Merge branch 'main' into feature-metacluster

2022-07-21 14:48:53 -07:00 · 2022-07-21 14:48:53 -07:00 · 3e1763e7ac
parent 978ca7fb6f 89141d4b3a
commit 3e1763e7ac
58 changed files with 1208 additions and 792 deletions
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -194,11 +194,7 @@ if(NOT WIN32)
  target_link_libraries(fdb_c_client_memory_test PRIVATE fdb_c Threads::Threads)

  target_include_directories(fdb_c_api_tester_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include ${CMAKE_BINARY_DIR}/flow/include)
-  if(USE_SANITIZER)
-    target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_asan)
-  else()
-    target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_target)
-  endif()
+  target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_target)
  target_link_libraries(fdb_c_api_tester_impl PRIVATE SimpleOpt)

  target_include_directories(fdb_c_unit_tests_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/)
@ -211,11 +207,7 @@ if(NOT WIN32)

  # do not set RPATH for mako
  set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
-  if (USE_SANITIZER)
-    target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_asan rapidjson)
-  else()
-    target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_target rapidjson)
-  endif()
+  target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_target rapidjson)

  if(NOT OPEN_FOR_IDE)
    # Make sure that fdb_c.h is compatible with c90
@ -439,7 +431,7 @@ if (OPEN_FOR_IDE)
  add_library(fdb_c_shim OBJECT fdb_c_shim.cpp)
  target_link_libraries(fdb_c_shim PUBLIC dl)

-elseif(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) # Linux Only
+elseif(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE AND NOT USE_UBSAN) # Linux, non-ubsan only

  set(SHIM_LIB_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})

@ -472,7 +464,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) # Linux Only
          --api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
          )

-endif() # End Linux only
+endif() # End Linux, non-ubsan only

 # TODO: re-enable once the old vcxproj-based build system is removed.
 #generate_export_header(fdb_c EXPORT_MACRO_NAME "DLLEXPORT"
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@ -941,13 +941,13 @@ static Value dataOfRecord(const int i) {
 	return Value(format("data-of-record-%08d", i));
 }
 static std::string indexEntryKey(const int i) {
-	return Tuple().append(StringRef(prefix)).append(INDEX).append(indexKey(i)).append(primaryKey(i)).pack().toString();
+	return Tuple::makeTuple(prefix, INDEX, indexKey(i), primaryKey(i)).pack().toString();
 }
 static std::string recordKey(const int i, const int split) {
-	return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).append(split).pack().toString();
+	return Tuple::makeTuple(prefix, RECORD, primaryKey(i), split).pack().toString();
 }
 static std::string recordValue(const int i, const int split) {
-	return Tuple().append(dataOfRecord(i)).append(split).pack().toString();
+	return Tuple::makeTuple(dataOfRecord(i), split).pack().toString();
 }

 const static int SPLIT_SIZE = 3;
@ -993,13 +993,8 @@ GetMappedRangeResult getMappedIndexEntries(int beginId,
                                           fdb::Transaction& tr,
                                           int matchIndex,
                                           bool allMissing) {
-	std::string mapper = Tuple()
-	                         .append(prefix)
-	                         .append(RECORD)
-	                         .append(allMissing ? "{K[2]}"_sr : "{K[3]}"_sr)
-	                         .append("{...}"_sr)
-	                         .pack()
-	                         .toString();
+	std::string mapper =
+	    Tuple::makeTuple(prefix, RECORD, (allMissing ? "{K[2]}"_sr : "{K[3]}"_sr), "{...}"_sr).pack().toString();
 	return getMappedIndexEntries(beginId, endId, tr, mapper, matchIndex);
 }

@ -1037,7 +1032,7 @@ TEST_CASE("tuple_support_versionstamp") {
 	// a random 12 bytes long StringRef as a versionstamp
 	StringRef str = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12"_sr;
 	Versionstamp vs(str);
-	const Tuple t = Tuple().append(prefix).append(RECORD).appendVersionstamp(vs).append("{K[3]}"_sr).append("{...}"_sr);
+	const Tuple t = Tuple::makeTuple(prefix, RECORD, vs, "{K[3]}"_sr, "{...}"_sr);
 	ASSERT(t.getVersionstamp(2) == vs);

 	// verify the round-way pack-unpack path for a Tuple containing a versionstamp
@ -1181,7 +1176,7 @@ TEST_CASE("fdb_transaction_get_mapped_range_missing_all_secondary") {
 }

 TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_serializable") {
-	std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString();
+	std::string mapper = Tuple::makeTuple(prefix, RECORD, "{K[3]}"_sr).pack().toString();
 	fdb::Transaction tr(db);
 	auto result = get_mapped_range(
 	    tr,
@ -1200,7 +1195,7 @@ TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_serializable") {
 }

 TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_ryw_enable") {
-	std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString();
+	std::string mapper = Tuple::makeTuple(prefix, RECORD, "{K[3]}"_sr).pack().toString();
 	fdb::Transaction tr(db);
 	fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); // Not disable RYW
 	auto result = get_mapped_range(
--- a/cmake/CompileBoost.cmake
+++ b/cmake/CompileBoost.cmake
@ -38,7 +38,10 @@ function(compile_boost)
  set(BOOST_LINK_FLAGS "")
  if(APPLE OR CLANG OR ICX OR USE_LIBCXX)
    list(APPEND BOOST_COMPILER_FLAGS -stdlib=libc++ -nostdlib++)
-    list(APPEND BOOST_LINK_FLAGS -static-libgcc -lc++ -lc++abi)
+    list(APPEND BOOST_LINK_FLAGS -lc++ -lc++abi)
+    if (NOT APPLE)
+      list(APPEND BOOST_LINK_FLAGS -static-libgcc)
+    endif()
  endif()

  # Update the user-config.jam
@ -46,9 +49,9 @@ function(compile_boost)
  foreach(flag IN LISTS BOOST_COMPILER_FLAGS COMPILE_BOOST_CXXFLAGS)
    string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "<cxxflags>${flag} ")
  endforeach()
-  #foreach(flag IN LISTS BOOST_LINK_FLAGS COMPILE_BOOST_LDFLAGS)
-  # string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "<linkflags>${flag} ")
-  #endforeach()
+  foreach(flag IN LISTS BOOST_LINK_FLAGS COMPILE_BOOST_LDFLAGS)
+    string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "<linkflags>${flag} ")
+  endforeach()
  configure_file(${CMAKE_SOURCE_DIR}/cmake/user-config.jam.cmake ${CMAKE_BINARY_DIR}/user-config.jam)
  set(USER_CONFIG_FLAG --user-config=${CMAKE_BINARY_DIR}/user-config.jam)

@ -92,10 +95,10 @@ if(USE_SANITIZER)
  endif()
  message(STATUS "A sanitizer is enabled, need to build boost from source")
  if (USE_VALGRIND)
-    compile_boost(TARGET boost_asan BUILD_ARGS valgrind=on
+    compile_boost(TARGET boost_target BUILD_ARGS valgrind=on
      CXXFLAGS ${SANITIZER_COMPILE_OPTIONS} LDFLAGS ${SANITIZER_LINK_OPTIONS})
  else()
-    compile_boost(TARGET boost_asan BUILD_ARGS context-impl=ucontext
+    compile_boost(TARGET boost_target BUILD_ARGS context-impl=ucontext
      CXXFLAGS ${SANITIZER_COMPILE_OPTIONS} LDFLAGS ${SANITIZER_LINK_OPTIONS})
  endif()
  return()
--- a/design/LoadBalancing/LoadBalancing.md
+++ b/design/LoadBalancing/LoadBalancing.md
@ -0,0 +1,227 @@
+# Load Balancing in FoundationDB
+
+## Introduction
+
+FoundationDB is a distributed key-value database. A FoundationDB cluster is constituted by one or more processes over one or more physical machines, where each process is a *worker* and takes certain *role*s, such as coordinator, proxy, TLog, storage server, etc., in the system.
+
+The interpocess communications (IPC) between the processes are supported by the [`flow`](https://github.com/apple/foundationdb/tree/main/flow) infrastructure. In the `flow` context, each process will expose one or more *interface*(s). Each interface is able to accept given type of *request*s, and *reply* `Void`, requested data or error. The interfaces and the corresponding request/reply pairs forms the IPC protocol of FoundationDB.
+
+In many cases, the same request can be proceed by multiple processes, e.g. all commit proxies can accept commit requests, and multiple storage server processes can provide values for a given key in double/triple redundancy mode. A load balancer (LB) can be used to distribute the requests over the possible interfaces, preventing one or a few processes getting overloaded. The interface candidates are also referred as *alternative*s. The LB is also able to react when one or more interfaces are (temporarily) unavailable by retrying, or re-routing the request to other candidates. The interface candidates are also known as *alternative*s.
+
+Two LBs are provided in FoundationDB: `basicLoadBalance` and `loadBalance`, both defined in [`LoadBalance.actor.h`](https://github.com/apple/foundationdb/blob/main/fdbrpc/include/fdbrpc/LoadBalance.actor.h). The `basicLoadBalance` is a simple load balancer which each interface is equally chosen; while the `loadBalance` accepts a model object, which provides [datacenter](https://apple.github.io/foundationdb/configuration.html#configuring-regions) (DC) awaring balancing algorithms, allowing requests being sent to interfaces in the same DC.
+
+In the following sections, the two LBs will be discussed in details.
+
+## `basicLoadBalance`
+
+`basicLoadBalance` implements a simple load balancing algorithm. It applies to
+
+* Commit proxy interface
+* GetReadVersion proxy interface
+* ConfigFollower interface
+
+Here, the interfaces are assumed to be always *fresh*, i.e. the list of the servers is fixed.
+
+```mermaid
+graph LR
+    H0{Has alternatives?}
+    H1[Pick an alternative]
+    H2[Backoff]
+    H3[Request]
+    H4([Reply])
+    H5([Error])
+    H6([Never])
+    H((Start)) --> H0
+    H0 --No--> H6
+    H0 --Yes--> H1
+    H1 --No healthy alternatives--> H2 --Retry--> H1
+    H1 --Has alternative--> H3 --Success--> H4
+    H3 --Exception--> H5
+    H3 --Broken Promise --> H2
+```
+
+### Alternative pick algorithm
+
+In `basicLoadBalance`, a *best* alternative is picked and used at the beginning. At this stage, this alternative is randomly picked among all alternatives. If the best alternative does not work, it will iteratively try other interfaces, see [here](#picking-up-an-alternative-in-basic-load-balancing-algorithm).
+
+## `loadBalance`
+
+`loadBalance` provides a more sophisticated implementation of load balancing. In addition of the basic load balancing, it also provides a variety of features:
+
+* Support for Test Storage Server ([TSS](https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/tss.rst))
+* Datacenter awaring alternative election
+* Recording the latency and penalty from interfaces, and [prioritize the interfaces based on previously stored data](#with-queuemodel).
+* Able to handle timeouts and SS exceptions with retries.
+
+Currently it is used for
+
+* Storage Server interface
+* BlobWorker interface
+
+
+
+```mermaid
+graph LR
+    H((Start))
+    H0{Has alternatives?}
+    H1[Choose initial candidates]
+    H4([Never])
+    H5[pick an alternative]
+    H6[Send request]
+    H7[Wait for available alternative]
+    H8([Response])
+    H9([All alternatives failed])
+
+    H --> H0 --No--> H4
+    H0 --Yes--> H1
+    H1 --> H5
+    H5 --Has alternative--> H6
+    H5 --No alternative-->H7
+    H6 --Success--> H8
+    H6 --Failure--> H5
+    H7 --At least one alternative--> H5
+    H7 --> H9
+```
+
+Note:
+
+* The response could be either a reply, or an `Error`, e.g. `process_behind` or `request_maybe_delivered`.
+
+### Choose initial candidates
+
+Two initial candidates will be picked before the requests start. They will be selected as the first two alternatives for the load balancer. If both of them failed, other alternatives are used in a round-robin way.
+
+#### No `QueueModel`
+
+If no `QueueModel` is provided, the initial candidates are picked randomly. The first candidate, or the *best* alternative, will be the one that in the same DC, if possible.
+
+#### With `QueueModel`
+
+`QueueModel` holds information about each candidate related to future version, latency and penalty.
+
+* If the storage server is returning a future version error, it is marked as not available until some certain time.
+* Penalty is reported by storage server in each response (see `storageserver.actor.cpp:StorageServer::getPenalty`). It is determined by the write queue length and the durability lagging.
+
+If `QueueModel` exists, the candidates will be picked base on the penalty. Workers with high penalties will be avoided when picking the first two candidates.
+
+### Pick an alternative
+
+The alternatives are chosen in the round-robin way when the first two candidates failed. If all alternatives failed, a flag is set, and if the next request fails with `process_behind`, the caller will receive the `process_behind` error.
+
+### Send requests to workers
+
+Here it is assumed that there are at least one alternative available. If no alternative is available, the LB will wait.
+
+```mermaid
+graph LR
+    H((start))
+    H0{Is first request}
+    H1[Send first request]
+   	H2([Response])
+   	H3[Pick up next alternative]
+   	H4[Send additional request]
+
+    H --> H3
+    H3 -->H0
+   	H0 --Yes--> H1
+   	H1 --Success--> H2
+   	H1 --Timeout--> H3
+   	H0 --No--> H4
+   	H4 --First request succeed--> H2
+   	H4 --Second request succeed--> H2
+   	H4 --Additional request failed--> H3
+```
+
+The first request has a timeout option. If the LB is not able to retrieve the response within the timout, more requests will be sent to secondary and other available interfaces. If the first request failed, it is reset and the next request will be considered as the first request. Certain types of errors can also be returned as response, e.g. `request_may_be_delivered` or `process_behind`, which may not trigger a load-balancer retry.
+
+### Wait for available alternative
+
+When there is no alternatives available, the load balancer may wait until at least one interface is up.
+
+```mermaid
+graph LR
+    H0((start))
+    H1{Is first request in-flight}
+    H2[Wait for the first request]
+    H3([Response])
+    H4([Retry])
+    H5[Wait for alternatives]
+    H6([all_alternatives_failed])
+    
+    H0 --> H1
+    H1 --Yes--> H2
+    H1 --No--> H5
+    H5 --Timeout-->H6
+    H5 --Success-->H4
+    H2 --Success-->H3
+    H2 --Failed-->H4
+```
+
+Note that "Wait for alternatives" will only timeout if the alternatives are always not fresh, i.e. this only happens when accessing storage servers. LB will throw `all_alternatives_failed` when timeout in this case.
+
+#### Requests
+
+Original requests in `loadBalancer` are wrapped by `LoadBalance.actor.h:RequestData`. It provides the following additional operations besides the original `flow` request:
+
+* TSS support if `QueueModel` is available
+* Translate some errors into `maybe_delivered`, `process_behind` or retries
+* Update the `QueueModel` information including latency, penalty, etc.
+
+## Appendix
+
+### Picking an alternative in basic load balancing algorithm
+
+The following script simulates the alternative picking up algorithm. The chosen alternatives will be printed out one-by-one. The `loadBalance` function uses a similar approach, though the interfaces in the same DC are used firstly.
+
+```python
+#! /usr/bin/env python3
+
+import random
+import time
+
+
+class Alternatives:
+
+    def __init__(self, num_alternatives):
+        self._size = num_alternatives
+    
+    def size(self):
+        return self._size
+
+    def get_best(self):
+        return random.randint(0, self._size - 1)
+
+
+# Entry
+NUM_ALTERNATIVES = 10
+alts = Alternatives(NUM_ALTERNATIVES)
+
+best_alt = alts.get_best()
+next_alt = random.randint(0, alts.size() - 2)
+if next_alt >= best_alt:
+    next_alt += 1
+start_alt = next_alt
+start_distance = (best_alt + alts.size() - start_alt) % alts.size()
+use_alt = None
+
+print("best_alt = {}".format(best_alt))
+print("start_alt = {}".format(start_alt))
+print("start_distance = {}".format(start_distance))
+
+while True:
+    for alt_num in range(0, alts.size()):
+        use_alt = next_alt
+        if next_alt == start_alt:
+            print("  Going back to the start_alt")
+            use_alt = best_alt
+        elif (next_alt + alts.size() - start_alt) % alts.size() <= start_distance:
+            print("  Entering start_distance")
+            use_alt = (next_alt + alts.size() - 1) % alts.size()
+        
+        print("Attempting alt: {}".format(use_alt))
+
+        # Next loop
+        next_alt = (next_alt + 1) % alts.size()
+        time.sleep(.2)
+```
+
--- a/design/LoadBalancing/LoadBalancing.pdf
+++ b/design/LoadBalancing/LoadBalancing.pdf
--- a/documentation/sphinx/source/global-configuration.rst
+++ b/documentation/sphinx/source/global-configuration.rst
@ -85,7 +85,7 @@ Values must always be encoded according to the :ref:`api-python-tuple-layer`.
   const KeyRef myGlobalConfigKey = LiteralStringRef("config/key");
   
   // When you want to set the value..
-   Tuple value = Tuple().appendDouble(1.5);
+   Tuple value = Tuple::makeTuple((double)1.5);
   
   FDBTransaction* tr = ...;
   tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
--- a/fdbcli/ProfileCommand.actor.cpp
+++ b/fdbcli/ProfileCommand.actor.cpp
@ -97,8 +97,8 @@ ACTOR Future<bool> profileCommandActor(Database db,
 				}
 			}

-			Tuple rate = Tuple().appendDouble(sampleRate);
-			Tuple size = Tuple().append(sizeLimit);
+			Tuple rate = Tuple::makeTuple(sampleRate);
+			Tuple size = Tuple::makeTuple(sizeLimit);
 			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
 			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
--- a/fdbclient/BlobGranuleFiles.cpp
+++ b/fdbclient/BlobGranuleFiles.cpp
@ -1216,6 +1216,8 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
 	std::unordered_set<std::string> usedKeys;
 	Standalone<GranuleSnapshot> data;
 	int totalDataBytes = 0;
+	const int maxKeyGenAttempts = 1000;
+	int nAttempts = 0;
 	while (totalDataBytes < targetDataBytes) {
 		int keySize = deterministicRandom()->randomInt(targetKeyLength / 2, targetKeyLength * 3 / 2);
 		keySize = std::min(keySize, uidSize);
@ -1232,6 +1234,13 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {

 			data.push_back_deep(data.arena(), KeyValueRef(KeyRef(key), ValueRef(value)));
 			totalDataBytes += key.size() + value.size();
+			nAttempts = 0;
+		} else if (nAttempts > maxKeyGenAttempts) {
+			// KeySpace exhausted, avoid infinite loop
+			break;
+		} else {
+			// Keep exploring the KeySpace
+			nAttempts++;
 		}
 	}

--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -71,13 +71,6 @@ if(WITH_AWS_BACKUP)
  include(awssdk)
 endif()

-find_package(ZLIB)
-if(ZLIB_FOUND)
-  add_compile_definitions(ZLIB_LIB_SUPPORTED)
-else()
-  message(STATUS "ZLIB package not found")
-endif()
-
 add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
 target_include_directories(fdbclient PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include")
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/versions.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/fdbclient/versions.h)
--- a/fdbclient/ConfigKnobs.cpp
+++ b/fdbclient/ConfigKnobs.cpp
@ -144,10 +144,7 @@ std::string configDBTypeToString(ConfigDBType configDBType) {
 }

 TEST_CASE("/fdbclient/ConfigDB/ConfigKey/EncodeDecode") {
-	Tuple tuple;
-	tuple << "class-A"_sr
-	      << "test_long"_sr;
-	auto packed = tuple.pack();
+	auto packed = Tuple::makeTuple("class-A"_sr, "test_long"_sr).pack();
 	auto unpacked = ConfigKeyRef::decodeKey(packed);
 	ASSERT(unpacked.configClass.get() == "class-A"_sr);
 	ASSERT(unpacked.knobName == "test_long"_sr);
@ -169,18 +166,8 @@ void decodeFailureTest(KeyRef key) {
 } // namespace

 TEST_CASE("/fdbclient/ConfigDB/ConfigKey/DecodeFailure") {
-	{
-		Tuple tuple;
-		tuple << "s1"_sr
-		      << "s2"_sr
-		      << "s3"_sr;
-		decodeFailureTest(tuple.pack());
-	}
-	{
-		Tuple tuple;
-		tuple << "s1"_sr << 5;
-		decodeFailureTest(tuple.pack());
-	}
+	decodeFailureTest(Tuple::makeTuple("s1"_sr, "s2"_sr, "s3"_sr).pack());
+	decodeFailureTest(Tuple::makeTuple("s1"_sr, 5).pack());
 	decodeFailureTest("non-tuple-key"_sr);
 	return Void();
 }
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -200,13 +200,7 @@ public:
 		Version endVersion{ ::invalidVersion }; // not meaningful for range files

 		Tuple pack() const {
-			return Tuple()
-			    .append(version)
-			    .append(StringRef(fileName))
-			    .append(isRange)
-			    .append(fileSize)
-			    .append(blockSize)
-			    .append(endVersion);
+			return Tuple::makeTuple(version, fileName, (int)isRange, fileSize, blockSize, endVersion);
 		}
 		static RestoreFile unpack(Tuple const& t) {
 			RestoreFile r;
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@ -183,13 +183,13 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
 				if (sampleRate.present()) {
 					const double sampleRateDbl =
 					    BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
-					Tuple rate = Tuple().appendDouble(sampleRateDbl);
+					Tuple rate = Tuple::makeTuple(sampleRateDbl);
 					tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
 				}
 				if (sizeLimit.present()) {
 					const int64_t sizeLimitInt =
 					    BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
-					Tuple size = Tuple().append(sizeLimitInt);
+					Tuple size = Tuple::makeTuple(sizeLimitInt);
 					tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
 				}

--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -1132,10 +1132,9 @@ ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {

 					for (const DetailedTSSMismatch& d : data.second) {
 						// <tssid, time, mismatchid> -> mismatch data
-						tssMismatchDB.set(
-						    tr,
-						    Tuple().append(data.first.toString()).append(d.timestamp).append(d.mismatchId.toString()),
-						    d.traceString);
+						tssMismatchDB.set(tr,
+						                  Tuple::makeTuple(data.first.toString(), d.timestamp, d.mismatchId.toString()),
+						                  d.traceString);
 					}

 					wait(tr->commit());
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -916,7 +916,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi

 	init( BG_ENABLE_MERGING,                                    true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false;
 	init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0;
-	
+	init( BG_MERGE_CANDIDATE_DELAY_SECONDS, BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 10.0 );

 	init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM,                8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
 	init( BLOB_WORKER_TIMEOUT,                                  10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
@ -928,6 +928,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN,                   0.1 );
 	init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX,                   5.0 );
 	init( BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT,              1.5 );
+	init( BLOB_MANAGER_CONCURRENT_MERGE_CHECKS,                   64 ); if( randomize && BUGGIFY ) BLOB_MANAGER_CONCURRENT_MERGE_CHECKS = 1 << deterministicRandom()->randomInt(0, 7);

 	init( BGCC_TIMEOUT,                   isSimulated ? 10.0 : 120.0 );
 	init( BGCC_MIN_INTERVAL,                isSimulated ? 1.0 : 10.0 );
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -679,13 +679,14 @@ Future<RangeResult> ConflictingKeysImpl::getRange(ReadYourWritesTransaction* ryw
 	if (ryw->getTransactionState()->conflictingKeys) {
 		auto krMapPtr = ryw->getTransactionState()->conflictingKeys.get();
 		auto beginIter = krMapPtr->rangeContaining(kr.begin);
-		if (beginIter->begin() != kr.begin)
-			++beginIter;
 		auto endIter = krMapPtr->rangeContaining(kr.end);
+
+		if (!kr.contains(beginIter->begin()) && beginIter != endIter)
+			++beginIter;
 		for (auto it = beginIter; it != endIter; ++it) {
 			result.push_back_deep(result.arena(), KeyValueRef(it->begin(), it->value()));
 		}
-		if (endIter->begin() != kr.end)
+		if (kr.contains(endIter->begin()))
 			result.push_back_deep(result.arena(), KeyValueRef(endIter->begin(), endIter->value()));
 	}
 	return result;
@ -2005,7 +2006,7 @@ Future<Optional<std::string>> ClientProfilingImpl::commit(ReadYourWritesTransact
 		} else {
 			try {
 				double sampleRate = boost::lexical_cast<double>(sampleRateStr);
-				Tuple rate = Tuple().appendDouble(sampleRate);
+				Tuple rate = Tuple::makeTuple(sampleRate);
 				insertions.push_back_deep(insertions.arena(), KeyValueRef(fdbClientInfoTxnSampleRate, rate.pack()));
 			} catch (boost::bad_lexical_cast& e) {
 				return Optional<std::string>(ManagementAPIError::toJsonString(
@ -2024,7 +2025,7 @@ Future<Optional<std::string>> ClientProfilingImpl::commit(ReadYourWritesTransact
 		} else {
 			try {
 				int64_t sizeLimit = boost::lexical_cast<int64_t>(sizeLimitStr);
-				Tuple size = Tuple().append(sizeLimit);
+				Tuple size = Tuple::makeTuple(sizeLimit);
 				insertions.push_back_deep(insertions.arena(), KeyValueRef(fdbClientInfoTxnSizeLimit, size.pack()));
 			} catch (boost::bad_lexical_cast& e) {
 				return Optional<std::string>(ManagementAPIError::toJsonString(
@ -2731,3 +2732,85 @@ Future<Optional<std::string>> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr
 	// exclude locality with failed option as true.
 	return excludeLocalityCommitActor(ryw, true);
 }
+
+ACTOR Future<Void> validateSpecialSubrangeRead(ReadYourWritesTransaction* ryw,
+                                               KeySelector begin,
+                                               KeySelector end,
+                                               GetRangeLimits limits,
+                                               Reverse reverse,
+                                               RangeResult result) {
+	if (!result.size()) {
+		RangeResult testResult = wait(ryw->getRange(begin, end, limits, Snapshot::True, reverse));
+		ASSERT(testResult == result);
+		return Void();
+	}
+
+	if (reverse) {
+		ASSERT(std::is_sorted(result.begin(), result.end(), KeyValueRef::OrderByKeyBack{}));
+	} else {
+		ASSERT(std::is_sorted(result.begin(), result.end(), KeyValueRef::OrderByKey{}));
+	}
+
+	// Generate a keyrange where we can determine the expected result based solely on the previous readrange, and whose
+	// boundaries may or may not be keys in result.
+	std::vector<Key> candidateKeys;
+	if (reverse) {
+		for (int i = result.size() - 1; i >= 0; --i) {
+			candidateKeys.emplace_back(result[i].key);
+			if (i - 1 >= 0) {
+				candidateKeys.emplace_back(keyBetween(KeyRangeRef(result[i].key, result[i - 1].key)));
+			}
+		}
+	} else {
+		for (int i = 0; i < result.size(); ++i) {
+			candidateKeys.emplace_back(result[i].key);
+			if (i + 1 < result.size()) {
+				candidateKeys.emplace_back(keyBetween(KeyRangeRef(result[i].key, result[i + 1].key)));
+			}
+		}
+	}
+	std::sort(candidateKeys.begin(), candidateKeys.end());
+	int originalSize = candidateKeys.size();
+	// Add more candidate keys so that we might read a range between two adjacent result keys.
+	for (int i = 0; i < originalSize - 1; ++i) {
+		candidateKeys.emplace_back(keyBetween(KeyRangeRef(candidateKeys[i], candidateKeys[i + 1])));
+	}
+	std::vector<Key> keys;
+	keys = { deterministicRandom()->randomChoice(candidateKeys), deterministicRandom()->randomChoice(candidateKeys) };
+	std::sort(keys.begin(), keys.end());
+	state KeySelector testBegin = firstGreaterOrEqual(keys[0]);
+	state KeySelector testEnd = firstGreaterOrEqual(keys[1]);
+
+	// Generate expected result. Linear time is ok here since we're in simulation, and there's a benefit to keeping this
+	// simple (as we're using it as an test oracle)
+	state RangeResult expectedResult;
+	// The reverse parameter should be the same as for the original read, so if
+	// reverse is true then the results are _already_ in reverse order.
+	for (const auto& kr : result) {
+		if (kr.key >= keys[0] && kr.key < keys[1]) {
+			expectedResult.push_back(expectedResult.arena(), kr);
+		}
+	}
+
+	// Test
+	RangeResult testResult = wait(ryw->getRange(testBegin, testEnd, limits, Snapshot::True, reverse));
+	if (testResult != expectedResult) {
+		fmt::print("Reverse: {}\n", reverse);
+		fmt::print("Original range: [{}, {})\n", begin.toString(), end.toString());
+		fmt::print("Original result:\n");
+		for (const auto& kr : result) {
+			fmt::print("	{} -> {}\n", kr.key.printable(), kr.value.printable());
+		}
+		fmt::print("Test range: [{}, {})\n", testBegin.getKey().printable(), testEnd.getKey().printable());
+		fmt::print("Expected:\n");
+		for (const auto& kr : expectedResult) {
+			fmt::print("	{} -> {}\n", kr.key.printable(), kr.value.printable());
+		}
+		fmt::print("Got:\n");
+		for (const auto& kr : testResult) {
+			fmt::print("	{} -> {}\n", kr.key.printable(), kr.value.printable());
+		}
+		ASSERT(testResult == expectedResult);
+	}
+	return Void();
+}
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -1524,9 +1524,9 @@ std::pair<BlobGranuleSplitState, Version> decodeBlobGranuleSplitValue(const Valu

 const Value blobGranuleMergeValueFor(KeyRange mergeKeyRange,
                                     std::vector<UID> parentGranuleIDs,
-                                     std::vector<KeyRange> parentGranuleRanges,
+                                     std::vector<Key> parentGranuleRanges,
                                     std::vector<Version> parentGranuleStartVersions) {
-	ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size());
+	ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size() - 1);
 	ASSERT(parentGranuleIDs.size() == parentGranuleStartVersions.size());

 	BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
@ -1536,12 +1536,12 @@ const Value blobGranuleMergeValueFor(KeyRange mergeKeyRange,
 	wr << parentGranuleStartVersions;
 	return addVersionStampAtEnd(wr.toValue());
 }
-std::tuple<KeyRange, Version, std::vector<UID>, std::vector<KeyRange>, std::vector<Version>>
-decodeBlobGranuleMergeValue(ValueRef const& value) {
+std::tuple<KeyRange, Version, std::vector<UID>, std::vector<Key>, std::vector<Version>> decodeBlobGranuleMergeValue(
+    ValueRef const& value) {
 	KeyRange range;
 	Version v;
 	std::vector<UID> parentGranuleIDs;
-	std::vector<KeyRange> parentGranuleRanges;
+	std::vector<Key> parentGranuleRanges;
 	std::vector<Version> parentGranuleStartVersions;

 	BinaryReader reader(value, IncludeVersion());
@ -1551,7 +1551,7 @@ decodeBlobGranuleMergeValue(ValueRef const& value) {
 	reader >> parentGranuleStartVersions;
 	reader >> v;

-	ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size());
+	ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size() - 1);
 	ASSERT(parentGranuleIDs.size() == parentGranuleStartVersions.size());
 	ASSERT(bigEndian64(v) >= 0);

@ -1581,6 +1581,8 @@ const KeyRange blobGranuleHistoryKeyRangeFor(KeyRangeRef const& range) {
 }

 const Value blobGranuleHistoryValueFor(Standalone<BlobGranuleHistoryValue> const& historyValue) {
+	ASSERT(historyValue.parentVersions.empty() ||
+	       historyValue.parentBoundaries.size() - 1 == historyValue.parentVersions.size());
 	BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
 	wr << historyValue;
 	return wr.toValue();
@ -1590,6 +1592,8 @@ Standalone<BlobGranuleHistoryValue> decodeBlobGranuleHistoryValue(const ValueRef
 	Standalone<BlobGranuleHistoryValue> historyValue;
 	BinaryReader reader(value, IncludeVersion());
 	reader >> historyValue;
+	ASSERT(historyValue.parentVersions.empty() ||
+	       historyValue.parentBoundaries.size() - 1 == historyValue.parentVersions.size());
 	return historyValue;
 }

--- a/fdbclient/TagThrottle.actor.cpp
+++ b/fdbclient/TagThrottle.actor.cpp
@ -141,12 +141,7 @@ bool ThrottleApi::TagQuotaValue::isValid() const {
 }

 Value ThrottleApi::TagQuotaValue::toValue() const {
-	Tuple tuple;
-	tuple.appendDouble(reservedReadQuota);
-	tuple.appendDouble(totalReadQuota);
-	tuple.appendDouble(reservedWriteQuota);
-	tuple.appendDouble(totalWriteQuota);
-	return tuple.pack();
+	return Tuple::makeTuple(reservedReadQuota, totalReadQuota, reservedWriteQuota, totalWriteQuota).pack();
 }

 ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) {
--- a/fdbclient/TaskBucket.actor.cpp
+++ b/fdbclient/TaskBucket.actor.cpp
@ -651,10 +651,7 @@ public:
 	                                     Reference<Task> task) {
 		taskBucket->setOptions(tr);

-		Tuple t;
-		t.append(task->timeoutVersion);
-		t.append(task->key);
-
+		Tuple t = Tuple::makeTuple(task->timeoutVersion, task->key);
 		RangeResult values = wait(tr->getRange(taskBucket->timeouts.range(t), 1));
 		if (values.size() > 0)
 			return false;
@ -996,9 +993,7 @@ Future<bool> TaskBucket::isEmpty(Reference<ReadYourWritesTransaction> tr) {
 Future<Void> TaskBucket::finish(Reference<ReadYourWritesTransaction> tr, Reference<Task> task) {
 	setOptions(tr);

-	Tuple t;
-	t.append(task->timeoutVersion);
-	t.append(task->key);
+	Tuple t = Tuple::makeTuple(task->timeoutVersion, task->key);

 	tr->atomicOp(prefix.pack(LiteralStringRef("task_count")),
 	             LiteralStringRef("\xff\xff\xff\xff\xff\xff\xff\xff"),
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@ -19,6 +19,7 @@
 */

 #include "fdbclient/Tuple.h"
+#include "flow/UnitTest.h"

 const uint8_t VERSIONSTAMP_96_CODE = 0x33;

@ -103,7 +104,7 @@ Tuple& Tuple::append(Tuple const& tuple) {
 	return *this;
 }

-Tuple& Tuple::appendVersionstamp(Versionstamp const& vs) {
+Tuple& Tuple::append(Versionstamp const& vs) {
 	offsets.push_back(data.size());

 	data.push_back(data.arena(), VERSIONSTAMP_96_CODE);
@ -134,6 +135,10 @@ Tuple& Tuple::append(StringRef const& str, bool utf8) {
 	return *this;
 }

+Tuple& Tuple::append(UnicodeStr const& str) {
+	return append(str.str, true);
+}
+
 Tuple& Tuple::appendRaw(StringRef const& str) {
 	offsets.push_back(data.size());

@ -166,7 +171,11 @@ Tuple& Tuple::append(int64_t value) {
 	return *this;
 }

-Tuple& Tuple::appendBool(bool value) {
+Tuple& Tuple::append(int32_t value) {
+	return append((int64_t)value);
+}
+
+Tuple& Tuple::append(bool value) {
 	offsets.push_back(data.size());
 	if (value) {
 		data.push_back(data.arena(), 0x27);
@ -176,7 +185,7 @@ Tuple& Tuple::appendBool(bool value) {
 	return *this;
 }

-Tuple& Tuple::appendFloat(float value) {
+Tuple& Tuple::append(float value) {
 	offsets.push_back(data.size());
 	float swap = bigEndianFloat(value);
 	uint8_t* bytes = (uint8_t*)&swap;
@ -187,7 +196,7 @@ Tuple& Tuple::appendFloat(float value) {
 	return *this;
 }

-Tuple& Tuple::appendDouble(double value) {
+Tuple& Tuple::append(double value) {
 	offsets.push_back(data.size());
 	double swap = value;
 	swap = bigEndianDouble(swap);
@ -199,12 +208,16 @@ Tuple& Tuple::appendDouble(double value) {
 	return *this;
 }

-Tuple& Tuple::appendNull() {
+Tuple& Tuple::append(nullptr_t) {
 	offsets.push_back(data.size());
 	data.push_back(data.arena(), (uint8_t)'\x00');
 	return *this;
 }

+Tuple& Tuple::appendNull() {
+	return append(nullptr);
+}
+
 Tuple::ElementType Tuple::getType(size_t index) const {
 	if (index >= offsets.size()) {
 		throw invalid_tuple_index();
@ -426,3 +439,30 @@ StringRef Tuple::subTupleRawString(size_t index) const {
 	size_t endPos = end < offsets.size() ? offsets[end] : data.size();
 	return StringRef(data.begin() + offsets[index], endPos - offsets[index]);
 }
+
+TEST_CASE("fdbclient/Tuple/makeTuple") {
+	Tuple t1 = Tuple::makeTuple(
+	    1, 1.0f, 1.0, false, "byteStr"_sr, Tuple::UnicodeStr("str"_sr), nullptr, Versionstamp("000000000000"_sr));
+	Tuple t2 = Tuple()
+	               .append(1)
+	               .append(1.0f)
+	               .append(1.0)
+	               .append(false)
+	               .append("byteStr"_sr)
+	               .append(Tuple::UnicodeStr("str"_sr))
+	               .append(nullptr)
+	               .append(Versionstamp("000000000000"_sr));
+
+	ASSERT(t1.pack() == t2.pack());
+	ASSERT(t1.getType(0) == Tuple::INT);
+	ASSERT(t1.getType(1) == Tuple::FLOAT);
+	ASSERT(t1.getType(2) == Tuple::DOUBLE);
+	ASSERT(t1.getType(3) == Tuple::BOOL);
+	ASSERT(t1.getType(4) == Tuple::BYTES);
+	ASSERT(t1.getType(5) == Tuple::UTF8);
+	ASSERT(t1.getType(6) == Tuple::NULL_TYPE);
+	ASSERT(t1.getType(7) == Tuple::VERSIONSTAMP);
+	ASSERT(t1.size() == 8);
+
+	return Void();
+}
--- a/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp
+++ b/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp
@ -333,7 +333,10 @@ Future<Void> BackupContainerAzureBlobStore::create() {
 	TraceEvent(SevDebug, "BCAzureBlobStoreCreateContainer").detail("ContainerName", containerName);
 	Future<Void> createContainerFuture =
 	    asyncTaskThread.execAsync([containerName = this->containerName, client = this->client] {
-		    waitAzureFuture(client->create_container(containerName), "create_container");
+		    auto outcome = client->get_container_properties(containerName).get();
+		    if (!outcome.success()) {
+			    waitAzureFuture(client->create_container(containerName), "create_container");
+		    }
 		    return Void();
 	    });
 	Future<Void> encryptionSetupFuture = usesEncryption() ? encryptionSetupComplete() : Void();
--- a/fdbclient/include/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/include/fdbclient/BackupAgent.actor.h
@ -361,7 +361,7 @@ public:
 template <>
 inline Standalone<StringRef> TupleCodec<FileBackupAgent::ERestoreState>::pack(
    FileBackupAgent::ERestoreState const& val) {
-	return Tuple().append(val).pack();
+	return Tuple::makeTuple(static_cast<int>(val)).pack();
 }
 template <>
 inline FileBackupAgent::ERestoreState TupleCodec<FileBackupAgent::ERestoreState>::unpack(
@ -578,7 +578,7 @@ ACTOR Future<Void> cleanupBackup(Database cx, DeleteData deleteData);
 using EBackupState = BackupAgentBase::EnumState;
 template <>
 inline Standalone<StringRef> TupleCodec<EBackupState>::pack(EBackupState const& val) {
-	return Tuple().append(static_cast<int>(val)).pack();
+	return Tuple::makeTuple(static_cast<int>(val)).pack();
 }
 template <>
 inline EBackupState TupleCodec<EBackupState>::unpack(Standalone<StringRef> const& val) {
@ -727,8 +727,7 @@ protected:

 template <>
 inline Standalone<StringRef> TupleCodec<Reference<IBackupContainer>>::pack(Reference<IBackupContainer> const& bc) {
-	Tuple tuple;
-	tuple.append(StringRef(bc->getURL()));
+	Tuple tuple = Tuple::makeTuple(bc->getURL());

 	if (bc->getEncryptionKeyFileName().present()) {
 		tuple.append(bc->getEncryptionKeyFileName().get());
@ -775,9 +774,7 @@ public:
 		Version version;
 		std::string fileName;
 		int64_t fileSize;
-		Tuple pack() const {
-			return Tuple().append(begin).append(version).append(StringRef(fileName)).append(fileSize);
-		}
+		Tuple pack() const { return Tuple::makeTuple(begin, version, fileName, fileSize); }
 		static RangeSlice unpack(Tuple const& t) {
 			RangeSlice r;
 			int i = 0;
--- a/fdbclient/include/fdbclient/BlobGranuleCommon.h
+++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h
@ -244,11 +244,13 @@ enum BlobGranuleSplitState { Unknown = 0, Initialized = 1, Assigned = 2, Done =
 struct BlobGranuleHistoryValue {
 	constexpr static FileIdentifier file_identifier = 991434;
 	UID granuleID;
-	VectorRef<std::pair<KeyRangeRef, Version>> parentGranules;
+	// VectorRef<std::pair<KeyRangeRef, Version>> parentGranules;
+	VectorRef<KeyRef> parentBoundaries;
+	VectorRef<Version> parentVersions;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, granuleID, parentGranules);
+		serializer(ar, granuleID, parentBoundaries, parentVersions);
 	}
 };

--- a/fdbclient/include/fdbclient/HighContentionPrefixAllocator.actor.h
+++ b/fdbclient/include/fdbclient/HighContentionPrefixAllocator.actor.h
@ -134,7 +134,7 @@ private:

 				if (!candidateValueFuture.get().present()) {
 					tr->addWriteConflictRange(singleKeyRange(self->recent.get(candidate).key()));
-					return Tuple().append(candidate).pack();
+					return Tuple::makeTuple(candidate).pack();
 				}
 			}
 		}
--- a/fdbclient/include/fdbclient/KeyBackedTypes.h
+++ b/fdbclient/include/fdbclient/KeyBackedTypes.h
@ -59,7 +59,7 @@ inline Tuple TupleCodec<Tuple>::unpack(Standalone<StringRef> const& val) {

 template <>
 inline Standalone<StringRef> TupleCodec<int64_t>::pack(int64_t const& val) {
-	return Tuple().append(val).pack();
+	return Tuple::makeTuple(val).pack();
 }
 template <>
 inline int64_t TupleCodec<int64_t>::unpack(Standalone<StringRef> const& val) {
@ -68,7 +68,7 @@ inline int64_t TupleCodec<int64_t>::unpack(Standalone<StringRef> const& val) {

 template <>
 inline Standalone<StringRef> TupleCodec<bool>::pack(bool const& val) {
-	return Tuple().append(val ? 1 : 0).pack();
+	return Tuple::makeTuple(val ? 1 : 0).pack();
 }
 template <>
 inline bool TupleCodec<bool>::unpack(Standalone<StringRef> const& val) {
@ -77,7 +77,7 @@ inline bool TupleCodec<bool>::unpack(Standalone<StringRef> const& val) {

 template <>
 inline Standalone<StringRef> TupleCodec<Standalone<StringRef>>::pack(Standalone<StringRef> const& val) {
-	return Tuple().append(val).pack();
+	return Tuple::makeTuple(val).pack();
 }
 template <>
 inline Standalone<StringRef> TupleCodec<Standalone<StringRef>>::unpack(Standalone<StringRef> const& val) {
@ -96,7 +96,7 @@ inline UID TupleCodec<UID>::unpack(Standalone<StringRef> const& val) {
 // This is backward compatible with TupleCodec<Standalone<StringRef>>
 template <>
 inline Standalone<StringRef> TupleCodec<std::string>::pack(std::string const& val) {
-	return Tuple().append(StringRef(val)).pack();
+	return Tuple::makeTuple(val).pack();
 }
 template <>
 inline std::string TupleCodec<std::string>::unpack(Standalone<StringRef> const& val) {
@ -143,7 +143,7 @@ struct TupleCodec<std::vector<T>> {

 template <>
 inline Standalone<StringRef> TupleCodec<KeyRange>::pack(KeyRange const& val) {
-	return Tuple().append(val.begin).append(val.end).pack();
+	return Tuple::makeTuple(val.begin, val.end).pack();
 }
 template <>
 inline KeyRange TupleCodec<KeyRange>::unpack(Standalone<StringRef> const& val) {
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -892,6 +892,7 @@ public:
 	int BG_CONSISTENCY_CHECK_TARGET_SPEED_KB;
 	bool BG_ENABLE_MERGING;
 	int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS;
+	int BG_MERGE_CANDIDATE_DELAY_SECONDS;

 	int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
 	double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
@ -902,6 +903,7 @@ public:
 	double BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN;
 	double BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX;
 	double BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT;
+	int BLOB_MANAGER_CONCURRENT_MERGE_CHECKS;
 	double BGCC_TIMEOUT;
 	double BGCC_MIN_INTERVAL;

--- a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h
@ -539,5 +539,15 @@ public:
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };

+// If the underlying set of key-value pairs of a key space is not changing, then we expect repeating a read to give the
+// same result. Additionally, we can generate the expected result of any read if that read is reading a subrange. This
+// actor performs a read of an arbitrary subrange of [begin, end) and validates the results.
+ACTOR Future<Void> validateSpecialSubrangeRead(ReadYourWritesTransaction* ryw,
+                                               KeySelector begin,
+                                               KeySelector end,
+                                               GetRangeLimits limits,
+                                               Reverse reverse,
+                                               RangeResult result);
+
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@ -605,7 +605,6 @@ struct StorageMetrics {
 	int64_t bytesPerKSecond = 0; // network bandwidth (average over 10s)
 	int64_t iosPerKSecond = 0;
 	int64_t bytesReadPerKSecond = 0;
-	Optional<KeyRange> keys; // this metric belongs to which range

 	static const int64_t infinity = 1LL << 60;

--- a/fdbclient/include/fdbclient/Subspace.h
+++ b/fdbclient/include/fdbclient/Subspace.h
@ -42,9 +42,7 @@ public:

 	template <class T>
 	Key pack(T const& item) const {
-		Tuple t;
-		t.append(item);
-		return pack(t);
+		return pack(Tuple::makeTuple(item));
 	}

 	Key pack(StringRef const& item, bool utf8 = false) const {
@ -58,9 +56,7 @@ public:

 	template <class T>
 	Subspace get(T const& item) const {
-		Tuple t;
-		t.append(item);
-		return get(t);
+		return get(Tuple::makeTuple(item));
 	}

 	Subspace get(StringRef const& item, bool utf8 = false) const {
--- a/fdbclient/include/fdbclient/SystemData.h
+++ b/fdbclient/include/fdbclient/SystemData.h
@ -658,11 +658,11 @@ std::pair<BlobGranuleSplitState, Version> decodeBlobGranuleSplitValue(ValueRef c

 const Value blobGranuleMergeValueFor(KeyRange mergeKeyRange,
                                     std::vector<UID> parentGranuleIDs,
-                                     std::vector<KeyRange> parentGranuleRanges,
+                                     std::vector<Key> parentGranuleRanges,
                                     std::vector<Version> parentGranuleStartVersions);
 // FIXME: probably just define object type for this?
-std::tuple<KeyRange, Version, std::vector<UID>, std::vector<KeyRange>, std::vector<Version>>
-decodeBlobGranuleMergeValue(ValueRef const& value);
+std::tuple<KeyRange, Version, std::vector<UID>, std::vector<Key>, std::vector<Version>> decodeBlobGranuleMergeValue(
+    ValueRef const& value);

 const Key blobGranuleHistoryKeyFor(KeyRangeRef const& range, Version version);
 std::pair<KeyRange, Version> decodeBlobGranuleHistoryKey(KeyRef const& key);
--- a/fdbclient/include/fdbclient/Tuple.h
+++ b/fdbclient/include/fdbclient/Tuple.h
@ -28,6 +28,11 @@
 #include "fdbclient/Versionstamp.h"

 struct Tuple {
+	struct UnicodeStr {
+		StringRef str;
+		explicit UnicodeStr(StringRef str) : str(str) {}
+	};
+
 	Tuple() {}

 	// Tuple parsing normally does not care of the final value is a numeric type and is incomplete.
@ -41,14 +46,15 @@ struct Tuple {
 	// the str needs to be a Tuple encoded string.
 	Tuple& appendRaw(StringRef const& str);
 	Tuple& append(StringRef const& str, bool utf8 = false);
+	Tuple& append(UnicodeStr const& str);
+	Tuple& append(int32_t);
 	Tuple& append(int64_t);
-	// There are some ambiguous append calls in fdbclient, so to make it easier
-	// to add append for floats and doubles, name them differently for now.
-	Tuple& appendBool(bool);
-	Tuple& appendFloat(float);
-	Tuple& appendDouble(double);
+	Tuple& append(bool);
+	Tuple& append(float);
+	Tuple& append(double);
+	Tuple& append(std::nullptr_t);
 	Tuple& appendNull();
-	Tuple& appendVersionstamp(Versionstamp const&);
+	Tuple& append(Versionstamp const&);

 	StringRef pack() const { return StringRef(data.begin(), data.size()); }

@ -84,10 +90,15 @@ struct Tuple {
 	Standalone<VectorRef<uint8_t>> getData() { return data; }
 	Standalone<StringRef> getDataAsStandalone() { return Standalone<StringRef>(pack(), data.arena()); }

+	// Create a tuple from a parameter pack
 	template <class... Types>
 	static Tuple makeTuple(Types&&... args) {
 		Tuple t;
-		(t << ... << args);
+
+		// Use a fold expression to append each argument using the << operator.
+		// https://en.cppreference.com/w/cpp/language/fold
+		(t << ... << std::forward<Types>(args));
+
 		return t;
 	}

--- a/fdbserver/BlobGranuleValidation.actor.cpp
+++ b/fdbserver/BlobGranuleValidation.actor.cpp
@ -165,4 +165,35 @@ bool compareFDBAndBlob(RangeResult fdb,
 		}
 	}
 	return correct;
-}
+}
+
+ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
+	// clear key range and check whether it is merged or not, repeatedly
+	state Transaction tr(cx);
+	state int reClearCount = 1;
+	state int reClearInterval = 1; // do quadratic backoff on clear rate, b/c large keys can keep it not write-cold
+	loop {
+		try {
+			Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(range));
+			if (ranges.size() == 1) {
+				return Void();
+			}
+			CODE_PROBE(true, "ClearAndAwaitMerge doing clear");
+			reClearCount--;
+			if (reClearCount <= 0) {
+				tr.clear(range);
+				wait(tr.commit());
+				fmt::print("ClearAndAwaitMerge cleared [{0} - {1}) @ {2}\n",
+				           range.begin.printable(),
+				           range.end.printable(),
+				           tr.getCommittedVersion());
+				reClearCount = reClearInterval;
+				reClearInterval++;
+			}
+			wait(delay(30.0)); // sleep a bit before checking on merge again
+			tr.reset();
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -25,6 +25,7 @@
 #include <vector>
 #include <unordered_map>

+#include "fdbrpc/simulator.h"
 #include "fmt/format.h"
 #include "fdbclient/BackupContainerFileSystem.h"
 #include "fdbclient/BlobGranuleCommon.h"
@ -281,6 +282,43 @@ struct BlobManagerStats {
 	}
 };

+enum MergeCandidateState {
+	MergeCandidateCannotMerge,
+	MergeCandidateCanMerge,
+	MergeCandidateUnknown,
+	MergeCandidateMerging
+};
+
+// The current merge algorithm, skipping just granules that will be merge-eligible on the next pass, but not
+// their neighbors, is optimal for guaranteeing merges to make progress where possible, with decently
+// optimal but not globally optimal merge behavior.
+// Alternative algorithms include not doing a two-pass consideration at all and immediately considering
+// all merge candidates, which guarantees the most progress but pretty much guarantees undesirably
+// suboptimal merge decisions, because of the time variance of granules becoming merge candidates. Or,
+// also skipping adjacent eligible granules in addition to the one that will be eligible next pass,
+// which ensures optimally large merges in a future pass, but adds decent delay to doing the merge. Or,
+// smarter considering of merge candidates adjacent to the one that will be eligible next pass
+// (depending on whether potential future merges with adjacent ones could include this candidate), which
+// would be the best of both worlds, but would add a decent amount of code complexity.
+struct MergeCandidateInfo {
+	MergeCandidateState st;
+	UID granuleID;
+	Version startVersion;
+	bool mergeNow;
+
+	MergeCandidateInfo() : st(MergeCandidateUnknown), startVersion(invalidVersion), mergeNow(false) {}
+
+	MergeCandidateInfo(MergeCandidateState st) : st(st), startVersion(invalidVersion), mergeNow(false) {
+		ASSERT(st != MergeCandidateCanMerge);
+	}
+	MergeCandidateInfo(UID granuleID, Version startVersion)
+	  : st(MergeCandidateCanMerge), granuleID(granuleID), startVersion(startVersion), mergeNow(false) {}
+
+	bool canMerge() const { return st == MergeCandidateCanMerge; }
+
+	bool canMergeNow() const { return st == MergeCandidateCanMerge && mergeNow; }
+};
+
 struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 	UID id;
 	Database db;
@ -301,11 +339,13 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 	KeyRangeMap<BoundaryEvaluation> boundaryEvaluations;
 	KeyRangeMap<bool> knownBlobRanges;
 	BGTenantMap tenantData;
-	KeyRangeMap<Optional<std::pair<UID, Version>>> mergeCandidates; // granule range to granule id + start version.
+	KeyRangeMap<MergeCandidateInfo> mergeCandidates; // granule range to granule id + start version.
 	KeyRangeMap<Version> activeGranuleMerges; // range map of active granule merges, because range in boundaryEval
 	                                          // doesn't correspond to merge range. invalidVersion is no merge,
 	                                          // 0 is no merge version determined yet

+	FlowLock concurrentMergeChecks;
+
 	AsyncTrigger startRecruiting;
 	Debouncer restartRecruiting;
 	std::set<NetworkAddress> recruitingLocalities; // the addrs of the workers being recruited on
@ -321,9 +361,10 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 	BlobManagerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db, Optional<Key> dcId)
 	  : id(id), db(db), dcId(dcId), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &workersById),
 	    knownBlobRanges(false, normalKeys.end), tenantData(BGTenantMap(dbInfo)),
-	    mergeCandidates(Optional<std::pair<UID, Version>>(), normalKeys.end),
-	    activeGranuleMerges(invalidVersion, normalKeys.end), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
-	    recruitingStream(0) {}
+	    mergeCandidates(MergeCandidateInfo(MergeCandidateUnknown), normalKeys.end),
+	    activeGranuleMerges(invalidVersion, normalKeys.end),
+	    concurrentMergeChecks(SERVER_KNOBS->BLOB_MANAGER_CONCURRENT_MERGE_CHECKS),
+	    restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), recruitingStream(0) {}

 	// only initialize blob store if actually needed
 	void initBStore() {
@ -347,6 +388,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 		}
 		return false;
 	}
+
 	Version activeMergeVersion(const KeyRangeRef& range) {
 		auto ranges = activeGranuleMerges.intersectingRanges(range);
 		Version v = invalidVersion;
@ -355,6 +397,30 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
 		}
 		return v;
 	}
+
+	void setMergeCandidate(const KeyRangeRef& range, UID granuleID, Version startVersion) {
+		// Want this to be idempotent. If a granule was already reported as merge-eligible, we want to use the existing
+		// merge and mergeNow state.
+		auto it = mergeCandidates.rangeContaining(range.begin);
+
+		if (it->begin() == range.begin && it.end() == range.end) {
+			if (it->cvalue().st != MergeCandidateCanMerge) {
+				// same range, just update
+				it->value() = MergeCandidateInfo(granuleID, startVersion);
+			} else {
+				// else no-op, but validate data
+				ASSERT(granuleID == it->cvalue().granuleID);
+				ASSERT(startVersion == it->cvalue().startVersion);
+			}
+		} else if (it->cvalue().st != MergeCandidateMerging) {
+			mergeCandidates.insert(range, MergeCandidateInfo(granuleID, startVersion));
+		}
+	}
+
+	void clearMergeCandidate(const KeyRangeRef& range, MergeCandidateState st) {
+		ASSERT(st != MergeCandidateCanMerge);
+		mergeCandidates.insert(range, MergeCandidateInfo(st));
+	}
 };

 ACTOR Future<Standalone<VectorRef<KeyRef>>> splitRange(Reference<BlobManagerData> bmData,
@ -1273,8 +1339,9 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,

 				Standalone<BlobGranuleHistoryValue> historyValue;
 				historyValue.granuleID = newGranuleIDs[i];
-				historyValue.parentGranules.push_back(historyValue.arena(),
-				                                      std::pair(granuleRange, granuleStartVersion));
+				historyValue.parentBoundaries.push_back(historyValue.arena(), granuleRange.begin);
+				historyValue.parentBoundaries.push_back(historyValue.arena(), granuleRange.end);
+				historyValue.parentVersions.push_back(historyValue.arena(), granuleStartVersion);

 				tr->set(historyKey, blobGranuleHistoryValueFor(historyValue));

@ -1452,7 +1519,8 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
 						break;
 					} else {
 						if (BM_DEBUG) {
-							fmt::print("ERROR: Error flushing range [{0} - {1}): {2}!\n",
+							fmt::print("ERROR: BM {0} Error flushing range [{1} - {2}): {3}!\n",
+							           bmData->epoch,
 							           blobGranuleMapping[j].key.printable(),
 							           blobGranuleMapping[j + 1].key.printable(),
 							           e.name());
@ -1491,7 +1559,7 @@ ACTOR Future<Void> forceGranuleFlush(Reference<BlobManagerData> bmData, KeyRange
 ACTOR Future<std::pair<UID, Version>> persistMergeGranulesStart(Reference<BlobManagerData> bmData,
                                                                KeyRange mergeRange,
                                                                std::vector<UID> parentGranuleIDs,
-                                                                std::vector<KeyRange> parentGranuleRanges,
+                                                                std::vector<Key> parentGranuleRanges,
                                                                std::vector<Version> parentGranuleStartVersions) {
 	state UID mergeGranuleID = deterministicRandom()->randomUniqueID();
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
@ -1549,7 +1617,7 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
                                            KeyRange mergeRange,
                                            Version mergeVersion,
                                            std::vector<UID> parentGranuleIDs,
-                                            std::vector<KeyRange> parentGranuleRanges,
+                                            std::vector<Key> parentGranuleRanges,
                                            std::vector<Version> parentGranuleStartVersions) {
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
 	// pick worker that has part of old range, it will soon get overridden anyway
@ -1580,13 +1648,14 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
 			state int parentIdx;
 			// TODO: could parallelize these
 			for (parentIdx = 0; parentIdx < parentGranuleIDs.size(); parentIdx++) {
-				state Key lockKey = blobGranuleLockKeyFor(parentGranuleRanges[parentIdx]);
+				KeyRange parentRange(KeyRangeRef(parentGranuleRanges[parentIdx], parentGranuleRanges[parentIdx + 1]));
+				state Key lockKey = blobGranuleLockKeyFor(parentRange);
 				state Future<Optional<Value>> oldLockFuture = tr->get(lockKey);

 				wait(updateChangeFeed(tr,
 				                      granuleIDToCFKey(parentGranuleIDs[parentIdx]),
 				                      ChangeFeedStatus::CHANGE_FEED_DESTROY,
-				                      parentGranuleRanges[parentIdx]));
+				                      parentRange));
 				if (BM_DEBUG) {
 					fmt::print("Granule merge destroying CF {0} ({1})!\n",
 					           parentGranuleIDs[parentIdx].shortString().substr(0, 6),
@ -1615,10 +1684,10 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
 			Standalone<BlobGranuleHistoryValue> historyValue;
 			historyValue.granuleID = mergeGranuleID;
 			for (parentIdx = 0; parentIdx < parentGranuleIDs.size(); parentIdx++) {
-				historyValue.parentGranules.push_back(
-				    historyValue.arena(),
-				    std::pair(parentGranuleRanges[parentIdx], parentGranuleStartVersions[parentIdx]));
+				historyValue.parentBoundaries.push_back(historyValue.arena(), parentGranuleRanges[parentIdx]);
+				historyValue.parentVersions.push_back(historyValue.arena(), parentGranuleStartVersions[parentIdx]);
 			}
+			historyValue.parentBoundaries.push_back(historyValue.arena(), parentGranuleRanges.back());

 			tr->set(historyKey, blobGranuleHistoryValueFor(historyValue));

@ -1646,7 +1715,7 @@ ACTOR Future<Void> finishMergeGranules(Reference<BlobManagerData> bmData,
                                       KeyRange mergeRange,
                                       Version mergeVersion,
                                       std::vector<UID> parentGranuleIDs,
-                                       std::vector<KeyRange> parentGranuleRanges,
+                                       std::vector<Key> parentGranuleRanges,
                                       std::vector<Version> parentGranuleStartVersions) {

 	// wait for BM to be fully recovered before starting actual merges
@ -1685,307 +1754,196 @@ ACTOR Future<Void> finishMergeGranules(Reference<BlobManagerData> bmData,

 	bmData->boundaryEvaluations.insert(mergeRange,
 	                                   BoundaryEvaluation(bmData->epoch, seqnoForEval, BoundaryEvalType::MERGE, 0, 0));
+	bmData->clearMergeCandidate(mergeRange, MergeCandidateMerging);

 	return Void();
 }

-// Make a decision on whether to merge this granule with surrounding ones.
-ACTOR Future<Void> maybeMergeRange(Reference<BlobManagerData> bmData,
-                                   UID granuleID,
-                                   KeyRange granuleRange,
-                                   Version granuleStartVersion) {
-	state std::deque<std::tuple<UID, KeyRange, Version>> beforeCandidates, afterCandidates;
+ACTOR Future<Void> doMerge(Reference<BlobManagerData> bmData,
+                           KeyRange mergeRange,
+                           std::vector<std::tuple<UID, KeyRange, Version>> toMerge) {
+	// switch to format persist merge wants
+	state std::vector<UID> ids;
+	state std::vector<Key> ranges;
+	state std::vector<Version> startVersions;
+	for (auto& it : toMerge) {
+		ids.push_back(std::get<0>(it));
+		ranges.push_back(std::get<1>(it).begin);
+		startVersions.push_back(std::get<2>(it));
+	}
+	ranges.push_back(std::get<1>(toMerge.back()).end);
+
+	try {
+		std::pair<UID, Version> persistMerge =
+		    wait(persistMergeGranulesStart(bmData, mergeRange, ids, ranges, startVersions));
+		wait(finishMergeGranules(
+		    bmData, persistMerge.first, mergeRange, persistMerge.second, ids, ranges, startVersions));
+		return Void();
+	} catch (Error& e) {
+		if (e.code() == error_code_operation_cancelled || e.code() == error_code_blob_manager_replaced) {
+			throw;
+		}
+		TraceEvent(SevError, "UnexpectedErrorGranuleMerge").error(e).detail("Range", mergeRange);
+		throw e;
+	}
+}
+
+// Needs to not be an actor to run synchronously for the race checking.
+// Technically this could just be the first part of doMerge, but this guarantees no waits happen for the checks before
+// the logic starts
+static void attemptStartMerge(Reference<BlobManagerData> bmData,
+                              const std::vector<std::tuple<UID, KeyRange, Version>>& toMerge) {
+	if (toMerge.size() < 2) {
+		return;
+	}
+	// TODO REMOVE validation eventually
+	for (int i = 0; i < toMerge.size() - 1; i++) {
+		ASSERT(std::get<1>(toMerge[i]).end == std::get<1>(toMerge[i + 1]).begin);
+	}
+	KeyRange mergeRange(KeyRangeRef(std::get<1>(toMerge.front()).begin, std::get<1>(toMerge.back()).end));
+	// merge/merge races should not be possible because granuleMergeChecker should only start attemptMerges() for
+	// disjoint ranges, and merge candidate is not updated if it is already in the state MergeCandidateMerging
+	ASSERT(!bmData->isMergeActive(mergeRange));
+	// Check to avoid races where a split eval came in while merge was evaluating. This also effectively checks
+	// boundaryEvals because they're both updated before maybeSplitRange is called. This handles split/merge races.
+	auto reCheckMergeCandidates = bmData->mergeCandidates.intersectingRanges(mergeRange);
+	for (auto it : reCheckMergeCandidates) {
+		if (!it->cvalue().canMergeNow()) {
+			CODE_PROBE(true, " granule no longer merge candidate after checking metrics, aborting merge");
+			return;
+		}
+	}

 	if (BM_DEBUG) {
-		fmt::print("BM {0} maybe merge [{1} - {2}): Start\n",
+		fmt::print("BM {0} Starting merge of [{1} - {2}) ({3})\n",
 		           bmData->epoch,
-		           granuleRange.begin.printable(),
-		           granuleRange.end.printable());
+		           mergeRange.begin.printable(),
+		           mergeRange.end.printable(),
+		           toMerge.size());
 	}
+	CODE_PROBE(true, "Doing granule merge");
+	bmData->activeGranuleMerges.insert(mergeRange, 0);
+	bmData->clearMergeCandidate(mergeRange, MergeCandidateMerging);
+	// Now, after setting activeGranuleMerges, we have committed to doing the merge, so any subsequent split eval for
+	// any of the ranges will be ignored. This handles merge/split races.
+	bmData->addActor.send(doMerge(bmData, mergeRange, toMerge));
+}

-	// look for candidates to the left
-	if (granuleRange.begin != normalKeys.begin) {
-		auto rangeBefore = bmData->mergeCandidates.rangeContainingKeyBefore(granuleRange.begin);
-		while (rangeBefore.cvalue().present() && beforeCandidates.size() < SERVER_KNOBS->BG_MAX_MERGE_FANIN - 1) {
-			// if it is a merge candidate, add it to the list
-			beforeCandidates.push_front(
-			    std::tuple(rangeBefore.cvalue().get().first, rangeBefore.range(), rangeBefore.cvalue().get().second));
+// Greedily merges any consecutive 2+ granules in a row that are mergeable
+ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
+                                 std::vector<std::tuple<UID, KeyRange, Version>> candidates) {
+	ASSERT(candidates.size() >= 2);

-			if (BM_DEBUG) {
-				fmt::print("BM {0} maybe merge [{1} - {2}):   Before candidate [{3} - {4})\n",
-				           bmData->epoch,
-				           granuleRange.begin.printable(),
-				           granuleRange.end.printable(),
-				           rangeBefore.begin().printable(),
-				           rangeBefore.end().printable());
-			}
-
-			ASSERT(rangeBefore.begin() >= normalKeys.begin);
-			if (rangeBefore.begin() == normalKeys.begin) {
-				break;
-			} else {
-				--rangeBefore;
-			}
-		}
+	// TODO REMOVE validation eventually
+	for (int i = 0; i < candidates.size() - 1; i++) {
+		ASSERT(std::get<1>(candidates[i]).end == std::get<1>(candidates[i + 1]).begin);
 	}
+	CODE_PROBE(true, "Candidate ranges to merge");
+	wait(bmData->concurrentMergeChecks.take());
+	state FlowLock::Releaser holdingDVL(bmData->concurrentMergeChecks);

-	// look for candidates to right
-	if (granuleRange.end != normalKeys.end) {
-		auto rangeAfter = bmData->mergeCandidates.rangeContaining(granuleRange.end);
-		while (rangeAfter.cvalue().present() && afterCandidates.size() < SERVER_KNOBS->BG_MAX_MERGE_FANIN - 1) {
-			// if it is a merge candidate, add it to the list
-			afterCandidates.push_back(
-			    std::tuple(rangeAfter.cvalue().get().first, rangeAfter.range(), rangeAfter.cvalue().get().second));
-			if (BM_DEBUG) {
-				fmt::print("BM {0} maybe merge [{1} - {2}):   After candidate [{3} - {4})\n",
-				           bmData->epoch,
-				           granuleRange.begin.printable(),
-				           granuleRange.end.printable(),
-				           rangeAfter.begin().printable(),
-				           rangeAfter.end().printable());
-			}
-			ASSERT(rangeAfter.end() <= normalKeys.end);
-			if (rangeAfter.end() == normalKeys.end) {
-				break;
-			} else {
-				++rangeAfter;
-			}
-		}
-	}
-
-	if (beforeCandidates.empty() && afterCandidates.empty()) {
-		CODE_PROBE(true, "no consecutive merge candidates");
-		if (BM_DEBUG) {
-			fmt::print("BM {0} maybe merge [{1} - {2}): No merge candidates\n",
-			           bmData->epoch,
-			           granuleRange.begin.printable(),
-			           granuleRange.end.printable());
-		}
-		return Void();
-	}
-
-	CODE_PROBE(true, "consecutive granule merge candidates");
-
-	if (BM_DEBUG) {
-		fmt::print("BM {0} maybe merge [{1} - {2}): Checking metrics for {3} candidates ({4} - {5})\n",
-		           bmData->epoch,
-		           granuleRange.begin.printable(),
-		           granuleRange.end.printable(),
-		           beforeCandidates.size() + afterCandidates.size() + 1,
-		           beforeCandidates.size(),
-		           afterCandidates.size());
-	}
-
-	// get metrics for current granule to see if it is still mergeable
-	StorageMetrics targetGranuleMetrics = wait(bmData->db->getStorageMetrics(granuleRange, CLIENT_KNOBS->TOO_MANY));
-	if (targetGranuleMetrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC ||
-	    targetGranuleMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES) {
-		CODE_PROBE(true, "granule merge candidate no longer mergeable");
-		return Void();
-	}
-
-	// best set of granules to merge
-	state std::vector<UID> bestGranuleIDs;
-	state std::vector<KeyRange> bestGranuleRanges;
-	state std::vector<Version> bestGranuleStartVersions;
-	state KeyRange bestGranuleRange;
-
-	// current set of granules being evaluated
-	state std::deque<std::tuple<UID, KeyRange, Version, int64_t>> windowGranules;
-	state int64_t windowBytes = targetGranuleMetrics.bytes;
-	windowGranules.push_back(std::tuple(granuleID, granuleRange, granuleStartVersion, windowBytes));
-
-	// first walk backwards through before candidates until combined granule would be too large to merge, or we hit a
-	// granule that has too high bytesPerKSec and isn't mergeable
+	// start merging any set of 2+ consecutive granules that can be merged
+	state int64_t currentBytes = 0;
+	// large keys can cause a large number of granules in the merge to exceed the maximum value size
+	state int currentKeySumBytes = 0;
+	state std::vector<std::tuple<UID, KeyRange, Version>> currentCandidates;
 	state int i;
-	for (i = beforeCandidates.size() - 1; i >= 0; i--) {
-		if (BM_DEBUG) {
-			fmt::print("BM {0} maybe merge [{1} - {2}): Checking before candidate [{3} - {4})\n",
-			           bmData->epoch,
-			           granuleRange.begin.printable(),
-			           granuleRange.end.printable(),
-			           std::get<1>(beforeCandidates[i]).begin.printable(),
-			           std::get<1>(beforeCandidates[i]).end.printable());
+	for (i = 0; i < candidates.size(); i++) {
+		StorageMetrics metrics =
+		    wait(bmData->db->getStorageMetrics(std::get<1>(candidates[i]), CLIENT_KNOBS->TOO_MANY));
+
+		if (metrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
+		    metrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
+			// This granule cannot be merged with any neighbors.
+			// If current candidates up to here can be merged, merge them and skip over this one
+			attemptStartMerge(bmData, currentCandidates);
+			currentCandidates.clear();
+			currentBytes = 0;
+			currentKeySumBytes = 0;
+			continue;
 		}
-		StorageMetrics beforeMetrics =
-		    wait(bmData->db->getStorageMetrics(std::get<1>(beforeCandidates[i]), CLIENT_KNOBS->TOO_MANY));
-		if (windowBytes + beforeMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
-		    beforeMetrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
-			break;
+
+		// if the current window is already at the maximum merge size, or adding this granule would push the window over
+		// the edge, merge the existing candidates if possible
+		ASSERT(currentCandidates.size() <= SERVER_KNOBS->BG_MAX_MERGE_FANIN);
+		if (currentCandidates.size() == SERVER_KNOBS->BG_MAX_MERGE_FANIN ||
+		    currentBytes + metrics.bytes > SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
+		    currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2) {
+			ASSERT(currentBytes <= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES);
+			CODE_PROBE(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2, "merge early because of key size");
+			attemptStartMerge(bmData, currentCandidates);
+			currentCandidates.clear();
+			currentBytes = 0;
+			currentKeySumBytes = 0;
 		}
-		if (BM_DEBUG) {
-			fmt::print("BM {0} maybe merge [{1} - {2}): Before Candidate [{3} - {4}): {5} bytes\n",
-			           bmData->epoch,
-			           granuleRange.begin.printable(),
-			           granuleRange.end.printable(),
-			           std::get<1>(beforeCandidates[i]).begin.printable(),
-			           std::get<1>(beforeCandidates[i]).end.printable(),
-			           beforeMetrics.bytes);
+
+		// add this granule to the window
+		if (currentCandidates.empty()) {
+			currentKeySumBytes += std::get<1>(candidates[i]).begin.size();
 		}
-		windowBytes += beforeMetrics.bytes;
-		windowGranules.push_front(std::tuple(std::get<0>(beforeCandidates[i]),
-		                                     std::get<1>(beforeCandidates[i]),
-		                                     std::get<2>(beforeCandidates[i]),
-		                                     beforeMetrics.bytes));
+		currentKeySumBytes += std::get<1>(candidates[i]).end.size();
+		currentCandidates.push_back(candidates[i]);
 	}

-	// set first window as the best range
-	bestGranuleRange = KeyRangeRef(std::get<1>(windowGranules.front()).begin, std::get<1>(windowGranules.back()).end);
-	for (auto& it : windowGranules) {
-		bestGranuleIDs.push_back(std::get<0>(it));
-		bestGranuleRanges.push_back(std::get<1>(it));
-		bestGranuleStartVersions.push_back(std::get<2>(it));
-	}
-
-	// Do moving window algorithm where we add the next afterCandidate to the merge window, and then remove the tail end
-	// of beforeCandidates until we are down to a mergeable granule
-	for (i = 0; i < afterCandidates.size(); i++) {
-		if (BM_DEBUG) {
-			fmt::print("BM {0} maybe merge [{1} - {2}): Checking after candidate [{3} - {4})\n",
-			           bmData->epoch,
-			           granuleRange.begin.printable(),
-			           granuleRange.end.printable(),
-			           std::get<1>(afterCandidates[i]).begin.printable(),
-			           std::get<1>(afterCandidates[i]).end.printable());
-		}
-		// include this granule in the window
-		StorageMetrics afterMetrics =
-		    wait(bmData->db->getStorageMetrics(std::get<1>(afterCandidates[i]), CLIENT_KNOBS->TOO_MANY));
-		if (afterMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
-		    afterMetrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
-			break;
-		}
-		if (BM_DEBUG) {
-			fmt::print("BM {0} maybe merge [{1} - {2}): After Candidate [{3} - {4}): {5} bytes\n",
-			           bmData->epoch,
-			           granuleRange.begin.printable(),
-			           granuleRange.end.printable(),
-			           std::get<1>(afterCandidates[i]).begin.printable(),
-			           std::get<1>(afterCandidates[i]).end.printable(),
-			           afterMetrics.bytes);
-		}
-		windowBytes += afterMetrics.bytes;
-		windowGranules.push_back(std::tuple(std::get<0>(afterCandidates[i]),
-		                                    std::get<1>(afterCandidates[i]),
-		                                    std::get<2>(afterCandidates[i]),
-		                                    afterMetrics.bytes));
-
-		// slide the window forward back down to mergeable size
-		while (windowBytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES) {
-			if (BM_DEBUG) {
-				fmt::print("BM {0} maybe merge [{1} - {2}): window bytes {3} >= target {4}\n",
-				           bmData->epoch,
-				           granuleRange.begin.printable(),
-				           granuleRange.end.printable(),
-				           windowBytes,
-				           SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES);
-			}
-			ASSERT(!windowGranules.empty());
-			if (std::get<0>(windowGranules.front()) == granuleID) {
-				// merge must include target granule
-				break;
-			}
-			if (BM_DEBUG) {
-				fmt::print(
-				    "BM {0} maybe merge [{1} - {2}):   After Candidate [{3} - {4}) popping [{5} - {6}): {7} bytes\n",
-				    bmData->epoch,
-				    granuleRange.begin.printable(),
-				    granuleRange.end.printable(),
-				    std::get<1>(afterCandidates[i]).begin.printable(),
-				    std::get<1>(afterCandidates[i]).end.printable(),
-				    std::get<1>(windowGranules.front()).begin.printable(),
-				    std::get<1>(windowGranules.front()).end.printable(),
-				    std::get<3>(windowGranules.front()));
-			}
-			windowBytes -= std::get<3>(windowGranules.front());
-			windowGranules.pop_front();
-		}
-
-		// compare this candidate window to previous best
-		if (windowBytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES) {
-			break;
-		} else if (windowGranules.size() > bestGranuleIDs.size()) {
-			if (BM_DEBUG) {
-				fmt::print("BM {0} maybe merge [{1} - {2}): new best granules {3}\n",
-				           bmData->epoch,
-				           granuleRange.begin.printable(),
-				           granuleRange.end.printable(),
-				           windowGranules.size());
-			}
-			bestGranuleRange =
-			    KeyRangeRef(std::get<1>(windowGranules.front()).begin, std::get<1>(windowGranules.back()).end);
-			bestGranuleIDs.clear();
-			bestGranuleRanges.clear();
-			bestGranuleStartVersions.clear();
-			for (auto& it : windowGranules) {
-				bestGranuleIDs.push_back(std::get<0>(it));
-				bestGranuleRanges.push_back(std::get<1>(it));
-				bestGranuleStartVersions.push_back(std::get<2>(it));
-			}
-		}
-	}
-
-	CODE_PROBE(bestGranuleIDs.size() == 1, "Cannot combine merge candidates into mergeable granule");
-	CODE_PROBE(bestGranuleIDs.size() > 1, "Granule ready for merge!");
-
-	if (bestGranuleIDs.size() > 1) {
-		if (BM_DEBUG) {
-			fmt::print("BM {0} maybe merge [{1} - {2}): Found {3} consecutive granules in range [{4} - {5}):\n",
-			           bmData->epoch,
-			           granuleRange.begin.printable(),
-			           granuleRange.end.printable(),
-			           bestGranuleIDs.size(),
-			           bestGranuleRange.begin.printable(),
-			           bestGranuleRange.end.printable());
-		}
-		// This code block must execute withou a wait for the lock checks (isMergeActive, mergeCandidates) to not
-		// deadlock and to avoid merge-merge races.
-		if ((!g_network->isSimulated() || !g_simulator.speedUpSimulation) && !bmData->isMergeActive(bestGranuleRange)) {
-			// check to avoid races where a split eval came in while merge was evaluating
-			auto reCheckMergeCandidates = bmData->mergeCandidates.intersectingRanges(bestGranuleRange);
-			bool mergeStillOk = true;
-			for (auto it : reCheckMergeCandidates) {
-				if (!it->cvalue().present()) {
-					CODE_PROBE(true, "granule no longer merge candidate after checking metrics, because of split eval");
-					mergeStillOk = false;
-					break;
-				}
-			}
-
-			if (mergeStillOk) {
-				fmt::print("BM {0} maybe merge [{1} - {2}): Starting merge of [{3} - {4}) ({5})\n",
-				           bmData->epoch,
-				           granuleRange.begin.printable(),
-				           granuleRange.end.printable(),
-				           bestGranuleRange.begin.printable(),
-				           bestGranuleRange.end.printable(),
-				           bestGranuleIDs.size());
-				CODE_PROBE(true, "Doing granule merge!");
-				bmData->activeGranuleMerges.insert(bestGranuleRange, 0);
-				bmData->mergeCandidates.insert(bestGranuleRange, Optional<std::pair<UID, Version>>());
-				state std::pair<UID, Version> persistMerge = wait(persistMergeGranulesStart(
-				    bmData, bestGranuleRange, bestGranuleIDs, bestGranuleRanges, bestGranuleStartVersions));
-				wait(finishMergeGranules(bmData,
-				                         persistMerge.first,
-				                         bestGranuleRange,
-				                         persistMerge.second,
-				                         bestGranuleIDs,
-				                         bestGranuleRanges,
-				                         bestGranuleStartVersions));
-			}
-		}
-	} else {
-		if (BM_DEBUG) {
-			fmt::print("BM {0} maybe merge [{1} - {2}): No mergeable granules after checking metrics\n",
-			           bmData->epoch,
-			           granuleRange.begin.printable(),
-			           granuleRange.end.printable());
-		}
-	}
+	attemptStartMerge(bmData, currentCandidates);

 	return Void();
 }

+// Uses single-pass algorithm to identify mergeable sections of granules.
+// To ensure each granule waits to see whether all of its neighbors are merge-eligible before merging it, a newly
+// merge-eligible granule will be ignored on the first pass
+ACTOR Future<Void> granuleMergeChecker(Reference<BlobManagerData> bmData) {
+	// initial sleep
+	wait(delayJittered(SERVER_KNOBS->BG_MERGE_CANDIDATE_DELAY_SECONDS));
+	// TODO could optimize to not check if there are no new merge-eligible granules and none in merge pending state
+	loop {
+
+		double sleepTime = SERVER_KNOBS->BG_MERGE_CANDIDATE_DELAY_SECONDS;
+		// Check more frequently if speedUpSimulation is set. This may
+		if (g_network->isSimulated() && g_simulator.speedUpSimulation) {
+			sleepTime = std::min(5.0, sleepTime);
+		}
+		// start delay at the start of the loop, to account for time spend in calculation
+		state Future<Void> intervalDelay = delayJittered(sleepTime);
+
+		// go over granule states, and start a findMergeableGranules for each sub-range of mergeable granules
+		// FIXME: avoid SlowTask by breaking this up periodically
+
+		// Break it up into parallel chunks. This makes it possible to process large ranges, but does mean the merges
+		// can be slightly suboptimal at boundaries. Use relatively large chunks to minimize the impact of this.
+		int maxRangeSize = SERVER_KNOBS->BG_MAX_MERGE_FANIN * 10;
+
+		state std::vector<Future<Void>> mergeChecks;
+		auto allRanges = bmData->mergeCandidates.ranges();
+		std::vector<std::tuple<UID, KeyRange, Version>> currentCandidates;
+
+		for (auto& it : allRanges) {
+			if (!it->cvalue().canMergeNow() || currentCandidates.size() == maxRangeSize) {
+				if (currentCandidates.size() >= 2) {
+					mergeChecks.push_back(attemptMerges(bmData, currentCandidates));
+				}
+				currentCandidates.clear();
+			}
+
+			if (it->cvalue().canMergeNow()) {
+				currentCandidates.push_back(std::tuple(it->cvalue().granuleID, it->range(), it->cvalue().startVersion));
+			} else if (it->cvalue().canMerge()) {
+				// set flag so this can get merged on the next pass
+				it->value().mergeNow = true;
+			}
+		}
+		if (currentCandidates.size() >= 2) {
+			mergeChecks.push_back(attemptMerges(bmData, currentCandidates));
+		}
+
+		CODE_PROBE(mergeChecks.size() > 1, "parallel merge checks");
+		wait(waitForAll(mergeChecks));
+		// if the calculation took longer than the desired interval, still wait a bit
+		wait(intervalDelay && delay(5.0));
+	}
+}
+
 ACTOR Future<Void> deregisterBlobWorker(Reference<BlobManagerData> bmData, BlobWorkerInterface interf) {
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);
 	loop {
@ -2310,34 +2268,22 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl

 					// clear merge candidates for range, if not already merging
 					if (clearMergeCandidate) {
-						bmData->mergeCandidates.insert(rep.granuleRange, Optional<std::pair<UID, Version>>());
+						bmData->clearMergeCandidate(rep.granuleRange, MergeCandidateCannotMerge);
 					}
 				}
 				if (rep.mergeCandidate && !ignore) {
+					// mark granule as merge candidate
 					ASSERT(!rep.doSplit);
-					// TODO: do we need any sort of validation that this is coming from the worker that currently owns
-					// the granule?
-					if (existingInProgress.present()) {
-						// TODO LOG?
-					} else {
-						if (BM_DEBUG) {
-							fmt::print("Manager {0} evaluating [{1} - {2}) {3}\n",
-							           bmData->epoch,
-							           rep.granuleRange.begin.printable().c_str(),
-							           rep.granuleRange.end.printable().c_str(),
-							           newEval.toString());
-						}
-						if (!bmData->isMergeActive(rep.granuleRange)) {
-							ASSERT(rep.mergeCandidate);
-							CODE_PROBE(true, "Granule merge candidate");
-							bmData->mergeCandidates.insert(rep.granuleRange,
-							                               std::pair(rep.granuleID, rep.startVersion));
-							newEval.inProgress =
-							    maybeMergeRange(bmData, rep.granuleID, rep.granuleRange, rep.startVersion);
-							// still update epoch/seqno even if not doing a merge eval
-							bmData->boundaryEvaluations.insert(rep.granuleRange, newEval);
-						}
+					CODE_PROBE(true, "Granule merge candidate");
+					if (BM_DEBUG) {
+						fmt::print("Manager {0} merge candidate granule [{1} - {2}) {3}\n",
+						           bmData->epoch,
+						           rep.granuleRange.begin.printable().c_str(),
+						           rep.granuleRange.end.printable().c_str(),
+						           newEval.toString());
 					}
+					bmData->boundaryEvaluations.insert(rep.granuleRange, newEval);
+					bmData->setMergeCandidate(rep.granuleRange, rep.granuleID, rep.startVersion);
 				}
 			}
 		} catch (Error& e) {
@ -2580,7 +2526,7 @@ ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData) {
 				UID mergeGranuleID = decodeBlobGranuleMergeKey(it.key);
 				KeyRange mergeRange;
 				std::vector<UID> parentGranuleIDs;
-				std::vector<KeyRange> parentGranuleRanges;
+				std::vector<Key> parentGranuleRanges;
 				std::vector<Version> parentGranuleStartVersions;
 				Version mergeVersion;
 				std::tie(mergeRange, mergeVersion, parentGranuleIDs, parentGranuleRanges, parentGranuleStartVersions) =
@ -2598,15 +2544,16 @@ ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData) {
 				// report updated status. Start with early (epoch, seqno) to guarantee lower than later status
 				BoundaryEvaluation eval(1, 0, BoundaryEvalType::MERGE, 1, 0);
 				ASSERT(!bmData->isMergeActive(mergeRange));
-				eval.inProgress = finishMergeGranules(bmData,
-				                                      mergeGranuleID,
-				                                      mergeRange,
-				                                      mergeVersion,
-				                                      parentGranuleIDs,
-				                                      parentGranuleRanges,
-				                                      parentGranuleStartVersions);
+				bmData->addActor.send(finishMergeGranules(bmData,
+				                                          mergeGranuleID,
+				                                          mergeRange,
+				                                          mergeVersion,
+				                                          parentGranuleIDs,
+				                                          parentGranuleRanges,
+				                                          parentGranuleStartVersions));
 				bmData->boundaryEvaluations.insert(mergeRange, eval);
 				bmData->activeGranuleMerges.insert(mergeRange, mergeVersion);
+				bmData->clearMergeCandidate(mergeRange, MergeCandidateMerging);
 			}

 			if (result.more) {
@ -3564,27 +3511,30 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
 		}

 		// add all of the node's parents to the queue
-		for (auto& parent : currHistoryNode.parentGranules) {
+		for (int i = 0; i < currHistoryNode.parentVersions.size(); i++) {
+			// for (auto& parent : currHistoryNode.parentVersions.size()) {
 			// if we already added this node to queue, skip it; otherwise, mark it as visited
-			if (visited.count({ parent.first.begin.begin(), parent.second })) {
+			KeyRangeRef parentRange(currHistoryNode.parentBoundaries[i], currHistoryNode.parentBoundaries[i + 1]);
+			Version parentVersion = currHistoryNode.parentVersions[i];
+			if (visited.count({ parentRange.begin.begin(), parentVersion })) {
 				if (BM_DEBUG) {
 					fmt::print("Already added {0} to queue, so skipping it\n", currHistoryNode.granuleID.toString());
 				}
 				continue;
 			}
-			visited.insert({ parent.first.begin.begin(), parent.second });
+			visited.insert({ parentRange.begin.begin(), parentVersion });

 			if (BM_DEBUG) {
 				fmt::print("Adding parent [{0} - {1}) with versions [{2} - {3}) to queue\n",
-				           parent.first.begin.printable(),
-				           parent.first.end.printable(),
-				           parent.second,
+				           parentRange.begin.printable(),
+				           parentRange.end.printable(),
+				           parentVersion,
 				           startVersion);
 			}

 			// the parent's end version is this node's startVersion,
 			// since this node must have started where it's parent finished
-			historyEntryQueue.push({ parent.first, parent.second, startVersion });
+			historyEntryQueue.push({ parentRange, parentVersion, startVersion });
 		}
 	}

@ -3984,6 +3934,9 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
 	if (SERVER_KNOBS->BG_CONSISTENCY_CHECK_ENABLED) {
 		self->addActor.send(bgConsistencyCheck(self));
 	}
+	if (SERVER_KNOBS->BG_ENABLE_MERGING) {
+		self->addActor.send(granuleMergeChecker(self));
+	}

 	if (BUGGIFY) {
 		self->addActor.send(chaosRangeMover(self));
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -1188,13 +1188,13 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 	}
 	// wait for the last snapshot to finish, so that the delay is from the last snapshot
 	wait(waitStart);
-	wait(delayJittered(SERVER_KNOBS->BG_MERGE_CANDIDATE_THRESHOLD_SECONDS));
+	double jitter = deterministicRandom()->random01() * 0.8 * SERVER_KNOBS->BG_MERGE_CANDIDATE_DELAY_SECONDS;
+	wait(delay(SERVER_KNOBS->BG_MERGE_CANDIDATE_THRESHOLD_SECONDS + jitter));
 	loop {
 		// this actor will be cancelled if a split check happened, or if the granule was moved away, so this
 		// being here means that granule is cold enough during that period. Now we just need to check if it is
 		// also small enough to be a merge candidate.
 		StorageMetrics currentMetrics = wait(bwData->db->getStorageMetrics(metadata->keyRange, CLIENT_KNOBS->TOO_MANY));
-		state int64_t granuleBytes = currentMetrics.bytes;

 		// FIXME: maybe separate knob and/or value for write rate?
 		if (currentMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2 ||
@ -1241,11 +1241,9 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 				                                                                 metadata->originalEpoch,
 				                                                                 metadata->originalSeqno));
 				// if a new manager appears, also tell it about this granule being mergeable
-				state int64_t lastSendEpoch = bwData->currentManagerEpoch;
-				while (lastSendEpoch == bwData->currentManagerEpoch) {
-					wait(bwData->currentManagerStatusStream.onChange());
-					wait(delay(0));
-				}
+				// or if a new stream from the existing manager, it may have missed the message due to a network issue
+				wait(bwData->currentManagerStatusStream.onChange());
+				wait(delay(0));
 				CODE_PROBE(true, "Blob worker re-sending merge candidate to new manager");
 			} catch (Error& e) {
 				if (e.code() == error_code_operation_cancelled) {
@ -1926,6 +1924,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 										    .detail("RollbackVersion", rollbackVersion);
 									}

+									Version oldPendingSnapshot = metadata->pendingSnapshotVersion;
 									Version cfRollbackVersion = doGranuleRollback(metadata,
 									                                              deltas.version,
 									                                              rollbackVersion,
@ -1933,6 +1932,23 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 									                                              rollbacksInProgress,
 									                                              rollbacksCompleted);

+									if (oldPendingSnapshot > metadata->pendingSnapshotVersion) {
+										// If rollback cancelled in-flight snapshot, merge candidate checker also got
+										// cancelled. Restart it
+										CODE_PROBE(true,
+										           "Restarting merge candidate checker after rolling back snapshot");
+										checkMergeCandidate = granuleCheckMergeCandidate(
+										    bwData,
+										    metadata,
+										    startState.granuleID,
+										    inFlightFiles.empty() ? Future<Void>(Void())
+										                          : success(inFlightFiles.back().future));
+									}
+									// reset force flush state, requests should retry and add it back once feed is ready
+									forceFlushVersions.clear();
+									lastForceFlushVersion = 0;
+									metadata->forceFlushVersion = NotifiedVersion();
+
 									Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>();

 									if (!readOldChangeFeed && cfRollbackVersion < startState.changeFeedStartVersion) {
@ -2352,7 +2368,7 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
 		GranuleStartState startState = wait(assignFuture);
 		state Optional<GranuleHistory> activeHistory = startState.history;

-		if (activeHistory.present() && activeHistory.get().value.parentGranules.size() > 0) {
+		if (activeHistory.present() && activeHistory.get().value.parentVersions.size() > 0) {
 			state int64_t loadId = nextHistoryLoadId++;
 			if (BW_HISTORY_DEBUG) {
 				fmt::print("HL {0} {1}) Loading history data for [{2} - {3})\n",
@ -2368,7 +2384,7 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
 			    std::priority_queue<OrderedHistoryKey, std::vector<OrderedHistoryKey>, std::greater<OrderedHistoryKey>>
 			        rootGranules;
 			state Transaction tr(bwData->db);
-			if (!activeHistory.get().value.parentGranules.empty()) {
+			if (!activeHistory.get().value.parentVersions.empty()) {
 				if (BW_HISTORY_DEBUG) {
 					fmt::print("HL {0} {1}) Starting history [{2} - {3}) @ {4}\n",
 					           bwData->id.shortString().substr(0, 5),
@ -2437,17 +2453,16 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
 				state bool noParentsPresent = true;
 				// FIXME: parallelize this for all parents/all entries in queue?
 				loop {
-					if (pIdx >= curHistory.value.parentGranules.size()) {
+					if (pIdx >= curHistory.value.parentVersions.size()) {
 						break;
 					}
 					try {
-						Optional<Value> v =
-						    wait(tr.get(blobGranuleHistoryKeyFor(curHistory.value.parentGranules[pIdx].first,
-						                                         curHistory.value.parentGranules[pIdx].second)));
+						state KeyRangeRef parentRange(curHistory.value.parentBoundaries[pIdx],
+						                              curHistory.value.parentBoundaries[pIdx + 1]);
+						state Version parentVersion = curHistory.value.parentVersions[pIdx];
+						Optional<Value> v = wait(tr.get(blobGranuleHistoryKeyFor(parentRange, parentVersion)));
 						if (v.present()) {
-							next = GranuleHistory(curHistory.value.parentGranules[pIdx].first,
-							                      curHistory.value.parentGranules[pIdx].second,
-							                      decodeBlobGranuleHistoryValue(v.get()));
+							next = GranuleHistory(parentRange, parentVersion, decodeBlobGranuleHistoryValue(v.get()));
 							ASSERT(next.version != invalidVersion);

 							auto inserted = forwardHistory.insert({ next.value.granuleID, ForwardHistoryValue() });
@ -3410,12 +3425,13 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 			// If anything in previousGranules, need to do the handoff logic and set
 			// ret.previousChangeFeedId, and the previous durable version will come from the previous
 			// granules
-			if (info.history.present() && info.history.get().value.parentGranules.size() > 0) {
+			if (info.history.present() && info.history.get().value.parentVersions.size() > 0) {
 				CODE_PROBE(true, "Granule open found parent");
-				if (info.history.get().value.parentGranules.size() == 1) { // split
-					state Key parentHistoryKey =
-					    blobGranuleHistoryKeyFor(info.history.get().value.parentGranules[0].first,
-					                             info.history.get().value.parentGranules[0].second);
+				if (info.history.get().value.parentVersions.size() == 1) { // split
+					state KeyRangeRef parentRange(info.history.get().value.parentBoundaries[0],
+					                              info.history.get().value.parentBoundaries[1]);
+					state Version parentVersion = info.history.get().value.parentVersions[0];
+					state Key parentHistoryKey = blobGranuleHistoryKeyFor(parentRange, parentVersion);

 					Optional<Value> historyParentValue = wait(tr.get(parentHistoryKey));

@ -3424,8 +3440,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 						    decodeBlobGranuleHistoryValue(historyParentValue.get());
 						UID parentGranuleID = val.granuleID;

-						info.splitParentGranule =
-						    std::pair(info.history.get().value.parentGranules[0].first, parentGranuleID);
+						info.splitParentGranule = std::pair(parentRange, parentGranuleID);

 						state std::pair<BlobGranuleSplitState, Version> granuleSplitState =
 						    std::pair(BlobGranuleSplitState::Initialized, invalidVersion);
@ -3479,8 +3494,12 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 					// Can't roll back past re-snapshot version
 					info.changeFeedStartVersion = info.history.get().version;

-					for (auto& it : info.history.get().value.parentGranules) {
-						parentGranulesToSnapshot.push_back(loadParentGranuleForMergeSnapshot(&tr, it.first, it.second));
+					for (int i = 0; i < info.history.get().value.parentVersions.size(); i++) {
+						KeyRangeRef parentRange(info.history.get().value.parentBoundaries[i],
+						                        info.history.get().value.parentBoundaries[i + 1]);
+						Version parentVersion = info.history.get().value.parentVersions[i];
+						parentGranulesToSnapshot.push_back(
+						    loadParentGranuleForMergeSnapshot(&tr, parentRange, parentVersion));
 					}

 					state int pIdx;
@ -4062,94 +4081,103 @@ ACTOR Future<Void> handleFlushGranuleReq(Reference<BlobWorkerData> self, FlushGr
 				}
 			}

-			// force granule to flush at this version, and wait
-			if (req.flushVersion > metadata->pendingDeltaVersion) {
-				// first, wait for granule active
+			loop {
+				// force granule to flush at this version, and wait
+				if (req.flushVersion > metadata->pendingDeltaVersion) {
+					// first, wait for granule active

-				// wait for change feed version to catch up to ensure we have all data
-				if (metadata->activeCFData.get()->getVersion() < req.flushVersion) {
-					if (BW_DEBUG) {
-						fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: waiting for CF version "
-						           "(currently {4})\n",
-						           self->id.toString().substr(0, 5),
-						           req.granuleRange.begin.printable(),
-						           req.granuleRange.end.printable(),
-						           req.flushVersion,
-						           metadata->activeCFData.get()->getVersion());
-					}
+					// wait for change feed version to catch up to ensure we have all data
+					if (metadata->activeCFData.get()->getVersion() < req.flushVersion) {
+						if (BW_DEBUG) {
+							fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: waiting for CF version "
+							           "(currently {4})\n",
+							           self->id.toString().substr(0, 5),
+							           req.granuleRange.begin.printable(),
+							           req.granuleRange.end.printable(),
+							           req.flushVersion,
+							           metadata->activeCFData.get()->getVersion());
+						}

-					loop {
-						choose {
-							when(wait(metadata->activeCFData.get().isValid()
-							              ? metadata->activeCFData.get()->whenAtLeast(req.flushVersion)
-							              : Never())) {
-								break;
-							}
-							when(wait(metadata->activeCFData.onChange())) {}
-							when(wait(granuleCancelled.getFuture())) {
-								if (BW_DEBUG) {
-									fmt::print("BW {0} flush granule [{1} - {2}) cancelled 2\n",
-									           self->id.toString().substr(0, 5),
-									           req.granuleRange.begin.printable(),
-									           req.granuleRange.end.printable());
+						loop {
+							choose {
+								when(wait(metadata->activeCFData.get().isValid()
+								              ? metadata->activeCFData.get()->whenAtLeast(req.flushVersion)
+								              : Never())) {
+									break;
+								}
+								when(wait(metadata->activeCFData.onChange())) {}
+								when(wait(granuleCancelled.getFuture())) {
+									if (BW_DEBUG) {
+										fmt::print("BW {0} flush granule [{1} - {2}) cancelled 2\n",
+										           self->id.toString().substr(0, 5),
+										           req.granuleRange.begin.printable(),
+										           req.granuleRange.end.printable());
+									}
+									req.reply.sendError(wrong_shard_server());
+									return Void();
 								}
-								req.reply.sendError(wrong_shard_server());
-								return Void();
 							}
 						}
+
+						ASSERT(metadata->activeCFData.get()->getVersion() >= req.flushVersion);
+						if (BW_DEBUG) {
+							fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got CF version\n",
+							           self->id.toString().substr(0, 5),
+							           req.granuleRange.begin.printable(),
+							           req.granuleRange.end.printable(),
+							           req.flushVersion);
+						}
 					}

-					ASSERT(metadata->activeCFData.get()->getVersion() >= req.flushVersion);
-					if (BW_DEBUG) {
-						fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got CF version\n",
-						           self->id.toString().substr(0, 5),
-						           req.granuleRange.begin.printable(),
-						           req.granuleRange.end.printable(),
-						           req.flushVersion);
+					if (req.flushVersion > metadata->pendingDeltaVersion) {
+						if (BW_DEBUG) {
+							fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: setting force flush version\n",
+							           self->id.toString().substr(0, 5),
+							           req.granuleRange.begin.printable(),
+							           req.granuleRange.end.printable(),
+							           req.flushVersion);
+						}
+						// if after waiting for CF version, flushVersion still higher than pendingDeltaVersion,
+						// set forceFlushVersion
+						metadata->forceFlushVersion.set(req.flushVersion);
 					}
 				}

-				if (req.flushVersion > metadata->pendingDeltaVersion) {
-					if (BW_DEBUG) {
-						fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: setting force flush version\n",
-						           self->id.toString().substr(0, 5),
-						           req.granuleRange.begin.printable(),
-						           req.granuleRange.end.printable(),
-						           req.flushVersion);
-					}
-					// if after waiting for CF version, flushVersion still higher than pendingDeltaVersion,
-					// set forceFlushVersion
-					metadata->forceFlushVersion.set(req.flushVersion);
+				if (BW_DEBUG) {
+					fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: waiting durable\n",
+					           self->id.toString().substr(0, 5),
+					           req.granuleRange.begin.printable(),
+					           req.granuleRange.end.printable(),
+					           req.flushVersion);
 				}
-			}
+				choose {
+					when(wait(metadata->durableDeltaVersion.whenAtLeast(req.flushVersion))) {
+						if (BW_DEBUG) {
+							fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got durable\n",
+							           self->id.toString().substr(0, 5),
+							           req.granuleRange.begin.printable(),
+							           req.granuleRange.end.printable(),
+							           req.flushVersion);
+						}

-			if (BW_DEBUG) {
-				fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: waiting durable\n",
-				           self->id.toString().substr(0, 5),
-				           req.granuleRange.begin.printable(),
-				           req.granuleRange.end.printable(),
-				           req.flushVersion);
-			}
-			choose {
-				when(wait(metadata->durableDeltaVersion.whenAtLeast(req.flushVersion))) {}
-				when(wait(granuleCancelled.getFuture())) {
-					if (BW_DEBUG) {
-						fmt::print("BW {0} flush granule [{1} - {2}) cancelled 3\n",
-						           self->id.toString().substr(0, 5),
-						           req.granuleRange.begin.printable(),
-						           req.granuleRange.end.printable());
+						req.reply.send(Void());
+						return Void();
+					}
+					when(wait(metadata->activeCFData.onChange())) {
+						// if a rollback happens, need to restart flush process
+					}
+					when(wait(granuleCancelled.getFuture())) {
+						if (BW_DEBUG) {
+							fmt::print("BW {0} flush granule [{1} - {2}) cancelled 3\n",
+							           self->id.toString().substr(0, 5),
+							           req.granuleRange.begin.printable(),
+							           req.granuleRange.end.printable());
+						}
+						req.reply.sendError(wrong_shard_server());
+						return Void();
 					}
-					req.reply.sendError(wrong_shard_server());
-					return Void();
 				}
 			}
-			if (BW_DEBUG) {
-				fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got durable\n",
-				           self->id.toString().substr(0, 5),
-				           req.granuleRange.begin.printable(),
-				           req.granuleRange.end.printable(),
-				           req.flushVersion);
-			}
 		} catch (Error& e) {
 			if (BW_DEBUG) {
 				fmt::print("BW {0} flushing granule [{1} - {2}) @ {3}: got unexpected error {4}\n",
@ -4169,10 +4197,10 @@ ACTOR Future<Void> handleFlushGranuleReq(Reference<BlobWorkerData> self, FlushGr
 			           req.granuleRange.end.printable(),
 			           req.flushVersion);
 		}
-	}

-	req.reply.send(Void());
-	return Void();
+		req.reply.send(Void());
+		return Void();
+	}
 }

 ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@ -23,13 +23,6 @@ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/workloads)

 add_flow_target(EXECUTABLE NAME fdbserver SRCS ${FDBSERVER_SRCS})

-find_package(ZLIB)
-if(ZLIB_FOUND)
-  add_compile_definitions(ZLIB_LIB_SUPPORTED)
-else()
-  message(STATUS "ZLIB package not found")
-endif()
-
 target_include_directories(fdbserver PRIVATE
  ${CMAKE_SOURCE_DIR}/bindings/c
  ${CMAKE_BINARY_DIR}/bindings/c
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -1639,6 +1639,10 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,

 				healthyDestinations.addDataInFlightToTeam(-metrics.bytes);
 				auto readLoad = metrics.bytesReadPerKSecond;
+				// Note: It’s equal to trigger([healthyDestinations, readLoad], which is a value capture of
+				// healthyDestinations. Have to create a reference to healthyDestinations because in ACTOR the state
+				// variable is actually a member variable, I can’t write trigger([healthyDestinations, readLoad]
+				// directly.
 				auto& destinationRef = healthyDestinations;
 				self->noErrorActors.add(
 				    trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); },
@ -1778,10 +1782,6 @@ ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
 	state Future<HealthMetrics> healthMetrics = self->cx->getHealthMetrics(true);
 	state GetTopKMetricsRequest req(
 	    shards, topK, (srcLoad - destLoad) * SERVER_KNOBS->READ_REBALANCE_MAX_SHARD_FRAC, srcLoad / shards.size());
-	req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) {
-		return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0) >
-		       b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0);
-	};
 	state GetTopKMetricsReply reply = wait(brokenPromiseToNever(self->getTopKMetrics.getReply(req)));
 	wait(ready(healthMetrics));
 	auto cpu = getWorstCpu(healthMetrics.get(), sourceTeam->getServerIDs());
@ -1790,31 +1790,24 @@ ACTOR Future<bool> rebalanceReadLoad(DDQueueData* self,
 		return false;
 	}

-	auto& metricsList = reply.metrics;
+	auto& metricsList = reply.shardMetrics;
 	// NOTE: randomize is important here since we don't want to always push the same shard into the queue
 	deterministicRandom()->randomShuffle(metricsList);
 	traceEvent->detail("MinReadLoad", reply.minReadLoad).detail("MaxReadLoad", reply.maxReadLoad);

-	int chosenIdx = -1;
-	for (int i = 0; i < metricsList.size(); ++i) {
-		if (metricsList[i].keys.present()) {
-			chosenIdx = i;
-			break;
-		}
-	}
-	if (chosenIdx == -1) {
+	if (metricsList.empty()) {
 		traceEvent->detail("SkipReason", "NoEligibleShards");
 		return false;
 	}

-	auto& metrics = metricsList[chosenIdx];
+	auto& [shard, metrics] = metricsList[0];
 	traceEvent->detail("ShardReadBandwidth", metrics.bytesReadPerKSecond);
 	//  Verify the shard is still in ShardsAffectedByTeamFailure
 	shards = self->shardsAffectedByTeamFailure->getShardsFor(
 	    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
 	for (int i = 0; i < shards.size(); i++) {
-		if (metrics.keys == shards[i]) {
-			self->output.send(RelocateShard(metrics.keys.get(), priority, RelocateReason::REBALANCE_READ));
+		if (shard == shards[i]) {
+			self->output.send(RelocateShard(shard, priority, RelocateReason::REBALANCE_READ));
 			self->updateLastAsSource(sourceTeam->getServerIDs());
 			return true;
 		}
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -831,9 +831,8 @@ ACTOR Future<Void> trackInitialShards(DataDistributionTracker* self, Reference<I
 }

 ACTOR Future<Void> fetchTopKShardMetrics_impl(DataDistributionTracker* self, GetTopKMetricsRequest req) {
-	ASSERT(req.comparator);
 	state Future<Void> onChange;
-	state std::vector<StorageMetrics> returnMetrics;
+	state std::vector<GetTopKMetricsReply::KeyRangeStorageMetrics> returnMetrics;
 	// random pick a portion of shard
 	if (req.keys.size() > SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT) {
 		deterministicRandom()->randomShuffle(req.keys, SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT);
@ -867,8 +866,7 @@ ACTOR Future<Void> fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get
 					maxReadLoad = std::max(metrics.bytesReadPerKSecond, maxReadLoad);
 					if (req.minBytesReadPerKSecond <= metrics.bytesReadPerKSecond &&
 					    metrics.bytesReadPerKSecond <= req.maxBytesReadPerKSecond) {
-						metrics.keys = range;
-						returnMetrics.push_back(metrics);
+						returnMetrics.emplace_back(range, metrics);
 					}
 				}

@ -882,11 +880,11 @@ ACTOR Future<Void> fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get
 					std::nth_element(returnMetrics.begin(),
 					                 returnMetrics.begin() + req.topK - 1,
 					                 returnMetrics.end(),
-					                 req.comparator);
-					req.reply.send(GetTopKMetricsReply(
-					    std::vector<StorageMetrics>(returnMetrics.begin(), returnMetrics.begin() + req.topK),
-					    minReadLoad,
-					    maxReadLoad));
+					                 GetTopKMetricsRequest::compare);
+					req.reply.send(GetTopKMetricsReply(std::vector<GetTopKMetricsReply::KeyRangeStorageMetrics>(
+					                                       returnMetrics.begin(), returnMetrics.begin() + req.topK),
+					                                   minReadLoad,
+					                                   maxReadLoad));
 				}
 				return Void();
 			}
--- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
@ -688,7 +688,7 @@ public:
 				return status;
 			}
 			metadataShard->readIterPool->update();
-			TraceEvent(SevVerbose, "InitializeMetaDataShard", this->logId)
+			TraceEvent(SevInfo, "InitializeMetaDataShard", this->logId)
 			    .detail("MetadataShardCF", metadataShard->cf->GetID());
 		}
 		physicalShards["kvs-metadata"] = metadataShard;
@ -2063,7 +2063,9 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			            ? true
 			            : false) {
 				for (const DataShard* shard : shards) {
-					shardRanges.emplace_back(shard->physicalShard, keys & shard->range);
+					if (shard != nullptr) {
+						shardRanges.emplace_back(shard->physicalShard, keys & shard->range);
+					}
 				}
 			}
 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; }
@ -2105,7 +2107,12 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 			int accumulatedBytes = 0;
 			int numShards = 0;
 			for (auto& [shard, range] : a.shardRanges) {
-				ASSERT(shard != nullptr && shard->initialized());
+				if (shard == nullptr || !shard->initialized()) {
+					TraceEvent(SevWarn, "ShardedRocksReadRangeShardNotReady", logId)
+					    .detail("Range", range)
+					    .detail("Reason", shard == nullptr ? "Not Exist" : "Not Initialized");
+					continue;
+				}
 				auto bytesRead = readRangeInDb(shard, range, rowLimit, byteLimit, &result);
 				if (bytesRead < 0) {
 					// Error reading an instance.
@ -2293,7 +2300,9 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		auto* shard = shardManager.getDataShard(key);
 		if (shard == nullptr || !shard->physicalShard->initialized()) {
 			// TODO: read non-exist system key range should not cause an error.
-			TraceEvent(SevError, "ShardedRocksDB").detail("Detail", "Read non-exist key range").detail("ReadKey", key);
+			TraceEvent(SevWarnAlways, "ShardedRocksDB")
+			    .detail("Detail", "Read non-exist key range")
+			    .detail("ReadKey", key);
 			return Optional<Value>();
 		}

@ -2367,12 +2376,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		TraceEvent(SevVerbose, "ShardedRocksReadRangeBegin", this->id).detail("Range", keys);
 		auto shards = shardManager.getDataShardsByRange(keys);

-		for (DataShard* shard : shards) {
-			if (shard == nullptr || !shard->physicalShard->initialized()) {
-				return RangeResult();
-			}
-		}
-
 		if (!shouldThrottle(type, keys.begin)) {
 			auto a = new Reader::ReadRangeAction(keys, shards, rowLimit, byteLimit);
 			auto res = a->result.getFuture();
@ -2511,9 +2514,10 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") {
 	addRangeFutures.push_back(kvStore->addRange(KeyRangeRef("0"_sr, "3"_sr), "shard-1"));
 	addRangeFutures.push_back(kvStore->addRange(KeyRangeRef("4"_sr, "7"_sr), "shard-2"));

-	kvStore->persistRangeMapping(KeyRangeRef("0"_sr, "7"_sr), true);
 	wait(waitForAll(addRangeFutures));

+	kvStore->persistRangeMapping(KeyRangeRef("0"_sr, "7"_sr), true);
+
 	// write to shard 1
 	state RangeResult expectedRows;
 	for (int i = 0; i < 30; ++i) {
@ -2701,6 +2705,7 @@ TEST_CASE("noSim/ShardedRocksDB/ShardOps") {
 	mapping.push_back(std::make_pair(KeyRange(KeyRangeRef("m"_sr, "n"_sr)), "shard-3"));
 	mapping.push_back(std::make_pair(KeyRange(KeyRangeRef("u"_sr, "v"_sr)), "shard-3"));
 	mapping.push_back(std::make_pair(KeyRange(KeyRangeRef("x"_sr, "z"_sr)), "shard-1"));
+	mapping.push_back(std::make_pair(specialKeys, "default"));

 	for (auto it = dataMap.begin(); it != dataMap.end(); ++it) {
 		std::cout << "Begin " << it->first.begin.toString() << ", End " << it->first.end.toString() << ", id "
@ -2738,7 +2743,7 @@ TEST_CASE("noSim/ShardedRocksDB/ShardOps") {
 	wait(kvStore->cleanUpShardsIfNeeded(shardsToCleanUp));

 	auto dataMap = rocksdbStore->getDataMapping();
-	ASSERT_EQ(dataMap.size(), 1);
+	ASSERT_EQ(dataMap.size(), 2);
 	ASSERT(dataMap[0].second == "shard-2");

 	Future<Void> closed = kvStore->onClosed();
--- a/fdbserver/MetricLogger.actor.cpp
+++ b/fdbserver/MetricLogger.actor.cpp
@ -40,13 +40,7 @@ struct MetricsRule {
 	int minLevel;

 	Tuple pack() const {
-		return Tuple()
-		    .append(namePattern)
-		    .append(typePattern)
-		    .append(addressPattern)
-		    .append(idPattern)
-		    .append(enabled ? 1 : 0)
-		    .append(minLevel);
+		return Tuple::makeTuple(namePattern, typePattern, addressPattern, idPattern, enabled ? 1 : 0, minLevel);
 	}

 	static inline MetricsRule unpack(Tuple const& t) {
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@ -458,24 +458,6 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "SharedBytesInput", [tLogData]() { return tLogData->bytesInput; });
 		specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; });
-		specialCounter(
-		    cc, "KvstoreBytesUsed", [tLogData]() { return tLogData->persistentData->getStorageBytes().used; });
-		specialCounter(
-		    cc, "KvstoreBytesFree", [tLogData]() { return tLogData->persistentData->getStorageBytes().free; });
-		specialCounter(cc, "KvstoreBytesAvailable", [tLogData]() {
-			return tLogData->persistentData->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "KvstoreBytesTotal", [tLogData]() { return tLogData->persistentData->getStorageBytes().total; });
-		specialCounter(
-		    cc, "QueueDiskBytesUsed", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().used; });
-		specialCounter(
-		    cc, "QueueDiskBytesFree", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().free; });
-		specialCounter(cc, "QueueDiskBytesAvailable", [tLogData]() {
-			return tLogData->rawPersistentQueue->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
 		specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
 	}

@ -1424,7 +1406,23 @@ ACTOR Future<Void> tLogCore(TLogData* self, Reference<LogData> logData) {
 	                                     logData->logId,
 	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
 	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics"));
+	                                     logData->logId.toString() + "/TLogMetrics",
+	                                     [self = self](TraceEvent& te) {
+		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                     te.detail("KvstoreBytesFree", sbTlog.free);
+		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+
+		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
+		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                     }));
+
 	logData->addActor.send(serveTLogInterface(self, logData->tli, logData, warningCollectorInput));

 	try {
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@ -554,24 +554,6 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; });
 		specialCounter(cc, "SharedOverheadBytesInput", [tLogData]() { return tLogData->overheadBytesInput; });
 		specialCounter(cc, "SharedOverheadBytesDurable", [tLogData]() { return tLogData->overheadBytesDurable; });
-		specialCounter(
-		    cc, "KvstoreBytesUsed", [tLogData]() { return tLogData->persistentData->getStorageBytes().used; });
-		specialCounter(
-		    cc, "KvstoreBytesFree", [tLogData]() { return tLogData->persistentData->getStorageBytes().free; });
-		specialCounter(cc, "KvstoreBytesAvailable", [tLogData]() {
-			return tLogData->persistentData->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "KvstoreBytesTotal", [tLogData]() { return tLogData->persistentData->getStorageBytes().total; });
-		specialCounter(
-		    cc, "QueueDiskBytesUsed", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().used; });
-		specialCounter(
-		    cc, "QueueDiskBytesFree", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().free; });
-		specialCounter(cc, "QueueDiskBytesAvailable", [tLogData]() {
-			return tLogData->rawPersistentQueue->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
 		specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
 	}

@ -2241,7 +2223,23 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	                                     logData->logId,
 	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
 	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics"));
+	                                     logData->logId.toString() + "/TLogMetrics",
+	                                     [self = self](TraceEvent& te) {
+		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                     te.detail("KvstoreBytesFree", sbTlog.free);
+		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+
+		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
+		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                     }));
+
 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
 	logData->addActor.send(logPeekTrackers(logData.getPtr()));
@ -2770,8 +2768,10 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 	req.reply.send(recruited);

 	TraceEvent("TLogReady", logData->logId)
-	    .detail("AllTags", describe(req.allTags))
-	    .detail("Locality", logData->locality);
+	    .detail("Locality", logData->locality)
+	    .setMaxEventLength(11000)
+	    .setMaxFieldLength(10000)
+	    .detail("AllTags", describe(req.allTags));

 	updater = Void();
 	wait(tLogCore(self, logData, recruited, pulledRecoveryVersions));
--- a/fdbserver/OldTLogServer_6_2.actor.cpp
+++ b/fdbserver/OldTLogServer_6_2.actor.cpp
@ -641,24 +641,6 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; });
 		specialCounter(cc, "SharedOverheadBytesInput", [tLogData]() { return tLogData->overheadBytesInput; });
 		specialCounter(cc, "SharedOverheadBytesDurable", [tLogData]() { return tLogData->overheadBytesDurable; });
-		specialCounter(
-		    cc, "KvstoreBytesUsed", [tLogData]() { return tLogData->persistentData->getStorageBytes().used; });
-		specialCounter(
-		    cc, "KvstoreBytesFree", [tLogData]() { return tLogData->persistentData->getStorageBytes().free; });
-		specialCounter(cc, "KvstoreBytesAvailable", [tLogData]() {
-			return tLogData->persistentData->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "KvstoreBytesTotal", [tLogData]() { return tLogData->persistentData->getStorageBytes().total; });
-		specialCounter(
-		    cc, "QueueDiskBytesUsed", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().used; });
-		specialCounter(
-		    cc, "QueueDiskBytesFree", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().free; });
-		specialCounter(cc, "QueueDiskBytesAvailable", [tLogData]() {
-			return tLogData->rawPersistentQueue->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
 		specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); });
 		specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); });
 		specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
@ -2701,7 +2683,23 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	                                     logData->logId,
 	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
 	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics"));
+	                                     logData->logId.toString() + "/TLogMetrics",
+	                                     [self = self](TraceEvent& te) {
+		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                     te.detail("KvstoreBytesFree", sbTlog.free);
+		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+
+		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
+		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                     }));
+
 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
 	logData->addActor.send(logPeekTrackers(logData.getPtr()));
@ -3261,8 +3259,10 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 	req.reply.send(recruited);

 	TraceEvent("TLogReady", logData->logId)
-	    .detail("AllTags", describe(req.allTags))
-	    .detail("Locality", logData->locality);
+	    .detail("Locality", logData->locality)
+	    .setMaxEventLength(11000)
+	    .setMaxFieldLength(10000)
+	    .detail("AllTags", describe(req.allTags));

 	updater = Void();
 	wait(tLogCore(self, logData, recruited, pulledRecoveryVersions));
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@ -472,7 +472,6 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
 	try {
 		++data->counters.getValueQueries;
 		++data->counters.allQueries;
-		//++data->readQueueSizeMetric;
 		// TODO later
 		// data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() -
 		// data->counters.finishedQueries.getValue());
@ -544,7 +543,6 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
 	}

 	++data->counters.finishedQueries;
-	//--data->readQueueSizeMetric;
 	// if(data->latencyBandConfig.present()) {
 	//	int maxReadBytes =
 	// data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
@ -728,7 +726,6 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
 	++data->counters.getRangeQueries;
 	++data->counters.allQueries;
 	// printf("\nSCGetKeyValues\n");
-	//++data->readQueueSizeMetric;
 	// data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() -
 	// data->counters.finishedQueries.getValue());

--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -3502,8 +3502,10 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 	req.reply.send(recruited);

 	TraceEvent("TLogReady", logData->logId)
-	    .detail("AllTags", describe(req.allTags))
-	    .detail("Locality", logData->locality);
+	    .detail("Locality", logData->locality)
+	    .setMaxEventLength(11000)
+	    .setMaxFieldLength(10000)
+	    .detail("AllTags", describe(req.allTags));

 	updater = Void();
 	wait(tLogCore(self, logData, recruited, pulledRecoveryVersions));
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -2614,6 +2614,8 @@ ACTOR Future<Void> TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst
 		req.tLogLocalities = localities;
 		req.tLogPolicy = logSet->tLogPolicy;
 		req.locality = remoteLocality;
+		TraceEvent("RemoteTLogRouterReplies", self->dbgid)
+		    .detail("WorkerID", remoteWorkers.logRouters[i % remoteWorkers.logRouters.size()].id());
 		logRouterInitializationReplies.push_back(transformErrors(
 		    throwErrorOr(
 		        remoteWorkers.logRouters[i % remoteWorkers.logRouters.size()].logRouter.getReplyUnlessFailedFor(
@ -2693,11 +2695,13 @@ ACTOR Future<Void> TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst
 	}

 	remoteTLogInitializationReplies.reserve(remoteWorkers.remoteTLogs.size());
-	for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++)
+	for (int i = 0; i < remoteWorkers.remoteTLogs.size(); i++) {
+		TraceEvent("RemoteTLogReplies", self->dbgid).detail("WorkerID", remoteWorkers.remoteTLogs[i].id());
 		remoteTLogInitializationReplies.push_back(transformErrors(
 		    throwErrorOr(remoteWorkers.remoteTLogs[i].tLog.getReplyUnlessFailedFor(
 		        remoteTLogReqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)),
 		    cluster_recovery_failed()));
+	}

 	TraceEvent("RemoteLogRecruitment_InitializingRemoteLogs")
 	    .detail("StartVersion", logSet->startVersion)
@ -2966,11 +2970,13 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
 	}

 	initializationReplies.reserve(recr.tLogs.size());
-	for (int i = 0; i < recr.tLogs.size(); i++)
+	for (int i = 0; i < recr.tLogs.size(); i++) {
+		TraceEvent("PrimaryTLogReplies", logSystem->getDebugID()).detail("WorkerID", recr.tLogs[i].id());
 		initializationReplies.push_back(transformErrors(
 		    throwErrorOr(recr.tLogs[i].tLog.getReplyUnlessFailedFor(
 		        reqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)),
 		    cluster_recovery_failed()));
+	}

 	state std::vector<Future<Void>> recoveryComplete;

@ -3034,11 +3040,14 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
 		}

 		satelliteInitializationReplies.reserve(recr.satelliteTLogs.size());
-		for (int i = 0; i < recr.satelliteTLogs.size(); i++)
+		for (int i = 0; i < recr.satelliteTLogs.size(); i++) {
+			TraceEvent("PrimarySatelliteTLogReplies", logSystem->getDebugID())
+			    .detail("WorkerID", recr.satelliteTLogs[i].id());
 			satelliteInitializationReplies.push_back(transformErrors(
 			    throwErrorOr(recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor(
 			        sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)),
 			    cluster_recovery_failed()));
+		}

 		wait(waitForAll(satelliteInitializationReplies) || oldRouterRecruitment);

--- a/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h
+++ b/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h
@ -51,6 +51,8 @@ bool compareFDBAndBlob(RangeResult fdb,
                       Version v,
                       bool debug);

+ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range);
+
 #include "flow/unactorcompiler.h"

 #endif
--- a/fdbserver/include/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/include/fdbserver/DataDistribution.actor.h
@ -228,17 +228,20 @@ struct GetMetricsRequest {
 };

 struct GetTopKMetricsReply {
-	std::vector<StorageMetrics> metrics;
+	struct KeyRangeStorageMetrics {
+		KeyRange range;
+		StorageMetrics metrics;
+		KeyRangeStorageMetrics() = default;
+		KeyRangeStorageMetrics(const KeyRange& range, const StorageMetrics& s) : range(range), metrics(s) {}
+	};
+	std::vector<KeyRangeStorageMetrics> shardMetrics;
 	double minReadLoad = -1, maxReadLoad = -1;
 	GetTopKMetricsReply() {}
-	GetTopKMetricsReply(std::vector<StorageMetrics> const& m, double minReadLoad, double maxReadLoad)
-	  : metrics(m), minReadLoad(minReadLoad), maxReadLoad(maxReadLoad) {}
+	GetTopKMetricsReply(std::vector<KeyRangeStorageMetrics> const& m, double minReadLoad, double maxReadLoad)
+	  : shardMetrics(m), minReadLoad(minReadLoad), maxReadLoad(maxReadLoad) {}
 };
 struct GetTopKMetricsRequest {
-	// whether a > b
-	typedef std::function<bool(const StorageMetrics& a, const StorageMetrics& b)> MetricsComparator;
-	int topK = 1; // default only return the top 1 shard based on the comparator
-	MetricsComparator comparator; // Return true if a.score > b.score, return the largest topK in keys
+	int topK = 1; // default only return the top 1 shard based on the GetTopKMetricsRequest::compare function
 	std::vector<KeyRange> keys;
 	Promise<GetTopKMetricsReply> reply; // topK storage metrics
 	double maxBytesReadPerKSecond = 0, minBytesReadPerKSecond = 0; // all returned shards won't exceed this read load
@ -250,6 +253,20 @@ struct GetTopKMetricsRequest {
 	                      double minBytesReadPerKSecond = 0)
 	  : topK(topK), keys(keys), maxBytesReadPerKSecond(maxBytesReadPerKSecond),
 	    minBytesReadPerKSecond(minBytesReadPerKSecond) {}
+
+	// Return true if a.score > b.score, return the largest topK in keys
+	static bool compare(const GetTopKMetricsReply::KeyRangeStorageMetrics& a,
+	                    const GetTopKMetricsReply::KeyRangeStorageMetrics& b) {
+		return compareByReadDensity(a, b);
+	}
+
+private:
+	// larger read density means higher score
+	static bool compareByReadDensity(const GetTopKMetricsReply::KeyRangeStorageMetrics& a,
+	                                 const GetTopKMetricsReply::KeyRangeStorageMetrics& b) {
+		return a.metrics.bytesReadPerKSecond / std::max(a.metrics.bytes * 1.0, 1.0) >
+		       b.metrics.bytesReadPerKSecond / std::max(b.metrics.bytes * 1.0, 1.0);
+	}
 };

 struct GetMetricsListRequest {
--- a/fdbserver/include/fdbserver/RestoreCommon.actor.h
+++ b/fdbserver/include/fdbserver/RestoreCommon.actor.h
@ -112,13 +112,7 @@ public:

 		Tuple pack() const {
 			// fprintf(stderr, "Filename:%s\n", fileName.c_str());
-			return Tuple()
-			    .append(version)
-			    .append(StringRef(fileName))
-			    .append(isRange)
-			    .append(fileSize)
-			    .append(blockSize)
-			    .append(endVersion);
+			return Tuple::makeTuple(version, fileName, (int)isRange, fileSize, blockSize, endVersion);
 		}
 		static RestoreFile unpack(Tuple const& t) {
 			RestoreFile r;
@ -190,17 +184,16 @@ struct RestoreFileFR {
 	int partitionId = -1; // Partition ID (Log Router Tag ID) for mutation files.

 	Tuple pack() const {
-		return Tuple()
-		    .append(version)
-		    .append(StringRef(fileName))
-		    .append(isRange)
-		    .append(fileSize)
-		    .append(blockSize)
-		    .append(endVersion)
-		    .append(beginVersion)
-		    .append(cursor)
-		    .append(fileIndex)
-		    .append(partitionId);
+		return Tuple::makeTuple(version,
+		                        fileName,
+		                        (int)isRange,
+		                        fileSize,
+		                        blockSize,
+		                        endVersion,
+		                        beginVersion,
+		                        cursor,
+		                        fileIndex,
+		                        partitionId);
 	}
 	static RestoreFileFR unpack(Tuple const& t) {
 		RestoreFileFR r;
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -839,8 +839,6 @@ public:
 	AsyncVar<bool> noRecentUpdates;
 	double lastUpdate;

-	Int64MetricHandle readQueueSizeMetric;
-
 	std::string folder;

 	// defined only during splitMutations()/addMutation()
@ -951,6 +949,9 @@ public:

 		LatencySample readLatencySample;
 		LatencyBands readLatencyBands;
+		LatencySample mappedRangeSample; // Samples getMappedRange latency
+		LatencySample mappedRangeRemoteSample; // Samples getMappedRange remote subquery latency
+		LatencySample mappedRangeLocalSample; // Samples getMappedRange local subquery latency

 		Counters(StorageServer* self)
 		  : cc("StorageServer", self->thisServerID.toString()), allQueries("QueryQueue", cc),
@ -982,7 +983,19 @@ public:
 		                      self->thisServerID,
 		                      SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 		                      SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
-		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
+		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
+		    mappedRangeSample("GetMappedRangeMetrics",
+		                      self->thisServerID,
+		                      SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+		                      SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		    mappedRangeRemoteSample("GetMappedRangeRemoteMetrics",
+		                            self->thisServerID,
+		                            SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+		                            SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		    mappedRangeLocalSample("GetMappedRangeLocalMetrics",
+		                           self->thisServerID,
+		                           SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+		                           SERVER_KNOBS->LATENCY_SAMPLE_SIZE) {
 			specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
 			specialCounter(cc, "Version", [self]() { return self->version.get(); });
 			specialCounter(cc, "StorageVersion", [self]() { return self->storageVersion(); });
@ -1056,8 +1069,7 @@ public:
 	    primaryLocality(tagLocalityInvalid), knownCommittedVersion(0), versionLag(0), logProtocol(0),
 	    thisServerID(ssi.id()), tssInQuarantine(false), db(db), actors(false),
 	    byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), durableInProgress(Void()), watchBytes(0),
-	    numWatches(0), noRecentUpdates(false), lastUpdate(now()),
-	    readQueueSizeMetric(LiteralStringRef("StorageServer.ReadQueueSize")), updateEagerReads(nullptr),
+	    numWatches(0), noRecentUpdates(false), lastUpdate(now()), updateEagerReads(nullptr),
 	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM),
 	    fetchChangeFeedParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM),
 	    fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false),
@ -1561,7 +1573,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 	try {
 		++data->counters.getValueQueries;
 		++data->counters.allQueries;
-		++data->readQueueSizeMetric;
 		data->maxQueryQueue = std::max<int>(
 		    data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());

@ -1669,7 +1680,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 	data->transactionTagCounter.addRequest(req.tags, resultSize);

 	++data->counters.finishedQueries;
-	--data->readQueueSizeMetric;

 	double duration = g_network->timer() - req.requestTime();
 	data->counters.readLatencySample.addMeasurement(duration);
@ -2950,6 +2960,7 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
                                                    // To provide span context, tags, debug ID to underlying lookups.
                                                    GetMappedKeyValuesRequest* pOriginalReq) {
 	state GetValueReqAndResultRef getValue;
+	state double getValueStart = g_network->timer();
 	getValue.key = key;

 	if (data->shards[key]->isReadable()) {
@ -2970,6 +2981,8 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
 			if (!reply.error.present()) {
 				++data->counters.quickGetValueHit;
 				copyOptionalValue(a, getValue, reply.value);
+				const double duration = g_network->timer() - getValueStart;
+				data->counters.mappedRangeLocalSample.addMeasurement(duration);
 				return getValue;
 			}
 			// Otherwise fallback.
@ -2989,6 +3002,8 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
 		// TODO: async in case it needs to read from other servers.
 		Optional<Value> valueOption = wait(valueFuture);
 		copyOptionalValue(a, getValue, valueOption);
+		double duration = g_network->timer() - getValueStart;
+		data->counters.mappedRangeRemoteSample.addMeasurement(duration);
 		return getValue;
 	} else {
 		throw quick_get_value_miss();
@ -3375,7 +3390,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)

 	++data->counters.getRangeQueries;
 	++data->counters.allQueries;
-	++data->readQueueSizeMetric;
 	data->maxQueryQueue = std::max<int>(
 	    data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());

@ -3530,7 +3544,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)

 	data->transactionTagCounter.addRequest(req.tags, resultSize);
 	++data->counters.finishedQueries;
-	--data->readQueueSizeMetric;

 	double duration = g_network->timer() - req.requestTime();
 	data->counters.readLatencySample.addMeasurement(duration);
@ -3556,6 +3569,7 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
    // To provide span context, tags, debug ID to underlying lookups.
    GetMappedKeyValuesRequest* pOriginalReq) {
 	state GetRangeReqAndResultRef getRange;
+	state double getValuesStart = g_network->timer();
 	getRange.begin = firstGreaterOrEqual(KeyRef(*a, prefix));
 	getRange.end = firstGreaterOrEqual(strinc(prefix, *a));
 	try {
@ -3586,6 +3600,8 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
 			// Convert GetKeyValuesReply to RangeResult.
 			a->dependsOn(reply.arena);
 			getRange.result = RangeResultRef(reply.data, reply.more);
+			const double duration = g_network->timer() - getValuesStart;
+			data->counters.mappedRangeLocalSample.addMeasurement(duration);
 			return getRange;
 		}
 		// Otherwise fallback.
@ -3605,6 +3621,8 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
 		RangeResult rangeResult = wait(rangeResultFuture);
 		a->dependsOn(rangeResult.arena());
 		getRange.result = rangeResult;
+		const double duration = g_network->timer() - getValuesStart;
+		data->counters.mappedRangeRemoteSample.addMeasurement(duration);
 		return getRange;
 	} else {
 		throw quick_get_key_values_miss();
@ -3676,9 +3694,7 @@ void preprocessMappedKey(Tuple& mappedKeyFormatTuple, std::vector<Optional<Tuple
 			bool escaped = unescapeLiterals(s, "{{", "{");
 			escaped = unescapeLiterals(s, "}}", "}") || escaped;
 			if (escaped) {
-				Tuple escapedTuple;
-				escapedTuple.append(s);
-				vt.emplace_back(escapedTuple);
+				vt.emplace_back(Tuple::makeTuple(s));
 			} else if (singleKeyOrValue(s, sz)) {
 				// when it is SingleKeyOrValue, insert an empty Tuple to vector as placeholder
 				vt.emplace_back(Tuple());
@ -3750,16 +3766,12 @@ Key constructMappedKey(KeyValueRef* keyValue,
 }

 TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
-	Key key = Tuple().append("key-0"_sr).append("key-1"_sr).append("key-2"_sr).getDataAsStandalone();
-	Value value = Tuple().append("value-0"_sr).append("value-1"_sr).append("value-2"_sr).getDataAsStandalone();
+	Key key = Tuple::makeTuple("key-0"_sr, "key-1"_sr, "key-2"_sr).getDataAsStandalone();
+	Value value = Tuple::makeTuple("value-0"_sr, "value-1"_sr, "value-2"_sr).getDataAsStandalone();
 	state KeyValueRef kvr(key, value);
 	{
-		Tuple mappedKeyFormatTuple = Tuple()
-		                                 .append("normal"_sr)
-		                                 .append("{{escaped}}"_sr)
-		                                 .append("{K[2]}"_sr)
-		                                 .append("{V[0]}"_sr)
-		                                 .append("{...}"_sr);
+		Tuple mappedKeyFormatTuple =
+		    Tuple::makeTuple("normal"_sr, "{{escaped}}"_sr, "{K[2]}"_sr, "{V[0]}"_sr, "{...}"_sr);

 		Tuple mappedKeyTuple;
 		std::vector<Optional<Tuple>> vt;
@ -3768,19 +3780,15 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {

 		Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple);

-		Key expectedMappedKey = Tuple()
-		                            .append("normal"_sr)
-		                            .append("{escaped}"_sr)
-		                            .append("key-2"_sr)
-		                            .append("value-0"_sr)
-		                            .getDataAsStandalone();
+		Key expectedMappedKey =
+		    Tuple::makeTuple("normal"_sr, "{escaped}"_sr, "key-2"_sr, "value-0"_sr).getDataAsStandalone();
 		//		std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl;
 		ASSERT(mappedKey.compare(expectedMappedKey) == 0);
 		ASSERT(isRangeQuery == true);
 	}

 	{
-		Tuple mappedKeyFormatTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr);
+		Tuple mappedKeyFormatTuple = Tuple::makeTuple("{{{{}}"_sr, "}}"_sr);

 		Tuple mappedKeyTuple;
 		std::vector<Optional<Tuple>> vt;
@ -3788,13 +3796,13 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
 		preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery);
 		Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple);

-		Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone();
+		Key expectedMappedKey = Tuple::makeTuple("{{}"_sr, "}"_sr).getDataAsStandalone();
 		//		std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl;
 		ASSERT(mappedKey.compare(expectedMappedKey) == 0);
 		ASSERT(isRangeQuery == false);
 	}
 	{
-		Tuple mappedKeyFormatTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr);
+		Tuple mappedKeyFormatTuple = Tuple::makeTuple("{{{{}}"_sr, "}}"_sr);

 		Tuple mappedKeyTuple;
 		std::vector<Optional<Tuple>> vt;
@ -3802,13 +3810,13 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
 		preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery);
 		Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple);

-		Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone();
+		Key expectedMappedKey = Tuple::makeTuple("{{}"_sr, "}"_sr).getDataAsStandalone();
 		//		std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl;
 		ASSERT(mappedKey.compare(expectedMappedKey) == 0);
 		ASSERT(isRangeQuery == false);
 	}
 	{
-		Tuple mappedKeyFormatTuple = Tuple().append("{K[100]}"_sr);
+		Tuple mappedKeyFormatTuple = Tuple::makeTuple("{K[100]}"_sr);
 		state bool throwException = false;
 		try {
 			Tuple mappedKeyTuple;
@ -3824,7 +3832,7 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
 		ASSERT(throwException);
 	}
 	{
-		Tuple mappedKeyFormatTuple = Tuple().append("{...}"_sr).append("last-element"_sr);
+		Tuple mappedKeyFormatTuple = Tuple::makeTuple("{...}"_sr, "last-element"_sr);
 		state bool throwException2 = false;
 		try {
 			Tuple mappedKeyTuple;
@ -3840,7 +3848,7 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
 		ASSERT(throwException2);
 	}
 	{
-		Tuple mappedKeyFormatTuple = Tuple().append("{K[not-a-number]}"_sr);
+		Tuple mappedKeyFormatTuple = Tuple::makeTuple("{K[not-a-number]}"_sr);
 		state bool throwException3 = false;
 		try {
 			Tuple mappedKeyTuple;
@ -4097,7 +4105,6 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe

 	++data->counters.getMappedRangeQueries;
 	++data->counters.allQueries;
-	++data->readQueueSizeMetric;
 	data->maxQueryQueue = std::max<int>(
 	    data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());

@ -4270,10 +4277,10 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe

 	data->transactionTagCounter.addRequest(req.tags, resultSize);
 	++data->counters.finishedQueries;
-	--data->readQueueSizeMetric;

 	double duration = g_network->timer() - req.requestTime();
 	data->counters.readLatencySample.addMeasurement(duration);
+	data->counters.mappedRangeSample.addMeasurement(duration);
 	if (data->latencyBandConfig.present()) {
 		int maxReadBytes =
 		    data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
@ -4304,7 +4311,6 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
 	req.reply.setByteLimit(SERVER_KNOBS->RANGESTREAM_LIMIT_BYTES);
 	++data->counters.getRangeStreamQueries;
 	++data->counters.allQueries;
-	++data->readQueueSizeMetric;
 	data->maxQueryQueue = std::max<int>(
 	    data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());

@ -4490,7 +4496,6 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe

 	data->transactionTagCounter.addRequest(req.tags, resultSize);
 	++data->counters.finishedQueries;
-	--data->readQueueSizeMetric;

 	return Void();
 }
@ -4505,7 +4510,6 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {

 	++data->counters.getKeyQueries;
 	++data->counters.allQueries;
-	++data->readQueueSizeMetric;
 	data->maxQueryQueue = std::max<int>(
 	    data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());

@ -4577,7 +4581,6 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 	data->transactionTagCounter.addRequest(req.tags, resultSize);

 	++data->counters.finishedQueries;
-	--data->readQueueSizeMetric;

 	double duration = g_network->timer() - req.requestTime();
 	data->counters.readLatencySample.addMeasurement(duration);
--- a/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp
@ -149,6 +149,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {

 	// parameters global across all clients
 	int64_t targetByteRate;
+	bool doMergeCheckAtEnd;

 	std::vector<Reference<ThreadData>> directories;
 	std::vector<Future<Void>> clients;
@ -162,6 +163,9 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 		// different parameters within those constraints
 		int64_t randomness = sharedRandomNumber;

+		doMergeCheckAtEnd = randomness % 10 == 0;
+		randomness /= 10;
+
 		// randomize between low and high directory count
 		int64_t targetDirectories = 1 + (randomness % 8);
 		randomness /= 8;
@ -912,7 +916,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 		}
 		wait(self->checkTenantRanges(self, cx, threadData));

-		bool initialCheck = result;
+		state bool initialCheck = result;
 		result &= threadData->mismatches == 0 && (threadData->timeTravelTooOld == 0);

 		fmt::print("Blob Granule Workload Directory {0} {1}:\n", threadData->directoryID, result ? "passed" : "failed");
@ -935,6 +939,11 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
 		// For some reason simulation is still passing when this fails?.. so assert for now
 		ASSERT(result);

+		if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && self->doMergeCheckAtEnd) {
+			CODE_PROBE(true, "BGCorrectness clearing database and awaiting merge");
+			wait(clearAndAwaitMerge(cx, threadData->directoryRange));
+		}
+
 		return result;
 	}

--- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
@ -451,7 +451,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		if (BGV_DEBUG && startReadVersion != readVersion) {
 			fmt::print("Availability check updated read version from {0} to {1}\n", startReadVersion, readVersion);
 		}
-		bool result = availabilityPassed && self->mismatches == 0 && (checks > 0) && (self->timeTravelTooOld == 0);
+		state bool result =
+		    availabilityPassed && self->mismatches == 0 && (checks > 0) && (self->timeTravelTooOld == 0);
 		fmt::print("Blob Granule Verifier {0} {1}:\n", self->clientId, result ? "passed" : "failed");
 		fmt::print("  {} successful final granule checks\n", checks);
 		fmt::print("  {} failed final granule checks\n", availabilityPassed ? 0 : 1);
@ -470,6 +471,11 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		// For some reason simulation is still passing when this fails?.. so assert for now
 		ASSERT(result);

+		if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && deterministicRandom()->random01() < 0.1) {
+			CODE_PROBE(true, "BGV clearing database and awaiting merge");
+			wait(clearAndAwaitMerge(cx, normalKeys));
+		}
+
 		return result;
 	}

--- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
+++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
@ -294,8 +294,8 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
 			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-			Tuple rate = Tuple().appendDouble(sampleProbability);
-			Tuple size = Tuple().append(sizeLimit);
+			Tuple rate = Tuple::makeTuple(sampleProbability);
+			Tuple size = Tuple::makeTuple(sizeLimit);
 			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
 			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
 			return Void();
--- a/fdbserver/workloads/ConfigIncrement.actor.cpp
+++ b/fdbserver/workloads/ConfigIncrement.actor.cpp
@ -36,12 +36,7 @@ class ConfigIncrementWorkload : public TestWorkload {

 	PerfIntCounter transactions, retries, commitUnknownResult;

-	static Key getConfigKey() {
-		Tuple tuple;
-		tuple.appendNull(); // config class
-		tuple << testKnobName;
-		return tuple.pack();
-	}
+	static Key getConfigKey() { return Tuple::makeTuple(/* config class */ nullptr, testKnobName).pack(); }

 	ACTOR static Future<int> get(Reference<ISingleThreadTransaction> tr) {
 		TraceEvent(SevDebug, "ConfigIncrementGet");
--- a/fdbserver/workloads/GetMappedRange.actor.cpp
+++ b/fdbserver/workloads/GetMappedRange.actor.cpp
@ -83,15 +83,11 @@ struct GetMappedRangeWorkload : ApiWorkload {
 	static Value dataOfRecord(int i) { return Key(format("data-of-record-%08d", i)); }
 	static Value dataOfRecord(int i, int split) { return Key(format("data-of-record-%08d-split-%08d", i, split)); }

-	static Key indexEntryKey(int i) {
-		return Tuple().append(prefix).append(INDEX).append(indexKey(i)).append(primaryKey(i)).pack();
-	}
-	static Key recordKey(int i) { return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).pack(); }
-	static Key recordKey(int i, int split) {
-		return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).append(split).pack();
-	}
-	static Value recordValue(int i) { return Tuple().append(dataOfRecord(i)).pack(); }
-	static Value recordValue(int i, int split) { return Tuple().append(dataOfRecord(i, split)).pack(); }
+	static Key indexEntryKey(int i) { return Tuple::makeTuple(prefix, INDEX, indexKey(i), primaryKey(i)).pack(); }
+	static Key recordKey(int i) { return Tuple::makeTuple(prefix, RECORD, primaryKey(i)).pack(); }
+	static Key recordKey(int i, int split) { return Tuple::makeTuple(prefix, RECORD, primaryKey(i), split).pack(); }
+	static Value recordValue(int i) { return Tuple::makeTuple(dataOfRecord(i)).pack(); }
+	static Value recordValue(int i, int split) { return Tuple::makeTuple(dataOfRecord(i, split)).pack(); }

 	ACTOR Future<Void> fillInRecords(Database cx, int n, GetMappedRangeWorkload* self) {
 		state Transaction tr(cx);
@ -270,9 +266,9 @@ struct GetMappedRangeWorkload : ApiWorkload {
 	                                   GetMappedRangeWorkload* self,
 	                                   int matchIndex,
 	                                   bool allMissing = false) {
-		Key beginTuple = Tuple().append(prefix).append(INDEX).append(indexKey(beginId)).getDataAsStandalone();
+		Key beginTuple = Tuple::makeTuple(prefix, INDEX, indexKey(beginId)).getDataAsStandalone();
 		state KeySelector beginSelector = KeySelector(firstGreaterOrEqual(beginTuple));
-		Key endTuple = Tuple().append(prefix).append(INDEX).append(indexKey(endId)).getDataAsStandalone();
+		Key endTuple = Tuple::makeTuple(prefix, INDEX, indexKey(endId)).getDataAsStandalone();
 		state KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple));
 		state int limit = 100;
 		state int expectedBeginId = beginId;
@ -322,9 +318,9 @@ struct GetMappedRangeWorkload : ApiWorkload {
 	                                                   Reference<TransactionWrapper>& tr,
 	                                                   GetMappedRangeWorkload* self) {
 		Key mapper = getMapper(self, false);
-		Key beginTuple = Tuple().append(prefix).append(INDEX).append(indexKey(beginId)).getDataAsStandalone();
+		Key beginTuple = Tuple::makeTuple(prefix, INDEX, indexKey(beginId)).getDataAsStandalone();
 		KeySelector beginSelector = KeySelector(firstGreaterOrEqual(beginTuple));
-		Key endTuple = Tuple().append(prefix).append(INDEX).append(indexKey(endId)).getDataAsStandalone();
+		Key endTuple = Tuple::makeTuple(prefix, INDEX, indexKey(endId)).getDataAsStandalone();
 		KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple));
 		return tr->getMappedRange(beginSelector,
 		                          endSelector,
--- a/fdbserver/workloads/ReportConflictingKeys.actor.cpp
+++ b/fdbserver/workloads/ReportConflictingKeys.actor.cpp
@ -192,9 +192,16 @@ struct ReportConflictingKeysWorkload : TestWorkload {
 					                LiteralStringRef("\xff\xff").withPrefix(conflictingKeysRange.begin));
 					// The getRange here using the special key prefix "\xff\xff/transaction/conflicting_keys/" happens
 					// locally Thus, the error handling is not needed here
-					Future<RangeResult> conflictingKeyRangesFuture = tr2->getRange(ckr, CLIENT_KNOBS->TOO_MANY);
+					state Future<RangeResult> conflictingKeyRangesFuture = tr2->getRange(ckr, CLIENT_KNOBS->TOO_MANY);
 					ASSERT(conflictingKeyRangesFuture.isReady());

+					wait(validateSpecialSubrangeRead(tr2.getPtr(),
+					                                 firstGreaterOrEqual(ckr.begin),
+					                                 firstGreaterOrEqual(ckr.end),
+					                                 GetRangeLimits(),
+					                                 Reverse::False,
+					                                 conflictingKeyRangesFuture.get()));
+
 					tr2 = makeReference<ReadYourWritesTransaction>(cx);

 					const RangeResult conflictingKeyRanges = conflictingKeyRangesFuture.get();
--- a/fdbserver/workloads/TaskBucketCorrectness.actor.cpp
+++ b/fdbserver/workloads/TaskBucketCorrectness.actor.cpp
@ -359,8 +359,7 @@ TEST_CASE("/fdbclient/TaskBucket/Subspace") {
 	print_subspace_key(subspace_test1, 1);
 	ASSERT(subspace_test1.key() == LiteralStringRef("abc"));

-	Tuple t;
-	t.append(LiteralStringRef("user"));
+	Tuple t = Tuple::makeTuple("user"_sr);
 	Subspace subspace_test2(t);
 	print_subspace_key(subspace_test2, 2);
 	ASSERT(subspace_test2.key() == LiteralStringRef("\x01user\x00"));
@ -369,8 +368,7 @@ TEST_CASE("/fdbclient/TaskBucket/Subspace") {
 	print_subspace_key(subspace_test3, 3);
 	ASSERT(subspace_test3.key() == LiteralStringRef("abc\x01user\x00"));

-	Tuple t1;
-	t1.append(1);
+	Tuple t1 = Tuple::makeTuple(1);
 	Subspace subspace_test4(t1);
 	print_subspace_key(subspace_test4, 4);
 	ASSERT(subspace_test4.key() == LiteralStringRef("\x15\x01"));
@ -400,8 +398,7 @@ TEST_CASE("/fdbclient/TaskBucket/Subspace") {
 	ASSERT(subspace_test8.key() == LiteralStringRef("\x01subitem\x00"));

 	// pack
-	Tuple t3;
-	t3.append(StringRef());
+	Tuple t3 = Tuple::makeTuple(""_sr);
 	printf("%d==========%s===%d\n", 10, printable(subspace_test5.pack(t3)).c_str(), subspace_test5.pack(t3).size());
 	ASSERT(subspace_test5.pack(t3) == subspace_test5.pack(StringRef()));
 	ASSERT(subspace_test5.pack(t3) == LiteralStringRef("abc\x01user\x00\x15\x7b\x01\x00"));
--- a/flow/Arena.cpp
+++ b/flow/Arena.cpp
@ -774,3 +774,45 @@ TEST_CASE("/flow/Arena/Size") {

 	return Void();
 }
+
+TEST_CASE("flow/StringRef/eat") {
+	StringRef str = "test/case"_sr;
+	StringRef first = str.eat("/");
+	ASSERT(first == "test"_sr);
+	ASSERT(str == "case"_sr);
+
+	str = "test/case"_sr;
+	first = str.eat("/"_sr);
+	ASSERT(first == "test"_sr);
+	ASSERT(str == "case"_sr);
+
+	str = "testcase"_sr;
+	first = str.eat("/"_sr);
+	ASSERT(first == "testcase"_sr);
+	ASSERT(str == ""_sr);
+
+	str = "testcase/"_sr;
+	first = str.eat("/"_sr);
+	ASSERT(first == "testcase"_sr);
+	ASSERT(str == ""_sr);
+
+	str = "test/case/extra"_sr;
+	first = str.eat("/"_sr);
+	ASSERT(first == "test"_sr);
+	ASSERT(str == "case/extra"_sr);
+
+	bool hasSep;
+	str = "test/case"_sr;
+	first = str.eat("/"_sr, &hasSep);
+	ASSERT(hasSep);
+	ASSERT(first == "test"_sr);
+	ASSERT(str == "case"_sr);
+
+	str = "testcase"_sr;
+	first = str.eat("/", &hasSep);
+	ASSERT(!hasSep);
+	ASSERT(first == "testcase"_sr);
+	ASSERT(str == ""_sr);
+
+	return Void();
+}
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@ -26,8 +26,8 @@ target_link_libraries(flowlinktest PRIVATE flow stacktrace)

 find_package(ZLIB)
 if(ZLIB_FOUND)
-  add_compile_definitions(ZLIB_LIB_SUPPORTED)
-  target_link_libraries(flow PRIVATE ZLIB::ZLIB)
+  target_compile_definitions(flow PUBLIC ZLIB_LIB_SUPPORTED)
+  target_link_libraries(flow PUBLIC ZLIB::ZLIB)
 else()
  message(STATUS "ZLIB package not found")
 endif()
@ -66,11 +66,7 @@ foreach(ft flow flow_sampling flowlinktest)
    target_include_directories(${ft} SYSTEM BEFORE PUBLIC ${WOLFSSL_INCLUDE_DIR}/wolfssl)
  endif()
  target_link_libraries(${ft} PUBLIC Threads::Threads ${CMAKE_DL_LIBS})
-  if(USE_SANITIZER)
-    target_link_libraries(${ft} PUBLIC boost_asan)
-  else()
-    target_link_libraries(${ft} PUBLIC boost_target)
-  endif()
+  target_link_libraries(${ft} PUBLIC boost_target)
  if(USE_VALGRIND)
    target_link_libraries(${ft} PUBLIC Valgrind)
  endif()
--- a/tests/loopback_cluster/run_custom_cluster.sh
+++ b/tests/loopback_cluster/run_custom_cluster.sh
@ -7,6 +7,7 @@ PORT_PREFIX=1500

 # default cluster settings, override with options
 STATELESS_COUNT=4
+REPLICATION_COUNT=1
 LOGS_COUNT=8
 STORAGE_COUNT=16
 KNOBS=""
@ -25,6 +26,7 @@ function usage {
 	printf "\t--logs_taskset BITMASK\n\r\t\tbitmask of CPUs to pin logs to. Default is all CPUs.\n\r"
 	printf "\t--storage_count COUNT\n\r\t\tnumber of storage daemons to start.  Default ${STORAGE_COUNT}\n\r"
 	printf "\t--storage_taskset BITMASK\n\r\t\tBitmask of CPUs to pin storage to. Default is all CPUs.\n\r"
+	printf "\t--replication_count COUNT\n\r\t\tReplication count may be 1,2 or 3. Default is 1.\n\r"
 	echo "Example"
 	printf "\t${0} . --knobs '--knob_proxy_use_resolver_private_mutations=1' --stateless_count 4 --stateless_taskset 0xf --logs_count 8 --logs_taskset 0xff0 --storage_taskset 0xffff000\n\r"
 	exit 1
@ -36,7 +38,8 @@ function start_servers {
 		DATA=${DIR}/${SERVER_COUNT}/data
 		mkdir -p ${LOG} ${DATA}
 		PORT=$(( $PORT_PREFIX + $SERVER_COUNT ))
-		$2 ${FDB} -p auto:${PORT} "$KNOBS" -c $3 -d $DATA -L $LOG -C $CLUSTER &
+		ZONE=$(( $j % $REPLICATION_COUNT ))
+		$2 ${FDB} -p auto:${PORT} "$KNOBS" -c $3 -d $DATA -L $LOG -C $CLUSTER --locality-zoneid Z-$ZONE --locality-machineid M-$SERVER_COUNT &
 		SERVER_COUNT=$(( $SERVER_COUNT + 1 ))
 	done
 }
@ -76,6 +79,9 @@ while [[ $# -gt 0 ]]; do
 		--storage_count)
 			STORAGE_COUNT=$2
 			;;	
+		--replication_count)
+			REPLICATION_COUNT=$2
+			;;
 	esac
 	shift; shift
 done
@ -86,6 +92,15 @@ if [ ! -f ${FDB} ]; then
 	usage
 fi

+if [ $REPLICATION_COUNT -eq 1 ]; then
+	replication="single"
+elif [ $REPLICATION_COUNT -eq 2 ]; then
+	replication="double"
+elif [ $REPLICATION_COUNT -eq 3 ]; then
+	replication="triple"
+else
+	usage
+fi

 DIR=./loopback-cluster
 rm -rf $DIR
@ -102,7 +117,7 @@ start_servers $LOGS_COUNT "$LOGS_TASKSET" log
 start_servers $STORAGE_COUNT "$STORAGE_TASKSET" storage

 CLI="$BUILD/bin/fdbcli -C ${CLUSTER} --exec"
-echo "configure new ssd single - stand by"
+echo "configure new ssd $replication - stand by"

-# sleep 2 seconds to wait for workers to join cluster, then configure database
-( sleep 2 ; $CLI "configure new ssd single" )
+# sleep 2 seconds to wait for workers to join cluster, then configure database and coordinators
+( sleep 2 ; $CLI "configure new ssd $replication" ; $CLI "coordinators auto")