Merge remote-tracking branch 'origin/main' into features/private-request-streams

2022-04-10 14:12:37 -06:00 · 2022-04-10 14:12:37 -06:00 · 16467262f0
parent d8a0b57b6c bdc22646c1
commit 16467262f0
116 changed files with 6242 additions and 2104 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -91,6 +91,11 @@ if (FDB_RELEASE_CANDIDATE)
 else()
    set(FDB_VERSION ${PROJECT_VERSION})
 endif()
+if (NOT FDB_RELEASE)
+    string(TIMESTAMP FDB_BUILDTIME %Y%m%d%H%M%S)
+    set(FDB_BUILDTIME_STRING ".${FDB_BUILDTIME}")
+    set(PRERELEASE_TAG "prerelease")
+endif()
 set(FDB_VERSION_PLAIN ${FDB_VERSION})
 string(REPLACE "." ";" FDB_VERSION_LIST ${FDB_VERSION_PLAIN})
 list(GET FDB_VERSION_LIST 0 FDB_MAJOR)
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -466,6 +466,27 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db
 	                }).extractPtr());
 }

+extern "C" DLLEXPORT FDBFuture* fdb_database_purge_blob_granules(FDBDatabase* db,
+                                                                 uint8_t const* begin_key_name,
+                                                                 int begin_key_name_length,
+                                                                 uint8_t const* end_key_name,
+                                                                 int end_key_name_length,
+                                                                 int64_t purge_version,
+                                                                 fdb_bool_t force) {
+	return (FDBFuture*)(DB(db)
+	                        ->purgeBlobGranules(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                        StringRef(end_key_name, end_key_name_length)),
+	                                            purge_version,
+	                                            force)
+	                        .extractPtr());
+}
+extern "C" DLLEXPORT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDatabase* db,
+                                                                          uint8_t const* purge_key_name,
+                                                                          int purge_key_name_length) {
+	return (
+	    FDBFuture*)(DB(db)->waitPurgeGranulesComplete(StringRef(purge_key_name, purge_key_name_length)).extractPtr());
+}
+
 extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) {
 	CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr(););
 }
@ -619,7 +640,7 @@ FDBFuture* fdb_transaction_get_range_impl(FDBTransaction* tr,
 	                    .extractPtr());
 }

-FDBFuture* fdb_transaction_get_mapped_range_impl(FDBTransaction* tr,
+extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_mapped_range(FDBTransaction* tr,
                                                                 uint8_t const* begin_key_name,
                                                                 int begin_key_name_length,
                                                                 fdb_bool_t begin_or_equal,
@ -651,8 +672,7 @@ FDBFuture* fdb_transaction_get_mapped_range_impl(FDBTransaction* tr,
 	                    .extractPtr());
 }

-// TODO: Support FDB_API_ADDED in generate_asm.py and then this can be replaced with fdb_api_ptr_unimpl.
-FDBFuture* fdb_transaction_get_mapped_range_v699(FDBTransaction* tr,
+FDBFuture* fdb_transaction_get_range_and_flat_map_v709(FDBTransaction* tr,
                                                       uint8_t const* begin_key_name,
                                                       int begin_key_name_length,
                                                       fdb_bool_t begin_or_equal,
@ -669,7 +689,7 @@ FDBFuture* fdb_transaction_get_mapped_range_v699(FDBTransaction* tr,
                                                       int iteration,
                                                       fdb_bool_t snapshot,
                                                       fdb_bool_t reverse) {
-	fprintf(stderr, "UNIMPLEMENTED FDB API FUNCTION\n");
+	fprintf(stderr, "GetRangeAndFlatMap is removed from 7.0. Please upgrade to 7.1 and use GetMappedRange\n");
 	abort();
 }

@ -900,13 +920,13 @@ extern "C" DLLEXPORT fdb_error_t fdb_select_api_version_impl(int runtime_version

 	// Versioned API changes -- descending order by version (new changes at top)
 	// FDB_API_CHANGED( function, ver ) means there is a new implementation as of ver, and a function function_(ver-1)
-	// is the old implementation FDB_API_REMOVED( function, ver ) means the function was removed as of ver, and
+	// is the old implementation. FDB_API_REMOVED( function, ver ) means the function was removed as of ver, and
 	// function_(ver-1) is the old implementation
 	//
 	// WARNING: use caution when implementing removed functions by calling public API functions. This can lead to
 	// undesired behavior when using the multi-version API. Instead, it is better to have both the removed and public
 	// functions call an internal implementation function. See fdb_create_database_impl for an example.
-	FDB_API_CHANGED(fdb_transaction_get_mapped_range, 700);
+	FDB_API_REMOVED(fdb_transaction_get_range_and_flat_map, 710);
 	FDB_API_REMOVED(fdb_future_get_version, 620);
 	FDB_API_REMOVED(fdb_create_cluster, 610);
 	FDB_API_REMOVED(fdb_cluster_create_database, 610);
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@ -299,6 +299,18 @@ DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDat

 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version);

+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_purge_blob_granules(FDBDatabase* db,
+                                                                         uint8_t const* begin_key_name,
+                                                                         int begin_key_name_length,
+                                                                         uint8_t const* end_key_name,
+                                                                         int end_key_name_length,
+                                                                         int64_t purge_version,
+                                                                         fdb_bool_t force);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDatabase* db,
+                                                                                  uint8_t const* purge_key_name,
+                                                                                  int purge_key_name_length);
+
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant,
                                                                       FDBTransaction** out_transaction);

--- a/bindings/c/test/unit/fdb_api.cpp
+++ b/bindings/c/test/unit/fdb_api.cpp
@ -130,6 +130,25 @@ EmptyFuture Database::create_snapshot(FDBDatabase* db,
 	return EmptyFuture(fdb_database_create_snapshot(db, uid, uid_length, snap_command, snap_command_length));
 }

+KeyFuture Database::purge_blob_granules(FDBDatabase* db,
+                                        std::string_view begin_key,
+                                        std::string_view end_key,
+                                        int64_t purge_version,
+                                        fdb_bool_t force) {
+	return KeyFuture(fdb_database_purge_blob_granules(db,
+	                                                  (const uint8_t*)begin_key.data(),
+	                                                  begin_key.size(),
+	                                                  (const uint8_t*)end_key.data(),
+	                                                  end_key.size(),
+	                                                  purge_version,
+	                                                  force));
+}
+
+EmptyFuture Database::wait_purge_granules_complete(FDBDatabase* db, std::string_view purge_key) {
+	return EmptyFuture(
+	    fdb_database_wait_purge_granules_complete(db, (const uint8_t*)purge_key.data(), purge_key.size()));
+}
+
 // Tenant
 Tenant::Tenant(FDBDatabase* db, const uint8_t* name, int name_length) {
 	if (fdb_error_t err = fdb_database_open_tenant(db, name, name_length, &tenant)) {
--- a/bindings/c/test/unit/fdb_api.hpp
+++ b/bindings/c/test/unit/fdb_api.hpp
@ -97,6 +97,7 @@ public:

 private:
 	friend class Transaction;
+	friend class Database;
 	KeyFuture(FDBFuture* f) : Future(f) {}
 };

@ -201,6 +202,14 @@ public:
 	                                   int uid_length,
 	                                   const uint8_t* snap_command,
 	                                   int snap_command_length);
+
+	static KeyFuture purge_blob_granules(FDBDatabase* db,
+	                                     std::string_view begin_key,
+	                                     std::string_view end_key,
+	                                     int64_t purge_version,
+	                                     fdb_bool_t force);
+
+	static EmptyFuture wait_purge_granules_complete(FDBDatabase* db, std::string_view purge_key);
 };

 class Tenant final {
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@ -2592,7 +2592,6 @@ TEST_CASE("Blob Granule Functions") {
 	}

 	// write some data
-
 	insert_data(db, create_data({ { "bg1", "a" }, { "bg2", "b" }, { "bg3", "c" } }));

 	// because wiring up files is non-trivial, just test the calls complete with the expected no_materialize error
@ -2709,6 +2708,42 @@ TEST_CASE("Blob Granule Functions") {
 		tr.reset();
 		break;
 	}
+
+	// do a purge + wait at that version to purge everything before originalReadVersion
+
+	fdb::KeyFuture purgeKeyFuture =
+	    fdb::Database::purge_blob_granules(db, key("bg"), key("bh"), originalReadVersion, false);
+
+	fdb_check(wait_future(purgeKeyFuture));
+
+	const uint8_t* purgeKeyData;
+	int purgeKeyLen;
+
+	fdb_check(purgeKeyFuture.get(&purgeKeyData, &purgeKeyLen));
+
+	std::string purgeKey((const char*)purgeKeyData, purgeKeyLen);
+
+	fdb::EmptyFuture waitPurgeFuture = fdb::Database::wait_purge_granules_complete(db, purgeKey);
+	fdb_check(wait_future(waitPurgeFuture));
+
+	// re-read again at the purge version to make sure it is still valid
+
+	while (1) {
+		fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0));
+		fdb::KeyValueArrayResult r =
+		    tr.read_blob_granules(key("bg"), key("bh"), 0, originalReadVersion, granuleContext);
+		fdb_error_t err = r.get(&out_kv, &out_count, &out_more);
+		if (err && err != 2037 /* blob_granule_not_materialized */) {
+			fdb::EmptyFuture f2 = tr.on_error(err);
+			fdb_check(wait_future(f2));
+			continue;
+		}
+
+		CHECK(err == 2037 /* blob_granule_not_materialized */);
+
+		tr.reset();
+		break;
+	}
 }

 int main(int argc, char** argv) {
--- a/bindings/python/tests/fdbcli_tests.py
+++ b/bindings/python/tests/fdbcli_tests.py
@ -260,6 +260,45 @@ def suspend(logger):
    assert get_value_from_status_json(False, 'client', 'database_status', 'available')


+def extract_version_epoch(cli_output):
+    return int(cli_output.split("\n")[-1].split(" ")[-1])
+
+
+@enable_logging()
+def targetversion(logger):
+    version1 = run_fdbcli_command('targetversion getepoch')
+    assert version1 == "Version epoch is unset"
+    version2 = int(run_fdbcli_command('getversion'))
+    logger.debug("read version: {}".format(version2))
+    assert version2 >= 0
+    # set the version epoch to the default value
+    logger.debug("setting version epoch to default")
+    run_fdbcli_command('targetversion add 0')
+    # get the version epoch
+    versionepoch1 = extract_version_epoch(run_fdbcli_command('targetversion getepoch'))
+    logger.debug("version epoch: {}".format(versionepoch1))
+    # make sure the version increased
+    version3 = int(run_fdbcli_command('getversion'))
+    logger.debug("read version: {}".format(version3))
+    assert version3 >= version2
+    # slightly increase the version epoch
+    versionepoch2 = extract_version_epoch(run_fdbcli_command("targetversion setepoch {}".format(versionepoch1 + 1000000)))
+    logger.debug("version epoch: {}".format(versionepoch2))
+    assert versionepoch2 == versionepoch1 + 1000000
+    # slightly decrease the version epoch
+    versionepoch3 = extract_version_epoch(run_fdbcli_command("targetversion add {}".format(-1000000)))
+    logger.debug("version epoch: {}".format(versionepoch3))
+    assert versionepoch3 == versionepoch2 - 1000000 == versionepoch1
+    # the versions should still be increasing
+    version4 = int(run_fdbcli_command('getversion'))
+    logger.debug("read version: {}".format(version4))
+    assert version4 >= version3
+    # clear the version epoch and make sure it is now unset
+    run_fdbcli_command("targetversion clearepoch")
+    version5 = run_fdbcli_command('targetversion getepoch')
+    assert version5 == "Version epoch is unset"
+
+
 def get_value_from_status_json(retry, *args):
    while True:
        result = json.loads(run_fdbcli_command('status', 'json'))
@ -685,6 +724,9 @@ if __name__ == '__main__':
        throttle()
        triggerddteaminfolog()
        tenants()
+        # TODO: similar to advanceversion, this seems to cause some issues, so disable for now
+        # This must go last, otherwise the version advancement can mess with the other tests
+        # targetversion()
    else:
        assert args.process_number > 1, "Process number should be positive"
        coordinators()
--- a/cmake/FindGperftools.cmake
+++ b/cmake/FindGperftools.cmake
@ -52,7 +52,6 @@ mark_as_advanced(

 if (GPERFTOOLS_FOUND)
  add_library(gperftools UNKNOWN IMPORTED)
-  target_compile_definitions(gperftools PUBLIC USE_GPERFTOOLS)
  set_target_properties(gperftools PROPERTIES
    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC_AND_PROFILER}
    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
--- a/cmake/InstallLayout.cmake
+++ b/cmake/InstallLayout.cmake
@ -105,7 +105,7 @@ install_destinations(TGZ
  ETC etc/foundationdb
  LOG log/foundationdb
  DATA lib/foundationdb)
-copy_install_destinations(TGZ VERSIONED PREFIX "usr/lib/foundationdb-${FDB_VERSION}/")
+copy_install_destinations(TGZ VERSIONED PREFIX "usr/lib/foundationdb-${FDB_VERSION}${FDB_BUILDTIME_STRING}/")
 install_destinations(DEB
  BIN usr/bin
  SBIN usr/sbin
@ -193,11 +193,9 @@ set(CPACK_PACKAGE_CONTACT "fdb-dist@apple.com")
 set(CPACK_PACKAGE_VERSION_MAJOR ${FDB_MAJOR})
 set(CPACK_PACKAGE_VERSION_MINOR ${FDB_MINOR})
 set(CPACK_PACKAGE_VERSION_PATCH ${FDB_PATCH})
-set(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${FDB_VERSION}-${CPACK_SYSTEM_NAME}")
 set(CPACK_OUTPUT_FILE_PREFIX "${CMAKE_BINARY_DIR}/packages")
 set(CPACK_PACKAGE_DESCRIPTION_FILE ${CMAKE_SOURCE_DIR}/packaging/description)
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY
-  "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions.")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions.")
 set(CPACK_PACKAGE_ICON ${CMAKE_SOURCE_DIR}/packaging/foundationdb.ico)
 set(CPACK_PACKAGE_CONTACT "The FoundationDB Community")

@ -205,18 +203,16 @@ set(CPACK_COMPONENT_SERVER-EL7_DEPENDS clients-el7)
 set(CPACK_COMPONENT_SERVER-DEB_DEPENDS clients-deb)
 set(CPACK_COMPONENT_SERVER-TGZ_DEPENDS clients-tgz)
 set(CPACK_COMPONENT_SERVER-VERSIONED_DEPENDS clients-versioned)
-set(CPACK_RPM_SERVER-VERSIONED_PACKAGE_REQUIRES
-  "foundationdb${FDB_VERSION}-clients")

 set(CPACK_COMPONENT_SERVER-EL7_DISPLAY_NAME "foundationdb-server")
 set(CPACK_COMPONENT_SERVER-DEB_DISPLAY_NAME "foundationdb-server")
 set(CPACK_COMPONENT_SERVER-TGZ_DISPLAY_NAME "foundationdb-server")
-set(CPACK_COMPONENT_SERVER-VERSIONED_DISPLAY_NAME "foundationdb${FDB_VERSION}-server")
+set(CPACK_COMPONENT_SERVER-VERSIONED_DISPLAY_NAME "foundationdb${FDB_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-server")

 set(CPACK_COMPONENT_CLIENTS-EL7_DISPLAY_NAME "foundationdb-clients")
 set(CPACK_COMPONENT_CLIENTS-DEB_DISPLAY_NAME "foundationdb-clients")
 set(CPACK_COMPONENT_CLIENTS-TGZ_DISPLAY_NAME "foundationdb-clients")
-set(CPACK_COMPONENT_CLIENTS-VERSIONED_DISPLAY_NAME "foundationdb${FDB_VERSION}-clients")
+set(CPACK_COMPONENT_CLIENTS-VERSIONED_DISPLAY_NAME "foundationdb${FDB_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-clients")


 # MacOS needs a file extension for the LICENSE file
@ -228,63 +224,59 @@ configure_file(${CMAKE_SOURCE_DIR}/LICENSE ${CMAKE_BINARY_DIR}/License.txt COPYO

 if(NOT FDB_RELEASE)
  if(CURRENT_GIT_VERSION)
-    set(git_string ".${CURRENT_GIT_VERSION}")
+    string(SUBSTRING ${CURRENT_GIT_VERSION} 0 9 git_hash)
  endif()
  set(CPACK_RPM_PACKAGE_RELEASE 0)
-  set(not_fdb_release_string "-0${git_string}.SNAPSHOT")
+  set(package_version_postfix "-0.${git_hash}.SNAPSHOT")
+  set(git_string ".${git_hash}")
 else()
  set(CPACK_RPM_PACKAGE_RELEASE 1)
-  set(not_fdb_release_string "-1")
+  set(package_version_postfix "-1")
 endif()

-#############
-# Filenames #
-#############
-set(unversioned_postfix "${FDB_VERSION}${not_fdb_release_string}")
-# RPM filenames
-set(rpm-clients-filename "foundationdb-clients-${unversioned_postfix}")
-set(rpm-server-filename "foundationdb-server-${unversioned_postfix}")
-set(rpm-clients-versioned-filename "foundationdb${FDB_VERSION}-clients${prerelease_string}")
-set(rpm-server-versioned-filename "foundationdb${FDB_VERSION}-server${prerelease_string}")
-
-# Deb filenames
-set(deb-clients-filename "foundationdb-clients_${unversioned_postfix}")
-set(deb-server-filename "foundationdb-server_${unversioned_postfix}")
-set(deb-clients-versioned-filename "foundationdb${FDB_VERSION}-clients${prerelease_string}")
-set(deb-server-versioned-filename "foundationdb${FDB_VERSION}-server${prerelease_string}")
-
 ################################################################################
 # Configuration for RPM
 ################################################################################

+string(REPLACE "-" "_" FDB_PACKAGE_VERSION ${FDB_VERSION})
 set(CPACK_RPM_PACKAGE_LICENSE                              "Apache 2.0")
-
 set(CPACK_RPM_PACKAGE_NAME                                 "foundationdb")
-set(CPACK_RPM_CLIENTS-EL7_PACKAGE_NAME "foundationdb-clients")
-set(CPACK_RPM_SERVER-EL7_PACKAGE_NAME "foundationdb-server")
-set(CPACK_RPM_SERVER-VERSIONED_PACKAGE_NAME "foundationdb${FDB_VERSION}-server")
-set(CPACK_RPM_CLIENTS-VERSIONED_PACKAGE_NAME "foundationdb${FDB_VERSION}-clients")
+set(CPACK_RPM_CLIENTS-EL7_PACKAGE_NAME                     "${CPACK_RPM_PACKAGE_NAME}-clients")
+set(CPACK_RPM_CLIENTS-EL7_FILE_NAME                        "${CPACK_RPM_CLIENTS-EL7_PACKAGE_NAME}-${FDB_PACKAGE_VERSION}${package_version_postfix}.el7.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_CLIENTS-EL7_DEBUGINFO_FILE_NAME              "${CPACK_RPM_CLIENTS-EL7_PACKAGE_NAME}-${FDB_PACKAGE_VERSION}${package_version_postfix}.el7-debuginfo.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_CLIENTS-EL7_PRE_INSTALL_SCRIPT_FILE          ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/preclients.sh)
+set(CPACK_RPM_CLIENTS-EL7_POST_INSTALL_SCRIPT_FILE         ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/postclients.sh)
+set(CPACK_RPM_CLIENTS-EL7_USER_FILELIST                    "%dir /etc/foundationdb")

-set(CPACK_RPM_CLIENTS-EL7_FILE_NAME "${rpm-clients-filename}.el7.${CMAKE_SYSTEM_PROCESSOR}.rpm")
-set(CPACK_RPM_CLIENTS-VERSIONED_FILE_NAME "${rpm-clients-versioned-filename}.versioned.${CMAKE_SYSTEM_PROCESSOR}.rpm")
-set(CPACK_RPM_SERVER-EL7_FILE_NAME "${rpm-server-filename}.el7.${CMAKE_SYSTEM_PROCESSOR}.rpm")
-set(CPACK_RPM_SERVER-VERSIONED_FILE_NAME "${rpm-server-versioned-filename}.versioned.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_SERVER-EL7_PACKAGE_NAME                      "${CPACK_RPM_PACKAGE_NAME}-server")
+set(CPACK_RPM_SERVER-EL7_FILE_NAME                         "${CPACK_RPM_SERVER-EL7_PACKAGE_NAME}-${FDB_PACKAGE_VERSION}${package_version_postfix}.el7.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_SERVER-EL7_DEBUGINFO_FILE_NAME               "${CPACK_RPM_SERVER-EL7_PACKAGE_NAME}-${FDB_PACKAGE_VERSION}${package_version_postfix}.el7-debuginfo.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_SERVER-EL7_PACKAGE_REQUIRES                  "${CPACK_RPM_CLIENTS-EL7_PACKAGE_NAME} = ${FDB_PACKAGE_VERSION}")
+set(CPACK_RPM_SERVER-EL7_PRE_INSTALL_SCRIPT_FILE           ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/preserver.sh)
+set(CPACK_RPM_SERVER-EL7_POST_INSTALL_SCRIPT_FILE          ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/postserver.sh)
+set(CPACK_RPM_SERVER-EL7_PRE_UNINSTALL_SCRIPT_FILE         ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/preunserver.sh)
+set(CPACK_RPM_SERVER-EL7_USER_FILELIST                     "%config(noreplace) /etc/foundationdb/foundationdb.conf"
+                                                           "%attr(0700,foundationdb,foundationdb) /var/log/foundationdb"
+                                                           "%attr(0700,foundationdb,foundationdb) /var/lib/foundationdb")

-set(CPACK_RPM_CLIENTS-EL7_DEBUGINFO_FILE_NAME "${rpm-clients-filename}.el7-debuginfo.${CMAKE_SYSTEM_PROCESSOR}.rpm")
-set(CPACK_RPM_CLIENTS-VERSIONED_DEBUGINFO_FILE_NAME "${rpm-clients-versioned-filename}.versioned-debuginfo.${CMAKE_SYSTEM_PROCESSOR}.rpm")
-set(CPACK_RPM_SERVER-EL7_DEBUGINFO_FILE_NAME "${rpm-server-filename}.el7-debuginfo.${CMAKE_SYSTEM_PROCESSOR}.rpm")
-set(CPACK_RPM_SERVER-VERSIONED_DEBUGINFO_FILE_NAME "${rpm-server-versioned-filename}.versioned-debuginfo.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_CLIENTS-VERSIONED_PACKAGE_NAME               "${CPACK_RPM_PACKAGE_NAME}${FDB_PACKAGE_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-clients")
+set(CPACK_RPM_CLIENTS-VERSIONED_FILE_NAME                  "${CPACK_RPM_CLIENTS-VERSIONED_PACKAGE_NAME}${git_string}.versioned.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_CLIENTS-VERSIONED_DEBUGINFO_FILE_NAME        "${CPACK_RPM_CLIENTS-VERSIONED_PACKAGE_NAME}${git_string}.versioned-debuginfo.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_CLIENTS-VERSIONED_POST_INSTALL_SCRIPT_FILE   ${CMAKE_BINARY_DIR}/packaging/multiversion/clients/postinst-el7)
+set(CPACK_RPM_CLIENTS-VERSIONED_PRE_UNINSTALL_SCRIPT_FILE  ${CMAKE_BINARY_DIR}/packaging/multiversion/clients/prerm)
+
+set(CPACK_RPM_SERVER-VERSIONED_PACKAGE_NAME                "${CPACK_RPM_PACKAGE_NAME}${FDB_PACKAGE_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-server")
+set(CPACK_RPM_SERVER-VERSIONED_FILE_NAME                   "${CPACK_RPM_SERVER-VERSIONED_PACKAGE_NAME}${git_string}.versioned.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_SERVER-VERSIONED_DEBUGINFO_FILE_NAME         "${CPACK_RPM_SERVER-VERSIONED_PACKAGE_NAME}${git_string}.versioned-debuginfo.${CMAKE_SYSTEM_PROCESSOR}.rpm")
+set(CPACK_RPM_SERVER-VERSIONED_PACKAGE_REQUIRES            "${CPACK_COMPONENT_CLIENTS-VERSIONED_DISPLAY_NAME}")
+set(CPACK_RPM_SERVER-VERSIONED_POST_INSTALL_SCRIPT_FILE    ${CMAKE_BINARY_DIR}/packaging/multiversion/server/postinst-rpm)
+set(CPACK_RPM_SERVER-VERSIONED_PRE_UNINSTALL_SCRIPT_FILE   ${CMAKE_BINARY_DIR}/packaging/multiversion/server/prerm)

 file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/packaging/emptydir")
 fdb_install(DIRECTORY "${CMAKE_BINARY_DIR}/packaging/emptydir/" DESTINATION data COMPONENT server)
 fdb_install(DIRECTORY "${CMAKE_BINARY_DIR}/packaging/emptydir/" DESTINATION log COMPONENT server)
 fdb_install(DIRECTORY "${CMAKE_BINARY_DIR}/packaging/emptydir/" DESTINATION etc COMPONENT clients)

-set(CPACK_RPM_SERVER-EL7_USER_FILELIST
-  "%config(noreplace) /etc/foundationdb/foundationdb.conf"
-  "%attr(0700,foundationdb,foundationdb) /var/log/foundationdb"
-  "%attr(0700, foundationdb, foundationdb) /var/lib/foundationdb")
-set(CPACK_RPM_CLIENTS-EL7_USER_FILELIST "%dir /etc/foundationdb")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
  "/usr/sbin"
  "/usr/share/java"
@ -304,56 +296,26 @@ set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
  "/usr/lib/pkgconfig"
  "/usr/lib/foundationdb"
  "/usr/lib/cmake"
-  "/usr/lib/foundationdb-${FDB_VERSION}/etc/foundationdb"
+  "/usr/lib/foundationdb-${FDB_VERSION}${FDB_BUILDTIME_STRING}/etc/foundationdb"
  )
 set(CPACK_RPM_DEBUGINFO_PACKAGE ${GENERATE_DEBUG_PACKAGES})
 #set(CPACK_RPM_BUILD_SOURCE_FDB_INSTALL_DIRS_PREFIX /usr/src)
 set(CPACK_RPM_COMPONENT_INSTALL ON)

-set(CPACK_RPM_clients-el7_PRE_INSTALL_SCRIPT_FILE
-  ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/preclients.sh)
-
-set(CPACK_RPM_CLIENTS-EL7_POST_INSTALL_SCRIPT_FILE
-  ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/postclients.sh)
-
-set(CPACK_RPM_SERVER-EL7_PRE_INSTALL_SCRIPT_FILE
-  ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/preserver.sh)
-
-set(CPACK_RPM_SERVER-EL7_POST_INSTALL_SCRIPT_FILE
-  ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/postserver.sh)
-
-set(CPACK_RPM_SERVER-EL7_PRE_UNINSTALL_SCRIPT_FILE
-  ${CMAKE_SOURCE_DIR}/packaging/rpm/scripts/preunserver.sh)
-
-set(CPACK_RPM_SERVER-EL7_PACKAGE_REQUIRES
-  "foundationdb-clients = ${FDB_MAJOR}.${FDB_MINOR}.${FDB_PATCH}")
-
-set(CPACK_RPM_SERVER-VERSIONED_POST_INSTALL_SCRIPT_FILE
-  ${CMAKE_BINARY_DIR}/packaging/multiversion/server/postinst-rpm)
-
-set(CPACK_RPM_SERVER-VERSIONED_PRE_UNINSTALL_SCRIPT_FILE
-  ${CMAKE_BINARY_DIR}/packaging/multiversion/server/prerm)
-
-set(CPACK_RPM_CLIENTS-VERSIONED_POST_INSTALL_SCRIPT_FILE
-  ${CMAKE_BINARY_DIR}/packaging/multiversion/clients/postinst-el7)
-
-set(CPACK_RPM_CLIENTS-VERSIONED_PRE_UNINSTALL_SCRIPT_FILE
-  ${CMAKE_BINARY_DIR}/packaging/multiversion/clients/prerm)
-
 ################################################################################
 # Configuration for DEB
 ################################################################################

 if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
-  set(CPACK_DEBIAN_CLIENTS-DEB_FILE_NAME "${deb-clients-filename}_amd64.deb")
-  set(CPACK_DEBIAN_SERVER-DEB_FILE_NAME "${deb-server-filename}_amd64.deb")
-  set(CPACK_DEBIAN_CLIENTS-VERSIONED_FILE_NAME "${deb-clients-versioned-filename}.versioned_amd64.deb")
-  set(CPACK_DEBIAN_SERVER-VERSIONED_FILE_NAME "${deb-server-versioned-filename}.versioned_amd64.deb")
+  set(CPACK_DEBIAN_CLIENTS-DEB_FILE_NAME       "foundationdb-clients_${FDB_VERSION}${package_version_postfix}_amd64.deb")
+  set(CPACK_DEBIAN_SERVER-DEB_FILE_NAME        "foundationdb-server_${FDB_VERSION}${package_version_postfix}_amd64.deb")
+  set(CPACK_DEBIAN_CLIENTS-VERSIONED_FILE_NAME "foundationdb${FDB_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-clients${git_string}.versioned_amd64.deb")
+  set(CPACK_DEBIAN_SERVER-VERSIONED_FILE_NAME  "foundationdb${FDB_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-server${git_string}.versioned_amd64.deb")
 else()
-  set(CPACK_DEBIAN_CLIENTS-DEB_FILE_NAME "${deb-clients-filename}_${CMAKE_SYSTEM_PROCESSOR}.deb")
-  set(CPACK_DEBIAN_SERVER-DEB_FILE_NAME "${deb-server-filename}_${CMAKE_SYSTEM_PROCESSOR}.deb")
-  set(CPACK_DEBIAN_CLIENTS-VERSIONED_FILE_NAME "${deb-clients-versioned-filename}.versioned_${CMAKE_SYSTEM_PROCESSOR}.deb")
-  set(CPACK_DEBIAN_SERVER-VERSIONED_FILE_NAME "${deb-server-versioned-filename}.versioned_${CMAKE_SYSTEM_PROCESSOR}.deb")
+  set(CPACK_DEBIAN_CLIENTS-DEB_FILE_NAME       "foundationdb-clients_${FDB_VERSION}${package_version_postfix}_${CMAKE_SYSTEM_PROCESSOR}.deb")
+  set(CPACK_DEBIAN_SERVER-DEB_FILE_NAME        "foundationdb-server_${FDB_VERSION}${package_version_postfix}_${CMAKE_SYSTEM_PROCESSOR}.deb")
+  set(CPACK_DEBIAN_CLIENTS-VERSIONED_FILE_NAME "foundationdb${FDB_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-clients${git_string}.versioned_${CMAKE_SYSTEM_PROCESSOR}.deb")
+  set(CPACK_DEBIAN_SERVER-VERSIONED_FILE_NAME  "foundationdb${FDB_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-server${git_string}.versioned_${CMAKE_SYSTEM_PROCESSOR}.deb")
 endif()

 set(CPACK_DEB_COMPONENT_INSTALL ON)
@ -363,8 +325,8 @@ set(CPACK_DEBIAN_ENABLE_COMPONENT_DEPENDS ON)

 set(CPACK_DEBIAN_SERVER-DEB_PACKAGE_NAME "foundationdb-server")
 set(CPACK_DEBIAN_CLIENTS-DEB_PACKAGE_NAME "foundationdb-clients")
-set(CPACK_DEBIAN_SERVER-VERSIONED_PACKAGE_NAME "foundationdb${FDB_VERSION}-server")
-set(CPACK_DEBIAN_CLIENTS-VERSIONED_PACKAGE_NAME "foundationdb${FDB_VERSION}-clients")
+set(CPACK_DEBIAN_SERVER-VERSIONED_PACKAGE_NAME "foundationdb${FDB_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-server")
+set(CPACK_DEBIAN_CLIENTS-VERSIONED_PACKAGE_NAME "foundationdb${FDB_VERSION}${FDB_BUILDTIME_STRING}${PRERELEASE_TAG}-clients")

 set(CPACK_DEBIAN_SERVER-DEB_PACKAGE_DEPENDS "adduser, libc6 (>= 2.12), foundationdb-clients (= ${FDB_VERSION})")
 set(CPACK_DEBIAN_SERVER-DEB_PACKAGE_RECOMMENDS "python (>= 2.6)")
@ -387,12 +349,12 @@ set(CPACK_DEBIAN_SERVER-VERSIONED_PACKAGE_CONTROL_EXTRA
  ${CMAKE_BINARY_DIR}/packaging/multiversion/server/prerm)

 ################################################################################
-# Configuration for DEB
+# Configuration for TGZ
 ################################################################################

 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
-set(CPACK_ARCHIVE_CLIENTS-TGZ_FILE_NAME "${deb-clients-filename}.${CMAKE_SYSTEM_PROCESSOR}")
-set(CPACK_ARCHIVE_SERVER-TGZ_FILE_NAME "${deb-server-filename}.${CMAKE_SYSTEM_PROCESSOR}")
+set(CPACK_ARCHIVE_CLIENTS-TGZ_FILE_NAME "foundationdb-clients_${FDB_VERSION}${package_version_postfix}.${CMAKE_SYSTEM_PROCESSOR}")
+set(CPACK_ARCHIVE_SERVER-TGZ_FILE_NAME "foundationdb-server_${FDB_VERSION}${package_version_postfix}.${CMAKE_SYSTEM_PROCESSOR}")

 ################################################################################
 # Server configuration
@ -418,10 +380,10 @@ if(NOT WIN32)
    RENAME "foundationdb"
    COMPONENT server-deb)
  install(FILES ${CMAKE_SOURCE_DIR}/packaging/rpm/foundationdb.service
-    DESTINATION "usr/lib/foundationdb-${FDB_VERSION}/lib/systemd/system"
+    DESTINATION "usr/lib/foundationdb-${FDB_VERSION}${FDB_BUILDTIME_STRING}/lib/systemd/system"
    COMPONENT server-versioned)
  install(PROGRAMS ${CMAKE_SOURCE_DIR}/packaging/deb/foundationdb-init
-    DESTINATION "usr/lib/foundationdb-${FDB_VERSION}/etc/init.d"
+    DESTINATION "usr/lib/foundationdb-${FDB_VERSION}${FDB_BUILDTIME_STRING}/etc/init.d"
    RENAME "foundationdb"
    COMPONENT server-versioned)
 endif()
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@ -28,7 +28,6 @@ Features
 * Improved the efficiency with which storage servers replicate data between themselves. `(PR #5017) <https://github.com/apple/foundationdb/pull/5017>`_
 * Added support to ``exclude command`` to exclude based on locality match. `(PR #5113) <https://github.com/apple/foundationdb/pull/5113>`_
 * Add the ``trace_partial_file_suffix`` network option. This option will give unfinished trace files a special suffix to indicate they're not complete yet. When the trace file is complete, it is renamed to remove the suffix. `(PR #5328) <https://github.com/apple/foundationdb/pull/5328>`_
-* Added "get range and flat map" feature with new APIs (see Bindings section). Storage servers are able to generate the keys in the queries based on another query. With this, upper layer can push some computations down to FDB, to improve latency and bandwidth when read. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_

 Performance
 -----------
@ -85,8 +84,6 @@ Bindings
 * C: Added a function, ``fdb_database_create_snapshot``, to create a snapshot of the database. `(PR #4241) <https://github.com/apple/foundationdb/pull/4241/files>`_
 * C: Added ``fdb_database_get_main_thread_busyness`` function to report how busy a client's main thread is. `(PR #4504) <https://github.com/apple/foundationdb/pull/4504>`_
 * Java: Added ``Database.getMainThreadBusyness`` function to report how busy a client's main thread is. `(PR #4564) <https://github.com/apple/foundationdb/pull/4564>`_
-* C: Added ``fdb_transaction_get_range_and_flat_map`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_
-* Java: Added ``Transaction.getRangeAndFlatMap`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_

 Other Changes
 -------------
--- a/documentation/sphinx/source/release-notes/release-notes-710.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-710.rst
@ -10,6 +10,7 @@ Release Notes
 Features
 --------
 * Added ``USE_GRV_CACHE`` transaction option to allow read versions to be locally cached on the client side for latency optimizations. `(PR #5725) <https://github.com/apple/foundationdb/pull/5725>`_ `(PR #6664) <https://github.com/apple/foundationdb/pull/6664>`_
+* Added "get range and flat map" feature with new APIs (see Bindings section). Storage servers are able to generate the keys in the queries based on another query. With this, upper layer can push some computations down to FDB, to improve latency and bandwidth when read. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_, `(PR #6181) <https://github.com/apple/foundationdb/pull/6181>`_, etc..

 Performance
 -----------
@ -25,6 +26,8 @@ Status

 Bindings
 --------
+* C: Added ``fdb_transaction_get_range_and_flat_map`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_
+* Java: Added ``Transaction.getRangeAndFlatMap`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_

 Other Changes
 -------------
--- a/fdbcli/CMakeLists.txt
+++ b/fdbcli/CMakeLists.txt
@ -29,6 +29,7 @@ set(FDBCLI_SRCS
  TriggerDDTeamInfoLogCommand.actor.cpp
  TssqCommand.actor.cpp
  Util.actor.cpp
+  VersionEpochCommand.actor.cpp
  linenoise/linenoise.h)

 if(NOT WIN32)
--- a/fdbcli/VersionEpochCommand.actor.cpp
+++ b/fdbcli/VersionEpochCommand.actor.cpp
@ -0,0 +1,174 @@
+/*
+ * VersionEpochCommand.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "boost/lexical_cast.hpp"
+
+#include "fdbcli/fdbcli.actor.h"
+
+#include "fdbclient/IClientApi.h"
+#include "fdbclient/ManagementAPI.actor.h"
+
+#include "flow/Arena.h"
+#include "flow/FastRef.h"
+#include "flow/ThreadHelper.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+namespace fdb_cli {
+
+const KeyRef versionEpochSpecialKey = LiteralStringRef("\xff\xff/management/version_epoch");
+
+struct VersionInfo {
+	int64_t version;
+	int64_t expectedVersion;
+};
+
+ACTOR static Future<Optional<VersionInfo>> getVersionInfo(Reference<IDatabase> db) {
+	state Reference<ITransaction> tr = db->createTransaction();
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			state Version rv = wait(safeThreadFutureToFuture(tr->getReadVersion()));
+			state ThreadFuture<Optional<Value>> versionEpochValFuture = tr->get(versionEpochKey);
+			Optional<Value> versionEpochVal = wait(safeThreadFutureToFuture(versionEpochValFuture));
+			if (!versionEpochVal.present()) {
+				return Optional<VersionInfo>();
+			}
+			int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(versionEpochVal.get(), Unversioned());
+			int64_t expected = g_network->timer() * CLIENT_KNOBS->CORE_VERSIONSPERSECOND - versionEpoch;
+			return VersionInfo{ rv, expected };
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR static Future<Optional<int64_t>> getVersionEpoch(Reference<ITransaction> tr) {
+	loop {
+		try {
+			state ThreadFuture<Optional<Value>> versionEpochValFuture = tr->get(versionEpochSpecialKey);
+			Optional<Value> versionEpochVal = wait(safeThreadFutureToFuture(versionEpochValFuture));
+			return versionEpochVal.present() ? boost::lexical_cast<int64_t>(versionEpochVal.get().toString())
+			                                 : Optional<int64_t>();
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR Future<bool> versionEpochCommandActor(Reference<IDatabase> db, Database cx, std::vector<StringRef> tokens) {
+	if (tokens.size() <= 3) {
+		state Reference<ITransaction> tr = db->createTransaction();
+		if (tokens.size() == 1) {
+			Optional<VersionInfo> versionInfo = wait(getVersionInfo(db));
+			if (versionInfo.present()) {
+				int64_t diff = versionInfo.get().expectedVersion - versionInfo.get().version;
+				printf("Version:    %" PRId64 "\n", versionInfo.get().version);
+				printf("Expected:   %" PRId64 "\n", versionInfo.get().expectedVersion);
+				printf("Difference: %" PRId64 " (%.2fs)\n", diff, 1.0 * diff / CLIENT_KNOBS->VERSIONS_PER_SECOND);
+			} else {
+				printf("Version epoch is unset\n");
+			}
+			return true;
+		} else if (tokens.size() == 2 && tokencmp(tokens[1], "get")) {
+			Optional<int64_t> versionEpoch = wait(getVersionEpoch(db->createTransaction()));
+			if (versionEpoch.present()) {
+				printf("Current version epoch is %" PRId64 "\n", versionEpoch.get());
+			} else {
+				printf("Version epoch is unset\n");
+			}
+			return true;
+		} else if (tokens.size() == 2 && tokencmp(tokens[1], "disable")) {
+			// Clearing the version epoch means versions will no longer attempt
+			// to advance at the same rate as the clock. The current version
+			// will remain unchanged.
+			loop {
+				try {
+					tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					Optional<int64_t> versionEpoch = wait(getVersionEpoch(db->createTransaction()));
+					if (!versionEpoch.present()) {
+						return true;
+					} else {
+						tr->clear(versionEpochSpecialKey);
+						wait(safeThreadFutureToFuture(tr->commit()));
+					}
+				} catch (Error& e) {
+					wait(safeThreadFutureToFuture(tr->onError(e)));
+				}
+			}
+		} else if ((tokens.size() == 2 && tokencmp(tokens[1], "enable")) ||
+		           (tokens.size() == 3 && tokencmp(tokens[1], "set"))) {
+			state int64_t v;
+			if (tokens.size() == 3) {
+				int n = 0;
+				if (sscanf(tokens[2].toString().c_str(), "%" SCNd64 "%n", &v, &n) != 1 || n != tokens[2].size()) {
+					printUsage(tokens[0]);
+					return false;
+				}
+			} else {
+				v = 0; // default version epoch
+			}
+
+			loop {
+				try {
+					tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					Optional<int64_t> versionEpoch = wait(getVersionEpoch(tr));
+					if (!versionEpoch.present() || (versionEpoch.get() != v && tokens.size() == 3)) {
+						tr->set(versionEpochSpecialKey, BinaryWriter::toValue(v, Unversioned()));
+						wait(safeThreadFutureToFuture(tr->commit()));
+					} else {
+						printf("Version epoch enabled. Run `versionepoch commit` to irreversibly jump to the target "
+						       "version\n");
+						return true;
+					}
+				} catch (Error& e) {
+					wait(safeThreadFutureToFuture(tr->onError(e)));
+				}
+			}
+		} else if (tokens.size() == 2 && tokencmp(tokens[1], "commit")) {
+			Optional<VersionInfo> versionInfo = wait(getVersionInfo(db));
+			if (versionInfo.present()) {
+				wait(advanceVersion(cx, versionInfo.get().expectedVersion));
+			} else {
+				printf("Must set the version epoch before committing it (see `versionepoch enable`)\n");
+			}
+			return true;
+		}
+	}
+
+	printUsage(tokens[0]);
+	return false;
+}
+
+CommandFactory versionEpochFactory(
+    "versionepoch",
+    CommandHelp("versionepoch [<enable|commit|set|disable> [EPOCH]]",
+                "Read or write the version epoch",
+                "If no arguments are specified, reports the offset between the expected version "
+                "and the actual version. Otherwise, enables, disables, or commits the version epoch. "
+                "Setting the version epoch can be irreversible since it can cause a large verison jump. "
+                "Thus, the version epoch must first by enabled with the enable or set command. This "
+                "causes a recovery. Once the version epoch has been set, versions may be given out at "
+                "a faster or slower rate to attempt to match the actual version to the expected version, "
+                "based on the version epoch. After setting the version, run the commit command to perform "
+                "a one time jump to the expected version. This is useful when there is a very large gap "
+                "between the current version and the expected version. Note that once a version jump has "
+                "occurred, it cannot be undone. Run this command without any arguments to see the current "
+                "and expected version."));
+} // namespace fdb_cli
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -1646,6 +1646,13 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 					continue;
 				}

+				if (tokencmp(tokens[0], "versionepoch")) {
+					bool _result = wait(makeInterruptable(versionEpochCommandActor(db, localDb, tokens)));
+					if (!_result)
+						is_error = true;
+					continue;
+				}
+
 				if (tokencmp(tokens[0], "kill")) {
 					getTransaction(db, managementTenant, tr, options, intrans);
 					bool _result = wait(makeInterruptable(killCommandActor(db, tr, tokens, &address_interface)));
--- a/fdbcli/fdbcli.actor.h
+++ b/fdbcli/fdbcli.actor.h
@ -210,6 +210,10 @@ ACTOR Future<bool> throttleCommandActor(Reference<IDatabase> db, std::vector<Str
 ACTOR Future<bool> triggerddteaminfologCommandActor(Reference<IDatabase> db);
 // tssq command
 ACTOR Future<bool> tssqCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
+// versionepoch command
+ACTOR Future<bool> versionEpochCommandActor(Reference<IDatabase> db, Database cx, std::vector<StringRef> tokens);
+// targetversion command
+ACTOR Future<bool> targetVersionCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);

 } // namespace fdb_cli

--- a/fdbclient/BlobWorkerInterface.h
+++ b/fdbclient/BlobWorkerInterface.h
@ -44,7 +44,18 @@ struct BlobWorkerInterface {
 	BlobWorkerInterface() {}
 	explicit BlobWorkerInterface(const struct LocalityData& l, UID id) : locality(l), myId(id) {}

-	void initEndpoints() {}
+	void initEndpoints() {
+		// TODO: specify endpoint priorities?
+		std::vector<std::pair<FlowReceiver*, TaskPriority>> streams;
+		streams.push_back(waitFailure.getReceiver());
+		streams.push_back(blobGranuleFileRequest.getReceiver());
+		streams.push_back(assignBlobRangeRequest.getReceiver());
+		streams.push_back(revokeBlobRangeRequest.getReceiver());
+		streams.push_back(granuleAssignmentsRequest.getReceiver());
+		streams.push_back(granuleStatusStreamRequest.getReceiver());
+		streams.push_back(haltBlobWorker.getReceiver());
+		FlowTransport::transport().addEndpoints(streams);
+	}
 	UID id() const { return myId; }
 	NetworkAddress address() const { return blobGranuleFileRequest.getEndpoint().getPrimaryAddress(); }
 	NetworkAddress stableAddress() const { return blobGranuleFileRequest.getEndpoint().getStableAddress(); }
@ -54,16 +65,22 @@ struct BlobWorkerInterface {

 	template <class Archive>
 	void serialize(Archive& ar) {
-		serializer(ar,
-		           waitFailure,
-		           blobGranuleFileRequest,
-		           assignBlobRangeRequest,
-		           revokeBlobRangeRequest,
-		           granuleAssignmentsRequest,
-		           granuleStatusStreamRequest,
-		           haltBlobWorker,
-		           locality,
-		           myId);
+		// use adjusted endpoints
+		serializer(ar, myId, locality, waitFailure);
+		if (Archive::isDeserializing) {
+			blobGranuleFileRequest =
+			    RequestStream<struct BlobGranuleFileRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(1));
+			assignBlobRangeRequest =
+			    RequestStream<struct AssignBlobRangeRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(2));
+			revokeBlobRangeRequest =
+			    RequestStream<struct RevokeBlobRangeRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(3));
+			granuleAssignmentsRequest =
+			    RequestStream<struct GetGranuleAssignmentsRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(4));
+			granuleStatusStreamRequest =
+			    RequestStream<struct GranuleStatusStreamRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(5));
+			haltBlobWorker =
+			    RequestStream<struct HaltBlobWorkerRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(6));
+		}
 	}
 };

--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -150,6 +150,7 @@ set(FDBCLIENT_SRCS
  VersionedMap.actor.h
  VersionedMap.h
  VersionedMap.cpp
+  VersionVector.h
  WellKnownEndpoints.h
  WriteMap.h
  json_spirit/json_spirit_error_position.h
--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -56,6 +56,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( MAX_COMMIT_PROXY_CONNECTIONS,              5 ); if( randomize && BUGGIFY ) MAX_COMMIT_PROXY_CONNECTIONS = 1;
 	init( MAX_GRV_PROXY_CONNECTIONS,                 3 ); if( randomize && BUGGIFY ) MAX_GRV_PROXY_CONNECTIONS = 1;
 	init( STATUS_IDLE_TIMEOUT,                   120.0 );
+	init( SEND_ENTIRE_VERSION_VECTOR,            false );

 	// wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin

--- a/fdbclient/ClientKnobs.h
+++ b/fdbclient/ClientKnobs.h
@ -55,6 +55,7 @@ public:
 	int MAX_COMMIT_PROXY_CONNECTIONS;
 	int MAX_GRV_PROXY_CONNECTIONS;
 	double STATUS_IDLE_TIMEOUT;
+	bool SEND_ENTIRE_VERSION_VECTOR;

 	// wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin
 	double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@ -30,6 +30,7 @@
 #include "fdbclient/CommitTransaction.h"
 #include "fdbclient/TagThrottle.actor.h"
 #include "fdbclient/GlobalConfig.h"
+#include "fdbclient/VersionVector.h"

 #include "fdbrpc/Stats.h"
 #include "fdbrpc/TimedRequest.h"
@ -205,6 +206,9 @@ struct GetReadVersionReply : public BasicLoadBalancedReply {

 	TransactionTagMap<ClientTagThrottleLimits> tagThrottleInfo;

+	VersionVector ssVersionVectorDelta;
+	UID proxyId; // GRV proxy ID to detect old GRV proxies at client side
+
 	GetReadVersionReply() : version(invalidVersion), locked(false) {}

 	template <class Ar>
@ -217,7 +221,9 @@ struct GetReadVersionReply : public BasicLoadBalancedReply {
 		           tagThrottleInfo,
 		           midShardSize,
 		           rkDefaultThrottled,
-		           rkBatchThrottled);
+		           rkBatchThrottled,
+		           ssVersionVectorDelta,
+		           proxyId);
 	}
 };

@ -246,15 +252,18 @@ struct GetReadVersionRequest : TimedRequest {
 	Optional<UID> debugID;
 	ReplyPromise<GetReadVersionReply> reply;

-	GetReadVersionRequest() : transactionCount(1), flags(0) {}
+	Version maxVersion; // max version in the client's version vector cache
+
+	GetReadVersionRequest() : transactionCount(1), flags(0), maxVersion(invalidVersion) {}
 	GetReadVersionRequest(SpanID spanContext,
 	                      uint32_t transactionCount,
 	                      TransactionPriority priority,
+	                      Version maxVersion,
 	                      uint32_t flags = 0,
 	                      TransactionTagMap<uint32_t> tags = TransactionTagMap<uint32_t>(),
 	                      Optional<UID> debugID = Optional<UID>())
 	  : spanContext(spanContext), transactionCount(transactionCount), flags(flags), priority(priority), tags(tags),
-	    debugID(debugID) {
+	    debugID(debugID), maxVersion(maxVersion) {
 		flags = flags & ~FLAG_PRIORITY_MASK;
 		switch (priority) {
 		case TransactionPriority::BATCH:
@ -275,7 +284,7 @@ struct GetReadVersionRequest : TimedRequest {

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, transactionCount, flags, tags, debugID, reply, spanContext);
+		serializer(ar, transactionCount, flags, tags, debugID, reply, spanContext, maxVersion);

 		if (ar.isDeserializing) {
 			if ((flags & PRIORITY_SYSTEM_IMMEDIATE) == PRIORITY_SYSTEM_IMMEDIATE) {
@ -300,9 +309,16 @@ struct GetKeyServerLocationsReply {
 	// if any storage servers in results have a TSS pair, that mapping is in here
 	std::vector<std::pair<UID, StorageServerInterface>> resultsTssMapping;

+	// maps storage server interfaces (captured in "results") to the tags of
+	// their corresponding storage servers
+	// @note this map allows the client to identify the latest commit versions
+	// of storage servers (the version vector, which captures the latest commit
+	// versions of storage servers, identifies storage servers by their tags).
+	std::vector<std::pair<UID, Tag>> resultsTagMapping;
+
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, results, resultsTssMapping, tenantEntry, arena);
+		serializer(ar, results, resultsTssMapping, tenantEntry, arena, resultsTagMapping);
 	}
 };

@ -348,6 +364,7 @@ struct GetRawCommittedVersionReply {
 	bool locked;
 	Optional<Value> metadataVersion;
 	Version minKnownCommittedVersion;
+	VersionVector ssVersionVectorDelta;

 	GetRawCommittedVersionReply()
 	  : debugID(Optional<UID>()), version(invalidVersion), locked(false), metadataVersion(Optional<Value>()),
@ -355,7 +372,7 @@ struct GetRawCommittedVersionReply {

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, debugID, version, locked, metadataVersion, minKnownCommittedVersion);
+		serializer(ar, debugID, version, locked, metadataVersion, minKnownCommittedVersion, ssVersionVectorDelta);
 	}
 };

@ -364,14 +381,17 @@ struct GetRawCommittedVersionRequest {
 	SpanID spanContext;
 	Optional<UID> debugID;
 	ReplyPromise<GetRawCommittedVersionReply> reply;
+	Version maxVersion; // max version in the grv proxy's version vector cache

-	explicit GetRawCommittedVersionRequest(SpanID spanContext, Optional<UID> const& debugID = Optional<UID>())
-	  : spanContext(spanContext), debugID(debugID) {}
-	explicit GetRawCommittedVersionRequest() : spanContext(), debugID() {}
+	explicit GetRawCommittedVersionRequest(SpanID spanContext,
+	                                       Optional<UID> const& debugID = Optional<UID>(),
+	                                       Version maxVersion = invalidVersion)
+	  : spanContext(spanContext), debugID(debugID), maxVersion(maxVersion) {}
+	explicit GetRawCommittedVersionRequest() : spanContext(), debugID(), maxVersion(invalidVersion) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, debugID, reply, spanContext);
+		serializer(ar, debugID, reply, spanContext, maxVersion);
 	}
 };

--- a/fdbclient/CommitTransaction.h
+++ b/fdbclient/CommitTransaction.h
@ -177,27 +177,40 @@ static inline bool isNonAssociativeOp(MutationRef::Type mutationType) {
 }

 struct CommitTransactionRef {
-	CommitTransactionRef() : read_snapshot(0), report_conflicting_keys(false) {}
+	CommitTransactionRef() = default;
 	CommitTransactionRef(Arena& a, const CommitTransactionRef& from)
 	  : read_conflict_ranges(a, from.read_conflict_ranges), write_conflict_ranges(a, from.write_conflict_ranges),
 	    mutations(a, from.mutations), read_snapshot(from.read_snapshot),
-	    report_conflicting_keys(from.report_conflicting_keys) {}
+	    report_conflicting_keys(from.report_conflicting_keys), lock_aware(from.lock_aware),
+	    spanContext(from.spanContext) {}
+
 	VectorRef<KeyRangeRef> read_conflict_ranges;
 	VectorRef<KeyRangeRef> write_conflict_ranges;
-	VectorRef<MutationRef> mutations;
-	Version read_snapshot;
-	bool report_conflicting_keys;
+	VectorRef<MutationRef> mutations; // metadata mutations
+	Version read_snapshot = 0;
+	bool report_conflicting_keys = false;
+	bool lock_aware = false; // set when metadata mutations are present
+	Optional<SpanID> spanContext;

 	template <class Ar>
 	force_inline void serialize(Ar& ar) {
 		if constexpr (is_fb_function<Ar>) {
-			serializer(
-			    ar, read_conflict_ranges, write_conflict_ranges, mutations, read_snapshot, report_conflicting_keys);
+			serializer(ar,
+			           read_conflict_ranges,
+			           write_conflict_ranges,
+			           mutations,
+			           read_snapshot,
+			           report_conflicting_keys,
+			           lock_aware,
+			           spanContext);
 		} else {
 			serializer(ar, read_conflict_ranges, write_conflict_ranges, mutations, read_snapshot);
 			if (ar.protocolVersion().hasReportConflictingKeys()) {
 				serializer(ar, report_conflicting_keys);
 			}
+			if (ar.protocolVersion().hasSpanContext()) {
+				serializer(ar, lock_aware, spanContext);
+			}
 		}
 	}

--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -34,6 +34,7 @@
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
+#include "fdbclient/VersionVector.h"
 #include "fdbrpc/QueueModel.h"
 #include "fdbrpc/MultiInterface.h"
 #include "flow/TDMetric.actor.h"
@ -288,6 +289,7 @@ public:
 	Reference<CommitProxyInfo> getCommitProxies(UseProvisionalProxies useProvisionalProxies);
 	Future<Reference<CommitProxyInfo>> getCommitProxiesFuture(UseProvisionalProxies useProvisionalProxies);
 	Reference<GrvProxyInfo> getGrvProxies(UseProvisionalProxies useProvisionalProxies);
+	bool isCurrentGrvProxy(UID proxyId) const;
 	Future<Void> onProxiesChanged() const;
 	Future<HealthMetrics> getHealthMetrics(bool detailed);
 	// Pass a negative value for `shardLimit` to indicate no limit on the shard number.
@ -372,6 +374,9 @@ public:
 	Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
 	Future<Void> popChangeFeedMutations(Key rangeID, Version version);

+	Future<Key> purgeBlobGranules(KeyRange keyRange, Version purgeVersion, bool force = false);
+	Future<Void> waitPurgeGranulesComplete(Key purgeKey);
+
 	// private:
 	explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord,
 	                         Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
@ -463,6 +468,12 @@ public:

 	Reference<ChangeFeedStorageData> getStorageData(StorageServerInterface interf);

+	// map from ssid -> ss tag
+	// @note this map allows the client to identify the latest commit versions
+	// of storage servers (note that "ssVersionVectorCache" identifies storage
+	// servers by their tags).
+	std::unordered_map<UID, Tag> ssidTagMapping;
+
 	UID dbId;
 	IsInternal internal; // Only contexts created through the C client and fdbcli are non-internal

@ -513,6 +524,7 @@ public:
 	Counter transactionsExpensiveClearCostEstCount;
 	Counter transactionGrvFullBatches;
 	Counter transactionGrvTimedOutBatches;
+	Counter transactionsStaleVersionVectors;

 	ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
 	    bytesPerCommit, bgLatencies, bgGranulesPerRequest;
@ -589,6 +601,9 @@ public:
 	static bool debugUseTags;
 	static const std::vector<std::string> debugTransactionTagChoices;

+	// Cache of the latest commit versions of storage servers.
+	VersionVector ssVersionVectorCache;
+
 	// Adds or updates the specified (SS, TSS) pair in the TSS mapping (if not already present).
 	// Requests to the storage server will be duplicated to the TSS.
 	void addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi);
@ -597,6 +612,17 @@ public:
 	// Requests to the storage server will no longer be duplicated to its pair TSS.
 	void removeTssMapping(StorageServerInterface const& ssi);

+	// Adds or updates the specified (UID, Tag) pair in the tag mapping.
+	void addSSIdTagMapping(const UID& uid, const Tag& tag);
+
+	// Returns the latest commit versions that mutated the specified storage servers
+	/// @note returns the latest commit version for a storage server only if the latest
+	// commit version of that storage server is below the specified "readVersion".
+	void getLatestCommitVersions(const Reference<LocationInfo>& locationInfo,
+	                             Version readVersion,
+	                             Reference<TransactionState> info,
+	                             VersionVector& latestCommitVersions);
+
 	// used in template functions to create a transaction
 	using TransactionT = ReadYourWritesTransaction;
 	Reference<TransactionT> createTransaction();
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -22,10 +22,12 @@
 #define FDBCLIENT_FDBTYPES_H

 #include <algorithm>
+#include <cinttypes>
 #include <set>
 #include <string>
 #include <vector>
 #include <unordered_set>
+#include <boost/functional/hash.hpp>

 #include "flow/Arena.h"
 #include "flow/FastRef.h"
@ -94,6 +96,8 @@ struct Tag {

 	int toTagDataIndex() const { return locality >= 0 ? 2 * locality : 1 - (2 * locality); }

+	bool isNonPrimaryTLogType() const { return locality < 0; }
+
 	std::string toString() const { return format("%d:%d", locality, id); }

 	template <class Ar>
@ -147,6 +151,18 @@ struct Traceable<Tag> : std::true_type {
 	static std::string toString(const Tag& value) { return value.toString(); }
 };

+namespace std {
+template <>
+struct hash<Tag> {
+	std::size_t operator()(const Tag& tag) const {
+		std::size_t seed = 0;
+		boost::hash_combine(seed, std::hash<int8_t>{}(tag.locality));
+		boost::hash_combine(seed, std::hash<uint16_t>{}(tag.id));
+		return seed;
+	}
+};
+} // namespace std
+
 static const Tag invalidTag{ tagLocalitySpecial, 0 };
 static const Tag txsTag{ tagLocalitySpecial, 1 };
 static const Tag cacheTag{ tagLocalitySpecial, 2 };
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@ -29,6 +29,8 @@

 #include "flow/ThreadHelper.actor.h"

+struct VersionVector;
+
 // An interface that represents a transaction created by a client
 class ITransaction {
 public:
@ -94,6 +96,11 @@ public:

 	virtual ThreadFuture<Void> commit() = 0;
 	virtual Version getCommittedVersion() = 0;
+	// @todo This API and the "getSpanID()" API may help with debugging simulation
+	// test failures. (These APIs are not currently invoked anywhere.) Remove them
+	// later if they are not really needed.
+	virtual VersionVector getVersionVector() = 0;
+	virtual UID getSpanID() = 0;
 	virtual ThreadFuture<int64_t> getApproximateSize() = 0;

 	virtual void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
@ -152,6 +159,11 @@ public:
 	// Management API, create snapshot
 	virtual ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) = 0;

+	// purge blob granules api. purgeBlobGranules is asynchronus, calling waitPurgeGranulesComplete after guarantees
+	// completion.
+	virtual ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) = 0;
+	virtual ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) = 0;
+
 	// Interface to manage shared state across multiple connections to the same Database
 	virtual ThreadFuture<DatabaseSharedState*> createSharedState() = 0;
 	virtual void setSharedState(DatabaseSharedState* p) = 0;
--- a/fdbclient/IConfigTransaction.h
+++ b/fdbclient/IConfigTransaction.h
@ -44,6 +44,8 @@ public:

 	// Not implemented:
 	void setVersion(Version) override { throw client_invalid_operation(); }
+	VersionVector getVersionVector() const override { throw client_invalid_operation(); }
+	UID getSpanID() const override { throw client_invalid_operation(); }
 	Future<Key> getKey(KeySelector const& key, Snapshot snapshot = Snapshot::False) override {
 		throw client_invalid_operation();
 	}
--- a/fdbclient/ISingleThreadTransaction.h
+++ b/fdbclient/ISingleThreadTransaction.h
@ -94,6 +94,8 @@ public:
 	virtual void addWriteConflictRange(KeyRangeRef const& keys) = 0;
 	virtual Future<Void> commit() = 0;
 	virtual Version getCommittedVersion() const = 0;
+	virtual VersionVector getVersionVector() const = 0;
+	virtual UID getSpanID() const = 0;
 	virtual int64_t getApproximateSize() const = 0;
 	virtual Future<Standalone<StringRef>> getVersionstamp() = 0;
 	virtual void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@ -364,6 +364,32 @@ TEST_CASE("/fdbclient/MonitorLeader/ConnectionString") {
 	return Void();
 }

+TEST_CASE("/fdbclient/MonitorLeader/PartialResolve") {
+	std::string connectionString = "TestCluster:0@host.name:1234,host-name:5678";
+	std::string hn = "host-name", port = "5678";
+
+	state NetworkAddress address = NetworkAddress::parse("1.0.0.0:5678");
+
+	INetworkConnections::net()->addMockTCPEndpoint(hn, port, { address });
+
+	state ClusterConnectionString cs(connectionString);
+
+	state std::unordered_set<NetworkAddress> coordinatorAddresses;
+	std::vector<Future<Void>> fs;
+	for (auto& hostname : cs.hostnames) {
+		fs.push_back(map(hostname.resolve(), [&](Optional<NetworkAddress> const& addr) -> Void {
+			if (addr.present()) {
+				coordinatorAddresses.insert(addr.get());
+			}
+			return Void();
+		}));
+	}
+	wait(waitForAll(fs));
+	ASSERT(coordinatorAddresses.size() == 1 && coordinatorAddresses.count(address) == 1);
+
+	return Void();
+}
+
 TEST_CASE("/flow/FlatBuffers/LeaderInfo") {
 	{
 		LeaderInfo in;
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -25,6 +25,7 @@
 #include "fdbclient/MultiVersionAssignmentVars.h"
 #include "fdbclient/ClientVersion.h"
 #include "fdbclient/LocalClientAPI.h"
+#include "fdbclient/VersionVector.h"

 #include "flow/ThreadPrimitives.h"
 #include "flow/network.h"
@ -386,6 +387,10 @@ void DLTransaction::reset() {
 	api->transactionReset(tr);
 }

+VersionVector DLTransaction::getVersionVector() {
+	return VersionVector(); // not implemented
+}
+
 // DLTenant
 Reference<ITransaction> DLTenant::createTransaction() {
 	ASSERT(api->tenantCreateTransaction != nullptr);
@ -516,6 +521,38 @@ ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVer
 	});
 }

+ThreadFuture<Key> DLDatabase::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) {
+	if (!api->purgeBlobGranules) {
+		return unsupported_operation();
+	}
+	FdbCApi::FDBFuture* f = api->purgeBlobGranules(db,
+	                                               keyRange.begin.begin(),
+	                                               keyRange.begin.size(),
+	                                               keyRange.end.begin(),
+	                                               keyRange.end.size(),
+	                                               purgeVersion,
+	                                               force);
+
+	return toThreadFuture<Key>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		const uint8_t* key;
+		int keyLength;
+		FdbCApi::fdb_error_t error = api->futureGetKey(f, &key, &keyLength);
+		ASSERT(!error);
+
+		// The memory for this is stored in the FDBFuture and is released when the future gets destroyed
+		return Key(KeyRef(key, keyLength), Arena());
+	});
+}
+
+ThreadFuture<Void> DLDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
+	if (!api->waitPurgeGranulesComplete) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->waitPurgeGranulesComplete(db, purgeKey.begin(), purgeKey.size());
+	return toThreadFuture<Void>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); });
+}
+
 // DLApi

 // Loads the specified function from a dynamic library
@ -590,6 +627,15 @@ void DLApi::init() {
 	loadClientFunction(
 	    &api->databaseCreateSnapshot, lib, fdbCPath, "fdb_database_create_snapshot", headerVersion >= 700);

+	loadClientFunction(
+	    &api->purgeBlobGranules, lib, fdbCPath, "fdb_database_purge_blob_granules", headerVersion >= 710);
+
+	loadClientFunction(&api->waitPurgeGranulesComplete,
+	                   lib,
+	                   fdbCPath,
+	                   "fdb_database_wait_purge_granules_complete",
+	                   headerVersion >= 710);
+
 	loadClientFunction(
 	    &api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710);
 	loadClientFunction(&api->tenantDestroy, lib, fdbCPath, "fdb_tenant_destroy", headerVersion >= 710);
@ -609,7 +655,7 @@ void DLApi::init() {
 	                   headerVersion >= 0);
 	loadClientFunction(&api->transactionGetRange, lib, fdbCPath, "fdb_transaction_get_range", headerVersion >= 0);
 	loadClientFunction(
-	    &api->transactionGetMappedRange, lib, fdbCPath, "fdb_transaction_get_mapped_range", headerVersion >= 700);
+	    &api->transactionGetMappedRange, lib, fdbCPath, "fdb_transaction_get_mapped_range", headerVersion >= 710);
 	loadClientFunction(
 	    &api->transactionGetVersionstamp, lib, fdbCPath, "fdb_transaction_get_versionstamp", headerVersion >= 410);
 	loadClientFunction(&api->transactionSet, lib, fdbCPath, "fdb_transaction_set", headerVersion >= 0);
@ -667,7 +713,7 @@ void DLApi::init() {
 	loadClientFunction(
 	    &api->futureGetKeyValueArray, lib, fdbCPath, "fdb_future_get_keyvalue_array", headerVersion >= 0);
 	loadClientFunction(
-	    &api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 700);
+	    &api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 710);
 	loadClientFunction(&api->futureGetSharedState, lib, fdbCPath, "fdb_future_get_shared_state", headerVersion >= 710);
 	loadClientFunction(&api->futureSetCallback, lib, fdbCPath, "fdb_future_set_callback", headerVersion >= 0);
 	loadClientFunction(&api->futureCancel, lib, fdbCPath, "fdb_future_cancel", headerVersion >= 0);
@ -866,6 +912,7 @@ void MultiVersionTransaction::setVersion(Version v) {
 		tr.transaction->setVersion(v);
 	}
 }
+
 ThreadFuture<Version> MultiVersionTransaction::getReadVersion() {
 	auto tr = getTransaction();
 	auto f = tr.transaction ? tr.transaction->getReadVersion() : makeTimeout<Version>();
@ -1053,6 +1100,24 @@ Version MultiVersionTransaction::getCommittedVersion() {
 	return invalidVersion;
 }

+VersionVector MultiVersionTransaction::getVersionVector() {
+	auto tr = getTransaction();
+	if (tr.transaction) {
+		return tr.transaction->getVersionVector();
+	}
+
+	return VersionVector();
+}
+
+UID MultiVersionTransaction::getSpanID() {
+	auto tr = getTransaction();
+	if (tr.transaction) {
+		return tr.transaction->getSpanID();
+	}
+
+	return UID();
+}
+
 ThreadFuture<int64_t> MultiVersionTransaction::getApproximateSize() {
 	auto tr = getTransaction();
 	auto f = tr.transaction ? tr.transaction->getApproximateSize() : makeTimeout<int64_t>();
@ -1442,6 +1507,17 @@ double MultiVersionDatabase::getMainThreadBusyness() {
 	return localClientBusyness;
 }

+ThreadFuture<Key> MultiVersionDatabase::purgeBlobGranules(const KeyRangeRef& keyRange,
+                                                          Version purgeVersion,
+                                                          bool force) {
+	auto f = dbState->db ? dbState->db->purgeBlobGranules(keyRange, purgeVersion, force) : ThreadFuture<Key>(Never());
+	return abortableFuture(f, dbState->dbVar->get().onChange);
+}
+ThreadFuture<Void> MultiVersionDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
+	auto f = dbState->db ? dbState->db->waitPurgeGranulesComplete(purgeKey) : ThreadFuture<Void>(Never());
+	return abortableFuture(f, dbState->dbVar->get().onChange);
+}
+
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 // Note: this will never return if the server is running a protocol from FDB 5.0 or older
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@ -156,6 +156,16 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	double (*databaseGetMainThreadBusyness)(FDBDatabase* database);
 	FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion);

+	FDBFuture* (*purgeBlobGranules)(FDBDatabase* db,
+	                                uint8_t const* begin_key_name,
+	                                int begin_key_name_length,
+	                                uint8_t const* end_key_name,
+	                                int end_key_name_length,
+	                                int64_t purge_version,
+	                                fdb_bool_t force);
+
+	FDBFuture* (*waitPurgeGranulesComplete)(FDBDatabase* db, uint8_t const* purge_key_name, int purge_key_name_length);
+
 	// Tenant
 	fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction);
 	void (*tenantDestroy)(FDBTenant* tenant);
@ -367,6 +377,8 @@ public:

 	ThreadFuture<Void> commit() override;
 	Version getCommittedVersion() override;
+	VersionVector getVersionVector() override;
+	UID getSpanID() override { return UID(); };
 	ThreadFuture<int64_t> getApproximateSize() override;

 	void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
@ -438,6 +450,9 @@ public:
 	ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
 	ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;

+	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
+	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;

@ -551,6 +566,8 @@ public:

 	ThreadFuture<Void> commit() override;
 	Version getCommittedVersion() override;
+	VersionVector getVersionVector() override;
+	UID getSpanID() override;
 	ThreadFuture<int64_t> getApproximateSize() override;

 	void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
@ -716,6 +733,9 @@ public:
 	ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
 	ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;

+	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
+	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;

--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -71,6 +71,7 @@
 #include "flow/Error.h"
 #include "flow/FastRef.h"
 #include "flow/IRandom.h"
+#include "flow/Trace.h"
 #include "flow/ProtocolVersion.h"
 #include "flow/flow.h"
 #include "flow/genericactors.actor.h"
@ -215,6 +216,52 @@ void DatabaseContext::removeTssMapping(StorageServerInterface const& ssi) {
 	}
 }

+void DatabaseContext::addSSIdTagMapping(const UID& uid, const Tag& tag) {
+	ssidTagMapping[uid] = tag;
+}
+
+void DatabaseContext::getLatestCommitVersions(const Reference<LocationInfo>& locationInfo,
+                                              Version readVersion,
+                                              Reference<TransactionState> info,
+                                              VersionVector& latestCommitVersions) {
+	latestCommitVersions.clear();
+
+	if (info->debugID.present()) {
+		g_traceBatch.addEvent("TransactionDebug", info->debugID.get().first(), "NativeAPI.getLatestCommitVersions");
+	}
+
+	if (!info->readVersionObtainedFromGrvProxy) {
+		return;
+	}
+
+	if (ssVersionVectorCache.getMaxVersion() != invalidVersion && readVersion > ssVersionVectorCache.getMaxVersion()) {
+		TraceEvent(SevDebug, "GetLatestCommitVersions")
+		    .detail("ReadVersion", readVersion)
+		    .detail("VersionVector", ssVersionVectorCache.toString());
+		ssVersionVectorCache.clear();
+		throw stale_version_vector(); // TODO: investigate why
+	}
+
+	std::map<Version, std::set<Tag>> versionMap; // order the versions to be returned
+	for (int i = 0; i < locationInfo->locations()->size(); i++) {
+		UID uid = locationInfo->locations()->getId(i);
+		if (ssidTagMapping.find(uid) != ssidTagMapping.end()) {
+			Tag tag = ssidTagMapping[uid];
+			if (ssVersionVectorCache.hasVersion(tag)) {
+				Version commitVersion = ssVersionVectorCache.getVersion(tag); // latest commit version
+				if (commitVersion < readVersion) {
+					versionMap[commitVersion].insert(tag);
+				}
+			}
+		}
+	}
+
+	// insert the commit versions in the version vector.
+	for (auto& iter : versionMap) {
+		latestCommitVersions.setVersion(iter.second, iter.first);
+	}
+}
+
 void updateCachedReadVersionShared(double t, Version v, DatabaseSharedState* p) {
 	MutexHolder mutex(p->mutexLock);
 	if (v >= p->grvCacheSpace.cachedReadVersion) {
@ -1389,13 +1436,13 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
    transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc),
    transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
    transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
-    latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000),
-    bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr),
-    lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0),
-    lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo),
-    clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(apiVersion), mvCacheInsertLocation(0),
-    healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0),
-    smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
+    transactionsStaleVersionVectors("NumStaleVersionVectors", cc), latencies(1000), readLatencies(1000),
+    commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000),
+    bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0),
+    lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0),
+    transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor),
+    coordinator(coordinator), apiVersion(apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0),
+    detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
    specialKeySpace(std::make_unique<SpecialKeySpace>(specialKeys.begin, specialKeys.end, /* test */ false)),
    connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {
 	dbId = deterministicRandom()->randomUniqueID();
@ -1519,6 +1566,12 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
 		    std::make_unique<AdvanceVersionImpl>(
 		        singleKeyRange(LiteralStringRef("min_required_commit_version"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::MANAGEMENT,
+		    SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<VersionEpochImpl>(
+		        singleKeyRange(LiteralStringRef("version_epoch"))
+		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 		registerSpecialKeySpaceModule(
 		    SpecialKeySpace::MODULE::MANAGEMENT,
 		    SpecialKeySpace::IMPLTYPE::READWRITE,
@ -1656,8 +1709,9 @@ DatabaseContext::DatabaseContext(const Error& err)
    transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc),
    transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
    transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
-    latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000),
-    bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), transactionTracingSample(false),
+    transactionsStaleVersionVectors("NumStaleVersionVectors", cc), latencies(1000), readLatencies(1000),
+    commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000),
+    bgGranulesPerRequest(1000), transactionTracingSample(false),
    smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
    connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {}

@ -1998,6 +2052,8 @@ ACTOR static Future<Void> switchConnectionRecordImpl(Reference<IClusterConnectio
 	self->tenantCache.clear();
 	self->invalidateCache(Key(), allKeys);

+	self->ssVersionVectorCache.clear();
+
 	auto clearedClientInfo = self->clientInfo->get();
 	clearedClientInfo.commitProxies.clear();
 	clearedClientInfo.grvProxies.clear();
@ -2496,6 +2552,7 @@ void DatabaseContext::updateProxies() {
 	proxiesLastChange = clientInfo->get().id;
 	commitProxies.clear();
 	grvProxies.clear();
+	ssVersionVectorCache.clear();
 	bool commitProxyProvisional = false, grvProxyProvisional = false;
 	if (clientInfo->get().commitProxies.size()) {
 		commitProxies = makeReference<CommitProxyInfo>(clientInfo->get().commitProxies);
@ -2527,6 +2584,15 @@ Reference<GrvProxyInfo> DatabaseContext::getGrvProxies(UseProvisionalProxies use
 	return grvProxies;
 }

+bool DatabaseContext::isCurrentGrvProxy(UID proxyId) const {
+	for (const auto& proxy : clientInfo->get().grvProxies) {
+		if (proxy.id() == proxyId)
+			return true;
+	}
+	TEST(true); // stale GRV proxy detected
+	return false;
+}
+
 // Actor which will wait until the MultiInterface<CommitProxyInterface> returned by the DatabaseContext cx is not
 // nullptr
 ACTOR Future<Reference<CommitProxyInfo>> getCommitProxiesFuture(DatabaseContext* cx,
@ -2707,6 +2773,12 @@ void updateTssMappings(Database cx, const GetKeyServerLocationsReply& reply) {
 	}
 }

+void updateTagMappings(Database cx, const GetKeyServerLocationsReply& reply) {
+	for (const auto& mapping : reply.resultsTagMapping) {
+		cx->addSSIdTagMapping(mapping.first, mapping.second);
+	}
+}
+
 // If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key).
 // Otherwise returns the shard containing key
 ACTOR Future<KeyRangeLocationInfo> getKeyLocation_internal(Database cx,
@ -2912,6 +2984,7 @@ ACTOR Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations_internal(
 						wait(yield());
 					}
 					updateTssMappings(cx, rep);
+					updateTagMappings(cx, rep);

 					return results;
 				}
@ -3074,8 +3147,8 @@ TransactionState::TransactionState(Database cx,
                                   TaskPriority taskID,
                                   SpanID spanID,
                                   Reference<TransactionLogInfo> trLogInfo)
-  : cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID), tenant_(tenant),
-    tenantSet(tenant.present()) {}
+  : cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID), readVersionObtainedFromGrvProxy(true),
+    tenant_(tenant), tenantSet(tenant.present()) {}

 Reference<TransactionState> TransactionState::cloneAndReset(Reference<TransactionLogInfo> newTrLogInfo,
                                                            bool generateNewSpan) const {
@ -3155,6 +3228,8 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
 		state Optional<UID> getValueID = Optional<UID>();
 		state uint64_t startTime;
 		state double startTimeD;
+		state VersionVector ssLatestCommitVersions;
+		trState->cx->getLatestCommitVersions(locationInfo.locations, ver, trState, ssLatestCommitVersions);
 		try {
 			if (trState->debugID.present()) {
 				getValueID = nondeterministicRandom()->randomUniqueID();
@ -3192,7 +3267,8 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
 					                         ver,
 					                         trState->cx->sampleReadTags() ? trState->options.readTags
 					                                                       : Optional<TagSet>(),
-					                         getValueID),
+					                         getValueID,
+					                         ssLatestCommitVersions),
 					         TaskPriority::DefaultPromiseEndpoint,
 					         AtMostOnce::False,
 					         trState->cx->enableLocalityLoadBalance ? &trState->cx->queueModel : nullptr))) {
@ -3298,6 +3374,9 @@ ACTOR Future<Key> getKey(Reference<TransactionState> trState,
 		                                                              useTenant,
 		                                                              version.get()));

+		state VersionVector ssLatestCommitVersions;
+		trState->cx->getLatestCommitVersions(locationInfo.locations, version.get(), trState, ssLatestCommitVersions);
+
 		try {
 			if (getKeyID.present())
 				g_traceBatch.addEvent(
@ -3312,7 +3391,8 @@ ACTOR Future<Key> getKey(Reference<TransactionState> trState,
 			                  k,
 			                  version.get(),
 			                  trState->cx->sampleReadTags() ? trState->options.readTags : Optional<TagSet>(),
-			                  getKeyID);
+			                  getKeyID,
+			                  ssLatestCommitVersions);
 			req.arena.dependsOn(k.arena());

 			state GetKeyReply reply;
@ -3369,14 +3449,20 @@ ACTOR Future<Version> waitForCommittedVersion(Database cx, Version version, Span
 		loop {
 			choose {
 				when(wait(cx->onProxiesChanged())) {}
-				when(GetReadVersionReply v =
-				         wait(basicLoadBalance(cx->getGrvProxies(UseProvisionalProxies::False),
+				when(GetReadVersionReply v = wait(basicLoadBalance(
+				         cx->getGrvProxies(UseProvisionalProxies::False),
 				         &GrvProxyInterface::getConsistentReadVersion,
-				                               GetReadVersionRequest(span.context, 0, TransactionPriority::IMMEDIATE),
+				         GetReadVersionRequest(
+				             span.context, 0, TransactionPriority::IMMEDIATE, cx->ssVersionVectorCache.getMaxVersion()),
 				         cx->taskID))) {
 					cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version);
 					if (v.midShardSize > 0)
 						cx->smoothMidShardSize.setTotal(v.midShardSize);
+					if (cx->isCurrentGrvProxy(v.proxyId)) {
+						cx->ssVersionVectorCache.applyDelta(v.ssVersionVectorDelta);
+					} else {
+						cx->ssVersionVectorCache.clear();
+					}
 					if (v.version >= version)
 						return v.version;
 					// SOMEDAY: Do the wait on the server side, possibly use less expensive source of committed version
@ -3399,8 +3485,16 @@ ACTOR Future<Version> getRawVersion(Reference<TransactionState> trState) {
 			when(GetReadVersionReply v =
 			         wait(basicLoadBalance(trState->cx->getGrvProxies(UseProvisionalProxies::False),
 			                               &GrvProxyInterface::getConsistentReadVersion,
-			                               GetReadVersionRequest(trState->spanID, 0, TransactionPriority::IMMEDIATE),
+			                               GetReadVersionRequest(trState->spanID,
+			                                                     0,
+			                                                     TransactionPriority::IMMEDIATE,
+			                                                     trState->cx->ssVersionVectorCache.getMaxVersion()),
 			                               trState->cx->taskID))) {
+				if (trState->cx->isCurrentGrvProxy(v.proxyId)) {
+					trState->cx->ssVersionVectorCache.applyDelta(v.ssVersionVectorDelta);
+				} else {
+					trState->cx->ssVersionVectorCache.clear();
+				}
 				return v.version;
 			}
 		}
@ -3726,6 +3820,8 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 			req.begin = firstGreaterOrEqual(range.begin);
 			req.end = firstGreaterOrEqual(range.end);
 			req.spanContext = span.context;
+			trState->cx->getLatestCommitVersions(
+			    locations[shard].locations, req.version, trState, req.ssLatestCommitVersions);

 			// keep shard's arena around in case of async tss comparison
 			req.arena.dependsOn(locations[shard].range.arena());
@ -4103,6 +4199,9 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
 			req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys);
 			req.version = readVersion;

+			trState->cx->getLatestCommitVersions(
+			    beginServer.locations, req.version, trState, req.ssLatestCommitVersions);
+
 			// In case of async tss comparison, also make req arena depend on begin, end, and/or shard's arena depending
 			// on which  is used
 			bool dependOnShard = false;
@ -4258,13 +4357,16 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
 					return output;
 				}

+				if (readVersion == latestVersion) {
 					readVersion = rep.version; // see above comment
+				}

 				if (!rep.more) {
 					ASSERT(modifiedSelectors);
 					TEST(true); // !GetKeyValuesFamilyReply.more and modifiedSelectors in getRange

 					if (!rep.data.size()) {
+						// VERSION_VECTOR change version to readVersion in getRangeFallback
 						RangeResultFamily result = wait(
 						    getRangeFallback<GetKeyValuesFamilyRequest, GetKeyValuesFamilyReply, RangeResultFamily>(
 						        trState,
@ -4548,6 +4650,8 @@ ACTOR Future<Void> getRangeStreamFragment(Reference<TransactionState> trState,
 			req.spanContext = spanContext;
 			req.limit = reverse ? -CLIENT_KNOBS->REPLY_BYTE_LIMIT : CLIENT_KNOBS->REPLY_BYTE_LIMIT;
 			req.limitBytes = std::numeric_limits<int>::max();
+			trState->cx->getLatestCommitVersions(
+			    locations[shard].locations, req.version, trState, req.ssLatestCommitVersions);

 			// keep shard's arena around in case of async tss comparison
 			req.arena.dependsOn(range.arena());
@ -4971,13 +5075,19 @@ void Transaction::flushTrLogsIfEnabled() {
 	}
 }

+VersionVector Transaction::getVersionVector() const {
+	return trState->cx->ssVersionVectorCache;
+}
+
 void Transaction::setVersion(Version v) {
 	trState->startTime = now();
 	if (readVersion.isValid())
 		throw read_version_already_set();
 	if (v <= 0)
 		throw version_invalid();
+
 	readVersion = v;
+	trState->readVersionObtainedFromGrvProxy = false;
 }

 Future<Optional<Value>> Transaction::get(const Key& key, Snapshot snapshot) {
@ -6469,7 +6579,13 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanID parentSpan,
 		g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.Before");
 	loop {
 		try {
-			state GetReadVersionRequest req(span.context, transactionCount, priority, flags, tags, debugID);
+			state GetReadVersionRequest req(span.context,
+			                                transactionCount,
+			                                priority,
+			                                cx->ssVersionVectorCache.getMaxVersion(),
+			                                flags,
+			                                tags,
+			                                debugID);
 			state Future<Void> onProxiesChanged = cx->onProxiesChanged();

 			choose {
@ -6502,6 +6618,11 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanID parentSpan,
 						    "TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.After");
 					ASSERT(v.version > 0);
 					cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version);
+					if (cx->isCurrentGrvProxy(v.proxyId)) {
+						cx->ssVersionVectorCache.applyDelta(v.ssVersionVectorDelta);
+					} else {
+						continue; // stale GRV reply, retry
+					}
 					return v;
 				}
 			}
@ -6687,6 +6808,11 @@ ACTOR Future<Version> extractReadVersion(Reference<TransactionState> trState,
 	}

 	metadataVersion.send(rep.metadataVersion);
+	if (trState->cx->isCurrentGrvProxy(rep.proxyId)) {
+		trState->cx->ssVersionVectorCache.applyDelta(rep.ssVersionVectorDelta);
+	} else {
+		trState->cx->ssVersionVectorCache.clear();
+	}
 	return rep.version;
 }

@ -6933,11 +7059,14 @@ Future<Void> Transaction::onError(Error const& e) {
 		reset();
 		return delay(backoff, trState->taskID);
 	}
-	if (e.code() == error_code_transaction_too_old || e.code() == error_code_future_version) {
+	if (e.code() == error_code_transaction_too_old || e.code() == error_code_future_version ||
+	    e.code() == error_code_stale_version_vector) {
 		if (e.code() == error_code_transaction_too_old)
 			++trState->cx->transactionsTooOld;
 		else if (e.code() == error_code_future_version)
 			++trState->cx->transactionsFutureVersions;
+		else if (e.code() == error_code_stale_version_vector)
+			++trState->cx->transactionsStaleVersionVectors;

 		double maxBackoff = trState->options.maxBackoff;
 		reset();
@ -9178,3 +9307,86 @@ Future<Void> DatabaseContext::popChangeFeedMutations(Key rangeID, Version versio
 Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
 	return makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(this)));
 }
+
+ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
+                                         KeyRange range,
+                                         Version purgeVersion,
+                                         bool force) {
+	state Database cx(db);
+	state Transaction tr(cx);
+	state Key purgeKey;
+
+	// FIXME: implement force
+	if (!force) {
+		throw unsupported_operation();
+	}
+	loop {
+		try {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			Value purgeValue = blobGranulePurgeValueFor(purgeVersion, range, force);
+			tr.atomicOp(
+			    addVersionStampAtEnd(blobGranulePurgeKeys.begin), purgeValue, MutationRef::SetVersionstampedKey);
+			tr.set(blobGranulePurgeChangeKey, deterministicRandom()->randomUniqueID().toString());
+			state Future<Standalone<StringRef>> fTrVs = tr.getVersionstamp();
+			wait(tr.commit());
+			Standalone<StringRef> vs = wait(fTrVs);
+			purgeKey = blobGranulePurgeKeys.begin.withSuffix(vs);
+			if (BG_REQUEST_DEBUG) {
+				fmt::print("purgeBlobGranules for range [{0} - {1}) at version {2} registered {3}\n",
+				           range.begin.printable(),
+				           range.end.printable(),
+				           purgeVersion,
+				           purgeKey.printable());
+			}
+			break;
+		} catch (Error& e) {
+			if (BG_REQUEST_DEBUG) {
+				fmt::print("purgeBlobGranules for range [{0} - {1}) at version {2} encountered error {3}\n",
+				           range.begin.printable(),
+				           range.end.printable(),
+				           purgeVersion,
+				           e.name());
+			}
+			wait(tr.onError(e));
+		}
+	}
+	return purgeKey;
+}
+
+Future<Key> DatabaseContext::purgeBlobGranules(KeyRange range, Version purgeVersion, bool force) {
+	return purgeBlobGranulesActor(Reference<DatabaseContext>::addRef(this), range, purgeVersion, force);
+}
+
+ACTOR Future<Void> waitPurgeGranulesCompleteActor(Reference<DatabaseContext> db, Key purgeKey) {
+	state Database cx(db);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			Optional<Value> purgeVal = wait(tr->get(purgeKey));
+			if (!purgeVal.present()) {
+				if (BG_REQUEST_DEBUG) {
+					fmt::print("purgeBlobGranules for {0} succeeded\n", purgeKey.printable());
+				}
+				return Void();
+			}
+			if (BG_REQUEST_DEBUG) {
+				fmt::print("purgeBlobGranules for {0} watching\n", purgeKey.printable());
+			}
+			state Future<Void> watchFuture = tr->watch(purgeKey);
+			wait(tr->commit());
+			wait(watchFuture);
+			tr->reset();
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
+Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
+	return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
+}
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -245,6 +245,7 @@ struct TransactionState : ReferenceCounted<TransactionState> {
 	TaskPriority taskID;
 	SpanID spanID;
 	UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False;
+	bool readVersionObtainedFromGrvProxy;

 	int numErrors = 0;
 	double startTime = 0;
@ -260,6 +261,7 @@ struct TransactionState : ReferenceCounted<TransactionState> {
 	// Only available so that Transaction can have a default constructor, for use in state variables
 	TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID), tenantSet(false) {}

+	// VERSION_VECTOR changed default values of readVersionObtainedFromGrvProxy
 	TransactionState(Database cx,
 	                 Optional<TenantName> tenant,
 	                 TaskPriority taskID,
@ -430,7 +432,10 @@ public:
 	void reset();
 	void fullReset();
 	double getBackoff(int errCode);
+
 	void debugTransaction(UID dID) { trState->debugID = dID; }
+	VersionVector getVersionVector() const;
+	UID getSpanID() const { return trState->spanID; }

 	Future<Void> commitMutations();
 	void setupWatches();
--- a/fdbclient/ReadYourWrites.h
+++ b/fdbclient/ReadYourWrites.h
@ -139,6 +139,9 @@ public:

 	[[nodiscard]] Future<Void> commit() override;
 	Version getCommittedVersion() const override { return tr.getCommittedVersion(); }
+	VersionVector getVersionVector() const override { return tr.getVersionVector(); }
+	UID getSpanID() const override { return tr.getSpanID(); }
+
 	int64_t getApproximateSize() const override { return approximateSize; }
 	[[nodiscard]] Future<Standalone<StringRef>> getVersionstamp() override;

--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -19,6 +19,7 @@
 */

 #include "fdbclient/ServerKnobs.h"
+#include "flow/IRandom.h"

 #define init(...) KNOB_FN(__VA_ARGS__, INIT_ATOMIC_KNOB, INIT_KNOB)(__VA_ARGS__)

@ -36,6 +37,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS,     5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 1 * VERSIONS_PER_SECOND);
 	init( MAX_COMMIT_BATCH_INTERVAL,                             2.0 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 0.5; // Each commit proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly
 	MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
+	init( ENABLE_VERSION_VECTOR,                               false );
+	init( ENABLE_VERSION_VECTOR_TLOG_UNICAST,                  false );
+	init( MAX_VERSION_RATE_MODIFIER,                             0.1 );
+	init( MAX_VERSION_RATE_OFFSET,               VERSIONS_PER_SECOND ); // If the calculated version is more than this amount away from the expected version, it will be clamped to this value. This prevents huge version jumps.

 	// TLogs
 	init( TLOG_TIMEOUT,                                          0.4 ); //cannot buggify because of availability
@ -102,6 +107,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( PUSH_STATS_SLOW_RATIO,                                 0.5 );
 	init( TLOG_POP_BATCH_SIZE,                                  1000 ); if ( randomize && BUGGIFY ) TLOG_POP_BATCH_SIZE = 10;
 	init( TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE,     250e6 );
+	init( BLOCKING_PEEK_TIMEOUT,                                 0.4 );
 	init( ENABLE_DETAILED_TLOG_POP_TRACE,                      false ); if ( randomize && BUGGIFY ) ENABLE_DETAILED_TLOG_POP_TRACE = true;
 	init( PEEK_BATCHING_EMPTY_MSG,                             false ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG = true;
 	init( PEEK_BATCHING_EMPTY_MSG_INTERVAL,                    0.001 ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG_INTERVAL = 0.01;
@ -109,6 +115,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// disk snapshot max timeout, to be put in TLog, storage and coordinator nodes
 	init( MAX_FORKED_PROCESS_OUTPUT,                            1024 );
 	init( SNAP_CREATE_MAX_TIMEOUT,                             300.0 );
+	init( MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE,                    1 );
+	init( MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE,                1 );

 	// Data distribution queue
 	init( HEALTH_POLL_TIME,                                      1.0 );
@ -384,6 +392,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD,                    1 );
 	init( ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD,              5 );
 	init( ROCKSDB_COMPACTION_READAHEAD_SIZE,                   32768 ); // 32 KB, performs bigger reads when doing compaction.
+	init( ROCKSDB_BLOCK_SIZE,                                  32768 ); // 32 KB, size of the block in rocksdb cache.

 	// Leader election
 	bool longLeaderElection = randomize && BUGGIFY;
@ -445,6 +454,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( TXN_STATE_SEND_AMOUNT,                                    4 );
 	init( REPORT_TRANSACTION_COST_ESTIMATION_DELAY,               0.1 );
 	init( PROXY_REJECT_BATCH_QUEUED_TOO_LONG,                    true );
+	init( PROXY_USE_RESOLVER_PRIVATE_MUTATIONS,                 false ); if( !ENABLE_VERSION_VECTOR_TLOG_UNICAST && randomize && BUGGIFY ) PROXY_USE_RESOLVER_PRIVATE_MUTATIONS = deterministicRandom()->coinflip();

 	init( RESET_MASTER_BATCHES,                                   200 );
 	init( RESET_RESOLVER_BATCHES,                                 200 );
@ -830,6 +840,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REDWOOD_METRICS_INTERVAL,                              5.0 );
 	init( REDWOOD_HISTOGRAM_INTERVAL,                           30.0 );
 	init( REDWOOD_EVICT_UPDATED_PAGES,                          true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
+	init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT,                    2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }

 	// Server request latency measurement
 	init( LATENCY_SAMPLE_SIZE,                                100000 );
--- a/fdbclient/ServerKnobs.h
+++ b/fdbclient/ServerKnobs.h
@ -37,8 +37,12 @@ public:
 	int64_t MAX_VERSIONS_IN_FLIGHT_FORCED;
 	int64_t MAX_READ_TRANSACTION_LIFE_VERSIONS;
 	int64_t MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
+	bool ENABLE_VERSION_VECTOR;
+	bool ENABLE_VERSION_VECTOR_TLOG_UNICAST;
 	double MAX_COMMIT_BATCH_INTERVAL; // Each commit proxy generates a CommitTransactionBatchRequest at least this
 	                                  // often, so that versions always advance smoothly
+	double MAX_VERSION_RATE_MODIFIER;
+	int64_t MAX_VERSION_RATE_OFFSET;

 	// TLogs
 	bool PEEK_USING_STREAMING;
@ -106,6 +110,7 @@ public:
 	double PUSH_STATS_SLOW_AMOUNT;
 	double PUSH_STATS_SLOW_RATIO;
 	int TLOG_POP_BATCH_SIZE;
+	double BLOCKING_PEEK_TIMEOUT;
 	bool PEEK_BATCHING_EMPTY_MSG;
 	double PEEK_BATCHING_EMPTY_MSG_INTERVAL;

@ -316,6 +321,7 @@ public:
 	int ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD;
 	int ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD;
 	int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
+	int64_t ROCKSDB_BLOCK_SIZE;

 	// Leader election
 	int MAX_NOTIFICATIONS;
@ -372,6 +378,7 @@ public:
 	int TXN_STATE_SEND_AMOUNT;
 	double REPORT_TRANSACTION_COST_ESTIMATION_DELAY;
 	bool PROXY_REJECT_BATCH_QUEUED_TOO_LONG;
+	bool PROXY_USE_RESOLVER_PRIVATE_MUTATIONS;

 	int RESET_MASTER_BATCHES;
 	int RESET_RESOLVER_BATCHES;
@ -581,6 +588,12 @@ public:
 	// disk snapshot
 	int64_t MAX_FORKED_PROCESS_OUTPUT;
 	double SNAP_CREATE_MAX_TIMEOUT;
+	// Maximum number of storage servers a snapshot can fail to
+	// capture while still succeeding
+	int64_t MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE;
+	// Maximum number of coordinators a snapshot can fail to
+	// capture while still succeeding
+	int64_t MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE;

 	// Storage Metrics
 	double STORAGE_METRICS_AVERAGE_INTERVAL;
@ -794,6 +807,7 @@ public:
 	double REDWOOD_METRICS_INTERVAL;
 	double REDWOOD_HISTOGRAM_INTERVAL;
 	bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache.
+	int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches

 	// Server request latency measurement
 	int LATENCY_SAMPLE_SIZE;
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -106,6 +106,8 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	{ "advanceversion",
 	  singleKeyRange(LiteralStringRef("min_required_commit_version"))
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "versionepoch",
+	  singleKeyRange(LiteralStringRef("version_epoch")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
 	{ "profile",
 	  KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
@ -1909,6 +1911,42 @@ Future<Optional<std::string>> AdvanceVersionImpl::commit(ReadYourWritesTransacti
 	return Optional<std::string>();
 }

+ACTOR static Future<RangeResult> getVersionEpochActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
+	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
+	ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS);
+	Optional<Value> val = wait(ryw->getTransaction().get(versionEpochKey));
+	RangeResult result;
+	if (val.present()) {
+		int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(val.get(), Unversioned());
+		ValueRef version(result.arena(), boost::lexical_cast<std::string>(versionEpoch));
+		result.push_back_deep(result.arena(), KeyValueRef(kr.begin, version));
+	}
+	return result;
+}
+
+VersionEpochImpl::VersionEpochImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
+Future<RangeResult> VersionEpochImpl::getRange(ReadYourWritesTransaction* ryw,
+                                               KeyRangeRef kr,
+                                               GetRangeLimits limitsHint) const {
+	ASSERT(kr == getKeyRange());
+	return getVersionEpochActor(ryw, kr);
+}
+
+Future<Optional<std::string>> VersionEpochImpl::commit(ReadYourWritesTransaction* ryw) {
+	auto versionEpoch =
+	    ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("versionepoch")].second;
+	if (versionEpoch.present()) {
+		int64_t epoch = BinaryReader::fromStringRef<int64_t>(versionEpoch.get(), Unversioned());
+		ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
+		ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS);
+		ryw->getTransaction().set(versionEpochKey, BinaryWriter::toValue(epoch, Unversioned()));
+	} else {
+		ryw->getTransaction().clear(versionEpochKey);
+	}
+	return Optional<std::string>();
+}
+
 ClientProfilingImpl::ClientProfilingImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}

 ACTOR static Future<RangeResult> ClientProfilingGetRangeActor(ReadYourWritesTransaction* ryw,
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@ -476,6 +476,15 @@ public:
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };

+class VersionEpochImpl : public SpecialKeyRangeRWImpl {
+public:
+	explicit VersionEpochImpl(KeyRangeRef kr);
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
+	                             KeyRangeRef kr,
+	                             GetRangeLimits limitsHint) const override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+};
+
 class ClientProfilingImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit ClientProfilingImpl(KeyRangeRef kr);
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -36,6 +36,7 @@
 #include "fdbclient/TagThrottle.actor.h"
 #include "fdbclient/Tenant.h"
 #include "flow/UnitTest.h"
+#include "fdbclient/VersionVector.h"

 // Dead code, removed in the next protocol version
 struct VersionReply {
@ -88,6 +89,7 @@ struct StorageServerInterface {
 	PublicRequestStream<struct ChangeFeedVersionUpdateRequest> changeFeedVersionUpdate;
 	PublicRequestStream<struct GetCheckpointRequest> checkpoint;
 	PublicRequestStream<struct FetchCheckpointRequest> fetchCheckpoint;
+	PublicRequestStream<struct FetchCheckpointKeyValuesRequest> fetchCheckpointKeyValues;

 private:
 	bool acceptingRequests;
@ -156,7 +158,13 @@ public:
 				checkpoint =
 				    PublicRequestStream<struct GetCheckpointRequest>(getValue.getEndpoint().getAdjustedEndpoint(19));
 				fetchCheckpoint =
+<<<<<<< variant A
 				    PublicRequestStream<struct FetchCheckpointRequest>(getValue.getEndpoint().getAdjustedEndpoint(20));
+>>>>>>> variant B
+				    RequestStream<struct FetchCheckpointRequest>(getValue.getEndpoint().getAdjustedEndpoint(20));
+				fetchCheckpointKeyValues = RequestStream<struct FetchCheckpointKeyValuesRequest>(
+				    getValue.getEndpoint().getAdjustedEndpoint(21));
+======= end
 			}
 		} else {
 			ASSERT(Ar::isDeserializing);
@ -206,6 +214,7 @@ public:
 		streams.push_back(changeFeedVersionUpdate.getReceiver());
 		streams.push_back(checkpoint.getReceiver());
 		streams.push_back(fetchCheckpoint.getReceiver());
+		streams.push_back(fetchCheckpointKeyValues.getReceiver());
 		FlowTransport::transport().addEndpoints(streams);
 	}
 };
@ -273,6 +282,9 @@ struct GetValueRequest : TimedRequest {
 	Optional<TagSet> tags;
 	Optional<UID> debugID;
 	ReplyPromise<GetValueReply> reply;
+	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
+	                                      // to this client, of all storage replicas that
+	                                      // serve the given key

 	GetValueRequest() {}
 	GetValueRequest(SpanID spanContext,
@ -280,12 +292,14 @@ struct GetValueRequest : TimedRequest {
 	                const Key& key,
 	                Version ver,
 	                Optional<TagSet> tags,
-	                Optional<UID> debugID)
-	  : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), tags(tags), debugID(debugID) {}
+	                Optional<UID> debugID,
+	                VersionVector latestCommitVersions)
+	  : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), tags(tags), debugID(debugID),
+	    ssLatestCommitVersions(latestCommitVersions) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, key, version, tags, debugID, reply, spanContext, tenantInfo);
+		serializer(ar, key, version, tags, debugID, reply, spanContext, tenantInfo, ssLatestCommitVersions);
 	}
 };

@ -363,6 +377,9 @@ struct GetKeyValuesRequest : TimedRequest {
 	Optional<TagSet> tags;
 	Optional<UID> debugID;
 	ReplyPromise<GetKeyValuesReply> reply;
+	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
+	                                      // to this client, of all storage replicas that
+	                                      // serve the given key

 	GetKeyValuesRequest() : isFetchKeys(false) {}

@ -380,7 +397,8 @@ struct GetKeyValuesRequest : TimedRequest {
 		           reply,
 		           spanContext,
 		           tenantInfo,
-		           arena);
+		           arena,
+		           ssLatestCommitVersions);
 	}
 };

@ -415,6 +433,9 @@ struct GetMappedKeyValuesRequest : TimedRequest {
 	Optional<TagSet> tags;
 	Optional<UID> debugID;
 	ReplyPromise<GetMappedKeyValuesReply> reply;
+	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
+	                                      // to this client, of all storage replicas that
+	                                      // serve the given key range

 	GetMappedKeyValuesRequest() : isFetchKeys(false) {}
 	template <class Ar>
@ -432,7 +453,8 @@ struct GetMappedKeyValuesRequest : TimedRequest {
 		           reply,
 		           spanContext,
 		           tenantInfo,
-		           arena);
+		           arena,
+		           ssLatestCommitVersions);
 	}
 };

@ -475,6 +497,9 @@ struct GetKeyValuesStreamRequest {
 	Optional<TagSet> tags;
 	Optional<UID> debugID;
 	ReplyPromiseStream<GetKeyValuesStreamReply> reply;
+	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
+	                                      // to this client, of all storage replicas that
+	                                      // serve the given key range

 	GetKeyValuesStreamRequest() : isFetchKeys(false) {}

@ -492,7 +517,8 @@ struct GetKeyValuesStreamRequest {
 		           reply,
 		           spanContext,
 		           tenantInfo,
-		           arena);
+		           arena,
+		           ssLatestCommitVersions);
 	}
 };

@ -520,6 +546,9 @@ struct GetKeyRequest : TimedRequest {
 	Optional<TagSet> tags;
 	Optional<UID> debugID;
 	ReplyPromise<GetKeyReply> reply;
+	VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known
+	                                      // to this client, of all storage replicas that
+	                                      // serve the given key

 	GetKeyRequest() {}

@ -528,12 +557,14 @@ struct GetKeyRequest : TimedRequest {
 	              KeySelectorRef const& sel,
 	              Version version,
 	              Optional<TagSet> tags,
-	              Optional<UID> debugID)
-	  : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), debugID(debugID) {}
+	              Optional<UID> debugID,
+	              VersionVector latestCommitVersions)
+	  : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), debugID(debugID),
+	    ssLatestCommitVersions(latestCommitVersions) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, sel, version, tags, debugID, reply, spanContext, tenantInfo, arena);
+		serializer(ar, sel, version, tags, debugID, reply, spanContext, tenantInfo, arena, ssLatestCommitVersions);
 	}
 };

@ -900,6 +931,37 @@ struct FetchCheckpointRequest {
 	}
 };

+struct FetchCheckpointKeyValuesStreamReply : public ReplyPromiseStreamReply {
+	constexpr static FileIdentifier file_identifier = 13804353;
+	Arena arena;
+	VectorRef<KeyValueRef> data;
+
+	FetchCheckpointKeyValuesStreamReply() = default;
+
+	int expectedSize() const { return data.expectedSize(); }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, ReplyPromiseStreamReply::sequence, data, arena);
+	}
+};
+
+// Fetch checkpoint in the format of key-value pairs.
+struct FetchCheckpointKeyValuesRequest {
+	constexpr static FileIdentifier file_identifier = 13804354;
+	UID checkpointID;
+	KeyRange range;
+	ReplyPromiseStream<FetchCheckpointKeyValuesStreamReply> reply;
+
+	FetchCheckpointKeyValuesRequest() = default;
+	FetchCheckpointKeyValuesRequest(UID checkpointID, KeyRange range) : checkpointID(checkpointID), range(range) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, checkpointID, range, reply);
+	}
+};
+
 struct OverlappingChangeFeedEntry {
 	Key rangeId;
 	KeyRange range;
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -823,6 +823,7 @@ std::vector<std::pair<UID, Version>> decodeBackupStartedValue(const ValueRef& va
 const KeyRef coordinatorsKey = LiteralStringRef("\xff/coordinators");
 const KeyRef logsKey = LiteralStringRef("\xff/logs");
 const KeyRef minRequiredCommitVersionKey = LiteralStringRef("\xff/minRequiredCommitVersion");
+const KeyRef versionEpochKey = LiteralStringRef("\xff/versionEpoch");

 const KeyRef globalKeysPrefix = LiteralStringRef("\xff/globals");
 const KeyRef lastEpochEndKey = LiteralStringRef("\xff/globals/lastEpochEnd");
@ -1155,9 +1156,9 @@ const KeyRangeRef blobGranuleMappingKeys(LiteralStringRef("\xff\x02/bgm/"), Lite
 const KeyRangeRef blobGranuleLockKeys(LiteralStringRef("\xff\x02/bgl/"), LiteralStringRef("\xff\x02/bgl0"));
 const KeyRangeRef blobGranuleSplitKeys(LiteralStringRef("\xff\x02/bgs/"), LiteralStringRef("\xff\x02/bgs0"));
 const KeyRangeRef blobGranuleHistoryKeys(LiteralStringRef("\xff\x02/bgh/"), LiteralStringRef("\xff\x02/bgh0"));
-const KeyRangeRef blobGranulePruneKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0"));
+const KeyRangeRef blobGranulePurgeKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0"));
 const KeyRangeRef blobGranuleVersionKeys(LiteralStringRef("\xff\x02/bgv/"), LiteralStringRef("\xff\x02/bgv0"));
-const KeyRef blobGranulePruneChangeKey = LiteralStringRef("\xff\x02/bgpChange");
+const KeyRef blobGranulePurgeChangeKey = LiteralStringRef("\xff\x02/bgpChange");

 const uint8_t BG_FILE_TYPE_DELTA = 'D';
 const uint8_t BG_FILE_TYPE_SNAPSHOT = 'S';
@ -1214,7 +1215,7 @@ std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFi
 	return std::tuple(filename, offset, length, fullFileLength);
 }

-const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force) {
+const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force) {
 	BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
 	wr << version;
 	wr << range;
@ -1222,7 +1223,7 @@ const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force
 	return wr.toValue();
 }

-std::tuple<Version, KeyRange, bool> decodeBlobGranulePruneValue(ValueRef const& value) {
+std::tuple<Version, KeyRange, bool> decodeBlobGranulePurgeValue(ValueRef const& value) {
 	Version version;
 	KeyRange range;
 	bool force;
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -348,6 +348,11 @@ extern const KeyRef logsKey;
 //	Used during backup/recovery to restrict version requirements
 extern const KeyRef minRequiredCommitVersionKey;

+//	"\xff/versionEpochKey" = "[[uint64_t]]"
+//	Defines the base epoch representing version 0. The value itself is the
+//	number of microseconds since the Unix epoch.
+extern const KeyRef versionEpochKey;
+
 const Value logsValue(const std::vector<std::pair<UID, NetworkAddress>>& logs,
                      const std::vector<std::pair<UID, NetworkAddress>>& oldLogs);
 std::pair<std::vector<std::pair<UID, NetworkAddress>>, std::vector<std::pair<UID, NetworkAddress>>> decodeLogsValue(
@ -564,9 +569,9 @@ extern const KeyRangeRef blobGranuleSplitKeys;
 extern const KeyRangeRef blobGranuleHistoryKeys;

 // \xff\x02/bgp/(start,end) = (version, force)
-extern const KeyRangeRef blobGranulePruneKeys;
+extern const KeyRangeRef blobGranulePurgeKeys;
 extern const KeyRangeRef blobGranuleVersionKeys;
-extern const KeyRef blobGranulePruneChangeKey;
+extern const KeyRef blobGranulePurgeChangeKey;

 const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t fileType);
 std::tuple<UID, Version, uint8_t> decodeBlobGranuleFileKey(KeyRef const& key);
@ -575,8 +580,8 @@ const KeyRange blobGranuleFileKeyRangeFor(UID granuleID);
 const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength);
 std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value);

-const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force);
-std::tuple<Version, KeyRange, bool> decodeBlobGranulePruneValue(ValueRef const& value);
+const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force);
+std::tuple<Version, KeyRange, bool> decodeBlobGranulePurgeValue(ValueRef const& value);

 const Value blobGranuleMappingValueFor(UID const& workerID);
 UID decodeBlobGranuleMappingValue(ValueRef const& value);
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@ -127,6 +127,20 @@ ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<Pro
 	    [db, expectedVersion]() -> Future<ProtocolVersion> { return db->getClusterProtocol(expectedVersion); });
 }

+ThreadFuture<Key> ThreadSafeDatabase::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread([db, range, purgeVersion, force]() -> Future<Key> {
+		return db->purgeBlobGranules(range, purgeVersion, force);
+	});
+}
+
+ThreadFuture<Void> ThreadSafeDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
+	DatabaseContext* db = this->db;
+	Key key = purgeKey;
+	return onMainThread([db, key]() -> Future<Void> { return db->waitPurgeGranulesComplete(key); });
+}
+
 ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) {
 	ClusterConnectionFile* connFile =
 	    new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFilename).first);
@ -447,6 +461,14 @@ Version ThreadSafeTransaction::getCommittedVersion() {
 	return tr->getCommittedVersion();
 }

+VersionVector ThreadSafeTransaction::getVersionVector() {
+	return tr->getVersionVector();
+}
+
+UID ThreadSafeTransaction::getSpanID() {
+	return tr->getSpanID();
+}
+
 ThreadFuture<int64_t> ThreadSafeTransaction::getApproximateSize() {
 	ISingleThreadTransaction* tr = this->tr;
 	return onMainThread([tr]() -> Future<int64_t> { return tr->getApproximateSize(); });
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@ -59,6 +59,9 @@ public:
 	ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
 	ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;

+	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
+	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;

@ -163,6 +166,8 @@ public:

 	ThreadFuture<Void> commit() override;
 	Version getCommittedVersion() override;
+	VersionVector getVersionVector() override;
+	UID getSpanID() override;
 	ThreadFuture<int64_t> getApproximateSize() override;

 	ThreadFuture<uint64_t> getProtocolVersion();
--- a/fdbclient/VersionVector.h
+++ b/fdbclient/VersionVector.h
@ -0,0 +1,157 @@
+/*
+ * VersionVector.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDBCLIENT_VERSION_VECTOR_H
+#define FDBCLIENT_VERSION_VECTOR_H
+
+#pragma once
+
+#include <boost/container/flat_map.hpp>
+#include <set>
+
+#include "fdbclient/FDBTypes.h"
+#include "fdbclient/Knobs.h"
+
+struct VersionVector {
+	boost::container::flat_map<Tag, Version> versions;
+	Version maxVersion; // Specifies the max version in this version vector. (Note:
+	                    // there may or may not be a corresponding entry for this
+	                    // version in the "versions" map.)
+
+	VersionVector() : maxVersion(invalidVersion) {}
+	VersionVector(Version version) : maxVersion(version) {}
+
+private:
+	// Only invoked by getDelta() and applyDelta(), where tag has been validated
+	// and version is guaranteed to be larger than the existing value.
+	inline void setVersionNoCheck(const Tag& tag, Version version) { versions[tag] = version; }
+
+public:
+	Version getMaxVersion() const { return maxVersion; }
+
+	int size() const { return versions.size(); }
+
+	void setVersion(const Tag& tag, Version version) {
+		ASSERT(tag != invalidTag);
+		ASSERT(version > maxVersion);
+		versions[tag] = version;
+		maxVersion = version;
+	}
+
+	void setVersion(const std::set<Tag>& tags, Version version) {
+		ASSERT(version > maxVersion);
+		for (auto& tag : tags) {
+			ASSERT(tag != invalidTag);
+			versions[tag] = version;
+		}
+		maxVersion = version;
+	}
+
+	bool hasVersion(const Tag& tag) const {
+		ASSERT(tag != invalidTag);
+		return versions.find(tag) != versions.end();
+	}
+
+	// @pre assumes that the given tag has an entry in the version vector.
+	Version getVersion(const Tag& tag) const {
+		ASSERT(tag != invalidTag);
+		auto iter = versions.find(tag);
+		ASSERT(iter != versions.end());
+		return iter->second;
+	}
+
+	void clear() {
+		versions.clear();
+		maxVersion = invalidVersion;
+	}
+
+	// @note this method, together with method applyDelta(), helps minimize
+	// the number of version vector entries that get sent from sequencer to
+	// grv proxy (and from grv proxy to client) on the read path.
+	void getDelta(Version refVersion, VersionVector& delta) const {
+		ASSERT(refVersion <= maxVersion);
+
+		delta.clear();
+
+		if (refVersion == maxVersion) {
+			return; // rerurn an invalid version vector
+		}
+
+		if (CLIENT_KNOBS->SEND_ENTIRE_VERSION_VECTOR) {
+			delta = *this;
+		} else {
+			for (const auto& [tag, version] : versions) {
+				if (version > refVersion) {
+					delta.setVersionNoCheck(tag, version);
+				}
+			}
+			delta.maxVersion = maxVersion;
+		}
+	}
+
+	// @note this method, together with method getDelta(), helps minimize
+	// the number of version vector entries that get sent from sequencer to
+	// grv proxy (and from grv proxy to client) on the read path.
+	void applyDelta(const VersionVector& delta) {
+		if (delta.maxVersion == invalidVersion) {
+			return;
+		}
+
+		if (maxVersion >= delta.maxVersion) {
+			return;
+		}
+
+		if (CLIENT_KNOBS->SEND_ENTIRE_VERSION_VECTOR) {
+			*this = delta;
+		} else {
+			for (const auto& [tag, version] : delta.versions) {
+				if (version > maxVersion) {
+					setVersionNoCheck(tag, version);
+				}
+			}
+			maxVersion = delta.maxVersion;
+		}
+	}
+
+	std::string toString() const {
+		std::stringstream vector;
+		vector << "[";
+		for (const auto& [tag, version] : versions) {
+			vector << '{' << tag.toString() << "," << version << '}';
+		}
+		vector << " maxversion: " << maxVersion << "]";
+		return vector.str();
+	}
+
+	bool operator==(const VersionVector& vv) const { return maxVersion == vv.maxVersion; }
+	bool operator!=(const VersionVector& vv) const { return maxVersion != vv.maxVersion; }
+	bool operator<(const VersionVector& vv) const { return maxVersion < vv.maxVersion; }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, versions, maxVersion);
+	}
+};
+
+static const VersionVector minVersionVector{ 0 };
+static const VersionVector maxVersionVector{ MAX_VERSION };
+static const VersionVector invalidVersionVector{ invalidVersion };
+
+#endif /* FDBCLIENT_VERSION_VECTOR_H */
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@ -330,6 +330,7 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
 	AcknowledgementReceiver acknowledgements;
 	Endpoint requestStreamEndpoint;
 	bool sentError = false;
+	bool notifiedFailed = false;
 	Promise<Void> onConnect;

 	NetNotifiedQueueWithAcknowledgements(int futures, int promises)
@ -408,14 +409,20 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
 		return res;
 	}

-	~NetNotifiedQueueWithAcknowledgements() {
-		if (acknowledgements.getRawEndpoint().isValid() && acknowledgements.isRemoteEndpoint() && !this->hasError()) {
+	void notifyFailed() {
+		if (!notifiedFailed && acknowledgements.getRawEndpoint().isValid() && acknowledgements.isRemoteEndpoint() &&
+		    !this->hasError()) {
 			// Notify the server that a client is not using this ReplyPromiseStream anymore
 			FlowTransport::transport().sendUnreliable(
 			    SerializeSource<ErrorOr<AcknowledgementReply>>(operation_obsolete()),
 			    acknowledgements.getEndpoint(TaskPriority::ReadSocket),
 			    false);
+			notifiedFailed = true;
 		}
+	}
+
+	~NetNotifiedQueueWithAcknowledgements() {
+		notifyFailed();
 		if (isRemoteEndpoint() && !sentError && !acknowledgements.failures.isReady()) {
 			// Notify the client ReplyPromiseStream was cancelled before sending an error, so the storage server must
 			// have died
@ -511,6 +518,8 @@ public:
 		return queue->onConnect.getFuture();
 	}

+	void notifyFailed() { queue->notifyFailed(); }
+
 	~ReplyPromiseStream() {
 		if (queue)
 			queue->delPromiseRef();
--- a/fdbrpc/genericactors.actor.h
+++ b/fdbrpc/genericactors.actor.h
@ -80,14 +80,21 @@ Future<ErrorOr<REPLY_TYPE(Req)>> tryGetReplyFromHostname(RequestStream<Req>* to,
 	// A wrapper of tryGetReply(request), except that the request is sent to an address resolved from a hostname.
 	// If resolving fails, return lookup_failed().
 	// Otherwise, return tryGetReply(request).
-	try {
-		wait(hostname.resolve());
-	} catch (...) {
+	Optional<NetworkAddress> address = wait(hostname.resolve());
+	if (!address.present()) {
 		return ErrorOr<REPLY_TYPE(Req)>(lookup_failed());
 	}
-	Optional<NetworkAddress> address = hostname.resolvedAddress;
 	*to = RequestStream<Req>(Endpoint::wellKnown({ address.get() }, token));
-	return to->tryGetReply(request);
+	ErrorOr<REPLY_TYPE(Req)> reply = wait(to->tryGetReply(request));
+	if (reply.isError()) {
+		resetReply(request);
+		if (reply.getError().code() == error_code_request_maybe_delivered) {
+			// Connection failure.
+			hostname.resetToUnresolved();
+			INetworkConnections::net()->removeCachedDNS(hostname.host, hostname.service);
+		}
+	}
+	return reply;
 }

 ACTOR template <class Req>
@ -99,14 +106,21 @@ Future<ErrorOr<REPLY_TYPE(Req)>> tryGetReplyFromHostname(RequestStream<Req>* to,
 	// A wrapper of tryGetReply(request), except that the request is sent to an address resolved from a hostname.
 	// If resolving fails, return lookup_failed().
 	// Otherwise, return tryGetReply(request).
-	try {
-		wait(hostname.resolve());
-	} catch (...) {
+	Optional<NetworkAddress> address = wait(hostname.resolve());
+	if (!address.present()) {
 		return ErrorOr<REPLY_TYPE(Req)>(lookup_failed());
 	}
-	Optional<NetworkAddress> address = hostname.resolvedAddress;
 	*to = RequestStream<Req>(Endpoint::wellKnown({ address.get() }, token));
-	return to->tryGetReply(request, taskID);
+	ErrorOr<REPLY_TYPE(Req)> reply = wait(to->tryGetReply(request, taskID));
+	if (reply.isError()) {
+		resetReply(request);
+		if (reply.getError().code() == error_code_request_maybe_delivered) {
+			// Connection failure.
+			hostname.resetToUnresolved();
+			INetworkConnections::net()->removeCachedDNS(hostname.host, hostname.service);
+		}
+	}
+	return reply;
 }

 ACTOR template <class Req>
@ -118,9 +132,8 @@ Future<REPLY_TYPE(Req)> retryGetReplyFromHostname(RequestStream<Req>* to,
 	// Suitable for use with hostname, where RequestStream is NOT initialized yet.
 	// Not normally useful for endpoints initialized with NetworkAddress.
 	loop {
-		wait(hostname.resolveWithRetry());
-		state Optional<NetworkAddress> address = hostname.resolvedAddress;
-		*to = RequestStream<Req>(Endpoint::wellKnown({ address.get() }, token));
+		NetworkAddress address = wait(hostname.resolveWithRetry());
+		*to = RequestStream<Req>(Endpoint::wellKnown({ address }, token));
 		ErrorOr<REPLY_TYPE(Req)> reply = wait(to->tryGetReply(request));
 		if (reply.isError()) {
 			resetReply(request);
@ -147,9 +160,8 @@ Future<REPLY_TYPE(Req)> retryGetReplyFromHostname(RequestStream<Req>* to,
 	// Suitable for use with hostname, where RequestStream is NOT initialized yet.
 	// Not normally useful for endpoints initialized with NetworkAddress.
 	loop {
-		wait(hostname.resolveWithRetry());
-		state Optional<NetworkAddress> address = hostname.resolvedAddress;
-		*to = RequestStream<Req>(Endpoint::wellKnown({ address.get() }, token));
+		NetworkAddress address = wait(hostname.resolveWithRetry());
+		*to = RequestStream<Req>(Endpoint::wellKnown({ address }, token));
 		ErrorOr<REPLY_TYPE(Req)> reply = wait(to->tryGetReply(request, taskID));
 		if (reply.isError()) {
 			resetReply(request);
@ -321,6 +333,8 @@ void endStreamOnDisconnect(Future<Void> signal,
 				wait(signal || stream.onConnected());
 			}
 		}
+		// Notify BEFORE dropping last reference, causing broken_promise to send on stream before destructor is called
+		stream.notifyFailed();
 	}
 }

--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -27,6 +27,8 @@
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/LogProtocolMessage.h"
 #include "fdbserver/LogSystem.h"
+#include "flow/Error.h"
+#include "flow/Trace.h"

 Reference<StorageInfo> getStorageInfo(UID id,
                                      std::map<UID, Reference<StorageInfo>>* storageCache,
@ -45,10 +47,6 @@ Reference<StorageInfo> getStorageInfo(UID id,
 }
 namespace {

-inline bool isSystemKey(KeyRef key) {
-	return key.size() && key[0] == systemKeys.begin[0];
-}
-
 // It is incredibly important that any modifications to txnStateStore are done in such a way that the same operations
 // will be done on all commit proxies at the same time. Otherwise, the data stored in txnStateStore will become
 // corrupted.
@ -84,6 +82,15 @@ public:
 	    tssMapping(&proxyCommitData_.tssMapping), tenantMap(&proxyCommitData_.tenantMap),
 	    initialCommit(initialCommit_) {}

+	ApplyMetadataMutationsImpl(const SpanID& spanContext_,
+	                           ResolverData& resolverData_,
+	                           const VectorRef<MutationRef>& mutations_)
+	  : spanContext(spanContext_), dbgid(resolverData_.dbgid), arena(resolverData_.arena), mutations(mutations_),
+	    txnStateStore(resolverData_.txnStateStore), toCommit(resolverData_.toCommit),
+	    confChange(resolverData_.confChanges), logSystem(resolverData_.logSystem), popVersion(resolverData_.popVersion),
+	    keyInfo(resolverData_.keyInfo), storageCache(resolverData_.storageCache),
+	    initialCommit(resolverData_.initialCommit), forResolver(true) {}
+
 private:
 	// The following variables are incoming parameters

@ -123,6 +130,9 @@ private:
 	// true if the mutations were already written to the txnStateStore as part of recovery
 	bool initialCommit = false;

+	// true if called from Resolver
+	bool forResolver = false;
+
 private:
 	// The following variables are used internally

@ -188,12 +198,16 @@ private:
 		}
 		uniquify(info.tags);
 		keyInfo->insert(insertRange, info);
+		if (toCommit && SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+			toCommit->setShardChanged();
+		}
 	}

 	void checkSetServerKeysPrefix(MutationRef m) {
 		if (!m.param1.startsWith(serverKeysPrefix)) {
 			return;
 		}
+
 		if (toCommit) {
 			Tag tag = decodeServerTagValue(
 			    txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get());
@ -215,6 +229,7 @@ private:
 		if (!m.param1.startsWith(serverTagPrefix)) {
 			return;
 		}
+
 		UID id = decodeServerTagKey(m.param1);
 		Tag tag = decodeServerTagValue(m.param2);

@ -223,8 +238,10 @@ private:
 			privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
 			TraceEvent("ServerTag", dbgid).detail("Server", id).detail("Tag", tag.toString());

+			TraceEvent(SevDebug, "SendingPrivatized_ServerTag", dbgid).detail("M", "LogProtocolMessage");
 			toCommit->addTag(tag);
 			toCommit->writeTypedMessage(LogProtocolMessage(), true);
+			TraceEvent(SevDebug, "SendingPrivatized_ServerTag", dbgid).detail("M", privatized);
 			toCommit->addTag(tag);
 			toCommit->writeTypedMessage(privatized);
 		}
@ -254,7 +271,7 @@ private:
 	void checkSetStorageCachePrefix(MutationRef m) {
 		if (!m.param1.startsWith(storageCachePrefix))
 			return;
-		if (cacheInfo) {
+		if (cacheInfo || forResolver) {
 			KeyRef k = m.param1.removePrefix(storageCachePrefix);

 			// Create a private mutation for storage servers
@ -265,7 +282,7 @@ private:
 				//TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
 				cachedRangeInfo[k] = privatized;
 			}
-			if (k != allKeys.end) {
+			if (cacheInfo && k != allKeys.end) {
 				KeyRef end = cacheInfo->rangeContaining(k).end();
 				std::vector<uint16_t> serverIndices;
 				decodeStorageCacheValue(m.param2, serverIndices);
@ -280,11 +297,11 @@ private:
 		if (!m.param1.startsWith(cacheKeysPrefix) || toCommit == nullptr) {
 			return;
 		}
-
 		// Create a private mutation for cache servers
 		// This is done to make the cache servers aware of the cached key-ranges
 		MutationRef privatized = m;
 		privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
+		TraceEvent(SevDebug, "SendingPrivatized_CacheTag", dbgid).detail("M", privatized);
 		toCommit->addTag(cacheTag);
 		toCommit->writeTypedMessage(privatized);
 	}
@ -336,6 +353,7 @@ private:
 				}
 				toCommit->addTags(allSources);
 			}
+			TraceEvent(SevDebug, "SendingPrivatized_ChangeFeed", dbgid).detail("M", privatized);
 			toCommit->writeTypedMessage(privatized);
 		}
 	}
@ -370,6 +388,7 @@ private:
 		if (!m.param1.startsWith(tssMappingKeys.begin)) {
 			return;
 		}
+
 		// Normally uses key backed map, so have to use same unpacking code here.
 		UID ssId = Codec<UID>::unpack(Tuple::unpack(m.param1.removePrefix(tssMappingKeys.begin)));
 		UID tssId = Codec<UID>::unpack(Tuple::unpack(m.param2));
@ -387,6 +406,7 @@ private:

 			Optional<Value> tagV = txnStateStore->readValue(serverTagKeyFor(ssId)).get();
 			if (tagV.present()) {
+				TraceEvent(SevDebug, "SendingPrivatized_TSSID", dbgid).detail("M", privatized);
 				toCommit->addTag(decodeServerTagValue(tagV.get()));
 				toCommit->writeTypedMessage(privatized);
 			}
@ -415,6 +435,7 @@ private:
 		if (tagV.present()) {
 			MutationRef privatized = m;
 			privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
+			TraceEvent(SevDebug, "SendingPrivatized_TSSQuarantine", dbgid).detail("M", privatized);
 			toCommit->addTag(decodeServerTagValue(tagV.get()));
 			toCommit->writeTypedMessage(privatized);
 		}
@ -510,7 +531,6 @@ private:
 		if (!toCommit) {
 			return;
 		}
-
 		// Notifies all servers that a Master's server epoch ends
 		auto allServers = txnStateStore->readRange(serverTagKeys).get();
 		std::set<Tag> allTags;
@ -533,10 +553,12 @@ private:
 		if (m.param1 == lastEpochEndKey) {
 			toCommit->addTags(allTags);
 			toCommit->writeTypedMessage(LogProtocolMessage(), true);
+			TraceEvent(SevDebug, "SendingPrivatized_GlobalKeys", dbgid).detail("M", "LogProtocolMessage");
 		}

 		MutationRef privatized = m;
 		privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
+		TraceEvent(SevDebug, "SendingPrivatized_GlobalKeys", dbgid).detail("M", privatized);
 		toCommit->addTags(allTags);
 		toCommit->writeTypedMessage(privatized);
 	}
@ -590,6 +612,18 @@ private:
 		TEST(true); // Recovering at a higher version.
 	}

+	void checkSetVersionEpochKey(MutationRef m) {
+		if (m.param1 != versionEpochKey) {
+			return;
+		}
+		int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(m.param2, Unversioned());
+		TraceEvent("VersionEpoch", dbgid).detail("Epoch", versionEpoch);
+		if (!initialCommit)
+			txnStateStore->set(KeyValueRef(m.param1, m.param2));
+		confChange = true;
+		TEST(true); // Setting version epoch
+	}
+
 	void checkSetWriteRecoverKey(MutationRef m) {
 		if (m.param1 != writeRecoveryKey) {
 			return;
@ -646,6 +680,9 @@ private:
 			                clearRange.begin == StringRef()
 			                    ? ServerCacheInfo()
 			                    : keyInfo->rangeContainingKeyBefore(clearRange.begin).value());
+			if (toCommit && SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+				toCommit->setShardChanged();
+			}
 		}

 		if (!initialCommit)
@ -687,10 +724,7 @@ private:
 	}

 	void checkClearTagLocalityListKeys(KeyRangeRef range) {
-		if (!tagLocalityListKeys.intersects(range)) {
-			return;
-		}
-		if (initialCommit) {
+		if (!tagLocalityListKeys.intersects(range) || initialCommit) {
 			return;
 		}
 		txnStateStore->clear(range & tagLocalityListKeys);
@ -700,7 +734,6 @@ private:
 		if (!serverTagKeys.intersects(range)) {
 			return;
 		}
-
 		// Storage server removal always happens in a separate version from any prior writes (or any subsequent
 		// reuse of the tag) so we can safely destroy the tag here without any concern about intra-batch
 		// ordering
@ -713,14 +746,19 @@ private:
 				    .detail("PopVersion", popVersion)
 				    .detail("Tag", tag.toString())
 				    .detail("Server", decodeServerTagKey(kv.key));
+				if (!forResolver) {
 					logSystem->pop(popVersion, decodeServerTagValue(kv.value));
 					(*tag_popped)[tag] = popVersion;
+				}
+				ASSERT_WE_THINK(forResolver ^ (tag_popped != nullptr));

 				if (toCommit) {
 					MutationRef privatized = m;
 					privatized.param1 = kv.key.withPrefix(systemKeys.begin, arena);
 					privatized.param2 = keyAfter(kv.key, arena).withPrefix(systemKeys.begin, arena);

+					TraceEvent(SevDebug, "SendingPrivatized_ClearServerTag", dbgid).detail("M", privatized);
+
 					toCommit->addTag(decodeServerTagValue(kv.value));
 					toCommit->writeTypedMessage(privatized);
 				}
@ -743,6 +781,8 @@ private:
 								privatized.param2 =
 								    keyAfter(maybeTssRange.begin, arena).withPrefix(systemKeys.begin, arena);

+								TraceEvent(SevDebug, "SendingPrivatized_TSSClearServerTag", dbgid)
+								    .detail("M", privatized);
 								toCommit->addTag(decodeServerTagValue(tagV.get()));
 								toCommit->writeTypedMessage(privatized);
 							}
@ -776,9 +816,12 @@ private:
 				    .detail("PopVersion", popVersion)
 				    .detail("Tag", tag.toString())
 				    .detail("Version", decodeServerTagHistoryKey(kv.key));
+				if (!forResolver) {
 					logSystem->pop(popVersion, tag);
 					(*tag_popped)[tag] = popVersion;
 				}
+				ASSERT_WE_THINK(forResolver ^ (tag_popped != nullptr));
+			}
 		}
 		if (!initialCommit)
 			txnStateStore->clear(range & serverTagHistoryKeys);
@ -924,6 +967,7 @@ private:
 			MutationRef privatized = m;
 			privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
 			privatized.param2 = m.param2.withPrefix(systemKeys.begin, arena);
+			TraceEvent(SevDebug, "SendingPrivatized_ClearTSSMapping", dbgid).detail("M", privatized);
 			toCommit->addTag(decodeServerTagValue(tagV.get()));
 			toCommit->writeTypedMessage(privatized);
 		}
@ -950,6 +994,7 @@ private:
 					MutationRef privatized = m;
 					privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
 					privatized.param2 = m.param2.withPrefix(systemKeys.begin, arena);
+					TraceEvent(SevDebug, "SendingPrivatized_ClearTSSQuarantine", dbgid).detail("M", privatized);
 					toCommit->addTag(decodeServerTagValue(tagV.get()));
 					toCommit->writeTypedMessage(privatized);
 				}
@ -957,6 +1002,16 @@ private:
 		}
 	}

+	void checkClearVersionEpochKeys(MutationRef m, KeyRangeRef range) {
+		if (!range.contains(versionEpochKey)) {
+			return;
+		}
+		if (!initialCommit)
+			txnStateStore->clear(singleKeyRange(versionEpochKey));
+		TraceEvent("MutationRequiresRestart", dbgid).detail("M", m);
+		confChange = true;
+	}
+
 	void checkClearTenantMapPrefix(KeyRangeRef range) {
 		if (tenantMapKeys.intersects(range)) {
 			if (tenantMap) {
@ -1087,6 +1142,9 @@ private:
 			}

 			// Add the tags to both begin and end mutations
+			TraceEvent(SevDebug, "SendingPrivatized_CachedKeyRange", dbgid)
+			    .detail("MBegin", mutationBegin)
+			    .detail("MEnd", mutationEnd);
 			toCommit->addTags(allTags);
 			toCommit->writeTypedMessage(mutationBegin);
 			toCommit->addTags(allTags);
@ -1119,6 +1177,7 @@ public:
 				checkSetGlobalKeys(m);
 				checkSetWriteRecoverKey(m);
 				checkSetMinRequiredCommitVersionKey(m);
+				checkSetVersionEpochKey(m);
 				checkSetTenantMapPrefix(m);
 				checkSetOtherKeys(m);
 			} else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) {
@ -1135,6 +1194,7 @@ public:
 				checkClearLogRangesRange(range);
 				checkClearTssMappingKeys(m, range);
 				checkClearTssQuarantineKeys(m, range);
+				checkClearVersionEpochKeys(m, range);
 				checkClearTenantMapPrefix(range);
 				checkClearMiscRangeKeys(range);
 			}
@ -1181,6 +1241,12 @@ void applyMetadataMutations(SpanID const& spanContext,
 	    .apply();
 }

+void applyMetadataMutations(SpanID const& spanContext,
+                            ResolverData& resolverData,
+                            const VectorRef<MutationRef>& mutations) {
+	ApplyMetadataMutationsImpl(spanContext, resolverData, mutations).apply();
+}
+
 void applyMetadataMutations(SpanID const& spanContext,
                            const UID& dbgid,
                            Arena& arena,
--- a/fdbserver/ApplyMetadataMutation.h
+++ b/fdbserver/ApplyMetadataMutation.h
@ -22,21 +22,59 @@
 #define FDBSERVER_APPLYMETADATAMUTATION_H
 #pragma once

+#include <cstddef>
+
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/MutationList.h"
 #include "fdbclient/Notified.h"
+#include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/LogProtocolMessage.h"
 #include "fdbserver/LogSystem.h"
 #include "fdbserver/ProxyCommitData.actor.h"
+#include "flow/FastRef.h"
+
+// Resolver's data for applyMetadataMutations() calls.
+struct ResolverData {
+	const UID dbgid;
+	IKeyValueStore* txnStateStore = nullptr;
+	KeyRangeMap<ServerCacheInfo>* keyInfo = nullptr;
+	Arena arena;
+	// Whether configuration changes. If so, a recovery is forced.
+	bool& confChanges;
+	bool initialCommit = false;
+	Reference<ILogSystem> logSystem = Reference<ILogSystem>();
+	LogPushData* toCommit = nullptr;
+	Version popVersion = 0; // exclusive, usually set to commitVersion + 1
+	std::map<UID, Reference<StorageInfo>>* storageCache = nullptr;
+	std::unordered_map<UID, StorageServerInterface>* tssMapping = nullptr;
+
+	// For initial broadcast
+	ResolverData(UID debugId, IKeyValueStore* store, KeyRangeMap<ServerCacheInfo>* info, bool& forceRecovery)
+	  : dbgid(debugId), txnStateStore(store), keyInfo(info), confChanges(forceRecovery), initialCommit(true) {}
+
+	// For transaction batches that contain metadata mutations
+	ResolverData(UID debugId,
+	             Reference<ILogSystem> logSystem,
+	             IKeyValueStore* store,
+	             KeyRangeMap<ServerCacheInfo>* info,
+	             LogPushData* toCommit,
+	             bool& forceRecovery,
+	             Version popVersion,
+	             std::map<UID, Reference<StorageInfo>>* storageCache,
+	             std::unordered_map<UID, StorageServerInterface>* tssMapping)
+	  : dbgid(debugId), txnStateStore(store), keyInfo(info), confChanges(forceRecovery), logSystem(logSystem),
+	    toCommit(toCommit), popVersion(popVersion), storageCache(storageCache), tssMapping(tssMapping) {}
+};

 inline bool isMetadataMutation(MutationRef const& m) {
 	// FIXME: This is conservative - not everything in system keyspace is necessarily processed by
 	// applyMetadataMutations
 	if (m.type == MutationRef::SetValue) {
-		return m.param1.size() && m.param1[0] == systemKeys.begin[0] &&
-		       !m.param1.startsWith(nonMetadataSystemKeys.begin);
+		return (m.param1.size() && m.param1[0] == systemKeys.begin[0] &&
+		        !m.param1.startsWith(nonMetadataSystemKeys.begin)) ||
+		       m.param1.startsWith(changeFeedPrefix);
 	} else if (m.type == MutationRef::ClearRange) {
 		return m.param2.size() > 1 && m.param2[0] == systemKeys.begin[0] &&
 		       !nonMetadataSystemKeys.contains(KeyRangeRef(m.param1, m.param2));
@ -65,4 +103,45 @@ void applyMetadataMutations(SpanID const& spanContext,
                            const VectorRef<MutationRef>& mutations,
                            IKeyValueStore* txnStateStore);

+inline bool isSystemKey(KeyRef key) {
+	return key.size() && key[0] == systemKeys.begin[0];
+}
+
+inline bool containsMetadataMutation(const VectorRef<MutationRef>& mutations) {
+	for (auto const& m : mutations) {
+
+		if (m.type == MutationRef::SetValue && isSystemKey(m.param1)) {
+			if (m.param1.startsWith(globalKeysPrefix) || (m.param1.startsWith(cacheKeysPrefix)) ||
+			    (m.param1.startsWith(configKeysPrefix)) || (m.param1.startsWith(serverListPrefix)) ||
+			    (m.param1.startsWith(storageCachePrefix)) || (m.param1.startsWith(serverTagPrefix)) ||
+			    (m.param1.startsWith(tssMappingKeys.begin)) || (m.param1.startsWith(tssQuarantineKeys.begin)) ||
+			    (m.param1.startsWith(applyMutationsEndRange.begin)) ||
+			    (m.param1.startsWith(applyMutationsKeyVersionMapRange.begin)) ||
+			    (m.param1.startsWith(logRangesRange.begin)) || (m.param1.startsWith(serverKeysPrefix)) ||
+			    (m.param1.startsWith(keyServersPrefix)) || (m.param1.startsWith(cacheKeysPrefix))) {
+				return true;
+			}
+		} else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) {
+			KeyRangeRef range(m.param1, m.param2);
+			if ((keyServersKeys.intersects(range)) || (configKeys.intersects(range)) ||
+			    (serverListKeys.intersects(range)) || (tagLocalityListKeys.intersects(range)) ||
+			    (serverTagKeys.intersects(range)) || (serverTagHistoryKeys.intersects(range)) ||
+			    (range.intersects(applyMutationsEndRange)) || (range.intersects(applyMutationsKeyVersionMapRange)) ||
+			    (range.intersects(logRangesRange)) || (tssMappingKeys.intersects(range)) ||
+			    (tssQuarantineKeys.intersects(range)) || (range.contains(coordinatorsKey)) ||
+			    (range.contains(databaseLockedKey)) || (range.contains(metadataVersionKey)) ||
+			    (range.contains(mustContainSystemMutationsKey)) || (range.contains(writeRecoveryKey)) ||
+			    (range.intersects(testOnlyTxnStateStorePrefixRange))) {
+				return true;
+			}
+		}
+	}
+	return false;
+}
+
+// Resolver's version
+void applyMetadataMutations(SpanID const& spanContext,
+                            ResolverData& resolverData,
+                            const VectorRef<MutationRef>& mutations);
+
 #endif
--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@ -431,6 +431,7 @@ struct BackupData {
 			GetReadVersionRequest request(span.context,
 			                              0,
 			                              TransactionPriority::DEFAULT,
+			                              invalidVersion,
 			                              GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION);
 			choose {
 				when(wait(self->cx->onProxiesChanged())) {}
@ -439,6 +440,7 @@ struct BackupData {
 				                               &GrvProxyInterface::getConsistentReadVersion,
 				                               request,
 				                               self->cx->taskID))) {
+					self->cx->ssVersionVectorCache.applyDelta(reply.ssVersionVectorDelta);
 					return reply.version;
 				}
 			}
@ -744,7 +746,6 @@ ACTOR Future<Void> saveMutationsToFile(BackupData* self, Version popVersion, int
 			continue;

 		DEBUG_MUTATION("addMutation", message.version.version, m)
-		    .detail("Version", message.version.toString())
 		    .detail("KCV", self->minKnownCommittedVersion)
 		    .detail("SavedVersion", self->savedVersion);

--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -216,7 +216,7 @@ struct SplitEvaluation {
 struct BlobManagerStats {
 	CounterCollection cc;

-	// FIXME: pruning stats
+	// FIXME: purging stats

 	Counter granuleSplits;
 	Counter granuleWriteHotSplits;
@ -226,6 +226,10 @@ struct BlobManagerStats {
 	Counter ccMismatches;
 	Counter ccTimeouts;
 	Counter ccErrors;
+	Counter purgesProcessed;
+	Counter granulesFullyPurged;
+	Counter granulesPartiallyPurged;
+	Counter filesPurged;
 	Future<Void> logger;

 	// Current stats maintained for a given blob worker process
@ -233,7 +237,9 @@ struct BlobManagerStats {
 	  : cc("BlobManagerStats", id.toString()), granuleSplits("GranuleSplits", cc),
 	    granuleWriteHotSplits("GranuleWriteHotSplits", cc), ccGranulesChecked("CCGranulesChecked", cc),
 	    ccRowsChecked("CCRowsChecked", cc), ccBytesChecked("CCBytesChecked", cc), ccMismatches("CCMismatches", cc),
-	    ccTimeouts("CCTimeouts", cc), ccErrors("CCErrors", cc) {
+	    ccTimeouts("CCTimeouts", cc), ccErrors("CCErrors", cc), purgesProcessed("PurgesProcessed", cc),
+	    granulesFullyPurged("GranulesFullyPurged", cc), granulesPartiallyPurged("GranulesPartiallyPurged", cc),
+	    filesPurged("FilesPurged", cc) {
 		specialCounter(cc, "WorkerCount", [workers]() { return workers->size(); });
 		logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics");
 	}
@ -438,6 +444,7 @@ ACTOR Future<UID> pickWorkerForAssign(Reference<BlobManagerData> bmData) {
 ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
                                     RangeAssignment assignment,
                                     Optional<UID> workerID,
+                                     int64_t epoch,
                                     int64_t seqNo) {
 	// WorkerId is set, except in case of assigning to any worker. Then we pick the worker to assign to in here

@ -468,7 +475,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 		           assignment.isAssign ? "assigning" : "revoking",
 		           assignment.keyRange.begin.printable(),
 		           assignment.keyRange.end.printable(),
-		           bmData->epoch,
+		           epoch,
 		           seqNo,
 		           workerID.get().toString());
 	}
@ -481,7 +488,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 			AssignBlobRangeRequest req;
 			req.keyRange = KeyRangeRef(StringRef(req.arena, assignment.keyRange.begin),
 			                           StringRef(req.arena, assignment.keyRange.end));
-			req.managerEpoch = bmData->epoch;
+			req.managerEpoch = epoch;
 			req.managerSeqno = seqNo;
 			req.type = assignment.assign.get().type;

@ -497,7 +504,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 			RevokeBlobRangeRequest req;
 			req.keyRange = KeyRangeRef(StringRef(req.arena, assignment.keyRange.begin),
 			                           StringRef(req.arena, assignment.keyRange.end));
-			req.managerEpoch = bmData->epoch;
+			req.managerEpoch = epoch;
 			req.managerSeqno = seqNo;
 			req.dispose = assignment.revoke.get().dispose;

@ -637,10 +644,10 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
 				}
 				count++;
 			}
-			ASSERT(count == 1);
 			if (skip) {
 				continue;
 			}
+			ASSERT(count == 1);

 			if (assignment.worker.present() && assignment.worker.get().isValid()) {
 				if (BM_DEBUG) {
@ -653,7 +660,7 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {

 				bmData->workerAssignments.insert(assignment.keyRange, workerId);
 				bmData->assignsInProgress.insert(assignment.keyRange,
-				                                 doRangeAssignment(bmData, assignment, workerId, seqNo));
+				                                 doRangeAssignment(bmData, assignment, workerId, bmData->epoch, seqNo));
 				// If we know about the worker and this is not a continue, then this is a new range for the worker
 				if (bmData->workerStats.count(workerId) &&
 				    assignment.assign.get().type != AssignRequestType::Continue) {
@ -662,8 +669,8 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
 			} else {
 				// Ensure the key boundaries are updated before we pick a worker
 				bmData->workerAssignments.insert(assignment.keyRange, UID());
-				bmData->assignsInProgress.insert(assignment.keyRange,
-				                                 doRangeAssignment(bmData, assignment, Optional<UID>(), seqNo));
+				bmData->assignsInProgress.insert(
+				    assignment.keyRange, doRangeAssignment(bmData, assignment, Optional<UID>(), bmData->epoch, seqNo));
 			}

 		} else {
@ -677,7 +684,8 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
 				if (existingRange.range() == assignment.keyRange && existingRange.cvalue() == assignment.worker.get()) {
 					bmData->workerAssignments.insert(assignment.keyRange, UID());
 				}
-				bmData->addActor.send(doRangeAssignment(bmData, assignment, assignment.worker.get(), seqNo));
+				bmData->addActor.send(
+				    doRangeAssignment(bmData, assignment, assignment.worker.get(), bmData->epoch, seqNo));
 			} else {
 				auto currentAssignments = bmData->workerAssignments.intersectingRanges(assignment.keyRange);
 				for (auto& it : currentAssignments) {
@ -693,7 +701,7 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
 					}

 					// revoke the range for the worker that owns it, not the worker specified in the revoke
-					bmData->addActor.send(doRangeAssignment(bmData, assignment, it.value(), seqNo));
+					bmData->addActor.send(doRangeAssignment(bmData, assignment, it.value(), bmData->epoch, seqNo));
 				}
 				bmData->workerAssignments.insert(assignment.keyRange, UID());
 			}
@ -1356,26 +1364,6 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 				// back is to split the range.
 				ASSERT(rep.doSplit);

-				// only evaluate for split if this worker currently owns the granule in this blob manager's mapping
-				auto currGranuleAssignment = bmData->workerAssignments.rangeContaining(rep.granuleRange.begin);
-				if (!(currGranuleAssignment.begin() == rep.granuleRange.begin &&
-				      currGranuleAssignment.end() == rep.granuleRange.end &&
-				      currGranuleAssignment.cvalue() == bwInterf.id())) {
-					if (BM_DEBUG) {
-						fmt::print("Manager {0} ignoring status from BW {1} for granule [{2} - {3}) since BW {4} owns "
-						           "[{5} - {6}).\n",
-						           bmData->epoch,
-						           bwInterf.id().toString().substr(0, 5),
-						           rep.granuleRange.begin.printable(),
-						           rep.granuleRange.end.printable(),
-						           currGranuleAssignment.cvalue().toString().substr(0, 5),
-						           currGranuleAssignment.begin().printable(),
-						           currGranuleAssignment.end().printable());
-					}
-					// FIXME: could send revoke request
-					continue;
-				}
-
 				// FIXME: We will need to go over all splits in the range once we're doing merges, instead of first one
 				auto lastSplitEval = bmData->splitEvaluations.rangeContaining(rep.granuleRange.begin);
 				if (rep.granuleRange.begin == lastSplitEval.begin() && rep.granuleRange.end == lastSplitEval.end() &&
@ -1386,10 +1374,36 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 						           rep.granuleRange.begin.printable(),
 						           rep.granuleRange.end.printable());
 					}
-				} else {
-					ASSERT(lastSplitEval.cvalue().epoch < rep.epoch ||
-					       (lastSplitEval.cvalue().epoch == rep.epoch && lastSplitEval.cvalue().seqno < rep.seqno));
-					if (lastSplitEval.cvalue().inProgress.isValid() && !lastSplitEval.cvalue().inProgress.isReady()) {
+				} else if (!(lastSplitEval.cvalue().epoch < rep.epoch ||
+				             (lastSplitEval.cvalue().epoch == rep.epoch && lastSplitEval.cvalue().seqno < rep.seqno))) {
+					TEST(true); // BM got out-of-date split request
+					if (BM_DEBUG) {
+						fmt::print(
+						    "Manager {0} ignoring status from BW {1} for granule [{2} - {3}) since it already processed"
+						    "[{4} - {5}) @ ({6}, {7}).\n",
+						    bmData->epoch,
+						    bwInterf.id().toString().substr(0, 5),
+						    rep.granuleRange.begin.printable(),
+						    rep.granuleRange.end.printable(),
+						    lastSplitEval.begin().printable(),
+						    lastSplitEval.end().printable(),
+						    lastSplitEval.cvalue().epoch,
+						    lastSplitEval.cvalue().seqno);
+					}
+
+					// revoke range from out-of-date worker, but bypass rangeAssigner and hack (epoch, seqno) to be
+					// (requesting epoch, requesting seqno + 1) to ensure no race with then reassigning the range to the
+					// worker at a later version
+					RangeAssignment revokeOld;
+					revokeOld.isAssign = false;
+					revokeOld.worker = bwInterf.id();
+					revokeOld.keyRange = rep.granuleRange;
+					revokeOld.revoke = RangeRevokeData(false);
+
+					bmData->addActor.send(
+					    doRangeAssignment(bmData, revokeOld, bwInterf.id(), rep.epoch, rep.seqno + 1));
+				} else if (lastSplitEval.cvalue().inProgress.isValid() &&
+				           !lastSplitEval.cvalue().inProgress.isReady()) {
 					TEST(true); // racing BM splits
 					// For example, one worker asked BM to split, then died, granule was moved, new worker asks to
 					// split on recovery. We need to ensure that they are semantically the same split.
@ -1417,17 +1431,12 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 						           rep.epoch,
 						           rep.seqno);
 					}
-						Future<Void> doSplitEval = maybeSplitRange(bmData,
-						                                           bwInterf.id(),
-						                                           rep.granuleRange,
-						                                           rep.granuleID,
-						                                           rep.startVersion,
-						                                           rep.writeHotSplit);
+					Future<Void> doSplitEval = maybeSplitRange(
+					    bmData, bwInterf.id(), rep.granuleRange, rep.granuleID, rep.startVersion, rep.writeHotSplit);
 					bmData->splitEvaluations.insert(rep.granuleRange,
 					                                SplitEvaluation(rep.epoch, rep.seqno, doSplitEval));
 				}
 			}
-			}
 		} catch (Error& e) {
 			if (e.code() == error_code_operation_cancelled) {
 				throw e;
@ -2160,23 +2169,84 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Reference<BlobManagerData> bmData, U
 	}
 }

-// FIXME: trace events for pruning
+// FIXME: trace events for purging
+
+ACTOR Future<Void> canDeleteFullGranule(Reference<BlobManagerData> self, UID granuleId) {
+	state Transaction tr(self->db);
+	state KeyRange splitRange = blobGranuleSplitKeyRangeFor(granuleId);
+
+	loop {
+		try {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			state RangeResult splitState = wait(tr.getRange(splitRange, SERVER_KNOBS->BG_MAX_SPLIT_FANOUT));
+			state int i = 0;
+			state bool retry = false;
+			for (; i < splitState.size(); i++) {
+				UID parent, child;
+				BlobGranuleSplitState st;
+				Version v;
+				std::tie(parent, child) = decodeBlobGranuleSplitKey(splitState[i].key);
+				std::tie(st, v) = decodeBlobGranuleSplitValue(splitState[i].value);
+				// if split state is done, this granule has definitely persisted a snapshot
+				if (st >= BlobGranuleSplitState::Done) {
+					continue;
+				}
+				// if split state isn't even assigned, this granule has definitely not persisted a snapshot
+				if (st <= BlobGranuleSplitState::Initialized) {
+					retry = true;
+					break;
+				}
+
+				ASSERT(st == BlobGranuleSplitState::Assigned);
+				// if assigned, granule may or may not have snapshotted. Check files to confirm. Since a re-snapshot is
+				// the first file written for a new granule, any files present mean it has re-snapshotted from this
+				// granule
+				KeyRange granuleFileRange = blobGranuleFileKeyRangeFor(child);
+				RangeResult files = wait(tr.getRange(granuleFileRange, 1));
+				if (files.empty()) {
+					retry = true;
+					break;
+				}
+			}
+			if (retry) {
+				tr.reset();
+				wait(delay(1.0));
+			} else {
+				if (splitState.empty() || !splitState.more) {
+					break;
+				}
+				splitRange = KeyRangeRef(keyAfter(splitState.back().key), splitRange.end);
+			}
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+	return Void();
+}

 /*
 * Deletes all files pertaining to the granule with id granuleId and
 * also removes the history entry for this granule from the system keyspace
- * TODO: ensure cannot fully delete granule that is still splitting!
 */
-ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Key historyKey) {
+ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
+                                      UID granuleId,
+                                      Key historyKey,
+                                      Version purgeVersion) {
 	if (BM_DEBUG) {
 		fmt::print("Fully deleting granule {0}: init\n", granuleId.toString());
 	}

+	// if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially
+	// delete the granule, since we need to keep the last snapshot and deltas for splitting
+	wait(canDeleteFullGranule(self, granuleId));
+
 	// get files
 	GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));

 	std::vector<Future<Void>> deletions;
-	std::vector<std::string> filesToDelete; // TODO: remove, just for debugging
+	state std::vector<std::string> filesToDelete; // TODO: remove, just for debugging

 	for (auto snapshotFile : files.snapshotFiles) {
 		std::string fname = snapshotFile.filename;
@ -2191,7 +2261,7 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, UID granu
 	}

 	if (BM_DEBUG) {
-		fmt::print("Fully deleting granule {0}: deleting {1} files\n", granuleId.toString(), deletions.size());
+		fmt::print("Fully deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
 		for (auto filename : filesToDelete) {
 			fmt::print(" - {}\n", filename.c_str());
 		}
@ -2228,18 +2298,27 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, UID granu
 		fmt::print("Fully deleting granule {0}: success\n", granuleId.toString());
 	}

+	TraceEvent("GranuleFullPurge", self->id)
+	    .detail("Epoch", self->epoch)
+	    .detail("GranuleID", granuleId)
+	    .detail("PurgeVersion", purgeVersion)
+	    .detail("FilesPurged", filesToDelete.size());
+
+	++self->stats.granulesFullyPurged;
+	self->stats.filesPurged += filesToDelete.size();
+
 	return Void();
 }

 /*
 * For the granule with id granuleId, finds the first snapshot file at a
- * version <= pruneVersion and deletes all files older than it.
+ * version <= purgeVersion and deletes all files older than it.
 *
 * Assumption: this granule's startVersion might change because the first snapshot
 * file might be deleted. We will need to ensure we don't rely on the granule's startVersion
 * (that's persisted as part of the key), but rather use the granule's first snapshot's version when needed
 */
-ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Version pruneVersion) {
+ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Version purgeVersion) {
 	if (BM_DEBUG) {
 		fmt::print("Partially deleting granule {0}: init\n", granuleId.toString());
 	}
@ -2247,7 +2326,7 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
 	// get files
 	GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));

-	// represents the version of the latest snapshot file in this granule with G.version < pruneVersion
+	// represents the version of the latest snapshot file in this granule with G.version < purgeVersion
 	Version latestSnapshotVersion = invalidVersion;

 	state std::vector<Future<Void>> deletions; // deletion work per file
@ -2262,8 +2341,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
 			deletions.emplace_back(self->bstore->deleteFile(fname));
 			deletedFileKeys.emplace_back(blobGranuleFileKeyFor(granuleId, files.snapshotFiles[idx].version, 'S'));
 			filesToDelete.emplace_back(fname);
-		} else if (files.snapshotFiles[idx].version <= pruneVersion) {
-			// otherwise if this is the FIRST snapshot file with version < pruneVersion,
+		} else if (files.snapshotFiles[idx].version <= purgeVersion) {
+			// otherwise if this is the FIRST snapshot file with version < purgeVersion,
 			// then we found our latestSnapshotVersion (FIRST since we are traversing in reverse)
 			latestSnapshotVersion = files.snapshotFiles[idx].version;
 		}
@ -2289,19 +2368,19 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
 	}

 	if (BM_DEBUG) {
-		fmt::print("Partially deleting granule {0}: deleting {1} files\n", granuleId.toString(), deletions.size());
+		fmt::print("Partially deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
 		for (auto filename : filesToDelete) {
 			fmt::print(" - {0}\n", filename);
 		}
 	}

 	// TODO: the following comment relies on the assumption that BWs will not get requests to
-	// read data that was already pruned. confirm assumption is fine. otherwise, we'd need
-	// to communicate with BWs here and have them ack the pruneVersion
+	// read data that was already purged. confirm assumption is fine. otherwise, we'd need
+	// to communicate with BWs here and have them ack the purgeVersion

 	// delete the files before the corresponding metadata.
 	// this could lead to dangling pointers in fdb, but we should never read data older than
-	// pruneVersion anyways, and we can clean up the keys the next time around.
+	// purgeVersion anyways, and we can clean up the keys the next time around.
 	// deleting files before corresponding metadata reduces the # of orphaned files.
 	wait(waitForAll(deletions));

@ -2329,26 +2408,41 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
 	if (BM_DEBUG) {
 		fmt::print("Partially deleting granule {0}: success\n", granuleId.toString());
 	}
+	TraceEvent("GranulePartialPurge", self->id)
+	    .detail("Epoch", self->epoch)
+	    .detail("GranuleID", granuleId)
+	    .detail("PurgeVersion", purgeVersion)
+	    .detail("FilesPurged", filesToDelete.size());
+
+	++self->stats.granulesPartiallyPurged;
+	self->stats.filesPurged += filesToDelete.size();
+
 	return Void();
 }

 /*
- * This method is used to prune the range [startKey, endKey) at (and including) pruneVersion.
+ * This method is used to purge the range [startKey, endKey) at (and including) purgeVersion.
 * To do this, we do a BFS traversal starting at the active granules. Then we classify granules
 * in the history as nodes that can be fully deleted (i.e. their files and history can be deleted)
 * and nodes that can be partially deleted (i.e. some of their files can be deleted).
- * Once all this is done, we finally clear the pruneIntent key, if possible, to indicate we are done
- * processing this prune intent.
+ * Once all this is done, we finally clear the purgeIntent key, if possible, to indicate we are done
+ * processing this purge intent.
 */
-ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range, Version pruneVersion, bool force) {
+ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range, Version purgeVersion, bool force) {
 	if (BM_DEBUG) {
-		fmt::print("pruneRange starting for range [{0} - {1}) @ pruneVersion={2}, force={3}\n",
+		fmt::print("purgeRange starting for range [{0} - {1}) @ purgeVersion={2}, force={3}\n",
 		           range.begin.printable(),
 		           range.end.printable(),
-		           pruneVersion,
+		           purgeVersion,
 		           force);
 	}

+	TraceEvent("PurgeGranulesBegin", self->id)
+	    .detail("Epoch", self->epoch)
+	    .detail("Range", range)
+	    .detail("PurgeVersion", purgeVersion)
+	    .detail("Force", force);
+
 	// queue of <range, startVersion, endVersion> for BFS traversal of history
 	state std::queue<std::tuple<KeyRange, Version, Version>> historyEntryQueue;

@ -2371,18 +2465,18 @@ ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range
 	state KeyRangeMap<UID>::iterator activeRange;
 	for (activeRange = activeRanges.begin(); activeRange != activeRanges.end(); ++activeRange) {
 		if (BM_DEBUG) {
-			fmt::print("Checking if active range [{0} - {1}), owned by BW {2}, should be pruned\n",
+			fmt::print("Checking if active range [{0} - {1}), owned by BW {2}, should be purged\n",
 			           activeRange.begin().printable(),
 			           activeRange.end().printable(),
 			           activeRange.value().toString());
 		}

-		// assumption: prune boundaries must respect granule boundaries
+		// assumption: purge boundaries must respect granule boundaries
 		if (activeRange.begin() < range.begin || activeRange.end() > range.end) {
 			continue;
 		}

-		// TODO: if this is a force prune, then revoke the assignment from the corresponding BW first
+		// TODO: if this is a force purge, then revoke the assignment from the corresponding BW first
 		// so that it doesn't try to interact with the granule (i.e. force it to give up gLock).
 		// we'll need some way to ack that the revoke was successful

@ -2456,17 +2550,17 @@ ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range
 		}

 		// There are three cases this granule can fall into:
-		// - if the granule's end version is at or before the prune version or this is a force delete,
+		// - if the granule's end version is at or before the purge version or this is a force delete,
 		//   this granule should be completely deleted
-		// - else if the startVersion <= pruneVersion, then G.startVersion < pruneVersion < G.endVersion
+		// - else if the startVersion <= purgeVersion, then G.startVersion < purgeVersion < G.endVersion
 		//   and so this granule should be partially deleted
 		// - otherwise, this granule is active, so don't schedule it for deletion
-		if (force || endVersion <= pruneVersion) {
+		if (force || endVersion <= purgeVersion) {
 			if (BM_DEBUG) {
 				fmt::print("Granule {0} will be FULLY deleted\n", currHistoryNode.granuleID.toString());
 			}
 			toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey });
-		} else if (startVersion < pruneVersion) {
+		} else if (startVersion < purgeVersion) {
 			if (BM_DEBUG) {
 				fmt::print("Granule {0} will be partially deleted\n", currHistoryNode.granuleID.toString());
 			}
@ -2513,70 +2607,79 @@ ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range
 	// we won't run into any issues with trying to "re-delete" a blob file since deleting
 	// a file that doesn't exist is considered successful

+	state std::vector<Future<Void>> partialDeletions;
 	state int i;
 	if (BM_DEBUG) {
 		fmt::print("{0} granules to fully delete\n", toFullyDelete.size());
 	}
 	for (i = toFullyDelete.size() - 1; i >= 0; --i) {
-		UID granuleId;
+		state UID granuleId;
 		Key historyKey;
 		std::tie(granuleId, historyKey) = toFullyDelete[i];
 		// FIXME: consider batching into a single txn (need to take care of txn size limit)
 		if (BM_DEBUG) {
 			fmt::print("About to fully delete granule {0}\n", granuleId.toString());
 		}
-		wait(fullyDeleteGranule(self, granuleId, historyKey));
+		wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion));
 	}

 	if (BM_DEBUG) {
 		fmt::print("{0} granules to partially delete\n", toPartiallyDelete.size());
 	}
-	std::vector<Future<Void>> partialDeletions;
+
 	for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
 		UID granuleId = toPartiallyDelete[i];
 		if (BM_DEBUG) {
 			fmt::print("About to partially delete granule {0}\n", granuleId.toString());
 		}
-		partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, pruneVersion));
+		partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion));
 	}

 	wait(waitForAll(partialDeletions));

 	// Now that all the necessary granules and their files have been deleted, we can
-	// clear the pruneIntent key to signify that the work is done. However, there could have been
-	// another pruneIntent that got written for this table while we were processing this one.
+	// clear the purgeIntent key to signify that the work is done. However, there could have been
+	// another purgeIntent that got written for this table while we were processing this one.
 	// If that is the case, we should not clear the key. Otherwise, we can just clear the key.

 	if (BM_DEBUG) {
-		fmt::print("Successfully pruned range [{0} - {1}) at pruneVersion={2}\n",
+		fmt::print("Successfully purged range [{0} - {1}) at purgeVersion={2}\n",
 		           range.begin.printable(),
 		           range.end.printable(),
-		           pruneVersion);
+		           purgeVersion);
 	}
+
+	TraceEvent("PurgeGranulesComplete", self->id)
+	    .detail("Epoch", self->epoch)
+	    .detail("Range", range)
+	    .detail("PurgeVersion", purgeVersion)
+	    .detail("Force", force);
+
+	++self->stats.purgesProcessed;
 	return Void();
 }

 /*
- * This monitor watches for changes to a key K that gets updated whenever there is a new prune intent.
- * On this change, we scan through all blobGranulePruneKeys (which look like <startKey, endKey>=<prune_version,
- * force>) and prune any intents.
+ * This monitor watches for changes to a key K that gets updated whenever there is a new purge intent.
+ * On this change, we scan through all blobGranulePurgeKeys (which look like <startKey, endKey>=<purge_version,
+ * force>) and purge any intents.
 *
- * Once the prune has succeeded, we clear the key IF the version is still the same one that was pruned.
- * That way, if another prune intent arrived for the same range while we were working on an older one,
+ * Once the purge has succeeded, we clear the key IF the version is still the same one that was purged.
+ * That way, if another purge intent arrived for the same range while we were working on an older one,
 * we wouldn't end up clearing the intent.
 *
 * When watching for changes, we might end up in scenarios where we failed to do the work
- * for a prune intent even though the watch was triggered (maybe the BM had a blip). This is problematic
- * if the intent is a force and there isn't another prune intent for quite some time. To remedy this,
- * if we don't see a watch change in X (configurable) seconds, we will just sweep through the prune intents,
+ * for a purge intent even though the watch was triggered (maybe the BM had a blip). This is problematic
+ * if the intent is a force and there isn't another purge intent for quite some time. To remedy this,
+ * if we don't see a watch change in X (configurable) seconds, we will just sweep through the purge intents,
 * consolidating any work we might have missed before.
 *
- * Note: we could potentially use a changefeed here to get the exact pruneIntent that was added
+ * Note: we could potentially use a changefeed here to get the exact purgeIntent that was added
 * rather than iterating through all of them, but this might have too much overhead for latency
- * improvements we don't really need here (also we need to go over all prune intents anyways in the
- * case that the timer is up before any new prune intents arrive).
+ * improvements we don't really need here (also we need to go over all purge intents anyways in the
+ * case that the timer is up before any new purge intents arrive).
 */
-ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
+ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
 	self->initBStore();

 	loop {
@ -2585,35 +2688,35 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
 		tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);

 		// Wait for the watch to change, or some time to expire (whichever comes first)
-		// before checking through the prune intents. We write a UID into the change key value
+		// before checking through the purge intents. We write a UID into the change key value
 		// so that we can still recognize when the watch key has been changed while we weren't
 		// monitoring it

-		state Key lastPruneKey = blobGranulePruneKeys.begin;
+		state Key lastPurgeKey = blobGranulePurgeKeys.begin;

 		loop {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);

-			state std::vector<Future<Void>> prunes;
-			state CoalescedKeyRangeMap<std::pair<Version, bool>> pruneMap;
-			pruneMap.insert(allKeys, std::make_pair<Version, bool>(0, false));
+			state std::vector<Future<Void>> purges;
+			state CoalescedKeyRangeMap<std::pair<Version, bool>> purgeMap;
+			purgeMap.insert(allKeys, std::make_pair<Version, bool>(0, false));
 			try {
 				// TODO: replace 10000 with a knob
-				state RangeResult pruneIntents = wait(tr->getRange(blobGranulePruneKeys, BUGGIFY ? 1 : 10000));
-				if (pruneIntents.size()) {
+				state RangeResult purgeIntents = wait(tr->getRange(blobGranulePurgeKeys, BUGGIFY ? 1 : 10000));
+				if (purgeIntents.size()) {
 					int rangeIdx = 0;
-					for (; rangeIdx < pruneIntents.size(); ++rangeIdx) {
-						Version pruneVersion;
+					for (; rangeIdx < purgeIntents.size(); ++rangeIdx) {
+						Version purgeVersion;
 						KeyRange range;
 						bool force;
-						std::tie(pruneVersion, range, force) =
-						    decodeBlobGranulePruneValue(pruneIntents[rangeIdx].value);
-						auto ranges = pruneMap.intersectingRanges(range);
+						std::tie(purgeVersion, range, force) =
+						    decodeBlobGranulePurgeValue(purgeIntents[rangeIdx].value);
+						auto ranges = purgeMap.intersectingRanges(range);
 						bool foundConflict = false;
 						for (auto it : ranges) {
-							if ((it.value().second && !force && it.value().first < pruneVersion) ||
-							    (!it.value().second && force && pruneVersion < it.value().first)) {
+							if ((it.value().second && !force && it.value().first < purgeVersion) ||
+							    (!it.value().second && force && purgeVersion < it.value().first)) {
 								foundConflict = true;
 								break;
 							}
@ -2621,39 +2724,41 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
 						if (foundConflict) {
 							break;
 						}
-						pruneMap.insert(range, std::make_pair(pruneVersion, force));
+						purgeMap.insert(range, std::make_pair(purgeVersion, force));

-						fmt::print("about to prune range [{0} - {1}) @ {2}, force={3}\n",
+						if (BM_DEBUG) {
+							fmt::print("about to purge range [{0} - {1}) @ {2}, force={3}\n",
 							           range.begin.printable(),
 							           range.end.printable(),
-						           pruneVersion,
+							           purgeVersion,
 							           force ? "T" : "F");
 						}
-					lastPruneKey = pruneIntents[rangeIdx - 1].key;
+					}
+					lastPurgeKey = purgeIntents[rangeIdx - 1].key;

-					for (auto it : pruneMap.ranges()) {
+					for (auto it : purgeMap.ranges()) {
 						if (it.value().first > 0) {
-							prunes.emplace_back(pruneRange(self, it.range(), it.value().first, it.value().second));
+							purges.emplace_back(purgeRange(self, it.range(), it.value().first, it.value().second));
 						}
 					}

-					// wait for this set of prunes to complete before starting the next ones since if we
-					// prune a range R at version V and while we are doing that, the time expires, we will
-					// end up trying to prune the same range again since the work isn't finished and the
-					// prunes will race
+					// wait for this set of purges to complete before starting the next ones since if we
+					// purge a range R at version V and while we are doing that, the time expires, we will
+					// end up trying to purge the same range again since the work isn't finished and the
+					// purges will race
 					//
 					// TODO: this isn't that efficient though. Instead we could keep metadata as part of the
-					// BM's memory that tracks which prunes are active. Once done, we can mark that work as
-					// done. If the BM fails then all prunes will fail and so the next BM will have a clear
+					// BM's memory that tracks which purges are active. Once done, we can mark that work as
+					// done. If the BM fails then all purges will fail and so the next BM will have a clear
 					// set of metadata (i.e. no work in progress) so we will end up doing the work in the
 					// new BM

-					wait(waitForAll(prunes));
+					wait(waitForAll(purges));
 					break;
 				} else {
-					state Future<Void> watchPruneIntentsChange = tr->watch(blobGranulePruneChangeKey);
+					state Future<Void> watchPurgeIntentsChange = tr->watch(blobGranulePurgeChangeKey);
 					wait(tr->commit());
-					wait(watchPruneIntentsChange);
+					wait(watchPurgeIntentsChange);
 					tr->reset();
 				}
 			} catch (Error& e) {
@ -2666,7 +2771,7 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-				tr->clear(KeyRangeRef(blobGranulePruneKeys.begin, keyAfter(lastPruneKey)));
+				tr->clear(KeyRangeRef(blobGranulePurgeKeys.begin, keyAfter(lastPurgeKey)));
 				wait(tr->commit());
 				break;
 			} catch (Error& e) {
@ -2675,7 +2780,7 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
 		}

 		if (BM_DEBUG) {
-			printf("Done pruning current set of prune intents.\n");
+			printf("Done clearing current set of purge intents.\n");
 		}
 	}
 }
@ -2876,7 +2981,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,

 	self->addActor.send(doLockChecks(self));
 	self->addActor.send(monitorClientRanges(self));
-	self->addActor.send(monitorPruneKeys(self));
+	self->addActor.send(monitorPurgeKeys(self));
 	if (SERVER_KNOBS->BG_CONSISTENCY_CHECK_ENABLED) {
 		self->addActor.send(bgConsistencyCheck(self));
 	}
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -86,6 +86,7 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
 	NotifiedVersion durableSnapshotVersion; // same as delta vars, except for snapshots
 	Version pendingSnapshotVersion = 0;
 	Version initialSnapshotVersion = invalidVersion;
+	Version historyVersion = invalidVersion;
 	Version knownCommittedVersion;

 	int64_t originalEpoch;
@ -756,7 +757,11 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
 				           bytesRead);
 			}
 			state Error err = e;
+			if (e.code() == error_code_server_overloaded) {
+				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+			} else {
 				wait(tr->onError(e));
+			}
 			retries++;
 			TEST(true); // Granule initial snapshot failed
 			// FIXME: why can't we supress error event?
@ -935,13 +940,8 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
 					break;
 				}

-				bwData->currentManagerStatusStream.get().send(GranuleStatusReply(metadata->keyRange,
-				                                                                 true,
-				                                                                 writeHot,
-				                                                                 statusEpoch,
-				                                                                 statusSeqno,
-				                                                                 granuleID,
-				                                                                 metadata->initialSnapshotVersion));
+				bwData->currentManagerStatusStream.get().send(GranuleStatusReply(
+				    metadata->keyRange, true, writeHot, statusEpoch, statusSeqno, granuleID, metadata->historyVersion));
 				break;
 			} catch (Error& e) {
 				if (e.code() == error_code_operation_cancelled) {
@ -1037,10 +1037,14 @@ static void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
 // if we get an i/o error updating files, or a rollback, reassign the granule to ourselves and start fresh
 static bool granuleCanRetry(const Error& e) {
 	switch (e.code()) {
-	case error_code_please_reboot:
 	case error_code_io_error:
 	case error_code_io_timeout:
+	// FIXME: handle connection errors in tighter retry loop around individual files.
+	// FIXME: if these requests fail at a high enough rate, the whole worker should be marked as unhealthy and its
+	// granules should be moved away, as there may be some problem with this host contacting blob storage
 	case error_code_http_request_failed:
+	case error_code_connection_failed:
+	case error_code_lookup_failed: // dns
 		return true;
 	default:
 		return false;
@ -1119,10 +1123,15 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
 		}
 		metadata->pendingDeltaVersion = cfRollbackVersion;
 		if (BW_DEBUG) {
-			fmt::print("[{0} - {1}) rollback discarding all {2} in-memory mutations\n",
+			fmt::print("[{0} - {1}) rollback discarding all {2} in-memory mutations",
 			           metadata->keyRange.begin.printable(),
 			           metadata->keyRange.end.printable(),
 			           metadata->currentDeltas.size());
+			if (metadata->currentDeltas.size()) {
+				fmt::print(
+				    " {0} - {1}", metadata->currentDeltas.front().version, metadata->currentDeltas.back().version);
+			}
+			fmt::print("\n");
 		}

 		// discard all in-memory mutations
@ -1150,6 +1159,8 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,

 		// FIXME: could binary search?
 		int mIdx = metadata->currentDeltas.size() - 1;
+		Version firstDiscarded = invalidVersion;
+		Version lastDiscarded = invalidVersion;
 		while (mIdx >= 0) {
 			if (metadata->currentDeltas[mIdx].version <= rollbackVersion) {
 				break;
@ -1157,19 +1168,37 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
 			for (auto& m : metadata->currentDeltas[mIdx].mutations) {
 				metadata->bufferedDeltaBytes -= m.totalSize();
 			}
+			if (firstDiscarded == invalidVersion) {
+				firstDiscarded = metadata->currentDeltas[mIdx].version;
+			}
+			lastDiscarded = metadata->currentDeltas[mIdx].version;
 			mIdx--;
 		}
-		mIdx++;
+
 		if (BW_DEBUG) {
-			fmt::print("[{0} - {1}) rollback discarding {2} in-memory mutations, {3} mutations and {4} bytes left\n",
+			fmt::print("[{0} - {1}) rollback discarding {2} in-memory mutations",
 			           metadata->keyRange.begin.printable(),
 			           metadata->keyRange.end.printable(),
-			           metadata->currentDeltas.size() - mIdx,
-			           mIdx,
-			           metadata->bufferedDeltaBytes);
+			           metadata->currentDeltas.size() - mIdx - 1);
+
+			if (firstDiscarded != invalidVersion) {
+				fmt::print(" {0} - {1}", lastDiscarded, firstDiscarded);
 			}

-		metadata->currentDeltas.resize(metadata->currentDeltas.arena(), mIdx);
+			fmt::print(", {0} mutations", mIdx);
+			if (mIdx >= 0) {
+				fmt::print(
+				    " ({0} - {1})", metadata->currentDeltas.front().version, metadata->currentDeltas[mIdx].version);
+			}
+			fmt::print(" and {0} bytes left\n", metadata->bufferedDeltaBytes);
+		}
+
+		if (mIdx < 0) {
+			metadata->currentDeltas = Standalone<GranuleDeltas>();
+			metadata->bufferedDeltaBytes = 0;
+		} else {
+			metadata->currentDeltas.resize(metadata->currentDeltas.arena(), mIdx + 1);
+		}

 		// delete all deltas in rollback range, but we can optimize here to just skip the uncommitted mutations
 		// directly and immediately pop the rollback out of inProgress to completed
@ -1328,6 +1357,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			metadata->pendingSnapshotVersion = metadata->files.snapshotFiles.back().version;
 			metadata->durableSnapshotVersion.set(metadata->pendingSnapshotVersion);
 			metadata->initialSnapshotVersion = metadata->files.snapshotFiles.front().version;
+			metadata->historyVersion = startState.history.get().version;
 		} else {
 			if (startState.blobFilesToSnapshot.present()) {
 				startVersion = startState.previousDurableVersion;
@ -1350,6 +1380,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			}
 			metadata->initialSnapshotVersion = startVersion;
 			metadata->pendingSnapshotVersion = startVersion;
+			metadata->historyVersion = startState.history.present() ? startState.history.get().version : startVersion;
 		}

 		metadata->durableDeltaVersion.set(startVersion);
@ -1459,8 +1490,16 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 						}
 						ASSERT(mutations.front().version > metadata->bufferedDeltaVersion);

-						// If this assert trips we should have gotten change_feed_popped from SS and didn't
-						ASSERT(mutations.front().version >= metadata->activeCFData.get()->popVersion);
+						// Rare race from merge cursor where no individual server detected popped in their response
+						if (mutations.front().version < metadata->activeCFData.get()->popVersion) {
+							TEST(true); // Blob Worker detected popped instead of change feed
+							TraceEvent("BlobWorkerChangeFeedPopped", bwData->id)
+							    .detail("Granule", metadata->keyRange)
+							    .detail("GranuleID", startState.granuleID)
+							    .detail("MutationVersion", mutations.front().version)
+							    .detail("PopVersion", metadata->activeCFData.get()->popVersion);
+							throw change_feed_popped();
+						}
 					}
 					when(wait(inFlightFiles.empty() ? Never() : success(inFlightFiles.front().future))) {}
 				}
@ -1623,6 +1662,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 									metadata->activeCFData.set(cfData);

 									justDidRollback = true;
+									lastDeltaVersion = cfRollbackVersion;
 									break;
 								}
 							}
@ -1841,6 +1881,12 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			}
 		}
 	} catch (Error& e) {
+		if (BW_DEBUG) {
+			fmt::print("Granule file updater for [{0} - {1}) got error {2}, exiting\n",
+			           metadata->keyRange.begin.printable(),
+			           metadata->keyRange.end.printable(),
+			           e.name());
+		}
 		// Free last change feed data
 		metadata->activeCFData.set(Reference<ChangeFeedData>());

@ -1871,12 +1917,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			return Void();
 		}
 		++bwData->stats.granuleUpdateErrors;
-		if (BW_DEBUG) {
-			fmt::print("Granule file updater for [{0} - {1}) got error {2}, exiting\n",
-			           metadata->keyRange.begin.printable(),
-			           metadata->keyRange.end.printable(),
-			           e.name());
-		}

 		if (granuleCanRetry(e)) {
 			TEST(true); // Granule close and re-open on error
@ -2002,6 +2042,14 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
 			int skipped = historyEntryStack.size() - 1 - i;

 			while (i >= 0) {
+				auto intersectingRanges = bwData->granuleHistory.intersectingRanges(historyEntryStack[i]->range);
+				std::vector<std::pair<KeyRange, Reference<GranuleHistoryEntry>>> newerHistory;
+				for (auto& r : intersectingRanges) {
+					if (r.value().isValid() && r.value()->endVersion >= historyEntryStack[i]->endVersion) {
+						newerHistory.push_back(std::make_pair(r.range(), r.value()));
+					}
+				}
+
 				auto prevRanges = bwData->granuleHistory.rangeContaining(historyEntryStack[i]->range.begin);

 				if (prevRanges.value().isValid() &&
@ -2012,6 +2060,9 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
 				}

 				bwData->granuleHistory.insert(historyEntryStack[i]->range, historyEntryStack[i]);
+				for (auto& it : newerHistory) {
+					bwData->granuleHistory.insert(it.first, it.second);
+				}
 				i--;
 			}

@ -2137,7 +2188,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 		if (req.beginVersion > 0) {
 			fmt::print("{0} - {1}\n", req.beginVersion, req.readVersion);
 		} else {
-			fmt::print("{}", req.readVersion);
+			fmt::print("{}\n", req.readVersion);
 		}
 	}

@ -2210,7 +2261,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 			state KeyRange chunkRange;
 			state GranuleFiles chunkFiles;

-			if (metadata->initialSnapshotVersion > req.readVersion) {
+			if (req.readVersion < metadata->historyVersion) {
 				TEST(true); // Granule Time Travel Read
 				// this is a time travel query, find previous granule
 				if (metadata->historyLoaded.canBeSet()) {
@ -2226,7 +2277,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				Reference<GranuleHistoryEntry> cur = bwData->granuleHistory.rangeContaining(historySearchKey).value();

 				// FIXME: use skip pointers here
-				Version expectedEndVersion = metadata->initialSnapshotVersion;
+				Version expectedEndVersion = metadata->historyVersion;
 				if (cur.isValid()) {
 					ASSERT(cur->endVersion == expectedEndVersion);
 				}
@ -2269,17 +2320,22 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				}

 				if (chunkFiles.snapshotFiles.empty()) {
-					// a snapshot file must have been pruned
+					// a snapshot file must have been purged
 					throw blob_granule_transaction_too_old();
 				}

 				ASSERT(!chunkFiles.deltaFiles.empty());
 				ASSERT(chunkFiles.deltaFiles.back().version > req.readVersion);
 				if (chunkFiles.snapshotFiles.front().version > req.readVersion) {
-					// a snapshot file must have been pruned
+					// a snapshot file must have been purged
 					throw blob_granule_transaction_too_old();
 				}
 			} else {
+				if (req.readVersion < metadata->initialSnapshotVersion) {
+					// a snapshot file must have been pruned
+					throw blob_granule_transaction_too_old();
+				}
+
 				TEST(true); // Granule Active Read
 				// this is an active granule query
 				loop {
@ -2287,7 +2343,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 						throw wrong_shard_server();
 					}
 					Future<Void> waitForVersionFuture = waitForVersion(metadata, req.readVersion);
-					if (waitForVersionFuture.isReady()) {
+					if (waitForVersionFuture.isReady() && !waitForVersionFuture.isError()) {
 						// didn't wait, so no need to check rollback stuff
 						break;
 					}
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -1099,21 +1099,16 @@ void haltRegisteringOrCurrentSingleton(ClusterControllerData* self,

 void registerWorker(RegisterWorkerRequest req,
                    ClusterControllerData* self,
-                    ServerCoordinators coordinators,
+                    std::unordered_set<NetworkAddress> coordinatorAddresses,
                    ConfigBroadcaster* configBroadcaster) {
 	const WorkerInterface& w = req.wi;
 	ProcessClass newProcessClass = req.processClass;
 	auto info = self->id_worker.find(w.locality.processId());
 	ClusterControllerPriorityInfo newPriorityInfo = req.priorityInfo;
 	newPriorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController);
-	Optional<ConfigFollowerInterface> cfi;
 	bool isCoordinator =
-	    std::find_if(coordinators.configServers.begin(),
-	                 coordinators.configServers.end(),
-	                 [&req](const ConfigFollowerInterface& cfi) {
-		                 return cfi.address() == req.wi.address() || (req.wi.secondaryAddress().present() &&
-		                                                              cfi.address() == req.wi.secondaryAddress().get());
-	                 }) != coordinators.configServers.end();
+	    (coordinatorAddresses.count(req.wi.address()) > 0) ||
+	    (req.wi.secondaryAddress().present() && coordinatorAddresses.count(req.wi.secondaryAddress().get()) > 0);

 	for (auto it : req.incompatiblePeers) {
 		self->db.incompatibleConnections[it] = now() + SERVER_KNOBS->INCOMPATIBLE_PEERS_LOGGING_INTERVAL;
@ -2547,10 +2542,30 @@ ACTOR Future<Void> clusterControllerCore(Reference<IClusterConnectionRecord> con
 		when(RecruitBlobWorkerRequest req = waitNext(interf.recruitBlobWorker.getFuture())) {
 			clusterRecruitBlobWorker(&self, req);
 		}
-		when(RegisterWorkerRequest req = waitNext(interf.registerWorker.getFuture())) {
+		when(state RegisterWorkerRequest req = waitNext(interf.registerWorker.getFuture())) {
 			++self.registerWorkerRequests;
-			registerWorker(
-			    req, &self, coordinators, (configDBType == ConfigDBType::DISABLED) ? nullptr : &configBroadcaster);
+			state ClusterConnectionString ccs = coordinators.ccr->getConnectionString();
+
+			state std::unordered_set<NetworkAddress> coordinatorAddresses;
+			std::vector<Future<Void>> fs;
+			for (auto& hostname : ccs.hostnames) {
+				fs.push_back(map(hostname.resolve(), [&](Optional<NetworkAddress> const& addr) -> Void {
+					if (addr.present()) {
+						coordinatorAddresses.insert(addr.get());
+					}
+					return Void();
+				}));
+			}
+			wait(waitForAll(fs));
+
+			for (const auto& coord : ccs.coordinators()) {
+				coordinatorAddresses.insert(coord);
+			}
+
+			registerWorker(req,
+			               &self,
+			               coordinatorAddresses,
+			               (configDBType == ConfigDBType::DISABLED) ? nullptr : &configBroadcaster);
 		}
 		when(GetWorkersRequest req = waitNext(interf.getWorkers.getFuture())) {
 			++self.getWorkersRequests;
--- a/fdbserver/ClusterRecovery.actor.cpp
+++ b/fdbserver/ClusterRecovery.actor.cpp
@ -226,6 +226,7 @@ ACTOR Future<Void> newResolvers(Reference<ClusterRecoveryData> self, RecruitFrom
 	std::vector<Future<ResolverInterface>> initializationReplies;
 	for (int i = 0; i < recr.resolvers.size(); i++) {
 		InitializeResolverRequest req;
+		req.masterLifetime = self->masterLifetime;
 		req.recoveryCount = self->cstate.myDBState.recoveryCount + 1;
 		req.commitProxyCount = recr.commitProxies.size();
 		req.resolverCount = recr.resolvers.size();
@ -342,6 +343,7 @@ ACTOR Future<Void> newSeedServers(Reference<ClusterRecoveryData> self,
 		isr.reqId = deterministicRandom()->randomUniqueID();
 		isr.interfaceId = deterministicRandom()->randomUniqueID();
 		isr.clusterId = self->clusterId;
+		isr.initialClusterVersion = self->recoveryTransactionVersion;

 		ErrorOr<InitializeStorageReply> newServer = wait(recruits.storageServers[idx].storage.tryGetReply(isr));

@ -989,8 +991,12 @@ ACTOR Future<std::vector<Standalone<CommitTransactionRef>>> recruitEverything(
 	     newTLogServers(self, recruits, oldLogSystem, &confChanges));

 	// Update recovery related information to the newly elected sequencer (master) process.
-	wait(brokenPromiseToNever(self->masterInterface.updateRecoveryData.getReply(UpdateRecoveryDataRequest(
-	    self->recoveryTransactionVersion, self->lastEpochEnd, self->commitProxies, self->resolvers))));
+	wait(brokenPromiseToNever(
+	    self->masterInterface.updateRecoveryData.getReply(UpdateRecoveryDataRequest(self->recoveryTransactionVersion,
+	                                                                                self->lastEpochEnd,
+	                                                                                self->commitProxies,
+	                                                                                self->resolvers,
+	                                                                                self->versionEpoch))));

 	return confChanges;
 }
@ -1004,6 +1010,12 @@ ACTOR Future<Void> updateLocalityForDcId(Optional<Key> dcId,
 		if (ver == invalidVersion) {
 			ver = oldLogSystem->getKnownCommittedVersion();
 		}
+		if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+			// Do not try to split peeks between data centers in peekTxns() to recover mem kvstore.
+			// This recovery optimization won't work in UNICAST mode.
+			loc.first = -1;
+		}
+
 		locality->set(PeekTxsInfo(loc.first, loc.second, ver));
 		TraceEvent("UpdatedLocalityForDcId")
 		    .detail("DcId", dcId)
@ -1036,6 +1048,14 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
 	self->txnStateStore =
 	    keyValueStoreLogSystem(self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false, false, true);

+	// Version 0 occurs at the version epoch. The version epoch is the number
+	// of microseconds since the Unix epoch. It can be set through fdbcli.
+	self->versionEpoch.reset();
+	Optional<Standalone<StringRef>> versionEpochValue = wait(self->txnStateStore->readValue(versionEpochKey));
+	if (versionEpochValue.present()) {
+		self->versionEpoch = BinaryReader::fromStringRef<int64_t>(versionEpochValue.get(), Unversioned());
+	}
+
 	// Versionstamped operations (particularly those applied from DR) define a minimum commit version
 	// that we may recover to, as they embed the version in user-readable data and require that no
 	// transactions will be committed at a lower version.
@ -1046,6 +1066,11 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
 	if (requiredCommitVersion.present()) {
 		minRequiredCommitVersion = BinaryReader::fromStringRef<Version>(requiredCommitVersion.get(), Unversioned());
 	}
+	if (g_network->isSimulated() && self->versionEpoch.present()) {
+		minRequiredCommitVersion = std::max(
+		    minRequiredCommitVersion,
+		    static_cast<Version>(g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - self->versionEpoch.get()));
+	}

 	// Recover version info
 	self->lastEpochEnd = oldLogSystem->getEnd() - 1;
@ -1058,14 +1083,14 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
 			self->recoveryTransactionVersion = self->lastEpochEnd + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT;
 		}

-		if (BUGGIFY) {
-			self->recoveryTransactionVersion +=
-			    deterministicRandom()->randomInt64(0, SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT);
-		}
 		if (self->recoveryTransactionVersion < minRequiredCommitVersion)
 			self->recoveryTransactionVersion = minRequiredCommitVersion;
 	}

+	if (BUGGIFY) {
+		self->recoveryTransactionVersion += deterministicRandom()->randomInt64(0, 10000000);
+	}
+
 	TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_RECOVERED_EVENT_NAME).c_str(),
 	           self->dbgid)
 	    .detail("LastEpochEnd", self->lastEpochEnd)
@ -1146,7 +1171,12 @@ ACTOR Future<Void> sendInitialCommitToResolvers(Reference<ClusterRecoveryData> s
 	for (auto& it : self->commitProxies) {
 		endpoints.push_back(it.txnState.getEndpoint());
 	}
-
+	if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
+		// Broadcasts transaction state store to resolvers.
+		for (auto& it : self->resolvers) {
+			endpoints.push_back(it.txnState.getEndpoint());
+		}
+	}
 	loop {
 		if (!data.size())
 			break;
--- a/fdbserver/ClusterRecovery.actor.h
+++ b/fdbserver/ClusterRecovery.actor.h
@ -169,6 +169,7 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
 	AsyncTrigger registrationTrigger;
 	Version lastEpochEnd, // The last version in the old epoch not (to be) rolled back in this recovery
 	    recoveryTransactionVersion; // The first version in this epoch
+	Optional<int64_t> versionEpoch; // The epoch which all versions are based off of
 	double lastCommitTime;

 	Version liveCommittedVersion; // The largest live committed version reported by commit proxies.
@ -209,6 +210,7 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
 	std::map<UID, CommitProxyVersionReplies> lastCommitProxyVersionReplies;

 	UID clusterId;
+	Version initialClusterVersion = -1;
 	Standalone<StringRef> dbId;

 	MasterInterface masterInterface;
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@ -21,8 +21,9 @@
 #include <algorithm>
 #include <tuple>

-#include <fdbclient/DatabaseContext.h>
 #include "fdbclient/Atomic.h"
+#include "fdbclient/CommitTransaction.h"
+#include "fdbclient/DatabaseContext.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/CommitProxyInterface.h"
@ -47,6 +48,7 @@
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "flow/ActorCollection.h"
+#include "flow/Error.h"
 #include "flow/IRandom.h"
 #include "flow/Knobs.h"
 #include "flow/Trace.h"
@ -86,14 +88,19 @@ ACTOR void discardCommit(UID id, Future<LogSystemDiskQueueAdapter::CommitMessage
 }

 struct ResolutionRequestBuilder {
-	ProxyCommitData* self;
+	const ProxyCommitData* self;
+
+	// One request per resolver.
 	std::vector<ResolveTransactionBatchRequest> requests;
+
+	// Txn i to resolvers that have i'th data sent
 	std::vector<std::vector<int>> transactionResolverMap;
 	std::vector<CommitTransactionRef*> outTr;
-	std::vector<std::vector<std::vector<int>>>
-	    txReadConflictRangeIndexMap; // Used to report conflicting keys, the format is
+
+	// Used to report conflicting keys, the format is
 	// [CommitTransactionRef_Index][Resolver_Index][Read_Conflict_Range_Index_on_Resolver]
 	// -> read_conflict_range's original index in the commitTransactionRef
+	std::vector<std::vector<std::vector<int>>> txReadConflictRangeIndexMap;

 	ResolutionRequestBuilder(ProxyCommitData* self,
 	                         Version version,
@ -120,35 +127,10 @@ struct ResolutionRequestBuilder {
 		return *out;
 	}

-	void addTransaction(CommitTransactionRequest& trRequest, int transactionNumberInBatch) {
-		auto& trIn = trRequest.transaction;
-		// SOMEDAY: There are a couple of unnecessary O( # resolvers ) steps here
-		outTr.assign(requests.size(), nullptr);
-		ASSERT(transactionNumberInBatch >= 0 && transactionNumberInBatch < 32768);
-
-		bool isTXNStateTransaction = false;
-		for (auto& m : trIn.mutations) {
-			if (m.type == MutationRef::SetVersionstampedKey) {
-				transformVersionstampMutation(m, &MutationRef::param1, requests[0].version, transactionNumberInBatch);
-				trIn.write_conflict_ranges.push_back(requests[0].arena, singleKeyRange(m.param1, requests[0].arena));
-			} else if (m.type == MutationRef::SetVersionstampedValue) {
-				transformVersionstampMutation(m, &MutationRef::param2, requests[0].version, transactionNumberInBatch);
-			}
-			if (isMetadataMutation(m)) {
-				isTXNStateTransaction = true;
-				getOutTransaction(0, trIn.read_snapshot).mutations.push_back(requests[0].arena, m);
-			}
-		}
-		if (isTXNStateTransaction && !trRequest.isLockAware()) {
-			// This mitigates https://github.com/apple/foundationdb/issues/3647. Since this transaction is not lock
-			// aware, if this transaction got a read version then \xff/dbLocked must not have been set at this
-			// transaction's read snapshot. If that changes by commit time, then it won't commit on any proxy because of
-			// a conflict. A client could set a read version manually so this isn't totally bulletproof.
-			trIn.read_conflict_ranges.push_back(trRequest.arena, KeyRangeRef(databaseLockedKey, databaseLockedKeyEnd));
-		}
-		std::vector<std::vector<int>> rCRIndexMap(
-		    requests.size()); // [resolver_index][read_conflict_range_index_on_the_resolver]
+	// Returns a read conflict index map: [resolver_index][read_conflict_range_index_on_the_resolver]
 	// -> read_conflict_range's original index
+	std::vector<std::vector<int>> addReadConflictRanges(CommitTransactionRef& trIn) {
+		std::vector<std::vector<int>> rCRIndexMap(requests.size());
 		for (int idx = 0; idx < trIn.read_conflict_ranges.size(); ++idx) {
 			const auto& r = trIn.read_conflict_ranges[idx];
 			auto ranges = self->keyResolvers.intersectingRanges(r);
@ -161,6 +143,11 @@ struct ResolutionRequestBuilder {
 						break;
 				}
 			}
+			if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && systemKeys.intersects(r)) {
+				for (int k = 0; k < self->resolvers.size(); k++) {
+					resolvers.insert(k);
+				}
+			}
 			ASSERT(resolvers.size());
 			for (int resolver : resolvers) {
 				getOutTransaction(resolver, trIn.read_snapshot)
@ -168,23 +155,76 @@ struct ResolutionRequestBuilder {
 				rCRIndexMap[resolver].push_back(idx);
 			}
 		}
-		txReadConflictRangeIndexMap.push_back(std::move(rCRIndexMap));
+		return rCRIndexMap;
+	}
+
+	void addWriteConflictRanges(CommitTransactionRef& trIn) {
 		for (auto& r : trIn.write_conflict_ranges) {
 			auto ranges = self->keyResolvers.intersectingRanges(r);
 			std::set<int> resolvers;
-			for (auto& ir : ranges)
-				resolvers.insert(ir.value().back().second);
+			for (auto& ir : ranges) {
+				auto& version_resolver = ir.value();
+				if (!version_resolver.empty()) {
+					resolvers.insert(version_resolver.back().second);
+				}
+			}
+			if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && systemKeys.intersects(r)) {
+				for (int k = 0; k < self->resolvers.size(); k++) {
+					resolvers.insert(k);
+				}
+			}
 			ASSERT(resolvers.size());
 			for (int resolver : resolvers)
 				getOutTransaction(resolver, trIn.read_snapshot)
 				    .write_conflict_ranges.push_back(requests[resolver].arena, r);
 		}
-		if (isTXNStateTransaction)
+	}
+
+	void addTransaction(CommitTransactionRequest& trRequest, Version ver, int transactionNumberInBatch) {
+		auto& trIn = trRequest.transaction;
+		// SOMEDAY: There are a couple of unnecessary O( # resolvers ) steps here
+		outTr.assign(requests.size(), nullptr);
+		ASSERT(transactionNumberInBatch >= 0 && transactionNumberInBatch < 32768);
+
+		bool isTXNStateTransaction = false;
+		for (auto& m : trIn.mutations) {
+			DEBUG_MUTATION("AddTr", ver, m, self->dbgid).detail("Idx", transactionNumberInBatch);
+			if (m.type == MutationRef::SetVersionstampedKey) {
+				transformVersionstampMutation(m, &MutationRef::param1, requests[0].version, transactionNumberInBatch);
+				trIn.write_conflict_ranges.push_back(requests[0].arena, singleKeyRange(m.param1, requests[0].arena));
+			} else if (m.type == MutationRef::SetVersionstampedValue) {
+				transformVersionstampMutation(m, &MutationRef::param2, requests[0].version, transactionNumberInBatch);
+			}
+			if (isMetadataMutation(m)) {
+				isTXNStateTransaction = true;
+				auto& tr = getOutTransaction(0, trIn.read_snapshot);
+				tr.mutations.push_back(requests[0].arena, m);
+				tr.lock_aware = trRequest.isLockAware();
+			}
+		}
+		if (isTXNStateTransaction && !trRequest.isLockAware()) {
+			// This mitigates https://github.com/apple/foundationdb/issues/3647. Since this transaction is not lock
+			// aware, if this transaction got a read version then \xff/dbLocked must not have been set at this
+			// transaction's read snapshot. If that changes by commit time, then it won't commit on any proxy because of
+			// a conflict. A client could set a read version manually so this isn't totally bulletproof.
+			trIn.read_conflict_ranges.push_back(trRequest.arena, KeyRangeRef(databaseLockedKey, databaseLockedKeyEnd));
+		}
+
+		std::vector<std::vector<int>> rCRIndexMap = addReadConflictRanges(trIn);
+		txReadConflictRangeIndexMap.push_back(std::move(rCRIndexMap));
+
+		addWriteConflictRanges(trIn);
+
+		if (isTXNStateTransaction) {
 			for (int r = 0; r < requests.size(); r++) {
 				int transactionNumberInRequest =
 				    &getOutTransaction(r, trIn.read_snapshot) - requests[r].transactions.begin();
 				requests[r].txnStateTransactions.push_back(requests[r].arena, transactionNumberInRequest);
 			}
+			// Note only Resolver 0 got the correct spanContext, which means
+			// the reply from Resolver 0 has the right one back.
+			getOutTransaction(0, trIn.read_snapshot).spanContext = trRequest.spanContext;
+		}

 		std::vector<int> resolversUsed;
 		for (int r = 0; r < outTr.size(); r++)
@ -520,14 +560,64 @@ struct CommitBatchContext {

 	double commitStartTime;

+	std::unordered_map<uint16_t, Version> tpcvMap; // obtained from resolver
+	std::set<Tag> writtenTags; // final set tags written to in the batch
+	std::set<Tag> writtenTagsPreResolution; // tags written to in the batch not including any changes from the resolver.
+
 	CommitBatchContext(ProxyCommitData*, const std::vector<CommitTransactionRequest>*, const int);

 	void setupTraceBatch();

+	std::set<Tag> getWrittenTagsPreResolution();
+
 private:
 	void evaluateBatchSize();
 };

+std::set<Tag> CommitBatchContext::getWrittenTagsPreResolution() {
+	std::set<Tag> transactionTags;
+	std::vector<Tag> cacheVector = { cacheTag };
+	for (int transactionNum = 0; transactionNum < trs.size(); transactionNum++) {
+		int mutationNum = 0;
+		VectorRef<MutationRef>* pMutations = &trs[transactionNum].transaction.mutations;
+		for (; mutationNum < pMutations->size(); mutationNum++) {
+			auto& m = (*pMutations)[mutationNum];
+			if (isSingleKeyMutation((MutationRef::Type)m.type)) {
+				auto& tags = pProxyCommitData->tagsForKey(m.param1);
+				transactionTags.insert(tags.begin(), tags.end());
+				if (pProxyCommitData->cacheInfo[m.param1]) {
+					transactionTags.insert(cacheTag);
+				}
+			} else if (m.type == MutationRef::ClearRange) {
+				KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2));
+				auto ranges = pProxyCommitData->keyInfo.intersectingRanges(clearRange);
+				auto firstRange = ranges.begin();
+				++firstRange;
+				if (firstRange == ranges.end()) {
+					std::set<Tag> filteredTags;
+					ranges.begin().value().populateTags();
+					filteredTags.insert(ranges.begin().value().tags.begin(), ranges.begin().value().tags.end());
+					transactionTags.insert(ranges.begin().value().tags.begin(), ranges.begin().value().tags.end());
+				} else {
+					std::set<Tag> allSources;
+					for (auto r : ranges) {
+						r.value().populateTags();
+						allSources.insert(r.value().tags.begin(), r.value().tags.end());
+						transactionTags.insert(r.value().tags.begin(), r.value().tags.end());
+					}
+				}
+				if (pProxyCommitData->needsCacheTag(clearRange)) {
+					transactionTags.insert(cacheTag);
+				}
+			} else {
+				UNREACHABLE();
+			}
+		}
+	}
+
+	return transactionTags;
+}
+
 CommitBatchContext::CommitBatchContext(ProxyCommitData* const pProxyCommitData_,
                                       const std::vector<CommitTransactionRequest>* trs_,
                                       const int currentBatchMemBytesCount)
@ -656,6 +746,9 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
 		    "CommitDebug", debugID.get().first(), "CommitProxyServer.commitBatch.GettingCommitVersion");
 	}

+	if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+		self->writtenTagsPreResolution = self->getWrittenTagsPreResolution();
+	}
 	GetCommitVersionRequest req(span.context,
 	                            pProxyCommitData->commitVersionRequestNumber++,
 	                            pProxyCommitData->mostRecentProcessedRequestNumber,
@ -703,7 +796,7 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
 	int conflictRangeCount = 0;
 	self->maxTransactionBytes = 0;
 	for (int t = 0; t < trs.size(); t++) {
-		requests.addTransaction(trs[t], t);
+		requests.addTransaction(trs[t], self->commitVersion, t);
 		conflictRangeCount +=
 		    trs[t].transaction.read_conflict_ranges.size() + trs[t].transaction.write_conflict_ranges.size();
 		//TraceEvent("MPTransactionDump", self->dbgid).detail("Snapshot", trs[t].transaction.read_snapshot);
@ -720,6 +813,7 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
 	std::vector<Future<ResolveTransactionBatchReply>> replies;
 	for (int r = 0; r < pProxyCommitData->resolvers.size(); r++) {
 		requests.requests[r].debugID = self->debugID;
+		requests.requests[r].writtenTags = self->writtenTagsPreResolution;
 		replies.push_back(trackResolutionMetrics(pProxyCommitData->stats.resolverDist[r],
 		                                         brokenPromiseToNever(pProxyCommitData->resolvers[r].resolve.getReply(
 		                                             requests.requests[r], TaskPriority::ProxyResolverReply))));
@ -757,9 +851,11 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
 }

 void assertResolutionStateMutationsSizeConsistent(const std::vector<ResolveTransactionBatchReply>& resolution) {
-
 	for (int r = 1; r < resolution.size(); r++) {
 		ASSERT(resolution[r].stateMutations.size() == resolution[0].stateMutations.size());
+		if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+			ASSERT_EQ(resolution[0].tpcvMap.size(), resolution[r].tpcvMap.size());
+		}
 		for (int s = 0; s < resolution[r].stateMutations.size(); s++) {
 			ASSERT(resolution[r].stateMutations[s].size() == resolution[0].stateMutations[s].size());
 		}
@ -888,7 +984,7 @@ ACTOR Future<Void> applyMetadataToCommittedTransactions(CommitBatchContext* self
 				                       self->arena,
 				                       pProxyCommitData->logSystem,
 				                       trs[t].transaction.mutations,
-				                       &self->toCommit,
+				                       SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ? nullptr : &self->toCommit,
 				                       self->forceRecovery,
 				                       self->commitVersion,
 				                       self->commitVersion + 1,
@ -906,6 +1002,17 @@ ACTOR Future<Void> applyMetadataToCommittedTransactions(CommitBatchContext* self
 			self->committed[t] = ConflictBatch::TransactionConflict;
 		TraceEvent(SevWarn, "RestartingTxnSubsystem", pProxyCommitData->dbgid).detail("Stage", "AwaitCommit");
 	}
+	if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
+		// Resolver also calculates forceRecovery and only applies metadata mutations
+		// in the same set of transactions as this proxy.
+		ResolveTransactionBatchReply& reply = self->resolution[0];
+		self->toCommit.setMutations(reply.privateMutationCount, reply.privateMutations);
+		if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+			// TraceEvent("ResolverReturn").detail("ReturnTags",reply.writtenTags).detail("TPCVsize",reply.tpcvMap.size()).detail("ReqTags",self->writtenTagsPreResolution);
+			self->tpcvMap = reply.tpcvMap;
+		}
+		self->toCommit.addWrittenTags(reply.writtenTags);
+	}

 	self->lockedKey = pProxyCommitData->txnStateStore->readValue(databaseLockedKey).get();
 	self->lockedAfter = self->lockedKey.present() && self->lockedKey.get().size();
@ -1012,7 +1119,6 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 					// Fast path
 					DEBUG_MUTATION("ProxyCommit", self->commitVersion, m, pProxyCommitData->dbgid)
 					    .detail("To", ranges.begin().value().tags);
-
 					ranges.begin().value().populateTags();
 					self->toCommit.addTags(ranges.begin().value().tags);

@ -1046,9 +1152,10 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 							trCost->get().clearIdxCosts.pop_front();
 						}
 					}
-					DEBUG_MUTATION("ProxyCommit", self->commitVersion, m, pProxyCommitData->dbgid)
-					    .detail("To", allSources);

+					DEBUG_MUTATION("ProxyCommit", self->commitVersion, m)
+					    .detail("Dbgid", pProxyCommitData->dbgid)
+					    .detail("To", allSources);
 					self->toCommit.addTags(allSources);
 				}

@ -1175,6 +1282,8 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 		                        &self->computeStart));
 	}

+	self->toCommit.saveTags(self->writtenTags);
+
 	pProxyCommitData->stats.mutations += self->mutationCount;
 	pProxyCommitData->stats.mutationBytes += self->mutationBytes;

@ -1200,8 +1309,11 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 					break;
 				}
 				when(wait(pProxyCommitData->cx->onProxiesChanged())) {}
+				// @todo probably there is no need to get the (entire) version vector from the sequencer
+				// in this case, and if so, consider adding a flag to the request to tell the sequencer
+				// to not send the version vector information.
 				when(GetRawCommittedVersionReply v = wait(pProxyCommitData->master.getLiveCommittedVersion.getReply(
-				         GetRawCommittedVersionRequest(waitVersionSpan.context, debugID),
+				         GetRawCommittedVersionRequest(waitVersionSpan.context, debugID, invalidVersion),
 				         TaskPriority::GetLiveCommittedVersionReply))) {
 					if (v.version > pProxyCommitData->committedVersion.get()) {
 						pProxyCommitData->locked = v.locked;
@ -1238,22 +1350,30 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 	if (self->prevVersion && self->commitVersion - self->prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT / 2)
 		debug_advanceMaxCommittedVersion(UID(), self->commitVersion); //< Is this valid?

-	//TraceEvent("ProxyPush", pProxyCommitData->dbgid).detail("PrevVersion", prevVersion).detail("Version", commitVersion)
-	//	.detail("TransactionsSubmitted", trs.size()).detail("TransactionsCommitted", commitCount).detail("TxsPopTo",
-	// msg.popTo);
+	// TraceEvent("ProxyPush", pProxyCommitData->dbgid)
+	//     .detail("PrevVersion", self->prevVersion)
+	//     .detail("Version", self->commitVersion)
+	//     .detail("TransactionsSubmitted", trs.size())
+	//     .detail("TransactionsCommitted", self->commitCount)
+	//     .detail("TxsPopTo", self->msg.popTo);

 	if (self->prevVersion && self->commitVersion - self->prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT / 2)
 		debug_advanceMaxCommittedVersion(UID(), self->commitVersion);

 	self->commitStartTime = now();
 	pProxyCommitData->lastStartCommit = self->commitStartTime;
+	Optional<std::unordered_map<uint16_t, Version>> tpcvMap = Optional<std::unordered_map<uint16_t, Version>>();
+	if (SERVER_KNOBS->ENABLE_VERSION_VECTOR) {
+		tpcvMap = self->tpcvMap;
+	}
 	self->loggingComplete = pProxyCommitData->logSystem->push(self->prevVersion,
 	                                                          self->commitVersion,
 	                                                          pProxyCommitData->committedVersion.get(),
 	                                                          pProxyCommitData->minKnownCommittedVersion,
 	                                                          self->toCommit,
 	                                                          span.context,
-	                                                          self->debugID);
+	                                                          self->debugID,
+	                                                          tpcvMap);

 	float ratio = self->toCommit.getEmptyMessageRatio();
 	pProxyCommitData->stats.commitBatchingEmptyMessageRatio.addMeasurement(ratio);
@ -1338,7 +1458,9 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 		debug_advanceMinCommittedVersion(UID(), self->commitVersion);
 	}

-	//TraceEvent("ProxyPushed", pProxyCommitData->dbgid).detail("PrevVersion", prevVersion).detail("Version", commitVersion);
+	// TraceEvent("ProxyPushed", pProxyCommitData->dbgid)
+	//     .detail("PrevVersion", self->prevVersion)
+	//     .detail("Version", self->commitVersion);
 	if (debugID.present())
 		g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "CommitProxyServer.commitBatch.AfterLogPush");

@ -1353,13 +1475,21 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 	// self->committedVersion by reporting commit version first before updating self->committedVersion. Otherwise, a
 	// client may get a commit version that the master is not aware of, and next GRV request may get a version less than
 	// self->committedVersion.
+
 	TEST(pProxyCommitData->committedVersion.get() > self->commitVersion); // later version was reported committed first
+
 	if (self->commitVersion >= pProxyCommitData->committedVersion.get()) {
+		state Optional<std::set<Tag>> writtenTags;
+		if (SERVER_KNOBS->ENABLE_VERSION_VECTOR) {
+			writtenTags = self->writtenTags;
+		}
 		wait(pProxyCommitData->master.reportLiveCommittedVersion.getReply(
 		    ReportRawCommittedVersionRequest(self->commitVersion,
 		                                     self->lockedAfter,
 		                                     self->metadataVersionAfter,
-		                                     pProxyCommitData->minKnownCommittedVersion),
+		                                     pProxyCommitData->minKnownCommittedVersion,
+		                                     self->prevVersion,
+		                                     writtenTags),
 		    TaskPriority::ProxyMasterVersionReply));
 	}
 	if (self->commitVersion > pProxyCommitData->committedVersion.get()) {
@ -1444,7 +1574,19 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 			if (r->value().size() && r->value().front().first < oldestVersion)
 				r->value().front().first = 0;
 		}
+		if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
+			// Only normal key space, because \xff key space is processed by all resolvers.
+			pProxyCommitData->keyResolvers.coalesce(normalKeys);
+			auto& versions = pProxyCommitData->systemKeyVersions;
+			while (versions.size() > 1 && versions[1] < oldestVersion) {
+				versions.pop_front();
+			}
+			if (!versions.empty() && versions[0] < oldestVersion) {
+				versions[0] = 0;
+			}
+		} else {
 			pProxyCommitData->keyResolvers.coalesce(allKeys);
+		}
 		if (pProxyCommitData->keyResolvers.size() != lastSize)
 			TraceEvent("KeyResolverSize", pProxyCommitData->dbgid)
 			    .detail("Size", pProxyCommitData->keyResolvers.size());
@ -1526,6 +1668,16 @@ void maybeAddTssMapping(GetKeyServerLocationsReply& reply,
 	}
 }

+void addTagMapping(GetKeyServerLocationsReply& reply, ProxyCommitData* commitData) {
+	for (const auto& [_, shard] : reply.results) {
+		for (auto& ssi : shard) {
+			auto iter = commitData->storageCache.find(ssi.id());
+			ASSERT_WE_THINK(iter != commitData->storageCache.end());
+			reply.resultsTagMapping.emplace_back(ssi.id(), iter->second->tag);
+		}
+	}
+}
+
 ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsRequest req, ProxyCommitData* commitData) {
 	// We can't respond to these requests until we have valid txnStateStore
 	getCurrentLineage()->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyServersLocations;
@ -1620,6 +1772,7 @@ ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsReques
 			--r;
 		}
 	}
+	addTagMapping(rep, commitData);
 	req.reply.send(rep);
 	++commitData->stats.keyServerLocationOut;
 	return Void();
@ -2167,9 +2320,12 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
 		                            "ToResolver_" + commitData.resolvers[i].id().toString(),
 		                            Histogram::Unit::microseconds));
 	}
-	auto rs = commitData.keyResolvers.modify(allKeys);
+
+	// Initialize keyResolvers map
+	auto rs = commitData.keyResolvers.modify(SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ? normalKeys : allKeys);
 	for (auto r = rs.begin(); r != rs.end(); ++r)
 		r->value().emplace_back(0, 0);
+	commitData.systemKeyVersions.push_back(0);

 	commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor);
 	commitData.logAdapter =
--- a/fdbserver/ConfigFollowerInterface.cpp
+++ b/fdbserver/ConfigFollowerInterface.cpp
@ -41,6 +41,9 @@ ConfigFollowerInterface::ConfigFollowerInterface(NetworkAddress const& remote)
    rollforward(Endpoint::wellKnown({ remote }, WLTOKEN_CONFIGFOLLOWER_ROLLFORWARD)),
    getCommittedVersion(Endpoint::wellKnown({ remote }, WLTOKEN_CONFIGFOLLOWER_GETCOMMITTEDVERSION)) {}

+ConfigFollowerInterface::ConfigFollowerInterface(Hostname const& remote)
+  : _id(deterministicRandom()->randomUniqueID()), hostname(remote) {}
+
 bool ConfigFollowerInterface::operator==(ConfigFollowerInterface const& rhs) const {
 	return _id == rhs._id;
 }
--- a/fdbserver/ConfigFollowerInterface.h
+++ b/fdbserver/ConfigFollowerInterface.h
@ -221,7 +221,7 @@ public:
 	ConfigFollowerInterface();
 	void setupWellKnownEndpoints();
 	ConfigFollowerInterface(NetworkAddress const& remote);
-	ConfigFollowerInterface(Hostname hostname) : hostname(hostname) {}
+	ConfigFollowerInterface(Hostname const& hostname);
 	bool operator==(ConfigFollowerInterface const& rhs) const;
 	bool operator!=(ConfigFollowerInterface const& rhs) const;
 	UID id() const { return _id; }
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -42,6 +42,7 @@
 #include "flow/ActorCollection.h"
 #include "flow/Arena.h"
 #include "flow/BooleanParam.h"
+#include "flow/genericactors.actor.h"
 #include "flow/serialize.h"
 #include "flow/Trace.h"
 #include "flow/UnitTest.h"
@ -752,7 +753,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
 			                                                          shardsAffectedByTeamFailure,
 			                                                          lock,
 			                                                          getAverageShardBytes,
-			                                                          getUnhealthyRelocationCount,
+			                                                          getUnhealthyRelocationCount.getFuture(),
 			                                                          self->ddId,
 			                                                          storageTeamSize,
 			                                                          configuration.storageTeamSize,
@ -900,8 +901,39 @@ Future<Void> sendSnapReq(RequestStream<Req> stream, Req req, Error e) {
 	return Void();
 }

+ACTOR template <class Req>
+Future<ErrorOr<Void>> trySendSnapReq(RequestStream<Req> stream, Req req) {
+	ErrorOr<REPLY_TYPE(Req)> reply = wait(stream.tryGetReply(req));
+	if (reply.isError()) {
+		TraceEvent("SnapDataDistributor_ReqError")
+		    .errorUnsuppressed(reply.getError())
+		    .detail("Peer", stream.getEndpoint().getPrimaryAddress());
+		return ErrorOr<Void>(reply.getError());
+	}
+	return ErrorOr<Void>(Void());
+}
+
+ACTOR static Future<Void> waitForMost(std::vector<Future<ErrorOr<Void>>> futures,
+                                      int faultTolerance,
+                                      Error e,
+                                      double waitMultiplierForSlowFutures = 1.0) {
+	state std::vector<Future<bool>> successFutures;
+	state double startTime = now();
+	successFutures.reserve(futures.size());
+	for (const auto& future : futures) {
+		successFutures.push_back(fmap([](auto const& result) { return result.present(); }, future));
+	}
+	bool success = wait(quorumEqualsTrue(successFutures, successFutures.size() - faultTolerance));
+	if (!success) {
+		throw e;
+	}
+	wait(delay((now() - startTime) * waitMultiplierForSlowFutures) || waitForAll(successFutures));
+	return Void();
+}
+
 ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<AsyncVar<ServerDBInfo> const> db) {
 	state Database cx = openDBOnServer(db, TaskPriority::DefaultDelay, LockAware::True);
+
 	state ReadYourWritesTransaction tr(cx);
 	loop {
 		try {
@ -936,19 +968,29 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 		    .detail("SnapPayload", snapReq.snapPayload)
 		    .detail("SnapUID", snapReq.snapUID);
 		// snap local storage nodes
-		std::vector<WorkerInterface> storageWorkers =
+		// TODO: Atomically read  configuration and storage worker list in a single transaction
+		state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
+		std::pair<std::vector<WorkerInterface>, int> storageWorkersAndFailures =
 		    wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed()));
+		const auto& [storageWorkers, storageFailures] = storageWorkersAndFailures;
+		auto const storageFaultTolerance =
+		    std::min(static_cast<int>(SERVER_KNOBS->MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE),
+		             configuration.storageTeamSize - 1) -
+		    storageFailures;
+		if (storageFaultTolerance < 0) {
+			TEST(true); // Too many failed storage servers to complete snapshot
+			throw snap_storage_failed();
+		}
 		TraceEvent("SnapDataDistributor_GotStorageWorkers")
 		    .detail("SnapPayload", snapReq.snapPayload)
 		    .detail("SnapUID", snapReq.snapUID);
-		std::vector<Future<Void>> storageSnapReqs;
+		std::vector<Future<ErrorOr<Void>>> storageSnapReqs;
 		storageSnapReqs.reserve(storageWorkers.size());
 		for (const auto& worker : storageWorkers) {
-			storageSnapReqs.push_back(sendSnapReq(worker.workerSnapReq,
-			                                      WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr),
-			                                      snap_storage_failed()));
+			storageSnapReqs.push_back(trySendSnapReq(
+			    worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr)));
 		}
-		wait(waitForAll(storageSnapReqs));
+		wait(waitForMost(storageSnapReqs, storageFaultTolerance, snap_storage_failed()));

 		TraceEvent("SnapDataDistributor_AfterSnapStorage")
 		    .detail("SnapPayload", snapReq.snapPayload)
@ -983,14 +1025,15 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 		TraceEvent("SnapDataDistributor_GotCoordWorkers")
 		    .detail("SnapPayload", snapReq.snapPayload)
 		    .detail("SnapUID", snapReq.snapUID);
-		std::vector<Future<Void>> coordSnapReqs;
+		std::vector<Future<ErrorOr<Void>>> coordSnapReqs;
 		coordSnapReqs.reserve(coordWorkers.size());
 		for (const auto& worker : coordWorkers) {
-			coordSnapReqs.push_back(sendSnapReq(worker.workerSnapReq,
-			                                    WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr),
-			                                    snap_coord_failed()));
+			coordSnapReqs.push_back(trySendSnapReq(
+			    worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr)));
 		}
-		wait(waitForAll(coordSnapReqs));
+		auto const coordFaultTolerance = std::min<int>(std::max<int>(0, coordSnapReqs.size() / 2 - 1),
+		                                               SERVER_KNOBS->MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE);
+		wait(waitForMost(coordSnapReqs, coordFaultTolerance, snap_coord_failed()));
 		TraceEvent("SnapDataDistributor_AfterSnapCoords")
 		    .detail("SnapPayload", snapReq.snapPayload)
 		    .detail("SnapUID", snapReq.snapUID);
@ -1260,3 +1303,44 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV

 	return Void();
 }
+
+static Future<ErrorOr<Void>> goodTestFuture(double duration) {
+	return tag(delay(duration), ErrorOr<Void>(Void()));
+}
+
+static Future<ErrorOr<Void>> badTestFuture(double duration, Error e) {
+	return tag(delay(duration), ErrorOr<Void>(e));
+}
+
+TEST_CASE("/DataDistribution/WaitForMost") {
+	state std::vector<Future<ErrorOr<Void>>> futures;
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
+		wait(waitForMost(futures, 1, operation_failed(), 0.0)); // Don't wait for slowest future
+		ASSERT(!futures[2].isReady());
+	}
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
+		wait(waitForMost(futures, 0, operation_failed(), 0.0)); // Wait for all futures
+		ASSERT(futures[2].isReady());
+	}
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
+		wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Wait for slowest future
+		ASSERT(futures[2].isReady());
+	}
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
+		wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Error ignored
+	}
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
+		try {
+			wait(waitForMost(futures, 0, operation_failed(), 1.0));
+			ASSERT(false);
+		} catch (Error& e) {
+			ASSERT_EQ(e.code(), error_code_operation_failed);
+		}
+	}
+	return Void();
+}
--- a/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/DataDistribution.actor.h
@ -267,7 +267,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                         Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
                                         MoveKeysLock lock,
                                         PromiseStream<Promise<int64_t>> getAverageShardBytes,
-                                         PromiseStream<Promise<int>> getUnhealthyRelocationCount,
+                                         FutureStream<Promise<int>> getUnhealthyRelocationCount,
                                         UID distributorId,
                                         int teamSize,
                                         int singleRegionTeamSize,
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -1027,6 +1027,16 @@ struct DDQueueData {

 		validate();
 	}
+
+	int getHighestPriorityRelocation() const {
+		int highestPriority{ 0 };
+		for (const auto& [priority, count] : priority_relocations) {
+			if (count > 0) {
+				highestPriority = std::max(highestPriority, priority);
+			}
+		}
+		return highestPriority;
+	}
 };

 static std::string destServersString(std::vector<std::pair<Reference<IDataDistributionTeam>, bool>> const& bestTeams) {
@ -1698,7 +1708,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                         Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
                                         MoveKeysLock lock,
                                         PromiseStream<Promise<int64_t>> getAverageShardBytes,
-                                         PromiseStream<Promise<int>> getUnhealthyRelocationCount,
+                                         FutureStream<Promise<int>> getUnhealthyRelocationCount,
                                         UID distributorId,
                                         int teamSize,
                                         int singleRegionTeamSize,
@ -1792,12 +1802,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,

 					recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL, TaskPriority::FlushTrace);

-					int highestPriorityRelocation = 0;
-					for (auto it = self.priority_relocations.begin(); it != self.priority_relocations.end(); ++it) {
-						if (it->second) {
-							highestPriorityRelocation = std::max(highestPriorityRelocation, it->first);
-						}
-					}
+					auto const highestPriorityRelocation = self.getHighestPriorityRelocation();

 					TraceEvent("MovingData", distributorId)
 					    .detail("InFlight", self.activeRelocations)
@ -1833,9 +1838,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
 				}
 				when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
 				when(wait(waitForAll(balancingFutures))) {}
-				when(Promise<int> r = waitNext(getUnhealthyRelocationCount.getFuture())) {
-					r.send(self.unhealthyRelocations);
-				}
+				when(Promise<int> r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); }
 			}
 		}
 	} catch (Error& e) {
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@ -1177,15 +1177,16 @@ public:
 	struct Cursor {
 		Cursor() : cache(nullptr), nodeIndex(-1) {}

-		Cursor(DecodeCache* cache, DeltaTree2* tree) : tree(tree), cache(cache), nodeIndex(-1) {}
+		Cursor(Reference<DecodeCache> cache, DeltaTree2* tree) : tree(tree), cache(cache), nodeIndex(-1) {}

-		Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex) : tree(tree), cache(cache), nodeIndex(nodeIndex) {}
+		Cursor(Reference<DecodeCache> cache, DeltaTree2* tree, int nodeIndex)
+		  : tree(tree), cache(cache), nodeIndex(nodeIndex) {}

 		// Copy constructor does not copy item because normally a copied cursor will be immediately moved.
 		Cursor(const Cursor& c) : tree(c.tree), cache(c.cache), nodeIndex(c.nodeIndex) {}

 		~Cursor() {
-			if (cache != nullptr) {
+			if (cache.isValid()) {
 				cache->updateUsedMemory();
 			}
 		}
@ -1212,7 +1213,7 @@ public:
 		}

 		DeltaTree2* tree;
-		DecodeCache* cache;
+		Reference<DecodeCache> cache;
 		int nodeIndex;
 		mutable Optional<T> item;

@ -1274,6 +1275,7 @@ public:
 			return item.get();
 		}

+		// Switch the cursor to point to a new DeltaTree
 		void switchTree(DeltaTree2* newTree) {
 			tree = newTree;
 			// Reset item because it may point into tree memory
@ -1709,7 +1711,13 @@ public:
 		} else {
 			nodeBytesUsed = 0;
 		}
+
+		ASSERT(size() <= spaceAvailable);
 		nodeBytesFree = spaceAvailable - size();
+
+		// Zero unused available space
+		memset((uint8_t*)this + size(), 0, nodeBytesFree);
+
 		return size();
 	}

@ -1782,8 +1790,15 @@ private:
 		node.setLeftChildOffset(largeNodes, leftChildOffset);
 		node.setRightChildOffset(largeNodes, rightChildOffset);

-		deltatree_printf("%p: Serialized %s as %s\n", this, item.toString().c_str(), node.toString(this).c_str());
+		int written = wptr - (uint8_t*)&node;
+		deltatree_printf("Built subtree tree=%p subtreeRoot=%p written=%d end=%p serialized subtreeRoot %s as %s \n",
+		                 this,
+		                 &node,
+		                 written,
+		                 (uint8_t*)&node + written,
+		                 item.toString().c_str(),
+		                 node.toString(this).c_str());

-		return wptr - (uint8_t*)&node;
+		return written;
 	}
 };
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@ -25,6 +25,7 @@
 #include "fdbserver/LogSystemDiskQueueAdapter.h"
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/GrvProxyInterface.h"
+#include "fdbclient/VersionVector.h"
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "fdbrpc/sim_validation.h"
@ -247,11 +248,15 @@ struct GrvProxyData {
 	Optional<LatencyBandConfig> latencyBandConfig;
 	double lastStartCommit;
 	double lastCommitLatency;
+	LatencySample versionVectorSizeOnGRVReply;
 	int updateCommitRequests;
 	NotifiedDouble lastCommitTime;

 	Version minKnownCommittedVersion; // we should ask master for this version.

+	// Cache of the latest commit versions of storage servers.
+	VersionVector ssVersionVectorCache;
+
 	void updateLatencyBandConfig(Optional<LatencyBandConfig> newLatencyBandConfig) {
 		if (newLatencyBandConfig.present() != latencyBandConfig.present() ||
 		    (newLatencyBandConfig.present() &&
@ -274,8 +279,12 @@ struct GrvProxyData {
 	             Reference<AsyncVar<ServerDBInfo> const> db)
 	  : dbgid(dbgid), stats(dbgid), master(master), getConsistentReadVersion(getConsistentReadVersion),
 	    cx(openDBOnServer(db, TaskPriority::DefaultEndpoint, LockAware::True)), db(db), lastStartCommit(0),
-	    lastCommitLatency(SERVER_KNOBS->REQUIRED_MIN_RECOVERY_DURATION), updateCommitRequests(0), lastCommitTime(0),
-	    minKnownCommittedVersion(invalidVersion) {}
+	    lastCommitLatency(SERVER_KNOBS->REQUIRED_MIN_RECOVERY_DURATION),
+	    versionVectorSizeOnGRVReply("VersionVectorSizeOnGRVReply",
+	                                dbgid,
+	                                SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                                SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    updateCommitRequests(0), lastCommitTime(0), minKnownCommittedVersion(invalidVersion) {}
 };

 ACTOR Future<Void> healthMetricsRequestServer(GrvProxyInterface grvProxy,
@ -552,7 +561,8 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanID parentSpan,
 	state double grvStart = now();
 	state Future<GetRawCommittedVersionReply> replyFromMasterFuture;
 	replyFromMasterFuture = grvProxyData->master.getLiveCommittedVersion.getReply(
-	    GetRawCommittedVersionRequest(span.context, debugID), TaskPriority::GetLiveCommittedVersionReply);
+	    GetRawCommittedVersionRequest(span.context, debugID, grvProxyData->ssVersionVectorCache.getMaxVersion()),
+	    TaskPriority::GetLiveCommittedVersionReply);

 	if (!SERVER_KNOBS->ALWAYS_CAUSAL_READ_RISKY && !(flags & GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)) {
 		wait(updateLastCommit(grvProxyData, debugID));
@ -571,7 +581,8 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanID parentSpan,
 	GetRawCommittedVersionReply repFromMaster = wait(replyFromMasterFuture);
 	grvProxyData->minKnownCommittedVersion =
 	    std::max(grvProxyData->minKnownCommittedVersion, repFromMaster.minKnownCommittedVersion);
-
+	// TODO add to "status json"
+	grvProxyData->ssVersionVectorCache.applyDelta(repFromMaster.ssVersionVectorDelta);
 	grvProxyData->stats.grvGetCommittedVersionRpcDist->sampleSeconds(now() - grvConfirmEpochLive);
 	GetReadVersionReply rep;
 	rep.version = repFromMaster.version;
@ -603,6 +614,7 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanID parentSpan,
 // Update GRV statistics according to the request's priority.
 ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
                                  std::vector<GetReadVersionRequest> requests,
+                                  GrvProxyData* grvProxyData,
                                  GrvProxyStats* stats,
                                  Version minKnownCommittedVersion,
                                  PrioritizedTransactionTagMap<ClientTagThrottleLimits> throttledTags,
@ -634,6 +646,9 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 		}
 		reply.midShardSize = midShardSize;
 		reply.tagThrottleInfo.clear();
+		grvProxyData->ssVersionVectorCache.getDelta(request.maxVersion, reply.ssVersionVectorDelta);
+		grvProxyData->versionVectorSizeOnGRVReply.addMeasurement(reply.ssVersionVectorDelta.size());
+		reply.proxyId = grvProxyData->dbgid;

 		if (!request.tags.empty()) {
 			auto& priorityThrottledTags = throttledTags[request.priority];
@ -936,6 +951,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 				                                                                       batchPriTransactionsStarted[i]);
 				addActor.send(sendGrvReplies(readVersionReply,
 				                             start[i],
+				                             grvProxyData,
 				                             &grvProxyData->stats,
 				                             grvProxyData->minKnownCommittedVersion,
 				                             throttledTags,
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@ -20,22 +20,24 @@

 #ifndef FDBSERVER_IPAGER_H
 #define FDBSERVER_IPAGER_H
+#include "flow/Error.h"
+#include "flow/FastAlloc.h"
+#include "flow/ProtocolVersion.h"
+#include <cstddef>
+#include <stdint.h>
 #pragma once

 #include "fdbserver/IKeyValueStore.h"

 #include "flow/flow.h"
 #include "fdbclient/FDBTypes.h"
+#define XXH_INLINE_ALL
 #include "flow/xxhash.h"

-#ifndef VALGRIND
-#define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
-#define VALGRIND_MAKE_MEM_DEFINED(x, y)
-#endif
-
 typedef uint32_t LogicalPageID;
 typedef uint32_t PhysicalPageID;
 #define invalidLogicalPageID std::numeric_limits<LogicalPageID>::max()
+#define invalidPhysicalPageID std::numeric_limits<PhysicalPageID>::max()

 typedef uint32_t QueueID;
 #define invalidQueueID std::numeric_limits<QueueID>::max()
@ -76,90 +78,509 @@ static const std::vector<std::pair<PagerEvents, PagerEventReasons>> L0PossibleEv
 	{ PagerEvents::PageWrite, PagerEventReasons::MetaData },
 };

-// Represents a block of memory in a 4096-byte aligned location held by an Arena.
+enum EncodingType : uint8_t {
+	XXHash64 = 0,
+	// For testing purposes
+	XOREncryption = 1
+};
+
+enum PageType : uint8_t {
+	HeaderPage = 0,
+	BackupHeaderPage = 1,
+	BTreeNode = 2,
+	BTreeSuperNode = 3,
+	QueuePageStandalone = 4,
+	QueuePageInExtent = 5
+};
+
+// Encryption key ID
+typedef uint64_t KeyID;
+
+// EncryptionKeyRef is somewhat multi-variant, it will contain members representing the union
+// of all fields relevant to any implemented encryption scheme.  They are generally of
+// the form
+//   Page Fields - fields which come from or are stored in the Page
+//   Secret Fields - fields which are only known by the Key Provider
+// but it is up to each encoding and provider which fields are which and which ones are used
+struct EncryptionKeyRef {
+
+	EncryptionKeyRef(){};
+	EncryptionKeyRef(Arena& arena, const EncryptionKeyRef& toCopy) : secret(arena, toCopy.secret), id(toCopy.id) {}
+	int expectedSize() const { return secret.size(); }
+
+	StringRef secret;
+	Optional<KeyID> id;
+};
+typedef Standalone<EncryptionKeyRef> EncryptionKey;
+
+// Interface used by pager to get encryption keys by ID when reading pages from disk
+// and by the BTree to get encryption keys to use for new pages
+class IEncryptionKeyProvider {
+public:
+	virtual ~IEncryptionKeyProvider() {}
+
+	// Get an EncryptionKey with Secret Fields populated based on the given Page Fields.
+	// It is up to the implementation which fields those are.
+	// The output Page Fields must match the input Page Fields.
+	virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) = 0;
+
+	// Get encryption key that should be used for a given user Key-Value range
+	virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) = 0;
+};
+
+// This is a hacky way to attach an additional object of an arbitrary type at runtime to another object.
+// It stores an arbitrary void pointer and a void pointer function to call when the ArbitraryObject
+// is destroyed.
+// It has helper operator= methods for storing heap-allocated T's or Reference<T>'s in into it via
+//   x = thing;
+// Examples:
+//   ArbitraryObject x;
+//   x.set(new Widget());  // x owns the new object
+//   x.set(Reference<SomeClass>(new SomeClass());    // x holds a reference now too
+//   x.setReference(new SomeReferenceCountedType()); //
+struct ArbitraryObject {
+	ArbitraryObject() : ptr(nullptr), onDestruct(nullptr) {}
+	ArbitraryObject(const ArbitraryObject&) = delete;
+
+	~ArbitraryObject() { destructOnly(); }
+
+	bool valid() const { return ptr != nullptr; }
+
+	template <typename T>
+	void operator=(T* p) {
+		destructOnly();
+		ptr = p;
+		onDestruct = [](void* ptr) { delete (T*)ptr; };
+	}
+
+	template <typename T>
+	void operator=(Reference<T>& r) {
+		destructOnly();
+		ptr = r.getPtr();
+		r.getPtr()->addref();
+		onDestruct = [](void* ptr) { ((T*)ptr)->delref(); };
+	}
+
+	template <typename T>
+	void operator=(Reference<T>&& r) {
+		destructOnly();
+		ptr = r.extractPtr();
+		onDestruct = [](void* ptr) { ((T*)ptr)->delref(); };
+	}
+
+	template <typename T>
+	T* getPtr() {
+		return (T*)ptr;
+	}
+
+	template <typename T>
+	Reference<T> getReference() {
+		return Reference<T>::addRef((T*)ptr);
+	}
+
+	void reset() {
+		destructOnly();
+		ptr = nullptr;
+		onDestruct = nullptr;
+	}
+
+	// ptr can be set to any arbitrary thing.  If it is not null at destruct time then
+	// onDestruct(ptr) will be called if onDestruct is not null.
+	void* ptr = nullptr;
+	void (*onDestruct)(void*) = nullptr;
+
+private:
+	// Call onDestruct(ptr) if needed but don't reset any state
+	void destructOnly() {
+		if (ptr != nullptr && onDestruct != nullptr) {
+			onDestruct(ptr);
+		}
+	}
+};
+
+// ArenaPage represents a data page meant to be stored on disk, located in a block of
+// 4k-aligned memory held by an Arena
+//
+// Page Format:
+//    PageHeader - describes main header version, encoding type, and offsets of subheaders and payload.
+//    MainHeader - structure based on header version.  It is responsible for protecting all bytes
+//                 of PageHeader, MainHeader, and EncodingHeader with some sort of checksum.
+//    EncodingHeader - structure based on encoding type.  It is responsible for protecting and
+//                     possibly encrypting all payload bytes.
+//    Payload - User accessible bytes, protected and possibly encrypted based on the encoding
+//
+// preWrite() must be called before writing a page to disk to update checksums and encrypt as needed
+// After reading a page from disk,
+//   postReadHeader() must be called to verify the verison, main, and encoding headers
+//   postReadPayload() must be called, after potentially setting encryption secret, to verify and possibly
+//                     decrypt the payload
 class ArenaPage : public ReferenceCounted<ArenaPage>, public FastAllocated<ArenaPage> {
 public:
-	// The page's logical size includes an opaque checksum, use size() to get usable size
-	ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), userData(nullptr) {
+	// This is the header version that new page init() calls will use.
+	// It is not necessarily the latest header version, as read/modify support for
+	// a new header version may be added prior to using that version as the default
+	// for new pages as part of downgrade support.
+	static constexpr uint8_t HEADER_WRITE_VERSION = 1;
+
+	ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), pPayload(nullptr) {
 		if (bufferSize > 0) {
 			buffer = (uint8_t*)arena.allocate4kAlignedBuffer(bufferSize);

-			// Mark any unused page portion defined
-			VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize);
+			// Zero unused region
+			memset(buffer + logicalSize, 0, bufferSize - logicalSize);
 		} else {
 			buffer = nullptr;
 		}
 	};

-	~ArenaPage() {
-		if (userData != nullptr && userDataDestructor != nullptr) {
-			userDataDestructor(userData);
+	~ArenaPage() {}
+
+	// Before using these, either init() or postReadHeader and postReadPayload() must be called
+	const uint8_t* data() const { return pPayload; }
+	uint8_t* mutateData() const { return (uint8_t*)pPayload; }
+	int dataSize() const { return payloadSize; }
+
+	StringRef dataAsStringRef() const { return StringRef((uint8_t*)pPayload, payloadSize); }
+
+	const uint8_t* rawData() const { return buffer; }
+	uint8_t* rawData() { return buffer; }
+	int rawSize() const { return bufferSize; }
+
+#pragma pack(push, 1)
+
+	// The next few structs describe the byte-packed physical structure.  The fields of Page
+	// cannot change, but new header versions and encoding types can be added and existing
+	// header versions and encoding type headers could change size as offset information
+	// is stored to enable efficient jumping to the encoding header or payload.
+	// Page members are only initialized in init()
+	struct PageHeader {
+		uint8_t headerVersion;
+		EncodingType encodingType;
+
+		// Encoding header comes after main header
+		uint8_t encodingHeaderOffset;
+
+		// Payload comes after encoding header
+		uint8_t payloadOffset;
+
+		// Get main header pointer, casting to its type
+		template <typename T>
+		T* getMainHeader() const {
+			return (T*)(this + 1);
+		}
+
+		// Get encoding header pointer, casting to its type
+		template <typename T>
+		T* getEncodingHeader() const {
+			return (T*)((uint8_t*)this + encodingHeaderOffset);
+		}
+
+		// Get payload pointer
+		uint8_t* getPayload() const { return (uint8_t*)this + payloadOffset; }
+	};
+
+	// Redwood header version 1
+	// Protects all headers with a 64-bit XXHash checksum
+	// Most other fields are forensic in nature and are not required to be set for correct
+	// behavior but they can faciliate forensic investigation of data on disk.  Some of them
+	// could be used for sanity checks at runtime.
+	struct RedwoodHeaderV1 {
+		PageType pageType;
+		// The meaning of pageSubType is based on pageType
+		//   For Queue pages, pageSubType is the QueueID
+		//   For BTree nodes, pageSubType is Height (also stored in BTreeNode)
+		uint8_t pageSubType;
+		// Format identifier, normally specific to the page Type and SubType
+		uint8_t pageFormat;
+		XXH64_hash_t checksum;
+
+		// Physical page ID of first block on disk of the ArenaPage
+		PhysicalPageID firstPhysicalPageID;
+		// The first logical page ID the ArenaPage was referenced by when last written
+		LogicalPageID lastKnownLogicalPageID;
+		// The first logical page ID of the parent of this ArenaPage when last written
+		LogicalPageID lastKnownParentLogicalPageID;
+
+		// Time and write version as of the last update to this page.
+		// Note that for relocated pages, writeVersion should not be updated.
+		double writeTime;
+		Version writeVersion;
+
+		// Update checksum
+		void updateChecksum(uint8_t* headerBytes, int len) {
+			// Checksum is within the checksum input so clear it first
+			checksum = 0;
+			checksum = XXH3_64bits(headerBytes, len);
+		}
+
+		// Verify checksum
+		void verifyChecksum(uint8_t* headerBytes, int len) {
+			// Checksum is within the checksum input so save it and restore it afterwards
+			XXH64_hash_t saved = checksum;
+			checksum = 0;
+			XXH64_hash_t calculated = XXH3_64bits(headerBytes, len);
+			checksum = saved;
+
+			if (saved != calculated) {
+				throw page_header_checksum_failed();
+			}
+		}
+	};
+
+	// An encoding that validates the payload with an XXHash checksum
+	struct XXHashEncodingHeader {
+		XXH64_hash_t checksum;
+		void encode(uint8_t* payload, int len, PhysicalPageID seed) {
+			checksum = XXH3_64bits_withSeed(payload, len, seed);
+		}
+		void decode(uint8_t* payload, int len, PhysicalPageID seed) {
+			if (checksum != XXH3_64bits_withSeed(payload, len, seed)) {
+				throw page_decoding_failed();
+			}
+		}
+	};
+
+	// A dummy "encrypting" encoding which uses XOR with a 1 byte secret key on
+	// the payload to obfuscate it and protects the payload with an XXHash checksum.
+	struct XOREncryptionEncodingHeader {
+		// Checksum is on unencrypted payload
+		XXH64_hash_t checksum;
+		uint8_t keyID;
+
+		void encode(uint8_t secret, uint8_t* payload, int len, PhysicalPageID seed) {
+			checksum = XXH3_64bits_withSeed(payload, len, seed);
+			for (int i = 0; i < len; ++i) {
+				payload[i] ^= secret;
+			}
+		}
+		void decode(uint8_t secret, uint8_t* payload, int len, PhysicalPageID seed) {
+			for (int i = 0; i < len; ++i) {
+				payload[i] ^= secret;
+			}
+			if (checksum != XXH3_64bits_withSeed(payload, len, seed)) {
+				throw page_decoding_failed();
+			}
+		}
+	};
+#pragma pack(pop)
+
+	// Get the size of the encoding header based on type
+	// Note that this is only to be used in operations involving new pages to calculate the payload offset.  For
+	// existing pages, the payload offset is stored in the page.
+	static int encodingHeaderSize(EncodingType t) {
+		if (t == EncodingType::XXHash64) {
+			return sizeof(XXHashEncodingHeader);
+		} else if (t == EncodingType::XOREncryption) {
+			return sizeof(XOREncryptionEncodingHeader);
+		} else {
+			throw page_encoding_not_supported();
 		}
 	}

-	uint8_t const* begin() const { return (uint8_t*)buffer; }
+	// Get the usable size for a new page of pageSize using HEADER_WRITE_VERSION with encoding type t
+	static int getUsableSize(int pageSize, EncodingType t) {
+		return pageSize - sizeof(PageHeader) - sizeof(RedwoodHeaderV1) - encodingHeaderSize(t);
+	}

-	uint8_t* mutate() { return (uint8_t*)buffer; }
+	// Initialize the header for a new page so that the payload can be written to
+	// Pre:  Buffer is allocated and logical size is set
+	// Post: Page header is initialized and space is reserved for subheaders for
+	//       HEADER_WRITE_VERSION main header and the given encoding type.
+	//       Payload can be written to with mutateData() and dataSize()
+	void init(EncodingType t, PageType pageType, uint8_t pageSubType, uint8_t pageFormat = 0) {
+		// Carefully cast away constness to modify page header
+		PageHeader* p = const_cast<PageHeader*>(page);
+		p->headerVersion = HEADER_WRITE_VERSION;
+		p->encodingHeaderOffset = sizeof(PageHeader) + sizeof(RedwoodHeaderV1);
+		p->encodingType = t;
+		p->payloadOffset = page->encodingHeaderOffset + encodingHeaderSize(t);

-	typedef XXH64_hash_t Checksum;
+		pPayload = page->getPayload();
+		payloadSize = logicalSize - (pPayload - buffer);

-	// Usable size, without checksum
-	int size() const { return logicalSize - sizeof(Checksum); }
+		RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
+		h->pageType = pageType;
+		h->pageSubType = pageSubType;
+		h->pageFormat = pageFormat;

-	Standalone<StringRef> asStringRef() const { return Standalone<StringRef>(StringRef(begin(), size()), arena); }
+		// Write dummy values for these in new pages. They should be updated when possible before calling preWrite()
+		// when modifying existing pages
+		h->lastKnownLogicalPageID = invalidLogicalPageID;
+		h->lastKnownParentLogicalPageID = invalidLogicalPageID;
+		h->writeVersion = invalidVersion;
+	}

-	// Get an ArenaPage which is a copy of this page, in its own Arena
-	Reference<ArenaPage> cloneContents() const {
+	// Get the logical page buffer as a StringRef
+	Standalone<StringRef> asStringRef() const { return Standalone<StringRef>(StringRef(buffer, logicalSize)); }
+
+	// Get a new ArenaPage that contains a copy of this page's data.
+	// extra is not copied to the returned page
+	Reference<ArenaPage> clone() const {
 		ArenaPage* p = new ArenaPage(logicalSize, bufferSize);
 		memcpy(p->buffer, buffer, logicalSize);
+
+		// Non-verifying header parse just to initialize members
+		p->postReadHeader(invalidPhysicalPageID, false);
+		p->encryptionKey = encryptionKey;
+
 		return Reference<ArenaPage>(p);
 	}

 	// Get an ArenaPage which depends on this page's Arena and references some of its memory
-	Reference<ArenaPage> subPage(int offset, int len) const {
+	Reference<ArenaPage> getSubPage(int offset, int len) const {
+		ASSERT(offset + len <= logicalSize);
 		ArenaPage* p = new ArenaPage(len, 0);
 		p->buffer = buffer + offset;
 		p->arena.dependsOn(arena);
+
+		// Non-verifying header parse just to initialize component pointers
+		p->postReadHeader(invalidPhysicalPageID, false);
+		p->encryptionKey = encryptionKey;
+
 		return Reference<ArenaPage>(p);
 	}

-	// Given a vector of pages with the same ->size(), create a new ArenaPage with a ->size() that is
-	// equivalent to all of the input pages and has all of their contents copied into it.
-	static Reference<ArenaPage> concatPages(const std::vector<Reference<const ArenaPage>>& pages) {
-		int usableSize = pages.front()->size();
-		int totalUsableSize = pages.size() * usableSize;
-		int totalBufferSize = pages.front()->bufferSize * pages.size();
-		ArenaPage* superpage = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize);
-
-		uint8_t* wptr = superpage->mutate();
-		for (auto& p : pages) {
-			ASSERT(p->size() == usableSize);
-			memcpy(wptr, p->begin(), usableSize);
-			wptr += usableSize;
+	// The next two functions set mostly forensic info that may help in an investigation to identify data on disk.  The
+	// exception is pageID which must be set to the physical page ID on disk where the page is written or post-read
+	// verification will fail.
+	void setWriteInfo(PhysicalPageID pageID, Version writeVersion) {
+		if (page->headerVersion == 1) {
+			RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
+			h->firstPhysicalPageID = pageID;
+			h->writeVersion = writeVersion;
+			h->writeTime = now();
+		}
 	}

-		return Reference<ArenaPage>(superpage);
+	// These should be updated before writing a BTree page.  Note that the logical ID that refers to a page can change
+	// after the page is written, if its parent is updated to point directly to its physical page ID.  Therefore, the
+	// last known logical page ID should always be updated before writing an updated version of a BTree page.
+	void setLogicalPageInfo(LogicalPageID lastKnownLogicalPageID, LogicalPageID lastKnownParentLogicalPageID) {
+		if (page->headerVersion == 1) {
+			RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
+			h->lastKnownLogicalPageID = lastKnownLogicalPageID;
+			h->lastKnownParentLogicalPageID = lastKnownParentLogicalPageID;
+		}
 	}

-	Checksum& getChecksum() { return *(Checksum*)(buffer + size()); }
+	// Must be called before writing to disk to update headers and encrypt page
+	// Pre:   Encoding-specific header fields are set if needed
+	//        Secret is set if needed
+	// Post:  Main and Encoding subheaders are updated
+	//        Payload is possibly encrypted
+	void preWrite(PhysicalPageID pageID) const {
+		// Explicitly check payload definedness to make the source of valgrind errors more clear.
+		// Without this check, calculating a checksum on a payload with undefined bytes does not
+		// cause a valgrind error but the resulting checksum is undefined which causes errors later.
+		ASSERT(VALGRIND_CHECK_MEM_IS_DEFINED(pPayload, payloadSize) == 0);

-	Checksum calculateChecksum(LogicalPageID pageID) { return XXH3_64bits_withSeed(buffer, size(), pageID); }
+		if (page->encodingType == EncodingType::XXHash64) {
+			page->getEncodingHeader<XXHashEncodingHeader>()->encode(pPayload, payloadSize, pageID);
+		} else if (page->encodingType == EncodingType::XOREncryption) {
+			ASSERT(encryptionKey.secret.size() == 1);
+			XOREncryptionEncodingHeader* xh = page->getEncodingHeader<XOREncryptionEncodingHeader>();
+			xh->keyID = encryptionKey.id.orDefault(0);
+			xh->encode(encryptionKey.secret[0], pPayload, payloadSize, pageID);
+		} else {
+			throw page_encoding_not_supported();
+		}

-	void updateChecksum(LogicalPageID pageID) { getChecksum() = calculateChecksum(pageID); }
+		if (page->headerVersion == 1) {
+			page->getMainHeader<RedwoodHeaderV1>()->updateChecksum(buffer, pPayload - buffer);
+		} else {
+			throw page_header_version_not_supported();
+		}
+	}

-	bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); }
+	// Must be called after reading from disk to verify all non-payload bytes
+	// Pre:   Bytes from storage medium copied into raw buffer space
+	// Post:  Page headers outside of payload are verified (unless verify is false)
+	//        encryptionKey is updated with information from encoding header if needed
+	//        Payload is accessible via data(), dataSize(), etc.
+	//
+	// Exceptions are thrown for unknown header types or pages which fail verification
+	void postReadHeader(PhysicalPageID pageID, bool verify = true) {
+		pPayload = page->getPayload();
+		payloadSize = logicalSize - (pPayload - buffer);
+
+		// Populate encryption key with relevant fields from page
+		if (page->encodingType == EncodingType::XOREncryption) {
+			encryptionKey.id = page->getEncodingHeader<XOREncryptionEncodingHeader>()->keyID;
+		}
+
+		if (page->headerVersion == 1) {
+			if (verify) {
+				RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
+				h->verifyChecksum(buffer, pPayload - buffer);
+				if (pageID != h->firstPhysicalPageID) {
+					throw page_header_wrong_page_id();
+				}
+			}
+		} else {
+			throw page_header_version_not_supported();
+		}
+	}
+
+	// Pre:   postReadHeader has been called, encoding-specific parameters (such as the encryption secret) have been set
+	// Post:  Payload has been verified and decrypted if necessary
+	void postReadPayload(PhysicalPageID pageID) {
+		if (page->encodingType == EncodingType::XXHash64) {
+			page->getEncodingHeader<XXHashEncodingHeader>()->decode(pPayload, payloadSize, pageID);
+		} else if (page->encodingType == EncodingType::XOREncryption) {
+			ASSERT(encryptionKey.secret.size() == 1);
+			page->getEncodingHeader<XOREncryptionEncodingHeader>()->decode(
+			    encryptionKey.secret[0], pPayload, payloadSize, pageID);
+		} else {
+			throw page_encoding_not_supported();
+		}
+	}

 	const Arena& getArena() const { return arena; }

+	static bool isEncodingTypeEncrypted(EncodingType t) { return t == EncodingType::XOREncryption; }
+
+	// Returns true if the page's encoding type employs encryption
+	bool isEncrypted() const { return isEncodingTypeEncrypted(getEncodingType()); }
+
 private:
 	Arena arena;
+
+	// The logical size of the page, which can be smaller than bufferSize, which is only of
+	// practical purpose in simulation to use arbitrarily small page sizes to test edge cases
+	// with shorter execution time
 	int logicalSize;
+
+	// The 4k-aligned physical size of allocated memory for the page which also represents the
+	// block size to be written to disk
 	int bufferSize;
+
+	// buffer is a pointer to the page's memory
+	// For convenience, it is unioned with a Page pointer which defines the page structure
+	union {
 		uint8_t* buffer;
+		const PageHeader* page;
+	};
+
+	// Pointer and length of page space available to the user
+	// These are accessed very often so they are stored directly
+	uint8_t* pPayload;
+	int payloadSize;

 public:
-	mutable void* userData;
-	mutable void (*userDataDestructor)(void*);
+	EncodingType getEncodingType() const { return page->encodingType; }
+
+	PhysicalPageID getPhysicalPageID() const {
+		if (page->headerVersion == 1) {
+			return page->getMainHeader<RedwoodHeaderV1>()->firstPhysicalPageID;
+		} else {
+			throw page_header_version_not_supported();
+		}
+	}
+
+	// Used by encodings that do encryption
+	EncryptionKey encryptionKey;
+
+	mutable ArbitraryObject extra;
 };

 class IPagerSnapshot {
@ -184,18 +605,21 @@ public:

 	virtual void addref() = 0;
 	virtual void delref() = 0;
+
+	ArbitraryObject extra;
 };

 // This API is probably too customized to the behavior of DWALPager and probably needs some changes to be more generic.
 class IPager2 : public IClosable {
 public:
+	virtual std::string getName() const = 0;
+
 	// Returns an ArenaPage that can be passed to writePage. The data in the returned ArenaPage might not be zeroed.
-	virtual Reference<ArenaPage> newPageBuffer(size_t size = 1) = 0;
+	virtual Reference<ArenaPage> newPageBuffer(size_t blocks = 1) = 0;

 	// Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead).
 	// For a given pager instance, separate calls to this function must return the same value.
 	// Only valid to call after recovery is complete.
-	virtual int getUsablePageSize() const = 0;
 	virtual int getPhysicalPageSize() const = 0;
 	virtual int getLogicalPageSize() const = 0;
 	virtual int getPagesPerExtent() const = 0;
@ -251,7 +675,7 @@ public:
 	                                              bool noHit) = 0;
 	virtual Future<Reference<ArenaPage>> readMultiPage(PagerEventReasons reason,
 	                                                   unsigned int level,
-	                                                   Standalone<VectorRef<PhysicalPageID>> pageIDs,
+	                                                   VectorRef<PhysicalPageID> pageIDs,
 	                                                   int priority,
 	                                                   bool cacheable,
 	                                                   bool noHit) = 0;
@ -271,16 +695,13 @@ public:
 	// The snapshot shall be usable until setOldVersion() is called with a version > v.
 	virtual Reference<IPagerSnapshot> getReadSnapshot(Version v) = 0;

-	// Atomically make durable all pending page writes, page frees, and update the metadata string,
-	// setting the committed version to v
-	// v must be >= the highest versioned page write.
-	virtual Future<Void> commit(Version v) = 0;
+	// Atomically make durable all pending page writes, page frees, and update the user commit
+	// record at version v
+	// v must be higher than the highest committed version
+	virtual Future<Void> commit(Version v, Value commitRecord) = 0;

-	// Get the latest meta key set or committed
-	virtual Key getMetaKey() const = 0;
-
-	// Set the metakey which will be stored in the next commit
-	virtual void setMetaKey(KeyRef metaKey) = 0;
+	// Get the latest committed user commit record
+	virtual Value getCommitRecord() const = 0;

 	virtual StorageBytes getStorageBytes() const = 0;

@ -318,4 +739,52 @@ protected:
 	~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface
 };

+// The null key provider is useful to simplify page decoding.
+// It throws an error for any key info requested.
+class NullKeyProvider : public IEncryptionKeyProvider {
+public:
+	virtual ~NullKeyProvider() {}
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { throw encryption_key_not_found(); }
+	Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		throw encryption_key_not_found();
+	}
+};
+
+// Key provider for dummy XOR encryption scheme
+class XOREncryptionKeyProvider : public IEncryptionKeyProvider {
+public:
+	XOREncryptionKeyProvider(std::string filename) {
+		ASSERT(g_network->isSimulated());
+
+		// Choose a deterministic random filename (without path) byte for secret generation
+		// Remove any leading directory names
+		size_t lastSlash = filename.find_last_of("\\/");
+		if (lastSlash != filename.npos) {
+			filename.erase(0, lastSlash);
+		}
+		xorWith = filename.empty() ? 0x5e
+		                           : (uint8_t)filename[XXH3_64bits(filename.data(), filename.size()) % filename.size()];
+	}
+
+	virtual ~XOREncryptionKeyProvider() {}
+
+	virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
+		if (!key.id.present()) {
+			throw encryption_key_not_found();
+		}
+		EncryptionKey s = key;
+		uint8_t secret = ~(uint8_t)key.id.get() ^ xorWith;
+		s.secret = StringRef(s.arena(), &secret, 1);
+		return s;
+	}
+
+	virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		EncryptionKeyRef k;
+		k.id = end.empty() ? 0 : *(end.end() - 1);
+		return getSecrets(k);
+	}
+
+	uint8_t xorWith;
+};
+
 #endif
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@ -294,6 +294,9 @@ rocksdb::ColumnFamilyOptions getCFOptions() {
 		}
 		bbOpts.block_cache = rocksdb_block_cache;
 	}
+	if (SERVER_KNOBS->ROCKSDB_BLOCK_SIZE > 0) {
+		bbOpts.block_size = SERVER_KNOBS->ROCKSDB_BLOCK_SIZE;
+	}

 	options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbOpts));

@ -790,6 +793,7 @@ ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> stat
 		{ "EstPendCompactBytes", rocksdb::DB::Properties::kEstimatePendingCompactionBytes },
 		{ "BlockCacheUsage", rocksdb::DB::Properties::kBlockCacheUsage },
 		{ "BlockCachePinnedUsage", rocksdb::DB::Properties::kBlockCachePinnedUsage },
+		{ "LiveSstFilesSize", rocksdb::DB::Properties::kLiveSstFilesSize },
 	};

 	state std::unordered_map<std::string, uint64_t> readIteratorPoolStats = {
@ -811,7 +815,8 @@ ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> stat
 		for (auto& p : propertyStats) {
 			auto& [name, property] = p;
 			stat = 0;
-			ASSERT(db->GetIntProperty(property, &stat));
+			// GetAggregatedIntProperty gets the aggregated int property from all column families.
+			ASSERT(db->GetAggregatedIntProperty(property, &stat));
 			e.detail(name, stat);
 		}

@ -1933,7 +1938,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {

 	StorageBytes getStorageBytes() const override {
 		uint64_t live = 0;
-		ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kLiveSstFilesSize, &live));
+		ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kLiveSstFilesSize, &live));

 		int64_t free;
 		int64_t total;
--- a/fdbserver/LogSystem.cpp
+++ b/fdbserver/LogSystem.cpp
@ -19,6 +19,7 @@
 */

 #include "fdbserver/LogSystem.h"
+#include "flow/serialize.h"

 std::string LogSet::logRouterString() {
 	std::string result;
@ -293,6 +294,7 @@ void LogPushData::writeMessage(StringRef rawMessageWithoutLength, bool usePrevio
 		}
 		msg_locations.clear();
 		logSystem->getPushLocations(prev_tags, msg_locations);
+		written_tags.insert(next_message_tags.begin(), next_message_tags.end());
 		next_message_tags.clear();
 	}
 	uint32_t subseq = this->subsequence++;
@ -307,6 +309,15 @@ void LogPushData::writeMessage(StringRef rawMessageWithoutLength, bool usePrevio
 	}
 }

+std::vector<Standalone<StringRef>> LogPushData::getAllMessages() {
+	std::vector<Standalone<StringRef>> results;
+	results.reserve(messagesWriter.size());
+	for (int loc = 0; loc < messagesWriter.size(); loc++) {
+		results.push_back(getMessages(loc));
+	}
+	return results;
+}
+
 void LogPushData::recordEmptyMessage(int loc, const Standalone<StringRef>& value) {
 	if (!messagesWritten[loc]) {
 		BinaryWriter w(AssumeVersion(g_network->protocolVersion()));
@ -344,3 +355,17 @@ bool LogPushData::writeTransactionInfo(int location, uint32_t subseq) {
 	*(uint32_t*)((uint8_t*)wr.getData() + offset) = length - sizeof(uint32_t);
 	return true;
 }
+
+void LogPushData::setMutations(uint32_t totalMutations, VectorRef<StringRef> mutations) {
+	ASSERT_EQ(subsequence, 1);
+	subsequence = totalMutations + 1; // set to next mutation number
+
+	ASSERT_EQ(messagesWriter.size(), mutations.size());
+	BinaryWriter w(AssumeVersion(g_network->protocolVersion()));
+	Standalone<StringRef> v = w.toValue();
+	const int header = v.size();
+	for (int i = 0; i < mutations.size(); i++) {
+		BinaryWriter& wr = messagesWriter[i];
+		wr.serializeBytes(mutations[i].substr(header));
+	}
+}
--- a/fdbserver/LogSystem.h
+++ b/fdbserver/LogSystem.h
@ -22,6 +22,7 @@
 #define FDBSERVER_LOGSYSTEM_H

 #include <set>
+#include <stdint.h>
 #include <vector>

 #include "fdbserver/SpanContextMessage.h"
@ -519,7 +520,9 @@ struct ILogSystem {
 	                             Version minKnownCommittedVersion,
 	                             LogPushData& data,
 	                             SpanID const& spanContext,
-	                             Optional<UID> debugID = Optional<UID>()) = 0;
+	                             Optional<UID> debugID = Optional<UID>(),
+	                             Optional<std::unordered_map<uint16_t, Version>> tpcvMap =
+	                                 Optional<std::unordered_map<uint16_t, Version>>()) = 0;
 	// Waits for the version number of the bundle (in this epoch) to be prevVersion (i.e. for all pushes ordered
 	// earlier) Puts the given messages into the bundle, each with the given tags, and with message versions (version,
 	// 0) - (version, N) Changes the version number of the bundle to be version (unblocking the next push) Returns when
@ -761,6 +764,36 @@ struct LogPushData : NonCopyable {
 	// Add transaction info to be written before the first mutation in the transaction.
 	void addTransactionInfo(SpanID const& context);

+	// copy written_tags, after filtering, into given set
+	void saveTags(std::set<Tag>& filteredTags) const {
+		for (const auto& tag : written_tags) {
+			filteredTags.insert(tag);
+		}
+	}
+
+	void addWrittenTags(const std::set<Tag>& tags) { written_tags.insert(tags.begin(), tags.end()); }
+
+	void getLocations(const std::set<Tag>& tags, std::set<uint16_t>& writtenTLogs) {
+		std::vector<Tag> vtags(tags.begin(), tags.end());
+		std::vector<int> msg_locations;
+		logSystem->getPushLocations(vtags, msg_locations, false /*allLocations*/);
+		writtenTLogs.insert(msg_locations.begin(), msg_locations.end());
+	}
+
+	void getLocations(const std::vector<Tag>& vtags, std::set<uint16_t>& writtenTLogs) {
+		std::vector<int> msg_locations;
+		logSystem->getPushLocations(vtags, msg_locations, false /*allLocations*/);
+		writtenTLogs.insert(msg_locations.begin(), msg_locations.end());
+	}
+
+	// store tlogs as represented by index
+	void saveLocations(std::set<uint16_t>& writtenTLogs) {
+		writtenTLogs.insert(msg_locations.begin(), msg_locations.end());
+	}
+
+	void setShardChanged() { shardChanged = true; }
+	bool isShardChanged() const { return shardChanged; }
+
 	void writeMessage(StringRef rawMessageWithoutLength, bool usePreviousLocations);

 	template <class T>
@ -768,6 +801,9 @@ struct LogPushData : NonCopyable {

 	Standalone<StringRef> getMessages(int loc) { return messagesWriter[loc].toValue(); }

+	// Returns all locations' messages, including empty ones.
+	std::vector<Standalone<StringRef>> getAllMessages();
+
 	// Records if a tlog (specified by "loc") will receive an empty version batch message.
 	// "value" is the message returned by getMessages() call.
 	void recordEmptyMessage(int loc, const Standalone<StringRef>& value);
@ -776,10 +812,18 @@ struct LogPushData : NonCopyable {
 	// MUST be called after getMessages() and recordEmptyMessage().
 	float getEmptyMessageRatio() const;

+	// Returns the total number of mutations.
+	uint32_t getMutationCount() const { return subsequence; }
+
+	// Sets mutations for all internal writers. "mutations" is the output from
+	// getAllMessages() and is used before writing any other mutations.
+	void setMutations(uint32_t totalMutations, VectorRef<StringRef> mutations);
+
 private:
 	Reference<ILogSystem> logSystem;
 	std::vector<Tag> next_message_tags;
 	std::vector<Tag> prev_tags;
+	std::set<Tag> written_tags;
 	std::vector<BinaryWriter> messagesWriter;
 	std::vector<bool> messagesWritten; // if messagesWriter has written anything
 	std::vector<int> msg_locations;
@ -789,6 +833,7 @@ private:
 	std::unordered_set<int> writtenLocations;
 	uint32_t subsequence;
 	SpanID spanContext;
+	bool shardChanged = false; // if keyServers has any changes, i.e., shard boundary modifications.

 	// Writes transaction info to the message stream at the given location if
 	// it has not already been written (for the current transaction). Returns
@ -860,6 +905,7 @@ void LogPushData::writeTypedMessage(T const& item, bool metadataMessage, bool al
 			wr.serializeBytes((uint8_t*)from.getData() + firstOffset, firstLength);
 		}
 	}
+	written_tags.insert(next_message_tags.begin(), next_message_tags.end());
 	next_message_tags.clear();
 }

--- a/fdbserver/LogSystemConfig.cpp
+++ b/fdbserver/LogSystemConfig.cpp
@ -142,6 +142,16 @@ std::vector<TLogInterface> LogSystemConfig::allLocalLogs(bool includeSatellite)
 	return results;
 }

+int LogSystemConfig::numLogs() const {
+	int numLogs = 0;
+	for (auto& tLogSet : tLogs) {
+		if (tLogSet.isLocal == true) {
+			numLogs += tLogSet.tLogs.size();
+		}
+	}
+	return numLogs;
+}
+
 std::vector<TLogInterface> LogSystemConfig::allPresentLogs() const {
 	std::vector<TLogInterface> results;
 	for (int i = 0; i < tLogs.size(); i++) {
--- a/fdbserver/LogSystemConfig.h
+++ b/fdbserver/LogSystemConfig.h
@ -192,6 +192,8 @@ struct LogSystemConfig {

 	std::vector<TLogInterface> allLocalLogs(bool includeSatellite = true) const;

+	int numLogs() const;
+
 	std::vector<TLogInterface> allPresentLogs() const;

 	std::pair<int8_t, int8_t> getLocalityForDcId(Optional<Key> dcId) const;
--- a/fdbserver/MasterInterface.h
+++ b/fdbserver/MasterInterface.h
@ -25,6 +25,8 @@
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/CommitTransaction.h"
 #include "fdbclient/DatabaseConfiguration.h"
+#include "fdbclient/VersionVector.h"
+#include "fdbserver/TLogInterface.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/Notified.h"
 #include "fdbclient/StorageServerInterface.h"
@ -151,25 +153,36 @@ struct GetCommitVersionRequest {
 	}
 };

+struct GetTLogPrevCommitVersionReply {
+	constexpr static FileIdentifier file_identifier = 16683183;
+	GetTLogPrevCommitVersionReply() {}
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar);
+	}
+};
+
 struct UpdateRecoveryDataRequest {
 	constexpr static FileIdentifier file_identifier = 13605417;
 	Version recoveryTransactionVersion;
 	Version lastEpochEnd;
 	std::vector<CommitProxyInterface> commitProxies;
 	std::vector<ResolverInterface> resolvers;
+	Optional<int64_t> versionEpoch;
 	ReplyPromise<Void> reply;

 	UpdateRecoveryDataRequest() = default;
 	UpdateRecoveryDataRequest(Version recoveryTransactionVersion,
 	                          Version lastEpochEnd,
 	                          const std::vector<CommitProxyInterface>& commitProxies,
-	                          const std::vector<ResolverInterface>& resolvers)
+	                          const std::vector<ResolverInterface>& resolvers,
+	                          Optional<int64_t> versionEpoch)
 	  : recoveryTransactionVersion(recoveryTransactionVersion), lastEpochEnd(lastEpochEnd),
-	    commitProxies(commitProxies), resolvers(resolvers) {}
+	    commitProxies(commitProxies), resolvers(resolvers), versionEpoch(versionEpoch) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, recoveryTransactionVersion, lastEpochEnd, commitProxies, resolvers, reply);
+		serializer(ar, recoveryTransactionVersion, lastEpochEnd, commitProxies, resolvers, versionEpoch, reply);
 	}
 };

@ -179,20 +192,23 @@ struct ReportRawCommittedVersionRequest {
 	bool locked;
 	Optional<Value> metadataVersion;
 	Version minKnownCommittedVersion;
-
+	Optional<Version> prevVersion; // if present, wait for prevVersion to be committed before replying
+	Optional<std::set<Tag>> writtenTags;
 	ReplyPromise<Void> reply;

 	ReportRawCommittedVersionRequest() : version(invalidVersion), locked(false), minKnownCommittedVersion(0) {}
 	ReportRawCommittedVersionRequest(Version version,
 	                                 bool locked,
 	                                 Optional<Value> metadataVersion,
-	                                 Version minKnownCommittedVersion)
+	                                 Version minKnownCommittedVersion,
+	                                 Optional<Version> prevVersion,
+	                                 Optional<std::set<Tag>> writtenTags = Optional<std::set<Tag>>())
 	  : version(version), locked(locked), metadataVersion(metadataVersion),
-	    minKnownCommittedVersion(minKnownCommittedVersion) {}
+	    minKnownCommittedVersion(minKnownCommittedVersion), prevVersion(prevVersion), writtenTags(writtenTags) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, version, locked, metadataVersion, minKnownCommittedVersion, reply);
+		serializer(ar, version, locked, metadataVersion, minKnownCommittedVersion, prevVersion, writtenTags, reply);
 	}
 };

--- a/fdbserver/ProxyCommitData.actor.h
+++ b/fdbserver/ProxyCommitData.actor.h
@ -180,6 +180,9 @@ struct ProxyCommitData {
 	uint64_t commitVersionRequestNumber;
 	uint64_t mostRecentProcessedRequestNumber;
 	KeyRangeMap<Deque<std::pair<Version, int>>> keyResolvers;
+	// When all resolvers process system keys (for private mutations), the "keyResolvers"
+	// only tracks normalKeys. This is used for tracking versions for systemKeys.
+	Deque<Version> systemKeyVersions;
 	KeyRangeMap<ServerCacheInfo> keyInfo; // keyrange -> all storage servers in all DCs for the keyrange
 	KeyRangeMap<bool> cacheInfo;
 	std::map<Key, ApplyMutationsData> uid_applyMutationsData;
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@ -277,9 +277,8 @@ ACTOR Future<std::vector<StorageServerInterface>> getStorageServers(Database cx,
 	}
 }

-ACTOR Future<std::vector<WorkerInterface>> getStorageWorkers(Database cx,
-                                                             Reference<AsyncVar<ServerDBInfo> const> dbInfo,
-                                                             bool localOnly) {
+ACTOR Future<std::pair<std::vector<WorkerInterface>, int>>
+getStorageWorkers(Database cx, Reference<AsyncVar<ServerDBInfo> const> dbInfo, bool localOnly) {
 	state std::vector<StorageServerInterface> servers = wait(getStorageServers(cx));
 	state std::map<NetworkAddress, WorkerInterface> workersMap;
 	std::vector<WorkerDetails> workers = wait(getWorkers(dbInfo));
@ -299,7 +298,9 @@ ACTOR Future<std::vector<WorkerInterface>> getStorageWorkers(Database cx,
 	}
 	auto masterDcId = dbInfo->get().master.locality.dcId();

-	std::vector<WorkerInterface> result;
+	std::pair<std::vector<WorkerInterface>, int> result;
+	auto& [workerInterfaces, failures] = result;
+	failures = 0;
 	for (const auto& server : servers) {
 		TraceEvent(SevDebug, "DcIdInfo")
 		    .detail("ServerLocalityID", server.locality.dcId())
@ -310,9 +311,10 @@ ACTOR Future<std::vector<WorkerInterface>> getStorageWorkers(Database cx,
 				TraceEvent(SevWarn, "GetStorageWorkers")
 				    .detail("Reason", "Could not find worker for storage server")
 				    .detail("SS", server.id());
-				throw operation_failed();
+				++failures;
+			} else {
+				workerInterfaces.push_back(itr->second);
 			}
-			result.push_back(itr->second);
 		}
 	}
 	return result;
@ -598,6 +600,31 @@ ACTOR Future<bool> getStorageServersRecruiting(Database cx, WorkerInterface dist
 	}
 }

+// Gets the difference between the expected version (based on the version
+// epoch) and the actual version.
+ACTOR Future<int64_t> getVersionOffset(Database cx,
+                                       WorkerInterface distributorWorker,
+                                       Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+	loop {
+		state Transaction tr(cx);
+		try {
+			TraceEvent("GetVersionOffset").detail("Stage", "ReadingVersionEpoch");
+
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			state Version rv = wait(tr.getReadVersion());
+			Optional<Standalone<StringRef>> versionEpochValue = wait(tr.get(versionEpochKey));
+			if (!versionEpochValue.present()) {
+				return 0;
+			}
+			int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(versionEpochValue.get(), Unversioned());
+			int64_t versionOffset = abs(rv - (g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - versionEpoch));
+			return versionOffset;
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
 ACTOR Future<Void> repairDeadDatacenter(Database cx,
                                        Reference<AsyncVar<ServerDBInfo> const> dbInfo,
                                        std::string context) {
@ -652,7 +679,8 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
                                        int64_t maxTLogQueueGate = 5e6,
                                        int64_t maxStorageServerQueueGate = 5e6,
                                        int64_t maxDataDistributionQueueSize = 0,
-                                        int64_t maxPoppedVersionLag = 30e6) {
+                                        int64_t maxPoppedVersionLag = 30e6,
+                                        int64_t maxVersionOffset = 1e6) {
 	state Future<Void> reconfig =
 	    reconfigureAfter(cx, 100 + (deterministicRandom()->random01() * 100), dbInfo, "QuietDatabase");
 	state Future<int64_t> dataInFlight;
@ -662,6 +690,7 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
 	state Future<int64_t> storageQueueSize;
 	state Future<bool> dataDistributionActive;
 	state Future<bool> storageServersRecruiting;
+	state Future<int64_t> versionOffset;
 	auto traceMessage = "QuietDatabase" + phase + "Begin";
 	TraceEvent(traceMessage.c_str()).log();

@ -698,10 +727,11 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
 			storageQueueSize = getMaxStorageServerQueueSize(cx, dbInfo);
 			dataDistributionActive = getDataDistributionActive(cx, distributorWorker);
 			storageServersRecruiting = getStorageServersRecruiting(cx, distributorWorker, distributorUID);
+			versionOffset = getVersionOffset(cx, distributorWorker, dbInfo);

 			wait(success(dataInFlight) && success(tLogQueueInfo) && success(dataDistributionQueueSize) &&
 			     success(teamCollectionValid) && success(storageQueueSize) && success(dataDistributionActive) &&
-			     success(storageServersRecruiting));
+			     success(storageServersRecruiting) && success(versionOffset));

 			TraceEvent(("QuietDatabase" + phase).c_str())
 			    .detail("DataInFlight", dataInFlight.get())
@ -717,13 +747,17 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
 			    .detail("MaxStorageServerQueueGate", maxStorageServerQueueGate)
 			    .detail("DataDistributionActive", dataDistributionActive.get())
 			    .detail("StorageServersRecruiting", storageServersRecruiting.get())
+			    .detail("RecoveryCount", dbInfo->get().recoveryCount)
+			    .detail("VersionOffset", versionOffset.get())
 			    .detail("NumSuccesses", numSuccesses);

+			maxVersionOffset += dbInfo->get().recoveryCount * SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT;
 			if (dataInFlight.get() > dataInFlightGate || tLogQueueInfo.get().first > maxTLogQueueGate ||
 			    tLogQueueInfo.get().second > maxPoppedVersionLag ||
 			    dataDistributionQueueSize.get() > maxDataDistributionQueueSize ||
 			    storageQueueSize.get() > maxStorageServerQueueGate || !dataDistributionActive.get() ||
-			    storageServersRecruiting.get() || !teamCollectionValid.get()) {
+			    storageServersRecruiting.get() || versionOffset.get() > maxVersionOffset ||
+			    !teamCollectionValid.get()) {

 				wait(delay(1.0));
 				numSuccesses = 0;
@ -779,6 +813,10 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
 				auto key = "NotReady" + std::to_string(notReadyCount++);
 				evt.detail(key.c_str(), "storageServersRecruiting");
 			}
+			if (versionOffset.isReady() && versionOffset.isError()) {
+				auto key = "NotReady" + std::to_string(notReadyCount++);
+				evt.detail(key.c_str(), "versionOffset");
+			}
 			wait(delay(1.0));
 			numSuccesses = 0;
 		}
@ -794,7 +832,8 @@ Future<Void> quietDatabase(Database const& cx,
                           int64_t maxTLogQueueGate,
                           int64_t maxStorageServerQueueGate,
                           int64_t maxDataDistributionQueueSize,
-                           int64_t maxPoppedVersionLag) {
+                           int64_t maxPoppedVersionLag,
+                           int64_t maxVersionOffset) {
 	return waitForQuietDatabase(cx,
 	                            dbInfo,
 	                            phase,
@ -802,5 +841,6 @@ Future<Void> quietDatabase(Database const& cx,
 	                            maxTLogQueueGate,
 	                            maxStorageServerQueueGate,
 	                            maxDataDistributionQueueSize,
-	                            maxPoppedVersionLag);
+	                            maxPoppedVersionLag,
+	                            maxVersionOffset);
 }
--- a/fdbserver/QuietDatabase.h
+++ b/fdbserver/QuietDatabase.h
@ -46,9 +46,11 @@ Future<WorkerInterface> getMasterWorker(Database const& cx, Reference<AsyncVar<S
 Future<Void> repairDeadDatacenter(Database const& cx,
                                  Reference<AsyncVar<ServerDBInfo> const> const& dbInfo,
                                  std::string const& context);
-Future<std::vector<WorkerInterface>> getStorageWorkers(Database const& cx,
-                                                       Reference<AsyncVar<ServerDBInfo> const> const& dbInfo,
-                                                       bool const& localOnly);
+
+// Returns list of worker interfaces for available storage servers and the number of unavailable
+// storage servers
+Future<std::pair<std::vector<WorkerInterface>, int>>
+getStorageWorkers(Database const& cx, Reference<AsyncVar<ServerDBInfo> const> const& dbInfo, bool const& localOnly);
 Future<std::vector<WorkerInterface>> getCoordWorkers(Database const& cx,
                                                     Reference<AsyncVar<ServerDBInfo> const> const& dbInfo);

--- a/fdbserver/ResolutionBalancer.actor.cpp
+++ b/fdbserver/ResolutionBalancer.actor.cpp
@ -115,8 +115,9 @@ static std::pair<KeyRangeRef, bool> findRange(CoalescedKeyRangeMap<int>& key_res
 ACTOR Future<Void> ResolutionBalancer::resolutionBalancing_impl(ResolutionBalancer* self) {
 	wait(self->triggerResolution.onTrigger());

-	state CoalescedKeyRangeMap<int> key_resolver;
-	key_resolver.insert(allKeys, 0);
+	state CoalescedKeyRangeMap<int> key_resolver(
+	    0, SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ? normalKeys.end : allKeys.end);
+	key_resolver.insert(SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ? normalKeys : allKeys, 0);
 	loop {
 		wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskPriority::ResolutionMetrics));
 		while (self->resolverChanges.get().size())
--- a/fdbserver/Resolver.actor.cpp
+++ b/fdbserver/Resolver.actor.cpp
@ -20,16 +20,23 @@

 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/Notified.h"
+#include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/SystemData.h"
+#include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/ConflictSet.h"
+#include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/Knobs.h"
+#include "fdbserver/LogSystem.h"
+#include "fdbserver/LogSystemDiskQueueAdapter.h"
 #include "fdbserver/MasterInterface.h"
 #include "fdbserver/ResolverInterface.h"
+#include "fdbserver/RestoreUtil.h"
 #include "fdbserver/ServerDBInfo.h"
 #include "fdbserver/StorageMetrics.h"
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "flow/ActorCollection.h"
+#include "flow/Error.h"

 #include "flow/actorcompiler.h" // This must be the last #include.

@ -43,21 +50,102 @@ struct ProxyRequestsInfo {
 } // namespace

 namespace {
+
+class RecentStateTransactionsInfo {
+public:
+	RecentStateTransactionsInfo() = default;
+
+	// Erases state transactions up to the given version (inclusive) and returns
+	// the number of bytes for the erased mutations.
+	int64_t eraseUpTo(Version oldestVersion) {
+		recentStateTransactions.erase(recentStateTransactions.begin(),
+		                              recentStateTransactions.upper_bound(oldestVersion));
+
+		int64_t stateBytes = 0;
+		while (recentStateTransactionSizes.size() && recentStateTransactionSizes.front().first <= oldestVersion) {
+			stateBytes += recentStateTransactionSizes.front().second;
+			recentStateTransactionSizes.pop_front();
+		}
+		return stateBytes;
+	}
+
+	// Adds state transactions between two versions to the reply message.
+	// "initialShardChanged" indicates if commitVersion has shard changes.
+	// Returns if shardChanged has ever happened for these versions.
+	[[nodiscard]] bool applyStateTxnsToBatchReply(ResolveTransactionBatchReply* reply,
+	                                              Version firstUnseenVersion,
+	                                              Version commitVersion,
+	                                              bool initialShardChanged) {
+		bool shardChanged = initialShardChanged;
+		auto stateTransactionItr = recentStateTransactions.lower_bound(firstUnseenVersion);
+		auto endItr = recentStateTransactions.lower_bound(commitVersion);
+		// Resolver only sends back prior state txns back, because the proxy
+		// sends this request has them and will apply them via applyMetadataToCommittedTransactions();
+		// and other proxies will get this version's state txns as a prior version.
+		for (; stateTransactionItr != endItr; ++stateTransactionItr) {
+			shardChanged = shardChanged || stateTransactionItr->value.first;
+			reply->stateMutations.push_back(reply->arena, stateTransactionItr->value.second);
+			reply->arena.dependsOn(stateTransactionItr->value.second.arena());
+		}
+		return shardChanged;
+	}
+
+	bool empty() const { return recentStateTransactionSizes.empty(); }
+	// Returns the number of versions with non-empty state transactions.
+	uint32_t size() const { return recentStateTransactionSizes.size(); }
+
+	// Returns the first/smallest version of the state transactions.
+	// This can only be called when empty() returns false or size() > 0.
+	Version firstVersion() const { return recentStateTransactionSizes.front().first; }
+
+	// Records non-zero stateBytes for a version.
+	void addVersionBytes(Version commitVersion, int64_t stateBytes) {
+		if (stateBytes > 0)
+			recentStateTransactionSizes.emplace_back(commitVersion, stateBytes);
+	}
+
+	// Returns the reference to the pair of (shardChanged, stateMutations) for the given version
+	std::pair<bool, Standalone<VectorRef<StateTransactionRef>>>& getStateTransactionsRef(Version commitVersion) {
+		return recentStateTransactions[commitVersion];
+	}
+
+private:
+	// Commit version to a pair of (shardChanged, stateMutations).
+	Map<Version, std::pair<bool, Standalone<VectorRef<StateTransactionRef>>>> recentStateTransactions;
+
+	// Only keep versions with non-zero size state transactions.
+	Deque<std::pair<Version, int64_t>> recentStateTransactionSizes;
+};
+
 struct Resolver : ReferenceCounted<Resolver> {
-	UID dbgid;
-	int commitProxyCount, resolverCount;
+	const UID dbgid;
+	const int commitProxyCount, resolverCount;
 	NotifiedVersion version;
 	AsyncVar<Version> neededVersion;

-	Map<Version, Standalone<VectorRef<StateTransactionRef>>> recentStateTransactions;
-	Deque<std::pair<Version, int64_t>> recentStateTransactionSizes;
+	RecentStateTransactionsInfo recentStateTransactionsInfo;
 	AsyncVar<int64_t> totalStateBytes;
 	AsyncTrigger checkNeededVersion;
 	std::map<NetworkAddress, ProxyRequestsInfo> proxyInfoMap;
 	ConflictSet* conflictSet;
 	TransientStorageMetricSample iopsSample;

-	Version debugMinRecentStateVersion;
+	// Use LogSystem as backend for txnStateStore. However, the real commit
+	// happens at commit proxies and we never "write" to the LogSystem at
+	// Resolvers.
+	LogSystemDiskQueueAdapter* logAdapter = nullptr;
+	Reference<ILogSystem> logSystem;
+	IKeyValueStore* txnStateStore = nullptr;
+
+	std::map<UID, Reference<StorageInfo>> storageCache;
+	KeyRangeMap<ServerCacheInfo> keyInfo; // keyrange -> all storage servers in all DCs for the keyrange
+	std::unordered_map<UID, StorageServerInterface> tssMapping;
+	bool forceRecovery = false;
+
+	Version debugMinRecentStateVersion = 0;
+
+	// The previous commit versions per tlog
+	std::vector<Version> tpcvVector;

 	CounterCollection cc;
 	Counter resolveBatchIn;
@ -75,15 +163,16 @@ struct Resolver : ReferenceCounted<Resolver> {
 	Counter resolveBatchOut;
 	Counter metricsRequests;
 	Counter splitRequests;
+	int numLogs;

 	Future<Void> logger;

 	Resolver(UID dbgid, int commitProxyCount, int resolverCount)
 	  : dbgid(dbgid), commitProxyCount(commitProxyCount), resolverCount(resolverCount), version(-1),
-	    conflictSet(newConflictSet()), iopsSample(SERVER_KNOBS->KEY_BYTES_PER_SAMPLE), debugMinRecentStateVersion(0),
-	    cc("Resolver", dbgid.toString()), resolveBatchIn("ResolveBatchIn", cc),
-	    resolveBatchStart("ResolveBatchStart", cc), resolvedTransactions("ResolvedTransactions", cc),
-	    resolvedBytes("ResolvedBytes", cc), resolvedReadConflictRanges("ResolvedReadConflictRanges", cc),
+	    conflictSet(newConflictSet()), iopsSample(SERVER_KNOBS->KEY_BYTES_PER_SAMPLE), cc("Resolver", dbgid.toString()),
+	    resolveBatchIn("ResolveBatchIn", cc), resolveBatchStart("ResolveBatchStart", cc),
+	    resolvedTransactions("ResolvedTransactions", cc), resolvedBytes("ResolvedBytes", cc),
+	    resolvedReadConflictRanges("ResolvedReadConflictRanges", cc),
 	    resolvedWriteConflictRanges("ResolvedWriteConflictRanges", cc),
 	    transactionsAccepted("TransactionsAccepted", cc), transactionsTooOld("TransactionsTooOld", cc),
 	    transactionsConflicted("TransactionsConflicted", cc),
@ -118,18 +207,20 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 		g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.Before");
 	}

-	/*TraceEvent("ResolveBatchStart", self->dbgid).detail("From", proxyAddress).detail("Version", req.version).detail("PrevVersion", req.prevVersion).detail("StateTransactions", req.txnStateTransactions.size())
-	    .detail("RecentStateTransactions", self->recentStateTransactionSizes.size()).detail("LastVersion",
-	   proxyInfo.lastVersion).detail("FirstVersion", self->recentStateTransactionSizes.empty() ? -1 :
-	   self->recentStateTransactionSizes.front().first) .detail("ResolverVersion", self->version.get());*/
+	/* TraceEvent("ResolveBatchStart", self->dbgid).detail("From", proxyAddress).detail("Version",
+	   req.version).detail("PrevVersion", req.prevVersion).detail("StateTransactions", req.txnStateTransactions.size())
+	    .detail("RecentStateTransactions", self->recentStateTransactionsInfo.size()).detail("LastVersion",
+	   proxyInfo.lastVersion).detail("FirstVersion", self->recentStateTransactionsInfo.empty() ? -1 :
+	   self->recentStateTransactionsInfo.firstVersion()) .detail("ResolverVersion", self->version.get()); */

 	while (self->totalStateBytes.get() > SERVER_KNOBS->RESOLVER_STATE_MEMORY_LIMIT &&
-	       self->recentStateTransactionSizes.size() &&
-	       proxyInfo.lastVersion > self->recentStateTransactionSizes.front().first &&
+	       self->recentStateTransactionsInfo.size() &&
+	       proxyInfo.lastVersion > self->recentStateTransactionsInfo.firstVersion() &&
 	       req.version > self->neededVersion.get()) {
-		/*TraceEvent("ResolveBatchDelay").detail("From", proxyAddress).detail("StateBytes", self->totalStateBytes.get()).detail("RecentStateTransactionSize", self->recentStateTransactionSizes.size())
+		/* TraceEvent("ResolveBatchDelay").detail("From", proxyAddress).detail("StateBytes",
+	 self->totalStateBytes.get()).detail("RecentStateTransactionSize", self->recentStateTransactionsInfo.size())
 	 .detail("LastVersion", proxyInfo.lastVersion).detail("RequestVersion", req.version).detail("NeededVersion",
-		   self->neededVersion.get()) .detail("RecentStateVer", self->recentStateTransactions.begin()->key);*/
+		     self->neededVersion.get()) .detail("RecentStateVer", self->recentStateTransactionsInfo.firstVersion());*/

 		wait(self->totalStateBytes.onChange() || self->neededVersion.onChange());
 	}
@ -139,8 +230,8 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 	}

 	loop {
-		if (self->recentStateTransactionSizes.size() &&
-		    proxyInfo.lastVersion <= self->recentStateTransactionSizes.front().first) {
+		if (self->recentStateTransactionsInfo.size() &&
+		    proxyInfo.lastVersion <= self->recentStateTransactionsInfo.firstVersion()) {
 			self->neededVersion.set(std::max(self->neededVersion.get(), req.prevVersion));
 		}

@ -173,6 +264,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 			g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.AfterOrderer");

 		ResolveTransactionBatchReply& reply = proxyInfo.outstandingBatches[req.version];
+		reply.writtenTags = req.writtenTags;

 		std::vector<int> commitList;
 		std::vector<int> tooOldList;
@ -214,9 +306,25 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 		ASSERT(req.prevVersion >= 0 ||
 		       req.txnStateTransactions.size() == 0); // The master's request should not have any state transactions

-		auto& stateTransactions = self->recentStateTransactions[req.version];
+		auto& stateTransactionsPair = self->recentStateTransactionsInfo.getStateTransactionsRef(req.version);
+		auto& stateTransactions = stateTransactionsPair.second;
 		int64_t stateMutations = 0;
 		int64_t stateBytes = 0;
+		LogPushData toCommit(self->logSystem); // For accumulating private mutations
+		ResolverData resolverData(self->dbgid,
+		                          self->logSystem,
+		                          self->txnStateStore,
+		                          &self->keyInfo,
+		                          &toCommit,
+		                          self->forceRecovery,
+		                          req.version + 1,
+		                          &self->storageCache,
+		                          &self->tssMapping);
+		bool isLocked = false;
+		if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
+			auto lockedKey = self->txnStateStore->readValue(databaseLockedKey).get();
+			isLocked = lockedKey.present() && lockedKey.get().size();
+		}
 		for (int t : req.txnStateTransactions) {
 			stateMutations += req.transactions[t].mutations.size();
 			stateBytes += req.transactions[t].mutations.expectedSize();
@ -224,28 +332,52 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 			    stateTransactions.arena(),
 			    StateTransactionRef(reply.committed[t] == ConflictBatch::TransactionCommitted,
 			                        req.transactions[t].mutations));
+
+			// for (const auto& m : req.transactions[t].mutations)
+			//	DEBUG_MUTATION("Resolver", req.version, m, self->dbgid);
+
+			// Generate private mutations for metadata mutations
+			// The condition here must match CommitBatch::applyMetadataToCommittedTransactions()
+			if (reply.committed[t] == ConflictBatch::TransactionCommitted && !self->forceRecovery &&
+			    SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && (!isLocked || req.transactions[t].lock_aware)) {
+				SpanID spanContext =
+				    req.transactions[t].spanContext.present() ? req.transactions[t].spanContext.get() : SpanID();
+
+				applyMetadataMutations(spanContext, resolverData, req.transactions[t].mutations);
+			}
+			TEST(self->forceRecovery); // Resolver detects forced recovery
 		}

 		self->resolvedStateTransactions += req.txnStateTransactions.size();
 		self->resolvedStateMutations += stateMutations;
 		self->resolvedStateBytes += stateBytes;

-		if (stateBytes > 0)
-			self->recentStateTransactionSizes.emplace_back(req.version, stateBytes);
+		self->recentStateTransactionsInfo.addVersionBytes(req.version, stateBytes);

 		ASSERT(req.version >= firstUnseenVersion);
 		ASSERT(firstUnseenVersion >= self->debugMinRecentStateVersion);

 		TEST(firstUnseenVersion == req.version); // Resolver first unseen version is current version

-		auto stateTransactionItr = self->recentStateTransactions.lower_bound(firstUnseenVersion);
-		auto endItr = self->recentStateTransactions.lower_bound(req.version);
-		for (; stateTransactionItr != endItr; ++stateTransactionItr) {
-			reply.stateMutations.push_back(reply.arena, stateTransactionItr->value);
-			reply.arena.dependsOn(stateTransactionItr->value.arena());
+		// If shardChanged at or before this commit version, the proxy may have computed
+		// the wrong set of groups. Then we need to broadcast to all groups below.
+		stateTransactionsPair.first = toCommit.isShardChanged();
+		bool shardChanged = self->recentStateTransactionsInfo.applyStateTxnsToBatchReply(
+		    &reply, firstUnseenVersion, req.version, toCommit.isShardChanged());
+
+		// Adds private mutation messages to the reply message.
+		if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
+			auto privateMutations = toCommit.getAllMessages();
+			for (const auto& mutations : privateMutations) {
+				reply.privateMutations.push_back(reply.arena, mutations);
+				reply.arena.dependsOn(mutations.arena());
+			}
+			// merge mutation tags with sent client tags
+			toCommit.saveTags(reply.writtenTags);
+			reply.privateMutationCount = toCommit.getMutationCount();
 		}

-		//TraceEvent("ResolveBatch", self->dbgid).detail("PrevVersion", req.prevVersion).detail("Version", req.version).detail("StateTransactionVersions", self->recentStateTransactionSizes.size()).detail("StateBytes", stateBytes).detail("FirstVersion", self->recentStateTransactionSizes.empty() ? -1 : self->recentStateTransactionSizes.front().first).detail("StateMutationsIn", req.txnStateTransactions.size()).detail("StateMutationsOut", reply.stateMutations.size()).detail("From", proxyAddress);
+		//TraceEvent("ResolveBatch", self->dbgid).detail("PrevVersion", req.prevVersion).detail("Version", req.version).detail("StateTransactionVersions", self->recentStateTransactionsInfo.size()).detail("StateBytes", stateBytes).detail("FirstVersion", self->recentStateTransactionsInfo.empty() ? -1 : self->recentStateTransactionsInfo.firstVersion()).detail("StateMutationsIn", req.txnStateTransactions.size()).detail("StateMutationsOut", reply.stateMutations.size()).detail("From", proxyAddress);

 		ASSERT(!proxyInfo.outstandingBatches.empty());
 		ASSERT(self->proxyInfoMap.size() <= self->commitProxyCount + 1);
@ -270,18 +402,33 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 		bool anyPopped = false;
 		if (firstUnseenVersion <= oldestProxyVersion && self->proxyInfoMap.size() == self->commitProxyCount + 1) {
 			TEST(true); // Deleting old state transactions
-			self->recentStateTransactions.erase(self->recentStateTransactions.begin(),
-			                                    self->recentStateTransactions.upper_bound(oldestProxyVersion));
+			int64_t erasedBytes = self->recentStateTransactionsInfo.eraseUpTo(oldestProxyVersion);
 			self->debugMinRecentStateVersion = oldestProxyVersion + 1;
-
-			while (self->recentStateTransactionSizes.size() &&
-			       self->recentStateTransactionSizes.front().first <= oldestProxyVersion) {
-				anyPopped = true;
-				stateBytes -= self->recentStateTransactionSizes.front().second;
-				self->recentStateTransactionSizes.pop_front();
-			}
+			anyPopped = erasedBytes = 0;
+			stateBytes -= erasedBytes;
 		}

+		if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+			if (!self->numLogs) {
+				reply.tpcvMap.clear();
+			} else {
+				std::set<uint16_t> writtenTLogs;
+				if (shardChanged || reply.privateMutationCount) {
+					for (int i = 0; i < self->numLogs; i++) {
+						writtenTLogs.insert(i);
+					}
+				} else {
+					toCommit.getLocations(reply.writtenTags, writtenTLogs);
+				}
+				if (self->tpcvVector[0] == invalidVersion) {
+					std::fill(self->tpcvVector.begin(), self->tpcvVector.end(), req.prevVersion);
+				}
+				for (uint16_t tLog : writtenTLogs) {
+					reply.tpcvMap[tLog] = self->tpcvVector[tLog];
+					self->tpcvVector[tLog] = req.version;
+				}
+			}
+		}
 		self->version.set(req.version);
 		bool breachedLimit = self->totalStateBytes.get() <= SERVER_KNOBS->RESOLVER_STATE_MEMORY_LIMIT &&
 		                     self->totalStateBytes.get() + stateBytes > SERVER_KNOBS->RESOLVER_STATE_MEMORY_LIMIT;
@ -319,7 +466,166 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 	return Void();
 }

-ACTOR Future<Void> resolverCore(ResolverInterface resolver, InitializeResolverRequest initReq) {
+namespace {
+
+// TODO: refactor with the one in CommitProxyServer.actor.cpp
+struct TransactionStateResolveContext {
+	// Maximum sequence for txnStateRequest, this is defined when the request last flag is set.
+	Sequence maxSequence = std::numeric_limits<Sequence>::max();
+
+	// Flags marks received transaction state requests, we only process the transaction request when *all* requests are
+	// received.
+	std::unordered_set<Sequence> receivedSequences;
+
+	Reference<Resolver> pResolverData;
+
+	// Pointer to transaction state store, shortcut for commitData.txnStateStore
+	IKeyValueStore* pTxnStateStore = nullptr;
+
+	// Actor streams
+	PromiseStream<Future<Void>>* pActors = nullptr;
+
+	// Flag reports if the transaction state request is complete. This request should only happen during recover, i.e.
+	// once per Resolver.
+	bool processed = false;
+
+	TransactionStateResolveContext() = default;
+
+	TransactionStateResolveContext(Reference<Resolver> pResolverData_, PromiseStream<Future<Void>>* pActors_)
+	  : pResolverData(pResolverData_), pTxnStateStore(pResolverData_->txnStateStore), pActors(pActors_) {
+		ASSERT(pTxnStateStore != nullptr || !SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS);
+	}
+};
+
+ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolveContext* pContext) {
+	state KeyRange txnKeys = allKeys;
+	state std::map<Tag, UID> tag_uid;
+
+	RangeResult UIDtoTagMap = pContext->pTxnStateStore->readRange(serverTagKeys).get();
+	for (const KeyValueRef& kv : UIDtoTagMap) {
+		tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key);
+	}
+
+	loop {
+		wait(yield());
+
+		RangeResult data =
+		    pContext->pTxnStateStore
+		        ->readRange(txnKeys, SERVER_KNOBS->BUGGIFIED_ROW_LIMIT, SERVER_KNOBS->APPLY_MUTATION_BYTES)
+		        .get();
+		if (!data.size())
+			break;
+
+		((KeyRangeRef&)txnKeys) = KeyRangeRef(keyAfter(data.back().key, txnKeys.arena()), txnKeys.end);
+
+		MutationsVec mutations;
+		std::vector<std::pair<MapPair<Key, ServerCacheInfo>, int>> keyInfoData;
+		std::vector<UID> src, dest;
+		ServerCacheInfo info;
+		// NOTE: An ACTOR will be compiled into several classes, the this pointer is from one of them.
+		auto updateTagInfo = [this](const std::vector<UID>& uids,
+		                            std::vector<Tag>& tags,
+		                            std::vector<Reference<StorageInfo>>& storageInfoItems) {
+			for (const auto& id : uids) {
+				auto storageInfo = getStorageInfo(id, &pContext->pResolverData->storageCache, pContext->pTxnStateStore);
+				ASSERT(storageInfo->tag != invalidTag);
+				tags.push_back(storageInfo->tag);
+				storageInfoItems.push_back(storageInfo);
+			}
+		};
+		for (auto& kv : data) {
+			if (!kv.key.startsWith(keyServersPrefix)) {
+				mutations.emplace_back(mutations.arena(), MutationRef::SetValue, kv.key, kv.value);
+				continue;
+			}
+
+			KeyRef k = kv.key.removePrefix(keyServersPrefix);
+			if (k == allKeys.end) {
+				continue;
+			}
+			decodeKeyServersValue(tag_uid, kv.value, src, dest);
+
+			info.tags.clear();
+
+			info.src_info.clear();
+			updateTagInfo(src, info.tags, info.src_info);
+
+			info.dest_info.clear();
+			updateTagInfo(dest, info.tags, info.dest_info);
+
+			uniquify(info.tags);
+			keyInfoData.emplace_back(MapPair<Key, ServerCacheInfo>(k, info), 1);
+		}
+
+		// insert keyTag data separately from metadata mutations so that we can do one bulk insert which
+		// avoids a lot of map lookups.
+		pContext->pResolverData->keyInfo.rawInsert(keyInfoData);
+
+		bool confChanges; // Ignore configuration changes for initial commits.
+		ResolverData resolverData(
+		    pContext->pResolverData->dbgid, pContext->pTxnStateStore, &pContext->pResolverData->keyInfo, confChanges);
+
+		applyMetadataMutations(SpanID(), resolverData, mutations);
+	} // loop
+
+	auto lockedKey = pContext->pTxnStateStore->readValue(databaseLockedKey).get();
+	// pContext->pCommitData->locked = lockedKey.present() && lockedKey.get().size();
+	// pContext->pCommitData->metadataVersion = pContext->pTxnStateStore->readValue(metadataVersionKey).get();
+
+	pContext->pTxnStateStore->enableSnapshot();
+
+	return Void();
+}
+
+ACTOR Future<Void> processTransactionStateRequestPart(TransactionStateResolveContext* pContext,
+                                                      TxnStateRequest request) {
+	state const TxnStateRequest& req = request;
+	state Resolver& resolverData = *pContext->pResolverData;
+	state PromiseStream<Future<Void>>& addActor = *pContext->pActors;
+	state Sequence& maxSequence = pContext->maxSequence;
+	state ReplyPromise<Void> reply = req.reply;
+	state std::unordered_set<Sequence>& txnSequences = pContext->receivedSequences;
+
+	ASSERT(pContext->pResolverData.getPtr() != nullptr);
+	ASSERT(pContext->pActors != nullptr);
+
+	if (pContext->receivedSequences.count(request.sequence)) {
+		// This part is already received. Still we will re-broadcast it to other CommitProxies & Resolvers
+		pContext->pActors->send(broadcastTxnRequest(request, SERVER_KNOBS->TXN_STATE_SEND_AMOUNT, true));
+		wait(yield());
+		return Void();
+	}
+
+	if (request.last) {
+		// This is the last piece of subsequence, yet other pieces might still on the way.
+		pContext->maxSequence = request.sequence + 1;
+	}
+	pContext->receivedSequences.insert(request.sequence);
+
+	// ASSERT(!pContext->pResolverData->validState.isSet());
+
+	for (auto& kv : request.data) {
+		pContext->pTxnStateStore->set(kv, &request.arena);
+	}
+	pContext->pTxnStateStore->commit(true);
+
+	if (pContext->receivedSequences.size() == pContext->maxSequence) {
+		// Received all components of the txnStateRequest
+		ASSERT(!pContext->processed);
+		wait(processCompleteTransactionStateRequest(pContext));
+		pContext->processed = true;
+	}
+
+	pContext->pActors->send(broadcastTxnRequest(request, SERVER_KNOBS->TXN_STATE_SEND_AMOUNT, true));
+	wait(yield());
+	return Void();
+}
+
+} // anonymous namespace
+
+ACTOR Future<Void> resolverCore(ResolverInterface resolver,
+                                InitializeResolverRequest initReq,
+                                Reference<AsyncVar<ServerDBInfo> const> db) {
 	state Reference<Resolver> self(new Resolver(resolver.id(), initReq.commitProxyCount, initReq.resolverCount));
 	state ActorCollection actors(false);
 	state Future<Void> doPollMetrics = self->resolverCount > 1 ? Void() : Future<Void>(Never());
@ -327,13 +633,45 @@ ACTOR Future<Void> resolverCore(ResolverInterface resolver, InitializeResolverRe
 	actors.add(traceRole(Role::RESOLVER, resolver.id()));

 	TraceEvent("ResolverInit", resolver.id()).detail("RecoveryCount", initReq.recoveryCount);
+
+	// Wait until we can load the "real" logsystem, since we don't support switching them currently
+	while (!(initReq.masterLifetime.isEqual(db->get().masterLifetime) &&
+	         db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION)) {
+		// TraceEvent("ResolverInit2", resolver.id()).detail("LSEpoch", db->get().logSystemConfig.epoch);
+		wait(db->onChange());
+	}
+
+	// Initialize txnStateStore
+	self->logSystem = ILogSystem::fromServerDBInfo(resolver.id(), db->get(), false, addActor);
+	state PromiseStream<Future<Void>> addActor;
+	state Future<Void> onError =
+	    transformError(actorCollection(addActor.getFuture()), broken_promise(), resolver_failed());
+	state TransactionStateResolveContext transactionStateResolveContext;
+	if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
+		self->logAdapter = new LogSystemDiskQueueAdapter(self->logSystem, Reference<AsyncVar<PeekTxsInfo>>(), 1, false);
+		self->txnStateStore = keyValueStoreLogSystem(self->logAdapter, resolver.id(), 2e9, true, true, true);
+
+		// wait for txnStateStore recovery
+		wait(success(self->txnStateStore->readValue(StringRef())));
+
+		// This has to be declared after the self->txnStateStore get initialized
+		transactionStateResolveContext = TransactionStateResolveContext(self, &addActor);
+
+		if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+			self->numLogs = db->get().logSystemConfig.numLogs();
+			self->tpcvVector.resize(1 + self->numLogs, 0);
+			std::fill(self->tpcvVector.begin(), self->tpcvVector.end(), invalidVersion);
+		}
+	}
+
 	loop choose {
 		when(ResolveTransactionBatchRequest batch = waitNext(resolver.resolve.getFuture())) {
 			actors.add(resolveBatch(self, batch));
 		}
 		when(ResolutionMetricsRequest req = waitNext(resolver.metrics.getFuture())) {
 			++self->metricsRequests;
-			req.reply.send(self->iopsSample.getEstimate(allKeys));
+			req.reply.send(self->iopsSample.getEstimate(SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ? normalKeys
+			                                                                                               : allKeys));
 		}
 		when(ResolutionSplitRequest req = waitNext(resolver.split.getFuture())) {
 			++self->splitRequests;
@ -344,10 +682,18 @@ ACTOR Future<Void> resolverCore(ResolverInterface resolver, InitializeResolverRe
 			req.reply.send(rep);
 		}
 		when(wait(actors.getResult())) {}
+		when(wait(onError)) {}
 		when(wait(doPollMetrics)) {
 			self->iopsSample.poll();
 			doPollMetrics = delay(SERVER_KNOBS->SAMPLE_POLL_TIME);
 		}
+		when(TxnStateRequest request = waitNext(resolver.txnState.getFuture())) {
+			if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
+				addActor.send(processTransactionStateRequestPart(&transactionStateResolveContext, request));
+			} else {
+				ASSERT(false);
+			}
+		}
 	}
 }

@ -366,7 +712,7 @@ ACTOR Future<Void> resolver(ResolverInterface resolver,
                            InitializeResolverRequest initReq,
                            Reference<AsyncVar<ServerDBInfo> const> db) {
 	try {
-		state Future<Void> core = resolverCore(resolver, initReq);
+		state Future<Void> core = resolverCore(resolver, initReq, db);
 		loop choose {
 			when(wait(core)) { return Void(); }
 			when(wait(checkRemoved(db, initReq.recoveryCount, resolver))) {}
--- a/fdbserver/ResolverInterface.h
+++ b/fdbserver/ResolverInterface.h
@ -20,15 +20,15 @@

 #ifndef FDBSERVER_RESOLVERINTERFACE_H
 #define FDBSERVER_RESOLVERINTERFACE_H
-#include "fdbclient/CommitTransaction.h"
-#include "fdbrpc/Locality.h"
-#include "fdbrpc/fdbrpc.h"
 #pragma once

+#include <stdint.h>
+
+#include "fdbclient/CommitProxyInterface.h"
+#include "fdbclient/CommitTransaction.h"
+#include "fdbclient/FDBTypes.h"
 #include "fdbrpc/Locality.h"
 #include "fdbrpc/fdbrpc.h"
-#include "fdbclient/FDBTypes.h"
-#include "fdbclient/CommitTransaction.h"

 struct ResolverInterface {
 	constexpr static FileIdentifier file_identifier = 1755944;
@ -42,6 +42,8 @@ struct ResolverInterface {
 	RequestStream<struct ResolutionSplitRequest> split;

 	RequestStream<ReplyPromise<Void>> waitFailure;
+	// For receiving initial transaction state store broadcast from the master
+	RequestStream<TxnStateRequest> txnState;

 	ResolverInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {}
 	UID id() const { return uniqueID; }
@ -53,11 +55,14 @@ struct ResolverInterface {
 	void initEndpoints() {
 		metrics.getEndpoint(TaskPriority::ResolutionMetrics);
 		split.getEndpoint(TaskPriority::ResolutionMetrics);
+		waitFailure.getEndpoint();
+		txnState.getEndpoint();
 	}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, uniqueID, locality, resolve, metrics, split, waitFailure);
+		// TODO: save space by using getAdjustedEndpoint() as in CommitProxyInterface
+		serializer(ar, uniqueID, locality, resolve, metrics, split, waitFailure, txnState);
 	}
 };

@ -87,9 +92,25 @@ struct ResolveTransactionBatchReply {
 	std::map<int, VectorRef<int>>
 	    conflictingKeyRangeMap; // transaction index -> conflicting read_conflict_range ids given by the resolver

+	// Privatized mutations with tags, one for each TLog location
+	VectorRef<StringRef> privateMutations;
+	uint32_t privateMutationCount;
+
+	std::unordered_map<uint16_t, Version> tpcvMap;
+	std::set<Tag> writtenTags;
+
 	template <class Archive>
 	void serialize(Archive& ar) {
-		serializer(ar, committed, stateMutations, debugID, conflictingKeyRangeMap, arena);
+		serializer(ar,
+		           committed,
+		           stateMutations,
+		           debugID,
+		           conflictingKeyRangeMap,
+		           privateMutations,
+		           privateMutationCount,
+		           tpcvMap,
+		           writtenTags,
+		           arena);
 	}
 };

@ -107,6 +128,8 @@ struct ResolveTransactionBatchRequest {
 	ReplyPromise<ResolveTransactionBatchReply> reply;
 	Optional<UID> debugID;

+	std::set<Tag> writtenTags;
+
 	template <class Archive>
 	void serialize(Archive& ar) {
 		serializer(ar,
@ -118,6 +141,7 @@ struct ResolveTransactionBatchRequest {
 		           reply,
 		           arena,
 		           debugID,
+		           writtenTags,
 		           spanContext);
 	}
 };
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -1294,6 +1294,8 @@ ACTOR static Future<double> doCommitProbe(Future<double> grvProbe, Transaction*

 	ASSERT(sourceTr->getReadVersion().isReady());
 	tr->setVersion(sourceTr->getReadVersion().get());
+	tr->getDatabase()->ssVersionVectorCache = sourceTr->getDatabase()->ssVersionVectorCache;
+	tr->trState->readVersionObtainedFromGrvProxy = sourceTr->trState->readVersionObtainedFromGrvProxy;

 	state double start = g_network->timer_monotonic();

--- a/fdbserver/TLogInterface.h
+++ b/fdbserver/TLogInterface.h
@ -132,10 +132,12 @@ struct TLogLockResult {
 	constexpr static FileIdentifier file_identifier = 11822027;
 	Version end;
 	Version knownCommittedVersion;
+	std::deque<std::tuple<Version, int>> unknownCommittedVersions;
+	UID id;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, end, knownCommittedVersion);
+		serializer(ar, end, knownCommittedVersion, unknownCommittedVersions, id);
 	}
 };

@ -301,6 +303,7 @@ struct TLogCommitRequest {
 	StringRef messages; // Each message prefixed by a 4-byte length

 	ReplyPromise<TLogCommitReply> reply;
+	int tLogCount;
 	Optional<UID> debugID;

 	TLogCommitRequest() {}
@ -311,10 +314,11 @@ struct TLogCommitRequest {
 	                  Version knownCommittedVersion,
 	                  Version minKnownCommittedVersion,
 	                  StringRef messages,
+	                  int tLogCount,
 	                  Optional<UID> debugID)
 	  : spanContext(context), arena(a), prevVersion(prevVersion), version(version),
 	    knownCommittedVersion(knownCommittedVersion), minKnownCommittedVersion(minKnownCommittedVersion),
-	    messages(messages), debugID(debugID) {}
+	    messages(messages), tLogCount(tLogCount), debugID(debugID) {}
 	template <class Ar>
 	void serialize(Ar& ar) {
 		serializer(ar,
@ -326,6 +330,7 @@ struct TLogCommitRequest {
 		           reply,
 		           arena,
 		           debugID,
+		           tLogCount,
 		           spanContext);
 	}
 };
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -53,7 +53,6 @@ struct TLogQueueEntryRef {
 	Version version;
 	Version knownCommittedVersion;
 	StringRef messages;
-
 	TLogQueueEntryRef() : version(0), knownCommittedVersion(0) {}
 	TLogQueueEntryRef(Arena& a, TLogQueueEntryRef const& from)
 	  : id(from.id), version(from.version), knownCommittedVersion(from.knownCommittedVersion),
@ -330,6 +329,8 @@ struct TLogData : NonCopyable {
 	                                // interface should work without directly accessing rawPersistentQueue
 	TLogQueue* persistentQueue; // Logical queue the log operates on and persist its data.

+	std::deque<std::tuple<Version, int>> unknownCommittedVersions;
+
 	int64_t diskQueueCommitBytes;
 	AsyncVar<bool>
 	    largeDiskQueueCommitBytes; // becomes true when diskQueueCommitBytes is greater than MAX_QUEUE_COMMIT_BYTES
@ -522,6 +523,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	Deque<std::pair<Version, Standalone<VectorRef<uint8_t>>>> messageBlocks;
 	std::vector<std::vector<Reference<TagData>>> tag_data; // tag.locality | tag.id
 	int unpoppedRecoveredTags;
+	std::map<Tag, Promise<Void>> waitingTags;

 	Reference<TagData> getTagData(Tag tag) {
 		int idx = tag.toTagDataIndex();
@ -554,6 +556,12 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	CounterCollection cc;
 	Counter bytesInput;
 	Counter bytesDurable;
+	Counter blockingPeeks;
+	Counter blockingPeekTimeouts;
+	Counter emptyPeeks;
+	Counter nonEmptyPeeks;
+	std::map<Tag, LatencySample> blockingPeekLatencies;
+	std::map<Tag, LatencySample> peekVersionCounts;

 	UID logId;
 	ProtocolVersion protocolVersion;
@ -635,13 +643,14 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	  : stopped(false), initialized(false), queueCommittingVersion(0), knownCommittedVersion(0),
 	    durableKnownCommittedVersion(0), minKnownCommittedVersion(0), queuePoppedVersion(0), minPoppedTagVersion(0),
 	    minPoppedTag(invalidTag), unpoppedRecoveredTags(0), cc("TLog", interf.id().toString()),
-	    bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), logId(interf.id()),
-	    protocolVersion(protocolVersion), newPersistentDataVersion(invalidVersion), tLogData(tLogData),
-	    unrecoveredBefore(1), recoveredAt(1), logSystem(new AsyncVar<Reference<ILogSystem>>()), remoteTag(remoteTag),
-	    isPrimary(isPrimary), logRouterTags(logRouterTags), logRouterPoppedVersion(0), logRouterPopToVersion(0),
-	    locality(tagLocalityInvalid), recruitmentID(recruitmentID), logSpillType(logSpillType),
-	    allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()), execOpCommitInProgress(false),
-	    txsTags(txsTags) {
+	    bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), blockingPeeks("BlockingPeeks", cc),
+	    blockingPeekTimeouts("BlockingPeekTimeouts", cc), emptyPeeks("EmptyPeeks", cc),
+	    nonEmptyPeeks("NonEmptyPeeks", cc), logId(interf.id()), protocolVersion(protocolVersion),
+	    newPersistentDataVersion(invalidVersion), tLogData(tLogData), unrecoveredBefore(1), recoveredAt(1),
+	    logSystem(new AsyncVar<Reference<ILogSystem>>()), remoteTag(remoteTag), isPrimary(isPrimary),
+	    logRouterTags(logRouterTags), logRouterPoppedVersion(0), logRouterPopToVersion(0), locality(tagLocalityInvalid),
+	    recruitmentID(recruitmentID), logSpillType(logSpillType), allTags(tags.begin(), tags.end()),
+	    terminated(tLogData->terminated.getFuture()), execOpCommitInProgress(false), txsTags(txsTags) {
 		startRole(Role::TRANSACTION_LOG,
 		          interf.id(),
 		          tLogData->workerID,
@ -732,6 +741,18 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	}

 	bool shouldSpillByReference(Tag t) const { return !shouldSpillByValue(t); }
+
+	void unblockWaitingPeeks() {
+		if (SERVER_KNOBS->ENABLE_VERSION_VECTOR) {
+			for (auto& iter : waitingTags) {
+				TraceEvent("UnblockWaitingPeeks", tLogData->dbgid)
+				    .detail("LogId", logId)
+				    .detail("Tag", iter.first.toString());
+				iter.second.send(Void());
+			}
+			waitingTags.clear();
+		}
+	}
 };

 template <class T>
@ -793,6 +814,7 @@ ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply,
 	    .detail("QueueCommitted", logData->queueCommittedVersion.get());

 	logData->stopped = true;
+	logData->unblockWaitingPeeks();
 	if (!logData->recoveryComplete.isSet()) {
 		logData->recoveryComplete.sendError(end_of_stream());
 	}
@ -805,6 +827,8 @@ ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply,
 	TLogLockResult result;
 	result.end = stopVersion;
 	result.knownCommittedVersion = logData->knownCommittedVersion;
+	result.unknownCommittedVersions = self->unknownCommittedVersions;
+	result.id = self->dbgid;

 	TraceEvent("TLogStop2", self->dbgid)
 	    .detail("LogId", logData->logId)
@ -1074,7 +1098,6 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa

 	TEST(anyData); // TLog moved data to persistentData
 	logData->persistentDataDurableVersion = newPersistentDataVersion;
-
 	for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
 		for (tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
 			if (logData->tag_data[tagLocality][tagId]) {
@ -1489,6 +1512,14 @@ void commitMessages(TLogData* self,
 				} else {
 					txsBytes += tagData->versionMessages.back().second.expectedSize();
 				}
+				if (SERVER_KNOBS->ENABLE_VERSION_VECTOR) {
+					auto iter = logData->waitingTags.find(tag);
+					if (iter != logData->waitingTags.end()) {
+						auto promise = iter->second;
+						logData->waitingTags.erase(iter);
+						promise.send(Void());
+					}
+				}

 				// The factor of VERSION_MESSAGES_OVERHEAD is intended to be an overestimate of the actual memory used
 				// to store this data in a std::deque. In practice, this number is probably something like 528/512
@ -1544,6 +1575,22 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>>& getVersionMessages(Refe
 	return tagData->versionMessages;
 };

+ACTOR Future<Void> waitForMessagesForTag(Reference<LogData> self, Tag reqTag, Version reqBegin, double timeout) {
+	self->blockingPeeks += 1;
+	auto tagData = self->getTagData(reqTag);
+	if (tagData.isValid() && !tagData->versionMessages.empty() && tagData->versionMessages.back().first >= reqBegin) {
+		return Void();
+	}
+	choose {
+		when(wait(self->waitingTags[reqTag].getFuture())) {
+			// we want the caller to finish first, otherwise the data structure it is building might not be complete
+			wait(delay(0.0));
+		}
+		when(wait(delay(timeout))) { self->blockingPeekTimeouts += 1; }
+	}
+	return Void();
+}
+
 void peekMessagesFromMemory(Reference<LogData> self,
                            Tag tag,
                            Version begin,
@ -1716,7 +1763,27 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,

 	state double workStart = now();

-	Version poppedVer = poppedVersion(logData, reqTag);
+	state Version poppedVer = poppedVersion(logData, reqTag);
+
+	if (SERVER_KNOBS->ENABLE_VERSION_VECTOR && poppedVer <= reqBegin &&
+	    reqBegin > logData->persistentDataDurableVersion && !reqOnlySpilled && reqTag.locality >= 0 &&
+	    !reqReturnIfBlocked) {
+		state double startTime = now();
+		// TODO (version vector) check if this should be included in "status details" json
+		// TODO (version vector) all tags may be too many, instead,  standard deviation?
+		wait(waitForMessagesForTag(logData, reqTag, reqBegin, SERVER_KNOBS->BLOCKING_PEEK_TIMEOUT));
+		double latency = now() - startTime;
+		if (logData->blockingPeekLatencies.find(reqTag) == logData->blockingPeekLatencies.end()) {
+			UID ssID = nondeterministicRandom()->randomUniqueID();
+			std::string s = "BlockingPeekLatencies-" + reqTag.toString();
+			logData->blockingPeekLatencies.try_emplace(
+			    reqTag, s, ssID, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE);
+		}
+		LatencySample& sample = logData->blockingPeekLatencies.at(reqTag);
+		sample.addMeasurement(latency);
+		poppedVer = poppedVersion(logData, reqTag);
+	}
+
 	if (poppedVer > reqBegin) {
 		TLogPeekReply rep;
 		rep.maxKnownVersion = logData->version.get();
@ -2201,9 +2268,15 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
 		if (self->diskQueueCommitBytes > SERVER_KNOBS->MAX_QUEUE_COMMIT_BYTES) {
 			self->largeDiskQueueCommitBytes.set(true);
 		}
-
 		// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
 		logData->version.set(req.version);
+		if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+			self->unknownCommittedVersions.push_front(std::make_tuple(req.version, req.tLogCount));
+			while (!self->unknownCommittedVersions.empty() &&
+			       std::get<0>(self->unknownCommittedVersions.back()) <= req.knownCommittedVersion) {
+				self->unknownCommittedVersions.pop_back();
+			}
+		}

 		if (req.debugID.present())
 			g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.AfterTLogCommit");
@ -2634,6 +2707,7 @@ void removeLog(TLogData* self, Reference<LogData> logData) {
 	    .detail("Input", logData->bytesInput.getValue())
 	    .detail("Durable", logData->bytesDurable.getValue());
 	logData->stopped = true;
+	logData->unblockWaitingPeeks();
 	if (!logData->recoveryComplete.isSet()) {
 		logData->recoveryComplete.sendError(end_of_stream());
 	}
@ -3045,6 +3119,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 		                                 "Restored");
 		logData->locality = id_locality[id1];
 		logData->stopped = true;
+		logData->unblockWaitingPeeks();
 		self->id_data[id1] = logData;
 		id_interf[id1] = recruited;

@ -3263,6 +3338,7 @@ void stopAllTLogs(TLogData* self, UID newLogId) {
 			}
 		}
 		it.second->stopped = true;
+		it.second->unblockWaitingPeeks();
 		if (!it.second->recoveryComplete.isSet()) {
 			it.second->recoveryComplete.sendError(end_of_stream());
 		}
@ -3320,7 +3396,8 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality

 		if (recovering) {
 			logData->unrecoveredBefore = req.startVersion;
-			logData->recoveredAt = req.recoverAt;
+			state Version recoverAt = req.recoverAt;
+			logData->recoveredAt = recoverAt;
 			logData->knownCommittedVersion = req.startVersion - 1;
 			logData->persistentDataVersion = logData->unrecoveredBefore - 1;
 			logData->persistentDataDurableVersion = logData->unrecoveredBefore - 1;
@ -3332,7 +3409,7 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality

 			TraceEvent("TLogRecover", self->dbgid)
 			    .detail("LogId", logData->logId)
-			    .detail("At", req.recoverAt)
+			    .detail("At", recoverAt)
 			    .detail("Known", req.knownCommittedVersion)
 			    .detail("Unrecovered", logData->unrecoveredBefore)
 			    .detail("Tags", describe(req.recoverTags))
@ -3349,28 +3426,30 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 			self->newLogData.trigger();

 			if ((req.isPrimary || req.recoverFrom.logRouterTags == 0) && !logData->stopped &&
-			    logData->unrecoveredBefore <= req.recoverAt) {
+			    logData->unrecoveredBefore <= recoverAt) {
 				if (req.recoverFrom.logRouterTags > 0 && req.locality != tagLocalitySatellite) {
-					logData->logRouterPopToVersion = req.recoverAt;
+					logData->logRouterPopToVersion = recoverAt;
 					std::vector<Tag> tags;
 					tags.push_back(logData->remoteTag);
-					wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true) ||
+					wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, recoverAt, true) ||
 					     logData->removed || logData->stopCommit.onTrigger());
 				} else if (!req.recoverTags.empty()) {
 					ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion);
 					wait(pullAsyncData(
-					         self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false) ||
+					         self, logData, req.recoverTags, req.knownCommittedVersion + 1, recoverAt, false) ||
 					     logData->removed || logData->stopCommit.onTrigger());
 				}
 				pulledRecoveryVersions = true;
-				logData->knownCommittedVersion = req.recoverAt;
+				logData->knownCommittedVersion = recoverAt;
 			}

-			if ((req.isPrimary || req.recoverFrom.logRouterTags == 0) && logData->version.get() < req.recoverAt &&
-			    !logData->stopped) {
+			state Version lastVersionPrevEpoch = req.recoverAt;
+
+			if ((req.isPrimary || req.recoverFrom.logRouterTags == 0) &&
+			    logData->version.get() < lastVersionPrevEpoch && !logData->stopped) {
 				// Log the changes to the persistent queue, to be committed by commitQueue()
 				TLogQueueEntryRef qe;
-				qe.version = req.recoverAt;
+				qe.version = lastVersionPrevEpoch;
 				qe.knownCommittedVersion = logData->knownCommittedVersion;
 				qe.messages = StringRef();
 				qe.id = logData->logId;
@ -3380,8 +3459,7 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
 				if (self->diskQueueCommitBytes > SERVER_KNOBS->MAX_QUEUE_COMMIT_BYTES) {
 					self->largeDiskQueueCommitBytes.set(true);
 				}
-
-				logData->version.set(req.recoverAt);
+				logData->version.set(lastVersionPrevEpoch);
 			}

 			if (logData->recoveryComplete.isSet()) {
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -508,12 +508,32 @@ Future<Version> TagPartitionedLogSystem::push(Version prevVersion,
                                              Version minKnownCommittedVersion,
                                              LogPushData& data,
                                              SpanID const& spanContext,
-                                              Optional<UID> debugID) {
+                                              Optional<UID> debugID,
+                                              Optional<std::unordered_map<uint16_t, Version>> tpcvMap) {
 	// FIXME: Randomize request order as in LegacyLogSystem?
 	std::vector<Future<Void>> quorumResults;
 	std::vector<Future<TLogCommitReply>> allReplies;
 	int location = 0;
 	Span span("TPLS:push"_loc, spanContext);
+
+	std::unordered_map<int, int> tLogCount;
+	if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+		int location = 0;
+		int logGroupLocal = 0;
+		for (auto& it : tLogs) {
+			if (!it->isLocal) {
+				continue;
+			}
+			for (int loc = 0; loc < it->logServers.size(); loc++) {
+				if (tpcvMap.get().find(location) != tpcvMap.get().end()) {
+					tLogCount[logGroupLocal]++;
+				}
+				location++;
+			}
+			logGroupLocal++;
+		}
+	}
+	int logGroupLocal = 0;
 	for (auto& it : tLogs) {
 		if (it->isLocal && it->logServers.size()) {
 			if (it->connectionResetTrackers.size() == 0) {
@ -531,6 +551,14 @@ Future<Version> TagPartitionedLogSystem::push(Version prevVersion,
 			}
 			std::vector<Future<Void>> tLogCommitResults;
 			for (int loc = 0; loc < it->logServers.size(); loc++) {
+				if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+					if (tpcvMap.get().find(location) != tpcvMap.get().end()) {
+						prevVersion = tpcvMap.get()[location];
+					} else {
+						location++;
+						continue;
+					}
+				}
 				Standalone<StringRef> msg = data.getMessages(location);
 				data.recordEmptyMessage(location, msg);
 				allReplies.push_back(recordPushMetrics(
@ -544,6 +572,7 @@ Future<Version> TagPartitionedLogSystem::push(Version prevVersion,
 				                                                                          knownCommittedVersion,
 				                                                                          minKnownCommittedVersion,
 				                                                                          msg,
+				                                                                          tLogCount[logGroupLocal],
 				                                                                          debugID),
 				                                                        TaskPriority::ProxyTLogCommitReply)));
 				Future<Void> commitSuccess = success(allReplies.back());
@ -552,6 +581,7 @@ Future<Version> TagPartitionedLogSystem::push(Version prevVersion,
 				location++;
 			}
 			quorumResults.push_back(quorum(tLogCommitResults, tLogCommitResults.size() - it->tLogWriteAntiQuorum));
+			logGroupLocal++;
 		}
 	}

@ -1310,7 +1340,7 @@ Version TagPartitionedLogSystem::getKnownCommittedVersion() {
 	for (auto& it : lockResults) {
 		auto versions = TagPartitionedLogSystem::getDurableVersion(dbgid, it);
 		if (versions.present()) {
-			result = std::max(result, versions.get().first);
+			result = std::max(result, std::get<0>(versions.get()));
 		}
 	}
 	return result;
@ -1819,7 +1849,7 @@ ACTOR Future<Void> TagPartitionedLogSystem::monitorLog(Reference<AsyncVar<Option
 	}
 }

-Optional<std::pair<Version, Version>> TagPartitionedLogSystem::getDurableVersion(
+Optional<std::tuple<Version, Version, std::vector<TLogLockResult>>> TagPartitionedLogSystem::getDurableVersion(
    UID dbgid,
    LogLockInfo lockInfo,
    std::vector<Reference<AsyncVar<bool>>> failed,
@ -1841,7 +1871,6 @@ Optional<std::pair<Version, Version>> TagPartitionedLogSystem::getDurableVersion
 	std::vector<TLogLockResult> results;
 	std::string sServerState;
 	LocalityGroup unResponsiveSet;
-
 	for (int t = 0; t < logSet->logServers.size(); t++) {
 		if (lockInfo.replies[t].isReady() && !lockInfo.replies[t].isError() && (!failed.size() || !failed[t]->get())) {
 			results.push_back(lockInfo.replies[t].get());
@ -1895,6 +1924,7 @@ Optional<std::pair<Version, Version>> TagPartitionedLogSystem::getDurableVersion
 			TraceEvent("GetDurableResult", dbgid)
 			    .detail("Required", requiredCount)
 			    .detail("Present", results.size())
+			    .detail("Anti", logSet->tLogWriteAntiQuorum)
 			    .detail("ServerState", sServerState)
 			    .detail("RecoveryVersion",
 			            ((safe_range_end > 0) && (safe_range_end - 1 < results.size()))
@ -1907,14 +1937,14 @@ Optional<std::pair<Version, Version>> TagPartitionedLogSystem::getDurableVersion
 			    .detail("KnownCommittedVersion", knownCommittedVersion)
 			    .detail("EpochEnd", lockInfo.epochEnd);

-			return std::make_pair(knownCommittedVersion, results[new_safe_range_begin].end);
+			return std::make_tuple(knownCommittedVersion, results[new_safe_range_begin].end, results);
 		}
 	}
 	TraceEvent("GetDurableResultWaiting", dbgid)
 	    .detail("Required", requiredCount)
 	    .detail("Present", results.size())
 	    .detail("ServerState", sServerState);
-	return Optional<std::pair<Version, Version>>();
+	return Optional<std::tuple<Version, Version, std::vector<TLogLockResult>>>();
 }

 ACTOR Future<Void> TagPartitionedLogSystem::getDurableVersionChanged(LogLockInfo lockInfo,
@ -1936,6 +1966,44 @@ ACTOR Future<Void> TagPartitionedLogSystem::getDurableVersionChanged(LogLockInfo
 	return Void();
 }

+// If VERSION_VECTOR_UNICAST is enabled, one tLog's DV may advance beyond the min(DV) over all tLogs.
+// This function finds the highest recoverable version for each tLog group over all log groups.
+// All prior versions to the chosen RV must also be recoverable.
+// TODO: unit tests to stress UNICAST
+Version getRecoverVersionUnicast(std::vector<std::tuple<int, std::vector<TLogLockResult>>>& logGroupResults,
+                                 Version minEnd) {
+	Version minLogGroup = std::numeric_limits<Version>::max();
+	for (auto& logGroupResult : logGroupResults) {
+		std::unordered_map<Version, int> versionRepCount;
+		std::map<Version, int> versionTLogCount;
+		int replicationFactor = std::get<0>(logGroupResult);
+		for (auto& tLogResult : std::get<1>(logGroupResult)) {
+			bool logGroupCandidate = false;
+			for (auto& unknownCommittedVersion : tLogResult.unknownCommittedVersions) {
+				Version k = std::get<0>(unknownCommittedVersion);
+				if (k > minEnd) {
+					versionRepCount[k]++;
+					versionTLogCount[k] = std::get<1>(unknownCommittedVersion);
+					logGroupCandidate = true;
+				}
+			}
+			if (!logGroupCandidate) {
+				return minEnd;
+			}
+		}
+		Version minTLogs = minEnd;
+		for (auto const& [version, tLogCount] : versionTLogCount) {
+			if (versionRepCount[version] >= tLogCount - replicationFactor + 1) {
+				minTLogs = version;
+			} else {
+				break;
+			}
+		}
+		minLogGroup = std::min(minLogGroup, minTLogs);
+	}
+	return minLogGroup;
+}
+
 ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Reference<ILogSystem>>> outLogSystem,
                                                     UID dbgid,
                                                     DBCoreState prevState,
@ -2139,7 +2207,6 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
 			}
 		}
 	}
-
 	if (*forceRecovery) {
 		state std::vector<LogLockInfo> allLockResults;
 		ASSERT(lockResults.size() == 1);
@ -2154,18 +2221,18 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
 			}
 			allLockResults.push_back(lockResult);
 		}
-
 		state int lockNum = 0;
 		state Version maxRecoveryVersion = 0;
 		state int maxRecoveryIndex = 0;
 		while (lockNum < allLockResults.size()) {
+
 			auto versions = TagPartitionedLogSystem::getDurableVersion(dbgid, allLockResults[lockNum]);
 			if (versions.present()) {
-				if (versions.get().second > maxRecoveryVersion) {
+				if (std::get<1>(versions.get()) > maxRecoveryVersion) {
 					TraceEvent("HigherRecoveryVersion", dbgid)
 					    .detail("Idx", lockNum)
-					    .detail("Ver", versions.get().second);
-					maxRecoveryVersion = versions.get().second;
+					    .detail("Ver", std::get<1>(versions.get()));
+					maxRecoveryVersion = std::get<1>(versions.get());
 					maxRecoveryIndex = lockNum;
 				}
 				lockNum++;
@ -2196,6 +2263,7 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
 		Version minEnd = std::numeric_limits<Version>::max();
 		Version maxEnd = 0;
 		std::vector<Future<Void>> changes;
+		std::vector<std::tuple<int, std::vector<TLogLockResult>>> logGroupResults;
 		for (int log = 0; log < logServers.size(); log++) {
 			if (!logServers[log]->isLocal) {
 				continue;
@ -2203,19 +2271,26 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
 			auto versions =
 			    TagPartitionedLogSystem::getDurableVersion(dbgid, lockResults[log], logFailed[log], lastEnd);
 			if (versions.present()) {
-				knownCommittedVersion = std::max(knownCommittedVersion, versions.get().first);
-				maxEnd = std::max(maxEnd, versions.get().second);
-				minEnd = std::min(minEnd, versions.get().second);
+				knownCommittedVersion = std::max(knownCommittedVersion, std::get<0>(versions.get()));
+				logGroupResults.emplace_back(logServers[log]->tLogReplicationFactor, std::get<2>(versions.get()));
+				maxEnd = std::max(maxEnd, std::get<1>(versions.get()));
+				minEnd = std::min(minEnd, std::get<1>(versions.get()));
 			}
 			changes.push_back(TagPartitionedLogSystem::getDurableVersionChanged(lockResults[log], logFailed[log]));
 		}
-
 		if (maxEnd > 0 && (!lastEnd.present() || maxEnd < lastEnd.get())) {
 			TEST(lastEnd.present()); // Restarting recovery at an earlier point

 			auto logSystem = makeReference<TagPartitionedLogSystem>(dbgid, locality, prevState.recoveryCount);

+			logSystem->recoverAt = minEnd;
+			if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
+				logSystem->recoverAt = getRecoverVersionUnicast(logGroupResults, minEnd);
+				TraceEvent("RecoveryVersionInfo").detail("RecoverAt", logSystem->recoverAt);
+			}
+
 			lastEnd = minEnd;
+
 			logSystem->tLogs = logServers;
 			logSystem->logRouterTags = prevState.logRouterTags;
 			logSystem->txsTags = prevState.txsTags;
@ -2226,7 +2301,6 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
 			if (knownCommittedVersion > minEnd) {
 				knownCommittedVersion = minEnd;
 			}
-			logSystem->recoverAt = minEnd;
 			logSystem->knownCommittedVersion = knownCommittedVersion;
 			TraceEvent(SevDebug, "FinalRecoveryVersionInfo")
 			    .detail("KCV", knownCommittedVersion)
@ -2458,12 +2532,13 @@ ACTOR Future<Void> TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst
 	state int lockNum = 0;
 	while (lockNum < oldLogSystem->lockResults.size()) {
 		if (oldLogSystem->lockResults[lockNum].logSet->locality == remoteLocality) {
+
 			loop {
 				auto versions =
 				    TagPartitionedLogSystem::getDurableVersion(self->dbgid, oldLogSystem->lockResults[lockNum]);
 				if (versions.present()) {
 					logSet->startVersion =
-					    std::min(std::min(versions.get().first + 1, oldLogSystem->lockResults[lockNum].epochEnd),
+					    std::min(std::min(std::get<0>(versions.get()) + 1, oldLogSystem->lockResults[lockNum].epochEnd),
 					             logSet->startVersion);
 					break;
 				}
@ -2742,7 +2817,7 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
 				    TagPartitionedLogSystem::getDurableVersion(logSystem->dbgid, oldLogSystem->lockResults[lockNum]);
 				if (versions.present()) {
 					logSystem->tLogs[0]->startVersion =
-					    std::min(std::min(versions.get().first + 1, oldLogSystem->lockResults[lockNum].epochEnd),
+					    std::min(std::min(std::get<0>(versions.get()) + 1, oldLogSystem->lockResults[lockNum].epochEnd),
 					             logSystem->tLogs[0]->startVersion);
 					break;
 				}
--- a/fdbserver/TagPartitionedLogSystem.actor.h
+++ b/fdbserver/TagPartitionedLogSystem.actor.h
@ -105,6 +105,7 @@ struct TagPartitionedLogSystem final : ILogSystem, ReferenceCounted<TagPartition
 	Optional<Version> recoveredAt;
 	Version knownCommittedVersion;
 	Version backupStartVersion = invalidVersion; // max(tLogs[0].startVersion, previous epochEnd).
+	std::map<UID, Version> rvLogs; // recovery versions per tlog
 	LocalityData locality;
 	// For each currently running popFromLog actor, outstandingPops is
 	// (logID, tag)->(max popped version, durableKnownCommittedVersion).
@ -191,7 +192,8 @@ struct TagPartitionedLogSystem final : ILogSystem, ReferenceCounted<TagPartition
 	                     Version minKnownCommittedVersion,
 	                     LogPushData& data,
 	                     SpanID const& spanContext,
-	                     Optional<UID> debugID) final;
+	                     Optional<UID> debugID,
+	                     Optional<std::unordered_map<uint16_t, Version>> tpcvMap) final;

 	Reference<IPeekCursor> peekAll(UID dbgid, Version begin, Version end, Tag tag, bool parallelGetMore);

@ -306,11 +308,11 @@ struct TagPartitionedLogSystem final : ILogSystem, ReferenceCounted<TagPartition
 	LogEpoch getOldestBackupEpoch() const final;

 	void setOldestBackupEpoch(LogEpoch epoch) final;
-
 	ACTOR static Future<Void> monitorLog(Reference<AsyncVar<OptionalInterface<TLogInterface>>> logServer,
 	                                     Reference<AsyncVar<bool>> failed);

-	Optional<std::pair<Version, Version>> static getDurableVersion(
+	// returns the log group's knownComittedVersion, DV, and a vector of TLogLockResults for each tLog in the group.
+	Optional<std::tuple<Version, Version, std::vector<TLogLockResult>>> static getDurableVersion(
 	    UID dbgid,
 	    LogLockInfo lockInfo,
 	    std::vector<Reference<AsyncVar<bool>>> failed = std::vector<Reference<AsyncVar<bool>>>(),
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@ -736,14 +736,16 @@ struct InitializeBlobManagerRequest {

 struct InitializeResolverRequest {
 	constexpr static FileIdentifier file_identifier = 7413317;
+	LifetimeToken masterLifetime;
 	uint64_t recoveryCount;
 	int commitProxyCount;
 	int resolverCount;
+	UID masterId; // master's UID
 	ReplyPromise<ResolverInterface> reply;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, recoveryCount, commitProxyCount, resolverCount, reply);
+		serializer(ar, masterLifetime, recoveryCount, commitProxyCount, resolverCount, masterId, reply);
 	}
 };

@ -767,11 +769,13 @@ struct InitializeStorageRequest {
 	Optional<std::pair<UID, Version>>
 	    tssPairIDAndVersion; // Only set if recruiting a tss. Will be the UID and Version of its SS pair.
 	UID clusterId; // Unique cluster identifier. Only needed at recruitment, will be read from txnStateStore on recovery
+	Version initialClusterVersion;
 	ReplyPromise<InitializeStorageReply> reply;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, seedTag, reqId, interfaceId, storeType, reply, tssPairIDAndVersion, clusterId);
+		serializer(
+		    ar, seedTag, reqId, interfaceId, storeType, reply, tssPairIDAndVersion, clusterId, initialClusterVersion);
 	}
 };

@ -1086,6 +1090,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                 StorageServerInterface ssi,
                                 Tag seedTag,
                                 UID clusterId,
+                                 Version startVersion,
                                 Version tssSeedVersion,
                                 ReplyPromise<InitializeStorageReply> recruitReply,
                                 Reference<AsyncVar<ServerDBInfo> const> db,
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@ -18,6 +18,7 @@
 * limitations under the License.
 */

+#include <algorithm>
 #include <iterator>

 #include "fdbrpc/sim_validation.h"
@ -29,6 +30,7 @@
 #include "fdbserver/ServerDBInfo.h"
 #include "flow/ActorCollection.h"
 #include "flow/Trace.h"
+#include "fdbclient/VersionVector.h"

 #include "flow/actorcompiler.h" // This must be the last #include.

@ -38,7 +40,9 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
 	Version lastEpochEnd, // The last version in the old epoch not (to be) rolled back in this recovery
 	    recoveryTransactionVersion; // The first version in this epoch

-	Version liveCommittedVersion; // The largest live committed version reported by commit proxies.
+	NotifiedVersion prevTLogVersion; // Order of transactions to tlogs
+
+	NotifiedVersion liveCommittedVersion; // The largest live committed version reported by commit proxies.
 	bool databaseLocked;
 	Optional<Value> proxyMetadataVersion;
 	Version minKnownCommittedVersion;
@ -47,6 +51,7 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {

 	Version version; // The last version assigned to a proxy by getVersion()
 	double lastVersionTime;
+	Optional<Version> referenceVersion;

 	std::map<UID, CommitProxyVersionReplies> lastCommitProxyVersionReplies;

@ -56,10 +61,24 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {

 	bool forceRecovery;

+	// Captures the latest commit version targeted for each storage server in the cluster.
+	// @todo We need to ensure that the latest commit versions of storage servers stay
+	// up-to-date in the presence of key range splits/merges.
+	VersionVector ssVersionVector;
+
 	CounterCollection cc;
 	Counter getCommitVersionRequests;
 	Counter getLiveCommittedVersionRequests;
 	Counter reportLiveCommittedVersionRequests;
+	// This counter gives an estimate of the number of non-empty peeks that storage servers
+	// should do from tlogs (in the worst case, ignoring blocking peek timeouts).
+	Counter versionVectorTagUpdates;
+	Counter waitForPrevCommitRequests;
+	Counter nonWaitForPrevCommitRequests;
+	LatencySample versionVectorSizeOnCVReply;
+	LatencySample waitForPrevLatencies;
+
+	PromiseStream<Future<Void>> addActor;

 	Future<Void> logger;
 	Future<Void> balancer;
@ -69,15 +88,27 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
 	           ServerCoordinators const& coordinators,
 	           ClusterControllerFullInterface const& clusterController,
 	           Standalone<StringRef> const& dbId,
+	           PromiseStream<Future<Void>> addActor,
 	           bool forceRecovery)
-
 	  : dbgid(myInterface.id()), lastEpochEnd(invalidVersion), recoveryTransactionVersion(invalidVersion),
 	    liveCommittedVersion(invalidVersion), databaseLocked(false), minKnownCommittedVersion(invalidVersion),
 	    coordinators(coordinators), version(invalidVersion), lastVersionTime(0), myInterface(myInterface),
 	    resolutionBalancer(&version), forceRecovery(forceRecovery), cc("Master", dbgid.toString()),
 	    getCommitVersionRequests("GetCommitVersionRequests", cc),
 	    getLiveCommittedVersionRequests("GetLiveCommittedVersionRequests", cc),
-	    reportLiveCommittedVersionRequests("ReportLiveCommittedVersionRequests", cc) {
+	    reportLiveCommittedVersionRequests("ReportLiveCommittedVersionRequests", cc),
+	    versionVectorTagUpdates("VersionVectorTagUpdates", cc),
+	    waitForPrevCommitRequests("WaitForPrevCommitRequests", cc),
+	    nonWaitForPrevCommitRequests("NonWaitForPrevCommitRequests", cc),
+	    versionVectorSizeOnCVReply("VersionVectorSizeOnCVReply",
+	                               dbgid,
+	                               SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                               SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    waitForPrevLatencies("WaitForPrevLatencies",
+	                         dbgid,
+	                         SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                         SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    addActor(addActor) {
 		logger = traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "MasterMetrics");
 		if (forceRecovery && !myInterface.locality.dcId().present()) {
 			TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log();
@ -120,17 +151,42 @@ ACTOR Future<Void> getVersion(Reference<MasterData> self, GetCommitVersionReques
 			self->lastVersionTime = now();
 			self->version = self->recoveryTransactionVersion;
 			rep.prevVersion = self->lastEpochEnd;
+
 		} else {
 			double t1 = now();
 			if (BUGGIFY) {
 				t1 = self->lastVersionTime;
 			}
-			rep.prevVersion = self->version;
-			self->version +=
+
+			// Versions should roughly follow wall-clock time, based on the
+			// system clock of the current machine and an FDB-specific epoch.
+			// Calculate the expected version and determine whether we need to
+			// hand out versions faster or slower to stay in sync with the
+			// clock.
+			Version toAdd =
 			    std::max<Version>(1,
 			                      std::min<Version>(SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS,
 			                                        SERVER_KNOBS->VERSIONS_PER_SECOND * (t1 - self->lastVersionTime)));

+			rep.prevVersion = self->version;
+			if (self->referenceVersion.present()) {
+				Version expected =
+				    g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - self->referenceVersion.get();
+
+				// Attempt to jump directly to the expected version. But make
+				// sure that versions are still being handed out at a rate
+				// around VERSIONS_PER_SECOND. This rate is scaled depending on
+				// how far off the calculated version is from the expected
+				// version.
+				int64_t maxOffset = std::min(static_cast<int64_t>(toAdd * SERVER_KNOBS->MAX_VERSION_RATE_MODIFIER),
+				                             SERVER_KNOBS->MAX_VERSION_RATE_OFFSET);
+				self->version =
+				    std::clamp(expected, self->version + toAdd - maxOffset, self->version + toAdd + maxOffset);
+				ASSERT_GT(self->version, rep.prevVersion);
+			} else {
+				self->version = self->version + toAdd;
+			}
+
 			TEST(self->version - rep.prevVersion == 1); // Minimum possible version gap

 			bool maxVersionGap = self->version - rep.prevVersion == SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS;
@ -147,6 +203,7 @@ ACTOR Future<Void> getVersion(Reference<MasterData> self, GetCommitVersionReques
 		                               proxyItr->second.replies.upper_bound(req.mostRecentProcessedRequestNum));
 		proxyItr->second.replies[req.requestNum] = rep;
 		ASSERT(rep.prevVersion >= 0);
+
 		req.reply.send(rep);

 		ASSERT(proxyItr->second.latestRequestNum.get() == req.requestNum - 1);
@ -167,6 +224,39 @@ ACTOR Future<Void> provideVersions(Reference<MasterData> self) {
 	}
 }

+void updateLiveCommittedVersion(Reference<MasterData> self, ReportRawCommittedVersionRequest req) {
+	self->minKnownCommittedVersion = std::max(self->minKnownCommittedVersion, req.minKnownCommittedVersion);
+
+	if (req.version > self->liveCommittedVersion.get()) {
+		if (SERVER_KNOBS->ENABLE_VERSION_VECTOR && req.writtenTags.present()) {
+			// TraceEvent("Received ReportRawCommittedVersionRequest").detail("Version",req.version);
+			self->ssVersionVector.setVersion(req.writtenTags.get(), req.version);
+			self->versionVectorTagUpdates += req.writtenTags.get().size();
+		}
+		auto curTime = now();
+		// add debug here to change liveCommittedVersion to time bound of now()
+		debug_advanceVersionTimestamp(self->liveCommittedVersion.get(), curTime + CLIENT_KNOBS->MAX_VERSION_CACHE_LAG);
+		// also add req.version but with no time bound
+		debug_advanceVersionTimestamp(req.version, std::numeric_limits<double>::max());
+		self->databaseLocked = req.locked;
+		self->proxyMetadataVersion = req.metadataVersion;
+		// Note the set call switches context to any waiters on liveCommittedVersion before continuing.
+		self->liveCommittedVersion.set(req.version);
+	}
+	++self->reportLiveCommittedVersionRequests;
+}
+
+ACTOR Future<Void> waitForPrev(Reference<MasterData> self, ReportRawCommittedVersionRequest req) {
+	state double startTime = now();
+	wait(self->liveCommittedVersion.whenAtLeast(req.prevVersion.get()));
+	double latency = now() - startTime;
+	self->waitForPrevLatencies.addMeasurement(latency);
+	++self->waitForPrevCommitRequests;
+	updateLiveCommittedVersion(self, req);
+	req.reply.send(Void());
+	return Void();
+}
+
 ACTOR Future<Void> serveLiveCommittedVersion(Reference<MasterData> self) {
 	loop {
 		choose {
@ -176,37 +266,34 @@ ACTOR Future<Void> serveLiveCommittedVersion(Reference<MasterData> self) {
 					                      req.debugID.get().first(),
 					                      "MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion");

-				if (self->liveCommittedVersion == invalidVersion) {
-					self->liveCommittedVersion = self->recoveryTransactionVersion;
+				if (self->liveCommittedVersion.get() == invalidVersion) {
+					self->liveCommittedVersion.set(self->recoveryTransactionVersion);
 				}
 				++self->getLiveCommittedVersionRequests;
 				GetRawCommittedVersionReply reply;
-				reply.version = self->liveCommittedVersion;
+				reply.version = self->liveCommittedVersion.get();
 				reply.locked = self->databaseLocked;
 				reply.metadataVersion = self->proxyMetadataVersion;
 				reply.minKnownCommittedVersion = self->minKnownCommittedVersion;
+				self->ssVersionVector.getDelta(req.maxVersion, reply.ssVersionVectorDelta);
+				self->versionVectorSizeOnCVReply.addMeasurement(reply.ssVersionVectorDelta.size());
 				req.reply.send(reply);
 			}
 			when(ReportRawCommittedVersionRequest req =
 			         waitNext(self->myInterface.reportLiveCommittedVersion.getFuture())) {
-				self->minKnownCommittedVersion = std::max(self->minKnownCommittedVersion, req.minKnownCommittedVersion);
-				if (req.version > self->liveCommittedVersion) {
-					auto curTime = now();
-					// add debug here to change liveCommittedVersion to time bound of now()
-					debug_advanceVersionTimestamp(self->liveCommittedVersion,
-					                              curTime + CLIENT_KNOBS->MAX_VERSION_CACHE_LAG);
-					// also add req.version but with no time bound
-					debug_advanceVersionTimestamp(req.version, std::numeric_limits<double>::max());
-					self->liveCommittedVersion = req.version;
-					self->databaseLocked = req.locked;
-					self->proxyMetadataVersion = req.metadataVersion;
-				}
-				++self->reportLiveCommittedVersionRequests;
+				if (SERVER_KNOBS->ENABLE_VERSION_VECTOR && req.prevVersion.present() &&
+				    (self->liveCommittedVersion.get() != invalidVersion) &&
+				    (self->liveCommittedVersion.get() < req.prevVersion.get())) {
+					self->addActor.send(waitForPrev(self, req));
+				} else {
+					updateLiveCommittedVersion(self, req);
+					++self->nonWaitForPrevCommitRequests;
 					req.reply.send(Void());
 				}
 			}
 		}
 	}
+}

 ACTOR Future<Void> updateRecoveryData(Reference<MasterData> self) {
 	loop {
@ -214,7 +301,8 @@ ACTOR Future<Void> updateRecoveryData(Reference<MasterData> self) {
 		TraceEvent("UpdateRecoveryData", self->dbgid)
 		    .detail("RecoveryTxnVersion", req.recoveryTransactionVersion)
 		    .detail("LastEpochEnd", req.lastEpochEnd)
-		    .detail("NumCommitProxies", req.commitProxies.size());
+		    .detail("NumCommitProxies", req.commitProxies.size())
+		    .detail("VersionEpoch", req.versionEpoch);

 		if (self->recoveryTransactionVersion == invalidVersion ||
 		    req.recoveryTransactionVersion > self->recoveryTransactionVersion) {
@ -230,6 +318,16 @@ ACTOR Future<Void> updateRecoveryData(Reference<MasterData> self) {
 				self->lastCommitProxyVersionReplies[p.id()] = CommitProxyVersionReplies();
 			}
 		}
+		if (req.versionEpoch.present()) {
+			self->referenceVersion = req.versionEpoch.get();
+		} else if (BUGGIFY) {
+			// Cannot use a positive version epoch in simulation because of the
+			// clock starting at 0. A positive version epoch would mean the initial
+			// cluster version was negative.
+			// TODO: Increase the size of this interval after fixing the issue
+			// with restoring ranges with large version gaps.
+			self->referenceVersion = deterministicRandom()->randomInt64(-1e6, 0);
+		}

 		self->resolutionBalancer.setCommitProxies(req.commitProxies);
 		self->resolutionBalancer.setResolvers(req.resolvers);
@ -279,8 +377,8 @@ ACTOR Future<Void> masterServer(MasterInterface mi,

 	state Future<Void> onDBChange = Void();
 	state PromiseStream<Future<Void>> addActor;
-	state Reference<MasterData> self(
-	    new MasterData(db, mi, coordinators, db->get().clusterInterface, LiteralStringRef(""), forceRecovery));
+	state Reference<MasterData> self(new MasterData(
+	    db, mi, coordinators, db->get().clusterInterface, LiteralStringRef(""), addActor, forceRecovery));
 	state Future<Void> collection = actorCollection(addActor.getFuture());

 	addActor.send(traceRole(Role::MASTER, mi.id()));
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -440,6 +440,8 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
 	Version metadataCreateVersion = invalidVersion;

 	bool removing = false;
+	bool destroyed = false;
+	bool possiblyDestroyed = false;

 	KeyRangeMap<std::unordered_map<UID, Promise<Void>>> moveTriggers;

@ -472,6 +474,13 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
 		}
 		// TODO: may be more cleanup possible here
 	}
+
+	void destroy(Version destroyVersion) {
+		removing = true;
+		destroyed = true;
+		moved(range);
+		newMutations.trigger();
+	}
 };

 class ServerWatchMetadata : public ReferenceCounted<ServerWatchMetadata> {
@ -795,6 +804,9 @@ public:
 	Reference<ILogSystem::IPeekCursor> logCursor;

 	Promise<UID> clusterId;
+	// The version the cluster starts on. This value is not persisted and may
+	// not be valid after a recovery.
+	Version initialClusterVersion = invalidVersion;
 	UID thisServerID;
 	Optional<UID> tssPairID; // if this server is a tss, this is the id of its (ss) pair
 	Optional<UID> ssPairID; // if this server is an ss, this is the id of its (tss) pair
@ -1404,6 +1416,23 @@ ACTOR Future<Version> waitForVersionActor(StorageServer* data, Version version,
 	}
 }

+// If the latest commit version that mutated the shard(s) being served by the specified storage
+// server is below the client specified read version then do a read at the latest commit version
+// of the storage server.
+Version getRealReadVersion(VersionVector& ssLatestCommitVersions, Tag& tag, Version specifiedReadVersion) {
+	Version realReadVersion =
+	    ssLatestCommitVersions.hasVersion(tag) ? ssLatestCommitVersions.getVersion(tag) : specifiedReadVersion;
+	ASSERT(realReadVersion <= specifiedReadVersion);
+	return realReadVersion;
+}
+
+// Find the latest commit version of the given tag.
+Version getLatestCommitVersion(VersionVector& ssLatestCommitVersions, Tag& tag) {
+	Version commitVersion =
+	    ssLatestCommitVersions.hasVersion(tag) ? ssLatestCommitVersions.getVersion(tag) : invalidVersion;
+	return commitVersion;
+}
+
 Future<Version> waitForVersion(StorageServer* data, Version version, SpanID spanContext) {
 	if (version == latestVersion) {
 		version = std::max(Version(1), data->version.get());
@ -1425,6 +1454,37 @@ Future<Version> waitForVersion(StorageServer* data, Version version, SpanID span
 	return waitForVersionActor(data, version, spanContext);
 }

+Future<Version> waitForVersion(StorageServer* data, Version commitVersion, Version readVersion, SpanID spanContext) {
+	ASSERT(commitVersion == invalidVersion || commitVersion < readVersion);
+
+	if (commitVersion == invalidVersion) {
+		return waitForVersion(data, readVersion, spanContext);
+	}
+
+	if (readVersion == latestVersion) {
+		readVersion = std::max(Version(1), data->version.get());
+	}
+
+	if (readVersion < data->oldestVersion.get() || readVersion <= 0) {
+		return transaction_too_old();
+	} else {
+		if (commitVersion < data->oldestVersion.get()) {
+			return data->oldestVersion.get();
+		} else if (commitVersion <= data->version.get()) {
+			return commitVersion;
+		}
+	}
+
+	if ((data->behind || data->versionBehind) && commitVersion > data->version.get()) {
+		return process_behind();
+	}
+
+	if (deterministicRandom()->random01() < 0.001) {
+		TraceEvent("WaitForVersion1000x");
+	}
+	return waitForVersionActor(data, std::max(commitVersion, data->oldestVersion.get()), spanContext);
+}
+
 ACTOR Future<Version> waitForVersionNoTooOld(StorageServer* data, Version version) {
 	// This could become an Actor transparently, but for now it just does the lookup
 	if (version == latestVersion)
@ -1493,7 +1553,8 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 			                      "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask());

 		state Optional<Value> v;
-		state Version version = wait(waitForVersion(data, req.version, req.spanContext));
+		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
+		state Version version = wait(waitForVersion(data, commitVersion, req.version, req.spanContext));
 		if (req.debugID.present())
 			g_traceBatch.addEvent("GetValueDebug",
 			                      req.debugID.get().first(),
@ -1629,7 +1690,7 @@ ACTOR Future<Version> watchWaitForValueChange(StorageServer* data, SpanID parent
 			TEST(latest >= minVersion &&
 			     latest < data->data().latestVersion); // Starting watch loop with latestVersion > data->version
 			GetValueRequest getReq(
-			    span.context, TenantInfo(), metadata->key, latest, metadata->tags, metadata->debugID);
+			    span.context, TenantInfo(), metadata->key, latest, metadata->tags, metadata->debugID, VersionVector());
 			state Future<Void> getValue = getValueQ(
 			    data, getReq); // we are relying on the delay zero at the top of getValueQ, if removed we need one here
 			GetValueReply reply = wait(getReq.reply.getFuture());
@ -1908,6 +1969,12 @@ ACTOR Future<Void> overlappingChangeFeedsQ(StorageServer* data, OverlappingChang
 	for (auto& it : rangeIds) {
 		reply.rangeIds.push_back(OverlappingChangeFeedEntry(
 		    it.first, std::get<0>(it.second), std::get<1>(it.second), std::get<2>(it.second)));
+		TraceEvent(SevDebug, "OverlappingChangeFeedEntry", data->thisServerID)
+		    .detail("MinVersion", req.minVersion)
+		    .detail("FeedID", it.first)
+		    .detail("Range", std::get<0>(it.second))
+		    .detail("EmptyVersion", std::get<1>(it.second))
+		    .detail("StopVersion", std::get<2>(it.second));
 	}

 	// Make sure all of the metadata we are sending won't get rolled back
@ -2751,7 +2818,8 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
 			                    key,
 			                    version,
 			                    pOriginalReq->tags,
-			                    pOriginalReq->debugID);
+			                    pOriginalReq->debugID,
+			                    VersionVector());
 			// Note that it does not use readGuard to avoid server being overloaded here. Throttling is enforced at the
 			// original request level, rather than individual underlying lookups. The reason is that throttle any
 			// individual underlying lookup will fail the original request, which is not productive.
@ -3178,7 +3246,9 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 	try {
 		if (req.debugID.present())
 			g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before");
-		state Version version = wait(waitForVersion(data, req.version, span.context));
+
+		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
+		state Version version = wait(waitForVersion(data, commitVersion, req.version, span.context));

 		state Optional<TenantMapEntry> tenantEntry = data->getTenantEntry(version, req.tenantInfo);
 		state Optional<Key> tenantPrefix = tenantEntry.map<Key>([](TenantMapEntry e) { return e.prefix; });
@ -3359,6 +3429,7 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
 		req.limitBytes = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT_BYTES;
 		req.isFetchKeys = false;
 		req.tags = pOriginalReq->tags;
+		req.ssLatestCommitVersions = VersionVector();
 		req.debugID = pOriginalReq->debugID;

 		// Note that it does not use readGuard to avoid server being overloaded here. Throttling is enforced at the
@ -3669,7 +3740,9 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 		if (req.debugID.present())
 			g_traceBatch.addEvent(
 			    "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.Before");
-		state Version version = wait(waitForVersion(data, req.version, span.context));
+		// VERSION_VECTOR change
+		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
+		state Version version = wait(waitForVersion(data, commitVersion, req.version, span.context));

 		state Optional<TenantMapEntry> tenantEntry = data->getTenantEntry(req.version, req.tenantInfo);
 		state Optional<Key> tenantPrefix = tenantEntry.map<Key>([](TenantMapEntry e) { return e.prefix; });
@ -3880,7 +3953,9 @@ ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe
 		if (req.debugID.present())
 			g_traceBatch.addEvent(
 			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesStream.Before");
-		state Version version = wait(waitForVersion(data, req.version, span.context));
+
+		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
+		state Version version = wait(waitForVersion(data, commitVersion, req.version, span.context));

 		state Optional<TenantMapEntry> tenantEntry = data->getTenantEntry(version, req.tenantInfo);
 		state Optional<Key> tenantPrefix = tenantEntry.map<Key>([](TenantMapEntry e) { return e.prefix; });
@ -4072,7 +4147,8 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 	wait(data->getQueryDelay());

 	try {
-		state Version version = wait(waitForVersion(data, req.version, req.spanContext));
+		Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag);
+		state Version version = wait(waitForVersion(data, commitVersion, req.version, req.spanContext));

 		state Optional<TenantMapEntry> tenantEntry = data->getTenantEntry(version, req.tenantInfo);
 		if (tenantEntry.present()) {
@ -4699,6 +4775,9 @@ ACTOR Future<Void> tryGetRange(PromiseStream<RangeResult> results, Transaction*
 	}
 }

+// global validation that missing refreshed feeds were previously destroyed
+static std::unordered_set<Key> allDestroyedChangeFeeds;
+
 // We have to store the version the change feed was stopped at in the SS instead of just the stopped status
 // In addition to simplifying stopping logic, it enables communicating stopped status when fetching change feeds
 // from other SS correctly
@ -4739,13 +4818,14 @@ ACTOR Future<Void> changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req)
 	    .detail("RangeID", req.rangeID.printable())
 	    .detail("Version", req.version)
 	    .detail("SSVersion", self->version.get())
-	    .detail("Range", req.range.toString());
+	    .detail("Range", req.range);

 	if (req.version - 1 > feed->second->emptyVersion) {
 		feed->second->emptyVersion = req.version - 1;
 		while (!feed->second->mutations.empty() && feed->second->mutations.front().version < req.version) {
 			feed->second->mutations.pop_front();
 		}
+		if (!feed->second->destroyed) {
 			Version durableVersion = self->data().getLatestVersion();
 			auto& mLV = self->addVersionToMutationLog(durableVersion);
 			self->addMutationToMutationLog(
@ -4767,6 +4847,7 @@ ACTOR Future<Void> changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req)
 			}
 			wait(self->durableVersion.whenAtLeast(durableVersion));
 		}
+	}
 	req.reply.send(Void());
 	return Void();
 }
@ -4944,7 +5025,9 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
 			    .errorUnsuppressed(e)
 			    .detail("RangeID", rangeId.printable())
 			    .detail("Range", range.toString())
-			    .detail("EndVersion", endVersion);
+			    .detail("EndVersion", endVersion)
+			    .detail("Removing", changeFeedInfo->removing)
+			    .detail("Destroyed", changeFeedInfo->destroyed);
 			throw;
 		}
 	}
@ -5041,6 +5124,7 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
 		}
 	}

+	state bool seenNotRegistered = false;
 	loop {
 		try {
 			Version maxFetched = wait(fetchChangeFeedApplier(data,
@ -5057,19 +5141,110 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
 				throw;
 			}
 		}
-		wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+
+		// TODO REMOVE
+		fmt::print("DBG: SS {} Feed {} possibly destroyed {}, {} metadata create, {} desired committed\n",
+		           data->thisServerID.toString().substr(0, 4),
+		           changeFeedInfo->id.printable(),
+		           changeFeedInfo->possiblyDestroyed,
+		           changeFeedInfo->metadataCreateVersion,
+		           data->desiredOldestVersion.get());
+
+		// There are two reasons for change_feed_not_registered:
+		//   1. The feed was just created, but the ss mutation stream is ahead of the GRV that fetchChangeFeedApplier
+		//   uses to read the change feed data from the database. In this case we need to wait and retry
+		//   2. The feed was destroyed, but we missed a metadata update telling us this. In this case we need to destroy
+		//   the feed
+		// endVersion >= the metadata create version, so we can safely use it as a proxy
+		if (beginVersion != 0 || seenNotRegistered || endVersion <= data->desiredOldestVersion.get()) {
+			// If any of these are true, the feed must be destroyed.
+			Version cleanupVersion = data->data().getLatestVersion();
+
+			TraceEvent(SevDebug, "DestroyingChangeFeedFromFetch", data->thisServerID)
+			    .detail("RangeID", changeFeedInfo->id.printable())
+			    .detail("Range", changeFeedInfo->range.toString())
+			    .detail("Version", cleanupVersion);
+
+			if (g_network->isSimulated()) {
+				ASSERT(allDestroyedChangeFeeds.count(changeFeedInfo->id));
+			}
+
+			Key beginClearKey = changeFeedInfo->id.withPrefix(persistChangeFeedKeys.begin);
+
+			auto& mLV = data->addVersionToMutationLog(cleanupVersion);
+			data->addMutationToMutationLog(
+			    mLV, MutationRef(MutationRef::ClearRange, beginClearKey, keyAfter(beginClearKey)));
+			++data->counters.kvSystemClearRanges;
+			data->addMutationToMutationLog(mLV,
+			                               MutationRef(MutationRef::ClearRange,
+			                                           changeFeedDurableKey(changeFeedInfo->id, 0),
+			                                           changeFeedDurableKey(changeFeedInfo->id, cleanupVersion)));
+			++data->counters.kvSystemClearRanges;
+
+			changeFeedInfo->destroy(cleanupVersion);
+			data->changeFeedCleanupDurable[changeFeedInfo->id] = cleanupVersion;
+
+			for (auto& it : data->changeFeedRemovals) {
+				it.second.send(changeFeedInfo->id);
+			}
+
+			return invalidVersion;
+		}
+
+		// otherwise assume the feed just hasn't been created on the SS we tried to read it from yet, wait for it to
+		// definitely be committed and retry
+		seenNotRegistered = true;
+		wait(data->desiredOldestVersion.whenAtLeast(endVersion));
 	}
 }

 ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
                                                       KeyRange keys,
-                                                       Version fetchVersion,
-                                                       PromiseStream<Key> removals) {
+                                                       PromiseStream<Key> removals,
+                                                       UID fetchKeysID) {
+
+	// Wait for current TLog batch to finish to ensure that we're fetching metadata at a version >= the version of the
+	// ChangeServerKeys mutation. This guarantees we don't miss any metadata between the previous batch's version
+	// (data->version) and the mutation version.
+	wait(data->version.whenAtLeast(data->version.get() + 1));
+	state Version fetchVersion = data->version.get();
+
 	TraceEvent(SevDebug, "FetchChangeFeedMetadata", data->thisServerID)
-	    .detail("Range", keys.toString())
-	    .detail("FetchVersion", fetchVersion);
-	state std::vector<OverlappingChangeFeedEntry> feeds =
-	    wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion + 1));
+	    .detail("Range", keys)
+	    .detail("FetchVersion", fetchVersion)
+	    .detail("FKID", fetchKeysID);
+
+	state std::set<Key> refreshedFeedIds;
+	state std::set<Key> destroyedFeedIds;
+	// before fetching feeds from other SS's, refresh any feeds we already have that are being marked as removed
+	auto ranges = data->keyChangeFeed.intersectingRanges(keys);
+	for (auto& r : ranges) {
+		for (auto& cfInfo : r.value()) {
+			auto feedCleanup = data->changeFeedCleanupDurable.find(cfInfo->id);
+			if (feedCleanup != data->changeFeedCleanupDurable.end() && cfInfo->removing && !cfInfo->destroyed) {
+				TEST(true); // re-fetching feed scheduled for deletion! Un-mark it as removing
+				destroyedFeedIds.insert(cfInfo->id);
+
+				cfInfo->removing = false;
+				// because we now have a gap in the metadata, it's possible this feed was destroyed
+				cfInfo->possiblyDestroyed = true;
+				// reset fetch versions because everything previously fetched was cleaned up
+				cfInfo->fetchVersion = invalidVersion;
+				cfInfo->durableFetchVersion = NotifiedVersion();
+
+				TraceEvent(SevDebug, "ResetChangeFeedInfo", data->thisServerID)
+				    .detail("RangeID", cfInfo->id.printable())
+				    .detail("Range", cfInfo->range)
+				    .detail("FetchVersion", fetchVersion)
+				    .detail("EmptyVersion", cfInfo->emptyVersion)
+				    .detail("StopVersion", cfInfo->stopVersion)
+				    .detail("FKID", fetchKeysID);
+			}
+		}
+	}
+
+	state std::vector<OverlappingChangeFeedEntry> feeds = wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion));
+	// handle change feeds removed while fetching overlapping
 	while (removals.getFuture().isReady()) {
 		Key remove = waitNext(removals.getFuture());
 		for (int i = 0; i < feeds.size(); i++) {
@ -5078,6 +5253,7 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 			}
 		}
 	}
+
 	std::vector<Key> feedIds;
 	feedIds.reserve(feeds.size());
 	// create change feed metadata if it does not exist
@ -5090,16 +5266,23 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,

 		TraceEvent(SevDebug, "FetchedChangeFeedInfo", data->thisServerID)
 		    .detail("RangeID", cfEntry.rangeId.printable())
-		    .detail("Range", cfEntry.range.toString())
+		    .detail("Range", cfEntry.range)
 		    .detail("FetchVersion", fetchVersion)
 		    .detail("EmptyVersion", cfEntry.emptyVersion)
 		    .detail("StopVersion", cfEntry.stopVersion)
 		    .detail("Existing", existing)
-		    .detail("CleanupPendingVersion", cleanupPending ? cleanupEntry->second : invalidVersion);
+		    .detail("CleanupPendingVersion", cleanupPending ? cleanupEntry->second : invalidVersion)
+		    .detail("FKID", fetchKeysID);

 		bool addMutationToLog = false;
 		Reference<ChangeFeedInfo> changeFeedInfo;

+		auto fid = destroyedFeedIds.find(cfEntry.rangeId);
+		if (fid != destroyedFeedIds.end()) {
+			refreshedFeedIds.insert(cfEntry.rangeId);
+			destroyedFeedIds.erase(fid);
+		}
+
 		if (!existing) {
 			TEST(cleanupPending); // Fetch change feed which is cleanup pending. This means there was a move away and a
 			// move back, this will remake the metadata
@ -5120,30 +5303,26 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 			addMutationToLog = true;
 		} else {
 			changeFeedInfo = existingEntry->second;
-			auto feedCleanup = data->changeFeedCleanupDurable.find(cfEntry.rangeId);

+			if (changeFeedInfo->destroyed) {
+				// race where multiple feeds fetched overlapping change feed, one realized feed was missing and marked
+				// it removed+destroyed, then this one fetched the same info
+				continue;
+			}
+
+			// we checked all feeds we already owned in this range at the start to reset them if they were removing, and
+			// this actor would have been cancelled if a later remove happened
+			ASSERT(!changeFeedInfo->removing);
 			if (cfEntry.stopVersion < changeFeedInfo->stopVersion) {
 				TEST(true); // Change feed updated stop version from fetch metadata
 				changeFeedInfo->stopVersion = cfEntry.stopVersion;
 				addMutationToLog = true;
 			}

-			if (feedCleanup != data->changeFeedCleanupDurable.end() && changeFeedInfo->removing) {
-				TEST(true); // re-fetching feed scheduled for deletion! Un-mark it as removing
-				if (cfEntry.emptyVersion < data->version.get()) {
+			// don't update empty version past SS version if SS is behind, it can cause issues
+			if (cfEntry.emptyVersion < data->version.get() && cfEntry.emptyVersion > changeFeedInfo->emptyVersion) {
+				TEST(true); // Change feed updated empty version from fetch metadata
 				changeFeedInfo->emptyVersion = cfEntry.emptyVersion;
-				}
-
-				changeFeedInfo->removing = false;
-				// reset fetch versions because everything previously fetched was cleaned up
-				changeFeedInfo->fetchVersion = invalidVersion;
-				changeFeedInfo->durableFetchVersion = NotifiedVersion();
-
-				// Since cleanup put a mutation in the log to delete the change feed data, put one in the log to restore
-				// it
-				// We may just want to refactor this so updateStorage does explicit deletes based on
-				// changeFeedCleanupDurable and not use the mutation log at all for the change feed metadata cleanup.
-				// Then we wouldn't have to reset anything here
 				addMutationToLog = true;
 			}
 		}
@ -5163,6 +5342,84 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 			}
 		}
 	}
+
+	TEST(!refreshedFeedIds.empty()); // Feed refreshed between move away and move back
+	TEST(!destroyedFeedIds.empty()); // Feed destroyed between move away and move back
+	for (auto& feedId : refreshedFeedIds) {
+		auto existingEntry = data->uidChangeFeed.find(feedId);
+		if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed) {
+			TEST(true); // feed refreshed
+			continue;
+		}
+
+		// Since cleanup put a mutation in the log to delete the change feed data, put one in the log to restore
+		// it
+		// We may just want to refactor this so updateStorage does explicit deletes based on
+		// changeFeedCleanupDurable and not use the mutation log at all for the change feed metadata cleanup.
+		// Then we wouldn't have to reset anything here or above
+		// Do the mutation log update here instead of above to ensure we only add it back to the mutation log if we're
+		// sure it wasn't deleted in the metadata gap
+		Version metadataVersion = data->data().getLatestVersion();
+		auto& mLV = data->addVersionToMutationLog(metadataVersion);
+		data->addMutationToMutationLog(
+		    mLV,
+		    MutationRef(MutationRef::SetValue,
+		                persistChangeFeedKeys.begin.toString() + existingEntry->second->id.toString(),
+		                changeFeedSSValue(existingEntry->second->range,
+		                                  existingEntry->second->emptyVersion + 1,
+		                                  existingEntry->second->stopVersion)));
+		TraceEvent(SevDebug, "PersistingResetChangeFeedInfo", data->thisServerID)
+		    .detail("RangeID", existingEntry->second->id.printable())
+		    .detail("Range", existingEntry->second->range)
+		    .detail("FetchVersion", fetchVersion)
+		    .detail("EmptyVersion", existingEntry->second->emptyVersion)
+		    .detail("StopVersion", existingEntry->second->stopVersion)
+		    .detail("FKID", fetchKeysID)
+		    .detail("MetadataVersion", metadataVersion);
+	}
+	for (auto& feedId : destroyedFeedIds) {
+		auto existingEntry = data->uidChangeFeed.find(feedId);
+		if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed) {
+			TEST(true); // feed refreshed but then destroyed elsewhere
+			continue;
+		}
+
+		// TODO REMOVE print
+		fmt::print("DBG: SS {} fetching feed {} was refreshed but not present!! assuming destroyed\n",
+		           data->thisServerID.toString().substr(0, 4),
+		           feedId.printable());
+
+		Version cleanupVersion = data->data().getLatestVersion();
+
+		TraceEvent(SevDebug, "DestroyingChangeFeedFromFetchMetadata", data->thisServerID)
+		    .detail("RangeID", feedId.printable())
+		    .detail("Range", existingEntry->second->range)
+		    .detail("Version", cleanupVersion)
+		    .detail("FKID", fetchKeysID);
+
+		if (g_network->isSimulated()) {
+			ASSERT(allDestroyedChangeFeeds.count(feedId));
+		}
+
+		Key beginClearKey = feedId.withPrefix(persistChangeFeedKeys.begin);
+
+		auto& mLV = data->addVersionToMutationLog(cleanupVersion);
+		data->addMutationToMutationLog(mLV,
+		                               MutationRef(MutationRef::ClearRange, beginClearKey, keyAfter(beginClearKey)));
+		++data->counters.kvSystemClearRanges;
+		data->addMutationToMutationLog(mLV,
+		                               MutationRef(MutationRef::ClearRange,
+		                                           changeFeedDurableKey(feedId, 0),
+		                                           changeFeedDurableKey(feedId, cleanupVersion)));
+		++data->counters.kvSystemClearRanges;
+
+		existingEntry->second->destroy(cleanupVersion);
+		data->changeFeedCleanupDurable[feedId] = cleanupVersion;
+
+		for (auto& it : data->changeFeedRemovals) {
+			it.second.send(feedId);
+		}
+	}
 	return feedIds;
 }

@ -5218,7 +5475,6 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
 					}
 				}
 				if (done) {
-					data->changeFeedRemovals.erase(fetchKeysID);
 					return feedMaxFetched;
 				}
 			}
@ -5283,8 +5539,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {

 		state PromiseStream<Key> removals;
 		data->changeFeedRemovals[fetchKeysID] = removals;
-		state Future<std::vector<Key>> fetchCFMetadata =
-		    fetchChangeFeedMetadata(data, keys, data->version.get(), removals);
+		state Future<std::vector<Key>> fetchCFMetadata = fetchChangeFeedMetadata(data, keys, removals, fetchKeysID);

 		validate(data);

@ -5629,6 +5884,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 			}
 		}

+		data->changeFeedRemovals.erase(fetchKeysID);
+
 		shard->phase = AddingShard::Waiting;

 		// Similar to transferred version, but wait for all feed data and
@ -5849,7 +6106,8 @@ void changeServerKeys(StorageServer* data,
 			data->watches.triggerRange(range.begin, range.end);
 		} else if (!dataAvailable) {
 			// SOMEDAY: Avoid restarting adding/transferred shards
-			if (version == 0) { // bypass fetchkeys; shard is known empty at version 0
+			// bypass fetchkeys; shard is known empty at initial cluster version
+			if (version == data->initialClusterVersion - 1) {
 				TraceEvent("ChangeServerKeysInitialRange", data->thisServerID)
 				    .detail("Begin", range.begin)
 				    .detail("End", range.end);
@ -5940,7 +6198,6 @@ void changeServerKeys(StorageServer* data,

 				auto feed = data->uidChangeFeed.find(f.first);
 				if (feed != data->uidChangeFeed.end()) {
-					feed->second->emptyVersion = version - 1;
 					feed->second->removing = true;
 					feed->second->moved(feed->second->range);
 					feed->second->newMutations.trigger();
@ -6039,7 +6296,7 @@ public:
 			} else if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(checkpointPrefix)) {
 				registerPendingCheckpoint(data, m, ver);
 			} else {
-				applyPrivateData(data, m);
+				applyPrivateData(data, ver, m);
 			}
 		} else {
 			if (MUTATION_TRACKING_ENABLED) {
@ -6067,8 +6324,8 @@ private:
 	KeyRef cacheStartKey;
 	bool processedCacheStartKey;

-	void applyPrivateData(StorageServer* data, MutationRef const& m) {
-		TraceEvent(SevDebug, "SSPrivateMutation", data->thisServerID).detail("Mutation", m);
+	void applyPrivateData(StorageServer* data, Version ver, MutationRef const& m) {
+		TraceEvent(SevDebug, "SSPrivateMutation", data->thisServerID).detail("Mutation", m).detail("Version", ver);

 		if (processedStartKey) {
 			// Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end)
@ -6242,8 +6499,11 @@ private:
 							feed->second->durableVersion = invalidVersion;
 						}
 					}
+					if (!feed->second->destroyed) {
+						// if feed is destroyed, adding an extra mutation here would re-create it if SS restarted
 						addMutationToLog = true;
 					}
+				}

 			} else if (status == ChangeFeedStatus::CHANGE_FEED_CREATE && createdFeed) {
 				TraceEvent(SevDebug, "CreatingChangeFeed", data->thisServerID)
@ -6278,13 +6538,12 @@ private:
 				                                           changeFeedDurableKey(feed->second->id, currentVersion)));
 				++data->counters.kvSystemClearRanges;

-				feed->second->emptyVersion = currentVersion - 1;
-				feed->second->stopVersion = currentVersion;
-				feed->second->removing = true;
-				feed->second->moved(feed->second->range);
-				feed->second->newMutations.trigger();
-
+				feed->second->destroy(currentVersion);
 				data->changeFeedCleanupDurable[feed->first] = cleanupVersion;
+
+				if (g_network->isSimulated()) {
+					allDestroyedChangeFeeds.insert(changeFeedId);
+				}
 			}

 			if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) {
@ -6734,7 +6993,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 					    .detail("Version", cloneCursor2->version().toString());
 				} else if (ver != invalidVersion) { // This change belongs to a version < minVersion
 					DEBUG_MUTATION("SSPeek", ver, msg, data->thisServerID);
-					if (ver == 1) {
+					if (ver == data->initialClusterVersion) {
 						//TraceEvent("SSPeekMutation", data->thisServerID).log();
 						// The following trace event may produce a value with special characters
 						TraceEvent("SSPeekMutation", data->thisServerID)
@ -6850,6 +7109,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 			proposedOldestVersion = std::min(proposedOldestVersion, data->version.get() - 1);
 			proposedOldestVersion = std::max(proposedOldestVersion, data->oldestVersion.get());
 			proposedOldestVersion = std::max(proposedOldestVersion, data->desiredOldestVersion.get());
+			proposedOldestVersion = std::max(proposedOldestVersion, data->initialClusterVersion);

 			//TraceEvent("StorageServerUpdated", data->thisServerID).detail("Ver", ver).detail("DataVersion", data->version.get())
 			//	.detail("LastTLogVersion", data->lastTLogVersion).detail("NewOldest",
@ -8212,8 +8472,13 @@ ACTOR Future<Void> serveWatchValueRequestsImpl(StorageServer* self, FutureStream
 			loop {
 				try {
 					state Version latest = self->version.get();
-					GetValueRequest getReq(
-					    span.context, TenantInfo(), metadata->key, latest, metadata->tags, metadata->debugID);
+					GetValueRequest getReq(span.context,
+					                       TenantInfo(),
+					                       metadata->key,
+					                       latest,
+					                       metadata->tags,
+					                       metadata->debugID,
+					                       VersionVector());
 					state Future<Void> getValue = getValueQ(self, getReq);
 					GetValueReply reply = wait(getReq.reply.getFuture());
 					metadata = self->getWatchMetadata(req.key.contents());
@ -8715,6 +8980,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                 StorageServerInterface ssi,
                                 Tag seedTag,
                                 UID clusterId,
+                                 Version startVersion,
                                 Version tssSeedVersion,
                                 ReplyPromise<InitializeStorageReply> recruitReply,
                                 Reference<AsyncVar<ServerDBInfo> const> db,
@ -8722,6 +8988,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 	state StorageServer self(persistentData, db, ssi);
 	state Future<Void> ssCore;
 	self.clusterId.send(clusterId);
+	self.initialClusterVersion = startVersion;
 	if (ssi.isTss()) {
 		self.setTssPair(ssi.tssPairID.get());
 		ASSERT(self.isTss());
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -2183,6 +2183,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 				                 [&req](const auto& p) { return p.second != req.storeType; }) ||
 				     req.seedTag != invalidTag)) {
 					ASSERT(req.clusterId.isValid());
+					ASSERT(req.initialClusterVersion >= 0);
 					LocalLineage _;
 					getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
 					bool isTss = req.tssPairIDAndVersion.present();
@ -2244,6 +2245,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					                               recruited,
 					                               req.seedTag,
 					                               req.clusterId,
+					                               req.initialClusterVersion,
 					                               isTss ? req.tssPairIDAndVersion.get().second : 0,
 					                               storageReady,
 					                               dbInfo,
--- a/fdbserver/workloads/ApiWorkload.h
+++ b/fdbserver/workloads/ApiWorkload.h
@ -77,6 +77,12 @@ struct TransactionWrapper : public ReferenceCounted<TransactionWrapper> {
 	// Gets the committed version of a transaction
 	virtual Version getCommittedVersion() = 0;

+	// Gets the version vector cached in a transaction
+	virtual VersionVector getVersionVector() = 0;
+
+	// Gets the spanID of a transaction
+	virtual UID getSpanID() = 0;
+
 	// Prints debugging messages for a transaction; not implemented for all transaction types
 	virtual void debugTransaction(UID debugId) {}

@ -152,6 +158,12 @@ struct FlowTransactionWrapper : public TransactionWrapper {
 	// Gets the committed version of a transaction
 	Version getCommittedVersion() override { return transaction.getCommittedVersion(); }

+	// Gets the version vector cached in a transaction
+	VersionVector getVersionVector() override { return transaction.getVersionVector(); }
+
+	// Gets the spanID of a transaction
+	UID getSpanID() override { return transaction.getSpanID(); }
+
 	// Prints debugging messages for a transaction
 	void debugTransaction(UID debugId) override { transaction.debugTransaction(debugId); }

@ -214,6 +226,12 @@ struct ThreadTransactionWrapper : public TransactionWrapper {
 	// Gets the committed version of a transaction
 	Version getCommittedVersion() override { return transaction->getCommittedVersion(); }

+	// Gets the version vector cached in a transaction
+	VersionVector getVersionVector() override { return transaction->getVersionVector(); }
+
+	// Gets the spanID of a transaction
+	UID getSpanID() override { return transaction->getSpanID(); }
+
 	void addReadConflictRange(KeyRangeRef const& keys) override { transaction->addReadConflictRange(keys); }
 };

--- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
@ -62,8 +62,9 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 	int64_t timeTravelTooOld = 0;
 	int64_t rowsRead = 0;
 	int64_t bytesRead = 0;
+	int64_t purges = 0;
 	std::vector<Future<Void>> clients;
-	bool enablePruning;
+	bool enablePurging;

 	DatabaseConfiguration config;

@ -79,7 +80,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		timeTravelLimit = getOption(options, LiteralStringRef("timeTravelLimit"), testDuration);
 		timeTravelBufferSize = getOption(options, LiteralStringRef("timeTravelBufferSize"), 100000000);
 		threads = getOption(options, LiteralStringRef("threads"), 1);
-		enablePruning = getOption(options, LiteralStringRef("enablePruning"), false /*sharedRandomNumber % 2 == 0*/);
+		enablePurging = getOption(options, LiteralStringRef("enablePurging"), false /*sharedRandomNumber % 2 == 0*/);
 		ASSERT(threads >= 1);

 		if (BGV_DEBUG) {
@ -177,60 +178,6 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		OldRead(KeyRange range, Version v, RangeResult oldResult) : range(range), v(v), oldResult(oldResult) {}
 	};

-	// utility to prune <range> at pruneVersion=<version> with the <force> flag
-	ACTOR Future<Void> pruneAtVersion(Database cx, KeyRange range, Version version, bool force) {
-		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
-		state Key pruneKey;
-		loop {
-			try {
-				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-
-				Value pruneValue = blobGranulePruneValueFor(version, range, force);
-				tr->atomicOp(
-				    addVersionStampAtEnd(blobGranulePruneKeys.begin), pruneValue, MutationRef::SetVersionstampedKey);
-				tr->set(blobGranulePruneChangeKey, deterministicRandom()->randomUniqueID().toString());
-				state Future<Standalone<StringRef>> fTrVs = tr->getVersionstamp();
-				wait(tr->commit());
-				Standalone<StringRef> vs = wait(fTrVs);
-				pruneKey = blobGranulePruneKeys.begin.withSuffix(vs);
-				if (BGV_DEBUG) {
-					fmt::print("pruneAtVersion for range [{0} - {1}) at version {2} succeeded\n",
-					           range.begin.printable(),
-					           range.end.printable(),
-					           version);
-				}
-				break;
-			} catch (Error& e) {
-				if (BGV_DEBUG) {
-					fmt::print("pruneAtVersion for range [{0} - {1}) at version {2} encountered error {3}\n",
-					           range.begin.printable(),
-					           range.end.printable(),
-					           version,
-					           e.name());
-				}
-				wait(tr->onError(e));
-			}
-		}
-		tr->reset();
-		loop {
-			try {
-				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-
-				Optional<Value> pruneVal = wait(tr->get(pruneKey));
-				if (!pruneVal.present()) {
-					return Void();
-				}
-				state Future<Void> watchFuture = tr->watch(pruneKey);
-				wait(tr->commit());
-				wait(watchFuture);
-			} catch (Error& e) {
-				wait(tr->onError(e));
-			}
-		}
-	}
-
 	ACTOR Future<Void> killBlobWorkers(Database cx, BlobGranuleVerifierWorkload* self) {
 		state Transaction tr(cx);
 		state std::set<UID> knownWorkers;
@ -272,12 +219,12 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		}
 	}

-	ACTOR Future<Void> verifyGranules(Database cx, BlobGranuleVerifierWorkload* self, bool allowPruning) {
+	ACTOR Future<Void> verifyGranules(Database cx, BlobGranuleVerifierWorkload* self, bool allowPurging) {
 		state double last = now();
 		state double endTime = last + self->testDuration;
 		state std::map<double, OldRead> timeTravelChecks;
 		state int64_t timeTravelChecksMemory = 0;
-		state Version prevPruneVersion = -1;
+		state Version prevPurgeVersion = -1;
 		state UID dbgId = debugRandom()->randomUniqueID();

 		TraceEvent("BlobGranuleVerifierStart");
@ -300,25 +247,27 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 					state OldRead oldRead = timeTravelIt->second;
 					timeTravelChecksMemory -= oldRead.oldResult.expectedSize();
 					timeTravelIt = timeTravelChecks.erase(timeTravelIt);
-					if (prevPruneVersion == -1) {
-						prevPruneVersion = oldRead.v;
+					if (prevPurgeVersion == -1) {
+						prevPurgeVersion = oldRead.v;
 					}
 					// advance iterator before doing read, so if it gets error we don't retry it

 					try {
-						state Version newPruneVersion = 0;
-						state bool doPruning = allowPruning && deterministicRandom()->random01() < 0.5;
-						if (doPruning) {
-							Version maxPruneVersion = oldRead.v;
+						state Version newPurgeVersion = 0;
+						state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5;
+						if (doPurging) {
+							Version maxPurgeVersion = oldRead.v;
 							for (auto& it : timeTravelChecks) {
-								maxPruneVersion = std::min(it.second.v, maxPruneVersion);
+								maxPurgeVersion = std::min(it.second.v, maxPurgeVersion);
 							}
-							if (prevPruneVersion < maxPruneVersion) {
-								newPruneVersion = deterministicRandom()->randomInt64(prevPruneVersion, maxPruneVersion);
-								prevPruneVersion = std::max(prevPruneVersion, newPruneVersion);
-								wait(self->pruneAtVersion(cx, normalKeys, newPruneVersion, false));
+							if (prevPurgeVersion < maxPurgeVersion) {
+								newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion);
+								prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion);
+								Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, false));
+								wait(cx->waitPurgeGranulesComplete(purgeKey));
+								self->purges++;
 							} else {
-								doPruning = false;
+								doPurging = false;
 							}
 						}
 						std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> reReadResult =
@ -328,12 +277,12 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 						}
 						self->timeTravelReads++;

-						if (doPruning) {
+						if (doPurging) {
 							wait(self->killBlobWorkers(cx, self));
 							std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead =
-							    wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPruneVersion));
+							    wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion));
 							try {
-								Version minSnapshotVersion = newPruneVersion;
+								Version minSnapshotVersion = newPurgeVersion;
 								for (auto& it : versionRead.second) {
 									minSnapshotVersion = std::min(minSnapshotVersion, it.snapshotVersion);
 								}
@ -395,10 +344,10 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 	Future<Void> start(Database const& cx) override {
 		clients.reserve(threads + 1);
 		clients.push_back(timeout(findGranules(cx, this), testDuration, Void()));
-		if (enablePruning && clientId == 0) {
+		if (enablePurging && clientId == 0) {
 			clients.push_back(
 			    timeout(reportErrors(verifyGranules(cx, this, true), "BlobGranuleVerifier"), testDuration, Void()));
-		} else if (!enablePruning) {
+		} else if (!enablePurging) {
 			for (int i = 0; i < threads; i++) {
 				clients.push_back(timeout(
 				    reportErrors(verifyGranules(cx, this, false), "BlobGranuleVerifier"), testDuration, Void()));
@ -518,6 +467,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		fmt::print("  {} time travel reads\n", self->timeTravelReads);
 		fmt::print("  {} rows\n", self->rowsRead);
 		fmt::print("  {} bytes\n", self->bytesRead);
+		fmt::print("  {} purges\n", self->purges);
 		// FIXME: add above as details to trace event

 		TraceEvent("BlobGranuleVerifierChecked").detail("Result", result);
--- a/fdbserver/workloads/ConfigureDatabase.actor.cpp
+++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp
@ -227,7 +227,8 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 	double testDuration;
 	int additionalDBs;
 	bool allowDescriptorChange;
-	bool allowTestStorageMigration;
+	bool allowTestStorageMigration; // allow change storage migration and perpetual wiggle conf
+	bool storageMigrationCompatibleConf; // only allow generating configuration suitable for storage migration test
 	bool waitStoreTypeCheck;
 	bool downgradeTest1; // if this is true, don't pick up downgrade incompatible config
 	std::vector<Future<Void>> clients;
@ -239,6 +240,7 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 		    getOption(options, LiteralStringRef("allowDescriptorChange"), SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT);
 		allowTestStorageMigration =
 		    getOption(options, "allowTestStorageMigration"_sr, false) && g_simulator.allowStorageMigrationTypeChange;
+		storageMigrationCompatibleConf = getOption(options, "storageMigrationCompatibleConf"_sr, false);
 		waitStoreTypeCheck = getOption(options, "waitStoreTypeCheck"_sr, false);
 		downgradeTest1 = getOption(options, "downgradeTest1"_sr, false);
 		g_simulator.usableRegions = 1;
@ -349,7 +351,11 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 			}
 			state int randomChoice;
 			if (self->allowTestStorageMigration) {
-				randomChoice = deterministicRandom()->randomInt(4, 9);
+				randomChoice = (deterministicRandom()->random01() < 0.375) ? deterministicRandom()->randomInt(0, 3)
+				                                                           : deterministicRandom()->randomInt(4, 9);
+			} else if (self->storageMigrationCompatibleConf) {
+				randomChoice = (deterministicRandom()->random01() < 3.0 / 7) ? deterministicRandom()->randomInt(0, 3)
+				                                                             : deterministicRandom()->randomInt(5, 9);
 			} else {
 				randomChoice = deterministicRandom()->randomInt(0, 8);
 			}
--- a/fdbserver/workloads/DiskFailureInjection.actor.cpp
+++ b/fdbserver/workloads/DiskFailureInjection.actor.cpp
@ -154,9 +154,13 @@ struct DiskFailureInjectionWorkload : TestWorkload {
 		loop {
 			wait(poisson(&lastTime, 1));
 			try {
-				wait(store(machines, getStorageWorkers(cx, self->dbInfo, false)));
+				std::pair<std::vector<W>, int> m = wait(getStorageWorkers(cx, self->dbInfo, false));
+				if (m.second > 0) {
+					throw operation_failed();
+				}
+				machines = std::move(m.first);
 			} catch (Error& e) {
-				// If we failed to get a list of storage servers, we can't inject failure events
+				// If we failed to get a complete list of storage servers, we can't inject failure events
 				// But don't throw the error in that case
 				continue;
 			}
--- a/fdbserver/workloads/EncryptionOps.actor.cpp
+++ b/fdbserver/workloads/EncryptionOps.actor.cpp
@ -20,6 +20,7 @@

 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/NativeAPI.actor.h"
+#include "flow/EncryptUtils.h"
 #include "flow/IRandom.h"
 #include "flow/BlobCipher.h"
 #include "fdbserver/workloads/workloads.actor.h"
@ -116,9 +117,10 @@ struct EncryptionOpsWorkload : TestWorkload {
 	Arena arena;
 	std::unique_ptr<WorkloadMetrics> metrics;

-	BlobCipherDomainId minDomainId;
-	BlobCipherDomainId maxDomainId;
-	BlobCipherBaseKeyId minBaseCipherId;
+	EncryptCipherDomainId minDomainId;
+	EncryptCipherDomainId maxDomainId;
+	EncryptCipherBaseKeyId minBaseCipherId;
+	EncryptCipherBaseKeyId headerBaseCipherId;

 	EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
 		mode = getOption(options, LiteralStringRef("fixedSize"), 1);
@ -131,6 +133,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 		minDomainId = wcx.clientId * 100 + mode * 30 + 1;
 		maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
 		minBaseCipherId = 100;
+		headerBaseCipherId = wcx.clientId * 100 + 1;

 		metrics = std::make_unique<WorkloadMetrics>();

@ -167,17 +170,21 @@ struct EncryptionOpsWorkload : TestWorkload {

 		uint8_t buff[AES_256_KEY_LENGTH];
 		std::vector<Reference<BlobCipherKey>> cipherKeys;
-		for (BlobCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
 		int cipherLen = 0;
+		for (EncryptCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
 			generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
 			cipherKeyCache.insertCipherKey(id, minBaseCipherId, buff, cipherLen);

 			ASSERT(cipherLen > 0 && cipherLen <= AES_256_KEY_LENGTH);

 			cipherKeys = cipherKeyCache.getAllCiphers(id);
-			ASSERT(cipherKeys.size() == 1);
+			ASSERT_EQ(cipherKeys.size(), 1);
 		}

+		// insert the Encrypt Header cipherKey
+		generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
+		cipherKeyCache.insertCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId, buff, cipherLen);
+
 		TraceEvent("SetupCipherEssentials_Done").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
 	}

@ -188,10 +195,10 @@ struct EncryptionOpsWorkload : TestWorkload {
 		TraceEvent("ResetCipherEssentials_Done").log();
 	}

-	void updateLatestBaseCipher(const BlobCipherDomainId encryptDomainId,
+	void updateLatestBaseCipher(const EncryptCipherDomainId encryptDomainId,
 	                            uint8_t* baseCipher,
 	                            int* baseCipherLen,
-	                            BlobCipherBaseKeyId* nextBaseCipherId) {
+	                            EncryptCipherBaseKeyId* nextBaseCipherId) {
 		auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
 		Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
 		*nextBaseCipherId = cipherKey->getBaseCipherId() + 1;
@ -202,22 +209,24 @@ struct EncryptionOpsWorkload : TestWorkload {
 		TraceEvent("UpdateBaseCipher").detail("DomainId", encryptDomainId).detail("BaseCipherId", *nextBaseCipherId);
 	}

-	Reference<EncryptBuf> doEncryption(Reference<BlobCipherKey> key,
+	Reference<EncryptBuf> doEncryption(Reference<BlobCipherKey> textCipherKey,
+	                                   Reference<BlobCipherKey> headerCipherKey,
 	                                   uint8_t* payload,
 	                                   int len,
+	                                   const EncryptAuthTokenMode authMode,
 	                                   BlobCipherEncryptHeader* header) {
 		uint8_t iv[AES_256_IV_LENGTH];
 		generateRandomData(&iv[0], AES_256_IV_LENGTH);
-		EncryptBlobCipherAes265Ctr encryptor(key, &iv[0], AES_256_IV_LENGTH);
+		EncryptBlobCipherAes265Ctr encryptor(textCipherKey, headerCipherKey, &iv[0], AES_256_IV_LENGTH, authMode);

 		auto start = std::chrono::high_resolution_clock::now();
 		Reference<EncryptBuf> encrypted = encryptor.encrypt(payload, len, header, arena);
 		auto end = std::chrono::high_resolution_clock::now();

 		// validate encrypted buffer size and contents (not matching with plaintext)
-		ASSERT(encrypted->getLogicalSize() == len);
-		ASSERT(memcmp(encrypted->begin(), payload, len) != 0);
-		ASSERT(header->flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(encrypted->getLogicalSize(), len);
+		ASSERT_NE(memcmp(encrypted->begin(), payload, len), 0);
+		ASSERT_EQ(header->flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);

 		metrics->updateEncryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
 		return encrypted;
@ -228,23 +237,30 @@ struct EncryptionOpsWorkload : TestWorkload {
 	                  const BlobCipherEncryptHeader& header,
 	                  uint8_t* originalPayload,
 	                  Reference<BlobCipherKey> orgCipherKey) {
-		ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
-		ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
+		ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);

 		auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
-		Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
+		Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
+		                                                                 header.cipherTextDetails.baseCipherId);
+		Reference<BlobCipherKey> headerCipherKey = cipherKeyCache.getCipherKey(
+		    header.cipherHeaderDetails.encryptDomainId, header.cipherHeaderDetails.baseCipherId);
 		ASSERT(cipherKey.isValid());
 		ASSERT(cipherKey->isEqual(orgCipherKey));

-		DecryptBlobCipherAes256Ctr decryptor(cipherKey, &header.iv[0]);
+		DecryptBlobCipherAes256Ctr decryptor(cipherKey, headerCipherKey, &header.cipherTextDetails.iv[0]);
+		const bool validateHeaderAuthToken = deterministicRandom()->randomInt(0, 100) < 65;

 		auto start = std::chrono::high_resolution_clock::now();
+		if (validateHeaderAuthToken) {
+			decryptor.verifyHeaderAuthToken(header, arena);
+		}
 		Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), len, header, arena);
 		auto end = std::chrono::high_resolution_clock::now();

 		// validate decrypted buffer size and contents (matching with original plaintext)
-		ASSERT(decrypted->getLogicalSize() == len);
-		ASSERT(memcmp(decrypted->begin(), originalPayload, len) == 0);
+		ASSERT_EQ(decrypted->getLogicalSize(), len);
+		ASSERT_EQ(memcmp(decrypted->begin(), originalPayload, len), 0);

 		metrics->updateDecryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
 	}
@ -256,7 +272,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 	Future<Void> start(Database const& cx) override {
 		uint8_t baseCipher[AES_256_KEY_LENGTH];
 		int baseCipherLen = 0;
-		BlobCipherBaseKeyId nextBaseCipherId;
+		EncryptCipherBaseKeyId nextBaseCipherId;

 		// Setup encryptDomainIds and corresponding baseCipher details
 		setupCipherEssentials();
@ -268,7 +284,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 			auto& cipherKeyCache = BlobCipherKeyCache::getInstance();

 			// randomly select a domainId
-			const BlobCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId);
+			const EncryptCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId);
 			ASSERT(encryptDomainId >= minDomainId && encryptDomainId <= maxDomainId);

 			if (updateBaseCipher) {
@ -279,14 +295,17 @@ struct EncryptionOpsWorkload : TestWorkload {

 			auto start = std::chrono::high_resolution_clock::now();
 			Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
+			// Each client working with their own version of encryptHeaderCipherKey, avoid using getLatest()
+			Reference<BlobCipherKey> headerCipherKey =
+			    cipherKeyCache.getCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId);
 			auto end = std::chrono::high_resolution_clock::now();
 			metrics->updateKeyDerivationTime(std::chrono::duration<double, std::nano>(end - start).count());

 			// Validate sanity of "getLatestCipher", especially when baseCipher gets updated
 			if (updateBaseCipher) {
-				ASSERT(cipherKey->getBaseCipherId() == nextBaseCipherId);
-				ASSERT(cipherKey->getBaseCipherLen() == baseCipherLen);
-				ASSERT(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0);
+				ASSERT_EQ(cipherKey->getBaseCipherId(), nextBaseCipherId);
+				ASSERT_EQ(cipherKey->getBaseCipherLen(), baseCipherLen);
+				ASSERT_EQ(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen), 0);
 			}

 			int dataLen = isFixedSizePayload() ? pageSize : deterministicRandom()->randomInt(100, maxBufSize);
@ -294,8 +313,12 @@ struct EncryptionOpsWorkload : TestWorkload {

 			// Encrypt the payload - generates BlobCipherEncryptHeader to assist decryption later
 			BlobCipherEncryptHeader header;
+			const EncryptAuthTokenMode authMode = deterministicRandom()->randomInt(0, 100) < 50
+			                                          ? ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE
+			                                          : ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI;
 			try {
-				Reference<EncryptBuf> encrypted = doEncryption(cipherKey, buff.get(), dataLen, &header);
+				Reference<EncryptBuf> encrypted =
+				    doEncryption(cipherKey, headerCipherKey, buff.get(), dataLen, authMode, &header);

 				// Decrypt the payload - parses the BlobCipherEncryptHeader, fetch corresponding cipherKey and
 				// decrypt
@ -303,7 +326,8 @@ struct EncryptionOpsWorkload : TestWorkload {
 			} catch (Error& e) {
 				TraceEvent("Failed")
 				    .detail("DomainId", encryptDomainId)
-				    .detail("BaseCipherId", cipherKey->getBaseCipherId());
+				    .detail("BaseCipherId", cipherKey->getBaseCipherId())
+				    .detail("AuthMode", authMode);
 				throw;
 			}

--- a/fdbserver/workloads/workloads.actor.h
+++ b/fdbserver/workloads/workloads.actor.h
@ -273,7 +273,8 @@ Future<Void> quietDatabase(Database const& cx,
                           int64_t maxTLogQueueGate = 5e6,
                           int64_t maxStorageServerQueueGate = 5e6,
                           int64_t maxDataDistributionQueueSize = 0,
-                           int64_t maxPoppedVersionLag = 30e6);
+                           int64_t maxPoppedVersionLag = 30e6,
+                           int64_t maxVersionOffset = 1e6);

 /**
 * A utility function for testing error situations. It succeeds if the given test
--- a/flow/BlobCipher.cpp
+++ b/flow/BlobCipher.cpp
@ -19,6 +19,7 @@
 */

 #include "flow/BlobCipher.h"
+#include "flow/EncryptUtils.h"
 #include "flow/Error.h"
 #include "flow/FastRef.h"
 #include "flow/IRandom.h"
@ -29,21 +30,23 @@

 #include <cstring>
 #include <memory>
+#include <string>

 #if ENCRYPTION_ENABLED

-// BlobCipherEncryptHeader
-BlobCipherEncryptHeader::BlobCipherEncryptHeader() {
-	flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_NONE;
+namespace {
+bool isEncryptHeaderAuthTokenModeValid(const EncryptAuthTokenMode mode) {
+	return mode >= ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && mode < ENCRYPT_HEADER_AUTH_TOKEN_LAST;
 }
+} // namespace

 // BlobCipherKey class methods

-BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId,
-                             const BlobCipherBaseKeyId& baseCiphId,
+BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId,
+                             const EncryptCipherBaseKeyId& baseCiphId,
                             const uint8_t* baseCiph,
                             int baseCiphLen) {
-	BlobCipherRandomSalt salt;
+	EncryptCipherRandomSalt salt;
 	if (g_network->isSimulated()) {
 		salt = deterministicRandom()->randomUInt64();
 	} else {
@ -58,11 +61,11 @@ BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId,
 	    .detail("CreationTime", creationTime);*/
 }

-void BlobCipherKey::initKey(const BlobCipherDomainId& domainId,
+void BlobCipherKey::initKey(const EncryptCipherDomainId& domainId,
                            const uint8_t* baseCiph,
                            int baseCiphLen,
-                            const BlobCipherBaseKeyId& baseCiphId,
-                            const BlobCipherRandomSalt& salt) {
+                            const EncryptCipherBaseKeyId& baseCiphId,
+                            const EncryptCipherRandomSalt& salt) {
 	// Set the base encryption key properties
 	baseCipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
 	memset(baseCipher.get(), 0, AES_256_KEY_LENGTH);
@ -82,11 +85,11 @@ void BlobCipherKey::initKey(const BlobCipherDomainId& domainId,

 void BlobCipherKey::applyHmacSha256Derivation() {
 	Arena arena;
-	uint8_t buf[baseCipherLen + sizeof(BlobCipherRandomSalt)];
+	uint8_t buf[baseCipherLen + sizeof(EncryptCipherRandomSalt)];
 	memcpy(&buf[0], baseCipher.get(), baseCipherLen);
-	memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(BlobCipherRandomSalt));
+	memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(EncryptCipherRandomSalt));
 	HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen);
-	StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(BlobCipherRandomSalt), arena);
+	StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(EncryptCipherRandomSalt), arena);
 	std::copy(digest.begin(), digest.end(), cipher.get());
 	if (digest.size() < AES_256_KEY_LENGTH) {
 		memcpy(cipher.get() + digest.size(), buf, AES_256_KEY_LENGTH - digest.size());
@ -101,10 +104,10 @@ void BlobCipherKey::reset() {
 // BlobKeyIdCache class methods

 BlobCipherKeyIdCache::BlobCipherKeyIdCache()
-  : domainId(INVALID_DOMAIN_ID), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {}
+  : domainId(ENCRYPT_INVALID_DOMAIN_ID), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID) {}

-BlobCipherKeyIdCache::BlobCipherKeyIdCache(BlobCipherDomainId dId)
-  : domainId(dId), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {
+BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId)
+  : domainId(dId), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID) {
 	TraceEvent("Init_BlobCipherKeyIdCache").detail("DomainId", domainId);
 }

@ -112,7 +115,7 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::getLatestCipherKey() {
 	return getCipherByBaseCipherId(latestBaseCipherKeyId);
 }

-Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId) {
+Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(EncryptCipherBaseKeyId baseCipherKeyId) {
 	BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherKeyId);
 	if (itr == keyIdCache.end()) {
 		throw encrypt_key_not_found();
@ -120,10 +123,10 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCiphe
 	return itr->second;
 }

-void BlobCipherKeyIdCache::insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId,
+void BlobCipherKeyIdCache::insertBaseCipherKey(EncryptCipherBaseKeyId baseCipherId,
                                               const uint8_t* baseCipher,
                                               int baseCipherLen) {
-	ASSERT(baseCipherId > INVALID_CIPHER_KEY_ID);
+	ASSERT_GT(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID);

 	// BaseCipherKeys are immutable, ensure that cached value doesn't get updated.
 	BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherId);
@ -165,11 +168,11 @@ std::vector<Reference<BlobCipherKey>> BlobCipherKeyIdCache::getAllCipherKeys() {

 // BlobCipherKeyCache class methods

-void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId,
-                                         const BlobCipherBaseKeyId& baseCipherId,
+void BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId,
+                                         const EncryptCipherBaseKeyId& baseCipherId,
                                         const uint8_t* baseCipher,
                                         int baseCipherLen) {
-	if (domainId == INVALID_DOMAIN_ID || baseCipherId == INVALID_CIPHER_KEY_ID) {
+	if (domainId == ENCRYPT_INVALID_DOMAIN_ID || baseCipherId == ENCRYPT_INVALID_CIPHER_KEY_ID) {
 		throw encrypt_invalid_id();
 	}

@ -193,7 +196,7 @@ void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId,
 	}
 }

-Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const BlobCipherDomainId& domainId) {
+Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const EncryptCipherDomainId& domainId) {
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
 		TraceEvent("GetLatestCipherKey_DomainNotFound").detail("DomainId", domainId);
@ -212,8 +215,8 @@ Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const BlobCipher
 	return cipherKey;
 }

-Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const BlobCipherDomainId& domainId,
-                                                          const BlobCipherBaseKeyId& baseCipherId) {
+Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const EncryptCipherDomainId& domainId,
+                                                          const EncryptCipherBaseKeyId& baseCipherId) {
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
 		throw encrypt_key_not_found();
@ -223,7 +226,7 @@ Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const BlobCipherDomain
 	return keyIdCache->getCipherByBaseCipherId(baseCipherId);
 }

-void BlobCipherKeyCache::resetEncyrptDomainId(const BlobCipherDomainId domainId) {
+void BlobCipherKeyCache::resetEncyrptDomainId(const EncryptCipherDomainId domainId) {
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
 		throw encrypt_key_not_found();
@ -245,7 +248,7 @@ void BlobCipherKeyCache::cleanup() noexcept {
 	instance.domainCacheMap.clear();
 }

-std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const BlobCipherDomainId& domainId) {
+std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const EncryptCipherDomainId& domainId) {
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
 		return {};
@ -255,13 +258,17 @@ std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const Bl
 	return keyIdCache->getAllCipherKeys();
 }

-// EncryptBlobCipher class methods
+// EncryptBlobCipherAes265Ctr class methods

-EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key,
+EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> tCipherKey,
+                                                       Reference<BlobCipherKey> hCipherKey,
                                                       const uint8_t* cipherIV,
-                                                       const int ivLen)
-  : ctx(EVP_CIPHER_CTX_new()), cipherKey(key) {
-	ASSERT(ivLen == AES_256_IV_LENGTH);
+                                                       const int ivLen,
+                                                       const EncryptAuthTokenMode mode)
+  : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode) {
+	ASSERT(isEncryptHeaderAuthTokenModeValid(mode));
+	ASSERT_EQ(ivLen, AES_256_IV_LENGTH);
+
 	memcpy(&iv[0], cipherIV, ivLen);

 	if (ctx == nullptr) {
@ -270,7 +277,7 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey>
 	if (EVP_EncryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr) != 1) {
 		throw encrypt_ops_error();
 	}
-	if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), cipherIV) != 1) {
+	if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, textCipherKey.getPtr()->data(), cipherIV) != 1) {
 		throw encrypt_ops_error();
 	}
 }
@ -281,21 +288,29 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
                                                          Arena& arena) {
 	TEST(true); // Encrypting data with BlobCipher

-	Reference<EncryptBuf> encryptBuf = makeReference<EncryptBuf>(plaintextLen + AES_BLOCK_SIZE, arena);
+	memset(reinterpret_cast<uint8_t*>(header), 0, sizeof(BlobCipherEncryptHeader));
+
+	// Alloc buffer computation accounts for 'header authentication' generation scheme. If single-auth-token needs to be
+	// generated, allocate buffer sufficient to append header to the cipherText optimizing memcpy cost.
+
+	const int allocSize = authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE
+	                          ? plaintextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader)
+	                          : plaintextLen + AES_BLOCK_SIZE;
+	Reference<EncryptBuf> encryptBuf = makeReference<EncryptBuf>(allocSize, arena);
 	uint8_t* ciphertext = encryptBuf->begin();
 	int bytes{ 0 };
 	if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) {
 		TraceEvent("Encrypt_UpdateFailed")
-		    .detail("BaseCipherId", cipherKey->getBaseCipherId())
-		    .detail("EncryptDomainId", cipherKey->getDomainId());
+		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
+		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
 	}

 	int finalBytes{ 0 };
 	if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) {
 		TraceEvent("Encrypt_FinalFailed")
-		    .detail("BaseCipherId", cipherKey->getBaseCipherId())
-		    .detail("EncryptDomainId", cipherKey->getDomainId());
+		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
+		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
 	}

@ -306,19 +321,57 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
 		throw encrypt_ops_error();
 	}

-	// populate header details for the encrypted blob.
+	// Populate encryption header flags details
 	header->flags.size = sizeof(BlobCipherEncryptHeader);
 	header->flags.headerVersion = EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION;
-	header->flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR;
-	header->baseCipherId = cipherKey->getBaseCipherId();
-	header->encryptDomainId = cipherKey->getDomainId();
-	header->salt = cipherKey->getSalt();
-	memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH);
+	header->flags.encryptMode = ENCRYPT_CIPHER_MODE_AES_256_CTR;
+	header->flags.authTokenMode = authTokenMode;

-	// Preserve checksum of encrypted bytes in the header; approach protects against disk induced bit-rot/flip
-	// scenarios. AES CTR mode doesn't generate 'tag' by default as with schemes such as: AES 256 GCM.
+	// Populate cipherText encryption-key details
+	header->cipherTextDetails.baseCipherId = textCipherKey->getBaseCipherId();
+	header->cipherTextDetails.encryptDomainId = textCipherKey->getDomainId();
+	header->cipherTextDetails.salt = textCipherKey->getSalt();
+	memcpy(&header->cipherTextDetails.iv[0], &iv[0], AES_256_IV_LENGTH);

-	header->ciphertextChecksum = computeEncryptChecksum(ciphertext, bytes + finalBytes, cipherKey->getSalt(), arena);
+	if (authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) {
+		// No header 'authToken' generation needed.
+	} else {
+		// Populate header encryption-key details
+		header->cipherHeaderDetails.encryptDomainId = headerCipherKey->getDomainId();
+		header->cipherHeaderDetails.baseCipherId = headerCipherKey->getBaseCipherId();
+
+		// Populate header authToken details
+		if (header->flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) {
+			ASSERT_GE(allocSize, (bytes + finalBytes + sizeof(BlobCipherEncryptHeader)));
+			ASSERT_GE(encryptBuf->getLogicalSize(), (bytes + finalBytes + sizeof(BlobCipherEncryptHeader)));
+
+			memcpy(&ciphertext[bytes + finalBytes],
+			       reinterpret_cast<const uint8_t*>(header),
+			       sizeof(BlobCipherEncryptHeader));
+			StringRef authToken = computeAuthToken(ciphertext,
+			                                       bytes + finalBytes + sizeof(BlobCipherEncryptHeader),
+			                                       headerCipherKey->rawCipher(),
+			                                       AES_256_KEY_LENGTH,
+			                                       arena);
+			memcpy(&header->singleAuthToken.authToken[0], authToken.begin(), AUTH_TOKEN_SIZE);
+		} else {
+			ASSERT_EQ(header->flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+
+			StringRef cipherTextAuthToken =
+			    computeAuthToken(ciphertext,
+			                     bytes + finalBytes,
+			                     reinterpret_cast<const uint8_t*>(&header->cipherTextDetails.salt),
+			                     sizeof(EncryptCipherRandomSalt),
+			                     arena);
+			memcpy(&header->multiAuthTokens.cipherTextAuthToken[0], cipherTextAuthToken.begin(), AUTH_TOKEN_SIZE);
+			StringRef headerAuthToken = computeAuthToken(reinterpret_cast<const uint8_t*>(header),
+			                                             sizeof(BlobCipherEncryptHeader),
+			                                             headerCipherKey->rawCipher(),
+			                                             AES_256_KEY_LENGTH,
+			                                             arena);
+			memcpy(&header->multiAuthTokens.headerAuthToken[0], headerAuthToken.begin(), AUTH_TOKEN_SIZE);
+		}
+	}

 	encryptBuf->setLogicalSize(plaintextLen);
 	return encryptBuf;
@ -330,45 +383,137 @@ EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() {
 	}
 }

-// DecryptBlobCipher class methods
+// DecryptBlobCipherAes256Ctr class methods

-DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv)
-  : ctx(EVP_CIPHER_CTX_new()) {
+DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> tCipherKey,
+                                                       Reference<BlobCipherKey> hCipherKey,
+                                                       const uint8_t* iv)
+  : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey),
+    headerAuthTokenValidationDone(false), authTokensValidationDone(false) {
 	if (ctx == nullptr) {
 		throw encrypt_ops_error();
 	}
 	if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr)) {
 		throw encrypt_ops_error();
 	}
-	if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), iv)) {
+	if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, tCipherKey.getPtr()->data(), iv)) {
 		throw encrypt_ops_error();
 	}
 }

-void DecryptBlobCipherAes256Ctr::verifyEncryptBlobHeader(const uint8_t* ciphertext,
-                                                         const int ciphertextLen,
-                                                         const BlobCipherEncryptHeader& header,
-                                                         Arena& arena) {
-	// validate header flag sanity
-	if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
-	    header.flags.encryptMode != BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR) {
-		TraceEvent("VerifyEncryptBlobHeader")
-		    .detail("HeaderVersion", header.flags.headerVersion)
-		    .detail("HeaderMode", header.flags.encryptMode)
-		    .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
-		    .detail("ExpectedMode", BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
-		throw encrypt_header_metadata_mismatch();
+void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena) {
+	if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI) {
+		// NoneAuthToken mode; no authToken is generated; nothing to do
+		// SingleAuthToken mode; verification will happen as part of decryption.
+		return;
 	}

-	// encrypted byte checksum sanity; protection against data bit-rot/flip.
-	BlobCipherChecksum computed = computeEncryptChecksum(ciphertext, ciphertextLen, header.salt, arena);
-	if (computed != header.ciphertextChecksum) {
-		TraceEvent("VerifyEncryptBlobHeader_ChecksumMismatch")
+	ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+
+	BlobCipherEncryptHeader headerCopy;
+	memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+	       reinterpret_cast<const uint8_t*>(&header),
+	       sizeof(BlobCipherEncryptHeader));
+	memset(reinterpret_cast<uint8_t*>(&headerCopy.multiAuthTokens.headerAuthToken), 0, AUTH_TOKEN_SIZE);
+	StringRef computedHeaderAuthToken = computeAuthToken(reinterpret_cast<const uint8_t*>(&headerCopy),
+	                                                     sizeof(BlobCipherEncryptHeader),
+	                                                     headerCipherKey->rawCipher(),
+	                                                     AES_256_KEY_LENGTH,
+	                                                     arena);
+	if (memcmp(&header.multiAuthTokens.headerAuthToken[0], computedHeaderAuthToken.begin(), AUTH_TOKEN_SIZE) != 0) {
+		TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderMode", header.flags.encryptMode)
-		    .detail("CiphertextChecksum", header.ciphertextChecksum)
-		    .detail("ComputedCiphertextChecksum", computed);
-		throw encrypt_header_checksum_mismatch();
+		    .detail("MultiAuthHeaderAuthToken",
+		            StringRef(arena, &header.multiAuthTokens.headerAuthToken[0], AUTH_TOKEN_SIZE).toString())
+		    .detail("ComputedHeaderAuthToken", computedHeaderAuthToken.toString());
+		throw encrypt_header_authtoken_mismatch();
+	}
+
+	headerAuthTokenValidationDone = true;
+}
+
+void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext,
+                                                             const int ciphertextLen,
+                                                             const BlobCipherEncryptHeader& header,
+                                                             uint8_t* buff,
+                                                             Arena& arena) {
+	// Header authToken not set for single auth-token mode.
+	ASSERT(!headerAuthTokenValidationDone);
+
+	// prepare the payload {cipherText + encryptionHeader}
+	memcpy(&buff[0], ciphertext, ciphertextLen);
+	memcpy(&buff[ciphertextLen], reinterpret_cast<const uint8_t*>(&header), sizeof(BlobCipherEncryptHeader));
+	// ensure the 'authToken' is reset before computing the 'authentication token'
+	BlobCipherEncryptHeader* eHeader = (BlobCipherEncryptHeader*)(&buff[ciphertextLen]);
+	memset(reinterpret_cast<uint8_t*>(&eHeader->singleAuthToken), 0, 2 * AUTH_TOKEN_SIZE);
+
+	StringRef computed = computeAuthToken(
+	    buff, ciphertextLen + sizeof(BlobCipherEncryptHeader), headerCipherKey->rawCipher(), AES_256_KEY_LENGTH, arena);
+	if (memcmp(&header.singleAuthToken.authToken[0], computed.begin(), AUTH_TOKEN_SIZE) != 0) {
+		TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("HeaderMode", header.flags.encryptMode)
+		    .detail("SingleAuthToken",
+		            StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString())
+		    .detail("ComputedSingleAuthToken", computed.toString());
+		throw encrypt_header_authtoken_mismatch();
+	}
+}
+
+void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphertext,
+                                                            const int ciphertextLen,
+                                                            const BlobCipherEncryptHeader& header,
+                                                            uint8_t* buff,
+                                                            Arena& arena) {
+	if (!headerAuthTokenValidationDone) {
+		verifyHeaderAuthToken(header, arena);
+	}
+	StringRef computedCipherTextAuthToken =
+	    computeAuthToken(ciphertext,
+	                     ciphertextLen,
+	                     reinterpret_cast<const uint8_t*>(&header.cipherTextDetails.salt),
+	                     sizeof(EncryptCipherRandomSalt),
+	                     arena);
+	if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], computedCipherTextAuthToken.begin(), AUTH_TOKEN_SIZE) !=
+	    0) {
+		TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("HeaderMode", header.flags.encryptMode)
+		    .detail("MultiAuthCipherTextAuthToken",
+		            StringRef(arena, &header.multiAuthTokens.cipherTextAuthToken[0], AUTH_TOKEN_SIZE).toString())
+		    .detail("ComputedCipherTextAuthToken", computedCipherTextAuthToken.toString());
+		throw encrypt_header_authtoken_mismatch();
+	}
+}
+
+void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext,
+                                                  const int ciphertextLen,
+                                                  const BlobCipherEncryptHeader& header,
+                                                  uint8_t* buff,
+                                                  Arena& arena) {
+	if (header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) {
+		verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, buff, arena);
+	} else {
+		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+		verifyHeaderMultiAuthToken(ciphertext, ciphertextLen, header, buff, arena);
+	}
+
+	authTokensValidationDone = true;
+}
+
+void DecryptBlobCipherAes256Ctr::verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header) {
+	// validate header flag sanity
+	if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
+	    header.flags.encryptMode != ENCRYPT_CIPHER_MODE_AES_256_CTR ||
+	    !isEncryptHeaderAuthTokenModeValid((EncryptAuthTokenMode)header.flags.authTokenMode)) {
+		TraceEvent("VerifyEncryptBlobHeader")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
+		    .detail("EncryptCipherMode", header.flags.encryptMode)
+		    .detail("ExpectedCipherMode", ENCRYPT_CIPHER_MODE_AES_256_CTR)
+		    .detail("EncryptHeaderAuthTokenMode", header.flags.authTokenMode);
+		throw encrypt_header_metadata_mismatch();
 	}
 }

@ -378,23 +523,37 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert
                                                          Arena& arena) {
 	TEST(true); // Decrypting data with BlobCipher

-	verifyEncryptBlobHeader(ciphertext, ciphertextLen, header, arena);
+	verifyEncryptHeaderMetadata(header);
+
+	if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && !headerCipherKey.isValid()) {
+		TraceEvent("Decrypt_InvalidHeaderCipherKey").detail("AuthTokenMode", header.flags.authTokenMode);
+		throw encrypt_ops_error();
+	}
+
+	const int allocSize = header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE
+	                          ? ciphertextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader)
+	                          : ciphertextLen + AES_BLOCK_SIZE;
+	Reference<EncryptBuf> decrypted = makeReference<EncryptBuf>(allocSize, arena);
+
+	if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) {
+		verifyAuthTokens(ciphertext, ciphertextLen, header, decrypted->begin(), arena);
+		ASSERT(authTokensValidationDone);
+	}

-	Reference<EncryptBuf> decrypted = makeReference<EncryptBuf>(ciphertextLen + AES_BLOCK_SIZE, arena);
 	uint8_t* plaintext = decrypted->begin();
 	int bytesDecrypted{ 0 };
 	if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) {
 		TraceEvent("Decrypt_UpdateFailed")
-		    .detail("BaseCipherId", header.baseCipherId)
-		    .detail("EncryptDomainId", header.encryptDomainId);
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
+		    .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
 		throw encrypt_ops_error();
 	}

 	int finalBlobBytes{ 0 };
 	if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) {
 		TraceEvent("Decrypt_FinalFailed")
-		    .detail("BaseCipherId", header.baseCipherId)
-		    .detail("EncryptDomainId", header.encryptDomainId);
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
+		    .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
 		throw encrypt_ops_error();
 	}

@ -443,6 +602,18 @@ StringRef HmacSha256DigestGen::digest(const unsigned char* data, size_t len, Are
 	return StringRef(digest, digestLen);
 }

+StringRef computeAuthToken(const uint8_t* payload,
+                           const int payloadLen,
+                           const uint8_t* key,
+                           const int keyLen,
+                           Arena& arena) {
+	HmacSha256DigestGen hmacGenerator(key, keyLen);
+	StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
+
+	ASSERT_GE(digest.size(), AUTH_TOKEN_SIZE);
+	return digest;
+}
+
 // Only used to link unit tests
 void forceLinkBlobCipherTests() {}

@ -453,41 +624,42 @@ void forceLinkBlobCipherTests() {}
 // 4. Inserting of 'non-identical' cipherKey (already cached) more than once works as desired.
 // 5. Validation encryption ops (correctness):
 //  5.1. Encyrpt a buffer followed by decryption of the buffer, validate the contents.
-//  5.2. Simulate anomolies such as: EncyrptionHeader corruption, checkSum mismatch / encryptionMode mismatch etc.
+//  5.2. Simulate anomalies such as: EncyrptionHeader corruption, authToken mismatch / encryptionMode mismatch etc.
 // 6. Cache cleanup
 //  6.1  cleanup cipherKeys by given encryptDomainId
 //  6.2. Cleanup all cached cipherKeys
 TEST_CASE("flow/BlobCipher") {
 	TraceEvent("BlobCipherTest_Start").log();
+
 	// Construct a dummy External Key Manager representation and populate with some keys
 	class BaseCipher : public ReferenceCounted<BaseCipher>, NonCopyable {
 	public:
-		BlobCipherDomainId domainId;
+		EncryptCipherDomainId domainId;
 		int len;
-		BlobCipherBaseKeyId keyId;
+		EncryptCipherBaseKeyId keyId;
 		std::unique_ptr<uint8_t[]> key;

-		BaseCipher(const BlobCipherDomainId& dId, const BlobCipherBaseKeyId& kId)
+		BaseCipher(const EncryptCipherDomainId& dId, const EncryptCipherBaseKeyId& kId)
 		  : domainId(dId), len(deterministicRandom()->randomInt(AES_256_KEY_LENGTH / 2, AES_256_KEY_LENGTH + 1)),
 		    keyId(kId), key(std::make_unique<uint8_t[]>(len)) {
 			generateRandomData(key.get(), len);
 		}
 	};

-	using BaseKeyMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BaseCipher>>;
-	using DomainKeyMap = std::unordered_map<BlobCipherDomainId, BaseKeyMap>;
+	using BaseKeyMap = std::unordered_map<EncryptCipherBaseKeyId, Reference<BaseCipher>>;
+	using DomainKeyMap = std::unordered_map<EncryptCipherDomainId, BaseKeyMap>;
 	DomainKeyMap domainKeyMap;
-	const BlobCipherDomainId minDomainId = 1;
-	const BlobCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
-	const BlobCipherBaseKeyId minBaseCipherKeyId = 100;
-	const BlobCipherBaseKeyId maxBaseCipherKeyId =
+	const EncryptCipherDomainId minDomainId = 1;
+	const EncryptCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
+	const EncryptCipherBaseKeyId minBaseCipherKeyId = 100;
+	const EncryptCipherBaseKeyId maxBaseCipherKeyId =
 	    deterministicRandom()->randomInt(minBaseCipherKeyId, minBaseCipherKeyId + 50) + 15;
 	for (int dId = minDomainId; dId <= maxDomainId; dId++) {
 		for (int kId = minBaseCipherKeyId; kId <= maxBaseCipherKeyId; kId++) {
 			domainKeyMap[dId].emplace(kId, makeReference<BaseCipher>(dId, kId));
 		}
 	}
-	ASSERT(domainKeyMap.size() == maxDomainId);
+	ASSERT_EQ(domainKeyMap.size(), maxDomainId);

 	// insert BlobCipher keys into BlobCipherKeyCache map and validate
 	TraceEvent("BlobCipherTest_InsertKeys").log();
@ -500,6 +672,11 @@ TEST_CASE("flow/BlobCipher") {
 			    baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len);
 		}
 	}
+	// insert EncryptHeader BlobCipher key
+	Reference<BaseCipher> headerBaseCipher = makeReference<BaseCipher>(ENCRYPT_HEADER_DOMAIN_ID, 1);
+	cipherKeyCache.insertCipherKey(
+	    headerBaseCipher->domainId, headerBaseCipher->keyId, headerBaseCipher->key.get(), headerBaseCipher->len);
+
 	TraceEvent("BlobCipherTest_InsertKeysDone").log();

 	// validate the cipherKey lookups work as desired
@ -509,13 +686,13 @@ TEST_CASE("flow/BlobCipher") {
 			Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(baseCipher->domainId, baseCipher->keyId);
 			ASSERT(cipherKey.isValid());
 			// validate common cipher properties - domainId, baseCipherId, baseCipherLen, rawBaseCipher
-			ASSERT(cipherKey->getBaseCipherId() == baseCipher->keyId);
-			ASSERT(cipherKey->getDomainId() == baseCipher->domainId);
-			ASSERT(cipherKey->getBaseCipherLen() == baseCipher->len);
+			ASSERT_EQ(cipherKey->getBaseCipherId(), baseCipher->keyId);
+			ASSERT_EQ(cipherKey->getDomainId(), baseCipher->domainId);
+			ASSERT_EQ(cipherKey->getBaseCipherLen(), baseCipher->len);
 			// ensure that baseCipher matches with the cached information
-			ASSERT(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) == 0);
+			ASSERT_EQ(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0);
 			// validate the encryption derivation
-			ASSERT(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) != 0);
+			ASSERT_NE(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0);
 		}
 	}
 	TraceEvent("BlobCipherTest_LooksupDone").log();
@ -548,6 +725,7 @@ TEST_CASE("flow/BlobCipher") {

 	// Validate Encyrption ops
 	Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(minDomainId);
+	Reference<BlobCipherKey> headerCipherKey = cipherKeyCache.getLatestCipherKey(ENCRYPT_HEADER_DOMAIN_ID);
 	const int bufLen = deterministicRandom()->randomInt(786, 2127) + 512;
 	uint8_t orgData[bufLen];
 	generateRandomData(&orgData[0], bufLen);
@ -556,68 +734,317 @@ TEST_CASE("flow/BlobCipher") {
 	uint8_t iv[AES_256_IV_LENGTH];
 	generateRandomData(&iv[0], AES_256_IV_LENGTH);

-	// validate basic encrypt followed by decrypt operation
-	EncryptBlobCipherAes265Ctr encryptor(cipherKey, iv, AES_256_IV_LENGTH);
+	BlobCipherEncryptHeader headerCopy;
+	// validate basic encrypt followed by decrypt operation for AUTH_MODE_NONE
+	{
+		TraceEvent("NoneAuthMode_Start").log();
+
+		EncryptBlobCipherAes265Ctr encryptor(
+		    cipherKey, Reference<BlobCipherKey>(), iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE);
 		BlobCipherEncryptHeader header;
 		Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);

-	ASSERT(encrypted->getLogicalSize() == bufLen);
-	ASSERT(memcmp(&orgData[0], encrypted->begin(), bufLen) != 0);
-	ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
-	ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
+		ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
+		ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
+		ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
+		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE);

 		TraceEvent("BlobCipherTest_EncryptDone")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderEncryptMode", header.flags.encryptMode)
-	    .detail("DomainId", header.encryptDomainId)
-	    .detail("BaseCipherId", header.baseCipherId)
-	    .detail("HeaderChecksum", header.ciphertextChecksum);
+		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId);

-	Reference<BlobCipherKey> encyrptKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
-	ASSERT(encyrptKey->isEqual(cipherKey));
-	DecryptBlobCipherAes256Ctr decryptor(encyrptKey, &header.iv[0]);
+		Reference<BlobCipherKey> tCipherKeyKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
+		                                                                     header.cipherTextDetails.baseCipherId);
+		ASSERT(tCipherKeyKey->isEqual(cipherKey));
+		DecryptBlobCipherAes256Ctr decryptor(
+		    tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
 		Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);

-	ASSERT(decrypted->getLogicalSize() == bufLen);
-	ASSERT(memcmp(decrypted->begin(), &orgData[0], bufLen) == 0);
+		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
+		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);

 		TraceEvent("BlobCipherTest_DecryptDone").log();

 		// induce encryption header corruption - headerVersion corrupted
-	header.flags.headerVersion += 1;
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.headerVersion += 1;
 		try {
-		decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			DecryptBlobCipherAes256Ctr decryptor(
+			    tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
 		} catch (Error& e) {
 			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
 				throw;
 			}
-		header.flags.headerVersion -= 1;
 		}

 		// induce encryption header corruption - encryptionMode corrupted
-	header.flags.encryptMode += 1;
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.encryptMode += 1;
 		try {
-		decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			DecryptBlobCipherAes256Ctr decryptor(
+			    tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
 		} catch (Error& e) {
 			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
 				throw;
 			}
-		header.flags.encryptMode -= 1;
 		}

-	// induce encryption header corruption - checksum mismatch
-	header.ciphertextChecksum += 1;
+		// induce encrypted buffer payload corruption
 		try {
-		decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			uint8_t temp[bufLen];
+			memcpy(encrypted->begin(), &temp[0], bufLen);
+			int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
+			temp[tIdx] += 1;
+			DecryptBlobCipherAes256Ctr decryptor(
+			    tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
 		} catch (Error& e) {
-		if (e.code() != error_code_encrypt_header_checksum_mismatch) {
+			// No authToken, hence, no corruption detection supported
+			ASSERT(false);
+		}
+
+		TraceEvent("NoneAuthMode_Done").log();
+	}
+
+	// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_SINGLE
+	{
+		TraceEvent("SingleAuthMode_Start").log();
+
+		EncryptBlobCipherAes265Ctr encryptor(
+		    cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
+		BlobCipherEncryptHeader header;
+		Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+
+		ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
+		ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
+		ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
+		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
+
+		TraceEvent("BlobCipherTest_EncryptDone")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("HeaderEncryptMode", header.flags.encryptMode)
+		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
+		    .detail("HeaderAuthToken",
+		            StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString());
+
+		Reference<BlobCipherKey> tCipherKeyKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
+		                                                                     header.cipherTextDetails.baseCipherId);
+		Reference<BlobCipherKey> hCipherKey = cipherKeyCache.getCipherKey(header.cipherHeaderDetails.encryptDomainId,
+		                                                                  header.cipherHeaderDetails.baseCipherId);
+		ASSERT(tCipherKeyKey->isEqual(cipherKey));
+		DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+		Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
+
+		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
+		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
+
+		TraceEvent("BlobCipherTest_DecryptDone").log();
+
+		// induce encryption header corruption - headerVersion corrupted
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.headerVersion += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
 				throw;
 			}
-		header.ciphertextChecksum -= 1;
+		}
+
+		// induce encryption header corruption - encryptionMode corrupted
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.encryptMode += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encryption header corruption - authToken mismatch
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1);
+		headerCopy.singleAuthToken.authToken[hIdx] += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encrypted buffer payload corruption
+		try {
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			uint8_t temp[bufLen];
+			memcpy(encrypted->begin(), &temp[0], bufLen);
+			int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
+			temp[tIdx] += 1;
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		TraceEvent("SingleAuthMode_Done").log();
+	}
+
+	// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI
+	{
+		TraceEvent("MultiAuthMode_Start").log();
+
+		EncryptBlobCipherAes265Ctr encryptor(
+		    cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+		BlobCipherEncryptHeader header;
+		Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+
+		ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
+		ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
+		ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
+		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+
+		TraceEvent("BlobCipherTest_EncryptDone")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("HeaderEncryptMode", header.flags.encryptMode)
+		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
+		    .detail("HeaderAuthToken",
+		            StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString());
+
+		Reference<BlobCipherKey> tCipherKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
+		                                                                  header.cipherTextDetails.baseCipherId);
+		Reference<BlobCipherKey> hCipherKey = cipherKeyCache.getCipherKey(header.cipherHeaderDetails.encryptDomainId,
+		                                                                  header.cipherHeaderDetails.baseCipherId);
+
+		ASSERT(tCipherKey->isEqual(cipherKey));
+		DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+		Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
+
+		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
+		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
+
+		TraceEvent("BlobCipherTest_DecryptDone").log();
+
+		// induce encryption header corruption - headerVersion corrupted
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.headerVersion += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encryption header corruption - encryptionMode corrupted
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.encryptMode += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encryption header corruption - cipherText authToken mismatch
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1);
+		headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encryption header corruption - header authToken mismatch
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1);
+		headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		try {
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			uint8_t temp[bufLen];
+			memcpy(encrypted->begin(), &temp[0], bufLen);
+			int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
+			temp[tIdx] += 1;
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		TraceEvent("MultiAuthMode_Done").log();
 	}

 	// Validate dropping encyrptDomainId cached keys
-	const BlobCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId);
+	const EncryptCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId);
 	cipherKeyCache.resetEncyrptDomainId(candidate);
 	std::vector<Reference<BlobCipherKey>> cachedKeys = cipherKeyCache.getAllCiphers(candidate);
 	ASSERT(cachedKeys.empty());
@ -633,20 +1060,4 @@ TEST_CASE("flow/BlobCipher") {
 	return Void();
 }

-BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
-                                          const int payloadLen,
-                                          const BlobCipherRandomSalt& salt,
-                                          Arena& arena) {
-	// FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate checksum
-	// Leverage HMAC_SHA256 using header.randomSalt as the initialization 'key' for the hmac digest.
-
-	HmacSha256DigestGen hmacGenerator((const uint8_t*)&salt, sizeof(salt));
-	StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
-	ASSERT(digest.size() >= sizeof(BlobCipherChecksum));
-
-	BlobCipherChecksum checksum;
-	memcpy((uint8_t*)&checksum, digest.begin(), sizeof(BlobCipherChecksum));
-	return checksum;
-}
-
 #endif // ENCRYPTION_ENABLED
--- a/flow/BlobCipher.h
+++ b/flow/BlobCipher.h
@ -33,6 +33,7 @@
 #if ENCRYPTION_ENABLED

 #include "flow/Arena.h"
+#include "flow/EncryptUtils.h"
 #include "flow/FastRef.h"
 #include "flow/flow.h"
 #include "flow/xxhash.h"
@ -45,15 +46,6 @@

 #define AES_256_KEY_LENGTH 32
 #define AES_256_IV_LENGTH 16
-#define INVALID_DOMAIN_ID 0
-#define INVALID_CIPHER_KEY_ID 0
-
-using BlobCipherDomainId = uint64_t;
-using BlobCipherRandomSalt = uint64_t;
-using BlobCipherBaseKeyId = uint64_t;
-using BlobCipherChecksum = uint64_t;
-
-typedef enum { BLOB_CIPHER_ENCRYPT_MODE_NONE = 0, BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR = 1 } BlockCipherEncryptMode;

 // Encryption operations buffer management
 // Approach limits number of copies needed during encryption or decryption operations.
@ -89,51 +81,94 @@ private:
 // This header is persisted along with encrypted buffer, it contains information necessary
 // to assist decrypting the buffers to serve read requests.
 //
-// The total space overhead is 56 bytes.
+// The total space overhead is 96 bytes.

 #pragma pack(push, 1) // exact fit - no padding
 typedef struct BlobCipherEncryptHeader {
+	static constexpr int headerSize = 96;
 	union {
 		struct {
 			uint8_t size; // reading first byte is sufficient to determine header
 			              // length. ALWAYS THE FIRST HEADER ELEMENT.
 			uint8_t headerVersion{};
 			uint8_t encryptMode{};
-			uint8_t _reserved[5]{};
+			uint8_t authTokenMode{};
+			uint8_t _reserved[4]{};
 		} flags;
 		uint64_t _padding{};
 	};
+
+	// Cipher text encryption information
+	struct {
 		// Encyrption domain boundary identifier.
-	BlobCipherDomainId encryptDomainId{};
+		EncryptCipherDomainId encryptDomainId{};
 		// BaseCipher encryption key identifier
-	BlobCipherBaseKeyId baseCipherId{};
+		EncryptCipherBaseKeyId baseCipherId{};
 		// Random salt
-	BlobCipherRandomSalt salt{};
-	// Checksum of the encrypted buffer. It protects against 'tampering' of ciphertext as well 'bit rots/flips'.
-	BlobCipherChecksum ciphertextChecksum{};
+		EncryptCipherRandomSalt salt{};
 		// Initialization vector used to encrypt the payload.
 		uint8_t iv[AES_256_IV_LENGTH];
+	} cipherTextDetails;

-	BlobCipherEncryptHeader();
+	struct {
+		// Encryption domainId for the header
+		EncryptCipherDomainId encryptDomainId{};
+		// BaseCipher encryption key identifier.
+		EncryptCipherBaseKeyId baseCipherId{};
+	} cipherHeaderDetails;
+
+	// Encryption header is stored as plaintext on a persistent storage to assist reconstruction of cipher-key(s) for
+	// reads. FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate 'authentication
+	// token' (crypto-secure) to protect against malicious tampering and/or bit rot/flip scenarios.
+
+	union {
+		// Encryption header support two modes of generation 'authentication tokens':
+		// 1) SingleAuthTokenMode: the scheme generates single crypto-secrure auth token to protect {cipherText +
+		// header} payload. Scheme is geared towards optimizing cost due to crypto-secure auth-token generation,
+		// however, on decryption client needs to be read 'header' + 'encrypted-buffer' to validate the 'auth-token'.
+		// The scheme is ideal for usecases where payload represented by the encryptionHeader is not large and it is
+		// desirable to minimize CPU/latency penalty due to crypto-secure ops, such as: CommitProxies encrypted inline
+		// transactions, StorageServer encrypting pages etc. 2) MultiAuthTokenMode: Scheme generates separate authTokens
+		// for 'encrypted buffer' & 'encryption-header'. The scheme is ideal where payload represented by
+		// encryptionHeader is large enough such that it is desirable to optimize cost of upfront reading full
+		// 'encrypted buffer', compared to reading only encryptionHeader and ensuring its sanity; for instance:
+		// backup-files.
+
+		struct {
+			// Cipher text authentication token
+			uint8_t cipherTextAuthToken[AUTH_TOKEN_SIZE]{};
+			uint8_t headerAuthToken[AUTH_TOKEN_SIZE]{};
+		} multiAuthTokens;
+		struct {
+			uint8_t authToken[AUTH_TOKEN_SIZE]{};
+			uint8_t _reserved[AUTH_TOKEN_SIZE]{};
+		} singleAuthToken;
+	};
+
+	BlobCipherEncryptHeader() {}
 } BlobCipherEncryptHeader;
 #pragma pack(pop)

+// Ensure no struct-packing issues
+static_assert(sizeof(BlobCipherEncryptHeader) == BlobCipherEncryptHeader::headerSize,
+              "BlobCipherEncryptHeader size mismatch");
+
 // This interface is in-memory representation of CipherKey used for encryption/decryption information.
 // It caches base encryption key properties as well as caches the 'derived encryption' key obtained by applying
 // HMAC-SHA-256 derivation technique.

 class BlobCipherKey : public ReferenceCounted<BlobCipherKey>, NonCopyable {
 public:
-	BlobCipherKey(const BlobCipherDomainId& domainId,
-	              const BlobCipherBaseKeyId& baseCiphId,
+	BlobCipherKey(const EncryptCipherDomainId& domainId,
+	              const EncryptCipherBaseKeyId& baseCiphId,
 	              const uint8_t* baseCiph,
 	              int baseCiphLen);

 	uint8_t* data() const { return cipher.get(); }
 	uint64_t getCreationTime() const { return creationTime; }
-	BlobCipherDomainId getDomainId() const { return encryptDomainId; }
-	BlobCipherRandomSalt getSalt() const { return randomSalt; }
-	BlobCipherBaseKeyId getBaseCipherId() const { return baseCipherId; }
+	EncryptCipherDomainId getDomainId() const { return encryptDomainId; }
+	EncryptCipherRandomSalt getSalt() const { return randomSalt; }
+	EncryptCipherBaseKeyId getBaseCipherId() const { return baseCipherId; }
 	int getBaseCipherLen() const { return baseCipherLen; }
 	uint8_t* rawCipher() const { return cipher.get(); }
 	uint8_t* rawBaseCipher() const { return baseCipher.get(); }
@ -147,23 +182,23 @@ public:

 private:
 	// Encryption domain boundary identifier
-	BlobCipherDomainId encryptDomainId;
+	EncryptCipherDomainId encryptDomainId;
 	// Base encryption cipher key properties
 	std::unique_ptr<uint8_t[]> baseCipher;
 	int baseCipherLen;
-	BlobCipherBaseKeyId baseCipherId;
+	EncryptCipherBaseKeyId baseCipherId;
 	// Random salt used for encryption cipher key derivation
-	BlobCipherRandomSalt randomSalt;
+	EncryptCipherRandomSalt randomSalt;
 	// Creation timestamp for the derived encryption cipher key
 	uint64_t creationTime;
 	// Derived encryption cipher key
 	std::unique_ptr<uint8_t[]> cipher;

-	void initKey(const BlobCipherDomainId& domainId,
+	void initKey(const EncryptCipherDomainId& domainId,
 	             const uint8_t* baseCiph,
 	             int baseCiphLen,
-	             const BlobCipherBaseKeyId& baseCiphId,
-	             const BlobCipherRandomSalt& salt);
+	             const EncryptCipherBaseKeyId& baseCiphId,
+	             const EncryptCipherRandomSalt& salt);
 	void applyHmacSha256Derivation();
 };

@ -190,37 +225,45 @@ private:
 // required encryption key, however, CPs/SSs cache-miss would result in RPC to
 // EncryptKeyServer to refresh the desired encryption key.

-using BlobCipherKeyIdCacheMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>;
-using BlobCipherKeyIdCacheMapCItr = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>::const_iterator;
+using BlobCipherKeyIdCacheMap = std::unordered_map<EncryptCipherBaseKeyId, Reference<BlobCipherKey>>;
+using BlobCipherKeyIdCacheMapCItr =
+    std::unordered_map<EncryptCipherBaseKeyId, Reference<BlobCipherKey>>::const_iterator;

 struct BlobCipherKeyIdCache : ReferenceCounted<BlobCipherKeyIdCache> {
 public:
 	BlobCipherKeyIdCache();
-	explicit BlobCipherKeyIdCache(BlobCipherDomainId dId);
+	explicit BlobCipherKeyIdCache(EncryptCipherDomainId dId);

 	// API returns the last inserted cipherKey.
 	// If none exists, 'encrypt_key_not_found' is thrown.
+
 	Reference<BlobCipherKey> getLatestCipherKey();
+
 	// API returns cipherKey corresponding to input 'baseCipherKeyId'.
 	// If none exists, 'encrypt_key_not_found' is thrown.
-	Reference<BlobCipherKey> getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId);
+
+	Reference<BlobCipherKey> getCipherByBaseCipherId(EncryptCipherBaseKeyId baseCipherKeyId);
+
 	// API enables inserting base encryption cipher details to the BlobCipherKeyIdCache.
 	// Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey
 	// is treated as a NOP (success), however, an attempt to update cipherKey would throw
 	// 'encrypt_update_cipher' exception.
-	void insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen);
+
+	void insertBaseCipherKey(EncryptCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen);
+
 	// API cleanup the cache by dropping all cached cipherKeys
 	void cleanup();
+
 	// API returns list of all 'cached' cipherKeys
 	std::vector<Reference<BlobCipherKey>> getAllCipherKeys();

 private:
-	BlobCipherDomainId domainId;
+	EncryptCipherDomainId domainId;
 	BlobCipherKeyIdCacheMap keyIdCache;
-	BlobCipherBaseKeyId latestBaseCipherKeyId;
+	EncryptCipherBaseKeyId latestBaseCipherKeyId;
 };

-using BlobCipherDomainCacheMap = std::unordered_map<BlobCipherDomainId, Reference<BlobCipherKeyIdCache>>;
+using BlobCipherDomainCacheMap = std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKeyIdCache>>;

 class BlobCipherKeyCache : NonCopyable {
 public:
@ -228,21 +271,28 @@ public:
 	// The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable,
 	// attempting to re-insert same 'identical' cipherKey is treated as a NOP (success),
 	// however, an attempt to update cipherKey would throw 'encrypt_update_cipher' exception.
-	void insertCipherKey(const BlobCipherDomainId& domainId,
-	                     const BlobCipherBaseKeyId& baseCipherId,
+
+	void insertCipherKey(const EncryptCipherDomainId& domainId,
+	                     const EncryptCipherBaseKeyId& baseCipherId,
 	                     const uint8_t* baseCipher,
 	                     int baseCipherLen);
 	// API returns the last insert cipherKey for a given encyryption domain Id.
 	// If none exists, it would throw 'encrypt_key_not_found' exception.
-	Reference<BlobCipherKey> getLatestCipherKey(const BlobCipherDomainId& domainId);
+
+	Reference<BlobCipherKey> getLatestCipherKey(const EncryptCipherDomainId& domainId);
+
 	// API returns cipherKey corresponding to {encryptionDomainId, baseCipherId} tuple.
 	// If none exists, it would throw 'encrypt_key_not_found' exception.
-	Reference<BlobCipherKey> getCipherKey(const BlobCipherDomainId& domainId, const BlobCipherBaseKeyId& baseCipherId);
+
+	Reference<BlobCipherKey> getCipherKey(const EncryptCipherDomainId& domainId,
+	                                      const EncryptCipherBaseKeyId& baseCipherId);
 	// API returns point in time list of all 'cached' cipherKeys for a given encryption domainId.
-	std::vector<Reference<BlobCipherKey>> getAllCiphers(const BlobCipherDomainId& domainId);
+	std::vector<Reference<BlobCipherKey>> getAllCiphers(const EncryptCipherDomainId& domainId);
+
 	// API enables dropping all 'cached' cipherKeys for a given encryption domain Id.
 	// Useful to cleanup cache if an encryption domain gets removed/destroyed etc.
-	void resetEncyrptDomainId(const BlobCipherDomainId domainId);
+
+	void resetEncyrptDomainId(const EncryptCipherDomainId domainId);

 	static BlobCipherKeyCache& getInstance() {
 		static BlobCipherKeyCache instance;
@ -262,14 +312,19 @@ private:
 // This interface enables data block encryption. An invocation to encrypt() will
 // do two things:
 // 1) generate encrypted ciphertext for given plaintext input.
-// 2) generate BlobCipherEncryptHeader (including the 'header checksum') and persit for decryption on reads.
+// 2) generate BlobCipherEncryptHeader (including the 'header authTokens') and persit for decryption on reads.

 class EncryptBlobCipherAes265Ctr final : NonCopyable, public ReferenceCounted<EncryptBlobCipherAes265Ctr> {
 public:
 	static constexpr uint8_t ENCRYPT_HEADER_VERSION = 1;

-	EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key, const uint8_t* iv, const int ivLen);
+	EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> tCipherKey,
+	                           Reference<BlobCipherKey> hCipherKey,
+	                           const uint8_t* iv,
+	                           const int ivLen,
+	                           const EncryptAuthTokenMode mode);
 	~EncryptBlobCipherAes265Ctr();
+
 	Reference<EncryptBuf> encrypt(const uint8_t* plaintext,
 	                              const int plaintextLen,
 	                              BlobCipherEncryptHeader* header,
@ -277,7 +332,9 @@ public:

 private:
 	EVP_CIPHER_CTX* ctx;
-	Reference<BlobCipherKey> cipherKey;
+	Reference<BlobCipherKey> textCipherKey;
+	Reference<BlobCipherKey> headerCipherKey;
+	EncryptAuthTokenMode authTokenMode;
 	uint8_t iv[AES_256_IV_LENGTH];
 };

@ -286,19 +343,43 @@ private:

 class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted<DecryptBlobCipherAes256Ctr> {
 public:
-	DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv);
+	DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> tCipherKey,
+	                           Reference<BlobCipherKey> hCipherKey,
+	                           const uint8_t* iv);
 	~DecryptBlobCipherAes256Ctr();
+
 	Reference<EncryptBuf> decrypt(const uint8_t* ciphertext,
 	                              const int ciphertextLen,
 	                              const BlobCipherEncryptHeader& header,
 	                              Arena&);

+	// Enable caller to validate encryption header auth-token (if available) without needing to read the full encyrpted
+	// payload. The call is NOP unless header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI.
+
+	void verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena);
+
 private:
 	EVP_CIPHER_CTX* ctx;
+	Reference<BlobCipherKey> textCipherKey;
+	Reference<BlobCipherKey> headerCipherKey;
+	bool headerAuthTokenValidationDone;
+	bool authTokensValidationDone;

-	void verifyEncryptBlobHeader(const uint8_t* cipherText,
+	void verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header);
+	void verifyAuthTokens(const uint8_t* ciphertext,
 	                      const int ciphertextLen,
 	                      const BlobCipherEncryptHeader& header,
+	                      uint8_t* buff,
+	                      Arena& arena);
+	void verifyHeaderSingleAuthToken(const uint8_t* ciphertext,
+	                                 const int ciphertextLen,
+	                                 const BlobCipherEncryptHeader& header,
+	                                 uint8_t* buff,
+	                                 Arena& arena);
+	void verifyHeaderMultiAuthToken(const uint8_t* ciphertext,
+	                                const int ciphertextLen,
+	                                const BlobCipherEncryptHeader& header,
+	                                uint8_t* buff,
 	                                Arena& arena);
 };

@ -313,9 +394,10 @@ private:
 	HMAC_CTX* ctx;
 };

-BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
+StringRef computeAuthToken(const uint8_t* payload,
                           const int payloadLen,
-                                          const BlobCipherRandomSalt& salt,
+                           const uint8_t* key,
+                           const int keyLen,
                           Arena& arena);

 #endif // ENCRYPTION_ENABLED
--- a/flow/EncryptUtils.h
+++ b/flow/EncryptUtils.h
@ -0,0 +1,66 @@
+/*
+ * EncryptUtils.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ENCRYPT_UTILS_H
+#define ENCRYPT_UTILS_H
+#pragma once
+
+#include <cstdint>
+#include <limits>
+
+#define ENCRYPT_INVALID_DOMAIN_ID 0
+#define ENCRYPT_INVALID_CIPHER_KEY_ID 0
+
+#define AUTH_TOKEN_SIZE 16
+
+#define SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID -1
+#define ENCRYPT_HEADER_DOMAIN_ID -2
+
+using EncryptCipherDomainId = int64_t;
+using EncryptCipherBaseKeyId = uint64_t;
+using EncryptCipherRandomSalt = uint64_t;
+
+typedef enum {
+	ENCRYPT_CIPHER_MODE_NONE = 0,
+	ENCRYPT_CIPHER_MODE_AES_256_CTR = 1,
+	ENCRYPT_CIPHER_MODE_LAST = 2
+} EncryptCipherMode;
+
+static_assert(EncryptCipherMode::ENCRYPT_CIPHER_MODE_LAST <= std::numeric_limits<uint8_t>::max(),
+              "EncryptCipherMode value overflow");
+
+// EncryptionHeader authentication modes
+// 1. NONE - No 'authentication token' generation needed for EncryptionHeader i.e. no protection against header OR
+// cipherText 'tampering' and/or bit rot/flip corruptions.
+// 2. Single/Multi - Encyrption header would generate one or more 'authentication tokens' to protect the header against
+// 'tempering' and/or bit rot/flip corruptions. Refer to BlobCipher.h for detailed usage recommendations.
+// 3. LAST - Invalid mode, used for static asserts.
+
+typedef enum {
+	ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE = 0,
+	ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE = 1,
+	ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI = 2,
+	ENCRYPT_HEADER_AUTH_TOKEN_LAST = 3 // Always the last element
+} EncryptAuthTokenMode;
+
+static_assert(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_LAST <= std::numeric_limits<uint8_t>::max(),
+              "EncryptHeaderAuthToken value overflow");
+
+#endif
--- a/flow/Hostname.actor.cpp
+++ b/flow/Hostname.actor.cpp
@ -46,7 +46,7 @@ void Hostname::resetToUnresolved() {
 	}
 }

-ACTOR Future<Void> resolveImpl(Hostname* self) {
+ACTOR Future<Optional<NetworkAddress>> resolveImpl(Hostname* self) {
 	loop {
 		if (self->status == Hostname::UNRESOLVED) {
 			self->status = Hostname::RESOLVING;
@ -61,51 +61,52 @@ ACTOR Future<Void> resolveImpl(Hostname* self) {
 				}
 				self->resolvedAddress = address;
 				self->status = Hostname::RESOLVED;
-				break;
+				self->resolveFinish.trigger();
+				return self->resolvedAddress.get();
 			} catch (...) {
 				self->status = Hostname::UNRESOLVED;
 				self->resolveFinish.trigger();
 				self->resolvedAddress = Optional<NetworkAddress>();
-				throw lookup_failed();
+				return Optional<NetworkAddress>();
 			}
 		} else if (self->status == Hostname::RESOLVING) {
 			wait(self->resolveFinish.onTrigger());
 			if (self->status == Hostname::RESOLVED) {
-				break;
+				return self->resolvedAddress.get();
 			}
 			// Otherwise, this means other threads failed on resolve, so here we go back to the loop and try to resolve
 			// again.
 		} else {
 			// status is RESOLVED, nothing to do.
-			break;
+			return self->resolvedAddress.get();
 		}
 	}
-	return Void();
 }

-ACTOR Future<Void> resolveWithRetryImpl(Hostname* self) {
+ACTOR Future<NetworkAddress> resolveWithRetryImpl(Hostname* self) {
 	loop {
 		try {
-			wait(resolveImpl(self));
-			return Void();
-		} catch (Error& e) {
-			if (e.code() == error_code_actor_cancelled) {
-				throw;
+			Optional<NetworkAddress> address = wait(resolveImpl(self));
+			if (address.present()) {
+				return address.get();
 			}
 			wait(delay(FLOW_KNOBS->HOSTNAME_RESOLVE_DELAY));
+		} catch (Error& e) {
+			ASSERT(e.code() == error_code_actor_cancelled);
+			throw;
 		}
 	}
 }

-Future<Void> Hostname::resolve() {
+Future<Optional<NetworkAddress>> Hostname::resolve() {
 	return resolveImpl(this);
 }

-Future<Void> Hostname::resolveWithRetry() {
+Future<NetworkAddress> Hostname::resolveWithRetry() {
 	return resolveWithRetryImpl(this);
 }

-void Hostname::resolveBlocking() {
+Optional<NetworkAddress> Hostname::resolveBlocking() {
 	if (status != RESOLVED) {
 		try {
 			std::vector<NetworkAddress> addresses =
@ -121,9 +122,9 @@ void Hostname::resolveBlocking() {
 		} catch (...) {
 			status = UNRESOLVED;
 			resolvedAddress = Optional<NetworkAddress>();
-			throw lookup_failed();
 		}
 	}
+	return resolvedAddress;
 }

 TEST_CASE("/flow/Hostname/hostname") {
@ -181,50 +182,44 @@ TEST_CASE("/flow/Hostname/hostname") {
 	ASSERT(hn3.status == Hostname::UNRESOLVED && !hn3.resolvedAddress.present());
 	ASSERT(hn4.status == Hostname::UNRESOLVED && !hn4.resolvedAddress.present());

-	try {
-		wait(hn2.resolve());
-	} catch (Error& e) {
-		ASSERT(e.code() == error_code_lookup_failed);
-	}
-	ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present());
+	state Optional<NetworkAddress> emptyAddress = wait(hn2.resolve());
+	ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present() && !emptyAddress.present());

 	try {
-		wait(timeoutError(hn2.resolveWithRetry(), 1));
+		NetworkAddress _ = wait(timeoutError(hn2.resolveWithRetry(), 1));
 	} catch (Error& e) {
 		ASSERT(e.code() == error_code_timed_out);
 	}
 	ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present());

-	try {
-		hn2.resolveBlocking();
-	} catch (Error& e) {
-		ASSERT(e.code() == error_code_lookup_failed);
-	}
-	ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present());
+	emptyAddress = hn2.resolveBlocking();
+	ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present() && !emptyAddress.present());

-	state NetworkAddress address = NetworkAddress::parse("127.0.0.0:1234");
-	INetworkConnections::net()->addMockTCPEndpoint("host-name", "1234", { address });
+	state NetworkAddress addressSource = NetworkAddress::parse("127.0.0.0:1234");
+	INetworkConnections::net()->addMockTCPEndpoint("host-name", "1234", { addressSource });

 	// Test resolve.
-	wait(hn2.resolve());
+	state Optional<NetworkAddress> optionalAddress = wait(hn2.resolve());
 	ASSERT(hn2.status == Hostname::RESOLVED);
-	ASSERT(hn2.resolvedAddress.present() && hn2.resolvedAddress.get() == address);
+	ASSERT(hn2.resolvedAddress.get() == addressSource && optionalAddress.get() == addressSource);
+	optionalAddress = Optional<NetworkAddress>();

 	// Test resolveWithRetry.
 	hn2.resetToUnresolved();
 	ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present());

-	wait(hn2.resolveWithRetry());
+	state NetworkAddress address = wait(hn2.resolveWithRetry());
 	ASSERT(hn2.status == Hostname::RESOLVED);
-	ASSERT(hn2.resolvedAddress.present() && hn2.resolvedAddress.get() == address);
+	ASSERT(hn2.resolvedAddress.get() == addressSource && address == addressSource);

 	// Test resolveBlocking.
 	hn2.resetToUnresolved();
 	ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present());

-	hn2.resolveBlocking();
+	optionalAddress = hn2.resolveBlocking();
 	ASSERT(hn2.status == Hostname::RESOLVED);
-	ASSERT(hn2.resolvedAddress.present() && hn2.resolvedAddress.get() == address);
+	ASSERT(hn2.resolvedAddress.get() == addressSource && optionalAddress.get() == addressSource);
+	optionalAddress = Optional<NetworkAddress>();

 	return Void();
 }
--- a/flow/Hostname.h
+++ b/flow/Hostname.h
@ -74,10 +74,10 @@ struct Hostname {

 	Optional<NetworkAddress> resolvedAddress;
 	enum HostnameStatus { UNRESOLVED, RESOLVING, RESOLVED };
-	Future<Void> resolve();
-	Future<Void> resolveWithRetry();
-	void resolveBlocking(); // This one should only be used when resolving asynchronously is impossible.
-	                        // For all other cases, resolve() should be preferred.
+	Future<Optional<NetworkAddress>> resolve();
+	Future<NetworkAddress> resolveWithRetry();
+	Optional<NetworkAddress> resolveBlocking(); // This one should only be used when resolving asynchronously is
+	                                            // impossible. For all other cases, resolve() should be preferred.
 	void resetToUnresolved();
 	HostnameStatus status = UNRESOLVED;
 	AsyncTrigger resolveFinish;
--- a/flow/TDMetric.actor.h
+++ b/flow/TDMetric.actor.h
@ -963,7 +963,7 @@ struct DynamicFieldBase {
 		if (getDerivedTypeName() == metricTypeName<T>())
 			return (DynamicField<T>*)this;

-		TraceEvent(SevWarnAlways, "ScopeEventFieldTypeMismatch")
+		TraceEvent(g_network->isSimulated() ? SevError : SevWarnAlways, "ScopeEventFieldTypeMismatch")
 		    .detail("EventType", eventType.toString())
 		    .detail("FieldName", fieldName().toString())
 		    .detail("OldType", getDerivedTypeName().toString())
--- a/flow/Trace.h
+++ b/flow/Trace.h
@ -35,6 +35,7 @@

 #define TRACE_DEFAULT_ROLL_SIZE (10 << 20)
 #define TRACE_DEFAULT_MAX_LOGS_SIZE (10 * TRACE_DEFAULT_ROLL_SIZE)
+#define PRINTABLE_COMPRESS_NULLS 0

 inline int fastrand() {
 	static int g_seed = 0;
@ -343,20 +344,37 @@ struct TraceableStringImpl : std::true_type {
 		}
 		std::string result;
 		result.reserve(size - nonPrintables + (nonPrintables * 4) + numBackslashes);
+		int numNull = 0;
 		for (auto iter = TraceableString<T>::begin(value); !TraceableString<T>::atEnd(value, iter); ++iter) {
 			if (*iter == '\\') {
+				if (numNull > 0) {
+					result += format("[%d]", numNull);
+					numNull = 0;
+				}
 				result.push_back('\\');
 				result.push_back('\\');
 			} else if (isPrintable(*iter)) {
+				if (numNull > 0) {
+					result += format("[%d]", numNull);
+					numNull = 0;
+				}
 				result.push_back(*iter);
 			} else {
 				const uint8_t byte = *iter;
+				if (PRINTABLE_COMPRESS_NULLS && byte == 0) {
+					numNull++;
+				} else {
 					result.push_back('\\');
 					result.push_back('x');
 					result.push_back(base16Char(byte / 16));
 					result.push_back(base16Char(byte));
 				}
 			}
+		}
+		if (numNull > 0) {
+			result += format("[%d]", numNull);
+			numNull = 0;
+		}
 		return result;
 	}
 };
--- a/flow/actorcompiler.h
+++ b/flow/actorcompiler.h
@ -73,3 +73,11 @@ T waitNext(const FutureStream<T>&);
 #ifdef _MSC_VER
 #pragma warning(disable : 4355) // 'this' : used in base member initializer list
 #endif
+
+// Currently, #ifdef can't be used inside actors, so define no-op versions of these valgrind
+// functions if valgrind is not defined
+#ifndef VALGRIND
+#define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
+#define VALGRIND_MAKE_MEM_DEFINED(x, y)
+#define VALGRIND_CHECK_MEM_IS_DEFINED(x, y) 0
+#endif
--- a/flow/error_definitions.h
+++ b/flow/error_definitions.h
@ -88,6 +88,14 @@ ERROR( blob_granule_transaction_too_old, 1064, "Read version is older than blob
 ERROR( blob_manager_replaced, 1065, "This blob manager has been replaced." )
 ERROR( change_feed_popped, 1066, "Tried to read a version older than what has been popped from the change feed" )
 ERROR( remote_kvs_cancelled, 1067, "The remote key-value store is cancelled" )
+ERROR( page_header_wrong_page_id, 1068, "Page header does not match location on disk" )
+ERROR( page_header_checksum_failed, 1069, "Page header checksum failed" )
+ERROR( page_header_version_not_supported, 1070, "Page header version is not supported" )
+ERROR( page_encoding_not_supported, 1071, "Page encoding type is not supported or not valid" )
+ERROR( page_decoding_failed, 1072, "Page content decoding failed" )
+ERROR( unexpected_encoding_type, 1073, "Page content decoding failed" )
+ERROR( encryption_key_not_found, 1074, "Encryption key not found" )
+ERROR( stale_version_vector, 1075, "Client version vector is stale" )

 ERROR( broken_promise, 1100, "Broken promise" )
 ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" )
@ -290,14 +298,14 @@ ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum
 ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported")
 ERROR( snap_invalid_uid_string, 2509, "The given uid string is not a 32-length hex string")

-// 3XXX - Encryption operations errors
-ERROR( encrypt_ops_error, 3000, "Encryption operation error")
-ERROR( encrypt_header_metadata_mismatch, 3001, "Encryption header metadata mismatch")
-ERROR( encrypt_key_not_found, 3002, "Expected encryption key is missing")
-ERROR( encrypt_key_ttl_expired, 3003, "Expected encryption key TTL has expired")
-ERROR( encrypt_header_checksum_mismatch, 3004, "Encryption header checksum mismatch")
-ERROR( encrypt_update_cipher, 3005, "Attempt to update encryption cipher key")
-ERROR( encrypt_invalid_id, 3006, "Invalid encryption domainId or encryption cipher key id")
+// 27XX - Encryption operations errors
+ERROR( encrypt_ops_error, 2700, "Encryption operation error")
+ERROR( encrypt_header_metadata_mismatch, 2701, "Encryption header metadata mismatch")
+ERROR( encrypt_key_not_found, 2702, "Expected encryption key is missing")
+ERROR( encrypt_key_ttl_expired, 2703, "Expected encryption key TTL has expired")
+ERROR( encrypt_header_authtoken_mismatch, 2704, "Encryption header authentication token mismatch")
+ERROR( encrypt_update_cipher, 2705, "Attempt to update encryption cipher key")
+ERROR( encrypt_invalid_id, 2706, "Invalid encryption domainId or encryption cipher key id")

 // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
 ERROR( unknown_error, 4000, "An unknown error occurred" )  // C++ exception not of type Error
--- a/flow/flat_buffers.h
+++ b/flow/flat_buffers.h
@ -21,6 +21,7 @@
 #pragma once

 #include <algorithm>
+#include <boost/container/flat_map.hpp>
 #include <iterator>
 #include <cstring>
 #include <functional>
@ -239,6 +240,34 @@ struct vector_like_traits<std::unordered_map<Key, T, Hash, Pred, Allocator>> : s
 	}
 };

+template <class Key, class T, class Compare, class Allocator>
+struct vector_like_traits<boost::container::flat_map<Key, T, Compare, Allocator>> : std::true_type {
+	using Vec = boost::container::flat_map<Key, T, Compare, Allocator>;
+	using value_type = std::pair<Key, T>;
+	using iterator = typename Vec::const_iterator;
+	using insert_iterator = std::insert_iterator<Vec>;
+
+	template <class Context>
+	static size_t num_entries(const Vec& v, Context&) {
+		return v.size();
+	}
+
+	// Return an insert_iterator starting with an empty vector. |size| is the
+	// number of elements to be inserted. Implementations may want to allocate
+	// enough memory up front to hold |size| elements.
+	template <class Context>
+	static insert_iterator insert(Vec& v, size_t size, Context&) {
+		v.clear();
+		v.reserve(size);
+		return std::inserter(v, v.end());
+	}
+
+	template <class Context>
+	static iterator begin(const Vec& v, Context&) {
+		return v.begin();
+	}
+};
+
 template <class Key, class Compare, class Allocator>
 struct vector_like_traits<std::set<Key, Compare, Allocator>> : std::true_type {
 	using Vec = std::set<Key, Compare, Allocator>;
--- a/flow/network.h
+++ b/flow/network.h
@ -84,6 +84,7 @@ enum class TaskPriority {
 	GetConsistentReadVersion = 8500,
 	GetLiveCommittedVersionReply = 8490,
 	GetLiveCommittedVersion = 8480,
+	GetTLogPrevCommitVersion = 8400,
 	UpdateRecoveryTransactionVersion = 8470,
 	DefaultPromiseEndpoint = 8000,
 	DefaultOnMainThread = 7500,
--- a/Show More
+++ b/Show More