Merge branch 'main' into readaware

2022-04-12 16:47:15 -07:00 · 2022-04-12 16:47:15 -07:00 · ed97a35dc0
parent 5e96bacb5b 67d383d58f
commit ed97a35dc0
77 changed files with 4455 additions and 1797 deletions
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -466,6 +466,27 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db
 	                }).extractPtr());
 }

+extern "C" DLLEXPORT FDBFuture* fdb_database_purge_blob_granules(FDBDatabase* db,
+                                                                 uint8_t const* begin_key_name,
+                                                                 int begin_key_name_length,
+                                                                 uint8_t const* end_key_name,
+                                                                 int end_key_name_length,
+                                                                 int64_t purge_version,
+                                                                 fdb_bool_t force) {
+	return (FDBFuture*)(DB(db)
+	                        ->purgeBlobGranules(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
+	                                                        StringRef(end_key_name, end_key_name_length)),
+	                                            purge_version,
+	                                            force)
+	                        .extractPtr());
+}
+extern "C" DLLEXPORT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDatabase* db,
+                                                                          uint8_t const* purge_key_name,
+                                                                          int purge_key_name_length) {
+	return (
+	    FDBFuture*)(DB(db)->waitPurgeGranulesComplete(StringRef(purge_key_name, purge_key_name_length)).extractPtr());
+}
+
 extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) {
 	CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr(););
 }
@ -619,23 +640,23 @@ FDBFuture* fdb_transaction_get_range_impl(FDBTransaction* tr,
 	                    .extractPtr());
 }

-FDBFuture* fdb_transaction_get_mapped_range_impl(FDBTransaction* tr,
-                                                 uint8_t const* begin_key_name,
-                                                 int begin_key_name_length,
-                                                 fdb_bool_t begin_or_equal,
-                                                 int begin_offset,
-                                                 uint8_t const* end_key_name,
-                                                 int end_key_name_length,
-                                                 fdb_bool_t end_or_equal,
-                                                 int end_offset,
-                                                 uint8_t const* mapper_name,
-                                                 int mapper_name_length,
-                                                 int limit,
-                                                 int target_bytes,
-                                                 FDBStreamingMode mode,
-                                                 int iteration,
-                                                 fdb_bool_t snapshot,
-                                                 fdb_bool_t reverse) {
+extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_mapped_range(FDBTransaction* tr,
+                                                                 uint8_t const* begin_key_name,
+                                                                 int begin_key_name_length,
+                                                                 fdb_bool_t begin_or_equal,
+                                                                 int begin_offset,
+                                                                 uint8_t const* end_key_name,
+                                                                 int end_key_name_length,
+                                                                 fdb_bool_t end_or_equal,
+                                                                 int end_offset,
+                                                                 uint8_t const* mapper_name,
+                                                                 int mapper_name_length,
+                                                                 int limit,
+                                                                 int target_bytes,
+                                                                 FDBStreamingMode mode,
+                                                                 int iteration,
+                                                                 fdb_bool_t snapshot,
+                                                                 fdb_bool_t reverse) {
 	FDBFuture* r = validate_and_update_parameters(limit, target_bytes, mode, iteration, reverse);
 	if (r != nullptr)
 		return r;
@ -651,25 +672,24 @@ FDBFuture* fdb_transaction_get_mapped_range_impl(FDBTransaction* tr,
 	                    .extractPtr());
 }

-// TODO: Support FDB_API_ADDED in generate_asm.py and then this can be replaced with fdb_api_ptr_unimpl.
-FDBFuture* fdb_transaction_get_mapped_range_v699(FDBTransaction* tr,
-                                                 uint8_t const* begin_key_name,
-                                                 int begin_key_name_length,
-                                                 fdb_bool_t begin_or_equal,
-                                                 int begin_offset,
-                                                 uint8_t const* end_key_name,
-                                                 int end_key_name_length,
-                                                 fdb_bool_t end_or_equal,
-                                                 int end_offset,
-                                                 uint8_t const* mapper_name,
-                                                 int mapper_name_length,
-                                                 int limit,
-                                                 int target_bytes,
-                                                 FDBStreamingMode mode,
-                                                 int iteration,
-                                                 fdb_bool_t snapshot,
-                                                 fdb_bool_t reverse) {
-	fprintf(stderr, "UNIMPLEMENTED FDB API FUNCTION\n");
+FDBFuture* fdb_transaction_get_range_and_flat_map_v709(FDBTransaction* tr,
+                                                       uint8_t const* begin_key_name,
+                                                       int begin_key_name_length,
+                                                       fdb_bool_t begin_or_equal,
+                                                       int begin_offset,
+                                                       uint8_t const* end_key_name,
+                                                       int end_key_name_length,
+                                                       fdb_bool_t end_or_equal,
+                                                       int end_offset,
+                                                       uint8_t const* mapper_name,
+                                                       int mapper_name_length,
+                                                       int limit,
+                                                       int target_bytes,
+                                                       FDBStreamingMode mode,
+                                                       int iteration,
+                                                       fdb_bool_t snapshot,
+                                                       fdb_bool_t reverse) {
+	fprintf(stderr, "GetRangeAndFlatMap is removed from 7.0. Please upgrade to 7.1 and use GetMappedRange\n");
 	abort();
 }

@ -900,13 +920,13 @@ extern "C" DLLEXPORT fdb_error_t fdb_select_api_version_impl(int runtime_version

 	// Versioned API changes -- descending order by version (new changes at top)
 	// FDB_API_CHANGED( function, ver ) means there is a new implementation as of ver, and a function function_(ver-1)
-	// is the old implementation FDB_API_REMOVED( function, ver ) means the function was removed as of ver, and
+	// is the old implementation. FDB_API_REMOVED( function, ver ) means the function was removed as of ver, and
 	// function_(ver-1) is the old implementation
 	//
 	// WARNING: use caution when implementing removed functions by calling public API functions. This can lead to
 	// undesired behavior when using the multi-version API. Instead, it is better to have both the removed and public
 	// functions call an internal implementation function. See fdb_create_database_impl for an example.
-	FDB_API_CHANGED(fdb_transaction_get_mapped_range, 700);
+	FDB_API_REMOVED(fdb_transaction_get_range_and_flat_map, 710);
 	FDB_API_REMOVED(fdb_future_get_version, 620);
 	FDB_API_REMOVED(fdb_create_cluster, 610);
 	FDB_API_REMOVED(fdb_cluster_create_database, 610);
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@ -299,6 +299,18 @@ DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDat

 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version);

+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_purge_blob_granules(FDBDatabase* db,
+                                                                         uint8_t const* begin_key_name,
+                                                                         int begin_key_name_length,
+                                                                         uint8_t const* end_key_name,
+                                                                         int end_key_name_length,
+                                                                         int64_t purge_version,
+                                                                         fdb_bool_t force);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDatabase* db,
+                                                                                  uint8_t const* purge_key_name,
+                                                                                  int purge_key_name_length);
+
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant,
                                                                       FDBTransaction** out_transaction);

--- a/bindings/c/test/unit/fdb_api.cpp
+++ b/bindings/c/test/unit/fdb_api.cpp
@ -130,6 +130,25 @@ EmptyFuture Database::create_snapshot(FDBDatabase* db,
 	return EmptyFuture(fdb_database_create_snapshot(db, uid, uid_length, snap_command, snap_command_length));
 }

+KeyFuture Database::purge_blob_granules(FDBDatabase* db,
+                                        std::string_view begin_key,
+                                        std::string_view end_key,
+                                        int64_t purge_version,
+                                        fdb_bool_t force) {
+	return KeyFuture(fdb_database_purge_blob_granules(db,
+	                                                  (const uint8_t*)begin_key.data(),
+	                                                  begin_key.size(),
+	                                                  (const uint8_t*)end_key.data(),
+	                                                  end_key.size(),
+	                                                  purge_version,
+	                                                  force));
+}
+
+EmptyFuture Database::wait_purge_granules_complete(FDBDatabase* db, std::string_view purge_key) {
+	return EmptyFuture(
+	    fdb_database_wait_purge_granules_complete(db, (const uint8_t*)purge_key.data(), purge_key.size()));
+}
+
 // Tenant
 Tenant::Tenant(FDBDatabase* db, const uint8_t* name, int name_length) {
 	if (fdb_error_t err = fdb_database_open_tenant(db, name, name_length, &tenant)) {
--- a/bindings/c/test/unit/fdb_api.hpp
+++ b/bindings/c/test/unit/fdb_api.hpp
@ -97,6 +97,7 @@ public:

 private:
 	friend class Transaction;
+	friend class Database;
 	KeyFuture(FDBFuture* f) : Future(f) {}
 };

@ -201,6 +202,14 @@ public:
 	                                   int uid_length,
 	                                   const uint8_t* snap_command,
 	                                   int snap_command_length);
+
+	static KeyFuture purge_blob_granules(FDBDatabase* db,
+	                                     std::string_view begin_key,
+	                                     std::string_view end_key,
+	                                     int64_t purge_version,
+	                                     fdb_bool_t force);
+
+	static EmptyFuture wait_purge_granules_complete(FDBDatabase* db, std::string_view purge_key);
 };

 class Tenant final {
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@ -2592,7 +2592,6 @@ TEST_CASE("Blob Granule Functions") {
 	}

 	// write some data
-
 	insert_data(db, create_data({ { "bg1", "a" }, { "bg2", "b" }, { "bg3", "c" } }));

 	// because wiring up files is non-trivial, just test the calls complete with the expected no_materialize error
@ -2709,6 +2708,42 @@ TEST_CASE("Blob Granule Functions") {
 		tr.reset();
 		break;
 	}
+
+	// do a purge + wait at that version to purge everything before originalReadVersion
+
+	fdb::KeyFuture purgeKeyFuture =
+	    fdb::Database::purge_blob_granules(db, key("bg"), key("bh"), originalReadVersion, false);
+
+	fdb_check(wait_future(purgeKeyFuture));
+
+	const uint8_t* purgeKeyData;
+	int purgeKeyLen;
+
+	fdb_check(purgeKeyFuture.get(&purgeKeyData, &purgeKeyLen));
+
+	std::string purgeKey((const char*)purgeKeyData, purgeKeyLen);
+
+	fdb::EmptyFuture waitPurgeFuture = fdb::Database::wait_purge_granules_complete(db, purgeKey);
+	fdb_check(wait_future(waitPurgeFuture));
+
+	// re-read again at the purge version to make sure it is still valid
+
+	while (1) {
+		fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0));
+		fdb::KeyValueArrayResult r =
+		    tr.read_blob_granules(key("bg"), key("bh"), 0, originalReadVersion, granuleContext);
+		fdb_error_t err = r.get(&out_kv, &out_count, &out_more);
+		if (err && err != 2037 /* blob_granule_not_materialized */) {
+			fdb::EmptyFuture f2 = tr.on_error(err);
+			fdb_check(wait_future(f2));
+			continue;
+		}
+
+		CHECK(err == 2037 /* blob_granule_not_materialized */);
+
+		tr.reset();
+		break;
+	}
 }

 int main(int argc, char** argv) {
--- a/bindings/python/tests/fdbcli_tests.py
+++ b/bindings/python/tests/fdbcli_tests.py
@ -260,6 +260,45 @@ def suspend(logger):
    assert get_value_from_status_json(False, 'client', 'database_status', 'available')


+def extract_version_epoch(cli_output):
+    return int(cli_output.split("\n")[-1].split(" ")[-1])
+
+
+@enable_logging()
+def targetversion(logger):
+    version1 = run_fdbcli_command('targetversion getepoch')
+    assert version1 == "Version epoch is unset"
+    version2 = int(run_fdbcli_command('getversion'))
+    logger.debug("read version: {}".format(version2))
+    assert version2 >= 0
+    # set the version epoch to the default value
+    logger.debug("setting version epoch to default")
+    run_fdbcli_command('targetversion add 0')
+    # get the version epoch
+    versionepoch1 = extract_version_epoch(run_fdbcli_command('targetversion getepoch'))
+    logger.debug("version epoch: {}".format(versionepoch1))
+    # make sure the version increased
+    version3 = int(run_fdbcli_command('getversion'))
+    logger.debug("read version: {}".format(version3))
+    assert version3 >= version2
+    # slightly increase the version epoch
+    versionepoch2 = extract_version_epoch(run_fdbcli_command("targetversion setepoch {}".format(versionepoch1 + 1000000)))
+    logger.debug("version epoch: {}".format(versionepoch2))
+    assert versionepoch2 == versionepoch1 + 1000000
+    # slightly decrease the version epoch
+    versionepoch3 = extract_version_epoch(run_fdbcli_command("targetversion add {}".format(-1000000)))
+    logger.debug("version epoch: {}".format(versionepoch3))
+    assert versionepoch3 == versionepoch2 - 1000000 == versionepoch1
+    # the versions should still be increasing
+    version4 = int(run_fdbcli_command('getversion'))
+    logger.debug("read version: {}".format(version4))
+    assert version4 >= version3
+    # clear the version epoch and make sure it is now unset
+    run_fdbcli_command("targetversion clearepoch")
+    version5 = run_fdbcli_command('targetversion getepoch')
+    assert version5 == "Version epoch is unset"
+
+
 def get_value_from_status_json(retry, *args):
    while True:
        result = json.loads(run_fdbcli_command('status', 'json'))
@ -685,6 +724,9 @@ if __name__ == '__main__':
        throttle()
        triggerddteaminfolog()
        tenants()
+        # TODO: similar to advanceversion, this seems to cause some issues, so disable for now
+        # This must go last, otherwise the version advancement can mess with the other tests
+        # targetversion()
    else:
        assert args.process_number > 1, "Process number should be positive"
        coordinators()
--- a/cmake/FindGperftools.cmake
+++ b/cmake/FindGperftools.cmake
@ -52,7 +52,6 @@ mark_as_advanced(

 if (GPERFTOOLS_FOUND)
  add_library(gperftools UNKNOWN IMPORTED)
-  target_compile_definitions(gperftools PUBLIC USE_GPERFTOOLS)
  set_target_properties(gperftools PROPERTIES
    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC_AND_PROFILER}
    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@ -3,6 +3,40 @@
 .. code-block:: javascript

   "cluster":{
+        "storage_wiggler": {
+            "wiggle_server_ids":["0ccb4e0feddb55"],
+            "wiggle_server_addresses": ["127.0.0.1"],
+            "primary": { // primary DC storage wiggler stats
+                // One StorageServer wiggle round is considered 'complete', when all StorageServers with creationTime < T are wiggled
+               "last_round_start_datetime": "2022-04-02 00:05:05.123 +0000",
+               "last_round_start_timestamp": 1648857905.123, // when did the latest round start
+               "last_round_finish_datetime": "1970-01-01 00:00:00.000 +0000",
+               "last_round_finish_timestamp": 0, // when did the latest finished round finish
+               "smoothed_round_seconds": 1, // moving average duration of a wiggle round
+               "finished_round": 1,
+               // 1 wiggle step as 1 storage server is wiggled in the current round
+               "last_wiggle_start_datetime": "2022-04-02 00:05:05.123 +0000",
+               "last_wiggle_start_timestamp": 1648857905.123, // when did the latest wiggle step start
+               "last_wiggle_finish_datetime": "1970-01-01 00:00:00.000 +0000",
+               "last_wiggle_finish_timestamp": 0,
+               "smoothed_wiggle_seconds": 1,
+               "finished_wiggle": 1
+            },
+            "remote": { // remote DC storage wiggler stats
+               "last_round_start_datetime": "2022-04-02 00:05:05.123 +0000",
+               "last_round_start_timestamp": 1648857905.123,
+               "last_round_finish_datetime": "1970-01-01 00:00:00.000 +0000",
+               "last_round_finish_timestamp": 0,
+               "smoothed_round_seconds": 1,
+               "finished_round": 1,
+               "last_wiggle_start_datetime": "2022-04-02 00:05:05.123 +0000",
+               "last_wiggle_start_timestamp": 1648857905.123,
+               "last_wiggle_finish_datetime": "1970-01-01 00:00:00.000 +0000",
+               "last_wiggle_finish_timestamp": 0,
+               "smoothed_wiggle_seconds": 1,
+               "finished_wiggle": 1
+            }
+       },
      "layers":{
         "_valid":true,
         "_error":"some error description"
--- a/documentation/sphinx/source/perpetual-storage-wiggle.rst
+++ b/documentation/sphinx/source/perpetual-storage-wiggle.rst
@ -14,7 +14,7 @@ Summary
 ============
 Perpetual storage wiggle is a feature that forces the data distributor to constantly build new storage teams when the cluster is healthy. On a high-level note, the process is like this:

-Order storage servers by process id. For each storage server n:
+Order storage servers by their created time, from oldest to newest. For each storage server n:

 a. Exclude storage server n.

@ -22,7 +22,7 @@ b. Wait until all data has been moved off the storage server.

 c. Include storage n

-Goto a to wiggle the next storage process with different process id.
+Goto step a to wiggle the next storage server.

 With a perpetual wiggle, storage migrations will be much less impactful. The wiggler will detect the healthy status based on healthy teams, available disk space and the number of unhealthy relocations. It will pause the wiggle until the cluster is healthy again.

@ -47,7 +47,8 @@ Disable perpetual storage wiggle locality matching filter, which wiggles all the
 Monitor
 =======

-The ``status`` command in the FDB :ref:`command line interface <command-line-interface>` will show the current perpetual_storage_wiggle value.
+* The ``status`` command will report the IP address of the Storage Server under wiggling.
+* The ``status json`` command in the FDB :ref:`command line interface <command-line-interface>` will show the current `perpetual_storage_wiggle` value. Plus, the ``cluster.storage_wiggler`` field reports storage wiggle details.

 Trace Events
 ----------------------
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@ -28,7 +28,6 @@ Features
 * Improved the efficiency with which storage servers replicate data between themselves. `(PR #5017) <https://github.com/apple/foundationdb/pull/5017>`_
 * Added support to ``exclude command`` to exclude based on locality match. `(PR #5113) <https://github.com/apple/foundationdb/pull/5113>`_
 * Add the ``trace_partial_file_suffix`` network option. This option will give unfinished trace files a special suffix to indicate they're not complete yet. When the trace file is complete, it is renamed to remove the suffix. `(PR #5328) <https://github.com/apple/foundationdb/pull/5328>`_
-* Added "get range and flat map" feature with new APIs (see Bindings section). Storage servers are able to generate the keys in the queries based on another query. With this, upper layer can push some computations down to FDB, to improve latency and bandwidth when read. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_

 Performance
 -----------
@ -85,8 +84,6 @@ Bindings
 * C: Added a function, ``fdb_database_create_snapshot``, to create a snapshot of the database. `(PR #4241) <https://github.com/apple/foundationdb/pull/4241/files>`_
 * C: Added ``fdb_database_get_main_thread_busyness`` function to report how busy a client's main thread is. `(PR #4504) <https://github.com/apple/foundationdb/pull/4504>`_
 * Java: Added ``Database.getMainThreadBusyness`` function to report how busy a client's main thread is. `(PR #4564) <https://github.com/apple/foundationdb/pull/4564>`_
-* C: Added ``fdb_transaction_get_range_and_flat_map`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_
-* Java: Added ``Transaction.getRangeAndFlatMap`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_

 Other Changes
 -------------
--- a/documentation/sphinx/source/release-notes/release-notes-710.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-710.rst
@ -10,6 +10,7 @@ Release Notes
 Features
 --------
 * Added ``USE_GRV_CACHE`` transaction option to allow read versions to be locally cached on the client side for latency optimizations. `(PR #5725) <https://github.com/apple/foundationdb/pull/5725>`_ `(PR #6664) <https://github.com/apple/foundationdb/pull/6664>`_
+* Added "get range and flat map" feature with new APIs (see Bindings section). Storage servers are able to generate the keys in the queries based on another query. With this, upper layer can push some computations down to FDB, to improve latency and bandwidth when read. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_, `(PR #6181) <https://github.com/apple/foundationdb/pull/6181>`_, etc..

 Performance
 -----------
@ -22,14 +23,18 @@ Fixes

 Status
 ------
+* Added ``cluster.storage_wiggler`` field report storage wiggle stats `(PR #6219) <https://github.com/apple/foundationdb/pull/6219>`_

 Bindings
 --------
+* C: Added ``fdb_transaction_get_range_and_flat_map`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_
+* Java: Added ``Transaction.getRangeAndFlatMap`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_

 Other Changes
 -------------
 * OpenTracing support is now deprecated in favor of OpenTelemetry tracing, which will be enabled in a future release. `(PR #6478) <https://github.com/apple/foundationdb/pull/6478/files>`_
 * Changed ``memory`` option to limit resident memory instead of virtual memory. Added a new ``memory_vsize`` option if limiting virtual memory is desired. `(PR #6719) <https://github.com/apple/foundationdb/pull/6719>`_
+* Change ``perpetual storage wiggle`` to wiggle the storage servers based on their created time. `(PR #6219) <https://github.com/apple/foundationdb/pull/6219>`_

 Earlier release notes
 ---------------------
--- a/fdbcli/CMakeLists.txt
+++ b/fdbcli/CMakeLists.txt
@ -29,6 +29,7 @@ set(FDBCLI_SRCS
  TriggerDDTeamInfoLogCommand.actor.cpp
  TssqCommand.actor.cpp
  Util.actor.cpp
+  VersionEpochCommand.actor.cpp
  linenoise/linenoise.h)

 if(NOT WIN32)
--- a/fdbcli/ConfigureCommand.actor.cpp
+++ b/fdbcli/ConfigureCommand.actor.cpp
@ -190,6 +190,11 @@ ACTOR Future<bool> configureCommandActor(Reference<IDatabase> db,
 	case ConfigurationResult::DATABASE_CREATED:
 		printf("Database created\n");
 		break;
+	case ConfigurationResult::DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL:
+		printf("Database created\n");
+		fprintf(stderr,
+		        "WARN: RocksDB storage engine type is still in experimental stage, not yet production tested.\n");
+		break;
 	case ConfigurationResult::DATABASE_UNAVAILABLE:
 		fprintf(stderr, "ERROR: The database is unavailable\n");
 		fprintf(stderr, "Type `configure FORCE <TOKEN...>' to configure without this check\n");
@ -250,6 +255,11 @@ ACTOR Future<bool> configureCommandActor(Reference<IDatabase> db,
 		        "storage_migration_type=gradual' to set the gradual migration type.\n");
 		ret = false;
 		break;
+	case ConfigurationResult::SUCCESS_WARN_ROCKSDB_EXPERIMENTAL:
+		printf("Configuration changed\n");
+		fprintf(stderr,
+		        "WARN: RocksDB storage engine type is still in experimental stage, not yet production tested.\n");
+		break;
 	default:
 		ASSERT(false);
 		ret = false;
--- a/fdbcli/VersionEpochCommand.actor.cpp
+++ b/fdbcli/VersionEpochCommand.actor.cpp
@ -0,0 +1,174 @@
+/*
+ * VersionEpochCommand.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "boost/lexical_cast.hpp"
+
+#include "fdbcli/fdbcli.actor.h"
+
+#include "fdbclient/IClientApi.h"
+#include "fdbclient/ManagementAPI.actor.h"
+
+#include "flow/Arena.h"
+#include "flow/FastRef.h"
+#include "flow/ThreadHelper.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+namespace fdb_cli {
+
+const KeyRef versionEpochSpecialKey = LiteralStringRef("\xff\xff/management/version_epoch");
+
+struct VersionInfo {
+	int64_t version;
+	int64_t expectedVersion;
+};
+
+ACTOR static Future<Optional<VersionInfo>> getVersionInfo(Reference<IDatabase> db) {
+	state Reference<ITransaction> tr = db->createTransaction();
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			state Version rv = wait(safeThreadFutureToFuture(tr->getReadVersion()));
+			state ThreadFuture<Optional<Value>> versionEpochValFuture = tr->get(versionEpochKey);
+			Optional<Value> versionEpochVal = wait(safeThreadFutureToFuture(versionEpochValFuture));
+			if (!versionEpochVal.present()) {
+				return Optional<VersionInfo>();
+			}
+			int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(versionEpochVal.get(), Unversioned());
+			int64_t expected = g_network->timer() * CLIENT_KNOBS->CORE_VERSIONSPERSECOND - versionEpoch;
+			return VersionInfo{ rv, expected };
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR static Future<Optional<int64_t>> getVersionEpoch(Reference<ITransaction> tr) {
+	loop {
+		try {
+			state ThreadFuture<Optional<Value>> versionEpochValFuture = tr->get(versionEpochSpecialKey);
+			Optional<Value> versionEpochVal = wait(safeThreadFutureToFuture(versionEpochValFuture));
+			return versionEpochVal.present() ? boost::lexical_cast<int64_t>(versionEpochVal.get().toString())
+			                                 : Optional<int64_t>();
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR Future<bool> versionEpochCommandActor(Reference<IDatabase> db, Database cx, std::vector<StringRef> tokens) {
+	if (tokens.size() <= 3) {
+		state Reference<ITransaction> tr = db->createTransaction();
+		if (tokens.size() == 1) {
+			Optional<VersionInfo> versionInfo = wait(getVersionInfo(db));
+			if (versionInfo.present()) {
+				int64_t diff = versionInfo.get().expectedVersion - versionInfo.get().version;
+				printf("Version:    %" PRId64 "\n", versionInfo.get().version);
+				printf("Expected:   %" PRId64 "\n", versionInfo.get().expectedVersion);
+				printf("Difference: %" PRId64 " (%.2fs)\n", diff, 1.0 * diff / CLIENT_KNOBS->VERSIONS_PER_SECOND);
+			} else {
+				printf("Version epoch is unset\n");
+			}
+			return true;
+		} else if (tokens.size() == 2 && tokencmp(tokens[1], "get")) {
+			Optional<int64_t> versionEpoch = wait(getVersionEpoch(db->createTransaction()));
+			if (versionEpoch.present()) {
+				printf("Current version epoch is %" PRId64 "\n", versionEpoch.get());
+			} else {
+				printf("Version epoch is unset\n");
+			}
+			return true;
+		} else if (tokens.size() == 2 && tokencmp(tokens[1], "disable")) {
+			// Clearing the version epoch means versions will no longer attempt
+			// to advance at the same rate as the clock. The current version
+			// will remain unchanged.
+			loop {
+				try {
+					tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					Optional<int64_t> versionEpoch = wait(getVersionEpoch(db->createTransaction()));
+					if (!versionEpoch.present()) {
+						return true;
+					} else {
+						tr->clear(versionEpochSpecialKey);
+						wait(safeThreadFutureToFuture(tr->commit()));
+					}
+				} catch (Error& e) {
+					wait(safeThreadFutureToFuture(tr->onError(e)));
+				}
+			}
+		} else if ((tokens.size() == 2 && tokencmp(tokens[1], "enable")) ||
+		           (tokens.size() == 3 && tokencmp(tokens[1], "set"))) {
+			state int64_t v;
+			if (tokens.size() == 3) {
+				int n = 0;
+				if (sscanf(tokens[2].toString().c_str(), "%" SCNd64 "%n", &v, &n) != 1 || n != tokens[2].size()) {
+					printUsage(tokens[0]);
+					return false;
+				}
+			} else {
+				v = 0; // default version epoch
+			}
+
+			loop {
+				try {
+					tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					Optional<int64_t> versionEpoch = wait(getVersionEpoch(tr));
+					if (!versionEpoch.present() || (versionEpoch.get() != v && tokens.size() == 3)) {
+						tr->set(versionEpochSpecialKey, BinaryWriter::toValue(v, Unversioned()));
+						wait(safeThreadFutureToFuture(tr->commit()));
+					} else {
+						printf("Version epoch enabled. Run `versionepoch commit` to irreversibly jump to the target "
+						       "version\n");
+						return true;
+					}
+				} catch (Error& e) {
+					wait(safeThreadFutureToFuture(tr->onError(e)));
+				}
+			}
+		} else if (tokens.size() == 2 && tokencmp(tokens[1], "commit")) {
+			Optional<VersionInfo> versionInfo = wait(getVersionInfo(db));
+			if (versionInfo.present()) {
+				wait(advanceVersion(cx, versionInfo.get().expectedVersion));
+			} else {
+				printf("Must set the version epoch before committing it (see `versionepoch enable`)\n");
+			}
+			return true;
+		}
+	}
+
+	printUsage(tokens[0]);
+	return false;
+}
+
+CommandFactory versionEpochFactory(
+    "versionepoch",
+    CommandHelp("versionepoch [<enable|commit|set|disable> [EPOCH]]",
+                "Read or write the version epoch",
+                "If no arguments are specified, reports the offset between the expected version "
+                "and the actual version. Otherwise, enables, disables, or commits the version epoch. "
+                "Setting the version epoch can be irreversible since it can cause a large verison jump. "
+                "Thus, the version epoch must first by enabled with the enable or set command. This "
+                "causes a recovery. Once the version epoch has been set, versions may be given out at "
+                "a faster or slower rate to attempt to match the actual version to the expected version, "
+                "based on the version epoch. After setting the version, run the commit command to perform "
+                "a one time jump to the expected version. This is useful when there is a very large gap "
+                "between the current version and the expected version. Note that once a version jump has "
+                "occurred, it cannot be undone. Run this command without any arguments to see the current "
+                "and expected version."));
+} // namespace fdb_cli
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -1646,6 +1646,13 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 					continue;
 				}

+				if (tokencmp(tokens[0], "versionepoch")) {
+					bool _result = wait(makeInterruptable(versionEpochCommandActor(db, localDb, tokens)));
+					if (!_result)
+						is_error = true;
+					continue;
+				}
+
 				if (tokencmp(tokens[0], "kill")) {
 					getTransaction(db, managementTenant, tr, options, intrans);
 					bool _result = wait(makeInterruptable(killCommandActor(db, tr, tokens, &address_interface)));
--- a/fdbcli/fdbcli.actor.h
+++ b/fdbcli/fdbcli.actor.h
@ -210,6 +210,10 @@ ACTOR Future<bool> throttleCommandActor(Reference<IDatabase> db, std::vector<Str
 ACTOR Future<bool> triggerddteaminfologCommandActor(Reference<IDatabase> db);
 // tssq command
 ACTOR Future<bool> tssqCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
+// versionepoch command
+ACTOR Future<bool> versionEpochCommandActor(Reference<IDatabase> db, Database cx, std::vector<StringRef> tokens);
+// targetversion command
+ACTOR Future<bool> targetVersionCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);

 } // namespace fdb_cli

--- a/fdbclient/BlobWorkerInterface.h
+++ b/fdbclient/BlobWorkerInterface.h
@ -44,7 +44,18 @@ struct BlobWorkerInterface {
 	BlobWorkerInterface() {}
 	explicit BlobWorkerInterface(const struct LocalityData& l, UID id) : locality(l), myId(id) {}

-	void initEndpoints() {}
+	void initEndpoints() {
+		// TODO: specify endpoint priorities?
+		std::vector<std::pair<FlowReceiver*, TaskPriority>> streams;
+		streams.push_back(waitFailure.getReceiver());
+		streams.push_back(blobGranuleFileRequest.getReceiver());
+		streams.push_back(assignBlobRangeRequest.getReceiver());
+		streams.push_back(revokeBlobRangeRequest.getReceiver());
+		streams.push_back(granuleAssignmentsRequest.getReceiver());
+		streams.push_back(granuleStatusStreamRequest.getReceiver());
+		streams.push_back(haltBlobWorker.getReceiver());
+		FlowTransport::transport().addEndpoints(streams);
+	}
 	UID id() const { return myId; }
 	NetworkAddress address() const { return blobGranuleFileRequest.getEndpoint().getPrimaryAddress(); }
 	NetworkAddress stableAddress() const { return blobGranuleFileRequest.getEndpoint().getStableAddress(); }
@ -54,16 +65,22 @@ struct BlobWorkerInterface {

 	template <class Archive>
 	void serialize(Archive& ar) {
-		serializer(ar,
-		           waitFailure,
-		           blobGranuleFileRequest,
-		           assignBlobRangeRequest,
-		           revokeBlobRangeRequest,
-		           granuleAssignmentsRequest,
-		           granuleStatusStreamRequest,
-		           haltBlobWorker,
-		           locality,
-		           myId);
+		// use adjusted endpoints
+		serializer(ar, myId, locality, waitFailure);
+		if (Archive::isDeserializing) {
+			blobGranuleFileRequest =
+			    RequestStream<struct BlobGranuleFileRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(1));
+			assignBlobRangeRequest =
+			    RequestStream<struct AssignBlobRangeRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(2));
+			revokeBlobRangeRequest =
+			    RequestStream<struct RevokeBlobRangeRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(3));
+			granuleAssignmentsRequest =
+			    RequestStream<struct GetGranuleAssignmentsRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(4));
+			granuleStatusStreamRequest =
+			    RequestStream<struct GranuleStatusStreamRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(5));
+			haltBlobWorker =
+			    RequestStream<struct HaltBlobWorkerRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(6));
+		}
 	}
 };

--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -372,6 +372,9 @@ public:
 	Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
 	Future<Void> popChangeFeedMutations(Key rangeID, Version version);

+	Future<Key> purgeBlobGranules(KeyRange keyRange, Version purgeVersion, bool force = false);
+	Future<Void> waitPurgeGranulesComplete(Key purgeKey);
+
 	// private:
 	explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord,
 	                         Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -22,6 +22,7 @@
 #define FDBCLIENT_FDBTYPES_H

 #include <algorithm>
+#include <cinttypes>
 #include <set>
 #include <string>
 #include <vector>
--- a/fdbclient/GenericManagementAPI.actor.h
+++ b/fdbclient/GenericManagementAPI.actor.h
@ -65,6 +65,8 @@ enum class ConfigurationResult {
 	LOCKED_NOT_NEW,
 	SUCCESS_WARN_PPW_GRADUAL,
 	SUCCESS,
+	SUCCESS_WARN_ROCKSDB_EXPERIMENTAL,
+	DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL,
 };

 enum class CoordinatorsResult {
@ -290,6 +292,7 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
 	state bool oldReplicationUsesDcId = false;
 	state bool warnPPWGradual = false;
 	state bool warnChangeStorageNoMigrate = false;
+	state bool warnRocksDBIsExperimental = false;
 	loop {
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -477,6 +480,9 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
 					} else if (newConfig.storageMigrationType == StorageMigrationType::GRADUAL &&
 					           newConfig.perpetualStorageWiggleSpeed == 0) {
 						warnPPWGradual = true;
+					} else if (newConfig.storageServerStoreType != oldConfig.storageServerStoreType &&
+					           newConfig.storageServerStoreType == KeyValueStoreType::SSD_ROCKSDB_V1) {
+						warnRocksDBIsExperimental = true;
 					}
 				}
 			}
@ -525,6 +531,9 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
 						Optional<Value> v = wait(safeThreadFutureToFuture(vF));
 						if (v != m[initIdKey.toString()])
 							return ConfigurationResult::DATABASE_ALREADY_CREATED;
+						else if (m[configKeysPrefix.toString() + "storage_engine"] ==
+						         std::to_string(KeyValueStoreType::SSD_ROCKSDB_V1))
+							return ConfigurationResult::DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL;
 						else
 							return ConfigurationResult::DATABASE_CREATED;
 					} catch (Error& e2) {
@ -538,6 +547,8 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,

 	if (warnPPWGradual) {
 		return ConfigurationResult::SUCCESS_WARN_PPW_GRADUAL;
+	} else if (warnRocksDBIsExperimental) {
+		return ConfigurationResult::SUCCESS_WARN_ROCKSDB_EXPERIMENTAL;
 	} else {
 		return ConfigurationResult::SUCCESS;
 	}
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@ -152,6 +152,11 @@ public:
 	// Management API, create snapshot
 	virtual ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) = 0;

+	// purge blob granules api. purgeBlobGranules is asynchronus, calling waitPurgeGranulesComplete after guarantees
+	// completion.
+	virtual ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) = 0;
+	virtual ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) = 0;
+
 	// Interface to manage shared state across multiple connections to the same Database
 	virtual ThreadFuture<DatabaseSharedState*> createSharedState() = 0;
 	virtual void setSharedState(DatabaseSharedState* p) = 0;
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -516,6 +516,38 @@ ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVer
 	});
 }

+ThreadFuture<Key> DLDatabase::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) {
+	if (!api->purgeBlobGranules) {
+		return unsupported_operation();
+	}
+	FdbCApi::FDBFuture* f = api->purgeBlobGranules(db,
+	                                               keyRange.begin.begin(),
+	                                               keyRange.begin.size(),
+	                                               keyRange.end.begin(),
+	                                               keyRange.end.size(),
+	                                               purgeVersion,
+	                                               force);
+
+	return toThreadFuture<Key>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		const uint8_t* key;
+		int keyLength;
+		FdbCApi::fdb_error_t error = api->futureGetKey(f, &key, &keyLength);
+		ASSERT(!error);
+
+		// The memory for this is stored in the FDBFuture and is released when the future gets destroyed
+		return Key(KeyRef(key, keyLength), Arena());
+	});
+}
+
+ThreadFuture<Void> DLDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
+	if (!api->waitPurgeGranulesComplete) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->waitPurgeGranulesComplete(db, purgeKey.begin(), purgeKey.size());
+	return toThreadFuture<Void>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); });
+}
+
 // DLApi

 // Loads the specified function from a dynamic library
@ -590,6 +622,15 @@ void DLApi::init() {
 	loadClientFunction(
 	    &api->databaseCreateSnapshot, lib, fdbCPath, "fdb_database_create_snapshot", headerVersion >= 700);

+	loadClientFunction(
+	    &api->purgeBlobGranules, lib, fdbCPath, "fdb_database_purge_blob_granules", headerVersion >= 710);
+
+	loadClientFunction(&api->waitPurgeGranulesComplete,
+	                   lib,
+	                   fdbCPath,
+	                   "fdb_database_wait_purge_granules_complete",
+	                   headerVersion >= 710);
+
 	loadClientFunction(
 	    &api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710);
 	loadClientFunction(&api->tenantDestroy, lib, fdbCPath, "fdb_tenant_destroy", headerVersion >= 710);
@ -609,7 +650,7 @@ void DLApi::init() {
 	                   headerVersion >= 0);
 	loadClientFunction(&api->transactionGetRange, lib, fdbCPath, "fdb_transaction_get_range", headerVersion >= 0);
 	loadClientFunction(
-	    &api->transactionGetMappedRange, lib, fdbCPath, "fdb_transaction_get_mapped_range", headerVersion >= 700);
+	    &api->transactionGetMappedRange, lib, fdbCPath, "fdb_transaction_get_mapped_range", headerVersion >= 710);
 	loadClientFunction(
 	    &api->transactionGetVersionstamp, lib, fdbCPath, "fdb_transaction_get_versionstamp", headerVersion >= 410);
 	loadClientFunction(&api->transactionSet, lib, fdbCPath, "fdb_transaction_set", headerVersion >= 0);
@ -667,7 +708,7 @@ void DLApi::init() {
 	loadClientFunction(
 	    &api->futureGetKeyValueArray, lib, fdbCPath, "fdb_future_get_keyvalue_array", headerVersion >= 0);
 	loadClientFunction(
-	    &api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 700);
+	    &api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 710);
 	loadClientFunction(&api->futureGetSharedState, lib, fdbCPath, "fdb_future_get_shared_state", headerVersion >= 710);
 	loadClientFunction(&api->futureSetCallback, lib, fdbCPath, "fdb_future_set_callback", headerVersion >= 0);
 	loadClientFunction(&api->futureCancel, lib, fdbCPath, "fdb_future_cancel", headerVersion >= 0);
@ -1442,6 +1483,17 @@ double MultiVersionDatabase::getMainThreadBusyness() {
 	return localClientBusyness;
 }

+ThreadFuture<Key> MultiVersionDatabase::purgeBlobGranules(const KeyRangeRef& keyRange,
+                                                          Version purgeVersion,
+                                                          bool force) {
+	auto f = dbState->db ? dbState->db->purgeBlobGranules(keyRange, purgeVersion, force) : ThreadFuture<Key>(Never());
+	return abortableFuture(f, dbState->dbVar->get().onChange);
+}
+ThreadFuture<Void> MultiVersionDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
+	auto f = dbState->db ? dbState->db->waitPurgeGranulesComplete(purgeKey) : ThreadFuture<Void>(Never());
+	return abortableFuture(f, dbState->dbVar->get().onChange);
+}
+
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 // Note: this will never return if the server is running a protocol from FDB 5.0 or older
@ -1536,7 +1588,7 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 		    .detail("OldProtocolVersion", dbProtocolVersion);
 		// When the protocol version changes, clear the corresponding entry in the shared state map
 		// so it can be re-initialized. Only do so if there was a valid previous protocol version.
-		if (dbProtocolVersion.present()) {
+		if (dbProtocolVersion.present() && MultiVersionApi::apiVersionAtLeast(710)) {
 			MultiVersionApi::api->clearClusterSharedStateMapEntry(clusterFilePath);
 		}

@ -2333,9 +2385,14 @@ ThreadFuture<Void> MultiVersionApi::updateClusterSharedStateMap(std::string clus

 void MultiVersionApi::clearClusterSharedStateMapEntry(std::string clusterFilePath) {
 	MutexHolder holder(lock);
-	auto ssPtr = clusterSharedStateMap[clusterFilePath].get();
+	auto mapEntry = clusterSharedStateMap.find(clusterFilePath);
+	if (mapEntry == clusterSharedStateMap.end()) {
+		TraceEvent(SevError, "ClusterSharedStateMapEntryNotFound").detail("ClusterFilePath", clusterFilePath);
+		return;
+	}
+	auto ssPtr = mapEntry->second.get();
 	ssPtr->delRef(ssPtr);
-	clusterSharedStateMap.erase(clusterFilePath);
+	clusterSharedStateMap.erase(mapEntry);
 }

 std::vector<std::string> parseOptionValues(std::string valueStr) {
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@ -156,6 +156,16 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	double (*databaseGetMainThreadBusyness)(FDBDatabase* database);
 	FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion);

+	FDBFuture* (*purgeBlobGranules)(FDBDatabase* db,
+	                                uint8_t const* begin_key_name,
+	                                int begin_key_name_length,
+	                                uint8_t const* end_key_name,
+	                                int end_key_name_length,
+	                                int64_t purge_version,
+	                                fdb_bool_t force);
+
+	FDBFuture* (*waitPurgeGranulesComplete)(FDBDatabase* db, uint8_t const* purge_key_name, int purge_key_name_length);
+
 	// Tenant
 	fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction);
 	void (*tenantDestroy)(FDBTenant* tenant);
@ -438,6 +448,9 @@ public:
 	ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
 	ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;

+	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
+	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;

@ -716,6 +729,9 @@ public:
 	ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
 	ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;

+	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
+	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;

--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -1519,6 +1519,12 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
 		    std::make_unique<AdvanceVersionImpl>(
 		        singleKeyRange(LiteralStringRef("min_required_commit_version"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::MANAGEMENT,
+		    SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<VersionEpochImpl>(
+		        singleKeyRange(LiteralStringRef("version_epoch"))
+		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 		registerSpecialKeySpaceModule(
 		    SpecialKeySpace::MODULE::MANAGEMENT,
 		    SpecialKeySpace::IMPLTYPE::READWRITE,
@ -9178,3 +9184,86 @@ Future<Void> DatabaseContext::popChangeFeedMutations(Key rangeID, Version versio
 Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
 	return makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(this)));
 }
+
+ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
+                                         KeyRange range,
+                                         Version purgeVersion,
+                                         bool force) {
+	state Database cx(db);
+	state Transaction tr(cx);
+	state Key purgeKey;
+
+	// FIXME: implement force
+	if (!force) {
+		throw unsupported_operation();
+	}
+	loop {
+		try {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			Value purgeValue = blobGranulePurgeValueFor(purgeVersion, range, force);
+			tr.atomicOp(
+			    addVersionStampAtEnd(blobGranulePurgeKeys.begin), purgeValue, MutationRef::SetVersionstampedKey);
+			tr.set(blobGranulePurgeChangeKey, deterministicRandom()->randomUniqueID().toString());
+			state Future<Standalone<StringRef>> fTrVs = tr.getVersionstamp();
+			wait(tr.commit());
+			Standalone<StringRef> vs = wait(fTrVs);
+			purgeKey = blobGranulePurgeKeys.begin.withSuffix(vs);
+			if (BG_REQUEST_DEBUG) {
+				fmt::print("purgeBlobGranules for range [{0} - {1}) at version {2} registered {3}\n",
+				           range.begin.printable(),
+				           range.end.printable(),
+				           purgeVersion,
+				           purgeKey.printable());
+			}
+			break;
+		} catch (Error& e) {
+			if (BG_REQUEST_DEBUG) {
+				fmt::print("purgeBlobGranules for range [{0} - {1}) at version {2} encountered error {3}\n",
+				           range.begin.printable(),
+				           range.end.printable(),
+				           purgeVersion,
+				           e.name());
+			}
+			wait(tr.onError(e));
+		}
+	}
+	return purgeKey;
+}
+
+Future<Key> DatabaseContext::purgeBlobGranules(KeyRange range, Version purgeVersion, bool force) {
+	return purgeBlobGranulesActor(Reference<DatabaseContext>::addRef(this), range, purgeVersion, force);
+}
+
+ACTOR Future<Void> waitPurgeGranulesCompleteActor(Reference<DatabaseContext> db, Key purgeKey) {
+	state Database cx(db);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			Optional<Value> purgeVal = wait(tr->get(purgeKey));
+			if (!purgeVal.present()) {
+				if (BG_REQUEST_DEBUG) {
+					fmt::print("purgeBlobGranules for {0} succeeded\n", purgeKey.printable());
+				}
+				return Void();
+			}
+			if (BG_REQUEST_DEBUG) {
+				fmt::print("purgeBlobGranules for {0} watching\n", purgeKey.printable());
+			}
+			state Future<Void> watchFuture = tr->watch(purgeKey);
+			wait(tr->commit());
+			wait(watchFuture);
+			tr->reset();
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
+Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
+	return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
+}
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -36,6 +36,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS,     5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 1 * VERSIONS_PER_SECOND);
 	init( MAX_COMMIT_BATCH_INTERVAL,                             2.0 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 0.5; // Each commit proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly
 	MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
+	init( MAX_VERSION_RATE_MODIFIER,                             0.1 );
+	init( MAX_VERSION_RATE_OFFSET,               VERSIONS_PER_SECOND ); // If the calculated version is more than this amount away from the expected version, it will be clamped to this value. This prevents huge version jumps.

 	// TLogs
 	init( TLOG_TIMEOUT,                                          0.4 ); //cannot buggify because of availability
@ -109,6 +111,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// disk snapshot max timeout, to be put in TLog, storage and coordinator nodes
 	init( MAX_FORKED_PROCESS_OUTPUT,                            1024 );
 	init( SNAP_CREATE_MAX_TIMEOUT,                             300.0 );
+	init( MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE,                    1 );
+	init( MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE,                1 );

 	// Data distribution queue
 	init( HEALTH_POLL_TIME,                                      1.0 );
@ -529,6 +533,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( CC_HEALTH_TRIGGER_FAILOVER,                          false );
 	init( CC_FAILOVER_DUE_TO_HEALTH_MIN_DEGRADATION,               5 );
 	init( CC_FAILOVER_DUE_TO_HEALTH_MAX_DEGRADATION,              10 );
+	init( CC_ENABLE_ENTIRE_SATELLITE_MONITORING,               false );
+	init( CC_SATELLITE_DEGRADATION_MIN_COMPLAINER,                 3 );
+	init( CC_SATELLITE_DEGRADATION_MIN_BAD_SERVER,                 3 );

 	init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL,                   600 ); if( randomize && BUGGIFY ) INCOMPATIBLE_PEERS_LOGGING_INTERVAL = 60.0;
 	init( EXPECTED_MASTER_FITNESS,            ProcessClass::UnsetFit );
@ -719,6 +726,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( PEER_LATENCY_CHECK_MIN_POPULATION,                      30 );
 	init( PEER_LATENCY_DEGRADATION_PERCENTILE,                  0.90 );
 	init( PEER_LATENCY_DEGRADATION_THRESHOLD,                   0.05 );
+	init( PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE,        0.90 );
+	init( PEER_LATENCY_DEGRADATION_THRESHOLD_SATELLITE,          0.1 );
 	init( PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD,         0.1 );
 	init( PEER_DEGRADATION_CONNECTION_FAILURE_COUNT,               1 );
 	init( WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER,           true );
@ -827,6 +836,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REDWOOD_METRICS_INTERVAL,                              5.0 );
 	init( REDWOOD_HISTOGRAM_INTERVAL,                           30.0 );
 	init( REDWOOD_EVICT_UPDATED_PAGES,                          true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
+	init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT,                    2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }

 	// Server request latency measurement
 	init( LATENCY_SAMPLE_SIZE,                                100000 );
--- a/fdbclient/ServerKnobs.h
+++ b/fdbclient/ServerKnobs.h
@ -39,6 +39,8 @@ public:
 	int64_t MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
 	double MAX_COMMIT_BATCH_INTERVAL; // Each commit proxy generates a CommitTransactionBatchRequest at least this
 	                                  // often, so that versions always advance smoothly
+	double MAX_VERSION_RATE_MODIFIER;
+	int64_t MAX_VERSION_RATE_OFFSET;

 	// TLogs
 	bool PEEK_USING_STREAMING;
@ -466,6 +468,14 @@ public:
 	                                               // failover.
 	int CC_FAILOVER_DUE_TO_HEALTH_MAX_DEGRADATION; // The maximum number of degraded servers that can trigger a
 	                                               // failover.
+	bool CC_ENABLE_ENTIRE_SATELLITE_MONITORING; // When enabled, gray failure tries to detect whether the entire
+	                                            // satellite DC is degraded.
+	int CC_SATELLITE_DEGRADATION_MIN_COMPLAINER; // When the network between primary and satellite becomes bad, all the
+	                                             // workers in primary may have bad network talking to the satellite.
+	                                             // This is the minimum amount of complainer for a satellite worker to
+	                                             // be determined as degraded worker.
+	int CC_SATELLITE_DEGRADATION_MIN_BAD_SERVER; // The minimum amount of degraded server in satellite DC to be
+	                                             // determined as degraded satellite.

 	// Knobs used to select the best policy (via monte carlo)
 	int POLICY_RATING_TESTS; // number of tests per policy (in order to compare)
@ -575,6 +585,12 @@ public:
 	// disk snapshot
 	int64_t MAX_FORKED_PROCESS_OUTPUT;
 	double SNAP_CREATE_MAX_TIMEOUT;
+	// Maximum number of storage servers a snapshot can fail to
+	// capture while still succeeding
+	int64_t MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE;
+	// Maximum number of coordinators a snapshot can fail to
+	// capture while still succeeding
+	int64_t MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE;

 	// Storage Metrics
 	double STORAGE_METRICS_AVERAGE_INTERVAL;
@ -659,8 +675,12 @@ public:
 	bool ENABLE_WORKER_HEALTH_MONITOR;
 	double WORKER_HEALTH_MONITOR_INTERVAL; // Interval between two health monitor health check.
 	int PEER_LATENCY_CHECK_MIN_POPULATION; // The minimum number of latency samples required to check a peer.
-	double PEER_LATENCY_DEGRADATION_PERCENTILE; // The percentile latency used to check peer health.
+	double PEER_LATENCY_DEGRADATION_PERCENTILE; // The percentile latency used to check peer health among workers inside
+	                                            // primary or remote DC.
 	double PEER_LATENCY_DEGRADATION_THRESHOLD; // The latency threshold to consider a peer degraded.
+	double PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE; // The percentile latency used to check peer health between
+	                                                      // primary and primary satellite.
+	double PEER_LATENCY_DEGRADATION_THRESHOLD_SATELLITE; // The latency threshold to consider a peer degraded.
 	double PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD; // The percentage of timeout to consider a peer degraded.
 	int PEER_DEGRADATION_CONNECTION_FAILURE_COUNT; // The number of connection failures experienced during measurement
 	                                               // period to consider a peer degraded.
@ -784,6 +804,7 @@ public:
 	double REDWOOD_METRICS_INTERVAL;
 	double REDWOOD_HISTOGRAM_INTERVAL;
 	bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache.
+	int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches

 	// Server request latency measurement
 	int LATENCY_SAMPLE_SIZE;
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -106,6 +106,8 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	{ "advanceversion",
 	  singleKeyRange(LiteralStringRef("min_required_commit_version"))
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "versionepoch",
+	  singleKeyRange(LiteralStringRef("version_epoch")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
 	{ "profile",
 	  KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
@ -1909,6 +1911,42 @@ Future<Optional<std::string>> AdvanceVersionImpl::commit(ReadYourWritesTransacti
 	return Optional<std::string>();
 }

+ACTOR static Future<RangeResult> getVersionEpochActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
+	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
+	ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS);
+	Optional<Value> val = wait(ryw->getTransaction().get(versionEpochKey));
+	RangeResult result;
+	if (val.present()) {
+		int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(val.get(), Unversioned());
+		ValueRef version(result.arena(), boost::lexical_cast<std::string>(versionEpoch));
+		result.push_back_deep(result.arena(), KeyValueRef(kr.begin, version));
+	}
+	return result;
+}
+
+VersionEpochImpl::VersionEpochImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
+Future<RangeResult> VersionEpochImpl::getRange(ReadYourWritesTransaction* ryw,
+                                               KeyRangeRef kr,
+                                               GetRangeLimits limitsHint) const {
+	ASSERT(kr == getKeyRange());
+	return getVersionEpochActor(ryw, kr);
+}
+
+Future<Optional<std::string>> VersionEpochImpl::commit(ReadYourWritesTransaction* ryw) {
+	auto versionEpoch =
+	    ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("versionepoch")].second;
+	if (versionEpoch.present()) {
+		int64_t epoch = BinaryReader::fromStringRef<int64_t>(versionEpoch.get(), Unversioned());
+		ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
+		ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS);
+		ryw->getTransaction().set(versionEpochKey, BinaryWriter::toValue(epoch, Unversioned()));
+	} else {
+		ryw->getTransaction().clear(versionEpochKey);
+	}
+	return Optional<std::string>();
+}
+
 ClientProfilingImpl::ClientProfilingImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}

 ACTOR static Future<RangeResult> ClientProfilingGetRangeActor(ReadYourWritesTransaction* ryw,
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@ -476,6 +476,15 @@ public:
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };

+class VersionEpochImpl : public SpecialKeyRangeRWImpl {
+public:
+	explicit VersionEpochImpl(KeyRangeRef kr);
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
+	                             KeyRangeRef kr,
+	                             GetRangeLimits limitsHint) const override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+};
+
 class ClientProfilingImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit ClientProfilingImpl(KeyRangeRef kr);
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -838,6 +838,7 @@ std::vector<std::pair<UID, Version>> decodeBackupStartedValue(const ValueRef& va
 const KeyRef coordinatorsKey = LiteralStringRef("\xff/coordinators");
 const KeyRef logsKey = LiteralStringRef("\xff/logs");
 const KeyRef minRequiredCommitVersionKey = LiteralStringRef("\xff/minRequiredCommitVersion");
+const KeyRef versionEpochKey = LiteralStringRef("\xff/versionEpoch");

 const KeyRef globalKeysPrefix = LiteralStringRef("\xff/globals");
 const KeyRef lastEpochEndKey = LiteralStringRef("\xff/globals/lastEpochEnd");
@ -1170,9 +1171,9 @@ const KeyRangeRef blobGranuleMappingKeys(LiteralStringRef("\xff\x02/bgm/"), Lite
 const KeyRangeRef blobGranuleLockKeys(LiteralStringRef("\xff\x02/bgl/"), LiteralStringRef("\xff\x02/bgl0"));
 const KeyRangeRef blobGranuleSplitKeys(LiteralStringRef("\xff\x02/bgs/"), LiteralStringRef("\xff\x02/bgs0"));
 const KeyRangeRef blobGranuleHistoryKeys(LiteralStringRef("\xff\x02/bgh/"), LiteralStringRef("\xff\x02/bgh0"));
-const KeyRangeRef blobGranulePruneKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0"));
+const KeyRangeRef blobGranulePurgeKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0"));
 const KeyRangeRef blobGranuleVersionKeys(LiteralStringRef("\xff\x02/bgv/"), LiteralStringRef("\xff\x02/bgv0"));
-const KeyRef blobGranulePruneChangeKey = LiteralStringRef("\xff\x02/bgpChange");
+const KeyRef blobGranulePurgeChangeKey = LiteralStringRef("\xff\x02/bgpChange");

 const uint8_t BG_FILE_TYPE_DELTA = 'D';
 const uint8_t BG_FILE_TYPE_SNAPSHOT = 'S';
@ -1229,7 +1230,7 @@ std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFi
 	return std::tuple(filename, offset, length, fullFileLength);
 }

-const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force) {
+const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force) {
 	BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
 	wr << version;
 	wr << range;
@ -1237,7 +1238,7 @@ const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force
 	return wr.toValue();
 }

-std::tuple<Version, KeyRange, bool> decodeBlobGranulePruneValue(ValueRef const& value) {
+std::tuple<Version, KeyRange, bool> decodeBlobGranulePurgeValue(ValueRef const& value) {
 	Version version;
 	KeyRange range;
 	bool force;
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -350,6 +350,11 @@ extern const KeyRef logsKey;
 //	Used during backup/recovery to restrict version requirements
 extern const KeyRef minRequiredCommitVersionKey;

+//	"\xff/versionEpochKey" = "[[uint64_t]]"
+//	Defines the base epoch representing version 0. The value itself is the
+//	number of microseconds since the Unix epoch.
+extern const KeyRef versionEpochKey;
+
 const Value logsValue(const std::vector<std::pair<UID, NetworkAddress>>& logs,
                      const std::vector<std::pair<UID, NetworkAddress>>& oldLogs);
 std::pair<std::vector<std::pair<UID, NetworkAddress>>, std::vector<std::pair<UID, NetworkAddress>>> decodeLogsValue(
@ -567,9 +572,9 @@ extern const KeyRangeRef blobGranuleSplitKeys;
 extern const KeyRangeRef blobGranuleHistoryKeys;

 // \xff\x02/bgp/(start,end) = (version, force)
-extern const KeyRangeRef blobGranulePruneKeys;
+extern const KeyRangeRef blobGranulePurgeKeys;
 extern const KeyRangeRef blobGranuleVersionKeys;
-extern const KeyRef blobGranulePruneChangeKey;
+extern const KeyRef blobGranulePurgeChangeKey;

 const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t fileType);
 std::tuple<UID, Version, uint8_t> decodeBlobGranuleFileKey(KeyRef const& key);
@ -578,8 +583,8 @@ const KeyRange blobGranuleFileKeyRangeFor(UID granuleID);
 const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength);
 std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value);

-const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force);
-std::tuple<Version, KeyRange, bool> decodeBlobGranulePruneValue(ValueRef const& value);
+const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force);
+std::tuple<Version, KeyRange, bool> decodeBlobGranulePurgeValue(ValueRef const& value);

 const Value blobGranuleMappingValueFor(UID const& workerID);
 UID decodeBlobGranuleMappingValue(ValueRef const& value);
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@ -127,6 +127,20 @@ ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<Pro
 	    [db, expectedVersion]() -> Future<ProtocolVersion> { return db->getClusterProtocol(expectedVersion); });
 }

+ThreadFuture<Key> ThreadSafeDatabase::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) {
+	DatabaseContext* db = this->db;
+	KeyRange range = keyRange;
+	return onMainThread([db, range, purgeVersion, force]() -> Future<Key> {
+		return db->purgeBlobGranules(range, purgeVersion, force);
+	});
+}
+
+ThreadFuture<Void> ThreadSafeDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
+	DatabaseContext* db = this->db;
+	Key key = purgeKey;
+	return onMainThread([db, key]() -> Future<Void> { return db->waitPurgeGranulesComplete(key); });
+}
+
 ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) {
 	ClusterConnectionFile* connFile =
 	    new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFilename).first);
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@ -59,6 +59,9 @@ public:
 	ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
 	ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;

+	ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
+	ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
+
 	ThreadFuture<DatabaseSharedState*> createSharedState() override;
 	void setSharedState(DatabaseSharedState* p) override;

--- a/fdbrpc/HealthMonitor.actor.cpp
+++ b/fdbrpc/HealthMonitor.actor.cpp
@ -35,11 +35,10 @@ void HealthMonitor::purgeOutdatedHistory() {
 			auto& count = peerClosedNum[p.second];
 			--count;
 			ASSERT(count >= 0);
-			peerClosedHistory.pop_front();
-
 			if (count == 0) {
 				peerClosedNum.erase(p.second);
 			}
+			peerClosedHistory.pop_front();
 		} else {
 			break;
 		}
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@ -326,6 +326,7 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
 	AcknowledgementReceiver acknowledgements;
 	Endpoint requestStreamEndpoint;
 	bool sentError = false;
+	bool notifiedFailed = false;
 	Promise<Void> onConnect;

 	NetNotifiedQueueWithAcknowledgements(int futures, int promises)
@ -402,14 +403,20 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
 		return res;
 	}

-	~NetNotifiedQueueWithAcknowledgements() {
-		if (acknowledgements.getRawEndpoint().isValid() && acknowledgements.isRemoteEndpoint() && !this->hasError()) {
+	void notifyFailed() {
+		if (!notifiedFailed && acknowledgements.getRawEndpoint().isValid() && acknowledgements.isRemoteEndpoint() &&
+		    !this->hasError()) {
 			// Notify the server that a client is not using this ReplyPromiseStream anymore
 			FlowTransport::transport().sendUnreliable(
 			    SerializeSource<ErrorOr<AcknowledgementReply>>(operation_obsolete()),
 			    acknowledgements.getEndpoint(TaskPriority::ReadSocket),
 			    false);
+			notifiedFailed = true;
 		}
+	}
+
+	~NetNotifiedQueueWithAcknowledgements() {
+		notifyFailed();
 		if (isRemoteEndpoint() && !sentError && !acknowledgements.failures.isReady()) {
 			// Notify the client ReplyPromiseStream was cancelled before sending an error, so the storage server must
 			// have died
@ -505,6 +512,8 @@ public:
 		return queue->onConnect.getFuture();
 	}

+	void notifyFailed() { queue->notifyFailed(); }
+
 	~ReplyPromiseStream() {
 		if (queue)
 			queue->delPromiseRef();
--- a/fdbrpc/genericactors.actor.h
+++ b/fdbrpc/genericactors.actor.h
@ -321,6 +321,8 @@ void endStreamOnDisconnect(Future<Void> signal,
 				wait(signal || stream.onConnected());
 			}
 		}
+		// Notify BEFORE dropping last reference, causing broken_promise to send on stream before destructor is called
+		stream.notifyFailed();
 	}
 }

--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -590,6 +590,18 @@ private:
 		TEST(true); // Recovering at a higher version.
 	}

+	void checkSetVersionEpochKey(MutationRef m) {
+		if (m.param1 != versionEpochKey) {
+			return;
+		}
+		int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(m.param2, Unversioned());
+		TraceEvent("VersionEpoch", dbgid).detail("Epoch", versionEpoch);
+		if (!initialCommit)
+			txnStateStore->set(KeyValueRef(m.param1, m.param2));
+		confChange = true;
+		TEST(true); // Setting version epoch
+	}
+
 	void checkSetWriteRecoverKey(MutationRef m) {
 		if (m.param1 != writeRecoveryKey) {
 			return;
@ -957,6 +969,16 @@ private:
 		}
 	}

+	void checkClearVersionEpochKeys(MutationRef m, KeyRangeRef range) {
+		if (!range.contains(versionEpochKey)) {
+			return;
+		}
+		if (!initialCommit)
+			txnStateStore->clear(singleKeyRange(versionEpochKey));
+		TraceEvent("MutationRequiresRestart", dbgid).detail("M", m);
+		confChange = true;
+	}
+
 	void checkClearTenantMapPrefix(KeyRangeRef range) {
 		if (tenantMapKeys.intersects(range)) {
 			if (tenantMap) {
@ -1119,6 +1141,7 @@ public:
 				checkSetGlobalKeys(m);
 				checkSetWriteRecoverKey(m);
 				checkSetMinRequiredCommitVersionKey(m);
+				checkSetVersionEpochKey(m);
 				checkSetTenantMapPrefix(m);
 				checkSetOtherKeys(m);
 			} else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) {
@ -1135,6 +1158,7 @@ public:
 				checkClearLogRangesRange(range);
 				checkClearTssMappingKeys(m, range);
 				checkClearTssQuarantineKeys(m, range);
+				checkClearVersionEpochKeys(m, range);
 				checkClearTenantMapPrefix(range);
 				checkClearMiscRangeKeys(range);
 			}
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -216,7 +216,7 @@ struct SplitEvaluation {
 struct BlobManagerStats {
 	CounterCollection cc;

-	// FIXME: pruning stats
+	// FIXME: purging stats

 	Counter granuleSplits;
 	Counter granuleWriteHotSplits;
@ -226,6 +226,10 @@ struct BlobManagerStats {
 	Counter ccMismatches;
 	Counter ccTimeouts;
 	Counter ccErrors;
+	Counter purgesProcessed;
+	Counter granulesFullyPurged;
+	Counter granulesPartiallyPurged;
+	Counter filesPurged;
 	Future<Void> logger;

 	// Current stats maintained for a given blob worker process
@ -233,7 +237,9 @@ struct BlobManagerStats {
 	  : cc("BlobManagerStats", id.toString()), granuleSplits("GranuleSplits", cc),
 	    granuleWriteHotSplits("GranuleWriteHotSplits", cc), ccGranulesChecked("CCGranulesChecked", cc),
 	    ccRowsChecked("CCRowsChecked", cc), ccBytesChecked("CCBytesChecked", cc), ccMismatches("CCMismatches", cc),
-	    ccTimeouts("CCTimeouts", cc), ccErrors("CCErrors", cc) {
+	    ccTimeouts("CCTimeouts", cc), ccErrors("CCErrors", cc), purgesProcessed("PurgesProcessed", cc),
+	    granulesFullyPurged("GranulesFullyPurged", cc), granulesPartiallyPurged("GranulesPartiallyPurged", cc),
+	    filesPurged("FilesPurged", cc) {
 		specialCounter(cc, "WorkerCount", [workers]() { return workers->size(); });
 		logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics");
 	}
@ -438,6 +444,7 @@ ACTOR Future<UID> pickWorkerForAssign(Reference<BlobManagerData> bmData) {
 ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
                                     RangeAssignment assignment,
                                     Optional<UID> workerID,
+                                     int64_t epoch,
                                     int64_t seqNo) {
 	// WorkerId is set, except in case of assigning to any worker. Then we pick the worker to assign to in here

@ -468,7 +475,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 		           assignment.isAssign ? "assigning" : "revoking",
 		           assignment.keyRange.begin.printable(),
 		           assignment.keyRange.end.printable(),
-		           bmData->epoch,
+		           epoch,
 		           seqNo,
 		           workerID.get().toString());
 	}
@ -481,7 +488,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 			AssignBlobRangeRequest req;
 			req.keyRange = KeyRangeRef(StringRef(req.arena, assignment.keyRange.begin),
 			                           StringRef(req.arena, assignment.keyRange.end));
-			req.managerEpoch = bmData->epoch;
+			req.managerEpoch = epoch;
 			req.managerSeqno = seqNo;
 			req.type = assignment.assign.get().type;

@ -497,7 +504,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 			RevokeBlobRangeRequest req;
 			req.keyRange = KeyRangeRef(StringRef(req.arena, assignment.keyRange.begin),
 			                           StringRef(req.arena, assignment.keyRange.end));
-			req.managerEpoch = bmData->epoch;
+			req.managerEpoch = epoch;
 			req.managerSeqno = seqNo;
 			req.dispose = assignment.revoke.get().dispose;

@ -637,10 +644,10 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
 				}
 				count++;
 			}
-			ASSERT(count == 1);
 			if (skip) {
 				continue;
 			}
+			ASSERT(count == 1);

 			if (assignment.worker.present() && assignment.worker.get().isValid()) {
 				if (BM_DEBUG) {
@ -653,7 +660,7 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {

 				bmData->workerAssignments.insert(assignment.keyRange, workerId);
 				bmData->assignsInProgress.insert(assignment.keyRange,
-				                                 doRangeAssignment(bmData, assignment, workerId, seqNo));
+				                                 doRangeAssignment(bmData, assignment, workerId, bmData->epoch, seqNo));
 				// If we know about the worker and this is not a continue, then this is a new range for the worker
 				if (bmData->workerStats.count(workerId) &&
 				    assignment.assign.get().type != AssignRequestType::Continue) {
@ -662,8 +669,8 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
 			} else {
 				// Ensure the key boundaries are updated before we pick a worker
 				bmData->workerAssignments.insert(assignment.keyRange, UID());
-				bmData->assignsInProgress.insert(assignment.keyRange,
-				                                 doRangeAssignment(bmData, assignment, Optional<UID>(), seqNo));
+				bmData->assignsInProgress.insert(
+				    assignment.keyRange, doRangeAssignment(bmData, assignment, Optional<UID>(), bmData->epoch, seqNo));
 			}

 		} else {
@ -677,7 +684,8 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
 				if (existingRange.range() == assignment.keyRange && existingRange.cvalue() == assignment.worker.get()) {
 					bmData->workerAssignments.insert(assignment.keyRange, UID());
 				}
-				bmData->addActor.send(doRangeAssignment(bmData, assignment, assignment.worker.get(), seqNo));
+				bmData->addActor.send(
+				    doRangeAssignment(bmData, assignment, assignment.worker.get(), bmData->epoch, seqNo));
 			} else {
 				auto currentAssignments = bmData->workerAssignments.intersectingRanges(assignment.keyRange);
 				for (auto& it : currentAssignments) {
@ -693,7 +701,7 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
 					}

 					// revoke the range for the worker that owns it, not the worker specified in the revoke
-					bmData->addActor.send(doRangeAssignment(bmData, assignment, it.value(), seqNo));
+					bmData->addActor.send(doRangeAssignment(bmData, assignment, it.value(), bmData->epoch, seqNo));
 				}
 				bmData->workerAssignments.insert(assignment.keyRange, UID());
 			}
@ -1356,26 +1364,6 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 				// back is to split the range.
 				ASSERT(rep.doSplit);

-				// only evaluate for split if this worker currently owns the granule in this blob manager's mapping
-				auto currGranuleAssignment = bmData->workerAssignments.rangeContaining(rep.granuleRange.begin);
-				if (!(currGranuleAssignment.begin() == rep.granuleRange.begin &&
-				      currGranuleAssignment.end() == rep.granuleRange.end &&
-				      currGranuleAssignment.cvalue() == bwInterf.id())) {
-					if (BM_DEBUG) {
-						fmt::print("Manager {0} ignoring status from BW {1} for granule [{2} - {3}) since BW {4} owns "
-						           "[{5} - {6}).\n",
-						           bmData->epoch,
-						           bwInterf.id().toString().substr(0, 5),
-						           rep.granuleRange.begin.printable(),
-						           rep.granuleRange.end.printable(),
-						           currGranuleAssignment.cvalue().toString().substr(0, 5),
-						           currGranuleAssignment.begin().printable(),
-						           currGranuleAssignment.end().printable());
-					}
-					// FIXME: could send revoke request
-					continue;
-				}
-
 				// FIXME: We will need to go over all splits in the range once we're doing merges, instead of first one
 				auto lastSplitEval = bmData->splitEvaluations.rangeContaining(rep.granuleRange.begin);
 				if (rep.granuleRange.begin == lastSplitEval.begin() && rep.granuleRange.end == lastSplitEval.end() &&
@ -1386,46 +1374,67 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 						           rep.granuleRange.begin.printable(),
 						           rep.granuleRange.end.printable());
 					}
-				} else {
-					ASSERT(lastSplitEval.cvalue().epoch < rep.epoch ||
-					       (lastSplitEval.cvalue().epoch == rep.epoch && lastSplitEval.cvalue().seqno < rep.seqno));
-					if (lastSplitEval.cvalue().inProgress.isValid() && !lastSplitEval.cvalue().inProgress.isReady()) {
-						TEST(true); // racing BM splits
-						// For example, one worker asked BM to split, then died, granule was moved, new worker asks to
-						// split on recovery. We need to ensure that they are semantically the same split.
-						// We will just rely on the in-progress split to finish
-						if (BM_DEBUG) {
-							fmt::print("Manager {0} got split request for [{1} - {2}) @ ({3}, {4}), but already in "
-							           "progress from [{5} - {6}) @ ({7}, {8})\n",
-							           bmData->epoch,
-							           rep.granuleRange.begin.printable().c_str(),
-							           rep.granuleRange.end.printable().c_str(),
-							           rep.epoch,
-							           rep.seqno,
-							           lastSplitEval.begin().printable().c_str(),
-							           lastSplitEval.end().printable().c_str(),
-							           lastSplitEval.cvalue().epoch,
-							           lastSplitEval.cvalue().seqno);
-						}
-						// ignore the request, they will retry
-					} else {
-						if (BM_DEBUG) {
-							fmt::print("Manager {0} evaluating [{1} - {2}) @ ({3}, {4}) for split\n",
-							           bmData->epoch,
-							           rep.granuleRange.begin.printable().c_str(),
-							           rep.granuleRange.end.printable().c_str(),
-							           rep.epoch,
-							           rep.seqno);
-						}
-						Future<Void> doSplitEval = maybeSplitRange(bmData,
-						                                           bwInterf.id(),
-						                                           rep.granuleRange,
-						                                           rep.granuleID,
-						                                           rep.startVersion,
-						                                           rep.writeHotSplit);
-						bmData->splitEvaluations.insert(rep.granuleRange,
-						                                SplitEvaluation(rep.epoch, rep.seqno, doSplitEval));
+				} else if (!(lastSplitEval.cvalue().epoch < rep.epoch ||
+				             (lastSplitEval.cvalue().epoch == rep.epoch && lastSplitEval.cvalue().seqno < rep.seqno))) {
+					TEST(true); // BM got out-of-date split request
+					if (BM_DEBUG) {
+						fmt::print(
+						    "Manager {0} ignoring status from BW {1} for granule [{2} - {3}) since it already processed"
+						    "[{4} - {5}) @ ({6}, {7}).\n",
+						    bmData->epoch,
+						    bwInterf.id().toString().substr(0, 5),
+						    rep.granuleRange.begin.printable(),
+						    rep.granuleRange.end.printable(),
+						    lastSplitEval.begin().printable(),
+						    lastSplitEval.end().printable(),
+						    lastSplitEval.cvalue().epoch,
+						    lastSplitEval.cvalue().seqno);
 					}
+
+					// revoke range from out-of-date worker, but bypass rangeAssigner and hack (epoch, seqno) to be
+					// (requesting epoch, requesting seqno + 1) to ensure no race with then reassigning the range to the
+					// worker at a later version
+					RangeAssignment revokeOld;
+					revokeOld.isAssign = false;
+					revokeOld.worker = bwInterf.id();
+					revokeOld.keyRange = rep.granuleRange;
+					revokeOld.revoke = RangeRevokeData(false);
+
+					bmData->addActor.send(
+					    doRangeAssignment(bmData, revokeOld, bwInterf.id(), rep.epoch, rep.seqno + 1));
+				} else if (lastSplitEval.cvalue().inProgress.isValid() &&
+				           !lastSplitEval.cvalue().inProgress.isReady()) {
+					TEST(true); // racing BM splits
+					// For example, one worker asked BM to split, then died, granule was moved, new worker asks to
+					// split on recovery. We need to ensure that they are semantically the same split.
+					// We will just rely on the in-progress split to finish
+					if (BM_DEBUG) {
+						fmt::print("Manager {0} got split request for [{1} - {2}) @ ({3}, {4}), but already in "
+						           "progress from [{5} - {6}) @ ({7}, {8})\n",
+						           bmData->epoch,
+						           rep.granuleRange.begin.printable().c_str(),
+						           rep.granuleRange.end.printable().c_str(),
+						           rep.epoch,
+						           rep.seqno,
+						           lastSplitEval.begin().printable().c_str(),
+						           lastSplitEval.end().printable().c_str(),
+						           lastSplitEval.cvalue().epoch,
+						           lastSplitEval.cvalue().seqno);
+					}
+					// ignore the request, they will retry
+				} else {
+					if (BM_DEBUG) {
+						fmt::print("Manager {0} evaluating [{1} - {2}) @ ({3}, {4}) for split\n",
+						           bmData->epoch,
+						           rep.granuleRange.begin.printable().c_str(),
+						           rep.granuleRange.end.printable().c_str(),
+						           rep.epoch,
+						           rep.seqno);
+					}
+					Future<Void> doSplitEval = maybeSplitRange(
+					    bmData, bwInterf.id(), rep.granuleRange, rep.granuleID, rep.startVersion, rep.writeHotSplit);
+					bmData->splitEvaluations.insert(rep.granuleRange,
+					                                SplitEvaluation(rep.epoch, rep.seqno, doSplitEval));
 				}
 			}
 		} catch (Error& e) {
@ -2160,23 +2169,84 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Reference<BlobManagerData> bmData, U
 	}
 }

-// FIXME: trace events for pruning
+// FIXME: trace events for purging
+
+ACTOR Future<Void> canDeleteFullGranule(Reference<BlobManagerData> self, UID granuleId) {
+	state Transaction tr(self->db);
+	state KeyRange splitRange = blobGranuleSplitKeyRangeFor(granuleId);
+
+	loop {
+		try {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			state RangeResult splitState = wait(tr.getRange(splitRange, SERVER_KNOBS->BG_MAX_SPLIT_FANOUT));
+			state int i = 0;
+			state bool retry = false;
+			for (; i < splitState.size(); i++) {
+				UID parent, child;
+				BlobGranuleSplitState st;
+				Version v;
+				std::tie(parent, child) = decodeBlobGranuleSplitKey(splitState[i].key);
+				std::tie(st, v) = decodeBlobGranuleSplitValue(splitState[i].value);
+				// if split state is done, this granule has definitely persisted a snapshot
+				if (st >= BlobGranuleSplitState::Done) {
+					continue;
+				}
+				// if split state isn't even assigned, this granule has definitely not persisted a snapshot
+				if (st <= BlobGranuleSplitState::Initialized) {
+					retry = true;
+					break;
+				}
+
+				ASSERT(st == BlobGranuleSplitState::Assigned);
+				// if assigned, granule may or may not have snapshotted. Check files to confirm. Since a re-snapshot is
+				// the first file written for a new granule, any files present mean it has re-snapshotted from this
+				// granule
+				KeyRange granuleFileRange = blobGranuleFileKeyRangeFor(child);
+				RangeResult files = wait(tr.getRange(granuleFileRange, 1));
+				if (files.empty()) {
+					retry = true;
+					break;
+				}
+			}
+			if (retry) {
+				tr.reset();
+				wait(delay(1.0));
+			} else {
+				if (splitState.empty() || !splitState.more) {
+					break;
+				}
+				splitRange = KeyRangeRef(keyAfter(splitState.back().key), splitRange.end);
+			}
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+	return Void();
+}

 /*
 * Deletes all files pertaining to the granule with id granuleId and
 * also removes the history entry for this granule from the system keyspace
- * TODO: ensure cannot fully delete granule that is still splitting!
 */
-ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Key historyKey) {
+ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
+                                      UID granuleId,
+                                      Key historyKey,
+                                      Version purgeVersion) {
 	if (BM_DEBUG) {
 		fmt::print("Fully deleting granule {0}: init\n", granuleId.toString());
 	}

+	// if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially
+	// delete the granule, since we need to keep the last snapshot and deltas for splitting
+	wait(canDeleteFullGranule(self, granuleId));
+
 	// get files
 	GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));

 	std::vector<Future<Void>> deletions;
-	std::vector<std::string> filesToDelete; // TODO: remove, just for debugging
+	state std::vector<std::string> filesToDelete; // TODO: remove, just for debugging

 	for (auto snapshotFile : files.snapshotFiles) {
 		std::string fname = snapshotFile.filename;
@ -2191,7 +2261,7 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, UID granu
 	}

 	if (BM_DEBUG) {
-		fmt::print("Fully deleting granule {0}: deleting {1} files\n", granuleId.toString(), deletions.size());
+		fmt::print("Fully deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
 		for (auto filename : filesToDelete) {
 			fmt::print(" - {}\n", filename.c_str());
 		}
@ -2228,18 +2298,27 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, UID granu
 		fmt::print("Fully deleting granule {0}: success\n", granuleId.toString());
 	}

+	TraceEvent("GranuleFullPurge", self->id)
+	    .detail("Epoch", self->epoch)
+	    .detail("GranuleID", granuleId)
+	    .detail("PurgeVersion", purgeVersion)
+	    .detail("FilesPurged", filesToDelete.size());
+
+	++self->stats.granulesFullyPurged;
+	self->stats.filesPurged += filesToDelete.size();
+
 	return Void();
 }

 /*
 * For the granule with id granuleId, finds the first snapshot file at a
- * version <= pruneVersion and deletes all files older than it.
+ * version <= purgeVersion and deletes all files older than it.
 *
 * Assumption: this granule's startVersion might change because the first snapshot
 * file might be deleted. We will need to ensure we don't rely on the granule's startVersion
 * (that's persisted as part of the key), but rather use the granule's first snapshot's version when needed
 */
-ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Version pruneVersion) {
+ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Version purgeVersion) {
 	if (BM_DEBUG) {
 		fmt::print("Partially deleting granule {0}: init\n", granuleId.toString());
 	}
@ -2247,7 +2326,7 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
 	// get files
 	GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));

-	// represents the version of the latest snapshot file in this granule with G.version < pruneVersion
+	// represents the version of the latest snapshot file in this granule with G.version < purgeVersion
 	Version latestSnapshotVersion = invalidVersion;

 	state std::vector<Future<Void>> deletions; // deletion work per file
@ -2262,8 +2341,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
 			deletions.emplace_back(self->bstore->deleteFile(fname));
 			deletedFileKeys.emplace_back(blobGranuleFileKeyFor(granuleId, files.snapshotFiles[idx].version, 'S'));
 			filesToDelete.emplace_back(fname);
-		} else if (files.snapshotFiles[idx].version <= pruneVersion) {
-			// otherwise if this is the FIRST snapshot file with version < pruneVersion,
+		} else if (files.snapshotFiles[idx].version <= purgeVersion) {
+			// otherwise if this is the FIRST snapshot file with version < purgeVersion,
 			// then we found our latestSnapshotVersion (FIRST since we are traversing in reverse)
 			latestSnapshotVersion = files.snapshotFiles[idx].version;
 		}
@ -2289,19 +2368,19 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
 	}

 	if (BM_DEBUG) {
-		fmt::print("Partially deleting granule {0}: deleting {1} files\n", granuleId.toString(), deletions.size());
+		fmt::print("Partially deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
 		for (auto filename : filesToDelete) {
 			fmt::print(" - {0}\n", filename);
 		}
 	}

 	// TODO: the following comment relies on the assumption that BWs will not get requests to
-	// read data that was already pruned. confirm assumption is fine. otherwise, we'd need
-	// to communicate with BWs here and have them ack the pruneVersion
+	// read data that was already purged. confirm assumption is fine. otherwise, we'd need
+	// to communicate with BWs here and have them ack the purgeVersion

 	// delete the files before the corresponding metadata.
 	// this could lead to dangling pointers in fdb, but we should never read data older than
-	// pruneVersion anyways, and we can clean up the keys the next time around.
+	// purgeVersion anyways, and we can clean up the keys the next time around.
 	// deleting files before corresponding metadata reduces the # of orphaned files.
 	wait(waitForAll(deletions));

@ -2329,26 +2408,41 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
 	if (BM_DEBUG) {
 		fmt::print("Partially deleting granule {0}: success\n", granuleId.toString());
 	}
+	TraceEvent("GranulePartialPurge", self->id)
+	    .detail("Epoch", self->epoch)
+	    .detail("GranuleID", granuleId)
+	    .detail("PurgeVersion", purgeVersion)
+	    .detail("FilesPurged", filesToDelete.size());
+
+	++self->stats.granulesPartiallyPurged;
+	self->stats.filesPurged += filesToDelete.size();
+
 	return Void();
 }

 /*
- * This method is used to prune the range [startKey, endKey) at (and including) pruneVersion.
+ * This method is used to purge the range [startKey, endKey) at (and including) purgeVersion.
 * To do this, we do a BFS traversal starting at the active granules. Then we classify granules
 * in the history as nodes that can be fully deleted (i.e. their files and history can be deleted)
 * and nodes that can be partially deleted (i.e. some of their files can be deleted).
- * Once all this is done, we finally clear the pruneIntent key, if possible, to indicate we are done
- * processing this prune intent.
+ * Once all this is done, we finally clear the purgeIntent key, if possible, to indicate we are done
+ * processing this purge intent.
 */
-ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range, Version pruneVersion, bool force) {
+ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range, Version purgeVersion, bool force) {
 	if (BM_DEBUG) {
-		fmt::print("pruneRange starting for range [{0} - {1}) @ pruneVersion={2}, force={3}\n",
+		fmt::print("purgeRange starting for range [{0} - {1}) @ purgeVersion={2}, force={3}\n",
 		           range.begin.printable(),
 		           range.end.printable(),
-		           pruneVersion,
+		           purgeVersion,
 		           force);
 	}

+	TraceEvent("PurgeGranulesBegin", self->id)
+	    .detail("Epoch", self->epoch)
+	    .detail("Range", range)
+	    .detail("PurgeVersion", purgeVersion)
+	    .detail("Force", force);
+
 	// queue of <range, startVersion, endVersion> for BFS traversal of history
 	state std::queue<std::tuple<KeyRange, Version, Version>> historyEntryQueue;

@ -2371,18 +2465,18 @@ ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range
 	state KeyRangeMap<UID>::iterator activeRange;
 	for (activeRange = activeRanges.begin(); activeRange != activeRanges.end(); ++activeRange) {
 		if (BM_DEBUG) {
-			fmt::print("Checking if active range [{0} - {1}), owned by BW {2}, should be pruned\n",
+			fmt::print("Checking if active range [{0} - {1}), owned by BW {2}, should be purged\n",
 			           activeRange.begin().printable(),
 			           activeRange.end().printable(),
 			           activeRange.value().toString());
 		}

-		// assumption: prune boundaries must respect granule boundaries
+		// assumption: purge boundaries must respect granule boundaries
 		if (activeRange.begin() < range.begin || activeRange.end() > range.end) {
 			continue;
 		}

-		// TODO: if this is a force prune, then revoke the assignment from the corresponding BW first
+		// TODO: if this is a force purge, then revoke the assignment from the corresponding BW first
 		// so that it doesn't try to interact with the granule (i.e. force it to give up gLock).
 		// we'll need some way to ack that the revoke was successful

@ -2456,17 +2550,17 @@ ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range
 		}

 		// There are three cases this granule can fall into:
-		// - if the granule's end version is at or before the prune version or this is a force delete,
+		// - if the granule's end version is at or before the purge version or this is a force delete,
 		//   this granule should be completely deleted
-		// - else if the startVersion <= pruneVersion, then G.startVersion < pruneVersion < G.endVersion
+		// - else if the startVersion <= purgeVersion, then G.startVersion < purgeVersion < G.endVersion
 		//   and so this granule should be partially deleted
 		// - otherwise, this granule is active, so don't schedule it for deletion
-		if (force || endVersion <= pruneVersion) {
+		if (force || endVersion <= purgeVersion) {
 			if (BM_DEBUG) {
 				fmt::print("Granule {0} will be FULLY deleted\n", currHistoryNode.granuleID.toString());
 			}
 			toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey });
-		} else if (startVersion < pruneVersion) {
+		} else if (startVersion < purgeVersion) {
 			if (BM_DEBUG) {
 				fmt::print("Granule {0} will be partially deleted\n", currHistoryNode.granuleID.toString());
 			}
@ -2513,70 +2607,79 @@ ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range
 	// we won't run into any issues with trying to "re-delete" a blob file since deleting
 	// a file that doesn't exist is considered successful

+	state std::vector<Future<Void>> partialDeletions;
 	state int i;
 	if (BM_DEBUG) {
 		fmt::print("{0} granules to fully delete\n", toFullyDelete.size());
 	}
 	for (i = toFullyDelete.size() - 1; i >= 0; --i) {
-		UID granuleId;
+		state UID granuleId;
 		Key historyKey;
 		std::tie(granuleId, historyKey) = toFullyDelete[i];
 		// FIXME: consider batching into a single txn (need to take care of txn size limit)
 		if (BM_DEBUG) {
 			fmt::print("About to fully delete granule {0}\n", granuleId.toString());
 		}
-		wait(fullyDeleteGranule(self, granuleId, historyKey));
+		wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion));
 	}

 	if (BM_DEBUG) {
 		fmt::print("{0} granules to partially delete\n", toPartiallyDelete.size());
 	}
-	std::vector<Future<Void>> partialDeletions;
+
 	for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
 		UID granuleId = toPartiallyDelete[i];
 		if (BM_DEBUG) {
 			fmt::print("About to partially delete granule {0}\n", granuleId.toString());
 		}
-		partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, pruneVersion));
+		partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion));
 	}

 	wait(waitForAll(partialDeletions));

 	// Now that all the necessary granules and their files have been deleted, we can
-	// clear the pruneIntent key to signify that the work is done. However, there could have been
-	// another pruneIntent that got written for this table while we were processing this one.
+	// clear the purgeIntent key to signify that the work is done. However, there could have been
+	// another purgeIntent that got written for this table while we were processing this one.
 	// If that is the case, we should not clear the key. Otherwise, we can just clear the key.

 	if (BM_DEBUG) {
-		fmt::print("Successfully pruned range [{0} - {1}) at pruneVersion={2}\n",
+		fmt::print("Successfully purged range [{0} - {1}) at purgeVersion={2}\n",
 		           range.begin.printable(),
 		           range.end.printable(),
-		           pruneVersion);
+		           purgeVersion);
 	}
+
+	TraceEvent("PurgeGranulesComplete", self->id)
+	    .detail("Epoch", self->epoch)
+	    .detail("Range", range)
+	    .detail("PurgeVersion", purgeVersion)
+	    .detail("Force", force);
+
+	++self->stats.purgesProcessed;
 	return Void();
 }

 /*
- * This monitor watches for changes to a key K that gets updated whenever there is a new prune intent.
- * On this change, we scan through all blobGranulePruneKeys (which look like <startKey, endKey>=<prune_version,
- * force>) and prune any intents.
+ * This monitor watches for changes to a key K that gets updated whenever there is a new purge intent.
+ * On this change, we scan through all blobGranulePurgeKeys (which look like <startKey, endKey>=<purge_version,
+ * force>) and purge any intents.
 *
- * Once the prune has succeeded, we clear the key IF the version is still the same one that was pruned.
- * That way, if another prune intent arrived for the same range while we were working on an older one,
+ * Once the purge has succeeded, we clear the key IF the version is still the same one that was purged.
+ * That way, if another purge intent arrived for the same range while we were working on an older one,
 * we wouldn't end up clearing the intent.
 *
 * When watching for changes, we might end up in scenarios where we failed to do the work
- * for a prune intent even though the watch was triggered (maybe the BM had a blip). This is problematic
- * if the intent is a force and there isn't another prune intent for quite some time. To remedy this,
- * if we don't see a watch change in X (configurable) seconds, we will just sweep through the prune intents,
+ * for a purge intent even though the watch was triggered (maybe the BM had a blip). This is problematic
+ * if the intent is a force and there isn't another purge intent for quite some time. To remedy this,
+ * if we don't see a watch change in X (configurable) seconds, we will just sweep through the purge intents,
 * consolidating any work we might have missed before.
 *
- * Note: we could potentially use a changefeed here to get the exact pruneIntent that was added
+ * Note: we could potentially use a changefeed here to get the exact purgeIntent that was added
 * rather than iterating through all of them, but this might have too much overhead for latency
- * improvements we don't really need here (also we need to go over all prune intents anyways in the
- * case that the timer is up before any new prune intents arrive).
+ * improvements we don't really need here (also we need to go over all purge intents anyways in the
+ * case that the timer is up before any new purge intents arrive).
 */
-ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
+ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
 	self->initBStore();

 	loop {
@ -2585,35 +2688,35 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
 		tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);

 		// Wait for the watch to change, or some time to expire (whichever comes first)
-		// before checking through the prune intents. We write a UID into the change key value
+		// before checking through the purge intents. We write a UID into the change key value
 		// so that we can still recognize when the watch key has been changed while we weren't
 		// monitoring it

-		state Key lastPruneKey = blobGranulePruneKeys.begin;
+		state Key lastPurgeKey = blobGranulePurgeKeys.begin;

 		loop {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);

-			state std::vector<Future<Void>> prunes;
-			state CoalescedKeyRangeMap<std::pair<Version, bool>> pruneMap;
-			pruneMap.insert(allKeys, std::make_pair<Version, bool>(0, false));
+			state std::vector<Future<Void>> purges;
+			state CoalescedKeyRangeMap<std::pair<Version, bool>> purgeMap;
+			purgeMap.insert(allKeys, std::make_pair<Version, bool>(0, false));
 			try {
 				// TODO: replace 10000 with a knob
-				state RangeResult pruneIntents = wait(tr->getRange(blobGranulePruneKeys, BUGGIFY ? 1 : 10000));
-				if (pruneIntents.size()) {
+				state RangeResult purgeIntents = wait(tr->getRange(blobGranulePurgeKeys, BUGGIFY ? 1 : 10000));
+				if (purgeIntents.size()) {
 					int rangeIdx = 0;
-					for (; rangeIdx < pruneIntents.size(); ++rangeIdx) {
-						Version pruneVersion;
+					for (; rangeIdx < purgeIntents.size(); ++rangeIdx) {
+						Version purgeVersion;
 						KeyRange range;
 						bool force;
-						std::tie(pruneVersion, range, force) =
-						    decodeBlobGranulePruneValue(pruneIntents[rangeIdx].value);
-						auto ranges = pruneMap.intersectingRanges(range);
+						std::tie(purgeVersion, range, force) =
+						    decodeBlobGranulePurgeValue(purgeIntents[rangeIdx].value);
+						auto ranges = purgeMap.intersectingRanges(range);
 						bool foundConflict = false;
 						for (auto it : ranges) {
-							if ((it.value().second && !force && it.value().first < pruneVersion) ||
-							    (!it.value().second && force && pruneVersion < it.value().first)) {
+							if ((it.value().second && !force && it.value().first < purgeVersion) ||
+							    (!it.value().second && force && purgeVersion < it.value().first)) {
 								foundConflict = true;
 								break;
 							}
@ -2621,39 +2724,41 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
 						if (foundConflict) {
 							break;
 						}
-						pruneMap.insert(range, std::make_pair(pruneVersion, force));
+						purgeMap.insert(range, std::make_pair(purgeVersion, force));

-						fmt::print("about to prune range [{0} - {1}) @ {2}, force={3}\n",
-						           range.begin.printable(),
-						           range.end.printable(),
-						           pruneVersion,
-						           force ? "T" : "F");
+						if (BM_DEBUG) {
+							fmt::print("about to purge range [{0} - {1}) @ {2}, force={3}\n",
+							           range.begin.printable(),
+							           range.end.printable(),
+							           purgeVersion,
+							           force ? "T" : "F");
+						}
 					}
-					lastPruneKey = pruneIntents[rangeIdx - 1].key;
+					lastPurgeKey = purgeIntents[rangeIdx - 1].key;

-					for (auto it : pruneMap.ranges()) {
+					for (auto it : purgeMap.ranges()) {
 						if (it.value().first > 0) {
-							prunes.emplace_back(pruneRange(self, it.range(), it.value().first, it.value().second));
+							purges.emplace_back(purgeRange(self, it.range(), it.value().first, it.value().second));
 						}
 					}

-					// wait for this set of prunes to complete before starting the next ones since if we
-					// prune a range R at version V and while we are doing that, the time expires, we will
-					// end up trying to prune the same range again since the work isn't finished and the
-					// prunes will race
+					// wait for this set of purges to complete before starting the next ones since if we
+					// purge a range R at version V and while we are doing that, the time expires, we will
+					// end up trying to purge the same range again since the work isn't finished and the
+					// purges will race
 					//
 					// TODO: this isn't that efficient though. Instead we could keep metadata as part of the
-					// BM's memory that tracks which prunes are active. Once done, we can mark that work as
-					// done. If the BM fails then all prunes will fail and so the next BM will have a clear
+					// BM's memory that tracks which purges are active. Once done, we can mark that work as
+					// done. If the BM fails then all purges will fail and so the next BM will have a clear
 					// set of metadata (i.e. no work in progress) so we will end up doing the work in the
 					// new BM

-					wait(waitForAll(prunes));
+					wait(waitForAll(purges));
 					break;
 				} else {
-					state Future<Void> watchPruneIntentsChange = tr->watch(blobGranulePruneChangeKey);
+					state Future<Void> watchPurgeIntentsChange = tr->watch(blobGranulePurgeChangeKey);
 					wait(tr->commit());
-					wait(watchPruneIntentsChange);
+					wait(watchPurgeIntentsChange);
 					tr->reset();
 				}
 			} catch (Error& e) {
@ -2666,7 +2771,7 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-				tr->clear(KeyRangeRef(blobGranulePruneKeys.begin, keyAfter(lastPruneKey)));
+				tr->clear(KeyRangeRef(blobGranulePurgeKeys.begin, keyAfter(lastPurgeKey)));
 				wait(tr->commit());
 				break;
 			} catch (Error& e) {
@ -2675,7 +2780,7 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
 		}

 		if (BM_DEBUG) {
-			printf("Done pruning current set of prune intents.\n");
+			printf("Done clearing current set of purge intents.\n");
 		}
 	}
 }
@ -2876,7 +2981,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,

 	self->addActor.send(doLockChecks(self));
 	self->addActor.send(monitorClientRanges(self));
-	self->addActor.send(monitorPruneKeys(self));
+	self->addActor.send(monitorPurgeKeys(self));
 	if (SERVER_KNOBS->BG_CONSISTENCY_CHECK_ENABLED) {
 		self->addActor.send(bgConsistencyCheck(self));
 	}
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -86,6 +86,7 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
 	NotifiedVersion durableSnapshotVersion; // same as delta vars, except for snapshots
 	Version pendingSnapshotVersion = 0;
 	Version initialSnapshotVersion = invalidVersion;
+	Version historyVersion = invalidVersion;
 	Version knownCommittedVersion;

 	int64_t originalEpoch;
@ -756,7 +757,11 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
 				           bytesRead);
 			}
 			state Error err = e;
-			wait(tr->onError(e));
+			if (e.code() == error_code_server_overloaded) {
+				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+			} else {
+				wait(tr->onError(e));
+			}
 			retries++;
 			TEST(true); // Granule initial snapshot failed
 			// FIXME: why can't we supress error event?
@ -935,13 +940,8 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
 					break;
 				}

-				bwData->currentManagerStatusStream.get().send(GranuleStatusReply(metadata->keyRange,
-				                                                                 true,
-				                                                                 writeHot,
-				                                                                 statusEpoch,
-				                                                                 statusSeqno,
-				                                                                 granuleID,
-				                                                                 metadata->initialSnapshotVersion));
+				bwData->currentManagerStatusStream.get().send(GranuleStatusReply(
+				    metadata->keyRange, true, writeHot, statusEpoch, statusSeqno, granuleID, metadata->historyVersion));
 				break;
 			} catch (Error& e) {
 				if (e.code() == error_code_operation_cancelled) {
@ -1037,10 +1037,14 @@ static void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
 // if we get an i/o error updating files, or a rollback, reassign the granule to ourselves and start fresh
 static bool granuleCanRetry(const Error& e) {
 	switch (e.code()) {
-	case error_code_please_reboot:
 	case error_code_io_error:
 	case error_code_io_timeout:
+	// FIXME: handle connection errors in tighter retry loop around individual files.
+	// FIXME: if these requests fail at a high enough rate, the whole worker should be marked as unhealthy and its
+	// granules should be moved away, as there may be some problem with this host contacting blob storage
 	case error_code_http_request_failed:
+	case error_code_connection_failed:
+	case error_code_lookup_failed: // dns
 		return true;
 	default:
 		return false;
@ -1119,10 +1123,15 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
 		}
 		metadata->pendingDeltaVersion = cfRollbackVersion;
 		if (BW_DEBUG) {
-			fmt::print("[{0} - {1}) rollback discarding all {2} in-memory mutations\n",
+			fmt::print("[{0} - {1}) rollback discarding all {2} in-memory mutations",
 			           metadata->keyRange.begin.printable(),
 			           metadata->keyRange.end.printable(),
 			           metadata->currentDeltas.size());
+			if (metadata->currentDeltas.size()) {
+				fmt::print(
+				    " {0} - {1}", metadata->currentDeltas.front().version, metadata->currentDeltas.back().version);
+			}
+			fmt::print("\n");
 		}

 		// discard all in-memory mutations
@ -1150,6 +1159,8 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,

 		// FIXME: could binary search?
 		int mIdx = metadata->currentDeltas.size() - 1;
+		Version firstDiscarded = invalidVersion;
+		Version lastDiscarded = invalidVersion;
 		while (mIdx >= 0) {
 			if (metadata->currentDeltas[mIdx].version <= rollbackVersion) {
 				break;
@ -1157,19 +1168,37 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
 			for (auto& m : metadata->currentDeltas[mIdx].mutations) {
 				metadata->bufferedDeltaBytes -= m.totalSize();
 			}
+			if (firstDiscarded == invalidVersion) {
+				firstDiscarded = metadata->currentDeltas[mIdx].version;
+			}
+			lastDiscarded = metadata->currentDeltas[mIdx].version;
 			mIdx--;
 		}
-		mIdx++;
+
 		if (BW_DEBUG) {
-			fmt::print("[{0} - {1}) rollback discarding {2} in-memory mutations, {3} mutations and {4} bytes left\n",
+			fmt::print("[{0} - {1}) rollback discarding {2} in-memory mutations",
 			           metadata->keyRange.begin.printable(),
 			           metadata->keyRange.end.printable(),
-			           metadata->currentDeltas.size() - mIdx,
-			           mIdx,
-			           metadata->bufferedDeltaBytes);
+			           metadata->currentDeltas.size() - mIdx - 1);
+
+			if (firstDiscarded != invalidVersion) {
+				fmt::print(" {0} - {1}", lastDiscarded, firstDiscarded);
+			}
+
+			fmt::print(", {0} mutations", mIdx);
+			if (mIdx >= 0) {
+				fmt::print(
+				    " ({0} - {1})", metadata->currentDeltas.front().version, metadata->currentDeltas[mIdx].version);
+			}
+			fmt::print(" and {0} bytes left\n", metadata->bufferedDeltaBytes);
 		}

-		metadata->currentDeltas.resize(metadata->currentDeltas.arena(), mIdx);
+		if (mIdx < 0) {
+			metadata->currentDeltas = Standalone<GranuleDeltas>();
+			metadata->bufferedDeltaBytes = 0;
+		} else {
+			metadata->currentDeltas.resize(metadata->currentDeltas.arena(), mIdx + 1);
+		}

 		// delete all deltas in rollback range, but we can optimize here to just skip the uncommitted mutations
 		// directly and immediately pop the rollback out of inProgress to completed
@ -1328,6 +1357,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			metadata->pendingSnapshotVersion = metadata->files.snapshotFiles.back().version;
 			metadata->durableSnapshotVersion.set(metadata->pendingSnapshotVersion);
 			metadata->initialSnapshotVersion = metadata->files.snapshotFiles.front().version;
+			metadata->historyVersion = startState.history.get().version;
 		} else {
 			if (startState.blobFilesToSnapshot.present()) {
 				startVersion = startState.previousDurableVersion;
@ -1350,6 +1380,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			}
 			metadata->initialSnapshotVersion = startVersion;
 			metadata->pendingSnapshotVersion = startVersion;
+			metadata->historyVersion = startState.history.present() ? startState.history.get().version : startVersion;
 		}

 		metadata->durableDeltaVersion.set(startVersion);
@ -1459,8 +1490,16 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 						}
 						ASSERT(mutations.front().version > metadata->bufferedDeltaVersion);

-						// If this assert trips we should have gotten change_feed_popped from SS and didn't
-						ASSERT(mutations.front().version >= metadata->activeCFData.get()->popVersion);
+						// Rare race from merge cursor where no individual server detected popped in their response
+						if (mutations.front().version < metadata->activeCFData.get()->popVersion) {
+							TEST(true); // Blob Worker detected popped instead of change feed
+							TraceEvent("BlobWorkerChangeFeedPopped", bwData->id)
+							    .detail("Granule", metadata->keyRange)
+							    .detail("GranuleID", startState.granuleID)
+							    .detail("MutationVersion", mutations.front().version)
+							    .detail("PopVersion", metadata->activeCFData.get()->popVersion);
+							throw change_feed_popped();
+						}
 					}
 					when(wait(inFlightFiles.empty() ? Never() : success(inFlightFiles.front().future))) {}
 				}
@ -1623,6 +1662,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 									metadata->activeCFData.set(cfData);

 									justDidRollback = true;
+									lastDeltaVersion = cfRollbackVersion;
 									break;
 								}
 							}
@ -1841,6 +1881,12 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			}
 		}
 	} catch (Error& e) {
+		if (BW_DEBUG) {
+			fmt::print("Granule file updater for [{0} - {1}) got error {2}, exiting\n",
+			           metadata->keyRange.begin.printable(),
+			           metadata->keyRange.end.printable(),
+			           e.name());
+		}
 		// Free last change feed data
 		metadata->activeCFData.set(Reference<ChangeFeedData>());

@ -1871,12 +1917,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			return Void();
 		}
 		++bwData->stats.granuleUpdateErrors;
-		if (BW_DEBUG) {
-			fmt::print("Granule file updater for [{0} - {1}) got error {2}, exiting\n",
-			           metadata->keyRange.begin.printable(),
-			           metadata->keyRange.end.printable(),
-			           e.name());
-		}

 		if (granuleCanRetry(e)) {
 			TEST(true); // Granule close and re-open on error
@ -2002,6 +2042,14 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
 			int skipped = historyEntryStack.size() - 1 - i;

 			while (i >= 0) {
+				auto intersectingRanges = bwData->granuleHistory.intersectingRanges(historyEntryStack[i]->range);
+				std::vector<std::pair<KeyRange, Reference<GranuleHistoryEntry>>> newerHistory;
+				for (auto& r : intersectingRanges) {
+					if (r.value().isValid() && r.value()->endVersion >= historyEntryStack[i]->endVersion) {
+						newerHistory.push_back(std::make_pair(r.range(), r.value()));
+					}
+				}
+
 				auto prevRanges = bwData->granuleHistory.rangeContaining(historyEntryStack[i]->range.begin);

 				if (prevRanges.value().isValid() &&
@ -2012,6 +2060,9 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
 				}

 				bwData->granuleHistory.insert(historyEntryStack[i]->range, historyEntryStack[i]);
+				for (auto& it : newerHistory) {
+					bwData->granuleHistory.insert(it.first, it.second);
+				}
 				i--;
 			}

@ -2137,7 +2188,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 		if (req.beginVersion > 0) {
 			fmt::print("{0} - {1}\n", req.beginVersion, req.readVersion);
 		} else {
-			fmt::print("{}", req.readVersion);
+			fmt::print("{}\n", req.readVersion);
 		}
 	}

@ -2210,7 +2261,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 			state KeyRange chunkRange;
 			state GranuleFiles chunkFiles;

-			if (metadata->initialSnapshotVersion > req.readVersion) {
+			if (req.readVersion < metadata->historyVersion) {
 				TEST(true); // Granule Time Travel Read
 				// this is a time travel query, find previous granule
 				if (metadata->historyLoaded.canBeSet()) {
@ -2226,7 +2277,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				Reference<GranuleHistoryEntry> cur = bwData->granuleHistory.rangeContaining(historySearchKey).value();

 				// FIXME: use skip pointers here
-				Version expectedEndVersion = metadata->initialSnapshotVersion;
+				Version expectedEndVersion = metadata->historyVersion;
 				if (cur.isValid()) {
 					ASSERT(cur->endVersion == expectedEndVersion);
 				}
@ -2269,17 +2320,22 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				}

 				if (chunkFiles.snapshotFiles.empty()) {
-					// a snapshot file must have been pruned
+					// a snapshot file must have been purged
 					throw blob_granule_transaction_too_old();
 				}

 				ASSERT(!chunkFiles.deltaFiles.empty());
 				ASSERT(chunkFiles.deltaFiles.back().version > req.readVersion);
 				if (chunkFiles.snapshotFiles.front().version > req.readVersion) {
-					// a snapshot file must have been pruned
+					// a snapshot file must have been purged
 					throw blob_granule_transaction_too_old();
 				}
 			} else {
+				if (req.readVersion < metadata->initialSnapshotVersion) {
+					// a snapshot file must have been pruned
+					throw blob_granule_transaction_too_old();
+				}
+
 				TEST(true); // Granule Active Read
 				// this is an active granule query
 				loop {
@ -2287,7 +2343,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 						throw wrong_shard_server();
 					}
 					Future<Void> waitForVersionFuture = waitForVersion(metadata, req.readVersion);
-					if (waitForVersionFuture.isReady()) {
+					if (waitForVersionFuture.isReady() && !waitForVersionFuture.isError()) {
 						// didn't wait, so no need to check rollback stuff
 						break;
 					}
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -2410,24 +2410,26 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
 				wait(lowPriorityDelay(SERVER_KNOBS->CC_WORKER_HEALTH_CHECKING_INTERVAL));
 			}

-			self->degradedServers = self->getServersWithDegradedLink();
+			self->degradationInfo = self->getDegradationInfo();

 			// Compare `self->degradedServers` with `self->excludedDegradedServers` and remove those that have
 			// recovered.
 			for (auto it = self->excludedDegradedServers.begin(); it != self->excludedDegradedServers.end();) {
-				if (self->degradedServers.find(*it) == self->degradedServers.end()) {
+				if (self->degradationInfo.degradedServers.find(*it) == self->degradationInfo.degradedServers.end()) {
 					self->excludedDegradedServers.erase(it++);
 				} else {
 					++it;
 				}
 			}

-			if (!self->degradedServers.empty()) {
+			if (!self->degradationInfo.degradedServers.empty() || self->degradationInfo.degradedSatellite) {
 				std::string degradedServerString;
-				for (const auto& server : self->degradedServers) {
+				for (const auto& server : self->degradationInfo.degradedServers) {
 					degradedServerString += server.toString() + " ";
 				}
-				TraceEvent("ClusterControllerHealthMonitor").detail("DegradedServers", degradedServerString);
+				TraceEvent("ClusterControllerHealthMonitor")
+				    .detail("DegradedServers", degradedServerString)
+				    .detail("DegradedSatellite", self->degradationInfo.degradedSatellite);

 				// Check if the cluster controller should trigger a recovery to exclude any degraded servers from
 				// the transaction system.
@ -2435,7 +2437,7 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
 					if (SERVER_KNOBS->CC_HEALTH_TRIGGER_RECOVERY) {
 						if (self->recentRecoveryCountDueToHealth() < SERVER_KNOBS->CC_MAX_HEALTH_RECOVERY_COUNT) {
 							self->recentHealthTriggeredRecoveryTime.push(now());
-							self->excludedDegradedServers = self->degradedServers;
+							self->excludedDegradedServers = self->degradationInfo.degradedServers;
 							TraceEvent("DegradedServerDetectedAndTriggerRecovery")
 							    .detail("RecentRecoveryCountDueToHealth", self->recentRecoveryCountDueToHealth());
 							self->db.forceMasterFailure.trigger();
@ -2784,7 +2786,7 @@ TEST_CASE("/fdbserver/clustercontroller/updateRecoveredWorkers") {
 	return Void();
 }

-TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
+TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
 	// Create a testing ClusterControllerData. Most of the internal states do not matter in this test.
 	ClusterControllerData data(ClusterControllerFullInterface(),
 	                           LocalityData(),
@ -2800,18 +2802,18 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
 	// cluster controller.
 	{
 		data.workerHealth[worker].degradedPeers[badPeer1] = { now(), now() };
-		ASSERT(data.getServersWithDegradedLink().empty());
+		ASSERT(data.getDegradationInfo().degradedServers.empty());
 		data.workerHealth.clear();
 	}

-	// Test that when there is only one reported degraded link, getServersWithDegradedLink can return correct
+	// Test that when there is only one reported degraded link, getDegradationInfo can return correct
 	// degraded server.
 	{
 		data.workerHealth[worker].degradedPeers[badPeer1] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
 			                                                  now() };
-		auto degradedServers = data.getServersWithDegradedLink();
-		ASSERT(degradedServers.size() == 1);
-		ASSERT(degradedServers.find(badPeer1) != degradedServers.end());
+		auto degradationInfo = data.getDegradationInfo();
+		ASSERT(degradationInfo.degradedServers.size() == 1);
+		ASSERT(degradationInfo.degradedServers.find(badPeer1) != degradationInfo.degradedServers.end());
 		data.workerHealth.clear();
 	}

@ -2821,10 +2823,10 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
 			                                                  now() };
 		data.workerHealth[badPeer1].degradedPeers[worker] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
 			                                                  now() };
-		auto degradedServers = data.getServersWithDegradedLink();
-		ASSERT(degradedServers.size() == 1);
-		ASSERT(degradedServers.find(worker) != degradedServers.end() ||
-		       degradedServers.find(badPeer1) != degradedServers.end());
+		auto degradationInfo = data.getDegradationInfo();
+		ASSERT(degradationInfo.degradedServers.size() == 1);
+		ASSERT(degradationInfo.degradedServers.find(worker) != degradationInfo.degradedServers.end() ||
+		       degradationInfo.degradedServers.find(badPeer1) != degradationInfo.degradedServers.end());
 		data.workerHealth.clear();
 	}

@ -2839,9 +2841,9 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
 			                                                  now() };
 		data.workerHealth[badPeer2].degradedPeers[worker] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
 			                                                  now() };
-		auto degradedServers = data.getServersWithDegradedLink();
-		ASSERT(degradedServers.size() == 1);
-		ASSERT(degradedServers.find(worker) != degradedServers.end());
+		auto degradationInfo = data.getDegradationInfo();
+		ASSERT(degradationInfo.degradedServers.size() == 1);
+		ASSERT(degradationInfo.degradedServers.find(worker) != degradationInfo.degradedServers.end());
 		data.workerHealth.clear();
 	}

@ -2856,7 +2858,7 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
 			                                                  now() };
 		data.workerHealth[badPeer4].degradedPeers[worker] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
 			                                                  now() };
-		ASSERT(data.getServersWithDegradedLink().empty());
+		ASSERT(data.getDegradationInfo().degradedServers.empty());
 		data.workerHealth.clear();
 	}

@ -2880,7 +2882,7 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
 			                                                  now() };
 		data.workerHealth[badPeer4].degradedPeers[worker] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
 			                                                  now() };
-		ASSERT(data.getServersWithDegradedLink().empty());
+		ASSERT(data.getDegradationInfo().degradedServers.empty());
 		data.workerHealth.clear();
 	}

@ -2977,42 +2979,42 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerRecoveryDueToDegradedServer
 	ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());

 	// Trigger recovery when master is degraded.
-	data.degradedServers.insert(master);
+	data.degradationInfo.degradedServers.insert(master);
 	ASSERT(data.shouldTriggerRecoveryDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();

 	// Trigger recovery when primary TLog is degraded.
-	data.degradedServers.insert(tlog);
+	data.degradationInfo.degradedServers.insert(tlog);
 	ASSERT(data.shouldTriggerRecoveryDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();

 	// No recovery when satellite Tlog is degraded.
-	data.degradedServers.insert(satelliteTlog);
+	data.degradationInfo.degradedServers.insert(satelliteTlog);
 	ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();

 	// No recovery when remote tlog is degraded.
-	data.degradedServers.insert(remoteTlog);
+	data.degradationInfo.degradedServers.insert(remoteTlog);
 	ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();

 	// No recovery when log router is degraded.
-	data.degradedServers.insert(logRouter);
+	data.degradationInfo.degradedServers.insert(logRouter);
 	ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();

 	// No recovery when backup worker is degraded.
-	data.degradedServers.insert(backup);
+	data.degradationInfo.degradedServers.insert(backup);
 	ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();

 	// Trigger recovery when proxy is degraded.
-	data.degradedServers.insert(proxy);
+	data.degradationInfo.degradedServers.insert(proxy);
 	ASSERT(data.shouldTriggerRecoveryDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();

 	// Trigger recovery when resolver is degraded.
-	data.degradedServers.insert(resolver);
+	data.degradationInfo.degradedServers.insert(resolver);
 	ASSERT(data.shouldTriggerRecoveryDueToDegradedServers());

 	return Void();
@ -3090,16 +3092,16 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerFailoverDueToDegradedServer
 	ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());

 	// No failover when small number of degraded servers
-	data.degradedServers.insert(master);
+	data.degradationInfo.degradedServers.insert(master);
 	ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();

 	// Trigger failover when enough servers in the txn system are degraded.
-	data.degradedServers.insert(master);
-	data.degradedServers.insert(tlog);
-	data.degradedServers.insert(proxy);
-	data.degradedServers.insert(proxy2);
-	data.degradedServers.insert(resolver);
+	data.degradationInfo.degradedServers.insert(master);
+	data.degradationInfo.degradedServers.insert(tlog);
+	data.degradationInfo.degradedServers.insert(proxy);
+	data.degradationInfo.degradedServers.insert(proxy2);
+	data.degradationInfo.degradedServers.insert(resolver);
 	ASSERT(data.shouldTriggerFailoverDueToDegradedServers());

 	// No failover when usable region is 1.
@ -3108,18 +3110,29 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerFailoverDueToDegradedServer
 	data.db.config.usableRegions = 2;

 	// No failover when remote is also degraded.
-	data.degradedServers.insert(remoteTlog);
+	data.degradationInfo.degradedServers.insert(remoteTlog);
 	ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();

 	// No failover when some are not from transaction system
-	data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 1));
-	data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 2));
-	data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 3));
-	data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 4));
-	data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 5));
+	data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 1));
+	data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 2));
+	data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 3));
+	data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 4));
+	data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 5));
 	ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());
-	data.degradedServers.clear();
+	data.degradationInfo.degradedServers.clear();
+
+	// Trigger failover when satellite is degraded.
+	data.degradationInfo.degradedSatellite = true;
+	ASSERT(data.shouldTriggerFailoverDueToDegradedServers());
+	data.degradationInfo.degradedServers.clear();
+
+	// No failover when satellite is degraded, but remote is not healthy.
+	data.degradationInfo.degradedSatellite = true;
+	data.degradationInfo.degradedServers.insert(remoteTlog);
+	ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());
+	data.degradationInfo.degradedServers.clear();

 	return Void();
 }
--- a/fdbserver/ClusterController.actor.h
+++ b/fdbserver/ClusterController.actor.h
@ -2981,9 +2981,16 @@ public:
 		}
 	}

+	struct DegradationInfo {
+		std::unordered_set<NetworkAddress>
+		    degradedServers; // The servers that the cluster controller is considered as degraded. The servers in this
+		                     // list are not excluded unless they are added to `excludedDegradedServers`.
+
+		bool degradedSatellite = false; // Indicates that the entire satellite DC is degraded.
+	};
 	// Returns a list of servers who are experiencing degraded links. These are candidates to perform exclusion. Note
 	// that only one endpoint of a bad link will be included in this list.
-	std::unordered_set<NetworkAddress> getServersWithDegradedLink() {
+	DegradationInfo getDegradationInfo() {
 		updateRecoveredWorkers();

 		// Build a map keyed by measured degraded peer. This map gives the info that who complains a particular server.
@ -3014,7 +3021,11 @@ public:
 		//
 		// For example, if server A is already considered as a degraded server, and A complains B, we won't add B as
 		// degraded since A is already considered as degraded.
+		//
+		// In the meantime, we also count the number of satellite workers got complained. If enough number of satellite
+		// workers are degraded, this may indicates that the whole network between primary and satellite is bad.
 		std::unordered_set<NetworkAddress> currentDegradedServers;
+		int satelliteBadServerCount = 0;
 		for (const auto& [complainerCount, badServer] : count2DegradedPeer) {
 			for (const auto& complainer : degradedLinkDst2Src[badServer]) {
 				if (currentDegradedServers.find(complainer) == currentDegradedServers.end()) {
@ -3022,23 +3033,36 @@ public:
 					break;
 				}
 			}
+
+			if (SERVER_KNOBS->CC_ENABLE_ENTIRE_SATELLITE_MONITORING &&
+			    addressInDbAndPrimarySatelliteDc(badServer, db.serverInfo) &&
+			    complainerCount >= SERVER_KNOBS->CC_SATELLITE_DEGRADATION_MIN_COMPLAINER) {
+				++satelliteBadServerCount;
+			}
 		}

 		// For degraded server that are complained by more than SERVER_KNOBS->CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE, we
 		// don't know if it is a hot server, or the network is bad. We remove from the returned degraded server list.
-		std::unordered_set<NetworkAddress> currentDegradedServersWithinLimit;
+		DegradationInfo currentDegradationInfo;
 		for (const auto& badServer : currentDegradedServers) {
 			if (degradedLinkDst2Src[badServer].size() <= SERVER_KNOBS->CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE) {
-				currentDegradedServersWithinLimit.insert(badServer);
+				currentDegradationInfo.degradedServers.insert(badServer);
 			}
 		}
-		return currentDegradedServersWithinLimit;
+
+		// If enough number of satellite workers are bad, we mark the entire satellite is bad. Note that this needs to
+		// be used with caution (controlled by CC_ENABLE_ENTIRE_SATELLITE_MONITORING knob), since the slow workers may
+		// also be caused by workload.
+		if (satelliteBadServerCount >= SERVER_KNOBS->CC_SATELLITE_DEGRADATION_MIN_BAD_SERVER) {
+			currentDegradationInfo.degradedSatellite = true;
+		}
+		return currentDegradationInfo;
 	}

 	// Whether the transaction system (in primary DC if in HA setting) contains degraded servers.
 	bool transactionSystemContainsDegradedServers() {
 		const ServerDBInfo dbi = db.serverInfo->get();
-		for (const auto& excludedServer : degradedServers) {
+		for (const auto& excludedServer : degradationInfo.degradedServers) {
 			if (dbi.master.addresses().contains(excludedServer)) {
 				return true;
 			}
@ -3083,7 +3107,7 @@ public:
 			return false;
 		}

-		for (const auto& excludedServer : degradedServers) {
+		for (const auto& excludedServer : degradationInfo.degradedServers) {
 			if (addressInDbAndRemoteDc(excludedServer, db.serverInfo)) {
 				return true;
 			}
@ -3121,7 +3145,7 @@ public:
 	// Returns true when the cluster controller should trigger a recovery due to degraded servers used in the
 	// transaction system in the primary data center.
 	bool shouldTriggerRecoveryDueToDegradedServers() {
-		if (degradedServers.size() > SERVER_KNOBS->CC_MAX_EXCLUSION_DUE_TO_HEALTH) {
+		if (degradationInfo.degradedServers.size() > SERVER_KNOBS->CC_MAX_EXCLUSION_DUE_TO_HEALTH) {
 			return false;
 		}

@ -3154,8 +3178,14 @@ public:
 			return false;
 		}

-		if (degradedServers.size() < SERVER_KNOBS->CC_FAILOVER_DUE_TO_HEALTH_MIN_DEGRADATION ||
-		    degradedServers.size() > SERVER_KNOBS->CC_FAILOVER_DUE_TO_HEALTH_MAX_DEGRADATION) {
+		bool remoteIsHealthy = !remoteTransactionSystemContainsDegradedServers();
+		if (degradationInfo.degradedSatellite && remoteIsHealthy) {
+			// If the satellite DC is bad, a failover is desired despite the number of degraded servers.
+			return true;
+		}
+
+		if (degradationInfo.degradedServers.size() < SERVER_KNOBS->CC_FAILOVER_DUE_TO_HEALTH_MIN_DEGRADATION ||
+		    degradationInfo.degradedServers.size() > SERVER_KNOBS->CC_FAILOVER_DUE_TO_HEALTH_MAX_DEGRADATION) {
 			return false;
 		}

@ -3165,7 +3195,7 @@ public:
 			return false;
 		}

-		return transactionSystemContainsDegradedServers() && !remoteTransactionSystemContainsDegradedServers();
+		return transactionSystemContainsDegradedServers() && remoteIsHealthy;
 	}

 	int recentRecoveryCountDueToHealth() {
@ -3248,9 +3278,7 @@ public:
 		// TODO(zhewu): Include disk and CPU signals.
 	};
 	std::unordered_map<NetworkAddress, WorkerHealth> workerHealth;
-	std::unordered_set<NetworkAddress>
-	    degradedServers; // The servers that the cluster controller is considered as degraded. The servers in this list
-	                     // are not excluded unless they are added to `excludedDegradedServers`.
+	DegradationInfo degradationInfo;
 	std::unordered_set<NetworkAddress>
 	    excludedDegradedServers; // The degraded servers to be excluded when assigning workers to roles.
 	std::queue<double> recentHealthTriggeredRecoveryTime;
--- a/fdbserver/ClusterRecovery.actor.cpp
+++ b/fdbserver/ClusterRecovery.actor.cpp
@ -342,6 +342,7 @@ ACTOR Future<Void> newSeedServers(Reference<ClusterRecoveryData> self,
 		isr.reqId = deterministicRandom()->randomUniqueID();
 		isr.interfaceId = deterministicRandom()->randomUniqueID();
 		isr.clusterId = self->clusterId;
+		isr.initialClusterVersion = self->recoveryTransactionVersion;

 		ErrorOr<InitializeStorageReply> newServer = wait(recruits.storageServers[idx].storage.tryGetReply(isr));

@ -989,8 +990,12 @@ ACTOR Future<std::vector<Standalone<CommitTransactionRef>>> recruitEverything(
 	     newTLogServers(self, recruits, oldLogSystem, &confChanges));

 	// Update recovery related information to the newly elected sequencer (master) process.
-	wait(brokenPromiseToNever(self->masterInterface.updateRecoveryData.getReply(UpdateRecoveryDataRequest(
-	    self->recoveryTransactionVersion, self->lastEpochEnd, self->commitProxies, self->resolvers))));
+	wait(brokenPromiseToNever(
+	    self->masterInterface.updateRecoveryData.getReply(UpdateRecoveryDataRequest(self->recoveryTransactionVersion,
+	                                                                                self->lastEpochEnd,
+	                                                                                self->commitProxies,
+	                                                                                self->resolvers,
+	                                                                                self->versionEpoch))));

 	return confChanges;
 }
@ -1036,6 +1041,14 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
 	self->txnStateStore =
 	    keyValueStoreLogSystem(self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false, false, true);

+	// Version 0 occurs at the version epoch. The version epoch is the number
+	// of microseconds since the Unix epoch. It can be set through fdbcli.
+	self->versionEpoch.reset();
+	Optional<Standalone<StringRef>> versionEpochValue = wait(self->txnStateStore->readValue(versionEpochKey));
+	if (versionEpochValue.present()) {
+		self->versionEpoch = BinaryReader::fromStringRef<int64_t>(versionEpochValue.get(), Unversioned());
+	}
+
 	// Versionstamped operations (particularly those applied from DR) define a minimum commit version
 	// that we may recover to, as they embed the version in user-readable data and require that no
 	// transactions will be committed at a lower version.
@ -1046,6 +1059,11 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
 	if (requiredCommitVersion.present()) {
 		minRequiredCommitVersion = BinaryReader::fromStringRef<Version>(requiredCommitVersion.get(), Unversioned());
 	}
+	if (g_network->isSimulated() && self->versionEpoch.present()) {
+		minRequiredCommitVersion = std::max(
+		    minRequiredCommitVersion,
+		    static_cast<Version>(g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - self->versionEpoch.get()));
+	}

 	// Recover version info
 	self->lastEpochEnd = oldLogSystem->getEnd() - 1;
@ -1058,14 +1076,14 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
 			self->recoveryTransactionVersion = self->lastEpochEnd + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT;
 		}

-		if (BUGGIFY) {
-			self->recoveryTransactionVersion +=
-			    deterministicRandom()->randomInt64(0, SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT);
-		}
 		if (self->recoveryTransactionVersion < minRequiredCommitVersion)
 			self->recoveryTransactionVersion = minRequiredCommitVersion;
 	}

+	if (BUGGIFY) {
+		self->recoveryTransactionVersion += deterministicRandom()->randomInt64(0, 10000000);
+	}
+
 	TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_RECOVERED_EVENT_NAME).c_str(),
 	           self->dbgid)
 	    .detail("LastEpochEnd", self->lastEpochEnd)
--- a/fdbserver/ClusterRecovery.actor.h
+++ b/fdbserver/ClusterRecovery.actor.h
@ -169,6 +169,7 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
 	AsyncTrigger registrationTrigger;
 	Version lastEpochEnd, // The last version in the old epoch not (to be) rolled back in this recovery
 	    recoveryTransactionVersion; // The first version in this epoch
+	Optional<int64_t> versionEpoch; // The epoch which all versions are based off of
 	double lastCommitTime;

 	Version liveCommittedVersion; // The largest live committed version reported by commit proxies.
@ -209,6 +210,7 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
 	std::map<UID, CommitProxyVersionReplies> lastCommitProxyVersionReplies;

 	UID clusterId;
+	Version initialClusterVersion = -1;
 	Standalone<StringRef> dbId;

 	MasterInterface masterInterface;
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -42,6 +42,7 @@
 #include "flow/ActorCollection.h"
 #include "flow/Arena.h"
 #include "flow/BooleanParam.h"
+#include "flow/genericactors.actor.h"
 #include "flow/serialize.h"
 #include "flow/Trace.h"
 #include "flow/UnitTest.h"
@ -754,7 +755,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
 			                                                          shardsAffectedByTeamFailure,
 			                                                          lock,
 			                                                          getAverageShardBytes,
-			                                                          getUnhealthyRelocationCount,
+			                                                          getUnhealthyRelocationCount.getFuture(),
 			                                                          self->ddId,
 			                                                          storageTeamSize,
 			                                                          configuration.storageTeamSize,
@ -902,8 +903,39 @@ Future<Void> sendSnapReq(RequestStream<Req> stream, Req req, Error e) {
 	return Void();
 }

+ACTOR template <class Req>
+Future<ErrorOr<Void>> trySendSnapReq(RequestStream<Req> stream, Req req) {
+	ErrorOr<REPLY_TYPE(Req)> reply = wait(stream.tryGetReply(req));
+	if (reply.isError()) {
+		TraceEvent("SnapDataDistributor_ReqError")
+		    .errorUnsuppressed(reply.getError())
+		    .detail("Peer", stream.getEndpoint().getPrimaryAddress());
+		return ErrorOr<Void>(reply.getError());
+	}
+	return ErrorOr<Void>(Void());
+}
+
+ACTOR static Future<Void> waitForMost(std::vector<Future<ErrorOr<Void>>> futures,
+                                      int faultTolerance,
+                                      Error e,
+                                      double waitMultiplierForSlowFutures = 1.0) {
+	state std::vector<Future<bool>> successFutures;
+	state double startTime = now();
+	successFutures.reserve(futures.size());
+	for (const auto& future : futures) {
+		successFutures.push_back(fmap([](auto const& result) { return result.present(); }, future));
+	}
+	bool success = wait(quorumEqualsTrue(successFutures, successFutures.size() - faultTolerance));
+	if (!success) {
+		throw e;
+	}
+	wait(delay((now() - startTime) * waitMultiplierForSlowFutures) || waitForAll(successFutures));
+	return Void();
+}
+
 ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<AsyncVar<ServerDBInfo> const> db) {
 	state Database cx = openDBOnServer(db, TaskPriority::DefaultDelay, LockAware::True);
+
 	state ReadYourWritesTransaction tr(cx);
 	loop {
 		try {
@ -938,19 +970,29 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 		    .detail("SnapPayload", snapReq.snapPayload)
 		    .detail("SnapUID", snapReq.snapUID);
 		// snap local storage nodes
-		std::vector<WorkerInterface> storageWorkers =
+		// TODO: Atomically read  configuration and storage worker list in a single transaction
+		state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
+		std::pair<std::vector<WorkerInterface>, int> storageWorkersAndFailures =
 		    wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed()));
+		const auto& [storageWorkers, storageFailures] = storageWorkersAndFailures;
+		auto const storageFaultTolerance =
+		    std::min(static_cast<int>(SERVER_KNOBS->MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE),
+		             configuration.storageTeamSize - 1) -
+		    storageFailures;
+		if (storageFaultTolerance < 0) {
+			TEST(true); // Too many failed storage servers to complete snapshot
+			throw snap_storage_failed();
+		}
 		TraceEvent("SnapDataDistributor_GotStorageWorkers")
 		    .detail("SnapPayload", snapReq.snapPayload)
 		    .detail("SnapUID", snapReq.snapUID);
-		std::vector<Future<Void>> storageSnapReqs;
+		std::vector<Future<ErrorOr<Void>>> storageSnapReqs;
 		storageSnapReqs.reserve(storageWorkers.size());
 		for (const auto& worker : storageWorkers) {
-			storageSnapReqs.push_back(sendSnapReq(worker.workerSnapReq,
-			                                      WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr),
-			                                      snap_storage_failed()));
+			storageSnapReqs.push_back(trySendSnapReq(
+			    worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr)));
 		}
-		wait(waitForAll(storageSnapReqs));
+		wait(waitForMost(storageSnapReqs, storageFaultTolerance, snap_storage_failed()));

 		TraceEvent("SnapDataDistributor_AfterSnapStorage")
 		    .detail("SnapPayload", snapReq.snapPayload)
@ -985,14 +1027,15 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 		TraceEvent("SnapDataDistributor_GotCoordWorkers")
 		    .detail("SnapPayload", snapReq.snapPayload)
 		    .detail("SnapUID", snapReq.snapUID);
-		std::vector<Future<Void>> coordSnapReqs;
+		std::vector<Future<ErrorOr<Void>>> coordSnapReqs;
 		coordSnapReqs.reserve(coordWorkers.size());
 		for (const auto& worker : coordWorkers) {
-			coordSnapReqs.push_back(sendSnapReq(worker.workerSnapReq,
-			                                    WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr),
-			                                    snap_coord_failed()));
+			coordSnapReqs.push_back(trySendSnapReq(
+			    worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr)));
 		}
-		wait(waitForAll(coordSnapReqs));
+		auto const coordFaultTolerance = std::min<int>(std::max<int>(0, coordSnapReqs.size() / 2 - 1),
+		                                               SERVER_KNOBS->MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE);
+		wait(waitForMost(coordSnapReqs, coordFaultTolerance, snap_coord_failed()));
 		TraceEvent("SnapDataDistributor_AfterSnapCoords")
 		    .detail("SnapPayload", snapReq.snapPayload)
 		    .detail("SnapUID", snapReq.snapUID);
@ -1262,3 +1305,44 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV

 	return Void();
 }
+
+static Future<ErrorOr<Void>> goodTestFuture(double duration) {
+	return tag(delay(duration), ErrorOr<Void>(Void()));
+}
+
+static Future<ErrorOr<Void>> badTestFuture(double duration, Error e) {
+	return tag(delay(duration), ErrorOr<Void>(e));
+}
+
+TEST_CASE("/DataDistribution/WaitForMost") {
+	state std::vector<Future<ErrorOr<Void>>> futures;
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
+		wait(waitForMost(futures, 1, operation_failed(), 0.0)); // Don't wait for slowest future
+		ASSERT(!futures[2].isReady());
+	}
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
+		wait(waitForMost(futures, 0, operation_failed(), 0.0)); // Wait for all futures
+		ASSERT(futures[2].isReady());
+	}
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
+		wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Wait for slowest future
+		ASSERT(futures[2].isReady());
+	}
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
+		wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Error ignored
+	}
+	{
+		futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
+		try {
+			wait(waitForMost(futures, 0, operation_failed(), 1.0));
+			ASSERT(false);
+		} catch (Error& e) {
+			ASSERT_EQ(e.code(), error_code_operation_failed);
+		}
+	}
+	return Void();
+}
--- a/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/DataDistribution.actor.h
@ -308,7 +308,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                         Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
                                         MoveKeysLock lock,
                                         PromiseStream<Promise<int64_t>> getAverageShardBytes,
-                                         PromiseStream<Promise<int>> getUnhealthyRelocationCount,
+                                         FutureStream<Promise<int>> getUnhealthyRelocationCount,
                                         UID distributorId,
                                         int teamSize,
                                         int singleRegionTeamSize,
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -1059,6 +1059,16 @@ struct DDQueueData {

 		validate();
 	}
+
+	int getHighestPriorityRelocation() const {
+		int highestPriority{ 0 };
+		for (const auto& [priority, count] : priority_relocations) {
+			if (count > 0) {
+				highestPriority = std::max(highestPriority, priority);
+			}
+		}
+		return highestPriority;
+	}
 };

 // return -1 if a.readload > b.readload
@ -1987,7 +1997,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
                                         Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
                                         MoveKeysLock lock,
                                         PromiseStream<Promise<int64_t>> getAverageShardBytes,
-                                         PromiseStream<Promise<int>> getUnhealthyRelocationCount,
+                                         FutureStream<Promise<int>> getUnhealthyRelocationCount,
                                         UID distributorId,
                                         int teamSize,
                                         int singleRegionTeamSize,
@ -2090,12 +2100,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,

 					recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL, TaskPriority::FlushTrace);

-					int highestPriorityRelocation = 0;
-					for (auto it = self.priority_relocations.begin(); it != self.priority_relocations.end(); ++it) {
-						if (it->second) {
-							highestPriorityRelocation = std::max(highestPriorityRelocation, it->first);
-						}
-					}
+					auto const highestPriorityRelocation = self.getHighestPriorityRelocation();

 					TraceEvent("MovingData", distributorId)
 					    .detail("InFlight", self.activeRelocations)
@ -2135,9 +2140,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
 				}
 				when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
 				when(wait(waitForAll(balancingFutures))) {}
-				when(Promise<int> r = waitNext(getUnhealthyRelocationCount.getFuture())) {
-					r.send(self.unhealthyRelocations);
-				}
+				when(Promise<int> r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); }
 			}
 		}
 	} catch (Error& e) {
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@ -1177,15 +1177,16 @@ public:
 	struct Cursor {
 		Cursor() : cache(nullptr), nodeIndex(-1) {}

-		Cursor(DecodeCache* cache, DeltaTree2* tree) : tree(tree), cache(cache), nodeIndex(-1) {}
+		Cursor(Reference<DecodeCache> cache, DeltaTree2* tree) : tree(tree), cache(cache), nodeIndex(-1) {}

-		Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex) : tree(tree), cache(cache), nodeIndex(nodeIndex) {}
+		Cursor(Reference<DecodeCache> cache, DeltaTree2* tree, int nodeIndex)
+		  : tree(tree), cache(cache), nodeIndex(nodeIndex) {}

 		// Copy constructor does not copy item because normally a copied cursor will be immediately moved.
 		Cursor(const Cursor& c) : tree(c.tree), cache(c.cache), nodeIndex(c.nodeIndex) {}

 		~Cursor() {
-			if (cache != nullptr) {
+			if (cache.isValid()) {
 				cache->updateUsedMemory();
 			}
 		}
@ -1212,7 +1213,7 @@ public:
 		}

 		DeltaTree2* tree;
-		DecodeCache* cache;
+		Reference<DecodeCache> cache;
 		int nodeIndex;
 		mutable Optional<T> item;

@ -1274,6 +1275,7 @@ public:
 			return item.get();
 		}

+		// Switch the cursor to point to a new DeltaTree
 		void switchTree(DeltaTree2* newTree) {
 			tree = newTree;
 			// Reset item because it may point into tree memory
@ -1709,7 +1711,13 @@ public:
 		} else {
 			nodeBytesUsed = 0;
 		}
+
+		ASSERT(size() <= spaceAvailable);
 		nodeBytesFree = spaceAvailable - size();
+
+		// Zero unused available space
+		memset((uint8_t*)this + size(), 0, nodeBytesFree);
+
 		return size();
 	}

@ -1782,8 +1790,15 @@ private:
 		node.setLeftChildOffset(largeNodes, leftChildOffset);
 		node.setRightChildOffset(largeNodes, rightChildOffset);

-		deltatree_printf("%p: Serialized %s as %s\n", this, item.toString().c_str(), node.toString(this).c_str());
+		int written = wptr - (uint8_t*)&node;
+		deltatree_printf("Built subtree tree=%p subtreeRoot=%p written=%d end=%p serialized subtreeRoot %s as %s \n",
+		                 this,
+		                 &node,
+		                 written,
+		                 (uint8_t*)&node + written,
+		                 item.toString().c_str(),
+		                 node.toString(this).c_str());

-		return wptr - (uint8_t*)&node;
+		return written;
 	}
 };
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@ -20,22 +20,24 @@

 #ifndef FDBSERVER_IPAGER_H
 #define FDBSERVER_IPAGER_H
+#include "flow/Error.h"
+#include "flow/FastAlloc.h"
+#include "flow/ProtocolVersion.h"
+#include <cstddef>
+#include <stdint.h>
 #pragma once

 #include "fdbserver/IKeyValueStore.h"

 #include "flow/flow.h"
 #include "fdbclient/FDBTypes.h"
+#define XXH_INLINE_ALL
 #include "flow/xxhash.h"

-#ifndef VALGRIND
-#define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
-#define VALGRIND_MAKE_MEM_DEFINED(x, y)
-#endif
-
 typedef uint32_t LogicalPageID;
 typedef uint32_t PhysicalPageID;
 #define invalidLogicalPageID std::numeric_limits<LogicalPageID>::max()
+#define invalidPhysicalPageID std::numeric_limits<PhysicalPageID>::max()

 typedef uint32_t QueueID;
 #define invalidQueueID std::numeric_limits<QueueID>::max()
@ -76,90 +78,509 @@ static const std::vector<std::pair<PagerEvents, PagerEventReasons>> L0PossibleEv
 	{ PagerEvents::PageWrite, PagerEventReasons::MetaData },
 };

-// Represents a block of memory in a 4096-byte aligned location held by an Arena.
+enum EncodingType : uint8_t {
+	XXHash64 = 0,
+	// For testing purposes
+	XOREncryption = 1
+};
+
+enum PageType : uint8_t {
+	HeaderPage = 0,
+	BackupHeaderPage = 1,
+	BTreeNode = 2,
+	BTreeSuperNode = 3,
+	QueuePageStandalone = 4,
+	QueuePageInExtent = 5
+};
+
+// Encryption key ID
+typedef uint64_t KeyID;
+
+// EncryptionKeyRef is somewhat multi-variant, it will contain members representing the union
+// of all fields relevant to any implemented encryption scheme.  They are generally of
+// the form
+//   Page Fields - fields which come from or are stored in the Page
+//   Secret Fields - fields which are only known by the Key Provider
+// but it is up to each encoding and provider which fields are which and which ones are used
+struct EncryptionKeyRef {
+
+	EncryptionKeyRef(){};
+	EncryptionKeyRef(Arena& arena, const EncryptionKeyRef& toCopy) : secret(arena, toCopy.secret), id(toCopy.id) {}
+	int expectedSize() const { return secret.size(); }
+
+	StringRef secret;
+	Optional<KeyID> id;
+};
+typedef Standalone<EncryptionKeyRef> EncryptionKey;
+
+// Interface used by pager to get encryption keys by ID when reading pages from disk
+// and by the BTree to get encryption keys to use for new pages
+class IEncryptionKeyProvider {
+public:
+	virtual ~IEncryptionKeyProvider() {}
+
+	// Get an EncryptionKey with Secret Fields populated based on the given Page Fields.
+	// It is up to the implementation which fields those are.
+	// The output Page Fields must match the input Page Fields.
+	virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) = 0;
+
+	// Get encryption key that should be used for a given user Key-Value range
+	virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) = 0;
+};
+
+// This is a hacky way to attach an additional object of an arbitrary type at runtime to another object.
+// It stores an arbitrary void pointer and a void pointer function to call when the ArbitraryObject
+// is destroyed.
+// It has helper operator= methods for storing heap-allocated T's or Reference<T>'s in into it via
+//   x = thing;
+// Examples:
+//   ArbitraryObject x;
+//   x.set(new Widget());  // x owns the new object
+//   x.set(Reference<SomeClass>(new SomeClass());    // x holds a reference now too
+//   x.setReference(new SomeReferenceCountedType()); //
+struct ArbitraryObject {
+	ArbitraryObject() : ptr(nullptr), onDestruct(nullptr) {}
+	ArbitraryObject(const ArbitraryObject&) = delete;
+
+	~ArbitraryObject() { destructOnly(); }
+
+	bool valid() const { return ptr != nullptr; }
+
+	template <typename T>
+	void operator=(T* p) {
+		destructOnly();
+		ptr = p;
+		onDestruct = [](void* ptr) { delete (T*)ptr; };
+	}
+
+	template <typename T>
+	void operator=(Reference<T>& r) {
+		destructOnly();
+		ptr = r.getPtr();
+		r.getPtr()->addref();
+		onDestruct = [](void* ptr) { ((T*)ptr)->delref(); };
+	}
+
+	template <typename T>
+	void operator=(Reference<T>&& r) {
+		destructOnly();
+		ptr = r.extractPtr();
+		onDestruct = [](void* ptr) { ((T*)ptr)->delref(); };
+	}
+
+	template <typename T>
+	T* getPtr() {
+		return (T*)ptr;
+	}
+
+	template <typename T>
+	Reference<T> getReference() {
+		return Reference<T>::addRef((T*)ptr);
+	}
+
+	void reset() {
+		destructOnly();
+		ptr = nullptr;
+		onDestruct = nullptr;
+	}
+
+	// ptr can be set to any arbitrary thing.  If it is not null at destruct time then
+	// onDestruct(ptr) will be called if onDestruct is not null.
+	void* ptr = nullptr;
+	void (*onDestruct)(void*) = nullptr;
+
+private:
+	// Call onDestruct(ptr) if needed but don't reset any state
+	void destructOnly() {
+		if (ptr != nullptr && onDestruct != nullptr) {
+			onDestruct(ptr);
+		}
+	}
+};
+
+// ArenaPage represents a data page meant to be stored on disk, located in a block of
+// 4k-aligned memory held by an Arena
+//
+// Page Format:
+//    PageHeader - describes main header version, encoding type, and offsets of subheaders and payload.
+//    MainHeader - structure based on header version.  It is responsible for protecting all bytes
+//                 of PageHeader, MainHeader, and EncodingHeader with some sort of checksum.
+//    EncodingHeader - structure based on encoding type.  It is responsible for protecting and
+//                     possibly encrypting all payload bytes.
+//    Payload - User accessible bytes, protected and possibly encrypted based on the encoding
+//
+// preWrite() must be called before writing a page to disk to update checksums and encrypt as needed
+// After reading a page from disk,
+//   postReadHeader() must be called to verify the verison, main, and encoding headers
+//   postReadPayload() must be called, after potentially setting encryption secret, to verify and possibly
+//                     decrypt the payload
 class ArenaPage : public ReferenceCounted<ArenaPage>, public FastAllocated<ArenaPage> {
 public:
-	// The page's logical size includes an opaque checksum, use size() to get usable size
-	ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), userData(nullptr) {
+	// This is the header version that new page init() calls will use.
+	// It is not necessarily the latest header version, as read/modify support for
+	// a new header version may be added prior to using that version as the default
+	// for new pages as part of downgrade support.
+	static constexpr uint8_t HEADER_WRITE_VERSION = 1;
+
+	ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), pPayload(nullptr) {
 		if (bufferSize > 0) {
 			buffer = (uint8_t*)arena.allocate4kAlignedBuffer(bufferSize);

-			// Mark any unused page portion defined
-			VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize);
+			// Zero unused region
+			memset(buffer + logicalSize, 0, bufferSize - logicalSize);
 		} else {
 			buffer = nullptr;
 		}
 	};

-	~ArenaPage() {
-		if (userData != nullptr && userDataDestructor != nullptr) {
-			userDataDestructor(userData);
+	~ArenaPage() {}
+
+	// Before using these, either init() or postReadHeader and postReadPayload() must be called
+	const uint8_t* data() const { return pPayload; }
+	uint8_t* mutateData() const { return (uint8_t*)pPayload; }
+	int dataSize() const { return payloadSize; }
+
+	StringRef dataAsStringRef() const { return StringRef((uint8_t*)pPayload, payloadSize); }
+
+	const uint8_t* rawData() const { return buffer; }
+	uint8_t* rawData() { return buffer; }
+	int rawSize() const { return bufferSize; }
+
+#pragma pack(push, 1)
+
+	// The next few structs describe the byte-packed physical structure.  The fields of Page
+	// cannot change, but new header versions and encoding types can be added and existing
+	// header versions and encoding type headers could change size as offset information
+	// is stored to enable efficient jumping to the encoding header or payload.
+	// Page members are only initialized in init()
+	struct PageHeader {
+		uint8_t headerVersion;
+		EncodingType encodingType;
+
+		// Encoding header comes after main header
+		uint8_t encodingHeaderOffset;
+
+		// Payload comes after encoding header
+		uint8_t payloadOffset;
+
+		// Get main header pointer, casting to its type
+		template <typename T>
+		T* getMainHeader() const {
+			return (T*)(this + 1);
+		}
+
+		// Get encoding header pointer, casting to its type
+		template <typename T>
+		T* getEncodingHeader() const {
+			return (T*)((uint8_t*)this + encodingHeaderOffset);
+		}
+
+		// Get payload pointer
+		uint8_t* getPayload() const { return (uint8_t*)this + payloadOffset; }
+	};
+
+	// Redwood header version 1
+	// Protects all headers with a 64-bit XXHash checksum
+	// Most other fields are forensic in nature and are not required to be set for correct
+	// behavior but they can faciliate forensic investigation of data on disk.  Some of them
+	// could be used for sanity checks at runtime.
+	struct RedwoodHeaderV1 {
+		PageType pageType;
+		// The meaning of pageSubType is based on pageType
+		//   For Queue pages, pageSubType is the QueueID
+		//   For BTree nodes, pageSubType is Height (also stored in BTreeNode)
+		uint8_t pageSubType;
+		// Format identifier, normally specific to the page Type and SubType
+		uint8_t pageFormat;
+		XXH64_hash_t checksum;
+
+		// Physical page ID of first block on disk of the ArenaPage
+		PhysicalPageID firstPhysicalPageID;
+		// The first logical page ID the ArenaPage was referenced by when last written
+		LogicalPageID lastKnownLogicalPageID;
+		// The first logical page ID of the parent of this ArenaPage when last written
+		LogicalPageID lastKnownParentLogicalPageID;
+
+		// Time and write version as of the last update to this page.
+		// Note that for relocated pages, writeVersion should not be updated.
+		double writeTime;
+		Version writeVersion;
+
+		// Update checksum
+		void updateChecksum(uint8_t* headerBytes, int len) {
+			// Checksum is within the checksum input so clear it first
+			checksum = 0;
+			checksum = XXH3_64bits(headerBytes, len);
+		}
+
+		// Verify checksum
+		void verifyChecksum(uint8_t* headerBytes, int len) {
+			// Checksum is within the checksum input so save it and restore it afterwards
+			XXH64_hash_t saved = checksum;
+			checksum = 0;
+			XXH64_hash_t calculated = XXH3_64bits(headerBytes, len);
+			checksum = saved;
+
+			if (saved != calculated) {
+				throw page_header_checksum_failed();
+			}
+		}
+	};
+
+	// An encoding that validates the payload with an XXHash checksum
+	struct XXHashEncodingHeader {
+		XXH64_hash_t checksum;
+		void encode(uint8_t* payload, int len, PhysicalPageID seed) {
+			checksum = XXH3_64bits_withSeed(payload, len, seed);
+		}
+		void decode(uint8_t* payload, int len, PhysicalPageID seed) {
+			if (checksum != XXH3_64bits_withSeed(payload, len, seed)) {
+				throw page_decoding_failed();
+			}
+		}
+	};
+
+	// A dummy "encrypting" encoding which uses XOR with a 1 byte secret key on
+	// the payload to obfuscate it and protects the payload with an XXHash checksum.
+	struct XOREncryptionEncodingHeader {
+		// Checksum is on unencrypted payload
+		XXH64_hash_t checksum;
+		uint8_t keyID;
+
+		void encode(uint8_t secret, uint8_t* payload, int len, PhysicalPageID seed) {
+			checksum = XXH3_64bits_withSeed(payload, len, seed);
+			for (int i = 0; i < len; ++i) {
+				payload[i] ^= secret;
+			}
+		}
+		void decode(uint8_t secret, uint8_t* payload, int len, PhysicalPageID seed) {
+			for (int i = 0; i < len; ++i) {
+				payload[i] ^= secret;
+			}
+			if (checksum != XXH3_64bits_withSeed(payload, len, seed)) {
+				throw page_decoding_failed();
+			}
+		}
+	};
+#pragma pack(pop)
+
+	// Get the size of the encoding header based on type
+	// Note that this is only to be used in operations involving new pages to calculate the payload offset.  For
+	// existing pages, the payload offset is stored in the page.
+	static int encodingHeaderSize(EncodingType t) {
+		if (t == EncodingType::XXHash64) {
+			return sizeof(XXHashEncodingHeader);
+		} else if (t == EncodingType::XOREncryption) {
+			return sizeof(XOREncryptionEncodingHeader);
+		} else {
+			throw page_encoding_not_supported();
 		}
 	}

-	uint8_t const* begin() const { return (uint8_t*)buffer; }
+	// Get the usable size for a new page of pageSize using HEADER_WRITE_VERSION with encoding type t
+	static int getUsableSize(int pageSize, EncodingType t) {
+		return pageSize - sizeof(PageHeader) - sizeof(RedwoodHeaderV1) - encodingHeaderSize(t);
+	}

-	uint8_t* mutate() { return (uint8_t*)buffer; }
+	// Initialize the header for a new page so that the payload can be written to
+	// Pre:  Buffer is allocated and logical size is set
+	// Post: Page header is initialized and space is reserved for subheaders for
+	//       HEADER_WRITE_VERSION main header and the given encoding type.
+	//       Payload can be written to with mutateData() and dataSize()
+	void init(EncodingType t, PageType pageType, uint8_t pageSubType, uint8_t pageFormat = 0) {
+		// Carefully cast away constness to modify page header
+		PageHeader* p = const_cast<PageHeader*>(page);
+		p->headerVersion = HEADER_WRITE_VERSION;
+		p->encodingHeaderOffset = sizeof(PageHeader) + sizeof(RedwoodHeaderV1);
+		p->encodingType = t;
+		p->payloadOffset = page->encodingHeaderOffset + encodingHeaderSize(t);

-	typedef XXH64_hash_t Checksum;
+		pPayload = page->getPayload();
+		payloadSize = logicalSize - (pPayload - buffer);

-	// Usable size, without checksum
-	int size() const { return logicalSize - sizeof(Checksum); }
+		RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
+		h->pageType = pageType;
+		h->pageSubType = pageSubType;
+		h->pageFormat = pageFormat;

-	Standalone<StringRef> asStringRef() const { return Standalone<StringRef>(StringRef(begin(), size()), arena); }
+		// Write dummy values for these in new pages. They should be updated when possible before calling preWrite()
+		// when modifying existing pages
+		h->lastKnownLogicalPageID = invalidLogicalPageID;
+		h->lastKnownParentLogicalPageID = invalidLogicalPageID;
+		h->writeVersion = invalidVersion;
+	}

-	// Get an ArenaPage which is a copy of this page, in its own Arena
-	Reference<ArenaPage> cloneContents() const {
+	// Get the logical page buffer as a StringRef
+	Standalone<StringRef> asStringRef() const { return Standalone<StringRef>(StringRef(buffer, logicalSize)); }
+
+	// Get a new ArenaPage that contains a copy of this page's data.
+	// extra is not copied to the returned page
+	Reference<ArenaPage> clone() const {
 		ArenaPage* p = new ArenaPage(logicalSize, bufferSize);
 		memcpy(p->buffer, buffer, logicalSize);
+
+		// Non-verifying header parse just to initialize members
+		p->postReadHeader(invalidPhysicalPageID, false);
+		p->encryptionKey = encryptionKey;
+
 		return Reference<ArenaPage>(p);
 	}

 	// Get an ArenaPage which depends on this page's Arena and references some of its memory
-	Reference<ArenaPage> subPage(int offset, int len) const {
+	Reference<ArenaPage> getSubPage(int offset, int len) const {
+		ASSERT(offset + len <= logicalSize);
 		ArenaPage* p = new ArenaPage(len, 0);
 		p->buffer = buffer + offset;
 		p->arena.dependsOn(arena);
+
+		// Non-verifying header parse just to initialize component pointers
+		p->postReadHeader(invalidPhysicalPageID, false);
+		p->encryptionKey = encryptionKey;
+
 		return Reference<ArenaPage>(p);
 	}

-	// Given a vector of pages with the same ->size(), create a new ArenaPage with a ->size() that is
-	// equivalent to all of the input pages and has all of their contents copied into it.
-	static Reference<ArenaPage> concatPages(const std::vector<Reference<const ArenaPage>>& pages) {
-		int usableSize = pages.front()->size();
-		int totalUsableSize = pages.size() * usableSize;
-		int totalBufferSize = pages.front()->bufferSize * pages.size();
-		ArenaPage* superpage = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize);
-
-		uint8_t* wptr = superpage->mutate();
-		for (auto& p : pages) {
-			ASSERT(p->size() == usableSize);
-			memcpy(wptr, p->begin(), usableSize);
-			wptr += usableSize;
+	// The next two functions set mostly forensic info that may help in an investigation to identify data on disk.  The
+	// exception is pageID which must be set to the physical page ID on disk where the page is written or post-read
+	// verification will fail.
+	void setWriteInfo(PhysicalPageID pageID, Version writeVersion) {
+		if (page->headerVersion == 1) {
+			RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
+			h->firstPhysicalPageID = pageID;
+			h->writeVersion = writeVersion;
+			h->writeTime = now();
 		}
-
-		return Reference<ArenaPage>(superpage);
 	}

-	Checksum& getChecksum() { return *(Checksum*)(buffer + size()); }
+	// These should be updated before writing a BTree page.  Note that the logical ID that refers to a page can change
+	// after the page is written, if its parent is updated to point directly to its physical page ID.  Therefore, the
+	// last known logical page ID should always be updated before writing an updated version of a BTree page.
+	void setLogicalPageInfo(LogicalPageID lastKnownLogicalPageID, LogicalPageID lastKnownParentLogicalPageID) {
+		if (page->headerVersion == 1) {
+			RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
+			h->lastKnownLogicalPageID = lastKnownLogicalPageID;
+			h->lastKnownParentLogicalPageID = lastKnownParentLogicalPageID;
+		}
+	}

-	Checksum calculateChecksum(LogicalPageID pageID) { return XXH3_64bits_withSeed(buffer, size(), pageID); }
+	// Must be called before writing to disk to update headers and encrypt page
+	// Pre:   Encoding-specific header fields are set if needed
+	//        Secret is set if needed
+	// Post:  Main and Encoding subheaders are updated
+	//        Payload is possibly encrypted
+	void preWrite(PhysicalPageID pageID) const {
+		// Explicitly check payload definedness to make the source of valgrind errors more clear.
+		// Without this check, calculating a checksum on a payload with undefined bytes does not
+		// cause a valgrind error but the resulting checksum is undefined which causes errors later.
+		ASSERT(VALGRIND_CHECK_MEM_IS_DEFINED(pPayload, payloadSize) == 0);

-	void updateChecksum(LogicalPageID pageID) { getChecksum() = calculateChecksum(pageID); }
+		if (page->encodingType == EncodingType::XXHash64) {
+			page->getEncodingHeader<XXHashEncodingHeader>()->encode(pPayload, payloadSize, pageID);
+		} else if (page->encodingType == EncodingType::XOREncryption) {
+			ASSERT(encryptionKey.secret.size() == 1);
+			XOREncryptionEncodingHeader* xh = page->getEncodingHeader<XOREncryptionEncodingHeader>();
+			xh->keyID = encryptionKey.id.orDefault(0);
+			xh->encode(encryptionKey.secret[0], pPayload, payloadSize, pageID);
+		} else {
+			throw page_encoding_not_supported();
+		}

-	bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); }
+		if (page->headerVersion == 1) {
+			page->getMainHeader<RedwoodHeaderV1>()->updateChecksum(buffer, pPayload - buffer);
+		} else {
+			throw page_header_version_not_supported();
+		}
+	}
+
+	// Must be called after reading from disk to verify all non-payload bytes
+	// Pre:   Bytes from storage medium copied into raw buffer space
+	// Post:  Page headers outside of payload are verified (unless verify is false)
+	//        encryptionKey is updated with information from encoding header if needed
+	//        Payload is accessible via data(), dataSize(), etc.
+	//
+	// Exceptions are thrown for unknown header types or pages which fail verification
+	void postReadHeader(PhysicalPageID pageID, bool verify = true) {
+		pPayload = page->getPayload();
+		payloadSize = logicalSize - (pPayload - buffer);
+
+		// Populate encryption key with relevant fields from page
+		if (page->encodingType == EncodingType::XOREncryption) {
+			encryptionKey.id = page->getEncodingHeader<XOREncryptionEncodingHeader>()->keyID;
+		}
+
+		if (page->headerVersion == 1) {
+			if (verify) {
+				RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
+				h->verifyChecksum(buffer, pPayload - buffer);
+				if (pageID != h->firstPhysicalPageID) {
+					throw page_header_wrong_page_id();
+				}
+			}
+		} else {
+			throw page_header_version_not_supported();
+		}
+	}
+
+	// Pre:   postReadHeader has been called, encoding-specific parameters (such as the encryption secret) have been set
+	// Post:  Payload has been verified and decrypted if necessary
+	void postReadPayload(PhysicalPageID pageID) {
+		if (page->encodingType == EncodingType::XXHash64) {
+			page->getEncodingHeader<XXHashEncodingHeader>()->decode(pPayload, payloadSize, pageID);
+		} else if (page->encodingType == EncodingType::XOREncryption) {
+			ASSERT(encryptionKey.secret.size() == 1);
+			page->getEncodingHeader<XOREncryptionEncodingHeader>()->decode(
+			    encryptionKey.secret[0], pPayload, payloadSize, pageID);
+		} else {
+			throw page_encoding_not_supported();
+		}
+	}

 	const Arena& getArena() const { return arena; }

+	static bool isEncodingTypeEncrypted(EncodingType t) { return t == EncodingType::XOREncryption; }
+
+	// Returns true if the page's encoding type employs encryption
+	bool isEncrypted() const { return isEncodingTypeEncrypted(getEncodingType()); }
+
 private:
 	Arena arena;
+
+	// The logical size of the page, which can be smaller than bufferSize, which is only of
+	// practical purpose in simulation to use arbitrarily small page sizes to test edge cases
+	// with shorter execution time
 	int logicalSize;
+
+	// The 4k-aligned physical size of allocated memory for the page which also represents the
+	// block size to be written to disk
 	int bufferSize;
-	uint8_t* buffer;
+
+	// buffer is a pointer to the page's memory
+	// For convenience, it is unioned with a Page pointer which defines the page structure
+	union {
+		uint8_t* buffer;
+		const PageHeader* page;
+	};
+
+	// Pointer and length of page space available to the user
+	// These are accessed very often so they are stored directly
+	uint8_t* pPayload;
+	int payloadSize;

 public:
-	mutable void* userData;
-	mutable void (*userDataDestructor)(void*);
+	EncodingType getEncodingType() const { return page->encodingType; }
+
+	PhysicalPageID getPhysicalPageID() const {
+		if (page->headerVersion == 1) {
+			return page->getMainHeader<RedwoodHeaderV1>()->firstPhysicalPageID;
+		} else {
+			throw page_header_version_not_supported();
+		}
+	}
+
+	// Used by encodings that do encryption
+	EncryptionKey encryptionKey;
+
+	mutable ArbitraryObject extra;
 };

 class IPagerSnapshot {
@ -184,18 +605,21 @@ public:

 	virtual void addref() = 0;
 	virtual void delref() = 0;
+
+	ArbitraryObject extra;
 };

 // This API is probably too customized to the behavior of DWALPager and probably needs some changes to be more generic.
 class IPager2 : public IClosable {
 public:
+	virtual std::string getName() const = 0;
+
 	// Returns an ArenaPage that can be passed to writePage. The data in the returned ArenaPage might not be zeroed.
-	virtual Reference<ArenaPage> newPageBuffer(size_t size = 1) = 0;
+	virtual Reference<ArenaPage> newPageBuffer(size_t blocks = 1) = 0;

 	// Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead).
 	// For a given pager instance, separate calls to this function must return the same value.
 	// Only valid to call after recovery is complete.
-	virtual int getUsablePageSize() const = 0;
 	virtual int getPhysicalPageSize() const = 0;
 	virtual int getLogicalPageSize() const = 0;
 	virtual int getPagesPerExtent() const = 0;
@ -251,7 +675,7 @@ public:
 	                                              bool noHit) = 0;
 	virtual Future<Reference<ArenaPage>> readMultiPage(PagerEventReasons reason,
 	                                                   unsigned int level,
-	                                                   Standalone<VectorRef<PhysicalPageID>> pageIDs,
+	                                                   VectorRef<PhysicalPageID> pageIDs,
 	                                                   int priority,
 	                                                   bool cacheable,
 	                                                   bool noHit) = 0;
@ -271,16 +695,13 @@ public:
 	// The snapshot shall be usable until setOldVersion() is called with a version > v.
 	virtual Reference<IPagerSnapshot> getReadSnapshot(Version v) = 0;

-	// Atomically make durable all pending page writes, page frees, and update the metadata string,
-	// setting the committed version to v
-	// v must be >= the highest versioned page write.
-	virtual Future<Void> commit(Version v) = 0;
+	// Atomically make durable all pending page writes, page frees, and update the user commit
+	// record at version v
+	// v must be higher than the highest committed version
+	virtual Future<Void> commit(Version v, Value commitRecord) = 0;

-	// Get the latest meta key set or committed
-	virtual Key getMetaKey() const = 0;
-
-	// Set the metakey which will be stored in the next commit
-	virtual void setMetaKey(KeyRef metaKey) = 0;
+	// Get the latest committed user commit record
+	virtual Value getCommitRecord() const = 0;

 	virtual StorageBytes getStorageBytes() const = 0;

@ -318,4 +739,52 @@ protected:
 	~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface
 };

+// The null key provider is useful to simplify page decoding.
+// It throws an error for any key info requested.
+class NullKeyProvider : public IEncryptionKeyProvider {
+public:
+	virtual ~NullKeyProvider() {}
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { throw encryption_key_not_found(); }
+	Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		throw encryption_key_not_found();
+	}
+};
+
+// Key provider for dummy XOR encryption scheme
+class XOREncryptionKeyProvider : public IEncryptionKeyProvider {
+public:
+	XOREncryptionKeyProvider(std::string filename) {
+		ASSERT(g_network->isSimulated());
+
+		// Choose a deterministic random filename (without path) byte for secret generation
+		// Remove any leading directory names
+		size_t lastSlash = filename.find_last_of("\\/");
+		if (lastSlash != filename.npos) {
+			filename.erase(0, lastSlash);
+		}
+		xorWith = filename.empty() ? 0x5e
+		                           : (uint8_t)filename[XXH3_64bits(filename.data(), filename.size()) % filename.size()];
+	}
+
+	virtual ~XOREncryptionKeyProvider() {}
+
+	virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
+		if (!key.id.present()) {
+			throw encryption_key_not_found();
+		}
+		EncryptionKey s = key;
+		uint8_t secret = ~(uint8_t)key.id.get() ^ xorWith;
+		s.secret = StringRef(s.arena(), &secret, 1);
+		return s;
+	}
+
+	virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		EncryptionKeyRef k;
+		k.id = end.empty() ? 0 : *(end.end() - 1);
+		return getSecrets(k);
+	}
+
+	uint8_t xorWith;
+};
+
 #endif
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@ -790,6 +790,7 @@ ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> stat
 		{ "EstPendCompactBytes", rocksdb::DB::Properties::kEstimatePendingCompactionBytes },
 		{ "BlockCacheUsage", rocksdb::DB::Properties::kBlockCacheUsage },
 		{ "BlockCachePinnedUsage", rocksdb::DB::Properties::kBlockCachePinnedUsage },
+		{ "LiveSstFilesSize", rocksdb::DB::Properties::kLiveSstFilesSize },
 	};

 	state std::unordered_map<std::string, uint64_t> readIteratorPoolStats = {
@ -811,7 +812,8 @@ ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> stat
 		for (auto& p : propertyStats) {
 			auto& [name, property] = p;
 			stat = 0;
-			ASSERT(db->GetIntProperty(property, &stat));
+			// GetAggregatedIntProperty gets the aggregated int property from all column families.
+			ASSERT(db->GetAggregatedIntProperty(property, &stat));
 			e.detail(name, stat);
 		}

@ -1933,7 +1935,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {

 	StorageBytes getStorageBytes() const override {
 		uint64_t live = 0;
-		ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kLiveSstFilesSize, &live));
+		ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kLiveSstFilesSize, &live));

 		int64_t free;
 		int64_t total;
--- a/fdbserver/MasterInterface.h
+++ b/fdbserver/MasterInterface.h
@ -157,19 +157,21 @@ struct UpdateRecoveryDataRequest {
 	Version lastEpochEnd;
 	std::vector<CommitProxyInterface> commitProxies;
 	std::vector<ResolverInterface> resolvers;
+	Optional<int64_t> versionEpoch;
 	ReplyPromise<Void> reply;

 	UpdateRecoveryDataRequest() = default;
 	UpdateRecoveryDataRequest(Version recoveryTransactionVersion,
 	                          Version lastEpochEnd,
 	                          const std::vector<CommitProxyInterface>& commitProxies,
-	                          const std::vector<ResolverInterface>& resolvers)
+	                          const std::vector<ResolverInterface>& resolvers,
+	                          Optional<int64_t> versionEpoch)
 	  : recoveryTransactionVersion(recoveryTransactionVersion), lastEpochEnd(lastEpochEnd),
-	    commitProxies(commitProxies), resolvers(resolvers) {}
+	    commitProxies(commitProxies), resolvers(resolvers), versionEpoch(versionEpoch) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, recoveryTransactionVersion, lastEpochEnd, commitProxies, resolvers, reply);
+		serializer(ar, recoveryTransactionVersion, lastEpochEnd, commitProxies, resolvers, versionEpoch, reply);
 	}
 };

--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@ -277,9 +277,8 @@ ACTOR Future<std::vector<StorageServerInterface>> getStorageServers(Database cx,
 	}
 }

-ACTOR Future<std::vector<WorkerInterface>> getStorageWorkers(Database cx,
-                                                             Reference<AsyncVar<ServerDBInfo> const> dbInfo,
-                                                             bool localOnly) {
+ACTOR Future<std::pair<std::vector<WorkerInterface>, int>>
+getStorageWorkers(Database cx, Reference<AsyncVar<ServerDBInfo> const> dbInfo, bool localOnly) {
 	state std::vector<StorageServerInterface> servers = wait(getStorageServers(cx));
 	state std::map<NetworkAddress, WorkerInterface> workersMap;
 	std::vector<WorkerDetails> workers = wait(getWorkers(dbInfo));
@ -299,7 +298,9 @@ ACTOR Future<std::vector<WorkerInterface>> getStorageWorkers(Database cx,
 	}
 	auto masterDcId = dbInfo->get().master.locality.dcId();

-	std::vector<WorkerInterface> result;
+	std::pair<std::vector<WorkerInterface>, int> result;
+	auto& [workerInterfaces, failures] = result;
+	failures = 0;
 	for (const auto& server : servers) {
 		TraceEvent(SevDebug, "DcIdInfo")
 		    .detail("ServerLocalityID", server.locality.dcId())
@ -310,9 +311,10 @@ ACTOR Future<std::vector<WorkerInterface>> getStorageWorkers(Database cx,
 				TraceEvent(SevWarn, "GetStorageWorkers")
 				    .detail("Reason", "Could not find worker for storage server")
 				    .detail("SS", server.id());
-				throw operation_failed();
+				++failures;
+			} else {
+				workerInterfaces.push_back(itr->second);
 			}
-			result.push_back(itr->second);
 		}
 	}
 	return result;
@ -598,6 +600,31 @@ ACTOR Future<bool> getStorageServersRecruiting(Database cx, WorkerInterface dist
 	}
 }

+// Gets the difference between the expected version (based on the version
+// epoch) and the actual version.
+ACTOR Future<int64_t> getVersionOffset(Database cx,
+                                       WorkerInterface distributorWorker,
+                                       Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+	loop {
+		state Transaction tr(cx);
+		try {
+			TraceEvent("GetVersionOffset").detail("Stage", "ReadingVersionEpoch");
+
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			state Version rv = wait(tr.getReadVersion());
+			Optional<Standalone<StringRef>> versionEpochValue = wait(tr.get(versionEpochKey));
+			if (!versionEpochValue.present()) {
+				return 0;
+			}
+			int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(versionEpochValue.get(), Unversioned());
+			int64_t versionOffset = abs(rv - (g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - versionEpoch));
+			return versionOffset;
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
 ACTOR Future<Void> repairDeadDatacenter(Database cx,
                                        Reference<AsyncVar<ServerDBInfo> const> dbInfo,
                                        std::string context) {
@ -652,7 +679,8 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
                                        int64_t maxTLogQueueGate = 5e6,
                                        int64_t maxStorageServerQueueGate = 5e6,
                                        int64_t maxDataDistributionQueueSize = 0,
-                                        int64_t maxPoppedVersionLag = 30e6) {
+                                        int64_t maxPoppedVersionLag = 30e6,
+                                        int64_t maxVersionOffset = 1e6) {
 	state Future<Void> reconfig =
 	    reconfigureAfter(cx, 100 + (deterministicRandom()->random01() * 100), dbInfo, "QuietDatabase");
 	state Future<int64_t> dataInFlight;
@ -662,6 +690,7 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
 	state Future<int64_t> storageQueueSize;
 	state Future<bool> dataDistributionActive;
 	state Future<bool> storageServersRecruiting;
+	state Future<int64_t> versionOffset;
 	auto traceMessage = "QuietDatabase" + phase + "Begin";
 	TraceEvent(traceMessage.c_str()).log();

@ -698,10 +727,11 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
 			storageQueueSize = getMaxStorageServerQueueSize(cx, dbInfo);
 			dataDistributionActive = getDataDistributionActive(cx, distributorWorker);
 			storageServersRecruiting = getStorageServersRecruiting(cx, distributorWorker, distributorUID);
+			versionOffset = getVersionOffset(cx, distributorWorker, dbInfo);

 			wait(success(dataInFlight) && success(tLogQueueInfo) && success(dataDistributionQueueSize) &&
 			     success(teamCollectionValid) && success(storageQueueSize) && success(dataDistributionActive) &&
-			     success(storageServersRecruiting));
+			     success(storageServersRecruiting) && success(versionOffset));

 			TraceEvent(("QuietDatabase" + phase).c_str())
 			    .detail("DataInFlight", dataInFlight.get())
@ -717,13 +747,17 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
 			    .detail("MaxStorageServerQueueGate", maxStorageServerQueueGate)
 			    .detail("DataDistributionActive", dataDistributionActive.get())
 			    .detail("StorageServersRecruiting", storageServersRecruiting.get())
+			    .detail("RecoveryCount", dbInfo->get().recoveryCount)
+			    .detail("VersionOffset", versionOffset.get())
 			    .detail("NumSuccesses", numSuccesses);

+			maxVersionOffset += dbInfo->get().recoveryCount * SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT;
 			if (dataInFlight.get() > dataInFlightGate || tLogQueueInfo.get().first > maxTLogQueueGate ||
 			    tLogQueueInfo.get().second > maxPoppedVersionLag ||
 			    dataDistributionQueueSize.get() > maxDataDistributionQueueSize ||
 			    storageQueueSize.get() > maxStorageServerQueueGate || !dataDistributionActive.get() ||
-			    storageServersRecruiting.get() || !teamCollectionValid.get()) {
+			    storageServersRecruiting.get() || versionOffset.get() > maxVersionOffset ||
+			    !teamCollectionValid.get()) {

 				wait(delay(1.0));
 				numSuccesses = 0;
@ -779,6 +813,10 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
 				auto key = "NotReady" + std::to_string(notReadyCount++);
 				evt.detail(key.c_str(), "storageServersRecruiting");
 			}
+			if (versionOffset.isReady() && versionOffset.isError()) {
+				auto key = "NotReady" + std::to_string(notReadyCount++);
+				evt.detail(key.c_str(), "versionOffset");
+			}
 			wait(delay(1.0));
 			numSuccesses = 0;
 		}
@ -794,7 +832,8 @@ Future<Void> quietDatabase(Database const& cx,
                           int64_t maxTLogQueueGate,
                           int64_t maxStorageServerQueueGate,
                           int64_t maxDataDistributionQueueSize,
-                           int64_t maxPoppedVersionLag) {
+                           int64_t maxPoppedVersionLag,
+                           int64_t maxVersionOffset) {
 	return waitForQuietDatabase(cx,
 	                            dbInfo,
 	                            phase,
@ -802,5 +841,6 @@ Future<Void> quietDatabase(Database const& cx,
 	                            maxTLogQueueGate,
 	                            maxStorageServerQueueGate,
 	                            maxDataDistributionQueueSize,
-	                            maxPoppedVersionLag);
+	                            maxPoppedVersionLag,
+	                            maxVersionOffset);
 }
--- a/fdbserver/QuietDatabase.h
+++ b/fdbserver/QuietDatabase.h
@ -46,9 +46,11 @@ Future<WorkerInterface> getMasterWorker(Database const& cx, Reference<AsyncVar<S
 Future<Void> repairDeadDatacenter(Database const& cx,
                                  Reference<AsyncVar<ServerDBInfo> const> const& dbInfo,
                                  std::string const& context);
-Future<std::vector<WorkerInterface>> getStorageWorkers(Database const& cx,
-                                                       Reference<AsyncVar<ServerDBInfo> const> const& dbInfo,
-                                                       bool const& localOnly);
+
+// Returns list of worker interfaces for available storage servers and the number of unavailable
+// storage servers
+Future<std::pair<std::vector<WorkerInterface>, int>>
+getStorageWorkers(Database const& cx, Reference<AsyncVar<ServerDBInfo> const> const& dbInfo, bool const& localOnly);
 Future<std::vector<WorkerInterface>> getCoordWorkers(Database const& cx,
                                                     Reference<AsyncVar<ServerDBInfo> const> const& dbInfo);

--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@ -767,11 +767,13 @@ struct InitializeStorageRequest {
 	Optional<std::pair<UID, Version>>
 	    tssPairIDAndVersion; // Only set if recruiting a tss. Will be the UID and Version of its SS pair.
 	UID clusterId; // Unique cluster identifier. Only needed at recruitment, will be read from txnStateStore on recovery
+	Version initialClusterVersion;
 	ReplyPromise<InitializeStorageReply> reply;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, seedTag, reqId, interfaceId, storeType, reply, tssPairIDAndVersion, clusterId);
+		serializer(
+		    ar, seedTag, reqId, interfaceId, storeType, reply, tssPairIDAndVersion, clusterId, initialClusterVersion);
 	}
 };

@ -1086,6 +1088,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                 StorageServerInterface ssi,
                                 Tag seedTag,
                                 UID clusterId,
+                                 Version startVersion,
                                 Version tssSeedVersion,
                                 ReplyPromise<InitializeStorageReply> recruitReply,
                                 Reference<AsyncVar<ServerDBInfo> const> db,
@ -1142,6 +1145,10 @@ ACTOR Future<Void> backupWorker(BackupInterface bi,

 void registerThreadForProfiling();

+// Returns true if `address` is used in the db (indicated by `dbInfo`) transaction system and in the db's primary
+// satellite DC.
+bool addressInDbAndPrimarySatelliteDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo);
+
 // Returns true if `address` is used in the db (indicated by `dbInfo`) transaction system and in the db's remote DC.
 bool addressInDbAndRemoteDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo);

--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@ -18,6 +18,7 @@
 * limitations under the License.
 */

+#include <algorithm>
 #include <iterator>

 #include "fdbrpc/sim_validation.h"
@ -47,6 +48,7 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {

 	Version version; // The last version assigned to a proxy by getVersion()
 	double lastVersionTime;
+	Optional<Version> referenceVersion;

 	std::map<UID, CommitProxyVersionReplies> lastCommitProxyVersionReplies;

@ -125,12 +127,36 @@ ACTOR Future<Void> getVersion(Reference<MasterData> self, GetCommitVersionReques
 			if (BUGGIFY) {
 				t1 = self->lastVersionTime;
 			}
-			rep.prevVersion = self->version;
-			self->version +=
+
+			// Versions should roughly follow wall-clock time, based on the
+			// system clock of the current machine and an FDB-specific epoch.
+			// Calculate the expected version and determine whether we need to
+			// hand out versions faster or slower to stay in sync with the
+			// clock.
+			Version toAdd =
 			    std::max<Version>(1,
 			                      std::min<Version>(SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS,
 			                                        SERVER_KNOBS->VERSIONS_PER_SECOND * (t1 - self->lastVersionTime)));

+			rep.prevVersion = self->version;
+			if (self->referenceVersion.present()) {
+				Version expected =
+				    g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - self->referenceVersion.get();
+
+				// Attempt to jump directly to the expected version. But make
+				// sure that versions are still being handed out at a rate
+				// around VERSIONS_PER_SECOND. This rate is scaled depending on
+				// how far off the calculated version is from the expected
+				// version.
+				int64_t maxOffset = std::min(static_cast<int64_t>(toAdd * SERVER_KNOBS->MAX_VERSION_RATE_MODIFIER),
+				                             SERVER_KNOBS->MAX_VERSION_RATE_OFFSET);
+				self->version =
+				    std::clamp(expected, self->version + toAdd - maxOffset, self->version + toAdd + maxOffset);
+				ASSERT_GT(self->version, rep.prevVersion);
+			} else {
+				self->version = self->version + toAdd;
+			}
+
 			TEST(self->version - rep.prevVersion == 1); // Minimum possible version gap

 			bool maxVersionGap = self->version - rep.prevVersion == SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS;
@ -214,7 +240,8 @@ ACTOR Future<Void> updateRecoveryData(Reference<MasterData> self) {
 		TraceEvent("UpdateRecoveryData", self->dbgid)
 		    .detail("RecoveryTxnVersion", req.recoveryTransactionVersion)
 		    .detail("LastEpochEnd", req.lastEpochEnd)
-		    .detail("NumCommitProxies", req.commitProxies.size());
+		    .detail("NumCommitProxies", req.commitProxies.size())
+		    .detail("VersionEpoch", req.versionEpoch);

 		if (self->recoveryTransactionVersion == invalidVersion ||
 		    req.recoveryTransactionVersion > self->recoveryTransactionVersion) {
@ -230,6 +257,16 @@ ACTOR Future<Void> updateRecoveryData(Reference<MasterData> self) {
 				self->lastCommitProxyVersionReplies[p.id()] = CommitProxyVersionReplies();
 			}
 		}
+		if (req.versionEpoch.present()) {
+			self->referenceVersion = req.versionEpoch.get();
+		} else if (BUGGIFY) {
+			// Cannot use a positive version epoch in simulation because of the
+			// clock starting at 0. A positive version epoch would mean the initial
+			// cluster version was negative.
+			// TODO: Increase the size of this interval after fixing the issue
+			// with restoring ranges with large version gaps.
+			self->referenceVersion = deterministicRandom()->randomInt64(-1e6, 0);
+		}

 		self->resolutionBalancer.setCommitProxies(req.commitProxies);
 		self->resolutionBalancer.setResolvers(req.resolvers);
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -440,6 +440,8 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
 	Version metadataCreateVersion = invalidVersion;

 	bool removing = false;
+	bool destroyed = false;
+	bool possiblyDestroyed = false;

 	KeyRangeMap<std::unordered_map<UID, Promise<Void>>> moveTriggers;

@ -472,6 +474,13 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
 		}
 		// TODO: may be more cleanup possible here
 	}
+
+	void destroy(Version destroyVersion) {
+		removing = true;
+		destroyed = true;
+		moved(range);
+		newMutations.trigger();
+	}
 };

 class ServerWatchMetadata : public ReferenceCounted<ServerWatchMetadata> {
@ -795,6 +804,9 @@ public:
 	Reference<ILogSystem::IPeekCursor> logCursor;

 	Promise<UID> clusterId;
+	// The version the cluster starts on. This value is not persisted and may
+	// not be valid after a recovery.
+	Version initialClusterVersion = invalidVersion;
 	UID thisServerID;
 	Optional<UID> tssPairID; // if this server is a tss, this is the id of its (ss) pair
 	Optional<UID> ssPairID; // if this server is an ss, this is the id of its (tss) pair
@ -1908,6 +1920,12 @@ ACTOR Future<Void> overlappingChangeFeedsQ(StorageServer* data, OverlappingChang
 	for (auto& it : rangeIds) {
 		reply.rangeIds.push_back(OverlappingChangeFeedEntry(
 		    it.first, std::get<0>(it.second), std::get<1>(it.second), std::get<2>(it.second)));
+		TraceEvent(SevDebug, "OverlappingChangeFeedEntry", data->thisServerID)
+		    .detail("MinVersion", req.minVersion)
+		    .detail("FeedID", it.first)
+		    .detail("Range", std::get<0>(it.second))
+		    .detail("EmptyVersion", std::get<1>(it.second))
+		    .detail("StopVersion", std::get<2>(it.second));
 	}

 	// Make sure all of the metadata we are sending won't get rolled back
@ -4699,6 +4717,9 @@ ACTOR Future<Void> tryGetRange(PromiseStream<RangeResult> results, Transaction*
 	}
 }

+// global validation that missing refreshed feeds were previously destroyed
+static std::unordered_set<Key> allDestroyedChangeFeeds;
+
 // We have to store the version the change feed was stopped at in the SS instead of just the stopped status
 // In addition to simplifying stopping logic, it enables communicating stopped status when fetching change feeds
 // from other SS correctly
@ -4739,33 +4760,35 @@ ACTOR Future<Void> changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req)
 	    .detail("RangeID", req.rangeID.printable())
 	    .detail("Version", req.version)
 	    .detail("SSVersion", self->version.get())
-	    .detail("Range", req.range.toString());
+	    .detail("Range", req.range);

 	if (req.version - 1 > feed->second->emptyVersion) {
 		feed->second->emptyVersion = req.version - 1;
 		while (!feed->second->mutations.empty() && feed->second->mutations.front().version < req.version) {
 			feed->second->mutations.pop_front();
 		}
-		Version durableVersion = self->data().getLatestVersion();
-		auto& mLV = self->addVersionToMutationLog(durableVersion);
-		self->addMutationToMutationLog(
-		    mLV,
-		    MutationRef(
-		        MutationRef::SetValue,
-		        persistChangeFeedKeys.begin.toString() + feed->second->id.toString(),
-		        changeFeedSSValue(feed->second->range, feed->second->emptyVersion + 1, feed->second->stopVersion)));
-		if (feed->second->storageVersion != invalidVersion) {
-			++self->counters.kvSystemClearRanges;
-			self->addMutationToMutationLog(mLV,
-			                               MutationRef(MutationRef::ClearRange,
-			                                           changeFeedDurableKey(feed->second->id, 0),
-			                                           changeFeedDurableKey(feed->second->id, req.version)));
-			if (req.version > feed->second->storageVersion) {
-				feed->second->storageVersion = invalidVersion;
-				feed->second->durableVersion = invalidVersion;
+		if (!feed->second->destroyed) {
+			Version durableVersion = self->data().getLatestVersion();
+			auto& mLV = self->addVersionToMutationLog(durableVersion);
+			self->addMutationToMutationLog(
+			    mLV,
+			    MutationRef(
+			        MutationRef::SetValue,
+			        persistChangeFeedKeys.begin.toString() + feed->second->id.toString(),
+			        changeFeedSSValue(feed->second->range, feed->second->emptyVersion + 1, feed->second->stopVersion)));
+			if (feed->second->storageVersion != invalidVersion) {
+				++self->counters.kvSystemClearRanges;
+				self->addMutationToMutationLog(mLV,
+				                               MutationRef(MutationRef::ClearRange,
+				                                           changeFeedDurableKey(feed->second->id, 0),
+				                                           changeFeedDurableKey(feed->second->id, req.version)));
+				if (req.version > feed->second->storageVersion) {
+					feed->second->storageVersion = invalidVersion;
+					feed->second->durableVersion = invalidVersion;
+				}
 			}
+			wait(self->durableVersion.whenAtLeast(durableVersion));
 		}
-		wait(self->durableVersion.whenAtLeast(durableVersion));
 	}
 	req.reply.send(Void());
 	return Void();
@ -4944,7 +4967,9 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
 			    .errorUnsuppressed(e)
 			    .detail("RangeID", rangeId.printable())
 			    .detail("Range", range.toString())
-			    .detail("EndVersion", endVersion);
+			    .detail("EndVersion", endVersion)
+			    .detail("Removing", changeFeedInfo->removing)
+			    .detail("Destroyed", changeFeedInfo->destroyed);
 			throw;
 		}
 	}
@ -5041,6 +5066,7 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
 		}
 	}

+	state bool seenNotRegistered = false;
 	loop {
 		try {
 			Version maxFetched = wait(fetchChangeFeedApplier(data,
@ -5057,19 +5083,110 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
 				throw;
 			}
 		}
-		wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+
+		// TODO REMOVE
+		fmt::print("DBG: SS {} Feed {} possibly destroyed {}, {} metadata create, {} desired committed\n",
+		           data->thisServerID.toString().substr(0, 4),
+		           changeFeedInfo->id.printable(),
+		           changeFeedInfo->possiblyDestroyed,
+		           changeFeedInfo->metadataCreateVersion,
+		           data->desiredOldestVersion.get());
+
+		// There are two reasons for change_feed_not_registered:
+		//   1. The feed was just created, but the ss mutation stream is ahead of the GRV that fetchChangeFeedApplier
+		//   uses to read the change feed data from the database. In this case we need to wait and retry
+		//   2. The feed was destroyed, but we missed a metadata update telling us this. In this case we need to destroy
+		//   the feed
+		// endVersion >= the metadata create version, so we can safely use it as a proxy
+		if (beginVersion != 0 || seenNotRegistered || endVersion <= data->desiredOldestVersion.get()) {
+			// If any of these are true, the feed must be destroyed.
+			Version cleanupVersion = data->data().getLatestVersion();
+
+			TraceEvent(SevDebug, "DestroyingChangeFeedFromFetch", data->thisServerID)
+			    .detail("RangeID", changeFeedInfo->id.printable())
+			    .detail("Range", changeFeedInfo->range.toString())
+			    .detail("Version", cleanupVersion);
+
+			if (g_network->isSimulated()) {
+				ASSERT(allDestroyedChangeFeeds.count(changeFeedInfo->id));
+			}
+
+			Key beginClearKey = changeFeedInfo->id.withPrefix(persistChangeFeedKeys.begin);
+
+			auto& mLV = data->addVersionToMutationLog(cleanupVersion);
+			data->addMutationToMutationLog(
+			    mLV, MutationRef(MutationRef::ClearRange, beginClearKey, keyAfter(beginClearKey)));
+			++data->counters.kvSystemClearRanges;
+			data->addMutationToMutationLog(mLV,
+			                               MutationRef(MutationRef::ClearRange,
+			                                           changeFeedDurableKey(changeFeedInfo->id, 0),
+			                                           changeFeedDurableKey(changeFeedInfo->id, cleanupVersion)));
+			++data->counters.kvSystemClearRanges;
+
+			changeFeedInfo->destroy(cleanupVersion);
+			data->changeFeedCleanupDurable[changeFeedInfo->id] = cleanupVersion;
+
+			for (auto& it : data->changeFeedRemovals) {
+				it.second.send(changeFeedInfo->id);
+			}
+
+			return invalidVersion;
+		}
+
+		// otherwise assume the feed just hasn't been created on the SS we tried to read it from yet, wait for it to
+		// definitely be committed and retry
+		seenNotRegistered = true;
+		wait(data->desiredOldestVersion.whenAtLeast(endVersion));
 	}
 }

 ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
                                                       KeyRange keys,
-                                                       Version fetchVersion,
-                                                       PromiseStream<Key> removals) {
+                                                       PromiseStream<Key> removals,
+                                                       UID fetchKeysID) {
+
+	// Wait for current TLog batch to finish to ensure that we're fetching metadata at a version >= the version of the
+	// ChangeServerKeys mutation. This guarantees we don't miss any metadata between the previous batch's version
+	// (data->version) and the mutation version.
+	wait(data->version.whenAtLeast(data->version.get() + 1));
+	state Version fetchVersion = data->version.get();
+
 	TraceEvent(SevDebug, "FetchChangeFeedMetadata", data->thisServerID)
-	    .detail("Range", keys.toString())
-	    .detail("FetchVersion", fetchVersion);
-	state std::vector<OverlappingChangeFeedEntry> feeds =
-	    wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion + 1));
+	    .detail("Range", keys)
+	    .detail("FetchVersion", fetchVersion)
+	    .detail("FKID", fetchKeysID);
+
+	state std::set<Key> refreshedFeedIds;
+	state std::set<Key> destroyedFeedIds;
+	// before fetching feeds from other SS's, refresh any feeds we already have that are being marked as removed
+	auto ranges = data->keyChangeFeed.intersectingRanges(keys);
+	for (auto& r : ranges) {
+		for (auto& cfInfo : r.value()) {
+			auto feedCleanup = data->changeFeedCleanupDurable.find(cfInfo->id);
+			if (feedCleanup != data->changeFeedCleanupDurable.end() && cfInfo->removing && !cfInfo->destroyed) {
+				TEST(true); // re-fetching feed scheduled for deletion! Un-mark it as removing
+				destroyedFeedIds.insert(cfInfo->id);
+
+				cfInfo->removing = false;
+				// because we now have a gap in the metadata, it's possible this feed was destroyed
+				cfInfo->possiblyDestroyed = true;
+				// reset fetch versions because everything previously fetched was cleaned up
+				cfInfo->fetchVersion = invalidVersion;
+				cfInfo->durableFetchVersion = NotifiedVersion();
+
+				TraceEvent(SevDebug, "ResetChangeFeedInfo", data->thisServerID)
+				    .detail("RangeID", cfInfo->id.printable())
+				    .detail("Range", cfInfo->range)
+				    .detail("FetchVersion", fetchVersion)
+				    .detail("EmptyVersion", cfInfo->emptyVersion)
+				    .detail("StopVersion", cfInfo->stopVersion)
+				    .detail("FKID", fetchKeysID);
+			}
+		}
+	}
+
+	state std::vector<OverlappingChangeFeedEntry> feeds = wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion));
+	// handle change feeds removed while fetching overlapping
 	while (removals.getFuture().isReady()) {
 		Key remove = waitNext(removals.getFuture());
 		for (int i = 0; i < feeds.size(); i++) {
@ -5078,6 +5195,7 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 			}
 		}
 	}
+
 	std::vector<Key> feedIds;
 	feedIds.reserve(feeds.size());
 	// create change feed metadata if it does not exist
@ -5090,16 +5208,23 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,

 		TraceEvent(SevDebug, "FetchedChangeFeedInfo", data->thisServerID)
 		    .detail("RangeID", cfEntry.rangeId.printable())
-		    .detail("Range", cfEntry.range.toString())
+		    .detail("Range", cfEntry.range)
 		    .detail("FetchVersion", fetchVersion)
 		    .detail("EmptyVersion", cfEntry.emptyVersion)
 		    .detail("StopVersion", cfEntry.stopVersion)
 		    .detail("Existing", existing)
-		    .detail("CleanupPendingVersion", cleanupPending ? cleanupEntry->second : invalidVersion);
+		    .detail("CleanupPendingVersion", cleanupPending ? cleanupEntry->second : invalidVersion)
+		    .detail("FKID", fetchKeysID);

 		bool addMutationToLog = false;
 		Reference<ChangeFeedInfo> changeFeedInfo;

+		auto fid = destroyedFeedIds.find(cfEntry.rangeId);
+		if (fid != destroyedFeedIds.end()) {
+			refreshedFeedIds.insert(cfEntry.rangeId);
+			destroyedFeedIds.erase(fid);
+		}
+
 		if (!existing) {
 			TEST(cleanupPending); // Fetch change feed which is cleanup pending. This means there was a move away and a
 			// move back, this will remake the metadata
@ -5120,30 +5245,26 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 			addMutationToLog = true;
 		} else {
 			changeFeedInfo = existingEntry->second;
-			auto feedCleanup = data->changeFeedCleanupDurable.find(cfEntry.rangeId);

+			if (changeFeedInfo->destroyed) {
+				// race where multiple feeds fetched overlapping change feed, one realized feed was missing and marked
+				// it removed+destroyed, then this one fetched the same info
+				continue;
+			}
+
+			// we checked all feeds we already owned in this range at the start to reset them if they were removing, and
+			// this actor would have been cancelled if a later remove happened
+			ASSERT(!changeFeedInfo->removing);
 			if (cfEntry.stopVersion < changeFeedInfo->stopVersion) {
 				TEST(true); // Change feed updated stop version from fetch metadata
 				changeFeedInfo->stopVersion = cfEntry.stopVersion;
 				addMutationToLog = true;
 			}

-			if (feedCleanup != data->changeFeedCleanupDurable.end() && changeFeedInfo->removing) {
-				TEST(true); // re-fetching feed scheduled for deletion! Un-mark it as removing
-				if (cfEntry.emptyVersion < data->version.get()) {
-					changeFeedInfo->emptyVersion = cfEntry.emptyVersion;
-				}
-
-				changeFeedInfo->removing = false;
-				// reset fetch versions because everything previously fetched was cleaned up
-				changeFeedInfo->fetchVersion = invalidVersion;
-				changeFeedInfo->durableFetchVersion = NotifiedVersion();
-
-				// Since cleanup put a mutation in the log to delete the change feed data, put one in the log to restore
-				// it
-				// We may just want to refactor this so updateStorage does explicit deletes based on
-				// changeFeedCleanupDurable and not use the mutation log at all for the change feed metadata cleanup.
-				// Then we wouldn't have to reset anything here
+			// don't update empty version past SS version if SS is behind, it can cause issues
+			if (cfEntry.emptyVersion < data->version.get() && cfEntry.emptyVersion > changeFeedInfo->emptyVersion) {
+				TEST(true); // Change feed updated empty version from fetch metadata
+				changeFeedInfo->emptyVersion = cfEntry.emptyVersion;
 				addMutationToLog = true;
 			}
 		}
@ -5163,6 +5284,84 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
 			}
 		}
 	}
+
+	TEST(!refreshedFeedIds.empty()); // Feed refreshed between move away and move back
+	TEST(!destroyedFeedIds.empty()); // Feed destroyed between move away and move back
+	for (auto& feedId : refreshedFeedIds) {
+		auto existingEntry = data->uidChangeFeed.find(feedId);
+		if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed) {
+			TEST(true); // feed refreshed
+			continue;
+		}
+
+		// Since cleanup put a mutation in the log to delete the change feed data, put one in the log to restore
+		// it
+		// We may just want to refactor this so updateStorage does explicit deletes based on
+		// changeFeedCleanupDurable and not use the mutation log at all for the change feed metadata cleanup.
+		// Then we wouldn't have to reset anything here or above
+		// Do the mutation log update here instead of above to ensure we only add it back to the mutation log if we're
+		// sure it wasn't deleted in the metadata gap
+		Version metadataVersion = data->data().getLatestVersion();
+		auto& mLV = data->addVersionToMutationLog(metadataVersion);
+		data->addMutationToMutationLog(
+		    mLV,
+		    MutationRef(MutationRef::SetValue,
+		                persistChangeFeedKeys.begin.toString() + existingEntry->second->id.toString(),
+		                changeFeedSSValue(existingEntry->second->range,
+		                                  existingEntry->second->emptyVersion + 1,
+		                                  existingEntry->second->stopVersion)));
+		TraceEvent(SevDebug, "PersistingResetChangeFeedInfo", data->thisServerID)
+		    .detail("RangeID", existingEntry->second->id.printable())
+		    .detail("Range", existingEntry->second->range)
+		    .detail("FetchVersion", fetchVersion)
+		    .detail("EmptyVersion", existingEntry->second->emptyVersion)
+		    .detail("StopVersion", existingEntry->second->stopVersion)
+		    .detail("FKID", fetchKeysID)
+		    .detail("MetadataVersion", metadataVersion);
+	}
+	for (auto& feedId : destroyedFeedIds) {
+		auto existingEntry = data->uidChangeFeed.find(feedId);
+		if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed) {
+			TEST(true); // feed refreshed but then destroyed elsewhere
+			continue;
+		}
+
+		// TODO REMOVE print
+		fmt::print("DBG: SS {} fetching feed {} was refreshed but not present!! assuming destroyed\n",
+		           data->thisServerID.toString().substr(0, 4),
+		           feedId.printable());
+
+		Version cleanupVersion = data->data().getLatestVersion();
+
+		TraceEvent(SevDebug, "DestroyingChangeFeedFromFetchMetadata", data->thisServerID)
+		    .detail("RangeID", feedId.printable())
+		    .detail("Range", existingEntry->second->range)
+		    .detail("Version", cleanupVersion)
+		    .detail("FKID", fetchKeysID);
+
+		if (g_network->isSimulated()) {
+			ASSERT(allDestroyedChangeFeeds.count(feedId));
+		}
+
+		Key beginClearKey = feedId.withPrefix(persistChangeFeedKeys.begin);
+
+		auto& mLV = data->addVersionToMutationLog(cleanupVersion);
+		data->addMutationToMutationLog(mLV,
+		                               MutationRef(MutationRef::ClearRange, beginClearKey, keyAfter(beginClearKey)));
+		++data->counters.kvSystemClearRanges;
+		data->addMutationToMutationLog(mLV,
+		                               MutationRef(MutationRef::ClearRange,
+		                                           changeFeedDurableKey(feedId, 0),
+		                                           changeFeedDurableKey(feedId, cleanupVersion)));
+		++data->counters.kvSystemClearRanges;
+
+		existingEntry->second->destroy(cleanupVersion);
+		data->changeFeedCleanupDurable[feedId] = cleanupVersion;
+
+		for (auto& it : data->changeFeedRemovals) {
+			it.second.send(feedId);
+		}
+	}
 	return feedIds;
 }

@ -5218,7 +5417,6 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
 					}
 				}
 				if (done) {
-					data->changeFeedRemovals.erase(fetchKeysID);
 					return feedMaxFetched;
 				}
 			}
@ -5283,8 +5481,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {

 		state PromiseStream<Key> removals;
 		data->changeFeedRemovals[fetchKeysID] = removals;
-		state Future<std::vector<Key>> fetchCFMetadata =
-		    fetchChangeFeedMetadata(data, keys, data->version.get(), removals);
+		state Future<std::vector<Key>> fetchCFMetadata = fetchChangeFeedMetadata(data, keys, removals, fetchKeysID);

 		validate(data);

@ -5629,6 +5826,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 			}
 		}

+		data->changeFeedRemovals.erase(fetchKeysID);
+
 		shard->phase = AddingShard::Waiting;

 		// Similar to transferred version, but wait for all feed data and
@ -5849,7 +6048,8 @@ void changeServerKeys(StorageServer* data,
 			data->watches.triggerRange(range.begin, range.end);
 		} else if (!dataAvailable) {
 			// SOMEDAY: Avoid restarting adding/transferred shards
-			if (version == 0) { // bypass fetchkeys; shard is known empty at version 0
+			// bypass fetchkeys; shard is known empty at initial cluster version
+			if (version == data->initialClusterVersion - 1) {
 				TraceEvent("ChangeServerKeysInitialRange", data->thisServerID)
 				    .detail("Begin", range.begin)
 				    .detail("End", range.end);
@ -5940,7 +6140,6 @@ void changeServerKeys(StorageServer* data,

 				auto feed = data->uidChangeFeed.find(f.first);
 				if (feed != data->uidChangeFeed.end()) {
-					feed->second->emptyVersion = version - 1;
 					feed->second->removing = true;
 					feed->second->moved(feed->second->range);
 					feed->second->newMutations.trigger();
@ -6242,7 +6441,10 @@ private:
 							feed->second->durableVersion = invalidVersion;
 						}
 					}
-					addMutationToLog = true;
+					if (!feed->second->destroyed) {
+						// if feed is destroyed, adding an extra mutation here would re-create it if SS restarted
+						addMutationToLog = true;
+					}
 				}

 			} else if (status == ChangeFeedStatus::CHANGE_FEED_CREATE && createdFeed) {
@ -6278,13 +6480,12 @@ private:
 				                                           changeFeedDurableKey(feed->second->id, currentVersion)));
 				++data->counters.kvSystemClearRanges;

-				feed->second->emptyVersion = currentVersion - 1;
-				feed->second->stopVersion = currentVersion;
-				feed->second->removing = true;
-				feed->second->moved(feed->second->range);
-				feed->second->newMutations.trigger();
-
+				feed->second->destroy(currentVersion);
 				data->changeFeedCleanupDurable[feed->first] = cleanupVersion;
+
+				if (g_network->isSimulated()) {
+					allDestroyedChangeFeeds.insert(changeFeedId);
+				}
 			}

 			if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) {
@ -6734,7 +6935,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 					    .detail("Version", cloneCursor2->version().toString());
 				} else if (ver != invalidVersion) { // This change belongs to a version < minVersion
 					DEBUG_MUTATION("SSPeek", ver, msg, data->thisServerID);
-					if (ver == 1) {
+					if (ver == data->initialClusterVersion) {
 						//TraceEvent("SSPeekMutation", data->thisServerID).log();
 						// The following trace event may produce a value with special characters
 						TraceEvent("SSPeekMutation", data->thisServerID)
@ -6850,6 +7051,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 			proposedOldestVersion = std::min(proposedOldestVersion, data->version.get() - 1);
 			proposedOldestVersion = std::max(proposedOldestVersion, data->oldestVersion.get());
 			proposedOldestVersion = std::max(proposedOldestVersion, data->desiredOldestVersion.get());
+			proposedOldestVersion = std::max(proposedOldestVersion, data->initialClusterVersion);

 			//TraceEvent("StorageServerUpdated", data->thisServerID).detail("Ver", ver).detail("DataVersion", data->version.get())
 			//	.detail("LastTLogVersion", data->lastTLogVersion).detail("NewOldest",
@ -8715,6 +8917,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                 StorageServerInterface ssi,
                                 Tag seedTag,
                                 UID clusterId,
+                                 Version startVersion,
                                 Version tssSeedVersion,
                                 ReplyPromise<InitializeStorageReply> recruitReply,
                                 Reference<AsyncVar<ServerDBInfo> const> db,
@ -8722,6 +8925,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 	state StorageServer self(persistentData, db, ssi);
 	state Future<Void> ssCore;
 	self.clusterId.send(clusterId);
+	self.initialClusterVersion = startVersion;
 	if (ssi.isTss()) {
 		self.setTssPair(ssi.tssPairID.get());
 		ASSERT(self.isTss());
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -778,6 +778,82 @@ TEST_CASE("/fdbserver/worker/addressInDbAndPrimaryDc") {

 } // namespace

+// Returns true if `address` is used in the db (indicated by `dbInfo`) transaction system and in the db's primary
+// satellite DC.
+bool addressInDbAndPrimarySatelliteDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+	for (const auto& logSet : dbInfo->get().logSystemConfig.tLogs) {
+		if (logSet.isLocal && logSet.locality == tagLocalitySatellite) {
+			for (const auto& tlog : logSet.tLogs) {
+				if (tlog.present() && tlog.interf().addresses().contains(address)) {
+					return true;
+				}
+			}
+		}
+	}
+
+	return false;
+}
+
+bool addressesInDbAndPrimarySatelliteDc(const NetworkAddressList& addresses,
+                                        Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
+	return addressInDbAndPrimarySatelliteDc(addresses.address, dbInfo) ||
+	       (addresses.secondaryAddress.present() &&
+	        addressInDbAndPrimarySatelliteDc(addresses.secondaryAddress.get(), dbInfo));
+}
+
+namespace {
+
+TEST_CASE("/fdbserver/worker/addressInDbAndPrimarySatelliteDc") {
+	// Setup a ServerDBInfo for test.
+	ServerDBInfo testDbInfo;
+	LocalityData testLocal;
+	testLocal.set(LiteralStringRef("dcid"), StringRef(std::to_string(1)));
+	testDbInfo.master.locality = testLocal;
+
+	// First, create an empty TLogInterface, and check that it shouldn't be considered as in satellite DC.
+	testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
+	testDbInfo.logSystemConfig.tLogs.back().isLocal = true;
+	testDbInfo.logSystemConfig.tLogs.back().locality = tagLocalitySatellite;
+	testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface<TLogInterface>());
+	ASSERT(!addressInDbAndPrimarySatelliteDc(g_network->getLocalAddress(),
+	                                         makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
+
+	// Create a satellite tlog, and it should be considered as in primary satellite DC.
+	NetworkAddress satelliteTLogAddress(IPAddress(0x13131313), 1);
+	TLogInterface satelliteTLog(testLocal);
+	satelliteTLog.initEndpoints();
+	satelliteTLog.peekMessages = RequestStream<struct TLogPeekRequest>(Endpoint({ satelliteTLogAddress }, UID(1, 2)));
+	testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(satelliteTLog));
+	ASSERT(addressInDbAndPrimarySatelliteDc(satelliteTLogAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
+
+	// Create a primary TLog, and it shouldn't be considered as in primary Satellite DC.
+	NetworkAddress primaryTLogAddress(IPAddress(0x26262626), 1);
+	testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
+	testDbInfo.logSystemConfig.tLogs.back().isLocal = true;
+	TLogInterface primaryTLog(testLocal);
+	primaryTLog.initEndpoints();
+	primaryTLog.peekMessages = RequestStream<struct TLogPeekRequest>(Endpoint({ primaryTLogAddress }, UID(1, 2)));
+	testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(primaryTLog));
+	ASSERT(!addressInDbAndPrimarySatelliteDc(primaryTLogAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
+
+	// Create a remote TLog, and it should be considered as in remote DC.
+	NetworkAddress remoteTLogAddress(IPAddress(0x37373737), 1);
+	LocalityData fakeRemote;
+	fakeRemote.set(LiteralStringRef("dcid"), StringRef(std::to_string(2)));
+	TLogInterface remoteTLog(fakeRemote);
+	remoteTLog.initEndpoints();
+	remoteTLog.peekMessages = RequestStream<struct TLogPeekRequest>(Endpoint({ remoteTLogAddress }, UID(1, 2)));
+
+	testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
+	testDbInfo.logSystemConfig.tLogs.back().isLocal = false;
+	testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(remoteTLog));
+	ASSERT(!addressInDbAndPrimarySatelliteDc(remoteTLogAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
+
+	return Void();
+}
+
+} // namespace
+
 bool addressInDbAndRemoteDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
 	const auto& dbi = dbInfo->get();

@ -872,17 +948,15 @@ ACTOR Future<Void> healthMonitor(Reference<AsyncVar<Optional<ClusterControllerFu
 			const auto& allPeers = FlowTransport::transport().getAllPeers();
 			UpdateWorkerHealthRequest req;

-			bool workerInDb = false;
-			bool workerInPrimary = false;
+			enum WorkerLocation { None, Primary, Remote };
+			WorkerLocation workerLocation = None;
 			if (addressesInDbAndPrimaryDc(interf.addresses(), dbInfo)) {
-				workerInDb = true;
-				workerInPrimary = true;
+				workerLocation = Primary;
 			} else if (addressesInDbAndRemoteDc(interf.addresses(), dbInfo)) {
-				workerInDb = true;
-				workerInPrimary = false;
+				workerLocation = Remote;
 			}

-			if (workerInDb) {
+			if (workerLocation != None) {
 				for (const auto& [address, peer] : allPeers) {
 					if (peer->connectFailedCount == 0 &&
 					    peer->pingLatencies.getPopulationSize() < SERVER_KNOBS->PEER_LATENCY_CHECK_MIN_POPULATION) {
@ -895,37 +969,50 @@ ACTOR Future<Void> healthMonitor(Reference<AsyncVar<Optional<ClusterControllerFu
 						//              last ping latencies logged.
 						continue;
 					}
-
-					if ((workerInPrimary && addressInDbAndPrimaryDc(address, dbInfo)) ||
-					    (!workerInPrimary && addressInDbAndRemoteDc(address, dbInfo))) {
-						// Only monitoring the servers that in the primary or remote DC's transaction systems.
-						// Note that currently we are not monitor storage servers, since lagging in storage servers
-						// today already can trigger server exclusion by data distributor.
+					bool degradedPeer = false;
+					if ((workerLocation == Primary && addressInDbAndPrimaryDc(address, dbInfo)) ||
+					    (workerLocation == Remote && addressInDbAndRemoteDc(address, dbInfo))) {
+						// Monitors intra DC latencies between servers that in the primary or remote DC's transaction
+						// systems. Note that currently we are not monitor storage servers, since lagging in storage
+						// servers today already can trigger server exclusion by data distributor.

 						if (peer->connectFailedCount >= SERVER_KNOBS->PEER_DEGRADATION_CONNECTION_FAILURE_COUNT ||
 						    peer->pingLatencies.percentile(SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE) >
 						        SERVER_KNOBS->PEER_LATENCY_DEGRADATION_THRESHOLD ||
 						    peer->timeoutCount / (double)(peer->pingLatencies.getPopulationSize()) >
 						        SERVER_KNOBS->PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD) {
-							// This is a degraded peer.
-							TraceEvent("HealthMonitorDetectDegradedPeer")
-							    .suppressFor(30)
-							    .detail("Peer", address)
-							    .detail("Elapsed", now() - peer->lastLoggedTime)
-							    .detail("MinLatency", peer->pingLatencies.min())
-							    .detail("MaxLatency", peer->pingLatencies.max())
-							    .detail("MeanLatency", peer->pingLatencies.mean())
-							    .detail("MedianLatency", peer->pingLatencies.median())
-							    .detail("CheckedPercentile", SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE)
-							    .detail(
-							        "CheckedPercentileLatency",
-							        peer->pingLatencies.percentile(SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE))
-							    .detail("PingCount", peer->pingLatencies.getPopulationSize())
-							    .detail("PingTimeoutCount", peer->timeoutCount)
-							    .detail("ConnectionFailureCount", peer->connectFailedCount);
-
-							req.degradedPeers.push_back(address);
+							degradedPeer = true;
 						}
+					} else if (workerLocation == Primary && addressInDbAndPrimarySatelliteDc(address, dbInfo)) {
+						// Monitors inter DC latencies between servers in primary and primary satellite DC. Note that
+						// TLog workers in primary satellite DC are on the critical path of serving a commit.
+						if (peer->connectFailedCount >= SERVER_KNOBS->PEER_DEGRADATION_CONNECTION_FAILURE_COUNT ||
+						    peer->pingLatencies.percentile(
+						        SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE) >
+						        SERVER_KNOBS->PEER_LATENCY_DEGRADATION_THRESHOLD_SATELLITE ||
+						    peer->timeoutCount / (double)(peer->pingLatencies.getPopulationSize()) >
+						        SERVER_KNOBS->PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD) {
+							degradedPeer = true;
+						}
+					}
+
+					if (degradedPeer) {
+						TraceEvent("HealthMonitorDetectDegradedPeer")
+						    .suppressFor(30)
+						    .detail("Peer", address)
+						    .detail("Elapsed", now() - peer->lastLoggedTime)
+						    .detail("MinLatency", peer->pingLatencies.min())
+						    .detail("MaxLatency", peer->pingLatencies.max())
+						    .detail("MeanLatency", peer->pingLatencies.mean())
+						    .detail("MedianLatency", peer->pingLatencies.median())
+						    .detail("CheckedPercentile", SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE)
+						    .detail("CheckedPercentileLatency",
+						            peer->pingLatencies.percentile(SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE))
+						    .detail("PingCount", peer->pingLatencies.getPopulationSize())
+						    .detail("PingTimeoutCount", peer->timeoutCount)
+						    .detail("ConnectionFailureCount", peer->connectFailedCount);
+
+						req.degradedPeers.push_back(address);
 					}
 				}

@ -941,8 +1028,9 @@ ACTOR Future<Void> healthMonitor(Reference<AsyncVar<Optional<ClusterControllerFu
 							continue;
 						}

-						if ((workerInPrimary && addressInDbAndPrimaryDc(address, dbInfo)) ||
-						    (!workerInPrimary && addressInDbAndRemoteDc(address, dbInfo))) {
+						if ((workerLocation == Primary && addressInDbAndPrimaryDc(address, dbInfo)) ||
+						    (workerLocation == Remote && addressInDbAndRemoteDc(address, dbInfo)) ||
+						    (workerLocation == Primary && addressInDbAndPrimarySatelliteDc(address, dbInfo))) {
 							TraceEvent("HealthMonitorDetectRecentClosedPeer").suppressFor(30).detail("Peer", address);
 							req.degradedPeers.push_back(address);
 						}
@ -2095,6 +2183,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 				                 [&req](const auto& p) { return p.second != req.storeType; }) ||
 				     req.seedTag != invalidTag)) {
 					ASSERT(req.clusterId.isValid());
+					ASSERT(req.initialClusterVersion >= 0);
 					LocalLineage _;
 					getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
 					bool isTss = req.tssPairIDAndVersion.present();
@ -2156,6 +2245,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
 					                               recruited,
 					                               req.seedTag,
 					                               req.clusterId,
+					                               req.initialClusterVersion,
 					                               isTss ? req.tssPairIDAndVersion.get().second : 0,
 					                               storageReady,
 					                               dbInfo,
--- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
@ -62,8 +62,9 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 	int64_t timeTravelTooOld = 0;
 	int64_t rowsRead = 0;
 	int64_t bytesRead = 0;
+	int64_t purges = 0;
 	std::vector<Future<Void>> clients;
-	bool enablePruning;
+	bool enablePurging;

 	DatabaseConfiguration config;

@ -79,7 +80,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		timeTravelLimit = getOption(options, LiteralStringRef("timeTravelLimit"), testDuration);
 		timeTravelBufferSize = getOption(options, LiteralStringRef("timeTravelBufferSize"), 100000000);
 		threads = getOption(options, LiteralStringRef("threads"), 1);
-		enablePruning = getOption(options, LiteralStringRef("enablePruning"), false /*sharedRandomNumber % 2 == 0*/);
+		enablePurging = getOption(options, LiteralStringRef("enablePurging"), false /*sharedRandomNumber % 2 == 0*/);
 		ASSERT(threads >= 1);

 		if (BGV_DEBUG) {
@ -177,60 +178,6 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		OldRead(KeyRange range, Version v, RangeResult oldResult) : range(range), v(v), oldResult(oldResult) {}
 	};

-	// utility to prune <range> at pruneVersion=<version> with the <force> flag
-	ACTOR Future<Void> pruneAtVersion(Database cx, KeyRange range, Version version, bool force) {
-		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
-		state Key pruneKey;
-		loop {
-			try {
-				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-
-				Value pruneValue = blobGranulePruneValueFor(version, range, force);
-				tr->atomicOp(
-				    addVersionStampAtEnd(blobGranulePruneKeys.begin), pruneValue, MutationRef::SetVersionstampedKey);
-				tr->set(blobGranulePruneChangeKey, deterministicRandom()->randomUniqueID().toString());
-				state Future<Standalone<StringRef>> fTrVs = tr->getVersionstamp();
-				wait(tr->commit());
-				Standalone<StringRef> vs = wait(fTrVs);
-				pruneKey = blobGranulePruneKeys.begin.withSuffix(vs);
-				if (BGV_DEBUG) {
-					fmt::print("pruneAtVersion for range [{0} - {1}) at version {2} succeeded\n",
-					           range.begin.printable(),
-					           range.end.printable(),
-					           version);
-				}
-				break;
-			} catch (Error& e) {
-				if (BGV_DEBUG) {
-					fmt::print("pruneAtVersion for range [{0} - {1}) at version {2} encountered error {3}\n",
-					           range.begin.printable(),
-					           range.end.printable(),
-					           version,
-					           e.name());
-				}
-				wait(tr->onError(e));
-			}
-		}
-		tr->reset();
-		loop {
-			try {
-				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-
-				Optional<Value> pruneVal = wait(tr->get(pruneKey));
-				if (!pruneVal.present()) {
-					return Void();
-				}
-				state Future<Void> watchFuture = tr->watch(pruneKey);
-				wait(tr->commit());
-				wait(watchFuture);
-			} catch (Error& e) {
-				wait(tr->onError(e));
-			}
-		}
-	}
-
 	ACTOR Future<Void> killBlobWorkers(Database cx, BlobGranuleVerifierWorkload* self) {
 		state Transaction tr(cx);
 		state std::set<UID> knownWorkers;
@ -272,12 +219,12 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		}
 	}

-	ACTOR Future<Void> verifyGranules(Database cx, BlobGranuleVerifierWorkload* self, bool allowPruning) {
+	ACTOR Future<Void> verifyGranules(Database cx, BlobGranuleVerifierWorkload* self, bool allowPurging) {
 		state double last = now();
 		state double endTime = last + self->testDuration;
 		state std::map<double, OldRead> timeTravelChecks;
 		state int64_t timeTravelChecksMemory = 0;
-		state Version prevPruneVersion = -1;
+		state Version prevPurgeVersion = -1;
 		state UID dbgId = debugRandom()->randomUniqueID();

 		TraceEvent("BlobGranuleVerifierStart");
@ -300,25 +247,27 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 					state OldRead oldRead = timeTravelIt->second;
 					timeTravelChecksMemory -= oldRead.oldResult.expectedSize();
 					timeTravelIt = timeTravelChecks.erase(timeTravelIt);
-					if (prevPruneVersion == -1) {
-						prevPruneVersion = oldRead.v;
+					if (prevPurgeVersion == -1) {
+						prevPurgeVersion = oldRead.v;
 					}
 					// advance iterator before doing read, so if it gets error we don't retry it

 					try {
-						state Version newPruneVersion = 0;
-						state bool doPruning = allowPruning && deterministicRandom()->random01() < 0.5;
-						if (doPruning) {
-							Version maxPruneVersion = oldRead.v;
+						state Version newPurgeVersion = 0;
+						state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5;
+						if (doPurging) {
+							Version maxPurgeVersion = oldRead.v;
 							for (auto& it : timeTravelChecks) {
-								maxPruneVersion = std::min(it.second.v, maxPruneVersion);
+								maxPurgeVersion = std::min(it.second.v, maxPurgeVersion);
 							}
-							if (prevPruneVersion < maxPruneVersion) {
-								newPruneVersion = deterministicRandom()->randomInt64(prevPruneVersion, maxPruneVersion);
-								prevPruneVersion = std::max(prevPruneVersion, newPruneVersion);
-								wait(self->pruneAtVersion(cx, normalKeys, newPruneVersion, false));
+							if (prevPurgeVersion < maxPurgeVersion) {
+								newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion);
+								prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion);
+								Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, false));
+								wait(cx->waitPurgeGranulesComplete(purgeKey));
+								self->purges++;
 							} else {
-								doPruning = false;
+								doPurging = false;
 							}
 						}
 						std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> reReadResult =
@ -328,12 +277,12 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 						}
 						self->timeTravelReads++;

-						if (doPruning) {
+						if (doPurging) {
 							wait(self->killBlobWorkers(cx, self));
 							std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead =
-							    wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPruneVersion));
+							    wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion));
 							try {
-								Version minSnapshotVersion = newPruneVersion;
+								Version minSnapshotVersion = newPurgeVersion;
 								for (auto& it : versionRead.second) {
 									minSnapshotVersion = std::min(minSnapshotVersion, it.snapshotVersion);
 								}
@ -395,10 +344,10 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 	Future<Void> start(Database const& cx) override {
 		clients.reserve(threads + 1);
 		clients.push_back(timeout(findGranules(cx, this), testDuration, Void()));
-		if (enablePruning && clientId == 0) {
+		if (enablePurging && clientId == 0) {
 			clients.push_back(
 			    timeout(reportErrors(verifyGranules(cx, this, true), "BlobGranuleVerifier"), testDuration, Void()));
-		} else if (!enablePruning) {
+		} else if (!enablePurging) {
 			for (int i = 0; i < threads; i++) {
 				clients.push_back(timeout(
 				    reportErrors(verifyGranules(cx, this, false), "BlobGranuleVerifier"), testDuration, Void()));
@ -518,6 +467,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		fmt::print("  {} time travel reads\n", self->timeTravelReads);
 		fmt::print("  {} rows\n", self->rowsRead);
 		fmt::print("  {} bytes\n", self->bytesRead);
+		fmt::print("  {} purges\n", self->purges);
 		// FIXME: add above as details to trace event

 		TraceEvent("BlobGranuleVerifierChecked").detail("Result", result);
--- a/fdbserver/workloads/ConfigureDatabase.actor.cpp
+++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp
@ -227,7 +227,8 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 	double testDuration;
 	int additionalDBs;
 	bool allowDescriptorChange;
-	bool allowTestStorageMigration;
+	bool allowTestStorageMigration; // allow change storage migration and perpetual wiggle conf
+	bool storageMigrationCompatibleConf; // only allow generating configuration suitable for storage migration test
 	bool waitStoreTypeCheck;
 	bool downgradeTest1; // if this is true, don't pick up downgrade incompatible config
 	std::vector<Future<Void>> clients;
@ -239,6 +240,7 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 		    getOption(options, LiteralStringRef("allowDescriptorChange"), SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT);
 		allowTestStorageMigration =
 		    getOption(options, "allowTestStorageMigration"_sr, false) && g_simulator.allowStorageMigrationTypeChange;
+		storageMigrationCompatibleConf = getOption(options, "storageMigrationCompatibleConf"_sr, false);
 		waitStoreTypeCheck = getOption(options, "waitStoreTypeCheck"_sr, false);
 		downgradeTest1 = getOption(options, "downgradeTest1"_sr, false);
 		g_simulator.usableRegions = 1;
@ -349,7 +351,11 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 			}
 			state int randomChoice;
 			if (self->allowTestStorageMigration) {
-				randomChoice = deterministicRandom()->randomInt(4, 9);
+				randomChoice = (deterministicRandom()->random01() < 0.375) ? deterministicRandom()->randomInt(0, 3)
+				                                                           : deterministicRandom()->randomInt(4, 9);
+			} else if (self->storageMigrationCompatibleConf) {
+				randomChoice = (deterministicRandom()->random01() < 3.0 / 7) ? deterministicRandom()->randomInt(0, 3)
+				                                                             : deterministicRandom()->randomInt(5, 9);
 			} else {
 				randomChoice = deterministicRandom()->randomInt(0, 8);
 			}
--- a/fdbserver/workloads/DiskFailureInjection.actor.cpp
+++ b/fdbserver/workloads/DiskFailureInjection.actor.cpp
@ -154,9 +154,13 @@ struct DiskFailureInjectionWorkload : TestWorkload {
 		loop {
 			wait(poisson(&lastTime, 1));
 			try {
-				wait(store(machines, getStorageWorkers(cx, self->dbInfo, false)));
+				std::pair<std::vector<W>, int> m = wait(getStorageWorkers(cx, self->dbInfo, false));
+				if (m.second > 0) {
+					throw operation_failed();
+				}
+				machines = std::move(m.first);
 			} catch (Error& e) {
-				// If we failed to get a list of storage servers, we can't inject failure events
+				// If we failed to get a complete list of storage servers, we can't inject failure events
 				// But don't throw the error in that case
 				continue;
 			}
--- a/fdbserver/workloads/EncryptionOps.actor.cpp
+++ b/fdbserver/workloads/EncryptionOps.actor.cpp
@ -20,6 +20,7 @@

 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/NativeAPI.actor.h"
+#include "flow/EncryptUtils.h"
 #include "flow/IRandom.h"
 #include "flow/BlobCipher.h"
 #include "fdbserver/workloads/workloads.actor.h"
@ -116,9 +117,10 @@ struct EncryptionOpsWorkload : TestWorkload {
 	Arena arena;
 	std::unique_ptr<WorkloadMetrics> metrics;

-	BlobCipherDomainId minDomainId;
-	BlobCipherDomainId maxDomainId;
-	BlobCipherBaseKeyId minBaseCipherId;
+	EncryptCipherDomainId minDomainId;
+	EncryptCipherDomainId maxDomainId;
+	EncryptCipherBaseKeyId minBaseCipherId;
+	EncryptCipherBaseKeyId headerBaseCipherId;

 	EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
 		mode = getOption(options, LiteralStringRef("fixedSize"), 1);
@ -131,6 +133,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 		minDomainId = wcx.clientId * 100 + mode * 30 + 1;
 		maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
 		minBaseCipherId = 100;
+		headerBaseCipherId = wcx.clientId * 100 + 1;

 		metrics = std::make_unique<WorkloadMetrics>();

@ -167,17 +170,21 @@ struct EncryptionOpsWorkload : TestWorkload {

 		uint8_t buff[AES_256_KEY_LENGTH];
 		std::vector<Reference<BlobCipherKey>> cipherKeys;
-		for (BlobCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
-			int cipherLen = 0;
+		int cipherLen = 0;
+		for (EncryptCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
 			generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
 			cipherKeyCache.insertCipherKey(id, minBaseCipherId, buff, cipherLen);

 			ASSERT(cipherLen > 0 && cipherLen <= AES_256_KEY_LENGTH);

 			cipherKeys = cipherKeyCache.getAllCiphers(id);
-			ASSERT(cipherKeys.size() == 1);
+			ASSERT_EQ(cipherKeys.size(), 1);
 		}

+		// insert the Encrypt Header cipherKey
+		generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
+		cipherKeyCache.insertCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId, buff, cipherLen);
+
 		TraceEvent("SetupCipherEssentials_Done").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
 	}

@ -188,10 +195,10 @@ struct EncryptionOpsWorkload : TestWorkload {
 		TraceEvent("ResetCipherEssentials_Done").log();
 	}

-	void updateLatestBaseCipher(const BlobCipherDomainId encryptDomainId,
+	void updateLatestBaseCipher(const EncryptCipherDomainId encryptDomainId,
 	                            uint8_t* baseCipher,
 	                            int* baseCipherLen,
-	                            BlobCipherBaseKeyId* nextBaseCipherId) {
+	                            EncryptCipherBaseKeyId* nextBaseCipherId) {
 		auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
 		Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
 		*nextBaseCipherId = cipherKey->getBaseCipherId() + 1;
@ -202,22 +209,24 @@ struct EncryptionOpsWorkload : TestWorkload {
 		TraceEvent("UpdateBaseCipher").detail("DomainId", encryptDomainId).detail("BaseCipherId", *nextBaseCipherId);
 	}

-	Reference<EncryptBuf> doEncryption(Reference<BlobCipherKey> key,
+	Reference<EncryptBuf> doEncryption(Reference<BlobCipherKey> textCipherKey,
+	                                   Reference<BlobCipherKey> headerCipherKey,
 	                                   uint8_t* payload,
 	                                   int len,
+	                                   const EncryptAuthTokenMode authMode,
 	                                   BlobCipherEncryptHeader* header) {
 		uint8_t iv[AES_256_IV_LENGTH];
 		generateRandomData(&iv[0], AES_256_IV_LENGTH);
-		EncryptBlobCipherAes265Ctr encryptor(key, &iv[0], AES_256_IV_LENGTH);
+		EncryptBlobCipherAes265Ctr encryptor(textCipherKey, headerCipherKey, &iv[0], AES_256_IV_LENGTH, authMode);

 		auto start = std::chrono::high_resolution_clock::now();
 		Reference<EncryptBuf> encrypted = encryptor.encrypt(payload, len, header, arena);
 		auto end = std::chrono::high_resolution_clock::now();

 		// validate encrypted buffer size and contents (not matching with plaintext)
-		ASSERT(encrypted->getLogicalSize() == len);
-		ASSERT(memcmp(encrypted->begin(), payload, len) != 0);
-		ASSERT(header->flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(encrypted->getLogicalSize(), len);
+		ASSERT_NE(memcmp(encrypted->begin(), payload, len), 0);
+		ASSERT_EQ(header->flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);

 		metrics->updateEncryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
 		return encrypted;
@ -228,23 +237,30 @@ struct EncryptionOpsWorkload : TestWorkload {
 	                  const BlobCipherEncryptHeader& header,
 	                  uint8_t* originalPayload,
 	                  Reference<BlobCipherKey> orgCipherKey) {
-		ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
-		ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
+		ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);

 		auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
-		Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
+		Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
+		                                                                 header.cipherTextDetails.baseCipherId);
+		Reference<BlobCipherKey> headerCipherKey = cipherKeyCache.getCipherKey(
+		    header.cipherHeaderDetails.encryptDomainId, header.cipherHeaderDetails.baseCipherId);
 		ASSERT(cipherKey.isValid());
 		ASSERT(cipherKey->isEqual(orgCipherKey));

-		DecryptBlobCipherAes256Ctr decryptor(cipherKey, &header.iv[0]);
+		DecryptBlobCipherAes256Ctr decryptor(cipherKey, headerCipherKey, &header.cipherTextDetails.iv[0]);
+		const bool validateHeaderAuthToken = deterministicRandom()->randomInt(0, 100) < 65;

 		auto start = std::chrono::high_resolution_clock::now();
+		if (validateHeaderAuthToken) {
+			decryptor.verifyHeaderAuthToken(header, arena);
+		}
 		Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), len, header, arena);
 		auto end = std::chrono::high_resolution_clock::now();

 		// validate decrypted buffer size and contents (matching with original plaintext)
-		ASSERT(decrypted->getLogicalSize() == len);
-		ASSERT(memcmp(decrypted->begin(), originalPayload, len) == 0);
+		ASSERT_EQ(decrypted->getLogicalSize(), len);
+		ASSERT_EQ(memcmp(decrypted->begin(), originalPayload, len), 0);

 		metrics->updateDecryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
 	}
@ -256,7 +272,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 	Future<Void> start(Database const& cx) override {
 		uint8_t baseCipher[AES_256_KEY_LENGTH];
 		int baseCipherLen = 0;
-		BlobCipherBaseKeyId nextBaseCipherId;
+		EncryptCipherBaseKeyId nextBaseCipherId;

 		// Setup encryptDomainIds and corresponding baseCipher details
 		setupCipherEssentials();
@ -268,7 +284,7 @@ struct EncryptionOpsWorkload : TestWorkload {
 			auto& cipherKeyCache = BlobCipherKeyCache::getInstance();

 			// randomly select a domainId
-			const BlobCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId);
+			const EncryptCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId);
 			ASSERT(encryptDomainId >= minDomainId && encryptDomainId <= maxDomainId);

 			if (updateBaseCipher) {
@ -279,14 +295,17 @@ struct EncryptionOpsWorkload : TestWorkload {

 			auto start = std::chrono::high_resolution_clock::now();
 			Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
+			// Each client working with their own version of encryptHeaderCipherKey, avoid using getLatest()
+			Reference<BlobCipherKey> headerCipherKey =
+			    cipherKeyCache.getCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId);
 			auto end = std::chrono::high_resolution_clock::now();
 			metrics->updateKeyDerivationTime(std::chrono::duration<double, std::nano>(end - start).count());

 			// Validate sanity of "getLatestCipher", especially when baseCipher gets updated
 			if (updateBaseCipher) {
-				ASSERT(cipherKey->getBaseCipherId() == nextBaseCipherId);
-				ASSERT(cipherKey->getBaseCipherLen() == baseCipherLen);
-				ASSERT(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0);
+				ASSERT_EQ(cipherKey->getBaseCipherId(), nextBaseCipherId);
+				ASSERT_EQ(cipherKey->getBaseCipherLen(), baseCipherLen);
+				ASSERT_EQ(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen), 0);
 			}

 			int dataLen = isFixedSizePayload() ? pageSize : deterministicRandom()->randomInt(100, maxBufSize);
@ -294,8 +313,12 @@ struct EncryptionOpsWorkload : TestWorkload {

 			// Encrypt the payload - generates BlobCipherEncryptHeader to assist decryption later
 			BlobCipherEncryptHeader header;
+			const EncryptAuthTokenMode authMode = deterministicRandom()->randomInt(0, 100) < 50
+			                                          ? ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE
+			                                          : ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI;
 			try {
-				Reference<EncryptBuf> encrypted = doEncryption(cipherKey, buff.get(), dataLen, &header);
+				Reference<EncryptBuf> encrypted =
+				    doEncryption(cipherKey, headerCipherKey, buff.get(), dataLen, authMode, &header);

 				// Decrypt the payload - parses the BlobCipherEncryptHeader, fetch corresponding cipherKey and
 				// decrypt
@ -303,7 +326,8 @@ struct EncryptionOpsWorkload : TestWorkload {
 			} catch (Error& e) {
 				TraceEvent("Failed")
 				    .detail("DomainId", encryptDomainId)
-				    .detail("BaseCipherId", cipherKey->getBaseCipherId());
+				    .detail("BaseCipherId", cipherKey->getBaseCipherId())
+				    .detail("AuthMode", authMode);
 				throw;
 			}

--- a/fdbserver/workloads/workloads.actor.h
+++ b/fdbserver/workloads/workloads.actor.h
@ -236,7 +236,8 @@ Future<Void> quietDatabase(Database const& cx,
                           int64_t maxTLogQueueGate = 5e6,
                           int64_t maxStorageServerQueueGate = 5e6,
                           int64_t maxDataDistributionQueueSize = 0,
-                           int64_t maxPoppedVersionLag = 30e6);
+                           int64_t maxPoppedVersionLag = 30e6,
+                           int64_t maxVersionOffset = 1e6);

 /**
 * A utility function for testing error situations. It succeeds if the given test
--- a/flow/BlobCipher.cpp
+++ b/flow/BlobCipher.cpp
@ -19,6 +19,7 @@
 */

 #include "flow/BlobCipher.h"
+#include "flow/EncryptUtils.h"
 #include "flow/Error.h"
 #include "flow/FastRef.h"
 #include "flow/IRandom.h"
@ -29,21 +30,23 @@

 #include <cstring>
 #include <memory>
+#include <string>

 #if ENCRYPTION_ENABLED

-// BlobCipherEncryptHeader
-BlobCipherEncryptHeader::BlobCipherEncryptHeader() {
-	flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_NONE;
+namespace {
+bool isEncryptHeaderAuthTokenModeValid(const EncryptAuthTokenMode mode) {
+	return mode >= ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && mode < ENCRYPT_HEADER_AUTH_TOKEN_LAST;
 }
+} // namespace

 // BlobCipherKey class methods

-BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId,
-                             const BlobCipherBaseKeyId& baseCiphId,
+BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId,
+                             const EncryptCipherBaseKeyId& baseCiphId,
                             const uint8_t* baseCiph,
                             int baseCiphLen) {
-	BlobCipherRandomSalt salt;
+	EncryptCipherRandomSalt salt;
 	if (g_network->isSimulated()) {
 		salt = deterministicRandom()->randomUInt64();
 	} else {
@ -58,11 +61,11 @@ BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId,
 	    .detail("CreationTime", creationTime);*/
 }

-void BlobCipherKey::initKey(const BlobCipherDomainId& domainId,
+void BlobCipherKey::initKey(const EncryptCipherDomainId& domainId,
                            const uint8_t* baseCiph,
                            int baseCiphLen,
-                            const BlobCipherBaseKeyId& baseCiphId,
-                            const BlobCipherRandomSalt& salt) {
+                            const EncryptCipherBaseKeyId& baseCiphId,
+                            const EncryptCipherRandomSalt& salt) {
 	// Set the base encryption key properties
 	baseCipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
 	memset(baseCipher.get(), 0, AES_256_KEY_LENGTH);
@ -82,11 +85,11 @@ void BlobCipherKey::initKey(const BlobCipherDomainId& domainId,

 void BlobCipherKey::applyHmacSha256Derivation() {
 	Arena arena;
-	uint8_t buf[baseCipherLen + sizeof(BlobCipherRandomSalt)];
+	uint8_t buf[baseCipherLen + sizeof(EncryptCipherRandomSalt)];
 	memcpy(&buf[0], baseCipher.get(), baseCipherLen);
-	memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(BlobCipherRandomSalt));
+	memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(EncryptCipherRandomSalt));
 	HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen);
-	StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(BlobCipherRandomSalt), arena);
+	StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(EncryptCipherRandomSalt), arena);
 	std::copy(digest.begin(), digest.end(), cipher.get());
 	if (digest.size() < AES_256_KEY_LENGTH) {
 		memcpy(cipher.get() + digest.size(), buf, AES_256_KEY_LENGTH - digest.size());
@ -101,10 +104,10 @@ void BlobCipherKey::reset() {
 // BlobKeyIdCache class methods

 BlobCipherKeyIdCache::BlobCipherKeyIdCache()
-  : domainId(INVALID_DOMAIN_ID), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {}
+  : domainId(ENCRYPT_INVALID_DOMAIN_ID), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID) {}

-BlobCipherKeyIdCache::BlobCipherKeyIdCache(BlobCipherDomainId dId)
-  : domainId(dId), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {
+BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId)
+  : domainId(dId), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID) {
 	TraceEvent("Init_BlobCipherKeyIdCache").detail("DomainId", domainId);
 }

@ -112,7 +115,7 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::getLatestCipherKey() {
 	return getCipherByBaseCipherId(latestBaseCipherKeyId);
 }

-Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId) {
+Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(EncryptCipherBaseKeyId baseCipherKeyId) {
 	BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherKeyId);
 	if (itr == keyIdCache.end()) {
 		throw encrypt_key_not_found();
@ -120,10 +123,10 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCiphe
 	return itr->second;
 }

-void BlobCipherKeyIdCache::insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId,
+void BlobCipherKeyIdCache::insertBaseCipherKey(EncryptCipherBaseKeyId baseCipherId,
                                               const uint8_t* baseCipher,
                                               int baseCipherLen) {
-	ASSERT(baseCipherId > INVALID_CIPHER_KEY_ID);
+	ASSERT_GT(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID);

 	// BaseCipherKeys are immutable, ensure that cached value doesn't get updated.
 	BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherId);
@ -165,11 +168,11 @@ std::vector<Reference<BlobCipherKey>> BlobCipherKeyIdCache::getAllCipherKeys() {

 // BlobCipherKeyCache class methods

-void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId,
-                                         const BlobCipherBaseKeyId& baseCipherId,
+void BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId,
+                                         const EncryptCipherBaseKeyId& baseCipherId,
                                         const uint8_t* baseCipher,
                                         int baseCipherLen) {
-	if (domainId == INVALID_DOMAIN_ID || baseCipherId == INVALID_CIPHER_KEY_ID) {
+	if (domainId == ENCRYPT_INVALID_DOMAIN_ID || baseCipherId == ENCRYPT_INVALID_CIPHER_KEY_ID) {
 		throw encrypt_invalid_id();
 	}

@ -193,7 +196,7 @@ void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId,
 	}
 }

-Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const BlobCipherDomainId& domainId) {
+Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const EncryptCipherDomainId& domainId) {
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
 		TraceEvent("GetLatestCipherKey_DomainNotFound").detail("DomainId", domainId);
@ -212,8 +215,8 @@ Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const BlobCipher
 	return cipherKey;
 }

-Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const BlobCipherDomainId& domainId,
-                                                          const BlobCipherBaseKeyId& baseCipherId) {
+Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const EncryptCipherDomainId& domainId,
+                                                          const EncryptCipherBaseKeyId& baseCipherId) {
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
 		throw encrypt_key_not_found();
@ -223,7 +226,7 @@ Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const BlobCipherDomain
 	return keyIdCache->getCipherByBaseCipherId(baseCipherId);
 }

-void BlobCipherKeyCache::resetEncyrptDomainId(const BlobCipherDomainId domainId) {
+void BlobCipherKeyCache::resetEncyrptDomainId(const EncryptCipherDomainId domainId) {
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
 		throw encrypt_key_not_found();
@ -245,7 +248,7 @@ void BlobCipherKeyCache::cleanup() noexcept {
 	instance.domainCacheMap.clear();
 }

-std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const BlobCipherDomainId& domainId) {
+std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const EncryptCipherDomainId& domainId) {
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
 		return {};
@ -255,13 +258,17 @@ std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const Bl
 	return keyIdCache->getAllCipherKeys();
 }

-// EncryptBlobCipher class methods
+// EncryptBlobCipherAes265Ctr class methods

-EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key,
+EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> tCipherKey,
+                                                       Reference<BlobCipherKey> hCipherKey,
                                                       const uint8_t* cipherIV,
-                                                       const int ivLen)
-  : ctx(EVP_CIPHER_CTX_new()), cipherKey(key) {
-	ASSERT(ivLen == AES_256_IV_LENGTH);
+                                                       const int ivLen,
+                                                       const EncryptAuthTokenMode mode)
+  : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode) {
+	ASSERT(isEncryptHeaderAuthTokenModeValid(mode));
+	ASSERT_EQ(ivLen, AES_256_IV_LENGTH);
+
 	memcpy(&iv[0], cipherIV, ivLen);

 	if (ctx == nullptr) {
@ -270,7 +277,7 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey>
 	if (EVP_EncryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr) != 1) {
 		throw encrypt_ops_error();
 	}
-	if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), cipherIV) != 1) {
+	if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, textCipherKey.getPtr()->data(), cipherIV) != 1) {
 		throw encrypt_ops_error();
 	}
 }
@ -281,21 +288,29 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
                                                          Arena& arena) {
 	TEST(true); // Encrypting data with BlobCipher

-	Reference<EncryptBuf> encryptBuf = makeReference<EncryptBuf>(plaintextLen + AES_BLOCK_SIZE, arena);
+	memset(reinterpret_cast<uint8_t*>(header), 0, sizeof(BlobCipherEncryptHeader));
+
+	// Alloc buffer computation accounts for 'header authentication' generation scheme. If single-auth-token needs to be
+	// generated, allocate buffer sufficient to append header to the cipherText optimizing memcpy cost.
+
+	const int allocSize = authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE
+	                          ? plaintextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader)
+	                          : plaintextLen + AES_BLOCK_SIZE;
+	Reference<EncryptBuf> encryptBuf = makeReference<EncryptBuf>(allocSize, arena);
 	uint8_t* ciphertext = encryptBuf->begin();
 	int bytes{ 0 };
 	if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) {
 		TraceEvent("Encrypt_UpdateFailed")
-		    .detail("BaseCipherId", cipherKey->getBaseCipherId())
-		    .detail("EncryptDomainId", cipherKey->getDomainId());
+		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
+		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
 	}

 	int finalBytes{ 0 };
 	if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) {
 		TraceEvent("Encrypt_FinalFailed")
-		    .detail("BaseCipherId", cipherKey->getBaseCipherId())
-		    .detail("EncryptDomainId", cipherKey->getDomainId());
+		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
+		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
 	}

@ -306,19 +321,57 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
 		throw encrypt_ops_error();
 	}

-	// populate header details for the encrypted blob.
+	// Populate encryption header flags details
 	header->flags.size = sizeof(BlobCipherEncryptHeader);
 	header->flags.headerVersion = EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION;
-	header->flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR;
-	header->baseCipherId = cipherKey->getBaseCipherId();
-	header->encryptDomainId = cipherKey->getDomainId();
-	header->salt = cipherKey->getSalt();
-	memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH);
+	header->flags.encryptMode = ENCRYPT_CIPHER_MODE_AES_256_CTR;
+	header->flags.authTokenMode = authTokenMode;

-	// Preserve checksum of encrypted bytes in the header; approach protects against disk induced bit-rot/flip
-	// scenarios. AES CTR mode doesn't generate 'tag' by default as with schemes such as: AES 256 GCM.
+	// Populate cipherText encryption-key details
+	header->cipherTextDetails.baseCipherId = textCipherKey->getBaseCipherId();
+	header->cipherTextDetails.encryptDomainId = textCipherKey->getDomainId();
+	header->cipherTextDetails.salt = textCipherKey->getSalt();
+	memcpy(&header->cipherTextDetails.iv[0], &iv[0], AES_256_IV_LENGTH);

-	header->ciphertextChecksum = computeEncryptChecksum(ciphertext, bytes + finalBytes, cipherKey->getSalt(), arena);
+	if (authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) {
+		// No header 'authToken' generation needed.
+	} else {
+		// Populate header encryption-key details
+		header->cipherHeaderDetails.encryptDomainId = headerCipherKey->getDomainId();
+		header->cipherHeaderDetails.baseCipherId = headerCipherKey->getBaseCipherId();
+
+		// Populate header authToken details
+		if (header->flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) {
+			ASSERT_GE(allocSize, (bytes + finalBytes + sizeof(BlobCipherEncryptHeader)));
+			ASSERT_GE(encryptBuf->getLogicalSize(), (bytes + finalBytes + sizeof(BlobCipherEncryptHeader)));
+
+			memcpy(&ciphertext[bytes + finalBytes],
+			       reinterpret_cast<const uint8_t*>(header),
+			       sizeof(BlobCipherEncryptHeader));
+			StringRef authToken = computeAuthToken(ciphertext,
+			                                       bytes + finalBytes + sizeof(BlobCipherEncryptHeader),
+			                                       headerCipherKey->rawCipher(),
+			                                       AES_256_KEY_LENGTH,
+			                                       arena);
+			memcpy(&header->singleAuthToken.authToken[0], authToken.begin(), AUTH_TOKEN_SIZE);
+		} else {
+			ASSERT_EQ(header->flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+
+			StringRef cipherTextAuthToken =
+			    computeAuthToken(ciphertext,
+			                     bytes + finalBytes,
+			                     reinterpret_cast<const uint8_t*>(&header->cipherTextDetails.salt),
+			                     sizeof(EncryptCipherRandomSalt),
+			                     arena);
+			memcpy(&header->multiAuthTokens.cipherTextAuthToken[0], cipherTextAuthToken.begin(), AUTH_TOKEN_SIZE);
+			StringRef headerAuthToken = computeAuthToken(reinterpret_cast<const uint8_t*>(header),
+			                                             sizeof(BlobCipherEncryptHeader),
+			                                             headerCipherKey->rawCipher(),
+			                                             AES_256_KEY_LENGTH,
+			                                             arena);
+			memcpy(&header->multiAuthTokens.headerAuthToken[0], headerAuthToken.begin(), AUTH_TOKEN_SIZE);
+		}
+	}

 	encryptBuf->setLogicalSize(plaintextLen);
 	return encryptBuf;
@ -330,45 +383,137 @@ EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() {
 	}
 }

-// DecryptBlobCipher class methods
+// DecryptBlobCipherAes256Ctr class methods

-DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv)
-  : ctx(EVP_CIPHER_CTX_new()) {
+DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> tCipherKey,
+                                                       Reference<BlobCipherKey> hCipherKey,
+                                                       const uint8_t* iv)
+  : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey),
+    headerAuthTokenValidationDone(false), authTokensValidationDone(false) {
 	if (ctx == nullptr) {
 		throw encrypt_ops_error();
 	}
 	if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr)) {
 		throw encrypt_ops_error();
 	}
-	if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), iv)) {
+	if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, tCipherKey.getPtr()->data(), iv)) {
 		throw encrypt_ops_error();
 	}
 }

-void DecryptBlobCipherAes256Ctr::verifyEncryptBlobHeader(const uint8_t* ciphertext,
-                                                         const int ciphertextLen,
-                                                         const BlobCipherEncryptHeader& header,
-                                                         Arena& arena) {
-	// validate header flag sanity
-	if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
-	    header.flags.encryptMode != BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR) {
-		TraceEvent("VerifyEncryptBlobHeader")
-		    .detail("HeaderVersion", header.flags.headerVersion)
-		    .detail("HeaderMode", header.flags.encryptMode)
-		    .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
-		    .detail("ExpectedMode", BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
-		throw encrypt_header_metadata_mismatch();
+void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena) {
+	if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI) {
+		// NoneAuthToken mode; no authToken is generated; nothing to do
+		// SingleAuthToken mode; verification will happen as part of decryption.
+		return;
 	}

-	// encrypted byte checksum sanity; protection against data bit-rot/flip.
-	BlobCipherChecksum computed = computeEncryptChecksum(ciphertext, ciphertextLen, header.salt, arena);
-	if (computed != header.ciphertextChecksum) {
-		TraceEvent("VerifyEncryptBlobHeader_ChecksumMismatch")
+	ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+
+	BlobCipherEncryptHeader headerCopy;
+	memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+	       reinterpret_cast<const uint8_t*>(&header),
+	       sizeof(BlobCipherEncryptHeader));
+	memset(reinterpret_cast<uint8_t*>(&headerCopy.multiAuthTokens.headerAuthToken), 0, AUTH_TOKEN_SIZE);
+	StringRef computedHeaderAuthToken = computeAuthToken(reinterpret_cast<const uint8_t*>(&headerCopy),
+	                                                     sizeof(BlobCipherEncryptHeader),
+	                                                     headerCipherKey->rawCipher(),
+	                                                     AES_256_KEY_LENGTH,
+	                                                     arena);
+	if (memcmp(&header.multiAuthTokens.headerAuthToken[0], computedHeaderAuthToken.begin(), AUTH_TOKEN_SIZE) != 0) {
+		TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderMode", header.flags.encryptMode)
-		    .detail("CiphertextChecksum", header.ciphertextChecksum)
-		    .detail("ComputedCiphertextChecksum", computed);
-		throw encrypt_header_checksum_mismatch();
+		    .detail("MultiAuthHeaderAuthToken",
+		            StringRef(arena, &header.multiAuthTokens.headerAuthToken[0], AUTH_TOKEN_SIZE).toString())
+		    .detail("ComputedHeaderAuthToken", computedHeaderAuthToken.toString());
+		throw encrypt_header_authtoken_mismatch();
+	}
+
+	headerAuthTokenValidationDone = true;
+}
+
+void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext,
+                                                             const int ciphertextLen,
+                                                             const BlobCipherEncryptHeader& header,
+                                                             uint8_t* buff,
+                                                             Arena& arena) {
+	// Header authToken not set for single auth-token mode.
+	ASSERT(!headerAuthTokenValidationDone);
+
+	// prepare the payload {cipherText + encryptionHeader}
+	memcpy(&buff[0], ciphertext, ciphertextLen);
+	memcpy(&buff[ciphertextLen], reinterpret_cast<const uint8_t*>(&header), sizeof(BlobCipherEncryptHeader));
+	// ensure the 'authToken' is reset before computing the 'authentication token'
+	BlobCipherEncryptHeader* eHeader = (BlobCipherEncryptHeader*)(&buff[ciphertextLen]);
+	memset(reinterpret_cast<uint8_t*>(&eHeader->singleAuthToken), 0, 2 * AUTH_TOKEN_SIZE);
+
+	StringRef computed = computeAuthToken(
+	    buff, ciphertextLen + sizeof(BlobCipherEncryptHeader), headerCipherKey->rawCipher(), AES_256_KEY_LENGTH, arena);
+	if (memcmp(&header.singleAuthToken.authToken[0], computed.begin(), AUTH_TOKEN_SIZE) != 0) {
+		TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("HeaderMode", header.flags.encryptMode)
+		    .detail("SingleAuthToken",
+		            StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString())
+		    .detail("ComputedSingleAuthToken", computed.toString());
+		throw encrypt_header_authtoken_mismatch();
+	}
+}
+
+void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphertext,
+                                                            const int ciphertextLen,
+                                                            const BlobCipherEncryptHeader& header,
+                                                            uint8_t* buff,
+                                                            Arena& arena) {
+	if (!headerAuthTokenValidationDone) {
+		verifyHeaderAuthToken(header, arena);
+	}
+	StringRef computedCipherTextAuthToken =
+	    computeAuthToken(ciphertext,
+	                     ciphertextLen,
+	                     reinterpret_cast<const uint8_t*>(&header.cipherTextDetails.salt),
+	                     sizeof(EncryptCipherRandomSalt),
+	                     arena);
+	if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], computedCipherTextAuthToken.begin(), AUTH_TOKEN_SIZE) !=
+	    0) {
+		TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("HeaderMode", header.flags.encryptMode)
+		    .detail("MultiAuthCipherTextAuthToken",
+		            StringRef(arena, &header.multiAuthTokens.cipherTextAuthToken[0], AUTH_TOKEN_SIZE).toString())
+		    .detail("ComputedCipherTextAuthToken", computedCipherTextAuthToken.toString());
+		throw encrypt_header_authtoken_mismatch();
+	}
+}
+
+void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext,
+                                                  const int ciphertextLen,
+                                                  const BlobCipherEncryptHeader& header,
+                                                  uint8_t* buff,
+                                                  Arena& arena) {
+	if (header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) {
+		verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, buff, arena);
+	} else {
+		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+		verifyHeaderMultiAuthToken(ciphertext, ciphertextLen, header, buff, arena);
+	}
+
+	authTokensValidationDone = true;
+}
+
+void DecryptBlobCipherAes256Ctr::verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header) {
+	// validate header flag sanity
+	if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
+	    header.flags.encryptMode != ENCRYPT_CIPHER_MODE_AES_256_CTR ||
+	    !isEncryptHeaderAuthTokenModeValid((EncryptAuthTokenMode)header.flags.authTokenMode)) {
+		TraceEvent("VerifyEncryptBlobHeader")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
+		    .detail("EncryptCipherMode", header.flags.encryptMode)
+		    .detail("ExpectedCipherMode", ENCRYPT_CIPHER_MODE_AES_256_CTR)
+		    .detail("EncryptHeaderAuthTokenMode", header.flags.authTokenMode);
+		throw encrypt_header_metadata_mismatch();
 	}
 }

@ -378,23 +523,37 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert
                                                          Arena& arena) {
 	TEST(true); // Decrypting data with BlobCipher

-	verifyEncryptBlobHeader(ciphertext, ciphertextLen, header, arena);
+	verifyEncryptHeaderMetadata(header);
+
+	if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && !headerCipherKey.isValid()) {
+		TraceEvent("Decrypt_InvalidHeaderCipherKey").detail("AuthTokenMode", header.flags.authTokenMode);
+		throw encrypt_ops_error();
+	}
+
+	const int allocSize = header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE
+	                          ? ciphertextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader)
+	                          : ciphertextLen + AES_BLOCK_SIZE;
+	Reference<EncryptBuf> decrypted = makeReference<EncryptBuf>(allocSize, arena);
+
+	if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) {
+		verifyAuthTokens(ciphertext, ciphertextLen, header, decrypted->begin(), arena);
+		ASSERT(authTokensValidationDone);
+	}

-	Reference<EncryptBuf> decrypted = makeReference<EncryptBuf>(ciphertextLen + AES_BLOCK_SIZE, arena);
 	uint8_t* plaintext = decrypted->begin();
 	int bytesDecrypted{ 0 };
 	if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) {
 		TraceEvent("Decrypt_UpdateFailed")
-		    .detail("BaseCipherId", header.baseCipherId)
-		    .detail("EncryptDomainId", header.encryptDomainId);
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
+		    .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
 		throw encrypt_ops_error();
 	}

 	int finalBlobBytes{ 0 };
 	if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) {
 		TraceEvent("Decrypt_FinalFailed")
-		    .detail("BaseCipherId", header.baseCipherId)
-		    .detail("EncryptDomainId", header.encryptDomainId);
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
+		    .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
 		throw encrypt_ops_error();
 	}

@ -443,6 +602,18 @@ StringRef HmacSha256DigestGen::digest(const unsigned char* data, size_t len, Are
 	return StringRef(digest, digestLen);
 }

+StringRef computeAuthToken(const uint8_t* payload,
+                           const int payloadLen,
+                           const uint8_t* key,
+                           const int keyLen,
+                           Arena& arena) {
+	HmacSha256DigestGen hmacGenerator(key, keyLen);
+	StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
+
+	ASSERT_GE(digest.size(), AUTH_TOKEN_SIZE);
+	return digest;
+}
+
 // Only used to link unit tests
 void forceLinkBlobCipherTests() {}

@ -453,41 +624,42 @@ void forceLinkBlobCipherTests() {}
 // 4. Inserting of 'non-identical' cipherKey (already cached) more than once works as desired.
 // 5. Validation encryption ops (correctness):
 //  5.1. Encyrpt a buffer followed by decryption of the buffer, validate the contents.
-//  5.2. Simulate anomolies such as: EncyrptionHeader corruption, checkSum mismatch / encryptionMode mismatch etc.
+//  5.2. Simulate anomalies such as: EncyrptionHeader corruption, authToken mismatch / encryptionMode mismatch etc.
 // 6. Cache cleanup
 //  6.1  cleanup cipherKeys by given encryptDomainId
 //  6.2. Cleanup all cached cipherKeys
 TEST_CASE("flow/BlobCipher") {
 	TraceEvent("BlobCipherTest_Start").log();
+
 	// Construct a dummy External Key Manager representation and populate with some keys
 	class BaseCipher : public ReferenceCounted<BaseCipher>, NonCopyable {
 	public:
-		BlobCipherDomainId domainId;
+		EncryptCipherDomainId domainId;
 		int len;
-		BlobCipherBaseKeyId keyId;
+		EncryptCipherBaseKeyId keyId;
 		std::unique_ptr<uint8_t[]> key;

-		BaseCipher(const BlobCipherDomainId& dId, const BlobCipherBaseKeyId& kId)
+		BaseCipher(const EncryptCipherDomainId& dId, const EncryptCipherBaseKeyId& kId)
 		  : domainId(dId), len(deterministicRandom()->randomInt(AES_256_KEY_LENGTH / 2, AES_256_KEY_LENGTH + 1)),
 		    keyId(kId), key(std::make_unique<uint8_t[]>(len)) {
 			generateRandomData(key.get(), len);
 		}
 	};

-	using BaseKeyMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BaseCipher>>;
-	using DomainKeyMap = std::unordered_map<BlobCipherDomainId, BaseKeyMap>;
+	using BaseKeyMap = std::unordered_map<EncryptCipherBaseKeyId, Reference<BaseCipher>>;
+	using DomainKeyMap = std::unordered_map<EncryptCipherDomainId, BaseKeyMap>;
 	DomainKeyMap domainKeyMap;
-	const BlobCipherDomainId minDomainId = 1;
-	const BlobCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
-	const BlobCipherBaseKeyId minBaseCipherKeyId = 100;
-	const BlobCipherBaseKeyId maxBaseCipherKeyId =
+	const EncryptCipherDomainId minDomainId = 1;
+	const EncryptCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
+	const EncryptCipherBaseKeyId minBaseCipherKeyId = 100;
+	const EncryptCipherBaseKeyId maxBaseCipherKeyId =
 	    deterministicRandom()->randomInt(minBaseCipherKeyId, minBaseCipherKeyId + 50) + 15;
 	for (int dId = minDomainId; dId <= maxDomainId; dId++) {
 		for (int kId = minBaseCipherKeyId; kId <= maxBaseCipherKeyId; kId++) {
 			domainKeyMap[dId].emplace(kId, makeReference<BaseCipher>(dId, kId));
 		}
 	}
-	ASSERT(domainKeyMap.size() == maxDomainId);
+	ASSERT_EQ(domainKeyMap.size(), maxDomainId);

 	// insert BlobCipher keys into BlobCipherKeyCache map and validate
 	TraceEvent("BlobCipherTest_InsertKeys").log();
@ -500,6 +672,11 @@ TEST_CASE("flow/BlobCipher") {
 			    baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len);
 		}
 	}
+	// insert EncryptHeader BlobCipher key
+	Reference<BaseCipher> headerBaseCipher = makeReference<BaseCipher>(ENCRYPT_HEADER_DOMAIN_ID, 1);
+	cipherKeyCache.insertCipherKey(
+	    headerBaseCipher->domainId, headerBaseCipher->keyId, headerBaseCipher->key.get(), headerBaseCipher->len);
+
 	TraceEvent("BlobCipherTest_InsertKeysDone").log();

 	// validate the cipherKey lookups work as desired
@ -509,13 +686,13 @@ TEST_CASE("flow/BlobCipher") {
 			Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(baseCipher->domainId, baseCipher->keyId);
 			ASSERT(cipherKey.isValid());
 			// validate common cipher properties - domainId, baseCipherId, baseCipherLen, rawBaseCipher
-			ASSERT(cipherKey->getBaseCipherId() == baseCipher->keyId);
-			ASSERT(cipherKey->getDomainId() == baseCipher->domainId);
-			ASSERT(cipherKey->getBaseCipherLen() == baseCipher->len);
+			ASSERT_EQ(cipherKey->getBaseCipherId(), baseCipher->keyId);
+			ASSERT_EQ(cipherKey->getDomainId(), baseCipher->domainId);
+			ASSERT_EQ(cipherKey->getBaseCipherLen(), baseCipher->len);
 			// ensure that baseCipher matches with the cached information
-			ASSERT(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) == 0);
+			ASSERT_EQ(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0);
 			// validate the encryption derivation
-			ASSERT(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) != 0);
+			ASSERT_NE(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0);
 		}
 	}
 	TraceEvent("BlobCipherTest_LooksupDone").log();
@ -548,6 +725,7 @@ TEST_CASE("flow/BlobCipher") {

 	// Validate Encyrption ops
 	Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(minDomainId);
+	Reference<BlobCipherKey> headerCipherKey = cipherKeyCache.getLatestCipherKey(ENCRYPT_HEADER_DOMAIN_ID);
 	const int bufLen = deterministicRandom()->randomInt(786, 2127) + 512;
 	uint8_t orgData[bufLen];
 	generateRandomData(&orgData[0], bufLen);
@ -556,68 +734,317 @@ TEST_CASE("flow/BlobCipher") {
 	uint8_t iv[AES_256_IV_LENGTH];
 	generateRandomData(&iv[0], AES_256_IV_LENGTH);

-	// validate basic encrypt followed by decrypt operation
-	EncryptBlobCipherAes265Ctr encryptor(cipherKey, iv, AES_256_IV_LENGTH);
-	BlobCipherEncryptHeader header;
-	Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+	BlobCipherEncryptHeader headerCopy;
+	// validate basic encrypt followed by decrypt operation for AUTH_MODE_NONE
+	{
+		TraceEvent("NoneAuthMode_Start").log();

-	ASSERT(encrypted->getLogicalSize() == bufLen);
-	ASSERT(memcmp(&orgData[0], encrypted->begin(), bufLen) != 0);
-	ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
-	ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
+		EncryptBlobCipherAes265Ctr encryptor(
+		    cipherKey, Reference<BlobCipherKey>(), iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE);
+		BlobCipherEncryptHeader header;
+		Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);

-	TraceEvent("BlobCipherTest_EncryptDone")
-	    .detail("HeaderVersion", header.flags.headerVersion)
-	    .detail("HeaderEncryptMode", header.flags.encryptMode)
-	    .detail("DomainId", header.encryptDomainId)
-	    .detail("BaseCipherId", header.baseCipherId)
-	    .detail("HeaderChecksum", header.ciphertextChecksum);
+		ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
+		ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
+		ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
+		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE);

-	Reference<BlobCipherKey> encyrptKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
-	ASSERT(encyrptKey->isEqual(cipherKey));
-	DecryptBlobCipherAes256Ctr decryptor(encyrptKey, &header.iv[0]);
-	Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
+		TraceEvent("BlobCipherTest_EncryptDone")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("HeaderEncryptMode", header.flags.encryptMode)
+		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId);

-	ASSERT(decrypted->getLogicalSize() == bufLen);
-	ASSERT(memcmp(decrypted->begin(), &orgData[0], bufLen) == 0);
+		Reference<BlobCipherKey> tCipherKeyKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
+		                                                                     header.cipherTextDetails.baseCipherId);
+		ASSERT(tCipherKeyKey->isEqual(cipherKey));
+		DecryptBlobCipherAes256Ctr decryptor(
+		    tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
+		Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);

-	TraceEvent("BlobCipherTest_DecryptDone").log();
+		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
+		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);

-	// induce encryption header corruption - headerVersion corrupted
-	header.flags.headerVersion += 1;
-	try {
-		decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
-	} catch (Error& e) {
-		if (e.code() != error_code_encrypt_header_metadata_mismatch) {
-			throw;
+		TraceEvent("BlobCipherTest_DecryptDone").log();
+
+		// induce encryption header corruption - headerVersion corrupted
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.headerVersion += 1;
+		try {
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			DecryptBlobCipherAes256Ctr decryptor(
+			    tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
+				throw;
+			}
 		}
-		header.flags.headerVersion -= 1;
+
+		// induce encryption header corruption - encryptionMode corrupted
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.encryptMode += 1;
+		try {
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			DecryptBlobCipherAes256Ctr decryptor(
+			    tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encrypted buffer payload corruption
+		try {
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			uint8_t temp[bufLen];
+			memcpy(encrypted->begin(), &temp[0], bufLen);
+			int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
+			temp[tIdx] += 1;
+			DecryptBlobCipherAes256Ctr decryptor(
+			    tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
+		} catch (Error& e) {
+			// No authToken, hence, no corruption detection supported
+			ASSERT(false);
+		}
+
+		TraceEvent("NoneAuthMode_Done").log();
 	}

-	// induce encryption header corruption - encryptionMode corrupted
-	header.flags.encryptMode += 1;
-	try {
-		decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
-	} catch (Error& e) {
-		if (e.code() != error_code_encrypt_header_metadata_mismatch) {
-			throw;
+	// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_SINGLE
+	{
+		TraceEvent("SingleAuthMode_Start").log();
+
+		EncryptBlobCipherAes265Ctr encryptor(
+		    cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
+		BlobCipherEncryptHeader header;
+		Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+
+		ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
+		ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
+		ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
+		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
+
+		TraceEvent("BlobCipherTest_EncryptDone")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("HeaderEncryptMode", header.flags.encryptMode)
+		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
+		    .detail("HeaderAuthToken",
+		            StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString());
+
+		Reference<BlobCipherKey> tCipherKeyKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
+		                                                                     header.cipherTextDetails.baseCipherId);
+		Reference<BlobCipherKey> hCipherKey = cipherKeyCache.getCipherKey(header.cipherHeaderDetails.encryptDomainId,
+		                                                                  header.cipherHeaderDetails.baseCipherId);
+		ASSERT(tCipherKeyKey->isEqual(cipherKey));
+		DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+		Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
+
+		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
+		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
+
+		TraceEvent("BlobCipherTest_DecryptDone").log();
+
+		// induce encryption header corruption - headerVersion corrupted
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.headerVersion += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
+				throw;
+			}
 		}
-		header.flags.encryptMode -= 1;
+
+		// induce encryption header corruption - encryptionMode corrupted
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.encryptMode += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encryption header corruption - authToken mismatch
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1);
+		headerCopy.singleAuthToken.authToken[hIdx] += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encrypted buffer payload corruption
+		try {
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			uint8_t temp[bufLen];
+			memcpy(encrypted->begin(), &temp[0], bufLen);
+			int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
+			temp[tIdx] += 1;
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		TraceEvent("SingleAuthMode_Done").log();
 	}

-	// induce encryption header corruption - checksum mismatch
-	header.ciphertextChecksum += 1;
-	try {
-		decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
-	} catch (Error& e) {
-		if (e.code() != error_code_encrypt_header_checksum_mismatch) {
-			throw;
+	// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI
+	{
+		TraceEvent("MultiAuthMode_Start").log();
+
+		EncryptBlobCipherAes265Ctr encryptor(
+		    cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+		BlobCipherEncryptHeader header;
+		Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+
+		ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
+		ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
+		ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
+		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
+		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
+
+		TraceEvent("BlobCipherTest_EncryptDone")
+		    .detail("HeaderVersion", header.flags.headerVersion)
+		    .detail("HeaderEncryptMode", header.flags.encryptMode)
+		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
+		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
+		    .detail("HeaderAuthToken",
+		            StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString());
+
+		Reference<BlobCipherKey> tCipherKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
+		                                                                  header.cipherTextDetails.baseCipherId);
+		Reference<BlobCipherKey> hCipherKey = cipherKeyCache.getCipherKey(header.cipherHeaderDetails.encryptDomainId,
+		                                                                  header.cipherHeaderDetails.baseCipherId);
+
+		ASSERT(tCipherKey->isEqual(cipherKey));
+		DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+		Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
+
+		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
+		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
+
+		TraceEvent("BlobCipherTest_DecryptDone").log();
+
+		// induce encryption header corruption - headerVersion corrupted
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.headerVersion += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
+				throw;
+			}
 		}
-		header.ciphertextChecksum -= 1;
+
+		// induce encryption header corruption - encryptionMode corrupted
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		headerCopy.flags.encryptMode += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_metadata_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encryption header corruption - cipherText authToken mismatch
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1);
+		headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		// induce encryption header corruption - header authToken mismatch
+		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
+		       reinterpret_cast<const uint8_t*>(&header),
+		       sizeof(BlobCipherEncryptHeader));
+		hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1);
+		headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1;
+		try {
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
+			ASSERT(false); // error expected
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		try {
+			encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
+			uint8_t temp[bufLen];
+			memcpy(encrypted->begin(), &temp[0], bufLen);
+			int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
+			temp[tIdx] += 1;
+			DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
+			decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
+		} catch (Error& e) {
+			if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
+				throw;
+			}
+		}
+
+		TraceEvent("MultiAuthMode_Done").log();
 	}

 	// Validate dropping encyrptDomainId cached keys
-	const BlobCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId);
+	const EncryptCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId);
 	cipherKeyCache.resetEncyrptDomainId(candidate);
 	std::vector<Reference<BlobCipherKey>> cachedKeys = cipherKeyCache.getAllCiphers(candidate);
 	ASSERT(cachedKeys.empty());
@ -633,20 +1060,4 @@ TEST_CASE("flow/BlobCipher") {
 	return Void();
 }

-BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
-                                          const int payloadLen,
-                                          const BlobCipherRandomSalt& salt,
-                                          Arena& arena) {
-	// FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate checksum
-	// Leverage HMAC_SHA256 using header.randomSalt as the initialization 'key' for the hmac digest.
-
-	HmacSha256DigestGen hmacGenerator((const uint8_t*)&salt, sizeof(salt));
-	StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
-	ASSERT(digest.size() >= sizeof(BlobCipherChecksum));
-
-	BlobCipherChecksum checksum;
-	memcpy((uint8_t*)&checksum, digest.begin(), sizeof(BlobCipherChecksum));
-	return checksum;
-}
-
 #endif // ENCRYPTION_ENABLED
--- a/flow/BlobCipher.h
+++ b/flow/BlobCipher.h
@ -33,6 +33,7 @@
 #if ENCRYPTION_ENABLED

 #include "flow/Arena.h"
+#include "flow/EncryptUtils.h"
 #include "flow/FastRef.h"
 #include "flow/flow.h"
 #include "flow/xxhash.h"
@ -45,15 +46,6 @@

 #define AES_256_KEY_LENGTH 32
 #define AES_256_IV_LENGTH 16
-#define INVALID_DOMAIN_ID 0
-#define INVALID_CIPHER_KEY_ID 0
-
-using BlobCipherDomainId = uint64_t;
-using BlobCipherRandomSalt = uint64_t;
-using BlobCipherBaseKeyId = uint64_t;
-using BlobCipherChecksum = uint64_t;
-
-typedef enum { BLOB_CIPHER_ENCRYPT_MODE_NONE = 0, BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR = 1 } BlockCipherEncryptMode;

 // Encryption operations buffer management
 // Approach limits number of copies needed during encryption or decryption operations.
@ -89,51 +81,94 @@ private:
 // This header is persisted along with encrypted buffer, it contains information necessary
 // to assist decrypting the buffers to serve read requests.
 //
-// The total space overhead is 56 bytes.
+// The total space overhead is 96 bytes.

 #pragma pack(push, 1) // exact fit - no padding
 typedef struct BlobCipherEncryptHeader {
+	static constexpr int headerSize = 96;
 	union {
 		struct {
 			uint8_t size; // reading first byte is sufficient to determine header
 			              // length. ALWAYS THE FIRST HEADER ELEMENT.
 			uint8_t headerVersion{};
 			uint8_t encryptMode{};
-			uint8_t _reserved[5]{};
+			uint8_t authTokenMode{};
+			uint8_t _reserved[4]{};
 		} flags;
 		uint64_t _padding{};
 	};
-	// Encyrption domain boundary identifier.
-	BlobCipherDomainId encryptDomainId{};
-	// BaseCipher encryption key identifier
-	BlobCipherBaseKeyId baseCipherId{};
-	// Random salt
-	BlobCipherRandomSalt salt{};
-	// Checksum of the encrypted buffer. It protects against 'tampering' of ciphertext as well 'bit rots/flips'.
-	BlobCipherChecksum ciphertextChecksum{};
-	// Initialization vector used to encrypt the payload.
-	uint8_t iv[AES_256_IV_LENGTH];

-	BlobCipherEncryptHeader();
+	// Cipher text encryption information
+	struct {
+		// Encyrption domain boundary identifier.
+		EncryptCipherDomainId encryptDomainId{};
+		// BaseCipher encryption key identifier
+		EncryptCipherBaseKeyId baseCipherId{};
+		// Random salt
+		EncryptCipherRandomSalt salt{};
+		// Initialization vector used to encrypt the payload.
+		uint8_t iv[AES_256_IV_LENGTH];
+	} cipherTextDetails;
+
+	struct {
+		// Encryption domainId for the header
+		EncryptCipherDomainId encryptDomainId{};
+		// BaseCipher encryption key identifier.
+		EncryptCipherBaseKeyId baseCipherId{};
+	} cipherHeaderDetails;
+
+	// Encryption header is stored as plaintext on a persistent storage to assist reconstruction of cipher-key(s) for
+	// reads. FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate 'authentication
+	// token' (crypto-secure) to protect against malicious tampering and/or bit rot/flip scenarios.
+
+	union {
+		// Encryption header support two modes of generation 'authentication tokens':
+		// 1) SingleAuthTokenMode: the scheme generates single crypto-secrure auth token to protect {cipherText +
+		// header} payload. Scheme is geared towards optimizing cost due to crypto-secure auth-token generation,
+		// however, on decryption client needs to be read 'header' + 'encrypted-buffer' to validate the 'auth-token'.
+		// The scheme is ideal for usecases where payload represented by the encryptionHeader is not large and it is
+		// desirable to minimize CPU/latency penalty due to crypto-secure ops, such as: CommitProxies encrypted inline
+		// transactions, StorageServer encrypting pages etc. 2) MultiAuthTokenMode: Scheme generates separate authTokens
+		// for 'encrypted buffer' & 'encryption-header'. The scheme is ideal where payload represented by
+		// encryptionHeader is large enough such that it is desirable to optimize cost of upfront reading full
+		// 'encrypted buffer', compared to reading only encryptionHeader and ensuring its sanity; for instance:
+		// backup-files.
+
+		struct {
+			// Cipher text authentication token
+			uint8_t cipherTextAuthToken[AUTH_TOKEN_SIZE]{};
+			uint8_t headerAuthToken[AUTH_TOKEN_SIZE]{};
+		} multiAuthTokens;
+		struct {
+			uint8_t authToken[AUTH_TOKEN_SIZE]{};
+			uint8_t _reserved[AUTH_TOKEN_SIZE]{};
+		} singleAuthToken;
+	};
+
+	BlobCipherEncryptHeader() {}
 } BlobCipherEncryptHeader;
 #pragma pack(pop)

+// Ensure no struct-packing issues
+static_assert(sizeof(BlobCipherEncryptHeader) == BlobCipherEncryptHeader::headerSize,
+              "BlobCipherEncryptHeader size mismatch");
+
 // This interface is in-memory representation of CipherKey used for encryption/decryption information.
 // It caches base encryption key properties as well as caches the 'derived encryption' key obtained by applying
 // HMAC-SHA-256 derivation technique.

 class BlobCipherKey : public ReferenceCounted<BlobCipherKey>, NonCopyable {
 public:
-	BlobCipherKey(const BlobCipherDomainId& domainId,
-	              const BlobCipherBaseKeyId& baseCiphId,
+	BlobCipherKey(const EncryptCipherDomainId& domainId,
+	              const EncryptCipherBaseKeyId& baseCiphId,
 	              const uint8_t* baseCiph,
 	              int baseCiphLen);

 	uint8_t* data() const { return cipher.get(); }
 	uint64_t getCreationTime() const { return creationTime; }
-	BlobCipherDomainId getDomainId() const { return encryptDomainId; }
-	BlobCipherRandomSalt getSalt() const { return randomSalt; }
-	BlobCipherBaseKeyId getBaseCipherId() const { return baseCipherId; }
+	EncryptCipherDomainId getDomainId() const { return encryptDomainId; }
+	EncryptCipherRandomSalt getSalt() const { return randomSalt; }
+	EncryptCipherBaseKeyId getBaseCipherId() const { return baseCipherId; }
 	int getBaseCipherLen() const { return baseCipherLen; }
 	uint8_t* rawCipher() const { return cipher.get(); }
 	uint8_t* rawBaseCipher() const { return baseCipher.get(); }
@ -147,23 +182,23 @@ public:

 private:
 	// Encryption domain boundary identifier
-	BlobCipherDomainId encryptDomainId;
+	EncryptCipherDomainId encryptDomainId;
 	// Base encryption cipher key properties
 	std::unique_ptr<uint8_t[]> baseCipher;
 	int baseCipherLen;
-	BlobCipherBaseKeyId baseCipherId;
+	EncryptCipherBaseKeyId baseCipherId;
 	// Random salt used for encryption cipher key derivation
-	BlobCipherRandomSalt randomSalt;
+	EncryptCipherRandomSalt randomSalt;
 	// Creation timestamp for the derived encryption cipher key
 	uint64_t creationTime;
 	// Derived encryption cipher key
 	std::unique_ptr<uint8_t[]> cipher;

-	void initKey(const BlobCipherDomainId& domainId,
+	void initKey(const EncryptCipherDomainId& domainId,
 	             const uint8_t* baseCiph,
 	             int baseCiphLen,
-	             const BlobCipherBaseKeyId& baseCiphId,
-	             const BlobCipherRandomSalt& salt);
+	             const EncryptCipherBaseKeyId& baseCiphId,
+	             const EncryptCipherRandomSalt& salt);
 	void applyHmacSha256Derivation();
 };

@ -190,37 +225,45 @@ private:
 // required encryption key, however, CPs/SSs cache-miss would result in RPC to
 // EncryptKeyServer to refresh the desired encryption key.

-using BlobCipherKeyIdCacheMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>;
-using BlobCipherKeyIdCacheMapCItr = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>::const_iterator;
+using BlobCipherKeyIdCacheMap = std::unordered_map<EncryptCipherBaseKeyId, Reference<BlobCipherKey>>;
+using BlobCipherKeyIdCacheMapCItr =
+    std::unordered_map<EncryptCipherBaseKeyId, Reference<BlobCipherKey>>::const_iterator;

 struct BlobCipherKeyIdCache : ReferenceCounted<BlobCipherKeyIdCache> {
 public:
 	BlobCipherKeyIdCache();
-	explicit BlobCipherKeyIdCache(BlobCipherDomainId dId);
+	explicit BlobCipherKeyIdCache(EncryptCipherDomainId dId);

 	// API returns the last inserted cipherKey.
 	// If none exists, 'encrypt_key_not_found' is thrown.
+
 	Reference<BlobCipherKey> getLatestCipherKey();
+
 	// API returns cipherKey corresponding to input 'baseCipherKeyId'.
 	// If none exists, 'encrypt_key_not_found' is thrown.
-	Reference<BlobCipherKey> getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId);
+
+	Reference<BlobCipherKey> getCipherByBaseCipherId(EncryptCipherBaseKeyId baseCipherKeyId);
+
 	// API enables inserting base encryption cipher details to the BlobCipherKeyIdCache.
 	// Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey
 	// is treated as a NOP (success), however, an attempt to update cipherKey would throw
 	// 'encrypt_update_cipher' exception.
-	void insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen);
+
+	void insertBaseCipherKey(EncryptCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen);
+
 	// API cleanup the cache by dropping all cached cipherKeys
 	void cleanup();
+
 	// API returns list of all 'cached' cipherKeys
 	std::vector<Reference<BlobCipherKey>> getAllCipherKeys();

 private:
-	BlobCipherDomainId domainId;
+	EncryptCipherDomainId domainId;
 	BlobCipherKeyIdCacheMap keyIdCache;
-	BlobCipherBaseKeyId latestBaseCipherKeyId;
+	EncryptCipherBaseKeyId latestBaseCipherKeyId;
 };

-using BlobCipherDomainCacheMap = std::unordered_map<BlobCipherDomainId, Reference<BlobCipherKeyIdCache>>;
+using BlobCipherDomainCacheMap = std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKeyIdCache>>;

 class BlobCipherKeyCache : NonCopyable {
 public:
@ -228,21 +271,28 @@ public:
 	// The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable,
 	// attempting to re-insert same 'identical' cipherKey is treated as a NOP (success),
 	// however, an attempt to update cipherKey would throw 'encrypt_update_cipher' exception.
-	void insertCipherKey(const BlobCipherDomainId& domainId,
-	                     const BlobCipherBaseKeyId& baseCipherId,
+
+	void insertCipherKey(const EncryptCipherDomainId& domainId,
+	                     const EncryptCipherBaseKeyId& baseCipherId,
 	                     const uint8_t* baseCipher,
 	                     int baseCipherLen);
 	// API returns the last insert cipherKey for a given encyryption domain Id.
 	// If none exists, it would throw 'encrypt_key_not_found' exception.
-	Reference<BlobCipherKey> getLatestCipherKey(const BlobCipherDomainId& domainId);
+
+	Reference<BlobCipherKey> getLatestCipherKey(const EncryptCipherDomainId& domainId);
+
 	// API returns cipherKey corresponding to {encryptionDomainId, baseCipherId} tuple.
 	// If none exists, it would throw 'encrypt_key_not_found' exception.
-	Reference<BlobCipherKey> getCipherKey(const BlobCipherDomainId& domainId, const BlobCipherBaseKeyId& baseCipherId);
+
+	Reference<BlobCipherKey> getCipherKey(const EncryptCipherDomainId& domainId,
+	                                      const EncryptCipherBaseKeyId& baseCipherId);
 	// API returns point in time list of all 'cached' cipherKeys for a given encryption domainId.
-	std::vector<Reference<BlobCipherKey>> getAllCiphers(const BlobCipherDomainId& domainId);
+	std::vector<Reference<BlobCipherKey>> getAllCiphers(const EncryptCipherDomainId& domainId);
+
 	// API enables dropping all 'cached' cipherKeys for a given encryption domain Id.
 	// Useful to cleanup cache if an encryption domain gets removed/destroyed etc.
-	void resetEncyrptDomainId(const BlobCipherDomainId domainId);
+
+	void resetEncyrptDomainId(const EncryptCipherDomainId domainId);

 	static BlobCipherKeyCache& getInstance() {
 		static BlobCipherKeyCache instance;
@ -262,14 +312,19 @@ private:
 // This interface enables data block encryption. An invocation to encrypt() will
 // do two things:
 // 1) generate encrypted ciphertext for given plaintext input.
-// 2) generate BlobCipherEncryptHeader (including the 'header checksum') and persit for decryption on reads.
+// 2) generate BlobCipherEncryptHeader (including the 'header authTokens') and persit for decryption on reads.

 class EncryptBlobCipherAes265Ctr final : NonCopyable, public ReferenceCounted<EncryptBlobCipherAes265Ctr> {
 public:
 	static constexpr uint8_t ENCRYPT_HEADER_VERSION = 1;

-	EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key, const uint8_t* iv, const int ivLen);
+	EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> tCipherKey,
+	                           Reference<BlobCipherKey> hCipherKey,
+	                           const uint8_t* iv,
+	                           const int ivLen,
+	                           const EncryptAuthTokenMode mode);
 	~EncryptBlobCipherAes265Ctr();
+
 	Reference<EncryptBuf> encrypt(const uint8_t* plaintext,
 	                              const int plaintextLen,
 	                              BlobCipherEncryptHeader* header,
@ -277,7 +332,9 @@ public:

 private:
 	EVP_CIPHER_CTX* ctx;
-	Reference<BlobCipherKey> cipherKey;
+	Reference<BlobCipherKey> textCipherKey;
+	Reference<BlobCipherKey> headerCipherKey;
+	EncryptAuthTokenMode authTokenMode;
 	uint8_t iv[AES_256_IV_LENGTH];
 };

@ -286,20 +343,44 @@ private:

 class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted<DecryptBlobCipherAes256Ctr> {
 public:
-	DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv);
+	DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> tCipherKey,
+	                           Reference<BlobCipherKey> hCipherKey,
+	                           const uint8_t* iv);
 	~DecryptBlobCipherAes256Ctr();
+
 	Reference<EncryptBuf> decrypt(const uint8_t* ciphertext,
 	                              const int ciphertextLen,
 	                              const BlobCipherEncryptHeader& header,
 	                              Arena&);

+	// Enable caller to validate encryption header auth-token (if available) without needing to read the full encyrpted
+	// payload. The call is NOP unless header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI.
+
+	void verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena);
+
 private:
 	EVP_CIPHER_CTX* ctx;
+	Reference<BlobCipherKey> textCipherKey;
+	Reference<BlobCipherKey> headerCipherKey;
+	bool headerAuthTokenValidationDone;
+	bool authTokensValidationDone;

-	void verifyEncryptBlobHeader(const uint8_t* cipherText,
-	                             const int ciphertextLen,
-	                             const BlobCipherEncryptHeader& header,
-	                             Arena& arena);
+	void verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header);
+	void verifyAuthTokens(const uint8_t* ciphertext,
+	                      const int ciphertextLen,
+	                      const BlobCipherEncryptHeader& header,
+	                      uint8_t* buff,
+	                      Arena& arena);
+	void verifyHeaderSingleAuthToken(const uint8_t* ciphertext,
+	                                 const int ciphertextLen,
+	                                 const BlobCipherEncryptHeader& header,
+	                                 uint8_t* buff,
+	                                 Arena& arena);
+	void verifyHeaderMultiAuthToken(const uint8_t* ciphertext,
+	                                const int ciphertextLen,
+	                                const BlobCipherEncryptHeader& header,
+	                                uint8_t* buff,
+	                                Arena& arena);
 };

 class HmacSha256DigestGen final : NonCopyable {
@ -313,9 +394,10 @@ private:
 	HMAC_CTX* ctx;
 };

-BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
-                                          const int payloadLen,
-                                          const BlobCipherRandomSalt& salt,
-                                          Arena& arena);
+StringRef computeAuthToken(const uint8_t* payload,
+                           const int payloadLen,
+                           const uint8_t* key,
+                           const int keyLen,
+                           Arena& arena);

 #endif // ENCRYPTION_ENABLED
--- a/flow/EncryptUtils.h
+++ b/flow/EncryptUtils.h
@ -0,0 +1,66 @@
+/*
+ * EncryptUtils.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ENCRYPT_UTILS_H
+#define ENCRYPT_UTILS_H
+#pragma once
+
+#include <cstdint>
+#include <limits>
+
+#define ENCRYPT_INVALID_DOMAIN_ID 0
+#define ENCRYPT_INVALID_CIPHER_KEY_ID 0
+
+#define AUTH_TOKEN_SIZE 16
+
+#define SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID -1
+#define ENCRYPT_HEADER_DOMAIN_ID -2
+
+using EncryptCipherDomainId = int64_t;
+using EncryptCipherBaseKeyId = uint64_t;
+using EncryptCipherRandomSalt = uint64_t;
+
+typedef enum {
+	ENCRYPT_CIPHER_MODE_NONE = 0,
+	ENCRYPT_CIPHER_MODE_AES_256_CTR = 1,
+	ENCRYPT_CIPHER_MODE_LAST = 2
+} EncryptCipherMode;
+
+static_assert(EncryptCipherMode::ENCRYPT_CIPHER_MODE_LAST <= std::numeric_limits<uint8_t>::max(),
+              "EncryptCipherMode value overflow");
+
+// EncryptionHeader authentication modes
+// 1. NONE - No 'authentication token' generation needed for EncryptionHeader i.e. no protection against header OR
+// cipherText 'tampering' and/or bit rot/flip corruptions.
+// 2. Single/Multi - Encyrption header would generate one or more 'authentication tokens' to protect the header against
+// 'tempering' and/or bit rot/flip corruptions. Refer to BlobCipher.h for detailed usage recommendations.
+// 3. LAST - Invalid mode, used for static asserts.
+
+typedef enum {
+	ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE = 0,
+	ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE = 1,
+	ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI = 2,
+	ENCRYPT_HEADER_AUTH_TOKEN_LAST = 3 // Always the last element
+} EncryptAuthTokenMode;
+
+static_assert(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_LAST <= std::numeric_limits<uint8_t>::max(),
+              "EncryptHeaderAuthToken value overflow");
+
+#endif
--- a/flow/TDMetric.actor.h
+++ b/flow/TDMetric.actor.h
@ -963,7 +963,7 @@ struct DynamicFieldBase {
 		if (getDerivedTypeName() == metricTypeName<T>())
 			return (DynamicField<T>*)this;

-		TraceEvent(SevWarnAlways, "ScopeEventFieldTypeMismatch")
+		TraceEvent(g_network->isSimulated() ? SevError : SevWarnAlways, "ScopeEventFieldTypeMismatch")
 		    .detail("EventType", eventType.toString())
 		    .detail("FieldName", fieldName().toString())
 		    .detail("OldType", getDerivedTypeName().toString())
--- a/flow/Trace.h
+++ b/flow/Trace.h
@ -35,6 +35,7 @@

 #define TRACE_DEFAULT_ROLL_SIZE (10 << 20)
 #define TRACE_DEFAULT_MAX_LOGS_SIZE (10 * TRACE_DEFAULT_ROLL_SIZE)
+#define PRINTABLE_COMPRESS_NULLS 0

 inline int fastrand() {
 	static int g_seed = 0;
@ -343,20 +344,37 @@ struct TraceableStringImpl : std::true_type {
 		}
 		std::string result;
 		result.reserve(size - nonPrintables + (nonPrintables * 4) + numBackslashes);
+		int numNull = 0;
 		for (auto iter = TraceableString<T>::begin(value); !TraceableString<T>::atEnd(value, iter); ++iter) {
 			if (*iter == '\\') {
+				if (numNull > 0) {
+					result += format("[%d]", numNull);
+					numNull = 0;
+				}
 				result.push_back('\\');
 				result.push_back('\\');
 			} else if (isPrintable(*iter)) {
+				if (numNull > 0) {
+					result += format("[%d]", numNull);
+					numNull = 0;
+				}
 				result.push_back(*iter);
 			} else {
 				const uint8_t byte = *iter;
-				result.push_back('\\');
-				result.push_back('x');
-				result.push_back(base16Char(byte / 16));
-				result.push_back(base16Char(byte));
+				if (PRINTABLE_COMPRESS_NULLS && byte == 0) {
+					numNull++;
+				} else {
+					result.push_back('\\');
+					result.push_back('x');
+					result.push_back(base16Char(byte / 16));
+					result.push_back(base16Char(byte));
+				}
 			}
 		}
+		if (numNull > 0) {
+			result += format("[%d]", numNull);
+			numNull = 0;
+		}
 		return result;
 	}
 };
--- a/flow/Tracing.actor.cpp
+++ b/flow/Tracing.actor.cpp
@ -690,7 +690,7 @@ TEST_CASE("/flow/Tracing/AddLinks") {
 	return Void();
 };

-uint64_t swapUint16BE(uint8_t* index) {
+uint16_t swapUint16BE(uint8_t* index) {
 	uint16_t value;
 	memcpy(&value, index, sizeof(value));
 	return fromBigEndian16(value);
@ -718,6 +718,26 @@ std::string readMPString(uint8_t* index, int len) {
 	return reinterpret_cast<char*>(data);
 }

+std::string readMPString(uint8_t* index) {
+	auto len = 0;
+	switch (*index) {
+	case 0xda:
+		index++; // read the size in the next 2 bytes
+		len = swapUint16BE(index);
+		index += 2; // move index past the size bytes
+		break;
+	default:
+		// We & out the bits here that contain the length the initial 3 higher order bits are
+		// to signify this is a string of len <= 31 chars.
+		len = static_cast<uint8_t>(*index & 0b00011111);
+		index++;
+	}
+	uint8_t data[len + 1];
+	std::copy(index, index + len, data);
+	data[len] = '\0';
+	return reinterpret_cast<char*>(data);
+}
+
 // Windows doesn't like lack of header and declaration of constructor for FastUDPTracer
 #ifndef WIN32
 TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
@ -754,9 +774,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
 	ASSERT(data[46] == 0xcf);
 	ASSERT(swapUint64BE(&data[47]) == 1);
 	// Read and verify span name
-	ASSERT(data[55] == (0b10100000 | strlen("encoded_span")));
-	ASSERT(strncmp(readMPString(&data[56], strlen("encoded_span")).c_str(), "encoded_span", strlen("encoded_span")) ==
-	       0);
+	ASSERT(readMPString(&data[55]) == "encoded_span");
 	// Verify begin/end is encoded, we don't care about the values
 	ASSERT(data[68] == 0xcb);
 	ASSERT(data[77] == 0xcb);
@ -795,10 +813,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
 	ASSERT(data[0] == 0b10011110); // 14 element array.
 	// We don't care about the next 54 bytes as there is no parent and a randomly assigned Trace and SpanID
 	// Read and verify span name
-	ASSERT(data[55] == (0b10100000 | strlen("encoded_span_3")));
-	ASSERT(strncmp(readMPString(&data[56], strlen("encoded_span_3")).c_str(),
-	               "encoded_span_3",
-	               strlen("encoded_span_3")) == 0);
+	ASSERT(readMPString(&data[55]) == "encoded_span_3");
 	// Verify begin/end is encoded, we don't care about the values
 	ASSERT(data[70] == 0xcb);
 	ASSERT(data[79] == 0xcb);
@ -818,43 +833,32 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
 	ASSERT(swapUint64BE(&data[112]) == 400);
 	// Events
 	ASSERT(data[120] == 0b10010001); // empty
-	ASSERT(data[121] == (0b10100000 | strlen("event1")));
-	ASSERT(strncmp(readMPString(&data[122], strlen("event1")).c_str(), "event1", strlen("event1")) == 0);
+	ASSERT(readMPString(&data[121]) == "event1");
 	ASSERT(data[128] == 0xcb);
 	ASSERT(swapDoubleBE(&data[129]) == 100.101);
 	// Events Attributes
 	ASSERT(data[137] == 0b10000001); // single k/v pair
-	ASSERT(data[138] == 0b10100011); // length of key string "foo" == 3
-	ASSERT(strncmp(readMPString(&data[139], strlen("foo")).c_str(), "foo", strlen("foo")) == 0);
-	ASSERT(data[142] == 0b10100011); // length of key string "bar" == 3
-	ASSERT(strncmp(readMPString(&data[143], strlen("bar")).c_str(), "bar", strlen("bar")) == 0);
+	ASSERT(readMPString(&data[138]) == "foo");
+	ASSERT(readMPString(&data[142]) == "bar");
 	// Attributes
 	ASSERT(data[146] == 0b10000010); // two k/v pair
 	// Reconstruct map from MessagePack wire format data and verify.
 	std::unordered_map<std::string, std::string> attributes;
 	auto index = 147;
-	// We & out the bits here that contain the length the initial 4 higher order bits are
-	// to signify this is a string of len <= 31 chars.
-	auto firstKeyLength = static_cast<uint8_t>(data[index] & 0b00011111);
-	index++;
-	auto firstKey = readMPString(&data[index], firstKeyLength);
-	index += firstKeyLength;
-	auto firstValueLength = static_cast<uint8_t>(data[index] & 0b00011111);
-	index++;
-	auto firstValue = readMPString(&data[index], firstValueLength);
-	index += firstValueLength;
+
+	auto firstKey = readMPString(&data[index]);
+	index += firstKey.length() + 1; // +1 for control byte
+	auto firstValue = readMPString(&data[index]);
+	index += firstValue.length() + 1; // +1 for control byte
 	attributes[firstKey] = firstValue;
-	auto secondKeyLength = static_cast<uint8_t>(data[index] & 0b00011111);
-	index++;
-	auto secondKey = readMPString(&data[index], secondKeyLength);
-	index += secondKeyLength;
-	auto secondValueLength = static_cast<uint8_t>(data[index] & 0b00011111);
-	index++;
-	auto secondValue = readMPString(&data[index], secondValueLength);
+
+	auto secondKey = readMPString(&data[index]);
+	index += secondKey.length() + 1; // +1 for control byte
+	auto secondValue = readMPString(&data[index]);
 	attributes[secondKey] = secondValue;
 	// We don't know what the value for address will be, so just verify it is in the map.
 	ASSERT(attributes.find("address") != attributes.end());
-	ASSERT(strncmp(attributes["operation"].c_str(), "grv", strlen("grv")) == 0);
+	ASSERT(attributes["operation"] == "grv");

 	request.reset();

@ -876,9 +880,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
 	// We don't care about the next 54 bytes as there is no parent and a randomly assigned Trace and SpanID
 	// Read and verify span name
 	ASSERT(data[55] == 0xda);
-	auto locationLength = swapUint16BE(&data[56]);
-	ASSERT(locationLength == strlen(longString));
-	ASSERT(strncmp(readMPString(&data[58], locationLength).c_str(), longString, strlen(longString)) == 0);
+	ASSERT(readMPString(&data[55]) == longString);
 	return Void();
 };
 #endif
--- a/flow/actorcompiler.h
+++ b/flow/actorcompiler.h
@ -73,3 +73,11 @@ T waitNext(const FutureStream<T>&);
 #ifdef _MSC_VER
 #pragma warning(disable : 4355) // 'this' : used in base member initializer list
 #endif
+
+// Currently, #ifdef can't be used inside actors, so define no-op versions of these valgrind
+// functions if valgrind is not defined
+#ifndef VALGRIND
+#define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
+#define VALGRIND_MAKE_MEM_DEFINED(x, y)
+#define VALGRIND_CHECK_MEM_IS_DEFINED(x, y) 0
+#endif
--- a/flow/error_definitions.h
+++ b/flow/error_definitions.h
@ -88,6 +88,13 @@ ERROR( blob_granule_transaction_too_old, 1064, "Read version is older than blob
 ERROR( blob_manager_replaced, 1065, "This blob manager has been replaced." )
 ERROR( change_feed_popped, 1066, "Tried to read a version older than what has been popped from the change feed" )
 ERROR( remote_kvs_cancelled, 1067, "The remote key-value store is cancelled" )
+ERROR( page_header_wrong_page_id, 1068, "Page header does not match location on disk" )
+ERROR( page_header_checksum_failed, 1069, "Page header checksum failed" )
+ERROR( page_header_version_not_supported, 1070, "Page header version is not supported" )
+ERROR( page_encoding_not_supported, 1071, "Page encoding type is not supported or not valid" )
+ERROR( page_decoding_failed, 1072, "Page content decoding failed" )
+ERROR( unexpected_encoding_type, 1073, "Page content decoding failed" )
+ERROR( encryption_key_not_found, 1074, "Encryption key not found" )

 ERROR( broken_promise, 1100, "Broken promise" )
 ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" )
@ -290,14 +297,14 @@ ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum
 ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported")
 ERROR( snap_invalid_uid_string, 2509, "The given uid string is not a 32-length hex string")

-// 3XXX - Encryption operations errors
-ERROR( encrypt_ops_error, 3000, "Encryption operation error")
-ERROR( encrypt_header_metadata_mismatch, 3001, "Encryption header metadata mismatch")
-ERROR( encrypt_key_not_found, 3002, "Expected encryption key is missing")
-ERROR( encrypt_key_ttl_expired, 3003, "Expected encryption key TTL has expired")
-ERROR( encrypt_header_checksum_mismatch, 3004, "Encryption header checksum mismatch")
-ERROR( encrypt_update_cipher, 3005, "Attempt to update encryption cipher key")
-ERROR( encrypt_invalid_id, 3006, "Invalid encryption domainId or encryption cipher key id")
+// 27XX - Encryption operations errors
+ERROR( encrypt_ops_error, 2700, "Encryption operation error")
+ERROR( encrypt_header_metadata_mismatch, 2701, "Encryption header metadata mismatch")
+ERROR( encrypt_key_not_found, 2702, "Expected encryption key is missing")
+ERROR( encrypt_key_ttl_expired, 2703, "Expected encryption key TTL has expired")
+ERROR( encrypt_header_authtoken_mismatch, 2704, "Encryption header authentication token mismatch")
+ERROR( encrypt_update_cipher, 2705, "Attempt to update encryption cipher key")
+ERROR( encrypt_invalid_id, 2706, "Invalid encryption domainId or encryption cipher key id")

 // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
 ERROR( unknown_error, 4000, "An unknown error occurred" )  // C++ exception not of type Error
--- a/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.txt
+++ b/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.txt
@ -1,3 +1,5 @@
+storageEngineExcludeTypes=3
+
 logAntiQuorum = 0

 testTitle=SubmitBackup
--- a/tests/restarting/from_7.0.0/SnapTestAttrition-1.txt
+++ b/tests/restarting/from_7.0.0/SnapTestAttrition-1.txt
@ -1,3 +1,5 @@
+storageEngineExcludeTypes=3
+
 ;write 1000 Keys ending with even numbers
 testTitle=SnapTestPre
 clearAfterTest=false
--- a/tests/restarting/from_7.0.0/SnapTestRestart-1.txt
+++ b/tests/restarting/from_7.0.0/SnapTestRestart-1.txt
@ -1,3 +1,5 @@
+storageEngineExcludeTypes=3
+
 ;write 1000 Keys ending with even numbers
 testTitle=SnapTestPre
 clearAfterTest=false
--- a/tests/restarting/from_7.0.0/SnapTestSimpleRestart-1.txt
+++ b/tests/restarting/from_7.0.0/SnapTestSimpleRestart-1.txt
@ -1,3 +1,5 @@
+storageEngineExcludeTypes=3
+
 ;write 1000 Keys ending with even number
 testTitle=SnapSimplePre
 clearAfterTest=false
--- a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
+++ b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
@ -1,3 +1,5 @@
+storageEngineExcludeTypes=3
+
 [[test]]
 testTitle = 'SubmitBackup'
 simBackupAgents= 'BackupToFile'
--- a/tests/restarting/from_7.1.0/ConfigureStorageMigrationTestRestart-2.toml
+++ b/tests/restarting/from_7.1.0/ConfigureStorageMigrationTestRestart-2.toml
@ -10,6 +10,7 @@ waitForQuiescenceBegin=false
    testName = 'ConfigureDatabase'
    testDuration = 300.0
    waitStoreTypeCheck = true
+    storageMigrationCompatibleConf = true

    [[test.workload]]
    testName = 'RandomClogging'
--- a/tests/restarting/to_7.0.0/CycleTestRestart-1.txt
+++ b/tests/restarting/to_7.0.0/CycleTestRestart-1.txt
@ -1,4 +1,4 @@
-storageEngineExcludeTypes=-1,-2
+storageEngineExcludeTypes=-1,-2,3
 maxTLogVersion=6
 disableTss=true
 disableHostname=true
--- a/tests/restarting/to_7.1.0/ConfigureStorageMigrationTestRestart-2.toml
+++ b/tests/restarting/to_7.1.0/ConfigureStorageMigrationTestRestart-2.toml
@ -10,6 +10,7 @@ waitForQuiescenceBegin=false
    testName = 'ConfigureDatabase'
    testDuration = 300.0
    waitStoreTypeCheck = true
+    storageMigrationCompatibleConf = true

    [[test.workload]]
    testName = 'RandomClogging'