Merge branch 'main' into granule_merging_batch

2022-07-20 07:42:26 -05:00 · 2022-07-20 07:42:26 -05:00 · 78b6a96006
parent 4000682578 a7469b925b
commit 78b6a96006
162 changed files with 3844 additions and 1270 deletions
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -467,7 +467,9 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) # Linux Only
  add_test(NAME fdb_c_shim_library_tests
    COMMAND $<TARGET_FILE:Python::Interpreter> ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_shim_tests.py
          --build-dir ${CMAKE_BINARY_DIR}
-          --source-dir ${CMAKE_SOURCE_DIR}
+          --unit-tests-bin $<TARGET_FILE:fdb_c_shim_unit_tests>
+          --api-tester-bin $<TARGET_FILE:fdb_c_shim_api_tester>
+          --api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
          )

 endif() # End Linux only
--- a/bindings/c/test/fdb_c_shim_tests.py
+++ b/bindings/c/test/fdb_c_shim_tests.py
@ -87,14 +87,12 @@ class FdbCShimTests:
        self.build_dir = Path(args.build_dir).resolve()
        assert self.build_dir.exists(), "{} does not exist".format(args.build_dir)
        assert self.build_dir.is_dir(), "{} is not a directory".format(args.build_dir)
-        self.source_dir = Path(args.source_dir).resolve()
-        assert self.source_dir.exists(), "{} does not exist".format(args.source_dir)
-        assert self.source_dir.is_dir(), "{} is not a directory".format(args.source_dir)
-        self.api_tester_bin = self.build_dir.joinpath("bin", "fdb_c_shim_api_tester")
-        assert self.api_tester_bin.exists(), "{} does not exist".format(self.api_tester_bin)
-        self.unit_tests_bin = self.build_dir.joinpath("bin", "fdb_c_shim_unit_tests")
+        self.unit_tests_bin = Path(args.unit_tests_bin).resolve()
        assert self.unit_tests_bin.exists(), "{} does not exist".format(self.unit_tests_bin)
-        self.api_test_dir = self.source_dir.joinpath("bindings", "c", "test", "apitester", "tests")
+        self.api_tester_bin = Path(args.api_tester_bin).resolve()
+        assert self.api_tester_bin.exists(), "{} does not exist".format(self.api_tests_bin)
+        self.api_test_dir = Path(args.api_test_dir).resolve()
+        assert self.api_test_dir.exists(), "{} does not exist".format(self.api_test_dir)
        self.downloader = FdbBinaryDownloader(args.build_dir)
        # binary downloads are currently available only for x86_64
        self.platform = platform.machine()
@ -196,13 +194,12 @@ if __name__ == "__main__":
        help="FDB build directory",
        required=True,
    )
-    parser.add_argument(
-        "--source-dir",
-        "-s",
-        metavar="SOURCE_DIRECTORY",
-        help="FDB source directory",
-        required=True,
-    )
+    parser.add_argument('--unit-tests-bin', type=str,
+                        help='Path to the fdb_c_shim_unit_tests executable.')
+    parser.add_argument('--api-tester-bin', type=str,
+                        help='Path to the fdb_c_shim_api_tester executable.')
+    parser.add_argument('--api-test-dir', type=str,
+                        help='Path to a directory with api test definitions.')
    args = parser.parse_args()
    test = FdbCShimTests(args)
    test.run_tests()
--- a/bindings/python/tests/fdbcli_tests.py
+++ b/bindings/python/tests/fdbcli_tests.py
@ -628,6 +628,9 @@ def tenants(logger):
    assert(len(json_output['tenant']) == 2)
    assert('id' in json_output['tenant'])
    assert('prefix' in json_output['tenant'])
+    assert(len(json_output['tenant']['prefix']) == 2)
+    assert('base64' in json_output['tenant']['prefix'])
+    assert('printable' in json_output['tenant']['prefix'])

    output = run_fdbcli_command('usetenant')
    assert output == 'Using the default tenant'
--- a/bindings/python/tests/tenant_tests.py
+++ b/bindings/python/tests/tenant_tests.py
@ -21,6 +21,7 @@
 import fdb
 import sys
 import json
+import base64
 from fdb.tuple import pack

 if __name__ == '__main__':
@ -65,11 +66,11 @@ def test_tenant_operations(db):

    t1_entry = tenant_list[0].value
    t1_json = json.loads(t1_entry)
-    p1 = t1_json['prefix'].encode('utf8')
+    p1 = base64.b64decode(t1_json['prefix']['base64'])

    t2_entry = tenant_list[1].value
    t2_json = json.loads(t2_entry)
-    p2 = t2_json['prefix'].encode('utf8')
+    p2 = base64.b64decode(t2_json['prefix']['base64'])

    tenant1 = db.open_tenant(b'tenant1')
    tenant2 = db.open_tenant(b'tenant2')
@ -80,12 +81,12 @@ def test_tenant_operations(db):

    tenant1_entry = db[b'\xff\xff/management/tenant/map/tenant1']
    tenant1_json = json.loads(tenant1_entry)
-    prefix1 = tenant1_json['prefix'].encode('utf8')
+    prefix1 = base64.b64decode(tenant1_json['prefix']['base64'])
    assert prefix1 == p1

    tenant2_entry = db[b'\xff\xff/management/tenant/map/tenant2']
    tenant2_json = json.loads(tenant2_entry)
-    prefix2 = tenant2_json['prefix'].encode('utf8')
+    prefix2 = base64.b64decode(tenant2_json['prefix']['base64'])
    assert prefix2 == p2

    assert tenant1[b'tenant_test_key'] == b'tenant1'
--- a/cmake/CompileBoost.cmake
+++ b/cmake/CompileBoost.cmake
@ -9,7 +9,7 @@ function(compile_boost)

  # Configure bootstrap command
  set(BOOTSTRAP_COMMAND "./bootstrap.sh")
-  set(BOOTSTRAP_LIBRARIES "context,filesystem")
+  set(BOOTSTRAP_LIBRARIES "context,filesystem,iostreams")

  set(BOOST_CXX_COMPILER "${CMAKE_CXX_COMPILER}")
  # Can't build Boost with Intel compiler, use clang instead.
@ -65,7 +65,8 @@ function(compile_boost)
    UPDATE_COMMAND ""
    BUILD_BYPRODUCTS "${BOOST_INSTALL_DIR}/boost/config.hpp"
                     "${BOOST_INSTALL_DIR}/lib/libboost_context.a"
-                     "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a")
+                     "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a"
+                     "${BOOST_INSTALL_DIR}/lib/libboost_iostreams.a")

  add_library(${COMPILE_BOOST_TARGET}_context STATIC IMPORTED)
  add_dependencies(${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}Project)
@ -75,9 +76,13 @@ function(compile_boost)
  add_dependencies(${COMPILE_BOOST_TARGET}_filesystem ${COMPILE_BOOST_TARGET}Project)
  set_target_properties(${COMPILE_BOOST_TARGET}_filesystem PROPERTIES IMPORTED_LOCATION "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a")

+  add_library(${COMPILE_BOOST_TARGET}_iostreams STATIC IMPORTED)
+  add_dependencies(${COMPILE_BOOST_TARGET}_iostreams ${COMPILE_BOOST_TARGET}Project)
+  set_target_properties(${COMPILE_BOOST_TARGET}_iostreams PROPERTIES IMPORTED_LOCATION "${BOOST_INSTALL_DIR}/lib/libboost_iostreams.a")
+
  add_library(${COMPILE_BOOST_TARGET} INTERFACE)
  target_include_directories(${COMPILE_BOOST_TARGET} SYSTEM INTERFACE ${BOOST_INSTALL_DIR}/include)
-  target_link_libraries(${COMPILE_BOOST_TARGET} INTERFACE ${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}_filesystem)
+  target_link_libraries(${COMPILE_BOOST_TARGET} INTERFACE ${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}_filesystem ${COMPILE_BOOST_TARGET}_iostreams)

 endfunction(compile_boost)

@ -103,11 +108,11 @@ set(Boost_USE_STATIC_LIBS ON)
 if (UNIX AND CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
  list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_78_0_clang)
  set(BOOST_HINT_PATHS /opt/boost_1_78_0_clang)
-  message(STATUS "Using Clang version of boost::context and boost::filesystem")
+  message(STATUS "Using Clang version of boost::context boost::filesystem and boost::iostreams")
 else ()
  list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_78_0)
  set(BOOST_HINT_PATHS /opt/boost_1_78_0)
-  message(STATUS "Using g++ version of boost::context and boost::filesystem")
+  message(STATUS "Using g++ version of boost::context boost::filesystem and boost::iostreams")
 endif ()

 if(BOOST_ROOT)
@ -119,18 +124,18 @@ if(WIN32)
  # properly for config mode. So we use the old way on Windows
  #  find_package(Boost 1.72.0 EXACT QUIET REQUIRED CONFIG PATHS ${BOOST_HINT_PATHS})
  # I think depending on the cmake version this will cause weird warnings
-  find_package(Boost 1.72 COMPONENTS filesystem)
+  find_package(Boost 1.72 COMPONENTS filesystem iostreams)
  add_library(boost_target INTERFACE)
-  target_link_libraries(boost_target INTERFACE Boost::boost Boost::filesystem)
+  target_link_libraries(boost_target INTERFACE Boost::boost Boost::filesystem Boost::iostreams)
  return()
 endif()

 find_package(Boost 1.78.0 EXACT QUIET COMPONENTS context filesystem CONFIG PATHS ${BOOST_HINT_PATHS})
 set(FORCE_BOOST_BUILD OFF CACHE BOOL "Forces cmake to build boost and ignores any installed boost")

-if(Boost_FOUND AND Boost_filesystem_FOUND AND Boost_context_FOUND AND NOT FORCE_BOOST_BUILD)
+if(Boost_FOUND AND Boost_filesystem_FOUND AND Boost_context_FOUND AND Boost_iostreams_FOUND AND NOT FORCE_BOOST_BUILD)
  add_library(boost_target INTERFACE)
-  target_link_libraries(boost_target INTERFACE Boost::boost Boost::context Boost::filesystem)
+  target_link_libraries(boost_target INTERFACE Boost::boost Boost::context Boost::filesystem Boost::iostreams)
 elseif(WIN32)
  message(FATAL_ERROR "Could not find Boost")
 else()
--- a/cmake/FlowCommands.cmake
+++ b/cmake/FlowCommands.cmake
@ -9,6 +9,14 @@ define_property(TARGET PROPERTY COVERAGE_FILTERS
 expression in this list will be ignored when the coverage.target.xml file is \
 generated. This property is set through the add_flow_target function.")

+if(WIN32)
+  set(compilation_unit_macro_default OFF)
+else()
+  set(compilation_unit_macro_default ON)
+endif()
+
+set(PASS_COMPILATION_UNIT "${compilation_unit_macro_default}" CACHE BOOL
+  "Pass path to compilation unit as macro to each compilation unit (useful for code probes)")

 function(generate_coverage_xml)
  if(NOT (${ARGC} EQUAL "1"))
@ -259,6 +267,11 @@ function(add_flow_target)
        endif()
      endif()
    endforeach()
+    if(PASS_COMPILATION_UNIT)
+      foreach(s IN LISTS sources)
+        set_source_files_properties("${s}" PROPERTIES COMPILE_DEFINITIONS "COMPILATION_UNIT=${s}")
+      endforeach()
+    endif()
    if(AFT_EXECUTABLE)
      set(strip_target ON)
      set(target_type exec)
--- a/contrib/ctest_to_joshua.py
+++ b/contrib/ctest_to_joshua.py
@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 from argparse import ArgumentParser
 import glob
 import io
@ -31,11 +33,15 @@ class JoshuaBuilder:
        if os.path.exists(arg):
            if not os.path.relpath(arg, self.build_dir).startswith(".."):
                relpath = "build/" + os.path.relpath(arg, self.build_dir)
-                self.files[arg] = relpath
+                # Avoid packaging the full build directory.
+                if relpath != "build/.":
+                    self.files[arg] = relpath
                return relpath
            elif not os.path.relpath(arg, self.src_dir).startswith(".."):
                relpath = "src/" + os.path.relpath(arg, self.src_dir)
-                self.files[arg] = relpath
+                # Avoid packaging the full source directory.
+                if relpath != "src/.":
+                    self.files[arg] = relpath
                return relpath
            elif os.access(arg, os.X_OK):
                # Hope it's on the path
@ -61,8 +67,7 @@ class JoshuaBuilder:
    def write_tarball(self, output, joshua_test):
        with tarfile.open(output, "w:gz") as tar:
            for file, arcfile in self.files.items():
-                if not os.path.isdir(file):
-                    self._add_file(tar, file, arcfile)
+                self._add_file(tar, file, arcfile)
            tarinfo = tarfile.TarInfo("joshua_test")
            tarinfo.mode = 0o755
            joshua_bytes = joshua_test.encode("utf-8")
@ -114,6 +119,7 @@ Unknown arguments are forwarded to ctest, so you may use -R to filter tests e.g.
    joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbcli"))
    joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbmonitor"))
    joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbserver"))
+    joshua_builder.add_arg(os.path.join(args.build_dir, "bin/mkcert"))
    if platform.system() == "Darwin":
        joshua_builder.add_arg(os.path.join(args.build_dir, "lib/libfdb_c.dylib"))
    else:
--- a/contrib/sqlite/sqlite3.amalgamation.c
+++ b/contrib/sqlite/sqlite3.amalgamation.c
@ -87009,7 +87009,7 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin(
  }
  sqlite3_query_plan[nQPlan] = 0;
  nQPlan = 0;
-#endif /* SQLITE_TEST // Testing and debugging use only */
+#endif /* SQLITE_TEST // Testing and debugging use only */");

  /* Record the continuation address in the WhereInfo structure.  Then
  ** clean up and return.
--- a/design/LoadBalancing/LoadBalancing.md
+++ b/design/LoadBalancing/LoadBalancing.md
@ -0,0 +1,227 @@
+# Load Balancing in FoundationDB
+
+## Introduction
+
+FoundationDB is a distributed key-value database. A FoundationDB cluster is constituted by one or more processes over one or more physical machines, where each process is a *worker* and takes certain *role*s, such as coordinator, proxy, TLog, storage server, etc., in the system.
+
+The interpocess communications (IPC) between the processes are supported by the [`flow`](https://github.com/apple/foundationdb/tree/main/flow) infrastructure. In the `flow` context, each process will expose one or more *interface*(s). Each interface is able to accept given type of *request*s, and *reply* `Void`, requested data or error. The interfaces and the corresponding request/reply pairs forms the IPC protocol of FoundationDB.
+
+In many cases, the same request can be proceed by multiple processes, e.g. all commit proxies can accept commit requests, and multiple storage server processes can provide values for a given key in double/triple redundancy mode. A load balancer (LB) can be used to distribute the requests over the possible interfaces, preventing one or a few processes getting overloaded. The interface candidates are also referred as *alternative*s. The LB is also able to react when one or more interfaces are (temporarily) unavailable by retrying, or re-routing the request to other candidates. The interface candidates are also known as *alternative*s.
+
+Two LBs are provided in FoundationDB: `basicLoadBalance` and `loadBalance`, both defined in [`LoadBalance.actor.h`](https://github.com/apple/foundationdb/blob/main/fdbrpc/include/fdbrpc/LoadBalance.actor.h). The `basicLoadBalance` is a simple load balancer which each interface is equally chosen; while the `loadBalance` accepts a model object, which provides [datacenter](https://apple.github.io/foundationdb/configuration.html#configuring-regions) (DC) awaring balancing algorithms, allowing requests being sent to interfaces in the same DC.
+
+In the following sections, the two LBs will be discussed in details.
+
+## `basicLoadBalance`
+
+`basicLoadBalance` implements a simple load balancing algorithm. It applies to
+
+* Commit proxy interface
+* GetReadVersion proxy interface
+* ConfigFollower interface
+
+Here, the interfaces are assumed to be always *fresh*, i.e. the list of the servers is fixed.
+
+```mermaid
+graph LR
+    H0{Has alternatives?}
+    H1[Pick an alternative]
+    H2[Backoff]
+    H3[Request]
+    H4([Reply])
+    H5([Error])
+    H6([Never])
+    H((Start)) --> H0
+    H0 --No--> H6
+    H0 --Yes--> H1
+    H1 --No healthy alternatives--> H2 --Retry--> H1
+    H1 --Has alternative--> H3 --Success--> H4
+    H3 --Exception--> H5
+    H3 --Broken Promise --> H2
+```
+
+### Alternative pick algorithm
+
+In `basicLoadBalance`, a *best* alternative is picked and used at the beginning. At this stage, this alternative is randomly picked among all alternatives. If the best alternative does not work, it will iteratively try other interfaces, see [here](#picking-up-an-alternative-in-basic-load-balancing-algorithm).
+
+## `loadBalance`
+
+`loadBalance` provides a more sophisticated implementation of load balancing. In addition of the basic load balancing, it also provides a variety of features:
+
+* Support for Test Storage Server ([TSS](https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/tss.rst))
+* Datacenter awaring alternative election
+* Recording the latency and penalty from interfaces, and [prioritize the interfaces based on previously stored data](#with-queuemodel).
+* Able to handle timeouts and SS exceptions with retries.
+
+Currently it is used for
+
+* Storage Server interface
+* BlobWorker interface
+
+
+
+```mermaid
+graph LR
+    H((Start))
+    H0{Has alternatives?}
+    H1[Choose initial candidates]
+    H4([Never])
+    H5[pick an alternative]
+    H6[Send request]
+    H7[Wait for available alternative]
+    H8([Response])
+    H9([All alternatives failed])
+
+    H --> H0 --No--> H4
+    H0 --Yes--> H1
+    H1 --> H5
+    H5 --Has alternative--> H6
+    H5 --No alternative-->H7
+    H6 --Success--> H8
+    H6 --Failure--> H5
+    H7 --At least one alternative--> H5
+    H7 --> H9
+```
+
+Note:
+
+* The response could be either a reply, or an `Error`, e.g. `process_behind` or `request_maybe_delivered`.
+
+### Choose initial candidates
+
+Two initial candidates will be picked before the requests start. They will be selected as the first two alternatives for the load balancer. If both of them failed, other alternatives are used in a round-robin way.
+
+#### No `QueueModel`
+
+If no `QueueModel` is provided, the initial candidates are picked randomly. The first candidate, or the *best* alternative, will be the one that in the same DC, if possible.
+
+#### With `QueueModel`
+
+`QueueModel` holds information about each candidate related to future version, latency and penalty.
+
+* If the storage server is returning a future version error, it is marked as not available until some certain time.
+* Penalty is reported by storage server in each response (see `storageserver.actor.cpp:StorageServer::getPenalty`). It is determined by the write queue length and the durability lagging.
+
+If `QueueModel` exists, the candidates will be picked base on the penalty. Workers with high penalties will be avoided when picking the first two candidates.
+
+### Pick an alternative
+
+The alternatives are chosen in the round-robin way when the first two candidates failed. If all alternatives failed, a flag is set, and if the next request fails with `process_behind`, the caller will receive the `process_behind` error.
+
+### Send requests to workers
+
+Here it is assumed that there are at least one alternative available. If no alternative is available, the LB will wait.
+
+```mermaid
+graph LR
+    H((start))
+    H0{Is first request}
+    H1[Send first request]
+   	H2([Response])
+   	H3[Pick up next alternative]
+   	H4[Send additional request]
+
+    H --> H3
+    H3 -->H0
+   	H0 --Yes--> H1
+   	H1 --Success--> H2
+   	H1 --Timeout--> H3
+   	H0 --No--> H4
+   	H4 --First request succeed--> H2
+   	H4 --Second request succeed--> H2
+   	H4 --Additional request failed--> H3
+```
+
+The first request has a timeout option. If the LB is not able to retrieve the response within the timout, more requests will be sent to secondary and other available interfaces. If the first request failed, it is reset and the next request will be considered as the first request. Certain types of errors can also be returned as response, e.g. `request_may_be_delivered` or `process_behind`, which may not trigger a load-balancer retry.
+
+### Wait for available alternative
+
+When there is no alternatives available, the load balancer may wait until at least one interface is up.
+
+```mermaid
+graph LR
+    H0((start))
+    H1{Is first request in-flight}
+    H2[Wait for the first request]
+    H3([Response])
+    H4([Retry])
+    H5[Wait for alternatives]
+    H6([all_alternatives_failed])
+    
+    H0 --> H1
+    H1 --Yes--> H2
+    H1 --No--> H5
+    H5 --Timeout-->H6
+    H5 --Success-->H4
+    H2 --Success-->H3
+    H2 --Failed-->H4
+```
+
+Note that "Wait for alternatives" will only timeout if the alternatives are always not fresh, i.e. this only happens when accessing storage servers. LB will throw `all_alternatives_failed` when timeout in this case.
+
+#### Requests
+
+Original requests in `loadBalancer` are wrapped by `LoadBalance.actor.h:RequestData`. It provides the following additional operations besides the original `flow` request:
+
+* TSS support if `QueueModel` is available
+* Translate some errors into `maybe_delivered`, `process_behind` or retries
+* Update the `QueueModel` information including latency, penalty, etc.
+
+## Appendix
+
+### Picking an alternative in basic load balancing algorithm
+
+The following script simulates the alternative picking up algorithm. The chosen alternatives will be printed out one-by-one. The `loadBalance` function uses a similar approach, though the interfaces in the same DC are used firstly.
+
+```python
+#! /usr/bin/env python3
+
+import random
+import time
+
+
+class Alternatives:
+
+    def __init__(self, num_alternatives):
+        self._size = num_alternatives
+    
+    def size(self):
+        return self._size
+
+    def get_best(self):
+        return random.randint(0, self._size - 1)
+
+
+# Entry
+NUM_ALTERNATIVES = 10
+alts = Alternatives(NUM_ALTERNATIVES)
+
+best_alt = alts.get_best()
+next_alt = random.randint(0, alts.size() - 2)
+if next_alt >= best_alt:
+    next_alt += 1
+start_alt = next_alt
+start_distance = (best_alt + alts.size() - start_alt) % alts.size()
+use_alt = None
+
+print("best_alt = {}".format(best_alt))
+print("start_alt = {}".format(start_alt))
+print("start_distance = {}".format(start_distance))
+
+while True:
+    for alt_num in range(0, alts.size()):
+        use_alt = next_alt
+        if next_alt == start_alt:
+            print("  Going back to the start_alt")
+            use_alt = best_alt
+        elif (next_alt + alts.size() - start_alt) % alts.size() <= start_distance:
+            print("  Entering start_distance")
+            use_alt = (next_alt + alts.size() - 1) % alts.size()
+        
+        print("Attempting alt: {}".format(use_alt))
+
+        # Next loop
+        next_alt = (next_alt + 1) % alts.size()
+        time.sleep(.2)
+```
+
--- a/design/LoadBalancing/LoadBalancing.pdf
+++ b/design/LoadBalancing/LoadBalancing.pdf
--- a/design/data-distributor-internals.md
+++ b/design/data-distributor-internals.md
@ -69,10 +69,11 @@ When a data distribution role is created, it recovers the states of the previous
 ### When to move keys?

 Keys can be moved from a server to another for several reasons:
-(1) DD moves keys from overutilized servers to underutilized servers, where a server’s utilization is defined as the server’s disk usage;
-(2) DD splits or merges shards in order to rebalance the disk usage of servers;
-(3) DD removes redundant teams when the team number is larger than the desired number;
-(4) DD repairs the replication factor by duplicate shards from a server to another when servers in a team fail.
+(1) DD moves keys from disk-overutilized servers to disk-underutilized servers, where a server’s disk-utilization is defined as the server’s disk space usage;
+(2) DD moves keys from read-busy servers to read-cold servers if read-aware data distribution is enabled;
+(3) DD splits or merges shards in order to rebalance the disk usage of servers;
+(4) DD removes redundant teams when the team number is larger than the desired number;
+(5) DD repairs the replication factor by duplicate shards from a server to another when servers in a team fail.

 Actors are created to monitor the reasons of key movement:
 (1) `MountainChopper` and `ValleyFiller` actors periodically measure a random server team’s utilization and rebalance the server’s keys among other servers;
@ -93,3 +94,62 @@ The data movement from one server (called source server) to another (called dest
 (2) The destination server will issue transactions to read the shard range and write the key-value pairs back. The key-value will be routed to the destination server and saved in the server’s storage engine;
 (3) DD removes the source server from the shard’s ownership by modifying the system keyspace;
 (4) DD removes the shard’s information owned by the source server from the server’s team information (i.e., *shardsAffectedByTeamFailure*).
+
+# Read-aware Data Distribution
+
+## Motivation
+Before FDB 7.2, when the data distributor wants to rebalance shard, it only considers write bandwidth when choosing source and destination team, and the moved shard is chosen randomly. There are several cases where uneven read distribution from users causes a small subset of servers to be busy with read requests. This motivates the data distributor considering read busyness to minimize the read load unevenness.
+
+## When does read rebalance happen
+The data distributor will periodically check whether the read rebalance is needed. The conditions of rebalancing are 
+* the **worst CPU usage of source team >= 0.15** , which means the source team is somewhat busy;
+* the ongoing relocation is less than the parallelism budget. `queuedRelocation[ priority ] < countLimit (default 50)`;
+* the source team is not throttled to be a data movement source team. `( now() - The last time the source team was selected ) * time volumn (default 20) > read sample interval (2 min default)`;
+* the read load difference between source team and destination team is larger than 30% of the source team load;
+
+## Metrics definition
+* READ_LOAD = ceil(READ_BYTES_PER_KSECOND / PAGE_SIZE) 
+* READ_IMBALANCE = ( MAX READ_LOAD / AVG READ_LOAD )
+* MOVE_SCORE = READ_DENSITY = READ_BYTES_PER_KSECOND / SHARD_BYTE
+
+The aim for read-aware data distributor is to minimize the IMBALANCE while not harm the disk utilization balance.
+
+## Which shard to move
+Basically, the MountainChopper will handle read-hot shards distribution with following steps:
+1. The MountainChopper chooses **the source team** with the largest READ_LOAD while it satisfies HARD_CONSTRAINT, then check whether rebalance is needed; 
+    * Hard constraint:
+        * Team is healthy
+        * The last time this team was source team is larger than (READ_SAMPLE_INTERVAL / MOVEMENT_PER_SAMPLE)
+        * The worst CPU usage of source team >= 0.15
+2. Choose the destination team for moving
+    * Hard constraint:
+        * Team is healthy
+        * The team’s available space is larger than the median free space
+    * Goals
+        * The destination team has the least LOAD in a random team set while it satisfies HARD_CONSTRAINT;
+3. Select K shards on the source team of which 
+    a. `LOAD(shard) < (LOAD(src) - LOAD(dest)) * READ_REBALANCE_MAX_SHARD_FRAC `; 
+    b. `LOAD(shard) > AVG(SourceShardLoad)`; 
+    c. with the highest top-K `MOVE_SCORE`; 
+
+    We use 3.a and 3.b to set a eligible shard bandwidth for read rebalance moving. If the upper bound is too large, it’ll just make the hot shard shift to another team but not even the read load. If the upper bound is small, we’ll just move some cold shards to other servers, which is also not helpful. The default value of READ_REBALANCE_MAX_SHARD_FRAC is 0.2 (up to 0.5) which is decided based on skewed workload test.
+4. Issue relocation request to move a random shard in the top k set. If the maximum limit of read-balance movement is reached, give up this relocation.
+
+Note: The ValleyFiller chooses a source team from a random set with the largest LOAD, and a destination team with the least LOAD.
+
+## Performance Test and Summary
+### Metrics to measure
+1. StorageMetrics trace event report “FinishedQueries” which means the current storage server finishes how many read operations. The rate of FinishedQueries is what we measure first. The better the load balance is, the more similar the FinishedQueries rate across all storage servers.
+CPU utilization. This metric is in a positive relationship with “FinishedQueries rate”. A even “FinishedQueries” generally means even CPU utilization in the read-only scenario.
+2. Data movement size. We want to achieve load balance with as little movement as possible;
+3. StandardDeviation(FinishedQueries). It indicates how much difference read load each storage server has.
+
+### Typical Test Setup
+120GB data, key=32B, value=200B; Single replica; 8 SS (20%) serves 80% read; 8 SS servers 60% write; 4 servers are both read and write hot;  TPS=100000, 7 read/txn + 1 write/txn; 
+
+### Test Result Summary and Recommendation 
+* With intersected sets of read-hot and write-hot servers, read-aware DD even out the read + write load on the double-hot (be both read and write hot) server, which means the converged write load is similar to disk rebalance only algorithm.
+* Read-aware DD will balance the read workload under the read-skew scenario. Starting from an imbalance `STD(FinishedQueries per minute)=16k`,the best result it can achieve is `STD(FinishedQueries per minute) = 2k`.
+* The typical movement size under a read-skew scenario is 100M ~ 600M under default KNOB value `READ_REBALANCE_MAX_SHARD_FRAC=0.2, READ_REBALANCE_SRC_PARALLELISM = 20`. Increasing those knobs may accelerate the converge speed with the risk of data movement churn, which overwhelms the destination and over-cold the source.
+* The upper bound of `READ_REBALANCE_MAX_SHARD_FRAC` is 0.5. Any value larger than 0.5 can result in hot server switching.
+* When needing a deeper diagnosis of the read aware DD, `BgDDMountainChopper_New`, and `BgDDValleyFiller_New` trace events are where to go.
--- a/documentation/sphinx/source/command-line-interface.rst
+++ b/documentation/sphinx/source/command-line-interface.rst
@ -241,7 +241,10 @@ Included in the output of this command are the ``id`` and ``prefix`` assigned to
    {
        "tenant": {
            "id": 0,
-            "prefix": "\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
+            "prefix": {
+              "base64": "AAAAAAAAAAU=",
+              "printable": "\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x05",
+            }
        },
        "type": "success"
    }
--- a/fdbcli/KillCommand.actor.cpp
+++ b/fdbcli/KillCommand.actor.cpp
@ -29,6 +29,7 @@
 #include "flow/Arena.h"
 #include "flow/FastRef.h"
 #include "flow/ThreadHelper.actor.h"
+#include "flow/CodeProbe.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

 namespace fdb_cli {
--- a/fdbcli/TenantCommands.actor.cpp
+++ b/fdbcli/TenantCommands.actor.cpp
@ -210,7 +210,7 @@ CommandFactory listTenantsFactory(
                "The number of tenants to print can be specified using the [LIMIT] parameter, which defaults to 100."));

 // gettenant command
-ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion) {
 	if (tokens.size() < 2 || tokens.size() > 3 || (tokens.size() == 3 && tokens[2] != "JSON"_sr)) {
 		printUsage(tokens[0]);
 		return false;
@ -243,11 +243,16 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St

 				int64_t id;
 				std::string prefix;
+
 				doc.get("id", id);
-				doc.get("prefix", prefix);
+				if (apiVersion >= 720) {
+					doc.get("prefix.printable", prefix);
+				} else {
+					doc.get("prefix", prefix);
+				}

 				printf("  id: %" PRId64 "\n", id);
-				printf("  prefix: %s\n", printable(prefix).c_str());
+				printf("  prefix: %s\n", prefix.c_str());
 			}

 			return true;
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -49,6 +49,7 @@
 #include "flow/FastRef.h"
 #include "flow/Platform.h"
 #include "flow/SystemMonitor.h"
+#include "flow/CodeProbe.h"

 #include "flow/TLSConfig.actor.h"
 #include "flow/ThreadHelper.actor.h"
@ -882,7 +883,7 @@ struct CLIOptions {
 	std::vector<std::pair<std::string, std::string>> knobs;

 	// api version, using the latest version by default
-	int api_version = FDB_API_VERSION;
+	int apiVersion = FDB_API_VERSION;

 	CLIOptions(int argc, char* argv[]) {
 		program_name = argv[0];
@ -927,11 +928,11 @@ struct CLIOptions {
 			break;
 		case OPT_API_VERSION: {
 			char* endptr;
-			api_version = strtoul((char*)args.OptionArg(), &endptr, 10);
+			apiVersion = strtoul((char*)args.OptionArg(), &endptr, 10);
 			if (*endptr != '\0') {
 				fprintf(stderr, "ERROR: invalid client version %s\n", args.OptionArg());
 				return 1;
-			} else if (api_version < 700 || api_version > FDB_API_VERSION) {
+			} else if (apiVersion < 700 || apiVersion > FDB_API_VERSION) {
 				// multi-version fdbcli only available after 7.0
 				fprintf(stderr,
 				        "ERROR: api version %s is not supported. (Min: 700, Max: %d)\n",
@ -1113,7 +1114,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	TraceEvent::setNetworkThread();

 	try {
-		localDb = Database::createDatabase(ccf, opt.api_version, IsInternal::False);
+		localDb = Database::createDatabase(ccf, opt.apiVersion, IsInternal::False);
 		if (!opt.exec.present()) {
 			printf("Using cluster file `%s'.\n", ccf->getLocation().c_str());
 		}
@ -1934,7 +1935,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 				}

 				if (tokencmp(tokens[0], "gettenant")) {
-					bool _result = wait(makeInterruptable(getTenantCommandActor(db, tokens)));
+					bool _result = wait(makeInterruptable(getTenantCommandActor(db, tokens, opt.apiVersion)));
 					if (!_result)
 						is_error = true;
 					continue;
@ -2171,7 +2172,7 @@ int main(int argc, char** argv) {
 	}

 	try {
-		API->selectApiVersion(opt.api_version);
+		API->selectApiVersion(opt.apiVersion);
 		API->setupNetwork();
 		opt.setupKnobs();
 		if (opt.exit_code != -1) {
--- a/fdbcli/include/fdbcli/fdbcli.actor.h
+++ b/fdbcli/include/fdbcli/fdbcli.actor.h
@ -185,7 +185,7 @@ ACTOR Future<bool> fileConfigureCommandActor(Reference<IDatabase> db,
 // force_recovery_with_data_loss command
 ACTOR Future<bool> forceRecoveryWithDataLossCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
 // gettenant command
-ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
+ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens, int apiVersion);
 // include command
 ACTOR Future<bool> includeCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
 // kill command
--- a/fdbclient/BlobGranuleFiles.cpp
+++ b/fdbclient/BlobGranuleFiles.cpp
@ -18,16 +18,26 @@
 * limitations under the License.
 */

-#include <cstring>
-#include <vector>
-
-#include "fmt/format.h"
-#include "flow/IRandom.h"
-#include "flow/serialize.h"
 #include "fdbclient/BlobGranuleFiles.h"
+
+#include "fdbclient/BlobGranuleCommon.h"
+#include "fdbclient/ClientKnobs.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/SystemData.h" // for allKeys unit test - could remove
+
+#include "flow/BlobCipher.h"
+#include "flow/CompressionUtils.h"
+#include "flow/DeterministicRandom.h"
+#include "flow/IRandom.h"
+#include "flow/Trace.h"
+#include "flow/serialize.h"
 #include "flow/UnitTest.h"
+#include "flow/xxhash.h"
+
+#include "fmt/format.h"
+
+#include <cstring>
+#include <vector>

 #define BG_READ_DEBUG false

@ -73,7 +83,67 @@ struct ChildBlockPointerRef {
 	};
 };

-struct IndexBlockRef {
+namespace {
+BlobGranuleFileEncryptionKeys getEncryptBlobCipherKey(const BlobGranuleCipherKeysCtx cipherKeysCtx) {
+	BlobGranuleFileEncryptionKeys eKeys;
+
+	eKeys.textCipherKey = makeReference<BlobCipherKey>(cipherKeysCtx.textCipherKey.encryptDomainId,
+	                                                   cipherKeysCtx.textCipherKey.baseCipherId,
+	                                                   cipherKeysCtx.textCipherKey.baseCipher.begin(),
+	                                                   cipherKeysCtx.textCipherKey.baseCipher.size(),
+	                                                   cipherKeysCtx.textCipherKey.salt);
+	eKeys.headerCipherKey = makeReference<BlobCipherKey>(cipherKeysCtx.headerCipherKey.encryptDomainId,
+	                                                     cipherKeysCtx.headerCipherKey.baseCipherId,
+	                                                     cipherKeysCtx.headerCipherKey.baseCipher.begin(),
+	                                                     cipherKeysCtx.headerCipherKey.baseCipher.size(),
+	                                                     cipherKeysCtx.headerCipherKey.salt);
+
+	return eKeys;
+}
+
+void validateEncryptionHeaderDetails(const BlobGranuleFileEncryptionKeys& eKeys,
+                                     const BlobCipherEncryptHeader& header,
+                                     const StringRef& ivRef) {
+	// Validate encryption header 'cipherHeader' details sanity
+	if (!(header.cipherHeaderDetails.baseCipherId == eKeys.headerCipherKey->getBaseCipherId() &&
+	      header.cipherHeaderDetails.encryptDomainId == eKeys.headerCipherKey->getDomainId() &&
+	      header.cipherHeaderDetails.salt == eKeys.headerCipherKey->getSalt())) {
+		TraceEvent(SevError, "EncryptionHeader_CipherHeaderMismatch")
+		    .detail("HeaderDomainId", eKeys.headerCipherKey->getDomainId())
+		    .detail("ExpectedHeaderDomainId", header.cipherHeaderDetails.encryptDomainId)
+		    .detail("HeaderBaseCipherId", eKeys.headerCipherKey->getBaseCipherId())
+		    .detail("ExpectedHeaderBaseCipherId", header.cipherHeaderDetails.baseCipherId)
+		    .detail("HeaderSalt", eKeys.headerCipherKey->getSalt())
+		    .detail("ExpectedHeaderSalt", header.cipherHeaderDetails.salt);
+		throw encrypt_header_metadata_mismatch();
+	}
+	// Validate encryption header 'cipherHeader' details sanity
+	if (!(header.cipherHeaderDetails.baseCipherId == eKeys.headerCipherKey->getBaseCipherId() &&
+	      header.cipherHeaderDetails.encryptDomainId == eKeys.headerCipherKey->getDomainId() &&
+	      header.cipherHeaderDetails.salt == eKeys.headerCipherKey->getSalt())) {
+		TraceEvent(SevError, "EncryptionHeader_CipherTextMismatch")
+		    .detail("TextDomainId", eKeys.textCipherKey->getDomainId())
+		    .detail("ExpectedTextDomainId", header.cipherTextDetails.encryptDomainId)
+		    .detail("TextBaseCipherId", eKeys.textCipherKey->getBaseCipherId())
+		    .detail("ExpectedTextBaseCipherId", header.cipherTextDetails.baseCipherId)
+		    .detail("TextSalt", eKeys.textCipherKey->getSalt())
+		    .detail("ExpectedTextSalt", header.cipherTextDetails.salt);
+		throw encrypt_header_metadata_mismatch();
+	}
+	// Validate 'Initialization Vector' sanity
+	if (memcmp(ivRef.begin(), &header.iv[0], AES_256_IV_LENGTH) != 0) {
+		TraceEvent(SevError, "EncryptionHeader_IVMismatch")
+		    .detail("IVChecksum", XXH3_64bits(ivRef.begin(), ivRef.size()))
+		    .detail("ExpectedIVChecksum", XXH3_64bits(&header.iv[0], AES_256_IV_LENGTH));
+		throw encrypt_header_metadata_mismatch();
+	}
+}
+} // namespace
+
+struct IndexBlock {
+	constexpr static FileIdentifier file_identifier = 6525412;
+
+	// Serializable fields
 	VectorRef<ChildBlockPointerRef> children;

 	template <class Ar>
@ -82,9 +152,281 @@ struct IndexBlockRef {
 	}
 };

+struct IndexBlockRef {
+	constexpr static FileIdentifier file_identifier = 1945731;
+
+	// Serialized fields
+	Optional<StringRef> encryptHeaderRef;
+	// Encrypted/unencrypted IndexBlock
+	StringRef buffer;
+
+	// Non-serializable fields
+	IndexBlock block;
+
+	void encrypt(const BlobGranuleCipherKeysCtx cipherKeysCtx, Arena& arena) {
+		BlobGranuleFileEncryptionKeys eKeys = getEncryptBlobCipherKey(cipherKeysCtx);
+		ASSERT(eKeys.headerCipherKey.isValid() && eKeys.textCipherKey.isValid());
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			XXH64_hash_t chksum = XXH3_64bits(buffer.begin(), buffer.size());
+			TraceEvent(SevDebug, "IndexBlockEncrypt_Before").detail("Chksum", chksum);
+		}
+
+		EncryptBlobCipherAes265Ctr encryptor(eKeys.textCipherKey,
+		                                     eKeys.headerCipherKey,
+		                                     cipherKeysCtx.ivRef.begin(),
+		                                     AES_256_IV_LENGTH,
+		                                     ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
+		Value serializedBuff = ObjectWriter::toValue(block, Unversioned());
+		BlobCipherEncryptHeader header;
+		buffer = encryptor.encrypt(serializedBuff.contents().begin(), serializedBuff.contents().size(), &header, arena)
+		             ->toStringRef();
+		encryptHeaderRef = BlobCipherEncryptHeader::toStringRef(header, arena);
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			XXH64_hash_t chksum = XXH3_64bits(buffer.begin(), buffer.size());
+			TraceEvent(SevDebug, "IndexBlockEncrypt_After").detail("Chksum", chksum);
+		}
+	}
+
+	static void decrypt(const BlobGranuleCipherKeysCtx cipherKeysCtx, IndexBlockRef& idxRef, Arena& arena) {
+		BlobGranuleFileEncryptionKeys eKeys = getEncryptBlobCipherKey(cipherKeysCtx);
+
+		ASSERT(eKeys.headerCipherKey.isValid() && eKeys.textCipherKey.isValid());
+		ASSERT(idxRef.encryptHeaderRef.present());
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			XXH64_hash_t chksum = XXH3_64bits(idxRef.buffer.begin(), idxRef.buffer.size());
+			TraceEvent(SevDebug, "IndexBlockEncrypt_Before").detail("Chksum", chksum);
+		}
+
+		BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(idxRef.encryptHeaderRef.get());
+
+		validateEncryptionHeaderDetails(eKeys, header, cipherKeysCtx.ivRef);
+
+		DecryptBlobCipherAes256Ctr decryptor(eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin());
+		StringRef decrypted =
+		    decryptor.decrypt(idxRef.buffer.begin(), idxRef.buffer.size(), header, arena)->toStringRef();
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			XXH64_hash_t chksum = XXH3_64bits(decrypted.begin(), decrypted.size());
+			TraceEvent(SevDebug, "IndexBlockEncrypt_After").detail("Chksum", chksum);
+		}
+
+		// TODO: Add version?
+		ObjectReader dataReader(decrypted.begin(), Unversioned());
+		dataReader.deserialize(FileIdentifierFor<IndexBlock>::value, idxRef.block, arena);
+	}
+
+	void init(Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx, Arena& arena) {
+		if (encryptHeaderRef.present()) {
+			ASSERT(cipherKeysCtx.present());
+			decrypt(cipherKeysCtx.get(), *this, arena);
+		} else {
+			TraceEvent("IndexBlockSize").detail("Sz", buffer.size());
+
+			// TODO: Add version?
+			ObjectReader dataReader(buffer.begin(), Unversioned());
+			dataReader.deserialize(FileIdentifierFor<IndexBlock>::value, block, arena);
+		}
+	}
+
+	void finalize(Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx, Arena& arena) {
+		if (cipherKeysCtx.present()) {
+			// IndexBlock childBlock pointers offsets are relative to IndexBlock endOffset instead of file start offset.
+			// Compressing indexBlock will need offset recalculation (circular depedency). IndexBlock size is bounded by
+			// number of chunks and sizeof(KeyPrefix), 'not' compressing IndexBlock shouldn't cause significant file
+			// size bloat.
+
+			ASSERT(cipherKeysCtx.present());
+			encrypt(cipherKeysCtx.get(), arena);
+		} else {
+			encryptHeaderRef.reset();
+			buffer = StringRef(arena, ObjectWriter::toValue(block, Unversioned()).contents());
+		}
+
+		TraceEvent(SevDebug, "IndexBlockSize").detail("Sz", buffer.size()).detail("Encrypted", cipherKeysCtx.present());
+	}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, encryptHeaderRef, buffer);
+	}
+};
+
+// On-disk and/or in-memory representation of a IndexBlobGranuleFile 'chunk'.
+//
+// Encryption: A 'chunk' gets encrypted before getting persisted if enabled. Encryption header is persisted along with
+// the chunk data to assist decryption on reads.
+//
+// Compression: A 'chunk' gets compressed before getting persisted if enabled. Compression filter (algoritm) infomration
+// is persisted as part of 'chunk metadata' to assist decompression on reads.
+
+struct IndexBlobGranuleFileChunkRef {
+	constexpr static FileIdentifier file_identifier = 2814019;
+
+	// Serialized fields
+	Optional<CompressionFilter> compressionFilter;
+	Optional<StringRef> encryptHeaderRef;
+	// encrypted and/or compressed chunk;
+	StringRef buffer;
+
+	// Non-serialized
+	Optional<StringRef> chunkBytes;
+
+	static void encrypt(const BlobGranuleCipherKeysCtx& cipherKeysCtx,
+	                    IndexBlobGranuleFileChunkRef& chunkRef,
+	                    Arena& arena) {
+		BlobGranuleFileEncryptionKeys eKeys = getEncryptBlobCipherKey(cipherKeysCtx);
+
+		ASSERT(eKeys.headerCipherKey.isValid() && eKeys.textCipherKey.isValid());
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			XXH64_hash_t chksum = XXH3_64bits(chunkRef.buffer.begin(), chunkRef.buffer.size());
+			TraceEvent(SevDebug, "BlobChunkEncrypt_Before").detail("Chksum", chksum);
+		}
+
+		EncryptBlobCipherAes265Ctr encryptor(eKeys.textCipherKey,
+		                                     eKeys.headerCipherKey,
+		                                     cipherKeysCtx.ivRef.begin(),
+		                                     AES_256_IV_LENGTH,
+		                                     ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
+		BlobCipherEncryptHeader header;
+		chunkRef.buffer =
+		    encryptor.encrypt(chunkRef.buffer.begin(), chunkRef.buffer.size(), &header, arena)->toStringRef();
+		chunkRef.encryptHeaderRef = BlobCipherEncryptHeader::toStringRef(header, arena);
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			XXH64_hash_t chksum = XXH3_64bits(chunkRef.buffer.begin(), chunkRef.buffer.size());
+			TraceEvent(SevDebug, "BlobChunkEncrypt_After").detail("Chksum", chksum);
+		}
+	}
+
+	static StringRef decrypt(const BlobGranuleCipherKeysCtx& cipherKeysCtx,
+	                         const IndexBlobGranuleFileChunkRef& chunkRef,
+	                         Arena& arena) {
+		BlobGranuleFileEncryptionKeys eKeys = getEncryptBlobCipherKey(cipherKeysCtx);
+
+		ASSERT(eKeys.headerCipherKey.isValid() && eKeys.textCipherKey.isValid());
+		ASSERT(chunkRef.encryptHeaderRef.present());
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			XXH64_hash_t chksum = XXH3_64bits(chunkRef.buffer.begin(), chunkRef.buffer.size());
+			TraceEvent(SevDebug, "BlobChunkDecrypt_Before").detail("Chksum", chksum);
+		}
+
+		BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(chunkRef.encryptHeaderRef.get());
+
+		validateEncryptionHeaderDetails(eKeys, header, cipherKeysCtx.ivRef);
+
+		DecryptBlobCipherAes256Ctr decryptor(eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin());
+		StringRef decrypted =
+		    decryptor.decrypt(chunkRef.buffer.begin(), chunkRef.buffer.size(), header, arena)->toStringRef();
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			XXH64_hash_t chksum = XXH3_64bits(decrypted.begin(), decrypted.size());
+			TraceEvent(SevDebug, "BlobChunkDecrypt_After").detail("Chksum", chksum);
+		}
+
+		return decrypted;
+	}
+
+	static void compress(IndexBlobGranuleFileChunkRef& chunkRef,
+	                     const Value& chunk,
+	                     const CompressionFilter compFilter,
+	                     Arena& arena) {
+		chunkRef.compressionFilter = compFilter;
+		chunkRef.buffer = CompressionUtils::compress(chunkRef.compressionFilter.get(), chunk.contents(), arena);
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			XXH64_hash_t chunkChksum = XXH3_64bits(chunk.contents().begin(), chunk.contents().size());
+			XXH64_hash_t chksum = XXH3_64bits(chunkRef.buffer.begin(), chunkRef.buffer.size());
+			TraceEvent("CompressBlobChunk")
+			    .detail("Filter", CompressionUtils::toString(chunkRef.compressionFilter.get()))
+			    .detail("ChkSumBefore", chunkChksum)
+			    .detail("ChkSumAfter", chksum);
+		}
+	}
+
+	static StringRef decompress(const IndexBlobGranuleFileChunkRef& chunkRef, Arena& arena) {
+		ASSERT(chunkRef.compressionFilter.present());
+		return CompressionUtils::decompress(chunkRef.compressionFilter.get(), chunkRef.chunkBytes.get(), arena);
+	}
+
+	static Value toBytes(Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx,
+	                     Optional<CompressionFilter> compFilter,
+	                     const Value& chunk,
+	                     Arena& arena) {
+		IndexBlobGranuleFileChunkRef chunkRef;
+
+		if (compFilter.present()) {
+			IndexBlobGranuleFileChunkRef::compress(chunkRef, chunk, compFilter.get(), arena);
+		} else {
+			chunkRef.buffer = StringRef(arena, chunk.contents());
+		}
+
+		if (cipherKeysCtx.present()) {
+			IndexBlobGranuleFileChunkRef::encrypt(cipherKeysCtx.get(), chunkRef, arena);
+		}
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			TraceEvent(SevDebug, "GenerateBlobGranuleFileChunk")
+			    .detail("Encrypt", cipherKeysCtx.present())
+			    .detail("Compress", compFilter.present())
+			    .detail("CompFilter",
+			            compFilter.present() ? CompressionUtils::toString(compFilter.get())
+			                                 : CompressionUtils::toString(CompressionFilter::NONE));
+		}
+
+		// TODO: Add version?
+		return ObjectWriter::toValue(chunkRef, Unversioned());
+	}
+
+	static IndexBlobGranuleFileChunkRef fromBytes(Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx,
+	                                              StringRef buffer,
+	                                              Arena& arena) {
+		IndexBlobGranuleFileChunkRef chunkRef;
+		// TODO: Add version?
+		ObjectReader dataReader(buffer.begin(), Unversioned());
+		dataReader.deserialize(FileIdentifierFor<IndexBlobGranuleFileChunkRef>::value, chunkRef, arena);
+
+		if (chunkRef.encryptHeaderRef.present()) {
+			ASSERT(cipherKeysCtx.present());
+			chunkRef.chunkBytes = IndexBlobGranuleFileChunkRef::decrypt(cipherKeysCtx.get(), chunkRef, arena);
+		} else {
+			chunkRef.chunkBytes = chunkRef.buffer;
+		}
+
+		if (chunkRef.compressionFilter.present()) {
+			chunkRef.chunkBytes = IndexBlobGranuleFileChunkRef::decompress(chunkRef, arena);
+		} else if (!chunkRef.chunkBytes.present()) {
+			// 'Encryption' & 'Compression' aren't enabled.
+			chunkRef.chunkBytes = chunkRef.buffer;
+		}
+
+		ASSERT(chunkRef.chunkBytes.present());
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			TraceEvent(SevDebug, "ParseBlobGranuleFileChunk")
+			    .detail("Encrypted", chunkRef.encryptHeaderRef.present())
+			    .detail("Compressed", chunkRef.compressionFilter.present())
+			    .detail("CompFilter",
+			            chunkRef.compressionFilter.present()
+			                ? CompressionUtils::toString(chunkRef.compressionFilter.get())
+			                : CompressionUtils::toString(CompressionFilter::NONE));
+		}
+
+		return chunkRef;
+	}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, compressionFilter, encryptHeaderRef, buffer);
+	}
+};
+
 /*
 * A file header for a key-ordered file that is chunked on disk, where each chunk is a disjoint key range of data.
- * FIXME: encryption and compression support
 */
 struct IndexedBlobGranuleFile {
 	constexpr static FileIdentifier file_identifier = 3828201;
@ -93,16 +435,27 @@ struct IndexedBlobGranuleFile {
 	uint8_t fileType;
 	Optional<StringRef> filter; // not used currently

-	// TODO: add encrypted/compressed versions of index block
-	IndexBlockRef indexBlock;
+	IndexBlockRef indexBlockRef;
+	int chunkStartOffset;

 	// Non-serialized member fields
-	// TODO: add encryption and compression metadata for whole file
 	StringRef fileBytes;

-	static Standalone<IndexedBlobGranuleFile> fromFileBytes(const StringRef& fileBytes) {
-		// TODO: decrypt/decompress index block here if necessary first
+	void init(const Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
+		formatVersion = LATEST_BG_FORMAT_VERSION;
+		fileType = SNAPSHOT_FILE_TYPE;
+		chunkStartOffset = -1;
+	}

+	void init(const StringRef& fBytes, Arena& arena, const Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
+		ASSERT(chunkStartOffset > 0);
+
+		fileBytes = fBytes;
+		indexBlockRef.init(cipherKeysCtx, arena);
+	}
+
+	static Standalone<IndexedBlobGranuleFile> fromFileBytes(const StringRef& fileBytes,
+	                                                        const Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
 		// parse index block at head of file
 		Arena arena;
 		IndexedBlobGranuleFile file;
@ -110,7 +463,7 @@ struct IndexedBlobGranuleFile {
 		ObjectReader dataReader(fileBytes.begin(), Unversioned());
 		dataReader.deserialize(FileIdentifierFor<IndexedBlobGranuleFile>::value, file, arena);

-		file.fileBytes = fileBytes;
+		file.init(fileBytes, arena, cipherKeysCtx);

 		// do sanity checks
 		if (file.formatVersion > LATEST_BG_FORMAT_VERSION || file.formatVersion < MIN_SUPPORTED_BG_FORMAT_VERSION) {
@ -128,13 +481,15 @@ struct IndexedBlobGranuleFile {

 	ChildBlockPointerRef* findStartBlock(const KeyRef& beginKey) const {
 		ChildBlockPointerRef searchKey(beginKey, 0);
-		ChildBlockPointerRef* startBlock = (ChildBlockPointerRef*)std::lower_bound(
-		    indexBlock.children.begin(), indexBlock.children.end(), searchKey, ChildBlockPointerRef::OrderByKey());
+		ChildBlockPointerRef* startBlock = (ChildBlockPointerRef*)std::lower_bound(indexBlockRef.block.children.begin(),
+		                                                                           indexBlockRef.block.children.end(),
+		                                                                           searchKey,
+		                                                                           ChildBlockPointerRef::OrderByKey());

-		if (startBlock != indexBlock.children.end() && startBlock != indexBlock.children.begin() &&
+		if (startBlock != indexBlockRef.block.children.end() && startBlock != indexBlockRef.block.children.begin() &&
 		    beginKey < startBlock->key) {
 			startBlock--;
-		} else if (startBlock == indexBlock.children.end()) {
+		} else if (startBlock == indexBlockRef.block.children.end()) {
 			startBlock--;
 		}

@ -143,19 +498,31 @@ struct IndexedBlobGranuleFile {

 	// FIXME: implement some sort of iterator type interface?
 	template <class ChildType>
-	Standalone<ChildType> getChild(const ChildBlockPointerRef* childPointer) {
-		// TODO decrypt/decompress if necessary
-		ASSERT(childPointer != indexBlock.children.end());
+	Standalone<ChildType> getChild(const ChildBlockPointerRef* childPointer,
+	                               Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx,
+	                               int startOffset) {
+		ASSERT(childPointer != indexBlockRef.block.children.end());
 		const ChildBlockPointerRef* nextPointer = childPointer + 1;
-		ASSERT(nextPointer != indexBlock.children.end());
+		ASSERT(nextPointer != indexBlockRef.block.children.end());

 		size_t blockSize = nextPointer->offset - childPointer->offset;
-		StringRef childData(fileBytes.begin() + childPointer->offset, blockSize);
+		// Account for IndexBlockRef size for chunk offset computation
+		StringRef childData(fileBytes.begin() + childPointer->offset + startOffset, blockSize);
+
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			TraceEvent(SevDebug, "GetChild")
+			    .detail("BlkSize", blockSize)
+			    .detail("Offset", childPointer->offset)
+			    .detail("StartOffset", chunkStartOffset);
+		}

 		Arena childArena;
+		IndexBlobGranuleFileChunkRef chunkRef =
+		    IndexBlobGranuleFileChunkRef::fromBytes(cipherKeysCtx, childData, childArena);
+
 		ChildType child;
 		// TODO: version?
-		ObjectReader dataReader(childData.begin(), Unversioned());
+		ObjectReader dataReader(chunkRef.chunkBytes.get().begin(), Unversioned());
 		dataReader.deserialize(FileIdentifierFor<ChildType>::value, child, childArena);

 		// TODO implement some sort of decrypted+decompressed+deserialized cache, if this object gets reused?
@ -164,7 +531,7 @@ struct IndexedBlobGranuleFile {

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, formatVersion, fileType, filter, indexBlock);
+		serializer(ar, formatVersion, fileType, filter, indexBlockRef, chunkStartOffset);
 	}
 };

@ -172,22 +539,30 @@ struct IndexedBlobGranuleFile {
 // serializing once, adding the serialized size to each offset, and serializing again. This relies on the fact that
 // ObjectWriter/flatbuffers uses fixed size integers instead of variable size.

-Value serializeIndexBlock(Standalone<IndexedBlobGranuleFile>& file) {
+Value serializeIndexBlock(Standalone<IndexedBlobGranuleFile>& file, Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
+	file.indexBlockRef.finalize(cipherKeysCtx, file.arena());
+
 	// TODO: version?
-	Value indexBlock = ObjectWriter::toValue(file, Unversioned());
-	for (auto& it : file.indexBlock.children) {
-		it.offset += indexBlock.size();
+	Value serialized = ObjectWriter::toValue(file, Unversioned());
+	file.chunkStartOffset = serialized.contents().size();
+
+	if (BG_ENCRYPT_COMPRESS_DEBUG) {
+		TraceEvent(SevDebug, "SerializeIndexBlock").detail("StartOffset", file.chunkStartOffset);
 	}
+
 	return ObjectWriter::toValue(file, Unversioned());
 }

 // TODO: this should probably be in actor file with yields?
 // TODO: optimize memory copying
 // TODO: sanity check no oversized files
-Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCount) {
+Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot,
+                               int chunkCount,
+                               Optional<CompressionFilter> compressFilter,
+                               Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
 	Standalone<IndexedBlobGranuleFile> file;
-	file.formatVersion = LATEST_BG_FORMAT_VERSION;
-	file.fileType = SNAPSHOT_FILE_TYPE;
+
+	file.init(cipherKeysCtx);

 	size_t targetChunkBytes = snapshot.expectedSize() / chunkCount;
 	size_t currentChunkBytesEstimate = 0;
@ -208,17 +583,25 @@ Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCo
 		currentChunkBytesEstimate += snapshot[i].expectedSize();

 		if (currentChunkBytesEstimate >= targetChunkBytes || i == snapshot.size() - 1) {
-			// TODO: add encryption/compression for each chunk
 			// TODO: protocol version
 			Value serialized = ObjectWriter::toValue(currentChunk, Unversioned());
-			chunks.push_back(serialized);
+			Value chunkBytes =
+			    IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena());
+			chunks.push_back(chunkBytes);
 			// TODO remove validation
-			if (!file.indexBlock.children.empty()) {
-				ASSERT(file.indexBlock.children.back().key < currentChunk.begin()->key);
+			if (!file.indexBlockRef.block.children.empty()) {
+				ASSERT(file.indexBlockRef.block.children.back().key < currentChunk.begin()->key);
 			}
-			file.indexBlock.children.emplace_back_deep(file.arena(), currentChunk.begin()->key, previousChunkBytes);
+			file.indexBlockRef.block.children.emplace_back_deep(
+			    file.arena(), currentChunk.begin()->key, previousChunkBytes);

-			previousChunkBytes += serialized.size();
+			if (BG_ENCRYPT_COMPRESS_DEBUG) {
+				TraceEvent(SevDebug, "ChunkSize")
+				    .detail("ChunkBytes", chunkBytes.size())
+				    .detail("PrvChunkBytes", previousChunkBytes);
+			}
+
+			previousChunkBytes += chunkBytes.size();
 			currentChunkBytesEstimate = 0;
 			currentChunk = Standalone<GranuleSnapshot>();
 		}
@ -226,12 +609,13 @@ Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCo
 	ASSERT(currentChunk.empty());
 	// push back dummy last chunk to get last chunk size, and to know last key in last block without having to read it
 	if (!snapshot.empty()) {
-		file.indexBlock.children.emplace_back_deep(file.arena(), keyAfter(snapshot.back().key), previousChunkBytes);
+		file.indexBlockRef.block.children.emplace_back_deep(
+		    file.arena(), keyAfter(snapshot.back().key), previousChunkBytes);
 	}

-	Value indexBlock = serializeIndexBlock(file);
-	int32_t indexSize = indexBlock.size();
-	chunks[0] = indexBlock;
+	Value indexBlockBytes = serializeIndexBlock(file, cipherKeysCtx);
+	int32_t indexSize = indexBlockBytes.size();
+	chunks[0] = indexBlockBytes;

 	// TODO: write this directly to stream to avoid extra copy?
 	Arena ret;
@ -240,7 +624,15 @@ Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCo
 	uint8_t* buffer = new (ret) uint8_t[size];

 	previousChunkBytes = 0;
+	int idx = 0;
 	for (auto& it : chunks) {
+		if (BG_ENCRYPT_COMPRESS_DEBUG) {
+			TraceEvent(SevDebug, "SerializeSnapshot")
+			    .detail("ChunkIdx", idx++)
+			    .detail("Size", it.size())
+			    .detail("Offset", previousChunkBytes);
+		}
+
 		memcpy(buffer + previousChunkBytes, it.begin(), it.size());
 		previousChunkBytes += it.size();
 	}
@ -252,19 +644,21 @@ Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunkCo
 // TODO: use redwood prefix trick to optimize cpu comparison
 static Arena loadSnapshotFile(const StringRef& snapshotData,
                              KeyRangeRef keyRange,
-                              std::map<KeyRef, ValueRef>& dataMap) {
+                              std::map<KeyRef, ValueRef>& dataMap,
+                              Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
 	Arena rootArena;

-	Standalone<IndexedBlobGranuleFile> file = IndexedBlobGranuleFile::fromFileBytes(snapshotData);
+	Standalone<IndexedBlobGranuleFile> file = IndexedBlobGranuleFile::fromFileBytes(snapshotData, cipherKeysCtx);

 	ASSERT(file.fileType == SNAPSHOT_FILE_TYPE);
+	ASSERT(file.chunkStartOffset > 0);

 	// empty snapshot file
-	if (file.indexBlock.children.empty()) {
+	if (file.indexBlockRef.block.children.empty()) {
 		return rootArena;
 	}

-	ASSERT(file.indexBlock.children.size() >= 2);
+	ASSERT(file.indexBlockRef.block.children.size() >= 2);

 	// TODO: refactor this out of delta tree
 	// int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first,
@ -275,8 +669,9 @@ static Arena loadSnapshotFile(const StringRef& snapshotData,

 	// FIXME: optimize cpu comparisons here in first/last partial blocks, doing entire blocks at once based on
 	// comparison, and using shared prefix for key comparison
-	while (currentBlock != (file.indexBlock.children.end() - 1) && keyRange.end > currentBlock->key) {
-		Standalone<GranuleSnapshot> dataBlock = file.getChild<GranuleSnapshot>(currentBlock);
+	while (currentBlock != (file.indexBlockRef.block.children.end() - 1) && keyRange.end > currentBlock->key) {
+		Standalone<GranuleSnapshot> dataBlock =
+		    file.getChild<GranuleSnapshot>(currentBlock, cipherKeysCtx, file.chunkStartOffset);
 		ASSERT(!dataBlock.empty());
 		ASSERT(currentBlock->key == dataBlock.front().key);

@ -426,7 +821,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
 	}

 	if (snapshotData.present()) {
-		Arena snapshotArena = loadSnapshotFile(snapshotData.get(), requestRange, dataMap);
+		Arena snapshotArena = loadSnapshotFile(snapshotData.get(), requestRange, dataMap, chunk.cipherKeysCtx);
 		arena.dependsOn(snapshotArena);
 	}

@ -574,6 +969,40 @@ std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, s
 	       suffix;
 }

+namespace {
+const EncryptCipherDomainId encryptDomainId = deterministicRandom()->randomInt64(786, 7860);
+const EncryptCipherBaseKeyId encryptBaseCipherId = deterministicRandom()->randomUInt64();
+const EncryptCipherRandomSalt encryptSalt = deterministicRandom()->randomUInt64();
+
+Standalone<StringRef> getBaseCipher() {
+	Standalone<StringRef> baseCipher = makeString(AES_256_KEY_LENGTH);
+	generateRandomData(mutateString(baseCipher), baseCipher.size());
+	return baseCipher;
+}
+
+Standalone<StringRef> encryptBaseCipher = getBaseCipher();
+
+BlobGranuleCipherKeysCtx getCipherKeysCtx(Arena& arena) {
+	BlobGranuleCipherKeysCtx cipherKeysCtx;
+
+	cipherKeysCtx.textCipherKey.encryptDomainId = encryptDomainId;
+	cipherKeysCtx.textCipherKey.baseCipherId = encryptBaseCipherId;
+	cipherKeysCtx.textCipherKey.salt = encryptSalt;
+	cipherKeysCtx.textCipherKey.baseCipher = StringRef(arena, encryptBaseCipher);
+
+	cipherKeysCtx.headerCipherKey.encryptDomainId = SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
+	cipherKeysCtx.headerCipherKey.baseCipherId = encryptBaseCipherId;
+	cipherKeysCtx.headerCipherKey.salt = encryptSalt;
+	cipherKeysCtx.headerCipherKey.baseCipher = StringRef(arena, encryptBaseCipher);
+
+	cipherKeysCtx.ivRef = makeString(AES_256_IV_LENGTH, arena);
+	generateRandomData(mutateString(cipherKeysCtx.ivRef), AES_256_IV_LENGTH);
+
+	return cipherKeysCtx;
+}
+
+} // namespace
+
 TEST_CASE("/blobgranule/files/applyDelta") {
 	printf("Testing blob granule delta applying\n");
 	Arena a;
@ -731,14 +1160,18 @@ int randomExp(int minExp, int maxExp) {
 	return deterministicRandom()->randomInt(val, val * 2);
 }

-void checkEmpty(const Value& serialized, Key begin, Key end) {
+void checkEmpty(const Value& serialized, Key begin, Key end, Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
 	std::map<KeyRef, ValueRef> result;
-	Arena ar = loadSnapshotFile(serialized, KeyRangeRef(begin, end), result);
+	Arena ar = loadSnapshotFile(serialized, KeyRangeRef(begin, end), result, cipherKeysCtx);
 	ASSERT(result.empty());
 }

 // endIdx is exclusive
-void checkRead(const Standalone<GranuleSnapshot>& snapshot, const Value& serialized, int beginIdx, int endIdx) {
+void checkRead(const Standalone<GranuleSnapshot>& snapshot,
+               const Value& serialized,
+               int beginIdx,
+               int endIdx,
+               Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx) {
 	ASSERT(beginIdx < endIdx);
 	ASSERT(endIdx <= snapshot.size());
 	std::map<KeyRef, ValueRef> result;
@ -746,7 +1179,7 @@ void checkRead(const Standalone<GranuleSnapshot>& snapshot, const Value& seriali
 	Key endKey = endIdx == snapshot.size() ? keyAfter(snapshot.back().key) : snapshot[endIdx].key;
 	KeyRangeRef range(beginKey, endKey);

-	Arena ar = loadSnapshotFile(serialized, range, result);
+	Arena ar = loadSnapshotFile(serialized, range, result, cipherKeysCtx);

 	if (result.size() != endIdx - beginIdx) {
 		fmt::print("Read {0} rows != {1}\n", result.size(), endIdx - beginIdx);
@ -818,7 +1251,21 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
 	fmt::print(
 	    "Constructing snapshot with {0} rows, {1} bytes, and {2} chunks\n", data.size(), totalDataBytes, targetChunks);

-	Value serialized = serializeChunkedSnapshot(data, targetChunks);
+	Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = Optional<BlobGranuleCipherKeysCtx>();
+	Arena arena;
+	if (deterministicRandom()->coinflip()) {
+		cipherKeysCtx = getCipherKeysCtx(arena);
+	}
+
+	Optional<CompressionFilter> compressFilter;
+	if (deterministicRandom()->coinflip()) {
+#ifdef ZLIB_LIB_SUPPORTED
+		compressFilter = CompressionFilter::GZIP;
+#else
+		compressFilter = CompressionFilter::NONE;
+#endif
+	}
+	Value serialized = serializeChunkedSnapshot(data, targetChunks, compressFilter, cipherKeysCtx);

 	fmt::print("Snapshot serialized! {0} bytes\n", serialized.size());

@ -829,7 +1276,7 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {

 	fmt::print("Initial read starting\n");

-	checkRead(data, serialized, 0, data.size());
+	checkRead(data, serialized, 0, data.size(), cipherKeysCtx);

 	fmt::print("Initial read complete\n");

@ -838,22 +1285,22 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") {
 			int width = randomExp(0, maxExp);
 			ASSERT(width <= data.size());
 			int start = deterministicRandom()->randomInt(0, data.size() - width);
-			checkRead(data, serialized, start, start + width);
+			checkRead(data, serialized, start, start + width, cipherKeysCtx);
 		}

 		fmt::print("Doing empty checks\n");
 		int randomIdx = deterministicRandom()->randomInt(0, data.size() - 1);
-		checkEmpty(serialized, keyAfter(data[randomIdx].key), data[randomIdx + 1].key);
+		checkEmpty(serialized, keyAfter(data[randomIdx].key), data[randomIdx + 1].key, cipherKeysCtx);
 	} else {
 		fmt::print("Doing empty checks\n");
 	}

-	checkEmpty(serialized, normalKeys.begin, data.front().key);
-	checkEmpty(serialized, normalKeys.begin, LiteralStringRef("\x00"));
-	checkEmpty(serialized, keyAfter(data.back().key), normalKeys.end);
-	checkEmpty(serialized, LiteralStringRef("\xfe"), normalKeys.end);
+	checkEmpty(serialized, normalKeys.begin, data.front().key, cipherKeysCtx);
+	checkEmpty(serialized, normalKeys.begin, LiteralStringRef("\x00"), cipherKeysCtx);
+	checkEmpty(serialized, keyAfter(data.back().key), normalKeys.end, cipherKeysCtx);
+	checkEmpty(serialized, LiteralStringRef("\xfe"), normalKeys.end, cipherKeysCtx);

 	fmt::print("Snapshot format test done!\n");

 	return Void();
-}
+}
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -71,6 +71,13 @@ if(WITH_AWS_BACKUP)
  include(awssdk)
 endif()

+find_package(ZLIB)
+if(ZLIB_FOUND)
+  add_compile_definitions(ZLIB_LIB_SUPPORTED)
+else()
+  message(STATUS "ZLIB package not found")
+endif()
+
 add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
 target_include_directories(fdbclient PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include")
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/versions.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/fdbclient/versions.h)
@ -89,7 +96,7 @@ if(WIN32)
  add_dependencies(fdbclient_sampling_actors fdbclient_actors)
 endif()

-add_flow_target(LINK_TEST NAME fdbclientlinktest SRCS ${FDBCLIENT_SRCS} LinkTest.cpp ADDL_SRCS ${options_srcs})
+add_flow_target(LINK_TEST NAME fdbclientlinktest SRCS LinkTest.cpp)
 target_link_libraries(fdbclientlinktest PRIVATE fdbclient rapidxml) # re-link rapidxml due to private link interface

 if(BUILD_AZURE_BACKUP)
@ -101,3 +108,4 @@ if(BUILD_AWS_BACKUP)
  target_link_libraries(fdbclient PUBLIC awssdk_target)
  target_link_libraries(fdbclient_sampling PUBLIC awssdk_target)
 endif()
+
--- a/fdbclient/DatabaseBackupAgent.actor.cpp
+++ b/fdbclient/DatabaseBackupAgent.actor.cpp
@ -363,7 +363,7 @@ struct BackupRangeTaskFunc : TaskFuncBase {
 					if ((!prevAdjacent || !nextAdjacent) &&
 					    rangeCount > ((prevAdjacent || nextAdjacent) ? CLIENT_KNOBS->BACKUP_MAP_KEY_UPPER_LIMIT
 					                                                 : CLIENT_KNOBS->BACKUP_MAP_KEY_LOWER_LIMIT)) {
-						TEST(true); // range insert delayed because too versionMap is too large
+						CODE_PROBE(true, "range insert delayed because too versionMap is too large");

 						if (rangeCount > CLIENT_KNOBS->BACKUP_MAP_KEY_UPPER_LIMIT)
 							TraceEvent(SevWarnAlways, "DBA_KeyRangeMapTooLarge").log();
@ -2780,7 +2780,7 @@ public:
 				Version destVersion = wait(tr3.getReadVersion());
 				TraceEvent("DBA_SwitchoverVersionUpgrade").detail("Src", commitVersion).detail("Dest", destVersion);
 				if (destVersion <= commitVersion) {
-					TEST(true); // Forcing dest backup cluster to higher version
+					CODE_PROBE(true, "Forcing dest backup cluster to higher version");
 					tr3.set(minRequiredCommitVersionKey, BinaryWriter::toValue(commitVersion + 1, Unversioned()));
 					wait(tr3.commit());
 				} else {
@ -2933,7 +2933,7 @@ public:
 					Version applied = BinaryReader::fromStringRef<Version>(lastApplied.get(), Unversioned());
 					TraceEvent("DBA_AbortVersionUpgrade").detail("Src", applied).detail("Dest", current);
 					if (current <= applied) {
-						TEST(true); // Upgrading version of local database.
+						CODE_PROBE(true, "Upgrading version of local database.");
 						// The +1 is because we want to make sure that a versionstamped operation can't reuse
 						// the same version as an already-applied transaction.
 						tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(applied + 1, Unversioned()));
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -822,7 +822,7 @@ struct AbortFiveZeroBackupTask : TaskFuncBase {
 		state FileBackupAgent backupAgent;
 		state std::string tagName = task->params[BackupAgentBase::keyConfigBackupTag].toString();

-		TEST(true); // Canceling old backup task
+		CODE_PROBE(true, "Canceling old backup task");

 		TraceEvent(SevInfo, "FileBackupCancelOldTask")
 		    .detail("Task", task->params[Task::reservedTaskParamKeyType])
@ -908,7 +908,7 @@ struct AbortFiveOneBackupTask : TaskFuncBase {
 		state BackupConfig config(task);
 		state std::string tagName = wait(config.tag().getOrThrow(tr));

-		TEST(true); // Canceling 5.1 backup task
+		CODE_PROBE(true, "Canceling 5.1 backup task");

 		TraceEvent(SevInfo, "FileBackupCancelFiveOneTask")
 		    .detail("Task", task->params[Task::reservedTaskParamKeyType])
@ -1245,7 +1245,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
 			// If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish it.
 			if (values.second != outVersion || done) {
 				if (outFile) {
-					TEST(outVersion != invalidVersion); // Backup range task wrote multiple versions
+					CODE_PROBE(outVersion != invalidVersion, "Backup range task wrote multiple versions");
 					state Key nextKey = done ? endKey : keyAfter(lastKey);
 					wait(rangeFile.writeKey(nextKey));

@ -4098,7 +4098,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 				    .detail("RestoreVersion", restoreVersion)
 				    .detail("Dest", destVersion);
 				if (destVersion <= restoreVersion) {
-					TEST(true); // Forcing restored cluster to higher version
+					CODE_PROBE(true, "Forcing restored cluster to higher version");
 					tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(restoreVersion + 1, Unversioned()));
 					wait(tr->commit());
 				} else {
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -22,6 +22,7 @@
 #include <string>
 #include <vector>

+#include "fdbclient/GenericManagementAPI.actor.h"
 #include "fmt/format.h"
 #include "fdbclient/Knobs.h"
 #include "flow/Arena.h"
@ -1002,8 +1003,8 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
 			TraceEvent("AttemptingQuorumChange")
 			    .detail("FromCS", oldClusterConnectionString.toString())
 			    .detail("ToCS", newClusterConnectionString.toString());
-			TEST(oldClusterKeyName != newClusterKeyName); // Quorum change with new name
-			TEST(oldClusterKeyName == newClusterKeyName); // Quorum change with unchanged name
+			CODE_PROBE(oldClusterKeyName != newClusterKeyName, "Quorum change with new name");
+			CODE_PROBE(oldClusterKeyName == newClusterKeyName, "Quorum change with unchanged name");

 			state std::vector<Future<Optional<LeaderInfo>>> leaderServers;
 			state ClientCoordinators coord(Reference<ClusterConnectionMemoryRecord>(
@ -2461,6 +2462,21 @@ bool schemaMatch(json_spirit::mValue const& schemaValue,
 	}
 }

+void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) {
+	tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+	auto key = storageQuotaKey(tenantName);
+	tr.set(key, BinaryWriter::toValue<uint64_t>(quota, Unversioned()));
+}
+
+ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
+	tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+	state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName)));
+	if (!v.present()) {
+		return Optional<uint64_t>();
+	}
+	return BinaryReader::fromStringRef<uint64_t>(v.get(), Unversioned());
+}
+
 std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) {
 	// Note: the error message here should not be changed if possible
 	// If you do change the message here,
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@ -987,8 +987,9 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
 			successIndex = index;
 			allConnectionsFailed = false;
 		} else {
-			TEST(rep.getError().code() == error_code_failed_to_progress); // Coordinator cant talk to cluster controller
-			TEST(rep.getError().code() == error_code_lookup_failed); // Coordinator hostname resolving failure
+			CODE_PROBE(rep.getError().code() == error_code_failed_to_progress,
+			           "Coordinator cant talk to cluster controller");
+			CODE_PROBE(rep.getError().code() == error_code_lookup_failed, "Coordinator hostname resolving failure");
 			TraceEvent("MonitorProxiesConnectFailed")
 			    .detail("Error", rep.getError().name())
 			    .detail("Coordinator", clientLeaderServer.getAddressString());
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -170,7 +170,7 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe
 			if (result->second.id() == tssi.id()) {
 				metrics = tssMetrics[tssi.id()];
 			} else {
-				TEST(true); // SS now maps to new TSS! This will probably never happen in practice
+				CODE_PROBE(true, "SS now maps to new TSS! This will probably never happen in practice");
 				tssMetrics.erase(result->second.id());
 				metrics = makeReference<TSSMetrics>();
 				tssMetrics[tssi.id()] = metrics;
@ -444,7 +444,7 @@ void DatabaseContext::validateVersion(Version version) const {
 		throw client_invalid_operation();
 	}
 	if (switchable && version < minAcceptableReadVersion) {
-		TEST(true); // Attempted to read a version lower than any this client has seen from the current cluster
+		CODE_PROBE(true, "Attempted to read a version lower than any this client has seen from the current cluster");
 		throw transaction_too_old();
 	}

@ -1114,8 +1114,8 @@ ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
 			state bool quarantine = CLIENT_KNOBS->QUARANTINE_TSS_ON_MISMATCH;
 			TraceEvent(SevWarnAlways, quarantine ? "TSS_QuarantineMismatch" : "TSS_KillMismatch")
 			    .detail("TSSID", data.first.toString());
-			TEST(quarantine); // Quarantining TSS because it got mismatch
-			TEST(!quarantine); // Killing TSS because it got mismatch
+			CODE_PROBE(quarantine, "Quarantining TSS because it got mismatch");
+			CODE_PROBE(!quarantine, "Killing TSS because it got mismatch");

 			tr = makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(cx)));
 			state int tries = 0;
@ -1154,7 +1154,7 @@ ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
 			// clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx
 			tr = makeReference<ReadYourWritesTransaction>();
 		} else {
-			TEST(true); // Not handling TSS with mismatch because it's already gone
+			CODE_PROBE(true, "Not handling TSS with mismatch because it's already gone");
 		}
 	}
 }
@ -1860,7 +1860,7 @@ bool DatabaseContext::getCachedLocations(const Optional<TenantName>& tenantName,
 	loop {
 		auto r = reverse ? end : begin;
 		if (!r->value()) {
-			TEST(result.size()); // had some but not all cached locations
+			CODE_PROBE(result.size(), "had some but not all cached locations");
 			result.clear();
 			return false;
 		}
@ -1907,7 +1907,7 @@ Reference<LocationInfo> DatabaseContext::setCachedLocation(const Optional<Tenant
 	int maxEvictionAttempts = 100, attempts = 0;
 	auto loc = makeReference<LocationInfo>(serverRefs);
 	while (locationCache.size() > locationCacheSize && attempts < maxEvictionAttempts) {
-		TEST(true); // NativeAPI storage server locationCache entry evicted
+		CODE_PROBE(true, "NativeAPI storage server locationCache entry evicted");
 		attempts++;
 		auto r = locationCache.randomRange();
 		Key begin = r.begin(), end = r.end(); // insert invalidates r, so can't be passed a mere reference into it
@ -2091,7 +2091,7 @@ Future<Void> DatabaseContext::onConnected() {

 ACTOR static Future<Void> switchConnectionRecordImpl(Reference<IClusterConnectionRecord> connRecord,
                                                     DatabaseContext* self) {
-	TEST(true); // Switch connection file
+	CODE_PROBE(true, "Switch connection file");
 	TraceEvent("SwitchConnectionRecord")
 	    .detail("ClusterFile", connRecord->toString())
 	    .detail("ConnectionString", connRecord->getConnectionString().toString());
@ -2152,7 +2152,7 @@ void DatabaseContext::expireThrottles() {
 	for (auto& priorityItr : throttledTags) {
 		for (auto tagItr = priorityItr.second.begin(); tagItr != priorityItr.second.end();) {
 			if (tagItr->second.expired()) {
-				TEST(true); // Expiring client throttle
+				CODE_PROBE(true, "Expiring client throttle");
 				tagItr = priorityItr.second.erase(tagItr);
 			} else {
 				++tagItr;
@ -2638,7 +2638,7 @@ bool DatabaseContext::isCurrentGrvProxy(UID proxyId) const {
 		if (proxy.id() == proxyId)
 			return true;
 	}
-	TEST(true); // stale GRV proxy detected
+	CODE_PROBE(true, "stale GRV proxy detected");
 	return false;
 }

@ -2875,6 +2875,7 @@ ACTOR Future<KeyRangeLocationInfo> getKeyLocation_internal(Database cx,
 					auto locationInfo =
 					    cx->setCachedLocation(tenant, rep.tenantEntry, rep.results[0].first, rep.results[0].second);
 					updateTssMappings(cx, rep);
+					updateTagMappings(cx, rep);

 					return KeyRangeLocationInfo(
 					    rep.tenantEntry,
@ -3629,13 +3630,13 @@ ACTOR Future<Version> watchValue(Database cx, Reference<const WatchParameters> p
 				wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, parameters->taskID));
 			} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
 				// clang-format off
-				TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
-				TEST(e.code() == error_code_process_behind); // The storage servers are all behind
+				CODE_PROBE(e.code() == error_code_watch_cancelled, "Too many watches on the storage server, poll for changes instead");
+				CODE_PROBE(e.code() == error_code_process_behind, "The storage servers are all behind");
 				// clang-format on
 				wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, parameters->taskID));
 			} else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case
 				                                           // it was cancelled
-				TEST(true); // A watch timed out
+				CODE_PROBE(true, "A watch timed out");
 				wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, parameters->taskID));
 			} else {
 				state Error err = e;
@ -3667,7 +3668,8 @@ ACTOR Future<Void> watchStorageServerResp(int64_t tenantId, Key key, Database cx
 			}
 			// ABA happens
 			else {
-				TEST(true); // ABA issue where the version returned from the server is less than the version in the map
+				CODE_PROBE(true,
+				           "ABA issue where the version returned from the server is less than the version in the map");

 				// case 2: version_1 < version_2 and future_count == 1
 				if (metadata->watchPromise.getFutureReferenceCount() == 1) {
@ -3758,7 +3760,8 @@ Future<Void> getWatchFuture(Database cx, Reference<WatchParameters> parameters)
 	// case 3: val_1 != val_2 && version_2 > version_1 (received watch with different value and a higher version so
 	// recreate in SS)
 	else if (parameters->version > metadata->parameters->version) {
-		TEST(true); // Setting a watch that has a different value than the one in the map but a higher version (newer)
+		CODE_PROBE(true,
+		           "Setting a watch that has a different value than the one in the map but a higher version (newer)");
 		cx->deleteWatchMetadata(parameters->tenant.tenantId, parameters->key);

 		metadata->watchPromise.send(parameters->version);
@ -3773,10 +3776,10 @@ Future<Void> getWatchFuture(Database cx, Reference<WatchParameters> parameters)
 	}
 	// case 5: val_1 != val_2 && version_1 == version_2 (received watch with different value but same version)
 	else if (metadata->parameters->version == parameters->version) {
-		TEST(true); // Setting a watch which has a different value than the one in the map but the same version
+		CODE_PROBE(true, "Setting a watch which has a different value than the one in the map but the same version");
 		return sameVersionDiffValue(cx, parameters);
 	}
-	TEST(true); // Setting a watch which has a different value than the one in the map but a lower version (older)
+	CODE_PROBE(true, "Setting a watch which has a different value than the one in the map but a lower version (older)");

 	// case 4: val_1 != val_2 && version_2 < version_1
 	return Void();
@ -3970,7 +3973,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 						    .detail("BlockBytes", rep.data.expectedSize());
 						ASSERT(false);
 					}
-					TEST(true); // GetKeyValuesFamilyReply.more in getExactRange
+					CODE_PROBE(true, "GetKeyValuesFamilyReply.more in getExactRange");
 					// Make next request to the same shard with a beginning key just after the last key returned
 					if (reverse)
 						locations[shard].range =
@ -3981,7 +3984,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 				}

 				if (!more || locations[shard].range.empty()) {
-					TEST(true); // getExactrange (!more || locations[shard].first.empty())
+					CODE_PROBE(true, "getExactrange (!more || locations[shard].first.empty())");
 					if (shard == locations.size() - 1) {
 						const KeyRangeRef& range = locations[shard].range;
 						KeyRef begin = reverse ? keys.begin : range.end;
@ -3991,7 +3994,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 							output.more = false;
 							return output;
 						}
-						TEST(true); // Multiple requests of key locations
+						CODE_PROBE(true, "Multiple requests of key locations");

 						keys = KeyRangeRef(begin, end);
 						break;
@ -4431,7 +4434,7 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,

 				if (!rep.more) {
 					ASSERT(modifiedSelectors);
-					TEST(true); // !GetKeyValuesFamilyReply.more and modifiedSelectors in getRange
+					CODE_PROBE(true, "!GetKeyValuesFamilyReply.more and modifiedSelectors in getRange");

 					if (!rep.data.size()) {
 						RangeResultFamily result = wait(
@ -4455,7 +4458,7 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
 					else
 						begin = firstGreaterOrEqual(shard.end);
 				} else {
-					TEST(true); // GetKeyValuesFamilyReply.more in getRange
+					CODE_PROBE(true, "GetKeyValuesFamilyReply.more in getRange");
 					if (reverse)
 						end = firstGreaterOrEqual(output[output.size() - 1].key);
 					else
@ -4574,7 +4577,7 @@ static Future<Void> tssStreamComparison(Request request,
 			} else {
 				tssData.metrics->ssError(e.code());
 			}
-			TEST(e.code() != error_code_end_of_stream); // SS got error in TSS stream comparison
+			CODE_PROBE(e.code() != error_code_end_of_stream, "SS got error in TSS stream comparison");
 		}

 		state double sleepTime = std::max(startTime + FLOW_KNOBS->LOAD_BALANCE_TSS_TIMEOUT - now(), 0.0);
@ -4586,7 +4589,7 @@ static Future<Void> tssStreamComparison(Request request,
 				}
 				when(wait(delay(sleepTime))) {
 					++tssData.metrics->tssTimeouts;
-					TEST(true); // Got TSS timeout in stream comparison
+					CODE_PROBE(true, "Got TSS timeout in stream comparison");
 				}
 			}
 		} catch (Error& e) {
@ -4601,7 +4604,7 @@ static Future<Void> tssStreamComparison(Request request,
 			} else {
 				tssData.metrics->tssError(e.code());
 			}
-			TEST(e.code() != error_code_end_of_stream); // TSS got error in TSS stream comparison
+			CODE_PROBE(e.code() != error_code_end_of_stream, "TSS got error in TSS stream comparison");
 		}

 		if (!ssEndOfStream || !tssEndOfStream) {
@ -4614,11 +4617,11 @@ static Future<Void> tssStreamComparison(Request request,
 			// FIXME: this code is pretty much identical to LoadBalance.h
 			// TODO could add team check logic in if we added synchronous way to turn this into a fixed getRange request
 			// and send it to the whole team and compare? I think it's fine to skip that for streaming though
-			TEST(ssEndOfStream != tssEndOfStream); // SS or TSS stream finished early!
+			CODE_PROBE(ssEndOfStream != tssEndOfStream, "SS or TSS stream finished early!");

 			// skip tss comparison if both are end of stream
 			if ((!ssEndOfStream || !tssEndOfStream) && !TSS_doCompare(ssReply.get(), tssReply.get())) {
-				TEST(true); // TSS mismatch in stream comparison
+				CODE_PROBE(true, "TSS mismatch in stream comparison");
 				TraceEvent mismatchEvent(
 				    (g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations)
 				        ? SevWarnAlways
@ -4630,10 +4633,10 @@ static Future<Void> tssStreamComparison(Request request,
 				if (tssData.metrics->shouldRecordDetailedMismatch()) {
 					TSS_traceMismatch(mismatchEvent, request, ssReply.get(), tssReply.get());

-					TEST(FLOW_KNOBS
-					         ->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Full TSS Mismatch in stream comparison
-					TEST(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Partial TSS Mismatch in stream
-					                                                         // comparison and storing the rest in FDB
+					CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
+					           "Tracing Full TSS Mismatch in stream comparison");
+					CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
+					           "Tracing Partial TSS Mismatch in stream comparison and storing the rest in FDB");

 					if (!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL) {
 						mismatchEvent.disable();
@ -4673,7 +4676,7 @@ maybeDuplicateTSSStreamFragment(Request& req, QueueModel* model, RequestStream<R
 		Optional<TSSEndpointData> tssData = model->getTssData(ssStream->getEndpoint().token.first());

 		if (tssData.present()) {
-			TEST(true); // duplicating stream to TSS
+			CODE_PROBE(true, "duplicating stream to TSS");
 			resetReply(req);
 			// FIXME: optimize to avoid creating new netNotifiedQueueWithAcknowledgements for each stream duplication
 			RequestStream<Request> tssRequestStream(tssData.get().endpoint);
@ -4873,7 +4876,7 @@ ACTOR Future<Void> getRangeStreamFragment(Reference<TransactionState> trState,
 							    .detail("BlockBytes", rep.data.expectedSize());
 							ASSERT(false);
 						}
-						TEST(true); // GetKeyValuesStreamReply.more in getRangeStream
+						CODE_PROBE(true, "GetKeyValuesStreamReply.more in getRangeStream");
 						// Make next request to the same shard with a beginning key just after the last key returned
 						if (reverse)
 							locations[shard].range =
@ -5271,7 +5274,7 @@ ACTOR Future<Void> watch(Reference<Watch> watch,
 						when(wait(watch->watchFuture)) { break; }

 						when(wait(cx->connectionFileChanged())) {
-							TEST(true); // Recreated a watch after switch
+							CODE_PROBE(true, "Recreated a watch after switch");
 							cx->clearWatchMetadata();
 							watch->watchFuture = watchValueMap(cx->minAcceptableReadVersion,
 							                                   tenantInfo,
@ -5444,18 +5447,18 @@ Future<RangeResultFamily> Transaction::getRangeInternal(const KeySelector& begin

 	KeySelector b = begin;
 	if (b.orEqual) {
-		TEST(true); // Native begin orEqual==true
+		CODE_PROBE(true, "Native begin orEqual==true");
 		b.removeOrEqual(b.arena());
 	}

 	KeySelector e = end;
 	if (e.orEqual) {
-		TEST(true); // Native end orEqual==true
+		CODE_PROBE(true, "Native end orEqual==true");
 		e.removeOrEqual(e.arena());
 	}

 	if (b.offset >= e.offset && b.getKey() >= e.getKey()) {
-		TEST(true); // Native range inverted
+		CODE_PROBE(true, "Native range inverted");
 		return RangeResultFamily();
 	}

@ -5518,18 +5521,18 @@ Future<Void> Transaction::getRangeStream(const PromiseStream<RangeResult>& resul

 	KeySelector b = begin;
 	if (b.orEqual) {
-		TEST(true); // Native stream begin orEqual==true
+		CODE_PROBE(true, "Native stream begin orEqual==true");
 		b.removeOrEqual(b.arena());
 	}

 	KeySelector e = end;
 	if (e.orEqual) {
-		TEST(true); // Native stream end orEqual==true
+		CODE_PROBE(true, "Native stream end orEqual==true");
 		e.removeOrEqual(e.arena());
 	}

 	if (b.offset >= e.offset && b.getKey() >= e.getKey()) {
-		TEST(true); // Native stream range inverted
+		CODE_PROBE(true, "Native stream range inverted");
 		results.sendError(end_of_stream());
 		return Void();
 	}
@ -5632,7 +5635,7 @@ void Transaction::atomicOp(const KeyRef& key,
 	if (addConflictRange && operationType != MutationRef::SetVersionstampedKey)
 		t.write_conflict_ranges.push_back(req.arena, r);

-	TEST(true); // NativeAPI atomic operation
+	CODE_PROBE(true, "NativeAPI atomic operation");
 }

 void Transaction::clear(const KeyRangeRef& range, AddConflictRange addConflictRange) {
@ -5718,7 +5721,7 @@ double Transaction::getBackoff(int errCode) {
 			if (priorityItr != trState->cx->throttledTags.end()) {
 				auto tagItr = priorityItr->second.find(tag);
 				if (tagItr != priorityItr->second.end()) {
-					TEST(true); // Returning throttle backoff
+					CODE_PROBE(true, "Returning throttle backoff");
 					returnedBackoff = std::max(
 					    returnedBackoff,
 					    std::min(CLIENT_KNOBS->TAG_THROTTLE_RECHECK_INTERVAL, tagItr->second.throttleDuration()));
@ -6249,7 +6252,7 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
 				KeyRangeRef selfConflictingRange =
 				    intersects(req.transaction.write_conflict_ranges, req.transaction.read_conflict_ranges).get();

-				TEST(true); // Waiting for dummy transaction to report commit_unknown_result
+				CODE_PROBE(true, "Waiting for dummy transaction to report commit_unknown_result");

 				wait(commitDummyTransaction(trState, singleKeyRange(selfConflictingRange.begin)));
 			}
@ -6587,7 +6590,7 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
 		if (value.get().size() != 33) {
 			throw invalid_option_value();
 		}
-		TEST(true); // Adding link in FDBTransactionOptions::SPAN_PARENT
+		CODE_PROBE(true, "Adding link in FDBTransactionOptions::SPAN_PARENT");
 		span.setParent(BinaryReader::fromStringRef<SpanContext>(value.get(), IncludeVersion()));
 		break;

@ -6667,10 +6670,10 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
 						for (auto& tag : tags) {
 							auto itr = v.tagThrottleInfo.find(tag.first);
 							if (itr == v.tagThrottleInfo.end()) {
-								TEST(true); // Removing client throttle
+								CODE_PROBE(true, "Removing client throttle");
 								priorityThrottledTags.erase(tag.first);
 							} else {
-								TEST(true); // Setting client throttle
+								CODE_PROBE(true, "Setting client throttle");
 								auto result = priorityThrottledTags.try_emplace(tag.first, itr->second);
 								if (!result.second) {
 									result.first->second.update(itr->second);
@ -6853,7 +6856,7 @@ ACTOR Future<Version> extractReadVersion(Reference<TransactionState> trState,
 				if (itr->second.expired()) {
 					priorityThrottledTags.erase(itr);
 				} else if (itr->second.throttleDuration() > 0) {
-					TEST(true); // throttling transaction after getting read version
+					CODE_PROBE(true, "throttling transaction after getting read version");
 					++trState->cx->transactionReadVersionsThrottled;
 					throw tag_throttled();
 				}
@ -6959,12 +6962,12 @@ Future<Version> Transaction::getReadVersion(uint32_t flags) {
 			}

 			if (maxThrottleDelay > 0.0 && !canRecheck) { // TODO: allow delaying?
-				TEST(true); // Throttling tag before GRV request
+				CODE_PROBE(true, "Throttling tag before GRV request");
 				++trState->cx->transactionReadVersionsThrottled;
 				readVersion = tag_throttled();
 				return readVersion;
 			} else {
-				TEST(maxThrottleDelay > 0.0); // Rechecking throttle
+				CODE_PROBE(maxThrottleDelay > 0.0, "Rechecking throttle");
 			}

 			for (auto& tag : trState->options.tags) {
@ -7343,10 +7346,10 @@ ACTOR Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(Da
 			wait(waitForAll(fReplies));

 			if (nLocs == 1) {
-				TEST(true); // Single-shard read hot range request
+				CODE_PROBE(true, "Single-shard read hot range request");
 				return fReplies[0].get().readHotRanges;
 			} else {
-				TEST(true); // Multi-shard read hot range request
+				CODE_PROBE(true, "Multi-shard read hot range request");
 				Standalone<VectorRef<ReadHotRangeWithMetrics>> results;
 				for (int i = 0; i < nLocs; i++) {
 					results.append(results.arena(),
@ -7855,7 +7858,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
 						if (!results.empty() && results.back().keyRange.end != chunk.keyRange.begin) {
 							ASSERT(results.back().keyRange.end > chunk.keyRange.begin);
 							ASSERT(results.back().keyRange.end <= chunk.keyRange.end);
-							TEST(true); // Merge while reading granule range
+							CODE_PROBE(true, "Merge while reading granule range");
 							while (!results.empty() && results.back().keyRange.begin >= chunk.keyRange.begin) {
 								// TODO: we can't easily un-depend the arenas for these guys, but that's ok as this
 								// should be rare
@ -8980,8 +8983,8 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
 	state std::vector<Future<Void>> onErrors(interfs.size());
 	state std::vector<MutationAndVersionStream> streams(interfs.size());

-	TEST(interfs.size() > 10); // Large change feed merge cursor
-	TEST(interfs.size() > 100); // Very large change feed merge cursor
+	CODE_PROBE(interfs.size() > 10, "Large change feed merge cursor");
+	CODE_PROBE(interfs.size() > 100, "Very large change feed merge cursor");

 	state UID mergeCursorUID = UID();
 	state std::vector<UID> debugUIDs;
@ -9305,13 +9308,13 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
 					interfs.emplace_back(locations[i].locations->getInterface(chosenLocations[i]),
 					                     locations[i].range & range);
 				}
-				TEST(true); // Change feed merge cursor
+				CODE_PROBE(true, "Change feed merge cursor");
 				// TODO (jslocum): validate connectionFileChanged behavior
 				wait(
 				    mergeChangeFeedStream(db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
 				    cx->connectionFileChanged());
 			} else {
-				TEST(true); // Change feed single cursor
+				CODE_PROBE(true, "Change feed single cursor");
 				StorageServerInterface interf = locations[0].locations->getInterface(chosenLocations[0]);
 				wait(singleChangeFeedStream(
 				         db, interf, range, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
@ -9327,7 +9330,7 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
 				results->streams.clear();
 				results->storageData.clear();
 				if (e.code() == error_code_change_feed_popped) {
-					TEST(true); // getChangeFeedStreamActor got popped
+					CODE_PROBE(true, "getChangeFeedStreamActor got popped");
 					results->mutations.sendError(e);
 					results->refresh.sendError(e);
 				} else {
--- a/fdbclient/PaxosConfigTransaction.actor.cpp
+++ b/fdbclient/PaxosConfigTransaction.actor.cpp
@ -199,7 +199,7 @@ class GetGenerationQuorum {
 				}
 			} catch (Error& e) {
 				if (e.code() == error_code_failed_to_reach_quorum) {
-					TEST(true); // Failed to reach quorum getting generation
+					CODE_PROBE(true, "Failed to reach quorum getting generation");
 					wait(delayJittered(
 					    std::clamp(0.005 * (1 << retries), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND)));
 					++retries;
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -1213,7 +1213,7 @@ public:
 		// isolation support. But it is not default and is rarely used. So we disallow it until we have thorough test
 		// coverage for it.)
 		if (snapshot) {
-			TEST(true); // getMappedRange not supported for snapshot.
+			CODE_PROBE(true, "getMappedRange not supported for snapshot.");
 			throw unsupported_operation();
 		}
 		// For now, getMappedRange requires read-your-writes being NOT disabled. But the support of RYW is limited
@ -1222,7 +1222,7 @@ public:
 		// which returns the written value transparently. In another word, it makes sure not break RYW semantics without
 		// actually implementing reading from the writes.
 		if (ryw->options.readYourWritesDisabled) {
-			TEST(true); // getMappedRange not supported for read-your-writes disabled.
+			CODE_PROBE(true, "getMappedRange not supported for read-your-writes disabled.");
 			throw unsupported_operation();
 		}

@ -1242,7 +1242,7 @@ public:
 			++it;

 			ASSERT(itCopy->value.size());
-			TEST(itCopy->value.size() > 1); // Multiple watches on the same key triggered by RYOW
+			CODE_PROBE(itCopy->value.size() > 1, "Multiple watches on the same key triggered by RYOW");

 			for (int i = 0; i < itCopy->value.size(); i++) {
 				if (itCopy->value[i]->onChangeTrigger.isSet()) {
@ -1535,11 +1535,11 @@ ACTOR Future<RangeResult> getWorkerInterfaces(Reference<IClusterConnectionRecord
 }

 Future<Optional<Value>> ReadYourWritesTransaction::get(const Key& key, Snapshot snapshot) {
-	TEST(true); // ReadYourWritesTransaction::get
+	CODE_PROBE(true, "ReadYourWritesTransaction::get");

 	if (getDatabase()->apiVersionAtLeast(630)) {
 		if (specialKeys.contains(key)) {
-			TEST(true); // Special keys get
+			CODE_PROBE(true, "Special keys get");
 			return getDatabase()->specialKeySpace->get(this, key);
 		}
 	} else {
@ -1622,7 +1622,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
 	if (getDatabase()->apiVersionAtLeast(630)) {
 		if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() &&
 		    end.getKey() <= specialKeys.end) {
-			TEST(true); // Special key space get range
+			CODE_PROBE(true, "Special key space get range");
 			return getDatabase()->specialKeySpace->getRange(this, begin, end, limits, reverse);
 		}
 	} else {
@ -1648,7 +1648,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,

 	// This optimization prevents nullptr operations from being added to the conflict range
 	if (limits.isReached()) {
-		TEST(true); // RYW range read limit 0
+		CODE_PROBE(true, "RYW range read limit 0");
 		return RangeResult();
 	}

@ -1662,7 +1662,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
 		end.removeOrEqual(end.arena());

 	if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
-		TEST(true); // RYW range inverted
+		CODE_PROBE(true, "RYW range inverted");
 		return RangeResult();
 	}

@ -1692,7 +1692,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
 	if (getDatabase()->apiVersionAtLeast(630)) {
 		if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() &&
 		    end.getKey() <= specialKeys.end) {
-			TEST(true); // Special key space get range (getMappedRange)
+			CODE_PROBE(true, "Special key space get range (getMappedRange)");
 			throw client_invalid_operation(); // Not support special keys.
 		}
 	} else {
@ -1714,7 +1714,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector

 	// This optimization prevents nullptr operations from being added to the conflict range
 	if (limits.isReached()) {
-		TEST(true); // RYW range read limit 0 (getMappedRange)
+		CODE_PROBE(true, "RYW range read limit 0 (getMappedRange)");
 		return MappedRangeResult();
 	}

@ -1728,7 +1728,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
 		end.removeOrEqual(end.arena());

 	if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
-		TEST(true); // RYW range inverted (getMappedRange)
+		CODE_PROBE(true, "RYW range inverted (getMappedRange)");
 		return MappedRangeResult();
 	}

@ -1998,7 +1998,7 @@ void ReadYourWritesTransaction::setToken(uint64_t token) {
 }

 RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRangeRef kr) {
-	TEST(true); // Special keys read conflict range
+	CODE_PROBE(true, "Special keys read conflict range");
 	ASSERT(readConflictRangeKeysRange.contains(kr));
 	ASSERT(!tr.trState->options.checkWritesEnabled);
 	RangeResult result;
@ -2040,7 +2040,7 @@ RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRange
 }

 RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRangeRef kr) {
-	TEST(true); // Special keys write conflict range
+	CODE_PROBE(true, "Special keys write conflict range");
 	ASSERT(writeConflictRangeKeysRange.contains(kr));
 	RangeResult result;

@ -2145,7 +2145,7 @@ void ReadYourWritesTransaction::atomicOp(const KeyRef& key, const ValueRef& oper
 	}

 	if (operationType == MutationRef::SetVersionstampedKey) {
-		TEST(options.readYourWritesDisabled); // SetVersionstampedKey without ryw enabled
+		CODE_PROBE(options.readYourWritesDisabled, "SetVersionstampedKey without ryw enabled");
 		// this does validation of the key and needs to be performed before the readYourWritesDisabled path
 		KeyRangeRef range = getVersionstampKeyRange(arena, k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey());
 		versionStampKeys.push_back(arena, k);
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -881,12 +881,18 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// Cluster recovery
 	init ( CLUSTER_RECOVERY_EVENT_NAME_PREFIX,              "Master" );

-	// encrypt key proxy
+	// Encryption
 	init( ENABLE_ENCRYPTION,                                   false ); if ( randomize && BUGGIFY ) { ENABLE_ENCRYPTION = deterministicRandom()->coinflip(); }
 	init( ENCRYPTION_MODE,                             "AES-256-CTR" );
 	init( SIM_KMS_MAX_KEYS,                                     4096 );
 	init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH,                 100000 );
-	init( ENABLE_TLOG_ENCRYPTION,                  ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) { ENABLE_TLOG_ENCRYPTION = (ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && deterministicRandom()->coinflip()); }
+	init( ENABLE_TLOG_ENCRYPTION,                  ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_TLOG_ENCRYPTION = (ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && deterministicRandom()->coinflip()); }
+	init( ENABLE_BLOB_GRANULE_ENCRYPTION,          ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_ENCRYPTION = (ENABLE_ENCRYPTION && deterministicRandom()->coinflip()); }
+
+	// encrypt key proxy
+	init( ENABLE_BLOB_GRANULE_COMPRESSION,                     false ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_COMPRESSION = deterministicRandom()->coinflip(); }
+	init( BLOB_GRANULE_COMPRESSION_FILTER,                    "GZIP" ); if ( randomize && BUGGIFY ) { BLOB_GRANULE_COMPRESSION_FILTER = "NONE"; }
+

    // KMS connector type
 	init( KMS_CONNECTOR_TYPE,                     "RESTKmsConnector" );
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -364,12 +364,12 @@ ACTOR Future<RangeResult> SpecialKeySpace::getRangeAggregationActor(SpecialKeySp
 	// Handle all corner cases like what RYW does
 	// return if range inverted
 	if (actualBeginOffset >= actualEndOffset && begin.getKey() >= end.getKey()) {
-		TEST(true); // inverted range
+		CODE_PROBE(true, "inverted range");
 		return RangeResultRef(false, false);
 	}
 	// If touches begin or end, return with readToBegin and readThroughEnd flags
 	if (begin.getKey() == moduleBoundary.end || end.getKey() == moduleBoundary.begin) {
-		TEST(true); // query touches begin or end
+		CODE_PROBE(true, "query touches begin or end");
 		return result;
 	}
 	state RangeMap<Key, SpecialKeyRangeReadImpl*, KeyRangeRef>::Ranges ranges =
@ -453,7 +453,7 @@ Future<RangeResult> SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw,
 	if (!limits.isValid())
 		return range_limits_invalid();
 	if (limits.isReached()) {
-		TEST(true); // read limit 0
+		CODE_PROBE(true, "read limit 0");
 		return RangeResult();
 	}
 	// make sure orEqual == false
@ -461,7 +461,7 @@ Future<RangeResult> SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw,
 	end.removeOrEqual(end.arena());

 	if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
-		TEST(true); // range inverted
+		CODE_PROBE(true, "range inverted");
 		return RangeResult();
 	}

--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -19,6 +19,7 @@
 */

 #include "fdbclient/SystemData.h"
+#include "fdbclient/BlobGranuleCommon.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/StorageServerInterface.h"
@ -1370,26 +1371,35 @@ const KeyRange blobGranuleFileKeyRangeFor(UID granuleID) {
 	return KeyRangeRef(startKey, strinc(startKey));
 }

-const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength) {
+const Value blobGranuleFileValueFor(StringRef const& filename,
+                                    int64_t offset,
+                                    int64_t length,
+                                    int64_t fullFileLength,
+                                    Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta) {
 	BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
 	wr << filename;
 	wr << offset;
 	wr << length;
 	wr << fullFileLength;
+	wr << cipherKeysMeta;
 	return wr.toValue();
 }

-std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value) {
+std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t, Optional<BlobGranuleCipherKeysMeta>>
+decodeBlobGranuleFileValue(ValueRef const& value) {
 	StringRef filename;
 	int64_t offset;
 	int64_t length;
 	int64_t fullFileLength;
+	Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
+
 	BinaryReader reader(value, IncludeVersion());
 	reader >> filename;
 	reader >> offset;
 	reader >> length;
 	reader >> fullFileLength;
-	return std::tuple(filename, offset, length, fullFileLength);
+	reader >> cipherKeysMeta;
+	return std::tuple(filename, offset, length, fullFileLength, cipherKeysMeta);
 }

 const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force) {
@ -1620,6 +1630,13 @@ const KeyRef tenantMapPrivatePrefix = "\xff\xff/tenantMap/"_sr;
 const KeyRef tenantLastIdKey = "\xff/tenantLastId/"_sr;
 const KeyRef tenantDataPrefixKey = "\xff/tenantDataPrefix"_sr;

+const KeyRangeRef storageQuotaKeys(LiteralStringRef("\xff/storageQuota/"), LiteralStringRef("\xff/storageQuota0"));
+const KeyRef storageQuotaPrefix = storageQuotaKeys.begin;
+
+Key storageQuotaKey(StringRef tenantName) {
+	return tenantName.withPrefix(storageQuotaPrefix);
+}
+
 // for tests
 void testSSISerdes(StorageServerInterface const& ssi) {
 	printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\nacceptingRequests=%s\naddress=%s\ngetValue=%s\n\n\n",
--- a/fdbclient/TaskBucket.actor.cpp
+++ b/fdbclient/TaskBucket.actor.cpp
@ -199,7 +199,7 @@ public:
 		// many other new tasks get added so that the timed out tasks never get chances to re-run
 		if (deterministicRandom()->random01() < CLIENT_KNOBS->TASKBUCKET_CHECK_TIMEOUT_CHANCE) {
 			bool anyTimeouts = wait(requeueTimedOutTasks(tr, taskBucket));
-			TEST(anyTimeouts); // Found a task that timed out
+			CODE_PROBE(anyTimeouts, "Found a task that timed out");
 		}

 		state std::vector<Future<Optional<Key>>> taskKeyFutures(CLIENT_KNOBS->TASKBUCKET_MAX_PRIORITY + 1);
@ -233,7 +233,7 @@ public:
 			bool anyTimeouts = wait(requeueTimedOutTasks(tr, taskBucket));
 			// If there were timeouts, try to get a task since there should now be one in one of the available spaces.
 			if (anyTimeouts) {
-				TEST(true); // Try to get one task from timeouts subspace
+				CODE_PROBE(true, "Try to get one task from timeouts subspace");
 				Reference<Task> task = wait(getOne(tr, taskBucket));
 				return task;
 			}
@ -707,7 +707,7 @@ public:
 					wait(delay(CLIENT_KNOBS->TASKBUCKET_CHECK_ACTIVE_DELAY));
 					bool isActiveKey = wait(getActiveKey(tr, taskBucket, startingValue));
 					if (isActiveKey) {
-						TEST(true); // checkActive return true
+						CODE_PROBE(true, "checkActive return true");
 						return true;
 					}
 					break;
@ -717,7 +717,7 @@ public:
 			}
 		}

-		TEST(true); // checkActive return false
+		CODE_PROBE(true, "checkActive return false");
 		return false;
 	}

@ -742,7 +742,7 @@ public:
 	// Returns True if any tasks were affected.
 	ACTOR static Future<bool> requeueTimedOutTasks(Reference<ReadYourWritesTransaction> tr,
 	                                               Reference<TaskBucket> taskBucket) {
-		TEST(true); // Looks for tasks that have timed out and returns them to be available tasks.
+		CODE_PROBE(true, "Looks for tasks that have timed out and returns them to be available tasks.");
 		Version end = wait(tr->getReadVersion());
 		state KeyRange range(
 		    KeyRangeRef(taskBucket->timeouts.get(0).range().begin, taskBucket->timeouts.get(end).range().end));
@ -849,12 +849,12 @@ public:

 		// If we're updating the task params the clear the old space and write params to the new space
 		if (updateParams) {
-			TEST(true); // Extended a task while updating parameters
+			CODE_PROBE(true, "Extended a task while updating parameters");
 			for (auto& p : task->params) {
 				tr->set(newTimeoutSpace.pack(p.key), p.value);
 			}
 		} else {
-			TEST(true); // Extended a task without updating parameters
+			CODE_PROBE(true, "Extended a task without updating parameters");
 			// Otherwise, read and transplant the params from the old to new timeout spaces
 			RangeResult params = wait(tr->getRange(oldTimeoutSpace.range(), CLIENT_KNOBS->TOO_MANY));
 			for (auto& kv : params) {
@ -1138,10 +1138,10 @@ public:
 		bool is_set = wait(isSet(tr, taskFuture));

 		if (is_set) {
-			TEST(true); // is_set == true
+			CODE_PROBE(true, "is_set == true");
 			wait(performAction(tr, taskBucket, taskFuture, task));
 		} else {
-			TEST(true); // is_set == false
+			CODE_PROBE(true, "is_set == false");
 			Subspace callbackSpace =
 			    taskFuture->callbacks.get(StringRef(deterministicRandom()->randomUniqueID().toString()));
 			for (auto& v : task->params) {
--- a/fdbclient/WriteMap.cpp
+++ b/fdbclient/WriteMap.cpp
@ -567,7 +567,7 @@ void WriteMap::clearNoConflict(KeyRangeRef keys) {
 	bool end_conflict = it.is_conflict_range();
 	bool end_unreadable = it.is_unreadable();

-	TEST(it.is_conflict_range() != lastConflicted); // not last conflicted
+	CODE_PROBE(it.is_conflict_range() != lastConflicted, "not last conflicted");

 	it.tree.clear();

--- a/fdbclient/include/fdbclient/Atomic.h
+++ b/fdbclient/include/fdbclient/Atomic.h
@ -120,7 +120,7 @@ inline ValueRef doAppendIfFits(const Optional<ValueRef>& existingValueOptional,
 	if (!otherOperand.size())
 		return existingValue;
 	if (existingValue.size() + otherOperand.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT) {
-		TEST(true) // AppendIfFIts resulted in truncation
+		CODE_PROBE(true, "AppendIfFIts resulted in truncation");
 		return existingValue;
 	}

--- a/fdbclient/include/fdbclient/BlobGranuleCommon.h
+++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h
@ -22,11 +22,18 @@
 #define FDBCLIENT_BLOBGRANULECOMMON_H
 #pragma once

-#include <sstream>
-
 #include "fdbclient/CommitTransaction.h"
 #include "fdbclient/FDBTypes.h"

+#include "flow/BlobCipher.h"
+#include "flow/EncryptUtils.h"
+#include "flow/IRandom.h"
+#include "flow/serialize.h"
+
+#include <sstream>
+
+#define BG_ENCRYPT_COMPRESS_DEBUG false
+
 // file format of actual blob files
 // FIXME: use VecSerStrategy::String serialization for this
 struct GranuleSnapshot : VectorRef<KeyValueRef> {
@ -48,33 +55,165 @@ struct GranuleDeltas : VectorRef<MutationsAndVersionRef> {
 	}
 };

+struct BlobGranuleCipherKeysMeta {
+	EncryptCipherDomainId textDomainId;
+	EncryptCipherBaseKeyId textBaseCipherId;
+	EncryptCipherRandomSalt textSalt;
+	EncryptCipherDomainId headerDomainId;
+	EncryptCipherBaseKeyId headerBaseCipherId;
+	EncryptCipherRandomSalt headerSalt;
+	std::string ivStr;
+
+	BlobGranuleCipherKeysMeta() {}
+	BlobGranuleCipherKeysMeta(const EncryptCipherDomainId tDomainId,
+	                          const EncryptCipherBaseKeyId tBaseCipherId,
+	                          const EncryptCipherRandomSalt tSalt,
+	                          const EncryptCipherDomainId hDomainId,
+	                          const EncryptCipherBaseKeyId hBaseCipherId,
+	                          const EncryptCipherRandomSalt hSalt,
+	                          const std::string& iv)
+	  : textDomainId(tDomainId), textBaseCipherId(tBaseCipherId), textSalt(tSalt), headerDomainId(hDomainId),
+	    headerBaseCipherId(hBaseCipherId), headerSalt(hSalt), ivStr(iv) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, textDomainId, textBaseCipherId, textSalt, headerDomainId, headerBaseCipherId, headerSalt, ivStr);
+	}
+};
+
+struct BlobGranuleCipherKey {
+	constexpr static FileIdentifier file_identifier = 7274734;
+	EncryptCipherDomainId encryptDomainId;
+	EncryptCipherBaseKeyId baseCipherId;
+	EncryptCipherRandomSalt salt;
+	StringRef baseCipher;
+
+	static BlobGranuleCipherKey fromBlobCipherKey(Reference<BlobCipherKey> keyRef, Arena& arena) {
+		BlobGranuleCipherKey cipherKey;
+		cipherKey.encryptDomainId = keyRef->getDomainId();
+		cipherKey.baseCipherId = keyRef->getBaseCipherId();
+		cipherKey.salt = keyRef->getSalt();
+		cipherKey.baseCipher = makeString(keyRef->getBaseCipherLen(), arena);
+		memcpy(mutateString(cipherKey.baseCipher), keyRef->rawBaseCipher(), keyRef->getBaseCipherLen());
+
+		return cipherKey;
+	}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, encryptDomainId, baseCipherId, salt, baseCipher);
+	}
+};
+
+struct BlobGranuleCipherKeysCtx {
+	constexpr static FileIdentifier file_identifier = 1278718;
+	BlobGranuleCipherKey textCipherKey;
+	BlobGranuleCipherKey headerCipherKey;
+	StringRef ivRef;
+
+	static BlobGranuleCipherKeysMeta toCipherKeysMeta(const BlobGranuleCipherKeysCtx& ctx) {
+		return BlobGranuleCipherKeysMeta(ctx.textCipherKey.encryptDomainId,
+		                                 ctx.textCipherKey.baseCipherId,
+		                                 ctx.textCipherKey.salt,
+		                                 ctx.headerCipherKey.encryptDomainId,
+		                                 ctx.headerCipherKey.baseCipherId,
+		                                 ctx.headerCipherKey.salt,
+		                                 ctx.ivRef.toString());
+	}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, textCipherKey, headerCipherKey, ivRef);
+	}
+};
+
+struct BlobGranuleFileEncryptionKeys {
+	Reference<BlobCipherKey> textCipherKey;
+	Reference<BlobCipherKey> headerCipherKey;
+};
+
+struct BlobGranuleCipherKeysMetaRef {
+	EncryptCipherDomainId textDomainId;
+	EncryptCipherBaseKeyId textBaseCipherId;
+	EncryptCipherRandomSalt textSalt;
+	EncryptCipherDomainId headerDomainId;
+	EncryptCipherBaseKeyId headerBaseCipherId;
+	EncryptCipherRandomSalt headerSalt;
+	StringRef ivRef;
+
+	BlobGranuleCipherKeysMetaRef() {}
+	BlobGranuleCipherKeysMetaRef(Arena& to,
+	                             const EncryptCipherDomainId tDomainId,
+	                             const EncryptCipherBaseKeyId tBaseCipherId,
+	                             const EncryptCipherRandomSalt tSalt,
+	                             const EncryptCipherDomainId hDomainId,
+	                             const EncryptCipherBaseKeyId hBaseCipherId,
+	                             const EncryptCipherRandomSalt hSalt,
+	                             const std::string& ivStr)
+	  : textDomainId(tDomainId), textBaseCipherId(tBaseCipherId), textSalt(tSalt), headerDomainId(hDomainId),
+	    headerBaseCipherId(hBaseCipherId), headerSalt(hSalt), ivRef(StringRef(to, ivStr)) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, textDomainId, textBaseCipherId, textSalt, headerDomainId, headerBaseCipherId, headerSalt, ivRef);
+	}
+};
+
 struct BlobFilePointerRef {
 	constexpr static FileIdentifier file_identifier = 5253554;
 	StringRef filename;
 	int64_t offset;
 	int64_t length;
 	int64_t fullFileLength;
+	Optional<BlobGranuleCipherKeysMetaRef> cipherKeysMetaRef;

 	BlobFilePointerRef() {}
 	BlobFilePointerRef(Arena& to, const std::string& filename, int64_t offset, int64_t length, int64_t fullFileLength)
 	  : filename(to, filename), offset(offset), length(length), fullFileLength(fullFileLength) {}

+	BlobFilePointerRef(Arena& to,
+	                   const std::string& filename,
+	                   int64_t offset,
+	                   int64_t length,
+	                   int64_t fullFileLength,
+	                   Optional<BlobGranuleCipherKeysMeta> ciphKeysMeta)
+	  : filename(to, filename), offset(offset), length(length), fullFileLength(fullFileLength) {
+		if (ciphKeysMeta.present()) {
+			cipherKeysMetaRef = BlobGranuleCipherKeysMetaRef(to,
+			                                                 ciphKeysMeta.get().textDomainId,
+			                                                 ciphKeysMeta.get().textBaseCipherId,
+			                                                 ciphKeysMeta.get().textSalt,
+			                                                 ciphKeysMeta.get().headerDomainId,
+			                                                 ciphKeysMeta.get().headerBaseCipherId,
+			                                                 ciphKeysMeta.get().headerSalt,
+			                                                 ciphKeysMeta.get().ivStr);
+		}
+	}
+
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, filename, offset, length, fullFileLength);
+		serializer(ar, filename, offset, length, fullFileLength, cipherKeysMetaRef);
 	}

 	std::string toString() const {
 		std::stringstream ss;
 		ss << filename.toString() << ":" << offset << ":" << length << ":" << fullFileLength;
+		if (cipherKeysMetaRef.present()) {
+			ss << ":CipherKeysMeta:TextCipher:" << cipherKeysMetaRef.get().textDomainId << ":"
+			   << cipherKeysMetaRef.get().textBaseCipherId << ":" << cipherKeysMetaRef.get().textSalt
+			   << ":HeaderCipher:" << cipherKeysMetaRef.get().headerDomainId << ":"
+			   << cipherKeysMetaRef.get().headerBaseCipherId << ":" << cipherKeysMetaRef.get().headerSalt;
+		}
 		return std::move(ss).str();
 	}
 };

-// the assumption of this response is that the client will deserialize the files and apply the mutations themselves
-// TODO could filter out delta files that don't intersect the key range being requested?
-// TODO since client request passes version, we don't need to include the version of each mutation in the response if we
-// pruned it there
+// the assumption of this response is that the client will deserialize the files
+// and apply the mutations themselves
+// TODO could filter out delta files that don't intersect the key range being
+// requested?
+// TODO since client request passes version, we don't need to include the
+// version of each mutation in the response if we pruned it there
 struct BlobGranuleChunkRef {
 	constexpr static FileIdentifier file_identifier = 865198;
 	KeyRangeRef keyRange;
@ -84,10 +223,19 @@ struct BlobGranuleChunkRef {
 	VectorRef<BlobFilePointerRef> deltaFiles;
 	GranuleDeltas newDeltas;
 	Optional<KeyRef> tenantPrefix;
+	Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, keyRange, includedVersion, snapshotVersion, snapshotFile, deltaFiles, newDeltas, tenantPrefix);
+		serializer(ar,
+		           keyRange,
+		           includedVersion,
+		           snapshotVersion,
+		           snapshotFile,
+		           deltaFiles,
+		           newDeltas,
+		           tenantPrefix,
+		           cipherKeysCtx);
 	}
 };

--- a/fdbclient/include/fdbclient/BlobGranuleFiles.h
+++ b/fdbclient/include/fdbclient/BlobGranuleFiles.h
@ -24,8 +24,12 @@
 // This file contains functions for readers who want to materialize blob granules from the underlying files

 #include "fdbclient/BlobGranuleCommon.h"
+#include "flow/CompressionUtils.h"

-Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot, int chunks);
+Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot,
+                               int chunks,
+                               Optional<CompressionFilter> compressFilter,
+                               Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = Optional<BlobGranuleCipherKeysCtx>());

 // FIXME: support sorted and chunked delta files

--- a/fdbclient/include/fdbclient/ManagementAPI.actor.h
+++ b/fdbclient/include/fdbclient/ManagementAPI.actor.h
@ -159,5 +159,9 @@ bool schemaMatch(json_spirit::mValue const& schema,
 // storage nodes
 ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);

+// Set and get the storage quota per tenant
+void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota);
+ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
+
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/include/fdbclient/PImpl.h
+++ b/fdbclient/include/fdbclient/PImpl.h
@ -39,4 +39,6 @@ public:
 	T const& operator*() const { return *impl; }
 	T* operator->() { return impl.get(); }
 	T const* operator->() const { return impl.get(); }
+	T* get() { return impl.get(); }
+	T const* get() const { return impl.get(); }
 };
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -863,6 +863,11 @@ public:
 	int SIM_KMS_MAX_KEYS;
 	int ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH;
 	bool ENABLE_TLOG_ENCRYPTION;
+	bool ENABLE_BLOB_GRANULE_ENCRYPTION;
+
+	// Compression
+	bool ENABLE_BLOB_GRANULE_COMPRESSION;
+	std::string BLOB_GRANULE_COMPRESSION_FILTER;

 	// Key Management Service (KMS) Connector
 	std::string KMS_CONNECTOR_TYPE;
--- a/fdbclient/include/fdbclient/SystemData.h
+++ b/fdbclient/include/fdbclient/SystemData.h
@ -25,6 +25,7 @@
 // Functions and constants documenting the organization of the reserved keyspace in the database beginning with "\xFF"

 #include "fdbclient/FDBTypes.h"
+#include "fdbclient/BlobGranuleCommon.h"
 #include "fdbclient/BlobWorkerInterface.h" // TODO move the functions that depend on this out of here and into BlobWorkerInterface.h to remove this depdendency
 #include "fdbclient/StorageServerInterface.h"
 #include "Tenant.h"
@ -624,8 +625,14 @@ const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t file
 std::tuple<UID, Version, uint8_t> decodeBlobGranuleFileKey(KeyRef const& key);
 const KeyRange blobGranuleFileKeyRangeFor(UID granuleID);

-const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength);
-std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value);
+const Value blobGranuleFileValueFor(
+    StringRef const& filename,
+    int64_t offset,
+    int64_t length,
+    int64_t fullFileLength,
+    Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta = Optional<BlobGranuleCipherKeysMeta>());
+std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t, Optional<BlobGranuleCipherKeysMeta>>
+decodeBlobGranuleFileValue(ValueRef const& value);

 const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force);
 std::tuple<Version, KeyRange, bool> decodeBlobGranulePurgeValue(ValueRef const& value);
@ -679,6 +686,12 @@ extern const KeyRef tenantMapPrivatePrefix;
 extern const KeyRef tenantLastIdKey;
 extern const KeyRef tenantDataPrefixKey;

+// Storage quota per tenant
+// "\xff/storageQuota/[[tenantName]]" := "[[quota]]"
+extern const KeyRangeRef storageQuotaKeys;
+extern const KeyRef storageQuotaPrefix;
+Key storageQuotaKey(StringRef tenantName);
+
 #pragma clang diagnostic pop

 #endif
--- a/fdbclient/include/fdbclient/TenantManagement.actor.h
+++ b/fdbclient/include/fdbclient/TenantManagement.actor.h
@ -19,6 +19,7 @@
 */

 #pragma once
+#include "flow/IRandom.h"
 #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H)
 #define FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H
 #include "fdbclient/TenantManagement.actor.g.h"
@ -135,6 +136,17 @@ Future<std::pair<TenantMapEntry, bool>> createTenantTransaction(Transaction tr,
 	return std::make_pair(newTenant, true);
 }

+ACTOR template <class Transaction>
+Future<int64_t> getNextTenantId(Transaction tr) {
+	state typename transaction_future_type<Transaction, Optional<Value>>::type lastIdFuture = tr->get(tenantLastIdKey);
+	Optional<Value> lastIdVal = wait(safeThreadFutureToFuture(lastIdFuture));
+	int64_t tenantId = lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0;
+	if (BUGGIFY) {
+		tenantId += deterministicRandom()->randomSkewedUInt32(1, 1e9);
+	}
+	return tenantId;
+}
+
 ACTOR template <class DB>
 Future<TenantMapEntry> createTenant(Reference<DB> db, TenantName name) {
 	state Reference<typename DB::TransactionT> tr = db->createTransaction();
@ -144,7 +156,8 @@ Future<TenantMapEntry> createTenant(Reference<DB> db, TenantName name) {
 		try {
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-			state typename DB::TransactionT::template FutureT<Optional<Value>> lastIdFuture = tr->get(tenantLastIdKey);
+
+			state Future<int64_t> tenantIdFuture = getNextTenantId(tr);

 			if (firstTry) {
 				Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
@ -155,8 +168,7 @@ Future<TenantMapEntry> createTenant(Reference<DB> db, TenantName name) {
 				firstTry = false;
 			}

-			Optional<Value> lastIdVal = wait(safeThreadFutureToFuture(lastIdFuture));
-			int64_t tenantId = lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0;
+			int64_t tenantId = wait(tenantIdFuture);
 			tr->set(tenantLastIdKey, TenantMapEntry::idToPrefix(tenantId));
 			state std::pair<TenantMapEntry, bool> newTenant = wait(createTenantTransaction(tr, name, tenantId));

--- a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h
+++ b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h
@ -31,6 +31,7 @@
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/TenantManagement.actor.h"
+#include "fdbclient/libb64/encode.h"
 #include "flow/Arena.h"
 #include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
@ -76,7 +77,19 @@ private:
 		for (auto tenant : tenants) {
 			json_spirit::mObject tenantEntry;
 			tenantEntry["id"] = tenant.second.id;
-			tenantEntry["prefix"] = tenant.second.prefix.toString();
+			if (ryw->getDatabase()->apiVersionAtLeast(720)) {
+				json_spirit::mObject prefixObject;
+				std::string encodedPrefix = base64::encoder::from_string(tenant.second.prefix.toString());
+				// Remove trailing newline
+				encodedPrefix.resize(encodedPrefix.size() - 1);
+
+				prefixObject["base64"] = encodedPrefix;
+				prefixObject["printable"] = printable(tenant.second.prefix);
+				tenantEntry["prefix"] = prefixObject;
+			} else {
+				// This is not a standard encoding in JSON, and some libraries may not be able to easily decode it
+				tenantEntry["prefix"] = tenant.second.prefix.toString();
+			}
 			std::string tenantEntryString = json_spirit::write_string(json_spirit::mValue(tenantEntry));
 			ValueRef tenantEntryBytes(results->arena(), tenantEntryString);
 			results->push_back(results->arena(),
@ -108,16 +121,16 @@ private:
 	}

 	ACTOR static Future<Void> createTenants(ReadYourWritesTransaction* ryw, std::vector<TenantNameRef> tenants) {
-		Optional<Value> lastIdVal = wait(ryw->getTransaction().get(tenantLastIdKey));
-		int64_t previousId = lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) : -1;
+		int64_t _nextId = wait(TenantAPI::getNextTenantId(&ryw->getTransaction()));
+		int64_t nextId = _nextId;

 		std::vector<Future<Void>> createFutures;
 		for (auto tenant : tenants) {
 			createFutures.push_back(
-			    success(TenantAPI::createTenantTransaction(&ryw->getTransaction(), tenant, ++previousId)));
+			    success(TenantAPI::createTenantTransaction(&ryw->getTransaction(), tenant, nextId++)));
 		}

-		ryw->getTransaction().set(tenantLastIdKey, TenantMapEntry::idToPrefix(previousId));
+		ryw->getTransaction().set(tenantLastIdKey, TenantMapEntry::idToPrefix(nextId - 1));
 		wait(waitForAll(createFutures));
 		return Void();
 	}
--- a/fdbclient/include/fdbclient/libb64/decode.h
+++ b/fdbclient/include/fdbclient/libb64/decode.h
@ -48,6 +48,14 @@ struct decoder {
 		delete[] code;
 		delete[] plaintext;
 	}
+
+	static std::string from_string(std::string s) {
+		std::stringstream in(s);
+		std::stringstream out;
+		decoder dec;
+		dec.decode(in, out);
+		return out.str();
+	}
 };

 } // namespace base64
--- a/fdbrpc/AsyncFileCached.actor.cpp
+++ b/fdbrpc/AsyncFileCached.actor.cpp
@ -193,14 +193,14 @@ Future<Void> AsyncFileCached::changeFileSize(int64_t size) {
 	prevLength = size;

 	if (offsetInPage) {
-		TEST(true); // Truncating to the middle of a page
+		CODE_PROBE(true, "Truncating to the middle of a page");
 		auto p = pages.find(pageOffset);
 		if (p != pages.end()) {
 			auto f = p->second->flush();
 			if (!f.isReady() || f.isError())
 				actors.push_back(f);
 		} else {
-			TEST(true); // Truncating to the middle of a page that isn't in cache
+			CODE_PROBE(true, "Truncating to the middle of a page that isn't in cache");
 		}

 		pageOffset += pageCache->pageSize;
--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@ -25,7 +25,7 @@ add_flow_target(STATIC_LIBRARY NAME fdbrpc_sampling
  SRCS ${FDBRPC_SRCS}
  DISABLE_ACTOR_DIAGNOSTICS ${FDBRPC_SRCS_DISABLE_ACTOR_DIAGNOSTICS})

-add_flow_target(LINK_TEST NAME fdbrpclinktest SRCS ${FDBRPC_SRCS} LinkTest.cpp DISABLE_ACTOR_DIAGNOSTICS ${FDBRPC_SRCS_DISABLE_ACTOR_DIAGNOSTICS})
+add_flow_target(LINK_TEST NAME fdbrpclinktest SRCS LinkTest.cpp)
 target_link_libraries(fdbrpclinktest PRIVATE fdbrpc rapidjson)
 target_include_directories(fdbrpclinktest PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/libeio)

--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@ -69,7 +69,7 @@ TEST_CASE("/flow/buggifiedDelay") {
 		});
 		wait(f1 && f2);
 		if (last == 1) {
-			TEST(true); // Delays can become ready out of order
+			CODE_PROBE(true, "Delays can become ready out of order");
 			return Void();
 		}
 	}
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -615,8 +615,8 @@ ACTOR Future<Void> connectionWriter(Reference<Peer> self, Reference<IConnection>
 				break;
 			}

-			TEST(true); // We didn't write everything, so apparently the write buffer is full.  Wait for it to be
-			            // nonfull.
+			CODE_PROBE(
+			    true, "We didn't write everything, so apparently the write buffer is full.  Wait for it to be nonfull");
 			wait(conn->onWritable());
 			wait(yield(TaskPriority::WriteSocket));
 		}
@ -1462,7 +1462,7 @@ ACTOR static Future<Void> connectionIncoming(TransportData* self, Reference<ICon
 			}
 			when(Reference<Peer> p = wait(onConnected.getFuture())) { p->onIncomingConnection(p, conn, reader); }
 			when(wait(delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT))) {
-				TEST(true); // Incoming connection timed out
+				CODE_PROBE(true, "Incoming connection timed out");
 				throw timed_out();
 			}
 		}
@ -1703,7 +1703,7 @@ void FlowTransport::addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageRecei
 }

 static void sendLocal(TransportData* self, ISerializeSource const& what, const Endpoint& destination) {
-	TEST(true); // "Loopback" delivery
+	CODE_PROBE(true, "\"Loopback\" delivery");
 	// SOMEDAY: Would it be better to avoid (de)serialization by doing this check in flow?

 	Standalone<StringRef> copy;
@ -1742,7 +1742,7 @@ static ReliablePacket* sendPacket(TransportData* self,
 	// If there isn't an open connection, a public address, or the peer isn't compatible, we can't send
 	if (!peer || (peer->outgoingConnectionIdle && !destination.getPrimaryAddress().isPublic()) ||
 	    (!peer->compatible && destination.token != Endpoint::wellKnownToken(WLTOKEN_PING_PACKET))) {
-		TEST(true); // Can't send to private address without a compatible open connection
+		CODE_PROBE(true, "Can't send to private address without a compatible open connection");
 		return nullptr;
 	}

--- a/fdbrpc/include/fdbrpc/AsyncFileEIO.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileEIO.actor.h
@ -116,7 +116,7 @@ public:
 	static Future<Void> deleteFile(std::string filename, bool mustBeDurable) {
 		::deleteFile(filename);
 		if (mustBeDurable) {
-			TEST(true); // deleteFile and fsync parent dir
+			CODE_PROBE(true, "deleteFile and fsync parent dir");
 			return async_fsync_parent(filename);
 		} else
 			return Void();
--- a/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h
@ -360,7 +360,7 @@ public:
 	//(e.g. to simulate power failure)
 	Future<Void> kill() {
 		TraceEvent("AsyncFileNonDurable_Kill", id).detail("Filename", filename);
-		TEST(true); // AsyncFileNonDurable was killed
+		CODE_PROBE(true, "AsyncFileNonDurable was killed");
 		return sync(this, false);
 	}

@ -404,7 +404,7 @@ private:
 			TraceEvent("AsyncFileNonDurable_KilledFileOperation", self->id)
 			    .detail("In", context)
 			    .detail("Filename", self->filename);
-			TEST(true); // AsyncFileNonDurable operation killed
+			CODE_PROBE(true, "AsyncFileNonDurable operation killed");
 			throw io_error().asInjectedFault();
 		}

@ -603,13 +603,13 @@ private:
 					    .detail("HasGarbage", garbage)
 					    .detail("Side", side)
 					    .detail("Filename", self->filename);
-					TEST(true); // AsyncFileNonDurable bad write
+					CODE_PROBE(true, "AsyncFileNonDurable bad write");
 				} else {
 					TraceEvent("AsyncFileNonDurable_DroppedWrite", self->id)
 					    .detail("Offset", offset + writeOffset + pageOffset)
 					    .detail("Length", sectorLength)
 					    .detail("Filename", self->filename);
-					TEST(true); // AsyncFileNonDurable dropped write
+					CODE_PROBE(true, "AsyncFileNonDurable dropped write");
 				}

 				pageOffset += sectorLength;
@ -689,7 +689,7 @@ private:
 			wait(self->file->truncate(size));
 		else {
 			TraceEvent("AsyncFileNonDurable_DroppedTruncate", self->id).detail("Size", size);
-			TEST(true); // AsyncFileNonDurable dropped truncate
+			CODE_PROBE(true, "AsyncFileNonDurable dropped truncate");
 		}

 		return Void();
@ -753,7 +753,7 @@ private:
 			// temporary file and then renamed to the correct location once sync is called.  By not calling sync, we
 			// simulate a failure to fsync the directory storing the file
 			if (self->hasBeenSynced && writeDurable && deterministicRandom()->random01() < 0.5) {
-				TEST(true); // AsyncFileNonDurable kill was durable and synced
+				CODE_PROBE(true, "AsyncFileNonDurable kill was durable and synced");
 				wait(success(errorOr(self->file->sync())));
 			}

--- a/fdbrpc/include/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/include/fdbrpc/LoadBalance.actor.h
@ -140,7 +140,7 @@ Future<Void> tssComparison(Req req,
 			tssData.metrics->recordLatency(req, srcEndTime - startTime, tssEndTime - startTime);

 			if (!TSS_doCompare(src.get(), tss.get().get())) {
-				TEST(true); // TSS Mismatch
+				CODE_PROBE(true, "TSS Mismatch");
 				state TraceEvent mismatchEvent(
 				    (g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations)
 				        ? SevWarnAlways
@ -150,7 +150,7 @@ Future<Void> tssComparison(Req req,
 				mismatchEvent.detail("TSSID", tssData.tssId);

 				if (FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_VERIFY_SS && ssTeam->size() > 1) {
-					TEST(true); // checking TSS mismatch against rest of storage team
+					CODE_PROBE(true, "checking TSS mismatch against rest of storage team");

 					// if there is more than 1 SS in the team, attempt to verify that the other SS servers have the same
 					// data
@ -195,9 +195,9 @@ Future<Void> tssComparison(Req req,
 				if (tssData.metrics->shouldRecordDetailedMismatch()) {
 					TSS_traceMismatch(mismatchEvent, req, src.get(), tss.get().get());

-					TEST(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Full TSS Mismatch
-					TEST(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Partial TSS Mismatch and storing
-					                                                         // the rest in FDB
+					CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL, "Tracing Full TSS Mismatch");
+					CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
+					           "Tracing Partial TSS Mismatch and storing the rest in FDB");

 					if (!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL) {
 						mismatchEvent.disable();
@ -268,7 +268,7 @@ struct RequestData : NonCopyable {
 			Optional<TSSEndpointData> tssData = model->getTssData(stream->getEndpoint().token.first());

 			if (tssData.present()) {
-				TEST(true); // duplicating request to TSS
+				CODE_PROBE(true, "duplicating request to TSS");
 				resetReply(request);
 				// FIXME: optimize to avoid creating new netNotifiedQueue for each message
 				RequestStream<Request, P> tssRequestStream(tssData.get().endpoint);
--- a/fdbrpc/include/fdbrpc/genericactors.actor.h
+++ b/fdbrpc/include/fdbrpc/genericactors.actor.h
@ -47,7 +47,7 @@ Future<REPLY_TYPE(Req)> retryBrokenPromise(RequestStream<Req, P> to, Req request
 				throw;
 			resetReply(request);
 			wait(delayJittered(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
-			TEST(true); // retryBrokenPromise
+			CODE_PROBE(true, "retryBrokenPromise");
 		}
 	}
 }
@ -67,7 +67,7 @@ Future<REPLY_TYPE(Req)> retryBrokenPromise(RequestStream<Req, P> to, Req request
 				throw;
 			resetReply(request);
 			wait(delayJittered(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY, taskID));
-			TEST(true); // retryBrokenPromise with taskID
+			CODE_PROBE(true, "retryBrokenPromise with taskID");
 		}
 	}
 }
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -75,10 +75,19 @@ bool simulator_should_inject_fault(const char* context, const char* file, int li
 		uint32_t h1 = line + (p->fault_injection_r >> 32);

 		if (h1 < p->fault_injection_p1 * std::numeric_limits<uint32_t>::max()) {
-			TEST(true); // A fault was injected
-			TEST(error_code == error_code_io_timeout); // An io timeout was injected
-			TEST(error_code == error_code_io_error); // An io error was injected
-			TEST(error_code == error_code_platform_error); // A platform error was injected.
+			CODE_PROBE(true, "A fault was injected", probe::assert::simOnly, probe::context::sim2);
+			CODE_PROBE(error_code == error_code_io_timeout,
+			           "An io timeout was injected",
+			           probe::assert::simOnly,
+			           probe::context::sim2);
+			CODE_PROBE(error_code == error_code_io_error,
+			           "An io error was injected",
+			           probe::assert::simOnly,
+			           probe::context::sim2);
+			CODE_PROBE(error_code == error_code_platform_error,
+			           "A platform error was injected.",
+			           probe::assert::simOnly,
+			           probe::context::sim2);
 			TraceEvent(SevWarn, "FaultInjected")
 			    .detail("Context", context)
 			    .detail("File", file)
@ -426,7 +435,7 @@ private:
 		    deterministicRandom()->random01() < .00001) {
 			g_simulator.lastConnectionFailure = now();
 			double a = deterministicRandom()->random01(), b = deterministicRandom()->random01();
-			TEST(true); // Simulated connection failure
+			CODE_PROBE(true, "Simulated connection failure", probe::context::sim2, probe::assert::simOnly);
 			TraceEvent("ConnectionFailure", dbgid)
 			    .detail("MyAddr", process->address)
 			    .detail("PeerAddr", peerProcess->address)
@ -1178,7 +1187,7 @@ public:
 					auto f = IAsyncFileSystem::filesystem(self->net2)->deleteFile(filename, false);
 					ASSERT(f.isReady());
 					wait(::delay(0.05 * deterministicRandom()->random01()));
-					TEST(true); // Simulated durable delete
+					CODE_PROBE(true, "Simulated durable delete", probe::context::sim2, probe::assert::simOnly);
 				}
 				wait(g_simulator.onProcess(currentProcess, currentTaskID));
 				return Void();
@ -1191,7 +1200,7 @@ public:
 			TraceEvent(SevDebug, "Sim2DeleteFileImplNonDurable")
 			    .detail("Filename", filename)
 			    .detail("Durable", mustBeDurable);
-			TEST(true); // Simulated non-durable delete
+			CODE_PROBE(true, "Simulated non-durable delete", probe::context::sim2, probe::assert::simOnly);
 			return Void();
 		}
 	}
@ -1587,10 +1596,20 @@ public:
 		killProcess_internal(p, KillInstantly);
 	}
 	void killProcess_internal(ProcessInfo* machine, KillType kt) {
-		TEST(true); // Simulated machine was killed with any kill type
-		TEST(kt == KillInstantly); // Simulated machine was killed instantly
-		TEST(kt == InjectFaults); // Simulated machine was killed with faults
-		TEST(kt == FailDisk); // Simulated machine was killed with a failed disk
+		CODE_PROBE(
+		    true, "Simulated machine was killed with any kill type", probe::context::sim2, probe::assert::simOnly);
+		CODE_PROBE(kt == KillInstantly,
+		           "Simulated machine was killed instantly",
+		           probe::context::sim2,
+		           probe::assert::simOnly);
+		CODE_PROBE(kt == InjectFaults,
+		           "Simulated machine was killed with faults",
+		           probe::context::sim2,
+		           probe::assert::simOnly);
+		CODE_PROBE(kt == FailDisk,
+		           "Simulated machine was killed with a failed disk",
+		           probe::context::sim2,
+		           probe::assert::simOnly);

 		if (kt == KillInstantly) {
 			TraceEvent(SevWarn, "FailMachine")
@ -1715,9 +1734,10 @@ public:
 	                 KillType* ktFinal) override {
 		auto ktOrig = kt;

-		TEST(true); // Trying to killing a machine
-		TEST(kt == KillInstantly); // Trying to kill instantly
-		TEST(kt == InjectFaults); // Trying to kill by injecting faults
+		CODE_PROBE(true, "Trying to killing a machine", probe::context::sim2, probe::assert::simOnly);
+		CODE_PROBE(kt == KillInstantly, "Trying to kill instantly", probe::context::sim2, probe::assert::simOnly);
+		CODE_PROBE(
+		    kt == InjectFaults, "Trying to kill by injecting faults", probe::context::sim2, probe::assert::simOnly);

 		if (speedUpSimulation && !forceKill) {
 			TraceEvent(SevWarn, "AbortedKill")
@ -1851,11 +1871,17 @@ public:
 			}
 		}

-		TEST(originalKt != kt); // Kill type was changed from requested to reboot.
+		CODE_PROBE(originalKt != kt,
+		           "Kill type was changed from requested to reboot.",
+		           probe::context::sim2,
+		           probe::assert::simOnly);

 		// Check if any processes on machine are rebooting
 		if (processesOnMachine != processesPerMachine && kt >= RebootAndDelete) {
-			TEST(true); // Attempted reboot, but the target did not have all of its processes running
+			CODE_PROBE(true,
+			           "Attempted reboot, but the target did not have all of its processes running",
+			           probe::context::sim2,
+			           probe::assert::simOnly);
 			TraceEvent(SevWarn, "AbortedKill")
 			    .detail("KillType", kt)
 			    .detail("MachineId", machineId)
@ -1870,7 +1896,10 @@ public:

 		// Check if any processes on machine are rebooting
 		if (processesOnMachine != processesPerMachine) {
-			TEST(true); // Attempted reboot and kill, but the target did not have all of its processes running
+			CODE_PROBE(true,
+			           "Attempted reboot and kill, but the target did not have all of its processes running",
+			           probe::context::sim2,
+			           probe::assert::simOnly);
 			TraceEvent(SevWarn, "AbortedKill")
 			    .detail("KillType", kt)
 			    .detail("MachineId", machineId)
@ -1920,10 +1949,12 @@ public:
 			}
 		}

-		TEST(kt == RebootAndDelete); // Resulted in a reboot and delete
-		TEST(kt == Reboot); // Resulted in a reboot
-		TEST(kt == KillInstantly); // Resulted in an instant kill
-		TEST(kt == InjectFaults); // Resulted in a kill by injecting faults
+		CODE_PROBE(
+		    kt == RebootAndDelete, "Resulted in a reboot and delete", probe::context::sim2, probe::assert::simOnly);
+		CODE_PROBE(kt == Reboot, "Resulted in a reboot", probe::context::sim2, probe::assert::simOnly);
+		CODE_PROBE(kt == KillInstantly, "Resulted in an instant kill", probe::context::sim2, probe::assert::simOnly);
+		CODE_PROBE(
+		    kt == InjectFaults, "Resulted in a kill by injecting faults", probe::context::sim2, probe::assert::simOnly);

 		if (ktFinal)
 			*ktFinal = kt;
@ -2037,13 +2068,32 @@ public:
 		    .detail("KillTypeMin", ktMin)
 		    .detail("KilledDC", kt == ktMin);

-		TEST(kt != ktMin); // DataCenter kill was rejected by killMachine
-		TEST((kt == ktMin) && (kt == RebootAndDelete)); // Datacenter kill Resulted in a reboot and delete
-		TEST((kt == ktMin) && (kt == Reboot)); // Datacenter kill Resulted in a reboot
-		TEST((kt == ktMin) && (kt == KillInstantly)); // Datacenter kill Resulted in an instant kill
-		TEST((kt == ktMin) && (kt == InjectFaults)); // Datacenter kill Resulted in a kill by injecting faults
-		TEST((kt == ktMin) && (kt != ktOrig)); // Datacenter Kill request was downgraded
-		TEST((kt == ktMin) && (kt == ktOrig)); // Datacenter kill - Requested kill was done
+		CODE_PROBE(
+		    kt != ktMin, "DataCenter kill was rejected by killMachine", probe::context::sim2, probe::assert::simOnly);
+		CODE_PROBE((kt == ktMin) && (kt == RebootAndDelete),
+		           "Datacenter kill Resulted in a reboot and delete",
+		           probe::context::sim2,
+		           probe::assert::simOnly);
+		CODE_PROBE((kt == ktMin) && (kt == Reboot),
+		           "Datacenter kill Resulted in a reboot",
+		           probe::context::sim2,
+		           probe::assert::simOnly);
+		CODE_PROBE((kt == ktMin) && (kt == KillInstantly),
+		           "Datacenter kill Resulted in an instant kill",
+		           probe::context::sim2,
+		           probe::assert::simOnly);
+		CODE_PROBE((kt == ktMin) && (kt == InjectFaults),
+		           "Datacenter kill Resulted in a kill by injecting faults",
+		           probe::context::sim2,
+		           probe::assert::simOnly);
+		CODE_PROBE((kt == ktMin) && (kt != ktOrig),
+		           "Datacenter Kill request was downgraded",
+		           probe::context::sim2,
+		           probe::assert::simOnly);
+		CODE_PROBE((kt == ktMin) && (kt == ktOrig),
+		           "Datacenter kill - Requested kill was done",
+		           probe::context::sim2,
+		           probe::assert::simOnly);

 		if (ktFinal)
 			*ktFinal = ktMin;
@ -2276,7 +2326,7 @@ class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
 	NetworkAddress _localAddress;
 	bool randomDropPacket() {
 		auto res = deterministicRandom()->random01() < .000001;
-		TEST(res); // UDP packet drop
+		CODE_PROBE(res, "UDP packet drop", probe::context::sim2, probe::assert::simOnly);
 		return res;
 	}

@ -2485,12 +2535,20 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
 		ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete ||
 		       kt == ISimulator::RebootProcessAndDelete);

-		TEST(kt == ISimulator::RebootProcess); // Simulated process rebooted
-		TEST(kt == ISimulator::Reboot); // Simulated machine rebooted
-		TEST(kt == ISimulator::RebootAndDelete); // Simulated machine rebooted with data and coordination state deletion
-		TEST(
-		    kt ==
-		    ISimulator::RebootProcessAndDelete); // Simulated process rebooted with data and coordination state deletion
+		CODE_PROBE(kt == ISimulator::RebootProcess,
+		           "Simulated process rebooted",
+		           probe::assert::simOnly,
+		           probe::context::sim2);
+		CODE_PROBE(
+		    kt == ISimulator::Reboot, "Simulated machine rebooted", probe::assert::simOnly, probe::context::sim2);
+		CODE_PROBE(kt == ISimulator::RebootAndDelete,
+		           "Simulated machine rebooted with data and coordination state deletion",
+		           probe::assert::simOnly,
+		           probe::context::sim2);
+		CODE_PROBE(kt == ISimulator::RebootProcessAndDelete,
+		           "Simulated process rebooted with data and coordination state deletion",
+		           probe::assert::simOnly,
+		           probe::context::sim2);

 		if (p->rebooting || !p->isReliable()) {
 			TraceEvent(SevDebug, "DoRebootFailed")
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -624,7 +624,7 @@ private:
 		if (!initialCommit)
 			txnStateStore->set(KeyValueRef(m.param1, m.param2));
 		confChange = true;
-		TEST(true); // Recovering at a higher version.
+		CODE_PROBE(true, "Recovering at a higher version.");
 	}

 	void checkSetVersionEpochKey(MutationRef m) {
@ -636,7 +636,7 @@ private:
 		if (!initialCommit)
 			txnStateStore->set(KeyValueRef(m.param1, m.param2));
 		confChange = true;
-		TEST(true); // Setting version epoch
+		CODE_PROBE(true, "Setting version epoch");
 	}

 	void checkSetWriteRecoverKey(MutationRef m) {
@ -646,7 +646,7 @@ private:
 		TraceEvent("WriteRecoveryKeySet", dbgid).log();
 		if (!initialCommit)
 			txnStateStore->set(KeyValueRef(m.param1, m.param2));
-		TEST(true); // Snapshot created, setting writeRecoveryKey in txnStateStore
+		CODE_PROBE(true, "Snapshot created, setting writeRecoveryKey in txnStateStore");
 	}

 	void checkSetTenantMapPrefix(MutationRef m) {
@ -680,7 +680,7 @@ private:
 				writeMutation(privatized);
 			}

-			TEST(true); // Tenant added to map
+			CODE_PROBE(true, "Tenant added to map");
 		}
 	}

@ -1068,7 +1068,7 @@ private:
 				writeMutation(privatized);
 			}

-			TEST(true); // Tenant cleared from map
+			CODE_PROBE(true, "Tenant cleared from map");
 		}
 	}

--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@ -72,7 +72,7 @@ struct VersionedMessage {
 		if (reader.protocolVersion().hasSpanContext() && SpanContextMessage::isNextIn(reader))
 			return false;
 		if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) {
-			TEST(true); // Returning false for OTELSpanContextMessage
+			CODE_PROBE(true, "Returning false for OTELSpanContextMessage");
 			return false;
 		}
 		if (EncryptedMutationMessage::isNextIn(reader)) {
--- a/fdbserver/BlobGranuleServerCommon.actor.cpp
+++ b/fdbserver/BlobGranuleServerCommon.actor.cpp
@ -30,6 +30,18 @@
 #include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // has to be last include

+// serialize change feed key as UID bytes, to use 16 bytes on disk
+Key granuleIDToCFKey(UID granuleID) {
+	BinaryWriter wr(Unversioned());
+	wr << granuleID;
+	return wr.toValue();
+}
+
+// parse change feed key back to UID, to be human-readable
+UID cfKeyToGranuleID(Key cfKey) {
+	return BinaryReader::fromStringRef<UID>(cfKey, Unversioned());
+}
+
 // Gets the latest granule history node for range that was persisted
 ACTOR Future<Optional<GranuleHistory>> getLatestGranuleHistory(Transaction* tr, KeyRange range) {
 	state KeyRange historyRange = blobGranuleHistoryKeyRangeFor(range);
@ -62,13 +74,14 @@ ACTOR Future<Void> readGranuleFiles(Transaction* tr, Key* startKey, Key endKey,
 			int64_t offset;
 			int64_t length;
 			int64_t fullFileLength;
+			Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;

 			std::tie(gid, version, fileType) = decodeBlobGranuleFileKey(it.key);
 			ASSERT(gid == granuleID);

-			std::tie(filename, offset, length, fullFileLength) = decodeBlobGranuleFileValue(it.value);
+			std::tie(filename, offset, length, fullFileLength, cipherKeysMeta) = decodeBlobGranuleFileValue(it.value);

-			BlobFileIndex idx(version, filename.toString(), offset, length, fullFileLength);
+			BlobFileIndex idx(version, filename.toString(), offset, length, fullFileLength, cipherKeysMeta);
 			if (fileType == 'S') {
 				ASSERT(files->snapshotFiles.empty() || files->snapshotFiles.back().version < idx.version);
 				files->snapshotFiles.push_back(idx);
@ -170,8 +183,12 @@ void GranuleFiles::getFiles(Version beginVersion,
 	Version lastIncluded = invalidVersion;
 	if (snapshotF != snapshotFiles.end()) {
 		chunk.snapshotVersion = snapshotF->version;
-		chunk.snapshotFile = BlobFilePointerRef(
-		    replyArena, snapshotF->filename, snapshotF->offset, snapshotF->length, snapshotF->fullFileLength);
+		chunk.snapshotFile = BlobFilePointerRef(replyArena,
+		                                        snapshotF->filename,
+		                                        snapshotF->offset,
+		                                        snapshotF->length,
+		                                        snapshotF->fullFileLength,
+		                                        snapshotF->cipherKeysMeta);
 		lastIncluded = chunk.snapshotVersion;
 	} else {
 		chunk.snapshotVersion = invalidVersion;
--- a/fdbserver/BlobGranuleValidation.actor.cpp
+++ b/fdbserver/BlobGranuleValidation.actor.cpp
@ -176,7 +176,7 @@ ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
 			if (ranges.size() == 1) {
 				return Void();
 			}
-			TEST(true); // clearAndAwaitMerge doing clear
+			CODE_PROBE(true, "ClearAndAwaitMerge doing clear");
 			tr.clear(range);
 			wait(tr.commit());

--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -459,7 +459,7 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitRange(Reference<BlobManagerData
 		if (writeHot) {
 			splitThreshold /= 3;
 		}
-		TEST(writeHot); // Change feed write hot split
+		CODE_PROBE(writeHot, "Change feed write hot split");
 		if (estimated.bytes > splitThreshold) {
 			// only split on bytes and write rate
 			state StorageMetrics splitMetrics;
@ -495,7 +495,7 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitRange(Reference<BlobManagerData
 			ASSERT(keys.back() == range.end);
 			return keys;
 		} else {
-			TEST(writeHot); // Not splitting write-hot because granules would be too small
+			CODE_PROBE(writeHot, "Not splitting write-hot because granules would be too small");
 			if (BM_DEBUG) {
 				printf("Not splitting range\n");
 			}
@ -527,7 +527,7 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitRange(Reference<BlobManagerData
 ACTOR Future<UID> pickWorkerForAssign(Reference<BlobManagerData> bmData) {
 	// wait until there are BWs to pick from
 	while (bmData->workerStats.size() == 0) {
-		TEST(true); // BM wants to assign range, but no workers available
+		CODE_PROBE(true, "BM wants to assign range, but no workers available");
 		if (BM_DEBUG) {
 			fmt::print("BM {0} waiting for blob workers before assigning granules\n", bmData->epoch);
 		}
@ -685,7 +685,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
 			throw;
 		}

-		TEST(true); // BM retrying range assign
+		CODE_PROBE(true, "BM retrying range assign");

 		// We use reliable delivery (getReply), so the broken_promise means the worker is dead, and we may need to retry
 		// somewhere else
@ -749,7 +749,7 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
 		if (assignment.assign.get().type == AssignRequestType::Continue) {
 			ASSERT(assignment.worker.present());
 			if (i.range() != assignment.keyRange || i.cvalue() != assignment.worker.get()) {
-				TEST(true); // BM assignment out of date
+				CODE_PROBE(true, "BM assignment out of date");
 				if (BM_DEBUG) {
 					fmt::print("Out of date re-assign for ({0}, {1}). Assignment must have changed while "
 					           "checking split.\n  Reassign: [{2} - {3}): {4}\n  Existing: [{5} - {6}): {7}\n",
@ -880,7 +880,7 @@ ACTOR Future<Void> writeInitialGranuleMapping(Reference<BlobManagerData> bmData,
 	state int i = 0;
 	state int transactionChunkSize = BUGGIFY ? deterministicRandom()->randomInt(2, 5) : 1000;
 	while (i < boundaries.size() - 1) {
-		TEST(i > 0); // multiple transactions for large granule split
+		CODE_PROBE(i > 0, "multiple transactions for large granule split");
 		tr->reset();
 		state int j = 0;
 		loop {
@ -1176,7 +1176,7 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
 	// Enforce max split fanout for performance reasons. This mainly happens when a blob worker is behind.
 	if (newRanges.size() >=
 	    SERVER_KNOBS->BG_MAX_SPLIT_FANOUT + 2) { // +2 because this is boundaries, so N keys would have N+1 bounaries.
-		TEST(true); // downsampling granule split because fanout too high
+		CODE_PROBE(true, "downsampling granule split because fanout too high");
 		Standalone<VectorRef<KeyRef>> coalescedRanges;
 		coalescedRanges.arena().dependsOn(newRanges.arena());
 		coalescedRanges.push_back(coalescedRanges.arena(), newRanges.front());
@ -1250,7 +1250,7 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
 			if (!existingState.empty()) {
 				// Something was previously committed, we must go with that decision.
 				// Read its boundaries and override our planned split boundaries
-				TEST(true); // Overriding split ranges with existing ones from DB
+				CODE_PROBE(true, "Overriding split ranges with existing ones from DB");
 				RangeResult existingBoundaries =
 				    wait(tr->getRange(KeyRangeRef(granuleRange.begin.withPrefix(blobGranuleMappingKeys.begin),
 				                                  keyAfter(granuleRange.end).withPrefix(blobGranuleMappingKeys.begin)),
@ -1628,7 +1628,7 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
 		}
 	}
 	if (tmpWorkerId == UID()) {
-		TEST(true); // All workers dead right now
+		CODE_PROBE(true, "All workers dead right now");
 		while (bmData->workersById.empty()) {
 			wait(bmData->recruitingStream.onChange() || bmData->foundBlobWorkers.getFuture());
 		}
@ -1699,7 +1699,7 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
 				           mergeVersion,
 				           tr->getCommittedVersion());
 			}
-			TEST(true); // Granule merge complete
+			CODE_PROBE(true, "Granule merge complete");
 			return Void();
 		} catch (Error& e) {
 			wait(tr->onError(e));
@ -1807,7 +1807,7 @@ static void attemptStartMerge(Reference<BlobManagerData> bmData,
 	auto reCheckMergeCandidates = bmData->mergeCandidates.intersectingRanges(mergeRange);
 	for (auto it : reCheckMergeCandidates) {
 		if (!it->cvalue().canMergeNow()) {
-			TEST(true); // granule no longer merge candidate after checking metrics, aborting merge
+			CODE_PROBE(true, " granule no longer merge candidate after checking metrics, aborting merge");
 			return;
 		}
 	}
@ -1819,7 +1819,7 @@ static void attemptStartMerge(Reference<BlobManagerData> bmData,
 		           mergeRange.end.printable(),
 		           toMerge.size());
 	}
-	TEST(true); // Doing granule merge!
+	CODE_PROBE(true, "Doing granule merge");
 	bmData->activeGranuleMerges.insert(mergeRange, 0);
 	bmData->clearMergeCandidate(mergeRange, MergeCandidateMerging);
 	// Now, after setting activeGranuleMerges, we have committed to doing the merge, so any subsequent split eval for
@ -1836,7 +1836,7 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
 	for (int i = 0; i < candidates.size() - 1; i++) {
 		ASSERT(std::get<1>(candidates[i]).end == std::get<1>(candidates[i + 1]).begin);
 	}
-	TEST(true); // Candidate ranges to merge
+	CODE_PROBE(true, "Candidate ranges to merge");
 	wait(bmData->concurrentMergeChecks.take());
 	state FlowLock::Releaser holdingDVL(bmData->concurrentMergeChecks);

@ -1868,7 +1868,7 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
 		    currentBytes + metrics.bytes > SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
 		    currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2) {
 			ASSERT(currentBytes <= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES);
-			TEST(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2); // merge early because of key size
+			CODE_PROBE(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2, "merge early because of key size");
 			attemptStartMerge(bmData, currentCandidates);
 			currentCandidates.clear();
 			currentBytes = 0;
@ -1935,7 +1935,7 @@ ACTOR Future<Void> granuleMergeChecker(Reference<BlobManagerData> bmData) {
 			mergeChecks.push_back(attemptMerges(bmData, currentCandidates));
 		}

-		TEST(mergeChecks.size() > 1); // parallel merge checks
+		CODE_PROBE(mergeChecks.size() > 1, "parallel merge checks");
 		wait(waitForAll(mergeChecks));
 		// if the calculation took longer than the desired interval, still wait a bit
 		wait(intervalDelay && delay(5.0));
@ -2130,7 +2130,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 						}
 						ignore = true;
 					} else if (newEval < lastBoundaryEval.cvalue()) {
-						TEST(true); // BM got out-of-date split request
+						CODE_PROBE(true, "BM got out-of-date split request");
 						if (BM_DEBUG) {
 							fmt::print("BM {0} ignoring status from BW {1} for granule [{2} - {3}) {4} since it "
 							           "already processed [{5} - {6}) {7}.\n",
@ -2206,7 +2206,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 					// suddenly gets a burst of writes after a decision to merge is made
 					if (inProgressMergeVersion != invalidVersion) {
 						if (rep.blockedVersion < inProgressMergeVersion) {
-							TEST(true); // merge blocking re-snapshot
+							CODE_PROBE(true, "merge blocking re-snapshot");
 							if (BM_DEBUG) {
 								fmt::print("DBG: BM {0} MERGE @ {1} blocking re-snapshot [{2} - {3}) @ {4}, "
 								           "continuing snapshot\n",
@ -2272,6 +2272,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 				if (rep.mergeCandidate && !ignore) {
 					// mark granule as merge candidate
 					ASSERT(!rep.doSplit);
+					CODE_PROBE(true, "Granule merge candidate");
 					if (BM_DEBUG) {
 						fmt::print("Manager {0} merge candidate granule [{1} - {2}) {3}\n",
 						           bmData->epoch,
@ -2307,7 +2308,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 			// if it is permanent, the failure monitor will eventually trip.
 			ASSERT(e.code() != error_code_end_of_stream);
 			if (e.code() == error_code_request_maybe_delivered || e.code() == error_code_connection_failed) {
-				TEST(true); // BM retrying BW monitoring
+				CODE_PROBE(true, "BM retrying BW monitoring");
 				wait(delay(backoff));
 				backoff = std::min(backoff * SERVER_KNOBS->BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT,
 				                   SERVER_KNOBS->BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX);
@ -2445,7 +2446,7 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
 		if (oldEpoch > newEpoch || (oldEpoch == newEpoch && oldSeqno > newSeqno)) {
 			newer.push_back(std::pair(old.range(), std::tuple(oldWorker, oldEpoch, oldSeqno)));
 			if (old.range() != newRange) {
-				TEST(true); // BM Recovery: BWs disagree on range boundaries
+				CODE_PROBE(true, "BM Recovery: BWs disagree on range boundaries");
 				anyConflicts = true;
 			}
 		} else {
@ -2455,7 +2456,7 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
 				ASSERT(oldEpoch != newEpoch || oldSeqno != newSeqno);
 			}
 			if (newEpoch == std::numeric_limits<int64_t>::max() && (oldWorker != newId || old.range() != newRange)) {
-				TEST(true); // BM Recovery: DB disagrees with workers
+				CODE_PROBE(true, "BM Recovery: DB disagrees with workers");
 				// new one is from DB (source of truth on boundaries) and existing mapping disagrees on boundary or
 				// assignment, do explicit revoke and re-assign to converge
 				anyConflicts = true;
@ -2479,7 +2480,7 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
 					std::get<0>(old.value()) = UID();
 				}
 				if (outOfDate.empty() || outOfDate.back() != std::pair(oldWorker, KeyRange(old.range()))) {
-					TEST(true); // BM Recovery: Two workers claim ownership of same granule
+					CODE_PROBE(true, "BM Recovery: Two workers claim ownership of same granule");
 					outOfDate.push_back(std::pair(oldWorker, old.range()));
 				}
 			}
@ -2519,7 +2520,7 @@ ACTOR Future<Void> resumeActiveMerges(Reference<BlobManagerData> bmData) {

 			RangeResult result = wait(tr->getRange(currentRange, rowLimit));
 			for (auto& it : result) {
-				TEST(true); // Blob Manager Recovery found merging granule
+				CODE_PROBE(true, "Blob Manager Recovery found merging granule");
 				UID mergeGranuleID = decodeBlobGranuleMergeKey(it.key);
 				KeyRange mergeRange;
 				std::vector<UID> parentGranuleIDs;
@ -2586,7 +2587,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {

 	state Future<Void> resumeMergesFuture = resumeActiveMerges(bmData);

-	TEST(true); // BM doing recovery
+	CODE_PROBE(true, "BM doing recovery");

 	wait(delay(0));

@ -2667,7 +2668,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 				bmData->workerStats[workerId].numGranulesAssigned = reply.get().assignments.size();
 			}
 		} else {
-			TEST(true); // BM Recovery: BW didn't respond to assignments request
+			CODE_PROBE(true, "BM Recovery: BW didn't respond to assignments request");
 			// SOMEDAY: mark as failed and kill it
 			if (BM_DEBUG) {
 				fmt::print("  Worker {}: failed\n", workerId.toString().substr(0, 5));
@ -2771,7 +2772,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
 	}

 	// revoke assignments that are old and incorrect
-	TEST(!outOfDateAssignments.empty()); // BM resolved conflicting assignments on recovery
+	CODE_PROBE(!outOfDateAssignments.empty(), "BM resolved conflicting assignments on recovery");
 	for (auto& it : outOfDateAssignments) {
 		if (BM_DEBUG) {
 			fmt::print("BM {0} revoking out of date assignment [{1} - {2}): {3}:\n",
@ -2841,7 +2842,7 @@ ACTOR Future<Void> chaosRangeMover(Reference<BlobManagerData> bmData) {
 	// KeyRange isn't hashable and this is only for simulation, so just use toString of range
 	state std::unordered_set<std::string> alreadyMoved;
 	ASSERT(g_network->isSimulated());
-	TEST(true); // BM chaos range mover enabled
+	CODE_PROBE(true, "BM chaos range mover enabled");
 	loop {
 		wait(delay(30.0));

@ -2945,7 +2946,7 @@ ACTOR Future<Void> initializeBlobWorker(Reference<BlobManagerData> self, Recruit
 		// if it failed in an expected way, add some delay before we try to recruit again
 		// on this worker
 		if (newBlobWorker.isError()) {
-			TEST(true); // BM got error recruiting BW
+			CODE_PROBE(true, "BM got error recruiting BW");
 			TraceEvent(SevWarn, "BMRecruitmentError", self->id)
 			    .error(newBlobWorker.getError())
 			    .detail("Epoch", self->epoch);
@ -3049,7 +3050,7 @@ ACTOR Future<Void> blobWorkerRecruiter(
 			if (e.code() != error_code_timed_out) {
 				throw;
 			}
-			TEST(true); // Blob worker recruitment timed out
+			CODE_PROBE(true, "Blob worker recruitment timed out");
 		}
 	}
 }
@ -3737,7 +3738,7 @@ ACTOR Future<Void> doLockChecks(Reference<BlobManagerData> bmData) {
 		wait(check.getFuture());
 		wait(delay(0.5)); // don't do this too often if a lot of conflict

-		TEST(true); // BM doing lock checks after getting conflicts
+		CODE_PROBE(true, "BM doing lock checks after getting conflicts");

 		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bmData->db);

--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -18,12 +18,7 @@
 * limitations under the License.
 */

-#include <limits>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "fmt/format.h"
+#include "fdbclient/ClientBooleanParams.h"
 #include "fdbclient/BlobGranuleFiles.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/KeyRangeMap.h"
@ -40,18 +35,30 @@
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/Notified.h"

-#include "fdbserver/Knobs.h"
 #include "fdbserver/BlobGranuleServerCommon.actor.h"
-
+#include "fdbserver/GetEncryptCipherKeys.h"
+#include "fdbserver/Knobs.h"
 #include "fdbserver/MutationTracking.h"
-#include "fdbserver/WaitFailure.h"
 #include "fdbserver/ServerDBInfo.h"
+#include "fdbserver/WaitFailure.h"
+
 #include "flow/Arena.h"
+#include "flow/BlobCipher.h"
+#include "flow/CompressionUtils.h"
+#include "flow/EncryptUtils.h"
 #include "flow/Error.h"
-#include "flow/IRandom.h"
-#include "flow/Trace.h"
 #include "flow/flow.h"
+#include "flow/IRandom.h"
 #include "flow/network.h"
+#include "flow/Trace.h"
+#include "flow/xxhash.h"
+
+#include "fmt/format.h"
+
+#include <limits>
+#include <tuple>
+#include <utility>
+#include <vector>

 #include "flow/actorcompiler.h" // has to be last include

@ -61,8 +68,8 @@

 /*
 * The Blob Worker is a stateless role assigned a set of granules by the Blob Manager.
- * It is responsible for managing the change feeds for those granules, and for consuming the mutations from those change
- * feeds and writing them out as files to blob storage.
+ * It is responsible for managing the change feeds for those granules, and for consuming the mutations from
+ * those change feeds and writing them out as files to blob storage.
 */

 struct GranuleStartState {
@ -182,6 +189,7 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 	Reference<BlobConnectionProvider> bstore;
 	KeyRangeMap<GranuleRangeMetadata> granuleMetadata;
 	BGTenantMap tenantData;
+	Reference<AsyncVar<ServerDBInfo> const> dbInfo;

 	// contains the history of completed granules before the existing ones. Maps to the latest one, and has
 	// back-pointers to earlier granules
@ -199,8 +207,8 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {

 	int changeFeedStreamReplyBufferSize = SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES / 2;

-	BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db)
-	  : id(id), db(db), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL), tenantData(BGTenantMap(dbInfo)),
+	BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInf, Database db)
+	  : id(id), db(db), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL), tenantData(BGTenantMap(dbInf)), dbInfo(dbInf),
 	    initialSnapshotLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM) {}

 	bool managerEpochOk(int64_t epoch) {
@ -225,8 +233,23 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 	}
 };

+namespace {
+bool isBlobFileEncryptionSupported() {
+	bool supported = SERVER_KNOBS->ENABLE_BLOB_GRANULE_ENCRYPTION && SERVER_KNOBS->BG_RANGE_SOURCE == "tenant";
+	ASSERT((supported && SERVER_KNOBS->ENABLE_ENCRYPTION) || !supported);
+	return supported;
+}
+
+Optional<CompressionFilter> getBlobFileCompressFilter() {
+	Optional<CompressionFilter> compFilter;
+	if (SERVER_KNOBS->ENABLE_BLOB_GRANULE_COMPRESSION) {
+		compFilter = CompressionUtils::fromFilterString(SERVER_KNOBS->BLOB_GRANULE_COMPRESSION_FILTER);
+	}
+	return compFilter;
+}
+
 // returns true if we can acquire it
-static void acquireGranuleLock(int64_t epoch, int64_t seqno, int64_t prevOwnerEpoch, int64_t prevOwnerSeqno) {
+void acquireGranuleLock(int64_t epoch, int64_t seqno, int64_t prevOwnerEpoch, int64_t prevOwnerSeqno) {
 	// returns true if our lock (E, S) >= (Eprev, Sprev)
 	if (epoch < prevOwnerEpoch || (epoch == prevOwnerEpoch && seqno < prevOwnerSeqno)) {
 		if (BW_DEBUG) {
@ -240,7 +263,7 @@ static void acquireGranuleLock(int64_t epoch, int64_t seqno, int64_t prevOwnerEp
 	}
 }

-static void checkGranuleLock(int64_t epoch, int64_t seqno, int64_t ownerEpoch, int64_t ownerSeqno) {
+void checkGranuleLock(int64_t epoch, int64_t seqno, int64_t ownerEpoch, int64_t ownerSeqno) {
 	// sanity check - lock value should never go backwards because of acquireGranuleLock
 	ASSERT(epoch <= ownerEpoch);
 	ASSERT(epoch < ownerEpoch || (epoch == ownerEpoch && seqno <= ownerSeqno));
@ -257,6 +280,112 @@ static void checkGranuleLock(int64_t epoch, int64_t seqno, int64_t ownerEpoch, i
 		throw granule_assignment_conflict();
 	}
 }
+} // namespace
+
+// Below actors asssit in fetching/lookup desired encryption keys. Following steps are done for an encryption key
+// lookup:
+// 1. Lookup proccess local in-memory cache `BlobCipherKeyCache` to check if desired EK is 'present' and 'valid'. Given
+//    FDB supports 'revocable' & 'non-revocable' EKs; a cached EK can also be 'invalid'.
+// 2. Local cache miss will follow with a RPC call to EncryptKeyProxy process (EKP), EKP maintain an in-memory cache of
+//    KMS BaseCipher details with KMS defined TTL if applicable. The lookup call can either to serviced by EKP or would
+//    lead to desired KMS endpoint invocation.
+//
+// In most of the cases, the EK lookup should be satisfied by process local in-memory cache and/or EKP in-memory cache,
+// unless cluster and/or a process crash/restart.
+
+ACTOR Future<BlobGranuleCipherKeysCtx> getLatestGranuleCipherKeys(Reference<BlobWorkerData> bwData,
+                                                                  KeyRange keyRange,
+                                                                  Arena* arena) {
+	state BlobGranuleCipherKeysCtx cipherKeysCtx;
+	state Reference<GranuleTenantData> tenantData = bwData->tenantData.getDataForGranule(keyRange);
+
+	ASSERT(tenantData.isValid());
+
+	std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> domains;
+	domains.emplace(tenantData->entry.id, StringRef(*arena, tenantData->name));
+	std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> domainKeyMap =
+	    wait(getLatestEncryptCipherKeys(bwData->dbInfo, domains));
+
+	auto domainKeyItr = domainKeyMap.find(tenantData->entry.id);
+	ASSERT(domainKeyItr != domainKeyMap.end());
+	cipherKeysCtx.textCipherKey = BlobGranuleCipherKey::fromBlobCipherKey(domainKeyItr->second, *arena);
+
+	TextAndHeaderCipherKeys systemCipherKeys = wait(getLatestSystemEncryptCipherKeys(bwData->dbInfo));
+	cipherKeysCtx.headerCipherKey = BlobGranuleCipherKey::fromBlobCipherKey(systemCipherKeys.cipherHeaderKey, *arena);
+
+	cipherKeysCtx.ivRef = makeString(AES_256_IV_LENGTH, *arena);
+	generateRandomData(mutateString(cipherKeysCtx.ivRef), AES_256_IV_LENGTH);
+
+	if (BG_ENCRYPT_COMPRESS_DEBUG) {
+		TraceEvent(SevDebug, "GetLatestGranuleCipherKey")
+		    .detail("TextDomainId", cipherKeysCtx.textCipherKey.encryptDomainId)
+		    .detail("TextBaseCipherId", cipherKeysCtx.textCipherKey.baseCipherId)
+		    .detail("TextSalt", cipherKeysCtx.textCipherKey.salt)
+		    .detail("HeaderDomainId", cipherKeysCtx.textCipherKey.encryptDomainId)
+		    .detail("HeaderBaseCipherId", cipherKeysCtx.textCipherKey.baseCipherId)
+		    .detail("HeaderSalt", cipherKeysCtx.textCipherKey.salt)
+		    .detail("IVChksum", XXH3_64bits(cipherKeysCtx.ivRef.begin(), cipherKeysCtx.ivRef.size()));
+	}
+
+	return cipherKeysCtx;
+}
+
+ACTOR Future<BlobGranuleCipherKey> lookupCipherKey(Reference<BlobWorkerData> bwData,
+                                                   BlobCipherDetails cipherDetails,
+                                                   Arena* arena) {
+	std::unordered_set<BlobCipherDetails> cipherDetailsSet;
+	cipherDetailsSet.emplace(cipherDetails);
+	state std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>> cipherKeyMap =
+	    wait(getEncryptCipherKeys(bwData->dbInfo, cipherDetailsSet));
+
+	ASSERT(cipherKeyMap.size() == 1);
+
+	auto cipherKeyMapItr = cipherKeyMap.find(cipherDetails);
+	if (cipherKeyMapItr == cipherKeyMap.end()) {
+		TraceEvent(SevError, "CipherKeyLookup_Failure")
+		    .detail("EncryptDomainId", cipherDetails.encryptDomainId)
+		    .detail("BaseCipherId", cipherDetails.baseCipherId)
+		    .detail("Salt", cipherDetails.salt);
+		throw encrypt_keys_fetch_failed();
+	}
+
+	return BlobGranuleCipherKey::fromBlobCipherKey(cipherKeyMapItr->second, *arena);
+}
+
+ACTOR Future<BlobGranuleCipherKeysCtx> getGranuleCipherKeys(Reference<BlobWorkerData> bwData,
+                                                            BlobGranuleCipherKeysMetaRef cipherKeysMetaRef,
+                                                            Arena* arena) {
+	state BlobGranuleCipherKeysCtx cipherKeysCtx;
+
+	// Fetch 'textCipher' key
+	state BlobCipherDetails textCipherDetails(
+	    cipherKeysMetaRef.textDomainId, cipherKeysMetaRef.textBaseCipherId, cipherKeysMetaRef.textSalt);
+	BlobGranuleCipherKey textCipherKey = wait(lookupCipherKey(bwData, textCipherDetails, arena));
+	cipherKeysCtx.textCipherKey = textCipherKey;
+
+	// Fetch 'headerCipher' key
+	state BlobCipherDetails headerCipherDetails(
+	    cipherKeysMetaRef.headerDomainId, cipherKeysMetaRef.headerBaseCipherId, cipherKeysMetaRef.headerSalt);
+	BlobGranuleCipherKey headerCipherKey = wait(lookupCipherKey(bwData, headerCipherDetails, arena));
+	cipherKeysCtx.headerCipherKey = headerCipherKey;
+
+	// Populate 'Intialization Vector'
+	ASSERT_EQ(cipherKeysMetaRef.ivRef.size(), AES_256_IV_LENGTH);
+	cipherKeysCtx.ivRef = StringRef(*arena, cipherKeysMetaRef.ivRef);
+
+	if (BG_ENCRYPT_COMPRESS_DEBUG) {
+		TraceEvent("GetGranuleCipherKey")
+		    .detail("TextDomainId", cipherKeysCtx.textCipherKey.encryptDomainId)
+		    .detail("TextBaseCipherId", cipherKeysCtx.textCipherKey.baseCipherId)
+		    .detail("TextSalt", cipherKeysCtx.textCipherKey.salt)
+		    .detail("HeaderDomainId", cipherKeysCtx.textCipherKey.encryptDomainId)
+		    .detail("HeaderBaseCipherId", cipherKeysCtx.textCipherKey.baseCipherId)
+		    .detail("HeaderSalt", cipherKeysCtx.textCipherKey.salt)
+		    .detail("IVChksum", XXH3_64bits(cipherKeysCtx.ivRef.begin(), cipherKeysCtx.ivRef.size()));
+	}
+
+	return cipherKeysCtx;
+}

 ACTOR Future<Void> readAndCheckGranuleLock(Reference<ReadYourWritesTransaction> tr,
                                           KeyRange granuleRange,
@ -410,7 +539,7 @@ ACTOR Future<Void> updateGranuleSplitState(Transaction* tr,

 			// tr->clear(singleKeyRange(oldGranuleLockKey));
 			tr->clear(currentRange);
-			TEST(true); // Granule split cleanup on last delta file persisted
+			CODE_PROBE(true, "Granule split cleanup on last delta file persisted");
 		} else {
 			tr->atomicOp(myStateKey, blobGranuleSplitValueFor(newState), MutationRef::SetVersionstampedValue);
 			if (newState == BlobGranuleSplitState::Assigned && currentState == BlobGranuleSplitState::Initialized &&
@ -425,10 +554,10 @@ ACTOR Future<Void> updateGranuleSplitState(Transaction* tr,
 				wait(updateChangeFeed(
 				    tr, KeyRef(granuleIDToCFKey(parentGranuleID)), ChangeFeedStatus::CHANGE_FEED_STOP));
 			}
-			TEST(true); // Granule split stopping change feed
+			CODE_PROBE(true, "Granule split stopping change feed");
 		}
 	} else if (BW_DEBUG) {
-		TEST(true); // Out of order granule split state updates ignored
+		CODE_PROBE(true, "Out of order granule split state updates ignored");
 		fmt::print("Ignoring granule {0} split state from {1} {2} -> {3}\n",
 		           currentGranuleID.toString(),
 		           parentGranuleID.toString(),
@ -549,13 +678,13 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
 		// commit a transaction, we can and want to safely delete the file we wrote. Otherwise, we may have updated FDB
 		// with file and cannot safely delete it.
 		if (numIterations > 0) {
-			TEST(true); // Granule potentially leaving orphaned delta file
+			CODE_PROBE(true, "Granule potentially leaving orphaned delta file");
 			throw e;
 		}
 		if (BW_DEBUG) {
 			fmt::print("deleting delta file {0} after error {1}\n", fname, e.name());
 		}
-		TEST(true); // Granule cleaning up delta file after error
+		CODE_PROBE(true, "Granule cleaning up delta file after error");
 		++bwData->stats.s3DeleteReqs;
 		bwData->addActor.send(writeBStore->deleteFile(fname));
 		throw e;
@ -614,7 +743,18 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
 		}
 	}

-	state Value serialized = serializeChunkedSnapshot(snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNKS);
+	state Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx;
+	state Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
+	state Arena arena;
+	if (isBlobFileEncryptionSupported()) {
+		BlobGranuleCipherKeysCtx ciphKeysCtx = wait(getLatestGranuleCipherKeys(bwData, keyRange, &arena));
+		cipherKeysCtx = ciphKeysCtx;
+		cipherKeysMeta = BlobGranuleCipherKeysCtx::toCipherKeysMeta(cipherKeysCtx.get());
+	}
+
+	Optional<CompressionFilter> compressFilter = getBlobFileCompressFilter();
+	state Value serialized =
+	    serializeChunkedSnapshot(snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNKS, compressFilter, cipherKeysCtx);
 	state size_t serializedSize = serialized.size();

 	// free snapshot to reduce memory
@ -650,7 +790,8 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
 				numIterations++;
 				Key snapshotFileKey = blobGranuleFileKeyFor(granuleID, version, 'S');
 				// TODO change once we support file multiplexing
-				Key snapshotFileValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize);
+				Key snapshotFileValue =
+				    blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize, cipherKeysMeta);
 				tr->set(snapshotFileKey, snapshotFileValue);
 				// create granule history at version if this is a new granule with the initial dump from FDB
 				if (createGranuleHistory) {
@ -670,13 +811,13 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
 		// commit a transaction, we can and want to safely delete the file we wrote. Otherwise, we may have updated FDB
 		// with file and cannot safely delete it.
 		if (numIterations > 0) {
-			TEST(true); // Granule potentially leaving orphaned snapshot file
+			CODE_PROBE(true, "Granule potentially leaving orphaned snapshot file");
 			throw e;
 		}
 		if (BW_DEBUG) {
 			fmt::print("deleting snapshot file {0} after error {1}\n", fname, e.name());
 		}
-		TEST(true); // Granule deleting snapshot file after error
+		CODE_PROBE(true, "Granule deleting snapshot file after error");
 		++bwData->stats.s3DeleteReqs;
 		bwData->addActor.send(writeBStore->deleteFile(fname));
 		throw e;
@ -695,7 +836,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
 	}

 	// FIXME: change when we implement multiplexing
-	return BlobFileIndex(version, fname, 0, serializedSize, serializedSize);
+	return BlobFileIndex(version, fname, 0, serializedSize, serializedSize, cipherKeysMeta);
 }

 ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData> bwData,
@ -766,7 +907,7 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
 				wait(tr->onError(e));
 			}
 			retries++;
-			TEST(true); // Granule initial snapshot failed
+			CODE_PROBE(true, "Granule initial snapshot failed");
 			// FIXME: why can't we supress error event?
 			TraceEvent(retries < 10 ? SevDebug : SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id)
 			    .error(err)
@ -803,14 +944,14 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
 	state Arena filenameArena;
 	state std::vector<Future<RangeResult>> chunksToRead;
 	state int64_t compactBytesRead = 0;
-	for (auto& files : fileSet) {
-		ASSERT(!files.snapshotFiles.empty());
-		ASSERT(!files.deltaFiles.empty());
+	for (auto& f : fileSet) {
+		ASSERT(!f.snapshotFiles.empty());
+		ASSERT(!f.deltaFiles.empty());

 		state BlobGranuleChunkRef chunk;
-
+		state GranuleFiles files = f;
 		state Version snapshotVersion = files.snapshotFiles.back().version;
-		BlobFileIndex snapshotF = files.snapshotFiles.back();
+		state BlobFileIndex snapshotF = files.snapshotFiles.back();

 		if (snapshotVersion >= version) {
 			fmt::print("Chunk snapshot version [{0} - {1}) @ {2} >= compact version {3}\n",
@ -821,8 +962,21 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
 		}
 		ASSERT(snapshotVersion < version);

-		chunk.snapshotFile = BlobFilePointerRef(
-		    filenameArena, snapshotF.filename, snapshotF.offset, snapshotF.length, snapshotF.fullFileLength);
+		chunk.snapshotFile = BlobFilePointerRef(filenameArena,
+		                                        snapshotF.filename,
+		                                        snapshotF.offset,
+		                                        snapshotF.length,
+		                                        snapshotF.fullFileLength,
+		                                        snapshotF.cipherKeysMeta);
+
+		// TODO: optimization - batch 'encryption-key' lookup given the GranuleFile set is known
+		if (chunk.snapshotFile.get().cipherKeysMetaRef.present()) {
+			ASSERT(isBlobFileEncryptionSupported());
+			BlobGranuleCipherKeysCtx cipherKeysCtx =
+			    wait(getGranuleCipherKeys(bwData, chunk.snapshotFile.get().cipherKeysMetaRef.get(), &filenameArena));
+			chunk.cipherKeysCtx = cipherKeysCtx;
+		}
+
 		compactBytesRead += snapshotF.length;
 		int deltaIdx = files.deltaFiles.size() - 1;
 		while (deltaIdx >= 0 && files.deltaFiles[deltaIdx].version > snapshotVersion) {
@ -975,8 +1129,7 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
 				if (e.code() == error_code_operation_cancelled) {
 					throw e;
 				}
-				TEST(true); // Blob worker re-sending split evaluation to manager after not error/not hearing
-				            // back
+				CODE_PROBE(true, "Blob worker re-sending split evaluation to manager after not error/not hearing back");
 				// if we got broken promise while waiting, the old stream was killed, so we don't need to wait
 				// on change, just retry
 				if (e.code() == error_code_broken_promise) {
@ -1047,11 +1200,11 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 		if (currentMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2 ||
 		    currentMetrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
 			wait(delayJittered(SERVER_KNOBS->BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 2.0));
-			TEST(true); // wait and check later to see if granule got smaller or colder
+			CODE_PROBE(true, "wait and check later to see if granule got smaller or colder");
 			continue;
 		}

-		TEST(true); // Blob Worker identified merge candidate granule
+		CODE_PROBE(true, "Blob Worker identified merge candidate granule");

 		// if we are a merge candidate, send a message to the BM. Once successful, this actor is complete
 		while (!bwData->statusStreamInitialized) {
@ -1072,7 +1225,7 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 				}

 				if (now() >= sendTimeGiveUp) {
-					TEST(true); // Blob worker could not send merge candidate in time, re-checking status
+					CODE_PROBE(true, "Blob worker could not send merge candidate in time, re-checking status");
 					break;
 				}

@ -1093,13 +1246,13 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 					wait(bwData->currentManagerStatusStream.onChange());
 					wait(delay(0));
 				}
-				TEST(true); // Blob worker re-sending merge candidate to new manager
+				CODE_PROBE(true, "Blob worker re-sending merge candidate to new manager");
 			} catch (Error& e) {
 				if (e.code() == error_code_operation_cancelled) {
 					throw e;
 				}

-				TEST(true); // Blob worker re-sending merge candidate to manager after not error/not hearing back
+				CODE_PROBE(true, "Blob worker re-sending merge candidate to manager after not error/not hearing back");

 				// if we got broken promise while waiting, the old stream was killed, so we don't need to wait
 				// on change, just retry
@ -1113,13 +1266,14 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 	}
 }

-static void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
-                                     Reference<GranuleMetadata> metadata,
-                                     BlobFileIndex completedDeltaFile,
-                                     Key cfKey,
-                                     Version cfStartVersion,
-                                     std::deque<std::pair<Version, Version>>* rollbacksCompleted,
-                                     std::deque<Future<Void>>& inFlightPops) {
+namespace {
+void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
+                              Reference<GranuleMetadata> metadata,
+                              BlobFileIndex completedDeltaFile,
+                              Key cfKey,
+                              Version cfStartVersion,
+                              std::deque<std::pair<Version, Version>>* rollbacksCompleted,
+                              std::deque<Future<Void>>& inFlightPops) {
 	metadata->files.deltaFiles.push_back(completedDeltaFile);
 	ASSERT(metadata->durableDeltaVersion.get() < completedDeltaFile.version);
 	metadata->durableDeltaVersion.set(completedDeltaFile.version);
@ -1155,7 +1309,7 @@ static void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
 }

 // if we get an i/o error updating files, or a rollback, reassign the granule to ourselves and start fresh
-static bool granuleCanRetry(const Error& e) {
+bool granuleCanRetry(const Error& e) {
 	switch (e.code()) {
 	case error_code_io_error:
 	case error_code_io_timeout:
@ -1170,6 +1324,7 @@ static bool granuleCanRetry(const Error& e) {
 		return false;
 	};
 }
+} // namespace

 struct InFlightFile {
 	Future<BlobFileIndex> future;
@ -1181,12 +1336,13 @@ struct InFlightFile {
 	  : future(future), version(version), bytes(bytes), snapshot(snapshot) {}
 };

-static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
-                                 Version mutationVersion,
-                                 Version rollbackVersion,
-                                 std::deque<InFlightFile>& inFlightFiles,
-                                 std::deque<std::pair<Version, Version>>& rollbacksInProgress,
-                                 std::deque<std::pair<Version, Version>>& rollbacksCompleted) {
+namespace {
+Version doGranuleRollback(Reference<GranuleMetadata> metadata,
+                          Version mutationVersion,
+                          Version rollbackVersion,
+                          std::deque<InFlightFile>& inFlightFiles,
+                          std::deque<std::pair<Version, Version>>& rollbacksInProgress,
+                          std::deque<std::pair<Version, Version>>& rollbacksCompleted) {
 	Version cfRollbackVersion;
 	if (metadata->pendingDeltaVersion > rollbackVersion) {
 		// if we already started writing mutations to a delta or snapshot file with version > rollbackVersion,
@ -1199,7 +1355,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
 		for (auto& f : inFlightFiles) {
 			if (f.snapshot) {
 				if (f.version > rollbackVersion) {
-					TEST(true); // Granule rollback cancelling snapshot file
+					CODE_PROBE(true, "Granule rollback cancelling snapshot file");
 					if (BW_DEBUG) {
 						fmt::print("[{0} - {1}) rollback cancelling snapshot file @ {2}\n",
 						           metadata->keyRange.begin.printable(),
@ -1220,7 +1376,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
 						metadata->bytesInNewDeltaFiles -= f.bytes;
 					}
 					toPop++;
-					TEST(true); // Granule rollback cancelling delta file
+					CODE_PROBE(true, "Granule rollback cancelling delta file");
 					if (BW_DEBUG) {
 						fmt::print("[{0} - {1}) rollback cancelling delta file @ {2}\n",
 						           metadata->keyRange.begin.printable(),
@ -1275,7 +1431,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,

 	} else {
 		// No pending delta files to discard, just in-memory mutations
-		TEST(true); // Granule rollback discarding in memory mutations
+		CODE_PROBE(true, "Granule rollback discarding in memory mutations");

 		// FIXME: could binary search?
 		int mIdx = metadata->currentDeltas.size() - 1;
@ -1337,6 +1493,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,

 	return cfRollbackVersion;
 }
+} // namespace

 ACTOR Future<Void> waitOnCFVersion(Reference<GranuleMetadata> metadata, Version waitVersion) {
 	loop {
@ -1479,7 +1636,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 		}

 		if (!startState.doSnapshot) {
-			TEST(true); // Granule moved without split
+			CODE_PROBE(true, "Granule moved without split");
 			startVersion = startState.previousDurableVersion;
 			ASSERT(!metadata->files.snapshotFiles.empty());
 			metadata->pendingSnapshotVersion = metadata->files.snapshotFiles.back().version;
@ -1640,7 +1797,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 						// popped up to V+1 is ok. Or in other words, if the last delta @ V, we only missed data
 						// at V+1 onward if popVersion >= V+2
 						if (metadata->bufferedDeltaVersion < metadata->activeCFData.get()->popVersion - 1) {
-							TEST(true); // Blob Worker detected popped
+							CODE_PROBE(true, "Blob Worker detected popped");
 							TraceEvent("BlobWorkerChangeFeedPopped", bwData->id)
 							    .detail("Granule", metadata->keyRange)
 							    .detail("GranuleID", startState.granuleID)
@ -1738,7 +1895,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 								if (metadata->pendingDeltaVersion <= rollbackVersion &&
 								    (metadata->currentDeltas.empty() ||
 								     metadata->currentDeltas.back().version <= rollbackVersion)) {
-									TEST(true); // Granule ignoring rollback
+									CODE_PROBE(true, "Granule ignoring rollback");

 									if (BW_DEBUG) {
 										fmt::print("Granule [{0} - {1}) on BW {2} skipping rollback {3} -> {4} "
@ -1755,7 +1912,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 									// rollbackInProgress when we restart the stream.
 									rollbacksCompleted.push_back(std::pair(rollbackVersion, deltas.version));
 								} else {
-									TEST(true); // Granule processing rollback
+									CODE_PROBE(true, "Granule processing rollback");
 									if (BW_DEBUG) {
 										fmt::print("[{0} - {1}) on BW {2} ROLLBACK @ {3} -> {4}\n",
 										           metadata->keyRange.begin.printable(),
@ -1786,7 +1943,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 										// change feed
 										ASSERT(cfRollbackVersion >= startState.previousDurableVersion);
 										ASSERT(cfRollbackVersion >= metadata->durableDeltaVersion.get());
-										TEST(true); // rollback crossed change feed boundaries
+										CODE_PROBE(true, "rollback crossed change feed boundaries");
 										readOldChangeFeed = true;
 										oldChangeFeedDataComplete.reset();
 									}
@ -1832,7 +1989,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 							}
 						} else if (!rollbacksInProgress.empty() && rollbacksInProgress.front().first < deltas.version &&
 						           rollbacksInProgress.front().second > deltas.version) {
-							TEST(true); // Granule skipping mutations b/c prior rollback
+							CODE_PROBE(true, "Granule skipping mutations b/c prior rollback");
 							if (BW_DEBUG) {
 								fmt::print("Skipping mutations @ {} b/c prior rollback\n", deltas.version);
 							}
@ -1875,7 +2032,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			// The force flush contract is a version cannot be put in forceFlushVersion unless the change feed
 			// is already whenAtLeast that version
 			bool forceFlush = !forceFlushVersions.empty() && forceFlushVersions.back() > metadata->pendingDeltaVersion;
-			TEST(forceFlush); // Force flushing granule
+			CODE_PROBE(forceFlush, "Force flushing granule");
 			if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush) {
 				TraceEvent(SevDebug, "BlobGranuleDeltaFile", bwData->id)
 				    .detail("Granule", metadata->keyRange)
@ -1914,7 +2071,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 					// write/read a bunch of empty blob files
 					ASSERT(forceFlush);
 					ASSERT(!forceFlushVersions.empty());
-					TEST(true); // Force flushing empty delta file!
+					CODE_PROBE(true, "Force flushing empty delta file!");
 				}

 				if (BW_DEBUG) {
@ -2042,7 +2199,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 						idx++;
 					}
 					while (waitIdx > 0) {
-						TEST(true); // Granule blocking on previous snapshot
+						CODE_PROBE(true, "Granule blocking on previous snapshot");
 						// TODO don't duplicate code
 						BlobFileIndex completedFile = wait(inFlightFiles.front().future);
 						if (inFlightFiles.front().snapshot) {
@ -2083,7 +2240,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				// queue too many files in parallel, and slow down change feed consuming to let file writing
 				// catch up

-				TEST(true); // Granule processing long tail of old change feed
+				CODE_PROBE(true, "Granule processing long tail of old change feed");
 				if (inFlightFiles.size() > 10 && inFlightFiles.front().version <= metadata->knownCommittedVersion) {
 					if (BW_DEBUG) {
 						fmt::print("[{0} - {1}) Waiting on delta file b/c old change feed\n",
@ -2137,7 +2294,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 		++bwData->stats.granuleUpdateErrors;

 		if (granuleCanRetry(e)) {
-			TEST(true); // Granule close and re-open on error
+			CODE_PROBE(true, "Granule close and re-open on error");
 			TraceEvent("GranuleFileUpdaterRetriableError", bwData->id)
 			    .error(e)
 			    .detail("Granule", metadata->keyRange)
@ -2313,7 +2470,7 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
 									           next.version);
 								}
 							} else {
-								TEST(true); // duplicate parent in granule history (split then merge)
+								CODE_PROBE(true, "duplicate parent in granule history (split then merge)");
 								if (BW_HISTORY_DEBUG) {
 									fmt::print("HL {0} {1}) [{2} - {3}) @ {4}: duplicate parent [{5} - "
 									           "{6}) @ {7}\n",
@ -2533,11 +2690,12 @@ struct sort_result_chunks {
 	}
 };

-static int64_t nextHistoryQueryId = 0;
-static std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(Reference<BlobWorkerData> bwData,
-                                                                                Version expectedEndVersion,
-                                                                                KeyRange keyRange,
-                                                                                Version readVersion) {
+namespace {
+int64_t nextHistoryQueryId = 0;
+std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(Reference<BlobWorkerData> bwData,
+                                                                         Version expectedEndVersion,
+                                                                         KeyRange keyRange,
+                                                                         Version readVersion) {
 	std::unordered_set<UID> visited;
 	std::deque<Reference<GranuleHistoryEntry>> queue;
 	std::vector<std::pair<KeyRange, Future<GranuleFiles>>> resultChunks;
@ -2660,7 +2818,7 @@ static std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(

 	ASSERT(!resultChunks.empty());
 	if (resultChunks.size() >= 2) {
-		TEST(true); // Multiple history chunks for time travel query
+		CODE_PROBE(true, "Multiple history chunks for time travel query");
 		std::sort(resultChunks.begin(), resultChunks.end(), sort_result_chunks());
 		// Assert contiguous
 		for (int i = 0; i < resultChunks.size() - 1; i++) {
@ -2698,7 +2856,6 @@ static std::vector<std::pair<KeyRange, Future<GranuleFiles>>> loadHistoryChunks(

 // TODO might want to separate this out for valid values for range assignments vs read requests. Assignment
 // conflict isn't valid for read requests but is for assignments
-namespace {
 bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_blob_granule_transaction_too_old:
@ -2735,7 +2892,7 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
 	     metadata->durableDeltaVersion.get() == metadata->pendingDeltaVersion) &&
 	    (v <= metadata->durableSnapshotVersion.get() ||
 	     metadata->durableSnapshotVersion.get() == metadata->pendingSnapshotVersion)) {
-		TEST(true); // Granule read not waiting
+		CODE_PROBE(true, "Granule read not waiting");
 		return Void();
 	}

@ -2752,7 +2909,7 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
 	// If there are mutations that are no longer buffered but have not been
 	// persisted to a delta file that are necessary for the query, wait for them
 	if (pendingDeltaV > metadata->durableDeltaVersion.get() && v > metadata->durableDeltaVersion.get()) {
-		TEST(true); // Granule read waiting for pending delta
+		CODE_PROBE(true, "Granule read waiting for pending delta");
 		wait(metadata->durableDeltaVersion.whenAtLeast(pendingDeltaV));
 		ASSERT(metadata->durableDeltaVersion.get() >= pendingDeltaV);
 	}
@ -2760,7 +2917,7 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
 	// This isn't strictly needed, but if we're in the process of re-snapshotting, we'd likely rather
 	// return that snapshot file than the previous snapshot file and all its delta files.
 	if (pendingSnapshotV > metadata->durableSnapshotVersion.get() && v > metadata->durableSnapshotVersion.get()) {
-		TEST(true); // Granule read waiting for pending snapshot
+		CODE_PROBE(true, "Granule read waiting for pending snapshot");
 		wait(metadata->durableSnapshotVersion.whenAtLeast(pendingSnapshotV));
 		ASSERT(metadata->durableSnapshotVersion.get() >= pendingSnapshotV);
 	}
@ -2770,7 +2927,7 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
 	// file instead of in memory mutations, so we wait for that delta file to complete

 	while (v > metadata->durableDeltaVersion.get() && metadata->pendingDeltaVersion > pendingDeltaV) {
-		TEST(true); // Granule mutations flushed while waiting for files to complete
+		CODE_PROBE(true, "Granule mutations flushed while waiting for files to complete");
 		Version waitVersion = std::min(v, metadata->pendingDeltaVersion);
 		pendingDeltaV = metadata->pendingDeltaVersion;
 		wait(metadata->durableDeltaVersion.whenAtLeast(waitVersion));
@ -2793,6 +2950,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 	}

 	state Optional<Key> tenantPrefix;
+	state Arena arena;
 	if (req.tenantInfo.name.present()) {
 		ASSERT(req.tenantInfo.tenantId != TenantInfo::INVALID_TENANT);
 		Optional<TenantMapEntry> tenantEntry = bwData->tenantData.getTenantById(req.tenantInfo.tenantId);
@ -2800,7 +2958,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 			ASSERT(tenantEntry.get().id == req.tenantInfo.tenantId);
 			tenantPrefix = tenantEntry.get().prefix;
 		} else {
-			TEST(true); // Blob worker unknown tenant
+			CODE_PROBE(true, "Blob worker unknown tenant");
 			// FIXME - better way. Wait on retry here, or just have better model for tenant metadata?
 			// Just throw wrong_shard_server and make the client retry and assume we load it later
 			TraceEvent(SevDebug, "BlobWorkerRequestUnknownTenant", bwData->id)
@ -2869,6 +3027,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				continue;
 			}
 			state Reference<GranuleMetadata> metadata = m;
+			state Version granuleBeginVersion = req.beginVersion;

 			choose {
 				when(wait(metadata->readable.getFuture())) {}
@ -2880,10 +3039,10 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				throw wrong_shard_server();
 			}

-			state std::vector<std::pair<KeyRange, GranuleFiles>> chunks;
+			state std::vector<std::pair<KeyRange, GranuleFiles>> rangeGranulePair;

 			if (req.readVersion < metadata->historyVersion) {
-				TEST(true); // Granule Time Travel Read
+				CODE_PROBE(true, "Granule Time Travel Read");
 				// this is a time travel query, find previous granule
 				if (metadata->historyLoaded.canBeSet()) {
 					choose {
@ -2898,19 +3057,19 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				for (chunkIdx = 0; chunkIdx < finalChunks.size(); chunkIdx++) {
 					choose {
 						when(GranuleFiles f = wait(finalChunks[chunkIdx].second)) {
-							chunks.push_back(std::pair(finalChunks[chunkIdx].first, f));
+							rangeGranulePair.push_back(std::pair(finalChunks[chunkIdx].first, f));
 						}
 						when(wait(metadata->cancelled.getFuture())) { throw wrong_shard_server(); }
 					}

-					if (chunks.back().second.snapshotFiles.empty()) {
+					if (rangeGranulePair.back().second.snapshotFiles.empty()) {
 						// a snapshot file must have been purged
 						throw blob_granule_transaction_too_old();
 					}

-					ASSERT(!chunks.back().second.deltaFiles.empty());
-					ASSERT(chunks.back().second.deltaFiles.back().version > req.readVersion);
-					if (chunks.back().second.snapshotFiles.front().version > req.readVersion) {
+					ASSERT(!rangeGranulePair.back().second.deltaFiles.empty());
+					ASSERT(rangeGranulePair.back().second.deltaFiles.back().version > req.readVersion);
+					if (rangeGranulePair.back().second.snapshotFiles.front().version > req.readVersion) {
 						// a snapshot file must have been purged
 						throw blob_granule_transaction_too_old();
 					}
@ -2922,7 +3081,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 					throw blob_granule_transaction_too_old();
 				}

-				TEST(true); // Granule Active Read
+				CODE_PROBE(true, "Granule Active Read");
 				// this is an active granule query
 				loop {
 					if (!metadata->activeCFData.get().isValid() || !metadata->cancelled.canBeSet()) {
@ -2945,13 +3104,13 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 						// We can get change feed cancelled from whenAtLeast. This means the change feed may
 						// retry, or may be cancelled. Wait a bit and try again to see
 						if (e.code() == error_code_change_feed_popped) {
-							TEST(true); // Change feed popped while read waiting
+							CODE_PROBE(true, "Change feed popped while read waiting");
 							throw wrong_shard_server();
 						}
 						if (e.code() != error_code_change_feed_cancelled) {
 							throw e;
 						}
-						TEST(true); // Change feed switched while read waiting
+						CODE_PROBE(true, "Change feed switched while read waiting");
 						// wait 1ms and try again
 						wait(delay(0.001));
 					}
@ -2962,7 +3121,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 						           req.readVersion);
 					}
 				}
-				chunks.push_back(std::pair(metadata->keyRange, metadata->files));
+				rangeGranulePair.push_back(std::pair(metadata->keyRange, metadata->files));
 			}

 			if (!metadata->cancelled.canBeSet()) {
@ -2978,31 +3137,40 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 			// granule is up to date, do read
 			ASSERT(metadata->cancelled.canBeSet());

-			for (auto& c : chunks) {
+			for (auto& item : rangeGranulePair) {
 				Version granuleBeginVersion = req.beginVersion;
 				// Right now we force a collapse if the version range crosses granule boundaries, for simplicity
-				if (granuleBeginVersion > 0 && granuleBeginVersion <= c.second.snapshotFiles.front().version) {
-					TEST(true); // collapsed begin version request because of boundaries
+				if (granuleBeginVersion > 0 && granuleBeginVersion <= item.second.snapshotFiles.front().version) {
+					CODE_PROBE(true, "collapsed begin version request because of boundaries");
 					didCollapse = true;
 					granuleBeginVersion = 0;
 				}
-				BlobGranuleChunkRef chunk;
+				state BlobGranuleChunkRef chunk;
 				// TODO change with early reply
 				chunk.includedVersion = req.readVersion;
-				chunk.keyRange = KeyRangeRef(StringRef(rep.arena, c.first.begin), StringRef(rep.arena, c.first.end));
+				chunk.keyRange =
+				    KeyRangeRef(StringRef(rep.arena, item.first.begin), StringRef(rep.arena, item.first.end));
 				if (tenantPrefix.present()) {
 					chunk.tenantPrefix = Optional<StringRef>(tenantPrefix.get());
 				}

 				int64_t deltaBytes = 0;
-				c.second.getFiles(
+				item.second.getFiles(
 				    granuleBeginVersion, req.readVersion, req.canCollapseBegin, chunk, rep.arena, deltaBytes);
 				bwData->stats.readReqDeltaBytesReturned += deltaBytes;
 				if (granuleBeginVersion > 0 && chunk.snapshotFile.present()) {
-					TEST(true); // collapsed begin version request for efficiency
+					CODE_PROBE(true, "collapsed begin version request for efficiency");
 					didCollapse = true;
 				}

+				// TODO: optimization - batch 'encryption-key' lookup given the GranuleFile set is known
+				state Future<BlobGranuleCipherKeysCtx> cipherKeysCtx;
+				if (chunk.snapshotFile.present() && chunk.snapshotFile.get().cipherKeysMetaRef.present()) {
+					ASSERT(isBlobFileEncryptionSupported());
+					cipherKeysCtx =
+					    getGranuleCipherKeys(bwData, chunk.snapshotFile.get().cipherKeysMetaRef.get(), &rep.arena);
+				}
+
 				// new deltas (if version is larger than version of last delta file)
 				// FIXME: do trivial key bounds here if key range is not fully contained in request key
 				// range
@ -3023,11 +3191,11 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 					rep.arena.dependsOn(metadata->currentDeltas.arena());
 					MutationsAndVersionRef* mutationIt = metadata->currentDeltas.begin();
 					if (granuleBeginVersion > metadata->currentDeltas.back().version) {
-						TEST(true); // beginVersion pruning all in-memory mutations
+						CODE_PROBE(true, "beginVersion pruning all in-memory mutations");
 						mutationIt = metadata->currentDeltas.end();
 					} else if (granuleBeginVersion > metadata->currentDeltas.front().version) {
 						// binary search for beginVersion
-						TEST(true); // beginVersion pruning some in-memory mutations
+						CODE_PROBE(true, "beginVersion pruning some in-memory mutations");
 						mutationIt = std::lower_bound(metadata->currentDeltas.begin(),
 						                              metadata->currentDeltas.end(),
 						                              MutationsAndVersionRef(granuleBeginVersion, 0),
@ -3037,7 +3205,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 					// add mutations to response
 					while (mutationIt != metadata->currentDeltas.end()) {
 						if (mutationIt->version > req.readVersion) {
-							TEST(true); // readVersion pruning some in-memory mutations
+							CODE_PROBE(true, "readVersion pruning some in-memory mutations");
 							break;
 						}
 						chunk.newDeltas.push_back_deep(rep.arena, *mutationIt);
@ -3045,6 +3213,11 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 					}
 				}

+				if (chunk.snapshotFile.present() && chunk.snapshotFile.get().cipherKeysMetaRef.present()) {
+					BlobGranuleCipherKeysCtx ctx = wait(cipherKeysCtx);
+					chunk.cipherKeysCtx = std::move(ctx);
+				}
+
 				rep.chunks.push_back(rep.arena, chunk);

 				bwData->stats.readReqTotalFilesReturned += chunk.deltaFiles.size() + int(chunk.snapshotFile.present());
@ -3087,7 +3260,7 @@ ACTOR Future<Void> handleBlobGranuleFileRequest(Reference<BlobWorkerData> bwData
 		when(wait(doBlobGranuleFileRequest(bwData, req))) {}
 		when(wait(delay(SERVER_KNOBS->BLOB_WORKER_REQUEST_TIMEOUT))) {
 			if (!req.reply.isSet()) {
-				TEST(true); // Blob Worker request timeout hit
+				CODE_PROBE(true, "Blob Worker request timeout hit");
 				if (BW_DEBUG) {
 					fmt::print("BW {0} request [{1} - {2}) @ {3} timed out, sending WSS\n",
 					           bwData->id.toString().substr(0, 5),
@ -3151,7 +3324,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 			state bool hasPrevOwner = prevLockValue.present();
 			state bool createChangeFeed = false;
 			if (hasPrevOwner) {
-				TEST(true); // Granule open found previous owner
+				CODE_PROBE(true, "Granule open found previous owner");
 				std::tuple<int64_t, int64_t, UID> prevOwner = decodeBlobGranuleLockValue(prevLockValue.get());

 				info.granuleID = std::get<2>(prevOwner);
@ -3160,7 +3333,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 				// if it's the first snapshot of a new granule, history won't be present
 				if (info.history.present()) {
 					if (info.granuleID != info.history.get().value.granuleID) {
-						TEST(true); // Blob Worker re-opening granule after merge+resplit
+						CODE_PROBE(true, "Blob Worker re-opening granule after merge+resplit");
 						// The only case this can happen is when a granule was merged into a larger granule,
 						// then split back out to the same one. Validate that this is a new granule that was
 						// split previously. Just check lock based on epoch, since seqno is intentionally
@ -3237,7 +3410,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 			// ret.previousChangeFeedId, and the previous durable version will come from the previous
 			// granules
 			if (info.history.present() && info.history.get().value.parentVersions.size() > 0) {
-				TEST(true); // Granule open found parent
+				CODE_PROBE(true, "Granule open found parent");
 				if (info.history.get().value.parentVersions.size() == 1) { // split
 					state KeyRangeRef parentRange(info.history.get().value.parentBoundaries[0],
 					                              info.history.get().value.parentBoundaries[1]);
@ -3262,12 +3435,12 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 						}

 						if (granuleSplitState.first == BlobGranuleSplitState::Assigned) {
-							TEST(true); // Granule open found granule in assign state
+							CODE_PROBE(true, "Granule open found granule in assign state");
 							// was already assigned, use change feed start version
 							ASSERT(granuleSplitState.second > 0);
 							info.changeFeedStartVersion = granuleSplitState.second;
 						} else if (granuleSplitState.first == BlobGranuleSplitState::Initialized) {
-							TEST(true); // Granule open found granule in initialized state
+							CODE_PROBE(true, "Granule open found granule in initialized state");
 							wait(updateGranuleSplitState(&tr,
 							                             info.splitParentGranule.get().first,
 							                             info.splitParentGranule.get().second,
@ -3276,7 +3449,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 							// change feed was created as part of this transaction, changeFeedStartVersion
 							// will be set later
 						} else {
-							TEST(true); // Granule open found granule in done state
+							CODE_PROBE(true, "Granule open found granule in done state");
 							// this sub-granule is done splitting, no need for split logic.
 							info.splitParentGranule.reset();
 						}
@ -3295,7 +3468,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
 						                                  : info.blobFilesToSnapshot[0].deltaFiles.back().version;
 					}
 				} else if (info.doSnapshot) {
-					TEST(true); // merge needs to snapshot at start
+					CODE_PROBE(true, "merge needs to snapshot at start");
 					state std::vector<Future<GranuleFiles>> parentGranulesToSnapshot;
 					ASSERT(info.previousDurableVersion == invalidVersion);
 					// need first snapshot to be at history version so this granule can serve the full range
@ -3359,7 +3532,7 @@ ACTOR Future<Reference<BlobConnectionProvider>> loadBStoreForTenant(Reference<Bl
 			wait(delay(0));
 			return data->bstore;
 		} else {
-			TEST(true); // bstore for unknown tenant
+			CODE_PROBE(true, "bstore for unknown tenant");
 			// Assume not loaded yet, just wait a bit. Could do sophisticated mechanism but will redo tenant
 			// loading to be versioned anyway. 10 retries means it's likely not a transient race with
 			// loading tenants, and instead a persistent issue.
@ -3389,10 +3562,11 @@ ACTOR Future<Void> start(Reference<BlobWorkerData> bwData, GranuleRangeMetadata*
 	return Void();
 }

-static GranuleRangeMetadata constructActiveBlobRange(Reference<BlobWorkerData> bwData,
-                                                     KeyRange keyRange,
-                                                     int64_t epoch,
-                                                     int64_t seqno) {
+namespace {
+GranuleRangeMetadata constructActiveBlobRange(Reference<BlobWorkerData> bwData,
+                                              KeyRange keyRange,
+                                              int64_t epoch,
+                                              int64_t seqno) {

 	Reference<GranuleMetadata> newMetadata = makeReference<GranuleMetadata>();
 	newMetadata->keyRange = keyRange;
@ -3405,12 +3579,12 @@ static GranuleRangeMetadata constructActiveBlobRange(Reference<BlobWorkerData> b
 	return GranuleRangeMetadata(epoch, seqno, newMetadata);
 }

-static GranuleRangeMetadata constructInactiveBlobRange(int64_t epoch, int64_t seqno) {
+GranuleRangeMetadata constructInactiveBlobRange(int64_t epoch, int64_t seqno) {
 	return GranuleRangeMetadata(epoch, seqno, Reference<GranuleMetadata>());
 }

 // ignore stale assignments and make repeating the same one idempotent
-static bool newerRangeAssignment(GranuleRangeMetadata oldMetadata, int64_t epoch, int64_t seqno) {
+bool newerRangeAssignment(GranuleRangeMetadata oldMetadata, int64_t epoch, int64_t seqno) {
 	return epoch > oldMetadata.lastEpoch || (epoch == oldMetadata.lastEpoch && seqno > oldMetadata.lastSeqno);
 }

@ -3435,15 +3609,15 @@ static bool newerRangeAssignment(GranuleRangeMetadata oldMetadata, int64_t epoch
 // state.

 // Not an actor because we need to guarantee it changes the synchronously as part of the request
-static bool changeBlobRange(Reference<BlobWorkerData> bwData,
-                            KeyRange keyRange,
-                            int64_t epoch,
-                            int64_t seqno,
-                            bool active,
-                            bool disposeOnCleanup,
-                            bool selfReassign,
-                            std::vector<Future<Void>>& toWaitOut,
-                            Optional<AssignRequestType> assignType = Optional<AssignRequestType>()) {
+bool changeBlobRange(Reference<BlobWorkerData> bwData,
+                     KeyRange keyRange,
+                     int64_t epoch,
+                     int64_t seqno,
+                     bool active,
+                     bool disposeOnCleanup,
+                     bool selfReassign,
+                     std::vector<Future<Void>>& toWaitOut,
+                     Optional<AssignRequestType> assignType = Optional<AssignRequestType>()) {
 	ASSERT(active == assignType.present());

 	if (BW_DEBUG) {
@ -3548,7 +3722,7 @@ static bool changeBlobRange(Reference<BlobWorkerData> bwData,
 	return newerRanges.size() == 0;
 }

-static bool resumeBlobRange(Reference<BlobWorkerData> bwData, KeyRange keyRange, int64_t epoch, int64_t seqno) {
+bool resumeBlobRange(Reference<BlobWorkerData> bwData, KeyRange keyRange, int64_t epoch, int64_t seqno) {
 	auto existingRange = bwData->granuleMetadata.rangeContaining(keyRange.begin);
 	// if range boundaries don't match, or this (epoch, seqno) is old or the granule is inactive, ignore
 	if (keyRange.begin != existingRange.begin() || keyRange.end != existingRange.end() ||
@ -3585,6 +3759,7 @@ static bool resumeBlobRange(Reference<BlobWorkerData> bwData, KeyRange keyRange,
 	// else we already processed this continue, do nothing
 	return true;
 }
+} // namespace

 // the contract of handleRangeAssign and handleRangeRevoke is that they change the mapping before doing any
 // waiting. This ensures GetGranuleAssignment returns an up-to-date set of ranges
@ -3736,7 +3911,7 @@ ACTOR Future<Void> monitorRemoval(Reference<BlobWorkerData> bwData) {

 				Optional<Value> val = wait(tr.get(blobWorkerListKey));
 				if (!val.present()) {
-					TEST(true); // Blob worker found out BM killed it from reading DB
+					CODE_PROBE(true, "Blob worker found out BM killed it from reading DB");
 					return Void();
 				}

@ -3823,8 +3998,8 @@ ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
 	}
 }

-static void handleGetGranuleAssignmentsRequest(Reference<BlobWorkerData> self,
-                                               const GetGranuleAssignmentsRequest& req) {
+namespace {
+void handleGetGranuleAssignmentsRequest(Reference<BlobWorkerData> self, const GetGranuleAssignmentsRequest& req) {
 	GetGranuleAssignmentsReply reply;
 	auto allRanges = self->granuleMetadata.intersectingRanges(normalKeys);
 	for (auto& it : allRanges) {
@ -3845,6 +4020,7 @@ static void handleGetGranuleAssignmentsRequest(Reference<BlobWorkerData> self,
 	}
 	req.reply.send(reply);
 }
+} // namespace

 ACTOR Future<Void> handleFlushGranuleReq(Reference<BlobWorkerData> self, FlushGranuleRequest req) {
 	++self->stats.flushGranuleReqs;
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@ -23,6 +23,13 @@ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/workloads)

 add_flow_target(EXECUTABLE NAME fdbserver SRCS ${FDBSERVER_SRCS})

+find_package(ZLIB)
+if(ZLIB_FOUND)
+  add_compile_definitions(ZLIB_LIB_SUPPORTED)
+else()
+  message(STATUS "ZLIB package not found")
+endif()
+
 target_include_directories(fdbserver PRIVATE
  ${CMAKE_SOURCE_DIR}/bindings/c
  ${CMAKE_BINARY_DIR}/bindings/c
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -316,7 +316,7 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,

 			wait(spinDelay);

-			TEST(true); // clusterWatchDatabase() master failed
+			CODE_PROBE(true, "clusterWatchDatabase() master failed");
 			TraceEvent(SevWarn, "DetectedFailedRecovery", cluster->id).detail("OldMaster", iMaster.id());
 		} catch (Error& e) {
 			state Error err = e;
@ -328,13 +328,14 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
 			wait(cleanupRecoveryActorCollection(recoveryData, true /* exThrown */));
 			ASSERT(addActor.isEmpty());

-			TEST(err.code() == error_code_tlog_failed); // Terminated due to tLog failure
-			TEST(err.code() == error_code_commit_proxy_failed); // Terminated due to commit proxy failure
-			TEST(err.code() == error_code_grv_proxy_failed); // Terminated due to GRV proxy failure
-			TEST(err.code() == error_code_resolver_failed); // Terminated due to resolver failure
-			TEST(err.code() == error_code_backup_worker_failed); // Terminated due to backup worker failure
-			TEST(err.code() == error_code_operation_failed); // Terminated due to failed operation
-			TEST(err.code() == error_code_restart_cluster_controller); // Terminated due to cluster-controller restart.
+			CODE_PROBE(err.code() == error_code_tlog_failed, "Terminated due to tLog failure");
+			CODE_PROBE(err.code() == error_code_commit_proxy_failed, "Terminated due to commit proxy failure");
+			CODE_PROBE(err.code() == error_code_grv_proxy_failed, "Terminated due to GRV proxy failure");
+			CODE_PROBE(err.code() == error_code_resolver_failed, "Terminated due to resolver failure");
+			CODE_PROBE(err.code() == error_code_backup_worker_failed, "Terminated due to backup worker failure");
+			CODE_PROBE(err.code() == error_code_operation_failed, "Terminated due to failed operation");
+			CODE_PROBE(err.code() == error_code_restart_cluster_controller,
+			           "Terminated due to cluster-controller restart.");

 			if (cluster->shouldCommitSuicide || err.code() == error_code_coordinators_changed) {
 				TraceEvent("ClusterControllerTerminate", cluster->id).errorUnsuppressed(err);
@ -622,7 +623,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
 	}

 	WorkerDetails newEKPWorker;
-	if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
+	if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
 		newEKPWorker = findNewProcessForSingleton(self, ProcessClass::EncryptKeyProxy, id_used);
 	}

@ -636,7 +637,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
 	}

 	ProcessClass::Fitness bestFitnessForEKP;
-	if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
+	if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
 		bestFitnessForEKP = findBestFitnessForSingleton(self, newEKPWorker, ProcessClass::EncryptKeyProxy);
 	}

@ -661,7 +662,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
 	}

 	bool ekpHealthy = true;
-	if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
+	if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
 		ekpHealthy = isHealthySingleton<EncryptKeyProxyInterface>(
 		    self, newEKPWorker, ekpSingleton, bestFitnessForEKP, self->recruitingEncryptKeyProxyID);
 	}
@ -685,7 +686,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
 	}

 	Optional<Standalone<StringRef>> currEKPProcessId, newEKPProcessId;
-	if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
+	if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
 		currEKPProcessId = ekpSingleton.interface.get().locality.processId();
 		newEKPProcessId = newEKPWorker.interf.locality.processId();
 	}
@ -697,7 +698,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
 		newPids.emplace_back(newBMProcessId);
 	}

-	if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
+	if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
 		currPids.emplace_back(currEKPProcessId);
 		newPids.emplace_back(newEKPProcessId);
 	}
@ -712,7 +713,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
 	}

 	// if the knob is disabled, the EKP coloc counts should have no affect on the coloc counts check below
-	if (!SERVER_KNOBS->ENABLE_ENCRYPTION && !g_network->isSimulated()) {
+	if (!SERVER_KNOBS->ENABLE_ENCRYPTION) {
 		ASSERT(currColocMap[currEKPProcessId] == 0);
 		ASSERT(newColocMap[newEKPProcessId] == 0);
 	}
@ -1244,7 +1245,7 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
 		}
 		checkOutstandingRequests(self);
 	} else {
-		TEST(true); // Received an old worker registration request.
+		CODE_PROBE(true, "Received an old worker registration request.");
 	}

 	// For each singleton
@ -1271,7 +1272,7 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
 		    self, w, currSingleton, registeringSingleton, self->recruitingBlobManagerID);
 	}

-	if ((SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) && req.encryptKeyProxyInterf.present()) {
+	if (SERVER_KNOBS->ENABLE_ENCRYPTION && req.encryptKeyProxyInterf.present()) {
 		auto currSingleton = EncryptKeyProxySingleton(self->db.serverInfo->get().encryptKeyProxy);
 		auto registeringSingleton = EncryptKeyProxySingleton(req.encryptKeyProxyInterf);
 		haltRegisteringOrCurrentSingleton<EncryptKeyProxyInterface>(
@ -2525,7 +2526,7 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 	state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));

 	// EncryptKeyProxy is necessary for TLog recovery, recruit it as the first process
-	if (SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) {
+	if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
 		self.addActor.send(monitorEncryptKeyProxy(&self));
 	}
 	self.addActor.send(clusterWatchDatabase(
--- a/fdbserver/ClusterRecovery.actor.cpp
+++ b/fdbserver/ClusterRecovery.actor.cpp
@ -58,7 +58,7 @@ ACTOR Future<Void> recoveryTerminateOnConflict(UID dbgid,
 		when(wait(onConflict)) {
 			if (!fullyRecovered.isSet()) {
 				TraceEvent("RecoveryTerminated", dbgid).detail("Reason", "Conflict");
-				TEST(true); // Coordinated state conflict, recovery terminating
+				CODE_PROBE(true, "Coordinated state conflict, recovery terminating");
 				throw worker_removed();
 			}
 			return Void();
@ -110,7 +110,7 @@ ACTOR Future<Void> recruitNewMaster(ClusterControllerData* cluster,

 			return Void();
 		} else {
-			TEST(true); // clusterWatchDatabase() !newMaster.present()
+			CODE_PROBE(true, "clusterWatchDatabase() !newMaster.present()");
 			wait(delay(SERVER_KNOBS->MASTER_SPIN_DELAY));
 		}
 	}
@ -118,7 +118,7 @@ ACTOR Future<Void> recruitNewMaster(ClusterControllerData* cluster,

 ACTOR Future<Void> clusterRecruitFromConfiguration(ClusterControllerData* self, Reference<RecruitWorkersInfo> req) {
 	// At the moment this doesn't really need to be an actor (it always completes immediately)
-	TEST(true); // ClusterController RecruitTLogsRequest
+	CODE_PROBE(true, "ClusterController RecruitTLogsRequest");
 	loop {
 		try {
 			req->rep = self->findWorkersForConfiguration(req->req);
@ -150,7 +150,7 @@ ACTOR Future<RecruitRemoteFromConfigurationReply> clusterRecruitRemoteFromConfig
    ClusterControllerData* self,
    Reference<RecruitRemoteWorkersInfo> req) {
 	// At the moment this doesn't really need to be an actor (it always completes immediately)
-	TEST(true); // ClusterController RecruitTLogsRequest Remote
+	CODE_PROBE(true, "ClusterController RecruitTLogsRequest Remote");
 	loop {
 		try {
 			auto rep = self->findRemoteWorkersForConfiguration(req->req);
@ -355,7 +355,7 @@ ACTOR Future<Void> newSeedServers(Reference<ClusterRecoveryData> self,
 			    !newServer.isError(error_code_request_maybe_delivered))
 				throw newServer.getError();

-			TEST(true); // initial storage recuitment loop failed to get new server
+			CODE_PROBE(true, "initial storage recuitment loop failed to get new server");
 			wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY));
 		} else {
 			if (!dcId_tags.count(recruits.storageServers[idx].locality.dcId())) {
@ -736,7 +736,7 @@ ACTOR Future<Void> updateLogsValue(Reference<ClusterRecoveryData> self, Database
 			}

 			if (!found) {
-				TEST(true); // old master attempted to change logsKey
+				CODE_PROBE(true, "old master attempted to change logsKey");
 				return Void();
 			}

@ -815,7 +815,7 @@ ACTOR Future<Void> updateRegistration(Reference<ClusterRecoveryData> self, Refer
 			                            std::vector<UID>()));
 		} else {
 			// The cluster should enter the accepting commits phase soon, and then we will register again
-			TEST(true); // cstate is updated but we aren't accepting commits yet
+			CODE_PROBE(true, "cstate is updated but we aren't accepting commits yet");
 		}
 	}
 }
@ -1357,7 +1357,7 @@ ACTOR Future<Void> recoverFrom(Reference<ClusterRecoveryData> self,
 			}
 			when(Standalone<CommitTransactionRef> _req = wait(provisional)) {
 				state Standalone<CommitTransactionRef> req = _req; // mutable
-				TEST(true); // Emergency transaction processing during recovery
+				CODE_PROBE(true, "Emergency transaction processing during recovery");
 				TraceEvent("EmergencyTransaction", self->dbgid).log();
 				for (auto m = req.mutations.begin(); m != req.mutations.end(); ++m)
 					TraceEvent("EmergencyTransactionMutation", self->dbgid)
@ -1559,7 +1559,7 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
 		    .detail("SnapRecoveryFlag", snapRecoveryFlag.present() ? snapRecoveryFlag.get().toString() : "N/A")
 		    .detail("LastEpochEnd", self->lastEpochEnd);
 		if (snapRecoveryFlag.present()) {
-			TEST(true); // Recovering from snapshot, writing to snapShotEndVersionKey
+			CODE_PROBE(true, "Recovering from snapshot, writing to snapShotEndVersionKey");
 			BinaryWriter bw(Unversioned());
 			tr.set(recoveryCommitRequest.arena, snapshotEndVersionKey, (bw << self->lastEpochEnd).toValue());
 			// Pause the backups that got restored in this snapshot to avoid data corruption
@ -1659,7 +1659,7 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
 	// unless we want to change TLogs
 	wait((success(recoveryCommit) && sendInitialCommitToResolvers(self)));
 	if (recoveryCommit.isReady() && recoveryCommit.get().isError()) {
-		TEST(true); // Cluster recovery failed because of the initial commit failed
+		CODE_PROBE(true, "Cluster recovery failed because of the initial commit failed");
 		throw cluster_recovery_failed();
 	}

--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@ -789,7 +789,7 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
 	}

 	// Pre-resolution the commits
-	TEST(pProxyCommitData->latestLocalCommitBatchResolving.get() < localBatchNumber - 1); // Wait for local batch
+	CODE_PROBE(pProxyCommitData->latestLocalCommitBatchResolving.get() < localBatchNumber - 1, "Wait for local batch");
 	wait(pProxyCommitData->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber - 1));
 	pProxyCommitData->stats.computeLatency.addMeasurement(now() - timeStart);
 	double queuingDelay = g_network->now() - timeStart;
@ -798,7 +798,7 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
 	     (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01))) &&
 	    SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && canReject(trs)) {
 		// Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries.
-		TEST(true); // Reject transactions in the batch
+		CODE_PROBE(true, "Reject transactions in the batch");
 		TraceEvent(SevWarnAlways, "ProxyReject", pProxyCommitData->dbgid)
 		    .suppressFor(0.1)
 		    .detail("QDelay", queuingDelay)
@ -1152,7 +1152,7 @@ void writeMutation(CommitBatchContext* self, int64_t tenantId, const MutationRef
 		bool isRawAccess = tenantId == TenantInfo::INVALID_TENANT && !isSystemKey(mutation.param1) &&
 		                   !(mutation.type == MutationRef::ClearRange && isSystemKey(mutation.param2)) &&
 		                   self->pProxyCommitData->db->get().client.tenantMode == TenantMode::REQUIRED;
-		TEST(isRawAccess); // Raw access to tenant key space
+		CODE_PROBE(isRawAccess, "Raw access to tenant key space");
 		self->toCommit.writeTypedMessage(mutation);
 	} else {
 		Arena arena;
@ -1259,7 +1259,7 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 						trCost->get().clearIdxCosts.pop_front();
 					}
 				} else {
-					TEST(true); // A clear range extends past a shard boundary
+					CODE_PROBE(true, "A clear range extends past a shard boundary");
 					std::set<Tag> allSources;
 					for (auto r : ranges) {
 						r.value().populateTags();
@ -1347,7 +1347,7 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 	state Span span("MP:postResolution"_loc, self->span.context);

 	bool queuedCommits = pProxyCommitData->latestLocalCommitBatchLogging.get() < localBatchNumber - 1;
-	TEST(queuedCommits); // Queuing post-resolution commit processing
+	CODE_PROBE(queuedCommits, "Queuing post-resolution commit processing");
 	wait(pProxyCommitData->latestLocalCommitBatchLogging.whenAtLeast(localBatchNumber - 1));
 	state double postResolutionQueuing = now();
 	pProxyCommitData->stats.postResolutionDist->sampleSeconds(postResolutionQueuing - postResolutionStart);
@ -1424,7 +1424,7 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 		       self->commitVersion - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) {
 			// This should be *extremely* rare in the real world, but knob buggification should make it happen in
 			// simulation
-			TEST(true); // Semi-committed pipeline limited by MVCC window
+			CODE_PROBE(true, "Semi-committed pipeline limited by MVCC window");
 			//TraceEvent("ProxyWaitingForCommitted", pProxyCommitData->dbgid).detail("CommittedVersion", pProxyCommitData->committedVersion.get()).detail("NeedToCommit", commitVersion);
 			waitVersionSpan = Span("MP:overMaxReadTransactionLifeVersions"_loc, span.context);
 			choose {
@ -1617,7 +1617,8 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 	// client may get a commit version that the master is not aware of, and next GRV request may get a version less than
 	// self->committedVersion.

-	TEST(pProxyCommitData->committedVersion.get() > self->commitVersion); // later version was reported committed first
+	CODE_PROBE(pProxyCommitData->committedVersion.get() > self->commitVersion,
+	           "later version was reported committed first");

 	if (self->commitVersion >= pProxyCommitData->committedVersion.get()) {
 		state Optional<std::set<Tag>> writtenTags;
@ -2603,7 +2604,7 @@ ACTOR Future<Void> commitProxyServer(CommitProxyInterface proxy,
 		    e.code() != error_code_failed_to_progress) {
 			throw;
 		}
-		TEST(e.code() == error_code_failed_to_progress); // Commit proxy failed to progress
+		CODE_PROBE(e.code() == error_code_failed_to_progress, "Commit proxy failed to progress");
 	}
 	return Void();
 }
--- a/fdbserver/ConfigNode.actor.cpp
+++ b/fdbserver/ConfigNode.actor.cpp
@ -206,7 +206,7 @@ class ConfigNodeImpl {
 			// Handle a very rare case where a ConfigNode loses data between
 			// responding with a committed version and responding to the
 			// subsequent get changes request.
-			TEST(true); // ConfigNode data loss occurred on a minority of coordinators
+			CODE_PROBE(true, "ConfigNode data loss occurred on a minority of coordinators");
 			req.reply.sendError(process_behind()); // Reuse the process_behind error
 			return Void();
 		}
@ -230,7 +230,8 @@ class ConfigNodeImpl {
 		state ConfigGeneration generation = wait(getGeneration(self));
 		++generation.liveVersion;
 		if (req.lastSeenLiveVersion.present()) {
-			TEST(req.lastSeenLiveVersion.get() >= generation.liveVersion); // Node is lagging behind some other node
+			CODE_PROBE(req.lastSeenLiveVersion.get() >= generation.liveVersion,
+			           "Node is lagging behind some other node");
 			generation.liveVersion = std::max(generation.liveVersion, req.lastSeenLiveVersion.get() + 1);
 		}
 		self->kvStore->set(KeyValueRef(currentGenerationKey, BinaryWriter::toValue(generation, IncludeVersion())));
--- a/fdbserver/CoordinatedState.actor.cpp
+++ b/fdbserver/CoordinatedState.actor.cpp
@ -79,13 +79,13 @@ struct CoordinatedStateImpl {

 	CoordinatedStateImpl(ServerCoordinators const& c)
 	  : coordinators(c), stage(0), conflictGen(0), doomed(false), ac(false), initial(false) {}
-	uint64_t getConflict() { return conflictGen; }
+	uint64_t getConflict() const { return conflictGen; }

-	bool isDoomed(GenerationRegReadReply const& rep) {
-		return rep.gen > gen // setExclusive is doomed, because there was a write at least started at a higher
-		                     // generation, which means a read completed at that higher generation
-		                     // || rep.rgen > gen // setExclusive isn't absolutely doomed, but it may/probably will fail
-		    ;
+	bool isDoomed(GenerationRegReadReply const& rep) const {
+		return rep.gen > gen;
+		// setExclusive is doomed, because there was a write at least started at a higher
+		// generation, which means a read completed at that higher generation
+		// || rep.rgen > gen // setExclusive isn't absolutely doomed, but it may/probably will fail
 	}

 	ACTOR static Future<Value> read(CoordinatedStateImpl* self) {
@ -216,7 +216,7 @@ struct CoordinatedStateImpl {
 };

 CoordinatedState::CoordinatedState(ServerCoordinators const& coord)
-  : impl(std::make_unique<CoordinatedStateImpl>(coord)) {}
+  : impl(PImpl<CoordinatedStateImpl>::create(coord)) {}
 CoordinatedState::~CoordinatedState() = default;
 Future<Value> CoordinatedState::read() {
 	return CoordinatedStateImpl::read(impl.get());
@ -227,7 +227,7 @@ Future<Void> CoordinatedState::onConflict() {
 Future<Void> CoordinatedState::setExclusive(Value v) {
 	return CoordinatedStateImpl::setExclusive(impl.get(), v);
 }
-uint64_t CoordinatedState::getConflict() {
+uint64_t CoordinatedState::getConflict() const {
 	return impl->getConflict();
 }

@ -273,7 +273,7 @@ struct MovableCoordinatedStateImpl {
 		// SOMEDAY: If moveState.mode == MovingFrom, read (without locking) old state and assert that it corresponds
 		// with our state and is ReallyTo(coordinators)
 		if (moveState.mode == MovableValue::MaybeTo) {
-			TEST(true); // Maybe moveto state
+			CODE_PROBE(true, "Maybe moveto state");
 			ASSERT(moveState.other.present());
 			wait(self->moveTo(
 			    self, &self->cs, ClusterConnectionString(moveState.other.get().toString()), moveState.value));
@ -322,7 +322,7 @@ struct MovableCoordinatedStateImpl {

 		Value oldQuorumState = wait(cs.read());
 		if (oldQuorumState != self->lastCSValue.get()) {
-			TEST(true); // Quorum change aborted by concurrent write to old coordination state
+			CODE_PROBE(true, "Quorum change aborted by concurrent write to old coordination state");
 			TraceEvent("QuorumChangeAbortedByConcurrency").log();
 			throw coordinated_state_conflict();
 		}
@ -354,7 +354,7 @@ struct MovableCoordinatedStateImpl {

 MovableCoordinatedState& MovableCoordinatedState::operator=(MovableCoordinatedState&&) = default;
 MovableCoordinatedState::MovableCoordinatedState(class ServerCoordinators const& coord)
-  : impl(std::make_unique<MovableCoordinatedStateImpl>(coord)) {}
+  : impl(PImpl<MovableCoordinatedStateImpl>::create(coord)) {}
 MovableCoordinatedState::~MovableCoordinatedState() = default;
 Future<Value> MovableCoordinatedState::read() {
 	return MovableCoordinatedStateImpl::read(impl.get());
--- a/fdbserver/DDTeamCollection.actor.cpp
+++ b/fdbserver/DDTeamCollection.actor.cpp
@ -942,8 +942,9 @@ public:
 											                                : SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY);
 										}
 									} else {
-										TEST(true); // A removed server is still associated with a team in
-										            // ShardsAffectedByTeamFailure
+										CODE_PROBE(true,
+										           "A removed server is still associated with a team in "
+										           "ShardsAffectedByTeamFailure");
 									}
 								}
 							}
@ -1253,7 +1254,7 @@ public:

 						server->updateLastKnown(newInterface.first, newInterface.second);
 						if (localityChanged && !isTss) {
-							TEST(true); // Server locality changed
+							CODE_PROBE(true, "Server locality changed");

 							// The locality change of a server will affect machine teams related to the server if
 							// the server's machine locality is changed
@ -1320,7 +1321,7 @@ public:
 								}
 							}
 							if (addedNewBadTeam && self->badTeamRemover.isReady()) {
-								TEST(true); // Server locality change created bad teams
+								CODE_PROBE(true, "Server locality change created bad teams");
 								self->doBuildTeams = true;
 								self->badTeamRemover = removeBadTeams(self);
 								self->addActor.send(self->badTeamRemover);
@ -1724,7 +1725,7 @@ public:
 					// in the serverTeams vector in the machine team.
 					--teamIndex;
 					self->addTeam(team->getServers(), IsInitialTeam::True, IsRedundantTeam::True);
-					TEST(true); // Removed machine team
+					CODE_PROBE(true, "Removed machine team");
 				}

 				self->doBuildTeams = true;
@ -1808,7 +1809,7 @@ public:
 				bool foundTeam = self->removeTeam(st);
 				ASSERT(foundTeam);
 				self->addTeam(st->getServers(), IsInitialTeam::True, IsRedundantTeam::True);
-				TEST(true); // Marked team as a bad team
+				CODE_PROBE(true, "Marked team as a bad team");

 				self->doBuildTeams = true;

@ -2052,7 +2053,7 @@ public:
 			if (self->wigglingId.present()) {
 				state UID id = self->wigglingId.get();
 				if (self->pauseWiggle->get()) {
-					TEST(true); // paused because cluster is unhealthy
+					CODE_PROBE(true, "paused because cluster is unhealthy");
 					moveFinishFuture = Never();
 					self->includeStorageServersForWiggle();
 					self->storageWiggler->setWiggleState(StorageWiggler::PAUSE);
@ -2068,7 +2069,7 @@ public:
 				} else {
 					choose {
 						when(wait(self->waitUntilHealthy())) {
-							TEST(true); // start wiggling
+							CODE_PROBE(true, "start wiggling");
 							wait(self->storageWiggler->startWiggle());
 							auto fv = self->excludeStorageServersForWiggle(id);
 							moveFinishFuture = fv;
@ -2431,10 +2432,10 @@ public:
 		// SS and/or TSS recruitment failed at this point, update tssState
 		if (recruitTss && tssState->tssRecruitFailed()) {
 			tssState->markComplete();
-			TEST(true); // TSS recruitment failed for some reason
+			CODE_PROBE(true, "TSS recruitment failed for some reason");
 		}
 		if (!recruitTss && tssState->ssRecruitFailed()) {
-			TEST(true); // SS with pair TSS recruitment failed for some reason
+			CODE_PROBE(true, "SS with pair TSS recruitment failed for some reason");
 		}

 		self->recruitingStream.set(self->recruitingStream.get() - 1);
@ -2575,7 +2576,7 @@ public:
 							    .detail("Addr", candidateSSAddr.toString())
 							    .detail("Locality", candidateWorker.worker.locality.toString());

-							TEST(true); // Starting TSS recruitment
+							CODE_PROBE(true, "Starting TSS recruitment");
 							self->isTssRecruiting = true;
 							tssState = makeReference<TSSPairState>(candidateWorker.worker.locality);

@ -2585,7 +2586,7 @@ public:
 							checkTss = self->initialFailureReactionDelay;
 						} else {
 							if (tssState->active && tssState->inDataZone(candidateWorker.worker.locality)) {
-								TEST(true); // TSS recruits pair in same dc/datahall
+								CODE_PROBE(true, "TSS recruits pair in same dc/datahall");
 								self->isTssRecruiting = false;
 								TraceEvent("TSS_Recruit", self->distributorId)
 								    .detail("Stage", "PairSS")
@ -2596,8 +2597,9 @@ public:
 								// successfully started recruitment of pair, reset tss recruitment state
 								tssState = makeReference<TSSPairState>();
 							} else {
-								TEST(tssState->active); // TSS recruitment skipped potential pair because it's in a
-								                        // different dc/datahall
+								CODE_PROBE(
+								    tssState->active,
+								    "TSS recruitment skipped potential pair because it's in a different dc/datahall");
 								self->addActor.send(initializeStorage(
 								    self, candidateWorker, ddEnabledState, false, makeReference<TSSPairState>()));
 							}
@ -2617,8 +2619,9 @@ public:
 						int tssToKill = std::min((int)self->tss_info_by_pair.size(),
 						                         std::max(-tssToRecruit, self->zeroHealthyTeams->get() ? 1 : 0));
 						if (cancelTss) {
-							TEST(tssToRecruit < 0); // tss recruitment cancelled due to too many TSS
-							TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams
+							CODE_PROBE(tssToRecruit < 0, "tss recruitment cancelled due to too many TSS");
+							CODE_PROBE(self->zeroHealthyTeams->get(),
+							           "tss recruitment cancelled due zero healthy teams");

 							TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId)
 							    .detail("Reason", tssToRecruit <= 0 ? "TooMany" : "ZeroHealthyTeams");
@ -2637,8 +2640,8 @@ public:
 								if (self->shouldHandleServer(tssi) && self->server_and_tss_info.count(tssId)) {
 									Promise<Void> killPromise = itr->second->killTss;
 									if (killPromise.canBeSet()) {
-										TEST(tssToRecruit < 0); // Killing TSS due to too many TSS
-										TEST(self->zeroHealthyTeams->get()); // Killing TSS due zero healthy teams
+										CODE_PROBE(tssToRecruit < 0, "Killing TSS due to too many TSS");
+										CODE_PROBE(self->zeroHealthyTeams->get(), "Killing TSS due zero healthy teams");
 										TraceEvent(SevWarn, "TSS_DDKill", self->distributorId)
 										    .detail("TSSID", tssId)
 										    .detail("Reason",
@ -2672,7 +2675,7 @@ public:
 				if (e.code() != error_code_timed_out) {
 					throw;
 				}
-				TEST(true); // Storage recruitment timed out
+				CODE_PROBE(true, "Storage recruitment timed out");
 			}
 		}
 	}
@ -2992,14 +2995,14 @@ public:

 			loop choose {
 				when(UID removedServer = waitNext(self->removedServers.getFuture())) {
-					TEST(true); // Storage server removed from database
+					CODE_PROBE(true, "Storage server removed from database");
 					self->removeServer(removedServer);
 					serverRemoved.send(Void());

 					self->restartRecruiting.trigger();
 				}
 				when(UID removedTSS = waitNext(self->removedTSS.getFuture())) {
-					TEST(true); // TSS removed from database
+					CODE_PROBE(true, "TSS removed from database");
 					self->removeTSS(removedTSS);
 					serverRemoved.send(Void());

@ -4808,7 +4811,7 @@ Reference<TCMachineInfo> DDTeamCollection::checkAndCreateMachine(Reference<TCSer
 	Reference<TCMachineInfo> machineInfo;
 	if (machine_info.find(machine_id) == machine_info.end()) {
 		// uid is the first storage server process on the machine
-		TEST(true); // First storage server in process on the machine
+		CODE_PROBE(true, "First storage server in process on the machine");
 		// For each machine, store the first server's localityEntry into machineInfo for later use.
 		LocalityEntry localityEntry = machineLocalityMap.add(locality, &server->getId());
 		machineInfo = makeReference<TCMachineInfo>(server, localityEntry);
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@ -250,7 +250,7 @@ class DDTxnProcessorImpl {
 		// If keyServers is too large to read in a single transaction, then we will have to break this process up into
 		// multiple transactions. In that case, each iteration should begin where the previous left off
 		while (beginKey < allKeys.end) {
-			TEST(beginKey > allKeys.begin); // Multi-transactional getInitialDataDistribution
+			CODE_PROBE(beginKey > allKeys.begin, "Multi-transactional getInitialDataDistribution");
 			loop {
 				succeeded = false;
 				try {
@ -430,4 +430,4 @@ Future<Reference<InitialDataDistribution>> DDTxnProcessor::getInitialDataDistrib

 Future<Void> DDTxnProcessor::waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const {
 	return DDTxnProcessorImpl::waitForDataDistributionEnabled(cx, ddEnabledState);
-}
+}
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -310,6 +310,8 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
 	// Optional components that can be set after ::init(). They're optional when test, but required for DD being
 	// fully-functional.
 	DDTeamCollection* teamCollection;
+	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
+	PromiseStream<RelocateShard> relocationProducer, relocationConsumer; // comsumer is a yield stream from producer

 	DataDistributor(Reference<AsyncVar<ServerDBInfo> const> const& db, UID id)
 	  : dbInfo(db), ddId(id), txnProcessor(nullptr), initialDDEventHolder(makeReference<EventCacheHolder>("InitialDD")),
@ -433,6 +435,88 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
 		}
 		return Void();
 	}
+
+	// Resume inflight relocations from the previous DD
+	// TODO: add a test to verify the inflight relocation correctness and measure the memory usage with 4 million shards
+	ACTOR static Future<Void> resumeRelocations(Reference<DataDistributor> self) {
+		ASSERT(self->shardsAffectedByTeamFailure); // has to be allocated
+
+		state int shard = 0;
+		for (; shard < self->initData->shards.size() - 1; shard++) {
+			const DDShardInfo& iShard = self->initData->shards[shard];
+			KeyRangeRef keys = KeyRangeRef(iShard.key, self->initData->shards[shard + 1].key);
+
+			self->shardsAffectedByTeamFailure->defineShard(keys);
+			std::vector<ShardsAffectedByTeamFailure::Team> teams;
+			teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.primarySrc, true));
+			if (self->configuration.usableRegions > 1) {
+				teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.remoteSrc, false));
+			}
+			if (g_network->isSimulated()) {
+				TraceEvent("DDInitShard")
+				    .detail("Keys", keys)
+				    .detail("PrimarySrc", describe(iShard.primarySrc))
+				    .detail("RemoteSrc", describe(iShard.remoteSrc))
+				    .detail("PrimaryDest", describe(iShard.primaryDest))
+				    .detail("RemoteDest", describe(iShard.remoteDest))
+				    .detail("SrcID", iShard.srcId)
+				    .detail("DestID", iShard.destId);
+			}
+
+			self->shardsAffectedByTeamFailure->moveShard(keys, teams);
+			if (iShard.hasDest && iShard.destId == anonymousShardId) {
+				// This shard is already in flight.  Ideally we should use dest in ShardsAffectedByTeamFailure and
+				// generate a dataDistributionRelocator directly in DataDistributionQueue to track it, but it's
+				// easier to just (with low priority) schedule it for movement.
+				bool unhealthy = iShard.primarySrc.size() != self->configuration.storageTeamSize;
+				if (!unhealthy && self->configuration.usableRegions > 1) {
+					unhealthy = iShard.remoteSrc.size() != self->configuration.storageTeamSize;
+				}
+				self->relocationProducer.send(RelocateShard(keys,
+				                                            unhealthy ? SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY
+				                                                      : SERVER_KNOBS->PRIORITY_RECOVER_MOVE,
+				                                            RelocateReason::OTHER));
+			}
+
+			wait(yield(TaskPriority::DataDistribution));
+		}
+
+		state KeyRangeMap<std::shared_ptr<DataMove>>::iterator it = self->initData->dataMoveMap.ranges().begin();
+		for (; it != self->initData->dataMoveMap.ranges().end(); ++it) {
+			const DataMoveMetaData& meta = it.value()->meta;
+			if (it.value()->isCancelled() || (it.value()->valid && !CLIENT_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) {
+				RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
+				rs.dataMoveId = meta.id;
+				rs.cancelled = true;
+				self->relocationProducer.send(rs);
+				TraceEvent("DDInitScheduledCancelDataMove", self->ddId).detail("DataMove", meta.toString());
+			} else if (it.value()->valid) {
+				TraceEvent(SevDebug, "DDInitFoundDataMove", self->ddId).detail("DataMove", meta.toString());
+				ASSERT(meta.range == it.range());
+				// TODO: Persist priority in DataMoveMetaData.
+				RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
+				rs.dataMoveId = meta.id;
+				rs.dataMove = it.value();
+				std::vector<ShardsAffectedByTeamFailure::Team> teams;
+				teams.push_back(ShardsAffectedByTeamFailure::Team(rs.dataMove->primaryDest, true));
+				if (!rs.dataMove->remoteDest.empty()) {
+					teams.push_back(ShardsAffectedByTeamFailure::Team(rs.dataMove->remoteDest, false));
+				}
+
+				// Since a DataMove could cover more than one keyrange, e.g., during merge, we need to define
+				// the target shard and restart the shard tracker.
+				self->shardsAffectedByTeamFailure->restartShardTracker.send(rs.keys);
+				self->shardsAffectedByTeamFailure->defineShard(rs.keys);
+
+				// When restoring a DataMove, the destination team is determined, and hence we need to register
+				// the data move now, so that team failures can be captured.
+				self->shardsAffectedByTeamFailure->moveShard(rs.keys, teams);
+				self->relocationProducer.send(rs);
+				wait(yield(TaskPriority::DataDistribution));
+			}
+		}
+		return Void();
+	}
 };

 // Runs the data distribution algorithm for FDB, including the DD Queue, DD tracker, and DD team collection
@ -473,8 +557,6 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			// When/If this assertion fails, Evan owes Ben a pat on the back for his foresight
 			ASSERT(self->configuration.storageTeamSize > 0);

-			state PromiseStream<RelocateShard> output;
-			state PromiseStream<RelocateShard> input;
 			state PromiseStream<Promise<int64_t>> getAverageShardBytes;
 			state PromiseStream<Promise<int>> getUnhealthyRelocationCount;
 			state PromiseStream<GetMetricsRequest> getShardMetrics;
@ -482,82 +564,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			state Reference<AsyncVar<bool>> processingUnhealthy(new AsyncVar<bool>(false));
 			state Reference<AsyncVar<bool>> processingWiggle(new AsyncVar<bool>(false));
 			state Promise<Void> readyToStart;
-			state Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure(new ShardsAffectedByTeamFailure);
-
-			state int shard = 0;
-			for (; shard < self->initData->shards.size() - 1; shard++) {
-				const DDShardInfo& iShard = self->initData->shards[shard];
-				KeyRangeRef keys = KeyRangeRef(iShard.key, self->initData->shards[shard + 1].key);
-
-				shardsAffectedByTeamFailure->defineShard(keys);
-				std::vector<ShardsAffectedByTeamFailure::Team> teams;
-				teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.primarySrc, true));
-				if (self->configuration.usableRegions > 1) {
-					teams.push_back(ShardsAffectedByTeamFailure::Team(iShard.remoteSrc, false));
-				}
-				if (g_network->isSimulated()) {
-					TraceEvent("DDInitShard")
-					    .detail("Keys", keys)
-					    .detail("PrimarySrc", describe(iShard.primarySrc))
-					    .detail("RemoteSrc", describe(iShard.remoteSrc))
-					    .detail("PrimaryDest", describe(iShard.primaryDest))
-					    .detail("RemoteDest", describe(iShard.remoteDest))
-					    .detail("SrcID", iShard.srcId)
-					    .detail("DestID", iShard.destId);
-				}
-
-				shardsAffectedByTeamFailure->moveShard(keys, teams);
-				if (iShard.hasDest && iShard.destId == anonymousShardId) {
-					// This shard is already in flight.  Ideally we should use dest in ShardsAffectedByTeamFailure and
-					// generate a dataDistributionRelocator directly in DataDistributionQueue to track it, but it's
-					// easier to just (with low priority) schedule it for movement.
-					bool unhealthy = iShard.primarySrc.size() != self->configuration.storageTeamSize;
-					if (!unhealthy && self->configuration.usableRegions > 1) {
-						unhealthy = iShard.remoteSrc.size() != self->configuration.storageTeamSize;
-					}
-					output.send(RelocateShard(keys,
-					                          unhealthy ? SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY
-					                                    : SERVER_KNOBS->PRIORITY_RECOVER_MOVE,
-					                          RelocateReason::OTHER));
-				}
-
-				wait(yield(TaskPriority::DataDistribution));
-			}
-
-			state KeyRangeMap<std::shared_ptr<DataMove>>::iterator it = self->initData->dataMoveMap.ranges().begin();
-			for (; it != self->initData->dataMoveMap.ranges().end(); ++it) {
-				const DataMoveMetaData& meta = it.value()->meta;
-				if (it.value()->isCancelled() || (it.value()->valid && !CLIENT_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) {
-					RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
-					rs.dataMoveId = meta.id;
-					rs.cancelled = true;
-					output.send(rs);
-					TraceEvent("DDInitScheduledCancelDataMove", self->ddId).detail("DataMove", meta.toString());
-				} else if (it.value()->valid) {
-					TraceEvent(SevDebug, "DDInitFoundDataMove", self->ddId).detail("DataMove", meta.toString());
-					ASSERT(meta.range == it.range());
-					// TODO: Persist priority in DataMoveMetaData.
-					RelocateShard rs(meta.range, SERVER_KNOBS->PRIORITY_RECOVER_MOVE, RelocateReason::OTHER);
-					rs.dataMoveId = meta.id;
-					rs.dataMove = it.value();
-					std::vector<ShardsAffectedByTeamFailure::Team> teams;
-					teams.push_back(ShardsAffectedByTeamFailure::Team(rs.dataMove->primaryDest, true));
-					if (!rs.dataMove->remoteDest.empty()) {
-						teams.push_back(ShardsAffectedByTeamFailure::Team(rs.dataMove->remoteDest, false));
-					}
-
-					// Since a DataMove could cover more than one keyrange, e.g., during merge, we need to define
-					// the target shard and restart the shard tracker.
-					shardsAffectedByTeamFailure->restartShardTracker.send(rs.keys);
-					shardsAffectedByTeamFailure->defineShard(rs.keys);
-
-					// When restoring a DataMove, the destination team is determined, and hence we need to register
-					// the data move now, so that team failures can be captured.
-					shardsAffectedByTeamFailure->moveShard(rs.keys, teams);
-					output.send(rs);
-					wait(yield(TaskPriority::DataDistribution));
-				}
-			}
+			self->shardsAffectedByTeamFailure = makeReference<ShardsAffectedByTeamFailure>();
+			wait(DataDistributor::resumeRelocations(self));

 			std::vector<TeamCollectionInterface> tcis;

@ -586,8 +594,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			actors.push_back(pollMoveKeysLock(cx, self->lock, ddEnabledState));
 			actors.push_back(reportErrorsExcept(dataDistributionTracker(self->initData,
 			                                                            cx,
-			                                                            output,
-			                                                            shardsAffectedByTeamFailure,
+			                                                            self->relocationProducer,
+			                                                            self->shardsAffectedByTeamFailure,
 			                                                            getShardMetrics,
 			                                                            getTopKShardMetrics.getFuture(),
 			                                                            getShardMetricsList,
@ -601,14 +609,14 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			                                    self->ddId,
 			                                    &normalDDQueueErrors()));
 			actors.push_back(reportErrorsExcept(dataDistributionQueue(cx,
-			                                                          output,
-			                                                          input.getFuture(),
+			                                                          self->relocationProducer,
+			                                                          self->relocationConsumer.getFuture(),
 			                                                          getShardMetrics,
 			                                                          getTopKShardMetrics,
 			                                                          processingUnhealthy,
 			                                                          processingWiggle,
 			                                                          tcis,
-			                                                          shardsAffectedByTeamFailure,
+			                                                          self->shardsAffectedByTeamFailure,
 			                                                          self->lock,
 			                                                          getAverageShardBytes,
 			                                                          getUnhealthyRelocationCount.getFuture(),
@ -625,8 +633,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			    cx,
 			    self->ddId,
 			    self->lock,
-			    output,
-			    shardsAffectedByTeamFailure,
+			    self->relocationProducer,
+			    self->shardsAffectedByTeamFailure,
 			    self->configuration,
 			    self->primaryDcId,
 			    self->configuration.usableRegions > 1 ? self->remoteDcIds : std::vector<Optional<Key>>(),
@ -646,8 +654,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 				    makeReference<DDTeamCollection>(cx,
 				                                    self->ddId,
 				                                    self->lock,
-				                                    output,
-				                                    shardsAffectedByTeamFailure,
+				                                    self->relocationProducer,
+				                                    self->shardsAffectedByTeamFailure,
 				                                    self->configuration,
 				                                    self->remoteDcIds,
 				                                    Optional<std::vector<Optional<Key>>>(),
@ -678,7 +686,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
 			    &normalDDQueueErrors()));

 			actors.push_back(DDTeamCollection::printSnapshotTeamsInfo(primaryTeamCollection));
-			actors.push_back(yieldPromiseStream(output.getFuture(), input));
+			actors.push_back(yieldPromiseStream(self->relocationProducer.getFuture(), self->relocationConsumer));

 			wait(waitForAll(actors));
 			return Void();
@ -873,7 +881,7 @@ ACTOR Future<std::map<NetworkAddress, std::pair<WorkerInterface, std::string>>>
 			                                  configuration.storageTeamSize - 1) -
 			                         storageFailures;
 			if (*storageFaultTolerance < 0) {
-				TEST(true); // Too many failed storage servers to complete snapshot
+				CODE_PROBE(true, "Too many failed storage servers to complete snapshot");
 				throw snap_storage_failed();
 			}
 			// tlogs
@ -1319,14 +1327,14 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 			when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) {
 				auto& snapUID = snapReq.snapUID;
 				if (ddSnapReqResultMap.count(snapUID)) {
-					TEST(true); // Data distributor received a duplicate finished snap request
+					CODE_PROBE(true, "Data distributor received a duplicate finished snap request");
 					auto result = ddSnapReqResultMap[snapUID];
 					result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
 					TraceEvent("RetryFinishedDistributorSnapRequest")
 					    .detail("SnapUID", snapUID)
 					    .detail("Result", result.isError() ? result.getError().code() : 0);
 				} else if (ddSnapReqMap.count(snapReq.snapUID)) {
-					TEST(true); // Data distributor received a duplicate ongoing snap request
+					CODE_PROBE(true, "Data distributor received a duplicate ongoing snap request");
 					TraceEvent("RetryOngoingDistributorSnapRequest").detail("SnapUID", snapUID);
 					ASSERT(snapReq.snapPayload == ddSnapReqMap[snapUID].snapPayload);
 					ddSnapReqMap[snapUID] = snapReq;
@ -1361,6 +1369,8 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 	return Void();
 }

+namespace data_distribution_test {
+
 static Future<ErrorOr<Void>> goodTestFuture(double duration) {
 	return tag(delay(duration), ErrorOr<Void>(Void()));
 }
@ -1369,29 +1379,41 @@ static Future<ErrorOr<Void>> badTestFuture(double duration, Error e) {
 	return tag(delay(duration), ErrorOr<Void>(e));
 }

+} // namespace data_distribution_test
+
 TEST_CASE("/DataDistribution/WaitForMost") {
 	state std::vector<Future<ErrorOr<Void>>> futures;
 	{
-		futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
+		futures = { data_distribution_test::goodTestFuture(1),
+			        data_distribution_test::goodTestFuture(2),
+			        data_distribution_test::goodTestFuture(3) };
 		wait(waitForMost(futures, 1, operation_failed(), 0.0)); // Don't wait for slowest future
 		ASSERT(!futures[2].isReady());
 	}
 	{
-		futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
+		futures = { data_distribution_test::goodTestFuture(1),
+			        data_distribution_test::goodTestFuture(2),
+			        data_distribution_test::goodTestFuture(3) };
 		wait(waitForMost(futures, 0, operation_failed(), 0.0)); // Wait for all futures
 		ASSERT(futures[2].isReady());
 	}
 	{
-		futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
+		futures = { data_distribution_test::goodTestFuture(1),
+			        data_distribution_test::goodTestFuture(2),
+			        data_distribution_test::goodTestFuture(3) };
 		wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Wait for slowest future
 		ASSERT(futures[2].isReady());
 	}
 	{
-		futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
+		futures = { data_distribution_test::goodTestFuture(1),
+			        data_distribution_test::goodTestFuture(2),
+			        data_distribution_test::badTestFuture(1, success()) };
 		wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Error ignored
 	}
 	{
-		futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
+		futures = { data_distribution_test::goodTestFuture(1),
+			        data_distribution_test::goodTestFuture(2),
+			        data_distribution_test::badTestFuture(1, success()) };
 		try {
 			wait(waitForMost(futures, 0, operation_failed(), 1.0));
 			ASSERT(false);
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -41,20 +41,100 @@
 typedef Reference<IDataDistributionTeam> ITeamRef;
 typedef std::pair<ITeamRef, ITeamRef> SrcDestTeamPair;

-// TODO: add guard to guarantee the priority is not equal for each purpose?
+// FIXME: Always use DataMovementReason to invoke these functions.
 inline bool isDiskRebalancePriority(int priority) {
 	return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
 	       priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM;
 }
+
+inline bool isDataMovementForDiskBalancing(DataMovementReason reason) {
+	return reason == DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM ||
+	       reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM;
+}
+
+inline bool isDataMovementForReadBalancing(DataMovementReason reason) {
+	return reason == DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM ||
+	       reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
+}
+
 inline bool isMountainChopperPriority(int priority) {
 	return priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM ||
 	       priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM;
 }
+
+inline bool isDataMovementForMountainChopper(DataMovementReason reason) {
+	return reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM ||
+	       reason == DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM;
+}
+
 inline bool isValleyFillerPriority(int priority) {
 	return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
 	       priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM;
 }

+inline bool isDataMovementForValleyFiller(DataMovementReason reason) {
+	return reason == DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM ||
+	       reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
+}
+
+int dataMovementPriority(DataMovementReason reason) {
+	int priority;
+	switch (reason) {
+	case DataMovementReason::RECOVER_MOVE:
+		priority = SERVER_KNOBS->PRIORITY_RECOVER_MOVE;
+		break;
+	case DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM:
+		priority = SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM;
+		break;
+	case DataMovementReason::REBALANCE_OVERUTILIZED_TEAM:
+		priority = SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM;
+		break;
+	case DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM:
+		priority = SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM;
+		break;
+	case DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM:
+		priority = SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM;
+		break;
+	case DataMovementReason::PERPETUAL_STORAGE_WIGGLE:
+		priority = SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE;
+		break;
+	case DataMovementReason::TEAM_HEALTHY:
+		priority = SERVER_KNOBS->PRIORITY_TEAM_HEALTHY;
+		break;
+	case DataMovementReason::TEAM_CONTAINS_UNDESIRED_SERVER:
+		priority = SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER;
+		break;
+	case DataMovementReason::TEAM_REDUNDANT:
+		priority = SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT;
+		break;
+	case DataMovementReason::MERGE_SHARD:
+		priority = SERVER_KNOBS->PRIORITY_MERGE_SHARD;
+		break;
+	case DataMovementReason::POPULATE_REGION:
+		priority = SERVER_KNOBS->PRIORITY_POPULATE_REGION;
+		break;
+	case DataMovementReason::TEAM_UNHEALTHY:
+		priority = SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY;
+		break;
+	case DataMovementReason::TEAM_2_LEFT:
+		priority = SERVER_KNOBS->PRIORITY_TEAM_2_LEFT;
+		break;
+	case DataMovementReason::TEAM_1_LEFT:
+		priority = SERVER_KNOBS->PRIORITY_TEAM_1_LEFT;
+		break;
+	case DataMovementReason::TEAM_FAILED:
+		priority = SERVER_KNOBS->PRIORITY_TEAM_FAILED;
+		break;
+	case DataMovementReason::TEAM_0_LEFT:
+		priority = SERVER_KNOBS->PRIORITY_TEAM_0_LEFT;
+		break;
+	case DataMovementReason::SPLIT_SHARD:
+		priority = SERVER_KNOBS->PRIORITY_SPLIT_SHARD;
+		break;
+	}
+	return priority;
+}
+
 struct RelocateData {
 	KeyRange keys;
 	int priority;
@ -1349,7 +1429,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 				}

 				if (anyDestOverloaded) {
-					TEST(true); // Destination overloaded throttled move
+					CODE_PROBE(true, "Destination overloaded throttled move");
 					destOverloadedCount++;
 					TraceEvent(destOverloadedCount > 50 ? SevInfo : SevDebug, "DestSSBusy", distributorId)
 					    .suppressFor(1.0)
@ -1361,7 +1441,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 					    .detail("Servers", destServersString(bestTeams));
 					wait(delay(SERVER_KNOBS->DEST_OVERLOADED_DELAY, TaskPriority::DataDistributionLaunch));
 				} else {
-					TEST(true); // did not find a healthy destination team on the first attempt
+					CODE_PROBE(true, "did not find a healthy destination team on the first attempt");
 					stuckCount++;
 					TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", distributorId)
 					    .suppressFor(1.0)
@ -1594,7 +1674,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self,
 					throw error;
 				}
 			} else {
-				TEST(true); // move to removed server
+				CODE_PROBE(true, "move to removed server");
 				healthyDestinations.addDataInFlightToTeam(-metrics.bytes);
 				auto readLoad = metrics.bytesReadPerKSecond;
 				auto& destinationRef = healthyDestinations;
@ -1842,16 +1922,16 @@ ACTOR Future<SrcDestTeamPair> getSrcDestTeams(DDQueueData* self,
 	return {};
 }

-ACTOR Future<Void> BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, int ddPriority) {
+ACTOR Future<Void> BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, DataMovementReason reason) {
 	state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
 	state Transaction tr(self->cx);
 	state double lastRead = 0;
 	state bool skipCurrentLoop = false;
 	state Future<Void> delayF = Never();
-	state const bool readRebalance = !isDiskRebalancePriority(ddPriority);
+	state const bool readRebalance = isDataMovementForReadBalancing(reason);
 	state const char* eventName =
-	    isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper_New" : "BgDDValleyFiller_New";
-
+	    isDataMovementForMountainChopper(reason) ? "BgDDMountainChopper_New" : "BgDDValleyFiller_New";
+	state int ddPriority = dataMovementPriority(reason);
 	loop {
 		state bool moved = false;
 		state Reference<IDataDistributionTeam> sourceTeam;
@ -1899,7 +1979,7 @@ ACTOR Future<Void> BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex,
 			traceEvent.detail("QueuedRelocations", self->priority_relocations[ddPriority]);

 			if (self->priority_relocations[ddPriority] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
-				if (isMountainChopperPriority(ddPriority)) {
+				if (isDataMovementForMountainChopper(reason)) {
 					srcReq = GetTeamRequest(WantNewServers::True,
 					                        WantTrueBest::True,
 					                        PreferLowerDiskUtil::False,
@ -2197,10 +2277,8 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
 		// balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM));
 		// balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM));
 		if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
-			balancingFutures.push_back(
-			    BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM));
-			balancingFutures.push_back(
-			    BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM));
+			balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM));
+			balancingFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM));
 		}
 		balancingFutures.push_back(BgDDMountainChopper(&self, i));
 		balancingFutures.push_back(BgDDValleyFiller(&self, i));
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -110,7 +110,7 @@ struct DataDistributionTracker {

 		DataDistributionTracker* operator()() {
 			if (trackerCancelled) {
-				TEST(true); // Trying to access DataDistributionTracker after tracker has been cancelled
+				CODE_PROBE(true, "Trying to access DataDistributionTracker after tracker has been cancelled");
 				throw dd_tracker_cancelled();
 			}
 			return &tracker;
@ -482,7 +482,7 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
 	state BandwidthStatus bandwidthStatus = getBandwidthStatus(metrics);

 	// Split
-	TEST(true); // shard to be split
+	CODE_PROBE(true, "shard to be split");

 	StorageMetrics splitMetrics;
 	splitMetrics.bytes = shardBounds.max.bytes / 2;
@ -559,7 +559,7 @@ Future<Void> shardMerger(DataDistributionTracker* self,
 	auto prevIter = self->shards.rangeContaining(keys.begin);
 	auto nextIter = self->shards.rangeContaining(keys.begin);

-	TEST(true); // shard to be merged
+	CODE_PROBE(true, "shard to be merged");
 	ASSERT(keys.begin > allKeys.begin);

 	// This will merge shards both before and after "this" shard in keyspace.
@ -604,7 +604,7 @@ Future<Void> shardMerger(DataDistributionTracker* self,
 			//  on the previous shard changing "size".
 			if (!newMetrics.present() || shardCount + newMetrics.get().shardCount >= CLIENT_KNOBS->SHARD_COUNT_LIMIT) {
 				if (shardsMerged == 1) {
-					TEST(true); // shardMerger cannot merge anything
+					CODE_PROBE(true, "shardMerger cannot merge anything");
 					return brokenPromiseToReady(prevIter->value().stats->onChange());
 				}

@ -797,7 +797,7 @@ void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optio
 			    .detail("Keys", keys)
 			    .detail("Size", startingMetrics.get().metrics.bytes)
 			    .detail("Merges", startingMetrics.get().merges);*/
-			TEST(true); // shardTracker started with trackedBytes already set
+			CODE_PROBE(true, "shardTracker started with trackedBytes already set");
 			shardMetrics->set(startingMetrics);
 		}

@ -903,7 +903,7 @@ ACTOR Future<Void> fetchTopKShardMetrics(DataDistributionTracker* self, GetTopKM
 	choose {
 		when(wait(fetchTopKShardMetrics_impl(self, req))) {}
 		when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) {
-			TEST(true); // TopK DD_SHARD_METRICS_TIMEOUT
+			CODE_PROBE(true, "TopK DD_SHARD_METRICS_TIMEOUT");
 			req.reply.send(GetTopKMetricsReply());
 		}
 	}
@ -942,7 +942,7 @@ ACTOR Future<Void> fetchShardMetrics(DataDistributionTracker* self, GetMetricsRe
 	choose {
 		when(wait(fetchShardMetrics_impl(self, req))) {}
 		when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT, TaskPriority::DataDistribution))) {
-			TEST(true); // DD_SHARD_METRICS_TIMEOUT
+			CODE_PROBE(true, "DD_SHARD_METRICS_TIMEOUT");
 			StorageMetrics largeMetrics;
 			largeMetrics.bytes = getMaxShardSize(self->dbSizeEstimate->get());
 			req.reply.send(largeMetrics);
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@ -379,7 +379,7 @@ public:
 					    pageFloor(std::max(self->files[1].size - desiredMaxFileSize, self->fileShrinkBytes));
 					if ((maxShrink > SERVER_KNOBS->DISK_QUEUE_MAX_TRUNCATE_BYTES) ||
 					    (frivolouslyTruncate && deterministicRandom()->random01() < 0.3)) {
-						TEST(true); // Replacing DiskQueue file
+						CODE_PROBE(true, "Replacing DiskQueue file");
 						TraceEvent("DiskQueueReplaceFile", self->dbgid)
 						    .detail("Filename", self->files[1].f->getFilename())
 						    .detail("OldFileSize", self->files[1].size)
@ -389,7 +389,7 @@ public:
 						waitfor.push_back(self->files[1].f->truncate(self->fileExtensionBytes));
 						self->files[1].size = self->fileExtensionBytes;
 					} else {
-						TEST(true); // Truncating DiskQueue file
+						CODE_PROBE(true, "Truncating DiskQueue file");
 						const int64_t startingSize = self->files[1].size;
 						self->files[1].size -= std::min(maxShrink, self->files[1].size);
 						self->files[1].size = std::max(self->files[1].size, self->fileExtensionBytes);
@ -460,12 +460,12 @@ public:

 			wait(ready);

-			TEST(pageData.size() > sizeof(Page)); // push more than one page of data
+			CODE_PROBE(pageData.size() > sizeof(Page), "push more than one page of data");

 			Future<Void> pushed = wait(self->push(pageData, &syncFiles));
 			pushing.send(Void());
 			ASSERT(syncFiles.size() >= 1 && syncFiles.size() <= 2);
-			TEST(2 == syncFiles.size()); // push spans both files
+			CODE_PROBE(2 == syncFiles.size(), "push spans both files");
 			wait(pushed);

 			delete pageMem;
@ -491,8 +491,8 @@ public:
 			committed.send(Void());
 		} catch (Error& e) {
 			delete pageMem;
-			TEST(true); // push error
-			TEST(2 == syncFiles.size()); // push spanning both files error
+			CODE_PROBE(true, "push error");
+			CODE_PROBE(2 == syncFiles.size(), "push spanning both files error");
 			TraceEvent(SevError, "RDQPushAndCommitError", dbgid)
 			    .errorUnsuppressed(e)
 			    .detail("InitialFilename0", filename);
@ -805,7 +805,7 @@ public:
 			Standalone<StringRef> result = self->readingBuffer.pop_front(sizeof(Page));
 			return result;
 		} catch (Error& e) {
-			TEST(true); // Read next page error
+			CODE_PROBE(true, "Read next page error");
 			TraceEvent(SevError, "RDQReadNextPageError", self->dbgid)
 			    .errorUnsuppressed(e)
 			    .detail("File0Name", self->files[0].dbgFilename);
@ -840,8 +840,8 @@ public:
 			state std::vector<Future<Void>> commits;
 			state bool swap = file == 0;

-			TEST(file == 0); // truncate before last read page on file 0
-			TEST(file == 1 && pos != self->files[1].size); // truncate before last read page on file 1
+			CODE_PROBE(file == 0, "truncate before last read page on file 0");
+			CODE_PROBE(file == 1 && pos != self->files[1].size, "truncate before last read page on file 1");

 			self->readingFile = 2;
 			self->readingBuffer.clear();
@ -890,10 +890,10 @@ public:
 		ASSERT(recovered);
 		uint8_t const* begin = contents.begin();
 		uint8_t const* end = contents.end();
-		TEST(contents.size() && pushedPageCount()); // More than one push between commits
+		CODE_PROBE(contents.size() && pushedPageCount(), "More than one push between commits");

 		bool pushAtEndOfPage = contents.size() >= 4 && pushedPageCount() && backPage().remainingCapacity() < 4;
-		TEST(pushAtEndOfPage); // Push right at the end of a page, possibly splitting size
+		CODE_PROBE(pushAtEndOfPage, "Push right at the end of a page, possibly splitting size");
 		while (begin != end) {
 			if (!pushedPageCount() || !backPage().remainingCapacity())
 				addEmptyPage();
@ -1391,7 +1391,7 @@ private:
 		int f;
 		int64_t p;
 		bool poppedNotDurable = self->lastPoppedSeq / sizeof(Page) != self->poppedSeq / sizeof(Page);
-		TEST(poppedNotDurable); // DiskQueue: Recovery popped position not fully durable
+		CODE_PROBE(poppedNotDurable, "DiskQueue: Recovery popped position not fully durable");
 		self->findPhysicalLocation(self->lastPoppedSeq, &f, &p, "lastPoppedSeq");
 		wait(self->rawQueue->setPoppedPage(f, p, pageFloor(self->lastPoppedSeq)));

@ -1408,8 +1408,8 @@ private:
 		self->recovered = true;
 		ASSERT(self->poppedSeq <= self->endLocation());

-		TEST(result.size() == 0); // End of queue at border between reads
-		TEST(result.size() != 0); // Partial read at end of queue
+		CODE_PROBE(result.size() == 0, "End of queue at border between reads");
+		CODE_PROBE(result.size() != 0, "Partial read at end of queue");

 		// The next read location isn't necessarily the end of the last commit, but this is sufficient for helping us
 		// check an ASSERTion
@ -1628,8 +1628,9 @@ public:
 			                // totally finished
 			pop(popLocation);
 			commitFuture = commitFuture && queue->commit();
-		} else
-			TEST(true); // No uncommitted data was popped
+		} else {
+			CODE_PROBE(true, "No uncommitted data was popped");
+		}

 		return commitFuture;
 	}
--- a/fdbserver/GlobalTagThrottler.actor.cpp
+++ b/fdbserver/GlobalTagThrottler.actor.cpp
@ -166,7 +166,7 @@ class GlobalTagThrottlerImpl {
 					// wait(tr.watch(tagThrottleSignalKey));
 					wait(delay(5.0));
 					TraceEvent("GlobalTagThrottler_ChangeSignaled");
-					TEST(true); // Global tag throttler detected quota changes
+					CODE_PROBE(true, "Global tag throttler detected quota changes");
 					break;
 				} catch (Error& e) {
 					TraceEvent("GlobalTagThrottlerMonitoringChangesError", self->id).error(e);
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@ -661,14 +661,14 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 				if (tagItr != priorityThrottledTags.end()) {
 					if (tagItr->second.expiration > now()) {
 						if (tagItr->second.tpsRate == std::numeric_limits<double>::max()) {
-							TEST(true); // Auto TPS rate is unlimited
+							CODE_PROBE(true, "Auto TPS rate is unlimited");
 						} else {
-							TEST(true); // GRV proxy returning tag throttle
+							CODE_PROBE(true, "GRV proxy returning tag throttle");
 							reply.tagThrottleInfo[tag.first] = tagItr->second;
 						}
 					} else {
 						// This isn't required, but we might as well
-						TEST(true); // GRV proxy expiring tag throttle
+						CODE_PROBE(true, "GRV proxy expiring tag throttle");
 						priorityThrottledTags.erase(tagItr);
 					}
 				}
--- a/fdbserver/KeyValueStoreMemory.actor.cpp
+++ b/fdbserver/KeyValueStoreMemory.actor.cpp
@ -104,9 +104,9 @@ public:
 			TraceEvent("KVSMemSwitchingToLargeTransactionMode", id)
 			    .detail("TransactionSize", transactionSize)
 			    .detail("DataSize", committedDataSize);
-			TEST(true); // KeyValueStoreMemory switching to large transaction mode
-			TEST(committedDataSize >
-			     1e3); // KeyValueStoreMemory switching to large transaction mode with committed data
+			CODE_PROBE(true, "KeyValueStoreMemory switching to large transaction mode");
+			CODE_PROBE(committedDataSize > 1e3,
+			           "KeyValueStoreMemory switching to large transaction mode with committed data");
 		}

 		int64_t bytesWritten = commit_queue(queue, true);
@ -506,6 +506,12 @@ private:
 	                                                      OpHeader* h,
 	                                                      bool* isZeroFilled,
 	                                                      int* zeroFillSize) {
+		// Metadata op types to be excluded from encryption.
+		static std::unordered_set<OpType> metaOps = { OpSnapshotEnd, OpSnapshotAbort, OpCommit, OpRollback };
+		if (metaOps.count((OpType)h->op) == 0) {
+			// It is not supported to open an encrypted store as unencrypted, or vice-versa.
+			ASSERT_EQ(h->op == OpEncrypted, self->enableEncryption);
+		}
 		state int remainingBytes = h->len1 + h->len2 + 1;
 		if (h->op == OpEncrypted) {
 			// encryption header, plus the real (encrypted) op type
@ -568,7 +574,7 @@ private:
 						Standalone<StringRef> data = wait(self->log->readNext(sizeof(OpHeader)));
 						if (data.size() != sizeof(OpHeader)) {
 							if (data.size()) {
-								TEST(true); // zero fill partial header in KeyValueStoreMemory
+								CODE_PROBE(true, "zero fill partial header in KeyValueStoreMemory");
 								memset(&h, 0, sizeof(OpHeader));
 								memcpy(&h, data.begin(), data.size());
 								zeroFillSize = sizeof(OpHeader) - data.size() + h.len1 + h.len2 + 1;
@ -699,7 +705,7 @@ private:
 						ASSERT(false);
 					}

-					TEST(true); // Fixing a partial commit at the end of the KeyValueStoreMemory log
+					CODE_PROBE(true, "Fixing a partial commit at the end of the KeyValueStoreMemory log");
 					for (int i = 0; i < zeroFillSize; i++)
 						self->log->push(StringRef((const uint8_t*)"", 1));
 				}
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@ -741,7 +741,8 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
                                       std::shared_ptr<PerfContextMetrics> perfContextMetrics,
                                       rocksdb::DB* db,
                                       std::shared_ptr<ReadIteratorPool> readIterPool,
-                                       Counters* counters) {
+                                       Counters* counters,
+                                       CF cf) {
 	state std::vector<std::tuple<const char*, uint32_t, uint64_t>> tickerStats = {
 		{ "StallMicros", rocksdb::STALL_MICROS, 0 },
 		{ "BytesRead", rocksdb::BYTES_READ, 0 },
@ -779,7 +780,7 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
 		{ "CountIterSkippedKeys", rocksdb::NUMBER_ITER_SKIP, 0 },

 	};
-	state std::vector<std::pair<const char*, std::string>> propertyStats = {
+	state std::vector<std::pair<const char*, std::string>> intPropertyStats = {
 		{ "NumImmutableMemtables", rocksdb::DB::Properties::kNumImmutableMemTable },
 		{ "NumImmutableMemtablesFlushed", rocksdb::DB::Properties::kNumImmutableMemTableFlushed },
 		{ "IsMemtableFlushPending", rocksdb::DB::Properties::kMemTableFlushPending },
@ -807,6 +808,14 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
 		{ "LiveSstFilesSize", rocksdb::DB::Properties::kLiveSstFilesSize },
 	};

+	state std::vector<std::pair<const char*, std::string>> strPropertyStats = {
+		{ "LevelStats", rocksdb::DB::Properties::kLevelStats },
+	};
+
+	state std::vector<std::pair<const char*, std::string>> levelStrPropertyStats = {
+		{ "CompressionRatioAtLevel", rocksdb::DB::Properties::kCompressionRatioAtLevelPrefix },
+	};
+
 	state std::unordered_map<std::string, uint64_t> readIteratorPoolStats = {
 		{ "NumReadIteratorsCreated", 0 },
 		{ "NumTimesReadIteratorsReused", 0 },
@ -816,21 +825,40 @@ ACTOR Future<Void> rocksDBMetricLogger(UID id,
 		wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
 		TraceEvent e("RocksDBMetrics", id);
 		uint64_t stat;
-		for (auto& t : tickerStats) {
-			auto& [name, ticker, cum] = t;
+		for (auto& [name, ticker, cum] : tickerStats) {
 			stat = statistics->getTickerCount(ticker);
 			e.detail(name, stat - cum);
 			cum = stat;
 		}

-		for (auto& p : propertyStats) {
-			auto& [name, property] = p;
+		for (const auto& [name, property] : intPropertyStats) {
 			stat = 0;
 			// GetAggregatedIntProperty gets the aggregated int property from all column families.
 			ASSERT(db->GetAggregatedIntProperty(property, &stat));
 			e.detail(name, stat);
 		}

+		std::string propValue;
+		for (const auto& [name, property] : strPropertyStats) {
+			propValue = "";
+			ASSERT(db->GetProperty(cf, property, &propValue));
+			e.detail(name, propValue);
+		}
+
+		rocksdb::ColumnFamilyMetaData cf_meta_data;
+		db->GetColumnFamilyMetaData(cf, &cf_meta_data);
+		int numLevels = static_cast<int>(cf_meta_data.levels.size());
+		std::string levelProp;
+		for (const auto& [name, property] : levelStrPropertyStats) {
+			levelProp = "";
+			for (int level = 0; level < numLevels; level++) {
+				propValue = "";
+				ASSERT(db->GetProperty(cf, property + std::to_string(level), &propValue));
+				levelProp += std::to_string(level) + ":" + propValue + (level == numLevels - 1 ? "" : ",");
+			}
+			e.detail(name, levelProp);
+		}
+
 		stat = readIterPool->numReadIteratorsCreated();
 		e.detail("NumReadIteratorsCreated", stat - readIteratorPoolStats["NumReadIteratorsCreated"]);
 		readIteratorPoolStats["NumReadIteratorsCreated"] = stat;
@ -1009,13 +1037,13 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				// The current thread and main thread are same when the code runs in simulation.
 				// blockUntilReady() is getting the thread into deadlock state, so directly calling
 				// the metricsLogger.
-				a.metrics =
-				    rocksDBMetricLogger(id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters) &&
-				    flowLockLogger(id, a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
+				a.metrics = rocksDBMetricLogger(
+				                id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters, cf) &&
+				            flowLockLogger(id, a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
 			} else {
 				onMainThread([&] {
 					a.metrics = rocksDBMetricLogger(
-					                id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters) &&
+					                id, options.statistics, perfContextMetrics, db, readIterPool, &a.counters, cf) &&
 					            flowLockLogger(id, a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
 					return Future<bool>(true);
 				}).blockUntilReady();
--- a/fdbserver/KeyValueStoreSQLite.actor.cpp
+++ b/fdbserver/KeyValueStoreSQLite.actor.cpp
@ -117,7 +117,7 @@ struct PageChecksumCodec {
 			crc32Sum.part1 = 0;
 			crc32Sum.part2 = crc32c_append(0xfdbeefdb, static_cast<uint8_t*>(data), dataLen);
 			if (crc32Sum == *pSumInPage) {
-				TEST(true); // Read CRC32 checksum
+				CODE_PROBE(true, "Read CRC32 checksum");
 				return true;
 			}
 		}
@ -133,7 +133,7 @@ struct PageChecksumCodec {
 			xxHash3Sum.part1 = static_cast<uint32_t>((xxHash3 >> 32) & 0x00ffffff);
 			xxHash3Sum.part2 = static_cast<uint32_t>(xxHash3 & 0xffffffff);
 			if (xxHash3Sum == *pSumInPage) {
-				TEST(true); // Read xxHash3 checksum
+				CODE_PROBE(true, "Read xxHash3 checksum");
 				return true;
 			}
 		}
@ -144,7 +144,7 @@ struct PageChecksumCodec {
 		hashLittle2Sum.part2 = 0x5ca1ab1e;
 		hashlittle2(pData, dataLen, &hashLittle2Sum.part1, &hashLittle2Sum.part2);
 		if (hashLittle2Sum == *pSumInPage) {
-			TEST(true); // Read HashLittle2 checksum
+			CODE_PROBE(true, "Read HashLittle2 checksum");
 			return true;
 		}

@ -357,7 +357,7 @@ struct SQLiteDB : NonCopyable {
 					lineStart = lineEnd;
 				}
 			}
-			TEST(true); // BTree integrity checked
+			CODE_PROBE(true, "BTree integrity checked");
 		}
 		if (e)
 			sqlite3_free(e);
@ -1423,7 +1423,7 @@ void SQLiteDB::open(bool writable) {
 			renameFile(walpath, walpath + "-old-" + deterministicRandom()->randomUniqueID().toString());
 			ASSERT_WE_THINK(false); //< This code should not be hit in FoundationDB at the moment, because worker looks
 			                        // for databases to open by listing .fdb files, not .fdb-wal files
-			// TEST(true);  // Replace a partially constructed or destructed DB
+			// CODE_PROBE(true, "Replace a partially constructed or destructed DB");
 		}

 		if (dbFile.isError() && walFile.isError() && writable &&
@ -1942,8 +1942,8 @@ private:
 				}

 				if (canDelete && (!canVacuum || deterministicRandom()->random01() < lazyDeleteBatchProbability)) {
-					TEST(canVacuum); // SQLite lazy deletion when vacuuming is active
-					TEST(!canVacuum); // SQLite lazy deletion when vacuuming is inactive
+					CODE_PROBE(canVacuum, "SQLite lazy deletion when vacuuming is active");
+					CODE_PROBE(!canVacuum, "SQLite lazy deletion when vacuuming is inactive");

 					int pagesToDelete = std::max(
 					    1,
@ -1955,10 +1955,10 @@ private:
 					lazyDeleteTime += now() - begin;
 				} else {
 					ASSERT(canVacuum);
-					TEST(canDelete); // SQLite vacuuming when lazy delete is active
-					TEST(!canDelete); // SQLite vacuuming when lazy delete is inactive
-					TEST(SERVER_KNOBS->SPRING_CLEANING_VACUUMS_PER_LAZY_DELETE_PAGE !=
-					     0); // SQLite vacuuming with nonzero vacuums_per_lazy_delete_page
+					CODE_PROBE(canDelete, "SQLite vacuuming when lazy delete is active");
+					CODE_PROBE(!canDelete, "SQLite vacuuming when lazy delete is inactive");
+					CODE_PROBE(SERVER_KNOBS->SPRING_CLEANING_VACUUMS_PER_LAZY_DELETE_PAGE != 0,
+					           "SQLite vacuuming with nonzero vacuums_per_lazy_delete_page");

 					vacuumFinished = conn.vacuum();
 					if (!vacuumFinished) {
@ -1973,10 +1973,10 @@ private:

 			freeListPages = conn.freePages();

-			TEST(workPerformed.lazyDeletePages > 0); // Pages lazily deleted
-			TEST(workPerformed.vacuumedPages > 0); // Pages vacuumed
-			TEST(vacuumTime > 0); // Time spent vacuuming
-			TEST(lazyDeleteTime > 0); // Time spent lazy deleting
+			CODE_PROBE(workPerformed.lazyDeletePages > 0, "Pages lazily deleted");
+			CODE_PROBE(workPerformed.vacuumedPages > 0, "Pages vacuumed");
+			CODE_PROBE(vacuumTime > 0, "Time spent vacuuming");
+			CODE_PROBE(lazyDeleteTime > 0, "Time spent lazy deleting");

 			++springCleaningStats.springCleaningCount;
 			springCleaningStats.lazyDeletePages += workPerformed.lazyDeletePages;
--- a/fdbserver/LeaderElection.actor.cpp
+++ b/fdbserver/LeaderElection.actor.cpp
@ -206,7 +206,7 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
 			choose {
 				when(wait(nomineeChange.onTrigger())) {}
 				when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) {
-					TEST(true); // Bad candidate timeout
+					CODE_PROBE(true, "Bad candidate timeout");
 					TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log();
 					break;
 				}
--- a/fdbserver/LocalConfiguration.actor.cpp
+++ b/fdbserver/LocalConfiguration.actor.cpp
@ -65,7 +65,7 @@ class LocalConfigurationImpl {
 					configClassToKnobToValue[configPath.back()] = {};
 				}
 			} else {
-				TEST(true); // Invalid configuration path
+				CODE_PROBE(true, "Invalid configuration path");
 				if (!g_network->isSimulated()) {
 					fprintf(stderr, "WARNING: Invalid configuration path: `%s'\n", paramString.c_str());
 				}
@ -88,7 +88,7 @@ class LocalConfigurationImpl {
 					knobCollection.setKnob(knobName.toString(), knobValue);
 				} catch (Error& e) {
 					if (e.code() == error_code_invalid_option_value) {
-						TEST(true); // invalid knob in configuration database
+						CODE_PROBE(true, "invalid knob in configuration database");
 						TraceEvent(SevWarnAlways, "InvalidKnobOptionValue")
 						    .detail("KnobName", knobName)
 						    .detail("KnobValue", knobValue.toString());
@ -126,10 +126,10 @@ class LocalConfigurationImpl {
 					this->overrides[stringToKeyRef(knobName)] = knobValue;
 				} catch (Error& e) {
 					if (e.code() == error_code_invalid_option) {
-						TEST(true); // Attempted to manually set invalid knob option
+						CODE_PROBE(true, "Attempted to manually set invalid knob option");
 						TraceEvent(SevWarnAlways, "UnrecognizedKnobOption").detail("Knob", printable(knobName));
 					} else if (e.code() == error_code_invalid_option_value) {
-						TEST(true); // Invalid manually set knob value
+						CODE_PROBE(true, "Invalid manually set knob value");
 						TraceEvent(SevWarnAlways, "InvalidKnobValue")
 						    .detail("Knob", printable(knobName))
 						    .detail("Value", printable(knobValueString));
@ -198,7 +198,7 @@ class LocalConfigurationImpl {
 		state ConfigKnobOverrides storedConfigPath =
 		    BinaryReader::fromStringRef<ConfigKnobOverrides>(storedConfigPathValue.get(), IncludeVersion());
 		if (!storedConfigPath.hasSameConfigPath(self->configKnobOverrides)) {
-			TEST(true); // All local information is outdated
+			CODE_PROBE(true, "All local information is outdated");
 			wait(clearKVStore(self));
 			wait(saveConfigPath(self));
 			self->updateInMemoryState(lastSeenVersion);
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@ -592,7 +592,7 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
 		}
 		if (sequenceData.isSet()) {
 			if (sequenceData.getFuture().get().first != reply.end) {
-				TEST(true); // tlog peek second attempt ended at a different version
+				CODE_PROBE(true, "tlog peek second attempt ended at a different version");
 				replyPromise.sendError(operation_obsolete());
 				return Void();
 			}
--- a/fdbserver/LogSystem.cpp
+++ b/fdbserver/LogSystem.cpp
@ -290,7 +290,7 @@ void LogPushData::addTxsTag() {
 }

 void LogPushData::addTransactionInfo(SpanContext const& context) {
-	TEST(!spanContext.isValid()); // addTransactionInfo with invalid SpanContext
+	CODE_PROBE(!spanContext.isValid(), "addTransactionInfo with invalid SpanContext");
 	spanContext = context;
 	writtenLocations.clear();
 }
@ -352,7 +352,7 @@ bool LogPushData::writeTransactionInfo(int location, uint32_t subseq) {
 		return false;
 	}

-	TEST(true); // Wrote SpanContextMessage to a transaction log
+	CODE_PROBE(true, "Wrote SpanContextMessage to a transaction log");
 	writtenLocations.insert(location);

 	BinaryWriter& wr = messagesWriter[location];
@ -375,10 +375,10 @@ bool LogPushData::writeTransactionInfo(int location, uint32_t subseq) {
 		// parent->child.
 		SpanContextMessage contextMessage;
 		if (spanContext.isSampled()) {
-			TEST(true); // Converting OTELSpanContextMessage to traced SpanContextMessage
+			CODE_PROBE(true, "Converting OTELSpanContextMessage to traced SpanContextMessage");
 			contextMessage = SpanContextMessage(UID(spanContext.traceID.first(), spanContext.traceID.second()));
 		} else {
-			TEST(true); // Converting OTELSpanContextMessage to untraced SpanContextMessage
+			CODE_PROBE(true, "Converting OTELSpanContextMessage to untraced SpanContextMessage");
 			contextMessage = SpanContextMessage(UID(0, 0));
 		}
 		wr << contextMessage;
--- a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
+++ b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
@ -101,7 +101,7 @@ public:
 					TraceEvent(SevWarnAlways, "DiskQueueAdapterReset")
 					    .detail("Version", self->cursor->popped())
 					    .detail("PeekTypeSwitch", self->peekTypeSwitches % 3);
-					TEST(true); // disk adapter reset
+					CODE_PROBE(true, "disk adapter reset");
 					if (self->cursor->popped() != 0) {
 						self->recoveryLoc = self->cursor->popped();
 					} else {
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@ -317,7 +317,7 @@ ACTOR Future<Void> serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self,
 				//
 				// A cursor for a log router can be delayed indefinitely during a network partition, so only fail
 				// simulation tests sufficiently far after we finish simulating network partitions.
-				TEST(e.code() == error_code_timed_out); // peek cursor timed out
+				CODE_PROBE(e.code() == error_code_timed_out, "peek cursor timed out");
 				if (now() >= FLOW_KNOBS->SIM_SPEEDUP_AFTER_SECONDS + SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME) {
 					ASSERT_WE_THINK(e.code() == error_code_operation_obsolete ||
 					                SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME < 10);
@ -653,7 +653,7 @@ void ILogSystem::MergedPeekCursor::updateMessage(bool usePolicy) {
 			c->advanceTo(messageVersion);
 			if (start <= messageVersion && messageVersion < c->version()) {
 				advancedPast = true;
-				TEST(true); // Merge peek cursor advanced past desired sequence
+				CODE_PROBE(true, "Merge peek cursor advanced past desired sequence");
 			}
 		}

@ -965,7 +965,7 @@ void ILogSystem::SetPeekCursor::updateMessage(int logIdx, bool usePolicy) {
 				c->advanceTo(messageVersion);
 				if (start <= messageVersion && messageVersion < c->version()) {
 					advancedPast = true;
-					TEST(true); // Merge peek cursor with logIdx advanced past desired sequence
+					CODE_PROBE(true, "Merge peek cursor with logIdx advanced past desired sequence");
 				}
 			}
 		}
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@ -217,7 +217,7 @@ ACTOR Future<MoveKeysLock> takeMoveKeysLock(Database cx, UID ddId) {
 			return lock;
 		} catch (Error& e) {
 			wait(tr.onError(e));
-			TEST(true); // takeMoveKeysLock retry
+			CODE_PROBE(true, "takeMoveKeysLock retry");
 		}
 	}
 }
@ -239,7 +239,7 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
 		Optional<Value> readVal = wait(tr->get(moveKeysLockWriteKey));
 		UID lastWrite = readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
 		if (lastWrite != lock.prevWrite) {
-			TEST(true); // checkMoveKeysLock: Conflict with previous owner
+			CODE_PROBE(true, "checkMoveKeysLock: Conflict with previous owner");
 			throw movekeys_conflict();
 		}

@ -272,7 +272,7 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,

 		return Void();
 	} else {
-		TEST(true); // checkMoveKeysLock: Conflict with new owner
+		CODE_PROBE(true, "checkMoveKeysLock: Conflict with new owner");
 		throw movekeys_conflict();
 	}
 }
@ -591,7 +591,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 		// This process can be split up into multiple transactions if there are too many existing overlapping shards
 		// In that case, each iteration of this loop will have begin set to the end of the last processed shard
 		while (begin < keys.end) {
-			TEST(begin > keys.begin); // Multi-transactional startMoveKeys
+			CODE_PROBE(begin > keys.begin, "Multi-transactional startMoveKeys");
 			batches++;

 			// RYW to optimize re-reading the same key ranges
@ -631,7 +631,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 							// Attempt to move onto a server that isn't in serverList (removed or never added to the
 							// database) This can happen (why?) and is handled by the data distribution algorithm
 							// FIXME: Answer why this can happen?
-							TEST(true); // start move keys moving to a removed server
+							CODE_PROBE(true, "start move keys moving to a removed server");
 							throw move_to_removed_server();
 						}
 					}
@ -825,7 +825,7 @@ ACTOR Future<Void> checkFetchingState(Database cx,
 			for (int s = 0; s < serverListValues.size(); s++) {
 				if (!serverListValues[s].present()) {
 					// FIXME: Is this the right behavior?  dataMovementComplete will never be sent!
-					TEST(true); // check fetching state moved to removed server
+					CODE_PROBE(true, "check fetching state moved to removed server");
 					throw move_to_removed_server();
 				}
 				auto si = decodeServerListValue(serverListValues[s].get());
@ -897,7 +897,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 		// This process can be split up into multiple transactions if there are too many existing overlapping shards
 		// In that case, each iteration of this loop will have begin set to the end of the last processed shard
 		while (begin < keys.end) {
-			TEST(begin > keys.begin); // Multi-transactional finishMoveKeys
+			CODE_PROBE(begin > keys.begin, "Multi-transactional finishMoveKeys");

 			state Transaction tr(occ);

@ -994,7 +994,8 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 						} else if (alreadyMoved) {
 							dest.clear();
 							src.clear();
-							TEST(true); // FinishMoveKeys first key in iteration sub-range has already been processed
+							CODE_PROBE(true,
+							           "FinishMoveKeys first key in iteration sub-range has already been processed");
 						}
 					}

@ -1029,8 +1030,9 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 						}
 					}
 					if (!dest.size()) {
-						TEST(true); // A previous finishMoveKeys for this range committed just as it was cancelled to
-						            // start this one?
+						CODE_PROBE(true,
+						           "A previous finishMoveKeys for this range committed just as it was cancelled to "
+						           "start this one?");
 						TraceEvent("FinishMoveKeysNothingToDo", relocationIntervalId)
 						    .detail("KeyBegin", keys.begin)
 						    .detail("KeyEnd", keys.end)
@ -1394,7 +1396,6 @@ ACTOR static Future<Void> startMoveShards(Database occ,
 							physicalShardMap[ssId].emplace_back(rangeIntersectKeys, srcId);
 						}

-						const UID checkpontId = deterministicRandom()->randomUniqueID();
 						for (const UID& ssId : src) {
 							dataMove.src.insert(ssId);
 							// TODO(psm): Create checkpoint for the range.
@ -2021,8 +2022,9 @@ ACTOR Future<Void> removeStorageServer(Database cx,

 			state bool canRemove = wait(canRemoveStorageServer(tr, serverID));
 			if (!canRemove) {
-				TEST(true); // The caller had a transaction in flight that assigned keys to the server.  Wait for it to
-				            // reverse its mistake.
+				CODE_PROBE(true,
+				           "The caller had a transaction in flight that assigned keys to the server.  Wait for it to "
+				           "reverse its mistake.");
 				TraceEvent(SevWarn, "NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID);
 				wait(delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch));
 				tr->reset();
@ -2039,7 +2041,7 @@ ACTOR Future<Void> removeStorageServer(Database cx,

 				if (!fListKey.get().present()) {
 					if (retry) {
-						TEST(true); // Storage server already removed after retrying transaction
+						CODE_PROBE(true, "Storage server already removed after retrying transaction");
 						return Void();
 					}
 					TraceEvent(SevError, "RemoveInvalidServer").detail("ServerID", serverID);
--- a/fdbserver/MutationTracking.cpp
+++ b/fdbserver/MutationTracking.cpp
@ -99,7 +99,7 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri
 			SpanContextMessage scm;
 			br >> scm;
 		} else if (OTELSpanContextMessage::startsOTELSpanContextMessage(mutationType)) {
-			TEST(true); // MutationTracking reading OTELSpanContextMessage
+			CODE_PROBE(true, "MutationTracking reading OTELSpanContextMessage");
 			BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));
 			OTELSpanContextMessage scm;
 			br >> scm;
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@ -182,7 +182,7 @@ private:
 			Standalone<StringRef> h = wait(self->queue->readNext(sizeof(uint32_t)));
 			if (h.size() != sizeof(uint32_t)) {
 				if (h.size()) {
-					TEST(true); // Zero fill within size field
+					CODE_PROBE(true, "Zero fill within size field");
 					int payloadSize = 0;
 					memcpy(&payloadSize, h.begin(), h.size());
 					zeroFillSize = sizeof(uint32_t) - h.size(); // zero fill the size itself
@ -196,7 +196,7 @@ private:

 			Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
 			if (e.size() != payloadSize + 1) {
-				TEST(true); // Zero fill within payload
+				CODE_PROBE(true, "Zero fill within payload");
 				zeroFillSize = payloadSize + 1 - e.size();
 				break;
 			}
@ -210,7 +210,7 @@ private:
 			}
 		}
 		if (zeroFillSize) {
-			TEST(true); // Fixing a partial commit at the end of the tlog queue
+			CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
 			for (int i = 0; i < zeroFillSize; i++)
 				self->queue->push(StringRef((const uint8_t*)"", 1));
 		}
@ -507,9 +507,9 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply, Reference<LogData> logData) {
 	state Version stopVersion = logData->version.get();

-	TEST(true); // TLog stopped by recovering master
-	TEST(logData->stopped); // LogData already stopped
-	TEST(!logData->stopped); // LogData not yet stopped
+	CODE_PROBE(true, "TLog stopped by recovering master");
+	CODE_PROBE(logData->stopped, "LogData already stopped");
+	CODE_PROBE(!logData->stopped, "LogData not yet stopped");

 	TraceEvent("TLogStop", logData->logId)
 	    .detail("Ver", stopVersion)
@ -611,7 +611,7 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
 	// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
 	// increase bytesDurable accordingly, and update persistentDataDurableVersion.

-	TEST(anyData); // TLog moved data to persistentData
+	CODE_PROBE(anyData, "TLog moved data to persistentData");
 	logData->persistentDataDurableVersion = newPersistentDataVersion;

 	for (tag = logData->tag_data.begin(); tag != logData->tag_data.end(); ++tag) {
@ -834,7 +834,7 @@ void commitMessages(Reference<LogData> self,
 		// Fill up the rest of this block
 		int bytes = (uint8_t*)r.getLengthPtr() - messages.begin();
 		if (bytes) {
-			TEST(true); // Splitting commit messages across multiple blocks
+			CODE_PROBE(true, "Splitting commit messages across multiple blocks");
 			messages1 = StringRef(block.end(), bytes);
 			block.append(block.arena(), messages.begin(), bytes);
 			self->messageBlocks.emplace_back(version, block);
@ -1047,7 +1047,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 			}
 			if (sequenceData.isSet()) {
 				if (sequenceData.getFuture().get() != rep.end) {
-					TEST(true); // tlog peek second attempt ended at a different version
+					CODE_PROBE(true, "tlog peek second attempt ended at a different version");
 					replyPromise.sendError(operation_obsolete());
 					return Void();
 				}
@ -1120,7 +1120,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 		auto& sequenceData = trackerData.sequence_version[sequence + 1];
 		if (sequenceData.isSet()) {
 			if (sequenceData.getFuture().get() != reply.end) {
-				TEST(true); // tlog peek second attempt ended at a different version (2)
+				CODE_PROBE(true, "tlog peek second attempt ended at a different version (2)");
 				replyPromise.sendError(operation_obsolete());
 				return Void();
 			}
@ -1467,7 +1467,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
 	if (!fFormat.get().present()) {
 		RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
 		if (!v.size()) {
-			TEST(true); // The DB is completely empty, so it was never initialized.  Delete it.
+			CODE_PROBE(true, "The DB is completely empty, so it was never initialized.  Delete it.");
 			throw worker_removed();
 		} else {
 			// This should never happen
@ -1553,7 +1553,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
 	try {
 		loop {
 			if (allRemoved.isReady()) {
-				TEST(true); // all tlogs removed during queue recovery
+				CODE_PROBE(true, "all tlogs removed during queue recovery");
 				throw worker_removed();
 			}
 			choose {
@ -1586,7 +1586,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
 							logData->queueCommittedVersion.set(qe.version);

 							while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
-								TEST(true); // Flush excess data during TLog queue recovery
+								CODE_PROBE(true, "Flush excess data during TLog queue recovery");
 								TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid)
 								    .detail("BytesInput", self->bytesInput)
 								    .detail("BytesDurable", self->bytesDurable)
@ -1610,7 +1610,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
 	}

 	TraceEvent("TLogRestorePersistentStateDone", self->dbgid).detail("Took", now() - startt);
-	TEST(now() - startt >= 1.0); // TLog recovery took more than 1 second
+	CODE_PROBE(now() - startt >= 1.0, "TLog recovery took more than 1 second");

 	for (auto it : self->id_data) {
 		if (it.second->queueCommittedVersion.get() == 0) {
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@ -148,7 +148,7 @@ private:
 			Standalone<StringRef> h = wait(self->queue->readNext(sizeof(uint32_t)));
 			if (h.size() != sizeof(uint32_t)) {
 				if (h.size()) {
-					TEST(true); // Zero fill within size field
+					CODE_PROBE(true, "Zero fill within size field");
 					int payloadSize = 0;
 					memcpy(&payloadSize, h.begin(), h.size());
 					zeroFillSize = sizeof(uint32_t) - h.size(); // zero fill the size itself
@ -162,7 +162,7 @@ private:

 			Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
 			if (e.size() != payloadSize + 1) {
-				TEST(true); // Zero fill within payload
+				CODE_PROBE(true, "Zero fill within payload");
 				zeroFillSize = payloadSize + 1 - e.size();
 				break;
 			}
@ -176,7 +176,7 @@ private:
 			}
 		}
 		if (zeroFillSize) {
-			TEST(true); // Fixing a partial commit at the end of the tlog queue
+			CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
 			for (int i = 0; i < zeroFillSize; i++)
 				self->queue->push(StringRef((const uint8_t*)"", 1));
 		}
@ -653,9 +653,9 @@ void TLogQueue::updateVersionSizes(const TLogQueueEntry& result, TLogData* tLog)
 ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply, Reference<LogData> logData) {
 	state Version stopVersion = logData->version.get();

-	TEST(true); // TLog stopped by recovering master
-	TEST(logData->stopped); // logData already stopped
-	TEST(!logData->stopped); // logData not yet stopped
+	CODE_PROBE(true, "TLog stopped by recovering master");
+	CODE_PROBE(logData->stopped, "logData already stopped");
+	CODE_PROBE(!logData->stopped, "logData not yet stopped");

 	TraceEvent("TLogStop", logData->logId)
 	    .detail("Ver", stopVersion)
@ -769,7 +769,7 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
 	// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
 	// increase bytesDurable accordingly, and update persistentDataDurableVersion.

-	TEST(anyData); // TLog moved data to persistentData
+	CODE_PROBE(anyData, "TLog moved data to persistentData");
 	logData->persistentDataDurableVersion = newPersistentDataVersion;

 	for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
@ -1341,7 +1341,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 			}
 			if (sequenceData.isSet()) {
 				if (sequenceData.getFuture().get().first != rep.end) {
-					TEST(true); // tlog peek second attempt ended at a different version
+					CODE_PROBE(true, "tlog peek second attempt ended at a different version");
 					replyPromise.sendError(operation_obsolete());
 					return Void();
 				}
@ -1439,7 +1439,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 		if (sequenceData.isSet()) {
 			trackerData.duplicatePeeks++;
 			if (sequenceData.getFuture().get().first != reply.end) {
-				TEST(true); // tlog peek second attempt ended at a different version (2)
+				CODE_PROBE(true, "tlog peek second attempt ended at a different version (2)");
 				replyPromise.sendError(operation_obsolete());
 				return Void();
 			}
@ -1546,7 +1546,7 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
 		    .detail("LogId", logData->logId)
 		    .detail("Version", it->version.get())
 		    .detail("QueueVer", it->queueCommittedVersion.get());
-		TEST(true); // A TLog was replaced before having a chance to commit its queue
+		CODE_PROBE(true, "A TLog was replaced before having a chance to commit its queue");
 		it->queueCommittedVersion.set(it->version.get());
 	}
 	return Void();
@ -2007,7 +2007,7 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
 		when(TLogCommitRequest req = waitNext(tli.commit.getFuture())) {
 			//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
 			ASSERT(logData->isPrimary);
-			TEST(logData->stopped); // TLogCommitRequest while stopped
+			CODE_PROBE(logData->stopped, "TLogCommitRequest while stopped");
 			if (!logData->stopped)
 				logData->addActor.send(tLogCommit(self, req, logData, warningCollectorInput));
 			else
@ -2333,7 +2333,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	if (!fFormat.get().present()) {
 		RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
 		if (!v.size()) {
-			TEST(true); // The DB is completely empty, so it was never initialized.  Delete it.
+			CODE_PROBE(true, "The DB is completely empty, so it was never initialized.  Delete it.");
 			throw worker_removed();
 		} else {
 			// This should never happen
@ -2473,7 +2473,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	try {
 		loop {
 			if (allRemoved.isReady()) {
-				TEST(true); // all tlogs removed during queue recovery
+				CODE_PROBE(true, "all tlogs removed during queue recovery");
 				throw worker_removed();
 			}
 			choose {
@ -2503,7 +2503,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 							logData->queueCommittedVersion.set(qe.version);

 							while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
-								TEST(true); // Flush excess data during TLog queue recovery
+								CODE_PROBE(true, "Flush excess data during TLog queue recovery");
 								TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid)
 								    .detail("BytesInput", self->bytesInput)
 								    .detail("BytesDurable", self->bytesDurable)
@ -2527,7 +2527,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	}

 	TraceEvent("TLogRestorePersistentStateDone", self->dbgid).detail("Took", now() - startt);
-	TEST(now() - startt >= 1.0); // TLog recovery took more than 1 second
+	CODE_PROBE(now() - startt >= 1.0, "TLog recovery took more than 1 second");

 	for (auto it : self->id_data) {
 		if (it.second->queueCommittedVersion.get() == 0) {
--- a/fdbserver/OldTLogServer_6_2.actor.cpp
+++ b/fdbserver/OldTLogServer_6_2.actor.cpp
@ -156,7 +156,7 @@ private:
 			Standalone<StringRef> h = wait(self->queue->readNext(sizeof(uint32_t)));
 			if (h.size() != sizeof(uint32_t)) {
 				if (h.size()) {
-					TEST(true); // Zero fill within size field
+					CODE_PROBE(true, "Zero fill within size field");
 					int payloadSize = 0;
 					memcpy(&payloadSize, h.begin(), h.size());
 					zeroFillSize = sizeof(uint32_t) - h.size(); // zero fill the size itself
@ -170,7 +170,7 @@ private:

 			Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
 			if (e.size() != payloadSize + 1) {
-				TEST(true); // Zero fill within payload
+				CODE_PROBE(true, "Zero fill within payload");
 				zeroFillSize = payloadSize + 1 - e.size();
 				break;
 			}
@ -186,7 +186,7 @@ private:
 			}
 		}
 		if (zeroFillSize) {
-			TEST(true); // Fixing a partial commit at the end of the tlog queue
+			CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
 			for (int i = 0; i < zeroFillSize; i++)
 				self->queue->push(StringRef((const uint8_t*)"", 1));
 		}
@ -756,9 +756,9 @@ void TLogQueue::updateVersionSizes(const TLogQueueEntry& result,
 ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply, Reference<LogData> logData) {
 	state Version stopVersion = logData->version.get();

-	TEST(true); // TLog stopped by recovering master
-	TEST(logData->stopped); // logData already stopped
-	TEST(!logData->stopped); // logData not yet stopped
+	CODE_PROBE(true, "TLog stopped by recovering master");
+	CODE_PROBE(logData->stopped, "logData already stopped");
+	CODE_PROBE(!logData->stopped, "logData not yet stopped");

 	TraceEvent("TLogStop", logData->logId)
 	    .detail("Ver", stopVersion)
@ -1042,7 +1042,7 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
 	// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
 	// increase bytesDurable accordingly, and update persistentDataDurableVersion.

-	TEST(anyData); // TLog moved data to persistentData
+	CODE_PROBE(anyData, "TLog moved data to persistentData");
 	logData->persistentDataDurableVersion = newPersistentDataVersion;

 	for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
@ -1680,7 +1680,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 			}
 			if (sequenceData.isSet()) {
 				if (sequenceData.getFuture().get().first != rep.end) {
-					TEST(true); // tlog peek second attempt ended at a different version
+					CODE_PROBE(true, "tlog peek second attempt ended at a different version");
 					replyPromise.sendError(operation_obsolete());
 					return Void();
 				}
@ -1868,7 +1868,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 		if (sequenceData.isSet()) {
 			trackerData.duplicatePeeks++;
 			if (sequenceData.getFuture().get().first != reply.end) {
-				TEST(true); // tlog peek second attempt ended at a different version (2)
+				CODE_PROBE(true, "tlog peek second attempt ended at a different version (2)");
 				replyPromise.sendError(operation_obsolete());
 				return Void();
 			}
@ -1930,7 +1930,7 @@ ACTOR Future<Void> watchDegraded(TLogData* self) {
 	wait(lowPriorityDelay(SERVER_KNOBS->TLOG_DEGRADED_DURATION));

 	TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid).log();
-	TEST(true); // TLog degraded
+	CODE_PROBE(true, "TLog degraded");
 	self->degraded->set(true);
 	return Void();
 }
@ -1988,7 +1988,7 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
 		    .detail("LogId", logData->logId)
 		    .detail("Version", it->version.get())
 		    .detail("QueueVer", it->queueCommittedVersion.get());
-		TEST(true); // A TLog was replaced before having a chance to commit its queue
+		CODE_PROBE(true, "A TLog was replaced before having a chance to commit its queue");
 		it->queueCommittedVersion.set(it->version.get());
 	}
 	return Void();
@ -2452,7 +2452,7 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
 		when(TLogCommitRequest req = waitNext(tli.commit.getFuture())) {
 			//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
 			ASSERT(logData->isPrimary);
-			TEST(logData->stopped); // TLogCommitRequest while stopped
+			CODE_PROBE(logData->stopped, "TLogCommitRequest while stopped");
 			if (!logData->stopped)
 				logData->addActor.send(tLogCommit(self, req, logData, warningCollectorInput));
 			else
@ -2801,7 +2801,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	if (!fFormat.get().present()) {
 		RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
 		if (!v.size()) {
-			TEST(true); // The DB is completely empty, so it was never initialized.  Delete it.
+			CODE_PROBE(true, "The DB is completely empty, so it was never initialized.  Delete it.");
 			throw worker_removed();
 		} else {
 			// This should never happen
@ -2949,7 +2949,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 			throw end_of_stream();
 		loop {
 			if (allRemoved.isReady()) {
-				TEST(true); // all tlogs removed during queue recovery
+				CODE_PROBE(true, "all tlogs removed during queue recovery");
 				throw worker_removed();
 			}
 			choose {
@ -2980,7 +2980,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 							logData->queueCommittedVersion.set(qe.version);

 							while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
-								TEST(true); // Flush excess data during TLog queue recovery
+								CODE_PROBE(true, "Flush excess data during TLog queue recovery");
 								TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid)
 								    .detail("LogId", logData->logId)
 								    .detail("BytesInput", self->bytesInput)
@ -3010,7 +3010,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	}

 	TraceEvent("TLogRestorePersistentStateDone", self->dbgid).detail("Took", now() - startt);
-	TEST(now() - startt >= 1.0); // TLog recovery took more than 1 second
+	CODE_PROBE(now() - startt >= 1.0, "TLog recovery took more than 1 second");

 	for (auto it : self->id_data) {
 		if (it.second->queueCommittedVersion.get() == 0) {
--- a/fdbserver/PaxosConfigConsumer.actor.cpp
+++ b/fdbserver/PaxosConfigConsumer.actor.cpp
@ -449,7 +449,7 @@ class PaxosConfigConsumerImpl {
 				if (e.code() == error_code_version_already_compacted || e.code() == error_code_timed_out ||
 				    e.code() == error_code_failed_to_reach_quorum || e.code() == error_code_version_already_compacted ||
 				    e.code() == error_code_process_behind) {
-					TEST(true); // PaxosConfigConsumer get version_already_compacted error
+					CODE_PROBE(true, "PaxosConfigConsumer get version_already_compacted error");
 					if (e.code() == error_code_failed_to_reach_quorum) {
 						try {
 							wait(self->getCommittedVersionQuorum.complete());
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@ -325,7 +325,7 @@ public:
 						reply.throttledTags = self.tagThrottler->getClientRates();
 						bool returningTagsToProxy =
 						    reply.throttledTags.present() && reply.throttledTags.get().size() > 0;
-						TEST(returningTagsToProxy); // Returning tag throttles to a proxy
+						CODE_PROBE(returningTagsToProxy, "Returning tag throttles to a proxy");
 					}

 					reply.healthMetrics.update(self.healthMetrics, true, req.detailed);
--- a/fdbserver/ResolutionBalancer.actor.cpp
+++ b/fdbserver/ResolutionBalancer.actor.cpp
@ -39,7 +39,7 @@ void ResolutionBalancer::setChangesInReply(UID requestingProxy, GetCommitVersion
 		rep.resolverChangesVersion = resolverChangesVersion;
 		resolverNeedingChanges.erase(requestingProxy);

-		TEST(!rep.resolverChanges.empty()); // resolution balancing moves keyranges
+		CODE_PROBE(!rep.resolverChanges.empty(), "resolution balancing moves keyranges");
 		if (resolverNeedingChanges.empty())
 			resolverChanges.set(Standalone<VectorRef<ResolverMoveRef>>());
 	}
--- a/fdbserver/Resolver.actor.cpp
+++ b/fdbserver/Resolver.actor.cpp
@ -350,7 +350,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc

 				applyMetadataMutations(spanContext, *resolverData, req.transactions[t].mutations);
 			}
-			TEST(self->forceRecovery); // Resolver detects forced recovery
+			CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery");
 		}

 		self->resolvedStateTransactions += req.txnStateTransactions.size();
@ -362,7 +362,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 		ASSERT(req.version >= firstUnseenVersion);
 		ASSERT(firstUnseenVersion >= self->debugMinRecentStateVersion);

-		TEST(firstUnseenVersion == req.version); // Resolver first unseen version is current version
+		CODE_PROBE(firstUnseenVersion == req.version, "Resolver first unseen version is current version");

 		// If shardChanged at or before this commit version, the proxy may have computed
 		// the wrong set of groups. Then we need to broadcast to all groups below.
@ -400,13 +400,14 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 			}
 		}

-		TEST(oldestProxyVersion == req.version); // The proxy that sent this request has the oldest current version
-		TEST(oldestProxyVersion !=
-		     req.version); // The proxy that sent this request does not have the oldest current version
+		CODE_PROBE(oldestProxyVersion == req.version,
+		           "The proxy that sent this request has the oldest current version");
+		CODE_PROBE(oldestProxyVersion != req.version,
+		           "The proxy that sent this request does not have the oldest current version");

 		bool anyPopped = false;
 		if (firstUnseenVersion <= oldestProxyVersion && self->proxyInfoMap.size() == self->commitProxyCount + 1) {
-			TEST(true); // Deleting old state transactions
+			CODE_PROBE(true, "Deleting old state transactions");
 			int64_t erasedBytes = self->recentStateTransactionsInfo.eraseUpTo(oldestProxyVersion);
 			self->debugMinRecentStateVersion = oldestProxyVersion + 1;
 			anyPopped = erasedBytes > 0;
@ -445,7 +446,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 		if (req.debugID.present())
 			g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.After");
 	} else {
-		TEST(true); // Duplicate resolve batch request
+		CODE_PROBE(true, "Duplicate resolve batch request");
 		//TraceEvent("DupResolveBatchReq", self->dbgid).detail("From", proxyAddress);
 	}

@ -456,13 +457,13 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
 		if (batchItr != proxyInfoItr->second.outstandingBatches.end()) {
 			req.reply.send(batchItr->second);
 		} else {
-			TEST(true); // No outstanding batches for version on proxy
+			CODE_PROBE(true, "No outstanding batches for version on proxy");
 			req.reply.send(Never());
 		}
 	} else {
 		ASSERT_WE_THINK(false); // The first non-duplicate request with this proxyAddress, including this one, should
 		                        // have inserted this item in the map!
-		// TEST(true); // No prior proxy requests
+		// CODE_PROBE(true, "No prior proxy requests");
 		req.reply.send(Never());
 	}

--- a/fdbserver/RkTagThrottleCollection.cpp
+++ b/fdbserver/RkTagThrottleCollection.cpp
@ -48,7 +48,7 @@ Optional<double> RkTagThrottleCollection::RkTagThrottleData::updateAndGetClientR
 		ASSERT_GE(rate, 0);
 		return rate;
 	} else {
-		TEST(true); // Get throttle rate for expired throttle
+		CODE_PROBE(true, "Get throttle rate for expired throttle");
 		rateSet = false;
 		return Optional<double>();
 	}
@ -92,14 +92,14 @@ Optional<double> RkTagThrottleCollection::autoThrottleTag(UID id,
 	bool present = (itr != autoThrottledTags.end());
 	if (!present) {
 		if (autoThrottledTags.size() >= SERVER_KNOBS->MAX_AUTO_THROTTLED_TRANSACTION_TAGS) {
-			TEST(true); // Reached auto-throttle limit
+			CODE_PROBE(true, "Reached auto-throttle limit");
 			return Optional<double>();
 		}

 		itr = autoThrottledTags.try_emplace(tag).first;
 		initializeTag(tag);
 	} else if (itr->second.limits.expiration <= now()) {
-		TEST(true); // Re-throttling expired tag that hasn't been cleaned up
+		CODE_PROBE(true, "Re-throttling expired tag that hasn't been cleaned up");
 		present = false;
 		itr->second = RkTagThrottleData();
 	}
@ -113,7 +113,7 @@ Optional<double> RkTagThrottleCollection::autoThrottleTag(UID id,
 				return Optional<double>();
 			}
 		} else if (now() <= throttle.lastUpdated + SERVER_KNOBS->AUTO_TAG_THROTTLE_UPDATE_FREQUENCY) {
-			TEST(true); // Tag auto-throttled too quickly
+			CODE_PROBE(true, "Tag auto-throttled too quickly");
 			return Optional<double>();
 		} else {
 			tpsRate = computeTargetTpsRate(fractionalBusyness,
@ -121,7 +121,7 @@ Optional<double> RkTagThrottleCollection::autoThrottleTag(UID id,
 			                               tagData[tag].requestRate.smoothRate());

 			if (throttle.limits.expiration > now() && tpsRate.get() >= throttle.limits.tpsRate) {
-				TEST(true); // Tag auto-throttle rate increase attempt while active
+				CODE_PROBE(true, "Tag auto-throttle rate increase attempt while active");
 				return Optional<double>();
 			}

@ -176,14 +176,14 @@ void RkTagThrottleCollection::manualThrottleTag(UID id,
 	result.first->second.limits.expiration = expiration;

 	if (!oldLimits.present()) {
-		TEST(true); // Transaction tag manually throttled
+		CODE_PROBE(true, "Transaction tag manually throttled");
 		TraceEvent("RatekeeperAddingManualThrottle", id)
 		    .detail("Tag", tag)
 		    .detail("Rate", tpsRate)
 		    .detail("Priority", transactionPriorityToString(priority))
 		    .detail("SecondsToExpiration", expiration - now());
 	} else if (oldLimits.get().tpsRate != tpsRate || oldLimits.get().expiration != expiration) {
-		TEST(true); // Manual transaction tag throttle updated
+		CODE_PROBE(true, "Manual transaction tag throttle updated");
 		TraceEvent("RatekeeperUpdatingManualThrottle", id)
 		    .detail("Tag", tag)
 		    .detail("Rate", tpsRate)
@ -225,14 +225,14 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
 				if (priorityItr != manualItr->second.end()) {
 					Optional<double> priorityClientRate = priorityItr->second.updateAndGetClientRate(requestRate);
 					if (!priorityClientRate.present()) {
-						TEST(true); // Manual priority throttle expired
+						CODE_PROBE(true, "Manual priority throttle expired");
 						priorityItr = manualItr->second.erase(priorityItr);
 					} else {
 						if (!manualClientRate.present() || manualClientRate.get().tpsRate > priorityClientRate.get()) {
 							manualClientRate = ClientTagThrottleLimits(priorityClientRate.get(),
 							                                           priorityItr->second.limits.expiration);
 						} else {
-							TEST(true); // Manual throttle overriden by higher priority
+							CODE_PROBE(true, "Manual throttle overriden by higher priority");
 						}

 						++priorityItr;
@ -241,13 +241,13 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g

 				if (manualClientRate.present()) {
 					tagPresent = true;
-					TEST(true); // Using manual throttle
+					CODE_PROBE(true, "Using manual throttle");
 					clientRates[*priority][tagItr->first] = manualClientRate.get();
 				}
 			}

 			if (manualItr->second.empty()) {
-				TEST(true); // All manual throttles expired
+				CODE_PROBE(true, "All manual throttles expired");
 				manualThrottledTags.erase(manualItr);
 				break;
 			}
@ -261,7 +261,7 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
 				double rampStartTime = autoItr->second.lastReduced + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION -
 				                       SERVER_KNOBS->AUTO_TAG_THROTTLE_RAMP_UP_TIME;
 				if (now() >= rampStartTime && adjustedRate != std::numeric_limits<double>::max()) {
-					TEST(true); // Tag auto-throttle ramping up
+					CODE_PROBE(true, "Tag auto-throttle ramping up");

 					double targetBusyness = SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS;
 					if (targetBusyness == 0) {
@ -280,14 +280,14 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
 					if (!result.second && result.first->second.tpsRate > adjustedRate) {
 						result.first->second = ClientTagThrottleLimits(adjustedRate, autoItr->second.limits.expiration);
 					} else {
-						TEST(true); // Auto throttle overriden by manual throttle
+						CODE_PROBE(true, "Auto throttle overriden by manual throttle");
 					}
 					clientRates[TransactionPriority::BATCH][tagItr->first] =
 					    ClientTagThrottleLimits(0, autoItr->second.limits.expiration);
 				}
 			} else {
 				ASSERT(autoItr->second.limits.expiration <= now());
-				TEST(true); // Auto throttle expired
+				CODE_PROBE(true, "Auto throttle expired");
 				if (BUGGIFY) { // Temporarily extend the window between expiration and cleanup
 					tagPresent = true;
 				} else {
@ -297,7 +297,7 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g
 		}

 		if (!tagPresent) {
-			TEST(true); // All tag throttles expired
+			CODE_PROBE(true, "All tag throttles expired");
 			tagItr = tagData.erase(tagItr);
 		} else {
 			++tagItr;
@ -309,7 +309,7 @@ PrioritizedTransactionTagMap<ClientTagThrottleLimits> RkTagThrottleCollection::g

 void RkTagThrottleCollection::addRequests(TransactionTag const& tag, int requests) {
 	if (requests > 0) {
-		TEST(true); // Requests reported for throttled tag
+		CODE_PROBE(true, "Requests reported for throttled tag");

 		auto tagItr = tagData.try_emplace(tag);
 		tagItr.first->second.requestRate.addDelta(requests);
--- a/fdbserver/SimpleConfigConsumer.actor.cpp
+++ b/fdbserver/SimpleConfigConsumer.actor.cpp
@ -109,7 +109,7 @@ class SimpleConfigConsumerImpl {
 			} catch (Error& e) {
 				++self->failedChangeRequest;
 				if (e.code() == error_code_version_already_compacted) {
-					TEST(true); // SimpleConfigConsumer get version_already_compacted error
+					CODE_PROBE(true, "SimpleConfigConsumer get version_already_compacted error");
 					wait(getSnapshotAndChanges(self, broadcaster));
 				} else {
 					throw e;
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -46,6 +46,7 @@
 #include "flow/network.h"
 #include "flow/TypeTraits.h"
 #include "flow/FaultInjection.h"
+#include "flow/CodeProbeUtils.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

 #undef max
@ -260,6 +261,9 @@ class TestConfig {
 			if (attrib == "disableRemoteKVS") {
 				disableRemoteKVS = strcmp(value.c_str(), "true") == 0;
 			}
+			if (attrib == "disableEncryption") {
+				disableEncryption = strcmp(value.c_str(), "true") == 0;
+			}
 			if (attrib == "restartInfoLocation") {
 				isFirstTestInRestart = true;
 			}
@ -297,6 +301,8 @@ public:
 	bool disableHostname = false;
 	// remote key value store is a child process spawned by the SS process to run the storage engine
 	bool disableRemoteKVS = false;
+	// 7.2 cannot be downgraded to 7.1 or below after enabling encryption-at-rest.
+	bool disableEncryption = false;
 	// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
 	//	0 = "ssd"
 	//	1 = "memory"
@ -358,6 +364,7 @@ public:
 		    .add("disableTss", &disableTss)
 		    .add("disableHostname", &disableHostname)
 		    .add("disableRemoteKVS", &disableRemoteKVS)
+		    .add("disableEncryption", &disableEncryption)
 		    .add("simpleConfig", &simpleConfig)
 		    .add("generateFearless", &generateFearless)
 		    .add("datacenters", &datacenters)
@ -839,9 +846,9 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
 				    .detail("Folder", myFolders[i]);
 			}

-			TEST(bootCount >= 1); // Simulated machine rebooted
-			TEST(bootCount >= 2); // Simulated machine rebooted twice
-			TEST(bootCount >= 3); // Simulated machine rebooted three times
+			CODE_PROBE(bootCount >= 1, "Simulated machine rebooted");
+			CODE_PROBE(bootCount >= 2, "Simulated machine rebooted twice");
+			CODE_PROBE(bootCount >= 3, "Simulated machine rebooted three times");
 			++bootCount;

 			TraceEvent("SimulatedMachineStart", randomId)
@ -961,7 +968,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
 			for (int i = 1; i < ips.size(); i++)
 				killType = std::max(processes[i].get(), killType);

-			TEST(true); // Simulated machine has been rebooted
+			CODE_PROBE(true, "Simulated machine has been rebooted");

 			state bool swap = killType == ISimulator::Reboot && BUGGIFY_WITH_PROB(0.75) &&
 			                  g_simulator.canSwapToMachine(localities.zoneId());
@ -989,7 +996,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
 				avail.pop_back();

 				if (myFolders != toRebootFrom) {
-					TEST(true); // Simulated machine swapped data folders
+					CODE_PROBE(true, "Simulated machine swapped data folders");
 					TraceEvent("SimulatedMachineFolderSwap", randomId)
 					    .detail("OldFolder0", myFolders[0])
 					    .detail("NewFolder0", toRebootFrom[0])
@ -1014,7 +1021,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
 					}
 				}

-				TEST(true); // Simulated machine rebooted with data loss
+				CODE_PROBE(true, "Simulated machine rebooted with data loss");
 			}

 			// this machine is rebooting = false;
@ -1061,7 +1068,7 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
 	// Randomly change data center id names to test that localities
 	// can be modified on cluster restart
 	bool renameZoneIds = testConfig.randomlyRenameZoneId ? deterministicRandom()->random01() < 0.1 : false;
-	TEST(renameZoneIds); // Zone ID names altered in restart test
+	CODE_PROBE(renameZoneIds, "Zone ID names altered in restart test");

 	// allows multiple ipAddr entries
 	ini.SetMultiKey();
@ -1091,10 +1098,15 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
 				INetworkConnections::net()->parseMockDNSFromString(mockDNSStr);
 			}
 		}
+		auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
 		if (testConfig.disableRemoteKVS) {
-			IKnobCollection::getMutableGlobalKnobCollection().setKnob("remote_kv_store",
-			                                                          KnobValueRef::create(bool{ false }));
-			TraceEvent(SevDebug, "DisaableRemoteKVS").log();
+			g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false }));
+			TraceEvent(SevDebug, "DisableRemoteKVS");
+		}
+		if (testConfig.disableEncryption) {
+			g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
+			g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));
+			TraceEvent(SevDebug, "DisableEncryption");
 		}
 		*pConnString = conn;
 		*pTesterCount = testerCount;
@ -1386,27 +1398,27 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {

 	switch (storage_engine_type) {
 	case 0: {
-		TEST(true); // Simulated cluster using ssd storage engine
+		CODE_PROBE(true, "Simulated cluster using ssd storage engine");
 		set_config("ssd");
 		break;
 	}
 	case 1: {
-		TEST(true); // Simulated cluster using default memory storage engine
+		CODE_PROBE(true, "Simulated cluster using default memory storage engine");
 		set_config("memory");
 		break;
 	}
 	case 2: {
-		TEST(true); // Simulated cluster using radix-tree storage engine
+		CODE_PROBE(true, "Simulated cluster using radix-tree storage engine");
 		set_config("memory-radixtree-beta");
 		break;
 	}
 	case 3: {
-		TEST(true); // Simulated cluster using redwood storage engine
+		CODE_PROBE(true, "Simulated cluster using redwood storage engine");
 		set_config("ssd-redwood-1-experimental");
 		break;
 	}
 	case 4: {
-		TEST(true); // Simulated cluster using RocksDB storage engine
+		CODE_PROBE(true, "Simulated cluster using RocksDB storage engine");
 		set_config("ssd-rocksdb-v1");
 		// Tests using the RocksDB engine are necessarily non-deterministic because of RocksDB
 		// background threads.
@ -1416,7 +1428,7 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
 		break;
 	}
 	case 5: {
-		TEST(true); // Simulated cluster using Sharded RocksDB storage engine
+		CODE_PROBE(true, "Simulated cluster using Sharded RocksDB storage engine");
 		set_config("ssd-sharded-rocksdb");
 		// Tests using the RocksDB engine are necessarily non-deterministic because of RocksDB
 		// background threads.
@ -1442,7 +1454,7 @@ void SimulationConfig::setReplicationType(const TestConfig& testConfig) {
 	} else {
 		switch (replication_type) {
 		case 0: {
-			TEST(true); // Simulated cluster using custom redundancy mode
+			CODE_PROBE(true, "Simulated cluster using custom redundancy mode");
 			int storage_servers = deterministicRandom()->randomInt(1, generateFearless ? 4 : 5);
 			// FIXME: log replicas must be more than storage replicas because otherwise better master exists will not
 			// recognize it needs to change dcs
@ -1461,21 +1473,21 @@ void SimulationConfig::setReplicationType(const TestConfig& testConfig) {
 			break;
 		}
 		case 1: {
-			TEST(true); // Simulated cluster running in single redundancy mode
+			CODE_PROBE(true, "Simulated cluster running in single redundancy mode");
 			set_config("single");
 			break;
 		}
 		case 2: {
-			TEST(true); // Simulated cluster running in double redundancy mode
+			CODE_PROBE(true, "Simulated cluster running in double redundancy mode");
 			set_config("double");
 			break;
 		}
 		case 3: {
 			if (datacenters <= 2 || generateFearless) {
-				TEST(true); // Simulated cluster running in triple redundancy mode
+				CODE_PROBE(true, "Simulated cluster running in triple redundancy mode");
 				set_config("triple");
 			} else if (datacenters == 3) {
-				TEST(true); // Simulated cluster running in 3 data-hall mode
+				CODE_PROBE(true, "Simulated cluster running in 3 data-hall mode");
 				set_config("three_data_hall");
 			} else {
 				ASSERT(false);
@ -1526,17 +1538,17 @@ void SimulationConfig::setRegions(const TestConfig& testConfig) {
 			int satellite_replication_type = deterministicRandom()->randomInt(0, 3);
 			switch (satellite_replication_type) {
 			case 0: {
-				TEST(true); // Simulated cluster using no satellite redundancy mode (>4 datacenters)
+				CODE_PROBE(true, "Simulated cluster using no satellite redundancy mode (>4 datacenters)");
 				break;
 			}
 			case 1: {
-				TEST(true); // Simulated cluster using two satellite fast redundancy mode
+				CODE_PROBE(true, "Simulated cluster using two satellite fast redundancy mode");
 				primaryObj["satellite_redundancy_mode"] = "two_satellite_fast";
 				remoteObj["satellite_redundancy_mode"] = "two_satellite_fast";
 				break;
 			}
 			case 2: {
-				TEST(true); // Simulated cluster using two satellite safe redundancy mode
+				CODE_PROBE(true, "Simulated cluster using two satellite safe redundancy mode");
 				primaryObj["satellite_redundancy_mode"] = "two_satellite_safe";
 				remoteObj["satellite_redundancy_mode"] = "two_satellite_safe";
 				break;
@ -1549,27 +1561,27 @@ void SimulationConfig::setRegions(const TestConfig& testConfig) {
 			switch (satellite_replication_type) {
 			case 0: {
 				// FIXME: implement
-				TEST(true); // Simulated cluster using custom satellite redundancy mode
+				CODE_PROBE(true, "Simulated cluster using custom satellite redundancy mode");
 				break;
 			}
 			case 1: {
-				TEST(true); // Simulated cluster using no satellite redundancy mode (<4 datacenters)
+				CODE_PROBE(true, "Simulated cluster using no satellite redundancy mode (<4 datacenters)");
 				break;
 			}
 			case 2: {
-				TEST(true); // Simulated cluster using single satellite redundancy mode
+				CODE_PROBE(true, "Simulated cluster using single satellite redundancy mode");
 				primaryObj["satellite_redundancy_mode"] = "one_satellite_single";
 				remoteObj["satellite_redundancy_mode"] = "one_satellite_single";
 				break;
 			}
 			case 3: {
-				TEST(true); // Simulated cluster using double satellite redundancy mode
+				CODE_PROBE(true, "Simulated cluster using double satellite redundancy mode");
 				primaryObj["satellite_redundancy_mode"] = "one_satellite_double";
 				remoteObj["satellite_redundancy_mode"] = "one_satellite_double";
 				break;
 			}
 			case 4: {
-				TEST(true); // Simulated cluster using triple satellite redundancy mode
+				CODE_PROBE(true, "Simulated cluster using triple satellite redundancy mode");
 				primaryObj["satellite_redundancy_mode"] = "one_satellite_triple";
 				remoteObj["satellite_redundancy_mode"] = "one_satellite_triple";
 				break;
@ -1589,10 +1601,10 @@ void SimulationConfig::setRegions(const TestConfig& testConfig) {
 		if (testConfig.minimumRegions <= 1 &&
 		    (deterministicRandom()->random01() < 0.25 ||
 		     SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < SERVER_KNOBS->VERSIONS_PER_SECOND)) {
-			TEST(true); // Simulated cluster using one region
+			CODE_PROBE(true, "Simulated cluster using one region");
 			needsRemote = false;
 		} else {
-			TEST(true); // Simulated cluster using two regions
+			CODE_PROBE(true, "Simulated cluster using two regions");
 			db.usableRegions = 2;
 		}

@ -1600,25 +1612,25 @@ void SimulationConfig::setRegions(const TestConfig& testConfig) {
 		switch (remote_replication_type) {
 		case 0: {
 			// FIXME: implement
-			TEST(true); // Simulated cluster using custom remote redundancy mode
+			CODE_PROBE(true, "Simulated cluster using custom remote redundancy mode");
 			break;
 		}
 		case 1: {
-			TEST(true); // Simulated cluster using default remote redundancy mode
+			CODE_PROBE(true, "Simulated cluster using default remote redundancy mode");
 			break;
 		}
 		case 2: {
-			TEST(true); // Simulated cluster using single remote redundancy mode
+			CODE_PROBE(true, "Simulated cluster using single remote redundancy mode");
 			set_config("remote_single");
 			break;
 		}
 		case 3: {
-			TEST(true); // Simulated cluster using double remote redundancy mode
+			CODE_PROBE(true, "Simulated cluster using double remote redundancy mode");
 			set_config("remote_double");
 			break;
 		}
 		case 4: {
-			TEST(true); // Simulated cluster using triple remote redundancy mode
+			CODE_PROBE(true, "Simulated cluster using triple remote redundancy mode");
 			set_config("remote_triple");
 			break;
 		}
@ -1860,10 +1872,15 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 	if (testConfig.configureLocked) {
 		startingConfigString += " locked";
 	}
+	auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
 	if (testConfig.disableRemoteKVS) {
-		IKnobCollection::getMutableGlobalKnobCollection().setKnob("remote_kv_store",
-		                                                          KnobValueRef::create(bool{ false }));
-		TraceEvent(SevDebug, "DisaableRemoteKVS").log();
+		g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false }));
+		TraceEvent(SevDebug, "DisableRemoteKVS");
+	}
+	if (testConfig.disableEncryption) {
+		g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false }));
+		g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false }));
+		TraceEvent(SevDebug, "DisableEncryption");
 	}
 	auto configDBType = testConfig.getConfigDBType();
 	for (auto kv : startingConfigJSON) {
@ -1967,18 +1984,18 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 	bool sslOnly = sslEnabled && deterministicRandom()->coinflip();
 	bool isTLS = sslEnabled && sslOnly;
 	g_simulator.listenersPerProcess = sslEnabled && !sslOnly ? 2 : 1;
-	TEST(sslEnabled); // SSL enabled
-	TEST(!sslEnabled); // SSL disabled
+	CODE_PROBE(sslEnabled, "SSL enabled");
+	CODE_PROBE(!sslEnabled, "SSL disabled");

 	// Use IPv6 25% of the time
 	bool useIPv6 = deterministicRandom()->random01() < 0.25;
-	TEST(useIPv6); // Use IPv6
-	TEST(!useIPv6); // Use IPv4
+	CODE_PROBE(useIPv6, "Use IPv6");
+	CODE_PROBE(!useIPv6, "Use IPv4");

 	// Use hostname 25% of the time, unless it is disabled
 	bool useHostname = !testConfig.disableHostname && deterministicRandom()->random01() < 0.25;
-	TEST(useHostname); // Use hostname
-	TEST(!useHostname); // Use IP address
+	CODE_PROBE(useHostname, "Use hostname");
+	CODE_PROBE(!useHostname, "Use IP address");
 	NetworkAddressFromHostname fromHostname =
 	    useHostname ? NetworkAddressFromHostname::True : NetworkAddressFromHostname::False;

@ -2414,7 +2431,7 @@ ACTOR void setupAndRun(std::string dataFolder,
 	wait(g_simulator.onProcess(testSystem, TaskPriority::DefaultYield));
 	Sim2FileSystem::newFileSystem();
 	FlowTransport::createInstance(true, 1, WLTOKEN_RESERVED_COUNT, &allowList);
-	TEST(true); // Simulation start
+	CODE_PROBE(true, "Simulation start");

 	state Optional<TenantName> defaultTenant;
 	state Standalone<VectorRef<TenantNameRef>> tenantsToCreate;
@ -2491,6 +2508,8 @@ ACTOR void setupAndRun(std::string dataFolder,
 		TraceEvent(SevError, "SetupAndRunError").error(e);
 	}

+	TraceEvent("TracingMissingCodeProbes").log();
+	probe::traceMissedProbes(probe::ExecutionContext::Simulation);
 	TraceEvent("SimulatedSystemDestruct").log();
 	g_simulator.stop();
 	destructed = true;
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -817,7 +817,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
 		roles.addRole("blob_manager", db->get().blobManager.get());
 	}

-	if ((SERVER_KNOBS->ENABLE_ENCRYPTION || g_network->isSimulated()) && db->get().encryptKeyProxy.present()) {
+	if (SERVER_KNOBS->ENABLE_ENCRYPTION && db->get().encryptKeyProxy.present()) {
 		roles.addRole("encrypt_key_proxy", db->get().encryptKeyProxy.get());
 	}

--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@ -282,7 +282,7 @@ public:
 	void checkChangeCounter(uint64_t oldCacheRangeChangeCounter, KeyRef const& key) {
 		if (oldCacheRangeChangeCounter != cacheRangeChangeCounter &&
 		    cachedRangeMap[key]->changeCounter > oldCacheRangeChangeCounter) {
-			TEST(true); // CacheRange change during getValueQ
+			CODE_PROBE(true, "CacheRange change during getValueQ");
 			// TODO: should we throw the cold_cache_server() error here instead?
 			throw wrong_shard_server();
 		}
@ -293,7 +293,7 @@ public:
 			auto sh = cachedRangeMap.intersectingRanges(keys);
 			for (auto i = sh.begin(); i != sh.end(); ++i)
 				if (i->value()->changeCounter > oldCacheRangeChangeCounter) {
-					TEST(true); // CacheRange change during range operation
+					CODE_PROBE(true, "CacheRange change during range operation");
 					// TODO: should we throw the cold_cache_server() error here instead?
 					throw wrong_shard_server();
 				}
@ -472,7 +472,6 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
 	try {
 		++data->counters.getValueQueries;
 		++data->counters.allQueries;
-		//++data->readQueueSizeMetric;
 		// TODO later
 		// data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() -
 		// data->counters.finishedQueries.getValue());
@ -544,7 +543,6 @@ ACTOR Future<Void> getValueQ(StorageCacheData* data, GetValueRequest req) {
 	}

 	++data->counters.finishedQueries;
-	//--data->readQueueSizeMetric;
 	// if(data->latencyBandConfig.present()) {
 	//	int maxReadBytes =
 	// data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
@ -665,7 +663,7 @@ Key findKey(StorageCacheData* data, KeySelectorRef sel, Version version, KeyRang
 	// If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in
 	// a loop
 	if (more && !forward && rep.data.size() == 1) {
-		TEST(true); // Reverse key selector returned only one result in range read
+		CODE_PROBE(true, "Reverse key selector returned only one result in range read");
 		maxBytes = std::numeric_limits<int>::max();
 		GetKeyValuesReply rep2 =
 		    readRange(data, version, KeyRangeRef(range.begin, keyAfter(sel.getKey())), -2, &maxBytes);
@ -688,7 +686,7 @@ Key findKey(StorageCacheData* data, KeySelectorRef sel, Version version, KeyRang
 			*pOffset = -*pOffset;

 		if (more) {
-			TEST(true); // Key selector read range had more results
+			CODE_PROBE(true, "Key selector read range had more results");

 			ASSERT(rep.data.size());
 			Key returnKey = forward ? keyAfter(rep.data.back().key) : rep.data.back().key;
@ -728,7 +726,6 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
 	++data->counters.getRangeQueries;
 	++data->counters.allQueries;
 	// printf("\nSCGetKeyValues\n");
-	//++data->readQueueSizeMetric;
 	// data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() -
 	// data->counters.finishedQueries.getValue());

@ -781,7 +778,7 @@ ACTOR Future<Void> getKeyValues(StorageCacheData* data, GetKeyValuesRequest req)
 		// cachedKeyRange is the end the last actual key returned must be from this cachedKeyRange. A begin offset of 1
 		// is also OK because then either begin is past end or equal to end (so the result is definitely empty)
 		if ((offset1 && offset1 != 1) || (offset2 && offset2 != 1)) {
-			TEST(true); // wrong_cache_server due to offset
+			CODE_PROBE(true, "wrong_cache_server due to offset");
 			// We could detect when offset1 takes us off the beginning of the database or offset2 takes us off the end,
 			// and return a clipped range rather than an error (since that is what the NativeAPI.getRange will do anyway
 			// via its "slow path"), but we would have to add some flags to the response to encode whether we went off
@ -943,7 +940,7 @@ bool expandMutation(MutationRef& m, StorageCacheData::VersionedData const& data,
 		if (it != data.atLatest().end() && it->isValue() && it.key() == m.param1)
 			oldVal = it->getValue();
 		else if (it != data.atLatest().end() && it->isClearTo() && it->getEndKey() > m.param1) {
-			TEST(true); // Atomic op right after a clear.
+			CODE_PROBE(true, "Atomic op right after a clear.");
 		}

 		switch (m.type) {
@ -1073,7 +1070,7 @@ void splitMutation(StorageCacheData* data, KeyRangeMap<T>& map, MutationRef cons
 }

 void rollback(StorageCacheData* data, Version rollbackVersion, Version nextVersion) {
-	TEST(true); // call to cacheRange rollback
+	CODE_PROBE(true, "call to cacheRange rollback");
 	// FIXME: enable when debugKeyRange is active
 	// debugKeyRange("Rollback", rollbackVersion, allKeys);

@ -1279,7 +1276,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
 			lastAvailable = std::max(lastAvailable, r->value());

 		if (lastAvailable != invalidVersion && lastAvailable >= data->oldestVersion.get()) {
-			TEST(true); // wait for oldest version
+			CODE_PROBE(true, "wait for oldest version");
 			wait(data->oldestVersion.whenAtLeast(lastAvailable + 1));
 		}

@ -1318,7 +1315,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang

 		loop {
 			try {
-				TEST(true); // Fetching keys for transferred cacheRange
+				CODE_PROBE(true, "Fetching keys for transferred cacheRange");

 				state RangeResult this_block =
 				    wait(tryFetchRange(data->cx,
@ -1382,7 +1379,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
 				    .suppressFor(1.0)
 				    .detail("FKID", interval.pairID);
 				if (e.code() == error_code_transaction_too_old) {
-					TEST(true); // A storage server has forgotten the history data we are fetching
+					CODE_PROBE(true, "A storage server has forgotten the history data we are fetching");
 					Version lastFV = fetchVersion;
 					fetchVersion = data->version.get();
 					isTooOld = false;
@ -1409,8 +1406,9 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
 						    .detail("E", data->version.get());
 					}
 				} else if (e.code() == error_code_future_version || e.code() == error_code_process_behind) {
-					TEST(true); // fetchKeys got future_version or process_behind, so there must be a huge storage lag
-					            // somewhere.  Keep trying.
+					CODE_PROBE(true,
+					           "fetchKeys got future_version or process_behind, so there must be a huge storage lag "
+					           "somewhere.  Keep trying.");
 				} else {
 					throw;
 				}
@ -1470,7 +1468,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
 		}

 		int startSize = batch->changes.size();
-		TEST(startSize); // Adding fetch data to a batch which already has changes
+		CODE_PROBE(startSize, "Adding fetch data to a batch which already has changes");
 		batch->changes.resize(batch->changes.size() + cacheRange->updates.size());

 		// FIXME: pass the deque back rather than copy the data
@ -1633,7 +1631,7 @@ void cacheWarmup(StorageCacheData* data, const KeyRangeRef& keys, bool nowAssign
 		else {
 			ASSERT(ranges[i].value->adding);
 			data->addCacheRange(CacheRangeInfo::newAdding(data, ranges[i]));
-			TEST(true); // cacheWarmup reFetchKeys
+			CODE_PROBE(true, "cacheWarmup reFetchKeys");
 		}
 	}

@ -1772,7 +1770,7 @@ private:
 			br >> rollbackVersion;

 			if (rollbackVersion < fromVersion && rollbackVersion > data->oldestVersion.get()) {
-				TEST(true); // CacheRangeApplyPrivateData cacheRange rollback
+				CODE_PROBE(true, "CacheRangeApplyPrivateData cacheRange rollback");
 				TraceEvent(SevWarn, "Rollback", data->thisServerID)
 				    .detail("FromVersion", fromVersion)
 				    .detail("ToVersion", rollbackVersion)
@ -1962,8 +1960,10 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
 					}
 					if (data->cacheRangeChangeCounter == changeCounter)
 						break;
-					// TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated.  Read
-					// it again.
+					// CODE_PROBE(
+					//     true,
+					//     "A fetchKeys completed while we were doing this, so eager might be outdated. Read it
+					//     again.");
 				}
 			}

@ -2014,7 +2014,7 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
 					SpanContextMessage scm;
 					reader >> scm;
 				} else if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) {
-					TEST(true); // StorageCache reading OTELSpanContextMessage
+					CODE_PROBE(true, "StorageCache reading OTELSpanContextMessage");
 					OTELSpanContextMessage oscm;
 					reader >> oscm;
 				} else {
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -157,7 +157,7 @@ private:
 			Standalone<StringRef> h = wait(self->queue->readNext(sizeof(uint32_t)));
 			if (h.size() != sizeof(uint32_t)) {
 				if (h.size()) {
-					TEST(true); // Zero fill within size field
+					CODE_PROBE(true, "Zero fill within size field");
 					int payloadSize = 0;
 					memcpy(&payloadSize, h.begin(), h.size());
 					zeroFillSize = sizeof(uint32_t) - h.size(); // zero fill the size itself
@ -171,7 +171,7 @@ private:

 			Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
 			if (e.size() != payloadSize + 1) {
-				TEST(true); // Zero fill within payload
+				CODE_PROBE(true, "Zero fill within payload");
 				zeroFillSize = payloadSize + 1 - e.size();
 				break;
 			}
@ -187,7 +187,7 @@ private:
 			}
 		}
 		if (zeroFillSize) {
-			TEST(true); // Fixing a partial commit at the end of the tlog queue
+			CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
 			for (int i = 0; i < zeroFillSize; i++)
 				self->queue->push(StringRef((const uint8_t*)"", 1));
 		}
@ -805,9 +805,9 @@ void TLogQueue::updateVersionSizes(const TLogQueueEntry& result,
 ACTOR Future<Void> tLogLock(TLogData* self, ReplyPromise<TLogLockResult> reply, Reference<LogData> logData) {
 	state Version stopVersion = logData->version.get();

-	TEST(true); // TLog stopped by recovering cluster-controller
-	TEST(logData->stopped); // logData already stopped
-	TEST(!logData->stopped); // logData not yet stopped
+	CODE_PROBE(true, "TLog stopped by recovering cluster-controller");
+	CODE_PROBE(logData->stopped, "logData already stopped");
+	CODE_PROBE(!logData->stopped, "logData not yet stopped");

 	TraceEvent("TLogStop", logData->logId)
 	    .detail("Ver", stopVersion)
@ -1097,7 +1097,7 @@ ACTOR Future<Void> updatePersistentData(TLogData* self, Reference<LogData> logDa
 	// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue,
 	// increase bytesDurable accordingly, and update persistentDataDurableVersion.

-	TEST(anyData); // TLog moved data to persistentData
+	CODE_PROBE(anyData, "TLog moved data to persistentData");
 	logData->persistentDataDurableVersion = newPersistentDataVersion;
 	for (tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
 		for (tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
@ -1248,7 +1248,7 @@ ACTOR Future<Void> processPopRequests(TLogData* self, Reference<LogData> logData
 		TraceEvent("PlayIgnoredPop", logData->logId).detail("Tag", tag.toString()).detail("Version", version);
 		ignoredPops.push_back(tLogPopCore(self, tag, version, logData));
 		if (++ignoredPopsPlayed % SERVER_KNOBS->TLOG_POP_BATCH_SIZE == 0) {
-			TEST(true); // Yielding while processing pop requests
+			CODE_PROBE(true, "Yielding while processing pop requests");
 			wait(yield());
 		}
 	}
@ -1836,7 +1836,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 			}
 			if (sequenceData.isSet()) {
 				if (sequenceData.getFuture().get().first != rep.end) {
-					TEST(true); // tlog peek second attempt ended at a different version
+					CODE_PROBE(true, "tlog peek second attempt ended at a different version");
 					replyPromise.sendError(operation_obsolete());
 					return Void();
 				}
@ -2069,7 +2069,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 		if (sequenceData.isSet()) {
 			trackerData.duplicatePeeks++;
 			if (sequenceData.getFuture().get().first != reply.end) {
-				TEST(true); // tlog peek second attempt ended at a different version (2)
+				CODE_PROBE(true, "tlog peek second attempt ended at a different version (2)");
 				replyPromise.sendError(operation_obsolete());
 				return Void();
 			}
@ -2177,7 +2177,7 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
 		    .detail("LogId", logData->logId)
 		    .detail("Version", it->version.get())
 		    .detail("QueueVer", it->queueCommittedVersion.get());
-		TEST(true); // A TLog was replaced before having a chance to commit its queue
+		CODE_PROBE(true, "A TLog was replaced before having a chance to commit its queue");
 		it->queueCommittedVersion.set(it->version.get());
 	}
 	return Void();
@ -2655,7 +2655,7 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
 		when(TLogCommitRequest req = waitNext(tli.commit.getFuture())) {
 			//TraceEvent("TLogCommitReq", logData->logId).detail("Ver", req.version).detail("PrevVer", req.prevVersion).detail("LogVer", logData->version.get());
 			ASSERT(logData->isPrimary);
-			TEST(logData->stopped); // TLogCommitRequest while stopped
+			CODE_PROBE(logData->stopped, "TLogCommitRequest while stopped");
 			if (!logData->stopped)
 				logData->addActor.send(tLogCommit(self, req, logData, warningCollectorInput));
 			else
@ -3026,7 +3026,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	if (!fFormat.get().present()) {
 		RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
 		if (!v.size()) {
-			TEST(true); // The DB is completely empty, so it was never initialized.  Delete it.
+			CODE_PROBE(true, "The DB is completely empty, so it was never initialized.  Delete it.");
 			throw worker_removed();
 		} else {
 			// This should never happen
@ -3183,7 +3183,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 			throw end_of_stream();
 		loop {
 			if (allRemoved.isReady()) {
-				TEST(true); // all tlogs removed during queue recovery
+				CODE_PROBE(true, "all tlogs removed during queue recovery");
 				throw worker_removed();
 			}
 			choose {
@ -3214,7 +3214,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 							logData->queueCommittedVersion.set(qe.version);

 							while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
-								TEST(true); // Flush excess data during TLog queue recovery
+								CODE_PROBE(true, "Flush excess data during TLog queue recovery");
 								TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid)
 								    .detail("LogId", logData->logId)
 								    .detail("BytesInput", self->bytesInput)
@ -3244,7 +3244,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	}

 	TraceEvent("TLogRestorePersistentStateDone", self->dbgid).detail("Took", now() - startt);
-	TEST(now() - startt >= 1.0); // TLog recovery took more than 1 second
+	CODE_PROBE(now() - startt >= 1.0, "TLog recovery took more than 1 second");

 	for (auto it : self->id_data) {
 		if (it.second->queueCommittedVersion.get() == 0) {
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -2167,7 +2167,7 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
 		}
 	}

-	TEST(true); // Master recovery from pre-existing database
+	CODE_PROBE(true, "Master recovery from pre-existing database");

 	// trackRejoins listens for rejoin requests from the tLogs that we are recovering from, to learn their
 	// TLogInterfaces
@ -2228,7 +2228,7 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
 			}
 			if (!lockedLocalities.count(log->locality)) {
 				TraceEvent("EpochEndLockExtra").detail("Locality", log->locality);
-				TEST(true); // locking old generations for version information
+				CODE_PROBE(true, "locking old generations for version information");
 				lockedLocalities.insert(log->locality);
 				LogLockInfo lockResult;
 				lockResult.epochEnd = old.epochEnd;
@ -2312,7 +2312,7 @@ ACTOR Future<Void> TagPartitionedLogSystem::epochEnd(Reference<AsyncVar<Referenc
 			changes.push_back(TagPartitionedLogSystem::getDurableVersionChanged(lockResults[log], logFailed[log]));
 		}
 		if (maxEnd > 0 && (!lastEnd.present() || maxEnd < lastEnd.get())) {
-			TEST(lastEnd.present()); // Restarting recovery at an earlier point
+			CODE_PROBE(lastEnd.present(), "Restarting recovery at an earlier point");

 			auto logSystem = makeReference<TagPartitionedLogSystem>(dbgid, locality, prevState.recoveryCount);

--- a/fdbserver/TagThrottler.actor.cpp
+++ b/fdbserver/TagThrottler.actor.cpp
@ -54,20 +54,20 @@ class TagThrottlerImpl {

 					if (autoThrottlingEnabled.get().present() &&
 					    autoThrottlingEnabled.get().get() == LiteralStringRef("0")) {
-						TEST(true); // Auto-throttling disabled
+						CODE_PROBE(true, "Auto-throttling disabled");
 						if (self->autoThrottlingEnabled) {
 							TraceEvent("AutoTagThrottlingDisabled", self->id).log();
 						}
 						self->autoThrottlingEnabled = false;
 					} else if (autoThrottlingEnabled.get().present() &&
 					           autoThrottlingEnabled.get().get() == LiteralStringRef("1")) {
-						TEST(true); // Auto-throttling enabled
+						CODE_PROBE(true, "Auto-throttling enabled");
 						if (!self->autoThrottlingEnabled) {
 							TraceEvent("AutoTagThrottlingEnabled", self->id).log();
 						}
 						self->autoThrottlingEnabled = true;
 					} else {
-						TEST(true); // Auto-throttling unspecified
+						CODE_PROBE(true, "Auto-throttling unspecified");
 						if (autoThrottlingEnabled.get().present()) {
 							TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue", self->id)
 							    .detail("Value", autoThrottlingEnabled.get().get());
@ -90,7 +90,7 @@ class TagThrottlerImpl {

 						if (tagValue.expirationTime == 0 ||
 						    tagValue.expirationTime > now() + tagValue.initialDuration) {
-							TEST(true); // Converting tag throttle duration to absolute time
+							CODE_PROBE(true, "Converting tag throttle duration to absolute time");
 							tagValue.expirationTime = now() + tagValue.initialDuration;
 							BinaryWriter wr(IncludeVersion(ProtocolVersion::withTagThrottleValueReason()));
 							wr << tagValue;
@ -128,7 +128,7 @@ class TagThrottlerImpl {

 					wait(watchFuture);
 					TraceEvent("RatekeeperThrottleSignaled", self->id).log();
-					TEST(true); // Tag throttle changes detected
+					CODE_PROBE(true, "Tag throttle changes detected");
 					break;
 				} catch (Error& e) {
 					TraceEvent("RatekeeperMonitorThrottlingChangesError", self->id).error(e);
@ -142,7 +142,7 @@ class TagThrottlerImpl {
 		// NOTE: before the comparison with MIN_TAG_COST, the busiest tag rate also compares with MIN_TAG_PAGES_RATE
 		// currently MIN_TAG_PAGES_RATE > MIN_TAG_COST in our default knobs.
 		if (busyness > SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS && rate > SERVER_KNOBS->MIN_TAG_COST) {
-			TEST(true); // Transaction tag auto-throttled
+			CODE_PROBE(true, "Transaction tag auto-throttled");
 			Optional<double> clientRate = throttledTags.autoThrottleTag(id, tag, busyness);
 			// TODO: Increment tag throttle counts here?
 			if (clientRate.present()) {
--- a/fdbserver/TransactionTagCounter.cpp
+++ b/fdbserver/TransactionTagCounter.cpp
@ -99,7 +99,7 @@ public:

 	void addRequest(Optional<TagSet> const& tags, int64_t bytes) {
 		if (tags.present()) {
-			TEST(true); // Tracking transaction tag in counter
+			CODE_PROBE(true, "Tracking transaction tag in counter");
 			double cost = costFunction(bytes);
 			for (auto& tag : tags.get()) {
 				int64_t& count = intervalCounts[TransactionTag(tag, tags.get().getArena())];
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@ -7819,10 +7819,10 @@ public:
 		if (rowLimit > 0) {
 			f = cur.seekGTE(keys.begin);
 			if (f.isReady()) {
-				TEST(true); // Cached forward range read seek
+				CODE_PROBE(true, "Cached forward range read seek");
 				f.get();
 			} else {
-				TEST(true); // Uncached forward range read seek
+				CODE_PROBE(true, "Uncached forward range read seek");
 				wait(store(lock, self->m_concurrentReads.lock()));
 				wait(f);
 			}
@ -7875,10 +7875,10 @@ public:
 		} else {
 			f = cur.seekLT(keys.end);
 			if (f.isReady()) {
-				TEST(true); // Cached reverse range read seek
+				CODE_PROBE(true, "Cached reverse range read seek");
 				f.get();
 			} else {
-				TEST(true); // Uncached reverse range read seek
+				CODE_PROBE(true, "Uncached reverse range read seek");
 				wait(store(lock, self->m_concurrentReads.lock()));
 				wait(f);
 			}
--- a/fdbserver/WaitFailure.actor.cpp
+++ b/fdbserver/WaitFailure.actor.cpp
@ -30,7 +30,7 @@ ACTOR Future<Void> waitFailureServer(FutureStream<ReplyPromise<Void>> waitFailur
 		ReplyPromise<Void> P = waitNext(waitFailure);
 		queue.push_back(P);
 		if (queue.size() > SERVER_KNOBS->MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS) {
-			TEST(true); // wait server queue full
+			CODE_PROBE(true, "wait server queue full");
 			queue.front().send(Void());
 			queue.pop_front();
 		}
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -108,7 +108,7 @@ enum {
 	OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_TRACER, OPT_NEWCONSOLE,
 	OPT_NOBOX, OPT_TESTFILE, OPT_RESTARTING, OPT_RESTORING, OPT_RANDOMSEED, OPT_KEY, OPT_MEMLIMIT, OPT_VMEMLIMIT, OPT_STORAGEMEMLIMIT, OPT_CACHEMEMLIMIT, OPT_MACHINEID,
 	OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_BUILD_FLAGS, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR,
-	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
+	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_PRINT_CODE_PROBES, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
 	OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
 	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_CONFIG_PATH, OPT_USE_TEST_CONFIG_DB, OPT_FAULT_INJECTION, OPT_PROFILER, OPT_PRINT_SIMTIME,
 	OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK, OPT_KMS_CONN_DISCOVERY_URL_FILE, OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT
@ -183,6 +183,7 @@ CSimpleOpt::SOption g_rgOptions[] = {
 	{ OPT_HELP,                  "-h",                          SO_NONE },
 	{ OPT_HELP,                  "--help",                      SO_NONE },
 	{ OPT_DEVHELP,               "--dev-help",                  SO_NONE },
+	{ OPT_PRINT_CODE_PROBES,     "--code-probes",               SO_REQ_SEP },
 	{ OPT_KNOB,                  "--knob-",                     SO_REQ_SEP },
 	{ OPT_UNITTESTPARAM,         "--test-",                     SO_REQ_SEP },
 	{ OPT_LOCALITY,              "--locality-",                 SO_REQ_SEP },
@ -1144,6 +1145,10 @@ private:
 				printUsage(argv[0], true);
 				flushAndExit(FDB_EXIT_SUCCESS);
 				break;
+			case OPT_PRINT_CODE_PROBES:
+				probe::ICodeProbe::printProbesJSON({ std::string(args.OptionArg()) });
+				flushAndExit(FDB_EXIT_SUCCESS);
+				break;
 			case OPT_KNOB: {
 				Optional<std::string> knobName = extractPrefixedArgument("--knob", args.OptionSyntax());
 				if (!knobName.present()) {
@ -2121,6 +2126,14 @@ int main(int argc, char* argv[]) {
 						}
 					}
 				}
+				g_knobs.setKnob("enable_encryption",
+				                KnobValue::create(ini.GetBoolValue("META", "enableEncryption", false)));
+				g_knobs.setKnob("enable_tlog_encryption",
+				                KnobValue::create(ini.GetBoolValue("META", "enableTLogEncryption", false)));
+				g_knobs.setKnob("enable_blob_granule_encryption",
+				                KnobValue::create(ini.GetBoolValue("META", "enableBlobGranuleEncryption", false)));
+				g_knobs.setKnob("enable_blob_granule_compression",
+				                KnobValue::create(ini.GetBoolValue("META", "enableBlobGranuleEncryption", false)));
 			}
 			setupAndRun(dataFolder, opts.testFile, opts.restarting, (isRestoring >= 1), opts.whitelistBinPaths);
 			g_simulator.run();
--- a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h
+++ b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h
@ -26,14 +26,14 @@

 #pragma once

-#include "flow/flow.h"
-#include "fdbclient/CommitTransaction.h"
-#include "fdbclient/FDBTypes.h"
 #include "fdbclient/BlobConnectionProvider.h"
 #include "fdbclient/BlobGranuleCommon.h"
+#include "fdbclient/CommitTransaction.h"
+#include "fdbclient/FDBTypes.h"
 #include "fdbclient/Tenant.h"
 #include "fdbserver/ServerDBInfo.h"
 #include "flow/actorcompiler.h" // has to be last include
+#include "flow/flow.h"

 struct GranuleHistory {
 	KeyRange range;
@ -53,18 +53,28 @@ struct BlobFileIndex {
 	int64_t offset;
 	int64_t length;
 	int64_t fullFileLength;
+	Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;

 	BlobFileIndex() {}

 	BlobFileIndex(Version version, std::string filename, int64_t offset, int64_t length, int64_t fullFileLength)
 	  : version(version), filename(filename), offset(offset), length(length), fullFileLength(fullFileLength) {}

+	BlobFileIndex(Version version,
+	              std::string filename,
+	              int64_t offset,
+	              int64_t length,
+	              int64_t fullFileLength,
+	              Optional<BlobGranuleCipherKeysMeta> ciphKeysMeta)
+	  : version(version), filename(filename), offset(offset), length(length), fullFileLength(fullFileLength),
+	    cipherKeysMeta(ciphKeysMeta) {}
+
 	// compare on version
 	bool operator<(const BlobFileIndex& r) const { return version < r.version; }
 };

-// FIXME: initialize these to smaller default sizes to save a bit of memory, particularly snapshotFiles
-// Stores the files that comprise a blob granule
+// FIXME: initialize these to smaller default sizes to save a bit of memory,
+// particularly snapshotFiles Stores the files that comprise a blob granule
 struct GranuleFiles {
 	std::vector<BlobFileIndex> snapshotFiles;
 	std::vector<BlobFileIndex> deltaFiles;
@ -78,16 +88,10 @@ struct GranuleFiles {
 };

 // serialize change feed key as UID bytes, to use 16 bytes on disk
-static Key granuleIDToCFKey(UID granuleID) {
-	BinaryWriter wr(Unversioned());
-	wr << granuleID;
-	return wr.toValue();
-}
+Key granuleIDToCFKey(UID granuleID);

 // parse change feed key back to UID, to be human-readable
-static UID cfKeyToGranuleID(Key cfKey) {
-	return BinaryReader::fromStringRef<UID>(cfKey, Unversioned());
-}
+UID cfKeyToGranuleID(Key cfKey);

 class Transaction;
 ACTOR Future<Optional<GranuleHistory>> getLatestGranuleHistory(Transaction* tr, KeyRange range);
--- a/Show More
+++ b/Show More