Merge branch 'main' of github.com:apple/foundationdb

2023-02-06 16:24:47 -08:00 · 2023-02-06 16:24:47 -08:00 · 68f3e29a47
parent 92ae21eb44 46027aa4aa
commit 68f3e29a47
238 changed files with 7329 additions and 2453 deletions
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@ -17,4 +17,5 @@ if(WITH_RUBY_BINDING)
 endif()
 if(NOT WIN32 AND NOT OPEN_FOR_IDE)
  package_bindingtester()
+  package_bindingtester2()
 endif()
--- a/bindings/bindingtester/bindingtester.py
+++ b/bindings/bindingtester/bindingtester.py
@ -22,10 +22,8 @@

 import sys
 import subprocess
-import struct
 import random
 import argparse
-import math
 import os
 import copy
 import traceback
--- a/bindings/bindingtester/spec/tenantTester.md
+++ b/bindings/bindingtester/spec/tenantTester.md
@ -32,7 +32,9 @@ The tenant API introduces some new operations:
 #### TENANT_SET_ACTIVE

    Pops the top item off of the stack as TENANT_NAME. Opens the tenant with
-    name TENANT_NAME and stores it as the active tenant.
+    name TENANT_NAME and stores it as the active tenant. Then, waits on a future
+    that initializes the tenant ID. When complete, pushes the string  
+    "SET_ACTIVE_TENANT" onto the stack.

 #### TENANT_CLEAR_ACTIVE

@ -46,6 +48,12 @@ The tenant API introduces some new operations:
    packed into a tuple as [t1,t2,t3,...,tn], and this single packed value
    is pushed onto the stack.

+#### TENANT_GET_ID
+
+    Attempts to resolve the active tenant's ID. Pushes the string "GOT_TENANT_ID" onto 
+    the stack if an ID was successfully read after waiting on the ID future. Pushes the string 
+    "NO_ACTIVE_TENANT" if there is no active tenant.
+
 Updates to Existing Instructions
 --------------------------------

--- a/bindings/bindingtester/tests/api.py
+++ b/bindings/bindingtester/tests/api.py
@ -175,7 +175,7 @@ class ApiTest(Test):
        write_conflicts = ['WRITE_CONFLICT_RANGE', 'WRITE_CONFLICT_KEY', 'DISABLE_WRITE_CONFLICT']
        txn_sizes = ['GET_APPROXIMATE_SIZE']
        storage_metrics = ['GET_ESTIMATED_RANGE_SIZE', 'GET_RANGE_SPLIT_POINTS']
-        tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE', 'TENANT_LIST']
+        tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE', 'TENANT_LIST', 'TENANT_GET_ID']

        op_choices += reads
        op_choices += mutations
@ -610,6 +610,7 @@ class ApiTest(Test):
                tenant_name = self.choose_tenant(0.8)
                instructions.push_args(tenant_name)
                instructions.append(op)
+                self.add_strings(1)
            elif op == 'TENANT_CLEAR_ACTIVE':
                instructions.append(op)
            elif op == 'TENANT_LIST':
@ -619,6 +620,9 @@ class ApiTest(Test):
                test_util.to_front(instructions, 2)
                instructions.append(op)
                self.add_strings(1)
+            elif op == "TENANT_GET_ID":
+                instructions.append(op)
+                self.add_strings(1)
            else:
                assert False, 'Unknown operation: ' + op

--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -217,8 +217,8 @@ if(NOT WIN32)
  target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads doctest)
  target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads flow doctest)
  target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads doctest)
-  target_link_libraries(fdb_c_client_config_tester PRIVATE SimpleOpt fdb_cpp fdb_c Threads::Threads fmt::fmt)
-  target_include_directories(fdb_c_client_config_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include)
+  target_link_libraries(fdb_c_client_config_tester PRIVATE SimpleOpt fdb_cpp fdb_c fdbclient Threads::Threads fmt::fmt)
+  target_include_directories(fdb_c_client_config_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/)

  # do not set RPATH for mako
  set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
@ -423,18 +423,18 @@ if(OPEN_FOR_IDE)
  target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads)
  target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include)

-elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer only
+elseif(NOT WIN32 AND NOT APPLE) # Linux Only

  set(SHIM_LIB_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})

  set(SHIM_LIB_GEN_SRC
-    ${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.init.c
+    ${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.init.cpp
    ${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.tramp.S)

  set(IMPLIBSO_SRC_DIR ${CMAKE_SOURCE_DIR}/contrib/Implib.so)
  set(IMPLIBSO_SRC
    ${IMPLIBSO_SRC_DIR}/implib-gen.py
-    ${IMPLIBSO_SRC_DIR}/arch/common/init.c.tpl
+    ${IMPLIBSO_SRC_DIR}/arch/common/init.cpp.tpl
    ${IMPLIBSO_SRC_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/config.ini
    ${IMPLIBSO_SRC_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/table.S.tpl
    ${IMPLIBSO_SRC_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/trampoline.S.tpl
@ -467,6 +467,11 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer
  target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads)
  target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include)

+  set(SHIM_LIB_TEST_EXTRA_OPTIONS "")
+  if(NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR USE_SANITIZER)
+    list(APPEND SHIM_LIB_TEST_EXTRA_OPTIONS --disable-prev-version-tests)
+  endif()  
+
  add_python_venv_test(NAME fdb_c_shim_library_tests
    COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_shim_tests.py
    --build-dir ${CMAKE_BINARY_DIR}
@ -474,6 +479,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer
    --api-tester-bin $<TARGET_FILE:fdb_c_shim_api_tester>
    --shim-lib-tester-bin $<TARGET_FILE:fdb_c_shim_lib_tester>
    --api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
+    ${SHIM_LIB_TEST_EXTRA_OPTIONS}
    )

 endif() # End Linux only, non-sanitizer only
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -18,6 +18,8 @@
 * limitations under the License.
 */

+#include "fdbclient/BlobGranuleCommon.h"
+#include "fdbclient/BlobGranuleFiles.h"
 #include "fdbclient/FDBTypes.h"
 #include "flow/ProtocolVersion.h"
 #include <cstdint>
@ -61,6 +63,11 @@ int g_api_version = 0;
 /* This must be true so that we can return the data pointer of a
   Standalone<RangeResultRef> as an array of FDBKeyValue. */
 static_assert(sizeof(FDBKeyValue) == sizeof(KeyValueRef), "FDBKeyValue / KeyValueRef size mismatch");
+static_assert(sizeof(FDBBGMutation) == sizeof(GranuleMutationRef), "FDBBGMutation / GranuleMutationRef size mismatch");
+static_assert(static_cast<int>(FDB_BG_MUTATION_TYPE_SET_VALUE) == static_cast<int>(MutationRef::Type::SetValue),
+              "FDB_BG_MUTATION_TYPE_SET_VALUE enum value mismatch");
+static_assert(static_cast<int>(FDB_BG_MUTATION_TYPE_CLEAR_RANGE) == static_cast<int>(MutationRef::Type::ClearRange),
+              "FDB_BG_MUTATION_TYPE_CLEAR_RANGE enum value mismatch");

 #define TSAV_ERROR(type, error) ((FDBFuture*)(ThreadFuture<type>(error())).extractPtr())

@ -333,6 +340,99 @@ extern "C" DLLEXPORT fdb_error_t fdb_future_get_granule_summary_array(FDBFuture*
 	                 *out_count = na.size(););
 }

+namespace {
+void setBlobFilePointer(FDBBGFilePointer* dest, const BlobFilePointerRef& source) {
+	dest->filename_ptr = source.filename.begin();
+	dest->filename_length = source.filename.size();
+	dest->file_offset = source.offset;
+	dest->file_length = source.length;
+	dest->full_file_length = source.fullFileLength;
+}
+
+void setBGMutation(FDBBGMutation* dest, int64_t version, const MutationRef& source) {
+	dest->version = version;
+	dest->type = source.type;
+	dest->param1_ptr = source.param1.begin();
+	dest->param1_length = source.param1.size();
+	dest->param2_ptr = source.param2.begin();
+	dest->param2_length = source.param2.size();
+}
+
+void setBGMutations(FDBBGMutation** mutationsOut, int* mutationCountOut, Arena& ar, const GranuleDeltas& deltas) {
+	// convert mutations from MutationsAndVersionRef to single mutations
+	int mutationCount = 0;
+	for (auto& it : deltas) {
+		mutationCount += it.mutations.size();
+	}
+	*mutationCountOut = mutationCount;
+
+	if (mutationCount > 0) {
+		*mutationsOut = new (ar) FDBBGMutation[mutationCount];
+		mutationCount = 0;
+		for (auto& it : deltas) {
+			for (auto& m : it.mutations) {
+				setBGMutation(&((*mutationsOut)[mutationCount]), it.version, m);
+				mutationCount++;
+			}
+		}
+		ASSERT(mutationCount == *mutationCountOut);
+	}
+}
+} // namespace
+
+extern "C" DLLEXPORT fdb_error_t fdb_future_readbg_get_descriptions(FDBFuture* f,
+                                                                    FDBBGFileDescription** out,
+                                                                    int* desc_count) {
+	CATCH_AND_RETURN(Standalone<VectorRef<BlobGranuleChunkRef>> results =
+	                     TSAV(Standalone<VectorRef<BlobGranuleChunkRef>>, f)->get();
+	                 *desc_count = results.size();
+	                 Arena ar;
+	                 *out = new (ar) FDBBGFileDescription[results.size()];
+	                 for (int chunkIdx = 0; chunkIdx < results.size(); chunkIdx++) {
+		                 BlobGranuleChunkRef& chunk = results[chunkIdx];
+		                 FDBBGFileDescription& desc = (*out)[chunkIdx];
+
+		                 // set key range
+		                 desc.key_range.begin_key = chunk.keyRange.begin.begin();
+		                 desc.key_range.begin_key_length = chunk.keyRange.begin.size();
+		                 desc.key_range.end_key = chunk.keyRange.end.begin();
+		                 desc.key_range.end_key_length = chunk.keyRange.end.size();
+
+		                 // snapshot file
+		                 desc.snapshot_present = chunk.snapshotFile.present();
+		                 if (desc.snapshot_present) {
+			                 setBlobFilePointer(&desc.snapshot_file_pointer, chunk.snapshotFile.get());
+		                 }
+
+		                 // delta files
+		                 desc.delta_file_count = chunk.deltaFiles.size();
+		                 if (chunk.deltaFiles.size()) {
+			                 desc.delta_files = new (ar) FDBBGFilePointer[chunk.deltaFiles.size()];
+			                 for (int d = 0; d < chunk.deltaFiles.size(); d++) {
+				                 setBlobFilePointer(&desc.delta_files[d], chunk.deltaFiles[d]);
+			                 }
+		                 }
+
+		                 setBGMutations(&desc.memory_mutations, &desc.memory_mutation_count, ar, chunk.newDeltas);
+	                 }
+
+	                 // make this memory owned by the arena of the object stored in the future
+	                 results.arena()
+	                     .dependsOn(ar););
+}
+
+extern "C" DLLEXPORT FDBResult* fdb_readbg_parse_snapshot_file(const uint8_t* file_data, int file_len) {
+	RETURN_RESULT_ON_ERROR(RangeResult,
+	                       RangeResult parsedSnapshotData = bgReadSnapshotFile(StringRef(file_data, file_len));
+	                       return ((FDBResult*)(ThreadResult<RangeResult>(parsedSnapshotData)).extractPtr()););
+}
+extern "C" DLLEXPORT FDBResult* fdb_readbg_parse_delta_file(const uint8_t* file_data, int file_len) {
+	RETURN_RESULT_ON_ERROR(
+	    Standalone<VectorRef<GranuleMutationRef>>,
+	    Standalone<VectorRef<GranuleMutationRef>> parsedDeltaData = bgReadDeltaFile(StringRef(file_data, file_len));
+	    return ((FDBResult*)(ThreadResult<Standalone<VectorRef<GranuleMutationRef>>>(parsedDeltaData)).extractPtr()););
+}
+
 extern "C" DLLEXPORT void fdb_result_destroy(FDBResult* r) {
 	CATCH_AND_DIE(TSAVB(r)->cancel(););
 }
@ -346,6 +446,13 @@ fdb_error_t fdb_result_get_keyvalue_array(FDBResult* r,
 	                 *out_more = rr.more;);
 }

+fdb_error_t fdb_result_get_bg_mutations_array(FDBResult* r, FDBBGMutation const** out_mutations, int* out_count) {
+	CATCH_AND_RETURN(Standalone<VectorRef<GranuleMutationRef>> mutations =
+	                     TSAV(Standalone<VectorRef<GranuleMutationRef>>, r)->get();
+	                 *out_mutations = (FDBBGMutation*)mutations.begin();
+	                 *out_count = mutations.size(););
+}
+
 FDBFuture* fdb_create_cluster_v609(const char* cluster_file_path) {
 	char* path;
 	if (cluster_file_path) {
@ -1088,6 +1195,28 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_summarize_blob_granules(FDBTrans
 	    return (FDBFuture*)(TXN(tr)->summarizeBlobGranules(range, sv, rangeLimit).extractPtr()););
 }

+// copied from read_blob_granules_start
+extern "C" DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_description(FDBTransaction* tr,
+                                                                               uint8_t const* begin_key_name,
+                                                                               int begin_key_name_length,
+                                                                               uint8_t const* end_key_name,
+                                                                               int end_key_name_length,
+                                                                               int64_t begin_version,
+                                                                               int64_t read_version,
+                                                                               int64_t* read_version_out) {
+	Optional<Version> rv;
+	if (read_version != latestVersion) {
+		rv = read_version;
+	}
+	return (FDBFuture*)(TXN(tr)
+	                        ->readBlobGranulesStart(KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length),
+	                                                            KeyRef(end_key_name, end_key_name_length)),
+	                                                begin_version,
+	                                                rv,
+	                                                read_version_out)
+	                        .extractPtr());
+}
+
 #include "fdb_c_function_pointers.g.h"

 #define FDB_API_CHANGED(func, ver)                                                                                     \
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@ -214,6 +214,39 @@ typedef struct readgranulecontext {
 	int granuleParallelism;
 } FDBReadBlobGranuleContext;

+typedef struct bgfilepointer {
+	const uint8_t* filename_ptr;
+	int filename_length;
+	int64_t file_offset;
+	int64_t file_length;
+	int64_t full_file_length;
+	/* TODO: encryption keys would go here */
+} FDBBGFilePointer;
+
+typedef enum { FDB_BG_MUTATION_TYPE_SET_VALUE = 0, FDB_BG_MUTATION_TYPE_CLEAR_RANGE = 1 } FDBBGMutationType;
+
+#pragma pack(push, 4)
+typedef struct bgmutation {
+	/* FDBBGMutationType */ uint8_t type;
+	int64_t version;
+	const uint8_t* param1_ptr;
+	int param1_length;
+	const uint8_t* param2_ptr;
+	int param2_length;
+} FDBBGMutation;
+
+typedef struct bgfiledescription {
+	FDBKeyRange key_range;
+	fdb_bool_t snapshot_present;
+	FDBBGFilePointer snapshot_file_pointer;
+	int delta_file_count;
+	FDBBGFilePointer* delta_files;
+	int memory_mutation_count;
+	FDBBGMutation* memory_mutations;
+	/* TODO: tenant info would go here */
+} FDBBGFileDescription;
+#pragma pack(pop)
+
 DLLEXPORT void fdb_future_cancel(FDBFuture* f);

 DLLEXPORT void fdb_future_release_memory(FDBFuture* f);
@ -275,6 +308,15 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_granule_summary_array(FD
                                                                              FDBGranuleSummary const** out_summaries,
                                                                              int* out_count);

+/* all for using future result from read_blob_granules_description */
+DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_readbg_get_descriptions(FDBFuture* f,
+                                                                            FDBBGFileDescription** out,
+                                                                            int* desc_count);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_readbg_parse_snapshot_file(const uint8_t* file_data, int file_len);
+
+DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_readbg_parse_delta_file(const uint8_t* file_data, int file_len);
+
 /* FDBResult is a synchronous computation result, as opposed to a future that is asynchronous. */
 DLLEXPORT void fdb_result_destroy(FDBResult* r);

@ -283,6 +325,10 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_result_get_keyvalue_array(FDBResult
                                                                       int* out_count,
                                                                       fdb_bool_t* out_more);

+DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_result_get_bg_mutations_array(FDBResult* r,
+                                                                           FDBBGMutation const** out_mutations,
+                                                                           int* out_count);
+
 /* TODO: add other return types as we need them */

 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_create_database(const char* cluster_file_path, FDBDatabase** out_database);
@ -582,6 +628,15 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_summarize_blob_granules(
                                                                                int64_t summaryVersion,
                                                                                int rangeLimit);

+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_read_blob_granules_description(FDBTransaction* tr,
+                                                                                       uint8_t const* begin_key_name,
+                                                                                       int begin_key_name_length,
+                                                                                       uint8_t const* end_key_name,
+                                                                                       int end_key_name_length,
+                                                                                       int64_t begin_version,
+                                                                                       int64_t read_version,
+                                                                                       int64_t* read_version_out);
+
 #define FDB_KEYSEL_LAST_LESS_THAN(k, l) k, l, 0, 0
 #define FDB_KEYSEL_LAST_LESS_OR_EQUAL(k, l) k, l, 1, 0
 #define FDB_KEYSEL_FIRST_GREATER_THAN(k, l) k, l, 1, 1
--- a/bindings/c/test/apitester/TesterApiWorkload.cpp
+++ b/bindings/c/test/apitester/TesterApiWorkload.cpp
@ -70,12 +70,15 @@ void ApiWorkload::start() {
 	schedule([this]() {
 		// 1. Clear data
 		clearData([this]() {
-			// 2. Workload setup
-			setup([this]() {
-				// 3. Populate initial data
-				populateData([this]() {
-					// 4. Generate random workload
-					runTests();
+			// 2. Create tenants if necessary.
+			createTenantsIfNecessary([this] {
+				// 3. Workload setup.
+				setup([this]() {
+					// 4. Populate initial data
+					populateData([this]() {
+						// 5. Generate random workload
+						runTests();
+					});
 				});
 			});
 		});
@ -152,6 +155,21 @@ fdb::Key ApiWorkload::randomKey(double existingKeyRatio, std::optional<int> tena
 	}
 }

+fdb::KeyRange ApiWorkload::randomNonEmptyKeyRange() {
+	fdb::KeyRange keyRange;
+	keyRange.beginKey = randomKeyName();
+	// avoid empty key range
+	do {
+		keyRange.endKey = randomKeyName();
+	} while (keyRange.beginKey == keyRange.endKey);
+
+	if (keyRange.beginKey > keyRange.endKey) {
+		std::swap(keyRange.beginKey, keyRange.endKey);
+	}
+	ASSERT(keyRange.beginKey < keyRange.endKey);
+	return keyRange;
+}
+
 std::optional<int> ApiWorkload::randomTenant() {
 	if (tenants.size() > 0) {
 		return Random::get().randomInt(0, tenants.size() - 1);
@ -244,9 +262,17 @@ void ApiWorkload::createTenants(TTaskFct cont) {
 	    [this, cont]() { schedule(cont); });
 }

+void ApiWorkload::createTenantsIfNecessary(TTaskFct cont) {
+	if (tenants.size() > 0) {
+		createTenants(cont);
+	} else {
+		schedule(cont);
+	}
+}
+
 void ApiWorkload::populateData(TTaskFct cont) {
 	if (tenants.size() > 0) {
-		createTenants([this, cont]() { populateTenantData(cont, std::make_optional(0)); });
+		populateTenantData(cont, std::make_optional(0));
 	} else {
 		populateTenantData(cont, {});
 	}
--- a/bindings/c/test/apitester/TesterApiWorkload.h
+++ b/bindings/c/test/apitester/TesterApiWorkload.h
@ -113,6 +113,7 @@ protected:
 	fdb::Key randomNotExistingKey(std::optional<int> tenantId);
 	fdb::Key randomExistingKey(std::optional<int> tenantId);
 	fdb::Key randomKey(double existingKeyRatio, std::optional<int> tenantId);
+	fdb::KeyRange randomNonEmptyKeyRange();

 	// Chooses a random tenant from the available tenants (or an empty optional if tenants aren't used in the test)
 	std::optional<int> randomTenant();
@ -140,6 +141,7 @@ private:
 	void populateDataTx(TTaskFct cont, std::optional<int> tenantId);
 	void populateTenantData(TTaskFct cont, std::optional<int> tenantId);
 	void createTenants(TTaskFct cont);
+	void createTenantsIfNecessary(TTaskFct cont);

 	void clearTenantData(TTaskFct cont, std::optional<int> tenantId);

--- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
+++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
@ -21,6 +21,8 @@
 #include "TesterBlobGranuleUtil.h"
 #include "TesterUtil.h"
 #include <unordered_set>
+#include <set>
+#include "fdb_api.hpp"
 #include <memory>
 #include <fmt/format.h>

@ -38,7 +40,7 @@ public:
 	}

 private:
-	// FIXME: add tenant support for DB operations
+	// FIXME: use other new blob granule apis!
 	enum OpType {
 		OP_INSERT,
 		OP_CLEAR,
@ -48,84 +50,63 @@ private:
 		OP_SUMMARIZE,
 		OP_GET_BLOB_RANGES,
 		OP_VERIFY,
-		OP_LAST = OP_VERIFY
+		OP_READ_DESC,
+		OP_LAST = OP_READ_DESC
 	};
 	std::vector<OpType> excludedOpTypes;

 	void setup(TTaskFct cont) override { setupBlobGranules(cont); }

-	// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
-	// FIXME: should still guarantee a read succeeds eventually somehow
-	std::unordered_set<std::optional<int>> tenantsWithReadSuccess;
+	std::set<fdb::ByteString> validatedFiles;

-	inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); }
-
-	inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); }
-
-	void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) {
+	void debugOp(std::string opName, fdb::KeyRange keyRange, std::optional<int> tenantId, std::string message) {
 		if (BG_API_DEBUG_VERBOSE) {
 			info(fmt::format("{0}: [{1} - {2}) {3}: {4}",
 			                 opName,
-			                 fdb::toCharsRef(begin),
-			                 fdb::toCharsRef(end),
+			                 fdb::toCharsRef(keyRange.beginKey),
+			                 fdb::toCharsRef(keyRange.endKey),
 			                 debugTenantStr(tenantId),
 			                 message));
 		}
 	}

 	void randomReadOp(TTaskFct cont, std::optional<int> tenantId) {
-		fdb::Key begin = randomKeyName();
-		fdb::Key end = randomKeyName();
-		if (begin > end) {
-			std::swap(begin, end);
-		}
+		fdb::KeyRange keyRange = randomNonEmptyKeyRange();

 		auto results = std::make_shared<std::vector<fdb::KeyValue>>();
 		auto tooOld = std::make_shared<bool>(false);

-		debugOp("Read", begin, end, tenantId, "starting");
+		debugOp("Read", keyRange, tenantId, "starting");

 		execTransaction(
-		    [this, begin, end, tenantId, results, tooOld](auto ctx) {
+		    [this, keyRange, tenantId, results, tooOld](auto ctx) {
 			    ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
 			    TesterGranuleContext testerContext(ctx->getBGBasePath());
 			    fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext);

-			    fdb::Result res = ctx->tx().readBlobGranules(
-			        begin, end, 0 /* beginVersion */, -2 /* latest read version */, granuleContext);
+			    fdb::Result res = ctx->tx().readBlobGranules(keyRange.beginKey,
+			                                                 keyRange.endKey,
+			                                                 0 /* beginVersion */,
+			                                                 -2 /* latest read version */,
+			                                                 granuleContext);
 			    auto out = fdb::Result::KeyValueRefArray{};
 			    fdb::Error err = res.getKeyValueArrayNothrow(out);
-			    if (err.code() == error_code_blob_granule_transaction_too_old) {
-				    bool previousSuccess = seenReadSuccess(tenantId);
-				    if (previousSuccess) {
-					    error("Read bg too old after read success!\n");
-				    } else {
-					    info("Read bg too old\n");
-				    }
-				    ASSERT(!previousSuccess);
-				    *tooOld = true;
-				    ctx->done();
-			    } else if (err.code() != error_code_success) {
+			    ASSERT(err.code() != error_code_blob_granule_transaction_too_old);
+			    if (err.code() != error_code_success) {
 				    ctx->onError(err);
 			    } else {
 				    auto resCopy = copyKeyValueArray(out);
 				    auto& [resVector, out_more] = resCopy;
 				    ASSERT(!out_more);
 				    results.get()->assign(resVector.begin(), resVector.end());
-				    bool previousSuccess = seenReadSuccess(tenantId);
-				    if (!previousSuccess) {
-					    info(fmt::format("Read {0}: first success\n", debugTenantStr(tenantId)));
-					    setReadSuccess(tenantId);
-				    } else {
-					    debugOp("Read", begin, end, tenantId, "complete");
-				    }
+				    debugOp("Read", keyRange, tenantId, "complete");
 				    ctx->done();
 			    }
 		    },
-		    [this, begin, end, results, tooOld, cont, tenantId]() {
+		    [this, keyRange, results, tooOld, cont, tenantId]() {
 			    if (!*tooOld) {
 				    std::vector<fdb::KeyValue> expected =
-				        stores[tenantId].getRange(begin, end, stores[tenantId].size(), false);
+				        stores[tenantId].getRange(keyRange.beginKey, keyRange.endKey, stores[tenantId].size(), false);
 				    if (results->size() != expected.size()) {
 					    error(fmt::format("randomReadOp result size mismatch. expected: {0} actual: {1}",
 					                      expected.size(),
@ -161,18 +142,14 @@ private:
 	}

 	void randomGetGranulesOp(TTaskFct cont, std::optional<int> tenantId) {
-		fdb::Key begin = randomKeyName();
-		fdb::Key end = randomKeyName();
-		if (begin > end) {
-			std::swap(begin, end);
-		}
+		fdb::KeyRange keyRange = randomNonEmptyKeyRange();
 		auto results = std::make_shared<std::vector<fdb::KeyRange>>();

-		debugOp("GetGranules", begin, end, tenantId, "starting");
+		debugOp("GetGranules", keyRange, tenantId, "starting");

 		execTransaction(
-		    [begin, end, results](auto ctx) {
-			    fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType();
+		    [keyRange, results](auto ctx) {
+			    fdb::Future f = ctx->tx().getBlobGranuleRanges(keyRange.beginKey, keyRange.endKey, 1000).eraseType();
 			    ctx->continueAfter(
 			        f,
 			        [ctx, f, results]() {
@ -181,34 +158,26 @@ private:
 			        },
 			        true);
 		    },
-		    [this, begin, end, tenantId, results, cont]() {
-			    debugOp(
-			        "GetGranules", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
-			    this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
+		    [this, keyRange, tenantId, results, cont]() {
+			    debugOp("GetGranules", keyRange, tenantId, fmt::format("complete with {0} granules", results->size()));
+			    this->validateRanges(results, keyRange);
 			    schedule(cont);
 		    },
 		    getTenant(tenantId));
 	}

 	void randomSummarizeOp(TTaskFct cont, std::optional<int> tenantId) {
-		if (!seenReadSuccess(tenantId)) {
-			// tester can't handle this throwing bg_txn_too_old, so just don't call it unless we have already seen a
-			// read success
-			schedule(cont);
-			return;
-		}
-		fdb::Key begin = randomKeyName();
-		fdb::Key end = randomKeyName();
-		if (begin > end) {
-			std::swap(begin, end);
-		}
+		fdb::KeyRange keyRange = randomNonEmptyKeyRange();
 		auto results = std::make_shared<std::vector<fdb::GranuleSummary>>();

-		debugOp("Summarize", begin, end, tenantId, "starting");
+		debugOp("Summarize", keyRange, tenantId, "starting");

 		execTransaction(
-		    [begin, end, results](auto ctx) {
-			    fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2 /*latest version*/, 1000).eraseType();
+		    [keyRange, results](auto ctx) {
+			    fdb::Future f =
+			        ctx->tx()
+			            .summarizeBlobGranules(keyRange.beginKey, keyRange.endKey, -2 /*latest version*/, 1000)
+			            .eraseType();
 			    ctx->continueAfter(
 			        f,
 			        [ctx, f, results]() {
@ -217,8 +186,8 @@ private:
 			        },
 			        true);
 		    },
-		    [this, begin, end, tenantId, results, cont]() {
-			    debugOp("Summarize", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
+		    [this, keyRange, tenantId, results, cont]() {
+			    debugOp("Summarize", keyRange, tenantId, fmt::format("complete with {0} granules", results->size()));

 			    // use validateRanges to share validation
 			    auto ranges = std::make_shared<std::vector<fdb::KeyRange>>();
@ -233,39 +202,35 @@ private:
 				    ranges->push_back((*results)[i].keyRange);
 			    }

-			    this->validateRanges(ranges, begin, end, true);
+			    this->validateRanges(ranges, keyRange);

 			    schedule(cont);
 		    },
 		    getTenant(tenantId));
 	}

-	void validateRanges(std::shared_ptr<std::vector<fdb::KeyRange>> results,
-	                    fdb::Key begin,
-	                    fdb::Key end,
-	                    bool shouldBeRanges) {
-		if (shouldBeRanges) {
-			if (results->size() == 0) {
-				error(fmt::format(
-				    "ValidateRanges: [{0} - {1}): No ranges returned!", fdb::toCharsRef(begin), fdb::toCharsRef(end)));
-			}
-			ASSERT(results->size() > 0);
-			if (results->front().beginKey > begin || results->back().endKey < end) {
-				error(fmt::format("ValidateRanges: [{0} - {1}): Incomplete range(s) returned [{2} - {3})!",
-				                  fdb::toCharsRef(begin),
-				                  fdb::toCharsRef(end),
-				                  fdb::toCharsRef(results->front().beginKey),
-				                  fdb::toCharsRef(results->back().endKey)));
-			}
-			ASSERT(results->front().beginKey <= begin);
-			ASSERT(results->back().endKey >= end);
+	void validateRanges(std::shared_ptr<std::vector<fdb::KeyRange>> results, fdb::KeyRange keyRange) {
+		if (results->size() == 0) {
+			error(fmt::format("ValidateRanges: [{0} - {1}): No ranges returned!",
+			                  fdb::toCharsRef(keyRange.beginKey),
+			                  fdb::toCharsRef(keyRange.endKey)));
 		}
+		ASSERT(results->size() > 0);
+		if (results->front().beginKey > keyRange.beginKey || results->back().endKey < keyRange.endKey) {
+			error(fmt::format("ValidateRanges: [{0} - {1}): Incomplete range(s) returned [{2} - {3})!",
+			                  fdb::toCharsRef(keyRange.beginKey),
+			                  fdb::toCharsRef(keyRange.endKey),
+			                  fdb::toCharsRef(results->front().beginKey),
+			                  fdb::toCharsRef(results->back().endKey)));
+		}
+		ASSERT(results->front().beginKey <= keyRange.beginKey);
+		ASSERT(results->back().endKey >= keyRange.endKey);
 		for (int i = 0; i < results->size(); i++) {
 			// no empty or inverted ranges
 			if ((*results)[i].beginKey >= (*results)[i].endKey) {
 				error(fmt::format("ValidateRanges: [{0} - {1}): Empty/inverted range [{2} - {3})",
-				                  fdb::toCharsRef(begin),
-				                  fdb::toCharsRef(end),
+				                  fdb::toCharsRef(keyRange.beginKey),
+				                  fdb::toCharsRef(keyRange.endKey),
 				                  fdb::toCharsRef((*results)[i].beginKey),
 				                  fdb::toCharsRef((*results)[i].endKey)));
 			}
@ -276,8 +241,8 @@ private:
 			// ranges contain entire requested key range
 			if ((*results)[i].beginKey != (*results)[i].endKey) {
 				error(fmt::format("ValidateRanges: [{0} - {1}): Non-covereed range [{2} - {3})",
-				                  fdb::toCharsRef(begin),
-				                  fdb::toCharsRef(end),
+				                  fdb::toCharsRef(keyRange.beginKey),
+				                  fdb::toCharsRef(keyRange.endKey),
 				                  fdb::toCharsRef((*results)[i - 1].endKey),
 				                  fdb::toCharsRef((*results)[i].endKey)));
 			}
@ -287,27 +252,24 @@ private:

 	// TODO: tenant support
 	void randomGetBlobRangesOp(TTaskFct cont, std::optional<int> tenantId) {
-		fdb::Key begin = randomKeyName();
-		fdb::Key end = randomKeyName();
-		auto results = std::make_shared<std::vector<fdb::KeyRange>>();
-		if (begin > end) {
-			std::swap(begin, end);
-		}
+		fdb::KeyRange keyRange = randomNonEmptyKeyRange();

-		debugOp("GetBlobRanges", begin, end, tenantId, "starting");
+		auto results = std::make_shared<std::vector<fdb::KeyRange>>();
+
+		debugOp("GetBlobRanges", keyRange, tenantId, "starting");

 		execOperation(
-		    [begin, end, results](auto ctx) {
-			    fdb::Future f = ctx->dbOps()->listBlobbifiedRanges(begin, end, 1000).eraseType();
+		    [keyRange, results](auto ctx) {
+			    fdb::Future f =
+			        ctx->dbOps()->listBlobbifiedRanges(keyRange.beginKey, keyRange.endKey, 1000).eraseType();
 			    ctx->continueAfter(f, [ctx, f, results]() {
 				    *results = copyKeyRangeArray(f.get<fdb::future_var::KeyRangeRefArray>());
 				    ctx->done();
 			    });
 		    },
-		    [this, begin, end, tenantId, results, cont]() {
-			    debugOp(
-			        "GetBlobRanges", begin, end, tenantId, fmt::format("complete with {0} ranges", results->size()));
-			    this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
+		    [this, keyRange, tenantId, results, cont]() {
+			    debugOp("GetBlobRanges", keyRange, tenantId, fmt::format("complete with {0} ranges", results->size()));
+			    this->validateRanges(results, keyRange);
 			    schedule(cont);
 		    },
 		    getTenant(tenantId),
@ -316,38 +278,214 @@ private:

 	// TODO: tenant support
 	void randomVerifyOp(TTaskFct cont, std::optional<int> tenantId) {
-		fdb::Key begin = randomKeyName();
-		fdb::Key end = randomKeyName();
-		if (begin > end) {
-			std::swap(begin, end);
-		}
+		fdb::KeyRange keyRange = randomNonEmptyKeyRange();

-		debugOp("Verify", begin, end, tenantId, "starting");
+		debugOp("Verify", keyRange, tenantId, "starting");

 		auto verifyVersion = std::make_shared<int64_t>(-1);
 		execOperation(
-		    [begin, end, verifyVersion](auto ctx) {
-			    fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, -2 /* latest version*/).eraseType();
+		    [keyRange, verifyVersion](auto ctx) {
+			    fdb::Future f = ctx->dbOps()
+			                        ->verifyBlobRange(keyRange.beginKey, keyRange.endKey, -2 /* latest version*/)
+			                        .eraseType();
 			    ctx->continueAfter(f, [ctx, verifyVersion, f]() {
 				    *verifyVersion = f.get<fdb::future_var::Int64>();
 				    ctx->done();
 			    });
 		    },
-		    [this, begin, end, tenantId, verifyVersion, cont]() {
-			    debugOp("Verify", begin, end, tenantId, fmt::format("Complete @ {0}", *verifyVersion));
-			    bool previousSuccess = seenReadSuccess(tenantId);
-			    if (*verifyVersion == -1) {
-				    ASSERT(!previousSuccess);
-			    } else if (!previousSuccess) {
-				    info(fmt::format("Verify {0}: first success\n", debugTenantStr(tenantId)));
-				    setReadSuccess(tenantId);
-			    }
+		    [this, keyRange, tenantId, verifyVersion, cont]() {
+			    debugOp("Verify", keyRange, tenantId, fmt::format("Complete @ {0}", *verifyVersion));
 			    schedule(cont);
 		    },
 		    getTenant(tenantId),
 		    /* failOnError = */ false);
 	}

+	void validateSnapshotData(std::shared_ptr<ITransactionContext> ctx,
+	                          fdb::native::FDBReadBlobGranuleContext& bgCtx,
+	                          fdb::GranuleFilePointer snapshotFile,
+	                          fdb::KeyRange keyRange) {
+		if (validatedFiles.contains(snapshotFile.filename)) {
+			return;
+		}
+		validatedFiles.insert(snapshotFile.filename);
+
+		int64_t snapshotLoadId = bgCtx.start_load_f((const char*)(snapshotFile.filename.data()),
+		                                            snapshotFile.filename.size(),
+		                                            snapshotFile.offset,
+		                                            snapshotFile.length,
+		                                            snapshotFile.fullFileLength,
+		                                            bgCtx.userContext);
+		fdb::BytesRef snapshotData(bgCtx.get_load_f(snapshotLoadId, bgCtx.userContext), snapshotFile.length);
+		fdb::Result snapshotRes = ctx->tx().parseSnapshotFile(snapshotData);
+		auto out = fdb::Result::KeyValueRefArray{};
+		fdb::Error err = snapshotRes.getKeyValueArrayNothrow(out);
+		ASSERT(err.code() == error_code_success);
+		auto res = copyKeyValueArray(out);
+		bgCtx.free_load_f(snapshotLoadId, bgCtx.userContext);
+		ASSERT(res.second == false);
+
+		for (int i = 0; i < res.first.size(); i++) {
+			ASSERT(res.first[i].key >= keyRange.beginKey);
+			ASSERT(res.first[i].key < keyRange.endKey);
+			if (i > 0) {
+				ASSERT(res.first[i - 1].key < res.first[i].key);
+			}
+			// TODO add snapshot rows to map
+		}
+	}
+
+	void validateDeltaData(std::shared_ptr<ITransactionContext> ctx,
+	                       fdb::native::FDBReadBlobGranuleContext& bgCtx,
+	                       fdb::GranuleFilePointer deltaFile,
+	                       fdb::KeyRange keyRange,
+	                       int64_t& lastDFMaxVersion) {
+		if (validatedFiles.contains(deltaFile.filename)) {
+			return;
+		}
+		validatedFiles.insert(deltaFile.filename);
+		int64_t deltaLoadId = bgCtx.start_load_f((const char*)(deltaFile.filename.data()),
+		                                         deltaFile.filename.size(),
+		                                         deltaFile.offset,
+		                                         deltaFile.length,
+		                                         deltaFile.fullFileLength,
+		                                         bgCtx.userContext);
+
+		fdb::BytesRef deltaData(bgCtx.get_load_f(deltaLoadId, bgCtx.userContext), deltaFile.length);
+
+		fdb::Result deltaRes = ctx->tx().parseDeltaFile(deltaData);
+		auto out = fdb::Result::GranuleMutationRefArray{};
+		fdb::Error err = deltaRes.getGranuleMutationArrayNothrow(out);
+		ASSERT(err.code() == error_code_success);
+		auto res = copyGranuleMutationArray(out);
+		bgCtx.free_load_f(deltaLoadId, bgCtx.userContext);
+
+		int64_t thisDFMaxVersion = 0;
+		for (int j = 0; j < res.size(); j++) {
+			fdb::GranuleMutation& m = res[j];
+			ASSERT(m.version > 0);
+			ASSERT(m.version > lastDFMaxVersion);
+			// mutations in delta files aren't necessarily in version order, so just validate ordering w.r.t
+			// previous file(s)
+			thisDFMaxVersion = std::max(thisDFMaxVersion, m.version);
+
+			ASSERT(m.type == 0 || m.type == 1);
+			ASSERT(keyRange.beginKey <= m.param1);
+			ASSERT(m.param1 < keyRange.endKey);
+			if (m.type == 1) {
+				ASSERT(keyRange.beginKey <= m.param2);
+				ASSERT(m.param2 <= keyRange.endKey);
+			}
+		}
+		lastDFMaxVersion = std::max(lastDFMaxVersion, thisDFMaxVersion);
+
+		// TODO have delta mutations update map
+	}
+
+	void validateBGDescriptionData(std::shared_ptr<ITransactionContext> ctx,
+	                               fdb::native::FDBReadBlobGranuleContext& bgCtx,
+	                               fdb::GranuleDescription desc,
+	                               fdb::KeyRange keyRange,
+	                               int64_t readVersion) {
+		ASSERT(desc.keyRange.beginKey < desc.keyRange.endKey);
+		// beginVersion of zero means snapshot present
+
+		// validate snapshot file
+		ASSERT(desc.snapshotFile.has_value());
+		if (BG_API_DEBUG_VERBOSE) {
+			info(fmt::format("Loading snapshot file {0}\n", fdb::toCharsRef(desc.snapshotFile->filename)));
+		}
+		validateSnapshotData(ctx, bgCtx, *desc.snapshotFile, desc.keyRange);
+
+		// validate delta files
+		int64_t lastDFMaxVersion = 0;
+		for (int i = 0; i < desc.deltaFiles.size(); i++) {
+			validateDeltaData(ctx, bgCtx, desc.deltaFiles[i], desc.keyRange, lastDFMaxVersion);
+		}
+
+		// validate memory mutations
+		int64_t lastVersion = 0;
+		for (int i = 0; i < desc.memoryMutations.size(); i++) {
+			fdb::GranuleMutation& m = desc.memoryMutations[i];
+			ASSERT(m.type == 0 || m.type == 1);
+			ASSERT(m.version > 0);
+			ASSERT(m.version >= lastVersion);
+			ASSERT(m.version <= readVersion);
+			lastVersion = m.version;
+
+			ASSERT(m.type == 0 || m.type == 1);
+			ASSERT(desc.keyRange.beginKey <= m.param1);
+			ASSERT(m.param1 < desc.keyRange.endKey);
+			if (m.type == 1) {
+				ASSERT(desc.keyRange.beginKey <= m.param2);
+				ASSERT(m.param2 <= desc.keyRange.endKey);
+			}
+
+			// TODO have delta mutations update map
+		}
+
+		// TODO: validate map against data store
+	}
+
+	void validateBlobGranuleDescriptions(std::shared_ptr<ITransactionContext> ctx,
+	                                     std::vector<fdb::GranuleDescription> results,
+	                                     fdb::KeyRange keyRange,
+	                                     std::optional<int> tenantId,
+	                                     int64_t readVersion) {
+		ASSERT(!results.empty());
+
+		if (tenantId) {
+			// FIXME: support tenants!!
+			info("Skipping validation because of tenant.");
+			return;
+		}
+
+		ASSERT(results.front().keyRange.beginKey <= keyRange.beginKey);
+		ASSERT(keyRange.endKey <= results.back().keyRange.endKey);
+		for (int i = 0; i < results.size() - 1; i++) {
+			ASSERT(results[i].keyRange.endKey == results[i + 1].keyRange.beginKey);
+		}
+
+		TesterGranuleContext testerContext(ctx->getBGBasePath());
+		fdb::native::FDBReadBlobGranuleContext bgCtx = createGranuleContext(&testerContext);
+		for (int i = 0; i < results.size(); i++) {
+			validateBGDescriptionData(ctx, bgCtx, results[i], keyRange, readVersion);
+		}
+	}
+
+	void randomReadDescription(TTaskFct cont, std::optional<int> tenantId) {
+		fdb::KeyRange keyRange = randomNonEmptyKeyRange();
+		auto results = std::make_shared<std::vector<fdb::GranuleDescription>>();
+		auto readVersionOut = std::make_shared<int64_t>();
+
+		debugOp("ReadDesc", keyRange, tenantId, "starting");
+
+		execTransaction(
+		    [this, keyRange, tenantId, results, readVersionOut](auto ctx) {
+			    ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
+
+			    int64_t* rvo = (int64_t*)readVersionOut.get();
+			    fdb::Future f =
+			        ctx->tx().readBlobGranulesDescription(keyRange.beginKey, keyRange.endKey, 0, -2, rvo).eraseType();
+			    ctx->continueAfter(
+			        f,
+			        [this, ctx, keyRange, tenantId, results, readVersionOut, f]() {
+				        *results = copyGranuleDescriptionArray(f.get<fdb::future_var::GranuleDescriptionRefArray>());
+				        this->validateBlobGranuleDescriptions(ctx, *results, keyRange, tenantId, *readVersionOut);
+				        ctx->done();
+			        },
+			        true);
+		    },
+		    [this, keyRange, tenantId, results, readVersionOut, cont]() {
+			    debugOp("ReadDesc",
+			            keyRange,
+			            tenantId,
+			            fmt::format("complete @ {0} with {1} granules", *readVersionOut, results->size()));
+			    schedule(cont);
+		    },
+		    getTenant(tenantId));
+	}
+
 	void randomOperation(TTaskFct cont) override {
 		std::optional<int> tenantId = randomTenant();

@ -381,6 +519,9 @@ private:
 		case OP_VERIFY:
 			randomVerifyOp(cont, tenantId);
 			break;
+		case OP_READ_DESC:
+			randomReadDescription(cont, tenantId);
+			break;
 		}
 	}
 };
--- a/bindings/c/test/apitester/TesterUtil.cpp
+++ b/bindings/c/test/apitester/TesterUtil.cpp
@ -112,6 +112,30 @@ GranuleSummaryArray copyGranuleSummaryArray(fdb::future_var::GranuleSummaryRefAr
 	return out;
 };

+GranuleDescriptionArray copyGranuleDescriptionArray(fdb::future_var::GranuleDescriptionRefArray::Type array) {
+	auto& [in_desc, in_count] = array;
+
+	GranuleDescriptionArray out;
+
+	for (int i = 0; i < in_count; ++i) {
+		fdb::native::FDBBGFileDescription nativeDesc = *in_desc++;
+		out.emplace_back(nativeDesc);
+	}
+	return out;
+};
+
+GranuleMutationArray copyGranuleMutationArray(fdb::future_var::GranuleMutationRefArray::Type array) {
+	auto& [in_mutations, in_count] = array;
+
+	GranuleMutationArray out;
+
+	for (int i = 0; i < in_count; ++i) {
+		fdb::native::FDBBGMutation nativeMutation = *in_mutations++;
+		out.emplace_back(nativeMutation);
+	}
+	return out;
+};
+
 TmpFile::~TmpFile() {
 	if (!filename.empty()) {
 		remove();
--- a/bindings/c/test/apitester/TesterUtil.h
+++ b/bindings/c/test/apitester/TesterUtil.h
@ -136,6 +136,12 @@ KeyRangeArray copyKeyRangeArray(fdb::future_var::KeyRangeRefArray::Type array);
 using GranuleSummaryArray = std::vector<fdb::GranuleSummary>;
 GranuleSummaryArray copyGranuleSummaryArray(fdb::future_var::GranuleSummaryRefArray::Type array);

+using GranuleDescriptionArray = std::vector<fdb::GranuleDescription>;
+GranuleDescriptionArray copyGranuleDescriptionArray(fdb::future_var::GranuleDescriptionRefArray::Type array);
+
+using GranuleMutationArray = std::vector<fdb::GranuleMutation>;
+GranuleMutationArray copyGranuleMutationArray(fdb::future_var::GranuleMutationRefArray::Type array);
+
 static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");

 // Converts a little-endian encoded number into an integral type.
--- a/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessBlocking.toml
+++ b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessBlocking.toml
@ -22,5 +22,6 @@ maxKeyLength = 64
 minValueLength = 1
 maxValueLength = 1000
 maxKeysPerTransaction = 50
+# TODO - increase initialSize and/or buggify down BG_SNAPSHOT_FILE_TARGET_BYTES to force multiple granules
 initialSize = 100
 numRandomOperations = 100
--- a/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessMultiThr.toml
+++ b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessMultiThr.toml
@ -21,5 +21,6 @@ maxKeyLength = 64
 minValueLength = 1
 maxValueLength = 1000
 maxKeysPerTransaction = 50
+# TODO - increase initialSize and/or buggify down BG_SNAPSHOT_FILE_TARGET_BYTES to force multiple granules
 initialSize = 100
 numRandomOperations = 100
--- a/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessSingleThr.toml
@ -14,5 +14,6 @@ maxKeyLength = 64
 minValueLength = 1
 maxValueLength = 1000
 maxKeysPerTransaction = 50
+# TODO - increase initialSize and/or buggify down BG_SNAPSHOT_FILE_TARGET_BYTES to force multiple granules
 initialSize = 100
 numRandomOperations = 100
--- a/bindings/c/test/apitester/tests/CApiBlobGranuleTenantCorrectnessMultiThr.toml
+++ b/bindings/c/test/apitester/tests/CApiBlobGranuleTenantCorrectnessMultiThr.toml
@ -0,0 +1,24 @@
+[[test]]
+title = 'Blob Granule API Tenant Correctness Multi Threaded'
+multiThreaded = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minClients = 1
+maxClients = 8
+minTenants = 1
+maxTenants = 5
+
+[[server]]
+blob_granules_enabled = true
+
+[[test.workload]]
+name = 'ApiBlobGranuleCorrectness'
+minKeyLength = 1
+maxKeyLength = 64
+minValueLength = 1
+maxValueLength = 1000
+maxKeysPerTransaction = 50
+# TODO - increase initialSize and/or buggify down BG_SNAPSHOT_FILE_TARGET_BYTES to force multiple granules
+initialSize = 100
+numRandomOperations = 100
--- a/bindings/c/test/client_config_tester.cpp
+++ b/bindings/c/test/client_config_tester.cpp
@ -34,6 +34,8 @@
 #include "SimpleOpt/SimpleOpt.h"
 #include <thread>
 #include <string_view>
+#include <unordered_map>
+#include "fdbclient/FDBOptions.g.h"

 #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__))
 #include <unistd.h>
@ -43,11 +45,6 @@
 #error Unsupported platform
 #endif

-#undef ERROR
-#define ERROR(name, number, description) enum { error_code_##name = number };
-
-#include "flow/error_definitions.h"
-
 #define API_VERSION_CLIENT_TMP_DIR 720

 using namespace std::string_view_literals;
@ -59,17 +56,14 @@ enum TesterOptionId {
 	OPT_CONNFILE,
 	OPT_EXTERNAL_CLIENT_LIBRARY,
 	OPT_EXTERNAL_CLIENT_DIRECTORY,
-	OPT_DISABLE_LOCAL_CLIENT,
-	OPT_DISABLE_CLIENT_BYPASS,
 	OPT_API_VERSION,
 	OPT_TRANSACTION_TIMEOUT,
 	OPT_TRACE,
 	OPT_TRACE_DIR,
 	OPT_TMP_DIR,
-	OPT_IGNORE_EXTERNAL_CLIENT_FAILURES,
-	OPT_FAIL_INCOMPATIBLE_CLIENT,
 	OPT_EXPECTED_ERROR,
-	OPT_PRINT_STATUS
+	OPT_PRINT_STATUS,
+	OPT_NETWORK_OPTION
 };

 const int MIN_TESTABLE_API_VERSION = 400;
@ -81,17 +75,14 @@ CSimpleOpt::SOption TesterOptionDefs[] = //
 	  { OPT_CONNFILE, "--cluster-file", SO_REQ_SEP },
 	  { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP },
 	  { OPT_EXTERNAL_CLIENT_DIRECTORY, "--external-client-dir", SO_REQ_SEP },
-	  { OPT_DISABLE_LOCAL_CLIENT, "--disable-local-client", SO_NONE },
-	  { OPT_DISABLE_CLIENT_BYPASS, "--disable-client-bypass", SO_NONE },
 	  { OPT_API_VERSION, "--api-version", SO_REQ_SEP },
 	  { OPT_TRANSACTION_TIMEOUT, "--transaction-timeout", SO_REQ_SEP },
 	  { OPT_TRACE, "--log", SO_NONE },
 	  { OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP },
 	  { OPT_TMP_DIR, "--tmp-dir", SO_REQ_SEP },
-	  { OPT_IGNORE_EXTERNAL_CLIENT_FAILURES, "--ignore-external-client-failures", SO_NONE },
-	  { OPT_FAIL_INCOMPATIBLE_CLIENT, "--fail-incompatible-client", SO_NONE },
 	  { OPT_EXPECTED_ERROR, "--expected-error", SO_REQ_SEP },
 	  { OPT_PRINT_STATUS, "--print-status", SO_NONE },
+	  { OPT_NETWORK_OPTION, "--network-option-", SO_REQ_SEP },
 	  SO_END_OF_OPTIONS };

 class TesterOptions {
@ -111,6 +102,7 @@ public:
 	bool failIncompatibleClient = false;
 	fdb::Error::CodeType expectedError = 0;
 	bool printStatus = false;
+	std::vector<std::pair<std::string, std::string>> networkOptions;
 };

 namespace {
@ -130,10 +122,6 @@ void printProgramUsage(const char* execName) {
 	       "                 Path to the external client library.\n"
 	       "  --external-client-dir DIR\n"
 	       "                 Directory containing external client libraries.\n"
-	       "  --disable-local-client\n"
-	       "                 Disable the local client, i.e. use only external client libraries.\n"
-	       "  --disable-client-bypass\n"
-	       "                 Disable bypassing Multi-Version Client when using the local client.\n"
 	       "  --api-version VERSION\n"
 	       "                 Required FDB API version (default %d).\n"
 	       "  --transaction-timeout MILLISECONDS\n"
@ -144,14 +132,12 @@ void printProgramUsage(const char* execName) {
 	       "                 no effect unless --log is specified.\n"
 	       "  --tmp-dir DIR\n"
 	       "                 Directory for temporary files of the client.\n"
-	       "  --ignore-external-client-failures\n"
-	       "                 Ignore failures to initialize external clients.\n"
-	       "  --fail-incompatible-client\n"
-	       "                 Fail if there is no client matching the server version.\n"
 	       "  --expected-error ERR\n"
 	       "                 FDB error code the test expected to fail with (default: 0).\n"
 	       "  --print-status\n"
 	       "                 Print database client status.\n"
+	       "  --network-option-OPTIONNAME OPTIONVALUE\n"
+	       "                 Changes a network option. OPTIONAME should be lowercase.\n"
 	       "  -h, --help     Display this help and exit.\n",
 	       FDB_API_VERSION);
 }
@ -170,6 +156,19 @@ bool processIntOption(const std::string& optionName, const std::string& value, i
 	return true;
 }

+// Extracts the key for command line arguments that are specified with a prefix (e.g. --knob-).
+// This function converts any hyphens in the extracted key to underscores.
+bool extractPrefixedArgument(std::string prefix, const std::string& arg, std::string& res) {
+	if (arg.size() <= prefix.size() || arg.find(prefix) != 0 ||
+	    (arg[prefix.size()] != '-' && arg[prefix.size()] != '_')) {
+		return false;
+	}
+
+	res = arg.substr(prefix.size() + 1);
+	std::transform(res.begin(), res.end(), res.begin(), [](int c) { return c == '-' ? '_' : c; });
+	return true;
+}
+
 bool processArg(const CSimpleOpt& args) {
 	switch (args.OptionId()) {
 	case OPT_CONNFILE:
@ -181,12 +180,6 @@ bool processArg(const CSimpleOpt& args) {
 	case OPT_EXTERNAL_CLIENT_DIRECTORY:
 		options.externalClientDir = args.OptionArg();
 		break;
-	case OPT_DISABLE_LOCAL_CLIENT:
-		options.disableLocalClient = true;
-		break;
-	case OPT_DISABLE_CLIENT_BYPASS:
-		options.disableClientBypass = true;
-		break;
 	case OPT_API_VERSION:
 		if (!processIntOption(
 		        args.OptionText(), args.OptionArg(), MIN_TESTABLE_API_VERSION, FDB_API_VERSION, options.apiVersion)) {
@ -207,12 +200,6 @@ bool processArg(const CSimpleOpt& args) {
 	case OPT_TMP_DIR:
 		options.tmpDir = args.OptionArg();
 		break;
-	case OPT_IGNORE_EXTERNAL_CLIENT_FAILURES:
-		options.ignoreExternalClientFailures = true;
-		break;
-	case OPT_FAIL_INCOMPATIBLE_CLIENT:
-		options.failIncompatibleClient = true;
-		break;
 	case OPT_EXPECTED_ERROR:
 		if (!processIntOption(args.OptionText(), args.OptionArg(), 0, 10000, options.expectedError)) {
 			return false;
@ -221,6 +208,16 @@ bool processArg(const CSimpleOpt& args) {
 	case OPT_PRINT_STATUS:
 		options.printStatus = true;
 		break;
+
+	case OPT_NETWORK_OPTION: {
+		std::string optionName;
+		if (!extractPrefixedArgument("--network-option", args.OptionSyntax(), optionName)) {
+			fmt::print(stderr, "ERROR: unable to parse network option '{}'\n", args.OptionSyntax());
+			return false;
+		}
+		options.networkOptions.emplace_back(optionName, args.OptionArg());
+		break;
+	}
 	}
 	return true;
 }
@ -272,6 +269,12 @@ void fdb_check(fdb::Error e, std::string_view msg) {
 	}
 }

+std::string stringToUpper(const std::string& str) {
+	std::string outStr(str);
+	std::transform(outStr.begin(), outStr.end(), outStr.begin(), [](char c) { return std::toupper(c); });
+	return outStr;
+}
+
 void applyNetworkOptions() {
 	if (!options.tmpDir.empty() && options.apiVersion >= API_VERSION_CLIENT_TMP_DIR) {
 		fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_TMP_DIR, options.tmpDir);
@ -283,20 +286,21 @@ void applyNetworkOptions() {
 	if (!options.externalClientDir.empty()) {
 		fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_DIRECTORY, options.externalClientDir);
 	}
-	if (options.disableLocalClient) {
-		fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT);
-	}
 	if (options.trace) {
 		fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, options.traceDir);
 	}
-	if (options.ignoreExternalClientFailures) {
-		fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_IGNORE_EXTERNAL_CLIENT_FAILURES);
+
+	std::unordered_map<std::string, FDBNetworkOption> networkOptionsByName;
+	for (auto const& [optionCode, optionInfo] : FDBNetworkOptions::optionInfo) {
+		networkOptionsByName[optionInfo.name] = static_cast<FDBNetworkOption>(optionCode);
 	}
-	if (options.failIncompatibleClient) {
-		fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_FAIL_INCOMPATIBLE_CLIENT);
-	}
-	if (options.disableClientBypass) {
-		fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_CLIENT_BYPASS);
+
+	for (auto const& [optionName, optionVal] : options.networkOptions) {
+		auto iter = networkOptionsByName.find(stringToUpper(optionName));
+		if (iter == networkOptionsByName.end()) {
+			fmt::print(stderr, "Unknown network option {}\n", optionName);
+		}
+		fdb::network::setOption(iter->second, optionVal);
 	}
 }

--- a/bindings/c/test/fdb_api.hpp
+++ b/bindings/c/test/fdb_api.hpp
@ -81,6 +81,61 @@ struct GranuleSummary {
 	}
 };

+struct GranuleFilePointer {
+	ByteString filename;
+	int64_t offset;
+	int64_t length;
+	int64_t fullFileLength;
+
+	GranuleFilePointer(const native::FDBBGFilePointer& nativePointer) {
+		filename = fdb::Key(nativePointer.filename_ptr, nativePointer.filename_length);
+		offset = nativePointer.file_offset;
+		length = nativePointer.file_length;
+		fullFileLength = nativePointer.full_file_length;
+	}
+};
+
+struct GranuleMutation {
+	native::FDBBGMutationType type;
+	int64_t version;
+	ByteString param1;
+	ByteString param2;
+
+	GranuleMutation(const native::FDBBGMutation& nativeMutation) {
+		type = static_cast<native::FDBBGMutationType>(nativeMutation.type);
+		version = nativeMutation.version;
+		param1 = ByteString(nativeMutation.param1_ptr, nativeMutation.param1_length);
+		param2 = ByteString(nativeMutation.param2_ptr, nativeMutation.param2_length);
+	}
+};
+
+struct GranuleDescription {
+	KeyRange keyRange;
+	std::optional<GranuleFilePointer> snapshotFile;
+	std::vector<GranuleFilePointer> deltaFiles;
+	std::vector<GranuleMutation> memoryMutations;
+
+	GranuleDescription(const native::FDBBGFileDescription& nativeDesc) {
+		keyRange.beginKey = fdb::Key(nativeDesc.key_range.begin_key, nativeDesc.key_range.begin_key_length);
+		keyRange.endKey = fdb::Key(nativeDesc.key_range.end_key, nativeDesc.key_range.end_key_length);
+		if (nativeDesc.snapshot_present) {
+			snapshotFile = GranuleFilePointer(nativeDesc.snapshot_file_pointer);
+		}
+		if (nativeDesc.delta_file_count > 0) {
+			deltaFiles.reserve(nativeDesc.delta_file_count);
+			for (int i = 0; i < nativeDesc.delta_file_count; i++) {
+				deltaFiles.emplace_back(nativeDesc.delta_files[i]);
+			}
+		}
+		if (nativeDesc.memory_mutation_count > 0) {
+			memoryMutations.reserve(nativeDesc.memory_mutation_count);
+			for (int i = 0; i < nativeDesc.memory_mutation_count; i++) {
+				memoryMutations.emplace_back(nativeDesc.memory_mutations[i]);
+			}
+		}
+	}
+};
+
 inline uint8_t const* toBytePtr(char const* ptr) noexcept {
 	return reinterpret_cast<uint8_t const*>(ptr);
 }
@ -246,6 +301,42 @@ struct GranuleSummaryRefArray {
 	}
 };

+// fdb_future_readbg_get_descriptions
+
+struct GranuleDescriptionRef : native::FDBBGFileDescription {
+	fdb::KeyRef beginKey() const noexcept {
+		return fdb::KeyRef(native::FDBBGFileDescription::key_range.begin_key,
+		                   native::FDBBGFileDescription::key_range.begin_key_length);
+	}
+	fdb::KeyRef endKey() const noexcept {
+		return fdb::KeyRef(native::FDBBGFileDescription::key_range.end_key,
+		                   native::FDBBGFileDescription::key_range.end_key_length);
+	}
+};
+
+struct GranuleDescriptionRefArray {
+	using Type = std::tuple<GranuleDescriptionRef*, int>;
+	static Error extract(native::FDBFuture* f, Type& out) noexcept {
+		auto& [out_desc, out_count] = out;
+		auto err = native::fdb_future_readbg_get_descriptions(
+		    f, reinterpret_cast<native::FDBBGFileDescription**>(&out_desc), &out_count);
+		return Error(err);
+	}
+};
+
+struct GranuleMutationRef : native::FDBBGMutation {
+	fdb::KeyRef param1() const noexcept {
+		return fdb::BytesRef(native::FDBBGMutation::param1_ptr, native::FDBBGMutation::param1_length);
+	}
+	fdb::KeyRef param2() const noexcept {
+		return fdb::BytesRef(native::FDBBGMutation::param2_ptr, native::FDBBGMutation::param2_length);
+	}
+};
+
+struct GranuleMutationRefArray {
+	using Type = std::tuple<GranuleMutationRef const*, int>;
+};
+
 } // namespace future_var

 [[noreturn]] inline void throwError(std::string_view preamble, Error err) {
@ -335,6 +426,7 @@ class Result {

 public:
 	using KeyValueRefArray = future_var::KeyValueRefArray::Type;
+	using GranuleMutationRefArray = future_var::GranuleMutationRefArray::Type;

 	Error getKeyValueArrayNothrow(KeyValueRefArray& out) const noexcept {
 		auto out_more_native = native::fdb_bool_t{};
@ -351,6 +443,20 @@ public:
 			throwError("ERROR: result_get_keyvalue_array(): ", err);
 		return ret;
 	}
+
+	Error getGranuleMutationArrayNothrow(GranuleMutationRefArray& out) const noexcept {
+		auto& [out_mutations, out_count] = out;
+		auto err_raw = native::fdb_result_get_bg_mutations_array(
+		    r.get(), reinterpret_cast<const native::FDBBGMutation**>(&out_mutations), &out_count);
+		return Error(err_raw);
+	}
+
+	GranuleMutationRefArray getGranuleMutationArray() const {
+		auto ret = GranuleMutationRefArray{};
+		if (auto err = getGranuleMutationArrayNothrow(ret))
+			throwError("ERROR: result_get_keyvalue_array(): ", err);
+		return ret;
+	}
 };

 class Future {
@ -640,6 +746,29 @@ public:
 		return native::fdb_transaction_watch(tr.get(), key.data(), intSize(key));
 	}

+	TypedFuture<future_var::GranuleDescriptionRefArray> readBlobGranulesDescription(KeyRef begin,
+	                                                                                KeyRef end,
+	                                                                                int64_t beginVersion,
+	                                                                                int64_t readVersion,
+	                                                                                int64_t* readVersionOut) {
+		return native::fdb_transaction_read_blob_granules_description(tr.get(),
+		                                                              begin.data(),
+		                                                              intSize(begin),
+		                                                              end.data(),
+		                                                              intSize(end),
+		                                                              beginVersion,
+		                                                              readVersion,
+		                                                              readVersionOut);
+	}
+
+	Result parseSnapshotFile(BytesRef fileData) {
+		return Result(native::fdb_readbg_parse_snapshot_file(fileData.data(), intSize(fileData)));
+	}
+
+	Result parseDeltaFile(BytesRef fileData) {
+		return Result(native::fdb_readbg_parse_delta_file(fileData.data(), intSize(fileData)));
+	}
+
 	TypedFuture<future_var::None> commit() { return native::fdb_transaction_commit(tr.get()); }

 	TypedFuture<future_var::None> onError(Error err) { return native::fdb_transaction_on_error(tr.get(), err.code()); }
--- a/bindings/c/test/fdb_c_client_config_tests.py
+++ b/bindings/c/test/fdb_c_client_config_tests.py
@ -8,6 +8,7 @@ import os
 import glob
 import unittest
 import json
+import re

 from threading import Thread
 import time
@ -99,6 +100,9 @@ class ClientConfigTest:
        self.expected_error = None
        self.transaction_timeout = None
        self.print_status = False
+        self.trace_file_identifier = None
+        self.trace_initialize_on_setup = False
+        self.trace_format = None

    # ----------------------------
    # Configuration methods
@ -208,6 +212,9 @@ class ClientConfigTest:
        self.tc.assertTrue("Healthy" in self.status_json)
        self.tc.assertEqual(expected_is_healthy, self.status_json["Healthy"])

+    def list_trace_files(self):
+        return glob.glob(os.path.join(self.log_dir, "*"))
+
    # ----------------------------
    # Executing the test
    # ----------------------------
@ -222,10 +229,10 @@ class ClientConfigTest:
            cmd_args += ["--log", "--log-dir", self.log_dir]

        if self.disable_local_client:
-            cmd_args += ["--disable-local-client"]
+            cmd_args += ["--network-option-disable_local_client", ""]

        if self.disable_client_bypass:
-            cmd_args += ["--disable-client-bypass"]
+            cmd_args += ["--network-option-disable_client_bypass", ""]

        if self.external_lib_path is not None:
            cmd_args += ["--external-client-library", self.external_lib_path]
@ -234,10 +241,19 @@ class ClientConfigTest:
            cmd_args += ["--external-client-dir", self.external_lib_dir]

        if self.ignore_external_client_failures:
-            cmd_args += ["--ignore-external-client-failures"]
+            cmd_args += ["--network-option-ignore_external_client_failures", ""]

        if self.fail_incompatible_client:
-            cmd_args += ["--fail-incompatible-client"]
+            cmd_args += ["--network-option-fail_incompatible_client", ""]
+
+        if self.trace_file_identifier is not None:
+            cmd_args += ["--network-option-trace_file_identifier", self.trace_file_identifier]
+
+        if self.trace_initialize_on_setup:
+            cmd_args += ["--network-option-trace_initialize_on_setup", ""]
+
+        if self.trace_format is not None:
+            cmd_args += ["--network-option-trace_format", self.trace_format]

        if self.api_version is not None:
            cmd_args += ["--api-version", str(self.api_version)]
@ -252,26 +268,20 @@ class ClientConfigTest:
            cmd_args += ["--print-status"]

        print("\nExecuting test command: {}".format(" ".join([str(c) for c in cmd_args])), file=sys.stderr)
-        try:
-            tester_proc = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=sys.stderr)
-            out, _ = tester_proc.communicate()
-            self.tc.assertEqual(0, tester_proc.returncode)
-            if self.print_status:
-                # Parse the output as status json
-                try:
-                    self.status_json = json.loads(out)
-                except json.JSONDecodeError as e:
-                    print("Error '{}' parsing output {}".format(e, out.decode()), file=sys.stderr)
-                self.tc.assertIsNotNone(self.status_json)
-                print("Status: ", self.status_json, file=sys.stderr)
-            else:
-                # Otherwise redirect the output to the console
-                print(out.decode(), file=sys.stderr)
-        finally:
-            self.cleanup()
-
-    def cleanup(self):
-        shutil.rmtree(self.test_dir)
+        tester_proc = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=sys.stderr)
+        out, _ = tester_proc.communicate()
+        self.tc.assertEqual(0, tester_proc.returncode)
+        if self.print_status:
+            # Parse the output as status json
+            try:
+                self.status_json = json.loads(out)
+            except json.JSONDecodeError as e:
+                print("Error '{}' parsing output {}".format(e, out.decode()), file=sys.stderr)
+            self.tc.assertIsNotNone(self.status_json)
+            print("Status: ", self.status_json, file=sys.stderr)
+        else:
+            # Otherwise redirect the output to the console
+            print(out.decode(), file=sys.stderr)


 class ClientConfigTests(unittest.TestCase):
@ -516,6 +526,171 @@ class ClientConfigSeparateCluster(unittest.TestCase):
            self.cluster.tear_down()


+# Test client-side tracing
+class ClientTracingTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.cluster = TestCluster(CURRENT_VERSION)
+        cls.cluster.setup()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.cluster.tear_down()
+
+    def test_default_config_normal_case(self):
+        # Test trace files created with a default trace configuration
+        # in a normal case
+        test = self.test
+        test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION])
+        test.api_version = api_version_from_str(PREV_RELEASE_VERSION)
+        test.disable_local_client = True
+
+        self.exec_test()
+        self.assertEqual(3, len(self.trace_files))
+        primary_trace = self.find_trace_file(with_ip=True)
+        self.find_and_check_event(primary_trace, "ClientStart", ["Machine"], [])
+        cur_ver_trace = self.find_trace_file(with_ip=True, version=CURRENT_VERSION, thread_idx=0)
+        self.find_and_check_event(cur_ver_trace, "ClientStart", ["Machine"], [])
+        prev_ver_trace = self.find_trace_file(with_ip=True, version=PREV_RELEASE_VERSION, thread_idx=0)
+        self.find_and_check_event(prev_ver_trace, "ClientStart", ["Machine"], [])
+
+    def test_default_config_error_case(self):
+        # Test that no trace files are created with a default configuration
+        # when an a client fails to initialize
+        test = self.test
+        test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION])
+        test.api_version = api_version_from_str(CURRENT_VERSION)
+        test.disable_local_client = True
+        test.expected_error = 2204  # API function missing
+
+        self.exec_test()
+        self.assertEqual(0, len(self.trace_files))
+
+    def test_init_on_setup_normal_case(self):
+        # Test trace files created with trace_initialize_on_setup option
+        # in a normal case
+        test = self.test
+        test.create_external_lib_dir([CURRENT_VERSION])
+        test.api_version = api_version_from_str(CURRENT_VERSION)
+        test.disable_local_client = True
+        test.trace_initialize_on_setup = True
+
+        self.exec_test()
+        self.assertEqual(2, len(self.trace_files))
+        primary_trace = self.find_trace_file()
+        # The machine address will be available only in the second ClientStart event
+        self.find_and_check_event(primary_trace, "ClientStart", [], ["Machine"])
+        self.find_and_check_event(primary_trace, "ClientStart", ["Machine"], [], seqno=1)
+        cur_ver_trace = self.find_trace_file(version=CURRENT_VERSION, thread_idx=0)
+        self.find_and_check_event(cur_ver_trace, "ClientStart", [], ["Machine"])
+        self.find_and_check_event(cur_ver_trace, "ClientStart", ["Machine"], [], seqno=1)
+
+    def test_init_on_setup_trace_error_case(self):
+        # Test trace files created with trace_initialize_on_setup option
+        # when an a client fails to initialize
+        test = self.test
+        test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION])
+        test.api_version = api_version_from_str(CURRENT_VERSION)
+        test.disable_local_client = True
+        test.trace_initialize_on_setup = True
+        test.expected_error = 2204  # API function missing
+
+        self.exec_test()
+        self.assertEqual(1, len(self.trace_files))
+        primary_trace = self.find_trace_file()
+        self.find_and_check_event(primary_trace, "ClientStart", [], ["Machine"])
+
+    def test_trace_identifier(self):
+        # Test trace files created with file identifier
+        test = self.test
+        test.create_external_lib_dir([CURRENT_VERSION])
+        test.api_version = api_version_from_str(CURRENT_VERSION)
+        test.disable_local_client = True
+        test.trace_file_identifier = "fdbclient"
+
+        self.exec_test()
+        self.assertEqual(2, len(self.trace_files))
+        self.find_trace_file(with_ip=True, identifier="fdbclient")
+        self.find_trace_file(with_ip=True, identifier="fdbclient", version=CURRENT_VERSION, thread_idx=0)
+
+    def test_init_on_setup_and_trace_identifier(self):
+        # Test trace files created with trace_initialize_on_setup option
+        # and file identifier
+        test = self.test
+        test.create_external_lib_dir([CURRENT_VERSION])
+        test.api_version = api_version_from_str(CURRENT_VERSION)
+        test.disable_local_client = True
+        test.trace_initialize_on_setup = True
+        test.trace_file_identifier = "fdbclient"
+
+        self.exec_test()
+        self.assertEqual(2, len(self.trace_files))
+        self.find_trace_file(identifier="fdbclient")
+        self.find_trace_file(identifier="fdbclient", version=CURRENT_VERSION, thread_idx=0)
+
+    # ---------------
+    # Helper methods
+    # ---------------
+
+    def setUp(self):
+        self.test = ClientConfigTest(self)
+        self.trace_files = None
+        self.test.trace_format = "json"
+
+    def exec_test(self):
+        self.test.exec()
+        self.trace_files = self.test.list_trace_files()
+        if self.test.trace_format == "json":
+            self.load_trace_file_events()
+
+    def load_trace_file_events(self):
+        self.trace_file_events = {}
+        for trace in self.trace_files:
+            events = []
+            with open(trace, "r") as f:
+                for line in f:
+                    events.append(json.loads(line))
+            self.trace_file_events[trace] = events
+
+    def find_trace_file(self, with_ip=False, identifier=None, version=None, thread_idx=None):
+        self.assertIsNotNone(self.trace_files)
+        for trace_file in self.trace_files:
+            name = os.path.basename(trace_file)
+            # trace prefix must be in all files
+            self.assertTrue(name.startswith("trace."))
+            pattern = "^trace\."
+            if with_ip:
+                pattern += "127\.0\.0\.1\."
+            else:
+                pattern += "0\.0\.0\.0\."
+            if identifier is not None:
+                pattern += identifier
+            else:
+                pattern += "\d+"
+            if version is not None:
+                pattern += "_v{}".format(version.replace(".", "_"))
+            if thread_idx is not None:
+                pattern += "t{}".format(thread_idx)
+            pattern += "\.\d+\.\w+\.\d+\.\d+\.{}$".format(self.test.trace_format)
+            if re.match(pattern, name):
+                return trace_file
+        self.fail("No maching trace file found")
+
+    def find_and_check_event(self, trace_file, event_type, attr_present, attr_missing, seqno=0):
+        self.assertTrue(trace_file in self.trace_file_events)
+        for event in self.trace_file_events[trace_file]:
+            if event["Type"] == event_type:
+                if seqno > 0:
+                    seqno -= 1
+                    continue
+                for attr in attr_present:
+                    self.assertTrue(attr in event)
+                for attr in attr_missing:
+                    self.assertFalse(attr in event)
+                return
+        self.fail("No matching event found")
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
--- a/bindings/c/test/fdb_c_shim_tests.py
+++ b/bindings/c/test/fdb_c_shim_tests.py
@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 from argparse import ArgumentParser, RawDescriptionHelpFormatter
 from pathlib import Path
-import platform
 import shutil
 import subprocess
 import sys
@ -53,7 +52,7 @@ class TestEnv(LocalCluster):
            self.downloader.binary_path(version, "fdbcli"),
            1,
        )
-        self.set_env_var("LD_LIBRARY_PATH", self.downloader.lib_dir(version))
+        self.set_env_var("LD_LIBRARY_PATH", "%s:%s" % (self.downloader.lib_dir(version), os.getenv("LD_LIBRARY_PATH")))
        client_lib = self.downloader.lib_path(version)
        assert client_lib.exists(), "{} does not exist".format(client_lib)
        self.client_lib_external = self.tmp_dir.joinpath("libfdb_c_external.so")
@ -91,9 +90,8 @@ class FdbCShimTests:
        self.api_test_dir = Path(args.api_test_dir).resolve()
        assert self.api_test_dir.exists(), "{} does not exist".format(self.api_test_dir)
        self.downloader = FdbBinaryDownloader(args.build_dir)
-        # binary downloads are currently available only for x86_64
-        self.platform = platform.machine()
-        if self.platform == "x86_64":
+        self.test_prev_versions = not args.disable_prev_version_tests
+        if self.test_prev_versions:
            self.downloader.download_old_binaries(PREV_RELEASE_VERSION)
            self.downloader.download_old_binaries("7.0.0")

@ -182,7 +180,8 @@ class FdbCShimTests:
        if use_external_lib:
            cmd_args = cmd_args + ["--disable-local-client", "--external-client-library", test_env.client_lib_external]
        env_vars = os.environ.copy()
-        env_vars["LD_LIBRARY_PATH"] = self.downloader.lib_dir(version) if set_ld_lib_path else ""
+        if set_ld_lib_path:
+            env_vars["LD_LIBRARY_PATH"] = "%s:%s" % (self.downloader.lib_dir(version), os.getenv("LD_LIBRARY_PATH"))
        if set_env_path:
            env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = (
                "dummy" if invalid_lib_path else self.downloader.lib_path(version)
@ -230,8 +229,7 @@ class FdbCShimTests:
            # Test calling a function that exists in the loaded library, but not for the selected API version
            self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, call_set_path=True, api_version=700)

-        # binary downloads are currently available only for x86_64
-        if self.platform == "x86_64":
+        if self.test_prev_versions:
            # Test the API workload with the release version
            self.run_c_api_test(PREV_RELEASE_VERSION, DEFAULT_TEST_FILE)

@ -283,6 +281,12 @@ if __name__ == "__main__":
    parser.add_argument(
        "--api-test-dir", type=str, help="Path to a directory with api test definitions.", required=True
    )
+    parser.add_argument(
+        "--disable-prev-version-tests",
+        action="store_true",
+        default=False,
+        help="Disable tests that need binaries of previous versions",
+    )
    args = parser.parse_args()
    test = FdbCShimTests(args)
    test.run_tests()
--- a/bindings/flow/fdb_flow.actor.cpp
+++ b/bindings/flow/fdb_flow.actor.cpp
@ -87,7 +87,7 @@ void fdb_flow_test() {

 	g_network = newNet2(TLSConfig());

-	openTraceFile(NetworkAddress(), 1000000, 1000000, ".");
+	openTraceFile({}, 1000000, 1000000, ".");
 	systemMonitor();
 	uncancellable(recurring(&systemMonitor, 5.0, TaskPriority::FlushTrace));

--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@ -107,6 +107,11 @@ func (o NetworkOptions) SetTraceShareAmongClientThreads() error {
 	return o.setOpt(37, nil)
 }

+// Initialize trace files on network setup, determine the local IP later. Otherwise tracing is initialized when opening the first database.
+func (o NetworkOptions) SetTraceInitializeOnSetup() error {
+	return o.setOpt(38, nil)
+}
+
 // Set file suffix for partially written log files.
 //
 // Parameter: Append this suffix to partially written log files. When a log file is complete, it is renamed to remove the suffix. No separator is added between the file and the suffix. If you want to add a file extension, you should include the separator - e.g. '.tmp' instead of 'tmp' to add the 'tmp' extension.
@ -422,9 +427,11 @@ func (o DatabaseOptions) SetUseConfigDatabase() error {
 	return o.setOpt(800, nil)
 }

-// An integer between 0 and 100 (default is 0) expressing the probability that a client will verify it can't read stale data whenever it detects a recovery.
-func (o DatabaseOptions) SetTestCausalReadRisky() error {
-	return o.setOpt(900, nil)
+// Enables verification of causal read risky by checking whether clients are able to read stale data when they detect a recovery, and logging an error if so.
+//
+// Parameter: integer between 0 and 100 expressing the probability a client will verify it can't read stale data
+func (o DatabaseOptions) SetTestCausalReadRisky(param int64) error {
+	return o.setOpt(900, int64ToBytes(param))
 }

 // The transaction, if not self-conflicting, may be committed a second time after commit succeeds, in the event of a fault
--- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java
@ -505,15 +505,27 @@ public class AsyncStackTester {
 			}, FDB.DEFAULT_EXECUTOR);
 		}
 		else if (op == StackOperation.TENANT_SET_ACTIVE) {
-			return inst.popParam().thenAcceptAsync(param -> {
+			return inst.popParam().thenComposeAsync(param -> {
 				byte[] tenantName = (byte[])param;
-				inst.context.setTenant(Optional.of(tenantName));
+				return inst.context.setTenant(Optional.of(tenantName)).thenAcceptAsync(id -> {
+					inst.push("SET_ACTIVE_TENANT".getBytes());
+				}, FDB.DEFAULT_EXECUTOR);
 			}, FDB.DEFAULT_EXECUTOR);
 		}
 		else if (op == StackOperation.TENANT_CLEAR_ACTIVE) {
 			inst.context.setTenant(Optional.empty());
 			return AsyncUtil.DONE;
 		}
+		else if (op == StackOperation.TENANT_GET_ID) {
+			if (inst.context.tenant.isPresent()) {
+				return inst.context.tenant.get().getId().thenAcceptAsync(id -> {
+					inst.push("GOT_TENANT_ID".getBytes());
+				}, FDB.DEFAULT_EXECUTOR);
+			} else {
+				inst.push("NO_ACTIVE_TENANT".getBytes());
+				return AsyncUtil.DONE;
+			}
+		}
 		else if (op == StackOperation.UNIT_TESTS) {
 			inst.context.db.options().setLocationCacheSize(100001);
 			return inst.context.db.runAsync(tr -> {
--- a/bindings/java/src/test/com/apple/foundationdb/test/Context.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/Context.java
@ -101,12 +101,14 @@ abstract class Context implements Runnable, AutoCloseable {
 		}
 	}

-	public synchronized void setTenant(Optional<byte[]> tenantName) {
+	public synchronized CompletableFuture<Long> setTenant(Optional<byte[]> tenantName) {
 		if (tenantName.isPresent()) {
 			tenant = Optional.of(tenantMap.computeIfAbsent(tenantName.get(), tn -> db.openTenant(tenantName.get())));
+			return tenant.get().getId();
 		}
 		else {
 			tenant = Optional.empty();
+			return CompletableFuture.completedFuture(-1L);
 		}
 	}

--- a/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java
@ -79,6 +79,7 @@ enum StackOperation {
 	TENANT_LIST,
 	TENANT_SET_ACTIVE,
 	TENANT_CLEAR_ACTIVE,
+	TENANT_GET_ID,

 	LOG_STACK
 }
--- a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java
@ -450,11 +450,20 @@ public class StackTester {
 			}
 			else if (op == StackOperation.TENANT_SET_ACTIVE) {
 				byte[] tenantName = (byte[])inst.popParam().join();
-				inst.context.setTenant(Optional.of(tenantName));
+				inst.context.setTenant(Optional.of(tenantName)).join();
+				inst.push("SET_ACTIVE_TENANT".getBytes());
 			}
 			else if (op == StackOperation.TENANT_CLEAR_ACTIVE) {
 				inst.context.setTenant(Optional.empty());
 			}
+			else if (op == StackOperation.TENANT_GET_ID) {
+				if (inst.context.tenant.isPresent()) {
+					inst.context.tenant.get().getId().join();
+					inst.push("GOT_TENANT_ID".getBytes());
+				} else {
+					inst.push("NO_ACTIVE_TENANT".getBytes());
+				}
+			}
 			else if (op == StackOperation.UNIT_TESTS) {
 				try {
 					inst.context.db.options().setLocationCacheSize(100001);
--- a/bindings/python/fdb/impl.py
+++ b/bindings/python/fdb/impl.py
@ -1713,6 +1713,9 @@ def init_c_api():
    _capi.fdb_tenant_destroy.argtypes = [ctypes.c_void_p]
    _capi.fdb_tenant_destroy.restype = None

+    _capi.fdb_tenant_get_id.argtypes = [ctypes.c_void_p]
+    _capi.fdb_tenant_get_id.restype = ctypes.c_void_p
+
    _capi.fdb_tenant_create_transaction.argtypes = [
        ctypes.c_void_p,
        ctypes.POINTER(ctypes.c_void_p),
--- a/bindings/python/tests/tester.py
+++ b/bindings/python/tests/tester.py
@ -603,6 +603,8 @@ class Tester:
                elif inst.op == six.u("TENANT_SET_ACTIVE"):
                    name = inst.pop()
                    self.tenant = self.db.open_tenant(name)
+                    self.tenant.get_id().wait()
+                    inst.push(b"SET_ACTIVE_TENANT")
                elif inst.op == six.u("TENANT_CLEAR_ACTIVE"):
                    self.tenant = None
                elif inst.op == six.u("TENANT_LIST"):
@ -618,6 +620,12 @@ class Tester:
                        except (json.decoder.JSONDecodeError, KeyError):
                            assert False, "Invalid Tenant Metadata"
                    inst.push(fdb.tuple.pack(tuple(result)))
+                elif inst.op == six.u("TENANT_GET_ID"):
+                    if self.tenant != None:
+                        self.tenant.get_id().wait()
+                        inst.push(b"GOT_TENANT_ID")
+                    else:
+                        inst.push(b"NO_ACTIVE_TENANT")
                elif inst.op == six.u("UNIT_TESTS"):
                    try:
                        test_db_options(db)
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@ -380,8 +380,127 @@ function(create_valgrind_correctness_package)
  endif()
 endfunction()

+function(prepare_binding_test_files build_directory target_name target_dependency)
+  add_custom_target(${target_name} DEPENDS ${target_dependency})
+  add_custom_command(
+    TARGET ${target_name}
+    COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:fdb_flow_tester> ${build_directory}/tests/flow/bin/fdb_flow_tester
+    COMMENT "Copy Flow tester for bindingtester")
+
+  set(generated_binding_files python/fdb/fdboptions.py)
+  if(WITH_JAVA_BINDING)
+    if(NOT FDB_RELEASE)
+      set(not_fdb_release_string "-SNAPSHOT")
+    else()
+      set(not_fdb_release_string "")
+    endif()
+    add_custom_command(
+      TARGET ${target_name}
+      COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/packages/fdb-java-${FDB_VERSION}${not_fdb_release_string}.jar
+        ${build_directory}/tests/java/foundationdb-client.jar
+      COMMENT "Copy Java bindings for bindingtester")
+    add_dependencies(${target_name} fat-jar)
+    add_dependencies(${target_name} foundationdb-tests)
+    set(generated_binding_files ${generated_binding_files} java/foundationdb-tests.jar)
+  endif()
+
+  if(WITH_GO_BINDING)
+    add_dependencies(${target_name} fdb_go_tester fdb_go)
+    add_custom_command(
+      TARGET ${target_name}
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/go/bin/_stacktester ${build_directory}/tests/go/build/bin/_stacktester
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${build_directory}/tests/go/src/fdb/
+      COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/bindings/go/src/github.com/apple/foundationdb/bindings/go/src/fdb/generated.go # SRC
+        ${build_directory}/tests/go/src/fdb/ # DEST
+      COMMENT "Copy generated.go for bindingtester")
+  endif()
+
+  foreach(generated IN LISTS generated_binding_files)
+    add_custom_command(
+      TARGET ${target_name}
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/${generated} ${build_directory}/tests/${generated}
+      COMMENT "Copy ${generated} to bindingtester")
+  endforeach()
+endfunction(prepare_binding_test_files)
+
+function(package_bindingtester2)
+  if (WIN32 OR OPEN_FOR_IDE)
+    message(WARNING "Binding tester is not built (WIN32/OPEN_FOR_IDE)")
+    return()
+  endif()
+
+  set(fdbcName "libfdb_c.so")
+  if (APPLE)
+    set(fdbcName "libfdb_c.dylib")
+  endif ()
+
+  set(touch_file ${CMAKE_BINARY_DIR}/bindingtester2.touch)
+  set(build_directory ${CMAKE_BINARY_DIR}/bindingtester2)
+  set(tests_directory ${build_directory}/tests)
+  add_custom_command(
+    OUTPUT ${touch_file}
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${build_directory}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${build_directory}
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${tests_directory}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${tests_directory}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bindings ${tests_directory}
+    COMMAND ${CMAKE_COMMAND} -E touch "${CMAKE_BINARY_DIR}/bindingtester2.touch"
+    COMMENT "Setup scratch directory for bindingtester2")
+
+  set(joshua_directory ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts)
+  set(output_files
+   ${build_directory}/joshua_test
+   ${build_directory}/joshua_timeout
+   ${build_directory}/fdbcli
+   ${build_directory}/fdbserver
+   ${build_directory}/${fdbcName}
+  )
+
+  add_custom_command(
+    OUTPUT ${output_files}
+    DEPENDS strip_only_fdbcli
+            strip_only_fdbserver
+            strip_only_fdb_c
+            ${joshua_directory}/binding_test_start.sh
+            ${joshua_directory}/binding_test_timeout.sh
+            ${touch_file}
+    COMMAND ${CMAKE_COMMAND} -E copy
+            ${CMAKE_BINARY_DIR}/packages/bin/fdbcli
+            ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
+            ${CMAKE_BINARY_DIR}/packages/lib/${fdbcName}
+            ${build_directory}
+    COMMAND ${CMAKE_COMMAND} -E copy ${joshua_directory}/binding_test_start.sh ${build_directory}/joshua_test
+    COMMAND ${CMAKE_COMMAND} -E copy ${joshua_directory}/binding_test_timeout.sh ${build_directory}/joshua_timeout
+    COMMENT "Copy executables and scripts to bindingtester2 dir")
+
+  set(local_cluster_files ${build_directory}/local_cluster)
+  set(local_cluster_directory ${CMAKE_SOURCE_DIR}/contrib/local_cluster)
+  add_custom_command(
+    OUTPUT ${local_cluster_files}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory
+            ${local_cluster_directory}
+            ${build_directory}
+  )
+
+  prepare_binding_test_files(${build_directory} copy_bindingtester2_test_files ${touch_file})
+
+  set(tar_file ${CMAKE_BINARY_DIR}/packages/bindingtester2-${FDB_VERSION}.tar.gz)
+  add_custom_command(
+    OUTPUT ${tar_file}
+    DEPENDS ${touch_file} ${output_files} ${local_cluster_files} copy_bindingtester2_test_files
+    COMMAND ${CMAKE_COMMAND} -E tar czf ${tar_file} *
+    WORKING_DIRECTORY ${build_directory}
+    COMMENT "Pack bindingtester2"
+  )
+
+  add_custom_target(bindingtester2 ALL DEPENDS ${tar_file})
+endfunction(package_bindingtester2)
+
 function(package_bindingtester)
  if(WIN32 OR OPEN_FOR_IDE)
+    message(WARNING "Binding tester is not built (WIN32/OPEN_FOR_IDE)")
    return()
  elseif(APPLE)
    set(fdbcName "libfdb_c.dylib")
@ -403,7 +522,6 @@ function(package_bindingtester)
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/localClusterStart.sh ${bdir}/localClusterStart.sh
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/bindingTestScript.sh ${bdir}/bindingTestScript.sh
    COMMENT "Copy executables and scripts to bindingtester dir")
-  file(GLOB_RECURSE test_files ${CMAKE_SOURCE_DIR}/bindings/*)
  add_custom_command(
    OUTPUT "${CMAKE_BINARY_DIR}/bindingtester.touch"
    COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/bindingtester/tests
@ -412,60 +530,19 @@ function(package_bindingtester)
    COMMAND ${CMAKE_COMMAND} -E touch "${CMAKE_BINARY_DIR}/bindingtester.touch"
    COMMENT "Copy test files for bindingtester")

-  add_custom_target(copy_binding_output_files DEPENDS ${CMAKE_BINARY_DIR}/bindingtester.touch python_binding fdb_flow_tester)
-  add_custom_command(
-    TARGET copy_binding_output_files
-    COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:fdb_flow_tester> ${bdir}/tests/flow/bin/fdb_flow_tester
-    COMMENT "Copy Flow tester for bindingtester")
-
-  set(generated_binding_files python/fdb/fdboptions.py)
-  if(WITH_JAVA_BINDING)
-    if(NOT FDB_RELEASE)
-      set(not_fdb_release_string "-SNAPSHOT")
-    else()
-      set(not_fdb_release_string "")
-    endif()
-    add_custom_command(
-      TARGET copy_binding_output_files
-      COMMAND ${CMAKE_COMMAND} -E copy
-        ${CMAKE_BINARY_DIR}/packages/fdb-java-${FDB_VERSION}${not_fdb_release_string}.jar
-        ${bdir}/tests/java/foundationdb-client.jar
-      COMMENT "Copy Java bindings for bindingtester")
-    add_dependencies(copy_binding_output_files fat-jar)
-    add_dependencies(copy_binding_output_files foundationdb-tests)
-    set(generated_binding_files ${generated_binding_files} java/foundationdb-tests.jar)
-  endif()
-
-  if(WITH_GO_BINDING AND NOT OPEN_FOR_IDE)
-    add_dependencies(copy_binding_output_files fdb_go_tester fdb_go)
-    add_custom_command(
-      TARGET copy_binding_output_files
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/go/bin/_stacktester ${bdir}/tests/go/build/bin/_stacktester
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${bdir}/tests/go/src/fdb/
-      COMMAND ${CMAKE_COMMAND} -E copy
-        ${CMAKE_BINARY_DIR}/bindings/go/src/github.com/apple/foundationdb/bindings/go/src/fdb/generated.go # SRC
-        ${bdir}/tests/go/src/fdb/ # DEST
-      COMMENT "Copy generated.go for bindingtester")
-  endif()
-
-  foreach(generated IN LISTS generated_binding_files)
-    add_custom_command(
-      TARGET copy_binding_output_files
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/${generated} ${bdir}/tests/${generated}
-      COMMENT "Copy ${generated} to bindingtester")
-  endforeach()
+  prepare_binding_test_files(${bdir} copy_binding_output_files ${CMAKE_BINARY_DIR}/bindingtester.touch)

  add_custom_target(copy_bindingtester_binaries
    DEPENDS ${outfiles} "${CMAKE_BINARY_DIR}/bindingtester.touch" copy_binding_output_files)
  add_dependencies(copy_bindingtester_binaries strip_only_fdbserver strip_only_fdbcli strip_only_fdb_c)
+
  set(tar_file ${CMAKE_BINARY_DIR}/packages/bindingtester-${FDB_VERSION}.tar.gz)
  add_custom_command(
    OUTPUT ${tar_file}
    COMMAND ${CMAKE_COMMAND} -E tar czf ${tar_file} *
    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bindingtester
    COMMENT "Pack bindingtester")
-  add_custom_target(bindingtester ALL DEPENDS ${tar_file})
-  add_dependencies(bindingtester copy_bindingtester_binaries)
+  add_custom_target(bindingtester ALL DEPENDS ${tar_file} copy_bindingtester_binaries)
 endfunction()

 # Test for setting up Python venv for client tests.
--- a/cmake/CPackConfig.cmake
+++ b/cmake/CPackConfig.cmake
@ -1,4 +1,3 @@
-# RPM specifics
 if(CPACK_GENERATOR MATCHES "RPM")
  set(CPACK_PACKAGING_INSTALL_PREFIX "/")
  set(CPACK_COMPONENTS_ALL clients-el7 server-el7 clients-versioned server-versioned)
--- a/contrib/Implib.so/README.md
+++ b/contrib/Implib.so/README.md
@ -46,27 +46,17 @@ where `TARGET` can be any of
  * aarch64-linux-gnu, aarch64-none-linux-android
  * e2k-linux-gnu

-Script generates two files: `libxyz.so.tramp.S` and `libxyz.so.init.c` which need to be linked to your application (instead of `-lxyz`):
+Script generates two files: `libxyz.so.tramp.S` and `libxyz.so.init.cpp` which need to be linked to your application (instead of `-lxyz`):

 ```
-$ gcc myfile1.c myfile2.c ... libxyz.so.tramp.S libxyz.so.init.c ... -ldl
+$ gcc myfile1.c myfile2.c ... libxyz.so.tramp.S libxyz.so.init.cpp ... -ldl
 ```

 Note that you need to link against libdl.so. On ARM in case your app is compiled to Thumb code (which e.g. Ubuntu's `arm-linux-gnueabihf-gcc` does by default) you'll also need to add `-mthumb-interwork`.

 Application can then freely call functions from `libxyz.so` _without linking to it_. Library will be loaded (via `dlopen`) on first call to any of its functions. If you want to forcedly resolve all symbols (e.g. if you want to avoid delays further on) you can call `void libxyz_init_all()`.

-Above command would perform a _lazy load_ i.e. load library on first call to one of it's symbols. If you want to load it at startup, run
-
-```
-$ implib-gen.py --no-lazy-load libxyz.so
-```
-
-If you don't want `dlopen` to be called automatically and prefer to load library yourself at program startup, run script as
-
-```
-$ implib-gen.py --no-dlopen libxys.so
-```
+Above command would perform a _lazy load_ i.e. load library on first call to one of it's symbols. 

 If you do want to load library via `dlopen` but would prefer to call it yourself (e.g. with custom parameters or with modified library name), run script as

@ -100,10 +90,6 @@ $ implib-gen.py --dlopen-callback=mycallback libxyz.so

 (callback must have signature `void *(*)(const char *lib_name)` and return handle of loaded library).

-Finally to force library load and resolution of all symbols, call
-
-    void _LIBNAME_tramp_resolve_all(void);
-
 # Wrapping vtables

 By default the tool does not try to wrap vtables exported from the library. This can be enabled via `--vtables` flag:
@ -141,7 +127,7 @@ void *mycallback(const char *lib_name) {
 }

 $ implib-gen.py --dlopen-callback=mycallback --symbol-list=mysymbols.txt libxyz.so
-$ ... # Link your app with libxyz.tramp.S, libxyz.init.c and mycallback.c
+$ ... # Link your app with libxyz.tramp.S, libxyz.init.cpp and mycallback.c
 ```

 Similar approach can be used if you want to provide a common interface for several libraries with partially intersecting interfaces (see [this example](tests/multilib/run.sh) for more details).
@ -156,7 +142,7 @@ To achieve this you can generate a wrapper with _renamed_ symbols which call to
 $ cat mycallback.c
 ... Same as before ...
 $ implib-gen.py --dlopen-callback=mycallback --symbol_prefix=MYPREFIX_ libxyz.so
-$ ... # Link your app with libxyz.tramp.S, libxyz.init.c and mycallback.c
+$ ... # Link your app with libxyz.tramp.S, libxyz.init.cpp and mycallback.c
 ```

 # Linker wrapper
--- a/contrib/Implib.so/arch/common/init.cpp.tpl
+++ b/contrib/Implib.so/arch/common/init.cpp.tpl
@ -11,6 +11,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
+#include <mutex>

 // Sanity check for ARM to avoid puzzling runtime crashes
 #ifdef __arm__
@ -31,22 +32,15 @@ extern "C" {
  } while(0)

 #define CALL_USER_CALLBACK $has_dlopen_callback
-#define NO_DLOPEN $no_dlopen
-#define LAZY_LOAD $lazy_load

 static void *lib_handle;
-static int is_lib_loading;

 static void *load_library() {
  if(lib_handle)
    return lib_handle;

-  is_lib_loading = 1;
-
  // TODO: dlopen and users callback must be protected w/ critical section (to avoid dlopening lib twice)
-#if NO_DLOPEN
-  CHECK(0, "internal error"); // We shouldn't get here
-#elif CALL_USER_CALLBACK
+#if CALL_USER_CALLBACK
  extern void *$dlopen_callback(const char *lib_name);
  lib_handle = $dlopen_callback("$load_name");
  CHECK(lib_handle, "callback '$dlopen_callback' failed to load library");
@ -55,17 +49,9 @@ static void *load_library() {
  CHECK(lib_handle, "failed to load library: %s", dlerror());
 #endif

-  is_lib_loading = 0;
-
  return lib_handle;
 }

-#if ! NO_DLOPEN && ! LAZY_LOAD
-static void __attribute__((constructor)) load_lib() {
-  load_library();
-}
-#endif
-
 static void __attribute__((destructor)) unload_lib() {
  if(lib_handle)
    dlclose(lib_handle);
@ -79,34 +65,35 @@ static const char *const sym_names[] = {

 extern void *_${lib_suffix}_tramp_table[];

-// Can be sped up by manually parsing library symtab...
-void _${lib_suffix}_tramp_resolve(int i) {
-  assert((unsigned)i + 1 < sizeof(sym_names) / sizeof(sym_names[0]));
+// Load library and resolve all symbols
+static void load_and_resolve(void) {
+  static std::mutex load_mutex;
+  static int is_loaded = false;

-  CHECK(!is_lib_loading, "library function '%s' called during library load", sym_names[i]);
+  std::unique_lock<std::mutex> lock(load_mutex);
+  if (is_loaded)
+    return;

  void *h = 0;
-#if NO_DLOPEN
-  // FIXME: instead of RTLD_NEXT we should search for loaded lib_handle
-  // as in https://github.com/jethrogb/ssltrace/blob/bf17c150a7/ssltrace.cpp#L74-L112
-  h = RTLD_NEXT;
-#elif LAZY_LOAD
  h = load_library();
-#else
-  h = lib_handle;
-  CHECK(h, "failed to resolve symbol '%s', library failed to load", sym_names[i]);
-#endif

-  // Dlsym is thread-safe so don't need to protect it.
-  _${lib_suffix}_tramp_table[i] = dlsym(h, sym_names[i]);
-  CHECK(_${lib_suffix}_tramp_table[i], "failed to resolve symbol '%s'", sym_names[i]);
-}
-
-// Helper for user to resolve all symbols
-void _${lib_suffix}_tramp_resolve_all(void) {
  size_t i;
  for(i = 0; i + 1 < sizeof(sym_names) / sizeof(sym_names[0]); ++i)
-    _${lib_suffix}_tramp_resolve(i);
+    // Resolving some of the symbols may fail. We ignore it, because if we are loading 
+    // a library of an older version it may lack certain functions
+    _${lib_suffix}_tramp_table[i] = dlsym(h, sym_names[i]);
+
+  is_loaded = true;
+}
+
+// The function is called if the table entry for the symbol is not set.
+// In that case we load the library and try to resolve all symbols if that was not done yet.
+// If the table entry is still missing, then the symbol is not available in the loaded library,
+// which is a fatal error on which we immediately exit the process.
+void _${lib_suffix}_tramp_resolve(int i) {
+  assert((unsigned)i + 1 < sizeof(sym_names) / sizeof(sym_names[0]));
+  load_and_resolve();
+  CHECK(_${lib_suffix}_tramp_table[i], "failed to resolve symbol '%s'", sym_names[i]);
 }

 #ifdef __cplusplus
--- a/contrib/Implib.so/implib-gen.py
+++ b/contrib/Implib.so/implib-gen.py
@ -22,532 +22,530 @@ import configparser
 me = os.path.basename(__file__)
 root = os.path.dirname(__file__)

+
 def warn(msg):
-  """Emits a nicely-decorated warning."""
-  sys.stderr.write(f'{me}: warning: {msg}\n')
+    """Emits a nicely-decorated warning."""
+    sys.stderr.write(f"{me}: warning: {msg}\n")
+

 def error(msg):
-  """Emits a nicely-decorated error and exits."""
-  sys.stderr.write(f'{me}: error: {msg}\n')
-  sys.exit(1)
+    """Emits a nicely-decorated error and exits."""
+    sys.stderr.write(f"{me}: error: {msg}\n")
+    sys.exit(1)
+
+
+def run(args, stdin=""):
+    """Runs external program and aborts on error."""
+    env = os.environ.copy()
+    # Force English language
+    env["LC_ALL"] = "c"
+    try:
+        del env["LANG"]
+    except KeyError:
+        pass
+    with subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) as p:
+        out, err = p.communicate(input=stdin.encode("utf-8"))
+    out = out.decode("utf-8")
+    err = err.decode("utf-8")
+    if p.returncode != 0 or err:
+        error(f"{args[0]} failed with retcode {p.returncode}:\n{err}")
+    return out, err

-def run(args, stdin=''):
-  """Runs external program and aborts on error."""
-  env = os.environ.copy()
-  # Force English language
-  env['LC_ALL'] = 'c'
-  try:
-    del env["LANG"]
-  except KeyError:
-    pass
-  with subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
-                        stderr=subprocess.PIPE, env=env) as p:
-    out, err = p.communicate(input=stdin.encode('utf-8'))
-  out = out.decode('utf-8')
-  err = err.decode('utf-8')
-  if p.returncode != 0 or err:
-    error(f"{args[0]} failed with retcode {p.returncode}:\n{err}")
-  return out, err

 def make_toc(words, renames=None):
-  "Make an mapping of words to their indices in list"
-  renames = renames or {}
-  toc = {}
-  for i, n in enumerate(words):
-    name = renames.get(n, n)
-    toc[i] = name
-  return toc
+    "Make an mapping of words to their indices in list"
+    renames = renames or {}
+    toc = {}
+    for i, n in enumerate(words):
+        name = renames.get(n, n)
+        toc[i] = name
+    return toc
+

 def parse_row(words, toc, hex_keys):
-  "Make a mapping from column names to values"
-  vals = {k: (words[i] if i < len(words) else '') for i, k in toc.items()}
-  for k in hex_keys:
-    if vals[k]:
-      vals[k] = int(vals[k], 16)
-  return vals
+    "Make a mapping from column names to values"
+    vals = {k: (words[i] if i < len(words) else "") for i, k in toc.items()}
+    for k in hex_keys:
+        if vals[k]:
+            vals[k] = int(vals[k], 16)
+    return vals
+

 def collect_syms(f):
-  """Collect ELF dynamic symtab."""
+    """Collect ELF dynamic symtab."""

-  # --dyn-syms does not always work for some reason so dump all symtabs
-  out, _ = run(['readelf', '-sW', f])
+    # --dyn-syms does not always work for some reason so dump all symtabs
+    out, _ = run(["readelf", "-sW", f])

-  toc = None
-  syms = []
-  syms_set = set()
-  for line in out.splitlines():
-    line = line.strip()
-    if not line:
-      # Next symtab
-      toc = None
-      continue
-    words = re.split(r' +', line)
-    if line.startswith('Num'):  # Header?
-      if toc is not None:
-        error("multiple headers in output of readelf")
-      # Colons are different across readelf versions so get rid of them.
-      toc = make_toc(map(lambda n: n.replace(':', ''), words))
-    elif toc is not None:
-      sym = parse_row(words, toc, ['Value'])
-      name = sym['Name']
-      if name in syms_set:
-        continue
-      syms_set.add(name)
-      sym['Size'] = int(sym['Size'], 0)  # Readelf is inconistent on Size format
-      if '@' in name:
-        sym['Default'] = '@@' in name
-        name, ver = re.split(r'@+', name)
-        sym['Name'] = name
-        sym['Version'] = ver
-      else:
-        sym['Default'] = True
-        sym['Version'] = None
-      syms.append(sym)
+    toc = None
+    syms = []
+    syms_set = set()
+    for line in out.splitlines():
+        line = line.strip()
+        if not line:
+            # Next symtab
+            toc = None
+            continue
+        words = re.split(r" +", line)
+        if line.startswith("Num"):  # Header?
+            if toc is not None:
+                error("multiple headers in output of readelf")
+            # Colons are different across readelf versions so get rid of them.
+            toc = make_toc(map(lambda n: n.replace(":", ""), words))
+        elif toc is not None:
+            sym = parse_row(words, toc, ["Value"])
+            name = sym["Name"]
+            if name in syms_set:
+                continue
+            syms_set.add(name)
+            sym["Size"] = int(sym["Size"], 0)  # Readelf is inconistent on Size format
+            if "@" in name:
+                sym["Default"] = "@@" in name
+                name, ver = re.split(r"@+", name)
+                sym["Name"] = name
+                sym["Version"] = ver
+            else:
+                sym["Default"] = True
+                sym["Version"] = None
+            syms.append(sym)

-  if toc is None:
-    error(f"failed to analyze symbols in {f}")
+    if toc is None:
+        error(f"failed to analyze symbols in {f}")

-  # Also collected demangled names
-  if syms:
-    out, _ = run(['c++filt'], '\n'.join((sym['Name'] for sym in syms)))
-    for i, name in enumerate(out.split("\n")):
-      syms[i]['Demangled Name'] = name
+    # Also collected demangled names
+    if syms:
+        out, _ = run(["c++filt"], "\n".join((sym["Name"] for sym in syms)))
+        for i, name in enumerate(out.split("\n")):
+            syms[i]["Demangled Name"] = name
+
+    return syms

-  return syms

 def collect_relocs(f):
-  """Collect ELF dynamic relocs."""
+    """Collect ELF dynamic relocs."""

-  out, _ = run(['readelf', '-rW', f])
+    out, _ = run(["readelf", "-rW", f])

-  toc = None
-  rels = []
-  for line in out.splitlines():
-    line = line.strip()
-    if not line:
-      toc = None
-      continue
-    if line == 'There are no relocations in this file.':
-      return []
-    if re.match(r'^\s*Offset', line):  # Header?
-      if toc is not None:
-        error("multiple headers in output of readelf")
-      words = re.split(r'\s\s+', line)  # "Symbol's Name + Addend"
-      toc = make_toc(words)
-    elif toc is not None:
-      line = re.sub(r' \+ ', '+', line)
-      words = re.split(r'\s+', line)
-      rel = parse_row(words, toc, ['Offset', 'Info'])
-      rels.append(rel)
-      # Split symbolic representation
-      sym_name = 'Symbol\'s Name + Addend'
-      if sym_name not in rel and 'Symbol\'s Name' in rel:
-        # Adapt to different versions of readelf
-        rel[sym_name] = rel['Symbol\'s Name'] + '+0'
-      if rel[sym_name]:
-        p = rel[sym_name].split('+')
-        if len(p) == 1:
-          p = ['', p[0]]
-        rel[sym_name] = (p[0], int(p[1], 16))
+    toc = None
+    rels = []
+    for line in out.splitlines():
+        line = line.strip()
+        if not line:
+            toc = None
+            continue
+        if line == "There are no relocations in this file.":
+            return []
+        if re.match(r"^\s*Offset", line):  # Header?
+            if toc is not None:
+                error("multiple headers in output of readelf")
+            words = re.split(r"\s\s+", line)  # "Symbol's Name + Addend"
+            toc = make_toc(words)
+        elif toc is not None:
+            line = re.sub(r" \+ ", "+", line)
+            words = re.split(r"\s+", line)
+            rel = parse_row(words, toc, ["Offset", "Info"])
+            rels.append(rel)
+            # Split symbolic representation
+            sym_name = "Symbol's Name + Addend"
+            if sym_name not in rel and "Symbol's Name" in rel:
+                # Adapt to different versions of readelf
+                rel[sym_name] = rel["Symbol's Name"] + "+0"
+            if rel[sym_name]:
+                p = rel[sym_name].split("+")
+                if len(p) == 1:
+                    p = ["", p[0]]
+                rel[sym_name] = (p[0], int(p[1], 16))

-  if toc is None:
-    error(f"failed to analyze relocations in {f}")
+    if toc is None:
+        error(f"failed to analyze relocations in {f}")
+
+    return rels

-  return rels

 def collect_sections(f):
-  """Collect section info from ELF."""
+    """Collect section info from ELF."""

-  out, _ = run(['readelf', '-SW', f])
+    out, _ = run(["readelf", "-SW", f])

-  toc = None
-  sections = []
-  for line in out.splitlines():
-    line = line.strip()
-    if not line:
-      continue
-    line = re.sub(r'\[\s+', '[', line)
-    words = re.split(r' +', line)
-    if line.startswith('[Nr]'):  # Header?
-      if toc is not None:
-        error("multiple headers in output of readelf")
-      toc = make_toc(words, {'Addr' : 'Address'})
-    elif line.startswith('[') and toc is not None:
-      sec = parse_row(words, toc, ['Address', 'Off', 'Size'])
-      if 'A' in sec['Flg']:  # Allocatable section?
-        sections.append(sec)
+    toc = None
+    sections = []
+    for line in out.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        line = re.sub(r"\[\s+", "[", line)
+        words = re.split(r" +", line)
+        if line.startswith("[Nr]"):  # Header?
+            if toc is not None:
+                error("multiple headers in output of readelf")
+            toc = make_toc(words, {"Addr": "Address"})
+        elif line.startswith("[") and toc is not None:
+            sec = parse_row(words, toc, ["Address", "Off", "Size"])
+            if "A" in sec["Flg"]:  # Allocatable section?
+                sections.append(sec)

-  if toc is None:
-    error(f"failed to analyze sections in {f}")
+    if toc is None:
+        error(f"failed to analyze sections in {f}")
+
+    return sections

-  return sections

 def read_unrelocated_data(input_name, syms, secs):
-  """Collect unrelocated data from ELF."""
-  data = {}
-  with open(input_name, 'rb') as f:
-    def is_symbol_in_section(sym, sec):
-      sec_end = sec['Address'] + sec['Size']
-      is_start_in_section = sec['Address'] <= sym['Value'] < sec_end
-      is_end_in_section = sym['Value'] + sym['Size'] <= sec_end
-      return is_start_in_section and is_end_in_section
-    for name, s in sorted(syms.items(), key=lambda s: s[1]['Value']):
-      # TODO: binary search (bisect)
-      sec = [sec for sec in secs if is_symbol_in_section(s, sec)]
-      if len(sec) != 1:
-        error(f"failed to locate section for interval [{s['Value']:x}, {s['Value'] + s['Size']:x})")
-      sec = sec[0]
-      f.seek(sec['Off'])
-      data[name] = f.read(s['Size'])
-  return data
+    """Collect unrelocated data from ELF."""
+    data = {}
+    with open(input_name, "rb") as f:
+
+        def is_symbol_in_section(sym, sec):
+            sec_end = sec["Address"] + sec["Size"]
+            is_start_in_section = sec["Address"] <= sym["Value"] < sec_end
+            is_end_in_section = sym["Value"] + sym["Size"] <= sec_end
+            return is_start_in_section and is_end_in_section
+
+        for name, s in sorted(syms.items(), key=lambda s: s[1]["Value"]):
+            # TODO: binary search (bisect)
+            sec = [sec for sec in secs if is_symbol_in_section(s, sec)]
+            if len(sec) != 1:
+                error(f"failed to locate section for interval [{s['Value']:x}, {s['Value'] + s['Size']:x})")
+            sec = sec[0]
+            f.seek(sec["Off"])
+            data[name] = f.read(s["Size"])
+    return data
+

 def collect_relocated_data(syms, bites, rels, ptr_size, reloc_types):
-  """Identify relocations for each symbol"""
-  data = {}
-  for name, s in sorted(syms.items()):
-    b = bites.get(name)
-    assert b is not None
-    if s['Demangled Name'].startswith('typeinfo name'):
-      data[name] = [('byte', int(x)) for x in b]
-      continue
-    data[name] = []
-    for i in range(0, len(b), ptr_size):
-      val = int.from_bytes(b[i*ptr_size:(i + 1)*ptr_size], byteorder='little')
-      data[name].append(('offset', val))
-    start = s['Value']
-    finish = start + s['Size']
-    # TODO: binary search (bisect)
-    for rel in rels:
-      if rel['Type'] in reloc_types and start <= rel['Offset'] < finish:
-        i = (rel['Offset'] - start) // ptr_size
-        assert i < len(data[name])
-        data[name][i] = 'reloc', rel
-  return data
+    """Identify relocations for each symbol"""
+    data = {}
+    for name, s in sorted(syms.items()):
+        b = bites.get(name)
+        assert b is not None
+        if s["Demangled Name"].startswith("typeinfo name"):
+            data[name] = [("byte", int(x)) for x in b]
+            continue
+        data[name] = []
+        for i in range(0, len(b), ptr_size):
+            val = int.from_bytes(b[i * ptr_size : (i + 1) * ptr_size], byteorder="little")
+            data[name].append(("offset", val))
+        start = s["Value"]
+        finish = start + s["Size"]
+        # TODO: binary search (bisect)
+        for rel in rels:
+            if rel["Type"] in reloc_types and start <= rel["Offset"] < finish:
+                i = (rel["Offset"] - start) // ptr_size
+                assert i < len(data[name])
+                data[name][i] = "reloc", rel
+    return data
+

 def generate_vtables(cls_tables, cls_syms, cls_data):
-  """Generate code for vtables"""
-  c_types = {
-    'reloc'  : 'const void *',
-    'byte'   : 'unsigned char',
-    'offset' : 'size_t'
-  }
+    """Generate code for vtables"""
+    c_types = {"reloc": "const void *", "byte": "unsigned char", "offset": "size_t"}

-  ss = []
-  ss.append('''\
+    ss = []
+    ss.append(
+        """\
 #ifdef __cplusplus
 extern "C" {
 #endif

-''')
+"""
+    )

-  # Print externs
+    # Print externs

-  printed = set()
-  for name, data in sorted(cls_data.items()):
-    for typ, val in data:
-      if typ != 'reloc':
-        continue
-      sym_name, addend = val['Symbol\'s Name + Addend']
-      sym_name = re.sub(r'@.*', '', sym_name)  # Can we pin version in C?
-      if sym_name not in cls_syms and sym_name not in printed:
-        ss.append(f'''\
+    printed = set()
+    for name, data in sorted(cls_data.items()):
+        for typ, val in data:
+            if typ != "reloc":
+                continue
+            sym_name, addend = val["Symbol's Name + Addend"]
+            sym_name = re.sub(r"@.*", "", sym_name)  # Can we pin version in C?
+            if sym_name not in cls_syms and sym_name not in printed:
+                ss.append(
+                    f"""\
 extern const char {sym_name}[];

-''')
+"""
+                )

-  # Collect variable infos
+    # Collect variable infos

-  code_info = {}
+    code_info = {}

-  for name, s in sorted(cls_syms.items()):
-    data = cls_data[name]
-    if s['Demangled Name'].startswith('typeinfo name'):
-      declarator = 'const unsigned char %s[]'
-    else:
-      field_types = (f'{c_types[typ]} field_{i};' for i, (typ, _) in enumerate(data))
-      declarator = 'const struct { %s } %%s' % ' '.join(field_types)  # pylint: disable=C0209  # consider-using-f-string
-    vals = []
-    for typ, val in data:
-      if typ != 'reloc':
-        vals.append(str(val) + 'UL')
-      else:
-        sym_name, addend = val['Symbol\'s Name + Addend']
-        sym_name = re.sub(r'@.*', '', sym_name)  # Can we pin version in C?
-        vals.append(f'(const char *)&{sym_name} + {addend}')
-    code_info[name] = (declarator, '{ %s }' % ', '.join(vals))  # pylint: disable= C0209  # consider-using-f-string
+    for name, s in sorted(cls_syms.items()):
+        data = cls_data[name]
+        if s["Demangled Name"].startswith("typeinfo name"):
+            declarator = "const unsigned char %s[]"
+        else:
+            field_types = (f"{c_types[typ]} field_{i};" for i, (typ, _) in enumerate(data))
+            declarator = "const struct { %s } %%s" % " ".join(
+                field_types
+            )  # pylint: disable=C0209  # consider-using-f-string
+        vals = []
+        for typ, val in data:
+            if typ != "reloc":
+                vals.append(str(val) + "UL")
+            else:
+                sym_name, addend = val["Symbol's Name + Addend"]
+                sym_name = re.sub(r"@.*", "", sym_name)  # Can we pin version in C?
+                vals.append(f"(const char *)&{sym_name} + {addend}")
+        code_info[name] = (declarator, "{ %s }" % ", ".join(vals))  # pylint: disable= C0209  # consider-using-f-string

-  # Print declarations
+    # Print declarations

-  for name, (decl, _) in sorted(code_info.items()):
-    type_name = name + '_type'
-    type_decl = decl % type_name
-    ss.append(f'''\
+    for name, (decl, _) in sorted(code_info.items()):
+        type_name = name + "_type"
+        type_decl = decl % type_name
+        ss.append(
+            f"""\
 typedef {type_decl};
 extern __attribute__((weak)) {type_name} {name};
-''')
+"""
+        )

-  # Print definitions
+    # Print definitions

-  for name, (_, init) in sorted(code_info.items()):
-    type_name = name + '_type'
-    ss.append(f'''\
+    for name, (_, init) in sorted(code_info.items()):
+        type_name = name + "_type"
+        ss.append(
+            f"""\
 const {type_name} {name} = {init};
-''')
+"""
+        )

-  ss.append('''\
+    ss.append(
+        """\
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-''')
+"""
+    )
+
+    return "".join(ss)

-  return ''.join(ss)

 def main():
-  """Driver function"""
-  parser = argparse.ArgumentParser(description="Generate wrappers for shared library functions.",
-                                   formatter_class=argparse.RawDescriptionHelpFormatter,
-                                   epilog=f"""\
+    """Driver function"""
+    parser = argparse.ArgumentParser(
+        description="Generate wrappers for shared library functions.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=f"""\
 Examples:
  $ python3 {me} /usr/lib/x86_64-linux-gnu/libaccountsservice.so.0
  Generating libaccountsservice.so.0.tramp.S...
-  Generating libaccountsservice.so.0.init.c...
-""")
+  Generating libaccountsservice.so.0.init.cpp...
+""",
+    )

-  parser.add_argument('library',
-                      metavar='LIB',
-                      help="Library to be wrapped.")
-  parser.add_argument('--verbose', '-v',
-                      help="Print diagnostic info",
-                      action='count',
-                      default=0)
-  parser.add_argument('--dlopen-callback',
-                      help="Call user-provided custom callback to load library instead of dlopen",
-                      default='')
-  parser.add_argument('--dlopen',
-                      help="Emit dlopen call (default)",
-                      dest='dlopen', action='store_true', default=True)
-  parser.add_argument('--no-dlopen',
-                      help="Do not emit dlopen call (user must load library himself)",
-                      dest='dlopen', action='store_false')
-  parser.add_argument('--library-load-name',
-                      help="Use custom name for dlopened library (default is LIB)")
-  parser.add_argument('--lazy-load',
-                      help="Load library lazily on first call to one of it's functions (default)",
-                      dest='lazy_load', action='store_true', default=True)
-  parser.add_argument('--no-lazy-load',
-                      help="Load library eagerly at program start",
-                      dest='lazy_load', action='store_false')
-  parser.add_argument('--vtables',
-                      help="Intercept virtual tables (EXPERIMENTAL)",
-                      dest='vtables', action='store_true', default=False)
-  parser.add_argument('--no-vtables',
-                      help="Do not intercept virtual tables (default)",
-                      dest='vtables', action='store_false')
-  parser.add_argument('--target',
-                      help="Target platform triple e.g. x86_64-unknown-linux-gnu or arm-none-eabi "
-                           "(atm x86_64, i[0-9]86, arm/armhf/armeabi, aarch64/armv8 "
-                           "and e2k are supported)",
-                      default=os.uname()[-1])
-  parser.add_argument('--symbol-list',
-                      help="Path to file with symbols that should be present in wrapper "
-                           "(all by default)")
-  parser.add_argument('--symbol-prefix',
-                      metavar='PFX',
-                      help="Prefix wrapper symbols with PFX",
-                      default='')
-  parser.add_argument('-q', '--quiet',
-                      help="Do not print progress info",
-                      action='store_true')
-  parser.add_argument('--outdir', '-o',
-                      help="Path to create wrapper at",
-                      default='./')
+    parser.add_argument("library", metavar="LIB", help="Library to be wrapped.")
+    parser.add_argument("--verbose", "-v", help="Print diagnostic info", action="count", default=0)
+    parser.add_argument(
+        "--dlopen-callback", help="Call user-provided custom callback to load library instead of dlopen", default=""
+    )
+    parser.add_argument("--library-load-name", help="Use custom name for dlopened library (default is LIB)")
+    parser.add_argument(
+        "--vtables", help="Intercept virtual tables (EXPERIMENTAL)", dest="vtables", action="store_true", default=False
+    )
+    parser.add_argument(
+        "--no-vtables", help="Do not intercept virtual tables (default)", dest="vtables", action="store_false"
+    )
+    parser.add_argument(
+        "--target",
+        help="Target platform triple e.g. x86_64-unknown-linux-gnu or arm-none-eabi "
+        "(atm x86_64, i[0-9]86, arm/armhf/armeabi, aarch64/armv8 "
+        "and e2k are supported)",
+        default=os.uname()[-1],
+    )
+    parser.add_argument(
+        "--symbol-list", help="Path to file with symbols that should be present in wrapper " "(all by default)"
+    )
+    parser.add_argument("--symbol-prefix", metavar="PFX", help="Prefix wrapper symbols with PFX", default="")
+    parser.add_argument("-q", "--quiet", help="Do not print progress info", action="store_true")
+    parser.add_argument("--outdir", "-o", help="Path to create wrapper at", default="./")

-  args = parser.parse_args()
+    args = parser.parse_args()

-  input_name = args.library
-  verbose = args.verbose
-  dlopen_callback = args.dlopen_callback
-  dlopen = args.dlopen
-  lazy_load = args.lazy_load
-  load_name = args.library_load_name or os.path.basename(input_name)
-  if args.target.startswith('arm'):
-    target = 'arm'  # Handle armhf-..., armel-...
-  elif re.match(r'^i[0-9]86', args.target):
-    target = 'i386'
-  else:
-    target = args.target.split('-')[0]
-  quiet = args.quiet
-  outdir = args.outdir
+    input_name = args.library
+    verbose = args.verbose
+    dlopen_callback = args.dlopen_callback
+    load_name = args.library_load_name or os.path.basename(input_name)
+    if args.target.startswith("arm"):
+        target = "arm"  # Handle armhf-..., armel-...
+    elif re.match(r"^i[0-9]86", args.target):
+        target = "i386"
+    else:
+        target = args.target.split("-")[0]
+    quiet = args.quiet
+    outdir = args.outdir

-  if args.symbol_list is None:
-    funs = None
-  else:
-    with open(args.symbol_list, 'r') as f:
-      funs = []
-      for line in re.split(r'\r?\n', f.read()):
-        line = re.sub(r'#.*', '', line)
-        line = line.strip()
-        if line:
-          funs.append(line)
+    if args.symbol_list is None:
+        funs = None
+    else:
+        with open(args.symbol_list, "r") as f:
+            funs = []
+            for line in re.split(r"\r?\n", f.read()):
+                line = re.sub(r"#.*", "", line)
+                line = line.strip()
+                if line:
+                    funs.append(line)

-  # Collect target info
+    # Collect target info

-  target_dir = os.path.join(root, 'arch', target)
+    target_dir = os.path.join(root, "arch", target)

-  if not os.path.exists(target_dir):
-    error(f"unknown architecture '{target}'")
+    if not os.path.exists(target_dir):
+        error(f"unknown architecture '{target}'")

-  cfg = configparser.ConfigParser(inline_comment_prefixes=';')
-  cfg.read(target_dir + '/config.ini')
+    cfg = configparser.ConfigParser(inline_comment_prefixes=";")
+    cfg.read(target_dir + "/config.ini")

-  ptr_size = int(cfg['Arch']['PointerSize'])
-  symbol_reloc_types = set(re.split(r'\s*,\s*', cfg['Arch']['SymbolReloc']))
+    ptr_size = int(cfg["Arch"]["PointerSize"])
+    symbol_reloc_types = set(re.split(r"\s*,\s*", cfg["Arch"]["SymbolReloc"]))

-  def is_exported(s):
-    return (s['Bind'] != 'LOCAL'
-            and s['Type'] != 'NOTYPE'
-            and s['Ndx'] != 'UND'
-            and s['Name'] not in ['', '_init', '_fini'])
+    def is_exported(s):
+        return (
+            s["Bind"] != "LOCAL"
+            and s["Type"] != "NOTYPE"
+            and s["Ndx"] != "UND"
+            and s["Name"] not in ["", "_init", "_fini"]
+        )

-  syms = list(filter(is_exported, collect_syms(input_name)))
+    syms = list(filter(is_exported, collect_syms(input_name)))

-  def is_data_symbol(s):
-    return (s['Type'] == 'OBJECT'
+    def is_data_symbol(s):
+        return (
+            s["Type"] == "OBJECT"
            # Allow vtables if --vtables is on
-            and not (' for ' in s['Demangled Name'] and args.vtables))
+            and not (" for " in s["Demangled Name"] and args.vtables)
+        )

-  exported_data = [s['Name'] for s in syms if is_data_symbol(s)]
-  if exported_data:
-    # TODO: we can generate wrappers for const data without relocations (or only code relocations)
-    warn(f"library '{input_name}' contains data symbols which won't be intercepted: "
-         + ', '.join(exported_data))
+    exported_data = [s["Name"] for s in syms if is_data_symbol(s)]
+    if exported_data:
+        # TODO: we can generate wrappers for const data without relocations (or only code relocations)
+        warn(f"library '{input_name}' contains data symbols which won't be intercepted: " + ", ".join(exported_data))

-  # Collect functions
-  # TODO: warn if user-specified functions are missing
+    # Collect functions
+    # TODO: warn if user-specified functions are missing

-  orig_funs = filter(lambda s: s['Type'] == 'FUNC', syms)
+    orig_funs = filter(lambda s: s["Type"] == "FUNC", syms)

-  all_funs = set()
-  warn_versioned = False
-  for s in orig_funs:
-    if s['Version'] is not None:
-      # TODO: support versions
-      if not warn_versioned:
-        warn(f"library {input_name} contains versioned symbols which are NYI")
-        warn_versioned = True
-      if verbose:
-        print(f"Skipping versioned symbol {s['Name']}")
-      continue
-    all_funs.add(s['Name'])
+    all_funs = set()
+    warn_versioned = False
+    for s in orig_funs:
+        if s["Version"] is not None:
+            # TODO: support versions
+            if not warn_versioned:
+                warn(f"library {input_name} contains versioned symbols which are NYI")
+                warn_versioned = True
+            if verbose:
+                print(f"Skipping versioned symbol {s['Name']}")
+            continue
+        all_funs.add(s["Name"])

-  if funs is None:
-    funs = sorted(list(all_funs))
-    if not funs and not quiet:
-      warn(f"no public functions were found in {input_name}")
-  else:
-    missing_funs = [name for name in funs if name not in all_funs]
-    if missing_funs:
-      warn("some user-specified functions are not present in library: " + ', '.join(missing_funs))
-    funs = [name for name in funs if name in all_funs]
-
-  if verbose:
-    print("Exported functions:")
-    for i, fun in enumerate(funs):
-      print(f"  {i}: {fun}")
-
-  # Collect vtables
-
-  if args.vtables:
-    cls_tables = {}
-    cls_syms = {}
-
-    for s in syms:
-      m = re.match(r'^(vtable|typeinfo|typeinfo name) for (.*)', s['Demangled Name'])
-      if m is not None and is_exported(s):
-        typ, cls = m.groups()
-        name = s['Name']
-        cls_tables.setdefault(cls, {})[typ] = name
-        cls_syms[name] = s
+    if funs is None:
+        funs = sorted(list(all_funs))
+        if not funs and not quiet:
+            warn(f"no public functions were found in {input_name}")
+    else:
+        missing_funs = [name for name in funs if name not in all_funs]
+        if missing_funs:
+            warn("some user-specified functions are not present in library: " + ", ".join(missing_funs))
+        funs = [name for name in funs if name in all_funs]

    if verbose:
-      print("Exported classes:")
-      for cls, _ in sorted(cls_tables.items()):
-        print(f"  {cls}")
+        print("Exported functions:")
+        for i, fun in enumerate(funs):
+            print(f"  {i}: {fun}")

-    secs = collect_sections(input_name)
-    if verbose:
-      print("Sections:")
-      for sec in secs:
-        print(f"  {sec['Name']}: [{sec['Address']:x}, {sec['Address'] + sec['Size']:x}), "
-              f"at {sec['Off']:x}")
+    # Collect vtables

-    bites = read_unrelocated_data(input_name, cls_syms, secs)
-
-    rels = collect_relocs(input_name)
-    if verbose:
-      print("Relocs:")
-      for rel in rels:
-        sym_add = rel['Symbol\'s Name + Addend']
-        print(f"  {rel['Offset']}: {sym_add}")
-
-    cls_data = collect_relocated_data(cls_syms, bites, rels, ptr_size, symbol_reloc_types)
-    if verbose:
-      print("Class data:")
-      for name, data in sorted(cls_data.items()):
-        demangled_name = cls_syms[name]['Demangled Name']
-        print(f"  {name} ({demangled_name}):")
-        for typ, val in data:
-          print("    " + str(val if typ != 'reloc' else val['Symbol\'s Name + Addend']))
-
-  # Generate assembly code
-
-  suffix = os.path.basename(load_name)
-  lib_suffix = re.sub(r'[^a-zA-Z_0-9]+', '_', suffix)
-
-  tramp_file = f'{suffix}.tramp.S'
-  with open(os.path.join(outdir, tramp_file), 'w') as f:
-    if not quiet:
-      print(f"Generating {tramp_file}...")
-    with open(target_dir + '/table.S.tpl', 'r') as t:
-      table_text = string.Template(t.read()).substitute(
-        lib_suffix=lib_suffix,
-        table_size=ptr_size*(len(funs) + 1))
-    f.write(table_text)
-
-    with open(target_dir + '/trampoline.S.tpl', 'r') as t:
-      tramp_tpl = string.Template(t.read())
-
-    for i, name in enumerate(funs):
-      tramp_text = tramp_tpl.substitute(
-        lib_suffix=lib_suffix,
-        sym=args.symbol_prefix + name,
-        offset=i*ptr_size,
-        number=i)
-      f.write(tramp_text)
-
-  # Generate C code
-
-  init_file = f'{suffix}.init.c'
-  with open(os.path.join(outdir, init_file), 'w') as f:
-    if not quiet:
-      print(f"Generating {init_file}...")
-    with open(os.path.join(root, 'arch/common/init.c.tpl'), 'r') as t:
-      if funs:
-        sym_names = ',\n  '.join(f'"{name}"' for name in funs) + ','
-      else:
-        sym_names = ''
-      init_text = string.Template(t.read()).substitute(
-        lib_suffix=lib_suffix,
-        load_name=load_name,
-        dlopen_callback=dlopen_callback,
-        has_dlopen_callback=int(bool(dlopen_callback)),
-        no_dlopen=not int(dlopen),
-        lazy_load=int(lazy_load),
-        sym_names=sym_names)
-      f.write(init_text)
    if args.vtables:
-      vtable_text = generate_vtables(cls_tables, cls_syms, cls_data)
-      f.write(vtable_text)
+        cls_tables = {}
+        cls_syms = {}

-if __name__ == '__main__':
-  main()
+        for s in syms:
+            m = re.match(r"^(vtable|typeinfo|typeinfo name) for (.*)", s["Demangled Name"])
+            if m is not None and is_exported(s):
+                typ, cls = m.groups()
+                name = s["Name"]
+                cls_tables.setdefault(cls, {})[typ] = name
+                cls_syms[name] = s
+
+        if verbose:
+            print("Exported classes:")
+            for cls, _ in sorted(cls_tables.items()):
+                print(f"  {cls}")
+
+        secs = collect_sections(input_name)
+        if verbose:
+            print("Sections:")
+            for sec in secs:
+                print(f"  {sec['Name']}: [{sec['Address']:x}, {sec['Address'] + sec['Size']:x}), " f"at {sec['Off']:x}")
+
+        bites = read_unrelocated_data(input_name, cls_syms, secs)
+
+        rels = collect_relocs(input_name)
+        if verbose:
+            print("Relocs:")
+            for rel in rels:
+                sym_add = rel["Symbol's Name + Addend"]
+                print(f"  {rel['Offset']}: {sym_add}")
+
+        cls_data = collect_relocated_data(cls_syms, bites, rels, ptr_size, symbol_reloc_types)
+        if verbose:
+            print("Class data:")
+            for name, data in sorted(cls_data.items()):
+                demangled_name = cls_syms[name]["Demangled Name"]
+                print(f"  {name} ({demangled_name}):")
+                for typ, val in data:
+                    print("    " + str(val if typ != "reloc" else val["Symbol's Name + Addend"]))
+
+    # Generate assembly code
+
+    suffix = os.path.basename(load_name)
+    lib_suffix = re.sub(r"[^a-zA-Z_0-9]+", "_", suffix)
+
+    tramp_file = f"{suffix}.tramp.S"
+    with open(os.path.join(outdir, tramp_file), "w") as f:
+        if not quiet:
+            print(f"Generating {tramp_file}...")
+        with open(target_dir + "/table.S.tpl", "r") as t:
+            table_text = string.Template(t.read()).substitute(
+                lib_suffix=lib_suffix, table_size=ptr_size * (len(funs) + 1)
+            )
+        f.write(table_text)
+
+        with open(target_dir + "/trampoline.S.tpl", "r") as t:
+            tramp_tpl = string.Template(t.read())
+
+        for i, name in enumerate(funs):
+            tramp_text = tramp_tpl.substitute(
+                lib_suffix=lib_suffix, sym=args.symbol_prefix + name, offset=i * ptr_size, number=i
+            )
+            f.write(tramp_text)
+
+    # Generate C code
+
+    init_file = f"{suffix}.init.cpp"
+    with open(os.path.join(outdir, init_file), "w") as f:
+        if not quiet:
+            print(f"Generating {init_file}...")
+        with open(os.path.join(root, "arch/common/init.cpp.tpl"), "r") as t:
+            if funs:
+                sym_names = ",\n  ".join(f'"{name}"' for name in funs) + ","
+            else:
+                sym_names = ""
+            init_text = string.Template(t.read()).substitute(
+                lib_suffix=lib_suffix,
+                load_name=load_name,
+                dlopen_callback=dlopen_callback,
+                has_dlopen_callback=int(bool(dlopen_callback)),
+                sym_names=sym_names,
+            )
+            f.write(init_text)
+        if args.vtables:
+            vtable_text = generate_vtables(cls_tables, cls_syms, cls_data)
+            f.write(vtable_text)
+
+
+if __name__ == "__main__":
+    main()
--- a/contrib/Joshua/scripts/binding_test_start.sh
+++ b/contrib/Joshua/scripts/binding_test_start.sh
@ -0,0 +1,7 @@
+#! /usr/bin/env bash
+
+set -e
+set -o pipefail
+
+# It is necessary to tee to output.log in case timeout happens
+python3 ./binding_test.py --stop-at-failure 10 --fdbserver-path $(pwd)/fdbserver --fdbcli-path $(pwd)/fdbcli --libfdb-path $(pwd) --num-ops 1000 --num-hca-ops 100 --concurrency 5 --test-timeout 60 --random | tee output.log
--- a/contrib/Joshua/scripts/binding_test_timeout.sh
+++ b/contrib/Joshua/scripts/binding_test_timeout.sh
@ -0,0 +1,4 @@
+#! /usr/bin/env bash
+
+echo "Binding test timed out"
+cat output.log
--- a/contrib/Joshua/scripts/localClusterStart.sh
+++ b/contrib/Joshua/scripts/localClusterStart.sh
--- a/contrib/TestHarness2/test_harness/run.py
+++ b/contrib/TestHarness2/test_harness/run.py
@ -60,7 +60,7 @@ class StatFetcher:
 class TestPicker:
    def __init__(self, test_dir: Path):
        if not test_dir.exists():
-            raise RuntimeError('{} is neither a directory nor a file'.format(test_dir))
+            raise RuntimeError("{} is neither a directory nor a file".format(test_dir))
        self.include_files_regex = re.compile(config.include_test_files)
        self.exclude_files_regex = re.compile(config.exclude_test_files)
        self.include_tests_regex = re.compile(config.include_test_classes)
@ -78,6 +78,7 @@ class TestPicker:
            self.stat_fetcher = StatFetcher(self.tests)
        else:
            from test_harness.fdb import FDBStatFetcher
+
            self.stat_fetcher = FDBStatFetcher(self.tests)
        if config.stats is not None:
            self.load_stats(config.stats)
@ -106,50 +107,60 @@ class TestPicker:
                break
        assert test_name is not None and test_desc is not None
        self.stat_fetcher.add_run_time(test_name, run_time, out)
-        out.attributes['TotalTestTime'] = str(test_desc.total_runtime)
-        out.attributes['TestRunCount'] = str(test_desc.num_runs)
+        out.attributes["TotalTestTime"] = str(test_desc.total_runtime)
+        out.attributes["TestRunCount"] = str(test_desc.num_runs)

    def dump_stats(self) -> str:
-        res = array.array('I')
+        res = array.array("I")
        for _, spec in self.tests.items():
            res.append(spec.total_runtime)
-        return base64.standard_b64encode(res.tobytes()).decode('utf-8')
+        return base64.standard_b64encode(res.tobytes()).decode("utf-8")

    def fetch_stats(self):
        self.stat_fetcher.read_stats()

    def load_stats(self, serialized: str):
-        times = array.array('I')
+        times = array.array("I")
        times.frombytes(base64.standard_b64decode(serialized))
        assert len(times) == len(self.tests.items())
        for idx, (_, spec) in enumerate(self.tests.items()):
            spec.total_runtime = times[idx]

    def parse_txt(self, path: Path):
-        if self.include_files_regex.search(str(path)) is None or self.exclude_files_regex.search(str(path)) is not None:
+        if (
+            self.include_files_regex.search(str(path)) is None
+            or self.exclude_files_regex.search(str(path)) is not None
+        ):
            return
-        with path.open('r') as f:
+        with path.open("r") as f:
            test_name: str | None = None
            test_class: str | None = None
            priority: float | None = None
            for line in f:
                line = line.strip()
-                kv = line.split('=')
+                kv = line.split("=")
                if len(kv) != 2:
                    continue
                kv[0] = kv[0].strip()
-                kv[1] = kv[1].strip(' \r\n\t\'"')
-                if kv[0] == 'testTitle' and test_name is None:
+                kv[1] = kv[1].strip(" \r\n\t'\"")
+                if kv[0] == "testTitle" and test_name is None:
                    test_name = kv[1]
-                if kv[0] == 'testClass' and test_class is None:
+                if kv[0] == "testClass" and test_class is None:
                    test_class = kv[1]
-                if kv[0] == 'testPriority' and priority is None:
+                if kv[0] == "testPriority" and priority is None:
                    try:
                        priority = float(kv[1])
                    except ValueError:
-                        raise RuntimeError("Can't parse {} -- testPriority in {} should be set to a float".format(kv[1],
-                                                                                                                  path))
-                if test_name is not None and test_class is not None and priority is not None:
+                        raise RuntimeError(
+                            "Can't parse {} -- testPriority in {} should be set to a float".format(
+                                kv[1], path
+                            )
+                        )
+                if (
+                    test_name is not None
+                    and test_class is not None
+                    and priority is not None
+                ):
                    break
            if test_name is None:
                return
@ -157,8 +168,10 @@ class TestPicker:
                test_class = test_name
            if priority is None:
                priority = 1.0
-            if self.include_tests_regex.search(test_class) is None \
-                    or self.exclude_tests_regex.search(test_class) is not None:
+            if (
+                self.include_tests_regex.search(test_class) is None
+                or self.exclude_tests_regex.search(test_class) is not None
+            ):
                return
            if test_class not in self.tests:
                self.tests[test_class] = TestDescription(path, test_class, priority)
@ -173,12 +186,12 @@ class TestPicker:
            # check whether we're looking at a restart test
            if self.follow_test.match(test.name) is not None:
                return
-            if test.suffix == '.txt' or test.suffix == '.toml':
+            if test.suffix == ".txt" or test.suffix == ".toml":
                self.parse_txt(test)

    @staticmethod
    def list_restart_files(start_file: Path) -> List[Path]:
-        name = re.sub(r'-\d+.(txt|toml)', '', start_file.name)
+        name = re.sub(r"-\d+.(txt|toml)", "", start_file.name)
        res: List[Path] = []
        for test_file in start_file.parent.iterdir():
            if test_file.name.startswith(name):
@ -209,12 +222,12 @@ class TestPicker:

 class OldBinaries:
    def __init__(self):
-        self.first_file_expr = re.compile(r'.*-1\.(txt|toml)')
+        self.first_file_expr = re.compile(r".*-1\.(txt|toml)")
        self.old_binaries_path: Path = config.old_binaries_path
        self.binaries: OrderedDict[Version, Path] = collections.OrderedDict()
        if not self.old_binaries_path.exists() or not self.old_binaries_path.is_dir():
            return
-        exec_pattern = re.compile(r'fdbserver-\d+\.\d+\.\d+(\.exe)?')
+        exec_pattern = re.compile(r"fdbserver-\d+\.\d+\.\d+(\.exe)?")
        for file in self.old_binaries_path.iterdir():
            if not file.is_file() or not os.access(file, os.X_OK):
                continue
@ -222,9 +235,9 @@ class OldBinaries:
                self._add_file(file)

    def _add_file(self, file: Path):
-        version_str = file.name.split('-')[1]
-        if version_str.endswith('.exe'):
-            version_str = version_str[0:-len('.exe')]
+        version_str = file.name.split("-")[1]
+        if version_str.endswith(".exe"):
+            version_str = version_str[0 : -len(".exe")]
        ver = Version.parse(version_str)
        self.binaries[ver] = file

@ -232,21 +245,21 @@ class OldBinaries:
        if len(self.binaries) == 0:
            return config.binary
        max_version = Version.max_version()
-        min_version = Version.parse('5.0.0')
+        min_version = Version.parse("5.0.0")
        dirs = test_file.parent.parts
-        if 'restarting' not in dirs:
+        if "restarting" not in dirs:
            return config.binary
-        version_expr = dirs[-1].split('_')
+        version_expr = dirs[-1].split("_")
        first_file = self.first_file_expr.match(test_file.name) is not None
-        if first_file and version_expr[0] == 'to':
+        if first_file and version_expr[0] == "to":
            # downgrade test -- first binary should be current one
            return config.binary
-        if not first_file and version_expr[0] == 'from':
+        if not first_file and version_expr[0] == "from":
            # upgrade test -- we only return an old version for the first test file
            return config.binary
-        if version_expr[0] == 'from' or version_expr[0] == 'to':
+        if version_expr[0] == "from" or version_expr[0] == "to":
            min_version = Version.parse(version_expr[1])
-        if len(version_expr) == 4 and version_expr[2] == 'until':
+        if len(version_expr) == 4 and version_expr[2] == "until":
            max_version = Version.parse(version_expr[3])
        candidates: List[Path] = []
        for ver, binary in self.binaries.items():
@ -259,13 +272,13 @@ class OldBinaries:

 def is_restarting_test(test_file: Path):
    for p in test_file.parts:
-        if p == 'restarting':
+        if p == "restarting":
            return True
    return False


 def is_no_sim(test_file: Path):
-    return test_file.parts[-2] == 'noSim'
+    return test_file.parts[-2] == "noSim"


 class ResourceMonitor(threading.Thread):
@ -291,9 +304,19 @@ class ResourceMonitor(threading.Thread):


 class TestRun:
-    def __init__(self, binary: Path, test_file: Path, random_seed: int, uid: uuid.UUID,
-                 restarting: bool = False, test_determinism: bool = False, buggify_enabled: bool = False,
-                 stats: str | None = None, expected_unseed: int | None = None, will_restart: bool = False):
+    def __init__(
+        self,
+        binary: Path,
+        test_file: Path,
+        random_seed: int,
+        uid: uuid.UUID,
+        restarting: bool = False,
+        test_determinism: bool = False,
+        buggify_enabled: bool = False,
+        stats: str | None = None,
+        expected_unseed: int | None = None,
+        will_restart: bool = False,
+    ):
        self.binary = binary
        self.test_file = test_file
        self.random_seed = random_seed
@ -313,23 +336,31 @@ class TestRun:
        self.temp_path = config.run_dir / str(self.uid)
        # state for the run
        self.retryable_error: bool = False
-        self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed,
-                                        will_restart=will_restart, long_running=config.long_running)
+        self.summary: Summary = Summary(
+            binary,
+            uid=self.uid,
+            stats=self.stats,
+            expected_unseed=self.expected_unseed,
+            will_restart=will_restart,
+            long_running=config.long_running,
+        )
        self.run_time: int = 0
        self.success = self.run()

    def log_test_plan(self, out: SummaryTree):
-        test_plan: SummaryTree = SummaryTree('TestPlan')
-        test_plan.attributes['TestUID'] = str(self.uid)
-        test_plan.attributes['RandomSeed'] = str(self.random_seed)
-        test_plan.attributes['TestFile'] = str(self.test_file)
-        test_plan.attributes['Buggify'] = '1' if self.buggify_enabled else '0'
-        test_plan.attributes['FaultInjectionEnabled'] = '1' if self.fault_injection_enabled else '0'
-        test_plan.attributes['DeterminismCheck'] = '1' if self.test_determinism else '0'
+        test_plan: SummaryTree = SummaryTree("TestPlan")
+        test_plan.attributes["TestUID"] = str(self.uid)
+        test_plan.attributes["RandomSeed"] = str(self.random_seed)
+        test_plan.attributes["TestFile"] = str(self.test_file)
+        test_plan.attributes["Buggify"] = "1" if self.buggify_enabled else "0"
+        test_plan.attributes["FaultInjectionEnabled"] = (
+            "1" if self.fault_injection_enabled else "0"
+        )
+        test_plan.attributes["DeterminismCheck"] = "1" if self.test_determinism else "0"
        out.append(test_plan)

    def delete_simdir(self):
-        shutil.rmtree(self.temp_path / Path('simfdb'))
+        shutil.rmtree(self.temp_path / Path("simfdb"))

    def run(self):
        command: List[str] = []
@ -341,47 +372,68 @@ class TestRun:
            # the test take longer. Also old binaries weren't built with
            # USE_VALGRIND=ON, and we have seen false positives with valgrind in
            # such binaries.
-            command.append('valgrind')
-            valgrind_file = self.temp_path / Path('valgrind-{}.xml'.format(self.random_seed))
-            dbg_path = os.getenv('FDB_VALGRIND_DBGPATH')
+            command.append("valgrind")
+            valgrind_file = self.temp_path / Path(
+                "valgrind-{}.xml".format(self.random_seed)
+            )
+            dbg_path = os.getenv("FDB_VALGRIND_DBGPATH")
            if dbg_path is not None:
-                command.append('--extra-debuginfo-path={}'.format(dbg_path))
-            command += ['--xml=yes', '--xml-file={}'.format(valgrind_file.absolute()), '-q']
-        command += [str(self.binary.absolute()),
-                    '-r', 'test' if is_no_sim(self.test_file) else 'simulation',
-                    '-f', str(self.test_file),
-                    '-s', str(self.random_seed)]
+                command.append("--extra-debuginfo-path={}".format(dbg_path))
+            command += [
+                "--xml=yes",
+                "--xml-file={}".format(valgrind_file.absolute()),
+                "-q",
+            ]
+        command += [
+            str(self.binary.absolute()),
+            "-r",
+            "test" if is_no_sim(self.test_file) else "simulation",
+            "-f",
+            str(self.test_file),
+            "-s",
+            str(self.random_seed),
+        ]
        if self.trace_format is not None:
-            command += ['--trace_format', self.trace_format]
+            command += ["--trace_format", self.trace_format]
        if self.use_tls_plugin:
-            command += ['--tls_plugin', str(config.tls_plugin_path)]
+            command += ["--tls_plugin", str(config.tls_plugin_path)]
            env["FDB_TLS_PLUGIN"] = str(config.tls_plugin_path)
        if config.disable_kaio:
-            command += ['--knob-disable-posix-kernel-aio=1']
-        if Version.of_binary(self.binary) >= '7.1.0':
-            command += ['-fi', 'on' if self.fault_injection_enabled else 'off']
+            command += ["--knob-disable-posix-kernel-aio=1"]
+        if Version.of_binary(self.binary) >= "7.1.0":
+            command += ["-fi", "on" if self.fault_injection_enabled else "off"]
        if self.restarting:
-            command.append('--restarting')
+            command.append("--restarting")
        if self.buggify_enabled:
-            command += ['-b', 'on']
+            command += ["-b", "on"]
        if config.crash_on_error:
-            command.append('--crash')
+            command.append("--crash")
        if config.long_running:
            # disable simulation speedup
-            command += ['--knob-sim-speedup-after-seconds=36000']
+            command += ["--knob-sim-speedup-after-seconds=36000"]
            # disable traceTooManyLines Error MAX_TRACE_LINES
-            command += ['--knob-max-trace-lines=1000000000']
+            command += ["--knob-max-trace-lines=1000000000"]

        self.temp_path.mkdir(parents=True, exist_ok=True)

        # self.log_test_plan(out)
        resources = ResourceMonitor()
        resources.start()
-        process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
-                                   text=True, env=env)
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+            cwd=self.temp_path,
+            text=True,
+            env=env,
+        )
        did_kill = False
        # No timeout for long running tests
-        timeout = 20 * config.kill_seconds if self.use_valgrind else (None if config.long_running else config.kill_seconds)
+        timeout = (
+            20 * config.kill_seconds
+            if self.use_valgrind
+            else (None if config.long_running else config.kill_seconds)
+        )
        err_out: str
        try:
            _, err_out = process.communicate(timeout=timeout)
@ -398,7 +450,7 @@ class TestRun:
        self.summary.was_killed = did_kill
        self.summary.valgrind_out_file = valgrind_file
        self.summary.error_out = err_out
-        self.summary.summarize(self.temp_path, ' '.join(command))
+        self.summary.summarize(self.temp_path, " ".join(command))
        return self.summary.ok()


@ -407,18 +459,18 @@ def decorate_summary(out: SummaryTree, test_file: Path, seed: int, buggify: bool
    tests are then hard to reproduce (they can be reproduced through TestHarness but
    require the user to run in the joshua docker container). To account for this we
    will write the necessary information into the attributes if it is missing."""
-    if 'TestFile' not in out.attributes:
-        out.attributes['TestFile'] = str(test_file)
-    if 'RandomSeed' not in out.attributes:
-        out.attributes['RandomSeed'] = str(seed)
-    if 'BuggifyEnabled' not in out.attributes:
-        out.attributes['BuggifyEnabled'] = '1' if buggify else '0'
+    if "TestFile" not in out.attributes:
+        out.attributes["TestFile"] = str(test_file)
+    if "RandomSeed" not in out.attributes:
+        out.attributes["RandomSeed"] = str(seed)
+    if "BuggifyEnabled" not in out.attributes:
+        out.attributes["BuggifyEnabled"] = "1" if buggify else "0"


 class TestRunner:
    def __init__(self):
        self.uid = uuid.uuid4()
-        self.test_path: Path = Path('tests')
+        self.test_path: Path = Path("tests")
        self.cluster_file: str | None = None
        self.fdb_app_dir: str | None = None
        self.binary_chooser = OldBinaries()
@ -426,32 +478,43 @@ class TestRunner:

    def backup_sim_dir(self, seed: int):
        temp_dir = config.run_dir / str(self.uid)
-        src_dir = temp_dir / 'simfdb'
+        src_dir = temp_dir / "simfdb"
        assert src_dir.is_dir()
-        dest_dir = temp_dir / 'simfdb.{}'.format(seed)
+        dest_dir = temp_dir / "simfdb.{}".format(seed)
        assert not dest_dir.exists()
        shutil.copytree(src_dir, dest_dir)

    def restore_sim_dir(self, seed: int):
        temp_dir = config.run_dir / str(self.uid)
-        src_dir = temp_dir / 'simfdb.{}'.format(seed)
+        src_dir = temp_dir / "simfdb.{}".format(seed)
        assert src_dir.exists()
-        dest_dir = temp_dir / 'simfdb'
+        dest_dir = temp_dir / "simfdb"
        shutil.rmtree(dest_dir)
        shutil.move(src_dir, dest_dir)

-    def run_tests(self, test_files: List[Path], seed: int, test_picker: TestPicker) -> bool:
+    def run_tests(
+        self, test_files: List[Path], seed: int, test_picker: TestPicker
+    ) -> bool:
        result: bool = True
        for count, file in enumerate(test_files):
            will_restart = count + 1 < len(test_files)
            binary = self.binary_chooser.choose_binary(file)
-            unseed_check = not is_no_sim(file) and config.random.random() < config.unseed_check_ratio
+            unseed_check = (
+                not is_no_sim(file)
+                and config.random.random() < config.unseed_check_ratio
+            )
            buggify_enabled: bool = config.random.random() < config.buggify_on_ratio
-            if unseed_check and count != 0:
-                # for restarting tests we will need to restore the sim2 after the first run
-                self.backup_sim_dir(seed + count - 1)
-            run = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0,
-                          stats=test_picker.dump_stats(), will_restart=will_restart, buggify_enabled=buggify_enabled)
+            # FIXME: support unseed checks for restarting tests
+            run = TestRun(
+                binary,
+                file.absolute(),
+                seed + count,
+                self.uid,
+                restarting=count != 0,
+                stats=test_picker.dump_stats(),
+                will_restart=will_restart,
+                buggify_enabled=buggify_enabled,
+            )
            result = result and run.success
            test_picker.add_time(test_files[0], run.run_time, run.summary.out)
            decorate_summary(run.summary.out, file, seed + count, run.buggify_enabled)
@ -460,14 +523,22 @@ class TestRunner:
            run.summary.out.dump(sys.stdout)
            if not result:
                return False
-            if unseed_check and run.summary.unseed is not None:
-                if count != 0:
-                    self.restore_sim_dir(seed + count - 1)
-                run2 = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0,
-                               stats=test_picker.dump_stats(), expected_unseed=run.summary.unseed,
-                               will_restart=will_restart, buggify_enabled=buggify_enabled)
+            if count == 0 and unseed_check and run.summary.unseed is not None:
+                run2 = TestRun(
+                    binary,
+                    file.absolute(),
+                    seed + count,
+                    self.uid,
+                    restarting=count != 0,
+                    stats=test_picker.dump_stats(),
+                    expected_unseed=run.summary.unseed,
+                    will_restart=will_restart,
+                    buggify_enabled=buggify_enabled,
+                )
                test_picker.add_time(file, run2.run_time, run.summary.out)
-                decorate_summary(run2.summary.out, file, seed + count, run.buggify_enabled)
+                decorate_summary(
+                    run2.summary.out, file, seed + count, run.buggify_enabled
+                )
                run2.summary.out.dump(sys.stdout)
                result = result and run2.success
                if not result:
@ -475,7 +546,11 @@ class TestRunner:
        return result

    def run(self) -> bool:
-        seed = config.random_seed if config.random_seed is not None else config.random.randint(0, 2 ** 32 - 1)
+        seed = (
+            config.random_seed
+            if config.random_seed is not None
+            else config.random.randint(0, 2**32 - 1)
+        )
        test_files = self.test_picker.choose_test()
        success = self.run_tests(test_files, seed, self.test_picker)
        if config.clean_up:
--- a/contrib/local_cluster/binding_test.py
+++ b/contrib/local_cluster/binding_test.py
@ -5,6 +5,7 @@ import asyncio
 import logging
 import os
 import os.path
+import random
 import sys

 import lib.fdb_process
@ -45,17 +46,23 @@ def _setup_logs(log_level: int = logging.INFO):

    logger.handlers.clear()

-    stdout_handler = logging.StreamHandler(stream=sys.stderr)
+    stdout_handler = logging.StreamHandler(stream=sys.stdout)
    stdout_handler.setLevel(log_level)
    stdout_handler.setFormatter(log_format)

    logger.addHandler(stdout_handler)
    logger.setLevel(log_level)

-    # Here we might lose some of the logging from lib
+    # Here we might lose some of the logging from lib as the logger is set after
+    # importing the modules
    lib_logger = logging.getLogger("lib")
+    lib_logger.addHandler(stdout_handler)
    lib_logger.setLevel(log_level)

+    local_cluster_logger = logging.getLogger("local_cluster")
+    local_cluster_logger.addHandler(stdout_handler)
+    local_cluster_logger.setLevel(log_level)
+

 def _setup_args() -> argparse.Namespace:
    """Parse the command line arguments"""
@ -109,6 +116,12 @@ def _setup_args() -> argparse.Namespace:
        default=DEFAULT_TIMEOUT_PER_TEST,
        help="Timeout for each single test",
    )
+    parser.add_argument(
+        "--random",
+        action="store_true",
+        default=False,
+        help="Randomly pick up a test",
+    )
    return parser.parse_args()


@ -137,11 +150,16 @@ class TestSet:
        self._concurrency = concurrency
        self._timeout = timeout
        self._logging_level = logging_level
+        self._cluster_file = None

        self._env = dict(os.environ)
        self._update_path_from_env("LD_LIBRARY_PATH", ld_library_path)
        self._update_path_from_env("PYTHONPATH", DEFAULT_PYTHON_BINDER)

+    def set_cluster_file(self, cluster_file: str):
+        """Sets the cluster file for the test"""
+        self._cluster_file = cluster_file
+
    def _update_path_from_env(self, environment_variable_name: str, new_path: str):
        original_path = os.getenv(environment_variable_name)
        self._env[environment_variable_name] = (
@ -159,6 +177,8 @@ class TestSet:
    ):
        arguments = [
            api_language,
+            "--cluster-file",
+            self._cluster_file,
            "--test-name",
            test_name,
            "--logging-level",
@ -190,6 +210,7 @@ class TestSet:
        test_name: str,
        additional_args: List[str],
    ):
+        assert self._cluster_file is not None, "Must set cluster file before the test"
        logger.debug(f"Run test API [{api_language}] Test name [{test_name}]")
        try:
            await self._test_coroutine(
@ -272,9 +293,7 @@ def _log_cluster_lines_with_severity(
            else:
                reporter = logger.debug

-            if len(lines) == 0:
-                reporter(f"{log_file}: No Severity={severity} lines")
-            else:
+            if len(lines) > 0:
                reporter(
                    "{}: {} lines with Severity={}\n{}".format(
                        log_file, len(lines), severity, "".join(lines)
@ -282,9 +301,7 @@ def _log_cluster_lines_with_severity(
                )


-async def run_binding_tests(
-    test_set: TestSet, num_cycles: int, stop_at_failure: int = None
-):
+def _generate_test_list(test_set: TestSet, api_languages: List[str]):
    tests = [
        test_set.run_scripted_test,
        test_set.run_api_test,
@ -292,39 +309,78 @@ async def run_binding_tests(
        test_set.run_directory_test,
        test_set.run_directory_hca_test,
    ]
+    return [
+        lambda: test(api_language) for test in tests for api_language in API_LANGUAGES
+    ]
+
+
+async def run_binding_tests(
+    test_set: TestSet,
+    num_cycles: int,
+    stop_at_failure: int = None,
+    random_pick_single: bool = False,
+) -> int:
+    """Run the binding tests
+
+    :param TestSet test_set:
+    :param int num_cycles: Number of tests to run
+    :param int stop_at_failure: Stop at i-th failure, defaults to None
+    :param bool random_pick_single: Randomly pick a single test, defaults to False
+    :return int: Number of failures
+    """
+    tests = _generate_test_list(test_set=test_set, api_languages=API_LANGUAGES)
    num_failures: int = 0

    async def run_tests():
        nonlocal num_failures
-        for api_language in API_LANGUAGES:
-            for test in tests:
-                test_success = await test(api_language)
-                if not test_success:
-                    num_failures += 1
-                    if stop_at_failure and num_failures > stop_at_failure:
-                        raise RuntimeError(
-                            f"Maximum number of test failures have reached"
-                        )
+        for test in tests:
+            test_success = await test()
+            if not test_success:
+                num_failures += 1
+                if stop_at_failure and num_failures > stop_at_failure:
+                    return
+
+    async def run_test_random():
+        nonlocal num_failures
+        test = random.choice(tests)
+        test_success = await test()
+        if not test_success:
+            num_failures += 1
+
+    async def run_test_cycles() -> int:
+        for cycle in range(num_cycles):
+            logger.info(f"Starting cycle {cycle}")
+            if random_pick_single:
+                await run_test_random()
+            else:
+                await run_tests()
+            if stop_at_failure and num_failures > stop_at_failure:
+                logger.error(
+                    f"Reached maximum failures of {num_failures}, prematurely terminating"
+                )
+                return num_failures
+        return num_failures

    async with lib.local_cluster.FDBServerLocalCluster(1) as local_cluster:
-        logger.info("Start binding test")
+        test_set.set_cluster_file(local_cluster)

        try:
-            for cycle in range(num_cycles):
-                logger.info(f"Starting cycle {cycle}")
-                await run_tests()
+            await run_test_cycles()
        except:
            logger.exception("Error found during the binding test")
+            raise
        finally:
            logger.info(f"Binding test completed with {num_failures} failures")

            _log_cluster_lines_with_severity(local_cluster, 40)
            _log_cluster_lines_with_severity(local_cluster, 30)

+        return num_failures

-def main():
+
+def main() -> int:
    args = _setup_args()
-    _setup_logs(args.debug)
+    _setup_logs(logging.DEBUG if args.debug else logging.INFO)

    _check_file(args.fdbserver_path, True)
    _check_file(args.fdbcli_path, True)
@ -333,12 +389,12 @@ def main():
    lib.fdb_process.set_fdbserver_path(args.fdbserver_path)
    lib.fdb_process.set_fdbcli_path(args.fdbcli_path)

-    logger.info(f"Executable: {__file__}")
-    logger.info(f"PID: {os.getpid()}")
-    logger.info(f"fdbserver: {args.fdbserver_path}")
-    logger.info(f"fdbcli: {args.fdbcli_path}")
-    logger.info(f"libfdb: {args.libfdb_path}")
-    logger.info(f"NumCycles: {args.num_cycles}")
+    logger.debug(f"Executable: {__file__}")
+    logger.debug(f"PID: {os.getpid()}")
+    logger.debug(f"fdbserver: {args.fdbserver_path}")
+    logger.debug(f"fdbcli: {args.fdbcli_path}")
+    logger.debug(f"libfdb: {args.libfdb_path}")
+    logger.debug(f"NumCycles: {args.num_cycles}")

    test_set = TestSet(
        binding_tester=args.binding_tester_path,
@ -349,9 +405,13 @@ def main():
        timeout=args.test_timeout,
    )

-    asyncio.run(run_binding_tests(test_set, args.num_cycles, args.stop_at_failure))
+    logger.info(f"Binding test start")
+    num_failures = asyncio.run(
+        run_binding_tests(test_set, args.num_cycles, args.stop_at_failure, args.random)
+    )
+    logger.info(f"Binding test finished with {num_failures} failures")

-    return 0
+    return 0 if num_failures == 0 else 1


 if __name__ == "__main__":
--- a/contrib/local_cluster/lib/fdb_process.py
+++ b/contrib/local_cluster/lib/fdb_process.py
@ -67,6 +67,8 @@ class _ExecutablePath:
        path = overridden_path
        if path is None:
            path = shutil.which(self._executable)
+        else:
+            path = os.path.abspath(path)

        if path is None or not os.path.exists(path):
            raise FileNotFoundError(
--- a/contrib/local_cluster/local_cluster.py
+++ b/contrib/local_cluster/local_cluster.py
@ -63,7 +63,8 @@ async def run_fdbservers(num_processes, work_dir, cluster_file, port):
    async with lib.local_cluster.FDBServerLocalCluster(
        num_processes, work_dir, cluster_file, port
    ):
-        await asyncio.sleep(20)
+        while True:
+            await asyncio.sleep(1)


 def main():
--- a/documentation/CMakeLists.txt
+++ b/documentation/CMakeLists.txt
@ -56,7 +56,6 @@ function(add_documentation_target)
            ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/${target}_done
    DEPENDS ${SRCS}
    WORKING_DIRECTORY ${venv_dir})
-  message(STATUS "add_custom_target(${target} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${target}_done)")
  add_custom_target(${target} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${target}_done)
  add_dependencies(${target} buildsphinx)
 endfunction()
@ -85,7 +84,6 @@ else()
  string(MD5 username_hash ${username})
  # cmake math function can only use 64 bit signed integers - so we just truncate the string
  string(SUBSTRING "${username_hash}" 0 15 username_hash_small)
-  message(STATUS math(EXPR port "(0x${username_hash_small} % 8000) + 8000" OUTPUT_FORMAT DECIMAL))
  math(EXPR port "(0x${username_hash_small} % 8000) + 8000" OUTPUT_FORMAT DECIMAL)
  message(STATUS "Port is ${port}")
 endif()
--- a/documentation/sphinx/source/command-line-interface.rst
+++ b/documentation/sphinx/source/command-line-interface.rst
@ -131,12 +131,35 @@ The default is ``disabled``, which means changing the storage engine will not be
 ``aggressive`` tries to replace as many storages as it can at once, and will recruit a new storage server on the same process as the old one. This will be faster, but can potentially hit degraded performance or OOM with two storages on the same process. The main benefit over ``gradual`` is that this doesn't need to take one storage out of rotation, so it works for small or development clusters that have the same number of storage processes as the replication factor. Note that ``aggressive`` is not exclusive to running the perpetual wiggle.
 ``disabled`` means that if the storage engine is changed, fdb will not move the cluster over to the new storage engine. This will disable the perpetual wiggle from rewriting storage files.

+consistencyscan
+----------------
+
+This command controls a native data consistency scan role that is automatically recruited in the FDB cluster.  The consistency scan reads all replicas of each shard to verify data consistency.  It is useful for finding corrupt cold data by ensuring that all data is read periodically.  Any errors found will be logged as TraceEvents with Severity = 40.
+
+The syntax is
+
+``consistencyscan [ off | on [maxRate <RATE>] [targetInterval <INTERVAL>] [restart <RESTART>] ]``
+
+* ``off`` will disable the consistency scan
+
+* ``on`` will enable the scan and can be accompanied by additional options shown above
+
+  * ``RATE`` - sets the maximum read speed of the scan in bytes/s.
+
+  * ``INTERVAL`` - sets the target completion time, in seconds, for each full pass over all data in the cluster.  Scan speed will target this interval with a hard limit of RATE.
+
+  * ``RESTART`` - a 1 or 0 and controls whether the process should restart from the beginning of userspace on startup or not.  This should normally be set to 0 which will resume progress from the last time the scan was running.
+
+The consistency scan role publishes its configuration and metrics in Status JSON under the path ``.cluster.consistency_scan_info``.
+
 consistencycheck
 ----------------

-The ``consistencycheck`` command enables or disables consistency checking. Its syntax is ``consistencycheck [on|off]``. Calling it with ``on`` enables consistency checking, and ``off`` disables it. Calling it with no arguments displays whether consistency checking is currently enabled.
+Note: This command exists for backward compatibility, it is suggested to use the ``consistencyscan`` command to control FDB's internal consistency scan role instead.

-You must be running an ``fdbserver`` process with the ``consistencycheck`` role to perform consistency checking.
+This command controls a key which controls behavior of any externally configured consistency check roles.  You must be running an ``fdbserver`` process with the ``consistencycheck`` role to perform consistency checking.
+
+The ``consistencycheck`` command enables or disables consistency checking. Its syntax is ``consistencycheck [on|off]``. Calling it with ``on`` enables consistency checking, and ``off`` disables it. Calling it with no arguments displays whether consistency checking is currently enabled.

 coordinators
 ------------
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@ -909,6 +909,10 @@
         "expired_age" : 0, // The age in seconds of expired_version.
         "oldest_id_version" : 0, // The version of the oldest idempotency id still stored in the database.
         "oldest_id_age" : 0 // The age in seconds of the oldest_id_version.
+      },
+      "version_epoch":{
+         "enabled": true,
+         "epoch": 0 // The version epoch, as an offset from the Unix epoch. This field will be excluded if enabled is false.
      }
   },
   "client":{
--- a/fdbbackup/FileConverter.actor.cpp
+++ b/fdbbackup/FileConverter.actor.cpp
@ -608,7 +608,7 @@ int main(int argc, char** argv) {
 		setupNetwork(0, UseMetrics::True);

 		TraceEvent::setNetworkThread();
-		openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, param.log_dir, "convert", param.trace_log_group);
+		openTraceFile({}, 10 << 20, 10 << 20, param.log_dir, "convert", param.trace_log_group);

 		auto f = stopAfter(convert(param));

--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@ -641,7 +641,7 @@ int main(int argc, char** argv) {
 		param.updateKnobs();

 		TraceEvent::setNetworkThread();
-		openTraceFile(NetworkAddress(), 10 << 20, 500 << 20, param.log_dir, "decode", param.trace_log_group);
+		openTraceFile({}, 10 << 20, 500 << 20, param.log_dir, "decode", param.trace_log_group);
 		param.tlsConfig.setupBlobCredentials();

 		auto f = stopAfter(decode_logs(param));
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -2383,7 +2383,7 @@ ACTOR Future<Void> runRestore(Database db,
 				fmt::print("Restored to version {}\n", restoredVersion);
 			}
 		} else {
-			state Optional<RestorableFileSet> rset = wait(bc->getRestoreSet(targetVersion, db, ranges));
+			state Optional<RestorableFileSet> rset = wait(bc->getRestoreSet(targetVersion, ranges));

 			if (!rset.present()) {
 				fmt::print(stderr,
@ -2493,7 +2493,7 @@ ACTOR Future<Void> runFastRestoreTool(Database db,
 				restoreVersion = dbVersion;
 			}

-			state Optional<RestorableFileSet> rset = wait(bc->getRestoreSet(restoreVersion, db));
+			state Optional<RestorableFileSet> rset = wait(bc->getRestoreSet(restoreVersion));
 			if (!rset.present()) {
 				fmt::print(stderr, "Insufficient data to restore to version {}\n", restoreVersion);
 				throw restore_invalid_version();
@ -2768,7 +2768,7 @@ ACTOR Future<Void> queryBackup(const char* name,
 			                           format("the specified restorable version %lld is not valid", restoreVersion));
 			return Void();
 		}
-		Optional<RestorableFileSet> fileSet = wait(bc->getRestoreSet(restoreVersion, cx, keyRangesFilter));
+		Optional<RestorableFileSet> fileSet = wait(bc->getRestoreSet(restoreVersion, keyRangesFilter));
 		if (fileSet.present()) {
 			int64_t totalRangeFilesSize = 0, totalLogFilesSize = 0;
 			result["restore_version"] = fileSet.get().targetVersion;
@ -3973,7 +3973,7 @@ int main(int argc, char* argv[]) {
 		// a cluster so they should use this instead.
 		auto initTraceFile = [&]() {
 			if (trace)
-				openTraceFile(NetworkAddress(), traceRollSize, traceMaxLogsSize, traceDir, "trace", traceLogGroup);
+				openTraceFile({}, traceRollSize, traceMaxLogsSize, traceDir, "trace", traceLogGroup);
 		};

 		auto initCluster = [&](bool quiet = false) {
--- a/fdbcli/AuditStorageCommand.actor.cpp
+++ b/fdbcli/AuditStorageCommand.actor.cpp
@ -47,11 +47,8 @@ ACTOR Future<UID> auditStorageCommandActor(Reference<IClusterConnectionRecord> c
 		return UID();
 	}

-	Key begin, end;
-	if (tokens.size() == 2) {
-		begin = allKeys.begin;
-		end = allKeys.end;
-	} else if (tokens.size() == 3) {
+	Key begin = allKeys.begin, end = allKeys.end;
+	if (tokens.size() == 3) {
 		begin = tokens[2];
 	} else if (tokens.size() == 4) {
 		begin = tokens[2];
@ -66,7 +63,11 @@ ACTOR Future<UID> auditStorageCommandActor(Reference<IClusterConnectionRecord> c
 }

 CommandFactory auditStorageFactory("audit_storage",
-                                   CommandHelp("audit_storage <ha> [BeginKey] [EndKey]",
+                                   CommandHelp("audit_storage <Type> [BeginKey EndKey]",
                                               "Start an audit storage",
-                                               "Trigger an audit storage, the auditID is returned.\n"));
+                                               "Specify audit `Type' (only `ha' `Type' is supported currently), and\n"
+                                               "optionally a sub-range with `BeginKey' and `EndKey'.\n"
+                                               "For example, to audit the full key range: `audit_storage ha'\n"
+                                               "To audit a sub-range only: `audit_storage ha 0xa 0xb'\n"
+                                               "Returns an audit `ID'. See also `get_audit_status' command.\n"));
 } // namespace fdb_cli
--- a/fdbcli/BlobRangeCommand.actor.cpp
+++ b/fdbcli/BlobRangeCommand.actor.cpp
@ -24,6 +24,7 @@
 #include "fdbclient/IClientApi.h"
 #include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
+#include "fdbclient/BlobGranuleRequest.actor.h"

 #include "flow/Arena.h"
 #include "flow/FastRef.h"
@ -88,6 +89,22 @@ ACTOR Future<Void> doBlobCheck(Database db, Key startKey, Key endKey, Optional<V
 	return Void();
 }

+ACTOR Future<Void> doBlobFlush(Database db, Key startKey, Key endKey, Optional<Version> version, bool compact) {
+	// TODO make DB function?
+	state Version flushVersion;
+	if (version.present()) {
+		flushVersion = version.get();
+	} else {
+		wait(store(flushVersion, getLatestReadVersion(db)));
+	}
+
+	KeyRange range(KeyRangeRef(startKey, endKey));
+	FlushGranuleRequest req(-1, range, flushVersion, compact);
+	wait(success(doBlobGranuleRequests(db, range, req, &BlobWorkerInterface::flushGranuleRequest)));
+
+	return Void();
+}
+
 } // namespace

 namespace fdb_cli {
@ -147,7 +164,8 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
 				           tokens[3].printable());
 			}
 			return success;
-		} else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge") || tokencmp(tokens[1], "check")) {
+		} else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge") || tokencmp(tokens[1], "check") ||
+		           tokencmp(tokens[1], "flush") || tokencmp(tokens[1], "compact")) {
 			bool purge = tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge");
 			bool forcePurge = tokencmp(tokens[1], "forcepurge");

@ -175,7 +193,15 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
 			if (purge) {
 				wait(doBlobPurge(localDb, begin, end, version, forcePurge));
 			} else {
-				wait(doBlobCheck(localDb, begin, end, version));
+				if (tokencmp(tokens[1], "check")) {
+					wait(doBlobCheck(localDb, begin, end, version));
+				} else if (tokencmp(tokens[1], "flush")) {
+					wait(doBlobFlush(localDb, begin, end, version, false));
+				} else if (tokencmp(tokens[1], "compact")) {
+					wait(doBlobFlush(localDb, begin, end, version, true));
+				} else {
+					ASSERT(false);
+				}
 			}
 		} else {
 			printUsage(tokens[0]);
@ -187,5 +213,5 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,

 CommandFactory blobRangeFactory(
    "blobrange",
-    CommandHelp("blobrange <start|stop|check|purge|forcepurge> <startkey> <endkey> [version]", "", ""));
+    CommandHelp("blobrange <start|stop|check|purge|forcepurge|flush|compact> <startkey> <endkey> [version]", "", ""));
 } // namespace fdb_cli
--- a/fdbcli/CMakeLists.txt
+++ b/fdbcli/CMakeLists.txt
@ -58,4 +58,10 @@ if (NOT WIN32 AND NOT OPEN_FOR_IDE)
          5
          --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so
          )
+
+  add_multi_fdbclient_test(
+    NAME metacluster_fdbcli_tests
+    COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/metacluster_fdbcli_tests.py
+            ${CMAKE_BINARY_DIR}
+            )
 endif()
--- a/fdbcli/FlowLineNoise.actor.cpp
+++ b/fdbcli/FlowLineNoise.actor.cpp
@ -21,7 +21,9 @@
 #include "fdbcli/FlowLineNoise.h"
 #include "flow/IThreadPool.h"

+#ifndef BOOST_SYSTEM_NO_LIB
 #define BOOST_SYSTEM_NO_LIB
+#endif
 #define BOOST_DATE_TIME_NO_LIB
 #define BOOST_REGEX_NO_LIB
 #include "boost/asio.hpp"
--- a/fdbcli/GetAuditStatusCommand.actor.cpp
+++ b/fdbcli/GetAuditStatusCommand.actor.cpp
@ -31,7 +31,7 @@
 namespace fdb_cli {

 ACTOR Future<bool> getAuditStatusCommandActor(Database cx, std::vector<StringRef> tokens) {
-	if (tokens.size() != 4) {
+	if (tokens.size() < 3 || tokens.size() > 4) {
 		printUsage(tokens[0]);
 		return false;
 	}
@ -45,11 +45,18 @@ ACTOR Future<bool> getAuditStatusCommandActor(Database cx, std::vector<StringRef
 	}

 	if (tokencmp(tokens[2], "id")) {
+		if (tokens.size() != 4) {
+			printUsage(tokens[0]);
+			return false;
+		}
 		const UID id = UID::fromString(tokens[3].toString());
 		AuditStorageState res = wait(getAuditState(cx, type, id));
 		printf("Audit result is:\n%s", res.toString().c_str());
 	} else if (tokencmp(tokens[2], "recent")) {
-		const int count = std::stoi(tokens[3].toString());
+		int count = CLIENT_KNOBS->TOO_MANY;
+		if (tokens.size() == 4) {
+			count = std::stoi(tokens[3].toString());
+		}
 		std::vector<AuditStorageState> res = wait(getLatestAuditStates(cx, type, count));
 		for (const auto& it : res) {
 			printf("Audit result is:\n%s\n", it.toString().c_str());
@ -60,8 +67,15 @@ ACTOR Future<bool> getAuditStatusCommandActor(Database cx, std::vector<StringRef

 CommandFactory getAuditStatusFactory(
    "get_audit_status",
-    CommandHelp("get_audit_status <ha> <id|recent> [ARGs]",
-                "Retrieve audit storage results of the specific type",
-                "Fetch audit result with an ID: get_audit_status [Type] id [ID];\n"
-                "Fetch most recent audit results: get_audit_status [Type] recent [Count].\n"));
+    CommandHelp("get_audit_status <Type> <id|recent> [ARGs]",
+                "Retrieve audit storage status",
+                "To fetch audit status via ID: `get_audit_status [Type] id [ID]'\n"
+                "To fetch status of most recent audit: `get_audit_status [Type] recent [Count]'\n"
+                "Only 'ha' `Type' is supported currently. If specified, `Count' is how many\n"
+                "rows to audit. If not specified, check all rows in audit.\n"
+                "Results have the following format:\n"
+                "  `[ID]: 000000000001000000000000, [Range]:  - 0xff, [Type]: 1, [Phase]: 2'\n"
+                "where `Type' is `1' for `ha' and `Phase' is `2' for `Complete'.\n"
+                "Phase can be `Invalid=0', `Running=1', `Complete=2', `Error=3', or `Failed=4'.\n"
+                "See also `audit_storage' command."));
 } // namespace fdb_cli
--- a/fdbcli/MetaclusterCommands.actor.cpp
+++ b/fdbcli/MetaclusterCommands.actor.cpp
@ -252,8 +252,8 @@ ACTOR Future<bool> metaclusterGetCommand(Reference<IDatabase> db, std::vector<St

 		if (useJson) {
 			json_spirit::mObject obj;
-			obj["type"] = "success";
-			obj["cluster"] = metadata.toJson();
+			obj[msgTypeKey] = "success";
+			obj[msgClusterKey] = metadata.toJson();
 			fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
 		} else {
 			fmt::print("  connection string: {}\n", metadata.connectionString.toString().c_str());
@ -264,8 +264,8 @@ ACTOR Future<bool> metaclusterGetCommand(Reference<IDatabase> db, std::vector<St
 	} catch (Error& e) {
 		if (useJson) {
 			json_spirit::mObject obj;
-			obj["type"] = "error";
-			obj["error"] = e.what();
+			obj[msgTypeKey] = "error";
+			obj[msgErrorKey] = e.what();
 			fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
 			return false;
 		} else {
@ -287,39 +287,85 @@ ACTOR Future<bool> metaclusterStatusCommand(Reference<IDatabase> db, std::vector

 	state bool useJson = tokens.size() == 3;

-	try {
-		std::map<ClusterName, DataClusterMetadata> clusters =
-		    wait(MetaclusterAPI::listClusters(db, ""_sr, "\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS));
+	state Optional<std::string> metaclusterName;

-		auto capacityNumbers = MetaclusterAPI::metaclusterCapacity(clusters);
+	state Reference<ITransaction> tr = db->createTransaction();

-		if (useJson) {
-			json_spirit::mObject obj;
-			obj["type"] = "success";
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+			Optional<MetaclusterRegistrationEntry> registrationEntry =
+			    wait(MetaclusterMetadata::metaclusterRegistration().get(tr));
+			const ClusterType clusterType =
+			    !registrationEntry.present() ? ClusterType::STANDALONE : registrationEntry.get().clusterType;
+			if (ClusterType::STANDALONE == clusterType) {
+				if (useJson) {
+					json_spirit::mObject obj;
+					obj[msgTypeKey] = "success";
+					obj[msgClusterTypeKey] = clusterTypeToString(clusterType);
+					fmt::print("{}\n",
+					           json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
+				} else {
+					fmt::print("This cluster is not part of a metacluster\n");
+				}
+				return true;
+			} else if (ClusterType::METACLUSTER_DATA == clusterType) {
+				ASSERT(registrationEntry.present());
+				metaclusterName = registrationEntry.get().metaclusterName.toString();
+				if (useJson) {
+					json_spirit::mObject obj;
+					obj[msgTypeKey] = "success";
+					obj[msgClusterTypeKey] = clusterTypeToString(clusterType);
+					json_spirit::mObject metaclusterObj;
+					metaclusterObj[msgMetaclusterName] = metaclusterName.get();
+					obj[msgMetaclusterKey] = metaclusterObj;
+					fmt::print("{}\n",
+					           json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
+				} else {
+					fmt::print("This cluster \"{}\" is a data cluster within the metacluster named \"{}\"\n",
+					           registrationEntry.get().name.toString().c_str(),
+					           metaclusterName.get().c_str());
+				}
+				return true;
+			}

-			json_spirit::mObject metaclusterObj;
-			metaclusterObj["data_clusters"] = (int)clusters.size();
-			metaclusterObj["capacity"] = capacityNumbers.first.toJson();
-			metaclusterObj["allocated"] = capacityNumbers.second.toJson();
+			metaclusterName = registrationEntry.get().metaclusterName.toString();

-			obj["metacluster"] = metaclusterObj;
-			fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
-		} else {
-			fmt::print("  number of data clusters: {}\n", clusters.size());
-			fmt::print("  tenant group capacity: {}\n", capacityNumbers.first.numTenantGroups);
-			fmt::print("  allocated tenant groups: {}\n", capacityNumbers.second.numTenantGroups);
-		}
+			ASSERT(ClusterType::METACLUSTER_MANAGEMENT == clusterType);
+			std::map<ClusterName, DataClusterMetadata> clusters =
+			    wait(MetaclusterAPI::listClustersTransaction(tr, ""_sr, "\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS));
+			auto capacityNumbers = MetaclusterAPI::metaclusterCapacity(clusters);
+			if (useJson) {
+				json_spirit::mObject obj;
+				obj[msgTypeKey] = "success";
+				obj[msgClusterTypeKey] = clusterTypeToString(ClusterType::METACLUSTER_MANAGEMENT);

-		return true;
-	} catch (Error& e) {
-		if (useJson) {
-			json_spirit::mObject obj;
-			obj["type"] = "error";
-			obj["error"] = e.what();
-			fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
-			return false;
-		} else {
-			throw;
+				json_spirit::mObject metaclusterObj;
+				metaclusterObj[msgMetaclusterName] = metaclusterName.get();
+				metaclusterObj[msgDataClustersKey] = static_cast<int>(clusters.size());
+				metaclusterObj[msgCapacityKey] = capacityNumbers.first.toJson();
+				metaclusterObj[msgAllocatedKey] = capacityNumbers.second.toJson();
+
+				obj[msgMetaclusterKey] = metaclusterObj;
+				fmt::print("{}\n",
+				           json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
+			} else {
+				fmt::print("  number of data clusters: {}\n", clusters.size());
+				fmt::print("  tenant group capacity: {}\n", capacityNumbers.first.numTenantGroups);
+				fmt::print("  allocated tenant groups: {}\n", capacityNumbers.second.numTenantGroups);
+			}
+			return true;
+		} catch (Error& e) {
+			if (useJson) {
+				json_spirit::mObject obj;
+				obj[msgTypeKey] = "error";
+				obj[msgErrorKey] = e.what();
+				fmt::print("{}\n",
+				           json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
+				return false;
+			} else {
+				throw;
+			}
 		}
 	}
 }
--- a/fdbcli/TenantCommands.actor.cpp
+++ b/fdbcli/TenantCommands.actor.cpp
@ -365,10 +365,18 @@ ACTOR Future<bool> tenantListCommand(Reference<IDatabase> db, std::vector<String
 			state ClusterType clusterType = wait(TenantAPI::getClusterType(tr));
 			state std::vector<TenantName> tenantNames;
 			if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
-				std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-				    wait(MetaclusterAPI::listTenants(db, beginTenant, endTenant, limit, offset, filters));
-				for (auto tenant : tenants) {
-					tenantNames.push_back(tenant.first);
+				if (filters.empty()) {
+					std::vector<std::pair<TenantName, int64_t>> tenants =
+					    wait(MetaclusterAPI::listTenants(db, beginTenant, endTenant, limit, offset));
+					for (auto tenant : tenants) {
+						tenantNames.push_back(tenant.first);
+					}
+				} else {
+					std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
+					    wait(MetaclusterAPI::listTenantMetadata(db, beginTenant, endTenant, limit, offset, filters));
+					for (auto tenant : tenants) {
+						tenantNames.push_back(tenant.first);
+					}
 				}
 			} else {
 				// Hold the reference to the standalone's memory
--- a/fdbcli/include/fdbcli/fdbcli.actor.h
+++ b/fdbcli/include/fdbcli/fdbcli.actor.h
@ -39,6 +39,16 @@

 namespace fdb_cli {

+constexpr char msgTypeKey[] = "type";
+constexpr char msgClusterKey[] = "cluster";
+constexpr char msgClusterTypeKey[] = "cluster_type";
+constexpr char msgMetaclusterName[] = "metacluster_name";
+constexpr char msgMetaclusterKey[] = "metacluster";
+constexpr char msgDataClustersKey[] = "data_clusters";
+constexpr char msgCapacityKey[] = "capacity";
+constexpr char msgAllocatedKey[] = "allocated";
+constexpr char msgErrorKey[] = "error";
+
 struct CommandHelp {
 	std::string usage;
 	std::string short_desc;
--- a/fdbcli/tests/fdbcli_tests.py
+++ b/fdbcli/tests/fdbcli_tests.py
@ -1077,7 +1077,7 @@ if __name__ == '__main__':
                            description="""
    The test calls fdbcli commands through fdbcli --exec "<command>" interactively using subprocess.
    The outputs from fdbcli are returned and compared to predefined results.
-    Consequently, changing fdbcli outputs or breaking any commands will casue the test to fail.
+    Consequently, changing fdbcli outputs or breaking any commands will cause the test to fail.
    Commands that are easy to test will run against a single process cluster.
    For complex commands like exclude, they will run against a cluster with multiple(current set to 5) processes.
    If external_client_library is given, we will disable the local client and use the external client to run fdbcli.
--- a/fdbcli/tests/metacluster_fdbcli_tests.py
+++ b/fdbcli/tests/metacluster_fdbcli_tests.py
@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+import argparse
+import os
+import subprocess
+
+from argparse import RawDescriptionHelpFormatter
+
+
+def run_command(*args):
+    commands = ["{}".format(args)]
+    print(commands)
+    try:
+        process = subprocess.run(commands, stdout=subprocess.PIPE, env=fdbcli_env, timeout=20)
+        return process.stdout.decode('utf-8').strip()
+    except subprocess.TimeoutExpired:
+        raise Exception('the command is stuck')
+
+
+def run_fdbcli_command(cluster_file, *args):
+    command_template = [fdbcli_bin, '-C', "{}".format(cluster_file), '--exec']
+    commands = command_template + ["{}".format(' '.join(args))]
+    print(commands)
+    try:
+        # if the fdbcli command is stuck for more than 20 seconds, the database is definitely unavailable
+        process = subprocess.run(commands, stdout=subprocess.PIPE, env=fdbcli_env, timeout=20)
+        return process.stdout.decode('utf-8').strip()
+    except subprocess.TimeoutExpired:
+        raise Exception('The fdbcli command is stuck, database is unavailable')
+
+
+def get_cluster_connection_str(cluster_file_path):
+    with open(cluster_file_path, 'r') as f:
+        conn_str = f.readline().strip()
+        return conn_str
+
+
+def metacluster_create(cluster_file, name):
+    return run_fdbcli_command(cluster_file, "metacluster create_experimental", name)
+
+
+def metacluster_register(management_cluster_file, data_cluster_file, name):
+    conn_str = get_cluster_connection_str(data_cluster_file)
+    return run_fdbcli_command(management_cluster_file, "metacluster register", name, "connection_string={}".format(
+        conn_str))
+
+
+def metacluster_status(cluster_file):
+    return run_fdbcli_command(cluster_file, "metacluster status")
+
+
+if __name__ == "__main__":
+    print("metacluster_fdbcli_tests")
+    script_desc = """
+    This script executes a series of commands on multiple clusters within an FDB metacluster.
+    """
+
+    parser = argparse.ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
+                                     description=script_desc)
+
+    parser.add_argument('build_dir', metavar='BUILD_DIRECTORY', help='FDB build directory')
+    args = parser.parse_args()
+
+    # keep current environment variables
+    fdbcli_env = os.environ.copy()
+    cluster_files = fdbcli_env.get("FDB_CLUSTERS").split(';')
+    num_clusters = len(cluster_files)
+    assert len(cluster_files) > 1
+
+    fdbcli_bin = args.build_dir + '/bin/fdbcli'
+
+    for cf in cluster_files:
+        output = metacluster_status(cf)
+        assert output == "This cluster is not part of a metacluster"
+
+    names = ['meta_mgmt']
+    names.extend(['data{}'.format(i) for i in range(1, num_clusters)])
+
+    metacluster_create(cluster_files[0], names[0])
+    for (cf, name) in zip(cluster_files[1:], names[1:]):
+        output = metacluster_register(cluster_files[0], cf, name)
+
+    expected = """
+number of data clusters: {}
+  tenant group capacity: 0
+  allocated tenant groups: 0
+"""
+    expected = expected.format(num_clusters - 1).strip()
+    output = metacluster_status(cluster_files[0])
+    assert expected == output
+
+    for (cf, name) in zip(cluster_files[1:], names[1:]):
+        output = metacluster_status(cf)
+        expected = "This cluster \"{}\" is a data cluster within the metacluster named \"{" \
+                   "}\"".format(name, names[0])
+        assert expected == output
--- a/fdbclient/AutoPublicAddress.cpp
+++ b/fdbclient/AutoPublicAddress.cpp
@ -21,7 +21,9 @@
 #include "flow/Platform.h"
 #include <algorithm>

+#ifndef BOOST_SYSTEM_NO_LIB
 #define BOOST_SYSTEM_NO_LIB
+#endif
 #define BOOST_DATE_TIME_NO_LIB
 #define BOOST_REGEX_NO_LIB
 #include "boost/asio.hpp"
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@ -24,6 +24,7 @@
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/BlobCipher.h"
 #include "fdbclient/CommitTransaction.h"
+#include "fdbclient/FDBTypes.h"
 #include "fdbclient/GetEncryptCipherKeys.actor.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/ManagementAPI.actor.h"
@ -251,6 +252,34 @@ Version getLogKeyVersion(Key key) {
 	return bigEndian64(*(int64_t*)(key.begin() + backupLogPrefixBytes + sizeof(UID) + sizeof(uint8_t)));
 }

+bool validTenantAccess(std::map<int64_t, TenantName>* tenantMap,
+                       MutationRef m,
+                       bool provisionalProxy,
+                       Version version) {
+	if (isSystemKey(m.param1)) {
+		return true;
+	}
+	int64_t tenantId = TenantInfo::INVALID_TENANT;
+	if (m.isEncrypted()) {
+		tenantId = m.encryptionHeader()->cipherTextDetails.encryptDomainId;
+	} else {
+		tenantId = TenantAPI::extractTenantIdFromMutation(m);
+	}
+	ASSERT(tenantMap != nullptr);
+	if (m.isEncrypted() && isReservedEncryptDomain(tenantId)) {
+		// These are valid encrypt domains so don't check the tenant map
+	} else if (tenantMap->find(tenantId) == tenantMap->end()) {
+		// If a tenant is not found for a given mutation then exclude it from the batch
+		ASSERT(!provisionalProxy);
+		TraceEvent(SevWarnAlways, "MutationLogRestoreTenantNotFound")
+		    .detail("Version", version)
+		    .detail("TenantId", tenantId);
+		CODE_PROBE(true, "mutation log restore tenant not found");
+		return false;
+	}
+	return true;
+}
+
 // Given a key from one of the ranges returned by get_log_ranges,
 // returns(version, part) where version is the database version number of
 // the transaction log data in the value, and part is 0 for the first such
@ -319,27 +348,49 @@ ACTOR static Future<Void> decodeBackupLogValue(Arena* arena,
 			offset += len2;
 			state Optional<MutationRef> encryptedLogValue = Optional<MutationRef>();

+			// Check for valid tenant in required tenant mode. If the tenant does not exist in our tenant map then
+			// we EXCLUDE the mutation (of that respective tenant) during the restore. NOTE: This simply allows a
+			// restore to make progress in the event of tenant deletion, but tenant deletion should be considered
+			// carefully so that we do not run into this case. We do this check here so if encrypted mutations are not
+			// found in the tenant map then we exit early without needing to reach out to the EKP.
+			if (config.tenantMode == TenantMode::REQUIRED &&
+			    config.encryptionAtRestMode.mode != EncryptionAtRestMode::CLUSTER_AWARE &&
+			    !validTenantAccess(tenantMap, logValue, provisionalProxy, version)) {
+				consumed += BackupAgentBase::logHeaderSize + len1 + len2;
+				continue;
+			}
+
 			// Decrypt mutation ref if encrypted
 			if (logValue.isEncrypted()) {
 				encryptedLogValue = logValue;
+				state EncryptCipherDomainId domainId = logValue.encryptionHeader()->cipherTextDetails.encryptDomainId;
 				Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
-				TextAndHeaderCipherKeys cipherKeys =
-				    wait(getEncryptCipherKeys(dbInfo, *logValue.encryptionHeader(), BlobCipherMetrics::BACKUP));
-				logValue = logValue.decrypt(cipherKeys, tempArena, BlobCipherMetrics::BACKUP);
+				try {
+					TextAndHeaderCipherKeys cipherKeys =
+					    wait(getEncryptCipherKeys(dbInfo, *logValue.encryptionHeader(), BlobCipherMetrics::RESTORE));
+					logValue = logValue.decrypt(cipherKeys, tempArena, BlobCipherMetrics::BACKUP);
+				} catch (Error& e) {
+					// It's possible a tenant was deleted and the encrypt key fetch failed
+					TraceEvent(SevWarnAlways, "MutationLogRestoreEncryptKeyFetchFailed")
+					    .detail("Version", version)
+					    .detail("TenantId", domainId);
+					if (e.code() == error_code_encrypt_keys_fetch_failed) {
+						CODE_PROBE(true, "mutation log restore encrypt keys not found");
+						consumed += BackupAgentBase::logHeaderSize + len1 + len2;
+						continue;
+					} else {
+						throw;
+					}
+				}
 			}
 			ASSERT(!logValue.isEncrypted());

-			if (config.tenantMode == TenantMode::REQUIRED && !isSystemKey(logValue.param1)) {
-				// If a tenant is not found for a given mutation then exclude it from the batch
-				int64_t tenantId = TenantAPI::extractTenantIdFromMutation(logValue);
-				ASSERT(tenantMap != nullptr);
-				if (tenantMap->find(tenantId) == tenantMap->end()) {
-					ASSERT(!provisionalProxy);
-					TraceEvent("TenantNotFound").detail("Version", version).detail("TenantId", tenantId);
-					CODE_PROBE(true, "mutation log restore tenant not found");
-					consumed += BackupAgentBase::logHeaderSize + len1 + len2;
-					continue;
-				}
+			// If the mutation was encrypted using cluster aware encryption then check after decryption
+			if (config.tenantMode == TenantMode::REQUIRED &&
+			    config.encryptionAtRestMode.mode == EncryptionAtRestMode::CLUSTER_AWARE &&
+			    !validTenantAccess(tenantMap, logValue, provisionalProxy, version)) {
+				consumed += BackupAgentBase::logHeaderSize + len1 + len2;
+				continue;
 			}

 			MutationRef originalLogValue = logValue;
--- a/fdbclient/BackupContainerFileSystem.actor.cpp
+++ b/fdbclient/BackupContainerFileSystem.actor.cpp
@ -906,7 +906,6 @@ public:
 	ACTOR static Future<Optional<RestorableFileSet>> getRestoreSet(Reference<BackupContainerFileSystem> bc,
 	                                                               Version targetVersion,
 	                                                               VectorRef<KeyRangeRef> keyRangesFilter,
-	                                                               Optional<Database> cx,
 	                                                               bool logsOnly = false,
 	                                                               Version beginVersion = invalidVersion) {
 		for (const auto& range : keyRangesFilter) {
@ -974,19 +973,6 @@ public:
 				continue;

 			restorable.snapshot = snapshots[i];
-			// TODO: Reenable the sanity check after TooManyFiles error is resolved
-			if (false && g_network->isSimulated()) {
-				// Sanity check key ranges
-				state std::map<std::string, KeyRange>::iterator rit;
-				for (rit = restorable.keyRanges.begin(); rit != restorable.keyRanges.end(); rit++) {
-					auto it = std::find_if(restorable.ranges.begin(),
-					                       restorable.ranges.end(),
-					                       [file = rit->first](const RangeFile f) { return f.fileName == file; });
-					ASSERT(it != restorable.ranges.end());
-					KeyRange result = wait(bc->getSnapshotFileKeyRange(*it, cx));
-					ASSERT(rit->second.begin <= result.begin && rit->second.end >= result.end);
-				}
-			}

 			// No logs needed if there is a complete filtered key space snapshot at the target version.
 			if (minKeyRangeVersion == maxKeyRangeVersion && maxKeyRangeVersion == restorable.targetVersion) {
@ -1362,7 +1348,7 @@ Future<Void> BackupContainerFileSystem::expireData(Version expireEndVersion,

 ACTOR static Future<KeyRange> getSnapshotFileKeyRange_impl(Reference<BackupContainerFileSystem> bc,
                                                           RangeFile file,
-                                                           Optional<Database> cx) {
+                                                           Database cx) {
 	state int readFileRetries = 0;
 	state bool beginKeySet = false;
 	state Key beginKey;
@ -1448,18 +1434,17 @@ ACTOR static Future<Optional<Version>> readVersionProperty(Reference<BackupConta
 	}
 }

-Future<KeyRange> BackupContainerFileSystem::getSnapshotFileKeyRange(const RangeFile& file, Optional<Database> cx) {
+Future<KeyRange> BackupContainerFileSystem::getSnapshotFileKeyRange(const RangeFile& file, Database cx) {
 	ASSERT(g_network->isSimulated());
 	return getSnapshotFileKeyRange_impl(Reference<BackupContainerFileSystem>::addRef(this), file, cx);
 }

 Future<Optional<RestorableFileSet>> BackupContainerFileSystem::getRestoreSet(Version targetVersion,
-                                                                             Optional<Database> cx,
                                                                             VectorRef<KeyRangeRef> keyRangesFilter,
                                                                             bool logsOnly,
                                                                             Version beginVersion) {
 	return BackupContainerFileSystemImpl::getRestoreSet(
-	    Reference<BackupContainerFileSystem>::addRef(this), targetVersion, keyRangesFilter, cx, logsOnly, beginVersion);
+	    Reference<BackupContainerFileSystem>::addRef(this), targetVersion, keyRangesFilter, logsOnly, beginVersion);
 }

 Future<Optional<Version>> BackupContainerFileSystem::VersionProperty::get() {
@ -1687,8 +1672,7 @@ ACTOR static Future<Void> testWriteSnapshotFile(Reference<IBackupFile> file, Key

 ACTOR Future<Void> testBackupContainer(std::string url,
                                       Optional<std::string> proxy,
-                                       Optional<std::string> encryptionKeyFileName,
-                                       Optional<Database> cx) {
+                                       Optional<std::string> encryptionKeyFileName) {
 	state FlowLock lock(100e6);

 	if (encryptionKeyFileName.present()) {
@ -1795,13 +1779,13 @@ ACTOR Future<Void> testBackupContainer(std::string url,
 	for (; i < listing.snapshots.size(); ++i) {
 		{
 			// Ensure we can still restore to the latest version
-			Optional<RestorableFileSet> rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get(), cx));
+			Optional<RestorableFileSet> rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get()));
 			ASSERT(rest.present());
 		}

 		{
 			// Ensure we can restore to the end version of snapshot i
-			Optional<RestorableFileSet> rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion, cx));
+			Optional<RestorableFileSet> rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion));
 			ASSERT(rest.present());
 		}

@ -1842,16 +1826,14 @@ ACTOR Future<Void> testBackupContainer(std::string url,
 }

 TEST_CASE("/backup/containers/localdir/unencrypted") {
-	wait(testBackupContainer(
-	    format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {}, {}));
+	wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {}));
 	return Void();
 }

 TEST_CASE("/backup/containers/localdir/encrypted") {
 	wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()),
 	                         {},
-	                         format("%s/test_encryption_key", params.getDataDir().c_str()),
-	                         {}));
+	                         format("%s/test_encryption_key", params.getDataDir().c_str())));
 	return Void();
 }

@ -1859,7 +1841,7 @@ TEST_CASE("/backup/containers/url") {
 	if (!g_network->isSimulated()) {
 		const char* url = getenv("FDB_TEST_BACKUP_URL");
 		ASSERT(url != nullptr);
-		wait(testBackupContainer(url, {}, {}, {}));
+		wait(testBackupContainer(url, {}, {}));
 	}
 	return Void();
 }
--- a/fdbclient/BlobCipher.cpp
+++ b/fdbclient/BlobCipher.cpp
@ -85,6 +85,7 @@ BlobCipherMetrics::BlobCipherMetrics()
                  CounterSet(cc, "KVRedwood"),
                  CounterSet(cc, "BlobGranule"),
                  CounterSet(cc, "Backup"),
+                  CounterSet(cc, "Restore"),
                  CounterSet(cc, "Test") }) {
 	specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); });
 	traceFuture = cc.traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL);
@ -102,6 +103,8 @@ std::string toString(BlobCipherMetrics::UsageType type) {
 		return "BlobGranule";
 	case BlobCipherMetrics::UsageType::BACKUP:
 		return "Backup";
+	case BlobCipherMetrics::UsageType::RESTORE:
+		return "Restore";
 	case BlobCipherMetrics::UsageType::TEST:
 		return "Test";
 	default:
--- a/fdbclient/BlobGranuleFiles.cpp
+++ b/fdbclient/BlobGranuleFiles.cpp
@ -1479,7 +1479,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
                                   Version beginVersion,
                                   Version readVersion,
                                   Optional<StringRef> snapshotData,
-                                   StringRef deltaFileData[],
+                                   const std::vector<StringRef>& deltaFileData,
                                   GranuleMaterializeStats& stats) {
 	// TODO REMOVE with early replying
 	ASSERT(readVersion == chunk.includedVersion);
@ -1528,6 +1528,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
 	if (BG_READ_DEBUG) {
 		fmt::print("Applying {} delta files\n", chunk.deltaFiles.size());
 	}
+	ASSERT(chunk.deltaFiles.size() == deltaFileData.size());
 	for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
 		stats.inputBytes += deltaFileData[deltaIdx].size();
 		bool startClear = false;
@ -1656,8 +1657,8 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
 				}
 			}

-			// +1 to avoid UBSAN variable length array of size zero
-			StringRef deltaData[files[chunkIdx].deltaFiles.size() + 1];
+			std::vector<StringRef> deltaData;
+			deltaData.resize(files[chunkIdx].deltaFiles.size());
 			for (int i = 0; i < files[chunkIdx].deltaFiles.size(); i++) {
 				deltaData[i] =
 				    StringRef(granuleContext.get_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext),
@ -1684,6 +1685,85 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
 	}
 }

+// just for client passthrough. reads all key-value pairs from a snapshot file, and all mutations from a delta file
+RangeResult bgReadSnapshotFile(const StringRef& data) {
+	Standalone<StringRef> fname = "f"_sr;
+	Standalone<VectorRef<ParsedDeltaBoundaryRef>> results = loadSnapshotFile(fname, data, normalKeys, {});
+	RangeResult snapshot;
+	snapshot.reserve(snapshot.arena(), results.size());
+	snapshot.arena().dependsOn(results.arena());
+	for (auto& it : results) {
+		snapshot.emplace_back(snapshot.arena(), it.key, it.value);
+	}
+	return snapshot;
+}
+
+// FIXME: refactor if possible, just copy-pasted from loadChunkedDeltaFile for prototyping
+Standalone<VectorRef<GranuleMutationRef>> bgReadDeltaFile(const StringRef& deltaData) {
+	Standalone<VectorRef<GranuleMutationRef>> deltas;
+	Standalone<IndexedBlobGranuleFile> file = IndexedBlobGranuleFile::fromFileBytes(deltaData, {});
+
+	ASSERT(file.fileType == DELTA_FILE_TYPE);
+	ASSERT(file.chunkStartOffset > 0);
+
+	// empty delta file
+	if (file.indexBlockRef.block.children.empty()) {
+		return deltas;
+	}
+
+	ASSERT(file.indexBlockRef.block.children.size() >= 2);
+
+	// find range of blocks needed to read
+	ChildBlockPointerRef* currentBlock = file.indexBlockRef.block.children.begin();
+
+	bool lastBlock = false;
+	bool prevClearAfter = false;
+	KeyRef prevClearAfterKey;
+	Version prevClearAfterVersion;
+	while (!lastBlock) {
+		auto nextBlock = currentBlock;
+		nextBlock++;
+		lastBlock = (nextBlock == file.indexBlockRef.block.children.end() - 1);
+
+		Standalone<GranuleSortedDeltas> deltaBlock =
+		    file.getChild<GranuleSortedDeltas>(currentBlock, {}, file.chunkStartOffset);
+		ASSERT(!deltaBlock.boundaries.empty());
+		ASSERT(currentBlock->key == deltaBlock.boundaries.front().key);
+
+		for (auto& entry : deltaBlock.boundaries) {
+			if (prevClearAfter) {
+				deltas.emplace_back(
+				    deltas.arena(), MutationRef::Type::ClearRange, prevClearAfterVersion, prevClearAfterKey, entry.key);
+			}
+			prevClearAfter = entry.clearVersion.present();
+			if (prevClearAfter) {
+				prevClearAfterVersion = entry.clearVersion.get();
+				prevClearAfterKey = entry.key;
+			}
+
+			for (auto& v : entry.values) {
+				if (v.op == MutationRef::Type::ClearRange) {
+					if (entry.clearVersion.present() && v.version == entry.clearVersion.get()) {
+						// we'll handle that in the next loop with prevClearAfter
+						continue;
+					}
+					deltas.emplace_back(deltas.arena(),
+					                    MutationRef::Type::ClearRange,
+					                    v.version,
+					                    entry.key,
+					                    keyAfter(entry.key, deltas.arena()));
+				} else {
+					ASSERT(v.op == MutationRef::Type::SetValue);
+					deltas.emplace_back(deltas.arena(), MutationRef::Type::SetValue, v.version, entry.key, v.value);
+				}
+			}
+		}
+		deltas.arena().dependsOn(deltaBlock.arena());
+		currentBlock++;
+	}
+	return deltas;
+}
+
 std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix) {
 	// Start with random bytes to avoid metadata hotspotting
 	// Worker ID for uniqueness and attribution
@ -2359,7 +2439,9 @@ void checkDeltaRead(const KeyValueGen& kvGen,
                    Version beginVersion,
                    Version readVersion,
                    const Standalone<GranuleDeltas>& data,
-                    StringRef* serialized) {
+                    const std::vector<StringRef>& serialized) {
+	ASSERT_EQ(serialized.size(), 1);
+
 	// expected answer
 	std::map<KeyRef, ValueRef> expectedData;
 	Version lastFileEndVersion = 0;
@ -2378,7 +2460,7 @@ void checkDeltaRead(const KeyValueGen& kvGen,
 	    deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta");
 	Standalone<BlobGranuleChunkRef> chunk;
 	chunk.deltaFiles.emplace_back_deep(
-	    chunk.arena(), filename, 0, serialized->size(), serialized->size(), kvGen.cipherKeys);
+	    chunk.arena(), filename, 0, serialized[0].size(), serialized[0].size(), kvGen.cipherKeys);
 	chunk.keyRange = kvGen.allRange;
 	chunk.includedVersion = readVersion;
 	chunk.snapshotVersion = invalidVersion;
@ -2459,13 +2541,14 @@ TEST_CASE("/blobgranule/files/deltaFormatUnitTest") {
 	}*/
 	Value serialized = serializeChunkedDeltaFile(
 	    fileNameRef, data, kvGen.allRange, targetChunkSize, kvGen.compressFilter, kvGen.cipherKeys);
+	std::vector<StringRef> deltaPtr{ serialized };

 	// check whole file
-	checkDeltaRead(kvGen, kvGen.allRange, 0, data.back().version, data, &serialized);
+	checkDeltaRead(kvGen, kvGen.allRange, 0, data.back().version, data, deltaPtr);

 	for (int i = 0; i < std::min((size_t)100, kvGen.usedKeysList.size() * data.size()); i++) {
 		auto params = randomizeKeyAndVersions(kvGen, data);
-		checkDeltaRead(kvGen, std::get<0>(params), std::get<1>(params), std::get<2>(params), data, &serialized);
+		checkDeltaRead(kvGen, std::get<0>(params), std::get<1>(params), std::get<2>(params), data, deltaPtr);
 	}

 	return Void();
@ -2518,10 +2601,6 @@ void checkGranuleRead(const KeyValueGen& kvGen,
 		}
 		deltaIdx++;
 	}
-	StringRef deltaPtrs[deltaPtrsVector.size() + 1];
-	for (int i = 0; i < deltaPtrsVector.size(); i++) {
-		deltaPtrs[i] = deltaPtrsVector[i];
-	}

 	// add in memory deltas
 	chunk.arena().dependsOn(inMemoryDeltas.arena());
@ -2540,7 +2619,7 @@ void checkGranuleRead(const KeyValueGen& kvGen,
 		snapshotPtr = serializedSnapshot;
 	}
 	RangeResult actualData =
-	    materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs, stats);
+	    materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrsVector, stats);

 	if (expectedData.size() != actualData.size()) {
 		fmt::print("Expected Size {0} != Actual Size {1}\n", expectedData.size(), actualData.size());
@ -2684,6 +2763,87 @@ TEST_CASE("/blobgranule/files/granuleReadUnitTest") {
 	return Void();
 }

+namespace {
+MutationsAndVersionRef singleMutation(Version v,
+                                      MutationRef::Type type,
+                                      Arena& ar,
+                                      const StringRef& param1,
+                                      const StringRef& param2) {
+	MutationsAndVersionRef ref(v, v);
+	ref.mutations.emplace_back(ar, type, param1, param2);
+	return ref;
+}
+
+void checkMutations(const Standalone<VectorRef<GranuleMutationRef>>& expected,
+                    const Standalone<VectorRef<GranuleMutationRef>>& actual) {
+	ASSERT(expected.size() == actual.size());
+	for (int i = 0; i < expected.size(); i++) {
+		ASSERT(expected[i].version == actual[i].version);
+		ASSERT(expected[i].type == actual[i].type);
+		ASSERT(expected[i].param1 == actual[i].param1);
+		ASSERT(expected[i].param2 == actual[i].param2);
+	}
+}
+} // namespace
+
+/*
+Input mutations:
+Set A=5 @ 100
+Clear [A - C) @ 200
+Set E=6 @ 300
+Set A=7 @ 400
+Clear [A - E) @ 500
+Clear [E - E\x00) @ 600 (single key clear)
+
+Output mutations:
+Set A=5 @ 100
+Set A=7 @ 400
+Clear [A - A\x00) @ 500
+Clear [A - C) @ 200
+Clear [C - C\x00] @ 500
+Set E=6 @ 300
+Clear [E - E\x00) @ 600
+*/
+TEST_CASE("/blobgranule/files/bgReadDeltaFile") {
+	Arena ar;
+	Standalone<StringRef> strA = "A"_sr;
+	Standalone<StringRef> strC = "C"_sr;
+	Standalone<StringRef> strE = "E"_sr;
+	Standalone<StringRef> str5 = "5"_sr;
+	Standalone<StringRef> str6 = "6"_sr;
+	Standalone<StringRef> str7 = "7"_sr;
+
+	Standalone<StringRef> strAfterA = keyAfter(strA);
+	Standalone<StringRef> strAfterE = keyAfter(strE);
+
+	Standalone<GranuleDeltas> originalMutations;
+	originalMutations.push_back(ar, singleMutation(100, MutationRef::Type::SetValue, ar, strA, str5));
+	originalMutations.push_back(ar, singleMutation(200, MutationRef::Type::ClearRange, ar, strA, strC));
+	originalMutations.push_back(ar, singleMutation(300, MutationRef::Type::SetValue, ar, strE, str6));
+	originalMutations.push_back(ar, singleMutation(400, MutationRef::Type::SetValue, ar, strA, str7));
+	originalMutations.push_back(ar, singleMutation(500, MutationRef::Type::ClearRange, ar, strA, strE));
+	originalMutations.push_back(ar, singleMutation(600, MutationRef::Type::ClearRange, ar, strE, strAfterE));
+
+	Standalone<VectorRef<GranuleMutationRef>> expectedMutations;
+	expectedMutations.emplace_back(ar, MutationRef::Type::SetValue, 100, strA, str5);
+	expectedMutations.emplace_back(ar, MutationRef::Type::SetValue, 400, strA, str7);
+	expectedMutations.emplace_back(ar, MutationRef::Type::ClearRange, 500, strA, strAfterA);
+	expectedMutations.emplace_back(ar, MutationRef::Type::ClearRange, 200, strA, strC);
+	expectedMutations.emplace_back(ar, MutationRef::Type::ClearRange, 500, strC, strE);
+	expectedMutations.emplace_back(ar, MutationRef::Type::SetValue, 300, strE, str6);
+	expectedMutations.emplace_back(ar, MutationRef::Type::ClearRange, 600, strE, strAfterE);
+
+	for (int chunkSize = 1; chunkSize <= 32 * 1024; chunkSize *= 2) {
+		Value serialized =
+		    serializeChunkedDeltaFile(strA, originalMutations, KeyRangeRef(strA, strAfterE), chunkSize, {}, {});
+		Standalone<VectorRef<GranuleMutationRef>> actualMutations = bgReadDeltaFile(serialized);
+
+		checkMutations(expectedMutations, actualMutations);
+	}
+
+	return Void();
+}
+
 // performance micro-benchmarks

 struct FileSet {
@ -2932,7 +3092,7 @@ std::pair<int64_t, double> doDeltaWriteBench(const Standalone<GranuleDeltas>& da

 void chunkFromFileSet(const FileSet& fileSet,
                      Standalone<BlobGranuleChunkRef>& chunk,
-                      StringRef* deltaPtrs,
+                      std::vector<StringRef>& deltaPtrs,
                      Version readVersion,
                      Optional<BlobGranuleCipherKeysCtx> keys,
                      int numDeltaFiles) {
@ -2985,7 +3145,7 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
 	Standalone<BlobGranuleChunkRef> chunk;
 	GranuleMaterializeStats stats;
 	ASSERT(numDeltaFiles >= 0 && numDeltaFiles <= fileSet.deltaFiles.size());
-	StringRef deltaPtrs[numDeltaFiles];
+	std::vector<StringRef> deltaPtrs(numDeltaFiles);

 	MutationRef clearAllAtEndMutation;
 	if (clearAllAtEnd) {
--- a/fdbclient/BlobGranuleReader.actor.cpp
+++ b/fdbclient/BlobGranuleReader.actor.cpp
@ -36,21 +36,12 @@ ACTOR Future<Standalone<StringRef>> readFile(Reference<BlobConnectionProvider> b
 		state Arena arena;
 		std::string fname = f.filename.toString();
 		state Reference<BackupContainerFileSystem> bstore = bstoreProvider->getForRead(fname);
-		// printf("Starting read of snapshot file %s\n", fname.c_str());
 		state Reference<IAsyncFile> reader = wait(bstore->readFile(fname));
-		// printf("Got snapshot file size %lld\n", size);
-		state uint8_t* data = new (arena) uint8_t[f.length];
-		// printf("Reading %lld bytes from snapshot file %s\n", size, filename.c_str());

-		state int lengthRemaining = f.length;
-		state int64_t blockOffset = f.offset;
-		while (lengthRemaining > 0) {
-			int blockSize = std::min(lengthRemaining, CLIENT_KNOBS->BGR_READ_BLOCK_SIZE);
-			int readSize = wait(reader->read(data + (blockOffset - f.offset), blockSize, blockOffset));
-			ASSERT(readSize <= lengthRemaining);
-			lengthRemaining -= readSize;
-			blockOffset += readSize;
-		}
+		state uint8_t* data = new (arena) uint8_t[f.length];
+
+		int readSize = wait(reader->read(data, f.length, f.offset));
+		ASSERT(f.length == readSize);

 		StringRef dataRef(data, f.length);
 		return Standalone<StringRef>(dataRef, arena);
@ -102,13 +93,13 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
 		}

 		state int numDeltaFiles = chunk.deltaFiles.size();
-		state StringRef* deltaData = new (arena) StringRef[numDeltaFiles];
+		state std::vector<StringRef> deltaData;
 		state int deltaIdx;

-		// for (Future<Standalone<StringRef>> deltaFuture : readDeltaFutures) {
+		deltaData.reserve(numDeltaFiles);
 		for (deltaIdx = 0; deltaIdx < numDeltaFiles; deltaIdx++) {
 			Standalone<StringRef> data = wait(readDeltaFutures[deltaIdx]);
-			deltaData[deltaIdx] = data;
+			deltaData.push_back(data);
 			arena.dependsOn(data.arena());
 		}

--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -224,6 +224,11 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE,      2 );
 	init( BLOBSTORE_MULTIPART_MAX_PART_SIZE,  20000000 );
 	init( BLOBSTORE_MULTIPART_MIN_PART_SIZE,   5242880 );
+	init( BLOBSTORE_GLOBAL_CONNECTION_POOL,       true );
+	init( BLOBSTORE_ENABLE_LOGGING,               true );
+	init( BLOBSTORE_STATS_LOGGING_INTERVAL,       10.0 );
+	init( BLOBSTORE_LATENCY_LOGGING_INTERVAL,    120.0 );
+	init( BLOBSTORE_LATENCY_LOGGING_ACCURACY,     0.01 );

 	// These are basically unlimited by default but can be used to reduce blob IO if needed
 	init( BLOBSTORE_REQUESTS_PER_SECOND,            200 );
@ -235,8 +240,6 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( BLOBSTORE_READ_REQUESTS_PER_SECOND,       100 );
 	init( BLOBSTORE_DELETE_REQUESTS_PER_SECOND,     200 );

-	init( BGR_READ_BLOCK_SIZE,             20*1024*1024 ); if( randomize && BUGGIFY ) BGR_READ_BLOCK_SIZE = 64 * 1024 * deterministicRandom()->randomInt(1, 100);
-
 	// Dynamic Knobs
 	init( COMMIT_QUORUM_TIMEOUT,                    3.0 );
 	init( GET_GENERATION_QUORUM_TIMEOUT,            3.0 );
@ -280,7 +283,7 @@ void ClientKnobs::initialize(Randomize randomize) {

 	// Blob granules
 	init( BG_MAX_GRANULE_PARALLELISM,                10 );
-	init( BG_TOO_MANY_GRANULES,                   10000 );
+	init( BG_TOO_MANY_GRANULES,                   20000 );
 	init( BLOB_METADATA_REFRESH_INTERVAL,          3600 ); if ( randomize && BUGGIFY ) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }

 	init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES,        3 );
@ -298,6 +301,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( CLIENT_ENABLE_USING_CLUSTER_ID_KEY,     false );

 	init( ENABLE_ENCRYPTION_CPU_TIME_LOGGING,     false );
+	init( SIMULATION_EKP_TENANT_IDS_TO_DROP,        "-1" );
 	// clang-format on
 }

--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -19,6 +19,11 @@
 */

 #include "fdbclient/DatabaseConfiguration.h"
+#include "fdbclient/TenantEntryCache.actor.h"
+#include "fdbclient/TenantManagement.actor.h"
+#include "fdbrpc/TenantInfo.h"
+#include "fdbrpc/simulator.h"
+#include "flow/FastRef.h"
 #include "fmt/format.h"
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/BackupContainer.h"
@ -606,7 +611,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
 	                                           int64_t dataLen,
 	                                           Arena* arena) {
 		Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
-		TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP));
+		TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::RESTORE));
 		ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid());
 		validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header);
 		DecryptBlobCipherAes256Ctr decryptor(
@ -1025,8 +1030,9 @@ private:
 ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
                                        Standalone<VectorRef<KeyValueRef>>* results,
                                        bool encryptedBlock,
-                                        Optional<EncryptionAtRestMode> encryptMode,
-                                        Optional<BlobCipherEncryptHeader> encryptHeader) {
+                                        EncryptionAtRestMode encryptMode,
+                                        Optional<BlobCipherEncryptHeader> encryptHeader,
+                                        Optional<Reference<TenantEntryCache<Void>>> tenantCache) {
 	// Read begin key, if this fails then block was invalid.
 	state uint32_t kLen = reader->consumeNetworkUInt32();
 	state const uint8_t* k = reader->consume(kLen);
@ -1044,16 +1050,15 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
 		// make sure that all keys in a block belong to exactly one tenant,
 		// unless its the last key in which case it can be a truncated (different) tenant prefix
 		if (encryptedBlock && g_network && g_network->isSimulated()) {
-			ASSERT(encryptMode.present());
 			ASSERT(encryptHeader.present());
 			state KeyRef curKey = KeyRef(k, kLen);
 			if (!prevDomainId.present()) {
 				EncryptCipherDomainId domainId =
-				    EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, encryptMode.get());
+				    EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, encryptMode);
 				prevDomainId = domainId;
 			}
 			EncryptCipherDomainId curDomainId =
-			    EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, encryptMode.get());
+			    EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, encryptMode);
 			if (!curKey.empty() && !prevKey.empty() && prevDomainId.get() != curDomainId) {
 				ASSERT(!done);
 				if (curDomainId != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID && curDomainId != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
@ -1076,9 +1081,22 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
 		}

 		// Read a value, which must exist or the block is invalid
-		uint32_t vLen = reader->consumeNetworkUInt32();
-		const uint8_t* v = reader->consume(vLen);
-		results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
+		state uint32_t vLen = reader->consumeNetworkUInt32();
+		state const uint8_t* v = reader->consume(vLen);
+		if (tenantCache.present() && !isSystemKey(KeyRef(k, kLen))) {
+			state int64_t tenantId = TenantAPI::extractTenantIdFromKeyRef(StringRef(k, kLen));
+			Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache.get()->getById(tenantId));
+			// The first and last KV pairs are not restored so if the tenant is not found for the last key then it's ok
+			// to include it in the restore set
+			if (!payload.present() && !(reader->eof() || *reader->rptr == 0xFF)) {
+				TraceEvent(SevWarnAlways, "SnapshotRestoreTenantNotFound").detail("TenantId", tenantId);
+				CODE_PROBE(true, "Snapshot restore tenant not found");
+			} else {
+				results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
+			}
+		} else {
+			results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
+		}

 		// If eof reached or first byte of next key len is 0xFF then a valid block end was reached.
 		if (reader->eof() || *reader->rptr == 0xFF)
@ -1096,7 +1114,7 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
 ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
                                                                      int64_t offset,
                                                                      int len,
-                                                                      Optional<Database> cx) {
+                                                                      Database cx) {
 	state Standalone<StringRef> buf = makeString(len);
 	int rLen = wait(uncancellable(holdWhile(buf, file->read(mutateString(buf), len, offset))));
 	if (rLen != len)
@ -1107,19 +1125,26 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 	state Standalone<VectorRef<KeyValueRef>> results({}, buf.arena());
 	state StringRefReader reader(buf, restore_corrupted_data());
 	state Arena arena;
+	state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx));
+	state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
+	if (config.tenantMode == TenantMode::REQUIRED) {
+		tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
+		wait(tenantCache.get()->init());
+	}
+	state EncryptionAtRestMode encryptMode = config.encryptionAtRestMode;
+	state int64_t blockTenantId = TenantInfo::INVALID_TENANT;

 	try {
 		// Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION or
 		// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
 		int32_t file_version = reader.consume<int32_t>();
 		if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
-			wait(decodeKVPairs(
-			    &reader, &results, false, Optional<EncryptionAtRestMode>(), Optional<BlobCipherEncryptHeader>()));
+			wait(
+			    decodeKVPairs(&reader, &results, false, encryptMode, Optional<BlobCipherEncryptHeader>(), tenantCache));
 		} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
 			CODE_PROBE(true, "decoding encrypted block");
-			ASSERT(cx.present());
 			// decode options struct
-			uint32_t optionsLen = reader.consumeNetworkUInt32();
+			state uint32_t optionsLen = reader.consumeNetworkUInt32();
 			const uint8_t* o = reader.consume(optionsLen);
 			StringRef optionsStringRef = StringRef(o, optionsLen);
 			EncryptedRangeFileWriter::Options options =
@ -1127,29 +1152,38 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 			ASSERT(!options.compressionEnabled);

 			// read encryption header
-			const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize);
+			state const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize);
 			StringRef headerS = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
 			state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
+			blockTenantId = header.cipherTextDetails.encryptDomainId;
+			if (config.tenantMode == TenantMode::REQUIRED && !isReservedEncryptDomain(blockTenantId)) {
+				ASSERT(tenantCache.present());
+				Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache.get()->getById(blockTenantId));
+				if (!payload.present()) {
+					throw tenant_not_found();
+				}
+			}
 			const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize;
 			// calculate the total bytes read up to (and including) the header
 			int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize;
 			// get the size of the encrypted payload and decrypt it
 			int64_t dataLen = len - bytesRead;
 			StringRef decryptedData =
-			    wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
+			    wait(EncryptedRangeFileWriter::decrypt(cx, header, dataPayloadStart, dataLen, &results.arena()));
 			reader = StringRefReader(decryptedData, restore_corrupted_data());
-			state Optional<EncryptionAtRestMode> encryptMode;
-			if (g_network && g_network->isSimulated()) {
-				// The encyption mode is only used during simulation for a sanity check
-				DatabaseConfiguration config = wait(getDatabaseConfiguration(cx.get()));
-				encryptMode = config.encryptionAtRestMode;
-			}
-			wait(decodeKVPairs(&reader, &results, true, encryptMode, header));
+			wait(decodeKVPairs(&reader, &results, true, encryptMode, header, tenantCache));
 		} else {
 			throw restore_unsupported_file_version();
 		}
 		return results;
 	} catch (Error& e) {
+		if (e.code() == error_code_encrypt_keys_fetch_failed) {
+			TraceEvent(SevWarnAlways, "SnapshotRestoreEncryptKeyFetchFailed").detail("TenantId", blockTenantId);
+			CODE_PROBE(true, "Snapshot restore encrypt keys not found");
+		} else if (e.code() == error_code_tenant_not_found) {
+			TraceEvent(SevWarnAlways, "EncryptedSnapshotRestoreTenantNotFound").detail("TenantId", blockTenantId);
+			CODE_PROBE(true, "Encrypted Snapshot restore tenant not found");
+		}
 		TraceEvent(SevWarn, "FileRestoreDecodeRangeFileBlockFailed")
 		    .error(e)
 		    .detail("Filename", file->getFilename())
@ -3529,6 +3563,16 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase {
 		return returnStr;
 	}

+	ACTOR static Future<Void> _validTenantAccess(KeyRef key, Reference<TenantEntryCache<Void>> tenantCache) {
+		if (isSystemKey(key)) {
+			return Void();
+		}
+		state int64_t tenantId = TenantAPI::extractTenantIdFromKeyRef(key);
+		Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache->getById(tenantId));
+		ASSERT(payload.present());
+		return Void();
+	}
+
 	ACTOR static Future<Void> _execute(Database cx,
 	                                   Reference<TaskBucket> taskBucket,
 	                                   Reference<FutureBucket> futureBucket,
@ -3577,8 +3621,25 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase {
 		}

 		state Reference<IAsyncFile> inFile = wait(bc.get()->readFile(rangeFile.fileName));
-		state Standalone<VectorRef<KeyValueRef>> blockData =
-		    wait(decodeRangeFileBlock(inFile, readOffset, readLen, cx));
+		state Standalone<VectorRef<KeyValueRef>> blockData;
+		try {
+			Standalone<VectorRef<KeyValueRef>> data = wait(decodeRangeFileBlock(inFile, readOffset, readLen, cx));
+			blockData = data;
+		} catch (Error& e) {
+			// It's possible a tenant was deleted and the encrypt key fetch failed
+			if (e.code() == error_code_encrypt_keys_fetch_failed || e.code() == error_code_tenant_not_found) {
+				return Void();
+			}
+			throw;
+		}
+		state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
+		state std::vector<Future<Void>> validTenantCheckFutures;
+		state Arena arena;
+		state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx));
+		if (config.tenantMode == TenantMode::REQUIRED && g_network && g_network->isSimulated()) {
+			tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
+			wait(tenantCache.get()->init());
+		}

 		// First and last key are the range for this file
 		state KeyRange fileRange = KeyRangeRef(blockData.front().key, blockData.back().key);
@ -3656,6 +3717,12 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase {

 					for (; i < iend; ++i) {
 						tr->setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE);
+						if (tenantCache.present()) {
+							validTenantCheckFutures.push_back(_validTenantAccess(
+							    StringRef(arena,
+							              data[i].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get())),
+							    tenantCache.get()));
+						}
 						tr->set(data[i].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()),
 						        data[i].value);
 					}
@ -3671,6 +3738,11 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase {

 					wait(tr->commit());

+					if (!validTenantCheckFutures.empty()) {
+						waitForAll(validTenantCheckFutures);
+						validTenantCheckFutures.clear();
+					}
+
 					TraceEvent("FileRestoreCommittedRange")
 					    .suppressFor(60)
 					    .detail("RestoreUID", restore.getUid())
@ -4663,7 +4735,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r));
 		}
 		state Optional<RestorableFileSet> restorable =
-		    wait(bc->getRestoreSet(restoreVersion, cx, keyRangesFilter, logsOnly, beginVersion));
+		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion));
 		if (!restorable.present())
 			throw restore_missing_data();

@ -4917,7 +4989,7 @@ public:
 			    .detail("OverrideTargetVersion", targetVersion);
 		}

-		Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion, cx));
+		Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));

 		if (!restoreSet.present()) {
 			TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
@ -5895,7 +5967,7 @@ public:
 			beginVersion = *std::min_element(beginVersions.begin(), beginVersions.end());
 		}
 		Optional<RestorableFileSet> restoreSet =
-		    wait(bc->getRestoreSet(targetVersion, cx, ranges, onlyApplyMutationLogs, beginVersion));
+		    wait(bc->getRestoreSet(targetVersion, ranges, onlyApplyMutationLogs, beginVersion));

 		if (!restoreSet.present()) {
 			TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -46,6 +46,7 @@
 #include "flow/Platform.h"
 #include "flow/ProtocolVersion.h"
 #include "flow/UnitTest.h"
+#include "flow/Trace.h"

 #ifdef __unixish__
 #include <fcntl.h>
@ -1466,8 +1467,11 @@ ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> MultiVersionTransaction
    Version beginVersion,
    Optional<Version> readVersion,
    Version* readVersionOut) {
-	// can't call this directly
-	return ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>>(unsupported_operation());
+	return executeOperation(&ITransaction::readBlobGranulesStart,
+	                        keyRange,
+	                        std::forward<Version>(beginVersion),
+	                        std::forward<Optional<Version>>(readVersion),
+	                        std::forward<Version*>(readVersionOut));
 }

 ThreadResult<RangeResult> MultiVersionTransaction::readBlobGranulesFinish(
@ -2903,123 +2907,129 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option,
 }

 void MultiVersionApi::setupNetwork() {
-	if (!externalClient) {
-		loadEnvironmentVariableNetworkOptions();
-	}
-
-	uint64_t transportId = 0;
-	{ // lock scope
-		MutexHolder holder(lock);
-		if (networkStartSetup) {
-			throw network_already_setup();
+	try {
+		if (!externalClient) {
+			loadEnvironmentVariableNetworkOptions();
 		}

-		if (threadCount > 1) {
-			disableLocalClient();
-		}
+		uint64_t transportId = 0;
+		{ // lock scope
+			MutexHolder holder(lock);
+			if (networkStartSetup) {
+				throw network_already_setup();
+			}

-		if (!apiVersion.hasFailOnExternalClientErrors()) {
-			ignoreExternalClientFailures = true;
-		}
+			if (threadCount > 1) {
+				disableLocalClient();
+			}

-		for (auto i : externalClientDescriptions) {
-			std::string path = i.second.libPath;
-			std::string filename = basename(path);
-			bool useFutureVersion = i.second.useFutureVersion;
+			networkStartSetup = true;

-			// Copy external lib for each thread
-			if (externalClients.count(filename) == 0) {
-				externalClients[filename] = {};
-				auto libCopies = copyExternalLibraryPerThread(path);
-				for (int idx = 0; idx < libCopies.size(); ++idx) {
-					bool unlinkOnLoad = libCopies[idx].second && !retainClientLibCopies;
-					externalClients[filename].push_back(Reference<ClientInfo>(
-					    new ClientInfo(new DLApi(libCopies[idx].first, unlinkOnLoad /*unlink on load*/),
-					                   path,
-					                   useFutureVersion,
-					                   idx)));
+			if (externalClientDescriptions.empty() && localClientDisabled) {
+				TraceEvent(SevWarn, "CannotSetupNetwork")
+				    .detail("Reason", "Local client is disabled and no external clients configured");
+
+				throw no_external_client_provided();
+			}
+
+			if (externalClientDescriptions.empty() && !disableBypass) {
+				bypassMultiClientApi = true; // SOMEDAY: we won't be able to set this option once it becomes possible to
+				                             // add clients after setupNetwork is called
+			}
+
+			if (!bypassMultiClientApi) {
+				transportId =
+				    (uint64_t(uint32_t(platform::getRandomSeed())) << 32) ^ uint32_t(platform::getRandomSeed());
+				if (transportId <= 1)
+					transportId += 2;
+				localClient->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID,
+				                                   std::to_string(transportId));
+			}
+			localClient->api->setupNetwork();
+
+			if (!apiVersion.hasFailOnExternalClientErrors()) {
+				ignoreExternalClientFailures = true;
+			}
+
+			for (auto i : externalClientDescriptions) {
+				std::string path = i.second.libPath;
+				std::string filename = basename(path);
+				bool useFutureVersion = i.second.useFutureVersion;
+
+				// Copy external lib for each thread
+				if (externalClients.count(filename) == 0) {
+					externalClients[filename] = {};
+					auto libCopies = copyExternalLibraryPerThread(path);
+					for (int idx = 0; idx < libCopies.size(); ++idx) {
+						bool unlinkOnLoad = libCopies[idx].second && !retainClientLibCopies;
+						externalClients[filename].push_back(Reference<ClientInfo>(
+						    new ClientInfo(new DLApi(libCopies[idx].first, unlinkOnLoad /*unlink on load*/),
+						                   path,
+						                   useFutureVersion,
+						                   idx)));
+					}
 				}
 			}
 		}

-		if (externalClients.empty() && localClientDisabled) {
-			TraceEvent(SevWarn, "CannotSetupNetwork")
-			    .detail("Reason", "Local client is disabled and no external clients configured");
+		localClient->loadVersion();

-			throw no_external_client_provided();
+		if (bypassMultiClientApi) {
+			networkSetup = true;
+		} else {
+			runOnExternalClientsAllThreads(
+			    [this](Reference<ClientInfo> client) {
+				    TraceEvent("InitializingExternalClient").detail("LibraryPath", client->libPath);
+				    client->api->selectApiVersion(apiVersion.version());
+				    if (client->useFutureVersion) {
+					    client->api->useFutureProtocolVersion();
+				    }
+				    client->loadVersion();
+			    },
+			    false,
+			    !ignoreExternalClientFailures);
+
+			std::string baseTraceFileId;
+			if (apiVersion.hasTraceFileIdentifier()) {
+				// TRACE_FILE_IDENTIFIER option is supported since 6.3
+				baseTraceFileId = traceFileIdentifier.empty() ? format("%d", getpid()) : traceFileIdentifier;
+			}
+
+			MutexHolder holder(lock);
+			runOnExternalClientsAllThreads(
+			    [this, transportId, baseTraceFileId](Reference<ClientInfo> client) {
+				    for (auto option : options) {
+					    client->api->setNetworkOption(option.first, option.second.castTo<StringRef>());
+				    }
+				    client->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID,
+				                                  std::to_string(transportId));
+				    if (!baseTraceFileId.empty()) {
+					    client->api->setNetworkOption(FDBNetworkOptions::TRACE_FILE_IDENTIFIER,
+					                                  traceShareBaseNameAmongThreads
+					                                      ? baseTraceFileId
+					                                      : client->getTraceFileIdentifier(baseTraceFileId));
+				    }
+				    client->api->setupNetwork();
+			    },
+			    false,
+			    !ignoreExternalClientFailures);
+
+			if (localClientDisabled && !hasNonFailedExternalClients()) {
+				TraceEvent(SevWarn, "CannotSetupNetwork")
+				    .detail("Reason", "Local client is disabled and all external clients failed");
+				throw all_external_clients_failed();
+			}
+
+			networkSetup = true; // Needs to be guarded by mutex
 		}

-		networkStartSetup = true;
-
-		if (externalClients.empty() && !disableBypass) {
-			bypassMultiClientApi = true; // SOMEDAY: we won't be able to set this option once it becomes possible to
-			                             // add clients after setupNetwork is called
-		}
-
-		if (!bypassMultiClientApi) {
-			transportId = (uint64_t(uint32_t(platform::getRandomSeed())) << 32) ^ uint32_t(platform::getRandomSeed());
-			if (transportId <= 1)
-				transportId += 2;
-			localClient->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID,
-			                                   std::to_string(transportId));
-		}
-		localClient->api->setupNetwork();
+		options.clear();
+		updateSupportedVersions();
+	} catch (Error& e) {
+		// Make sure all error and warning events are traced
+		flushTraceFileVoid();
+		throw e;
 	}
-
-	localClient->loadVersion();
-
-	if (bypassMultiClientApi) {
-		networkSetup = true;
-	} else {
-		runOnExternalClientsAllThreads(
-		    [this](Reference<ClientInfo> client) {
-			    TraceEvent("InitializingExternalClient").detail("LibraryPath", client->libPath);
-			    client->api->selectApiVersion(apiVersion.version());
-			    if (client->useFutureVersion) {
-				    client->api->useFutureProtocolVersion();
-			    }
-			    client->loadVersion();
-		    },
-		    false,
-		    !ignoreExternalClientFailures);
-
-		std::string baseTraceFileId;
-		if (apiVersion.hasTraceFileIdentifier()) {
-			// TRACE_FILE_IDENTIFIER option is supported since 6.3
-			baseTraceFileId = traceFileIdentifier.empty() ? format("%d", getpid()) : traceFileIdentifier;
-		}
-
-		MutexHolder holder(lock);
-		runOnExternalClientsAllThreads(
-		    [this, transportId, baseTraceFileId](Reference<ClientInfo> client) {
-			    for (auto option : options) {
-				    client->api->setNetworkOption(option.first, option.second.castTo<StringRef>());
-			    }
-			    client->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID,
-			                                  std::to_string(transportId));
-			    if (!baseTraceFileId.empty()) {
-				    client->api->setNetworkOption(FDBNetworkOptions::TRACE_FILE_IDENTIFIER,
-				                                  traceShareBaseNameAmongThreads
-				                                      ? baseTraceFileId
-				                                      : client->getTraceFileIdentifier(baseTraceFileId));
-			    }
-			    client->api->setupNetwork();
-		    },
-		    false,
-		    !ignoreExternalClientFailures);
-
-		if (localClientDisabled && !hasNonFailedExternalClients()) {
-			TraceEvent(SevWarn, "CannotSetupNetwork")
-			    .detail("Reason", "Local client is disabled and all external clients failed");
-
-			throw all_external_clients_failed();
-		}
-
-		networkSetup = true; // Needs to be guarded by mutex
-	}
-
-	options.clear();
-	updateSupportedVersions();
 }

 THREAD_FUNC_RETURN runNetworkThread(void* param) {
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -161,7 +161,7 @@ TLSConfig tlsConfig(TLSEndpointType::CLIENT);
 // The default values, TRACE_DEFAULT_ROLL_SIZE and TRACE_DEFAULT_MAX_LOGS_SIZE are located in Trace.h.
 NetworkOptions::NetworkOptions()
  : traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"),
-    traceFormat("xml"), traceClockSource("now"),
+    traceFormat("xml"), traceClockSource("now"), traceInitializeOnSetup(false),
    supportedVersions(new ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>()), runLoopProfilingEnabled(false),
    primaryClient(true) {}

@ -2217,6 +2217,99 @@ void DatabaseContext::expireThrottles() {

 extern IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs);

+// Initialize tracing for FDB client
+//
+// connRecord is necessary for determining the local IP, which is then included in the trace
+// file name, and also used to annotate all trace events.
+//
+// If trace_initialize_on_setup is not set, tracing is initialized when opening a database.
+// In that case we can immediatelly determine the IP. Thus, we can use the IP in the
+// trace file name and annotate all events with it.
+//
+// If trace_initialize_on_setup network option is set, tracing is at first initialized without
+// connRecord and thus without the local IP. In that case we cannot use the local IP in the
+// trace file names. The IP is then provided by a repeated call to initializeClientTracing
+// when opening a database. All tracing events from this point are annotated with the local IP
+//
+// If tracing initialization is completed, further calls to initializeClientTracing are ignored
+void initializeClientTracing(Reference<IClusterConnectionRecord> connRecord, Optional<int> apiVersion) {
+	if (!networkOptions.traceDirectory.present()) {
+		return;
+	}
+
+	bool initialized = traceFileIsOpen();
+	if (initialized && (isTraceLocalAddressSet() || !connRecord)) {
+		// Tracing initialization is completed
+		return;
+	}
+
+	// Network must be created before initializing tracing
+	ASSERT(g_network);
+
+	Optional<NetworkAddress> localAddress;
+	if (connRecord) {
+		auto publicIP = determinePublicIPAutomatically(connRecord->getConnectionString());
+		localAddress = NetworkAddress(publicIP, ::getpid());
+	}
+	platform::ImageInfo imageInfo = platform::getImageInfo();
+
+	if (initialized) {
+		// Tracing already initialized, just need to update the IP address
+		setTraceLocalAddress(localAddress.get());
+		TraceEvent("ClientStart")
+		    .detail("SourceVersion", getSourceVersion())
+		    .detail("Version", FDB_VT_VERSION)
+		    .detail("PackageName", FDB_VT_PACKAGE_NAME)
+		    .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr))
+		    .detail("ApiVersion", apiVersion)
+		    .detail("ClientLibrary", imageInfo.fileName)
+		    .detailf("ImageOffset", "%p", imageInfo.offset)
+		    .detail("Primary", networkOptions.primaryClient)
+		    .trackLatest("ClientStart");
+	} else {
+		// Initialize tracing
+		selectTraceFormatter(networkOptions.traceFormat);
+		selectTraceClockSource(networkOptions.traceClockSource);
+		addUniversalTraceField("ClientDescription",
+		                       format("%s-%s-%" PRIu64,
+		                              networkOptions.primaryClient ? "primary" : "external",
+		                              FDB_VT_VERSION,
+		                              deterministicRandom()->randomUInt64()));
+
+		std::string identifier = networkOptions.traceFileIdentifier;
+		openTraceFile(localAddress,
+		              networkOptions.traceRollSize,
+		              networkOptions.traceMaxLogsSize,
+		              networkOptions.traceDirectory.get(),
+		              "trace",
+		              networkOptions.traceLogGroup,
+		              identifier,
+		              networkOptions.tracePartialFileSuffix);
+
+		TraceEvent("ClientStart")
+		    .detail("SourceVersion", getSourceVersion())
+		    .detail("Version", FDB_VT_VERSION)
+		    .detail("PackageName", FDB_VT_PACKAGE_NAME)
+		    .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr))
+		    .detail("ApiVersion", apiVersion)
+		    .detail("ClientLibrary", imageInfo.fileName)
+		    .detailf("ImageOffset", "%p", imageInfo.offset)
+		    .detail("Primary", networkOptions.primaryClient)
+		    .trackLatest("ClientStart");
+
+		g_network->initMetrics();
+		FlowTransport::transport().initMetrics();
+		initTraceEventMetrics();
+	}
+
+	// Initialize system monitoring once the local IP is available
+	if (localAddress.present()) {
+		initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(localAddress.get().ip)));
+		systemMonitor();
+		uncancellable(recurring(&systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace));
+	}
+}
+
 // Creates a database object that represents a connection to a cluster
 // This constructor uses a preallocated DatabaseContext that may have been created
 // on another thread
@ -2230,49 +2323,7 @@ Database Database::createDatabase(Reference<IClusterConnectionRecord> connRecord

 	ASSERT(TraceEvent::isNetworkThread());

-	platform::ImageInfo imageInfo = platform::getImageInfo();
-
-	if (connRecord) {
-		if (networkOptions.traceDirectory.present() && !traceFileIsOpen()) {
-			g_network->initMetrics();
-			FlowTransport::transport().initMetrics();
-			initTraceEventMetrics();
-
-			auto publicIP = determinePublicIPAutomatically(connRecord->getConnectionString());
-			selectTraceFormatter(networkOptions.traceFormat);
-			selectTraceClockSource(networkOptions.traceClockSource);
-			addUniversalTraceField("ClientDescription",
-			                       format("%s-%s-%" PRIu64,
-			                              networkOptions.primaryClient ? "primary" : "external",
-			                              FDB_VT_VERSION,
-			                              getTraceThreadId()));
-
-			openTraceFile(NetworkAddress(publicIP, ::getpid()),
-			              networkOptions.traceRollSize,
-			              networkOptions.traceMaxLogsSize,
-			              networkOptions.traceDirectory.get(),
-			              "trace",
-			              networkOptions.traceLogGroup,
-			              networkOptions.traceFileIdentifier,
-			              networkOptions.tracePartialFileSuffix);
-
-			TraceEvent("ClientStart")
-			    .detail("SourceVersion", getSourceVersion())
-			    .detail("Version", FDB_VT_VERSION)
-			    .detail("PackageName", FDB_VT_PACKAGE_NAME)
-			    .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr))
-			    .detail("ApiVersion", apiVersion)
-			    .detail("ClientLibrary", imageInfo.fileName)
-			    .detailf("ImageOffset", "%p", imageInfo.offset)
-			    .detail("Primary", networkOptions.primaryClient)
-			    .trackLatest("ClientStart");
-
-			initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(publicIP)));
-
-			systemMonitor();
-			uncancellable(recurring(&systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace));
-		}
-	}
+	initializeClientTracing(connRecord, apiVersion);

 	g_network->initTLS();

@ -2324,7 +2375,7 @@ Database Database::createDatabase(Reference<IClusterConnectionRecord> connRecord
 	    .detail("Version", FDB_VT_VERSION)
 	    .detail("ClusterFile", connRecord ? connRecord->toString() : "None")
 	    .detail("ConnectionString", connRecord ? connRecord->getConnectionString().toString() : "None")
-	    .detail("ClientLibrary", imageInfo.fileName)
+	    .detail("ClientLibrary", platform::getImageInfo().fileName)
 	    .detail("Primary", networkOptions.primaryClient)
 	    .detail("Internal", internal)
 	    .trackLatest(database->connectToDatabaseEventCacheHolder.trackingKey);
@ -2408,6 +2459,9 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> valu
 		validateOptionValuePresent(value);
 		networkOptions.tracePartialFileSuffix = value.get().toString();
 		break;
+	case FDBNetworkOptions::TRACE_INITIALIZE_ON_SETUP:
+		networkOptions.traceInitializeOnSetup = true;
+		break;
 	case FDBNetworkOptions::KNOB: {
 		validateOptionValuePresent(value);

@ -2608,6 +2662,10 @@ void setupNetwork(uint64_t transportId, UseMetrics useMetrics) {
 	FlowTransport::createInstance(true, transportId, WLTOKEN_RESERVED_COUNT);
 	Net2FileSystem::newFileSystem();

+	if (networkOptions.traceInitializeOnSetup) {
+		::initializeClientTracing({}, {});
+	}
+
 	uncancellable(monitorNetworkBusyness());
 }

@ -2803,6 +2861,10 @@ int64_t Tenant::id() const {
 	return idFuture.get();
 }

+Future<int64_t> Tenant::getIdFuture() const {
+	return idFuture;
+}
+
 KeyRef Tenant::prefix() const {
 	ASSERT(idFuture.isReady());
 	if (bigEndianId == -1) {
@ -8599,24 +8661,36 @@ ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLoc
 	try {
 		state int i = 0;
 		for (; i < locations.size(); i++) {
-			SplitMetricsRequest req(
-			    locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
-			SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
-			                                         &StorageServerInterface::splitMetrics,
-			                                         req,
-			                                         TaskPriority::DataDistribution));
-			if (res.splits.size() && res.splits[0] <= results.back()) { // split points are out of order, possibly
-				                                                        // because of moving data, throw error to retry
-				ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
-				throw all_alternatives_failed();
-			}
-			if (res.splits.size()) {
-				results.append(results.arena(), res.splits.begin(), res.splits.size());
-				results.arena().dependsOn(res.splits.arena());
-			}
-			used = res.used;
+			state Key beginKey = locations[i].range.begin;
+			loop {
+				KeyRangeRef range(beginKey, locations[i].range.end);
+				SplitMetricsRequest req(range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
+				SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
+				                                         &StorageServerInterface::splitMetrics,
+				                                         req,
+				                                         TaskPriority::DataDistribution));
+				if (res.splits.size() &&
+				    res.splits[0] <= results.back()) { // split points are out of order, possibly
+					                                   // because of moving data, throw error to retry
+					ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
+					throw all_alternatives_failed();
+				}

-			//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
+				if (res.splits.size()) {
+					results.append(results.arena(), res.splits.begin(), res.splits.size());
+					results.arena().dependsOn(res.splits.arena());
+				}
+
+				used = res.used;
+
+				if (res.more && res.splits.size()) {
+					// Next request will return split points after this one
+					beginKey = KeyRef(beginKey.arena(), res.splits.back());
+				} else {
+					break;
+				}
+				//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
+			}
 		}

 		if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) && results.size() > 1) {
@ -10911,7 +10985,7 @@ ACTOR Future<bool> blobRestoreActor(Reference<DatabaseContext> cx, KeyRange rang
 			Optional<Value> value = wait(tr->get(key));
 			if (value.present()) {
 				Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
-				if (status.phase != BlobRestorePhase::DONE) {
+				if (status.phase < BlobRestorePhase::DONE) {
 					return false; // stop if there is in-progress restore.
 				}
 			}
--- a/fdbclient/S3BlobStore.actor.cpp
+++ b/fdbclient/S3BlobStore.actor.cpp
@ -69,6 +69,11 @@ S3BlobStoreEndpoint::Stats S3BlobStoreEndpoint::Stats::operator-(const Stats& rh
 }

 S3BlobStoreEndpoint::Stats S3BlobStoreEndpoint::s_stats;
+std::unique_ptr<S3BlobStoreEndpoint::BlobStats> S3BlobStoreEndpoint::blobStats;
+Future<Void> S3BlobStoreEndpoint::statsLogger = Never();
+
+std::unordered_map<BlobStoreConnectionPoolKey, Reference<S3BlobStoreEndpoint::ConnectionPoolData>>
+    S3BlobStoreEndpoint::globalConnectionPool;

 S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
 	secure_connection = 1;
@ -96,6 +101,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
 	max_send_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_SEND_BYTES_PER_SECOND;
 	max_recv_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_RECV_BYTES_PER_SECOND;
 	sdk_auth = false;
+	global_connection_pool = CLIENT_KNOBS->BLOBSTORE_GLOBAL_CONNECTION_POOL;
 }

 bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
@ -134,6 +140,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
 	TRY_PARAM(max_send_bytes_per_second, sbps);
 	TRY_PARAM(max_recv_bytes_per_second, rbps);
 	TRY_PARAM(sdk_auth, sa);
+	TRY_PARAM(global_connection_pool, gcp);
 #undef TRY_PARAM
 	return false;
 }
@ -171,6 +178,8 @@ std::string S3BlobStoreEndpoint::BlobKnobs::getURLParameters() const {
 	_CHECK_PARAM(read_cache_blocks_per_file, rcb);
 	_CHECK_PARAM(max_send_bytes_per_second, sbps);
 	_CHECK_PARAM(max_recv_bytes_per_second, rbps);
+	_CHECK_PARAM(sdk_auth, sa);
+	_CHECK_PARAM(global_connection_pool, gcp);
 #undef _CHECK_PARAM
 	return r;
 }
@ -721,20 +730,23 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
                                                                   bool* reusingConn) {
 	// First try to get a connection from the pool
 	*reusingConn = false;
-	while (!b->connectionPool.empty()) {
-		S3BlobStoreEndpoint::ReusableConnection rconn = b->connectionPool.front();
-		b->connectionPool.pop();
+	while (!b->connectionPool->pool.empty()) {
+		S3BlobStoreEndpoint::ReusableConnection rconn = b->connectionPool->pool.front();
+		b->connectionPool->pool.pop();

 		// If the connection expires in the future then return it
 		if (rconn.expirationTime > now()) {
 			*reusingConn = true;
+			++b->blobStats->reusedConnections;
 			TraceEvent("S3BlobStoreEndpointReusingConnected")
 			    .suppressFor(60)
 			    .detail("RemoteEndpoint", rconn.conn->getPeerAddress())
 			    .detail("ExpiresIn", rconn.expirationTime - now());
 			return rconn;
 		}
+		++b->blobStats->expiredConnections;
 	}
+	++b->blobStats->newConnections;
 	std::string host = b->host, service = b->service;
 	if (service.empty()) {
 		if (b->useProxy) {
@ -743,7 +755,7 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
 		}
 		service = b->knobs.secure_connection ? "https" : "http";
 	}
-	bool isTLS = b->knobs.secure_connection == 1;
+	bool isTLS = b->knobs.isTLS();
 	state Reference<IConnection> conn;
 	if (b->useProxy) {
 		if (isTLS) {
@ -779,7 +791,9 @@ Future<S3BlobStoreEndpoint::ReusableConnection> S3BlobStoreEndpoint::connect(boo
 void S3BlobStoreEndpoint::returnConnection(ReusableConnection& rconn) {
 	// If it expires in the future then add it to the pool in the front
 	if (rconn.expirationTime > now()) {
-		connectionPool.push(rconn);
+		connectionPool->pool.push(rconn);
+	} else {
+		++blobStats->expiredConnections;
 	}
 	rconn.conn = Reference<IConnection>();
 }
@ -945,6 +959,8 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp
 			// received the "Connection: close" header.
 			if (r->headers["Connection"] != "close") {
 				bstore->returnConnection(rconn);
+			} else {
+				++bstore->blobStats->expiredConnections;
 			}
 			rconn.conn.clear();

@ -958,16 +974,19 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp
 		double end = g_network->timer();
 		double connectDuration = reqStartTimer - connectStartTimer;
 		double reqDuration = end - reqStartTimer;
+		bstore->blobStats->requestLatency.addMeasurement(reqDuration);

 		// If err is not present then r is valid.
 		// If r->code is in successCodes then record the successful request and return r.
 		if (!err.present() && successCodes.count(r->code) != 0) {
 			bstore->s_stats.requests_successful++;
+			++bstore->blobStats->requestsSuccessful;
 			return r;
 		}

 		// Otherwise, this request is considered failed.  Update failure count.
 		bstore->s_stats.requests_failed++;
+		++bstore->blobStats->requestsFailed;

 		// All errors in err are potentially retryable as well as certain HTTP response codes...
 		bool retryable = err.present() || r->code == 500 || r->code == 502 || r->code == 503 || r->code == 429;
@ -1014,6 +1033,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp
 			++thisTry;

 		if (fastRetry) {
+			++bstore->blobStats->fastRetries;
 			wait(delay(0));
 		} else if (retryable) {
 			// We will wait delay seconds before the next retry, start with nextRetryDelay.
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@ -981,6 +981,10 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema(
         "expired_age": 0,
         "oldest_id_version": 0,
         "oldest_id_age": 0
+      },
+      "version_epoch":{
+         "enabled": false,
+         "epoch": 0
      }
   },
   "client":{
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -70,7 +70,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( MAX_MESSAGE_SIZE,            std::max<int>(LOG_SYSTEM_PUSHED_DATA_BLOCK_SIZE, 1e5 + 2e4 + 1) + 8 ); // VALUE_SIZE_LIMIT + SYSTEM_KEY_SIZE_LIMIT + 9 bytes (4 bytes for length, 4 bytes for sequence number, and 1 byte for mutation type)
 	init( TLOG_MESSAGE_BLOCK_BYTES,                             10e6 );
 	init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR,      double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473
-	init( PEEK_TRACKER_EXPIRATION_TIME,                          600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = deterministicRandom()->coinflip() ? 0.1 : 120;
+	init( PEEK_TRACKER_EXPIRATION_TIME,                          600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = 120; // Cannot be buggified lower without changing the following assert in LogSystemPeekCursor.actor.cpp: ASSERT_WE_THINK(e.code() == error_code_operation_obsolete || SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME < 10);
 	init( PEEK_USING_STREAMING,                                false ); if( randomize && isSimulated && BUGGIFY ) PEEK_USING_STREAMING = true;
 	init( PARALLEL_GET_MORE_REQUESTS,                             32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2;
 	init( MULTI_CURSOR_PRE_FETCH_LIMIT,                           10 );
@ -736,8 +736,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BW_THROTTLING_ENABLED,                                true );

 	bool buggifySmallBWLag = randomize && BUGGIFY;
-	init( TARGET_BW_LAG,                                       240.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0;
-	init( TARGET_BW_LAG_BATCH,                                 200.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0;
+	init( TARGET_BW_LAG,                                       90.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0;
+	init( TARGET_BW_LAG_BATCH,                                 60.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0;
 	init( TARGET_BW_LAG_UPDATE,                                  9.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_UPDATE = 1.0;
 	init( MIN_BW_HISTORY,                                         10 );
 	init( BW_ESTIMATION_INTERVAL,                               10.0 ); if(buggifySmallBWLag) BW_ESTIMATION_INTERVAL = 2.0;
@ -746,7 +746,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BW_FETCH_WORKERS_INTERVAL,                             5.0 );
 	init( BW_RW_LOGGING_INTERVAL,                                5.0 );
 	init( BW_MAX_BLOCKED_INTERVAL,                              10.0 ); if(buggifySmallBWLag) BW_MAX_BLOCKED_INTERVAL = 2.0;
-	init( BW_RK_SIM_QUIESCE_DELAY,                             150.0 );
+	init( BW_RK_SIM_QUIESCE_DELAY,                             300.0 );

 	init( MAX_AUTO_THROTTLED_TRANSACTION_TAGS,                     5 ); if(randomize && BUGGIFY) MAX_AUTO_THROTTLED_TRANSACTION_TAGS = 1;
 	init( MAX_MANUAL_THROTTLED_TRANSACTION_TAGS,                  40 ); if(randomize && BUGGIFY) MAX_MANUAL_THROTTLED_TRANSACTION_TAGS = 1;
@ -850,6 +850,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// This exists for flexibility but assigning each ReadType to its own unique priority number makes the most sense
 	// The enumeration is currently: eager, fetch, low, normal, high
 	init( STORAGESERVER_READTYPE_PRIORITY_MAP,           "0,1,2,3,4" );
+	init( SPLIT_METRICS_MAX_ROWS,                              10000 );

 	//Wait Failure
 	init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS,                 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -974,7 +975,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REDWOOD_EVICT_UPDATED_PAGES,                          true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
 	init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT,                    2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
 	init( REDWOOD_IO_PRIORITIES,                       "32,32,32,32" );
-	init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT,             false );

 	// Server request latency measurement
 	init( LATENCY_SKETCH_ACCURACY,                              0.01 );
@ -1005,9 +1005,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY);
 	// BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs"
 	init( BG_METADATA_SOURCE,                                "knobs" );
-	init( BG_SNAPSHOT_FILE_TARGET_BYTES,                    10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000;
+	init( BG_SNAPSHOT_FILE_TARGET_BYTES,                    20000000 ); if ( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 50000 * deterministicRandom()->randomInt(1, 4); else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 50000 * deterministicRandom()->randomInt(1, 20);
 	init( BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES,               64*1024 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES = BG_SNAPSHOT_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 8));
-	init( BG_DELTA_BYTES_BEFORE_COMPACT, BG_SNAPSHOT_FILE_TARGET_BYTES/2 );
+	init( BG_DELTA_BYTES_BEFORE_COMPACT, BG_SNAPSHOT_FILE_TARGET_BYTES/2 ); if ( randomize && BUGGIFY ) BG_DELTA_BYTES_BEFORE_COMPACT *= (1.0 + deterministicRandom()->random01() * 3.0)/2.0;
 	init( BG_DELTA_FILE_TARGET_BYTES,   BG_DELTA_BYTES_BEFORE_COMPACT/10 );
 	init( BG_DELTA_FILE_TARGET_CHUNK_BYTES,                  32*1024 ); if ( randomize && BUGGIFY ) BG_DELTA_FILE_TARGET_CHUNK_BYTES = BG_DELTA_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 7));
 	init( BG_MAX_SPLIT_FANOUT,                                    10 ); if( randomize && BUGGIFY ) BG_MAX_SPLIT_FANOUT = deterministicRandom()->randomInt(5, 15);
@ -1021,16 +1021,21 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BG_RDC_BYTES_FACTOR,                                     2 ); if (randomize && BUGGIFY) BG_RDC_BYTES_FACTOR = deterministicRandom()->randomInt(1, 10);
 	init( BG_RDC_READ_FACTOR,                                      3 ); if (randomize && BUGGIFY) BG_RDC_READ_FACTOR = deterministicRandom()->randomInt(1, 10);
 	init( BG_WRITE_MULTIPART,                                  false ); if (randomize && BUGGIFY) BG_WRITE_MULTIPART = true;
+	init( BG_ENABLE_DYNAMIC_WRITE_AMP,                          true ); if (randomize && BUGGIFY) BG_ENABLE_DYNAMIC_WRITE_AMP = false;
+	init( BG_DYNAMIC_WRITE_AMP_MIN_FACTOR,                       0.5 );
+	init( BG_DYNAMIC_WRITE_AMP_DECREASE_FACTOR,                  0.8 );

 	init( BG_ENABLE_MERGING,                                    true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false;
 	init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0;
 	init( BG_MERGE_CANDIDATE_DELAY_SECONDS, BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 10.0 );

 	init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM,                8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
+	// The resnapshot/delta parallelism knobs are deprecated and replaced by the budget_bytes knobs! FIXME: remove after next release
 	init( BLOB_WORKER_RESNAPSHOT_PARALLELISM,                     40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10);
 	init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM,             2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100);
 	init( BLOB_WORKER_RDC_PARALLELISM,                             2 ); if( randomize && BUGGIFY ) BLOB_WORKER_RDC_PARALLELISM = deterministicRandom()->randomInt(1, 6);
-
+	init( BLOB_WORKER_RESNAPSHOT_BUDGET_BYTES,        1024*1024*1024 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_BUDGET_BYTES = deterministicRandom()->random01() * 10 * BG_SNAPSHOT_FILE_TARGET_BYTES;
+	init( BLOB_WORKER_DELTA_WRITE_BUDGET_BYTES,       1024*1024*1024 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_WRITE_BUDGET_BYTES = (5 + 45*deterministicRandom()->random01()) * BG_DELTA_FILE_TARGET_BYTES;
 	init( BLOB_WORKER_TIMEOUT,                                  10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
 	init( BLOB_WORKER_REQUEST_TIMEOUT,                           5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0;
 	init( BLOB_WORKERLIST_FETCH_INTERVAL,                        1.0 );
@ -1050,6 +1055,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BLOB_MANIFEST_RW_ROWS,             isSimulated ?  10 : 1000);
 	init( BLOB_RESTORE_MLOGS_URL, isSimulated ? "file://simfdb/backups/" : "");
 	init( BLOB_MIGRATOR_ERROR_RETRIES,        20);
+	init( BLOB_RESTORE_MANIFEST_URL, isSimulated ? "file://simfdb/fdbblob/manifest" : "");

 	init( BGCC_TIMEOUT,                   isSimulated ? 10.0 : 120.0 );
 	init( BGCC_MIN_INTERVAL,                isSimulated ? 1.0 : 10.0 );
--- a/fdbclient/TenantManagement.actor.cpp
+++ b/fdbclient/TenantManagement.actor.cpp
@ -43,9 +43,11 @@ int64_t extractTenantIdFromMutation(MutationRef m) {

 	if (isSingleKeyMutation((MutationRef::Type)m.type)) {
 		// The first 8 bytes of the key of this OP is also an 8-byte number
-		if (m.type == MutationRef::SetVersionstampedKey && m.param1.size() >= 4 &&
-		    parseVersionstampOffset(m.param1) < 8) {
-			return TenantInfo::INVALID_TENANT;
+		if (m.type == MutationRef::SetVersionstampedKey && m.param1.size() >= 4) {
+			// when the timestamp overlap with first 8 bytes
+			if (parseVersionstampOffset(m.param1) < 8) {
+				return TenantInfo::INVALID_TENANT;
+			}
 		}
 	} else {
 		// Assumes clear range mutations are split on tenant boundaries
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@ -245,7 +245,7 @@ Reference<ITransaction> ThreadSafeTenant::createTransaction() {

 ThreadFuture<int64_t> ThreadSafeTenant::getId() {
 	Tenant* tenant = this->tenant;
-	return onMainThread([tenant]() -> Future<int64_t> { return tenant->id(); });
+	return onMainThread([tenant]() -> Future<int64_t> { return tenant->getIdFuture(); });
 }

 ThreadFuture<Key> ThreadSafeTenant::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) {
--- a/fdbclient/include/fdbclient/Audit.h
+++ b/fdbclient/include/fdbclient/Audit.h
@ -59,8 +59,8 @@ struct AuditStorageState {

 	std::string toString() const {
 		std::string res = "AuditStorageState: [ID]: " + id.toString() +
-		                  "[Range]: " + Traceable<KeyRangeRef>::toString(range) + "[Type]: " + std::to_string(type) +
-		                  "[Phase]: " + std::to_string(phase);
+		                  ", [Range]: " + Traceable<KeyRangeRef>::toString(range) +
+		                  ", [Type]: " + std::to_string(type) + ", [Phase]: " + std::to_string(phase);
 		if (!error.empty()) {
 			res += "[Error]: " + error;
 		}
--- a/fdbclient/include/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/include/fdbclient/BackupAgent.actor.h
@ -1004,7 +1004,7 @@ namespace fileBackup {
 ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
                                                                      int64_t offset,
                                                                      int len,
-                                                                      Optional<Database> cx);
+                                                                      Database cx);

 // Reads a mutation log block from file and parses into batch mutation blocks for further parsing.
 ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeMutationLogFileBlock(Reference<IAsyncFile> file,
--- a/fdbclient/include/fdbclient/BackupContainer.h
+++ b/fdbclient/include/fdbclient/BackupContainer.h
@ -253,7 +253,7 @@ public:

 	// Returns the key ranges in the snapshot file. This is an expensive function
 	// and should only be used in simulation for sanity check.
-	virtual Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file, Optional<Database> cx) = 0;
+	virtual Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file, Database cx) = 0;

 	struct ExpireProgress {
 		std::string step;
@ -292,7 +292,6 @@ public:
 	// If logsOnly is set, only use log files in [beginVersion, targetVervions) in restore set.
 	// Returns non-present if restoring to the given version is not possible.
 	virtual Future<Optional<RestorableFileSet>> getRestoreSet(Version targetVersion,
-	                                                          Optional<Database> cx,
 	                                                          VectorRef<KeyRangeRef> keyRangesFilter = {},
 	                                                          bool logsOnly = false,
 	                                                          Version beginVersion = -1) = 0;
--- a/fdbclient/include/fdbclient/BackupContainerFileSystem.h
+++ b/fdbclient/include/fdbclient/BackupContainerFileSystem.h
@ -155,10 +155,9 @@ public:
 	                        ExpireProgress* progress,
 	                        Version restorableBeginVersion) final;

-	Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file, Optional<Database> cx) final;
+	Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file, Database cx) final;

 	Future<Optional<RestorableFileSet>> getRestoreSet(Version targetVersion,
-	                                                  Optional<Database> cx,
 	                                                  VectorRef<KeyRangeRef> keyRangesFilter,
 	                                                  bool logsOnly,
 	                                                  Version beginVersion) final;
--- a/fdbclient/include/fdbclient/BlobCipher.h
+++ b/fdbclient/include/fdbclient/BlobCipher.h
@ -70,6 +70,7 @@ public:
 		KV_REDWOOD,
 		BLOB_GRANULE,
 		BACKUP,
+		RESTORE,
 		TEST,
 		MAX,
 	};
--- a/fdbclient/include/fdbclient/BlobGranuleCommon.h
+++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h
@ -55,6 +55,23 @@ struct GranuleDeltas : VectorRef<MutationsAndVersionRef> {
 	}
 };

+#pragma pack(push, 4)
+struct GranuleMutationRef {
+	MutationRef::Type type;
+	Version version;
+	StringRef param1;
+	StringRef param2;
+
+	GranuleMutationRef() {}
+	GranuleMutationRef(MutationRef::Type t, Version v, StringRef param1, StringRef param2)
+	  : type(t), version(v), param1(param1), param2(param2) {}
+	GranuleMutationRef(Arena& to, MutationRef::Type t, Version v, StringRef param1, StringRef param2)
+	  : type(t), version(v), param1(to, param1), param2(to, param2) {}
+	GranuleMutationRef(Arena& to, const GranuleMutationRef& from)
+	  : type(from.type), version(from.version), param1(to, from.param1), param2(to, from.param2) {}
+};
+#pragma pack(pop)
+
 struct GranuleMaterializeStats {
 	// file-level stats
 	int64_t inputBytes;
--- a/fdbclient/include/fdbclient/BlobGranuleFiles.h
+++ b/fdbclient/include/fdbclient/BlobGranuleFiles.h
@ -51,7 +51,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
                                   Version beginVersion,
                                   Version readVersion,
                                   Optional<StringRef> snapshotData,
-                                   StringRef deltaFileData[],
+                                   const std::vector<StringRef>& deltaFileData,
                                   GranuleMaterializeStats& stats);

 std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix);
@ -59,4 +59,8 @@ std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, s
 // For benchmark testing only. It should never be called in prod.
 void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange);

+// just for client passthrough. reads all key-value pairs from a snapshot file, and all mutations from a delta file
+RangeResult bgReadSnapshotFile(const StringRef& data);
+Standalone<VectorRef<GranuleMutationRef>> bgReadDeltaFile(const StringRef& data);
+
 #endif
--- a/fdbclient/include/fdbclient/BlobGranuleRequest.actor.h
+++ b/fdbclient/include/fdbclient/BlobGranuleRequest.actor.h
@ -0,0 +1,187 @@
+/*
+ * BlobGranuleRequest.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// When actually compiled (NO_INTELLISENSE), include the generated version of this file.  In intellisense use the source
+// version.
+#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_BLOB_GRANULE_REQUEST_ACTOR_G_H)
+#define FDBCLIENT_BLOB_GRANULE_REQUEST_ACTOR_G_H
+#include "fdbclient/BlobGranuleRequest.actor.g.h"
+#elif !defined(FDBCLIENT_BLOB_GRANULE_REQUEST_ACTOR_H)
+#define FDBCLIENT_BLOB_GRANULE_REQUEST_ACTOR_H
+
+#include "flow/flow.h"
+#include "flow/Knobs.h"
+
+// #include "fdbclient/NativeAPI.actor.h"
+#include "flow/Arena.h"
+#include "fdbclient/DatabaseContext.h"
+#include "fdbclient/BlobWorkerInterface.h"
+
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+#define BGR_DEBUG false
+
+ACTOR template <class Request, bool P>
+Future<Standalone<VectorRef<REPLY_TYPE(Request)>>> txnDoBlobGranuleRequests(
+    Transaction* tr,
+    Key* beginKey,
+    Key endKey,
+    Request request,
+    RequestStream<Request, P> BlobWorkerInterface::*channel) {
+	// TODO KNOB
+	state RangeResult blobGranuleMapping = wait(krmGetRanges(
+	    tr, blobGranuleMappingKeys.begin, KeyRangeRef(*beginKey, endKey), 64, GetRangeLimits::BYTE_LIMIT_UNLIMITED));
+
+	state int i = 0;
+	state std::vector<Future<ErrorOr<REPLY_TYPE(Request)>>> requests;
+	state Standalone<VectorRef<REPLY_TYPE(Request)>> results;
+
+	for (; i < blobGranuleMapping.size() - 1; i++) {
+		if (!blobGranuleMapping[i].value.size()) {
+			if (BGR_DEBUG) {
+				fmt::print("ERROR: No valid granule data for range [{0} - {1}) \n",
+				           blobGranuleMapping[i].key.printable(),
+				           blobGranuleMapping[i + 1].key.printable());
+			}
+			// no granule for range
+			throw blob_granule_transaction_too_old();
+		}
+
+		state UID workerId = decodeBlobGranuleMappingValue(blobGranuleMapping[i].value);
+		if (workerId == UID()) {
+			if (BGR_DEBUG) {
+				fmt::print("ERROR: Invalid Blob Worker ID for range [{0} - {1}) \n",
+				           blobGranuleMapping[i].key.printable(),
+				           blobGranuleMapping[i + 1].key.printable());
+			}
+			// no worker for granule
+			throw blob_granule_transaction_too_old();
+		}
+
+		if (!tr->trState->cx->blobWorker_interf.count(workerId)) {
+			Optional<Value> workerInterface = wait(tr->get(blobWorkerListKeyFor(workerId)));
+			// from the time the mapping was read from the db, the associated blob worker
+			// could have died and so its interface wouldn't be present as part of the blobWorkerList
+			// we persist in the db.
+			if (workerInterface.present()) {
+				tr->trState->cx->blobWorker_interf[workerId] = decodeBlobWorkerListValue(workerInterface.get());
+			} else {
+				if (BGR_DEBUG) {
+					fmt::print("ERROR: Worker  for range [{1} - {2}) does not exist!\n",
+					           workerId.toString().substr(0, 5),
+					           blobGranuleMapping[i].key.printable(),
+					           blobGranuleMapping[i + 1].key.printable());
+				}
+				// throw to force read version to increase and to retry reading mapping
+				throw transaction_too_old();
+			}
+		}
+
+		if (BGR_DEBUG) {
+			fmt::print("Requesting range [{0} - {1}) from worker {2}!\n",
+			           blobGranuleMapping[i].key.printable(),
+			           blobGranuleMapping[i + 1].key.printable(),
+			           workerId.toString().substr(0, 5));
+		}
+
+		KeyRangeRef range(blobGranuleMapping[i].key, blobGranuleMapping[i + 1].key);
+		request.reply.reset();
+		request.setRange(range);
+		// TODO consolidate?
+		BlobWorkerInterface bwi = tr->trState->cx->blobWorker_interf[workerId];
+		RequestStream<Request, P> const* stream = &(bwi.*channel);
+		Future<ErrorOr<REPLY_TYPE(Request)>> response = stream->tryGetReply(request);
+		requests.push_back(response);
+	}
+	// wait for each request. If it has an error, retry from there if it is a retriable error
+	state int j = 0;
+	for (; j < requests.size(); j++) {
+		try {
+			ErrorOr<REPLY_TYPE(Request)> result = wait(requests[j]);
+			if (result.isError()) {
+				throw result.getError();
+			}
+			results.push_back(results.arena(), result.get());
+		} catch (Error& e) {
+			if (e.code() == error_code_wrong_shard_server || e.code() == error_code_request_maybe_delivered ||
+			    e.code() == error_code_broken_promise || e.code() == error_code_connection_failed) {
+				// re-read mapping and retry from failed req
+				i = j;
+				break;
+			} else {
+				if (BGR_DEBUG) {
+					fmt::print("ERROR: Error doing request for range [{0} - {1}): {2}!\n",
+					           blobGranuleMapping[j].key.printable(),
+					           blobGranuleMapping[j + 1].key.printable(),
+					           e.name());
+				}
+				throw;
+			}
+		}
+	}
+	if (i < blobGranuleMapping.size() - 1) {
+		// a request failed, retry from there after a sleep
+		*beginKey = blobGranuleMapping[i].key;
+		wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+	} else if (blobGranuleMapping.more) {
+		*beginKey = blobGranuleMapping.back().key;
+		// no requests failed but there is more to read, continue reading
+	} else {
+		*beginKey = endKey;
+	}
+	return results;
+}
+
+// FIXME: port other request types to this function
+ACTOR template <class Request, bool P>
+Future<Standalone<VectorRef<REPLY_TYPE(Request)>>> doBlobGranuleRequests(
+    Database cx,
+    KeyRange range,
+    Request request,
+    RequestStream<Request, P> BlobWorkerInterface::*channel) {
+	state Key beginKey = range.begin;
+	state Key endKey = range.end;
+	state Transaction tr(cx);
+	state Standalone<VectorRef<REPLY_TYPE(Request)>> results;
+	loop {
+		if (beginKey >= endKey) {
+			return results;
+		}
+		try {
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+			Standalone<VectorRef<REPLY_TYPE(Request)>> partialResults =
+			    wait(txnDoBlobGranuleRequests(&tr, &beginKey, endKey, request, channel));
+			if (!partialResults.empty()) {
+				results.arena().dependsOn(partialResults.arena());
+				results.append(results.arena(), partialResults.begin(), partialResults.size());
+			}
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
+#include "flow/unactorcompiler.h"
+
+#endif
--- a/fdbclient/include/fdbclient/BlobWorkerCommon.h
+++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h
@ -46,10 +46,12 @@ struct BlobWorkerStats {
 	Counter fullRejections;
 	Counter forceFlushCleanups;
 	Counter readDrivenCompactions;
+	Counter oldFeedSnapshots;

 	int numRangesAssigned;
 	int mutationBytesBuffered;
 	int activeReadRequests;
+	// TODO: add gauge for granules blocking on old snapshots, once this guage is fixed
 	int granulesPendingSplitCheck;
 	Version minimumCFVersion;
 	Version cfVersionLag;
@ -63,8 +65,8 @@ struct BlobWorkerStats {
 	LatencySample readLatencySample;

 	Reference<FlowLock> initialSnapshotLock;
-	Reference<FlowLock> resnapshotLock;
-	Reference<FlowLock> deltaWritesLock;
+	Reference<FlowLock> resnapshotBudget;
+	Reference<FlowLock> deltaWritesBudget;

 	Future<Void> logger;

@ -72,8 +74,8 @@ struct BlobWorkerStats {
 	explicit BlobWorkerStats(UID id,
 	                         double interval,
 	                         Reference<FlowLock> initialSnapshotLock,
-	                         Reference<FlowLock> resnapshotLock,
-	                         Reference<FlowLock> deltaWritesLock,
+	                         Reference<FlowLock> resnapshotBudget,
+	                         Reference<FlowLock> deltaWritesBudget,
 	                         double sampleLoggingInterval,
 	                         double fileOpLatencySketchAccuracy,
 	                         double requestLatencySketchAccuracy)
@ -93,17 +95,17 @@ struct BlobWorkerStats {
 	    flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
 	    compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc),
 	    forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc),
-	    numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0),
-	    minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0),
-	    snapshotBlobWriteLatencySample("SnapshotBlobWriteMetrics",
-	                                   id,
-	                                   sampleLoggingInterval,
-	                                   fileOpLatencySketchAccuracy),
+	    oldFeedSnapshots("OldFeedSnapshots", cc), numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0),
+	    granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0),
+	    lastResidentMemory(0), snapshotBlobWriteLatencySample("SnapshotBlobWriteMetrics",
+	                                                          id,
+	                                                          sampleLoggingInterval,
+	                                                          fileOpLatencySketchAccuracy),
 	    deltaBlobWriteLatencySample("DeltaBlobWriteMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
 	    reSnapshotLatencySample("GranuleResnapshotMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
 	    readLatencySample("GranuleReadLatencyMetrics", id, sampleLoggingInterval, requestLatencySketchAccuracy),
-	    estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock),
-	    deltaWritesLock(deltaWritesLock) {
+	    estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotBudget(resnapshotBudget),
+	    deltaWritesBudget(deltaWritesBudget) {
 		specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
 		specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
 		specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });
@ -115,10 +117,10 @@ struct BlobWorkerStats {
 		specialCounter(cc, "EstimatedMaxResidentMemory", [this]() { return this->estimatedMaxResidentMemory; });
 		specialCounter(cc, "InitialSnapshotsActive", [this]() { return this->initialSnapshotLock->activePermits(); });
 		specialCounter(cc, "InitialSnapshotsWaiting", [this]() { return this->initialSnapshotLock->waiters(); });
-		specialCounter(cc, "ReSnapshotsActive", [this]() { return this->resnapshotLock->activePermits(); });
-		specialCounter(cc, "ReSnapshotsWaiting", [this]() { return this->resnapshotLock->waiters(); });
-		specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); });
-		specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); });
+		specialCounter(cc, "ReSnapshotBytesActive", [this]() { return this->resnapshotBudget->activePermits(); });
+		specialCounter(cc, "ReSnapshotBytesWaiting", [this]() { return this->resnapshotBudget->waiters(); });
+		specialCounter(cc, "DeltaFileWriteBytesActive", [this]() { return this->deltaWritesBudget->activePermits(); });
+		specialCounter(cc, "DeltaFileWriteBytesWaiting", [this]() { return this->deltaWritesBudget->waiters(); });

 		logger = cc.traceCounters("BlobWorkerMetrics", id, interval, "BlobWorkerMetrics");
 	}
--- a/fdbclient/include/fdbclient/BlobWorkerInterface.h
+++ b/fdbclient/include/fdbclient/BlobWorkerInterface.h
@ -332,15 +332,19 @@ struct FlushGranuleRequest {
 	int64_t managerEpoch;
 	KeyRange granuleRange;
 	Version flushVersion;
+	bool compactAfter;
 	ReplyPromise<Void> reply;

-	FlushGranuleRequest() : managerEpoch(-1), flushVersion(invalidVersion) {}
-	explicit FlushGranuleRequest(int64_t managerEpoch, KeyRange granuleRange, Version flushVersion)
-	  : managerEpoch(managerEpoch), granuleRange(granuleRange), flushVersion(flushVersion) {}
+	FlushGranuleRequest() : managerEpoch(-1), flushVersion(invalidVersion), compactAfter(false) {}
+	explicit FlushGranuleRequest(int64_t managerEpoch, KeyRange granuleRange, Version flushVersion, bool compactAfter)
+	  : managerEpoch(managerEpoch), granuleRange(granuleRange), flushVersion(flushVersion), compactAfter(compactAfter) {
+	}
+
+	void setRange(const KeyRangeRef& range) { granuleRange = range; }

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, managerEpoch, granuleRange, flushVersion, reply);
+		serializer(ar, managerEpoch, granuleRange, flushVersion, compactAfter, reply);
 	}
 };

--- a/fdbclient/include/fdbclient/ClientKnobs.h
+++ b/fdbclient/include/fdbclient/ClientKnobs.h
@ -239,7 +239,11 @@ public:
 	int BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
 	int BLOBSTORE_MAX_SEND_BYTES_PER_SECOND;
 	int BLOBSTORE_MAX_RECV_BYTES_PER_SECOND;
-	int BGR_READ_BLOCK_SIZE;
+	bool BLOBSTORE_GLOBAL_CONNECTION_POOL;
+	bool BLOBSTORE_ENABLE_LOGGING;
+	double BLOBSTORE_STATS_LOGGING_INTERVAL;
+	double BLOBSTORE_LATENCY_LOGGING_INTERVAL;
+	double BLOBSTORE_LATENCY_LOGGING_ACCURACY;

 	int CONSISTENCY_CHECK_RATE_LIMIT_MAX;
 	int CONSISTENCY_CHECK_ONE_ROUND_TARGET_COMPLETION_TIME;
@ -293,6 +297,10 @@ public:

 	// Encryption-at-rest
 	bool ENABLE_ENCRYPTION_CPU_TIME_LOGGING;
+	// This Knob will be a comma-delimited string (i.e 0,1,2,3) that specifies which tenants the the EKP should throw
+	// key_not_found errors for. If TenantInfo::INVALID_TENANT is contained within the list then no tenants will be
+	// dropped. This Knob should ONLY be used in simulation for testing purposes
+	std::string SIMULATION_EKP_TENANT_IDS_TO_DROP;

 	ClientKnobs(Randomize randomize);
 	void initialize(Randomize randomize);
--- a/fdbclient/include/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/include/fdbclient/CommitProxyInterface.h
@ -512,10 +512,11 @@ struct GetStorageServerRejoinInfoReply {
 	Optional<Tag> newTag;
 	bool newLocality;
 	std::vector<std::pair<Version, Tag>> history;
+	EncryptionAtRestMode encryptMode;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, version, tag, newTag, newLocality, history);
+		serializer(ar, version, tag, newTag, newLocality, history, encryptMode);
 	}
 };

--- a/fdbclient/include/fdbclient/CommitTransaction.h
+++ b/fdbclient/include/fdbclient/CommitTransaction.h
@ -313,6 +313,11 @@ struct CommitTransactionRef {
 	bool lock_aware = false; // set when metadata mutations are present
 	Optional<SpanContext> spanContext;

+	// set by Commit Proxy
+	// The tenants associated with this transaction. This field only existing
+	// when tenant mode is required and this transaction has metadata mutations
+	Optional<VectorRef<int64_t>> tenantIds;
+
 	template <class Ar>
 	force_inline void serialize(Ar& ar) {
 		if constexpr (is_fb_function<Ar>) {
@ -323,7 +328,8 @@ struct CommitTransactionRef {
 			           read_snapshot,
 			           report_conflicting_keys,
 			           lock_aware,
-			           spanContext);
+			           spanContext,
+			           tenantIds);
 		} else {
 			serializer(ar, read_conflict_ranges, write_conflict_ranges, mutations, read_snapshot);
 			if (ar.protocolVersion().hasReportConflictingKeys()) {
--- a/fdbclient/include/fdbclient/FDBTypes.h
+++ b/fdbclient/include/fdbclient/FDBTypes.h
@ -1520,6 +1520,8 @@ struct EncryptionAtRestMode {

 	bool operator==(const EncryptionAtRestMode& e) const { return isEquals(e); }
 	bool operator!=(const EncryptionAtRestMode& e) const { return !isEquals(e); }
+	bool operator==(Mode m) const { return mode == m; }
+	bool operator!=(Mode m) const { return mode != m; }

 	bool isEncryptionEnabled() const { return mode != EncryptionAtRestMode::DISABLED; }

@ -1548,6 +1550,11 @@ struct EncryptionAtRestMode {
 	uint32_t mode;
 };

+template <>
+struct Traceable<EncryptionAtRestMode> : std::true_type {
+	static std::string toString(const EncryptionAtRestMode& mode) { return mode.toString(); }
+};
+
 typedef StringRef ClusterNameRef;
 typedef Standalone<ClusterNameRef> ClusterName;

--- a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
+++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
@ -19,6 +19,7 @@
 */
 #pragma once
 #include "flow/EncryptUtils.h"
+#include "flow/genericactors.actor.h"
 #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H)
 #define FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H
 #include "fdbclient/GetEncryptCipherKeys.actor.g.h"
@ -27,7 +28,9 @@

 #include "fdbclient/BlobCipher.h"
 #include "fdbclient/EncryptKeyProxyInterface.h"
+#include "fdbclient/Knobs.h"
 #include "fdbrpc/Stats.h"
+#include "fdbrpc/TenantInfo.h"
 #include "flow/Knobs.h"
 #include "flow/IRandom.h"

@ -182,6 +185,18 @@ Future<EKPGetBaseCipherKeysByIdsReply> getUncachedEncryptCipherKeys(Reference<As
 			TraceEvent(SevWarn, "GetEncryptCipherKeys_RequestFailed").error(reply.error.get());
 			throw encrypt_keys_fetch_failed();
 		}
+		if (g_network && g_network->isSimulated() && usageType == BlobCipherMetrics::RESTORE) {
+			std::unordered_set<int64_t> tenantIdsToDrop =
+			    parseStringToUnorderedSet<int64_t>(CLIENT_KNOBS->SIMULATION_EKP_TENANT_IDS_TO_DROP, ',');
+			if (!tenantIdsToDrop.count(TenantInfo::INVALID_TENANT)) {
+				for (auto& baseCipherInfo : request.baseCipherInfos) {
+					if (tenantIdsToDrop.count(baseCipherInfo.domainId)) {
+						TraceEvent("GetEncryptCipherKeys_SimulatedError").detail("DomainId", baseCipherInfo.domainId);
+						throw encrypt_keys_fetch_failed();
+					}
+				}
+			}
+		}
 		return reply;
 	} catch (Error& e) {
 		TraceEvent("GetEncryptCipherKeys_CaughtError").error(e);
--- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
+++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
@ -604,7 +604,7 @@ struct RegisterClusterImpl {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);

-				state Future<std::vector<std::pair<TenantName, TenantMapEntry>>> existingTenantsFuture =
+				state Future<std::vector<std::pair<TenantName, int64_t>>> existingTenantsFuture =
 				    TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, 1);
 				state ThreadFuture<RangeResult> existingDataFuture = tr->getRange(normalKeys, 1);

@ -624,7 +624,7 @@ struct RegisterClusterImpl {
 				}

 				// Check for any existing data
-				std::vector<std::pair<TenantName, TenantMapEntry>> existingTenants =
+				std::vector<std::pair<TenantName, int64_t>> existingTenants =
 				    wait(safeThreadFutureToFuture(existingTenantsFuture));
 				if (!existingTenants.empty()) {
 					TraceEvent(SevWarn, "CannotRegisterClusterWithTenants").detail("ClusterName", self->clusterName);
@ -1544,34 +1544,72 @@ Future<Void> deleteTenant(Reference<DB> db, int64_t id) {
 	return Void();
 }

-ACTOR template <class Transaction>
-Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(Transaction tr,
-                                                                                  TenantNameRef begin,
-                                                                                  TenantNameRef end,
-                                                                                  int limit) {
+template <class Transaction>
+Future<std::vector<std::pair<TenantName, int64_t>>> listTenantsTransaction(Transaction tr,
+                                                                           TenantName begin,
+                                                                           TenantName end,
+                                                                           int limit,
+                                                                           int offset = 0) {
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+	auto future = ManagementClusterMetadata::tenantMetadata().tenantNameIndex.getRange(tr, begin, end, limit + offset);
+	return fmap(
+	    [offset](auto f) {
+		    std::vector<std::pair<TenantName, int64_t>>& results = f.results;
+		    results.erase(results.begin(), results.begin() + offset);
+		    return results;
+	    },
+	    future);
+}

-	state KeyBackedRangeResult<std::pair<TenantName, int64_t>> matchingTenants =
-	    wait(ManagementClusterMetadata::tenantMetadata().tenantNameIndex.getRange(tr, begin, end, limit));
+template <class DB>
+Future<std::vector<std::pair<TenantName, int64_t>>> listTenants(Reference<DB> db,
+                                                                TenantName begin,
+                                                                TenantName end,
+                                                                int limit,
+                                                                int offset = 0) {
+	return runTransaction(db, [=](Reference<typename DB::TransactionT> tr) {
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+		tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+		return listTenantsTransaction(tr, begin, end, limit, offset);
+	});
+}

-	state std::vector<Future<TenantMapEntry>> tenantEntryFutures;
-	for (auto const& [name, id] : matchingTenants.results) {
-		tenantEntryFutures.push_back(getTenantTransaction(tr, id));
+// Scan the tenant index to get a list of tenant IDs, and then lookup the metadata for each ID individually
+ACTOR template <class Transaction>
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadataTransaction(
+    Transaction tr,
+    std::vector<std::pair<TenantName, int64_t>> tenantIds) {
+
+	state int idIdx = 0;
+	state std::vector<Future<Optional<TenantMapEntry>>> futures;
+	for (; idIdx < tenantIds.size(); ++idIdx) {
+		futures.push_back(MetaclusterAPI::tryGetTenantTransaction(tr, tenantIds[idIdx].second));
 	}
-
-	wait(waitForAll(tenantEntryFutures));
+	wait(waitForAll(futures));

 	std::vector<std::pair<TenantName, TenantMapEntry>> results;
-	for (int i = 0; i < matchingTenants.results.size(); ++i) {
-		// Tenants being renamed will show up twice; once under each name
-		results.emplace_back(matchingTenants.results[i].first, tenantEntryFutures[i].get());
+	results.reserve(futures.size());
+	for (int i = 0; i < futures.size(); ++i) {
+		const TenantMapEntry& entry = futures[i].get().get();
+		results.emplace_back(entry.tenantName, entry);
 	}

 	return results;
 }

+ACTOR template <class Transaction>
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadataTransaction(Transaction tr,
+                                                                                         TenantNameRef begin,
+                                                                                         TenantNameRef end,
+                                                                                         int limit) {
+	std::vector<std::pair<TenantName, int64_t>> matchingTenants = wait(listTenantsTransaction(tr, begin, end, limit));
+	std::vector<std::pair<TenantName, TenantMapEntry>> results =
+	    wait(listTenantMetadataTransaction(tr, matchingTenants));
+	return results;
+}
+
 ACTOR template <class DB>
-Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadata(
    Reference<DB> db,
    TenantName begin,
    TenantName end,
@ -1586,30 +1624,24 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
 			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
 			if (filters.empty()) {
-				wait(store(results, listTenantsTransaction(tr, begin, end, limit + offset)));
-
-				if (offset >= results.size()) {
-					results.clear();
-				} else if (offset > 0) {
-					results.erase(results.begin(), results.begin() + offset);
-				}
-
+				std::vector<std::pair<TenantName, int64_t>> ids =
+				    wait(MetaclusterAPI::listTenantsTransaction(tr, begin, end, limit, offset));
+				wait(store(results, MetaclusterAPI::listTenantMetadataTransaction(tr, ids)));
 				return results;
 			}

-			tr->setOption(FDBTransactionOptions::RAW_ACCESS);
-
+			// read in batch
 			state int count = 0;
 			loop {
 				std::vector<std::pair<TenantName, TenantMapEntry>> tenantBatch =
-				    wait(listTenantsTransaction(tr, begin, end, std::max(limit + offset, 1000)));
+				    wait(MetaclusterAPI::listTenantMetadataTransaction(tr, begin, end, std::max(limit + offset, 1000)));

 				if (tenantBatch.empty()) {
 					return results;
 				}

 				for (auto const& [name, entry] : tenantBatch) {
-					if (filters.empty() || std::count(filters.begin(), filters.end(), entry.tenantState)) {
+					if (std::count(filters.begin(), filters.end(), entry.tenantState)) {
 						++count;
 						if (count > offset) {
 							results.push_back(std::make_pair(name, entry));
--- a/fdbclient/include/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/include/fdbclient/NativeAPI.actor.h
@ -71,6 +71,7 @@ struct NetworkOptions {
 	std::string traceClockSource;
 	std::string traceFileIdentifier;
 	std::string tracePartialFileSuffix;
+	bool traceInitializeOnSetup;
 	Optional<bool> logClientInfo;
 	Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions;
 	bool runLoopProfilingEnabled;
@ -247,6 +248,7 @@ public:

 	Future<Void> ready() const { return success(idFuture); }
 	int64_t id() const;
+	Future<int64_t> getIdFuture() const;
 	KeyRef prefix() const;
 	std::string description() const;

--- a/fdbclient/include/fdbclient/S3BlobStore.h
+++ b/fdbclient/include/fdbclient/S3BlobStore.h
@ -21,15 +21,53 @@
 #pragma once

 #include <map>
+#include <unordered_map>
 #include <functional>
+#include "flow/IRandom.h"
 #include "flow/flow.h"
 #include "flow/Net2Packet.h"
 #include "fdbclient/Knobs.h"
 #include "flow/IRateControl.h"
 #include "fdbrpc/HTTP.h"
+#include "fdbrpc/Stats.h"
 #include "fdbclient/JSONDoc.h"
 #include "flow/IConnection.h"

+#include <boost/functional/hash.hpp>
+
+// unique key that indentifies interchangeable connections for the same settings and destination
+// FIXME: can we define std::hash as a struct member of a S3BlobStoreEndpoint?
+struct BlobStoreConnectionPoolKey {
+	std::string host;
+	std::string service;
+	std::string region;
+	bool isTLS;
+
+	BlobStoreConnectionPoolKey(const std::string& host,
+	                           const std::string& service,
+	                           const std::string& region,
+	                           bool isTLS)
+	  : host(host), service(service), region(region), isTLS(isTLS) {}
+
+	bool operator==(const BlobStoreConnectionPoolKey& other) const {
+		return isTLS == other.isTLS && host == other.host && service == other.service && region == other.region;
+	}
+};
+
+namespace std {
+template <>
+struct hash<BlobStoreConnectionPoolKey> {
+	std::size_t operator()(const BlobStoreConnectionPoolKey& key) const {
+		std::size_t seed = 0;
+		boost::hash_combine(seed, std::hash<std::string>{}(key.host));
+		boost::hash_combine(seed, std::hash<std::string>{}(key.service));
+		boost::hash_combine(seed, std::hash<std::string>{}(key.region));
+		boost::hash_combine(seed, std::hash<bool>{}(key.isTLS));
+		return seed;
+	}
+};
+} // namespace std
+
 // Representation of all the things you need to connect to a blob store instance with some credentials.
 // Reference counted because a very large number of them could be needed.
 class S3BlobStoreEndpoint : public ReferenceCounted<S3BlobStoreEndpoint> {
@ -47,6 +85,54 @@ public:

 	static Stats s_stats;

+	struct BlobStats {
+		UID id;
+		CounterCollection cc;
+		Counter requestsSuccessful;
+		Counter requestsFailed;
+		Counter newConnections;
+		Counter expiredConnections;
+		Counter reusedConnections;
+		Counter fastRetries;
+
+		LatencySample requestLatency;
+
+		// init not in static codepath, to avoid initialization race issues and so no blob connections means no
+		// unecessary blob stats traces
+		BlobStats()
+		  : id(deterministicRandom()->randomUniqueID()), cc("BlobStoreStats", id.toString()),
+		    requestsSuccessful("RequestsSuccessful", cc), requestsFailed("RequestsFailed", cc),
+		    newConnections("NewConnections", cc), expiredConnections("ExpiredConnections", cc),
+		    reusedConnections("ReusedConnections", cc), fastRetries("FastRetries", cc),
+		    requestLatency("BlobStoreRequestLatency",
+		                   id,
+		                   CLIENT_KNOBS->BLOBSTORE_LATENCY_LOGGING_INTERVAL,
+		                   CLIENT_KNOBS->BLOBSTORE_LATENCY_LOGGING_ACCURACY) {}
+	};
+	// null when initialized, so no blob stats until a blob connection is used
+	static std::unique_ptr<BlobStats> blobStats;
+	static Future<Void> statsLogger;
+
+	void maybeStartStatsLogger() {
+		if (!blobStats && CLIENT_KNOBS->BLOBSTORE_ENABLE_LOGGING) {
+			blobStats = std::make_unique<BlobStats>();
+			specialCounter(
+			    blobStats->cc, "GlobalConnectionPoolCount", [this]() { return this->globalConnectionPool.size(); });
+			specialCounter(blobStats->cc, "GlobalConnectionPoolSize", [this]() {
+				// FIXME: could track this explicitly via an int variable with extra logic, but this should be small and
+				// infrequent
+				int totalConnections = 0;
+				for (auto& it : this->globalConnectionPool) {
+					totalConnections += it.second->pool.size();
+				}
+				return totalConnections;
+			});
+
+			statsLogger = blobStats->cc.traceCounters(
+			    "BlobStoreMetrics", blobStats->id, CLIENT_KNOBS->BLOBSTORE_STATS_LOGGING_INTERVAL, "BlobStoreMetrics");
+		}
+	}
+
 	struct Credentials {
 		std::string key;
 		std::string secret;
@ -60,7 +146,7 @@ public:
 		    delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests,
 		    concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file,
 		    enable_read_cache, read_block_size, read_ahead_blocks, read_cache_blocks_per_file,
-		    max_send_bytes_per_second, max_recv_bytes_per_second, sdk_auth;
+		    max_send_bytes_per_second, max_recv_bytes_per_second, sdk_auth, global_connection_pool;
 		bool set(StringRef name, int value);
 		std::string getURLParameters() const;
 		static std::vector<std::string> getKnobDescriptions() {
@ -95,11 +181,27 @@ public:
 				"max_recv_bytes_per_second (or rbps)   Max receive bytes per second for all requests combined (NOT YET "
 				"USED).",
 				"sdk_auth (or sa)                      Use AWS SDK to resolve credentials. Only valid if "
-				"BUILD_AWS_BACKUP is enabled."
+				"BUILD_AWS_BACKUP is enabled.",
+				"global_connection_pool (or gcp)       Enable shared connection pool between all blobstore instances."
 			};
 		}
+
+		bool isTLS() const { return secure_connection == 1; }
 	};

+	struct ReusableConnection {
+		Reference<IConnection> conn;
+		double expirationTime;
+	};
+
+	// basically, reference counted queue with option to add other fields
+	struct ConnectionPoolData : NonCopyable, ReferenceCounted<ConnectionPoolData> {
+		std::queue<ReusableConnection> pool;
+	};
+
+	// global connection pool for multiple blobstore endpoints with same connection settings and request destination
+	static std::unordered_map<BlobStoreConnectionPoolKey, Reference<ConnectionPoolData>> globalConnectionPool;
+
 	S3BlobStoreEndpoint(std::string const& host,
 	                    std::string const& service,
 	                    std::string region,
@ -123,15 +225,34 @@ public:

 		if (host.empty() || (proxyHost.present() != proxyPort.present()))
 			throw connection_string_invalid();
+
+		// set connection pool instance
+		if (useProxy || !knobs.global_connection_pool) {
+			// don't use global connection pool if there's a proxy, as it complicates the logic
+			// FIXME: handle proxies?
+			connectionPool = makeReference<ConnectionPoolData>();
+		} else {
+			BlobStoreConnectionPoolKey key(host, service, region, knobs.isTLS());
+			auto it = globalConnectionPool.find(key);
+			if (it != globalConnectionPool.end()) {
+				connectionPool = it->second;
+			} else {
+				connectionPool = makeReference<ConnectionPoolData>();
+				globalConnectionPool.insert({ key, connectionPool });
+			}
+		}
+		ASSERT(connectionPool.isValid());
+
+		maybeStartStatsLogger();
 	}

 	static std::string getURLFormat(bool withResource = false) {
 		const char* resource = "";
 		if (withResource)
 			resource = "<name>";
-		return format(
-		    "blobstore://<api_key>:<secret>:<security_token>@<host>[:<port>]/%s[?<param>=<value>[&<param>=<value>]...]",
-		    resource);
+		return format("blobstore://<api_key>:<secret>:<security_token>@<host>[:<port>]/"
+		              "%s[?<param>=<value>[&<param>=<value>]...]",
+		              resource);
 	}

 	typedef std::map<std::string, std::string> ParametersT;
@ -149,11 +270,9 @@ public:
 	// parameters in addition to the passed params string
 	std::string getResourceURL(std::string resource, std::string params) const;

-	struct ReusableConnection {
-		Reference<IConnection> conn;
-		double expirationTime;
-	};
-	std::queue<ReusableConnection> connectionPool;
+	// FIXME: add periodic connection reaper to pool
+	// local connection pool for this blobstore
+	Reference<ConnectionPoolData> connectionPool;
 	Future<ReusableConnection> connect(bool* reusingConn);
 	void returnConnection(ReusableConnection& conn);

--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -791,6 +791,7 @@ public:
 	std::string STORAGESERVER_READ_PRIORITIES;
 	int STORAGE_SERVER_READ_CONCURRENCY;
 	std::string STORAGESERVER_READTYPE_PRIORITY_MAP;
+	int SPLIT_METRICS_MAX_ROWS;

 	// Wait Failure
 	int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -937,7 +938,6 @@ public:
 	double REDWOOD_HISTOGRAM_INTERVAL;
 	bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache.
 	int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
-	bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled

 	std::string REDWOOD_IO_PRIORITIES;

@ -990,11 +990,18 @@ public:
 	int BG_RDC_BYTES_FACTOR;
 	int BG_RDC_READ_FACTOR;
 	bool BG_WRITE_MULTIPART;
+	bool BG_ENABLE_DYNAMIC_WRITE_AMP;
+	double BG_DYNAMIC_WRITE_AMP_MIN_FACTOR;
+	double BG_DYNAMIC_WRITE_AMP_DECREASE_FACTOR;

 	int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
 	int BLOB_WORKER_RESNAPSHOT_PARALLELISM;
 	int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM;
 	int BLOB_WORKER_RDC_PARALLELISM;
+	// The resnapshot/delta parallelism knobs are deprecated and replaced by the budget_bytes knobs! FIXME: remove after
+	// next release
+	int64_t BLOB_WORKER_RESNAPSHOT_BUDGET_BYTES;
+	int64_t BLOB_WORKER_DELTA_WRITE_BUDGET_BYTES;

 	double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
 	double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout
@ -1017,6 +1024,7 @@ public:
 	int BLOB_MANIFEST_RW_ROWS;
 	std::string BLOB_RESTORE_MLOGS_URL;
 	int BLOB_MIGRATOR_ERROR_RETRIES;
+	std::string BLOB_RESTORE_MANIFEST_URL;

 	// Blob metadata
 	int64_t BLOB_METADATA_CACHE_TTL;
--- a/fdbclient/include/fdbclient/StorageCheckpoint.h
+++ b/fdbclient/include/fdbclient/StorageCheckpoint.h
@ -160,9 +160,9 @@ struct DataMoveMetaData {
 	void setPhase(Phase phase) { this->phase = static_cast<int16_t>(phase); }

 	std::string toString() const {
-		std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + describe(ranges) +
-		                  " [Phase]: " + std::to_string(static_cast<int>(phase)) +
-		                  " [Source Servers]: " + describe(src) + " [Destination Servers]: " + describe(dest);
+		std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + ", [Range]: " + describe(ranges) +
+		                  ", [Phase]: " + std::to_string(static_cast<int>(phase)) +
+		                  ", [Source Servers]: " + describe(src) + ", [Destination Servers]: " + describe(dest);
 		return res;
 	}

@ -172,4 +172,4 @@ struct DataMoveMetaData {
 	}
 };

-#endif
+#endif
--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@ -740,10 +740,11 @@ struct SplitMetricsReply {
 	constexpr static FileIdentifier file_identifier = 11530792;
 	Standalone<VectorRef<KeyRef>> splits;
 	StorageMetrics used;
+	bool more = false;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, splits, used);
+		serializer(ar, splits, used, more);
 	}
 };

--- a/fdbclient/include/fdbclient/TenantEntryCache.actor.h
+++ b/fdbclient/include/fdbclient/TenantEntryCache.actor.h
@ -220,7 +220,7 @@ private:
 			if (!cache->lastTenantId.present()) {
 				return false;
 			}
-			return cache->lastTenantId.get() > 0;
+			return cache->lastTenantId.get() >= 0;
 		}
 		return true;
 	}
--- a/fdbclient/include/fdbclient/TenantManagement.actor.h
+++ b/fdbclient/include/fdbclient/TenantManagement.actor.h
@ -474,18 +474,37 @@ Future<Void> configureTenantTransaction(Transaction tr,
 	return Void();
 }

-ACTOR template <class Transaction>
-Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(Transaction tr,
-                                                                                  TenantName begin,
-                                                                                  TenantName end,
-                                                                                  int limit) {
+template <class Transaction>
+Future<std::vector<std::pair<TenantName, int64_t>>> listTenantsTransaction(Transaction tr,
+                                                                           TenantName begin,
+                                                                           TenantName end,
+                                                                           int limit) {
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+	auto future = TenantMetadata::tenantNameIndex().getRange(tr, begin, end, limit);
+	return fmap([](auto f) -> std::vector<std::pair<TenantName, int64_t>> { return f.results; }, future);
+}

-	KeyBackedRangeResult<std::pair<TenantName, int64_t>> matchingTenants =
-	    wait(TenantMetadata::tenantNameIndex().getRange(tr, begin, end, limit));
+template <class DB>
+Future<std::vector<std::pair<TenantName, int64_t>>> listTenants(Reference<DB> db,
+                                                                TenantName begin,
+                                                                TenantName end,
+                                                                int limit) {
+	return runTransaction(db, [=](Reference<typename DB::TransactionT> tr) {
+		tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+		return listTenantsTransaction(tr, begin, end, limit);
+	});
+}
+
+ACTOR template <class Transaction>
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadataTransaction(Transaction tr,
+                                                                                         TenantName begin,
+                                                                                         TenantName end,
+                                                                                         int limit) {
+	std::vector<std::pair<TenantName, int64_t>> matchingTenants = wait(listTenantsTransaction(tr, begin, end, limit));

 	state std::vector<Future<TenantMapEntry>> tenantEntryFutures;
-	for (auto const& [name, id] : matchingTenants.results) {
+	for (auto const& [name, id] : matchingTenants) {
 		tenantEntryFutures.push_back(getTenantTransaction(tr, id));
 	}

@ -499,24 +518,16 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransactio
 	return results;
 }

-ACTOR template <class DB>
-Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(Reference<DB> db,
-                                                                       TenantName begin,
-                                                                       TenantName end,
-                                                                       int limit) {
-	state Reference<typename DB::TransactionT> tr = db->createTransaction();
-
-	loop {
-		try {
-			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
-			tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
-			std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-			    wait(listTenantsTransaction(tr, begin, end, limit));
-			return tenants;
-		} catch (Error& e) {
-			wait(safeThreadFutureToFuture(tr->onError(e)));
-		}
-	}
+template <class DB>
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadata(Reference<DB> db,
+                                                                              TenantName begin,
+                                                                              TenantName end,
+                                                                              int limit) {
+	return runTransaction(db, [=](Reference<typename DB::TransactionT> tr) {
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+		tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+		return listTenantMetadataTransaction(tr, begin, end, limit);
+	});
 }

 ACTOR template <class Transaction>
--- a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h
+++ b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h
@ -70,7 +70,7 @@ private:
 	                                        RangeResult* results,
 	                                        GetRangeLimits limitsHint) {
 		std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-		    wait(TenantAPI::listTenantsTransaction(&ryw->getTransaction(), kr.begin, kr.end, limitsHint.rows));
+		    wait(TenantAPI::listTenantMetadataTransaction(&ryw->getTransaction(), kr.begin, kr.end, limitsHint.rows));

 		for (auto tenant : tenants) {
 			std::string jsonString = tenant.second.toJson();
@ -202,7 +202,7 @@ private:
 	                                            TenantName beginTenant,
 	                                            TenantName endTenant,
 	                                            std::map<TenantGroupName, int>* tenantGroupNetTenantDelta) {
-		state std::vector<std::pair<TenantName, TenantMapEntry>> tenants = wait(
+		state std::vector<std::pair<TenantName, int64_t>> tenants = wait(
 		    TenantAPI::listTenantsTransaction(&ryw->getTransaction(), beginTenant, endTenant, CLIENT_KNOBS->TOO_MANY));

 		if (tenants.size() == CLIENT_KNOBS->TOO_MANY) {
--- a/fdbclient/include/fdbclient/VersionedMap.h
+++ b/fdbclient/include/fdbclient/VersionedMap.h
@ -351,6 +351,41 @@ void removeRoot(Reference<PTree<T>>& p, Version at) {
 	}
 }

+// changes p to point to a PTree with finger removed. p must be the root of the
+// tree associated with finger.
+//
+// Invalidates finger.
+template <class T>
+void removeFinger(Reference<PTree<T>>& p, Version at, PTreeFinger<T> finger) {
+	ASSERT_GT(finger.size(), 0);
+	// Start at the end of the finger, remove, and propagate copies up along the
+	// search path (finger) as needed.
+	auto node = Reference<PTree<T>>::addRef(const_cast<PTree<T>*>(finger.back()));
+	auto* before = node.getPtr();
+	removeRoot(node, at);
+	for (;;) {
+		if (before == node.getPtr()) {
+			// Done propagating copies
+			return;
+		}
+		if (finger.size() == 1) {
+			// Check we passed the correct root for this finger
+			ASSERT(p.getPtr() == before);
+			// Propagate copy to root
+			p = node;
+			return;
+		}
+		finger.pop_back();
+		auto parent = Reference<PTree<T>>::addRef(const_cast<PTree<T>*>(finger.back()));
+		bool isLeftChild = parent->left(at).getPtr() == before;
+		bool isRightChild = parent->right(at).getPtr() == before;
+		ASSERT(isLeftChild || isRightChild); // Corrupt finger?
+		// Prepare for next iteration
+		before = parent.getPtr();
+		node = update(parent, isRightChild, node, at);
+	}
+}
+
 // changes p to point to a PTree with x removed
 template <class T, class X>
 void remove(Reference<PTree<T>>& p, Version at, const X& x) {
@ -745,9 +780,8 @@ public:
 		PTreeImpl::remove(roots.back().second, latestVersion, key);
 	}
 	void erase(iterator const& item) { // iterator must be in latest version!
-		// SOMEDAY: Optimize to use item.finger and avoid repeated search
-		K key = item.key();
-		erase(key);
+		ASSERT_EQ(item.at, latestVersion);
+		PTreeImpl::removeFinger(roots.back().second, latestVersion, item.finger);
 	}

 	void printDetail() { PTreeImpl::printTreeDetails(roots.back().second, 0); }
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@ -59,6 +59,8 @@ description is not currently required but encouraged.
            description="Once provided, this string will be used to replace the port/PID in the log file names." />
    <Option name="trace_share_among_client_threads" code="37"
            description="Use the same base trace file name for all client threads as it did before version 7.2. The current default behavior is to use distinct trace file names for client threads by including their version and thread index." />
+    <Option name="trace_initialize_on_setup" code="38"
+            description="Initialize trace files on network setup, determine the local IP later. Otherwise tracing is initialized when opening the first database." />
    <Option name="trace_partial_file_suffix" code="39"
            paramType="String" paramDescription="Append this suffix to partially written log files. When a log file is complete, it is renamed to remove the suffix. No separator is added between the file and the suffix. If you want to add a file extension, you should include the separator - e.g. '.tmp' instead of 'tmp' to add the 'tmp' extension."
            description="Set file suffix for partially written log files." />
@ -216,7 +218,8 @@ description is not currently required but encouraged.
    <Option name="use_config_database" code="800"
            description="Use configuration database." />
    <Option name="test_causal_read_risky" code="900"
-            description="An integer between 0 and 100 (default is 0) expressing the probability that a client will verify it can't read stale data whenever it detects a recovery." />
+            paramType="Int" paramDescription="integer between 0 and 100 expressing the probability a client will verify it can't read stale data"
+            description="Enables verification of causal read risky by checking whether clients are able to read stale data when they detect a recovery, and logging an error if so." />
  </Scope>
  
  <Scope name="TransactionOption">
--- a/fdbrpc/Net2FileSystem.cpp
+++ b/fdbrpc/Net2FileSystem.cpp
@ -22,7 +22,9 @@

 // Define boost::asio::io_service
 #include <algorithm>
+#ifndef BOOST_SYSTEM_NO_LIB
 #define BOOST_SYSTEM_NO_LIB
+#endif
 #define BOOST_DATE_TIME_NO_LIB
 #define BOOST_REGEX_NO_LIB
 #include <boost/asio.hpp>
--- a/fdbrpc/SimExternalConnection.actor.cpp
+++ b/fdbrpc/SimExternalConnection.actor.cpp
@ -18,7 +18,9 @@
 * limitations under the License.
 */

+#ifndef BOOST_SYSTEM_NO_LIB
 #define BOOST_SYSTEM_NO_LIB
+#endif
 #define BOOST_DATE_TIME_NO_LIB
 #define BOOST_REGEX_NO_LIB
 #include <boost/asio.hpp>
--- a/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h
@ -127,10 +127,23 @@ public:
 			int rlen = readEnd - readStart;
 			memcpy((uint8_t*)data + wpos, block->data + readStart, rlen);
 			wpos += rlen;
+
+			// unpin this block
+			localCache.erase(blockNum);
+			if (f->m_blocks.size() > f->m_cache_block_limit) {
+				// make an attempt to free no-longer needed blocks as we go
+				// FIXME: could also expire previous blocks if above limit and they're also free
+				auto i = f->m_blocks.find(blockNum);
+				ASSERT(i != f->m_blocks.end() && i->first == blockNum);
+				if (i->second.getFutureReferenceCount() == 1) {
+					// printf("evicting block %d\n", i->first);
+					i = f->m_blocks.erase(i);
+				}
+			}
 		}

 		ASSERT(wpos == length);
-		localCache.clear();
+		ASSERT(localCache.empty());

 		// If the cache is too large then go through the cache in block number order and remove any entries whose future
 		// has a reference count of 1, stopping once the cache is no longer too big.  There is no point in removing
--- a/Show More
+++ b/Show More