Merge branch 'main' of https://github.com/apple/foundationdb into feature/main/txnProcessor_team

2022-09-21 15:35:26 -07:00 · 2022-09-21 15:35:26 -07:00 · 141bd99133
parent 5500ec8126 a8d3282250
commit 141bd99133
68 changed files with 1948 additions and 987 deletions
--- a/.gitignore
+++ b/.gitignore
@ -64,6 +64,7 @@ packaging/msi/obj
 simfdb
 tests/oldBinaries
 trace.*.xml
+trace.*.json
 .venv

 # Editor files
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -548,10 +548,14 @@ extern "C" DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_rang
                                                                                  uint8_t const* end_key_name,
                                                                                  int end_key_name_length,
                                                                                  int64_t version) {
+	Optional<Version> rv;
+	if (version != latestVersion) {
+		rv = version;
+	}
 	return (FDBFuture*)(DB(db)
 	                        ->verifyBlobRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
 	                                                      StringRef(end_key_name, end_key_name_length)),
-	                                          version)
+	                                          rv)
 	                        .extractPtr());
 }

--- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
+++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
@ -36,7 +36,16 @@ public:

 private:
 	// FIXME: use other new blob granule apis!
-	enum OpType { OP_INSERT, OP_CLEAR, OP_CLEAR_RANGE, OP_READ, OP_GET_RANGES, OP_SUMMARIZE, OP_LAST = OP_SUMMARIZE };
+	enum OpType {
+		OP_INSERT,
+		OP_CLEAR,
+		OP_CLEAR_RANGE,
+		OP_READ,
+		OP_GET_GRANULES,
+		OP_SUMMARIZE,
+		OP_GET_BLOB_RANGES,
+		OP_LAST = OP_GET_BLOB_RANGES
+	};
 	std::vector<OpType> excludedOpTypes;

 	// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
@ -120,7 +129,7 @@ private:
 		    getTenant(tenantId));
 	}

-	void randomGetRangesOp(TTaskFct cont, std::optional<int> tenantId) {
+	void randomGetGranulesOp(TTaskFct cont, std::optional<int> tenantId) {
 		fdb::Key begin = randomKeyName();
 		fdb::Key end = randomKeyName();
 		if (begin > end) {
@ -140,36 +149,7 @@ private:
 			        true);
 		    },
 		    [this, begin, end, results, cont]() {
-			    if (seenReadSuccess) {
-				    ASSERT(results->size() > 0);
-				    ASSERT(results->front().beginKey <= begin);
-				    ASSERT(results->back().endKey >= end);
-			    }
-
-			    for (int i = 0; i < results->size(); i++) {
-				    // no empty or inverted ranges
-				    if ((*results)[i].beginKey >= (*results)[i].endKey) {
-					    error(fmt::format("Empty/inverted range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
-					                      fdb::toCharsRef((*results)[i].beginKey),
-					                      fdb::toCharsRef((*results)[i].endKey),
-					                      fdb::toCharsRef(begin),
-					                      fdb::toCharsRef(end)));
-				    }
-				    ASSERT((*results)[i].beginKey < (*results)[i].endKey);
-			    }
-
-			    for (int i = 1; i < results->size(); i++) {
-				    // ranges contain entire requested key range
-				    if ((*results)[i].beginKey != (*results)[i].endKey) {
-					    error(fmt::format("Non-contiguous range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
-					                      fdb::toCharsRef((*results)[i].beginKey),
-					                      fdb::toCharsRef((*results)[i].endKey),
-					                      fdb::toCharsRef(begin),
-					                      fdb::toCharsRef(end)));
-				    }
-				    ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey);
-			    }
-
+			    this->validateRanges(results, begin, end, seenReadSuccess);
 			    schedule(cont);
 		    },
 		    getTenant(tenantId));
@ -218,6 +198,62 @@ private:
 		    getTenant(tenantId));
 	}

+	void validateRanges(std::shared_ptr<std::vector<fdb::KeyRange>> results,
+	                    fdb::Key begin,
+	                    fdb::Key end,
+	                    bool shouldBeRanges) {
+		if (shouldBeRanges) {
+			ASSERT(results->size() > 0);
+			ASSERT(results->front().beginKey <= begin);
+			ASSERT(results->back().endKey >= end);
+		}
+		for (int i = 0; i < results->size(); i++) {
+			// no empty or inverted ranges
+			if ((*results)[i].beginKey >= (*results)[i].endKey) {
+				error(fmt::format("Empty/inverted range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
+				                  fdb::toCharsRef((*results)[i].beginKey),
+				                  fdb::toCharsRef((*results)[i].endKey),
+				                  fdb::toCharsRef(begin),
+				                  fdb::toCharsRef(end)));
+			}
+			ASSERT((*results)[i].beginKey < (*results)[i].endKey);
+		}
+
+		for (int i = 1; i < results->size(); i++) {
+			// ranges contain entire requested key range
+			if ((*results)[i].beginKey != (*results)[i].endKey) {
+				error(fmt::format("Non-contiguous range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
+				                  fdb::toCharsRef((*results)[i].beginKey),
+				                  fdb::toCharsRef((*results)[i].endKey),
+				                  fdb::toCharsRef(begin),
+				                  fdb::toCharsRef(end)));
+			}
+			ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey);
+		}
+	}
+
+	void randomGetBlobRangesOp(TTaskFct cont) {
+		fdb::Key begin = randomKeyName();
+		fdb::Key end = randomKeyName();
+		auto results = std::make_shared<std::vector<fdb::KeyRange>>();
+		if (begin > end) {
+			std::swap(begin, end);
+		}
+		execOperation(
+		    [begin, end, results](auto ctx) {
+			    fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType();
+			    ctx->continueAfter(f, [ctx, f, results]() {
+				    *results = copyKeyRangeArray(f.get<fdb::future_var::KeyRangeRefArray>());
+				    ctx->done();
+			    });
+		    },
+		    [this, begin, end, results, cont]() {
+			    this->validateRanges(results, begin, end, seenReadSuccess);
+			    schedule(cont);
+		    },
+		    /* failOnError = */ false);
+	}
+
 	void randomOperation(TTaskFct cont) {
 		std::optional<int> tenantId = randomTenant();

@ -239,12 +275,15 @@ private:
 		case OP_READ:
 			randomReadOp(cont, tenantId);
 			break;
-		case OP_GET_RANGES:
-			randomGetRangesOp(cont, tenantId);
+		case OP_GET_GRANULES:
+			randomGetGranulesOp(cont, tenantId);
 			break;
 		case OP_SUMMARIZE:
 			randomSummarizeOp(cont, tenantId);
 			break;
+		case OP_GET_BLOB_RANGES:
+			randomGetBlobRangesOp(cont);
+			break;
 		}
 	}
 };
--- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp
+++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp
@ -204,23 +204,23 @@ private:

 	void getRangeLoop(std::shared_ptr<ITransactionContext> ctx,
 	                  fdb::KeySelector begin,
-	                  fdb::KeySelector end,
+	                  fdb::Key endKey,
 	                  std::shared_ptr<std::vector<fdb::KeyValue>> results) {
 		auto f = ctx->tx().getRange(begin,
-		                            end,
+		                            fdb::key_select::firstGreaterOrEqual(endKey),
 		                            0 /*limit*/,
 		                            0 /*target_bytes*/,
 		                            FDB_STREAMING_MODE_WANT_ALL,
 		                            0 /*iteration*/,
 		                            false /*snapshot*/,
 		                            false /*reverse*/);
-		ctx->continueAfter(f, [this, ctx, f, end, results]() {
+		ctx->continueAfter(f, [this, ctx, f, endKey, results]() {
 			auto out = copyKeyValueArray(f.get());
 			results->insert(results->end(), out.first.begin(), out.first.end());
 			const bool more = out.second;
 			if (more) {
 				// Fetch the remaining results.
-				getRangeLoop(ctx, fdb::key_select::firstGreaterThan(results->back().key), end, results);
+				getRangeLoop(ctx, fdb::key_select::firstGreaterThan(results->back().key), endKey, results);
 			} else {
 				ctx->done();
 			}
@ -237,10 +237,7 @@ private:
 			    // Clear the results vector, in case the transaction is retried.
 			    results->clear();

-			    getRangeLoop(ctx,
-			                 fdb::key_select::firstGreaterOrEqual(begin),
-			                 fdb::key_select::firstGreaterOrEqual(end),
-			                 results);
+			    getRangeLoop(ctx, fdb::key_select::firstGreaterOrEqual(begin), end, results);
 		    },
 		    [this, begin, end, results, cont, tenantId]() {
 			    auto expected = stores[tenantId].getRange(begin, end, results->size() + 10, false);
--- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp
+++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp
@ -40,11 +40,6 @@ namespace FdbApiTester {
 constexpr int LONG_WAIT_TIME_US = 2000000;
 constexpr int LARGE_NUMBER_OF_RETRIES = 10;

-void TransactionActorBase::complete(fdb::Error err) {
-	error = err;
-	context = {};
-}
-
 void ITransactionContext::continueAfterAll(std::vector<fdb::Future> futures, TTaskFct cont) {
 	auto counter = std::make_shared<std::atomic<int>>(futures.size());
 	auto errorCode = std::make_shared<std::atomic<fdb::Error>>(fdb::Error::success());
@ -76,28 +71,31 @@ void ITransactionContext::continueAfterAll(std::vector<fdb::Future> futures, TTa
 class TransactionContextBase : public ITransactionContext {
 public:
 	TransactionContextBase(ITransactionExecutor* executor,
-	                       std::shared_ptr<ITransactionActor> txActor,
-	                       TTaskFct cont,
+	                       TOpStartFct startFct,
+	                       TOpContFct cont,
 	                       IScheduler* scheduler,
 	                       int retryLimit,
 	                       std::string bgBasePath,
-	                       std::optional<fdb::BytesRef> tenantName)
-	  : executor(executor), txActor(txActor), contAfterDone(cont), scheduler(scheduler), retryLimit(retryLimit),
-	    txState(TxState::IN_PROGRESS), commitCalled(false), bgBasePath(bgBasePath), tenantName(tenantName) {
+	                       std::optional<fdb::BytesRef> tenantName,
+	                       bool transactional)
+	  : executor(executor), startFct(startFct), contAfterDone(cont), scheduler(scheduler), retryLimit(retryLimit),
+	    txState(TxState::IN_PROGRESS), commitCalled(false), bgBasePath(bgBasePath), tenantName(tenantName),
+	    transactional(transactional) {
 		databaseCreateErrorInjected = executor->getOptions().injectDatabaseCreateErrors &&
 		                              Random::get().randomBool(executor->getOptions().databaseCreateErrorRatio);
-		fdb::Database db;
 		if (databaseCreateErrorInjected) {
-			db = fdb::Database(executor->getClusterFileForErrorInjection());
+			fdbDb = fdb::Database(executor->getClusterFileForErrorInjection());
 		} else {
-			db = executor->selectDatabase();
+			fdbDb = executor->selectDatabase();
 		}

-		if (tenantName) {
-			fdb::Tenant tenant = db.openTenant(*tenantName);
-			fdbTx = tenant.createTransaction();
-		} else {
-			fdbTx = db.createTransaction();
+		if (transactional) {
+			if (tenantName) {
+				fdb::Tenant tenant = fdbDb.openTenant(*tenantName);
+				fdbTx = tenant.createTransaction();
+			} else {
+				fdbTx = fdbDb.createTransaction();
+			}
 		}
 	}

@ -107,6 +105,8 @@ public:
 	// IN_PROGRESS -> (ON_ERROR -> IN_PROGRESS)* [-> ON_ERROR] -> DONE
 	enum class TxState { IN_PROGRESS, ON_ERROR, DONE };

+	fdb::Database db() override { return fdbDb.atomic_load(); }
+
 	fdb::Transaction tx() override { return fdbTx.atomic_load(); }

 	// Set a continuation to be executed when a future gets ready
@ -116,6 +116,7 @@ public:

 	// Complete the transaction with a commit
 	void commit() override {
+		ASSERT(transactional);
 		std::unique_lock<std::mutex> lock(mutex);
 		if (txState != TxState::IN_PROGRESS) {
 			return;
@ -146,14 +147,14 @@ public:
 			           fmt::join(retriedErrorCodes(), ", "));
 		}

-		// cancel transaction so that any pending operations on it
-		// fail gracefully
-		fdbTx.cancel();
-
-		txActor->complete(fdb::Error::success());
-		cleanUp();
+		if (transactional) {
+			// cancel transaction so that any pending operations on it
+			// fail gracefully
+			fdbTx.cancel();
+			cleanUp();
+		}
 		ASSERT(txState == TxState::DONE);
-		contAfterDone();
+		contAfterDone(fdb::Error::success());
 	}

 	std::string getBGBasePath() override { return bgBasePath; }
@ -179,20 +180,26 @@ public:
 		if (databaseCreateErrorInjected && canBeInjectedDatabaseCreateError(err.code())) {
 			// Failed to create a database because of failure injection
 			// Restart by recreating the transaction in a valid database
-			scheduler->schedule([this]() {
-				fdb::Database db = executor->selectDatabase();
-				if (tenantName) {
-					fdb::Tenant tenant = db.openTenant(*tenantName);
-					fdbTx.atomic_store(tenant.createTransaction());
-				} else {
-					fdbTx.atomic_store(db.createTransaction());
+			auto thisRef = std::static_pointer_cast<TransactionContextBase>(shared_from_this());
+			scheduler->schedule([thisRef]() {
+				fdb::Database db = thisRef->executor->selectDatabase();
+				thisRef->fdbDb.atomic_store(db);
+				if (thisRef->transactional) {
+					if (thisRef->tenantName) {
+						fdb::Tenant tenant = db.openTenant(*thisRef->tenantName);
+						thisRef->fdbTx.atomic_store(tenant.createTransaction());
+					} else {
+						thisRef->fdbTx.atomic_store(db.createTransaction());
+					}
 				}
-				restartTransaction();
+				thisRef->restartTransaction();
 			});
-		} else {
+		} else if (transactional) {
 			onErrorArg = err;
 			onErrorFuture = tx().onError(err);
 			handleOnErrorFuture();
+		} else {
+			transactionFailed(err);
 		}
 	}

@ -207,7 +214,6 @@ protected:
 	void cleanUp() {
 		ASSERT(txState == TxState::DONE);
 		ASSERT(!onErrorFuture);
-		txActor = {};
 		cancelPendingFutures();
 	}

@ -230,9 +236,8 @@ protected:
 		// No need for lock from here on, because only one thread
 		// can enter DONE state and handle it

-		txActor->complete(err);
 		cleanUp();
-		contAfterDone();
+		contAfterDone(err);
 	}

 	// Handle result of an a transaction onError call
@ -254,7 +259,7 @@ protected:
 		txState = TxState::IN_PROGRESS;
 		commitCalled = false;
 		lock.unlock();
-		txActor->start();
+		startFct(shared_from_this());
 	}

 	// Checks if a transaction can be retried. Fails the transaction if the check fails
@ -286,13 +291,17 @@ protected:
 	// Set in contructor, stays immutable
 	ITransactionExecutor* const executor;

+	// FDB database
+	// Provides a thread safe interface by itself (no need for mutex)
+	fdb::Database fdbDb;
+
 	// FDB transaction
 	// Provides a thread safe interface by itself (no need for mutex)
 	fdb::Transaction fdbTx;

-	// Actor implementing the transaction worklflow
+	// The function implementing the starting point of the transaction
 	// Set in constructor and reset on cleanup (no need for mutex)
-	std::shared_ptr<ITransactionActor> txActor;
+	TOpStartFct startFct;

 	// Mutex protecting access to shared mutable state
 	// Only the state that is accessible unter IN_PROGRESS state
@ -301,7 +310,7 @@ protected:

 	// Continuation to be called after completion of the transaction
 	// Set in contructor, stays immutable
-	const TTaskFct contAfterDone;
+	const TOpContFct contAfterDone;

 	// Reference to the scheduler
 	// Set in contructor, stays immutable
@ -346,6 +355,9 @@ protected:

 	// The tenant that we will run this transaction in
 	const std::optional<fdb::BytesRef> tenantName;
+
+	// Specifies whether the operation is transactional
+	const bool transactional;
 };

 /**
@ -354,13 +366,15 @@ protected:
 class BlockingTransactionContext : public TransactionContextBase {
 public:
 	BlockingTransactionContext(ITransactionExecutor* executor,
-	                           std::shared_ptr<ITransactionActor> txActor,
-	                           TTaskFct cont,
+	                           TOpStartFct startFct,
+	                           TOpContFct cont,
 	                           IScheduler* scheduler,
 	                           int retryLimit,
 	                           std::string bgBasePath,
-	                           std::optional<fdb::BytesRef> tenantName)
-	  : TransactionContextBase(executor, txActor, cont, scheduler, retryLimit, bgBasePath, tenantName) {}
+	                           std::optional<fdb::BytesRef> tenantName,
+	                           bool transactional)
+	  : TransactionContextBase(executor, startFct, cont, scheduler, retryLimit, bgBasePath, tenantName, transactional) {
+	}

 protected:
 	void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override {
@ -430,13 +444,15 @@ protected:
 class AsyncTransactionContext : public TransactionContextBase {
 public:
 	AsyncTransactionContext(ITransactionExecutor* executor,
-	                        std::shared_ptr<ITransactionActor> txActor,
-	                        TTaskFct cont,
+	                        TOpStartFct startFct,
+	                        TOpContFct cont,
 	                        IScheduler* scheduler,
 	                        int retryLimit,
 	                        std::string bgBasePath,
-	                        std::optional<fdb::BytesRef> tenantName)
-	  : TransactionContextBase(executor, txActor, cont, scheduler, retryLimit, bgBasePath, tenantName) {}
+	                        std::optional<fdb::BytesRef> tenantName,
+	                        bool transactional)
+	  : TransactionContextBase(executor, startFct, cont, scheduler, retryLimit, bgBasePath, tenantName, transactional) {
+	}

 protected:
 	void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override {
@ -648,23 +664,22 @@ public:

 	const TransactionExecutorOptions& getOptions() override { return options; }

-	void execute(std::shared_ptr<ITransactionActor> txActor,
-	             TTaskFct cont,
-	             std::optional<fdb::BytesRef> tenantName = {}) override {
+	void execute(TOpStartFct startFct,
+	             TOpContFct cont,
+	             std::optional<fdb::BytesRef> tenantName,
+	             bool transactional) override {
 		try {
 			std::shared_ptr<ITransactionContext> ctx;
 			if (options.blockOnFutures) {
 				ctx = std::make_shared<BlockingTransactionContext>(
-				    this, txActor, cont, scheduler, options.transactionRetryLimit, bgBasePath, tenantName);
+				    this, startFct, cont, scheduler, options.transactionRetryLimit, bgBasePath, tenantName, true);
 			} else {
 				ctx = std::make_shared<AsyncTransactionContext>(
-				    this, txActor, cont, scheduler, options.transactionRetryLimit, bgBasePath, tenantName);
+				    this, startFct, cont, scheduler, options.transactionRetryLimit, bgBasePath, tenantName, true);
 			}
-			txActor->init(ctx);
-			txActor->start();
+			startFct(ctx);
 		} catch (...) {
-			txActor->complete(fdb::Error(error_code_operation_failed));
-			cont();
+			cont(fdb::Error(error_code_operation_failed));
 		}
 	}

--- a/bindings/c/test/apitester/TesterTransactionExecutor.h
+++ b/bindings/c/test/apitester/TesterTransactionExecutor.h
@ -38,6 +38,9 @@ class ITransactionContext : public std::enable_shared_from_this<ITransactionCont
 public:
 	virtual ~ITransactionContext() {}

+	// Current FDB database
+	virtual fdb::Database db() = 0;
+
 	// Current FDB transaction
 	virtual fdb::Transaction tx() = 0;

@ -62,57 +65,11 @@ public:
 	virtual void continueAfterAll(std::vector<fdb::Future> futures, TTaskFct cont);
 };

-/**
- * Interface of an actor object implementing a concrete transaction
- */
-class ITransactionActor {
-public:
-	virtual ~ITransactionActor() {}
+// Type of the lambda functions implementing a database operation
+using TOpStartFct = std::function<void(std::shared_ptr<ITransactionContext>)>;

-	// Initialize with the given transaction context
-	virtual void init(std::shared_ptr<ITransactionContext> ctx) = 0;
-
-	// Start execution of the transaction, also called on retries
-	virtual void start() = 0;
-
-	// Transaction completion result (error_code_success in case of success)
-	virtual fdb::Error getError() = 0;
-
-	// Notification about the completion of the transaction
-	virtual void complete(fdb::Error err) = 0;
-};
-
-/**
- * A helper base class for transaction actors
- */
-class TransactionActorBase : public ITransactionActor {
-public:
-	void init(std::shared_ptr<ITransactionContext> ctx) override { context = ctx; }
-	fdb::Error getError() override { return error; }
-	void complete(fdb::Error err) override;
-
-protected:
-	std::shared_ptr<ITransactionContext> ctx() { return context; }
-
-private:
-	std::shared_ptr<ITransactionContext> context;
-	fdb::Error error = fdb::Error::success();
-};
-
-// Type of the lambda functions implementing a transaction
-using TTxStartFct = std::function<void(std::shared_ptr<ITransactionContext>)>;
-
-/**
- * A wrapper class for transactions implemented by lambda functions
- */
-class TransactionFct : public TransactionActorBase {
-public:
-	TransactionFct(TTxStartFct startFct) : startFct(startFct) {}
-	void start() override { startFct(this->ctx()); }
-
-private:
-	TTxStartFct startFct;
-};
+// Type of the lambda functions implementing a database operation
+using TOpContFct = std::function<void(fdb::Error)>;

 /**
 * Configuration of transaction execution mode
@ -156,9 +113,10 @@ class ITransactionExecutor {
 public:
 	virtual ~ITransactionExecutor() {}
 	virtual void init(IScheduler* sched, const char* clusterFile, const std::string& bgBasePath) = 0;
-	virtual void execute(std::shared_ptr<ITransactionActor> tx,
-	                     TTaskFct cont,
-	                     std::optional<fdb::BytesRef> tenantName = {}) = 0;
+	virtual void execute(TOpStartFct start,
+	                     TOpContFct cont,
+	                     std::optional<fdb::BytesRef> tenantName,
+	                     bool transactional) = 0;
 	virtual fdb::Database selectDatabase() = 0;
 	virtual std::string getClusterFileForErrorInjection() = 0;
 	virtual const TransactionExecutorOptions& getOptions() = 0;
--- a/bindings/c/test/apitester/TesterWorkload.cpp
+++ b/bindings/c/test/apitester/TesterWorkload.cpp
@ -106,10 +106,23 @@ void WorkloadBase::schedule(TTaskFct task) {
 	});
 }

-void WorkloadBase::execTransaction(std::shared_ptr<ITransactionActor> tx,
+void WorkloadBase::execTransaction(TOpStartFct startFct,
                                   TTaskFct cont,
                                   std::optional<fdb::BytesRef> tenant,
                                   bool failOnError) {
+	doExecute(startFct, cont, tenant, failOnError, true);
+}
+
+// Execute a non-transactional database operation within the workload
+void WorkloadBase::execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError) {
+	doExecute(startFct, cont, {}, failOnError, false);
+}
+
+void WorkloadBase::doExecute(TOpStartFct startFct,
+                             TTaskFct cont,
+                             std::optional<fdb::BytesRef> tenant,
+                             bool failOnError,
+                             bool transactional) {
 	ASSERT(inProgress);
 	if (failed) {
 		return;
@ -117,10 +130,9 @@ void WorkloadBase::execTransaction(std::shared_ptr<ITransactionActor> tx,
 	tasksScheduled++;
 	numTxStarted++;
 	manager->txExecutor->execute(
-	    tx,
-	    [this, tx, cont, failOnError]() {
+	    startFct,
+	    [this, startFct, cont, failOnError](fdb::Error err) {
 		    numTxCompleted++;
-		    fdb::Error err = tx->getError();
 		    if (err.code() == error_code_success) {
 			    cont();
 		    } else {
@ -135,7 +147,8 @@ void WorkloadBase::execTransaction(std::shared_ptr<ITransactionActor> tx,
 		    }
 		    scheduledTaskDone();
 	    },
-	    tenant);
+	    tenant,
+	    transactional);
 }

 void WorkloadBase::info(const std::string& msg) {
--- a/bindings/c/test/apitester/TesterWorkload.h
+++ b/bindings/c/test/apitester/TesterWorkload.h
@ -119,18 +119,13 @@ protected:
 	void schedule(TTaskFct task);

 	// Execute a transaction within the workload
-	void execTransaction(std::shared_ptr<ITransactionActor> tx,
+	void execTransaction(TOpStartFct startFct,
 	                     TTaskFct cont,
 	                     std::optional<fdb::BytesRef> tenant = std::optional<fdb::BytesRef>(),
 	                     bool failOnError = true);

-	// Execute a transaction within the workload, a convenience method for a tranasaction defined by a lambda function
-	void execTransaction(TTxStartFct start,
-	                     TTaskFct cont,
-	                     std::optional<fdb::BytesRef> tenant = std::optional<fdb::BytesRef>(),
-	                     bool failOnError = true) {
-		execTransaction(std::make_shared<TransactionFct>(start), cont, tenant, failOnError);
-	}
+	// Execute a non-transactional database operation within the workload
+	void execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError = true);

 	// Log an error message, increase error counter
 	void error(const std::string& msg);
@ -144,6 +139,12 @@ protected:
 private:
 	WorkloadManager* manager;

+	void doExecute(TOpStartFct startFct,
+	               TTaskFct cont,
+	               std::optional<fdb::BytesRef> tenant,
+	               bool failOnError,
+	               bool transactional);
+
 	// Decrease scheduled task counter, notify the workload manager
 	// that the task is done if no more tasks schedule
 	void scheduledTaskDone();
--- a/bindings/c/test/fdb_api.hpp
+++ b/bindings/c/test/fdb_api.hpp
@ -349,6 +349,7 @@ public:
 class Future {
 protected:
 	friend class Transaction;
+	friend class Database;
 	friend std::hash<Future>;
 	std::shared_ptr<native::FDBFuture> f;

@ -718,6 +719,14 @@ public:
 	}
 	Database() noexcept : db(nullptr) {}

+	void atomic_store(Database other) { std::atomic_store(&db, other.db); }
+
+	Database atomic_load() {
+		Database retVal;
+		retVal.db = std::atomic_load(&db);
+		return retVal;
+	}
+
 	Error setOptionNothrow(FDBDatabaseOption option, int64_t value) noexcept {
 		return Error(native::fdb_database_set_option(
 		    db.get(), option, reinterpret_cast<const uint8_t*>(&value), static_cast<int>(sizeof(value))));
@ -763,6 +772,13 @@ public:
 			throwError("Failed to create transaction: ", err);
 		return Transaction(tx_native);
 	}
+
+	TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) {
+		if (!db)
+			throw std::runtime_error("list_blobbified_ranges from null database");
+		return native::fdb_database_list_blobbified_ranges(
+		    db.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
+	}
 };

 inline Error selectApiVersionNothrow(int version) {
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@ -1037,7 +1037,7 @@ JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1verify
 		return 0;
 	}

-	FDBFuture* f = fdb_database_list_blobbified_ranges(
+	FDBFuture* f = fdb_database_verify_blob_range(
 	    tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), version);
 	jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT);
 	jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT);
--- a/bindings/java/src/main/com/apple/foundationdb/Database.java
+++ b/bindings/java/src/main/com/apple/foundationdb/Database.java
@ -161,6 +161,19 @@ public interface Database extends AutoCloseable, TransactionContext {
 	 */
 	double getMainThreadBusyness();

+	/**
+	 * Runs {@link #purgeBlobGranules(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 * @param force if true delete all data, if not keep data >= purgeVersion
+	 *
+	 * @return the key to watch for purge complete
+	 */
+	default CompletableFuture<byte[]> purgeBlobGranules(byte[] beginKey, byte[] endKey, boolean force) {
+		return purgeBlobGranules(beginKey, endKey, -2, force, getExecutor());
+	}
+
 	/**
 	 * Runs {@link #purgeBlobGranules(Function)} on the default executor.
 	 *
@ -278,6 +291,18 @@ public interface Database extends AutoCloseable, TransactionContext {
 	 */
 	 CompletableFuture<KeyRangeArrayResult> listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e);

+	/**
+	 * Runs {@link #verifyBlobRange(Function)} on the default executor.
+	 *
+	 * @param beginKey start of the key range
+	 * @param endKey end of the key range
+	 *
+	 * @return a future with the version of the last blob granule.
+	 */
+	default CompletableFuture<Long> verifyBlobRange(byte[] beginKey, byte[] endKey) {
+		return verifyBlobRange(beginKey, endKey, -2, getExecutor());
+	}
+
 	/**
 	 * Runs {@link #verifyBlobRange(Function)} on the default executor.
 	 *
--- a/bindings/python/fdb/impl.py
+++ b/bindings/python/fdb/impl.py
@ -1359,7 +1359,7 @@ else:
    except:
        # The system python on OS X can't find the library installed to /usr/local/lib if SIP is enabled
        # find_library does find the location in /usr/local/lib, so if the above fails fallback to using it
-        lib_path = ctypes.util.find_library(capi_name)
+        lib_path = ctypes.util.find_library("fdb_c")
        if lib_path is not None:
            try:
                _capi = ctypes.CDLL(lib_path)
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -25,6 +25,7 @@ env_set(STATIC_LINK_LIBCXX "${_static_link_libcxx}" BOOL "Statically link libstd
 env_set(TRACE_PC_GUARD_INSTRUMENTATION_LIB "" STRING "Path to a library containing an implementation for __sanitizer_cov_trace_pc_guard. See https://clang.llvm.org/docs/SanitizerCoverage.html for more info.")
 env_set(PROFILE_INSTR_GENERATE OFF BOOL "If set, build FDB as an instrumentation build to generate profiles")
 env_set(PROFILE_INSTR_USE "" STRING "If set, build FDB with profile")
+env_set(FULL_DEBUG_SYMBOLS OFF BOOL "Generate full debug symbols")

 set(USE_SANITIZER OFF)
 if(USE_ASAN OR USE_VALGRIND OR USE_MSAN OR USE_TSAN OR USE_UBSAN)
@ -164,9 +165,20 @@ else()
  set(SANITIZER_COMPILE_OPTIONS)
  set(SANITIZER_LINK_OPTIONS)

-  # we always compile with debug symbols. CPack will strip them out
+  # we always compile with debug symbols. For release builds CPack will strip them out
  # and create a debuginfo rpm
-  add_compile_options(-ggdb -fno-omit-frame-pointer)
+  add_compile_options(-fno-omit-frame-pointer -gz)
+  add_link_options(-gz)
+  if(FDB_RELEASE OR FULL_DEBUG_SYMBOLS OR CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # Configure with FULL_DEBUG_SYMBOLS=ON to generate all symbols for debugging with gdb
+    # Also generating full debug symbols in release builds, because they are packaged
+    # separately and installed optionally
+    add_compile_options(-ggdb)
+  else()
+    # Generating minimal debug symbols by default. They are sufficient for testing purposes
+    add_compile_options(-ggdb1)
+  endif()
+
  if(TRACE_PC_GUARD_INSTRUMENTATION_LIB)
      add_compile_options(-fsanitize-coverage=trace-pc-guard)
      link_libraries(${TRACE_PC_GUARD_INSTRUMENTATION_LIB})
--- a/cmake/awssdk.cmake
+++ b/cmake/awssdk.cmake
@ -2,10 +2,8 @@ project(awssdk-download NONE)

 # Compile the sdk with clang and libc++, since otherwise we get libc++ vs libstdc++ link errors when compiling fdb with clang
 set(AWSSDK_COMPILER_FLAGS "")
-set(AWSSDK_LINK_FLAGS "")
-if(APPLE OR CLANG OR USE_LIBCXX)
+if(APPLE OR USE_LIBCXX)
  set(AWSSDK_COMPILER_FLAGS -stdlib=libc++ -nostdlib++)
-  set(AWSSDK_LINK_FLAGS -stdlib=libc++ -lc++abi)
 endif()

 include(ExternalProject)
@ -21,11 +19,11 @@ ExternalProject_Add(awssdk_project
                    -DSIMPLE_INSTALL=ON
                    -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
                    -DBYO_CRYPTO=ON                # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-
+                    -DBUILD_CURL=ON
+                    -DBUILD_ZLIB=ON
                    
                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_EXE_LINKER_FLAGS=${AWSSDK_COMPILER_FLAGS}
-                    -DCMAKE_CXX_FLAGS=${AWSSDK_LINK_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
  TEST_COMMAND      ""
  # the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
  BUILD_BYPRODUCTS  "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
@ -41,6 +39,8 @@ ExternalProject_Add(awssdk_project
                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
+                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
+                    "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
 )

 add_library(awssdk_core STATIC IMPORTED)
@ -96,7 +96,15 @@ add_library(awssdk_c_common STATIC IMPORTED)
 add_dependencies(awssdk_c_common awssdk_project)
 set_target_properties(awssdk_c_common PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a")

+add_library(curl STATIC IMPORTED)
+add_dependencies(curl awssdk_project)
+set_property(TARGET curl PROPERTY IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a")
+
+add_library(zlib STATIC IMPORTED)
+add_dependencies(zlib awssdk_project)
+set_property(TARGET zlib PROPERTY IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a")
+
 # link them all together in one interface target
 add_library(awssdk_target INTERFACE)
 target_include_directories(awssdk_target SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/include)
-target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_sdkutils awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl)
+target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_sdkutils awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl zlib)
--- a/documentation/sphinx/conf.py
+++ b/documentation/sphinx/conf.py
@ -69,7 +69,7 @@ release = root.find(".//{http://schemas.microsoft.com/developer/msbuild/2003}Ver

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
-language = None
+language = 'en'

 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
@ -185,7 +185,7 @@ html_show_copyright = True
 htmlhelp_basename = 'FoundationDB'

 # Disable permalinks
-html_add_permalinks = ""
+html_permalinks = False


 # -- Options for LaTeX output --------------------------------------------------
--- a/documentation/sphinx/extensions/rubydomain.py
+++ b/documentation/sphinx/extensions/rubydomain.py
@ -42,7 +42,7 @@ from docutils.parsers.rst import directives, Directive

 from sphinx import addnodes
 from sphinx.roles import XRefRole
-from sphinx.locale import l_, _
+from sphinx.locale import _
 from sphinx.domains import Domain, ObjType, Index
 from sphinx.directives import ObjectDescription
 from sphinx.util.nodes import make_refnode
@ -83,18 +83,18 @@ class RubyObject(ObjectDescription):
    }

    doc_field_types = [
-        TypedField('parameter', label=l_('Parameters'),
+        TypedField('parameter', label=_('Parameters'),
                   names=('param', 'parameter', 'arg', 'argument'),
                   typerolename='obj', typenames=('paramtype', 'type')),
-        TypedField('variable', label=l_('Variables'), rolename='obj',
+        TypedField('variable', label=_('Variables'), rolename='obj',
                   names=('var', 'ivar', 'cvar'),
                   typerolename='obj', typenames=('vartype',)),
-        GroupedField('exceptions', label=l_('Raises'), rolename='exc',
+        GroupedField('exceptions', label=_('Raises'), rolename='exc',
                     names=('raises', 'raise', 'exception', 'except'),
                     can_collapse=True),
-        Field('returnvalue', label=l_('Returns'), has_arg=False,
+        Field('returnvalue', label=_('Returns'), has_arg=False,
              names=('returns', 'return')),
-        Field('returntype', label=l_('Return type'), has_arg=False,
+        Field('returntype', label=_('Return type'), has_arg=False,
              names=('rtype',)),
    ]

@ -493,8 +493,8 @@ class RubyModuleIndex(Index):
    """

    name = 'modindex'
-    localname = l_('Ruby Module Index')
-    shortname = l_('modules')
+    localname = _('Ruby Module Index')
+    shortname = _('modules')

    def generate(self, docnames=None):
        content = {}
@ -561,17 +561,17 @@ class RubyDomain(Domain):
    name = 'rb'
    label = 'Ruby'
    object_types = {
-        'function':        ObjType(l_('function'),         'func', 'obj'),
-        'global':          ObjType(l_('global variable'),  'global', 'obj'),
-        'method':          ObjType(l_('method'),           'meth', 'obj'),
-        'class':           ObjType(l_('class'),            'class', 'obj'),
-        'exception':       ObjType(l_('exception'),        'exc', 'obj'),
-        'classmethod':     ObjType(l_('class method'),     'meth', 'obj'),
-        'attr_reader':     ObjType(l_('attribute'),        'attr', 'obj'),
-        'attr_writer':     ObjType(l_('attribute'),        'attr', 'obj'),
-        'attr_accessor':   ObjType(l_('attribute'),        'attr', 'obj'),
-        'const':           ObjType(l_('const'),            'const', 'obj'),
-        'module':          ObjType(l_('module'),           'mod', 'obj'),
+        'function':        ObjType(_('function'),         'func', 'obj'),
+        'global':          ObjType(_('global variable'),  'global', 'obj'),
+        'method':          ObjType(_('method'),           'meth', 'obj'),
+        'class':           ObjType(_('class'),            'class', 'obj'),
+        'exception':       ObjType(_('exception'),        'exc', 'obj'),
+        'classmethod':     ObjType(_('class method'),     'meth', 'obj'),
+        'attr_reader':     ObjType(_('attribute'),        'attr', 'obj'),
+        'attr_writer':     ObjType(_('attribute'),        'attr', 'obj'),
+        'attr_accessor':   ObjType(_('attribute'),        'attr', 'obj'),
+        'const':           ObjType(_('const'),            'const', 'obj'),
+        'module':          ObjType(_('module'),           'mod', 'obj'),
    }

    directives = {
--- a/documentation/sphinx/requirements.txt
+++ b/documentation/sphinx/requirements.txt
@ -1,6 +1,6 @@
 --index-url https://pypi.python.org/simple
-setuptools>=20.10.0,<=57.4.0
-sphinx==1.5.6
-sphinx-bootstrap-theme==0.4.8
-docutils==0.16
-Jinja2==3.0.3
+setuptools==65.3.0
+sphinx==5.1.1
+sphinx-bootstrap-theme==0.8.1
+docutils==0.19
+Jinja2==3.1.2
--- a/documentation/sphinx/source/api-c.rst
+++ b/documentation/sphinx/source/api-c.rst
@ -222,7 +222,7 @@ The FoundationDB client library performs most tasks on a singleton thread (which
 Future
 ======

-Most functions in the FoundationDB API are asynchronous, meaning that they may return to the caller before actually delivering their result. These functions always return :type:`FDBFuture*`. An :type:`FDBFuture` object represents a result value or error to be delivered at some future time. You can wait for a Future to be "ready" -- to have a value or error delivered -- by setting a callback function, or by blocking a thread, or by polling. Once a Future is ready, you can extract either an error code or a value of the appropriate type (the documentation for the original function will tell you which :func:`fdb_future_get_*()` function you should call).
+Most functions in the FoundationDB API are asynchronous, meaning that they may return to the caller before actually delivering their result. These functions always return ``FDBFuture*``. An :type:`FDBFuture` object represents a result value or error to be delivered at some future time. You can wait for a Future to be "ready" -- to have a value or error delivered -- by setting a callback function, or by blocking a thread, or by polling. Once a Future is ready, you can extract either an error code or a value of the appropriate type (the documentation for the original function will tell you which ``fdb_future_get_()`` function you should call).

 To use the API in a synchronous way, you would typically do something like this for each asynchronous call::

@ -282,7 +282,7 @@ See :ref:`developer-guide-programming-with-futures` for further (language-indepe

 .. type:: FDBCallback

-   A pointer to a function which takes :type:`FDBFuture*` and ``void*`` and returns ``void``.
+   A pointer to a function which takes ``FDBFuture*`` and ``void*`` and returns ``void``.

 .. function:: void fdb_future_release_memory(FDBFuture* future)

@ -298,13 +298,13 @@ See :ref:`developer-guide-programming-with-futures` for further (language-indepe

 .. function:: fdb_error_t fdb_future_get_int64(FDBFuture* future, int64_t* out)

-   Extracts a 64-bit integer from an :type:`FDBFuture*` into a caller-provided variable of type ``int64_t``. |future-warning|
+   Extracts a 64-bit integer from a pointer to :type:`FDBFuture` into a caller-provided variable of type ``int64_t``. |future-warning|

   |future-get-return1| |future-get-return2|.

 .. function:: fdb_error_t fdb_future_get_key_array( FDBFuture* f, FDBKey const** out_key_array, int* out_count)

-   Extracts an array of :type:`FDBKey` from an :type:`FDBFuture*` into a caller-provided variable of type ``FDBKey*``. The size of the array will also be extracted and passed back by a caller-provided variable of type ``int`` |future-warning|
+   Extracts an array of :type:`FDBKey` from an ``FDBFuture*`` into a caller-provided variable of type ``FDBKey*``. The size of the array will also be extracted and passed back by a caller-provided variable of type ``int`` |future-warning|

   |future-get-return1| |future-get-return2|.

@ -547,13 +547,13 @@ Applications must provide error handling and an appropriate retry loop around th

 .. function:: void fdb_transaction_set_read_version(FDBTransaction* transaction, int64_t version)

-   Sets the snapshot read version used by a transaction. This is not needed in simple cases. If the given version is too old, subsequent reads will fail with error_code_transaction_too_old; if it is too new, subsequent reads may be delayed indefinitely and/or fail with error_code_future_version. If any of :func:`fdb_transaction_get_*()` have been called on this transaction already, the result is undefined.
+   Sets the snapshot read version used by a transaction. This is not needed in simple cases. If the given version is too old, subsequent reads will fail with error_code_transaction_too_old; if it is too new, subsequent reads may be delayed indefinitely and/or fail with error_code_future_version. If any of ``fdb_transaction_get_*()`` have been called on this transaction already, the result is undefined.

 .. function:: FDBFuture* fdb_transaction_get_read_version(FDBTransaction* transaction)

   |future-return0| the transaction snapshot read version. |future-return1| call :func:`fdb_future_get_int64()` to extract the version into an int64_t that you provide, |future-return2|

-   The transaction obtains a snapshot read version automatically at the time of the first call to :func:`fdb_transaction_get_*()` (including this one) and (unless causal consistency has been deliberately compromised by transaction options) is guaranteed to represent all transactions which were reported committed before that call.
+   The transaction obtains a snapshot read version automatically at the time of the first call to ``fdb_transaction_get_*()`` (including this one) and (unless causal consistency has been deliberately compromised by transaction options) is guaranteed to represent all transactions which were reported committed before that call.

 .. function:: FDBFuture* fdb_transaction_get(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length, fdb_bool_t snapshot)

@ -829,7 +829,7 @@ Applications must provide error handling and an appropriate retry loop around th

   |future-returnvoid|

-   Callers will usually want to retry a transaction if the commit or a prior :func:`fdb_transaction_get_*()` returns a retryable error (see :func:`fdb_transaction_on_error()`).
+   Callers will usually want to retry a transaction if the commit or a prior ``fdb_transaction_get_*()`` returns a retryable error (see :func:`fdb_transaction_on_error()`).

   |commit-unknown-result-blurb|

@ -878,9 +878,9 @@ Applications must provide error handling and an appropriate retry loop around th

 .. function:: FDBFuture* fdb_transaction_on_error(FDBTransaction* transaction, fdb_error_t error)

-   Implements the recommended retry and backoff behavior for a transaction. This function knows which of the error codes generated by other :func:`fdb_transaction_*()` functions represent temporary error conditions and which represent application errors that should be handled by the application. It also implements an exponential backoff strategy to avoid swamping the database cluster with excessive retries when there is a high level of conflict between transactions.
+   Implements the recommended retry and backoff behavior for a transaction. This function knows which of the error codes generated by other ``fdb_transaction_*()`` functions represent temporary error conditions and which represent application errors that should be handled by the application. It also implements an exponential backoff strategy to avoid swamping the database cluster with excessive retries when there is a high level of conflict between transactions.

-   On receiving any type of error from an :func:`fdb_transaction_*()` function, the application should:
+   On receiving any type of error from an ``fdb_transaction_*()`` function, the application should:

   1. Call :func:`fdb_transaction_on_error()` with the returned :type:`fdb_error_t` code.

@ -963,15 +963,15 @@ Key selectors

 In the FoundationDB C API, key selectors are not represented by a structure of any kind, but are instead expressed as sequential parameters to |get-key-func| and |get-range-func|. For convenience, the most common key selectors are available as C macros that expand to the appropriate parameters.

-.. function:: FDB_KEYSEL_LAST_LESS_THAN(key_name, key_name_length)
+.. type:: FDB_KEYSEL_LAST_LESS_THAN(key_name, key_name_length)

-.. function:: FDB_KEYSEL_LAST_LESS_OR_EQUAL(key_name, key_name_length)
+.. type:: FDB_KEYSEL_LAST_LESS_OR_EQUAL(key_name, key_name_length)

-.. function:: FDB_KEYSEL_FIRST_GREATER_THAN(key_name, key_name_length)
+.. type:: FDB_KEYSEL_FIRST_GREATER_THAN(key_name, key_name_length)

-.. function:: FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(key_name, key_name_length)
+.. type:: FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(key_name, key_name_length)

-To use one of these macros, simply replace the four parameters in the function with one of :func:`FDB_KEYSEL_*`::
+To use one of these macros, simply replace the four parameters in the function with one of ``FDB_KEYSEL_*``::

    future = fdb_transaction_get_key(transaction, "key", 3, 0, 2, 0);

--- a/documentation/sphinx/source/api-python.rst
+++ b/documentation/sphinx/source/api-python.rst
@ -194,10 +194,6 @@ After importing the ``fdb`` module and selecting an API version, you probably wa

       |option-tls-key-bytes|
    
-    .. method :: fdb.options.set_tls_verify_peers(verification_pattern)
-
-       |option-tls-verify-peers|
-    
    .. method :: fdb.options.set_tls_ca_bytes(ca_bundle)

       |option-tls-ca-bytes|
@ -210,10 +206,6 @@ After importing the ``fdb`` module and selecting an API version, you probably wa

       |option-tls-password|

-    .. method :: fdb.options.set_disable_multi_version_client_api()
-
-       |option-disable-multi-version-client-api|
-
    .. method :: fdb.options.set_disable_local_client()

      |option-set-disable-local-client|
@ -761,10 +753,6 @@ In each of the methods below, ``param`` should be a string appropriately packed
 Committing
 ----------

-.. decorator:: transactional()
-
-    The ``transactional`` decorator makes it easy to write transactional functions which accept a :class:`Database`, :class`Tenant`, or :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional <transactional>` for explanation and examples.
-
 .. method :: Transaction.commit()

    Attempt to commit the changes made in the transaction to the database.  Returns a :class:`FutureVoid` representing the asynchronous result of the commit. You **must** call the :meth:`Future.wait()` method on the returned :class:`FutureVoid`, which will raise an exception if the commit failed.
--- a/documentation/sphinx/source/data-modeling.rst
+++ b/documentation/sphinx/source/data-modeling.rst
@ -1,7 +1,6 @@
 .. default-domain:: py
 .. default-domain:: py
 .. highlight:: python
-.. module:: fdb

 .. Required substitutions for api-common.rst.inc

--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@ -1,7 +1,6 @@
 .. default-domain:: py
 .. default-domain:: py
 .. highlight:: python
-.. module:: fdb

 .. Required substitutions for api-common.rst.inc

--- a/documentation/sphinx/source/index.rst
+++ b/documentation/sphinx/source/index.rst
@ -50,6 +50,7 @@ The latest changes are detailed in :ref:`release-notes`. The documentation has t
   :hidden:

   local-dev
+   internal-dev-tools
   why-foundationdb
   technical-overview
   client-design
--- a/documentation/sphinx/source/internal-dev-tools.rst
+++ b/documentation/sphinx/source/internal-dev-tools.rst
@ -0,0 +1,58 @@
+##################
+Internal Dev Tools
+##################
+
+Code Probes
+===========
+
+Code probes are a mechanism in FDB to prove that certain code-paths are being tested under the right conditions. They differ from code coverage in multiple ways (explained below).
+
+The general format of a code probe is:
+
+.. code-block:: C++
+
+   CODE_PROBE(<condition>, "Comment", [annotations...]);
+
+A simple example of a code probe could look as follows:
+
+.. code-block:: C++
+
+   CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery", probe::context::sim2);
+
+On a very high level, the above code will indicate that whenever this line is executed and ``self->forceRecovery`` is ``true``, we ran into some interesting case. In addition this probe is also annotated with ``probe::context::sim2``. This indicates that we expect this code to be eventually hit in simulation.
+
+By default, FDB simply will write a trace-line when this code is hit and the condition is ``true``. If the code is never hit, the simulator will, at the end of the run, print the code probe but set the ``covered`` field to ``false``. This all happens in the context of a single simulation run (``fdbserver`` doesn't have a concept of ensembles). This information is written into the log file. ``TestHarness`` (see below) will then use this information to write code probe statistics to the ensemble in the Joshua cluster (if the test is run in Joshua).
+
+We expect that ALL code probes will be hit in a nightly run. In the future we can potentially use this feature for other things (like instructing the simulator to do an extensive search starting when one of these probes is being hit).
+
+In addition to ``context`` annotations, users can also define and pass assertions. For example:
+
+.. code-block:: C++
+
+   CODE_PROBE(condition, "Some comment", assert::simOnly);
+
+These will add an assertion to the code. In addition to that, the simulator will not print missed code probes that asserted that the probe won't be hit in simulation.
+
+Test Harness
+============
+
+TestHarness is our primary testing tool. It has multiple jobs:
+
+* *Running*: It can run a test in Joshua.
+* *Statistics*: It will choose a test to run based on previous runs (within the same ensemble) spent CPU time for each test. It does that by writing statistics about the test at the end of each run.
+* *Reporting*: After an ensemble has finished (or while it is running), ``TestHarness`` can be used to generate a report in ``xml`` or ``json``.
+
+Test Harness can be found in the FDB source repository under ``contrib/TestHarness2``. It has a weak dependency to `joshua <https://github.com/foundationDB/fdb-joshua>`_ (if Test Harness can find joshua it will report back about failed tests, otherwise it will just print out general statistics about the ensemble). Joshua will call Test Harness as follows:
+
+.. code-block:: shell
+
+   python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR}
+
+Here the seed is a random number generated by joshua and ``OLDBINDIR`` is a directory path where the old fdb binaries can be found (this is needed for restart tests). If one wants to retry a test they can pass the previous joshua seed, a directory path that has *exactly* the same content as ``OLDBINARYDIR``, plus the reported statistics to the test harness app. This should then re-run the same code as before.
+
+In order to figure out what command line arguments ``test_harness.app`` (and ``test_harness.results``) accepts, one can check the contents of ``contrib/TestHarness2/test_harness/config.py``.
+
+Reporting
+---------
+
+After a joshua ensemble completed, ``test_harness.results`` can be used in order to get a report on the ensemble. This will include, by default, a list of all failed tests (similar to ``joshua tail --errors``, though in a more human readable file). For completed ensemble it will also print code probes that weren't hit often enough. An ensemble is considered to be successful if no simulation runs completed with an error AND all code probes have been hit sufficiently often.
--- a/documentation/sphinx/source/release-notes/release-notes-710.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-710.rst
@ -2,6 +2,21 @@
 Release Notes
 #############

+7.1.23
+======
+* Same as 7.1.22 release with AVX enabled.
+
+7.1.22
+======
+* Released with AVX disabled.
+* Added new latency samples for GetValue, GetRange, QueueWait, and VersionWait in storage servers. `(PR #8215) <https://github.com/apple/foundationdb/pull/8215>`_
+* Fixed a rare partial data write for TLogs. `(PR #8210) <https://github.com/apple/foundationdb/pull/8210>`_
+* Added HTTP proxy support for backup agents. `(PR #8193) <https://github.com/apple/foundationdb/pull/8193>`_
+* Fixed a memory bug of secondary queries in index prefetch. `(PR #8195) <https://github.com/apple/foundationdb/pull/8195>`_, `(PR #8190) <https://github.com/apple/foundationdb/pull/8190>`_
+* Introduced STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT knob to recreate SS at io_timeout errors. `(PR #8123) <https://github.com/apple/foundationdb/pull/8123>`_
+* Fixed two TLog stopped bugs and a CC leader replacement bug. `(PR #8081) <https://github.com/apple/foundationdb/pull/8081>`_
+* Added back RecoveryAvailable trace event for status's seconds_since_last_recovered field. `(PR #8068) <https://github.com/apple/foundationdb/pull/8068>`_
+
 7.1.21
 ======
 * Same as 7.1.20 release with AVX enabled.
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@ -22,6 +22,9 @@
 #include <time.h>

 #include "fdbclient/BackupAgent.actor.h"
+#include "fdbclient/BlobCipher.h"
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
+#include "fdbclient/DatabaseContext.h"
 #include "fdbrpc/simulator.h"
 #include "flow/ActorCollection.h"
 #include "flow/actorcompiler.h" // has to be last include
@ -253,16 +256,18 @@ std::pair<Version, uint32_t> decodeBKMutationLogKey(Key key) {
 	    bigEndian32(*(int32_t*)(key.begin() + backupLogPrefixBytes + sizeof(UID) + sizeof(uint8_t) + sizeof(int64_t))));
 }

-void decodeBackupLogValue(Arena& arena,
-                          VectorRef<MutationRef>& result,
-                          int& mutationSize,
-                          StringRef value,
-                          StringRef addPrefix,
-                          StringRef removePrefix,
-                          Version version,
-                          Reference<KeyRangeMap<Version>> key_version) {
+ACTOR static Future<Void> decodeBackupLogValue(Arena* arena,
+                                               VectorRef<MutationRef>* result,
+                                               VectorRef<Optional<MutationRef>>* encryptedResult,
+                                               int* mutationSize,
+                                               Standalone<StringRef> value,
+                                               Key addPrefix,
+                                               Key removePrefix,
+                                               Version version,
+                                               Reference<KeyRangeMap<Version>> key_version,
+                                               Database cx) {
 	try {
-		uint64_t offset(0);
+		state uint64_t offset(0);
 		uint64_t protocolVersion = 0;
 		memcpy(&protocolVersion, value.begin(), sizeof(uint64_t));
 		offset += sizeof(uint64_t);
@ -274,36 +279,48 @@ void decodeBackupLogValue(Arena& arena,
 			throw incompatible_protocol_version();
 		}

-		uint32_t totalBytes = 0;
+		state uint32_t totalBytes = 0;
 		memcpy(&totalBytes, value.begin() + offset, sizeof(uint32_t));
 		offset += sizeof(uint32_t);
-		uint32_t consumed = 0;
+		state uint32_t consumed = 0;

 		if (totalBytes + offset > value.size())
 			throw restore_missing_data();

-		int originalOffset = offset;
+		state int originalOffset = offset;

 		while (consumed < totalBytes) {
 			uint32_t type = 0;
 			memcpy(&type, value.begin() + offset, sizeof(uint32_t));
 			offset += sizeof(uint32_t);
-			uint32_t len1 = 0;
+			state uint32_t len1 = 0;
 			memcpy(&len1, value.begin() + offset, sizeof(uint32_t));
 			offset += sizeof(uint32_t);
-			uint32_t len2 = 0;
+			state uint32_t len2 = 0;
 			memcpy(&len2, value.begin() + offset, sizeof(uint32_t));
 			offset += sizeof(uint32_t);

 			ASSERT(offset + len1 + len2 <= value.size() && isValidMutationType(type));

-			MutationRef logValue;
-			Arena tempArena;
+			state MutationRef logValue;
+			state Arena tempArena;
 			logValue.type = type;
 			logValue.param1 = value.substr(offset, len1);
 			offset += len1;
 			logValue.param2 = value.substr(offset, len2);
 			offset += len2;
+			state Optional<MutationRef> encryptedLogValue = Optional<MutationRef>();
+
+			// Decrypt mutation ref if encrypted
+			if (logValue.isEncrypted()) {
+				encryptedLogValue = logValue;
+				Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
+				TextAndHeaderCipherKeys cipherKeys =
+				    wait(getEncryptCipherKeys(dbInfo, *logValue.encryptionHeader(), BlobCipherMetrics::BACKUP));
+				logValue = logValue.decrypt(cipherKeys, tempArena, BlobCipherMetrics::BACKUP);
+			}
+			ASSERT(!logValue.isEncrypted());
+			MutationRef originalLogValue = logValue;

 			if (logValue.type == MutationRef::ClearRange) {
 				KeyRangeRef range(logValue.param1, logValue.param2);
@ -320,8 +337,8 @@ void decodeBackupLogValue(Arena& arena,
 								logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena);
 							}
 							logValue.param2 = addPrefix == StringRef() ? normalKeys.end : strinc(addPrefix, tempArena);
-							result.push_back_deep(arena, logValue);
-							mutationSize += logValue.expectedSize();
+							result->push_back_deep(*arena, logValue);
+							*mutationSize += logValue.expectedSize();
 						} else {
 							logValue.param1 = std::max(r.range().begin, range.begin);
 							logValue.param2 = minKey;
@ -333,8 +350,13 @@ void decodeBackupLogValue(Arena& arena,
 								logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena);
 								logValue.param2 = logValue.param2.withPrefix(addPrefix, tempArena);
 							}
-							result.push_back_deep(arena, logValue);
-							mutationSize += logValue.expectedSize();
+							result->push_back_deep(*arena, logValue);
+							*mutationSize += logValue.expectedSize();
+						}
+						if (originalLogValue.param1 == logValue.param1 && originalLogValue.param2 == logValue.param2) {
+							encryptedResult->push_back_deep(*arena, encryptedLogValue);
+						} else {
+							encryptedResult->push_back_deep(*arena, Optional<MutationRef>());
 						}
 					}
 				}
@ -348,8 +370,15 @@ void decodeBackupLogValue(Arena& arena,
 					if (addPrefix.size()) {
 						logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena);
 					}
-					result.push_back_deep(arena, logValue);
-					mutationSize += logValue.expectedSize();
+					result->push_back_deep(*arena, logValue);
+					*mutationSize += logValue.expectedSize();
+					// If we did not remove/add prefixes to the mutation then keep the original encrypted mutation so we
+					// do not have to re-encrypt unnecessarily
+					if (originalLogValue.param1 == logValue.param1 && originalLogValue.param2 == logValue.param2) {
+						encryptedResult->push_back_deep(*arena, encryptedLogValue);
+					} else {
+						encryptedResult->push_back_deep(*arena, Optional<MutationRef>());
+					}
 				}
 			}

@ -374,6 +403,7 @@ void decodeBackupLogValue(Arena& arena,
 		    .detail("Value", value);
 		throw;
 	}
+	return Void();
 }

 static double lastErrorTime = 0;
@ -614,21 +644,24 @@ ACTOR Future<int> dumpData(Database cx,
 		state int mutationSize = 0;
 		loop {
 			try {
-				RCGroup group = waitNext(results.getFuture());
+				state RCGroup group = waitNext(results.getFuture());
 				lock->release(group.items.expectedSize());

 				BinaryWriter bw(Unversioned());
 				for (int i = 0; i < group.items.size(); ++i) {
 					bw.serializeBytes(group.items[i].value);
 				}
-				decodeBackupLogValue(req.arena,
-				                     req.transaction.mutations,
-				                     mutationSize,
-				                     bw.toValue(),
-				                     addPrefix,
-				                     removePrefix,
-				                     group.groupKey,
-				                     keyVersion);
+				Standalone<StringRef> value = bw.toValue();
+				wait(decodeBackupLogValue(&req.arena,
+				                          &req.transaction.mutations,
+				                          &req.transaction.encryptedMutations,
+				                          &mutationSize,
+				                          value,
+				                          addPrefix,
+				                          removePrefix,
+				                          group.groupKey,
+				                          keyVersion,
+				                          cx));
 				newBeginVersion = group.groupKey + 1;
 				if (mutationSize >= CLIENT_KNOBS->BACKUP_LOG_WRITE_BATCH_MAX_SIZE) {
 					break;
@ -652,8 +685,10 @@ ACTOR Future<int> dumpData(Database cx,
 		Key rangeEnd = getApplyKey(newBeginVersion, uid);

 		req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::SetValue, applyBegin, versionKey));
+		req.transaction.encryptedMutations.push_back_deep(req.arena, Optional<MutationRef>());
 		req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(applyBegin));
 		req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::ClearRange, rangeBegin, rangeEnd));
+		req.transaction.encryptedMutations.push_back_deep(req.arena, Optional<MutationRef>());
 		req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(rangeBegin));

 		// The commit request contains no read conflict ranges, so regardless of what read version we
--- a/fdbclient/BlobCipher.cpp
+++ b/fdbclient/BlobCipher.cpp
@ -152,7 +152,7 @@ void BlobCipherKey::initKey(const EncryptCipherDomainId& domainId,
 	expireAtTS = expireAt;

 #if BLOB_CIPHER_DEBUG
-	TraceEvent(SevDebug, "BlobCipher.KeyInit")
+	TraceEvent(SevDebug, "BlobCipherKeyInit")
 	    .detail("DomainId", domainId)
 	    .detail("BaseCipherId", baseCipherId)
 	    .detail("BaseCipherLen", baseCipherLen)
@ -168,10 +168,10 @@ void BlobCipherKey::applyHmacSha256Derivation() {
 	memcpy(&buf[0], baseCipher.get(), baseCipherLen);
 	memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(EncryptCipherRandomSalt));
 	HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen);
-	StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(EncryptCipherRandomSalt), arena);
-	std::copy(digest.begin(), digest.end(), cipher.get());
-	if (digest.size() < AES_256_KEY_LENGTH) {
-		memcpy(cipher.get() + digest.size(), buf, AES_256_KEY_LENGTH - digest.size());
+	unsigned int digestLen =
+	    hmacGen.digest(&buf[0], baseCipherLen + sizeof(EncryptCipherRandomSalt), cipher.get(), AUTH_TOKEN_SIZE);
+	if (digestLen < AES_256_KEY_LENGTH) {
+		memcpy(cipher.get() + digestLen, buf, AES_256_KEY_LENGTH - digestLen);
 	}
 }

@ -185,7 +185,7 @@ void BlobCipherKey::reset() {
 BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId, size_t* sizeStat)
  : domainId(dId), latestBaseCipherKeyId(), latestRandomSalt(), sizeStat(sizeStat) {
 	ASSERT(sizeStat != nullptr);
-	TraceEvent(SevInfo, "BlobCipher.KeyIdCacheInit").detail("DomainId", domainId);
+	TraceEvent(SevInfo, "BlobCipherKeyIdCacheInit").detail("DomainId", domainId);
 }

 BlobCipherKeyIdCacheKey BlobCipherKeyIdCache::getCacheKey(const EncryptCipherBaseKeyId& baseCipherKeyId,
@ -229,7 +229,7 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt
 	if (latestCipherKey.isValid() && latestCipherKey->getBaseCipherId() == baseCipherId) {
 		if (memcmp(latestCipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0) {
 #if BLOB_CIPHER_DEBUG
-			TraceEvent(SevDebug, "InsertBaseCipherKey_AlreadyPresent")
+			TraceEvent(SevDebug, "InsertBaseCipherKeyAlreadyPresent")
 			    .detail("BaseCipherKeyId", baseCipherId)
 			    .detail("DomainId", domainId);
 #endif
@ -237,14 +237,14 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt
 			// Key is already present; nothing more to do.
 			return latestCipherKey;
 		} else {
-			TraceEvent(SevInfo, "BlobCipher.UpdatetBaseCipherKey")
+			TraceEvent(SevInfo, "BlobCipherUpdatetBaseCipherKey")
 			    .detail("BaseCipherKeyId", baseCipherId)
 			    .detail("DomainId", domainId);
 			throw encrypt_update_cipher();
 		}
 	}

-	TraceEvent(SevInfo, "BlobCipherKey.InsertBaseCipherKeyLatest")
+	TraceEvent(SevInfo, "BlobCipherKeyInsertBaseCipherKeyLatest")
 	    .detail("DomainId", domainId)
 	    .detail("BaseCipherId", baseCipherId)
 	    .detail("RefreshAt", refreshAt)
@ -279,7 +279,7 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt
 	if (itr != keyIdCache.end()) {
 		if (memcmp(itr->second->rawBaseCipher(), baseCipher, baseCipherLen) == 0) {
 #if BLOB_CIPHER_DEBUG
-			TraceEvent(SevDebug, "InsertBaseCipherKey_AlreadyPresent")
+			TraceEvent(SevDebug, "InsertBaseCipherKeyAlreadyPresent")
 			    .detail("BaseCipherKeyId", baseCipherId)
 			    .detail("DomainId", domainId);
 #endif
@ -287,14 +287,14 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt
 			// Key is already present; nothing more to do.
 			return itr->second;
 		} else {
-			TraceEvent(SevInfo, "BlobCipher.UpdateBaseCipherKey")
+			TraceEvent(SevInfo, "BlobCipherUpdateBaseCipherKey")
 			    .detail("BaseCipherKeyId", baseCipherId)
 			    .detail("DomainId", domainId);
 			throw encrypt_update_cipher();
 		}
 	}

-	TraceEvent(SevInfo, "BlobCipherKey.InsertBaseCipherKey")
+	TraceEvent(SevInfo, "BlobCipherKeyInsertBaseCipherKey")
 	    .detail("DomainId", domainId)
 	    .detail("BaseCipherId", baseCipherId)
 	    .detail("Salt", salt)
@ -351,7 +351,7 @@ Reference<BlobCipherKey> BlobCipherKeyCache::insertCipherKey(const EncryptCipher
 			cipherKey = keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt);
 		}
 	} catch (Error& e) {
-		TraceEvent(SevWarn, "BlobCipher.InsertCipherKeyFailed")
+		TraceEvent(SevWarn, "BlobCipherInsertCipherKeyFailed")
 		    .detail("BaseCipherKeyId", baseCipherId)
 		    .detail("DomainId", domainId);
 		throw;
@ -387,7 +387,7 @@ Reference<BlobCipherKey> BlobCipherKeyCache::insertCipherKey(const EncryptCipher
 			    keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt);
 		}
 	} catch (Error& e) {
-		TraceEvent(SevWarn, "BlobCipher.InsertCipherKey_Failed")
+		TraceEvent(SevWarn, "BlobCipherInsertCipherKey_Failed")
 		    .detail("BaseCipherKeyId", baseCipherId)
 		    .detail("DomainId", domainId)
 		    .detail("Salt", salt);
@ -398,12 +398,12 @@ Reference<BlobCipherKey> BlobCipherKeyCache::insertCipherKey(const EncryptCipher

 Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const EncryptCipherDomainId& domainId) {
 	if (domainId == INVALID_ENCRYPT_DOMAIN_ID) {
-		TraceEvent(SevWarn, "BlobCipher.GetLatestCipherKeyInvalidID").detail("DomainId", domainId);
+		TraceEvent(SevWarn, "BlobCipherGetLatestCipherKeyInvalidID").detail("DomainId", domainId);
 		throw encrypt_invalid_id();
 	}
 	auto domainItr = domainCacheMap.find(domainId);
 	if (domainItr == domainCacheMap.end()) {
-		TraceEvent(SevInfo, "BlobCipher.GetLatestCipherKeyDomainNotFound").detail("DomainId", domainId);
+		TraceEvent(SevInfo, "BlobCipherGetLatestCipherKeyDomainNotFound").detail("DomainId", domainId);
 		return Reference<BlobCipherKey>();
 	}

@ -414,7 +414,7 @@ Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const EncryptCip
 	if (cipherKey.isValid()) {
 		if (cipherKey->needsRefresh()) {
 #if BLOB_CIPHER_DEBUG
-			TraceEvent("SevDebug, BlobCipher.GetLatestNeedsRefresh")
+			TraceEvent("SevDebug, BlobCipherGetLatestNeedsRefresh")
 			    .detail("DomainId", domainId)
 			    .detail("Now", now())
 			    .detail("RefreshAt", cipherKey->getRefreshAtTS());
@ -445,7 +445,7 @@ Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const EncryptCipherDom
 	if (cipherKey.isValid()) {
 		if (cipherKey->isExpired()) {
 #if BLOB_CIPHER_DEBUG
-			TraceEvent(SevDebug, "BlobCipher.GetCipherExpired")
+			TraceEvent(SevDebug, "BlobCipherGetCipherExpired")
 			    .detail("DomainId", domainId)
 			    .detail("BaseCipherId", baseCipherId)
 			    .detail("Now", now())
@ -472,18 +472,18 @@ void BlobCipherKeyCache::resetEncryptDomainId(const EncryptCipherDomainId domain
 	ASSERT(keyIdCache->getSize() <= size);
 	size -= keyIdCache->getSize();
 	keyIdCache->cleanup();
-	TraceEvent(SevInfo, "BlobCipher.ResetEncryptDomainId").detail("DomainId", domainId);
+	TraceEvent(SevInfo, "BlobCipherResetEncryptDomainId").detail("DomainId", domainId);
 }

 void BlobCipherKeyCache::cleanup() noexcept {
 	Reference<BlobCipherKeyCache> instance = BlobCipherKeyCache::getInstance();

-	TraceEvent(SevInfo, "BlobCipherKeyCache.Cleanup").log();
+	TraceEvent(SevInfo, "BlobCipherKeyCacheCleanup").log();

 	for (auto& domainItr : instance->domainCacheMap) {
 		Reference<BlobCipherKeyIdCache> keyIdCache = domainItr.second;
 		keyIdCache->cleanup();
-		TraceEvent(SevInfo, "BlobCipher.KeyCacheCleanup").detail("DomainId", domainItr.first);
+		TraceEvent(SevInfo, "BlobCipherKeyCacheCleanup").detail("DomainId", domainItr.first);
 	}

 	instance->domainCacheMap.clear();
@ -547,7 +547,6 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
 	if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) {
 		startTime = timer_monotonic();
 	}
-	CODE_PROBE(true, "Encrypting data with BlobCipher");

 	memset(reinterpret_cast<uint8_t*>(header), 0, sizeof(BlobCipherEncryptHeader));

@ -561,7 +560,7 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
 	uint8_t* ciphertext = encryptBuf->begin();
 	int bytes{ 0 };
 	if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) {
-		TraceEvent(SevWarn, "BlobCipher.EncryptUpdateFailed")
+		TraceEvent(SevWarn, "BlobCipherEncryptUpdateFailed")
 		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
 		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
@ -569,14 +568,14 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte

 	int finalBytes{ 0 };
 	if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) {
-		TraceEvent(SevWarn, "BlobCipher.EncryptFinalFailed")
+		TraceEvent(SevWarn, "BlobCipherEncryptFinalFailed")
 		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
 		    .detail("EncryptDomainId", textCipherKey->getDomainId());
 		throw encrypt_ops_error();
 	}

 	if ((bytes + finalBytes) != plaintextLen) {
-		TraceEvent(SevWarn, "BlobCipher.EncryptUnexpectedCipherLen")
+		TraceEvent(SevWarn, "BlobCipherEncryptUnexpectedCipherLen")
 		    .detail("PlaintextLen", plaintextLen)
 		    .detail("EncryptedBufLen", bytes + finalBytes);
 		throw encrypt_ops_error();
@ -610,73 +609,41 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
 			memcpy(&ciphertext[bytes + finalBytes],
 			       reinterpret_cast<const uint8_t*>(header),
 			       sizeof(BlobCipherEncryptHeader));
-			StringRef authToken = computeAuthToken(ciphertext,
-			                                       bytes + finalBytes + sizeof(BlobCipherEncryptHeader),
-			                                       headerCipherKey->rawCipher(),
-			                                       AES_256_KEY_LENGTH,
-			                                       arena);
-			memcpy(&header->singleAuthToken.authToken[0], authToken.begin(), AUTH_TOKEN_SIZE);
+			computeAuthToken(ciphertext,
+			                 bytes + finalBytes + sizeof(BlobCipherEncryptHeader),
+			                 headerCipherKey->rawCipher(),
+			                 AES_256_KEY_LENGTH,
+			                 &header->singleAuthToken.authToken[0],
+			                 AUTH_TOKEN_SIZE);
 		} else {
 			ASSERT_EQ(header->flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);

-			StringRef cipherTextAuthToken =
-			    computeAuthToken(ciphertext,
-			                     bytes + finalBytes,
-			                     reinterpret_cast<const uint8_t*>(&header->cipherTextDetails.salt),
-			                     sizeof(EncryptCipherRandomSalt),
-			                     arena);
-			memcpy(&header->multiAuthTokens.cipherTextAuthToken[0], cipherTextAuthToken.begin(), AUTH_TOKEN_SIZE);
-			StringRef headerAuthToken = computeAuthToken(reinterpret_cast<const uint8_t*>(header),
-			                                             sizeof(BlobCipherEncryptHeader),
-			                                             headerCipherKey->rawCipher(),
-			                                             AES_256_KEY_LENGTH,
-			                                             arena);
-			memcpy(&header->multiAuthTokens.headerAuthToken[0], headerAuthToken.begin(), AUTH_TOKEN_SIZE);
+			computeAuthToken(ciphertext,
+			                 bytes + finalBytes,
+			                 reinterpret_cast<const uint8_t*>(&header->cipherTextDetails.salt),
+			                 sizeof(EncryptCipherRandomSalt),
+			                 &header->multiAuthTokens.cipherTextAuthToken[0],
+			                 AUTH_TOKEN_SIZE);
+			computeAuthToken(reinterpret_cast<const uint8_t*>(header),
+			                 sizeof(BlobCipherEncryptHeader),
+			                 headerCipherKey->rawCipher(),
+			                 AES_256_KEY_LENGTH,
+			                 &header->multiAuthTokens.headerAuthToken[0],
+			                 AUTH_TOKEN_SIZE);
 		}
 	}

 	encryptBuf->setLogicalSize(plaintextLen);
+
 	if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) {
 		BlobCipherMetrics::counters(usageType).encryptCPUTimeNS += int64_t((timer_monotonic() - startTime) * 1e9);
 	}
+
+	CODE_PROBE(true, "Encrypting data with BlobCipher");
+
 	return encryptBuf;
 }

-Standalone<StringRef> EncryptBlobCipherAes265Ctr::encryptBlobGranuleChunk(const uint8_t* plaintext,
-                                                                          const int plaintextLen) {
-	double startTime = 0.0;
-	if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) {
-		startTime = timer_monotonic();
-	}
-	Standalone<StringRef> encrypted = makeString(plaintextLen);
-	uint8_t* ciphertext = mutateString(encrypted);
-	int bytes{ 0 };
-
-	if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) {
-		TraceEvent(SevWarn, "BlobCipher.EncryptUpdateFailed")
-		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
-		    .detail("EncryptDomainId", textCipherKey->getDomainId());
-		throw encrypt_ops_error();
-	}
-	int finalBytes{ 0 };
-	if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) {
-		TraceEvent(SevWarn, "BlobCipher.EncryptFinalFailed")
-		    .detail("BaseCipherId", textCipherKey->getBaseCipherId())
-		    .detail("EncryptDomainId", textCipherKey->getDomainId());
-		throw encrypt_ops_error();
-	}
-	if ((bytes + finalBytes) != plaintextLen) {
-		TraceEvent(SevWarn, "BlobCipher.EncryptUnexpectedCipherLen")
-		    .detail("PlaintextLen", plaintextLen)
-		    .detail("EncryptedBufLen", bytes + finalBytes);
-		throw encrypt_ops_error();
-	}
-	if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) {
-		BlobCipherMetrics::counters(usageType).encryptCPUTimeNS += int64_t((timer_monotonic() - startTime) * 1e9);
-	}
-	return encrypted;
-}
-
 EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() {
 	if (ctx != nullptr) {
 		EVP_CIPHER_CTX_free(ctx);
@ -716,18 +683,20 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHe
 	       reinterpret_cast<const uint8_t*>(&header),
 	       sizeof(BlobCipherEncryptHeader));
 	memset(reinterpret_cast<uint8_t*>(&headerCopy.multiAuthTokens.headerAuthToken), 0, AUTH_TOKEN_SIZE);
-	StringRef computedHeaderAuthToken = computeAuthToken(reinterpret_cast<const uint8_t*>(&headerCopy),
-	                                                     sizeof(BlobCipherEncryptHeader),
-	                                                     headerCipherKey->rawCipher(),
-	                                                     AES_256_KEY_LENGTH,
-	                                                     arena);
-	if (memcmp(&header.multiAuthTokens.headerAuthToken[0], computedHeaderAuthToken.begin(), AUTH_TOKEN_SIZE) != 0) {
-		TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeaderAuthTokenMismatch")
+	uint8_t computedHeaderAuthToken[AUTH_TOKEN_SIZE];
+	computeAuthToken(reinterpret_cast<const uint8_t*>(&headerCopy),
+	                 sizeof(BlobCipherEncryptHeader),
+	                 headerCipherKey->rawCipher(),
+	                 AES_256_KEY_LENGTH,
+	                 &computedHeaderAuthToken[0],
+	                 AUTH_TOKEN_SIZE);
+	if (memcmp(&header.multiAuthTokens.headerAuthToken[0], &computedHeaderAuthToken[0], AUTH_TOKEN_SIZE) != 0) {
+		TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderMode", header.flags.encryptMode)
 		    .detail("MultiAuthHeaderAuthToken",
 		            StringRef(arena, &header.multiAuthTokens.headerAuthToken[0], AUTH_TOKEN_SIZE).toString())
-		    .detail("ComputedHeaderAuthToken", computedHeaderAuthToken.toString());
+		    .detail("ComputedHeaderAuthToken", StringRef(computedHeaderAuthToken, AUTH_TOKEN_SIZE));
 		throw encrypt_header_authtoken_mismatch();
 	}

@ -749,15 +718,20 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciph
 	BlobCipherEncryptHeader* eHeader = (BlobCipherEncryptHeader*)(&buff[ciphertextLen]);
 	memset(reinterpret_cast<uint8_t*>(&eHeader->singleAuthToken), 0, 2 * AUTH_TOKEN_SIZE);

-	StringRef computed = computeAuthToken(
-	    buff, ciphertextLen + sizeof(BlobCipherEncryptHeader), headerCipherKey->rawCipher(), AES_256_KEY_LENGTH, arena);
-	if (memcmp(&header.singleAuthToken.authToken[0], computed.begin(), AUTH_TOKEN_SIZE) != 0) {
-		TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeaderAuthTokenMismatch")
+	uint8_t computed[AUTH_TOKEN_SIZE];
+	computeAuthToken(buff,
+	                 ciphertextLen + sizeof(BlobCipherEncryptHeader),
+	                 headerCipherKey->rawCipher(),
+	                 AES_256_KEY_LENGTH,
+	                 &computed[0],
+	                 AUTH_TOKEN_SIZE);
+	if (memcmp(&header.singleAuthToken.authToken[0], &computed[0], AUTH_TOKEN_SIZE) != 0) {
+		TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderMode", header.flags.encryptMode)
 		    .detail("SingleAuthToken",
 		            StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString())
-		    .detail("ComputedSingleAuthToken", computed.toString());
+		    .detail("ComputedSingleAuthToken", StringRef(computed, AUTH_TOKEN_SIZE));
 		throw encrypt_header_authtoken_mismatch();
 	}
 }
@ -770,20 +744,20 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphe
 	if (!headerAuthTokenValidationDone) {
 		verifyHeaderAuthToken(header, arena);
 	}
-	StringRef computedCipherTextAuthToken =
-	    computeAuthToken(ciphertext,
-	                     ciphertextLen,
-	                     reinterpret_cast<const uint8_t*>(&header.cipherTextDetails.salt),
-	                     sizeof(EncryptCipherRandomSalt),
-	                     arena);
-	if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], computedCipherTextAuthToken.begin(), AUTH_TOKEN_SIZE) !=
-	    0) {
-		TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeaderAuthTokenMismatch")
+	uint8_t computedCipherTextAuthToken[AUTH_TOKEN_SIZE];
+	computeAuthToken(ciphertext,
+	                 ciphertextLen,
+	                 reinterpret_cast<const uint8_t*>(&header.cipherTextDetails.salt),
+	                 sizeof(EncryptCipherRandomSalt),
+	                 &computedCipherTextAuthToken[0],
+	                 AUTH_TOKEN_SIZE);
+	if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], &computedCipherTextAuthToken[0], AUTH_TOKEN_SIZE) != 0) {
+		TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderMode", header.flags.encryptMode)
 		    .detail("MultiAuthCipherTextAuthToken",
 		            StringRef(arena, &header.multiAuthTokens.cipherTextAuthToken[0], AUTH_TOKEN_SIZE).toString())
-		    .detail("ComputedCipherTextAuthToken", computedCipherTextAuthToken.toString());
+		    .detail("ComputedCipherTextAuthToken", StringRef(computedCipherTextAuthToken, AUTH_TOKEN_SIZE));
 		throw encrypt_header_authtoken_mismatch();
 	}
 }
@ -808,7 +782,7 @@ void DecryptBlobCipherAes256Ctr::verifyEncryptHeaderMetadata(const BlobCipherEnc
 	if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
 	    header.flags.encryptMode != ENCRYPT_CIPHER_MODE_AES_256_CTR ||
 	    !isEncryptHeaderAuthTokenModeValid((EncryptAuthTokenMode)header.flags.authTokenMode)) {
-		TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeader")
+		TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeader")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
 		    .detail("EncryptCipherMode", header.flags.encryptMode)
@ -822,8 +796,6 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert
                                                          const int ciphertextLen,
                                                          const BlobCipherEncryptHeader& header,
                                                          Arena& arena) {
-	CODE_PROBE(true, "Decrypting data with BlobCipher");
-
 	double startTime = 0.0;
 	if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) {
 		startTime = timer_monotonic();
@ -832,7 +804,7 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert
 	verifyEncryptHeaderMetadata(header);

 	if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && !headerCipherKey.isValid()) {
-		TraceEvent(SevWarn, "BlobCipher.DecryptInvalidHeaderCipherKey")
+		TraceEvent(SevWarn, "BlobCipherDecryptInvalidHeaderCipherKey")
 		    .detail("AuthTokenMode", header.flags.authTokenMode);
 		throw encrypt_ops_error();
 	}
@ -850,7 +822,7 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert
 	uint8_t* plaintext = decrypted->begin();
 	int bytesDecrypted{ 0 };
 	if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) {
-		TraceEvent(SevWarn, "BlobCipher.DecryptUpdateFailed")
+		TraceEvent(SevWarn, "BlobCipherDecryptUpdateFailed")
 		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
 		    .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
 		throw encrypt_ops_error();
@ -858,23 +830,27 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert

 	int finalBlobBytes{ 0 };
 	if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) {
-		TraceEvent(SevWarn, "BlobCipher.DecryptFinalFailed")
+		TraceEvent(SevWarn, "BlobCipherDecryptFinalFailed")
 		    .detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
 		    .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
 		throw encrypt_ops_error();
 	}

 	if ((bytesDecrypted + finalBlobBytes) != ciphertextLen) {
-		TraceEvent(SevWarn, "BlobCipher.EncryptUnexpectedPlaintextLen")
+		TraceEvent(SevWarn, "BlobCipherEncryptUnexpectedPlaintextLen")
 		    .detail("CiphertextLen", ciphertextLen)
 		    .detail("DecryptedBufLen", bytesDecrypted + finalBlobBytes);
 		throw encrypt_ops_error();
 	}

 	decrypted->setLogicalSize(ciphertextLen);
+
 	if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) {
 		BlobCipherMetrics::counters(usageType).decryptCPUTimeNS += int64_t((timer_monotonic() - startTime) * 1e9);
 	}
+
+	CODE_PROBE(true, "Decrypting data with BlobCipher");
+
 	return decrypted;
 }

@ -898,32 +874,38 @@ HmacSha256DigestGen::~HmacSha256DigestGen() {
 	}
 }

-StringRef HmacSha256DigestGen::digest(const unsigned char* data, size_t len, Arena& arena) {
-	CODE_PROBE(true, "Digest generation");
-	unsigned int digestLen = HMAC_size(ctx);
-	auto digest = new (arena) unsigned char[digestLen];
+unsigned int HmacSha256DigestGen::digest(const unsigned char* data,
+                                         size_t len,
+                                         unsigned char* buf,
+                                         unsigned int bufLen) {
+	ASSERT_EQ(bufLen, HMAC_size(ctx));
+
 	if (HMAC_Update(ctx, data, len) != 1) {
 		throw encrypt_ops_error();
 	}

-	if (HMAC_Final(ctx, digest, &digestLen) != 1) {
+	unsigned int digestLen = 0;
+	if (HMAC_Final(ctx, buf, &digestLen) != 1) {
 		throw encrypt_ops_error();
 	}

-	return StringRef(arena, digest, digestLen);
+	CODE_PROBE(true, "Digest generation");
+
+	return digestLen;
 }

-StringRef computeAuthToken(const uint8_t* payload,
-                           const int payloadLen,
-                           const uint8_t* key,
-                           const int keyLen,
-                           Arena& arena) {
-	CODE_PROBE(true, "Auth token generation");
+void computeAuthToken(const uint8_t* payload,
+                      const int payloadLen,
+                      const uint8_t* key,
+                      const int keyLen,
+                      unsigned char* digestBuf,
+                      unsigned int digestBufSz) {
 	HmacSha256DigestGen hmacGenerator(key, keyLen);
-	StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
+	unsigned int digestLen = hmacGenerator.digest(payload, payloadLen, digestBuf, digestBufSz);

-	ASSERT_GE(digest.size(), AUTH_TOKEN_SIZE);
-	return digest;
+	ASSERT_EQ(digestLen, digestBufSz);
+
+	CODE_PROBE(true, "Auth token generation");
 }

 // Only used to link unit tests
@ -941,7 +923,7 @@ void forceLinkBlobCipherTests() {}
 //  6.1  cleanup cipherKeys by given encryptDomainId
 //  6.2. Cleanup all cached cipherKeys
 TEST_CASE("flow/BlobCipher") {
-	TraceEvent("BlobCipherTest.Start").log();
+	TraceEvent("BlobCipherTestStart").log();

 	// Construct a dummy External Key Manager representation and populate with some keys
 	class BaseCipher : public ReferenceCounted<BaseCipher>, NonCopyable {
@ -985,7 +967,7 @@ TEST_CASE("flow/BlobCipher") {
 	Reference<BlobCipherKeyCache> cipherKeyCache = BlobCipherKeyCache::getInstance();

 	// validate getLatestCipherKey return empty when there's no cipher key
-	TraceEvent("BlobCipherTest.LatestKeyNotExists").log();
+	TraceEvent("BlobCipherTestLatestKeyNotExists").log();
 	Reference<BlobCipherKey> latestKeyNonexists =
 	    cipherKeyCache->getLatestCipherKey(deterministicRandom()->randomInt(minDomainId, maxDomainId));
 	ASSERT(!latestKeyNonexists.isValid());
@ -997,7 +979,7 @@ TEST_CASE("flow/BlobCipher") {
 	}

 	// insert BlobCipher keys into BlobCipherKeyCache map and validate
-	TraceEvent("BlobCipherTest_InsertKeys").log();
+	TraceEvent("BlobCipherTestInsertKeys").log();
 	for (auto& domainItr : domainKeyMap) {
 		for (auto& baseKeyItr : domainItr.second) {
 			Reference<BaseCipher> baseCipher = baseKeyItr.second;
@ -1022,7 +1004,7 @@ TEST_CASE("flow/BlobCipher") {
 	                                headerBaseCipher->refreshAt,
 	                                headerBaseCipher->expireAt);

-	TraceEvent("BlobCipherTest.InsertKeysDone").log();
+	TraceEvent("BlobCipherTestInsertKeysDone").log();

 	// validate the cipherKey lookups work as desired
 	for (auto& domainItr : domainKeyMap) {
@ -1041,7 +1023,7 @@ TEST_CASE("flow/BlobCipher") {
 			ASSERT_NE(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0);
 		}
 	}
-	TraceEvent("BlobCipherTest.LooksupDone").log();
+	TraceEvent("BlobCipherTestLooksupDone").log();

 	// Ensure attemtping to insert existing cipherKey (identical) more than once is treated as a NOP
 	try {
@ -1055,7 +1037,7 @@ TEST_CASE("flow/BlobCipher") {
 	} catch (Error& e) {
 		throw;
 	}
-	TraceEvent("BlobCipherTest.ReinsertIdempotentKeyDone").log();
+	TraceEvent("BlobCipherTestReinsertIdempotentKeyDone").log();

 	// Ensure attemtping to insert an existing cipherKey (modified) fails with appropriate error
 	try {
@ -1077,7 +1059,7 @@ TEST_CASE("flow/BlobCipher") {
 			throw;
 		}
 	}
-	TraceEvent("BlobCipherTest.ReinsertNonIdempotentKeyDone").log();
+	TraceEvent("BlobCipherTestReinsertNonIdempotentKeyDone").log();

 	// Validate Encryption ops
 	Reference<BlobCipherKey> cipherKey = cipherKeyCache->getLatestCipherKey(minDomainId);
@ -1093,7 +1075,7 @@ TEST_CASE("flow/BlobCipher") {
 	BlobCipherEncryptHeader headerCopy;
 	// validate basic encrypt followed by decrypt operation for AUTH_MODE_NONE
 	{
-		TraceEvent("NoneAuthMode.Start").log();
+		TraceEvent("NoneAuthModeStart").log();

 		EncryptBlobCipherAes265Ctr encryptor(cipherKey,
 		                                     Reference<BlobCipherKey>(),
@ -1110,7 +1092,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
 		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE);

-		TraceEvent("BlobCipherTest.EncryptDone")
+		TraceEvent("BlobCipherTestEncryptDone")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderEncryptMode", header.flags.encryptMode)
 		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
@ -1127,7 +1109,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
 		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);

-		TraceEvent("BlobCipherTest.DecryptDone").log();
+		TraceEvent("BlobCipherTestDecryptDone").log();

 		// induce encryption header corruption - headerVersion corrupted
 		memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
@ -1200,7 +1182,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
 		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);

-		TraceEvent("BlobCipherTest.EncryptDone")
+		TraceEvent("BlobCipherTestEncryptDone")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderEncryptMode", header.flags.encryptMode)
 		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
@ -1221,7 +1203,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
 		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);

-		TraceEvent("BlobCipherTest.DecryptDone").log();
+		TraceEvent("BlobCipherTestDecryptDone").log();

 		// induce encryption header corruption - headerVersion corrupted
 		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
@ -1287,12 +1269,12 @@ TEST_CASE("flow/BlobCipher") {
 			}
 		}

-		TraceEvent("SingleAuthMode.Done").log();
+		TraceEvent("SingleAuthModeDone").log();
 	}

 	// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI
 	{
-		TraceEvent("MultiAuthMode.Start").log();
+		TraceEvent("MultiAuthModeStart").log();

 		EncryptBlobCipherAes265Ctr encryptor(cipherKey,
 		                                     headerCipherKey,
@ -1309,7 +1291,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
 		ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);

-		TraceEvent("BlobCipherTest.EncryptDone")
+		TraceEvent("BlobCipherTestEncryptDone")
 		    .detail("HeaderVersion", header.flags.headerVersion)
 		    .detail("HeaderEncryptMode", header.flags.encryptMode)
 		    .detail("DomainId", header.cipherTextDetails.encryptDomainId)
@ -1331,7 +1313,7 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
 		ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);

-		TraceEvent("BlobCipherTest.DecryptDone").log();
+		TraceEvent("BlobCipherTestDecryptDone").log();

 		// induce encryption header corruption - headerVersion corrupted
 		encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
@ -1413,7 +1395,7 @@ TEST_CASE("flow/BlobCipher") {
 			}
 		}

-		TraceEvent("MultiAuthMode.Done").log();
+		TraceEvent("MultiAuthModeDone").log();
 	}

 	// Validate dropping encryptDomainId cached keys
@ -1429,6 +1411,6 @@ TEST_CASE("flow/BlobCipher") {
 		ASSERT(cachedKeys.empty());
 	}

-	TraceEvent("BlobCipherTest.Done").log();
+	TraceEvent("BlobCipherTestDone").log();
 	return Void();
 }
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -3328,6 +3328,14 @@ bool AccumulatedMutations::matchesAnyRange(const std::vector<KeyRange>& ranges)
 	std::vector<MutationRef> mutations = decodeMutationLogValue(serializedMutations);
 	for (auto& m : mutations) {
 		for (auto& r : ranges) {
+			if (m.type == MutationRef::Encrypted) {
+				// TODO:  In order to filter out encrypted mutations that are not relevant to the
+				// target range, they would have to be decrypted here in order to check relevance
+				// below, however the staged mutations would still need to remain encrypted for
+				// staging into the destination database.  Without decrypting, we must assume that
+				// some data could match the range and return true here.
+				return true;
+			}
 			if (m.type == MutationRef::ClearRange) {
 				if (r.intersects(KeyRangeRef(m.param1, m.param2))) {
 					return true;
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -698,8 +698,10 @@ ThreadFuture<Version> DLDatabase::verifyBlobRange(const KeyRangeRef& keyRange, O
 		return unsupported_operation();
 	}

+	Version readVersion = version.present() ? version.get() : latestVersion;
+
 	FdbCApi::FDBFuture* f = api->databaseVerifyBlobRange(
-	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), version);
+	    db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), readVersion);

 	return toThreadFuture<Version>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
 		Version version = invalidVersion;
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -8122,6 +8122,25 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx, KeyRan
 	state Version readVersionOut = invalidVersion;
 	state int batchSize = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2;
 	state int loadSize = (BUGGIFY ? deterministicRandom()->randomInt(1, 20) : 20) * batchSize;
+
+	if (version.present()) {
+		if (version.get() == latestVersion) {
+			loop {
+				try {
+					Version _version = wait(tr.getReadVersion());
+					version = _version;
+					break;
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+		if (version.get() <= 0) {
+			TraceEvent("VerifyBlobInvalidVersion").detail("Range", range).detail("Version", version);
+			throw unsupported_operation();
+		}
+	}
+
 	loop {
 		if (curRegion.begin >= range.end) {
 			return readVersionOut;
@ -9911,6 +9930,24 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
 	state KeyRange purgeRange = range;
 	state bool loadedTenantPrefix = false;

+	tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+	if (purgeVersion == latestVersion) {
+		loop {
+			try {
+				Version _purgeVersion = wait(tr.getReadVersion());
+				purgeVersion = _purgeVersion;
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+		tr.reset();
+	}
+	if (purgeVersion <= 0) {
+		TraceEvent("PurgeInvalidVersion").detail("Range", range).detail("Version", purgeVersion).detail("Force", force);
+		throw unsupported_operation();
+	}
+
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -383,7 +383,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ROCKSDB_UNSAFE_AUTO_FSYNC,                           false );
 	init( ROCKSDB_PERIODIC_COMPACTION_SECONDS,                     0 );
 	init( ROCKSDB_PREFIX_LEN,                                      0 );
-	init( ROCKSDB_BLOCK_CACHE_SIZE,                                0 );
+	// If rocksdb block cache size is 0, the default 8MB is used.
+	int64_t blockCacheSize = isSimulated ? 0 : 1024 * 1024 * 1024 /* 1GB */;
+	init( ROCKSDB_BLOCK_CACHE_SIZE,                   blockCacheSize );
 	init( ROCKSDB_METRICS_DELAY,                                60.0 );
 	init( ROCKSDB_READ_VALUE_TIMEOUT,      isSimulated ? 5.0 : 200.0 );
 	init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, isSimulated ? 5.0 : 200.0 );
@ -959,6 +961,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( BLOB_WORKER_BATCH_GRV_INTERVAL,                        0.1 );
 	init( BLOB_WORKER_DO_REJECT_WHEN_FULL,                      true ); if ( randomize && BUGGIFY ) BLOB_WORKER_DO_REJECT_WHEN_FULL = false;
 	init( BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD,                0.9 );
+	init( BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY,                30.0 ); if ( randomize && BUGGIFY ) BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY = deterministicRandom()->randomInt(0, 10) - 1;

 	init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN,                   0.1 );
 	init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX,                   5.0 );
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -728,8 +728,8 @@ ACTOR Future<RangeResult> ddMetricsGetRangeActor(ReadYourWritesTransaction* ryw,
 	loop {
 		try {
 			auto keys = kr.removePrefix(ddStatsRange.begin);
-			Standalone<VectorRef<DDMetricsRef>> resultWithoutPrefix = wait(
-			    waitDataDistributionMetricsList(ryw->getDatabase(), keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT));
+			Standalone<VectorRef<DDMetricsRef>> resultWithoutPrefix =
+			    wait(waitDataDistributionMetricsList(ryw->getDatabase(), keys, CLIENT_KNOBS->TOO_MANY));
 			RangeResult result;
 			for (const auto& ddMetricsRef : resultWithoutPrefix) {
 				// each begin key is the previous end key, thus we only encode the begin key in the result
--- a/fdbclient/include/fdbclient/BlobCipher.h
+++ b/fdbclient/include/fdbclient/BlobCipher.h
@ -116,7 +116,7 @@ class EncryptBuf : public ReferenceCounted<EncryptBuf>, NonCopyable {
 public:
 	EncryptBuf(int size, Arena& arena) : allocSize(size), logicalSize(size) {
 		if (size > 0) {
-			buffer = new (arena) uint8_t[size];
+			buffer = new (arena) uint8_t[size]();
 		} else {
 			buffer = nullptr;
 		}
@ -563,7 +563,6 @@ public:
 	                              const int plaintextLen,
 	                              BlobCipherEncryptHeader* header,
 	                              Arena&);
-	Standalone<StringRef> encryptBlobGranuleChunk(const uint8_t* plaintext, const int plaintextLen);

 private:
 	EVP_CIPHER_CTX* ctx;
@ -628,16 +627,17 @@ public:
 	HmacSha256DigestGen(const unsigned char* key, size_t len);
 	~HmacSha256DigestGen();
 	HMAC_CTX* getCtx() const { return ctx; }
-	StringRef digest(unsigned char const* data, size_t len, Arena&);
+	unsigned int digest(unsigned char const* data, size_t len, unsigned char* buf, unsigned int bufLen);

 private:
 	HMAC_CTX* ctx;
 };

-StringRef computeAuthToken(const uint8_t* payload,
-                           const int payloadLen,
-                           const uint8_t* key,
-                           const int keyLen,
-                           Arena& arena);
+void computeAuthToken(const uint8_t* payload,
+                      const int payloadLen,
+                      const uint8_t* key,
+                      const int keyLen,
+                      unsigned char* digestBuf,
+                      unsigned int digestBufSz);

 #endif // FDBCLIENT_BLOB_CIPHER_H
--- a/fdbclient/include/fdbclient/BlobWorkerCommon.h
+++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h
@ -44,6 +44,7 @@ struct BlobWorkerStats {
 	Counter compressionBytesRaw;
 	Counter compressionBytesFinal;
 	Counter fullRejections;
+	Counter forceFlushCleanups;

 	int numRangesAssigned;
 	int mutationBytesBuffered;
@ -81,9 +82,10 @@ struct BlobWorkerStats {
 	    granuleUpdateErrors("GranuleUpdateErrors", cc), granuleRequestTimeouts("GranuleRequestTimeouts", cc),
 	    readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
 	    flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
-	    compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc), numRangesAssigned(0),
-	    mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0),
-	    cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
+	    compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc),
+	    forceFlushCleanups("ForceFlushCleanups", cc), numRangesAssigned(0), mutationBytesBuffered(0),
+	    activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0),
+	    notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
 	    initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) {
 		specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
 		specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
--- a/fdbclient/include/fdbclient/CommitTransaction.h
+++ b/fdbclient/include/fdbclient/CommitTransaction.h
@ -24,6 +24,7 @@

 #include "fdbclient/BlobCipher.h"
 #include "fdbclient/FDBTypes.h"
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/Tracing.h"

@ -171,6 +172,22 @@ struct MutationRef {
 		return encrypt(cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, arena, usageType);
 	}

+	MutationRef decrypt(TextAndHeaderCipherKeys cipherKeys,
+	                    Arena& arena,
+	                    BlobCipherMetrics::UsageType usageType,
+	                    StringRef* buf = nullptr) const {
+		const BlobCipherEncryptHeader* header = encryptionHeader();
+		DecryptBlobCipherAes256Ctr cipher(cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, header->iv, usageType);
+		StringRef plaintext = cipher.decrypt(param2.begin(), param2.size(), *header, arena)->toStringRef();
+		if (buf != nullptr) {
+			*buf = plaintext;
+		}
+		ArenaReader reader(arena, plaintext, AssumeVersion(ProtocolVersion::withEncryptionAtRest()));
+		MutationRef mutation;
+		reader >> mutation;
+		return mutation;
+	}
+
 	MutationRef decrypt(const std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>& cipherKeys,
 	                    Arena& arena,
 	                    BlobCipherMetrics::UsageType usageType,
@ -180,15 +197,10 @@ struct MutationRef {
 		auto headerCipherItr = cipherKeys.find(header->cipherHeaderDetails);
 		ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
 		ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
-		DecryptBlobCipherAes256Ctr cipher(textCipherItr->second, headerCipherItr->second, header->iv, usageType);
-		StringRef plaintext = cipher.decrypt(param2.begin(), param2.size(), *header, arena)->toStringRef();
-		if (buf != nullptr) {
-			*buf = plaintext;
-		}
-		ArenaReader reader(arena, plaintext, AssumeVersion(ProtocolVersion::withEncryptionAtRest()));
-		MutationRef mutation;
-		reader >> mutation;
-		return mutation;
+		TextAndHeaderCipherKeys textAndHeaderKeys;
+		textAndHeaderKeys.cipherHeaderKey = headerCipherItr->second;
+		textAndHeaderKeys.cipherTextKey = textCipherItr->second;
+		return decrypt(textAndHeaderKeys, arena, usageType, buf);
 	}

 	// These masks define which mutation types have particular properties (they are used to implement
@ -253,6 +265,11 @@ struct CommitTransactionRef {
 	VectorRef<KeyRangeRef> read_conflict_ranges;
 	VectorRef<KeyRangeRef> write_conflict_ranges;
 	VectorRef<MutationRef> mutations; // metadata mutations
+	// encryptedMutations should be a 1-1 corespondence with mutations field above. That is either
+	// encryptedMutations.size() == 0 or encryptedMutations.size() == mutations.size() and encryptedMutations[i] =
+	// mutations[i].encrypt(). Currently this field is not serialized so clients should NOT set this field during a
+	// usual commit path. It is currently only used during backup mutation log restores.
+	VectorRef<Optional<MutationRef>> encryptedMutations;
 	Version read_snapshot = 0;
 	bool report_conflicting_keys = false;
 	bool lock_aware = false; // set when metadata mutations are present
--- a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
+++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h
@ -150,6 +150,21 @@ Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getL
 	return cipherKeys;
 }

+// Get latest cipher key for given a encryption domain. It tries to get the cipher key from the local cache.
+// In case of cache miss, it fetches the cipher key from EncryptKeyProxy and put the result in the local cache
+// before return.
+ACTOR template <class T>
+Future<Reference<BlobCipherKey>> getLatestEncryptCipherKey(Reference<AsyncVar<T> const> db,
+                                                           EncryptCipherDomainId domainId,
+                                                           EncryptCipherDomainName domainName,
+                                                           BlobCipherMetrics::UsageType usageType) {
+	std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> domains({ { domainId, domainName } });
+	std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKey =
+	    wait(getLatestEncryptCipherKeys(db, domains, usageType));
+
+	return cipherKey.at(domainId);
+}
+
 ACTOR template <class T>
 Future<EKPGetBaseCipherKeysByIdsReply> getUncachedEncryptCipherKeys(Reference<AsyncVar<T> const> db,
                                                                    EKPGetBaseCipherKeysByIdsRequest request) {
--- a/fdbclient/include/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/include/fdbclient/MultiVersionTransaction.h
@ -204,7 +204,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	                                      int begin_key_name_length,
 	                                      uint8_t const* end_key_name,
 	                                      int end_key_name_length,
-	                                      Optional<Version> version);
+	                                      int64_t version);

 	// Tenant
 	fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction);
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -938,6 +938,7 @@ public:
 	double BLOB_WORKER_BATCH_GRV_INTERVAL;
 	bool BLOB_WORKER_DO_REJECT_WHEN_FULL;
 	double BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD;
+	double BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY;

 	double BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN;
 	double BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX;
--- a/fdbrpc/tests/AuthzTlsTest.actor.cpp
+++ b/fdbrpc/tests/AuthzTlsTest.actor.cpp
@ -21,12 +21,17 @@
 #ifndef _WIN32
 #include <algorithm>
 #include <cstring>
+#include <cstdlib>
+#include <ctime>
 #include <fmt/format.h>
+#include <limits>
 #include <unistd.h>
 #include <string_view>
 #include <signal.h>
 #include <sys/wait.h>
+#include <type_traits>
 #include "flow/Arena.h"
+#include "flow/Error.h"
 #include "flow/MkCert.h"
 #include "flow/ScopeExit.h"
 #include "flow/TLSConfig.actor.h"
@ -36,46 +41,64 @@

 std::FILE* outp = stdout;

+enum ChainLength : int {
+	NO_TLS = std::numeric_limits<int>::min(),
+};
+
 template <class... Args>
-void log(Args&&... args) {
+void logRaw(Args&&... args) {
 	auto buf = fmt::memory_buffer{};
 	fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
-	fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size()));
+	fmt::print(outp, "{}", std::string_view(buf.data(), buf.size()));
+}
+
+template <class... Args>
+void logWithPrefix(const char* prefix, Args&&... args) {
+	auto buf = fmt::memory_buffer{};
+	fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
+	fmt::print(outp, "{}{}\n", prefix, std::string_view(buf.data(), buf.size()));
 }

 template <class... Args>
 void logc(Args&&... args) {
-	auto buf = fmt::memory_buffer{};
-	fmt::format_to(std::back_inserter(buf), "[CLIENT] ");
-	fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
-	fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size()));
+	logWithPrefix("[CLIENT] ", std::forward<Args>(args)...);
 }

 template <class... Args>
 void logs(Args&&... args) {
-	auto buf = fmt::memory_buffer{};
-	fmt::format_to(std::back_inserter(buf), "[SERVER] ");
-	fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
-	fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size()));
+	logWithPrefix("[SERVER] ", std::forward<Args>(args)...);
 }

 template <class... Args>
 void logm(Args&&... args) {
-	auto buf = fmt::memory_buffer{};
-	fmt::format_to(std::back_inserter(buf), "[ MAIN ] ");
-	fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
-	fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size()));
+	logWithPrefix("[ MAIN ] ", std::forward<Args>(args)...);
+}
+
+std::string drainPipe(int pipeFd) {
+	int readRc = 0;
+	std::string ret;
+	char buf[PIPE_BUF];
+	while ((readRc = ::read(pipeFd, buf, PIPE_BUF)) > 0) {
+		ret.append(buf, readRc);
+	}
+	if (readRc != 0) {
+		logm("Unexpected error draining pipe: {}", strerror(errno));
+		throw std::runtime_error("pipe read error");
+	}
+	return ret;
 }

 struct TLSCreds {
+	bool noTls = false;
 	std::string certBytes;
 	std::string keyBytes;
 	std::string caBytes;
 };

-TLSCreds makeCreds(int chainLen, mkcert::ESide side) {
-	if (chainLen == 0)
-		return {};
+TLSCreds makeCreds(ChainLength chainLen, mkcert::ESide side) {
+	if (chainLen == 0 || chainLen == NO_TLS) {
+		return TLSCreds{ chainLen == NO_TLS, "", "", "" };
+	}
 	auto arena = Arena();
 	auto ret = TLSCreds{};
 	auto specs = mkcert::makeCertChainSpec(arena, std::labs(chainLen), side);
@ -97,9 +120,10 @@ TLSCreds makeCreds(int chainLen, mkcert::ESide side) {
 }

 enum class Result : int {
-	TRUSTED = 0,
+	ERROR = 0,
+	TRUSTED,
 	UNTRUSTED,
-	ERROR,
+	TIMEOUT,
 };

 template <>
@ -112,11 +136,43 @@ struct fmt::formatter<Result> {
 			return fmt::format_to(ctx.out(), "TRUSTED");
 		else if (r == Result::UNTRUSTED)
 			return fmt::format_to(ctx.out(), "UNTRUSTED");
+		else if (r == Result::TIMEOUT)
+			return fmt::format_to(ctx.out(), "TIMEOUT");
 		else
 			return fmt::format_to(ctx.out(), "ERROR");
 	}
 };

+template <>
+struct fmt::formatter<ChainLength> {
+	constexpr auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { return ctx.begin(); }
+
+	template <class FormatContext>
+	auto format(ChainLength value, FormatContext& ctx) -> decltype(ctx.out()) {
+		if (value == NO_TLS)
+			return fmt::format_to(ctx.out(), "NO_TLS");
+		else
+			return fmt::format_to(ctx.out(), "{}", static_cast<std::underlying_type_t<ChainLength>>(value));
+	}
+};
+
+template <>
+struct fmt::formatter<std::vector<std::pair<ChainLength, ChainLength>>> {
+	constexpr auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { return ctx.begin(); }
+
+	template <class FormatContext>
+	auto format(const std::vector<std::pair<ChainLength, ChainLength>>& entries, FormatContext& ctx)
+	    -> decltype(ctx.out()) {
+		fmt::format_to(ctx.out(), "[");
+		bool first = true;
+		for (const auto& entry : entries) {
+			fmt::format_to(ctx.out(), "{}{{ {}, {} }}", (first ? "" : ", "), entry.first, entry.second);
+			first = false;
+		}
+		return fmt::format_to(ctx.out(), "]");
+	}
+};
+
 ACTOR template <class T>
 Future<T> stopNetworkAfter(Future<T> what) {
 	T t = wait(what);
@ -165,25 +221,27 @@ struct SessionProbeReceiver final : NetworkMessageReceiver {
 	bool isPublic() const override { return true; }
 };

-Future<Void> runServer(Future<Void> listenFuture, const Endpoint& endpoint, int addrPipe, int completionPipe) {
+void runServer(const Endpoint& endpoint, int addrPipe, int completionPipe) {
 	auto realAddr = FlowTransport::transport().getLocalAddresses().address;
 	logs("Listening at {}", realAddr.toString());
 	logs("Endpoint token is {}", endpoint.token.toString());
+	static_assert(std::is_trivially_destructible_v<NetworkAddress>,
+	              "NetworkAddress cannot be directly put on wire; need proper (de-)serialization");
 	// below writes/reads would block, but this is good enough for a test.
 	if (sizeof(realAddr) != ::write(addrPipe, &realAddr, sizeof(realAddr))) {
 		logs("Failed to write server addr to pipe: {}", strerror(errno));
-		return Void();
+		return;
 	}
 	if (sizeof(endpoint.token) != ::write(addrPipe, &endpoint.token, sizeof(endpoint.token))) {
 		logs("Failed to write server endpoint to pipe: {}", strerror(errno));
-		return Void();
+		return;
 	}
 	auto done = false;
 	if (sizeof(done) != ::read(completionPipe, &done, sizeof(done))) {
 		logs("Failed to read completion flag from pipe: {}", strerror(errno));
-		return Void();
+		return;
 	}
-	return Void();
+	return;
 }

 ACTOR Future<Void> waitAndPrintResponse(Future<SessionInfo> response, Result* rc) {
@ -192,8 +250,13 @@ ACTOR Future<Void> waitAndPrintResponse(Future<SessionInfo> response, Result* rc
 		logc("Probe response: trusted={} peerAddress={}", info.isPeerTrusted, info.peerAddress.toString());
 		*rc = info.isPeerTrusted ? Result::TRUSTED : Result::UNTRUSTED;
 	} catch (Error& err) {
-		logc("Error: {}", err.what());
-		*rc = Result::ERROR;
+		if (err.code() != error_code_operation_cancelled) {
+			logc("Unexpected error: {}", err.what());
+			*rc = Result::ERROR;
+		} else {
+			logc("Timed out");
+			*rc = Result::TIMEOUT;
+		}
 	}
 	return Void();
 }
@ -201,9 +264,12 @@ ACTOR Future<Void> waitAndPrintResponse(Future<SessionInfo> response, Result* rc
 template <bool IsServer>
 int runHost(TLSCreds creds, int addrPipe, int completionPipe, Result expect) {
 	auto tlsConfig = TLSConfig(IsServer ? TLSEndpointType::SERVER : TLSEndpointType::CLIENT);
-	tlsConfig.setCertificateBytes(creds.certBytes);
-	tlsConfig.setCABytes(creds.caBytes);
-	tlsConfig.setKeyBytes(creds.keyBytes);
+	bool const noTls = creds.noTls;
+	if (!noTls) {
+		tlsConfig.setCertificateBytes(creds.certBytes);
+		tlsConfig.setCABytes(creds.caBytes);
+		tlsConfig.setKeyBytes(creds.keyBytes);
+	}
 	g_network = newNet2(tlsConfig);
 	openTraceFile(NetworkAddress(),
 	              10 << 20,
@ -213,19 +279,21 @@ int runHost(TLSCreds creds, int addrPipe, int completionPipe, Result expect) {
 	FlowTransport::createInstance(!IsServer, 1, WLTOKEN_RESERVED_COUNT);
 	auto& transport = FlowTransport::transport();
 	if constexpr (IsServer) {
-		auto addr = NetworkAddress::parse("127.0.0.1:0:tls");
+		auto addr = NetworkAddress::parse(noTls ? "127.0.0.1:0" : "127.0.0.1:0:tls");
+		auto endpoint = Endpoint();
+		auto receiver = SessionProbeReceiver();
+		auto listenFuture = transport.bind(addr, addr);
+		transport.addEndpoint(endpoint, &receiver, TaskPriority::ReadSocket);
 		auto thread = std::thread([]() {
 			g_network->run();
 			flushTraceFileVoid();
 		});
-		auto endpoint = Endpoint();
-		auto receiver = SessionProbeReceiver();
-		transport.addEndpoint(endpoint, &receiver, TaskPriority::ReadSocket);
-		runServer(transport.bind(addr, addr), endpoint, addrPipe, completionPipe);
+		runServer(endpoint, addrPipe, completionPipe);
 		auto cleanupGuard = ScopeExit([&thread]() {
 			g_network->stop();
 			thread.join();
 		});
+		return 0;
 	} else {
 		auto dest = Endpoint();
 		auto& serverAddr = dest.addresses.address;
@ -233,26 +301,27 @@ int runHost(TLSCreds creds, int addrPipe, int completionPipe, Result expect) {
 			logc("Failed to read server addr from pipe: {}", strerror(errno));
 			return 1;
 		}
+		if (noTls)
+			serverAddr.flags &= ~NetworkAddress::FLAG_TLS;
+		else
+			serverAddr.flags |= NetworkAddress::FLAG_TLS;
 		auto& token = dest.token;
 		if (sizeof(token) != ::read(addrPipe, &token, sizeof(token))) {
 			logc("Failed to read server endpoint token from pipe: {}", strerror(errno));
 			return 2;
 		}
-		logc("Server address is {}", serverAddr.toString());
+		logc("Server address is {}{}", serverAddr.toString(), noTls ? " (TLS suffix removed)" : "");
 		logc("Server endpoint token is {}", token.toString());
 		auto sessionProbeReq = SessionProbeRequest{};
 		transport.sendUnreliable(SerializeSource(sessionProbeReq), dest, true /*openConnection*/);
 		logc("Request is sent");
-		auto probeResponse = sessionProbeReq.reply.getFuture();
-		auto result = Result::TRUSTED;
-		auto timeout = delay(5);
-		auto complete = waitAndPrintResponse(probeResponse, &result);
-		auto f = stopNetworkAfter(complete || timeout);
 		auto rc = 0;
-		g_network->run();
-		if (!complete.isReady()) {
-			logc("Error: Probe request timed out");
-			rc = 3;
+		auto result = Result::ERROR;
+		{
+			auto timeout = delay(expect == Result::TIMEOUT ? 0.5 : 5);
+			auto complete = waitAndPrintResponse(sessionProbeReq.reply.getFuture(), &result);
+			auto f = stopNetworkAfter(complete || timeout);
+			g_network->run();
 		}
 		auto done = true;
 		if (sizeof(done) != ::write(completionPipe, &done, sizeof(done))) {
@ -269,19 +338,48 @@ int runHost(TLSCreds creds, int addrPipe, int completionPipe, Result expect) {
 		}
 		return rc;
 	}
-	return 0;
 }

-int runTlsTest(int serverChainLen, int clientChainLen) {
-	log("==== BEGIN TESTCASE ====");
+Result getExpectedResult(ChainLength serverChainLen, ChainLength clientChainLen) {
 	auto expect = Result::ERROR;
 	if (serverChainLen > 0) {
-		if (clientChainLen > 0)
+		if (clientChainLen == NO_TLS || clientChainLen < 0) {
+			expect = Result::TIMEOUT;
+		} else if (clientChainLen > 0) {
 			expect = Result::TRUSTED;
-		else if (clientChainLen == 0)
+		} else if (clientChainLen == 0) {
 			expect = Result::UNTRUSTED;
+		}
+	} else if (serverChainLen == NO_TLS && clientChainLen == NO_TLS) {
+		expect = Result::TRUSTED;
+	} else {
+		expect = Result::TIMEOUT;
 	}
-	log("Cert chain length: server={} client={}", serverChainLen, clientChainLen);
+	return expect;
+}
+
+bool waitPid(pid_t subProcPid, const char* procName) {
+	auto status = int{};
+	auto pid = ::waitpid(subProcPid, &status, 0);
+	if (pid < 0) {
+		logm("{} subprocess waitpid() failed with {}", procName, strerror(errno));
+		return false;
+	} else {
+		if (status != 0) {
+			logm("{} subprocess had error: rc={}", procName, status);
+			return false;
+		} else {
+			logm("{} subprocess waitpid() OK", procName);
+			return true;
+		}
+	}
+}
+
+int runTlsTest(ChainLength serverChainLen, ChainLength clientChainLen) {
+	logm("==== BEGIN TESTCASE ====");
+	auto const expect = getExpectedResult(serverChainLen, clientChainLen);
+	using namespace std::literals::string_literals;
+	logm("Cert chain length: server={} client={}", serverChainLen, clientChainLen);
 	auto arena = Arena();
 	auto serverCreds = makeCreds(serverChainLen, mkcert::ESide::Server);
 	auto clientCreds = makeCreds(clientChainLen, mkcert::ESide::Client);
@ -289,65 +387,123 @@ int runTlsTest(int serverChainLen, int clientChainLen) {
 	std::swap(serverCreds.caBytes, clientCreds.caBytes);
 	auto clientPid = pid_t{};
 	auto serverPid = pid_t{};
-	int addrPipe[2];
-	int completionPipe[2];
-	if (::pipe(addrPipe) || ::pipe(completionPipe)) {
+	int addrPipe[2], completionPipe[2], serverStdoutPipe[2], clientStdoutPipe[2];
+	if (::pipe(addrPipe) || ::pipe(completionPipe) || ::pipe(serverStdoutPipe) || ::pipe(clientStdoutPipe)) {
 		logm("Pipe open failed: {}", strerror(errno));
 		return 1;
 	}
-	auto pipeCleanup = ScopeExit([&addrPipe, &completionPipe]() {
-		::close(addrPipe[0]);
-		::close(addrPipe[1]);
-		::close(completionPipe[0]);
-		::close(completionPipe[1]);
-	});
-	serverPid = fork();
-	if (serverPid == 0) {
-		_exit(runHost<true>(std::move(serverCreds), addrPipe[1], completionPipe[0], expect));
-	}
-	clientPid = fork();
-	if (clientPid == 0) {
-		_exit(runHost<false>(std::move(clientCreds), addrPipe[0], completionPipe[1], expect));
-	}
-	auto pid = pid_t{};
-	auto status = int{};
-	pid = waitpid(clientPid, &status, 0);
 	auto ok = true;
-	if (pid < 0) {
-		logm("waitpid() for client failed with {}", strerror(errno));
-		ok = false;
-	} else {
-		if (status != 0) {
-			logm("Client error: rc={}", status);
-			ok = false;
-		} else {
-			logm("Client OK");
+	{
+		serverPid = fork();
+		if (serverPid == -1) {
+			logm("fork() for server subprocess failed: {}", strerror(errno));
+			return 1;
+		} else if (serverPid == 0) {
+			// server subprocess
+			::close(addrPipe[0]); // close address-in pipe (server writes its own address for client)
+			::close(
+			    completionPipe[1]); // close completion-flag-out pipe (server awaits/reads completion flag from client)
+			::close(clientStdoutPipe[0]);
+			::close(clientStdoutPipe[1]);
+			::close(serverStdoutPipe[0]);
+			auto pipeCleanup = ScopeExit([&addrPipe, &completionPipe]() {
+				::close(addrPipe[1]);
+				::close(completionPipe[0]);
+			});
+			if (-1 == ::dup2(serverStdoutPipe[1], STDOUT_FILENO)) {
+				logs("Failed to redirect server stdout to pipe: {}", strerror(errno));
+				::close(serverStdoutPipe[1]);
+				return 1;
+			}
+			_exit(runHost<true>(std::move(serverCreds), addrPipe[1], completionPipe[0], expect));
 		}
-	}
-	pid = waitpid(serverPid, &status, 0);
-	if (pid < 0) {
-		logm("waitpid() for server failed with {}", strerror(errno));
-		ok = false;
-	} else {
-		if (status != 0) {
-			logm("Server error: rc={}", status);
-			ok = false;
-		} else {
-			logm("Server OK");
+		auto serverProcCleanup = ScopeExit([&ok, serverPid]() {
+			if (!waitPid(serverPid, "Server"))
+				ok = false;
+		});
+		clientPid = fork();
+		if (clientPid == -1) {
+			logm("fork() for client subprocess failed: {}", strerror(errno));
+			return 1;
+		} else if (clientPid == 0) {
+			::close(addrPipe[1]);
+			::close(completionPipe[0]);
+			::close(serverStdoutPipe[0]);
+			::close(serverStdoutPipe[1]);
+			::close(clientStdoutPipe[0]);
+			auto pipeCleanup = ScopeExit([&addrPipe, &completionPipe]() {
+				::close(addrPipe[0]);
+				::close(completionPipe[1]);
+			});
+			if (-1 == ::dup2(clientStdoutPipe[1], STDOUT_FILENO)) {
+				logs("Failed to redirect client stdout to pipe: {}", strerror(errno));
+				::close(clientStdoutPipe[1]);
+				return 1;
+			}
+			_exit(runHost<false>(std::move(clientCreds), addrPipe[0], completionPipe[1], expect));
 		}
+		auto clientProcCleanup = ScopeExit([&ok, clientPid]() {
+			if (!waitPid(clientPid, "Client"))
+				ok = false;
+		});
 	}
-	log(ok ? "OK" : "FAILED");
-	return 0;
+	// main process
+	::close(addrPipe[0]);
+	::close(addrPipe[1]);
+	::close(completionPipe[0]);
+	::close(completionPipe[1]);
+	::close(serverStdoutPipe[1]);
+	::close(clientStdoutPipe[1]);
+	auto pipeCleanup = ScopeExit([&]() {
+		::close(serverStdoutPipe[0]);
+		::close(clientStdoutPipe[0]);
+	});
+	std::string const clientStdout = drainPipe(clientStdoutPipe[0]);
+	logm("/// Begin Client STDOUT ///");
+	logRaw(clientStdout);
+	logm("/// End Client STDOUT ///");
+	std::string const serverStdout = drainPipe(serverStdoutPipe[0]);
+	logm("/// Begin Server STDOUT ///");
+	logRaw(serverStdout);
+	logm("/// End Server STDOUT ///");
+	logm(ok ? "OK" : "FAILED");
+	return !ok;
 }

-int main() {
-	std::pair<int, int> inputs[] = { { 3, 2 }, { 4, 0 }, { 1, 3 }, { 1, 0 }, { 2, 0 }, { 3, 3 }, { 3, 0 } };
+int main(int argc, char** argv) {
+	unsigned seed = std::time(nullptr);
+	if (argc > 1)
+		seed = std::stoul(argv[1]);
+	std::srand(seed);
+	logm("Seed: {}", seed);
+	auto categoryToValue = [](int category) -> ChainLength {
+		if (category == 2 || category == -2) {
+			return static_cast<ChainLength>(category + std::rand() % 3);
+		} else {
+			return static_cast<ChainLength>(category);
+		}
+	};
+	std::vector<std::pair<ChainLength, ChainLength>> inputs;
+	std::vector<int> categories{ 0, NO_TLS, 1, -1, 2, -2 };
+	for (auto lhs : categories) {
+		for (auto rhs : categories) {
+			auto input = std::pair(categoryToValue(lhs), categoryToValue(rhs));
+			inputs.push_back(input);
+		}
+	}
+	std::vector<std::pair<ChainLength, ChainLength>> failed;
 	for (auto input : inputs) {
 		auto [serverChainLen, clientChainLen] = input;
-		if (auto rc = runTlsTest(serverChainLen, clientChainLen))
-			return rc;
+		if (runTlsTest(serverChainLen, clientChainLen))
+			failed.push_back({ serverChainLen, clientChainLen });
+	}
+	if (!failed.empty()) {
+		logm("Test Failed: {}/{} cases: {}", failed.size(), inputs.size(), failed);
+		return 1;
+	} else {
+		logm("Test OK: {}/{} cases passed", inputs.size(), inputs.size());
+		return 0;
 	}
-	return 0;
 }
 #else // _WIN32

--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -1854,6 +1854,30 @@ ACTOR Future<bool> checkFileNotFoundForcePurgeRace(Reference<BlobWorkerData> bwD
 	}
 }

+// Does a force flush after consuming all data post-split from the parent granule's old change feed.
+// This guarantees that all of the data from the old change feed gets persisted to blob storage, and the old feed can be
+// cleaned up. This is particularly necessary for sequential workloads, where only one child granule after the split has
+// new writes. Adds a delay so that, if the granule is not write-cold and would have written a delta file soon anyway,
+// this does not add any extra overhead.
+ACTOR Future<Void> forceFlushCleanup(Reference<BlobWorkerData> bwData, Reference<GranuleMetadata> metadata, Version v) {
+	double cleanupDelay = SERVER_KNOBS->BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY;
+	if (cleanupDelay < 0) {
+		return Void();
+	}
+	wait(delay(cleanupDelay));
+	if (metadata->forceFlushVersion.get() < v && metadata->pendingDeltaVersion < v) {
+		metadata->forceFlushVersion.set(v);
+		++bwData->stats.forceFlushCleanups;
+		if (BW_DEBUG) {
+			fmt::print("Granule [{0} - {1}) forcing flush cleanup @ {2}\n",
+			           metadata->keyRange.begin.printable(),
+			           metadata->keyRange.end.printable(),
+			           v);
+		}
+	}
+	return Void();
+}
+
 // updater for a single granule
 // TODO: this is getting kind of large. Should try to split out this actor if it continues to grow?
 ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
@ -1866,9 +1890,9 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 	state Future<Void> oldChangeFeedFuture;
 	state Future<Void> changeFeedFuture;
 	state Future<Void> checkMergeCandidate;
+	state Future<Void> forceFlushCleanupFuture;
 	state GranuleStartState startState;
 	state bool readOldChangeFeed;
-	state Optional<std::pair<KeyRange, UID>> oldChangeFeedDataComplete;
 	state Key cfKey;
 	state Optional<Key> oldCFKey;
 	state int pendingSnapshots = 0;
@ -1990,7 +2014,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,

 		Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>(bwData->db.getPtr());

-		if (startState.splitParentGranule.present() && startVersion < startState.changeFeedStartVersion) {
+		if (startState.splitParentGranule.present() && startVersion + 1 < startState.changeFeedStartVersion) {
 			// read from parent change feed up until our new change feed is started
 			// Required to have canReadPopped = false, otherwise another granule can take over the change feed,
 			// and pop it. That could cause this worker to think it has the full correct set of data if it then
@ -2017,7 +2041,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 			                                                   false);
 			// in case previous worker died before popping the latest version, start another pop
 			if (startState.previousDurableVersion != invalidVersion) {
-				ASSERT(startState.previousDurableVersion >= startState.changeFeedStartVersion);
+				ASSERT(startState.previousDurableVersion + 1 >= startState.changeFeedStartVersion);
 				Future<Void> popFuture =
 				    bwData->db->popChangeFeedMutations(cfKey, startState.previousDurableVersion + 1);
 				inFlightPops.push_back(popFuture);
@ -2138,9 +2162,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				ASSERT(readOldChangeFeed);

 				readOldChangeFeed = false;
-				// set this so next delta file write updates granule split metadata to done
-				ASSERT(startState.splitParentGranule.present());
-				oldChangeFeedDataComplete = startState.splitParentGranule.get();
 				if (BW_DEBUG) {
 					fmt::print("Granule [{0} - {1}) switching to new change feed {2} @ {3}, {4}\n",
 					           metadata->keyRange.begin.printable(),
@ -2152,6 +2173,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				ASSERT(metadata->bufferedDeltaVersion <= metadata->activeCFData.get()->getVersion());
 				// update this for change feed popped detection
 				metadata->bufferedDeltaVersion = metadata->activeCFData.get()->getVersion();
+				forceFlushCleanupFuture = forceFlushCleanup(bwData, metadata, metadata->bufferedDeltaVersion);

 				Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>(bwData->db.getPtr());

@ -2267,7 +2289,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 									Reference<ChangeFeedData> cfData =
 									    makeReference<ChangeFeedData>(bwData->db.getPtr());

-									if (!readOldChangeFeed && cfRollbackVersion < startState.changeFeedStartVersion) {
+									if (!readOldChangeFeed &&
+									    cfRollbackVersion + 1 < startState.changeFeedStartVersion) {
 										// It isn't possible to roll back across the parent/child feed boundary,
 										// but as part of rolling back we may need to cancel in-flight delta
 										// files, and those delta files may include stuff from before the
@ -2277,11 +2300,11 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 										ASSERT(cfRollbackVersion >= metadata->durableDeltaVersion.get());
 										CODE_PROBE(true, "rollback crossed change feed boundaries");
 										readOldChangeFeed = true;
-										oldChangeFeedDataComplete.reset();
+										forceFlushCleanupFuture = Never();
 									}

 									if (readOldChangeFeed) {
-										ASSERT(cfRollbackVersion < startState.changeFeedStartVersion);
+										ASSERT(cfRollbackVersion + 1 < startState.changeFeedStartVersion);
 										ASSERT(oldCFKey.present());
 										oldChangeFeedFuture =
 										    bwData->db->getChangeFeedStream(cfData,
@ -2293,12 +2316,12 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 										                                    false);

 									} else {
-										if (cfRollbackVersion < startState.changeFeedStartVersion) {
+										if (cfRollbackVersion + 1 < startState.changeFeedStartVersion) {
 											fmt::print("Rollback past CF start??. rollback={0}, start={1}\n",
 											           cfRollbackVersion,
 											           startState.changeFeedStartVersion);
 										}
-										ASSERT(cfRollbackVersion >= startState.changeFeedStartVersion);
+										ASSERT(cfRollbackVersion + 1 >= startState.changeFeedStartVersion);

 										changeFeedFuture =
 										    bwData->db->getChangeFeedStream(cfData,
@ -2406,6 +2429,21 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 					CODE_PROBE(true, "Force flushing empty delta file!");
 				}

+				// launch pipelined, but wait for previous operation to complete before persisting to FDB
+				Future<BlobFileIndex> previousFuture;
+				if (!inFlightFiles.empty()) {
+					previousFuture = inFlightFiles.back().future;
+				} else {
+					previousFuture = Future<BlobFileIndex>(BlobFileIndex());
+				}
+
+				Optional<std::pair<KeyRange, UID>> oldChangeFeedDataComplete;
+				if (startState.splitParentGranule.present() &&
+				    metadata->pendingDeltaVersion < startState.changeFeedStartVersion &&
+				    lastDeltaVersion >= startState.changeFeedStartVersion) {
+					oldChangeFeedDataComplete = startState.splitParentGranule.get();
+				}
+
 				if (BW_DEBUG) {
 					fmt::print("Granule [{0} - {1}) flushing delta file after {2} bytes @ {3} {4}\n",
 					           metadata->keyRange.begin.printable(),
@ -2415,13 +2453,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 					           oldChangeFeedDataComplete.present() ? ". Finalizing " : "");
 				}

-				// launch pipelined, but wait for previous operation to complete before persisting to FDB
-				Future<BlobFileIndex> previousFuture;
-				if (!inFlightFiles.empty()) {
-					previousFuture = inFlightFiles.back().future;
-				} else {
-					previousFuture = Future<BlobFileIndex>(BlobFileIndex());
-				}
 				startDeltaFileWrite = bwData->deltaWritesLock->take();
 				Future<BlobFileIndex> dfFuture =
 				    writeDeltaFile(bwData,
@ -2438,7 +2469,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				                   startDeltaFileWrite);
 				inFlightFiles.push_back(InFlightFile(dfFuture, lastDeltaVersion, metadata->bufferedDeltaBytes, false));

-				oldChangeFeedDataComplete.reset();
 				// add new pending delta file
 				ASSERT(metadata->pendingDeltaVersion < lastDeltaVersion);
 				metadata->pendingDeltaVersion = lastDeltaVersion;
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@ -50,6 +50,7 @@
 #include "fdbserver/RatekeeperInterface.h"
 #include "fdbserver/RecoveryState.h"
 #include "fdbserver/RestoreUtil.h"
+#include "fdbserver/ServerDBInfo.actor.h"
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "flow/ActorCollection.h"
@ -61,6 +62,7 @@
 #include "fdbclient/Tracing.h"

 #include "flow/actorcompiler.h" // This must be the last #include.
+#include "flow/network.h"

 ACTOR Future<Void> broadcastTxnRequest(TxnStateRequest req, int sendAmount, bool sendReply) {
 	state ReplyPromise<Void> reply = req.reply;
@ -1009,10 +1011,17 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
 				ASSERT(tenantName.present());
 				encryptDomains[tenantId] = Standalone(tenantName.get(), tenantInfo.arena);
 			} else {
-				for (auto m : trs[t].transaction.mutations) {
-					std::pair<EncryptCipherDomainName, int64_t> details =
-					    getEncryptDetailsFromMutationRef(pProxyCommitData, m);
-					encryptDomains[details.second] = details.first;
+				// Optimization: avoid enumerating mutations if cluster only serves default encryption domains
+				if (pProxyCommitData->tenantMap.size() > 0) {
+					for (auto m : trs[t].transaction.mutations) {
+						std::pair<EncryptCipherDomainName, int64_t> details =
+						    getEncryptDetailsFromMutationRef(pProxyCommitData, m);
+						encryptDomains[details.second] = details.first;
+					}
+				} else {
+					// Ensure default encryption domain-ids are present.
+					ASSERT_EQ(encryptDomains.count(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID), 1);
+					ASSERT_EQ(encryptDomains.count(FDB_DEFAULT_ENCRYPT_DOMAIN_ID), 1);
 				}
 			}
 		}
@ -1240,25 +1249,54 @@ ACTOR Future<Void> applyMetadataToCommittedTransactions(CommitBatchContext* self
 	return Void();
 }

-void writeMutation(CommitBatchContext* self, int64_t tenantId, const MutationRef& mutation) {
+ACTOR Future<MutationRef> writeMutation(CommitBatchContext* self,
+                                        int64_t tenantId,
+                                        const MutationRef* mutation,
+                                        Optional<MutationRef>* encryptedMutationOpt,
+                                        Arena* arena) {
 	static_assert(TenantInfo::INVALID_TENANT == INVALID_ENCRYPT_DOMAIN_ID);

 	if (self->pProxyCommitData->isEncryptionEnabled) {
-		EncryptCipherDomainId domainId = tenantId;
-		if (domainId == INVALID_ENCRYPT_DOMAIN_ID) {
-			std::pair<EncryptCipherDomainName, EncryptCipherDomainId> p =
-			    getEncryptDetailsFromMutationRef(self->pProxyCommitData, mutation);
-			domainId = p.second;
+		state EncryptCipherDomainId domainId = tenantId;
+		state MutationRef encryptedMutation;

-			CODE_PROBE(true, "Raw access mutation encryption");
+		if (encryptedMutationOpt->present()) {
+			CODE_PROBE(true, "using already encrypted mutation");
+			encryptedMutation = encryptedMutationOpt->get();
+			ASSERT(encryptedMutation.isEncrypted());
+			// During simulation check whether the encrypted mutation matches the decrpyted mutation
+			if (g_network && g_network->isSimulated()) {
+				Reference<AsyncVar<ServerDBInfo> const> dbInfo = self->pProxyCommitData->db;
+				state const BlobCipherEncryptHeader* header = encryptedMutation.encryptionHeader();
+				TextAndHeaderCipherKeys cipherKeys =
+				    wait(getEncryptCipherKeys(dbInfo, *header, BlobCipherMetrics::TLOG));
+				MutationRef decryptedMutation = encryptedMutation.decrypt(cipherKeys, *arena, BlobCipherMetrics::TLOG);
+				ASSERT(decryptedMutation.param1 == mutation->param1 && decryptedMutation.param2 == mutation->param2 &&
+				       decryptedMutation.type == mutation->type);
+			}
+		} else {
+			if (domainId == INVALID_ENCRYPT_DOMAIN_ID) {
+				std::pair<EncryptCipherDomainName, EncryptCipherDomainId> p =
+				    getEncryptDetailsFromMutationRef(self->pProxyCommitData, *mutation);
+				domainId = p.second;
+
+				if (self->cipherKeys.find(domainId) == self->cipherKeys.end()) {
+					Reference<BlobCipherKey> cipherKey = wait(getLatestEncryptCipherKey(
+					    self->pProxyCommitData->db, domainId, p.first, BlobCipherMetrics::TLOG));
+					self->cipherKeys[domainId] = cipherKey;
+				}
+
+				CODE_PROBE(true, "Raw access mutation encryption");
+			}
+
+			ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID);
+			encryptedMutation = mutation->encrypt(self->cipherKeys, domainId, *arena, BlobCipherMetrics::TLOG);
 		}
-
-		ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID);
-
-		Arena arena;
-		self->toCommit.writeTypedMessage(mutation.encrypt(self->cipherKeys, domainId, arena, BlobCipherMetrics::TLOG));
+		self->toCommit.writeTypedMessage(encryptedMutation);
+		return encryptedMutation;
 	} else {
-		self->toCommit.writeTypedMessage(mutation);
+		self->toCommit.writeTypedMessage(*mutation);
+		return *mutation;
 	}
 }

@ -1278,6 +1316,9 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 		state Optional<ClientTrCommitCostEstimation>* trCost = &trs[self->transactionNum].commitCostEstimation;
 		state int mutationNum = 0;
 		state VectorRef<MutationRef>* pMutations = &trs[self->transactionNum].transaction.mutations;
+		state VectorRef<Optional<MutationRef>>* encryptedMutations =
+		    &trs[self->transactionNum].transaction.encryptedMutations;
+		ASSERT(encryptedMutations->size() == 0 || encryptedMutations->size() == pMutations->size());
 		state int64_t tenantId = trs[self->transactionNum].tenantInfo.tenantId;

 		self->toCommit.addTransactionInfo(trs[self->transactionNum].spanContext);
@ -1292,13 +1333,17 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 				}
 			}

-			auto& m = (*pMutations)[mutationNum];
+			state MutationRef m = (*pMutations)[mutationNum];
+			state Optional<MutationRef> encryptedMutation =
+			    encryptedMutations->size() > 0 ? (*encryptedMutations)[mutationNum] : Optional<MutationRef>();
+			state Arena arena;
+			state MutationRef writtenMutation;
 			self->mutationCount++;
 			self->mutationBytes += m.expectedSize();
 			self->yieldBytes += m.expectedSize();
+			ASSERT(!m.isEncrypted());
 			// Determine the set of tags (responsible storage servers) for the mutation, splitting it
 			// if necessary.  Serialize (splits of) the mutation into the message buffer and add the tags.
-
 			if (isSingleKeyMutation((MutationRef::Type)m.type)) {
 				auto& tags = pProxyCommitData->tagsForKey(m.param1);

@ -1336,7 +1381,11 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 				if (pProxyCommitData->cacheInfo[m.param1]) {
 					self->toCommit.addTag(cacheTag);
 				}
-				writeMutation(self, tenantId, m);
+				if (encryptedMutation.present()) {
+					ASSERT(encryptedMutation.get().isEncrypted());
+				}
+				MutationRef tempMutation = wait(writeMutation(self, tenantId, &m, &encryptedMutation, &arena));
+				writtenMutation = tempMutation;
 			} else if (m.type == MutationRef::ClearRange) {
 				KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2));
 				auto ranges = pProxyCommitData->keyInfo.intersectingRanges(clearRange);
@ -1389,7 +1438,8 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 				if (pProxyCommitData->needsCacheTag(clearRange)) {
 					self->toCommit.addTag(cacheTag);
 				}
-				writeMutation(self, tenantId, m);
+				MutationRef tempMutation = wait(writeMutation(self, tenantId, &m, &encryptedMutation, &arena));
+				writtenMutation = tempMutation;
 			} else {
 				UNREACHABLE();
 			}
@ -1403,7 +1453,9 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 			if (m.type != MutationRef::Type::ClearRange) {
 				// Add the mutation to the relevant backup tag
 				for (auto backupName : pProxyCommitData->vecBackupKeys[m.param1]) {
-					self->logRangeMutations[backupName].push_back_deep(self->logRangeMutationsArena, m);
+					// If encryption is enabled make sure the mutation we are writing is also encrypted
+					ASSERT(!self->pProxyCommitData->isEncryptionEnabled || writtenMutation.isEncrypted());
+					self->logRangeMutations[backupName].push_back_deep(self->logRangeMutationsArena, writtenMutation);
 				}
 			} else {
 				KeyRangeRef mutationRange(m.param1, m.param2);
@ -1421,6 +1473,21 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 					MutationRef backupMutation(
 					    MutationRef::Type::ClearRange, intersectionRange.begin, intersectionRange.end);

+					// TODO (Nim): Currently clear ranges are encrypted using the default encryption key, this must be
+					// changed to account for clear ranges which span tenant boundaries
+					if (self->pProxyCommitData->isEncryptionEnabled) {
+						if (backupMutation.param1 == m.param1 && backupMutation.param2 == m.param2 &&
+						    encryptedMutation.present()) {
+							backupMutation = encryptedMutation.get();
+						} else {
+							std::pair<EncryptCipherDomainName, EncryptCipherDomainId> p =
+							    getEncryptDetailsFromMutationRef(self->pProxyCommitData, backupMutation);
+							EncryptCipherDomainId domainId = p.second;
+							backupMutation =
+							    backupMutation.encrypt(self->cipherKeys, domainId, arena, BlobCipherMetrics::BACKUP);
+						}
+					}
+
 					// Add the mutation to the relevant backup tag
 					for (auto backupName : backupRange.value()) {
 						self->logRangeMutations[backupName].push_back_deep(self->logRangeMutationsArena,
@ -1514,9 +1581,9 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 	pProxyCommitData->stats.mutations += self->mutationCount;
 	pProxyCommitData->stats.mutationBytes += self->mutationBytes;

-	// Storage servers mustn't make durable versions which are not fully committed (because then they are impossible to
-	// roll back) We prevent this by limiting the number of versions which are semi-committed but not fully committed to
-	// be less than the MVCC window
+	// Storage servers mustn't make durable versions which are not fully committed (because then they are impossible
+	// to roll back) We prevent this by limiting the number of versions which are semi-committed but not fully
+	// committed to be less than the MVCC window
 	if (pProxyCommitData->committedVersion.get() <
 	    self->commitVersion - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) {
 		self->computeDuration += g_network->timer() - self->computeStart;
@ -1715,8 +1782,8 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 	// After logging finishes, we report the commit version to master so that every other proxy can get the most
 	// up-to-date live committed version. We also maintain the invariant that master's committed version >=
 	// self->committedVersion by reporting commit version first before updating self->committedVersion. Otherwise, a
-	// client may get a commit version that the master is not aware of, and next GRV request may get a version less than
-	// self->committedVersion.
+	// client may get a commit version that the master is not aware of, and next GRV request may get a version less
+	// than self->committedVersion.

 	CODE_PROBE(pProxyCommitData->committedVersion.get() > self->commitVersion,
 	           "later version was reported committed first");
@ -1775,11 +1842,11 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 				for (int resolverInd : self->transactionResolverMap[t]) {
 					auto const& cKRs =
 					    self->resolution[resolverInd]
-					        .conflictingKeyRangeMap[self->nextTr[resolverInd]]; // nextTr[resolverInd] -> index of this
-					                                                            // trs[t] on the resolver
+					        .conflictingKeyRangeMap[self->nextTr[resolverInd]]; // nextTr[resolverInd] -> index of
+					                                                            // this trs[t] on the resolver
 					for (auto const& rCRIndex : cKRs)
-						// read_conflict_range can change when sent to resolvers, mapping the index from resolver-side
-						// to original index in commitTransactionRef
+						// read_conflict_range can change when sent to resolvers, mapping the index from
+						// resolver-side to original index in commitTransactionRef
 						conflictingKRIndices.push_back(conflictingKRIndices.arena(),
 						                               self->txReadConflictRangeIndexMap[t][resolverInd][rCRIndex]);
 				}
--- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
@ -264,7 +264,7 @@ rocksdb::ColumnFamilyOptions getCFOptions() {
 		bbOpts.whole_key_filtering = false;
 	}

-	if (rocksdb_block_cache == nullptr) {
+	if (rocksdb_block_cache == nullptr && SERVER_KNOBS->ROCKSDB_BLOCK_CACHE_SIZE > 0) {
 		rocksdb_block_cache = rocksdb::NewLRUCache(SERVER_KNOBS->ROCKSDB_BLOCK_CACHE_SIZE);
 	}
 	bbOpts.block_cache = rocksdb_block_cache;
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -21,9 +21,12 @@
 // This file implements the functions and actors used by the RestoreLoader role.
 // The RestoreLoader role starts with the restoreLoaderCore actor

+#include "fdbclient/BlobCipher.h"
 #include "flow/UnitTest.h"
 #include "fdbclient/BackupContainer.h"
 #include "fdbclient/BackupAgent.actor.h"
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
+#include "fdbclient/DatabaseContext.h"
 #include "fdbserver/RestoreLoader.actor.h"
 #include "fdbserver/RestoreRoleCommon.actor.h"
 #include "fdbserver/MutationTracking.h"
@ -44,17 +47,19 @@ void splitMutation(const KeyRangeMap<UID>& krMap,
                   VectorRef<MutationRef>& mvector,
                   Arena& nodeIDs_arena,
                   VectorRef<UID>& nodeIDs);
-void _parseSerializedMutation(KeyRangeMap<Version>* pRangeVersions,
-                              std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
-                              SerializedMutationListMap* mutationMap,
-                              std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
-                              LoaderCounters* cc,
-                              const RestoreAsset& asset);
+ACTOR Future<Void> _parseSerializedMutation(KeyRangeMap<Version>* pRangeVersions,
+                                            std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
+                                            SerializedMutationListMap* mutationMap,
+                                            std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
+                                            LoaderCounters* cc,
+                                            RestoreAsset asset,
+                                            Database cx);

 void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference<RestoreLoaderData> self);
-ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<RestoreLoaderData> self);
+ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<RestoreLoaderData> self, Database cx);
 ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req,
-                                              Reference<RestoreLoaderData> self);
+                                              Reference<RestoreLoaderData> self,
+                                              Database cx);
 ACTOR Future<Void> sendMutationsToApplier(
    std::priority_queue<RestoreLoaderSchedSendLoadParamRequest>* sendLoadParamQueue,
    std::map<int, int>* inflightSendLoadParamReqs,
@ -64,7 +69,8 @@ ACTOR Future<Void> sendMutationsToApplier(
    RestoreAsset asset,
    bool isRangeFile,
    std::map<Key, UID>* pRangeToApplier,
-    std::map<UID, RestoreApplierInterface>* pApplierInterfaces);
+    std::map<UID, RestoreApplierInterface>* pApplierInterfaces,
+    Database cx);
 ACTOR static Future<Void> _parseLogFileToMutationsOnLoader(NotifiedVersion* pProcessedFileOffset,
                                                           SerializedMutationListMap* mutationMap,
                                                           Reference<IBackupContainer> bc,
@ -85,7 +91,7 @@ ACTOR Future<Void> handleFinishVersionBatchRequest(RestoreVersionBatchRequest re
 // Dispatch requests based on node's business (i.e, cpu usage for now) and requests' priorities
 // Requests for earlier version batches are preferred; which is equivalent to
 // sendMuttionsRequests are preferred than loadingFileRequests
-ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
+ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self, Database cx) {
 	try {
 		state int curVBInflightReqs = 0;
 		state int sendLoadParams = 0;
@ -139,7 +145,7 @@ ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
 				// Dispatch the request if it is the next version batch to process or if cpu usage is low
 				if (req.batchIndex - 1 == self->finishedSendingVB ||
 				    self->cpuUsage < SERVER_KNOBS->FASTRESTORE_SCHED_TARGET_CPU_PERCENT) {
-					self->addActor.send(handleSendMutationsRequest(req, self));
+					self->addActor.send(handleSendMutationsRequest(req, self, cx));
 					self->sendingQueue.pop();
 				}
 			}
@ -204,7 +210,7 @@ ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
 					self->loadingQueue.pop();
 					ASSERT(false); // Check if this ever happens easily
 				} else {
-					self->addActor.send(handleLoadFileRequest(req, self));
+					self->addActor.send(handleLoadFileRequest(req, self, cx));
 					self->loadingQueue.pop();
 					lastLoadReqs++;
 				}
@ -244,7 +250,7 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf,
 	actors.add(updateProcessMetrics(self));
 	actors.add(traceProcessMetrics(self, "RestoreLoader"));

-	self->addActor.send(dispatchRequests(self));
+	self->addActor.send(dispatchRequests(self, cx));

 	loop {
 		state std::string requestTypeStr = "[Init]";
@ -361,6 +367,18 @@ void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference<Res
 	req.reply.send(RestoreCommonReply(self->id()));
 }

+ACTOR static Future<MutationRef> _decryptMutation(MutationRef mutation, Database cx, Arena* arena) {
+	ASSERT(mutation.isEncrypted());
+	Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
+	state const BlobCipherEncryptHeader* header = mutation.encryptionHeader();
+	std::unordered_set<BlobCipherDetails> cipherDetails;
+	cipherDetails.insert(header->cipherHeaderDetails);
+	cipherDetails.insert(header->cipherTextDetails);
+	std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>> getCipherKeysResult =
+	    wait(getEncryptCipherKeys(dbInfo, cipherDetails, BlobCipherMetrics::BACKUP));
+	return mutation.decrypt(getCipherKeysResult, *arena, BlobCipherMetrics::BACKUP);
+}
+
 // Parse a data block in a partitioned mutation log file and store mutations
 // into "kvOpsIter" and samples into "samplesIter".
 ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
@ -370,7 +388,8 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
    std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
    LoaderCounters* cc,
    Reference<IBackupContainer> bc,
-    RestoreAsset asset) {
+    RestoreAsset asset,
+    Database cx) {
 	state Standalone<StringRef> buf = makeString(asset.len);
 	state Reference<IAsyncFile> file = wait(bc->readFile(asset.filename));
 	int rLen = wait(file->read(mutateString(buf), asset.len, asset.offset));
@ -389,21 +408,21 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 	wait(processedFileOffset->whenAtLeast(asset.offset));
 	ASSERT(processedFileOffset->get() == asset.offset);

-	Arena tempArena;
-	StringRefReader reader(buf, restore_corrupted_data());
+	state Arena tempArena;
+	state StringRefReader reader(buf, restore_corrupted_data());
 	try {
 		// Read block header
 		if (reader.consume<int32_t>() != PARTITIONED_MLOG_VERSION)
 			throw restore_unsupported_file_version();

-		VersionedMutationsMap& kvOps = kvOpsIter->second;
+		state VersionedMutationsMap* kvOps = &kvOpsIter->second;
 		while (1) {
 			// If eof reached or first key len bytes is 0xFF then end of block was reached.
 			if (reader.eof() || *reader.rptr == 0xFF)
 				break;

 			// Deserialize messages written in saveMutationsToFile().
-			LogMessageVersion msgVersion;
+			state LogMessageVersion msgVersion;
 			msgVersion.version = reader.consumeNetworkUInt64();
 			msgVersion.sub = reader.consumeNetworkUInt32();
 			int msgSize = reader.consumeNetworkInt32();
@ -413,19 +432,20 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 			if (!asset.isInVersionRange(msgVersion.version))
 				continue;

-			VersionedMutationsMap::iterator it;
+			state VersionedMutationsMap::iterator it;
 			bool inserted;
-			std::tie(it, inserted) = kvOps.emplace(msgVersion, MutationsVec());
+			std::tie(it, inserted) = kvOps->emplace(msgVersion, MutationsVec());
 			// A clear mutation can be split into multiple mutations with the same (version, sub).
 			// See saveMutationsToFile(). Current tests only use one key range per backup, thus
 			// only one clear mutation is generated (i.e., always inserted).
 			ASSERT(inserted);

 			ArenaReader rd(buf.arena(), StringRef(message, msgSize), AssumeVersion(g_network->protocolVersion()));
-			MutationRef mutation;
+			state MutationRef mutation;
 			rd >> mutation;
 			if (mutation.isEncrypted()) {
-				throw encrypt_unsupported();
+				MutationRef decryptedMutation = wait(_decryptMutation(mutation, cx, &tempArena));
+				mutation = decryptedMutation;
 			}

 			// Skip mutation whose commitVesion < range kv's version
@ -500,12 +520,13 @@ ACTOR static Future<Void> parsePartitionedLogFileOnLoader(
    std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
    LoaderCounters* cc,
    Reference<IBackupContainer> bc,
-    RestoreAsset asset) {
+    RestoreAsset asset,
+    Database cx) {
 	state int readFileRetries = 0;
 	loop {
 		try {
 			wait(_parsePartitionedLogFileOnLoader(
-			    pRangeVersions, processedFileOffset, kvOpsIter, samplesIter, cc, bc, asset));
+			    pRangeVersions, processedFileOffset, kvOpsIter, samplesIter, cc, bc, asset, cx));
 			break;
 		} catch (Error& e) {
 			if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version ||
@ -532,7 +553,8 @@ ACTOR Future<Void> _processLoadingParam(KeyRangeMap<Version>* pRangeVersions,
                                        LoadingParam param,
                                        Reference<LoaderBatchData> batchData,
                                        UID loaderID,
-                                        Reference<IBackupContainer> bc) {
+                                        Reference<IBackupContainer> bc,
+                                        Database cx) {
 	// Temporary data structure for parsing log files into (version, <K, V, mutationType>)
 	// Must use StandAlone to save mutations, otherwise, the mutationref memory will be corrupted
 	// mutationMap: Key is the unique identifier for a batch of mutation logs at the same version
@ -572,7 +594,8 @@ ACTOR Future<Void> _processLoadingParam(KeyRangeMap<Version>* pRangeVersions,
 				                                                            samplesIter,
 				                                                            &batchData->counters,
 				                                                            bc,
-				                                                            subAsset));
+				                                                            subAsset,
+				                                                            cx));
 			} else {
 				fileParserFutures.push_back(
 				    parseLogFileToMutationsOnLoader(&processedFileOffset, &mutationMap, bc, subAsset));
@ -582,8 +605,8 @@ ACTOR Future<Void> _processLoadingParam(KeyRangeMap<Version>* pRangeVersions,
 	wait(waitForAll(fileParserFutures));

 	if (!param.isRangeFile && !param.isPartitionedLog()) {
-		_parseSerializedMutation(
-		    pRangeVersions, kvOpsPerLPIter, &mutationMap, samplesIter, &batchData->counters, param.asset);
+		wait(_parseSerializedMutation(
+		    pRangeVersions, kvOpsPerLPIter, &mutationMap, samplesIter, &batchData->counters, param.asset, cx));
 	}

 	TraceEvent("FastRestoreLoaderProcessLoadingParamDone", loaderID)
@ -594,7 +617,7 @@ ACTOR Future<Void> _processLoadingParam(KeyRangeMap<Version>* pRangeVersions,
 }

 // A loader can process multiple RestoreLoadFileRequest in parallel.
-ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<RestoreLoaderData> self) {
+ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<RestoreLoaderData> self, Database cx) {
 	state Reference<LoaderBatchData> batchData = self->batch[req.batchIndex];
 	state bool isDuplicated = true;
 	state bool printTrace = false;
@ -623,7 +646,7 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
 		    .detail("ProcessLoadParam", req.param.toString());
 		ASSERT(batchData->sampleMutations.find(req.param) == batchData->sampleMutations.end());
 		batchData->processedFileParams[req.param] =
-		    _processLoadingParam(&self->rangeVersions, req.param, batchData, self->id(), self->bc);
+		    _processLoadingParam(&self->rangeVersions, req.param, batchData, self->id(), self->bc, cx);
 		self->inflightLoadingReqs++;
 		isDuplicated = false;
 	} else {
@ -682,7 +705,8 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
 // Send buffered mutations to appliers.
 // Do not need to block on low memory usage because this actor should not increase memory usage.
 ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req,
-                                              Reference<RestoreLoaderData> self) {
+                                              Reference<RestoreLoaderData> self,
+                                              Database cx) {
 	state Reference<LoaderBatchData> batchData;
 	state Reference<LoaderBatchStatus> batchStatus;
 	state bool isDuplicated = true;
@ -759,7 +783,8 @@ ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequ
 				                                                loadParam.asset,
 				                                                loadParam.isRangeFile,
 				                                                &batchData->rangeToApplier,
-				                                                &self->appliersInterf));
+				                                                &self->appliersInterf,
+				                                                cx));
 			}
 		}
 		wait(waitForAll(fSendMutations));
@ -812,7 +837,8 @@ ACTOR Future<Void> sendMutationsToApplier(
    RestoreAsset asset,
    bool isRangeFile,
    std::map<Key, UID>* pRangeToApplier,
-    std::map<UID, RestoreApplierInterface>* pApplierInterfaces) {
+    std::map<UID, RestoreApplierInterface>* pApplierInterfaces,
+    Database cx) {
 	state VersionedMutationsMap& kvOps = *pkvOps;
 	state VersionedMutationsMap::iterator kvOp = kvOps.begin();
 	state int kvCount = 0;
@ -820,6 +846,7 @@ ACTOR Future<Void> sendMutationsToApplier(
 	state Version msgIndex = 1; // Monotonically increased index for send message, must start at 1
 	state std::vector<UID> applierIDs = getApplierIDs(*pRangeToApplier);
 	state double msgSize = 0; // size of mutations in the message
+	state Arena arena;

 	// Wait for scheduler to kick it off
 	Promise<Void> toSched;
@ -863,14 +890,18 @@ ACTOR Future<Void> sendMutationsToApplier(
 	for (auto& applierID : applierIDs) {
 		applierVersionedMutationsBuffer[applierID] = VersionedMutationsVec();
 	}
-	KeyRangeMap<UID> krMap;
+	state KeyRangeMap<UID> krMap;
 	buildApplierRangeMap(&krMap, pRangeToApplier);
 	for (kvOp = kvOps.begin(); kvOp != kvOps.end(); kvOp++) {
 		commitVersion = kvOp->first;
 		ASSERT(commitVersion.version >= asset.beginVersion);
 		ASSERT(commitVersion.version <= asset.endVersion); // endVersion is an empty commit to ensure progress
 		for (mIndex = 0; mIndex < kvOp->second.size(); mIndex++) {
-			MutationRef& kvm = kvOp->second[mIndex];
+			state MutationRef kvm = kvOp->second[mIndex];
+			if (kvm.isEncrypted()) {
+				MutationRef decryptedMutation = wait(_decryptMutation(kvm, cx, &arena));
+				kvm = decryptedMutation;
+			}
 			// Send the mutation to applier
 			if (isRangeMutation(kvm)) {
 				MutationsVec mvector;
@ -1082,31 +1113,35 @@ bool concatenateBackupMutationForLogFile(SerializedMutationListMap* pMutationMap
 // we may not get the entire mutation list for the version encoded_list_of_mutations:
 // [mutation1][mutation2]...[mutationk], where
 //	a mutation is encoded as [type:uint32_t][keyLength:uint32_t][valueLength:uint32_t][keyContent][valueContent]
-void _parseSerializedMutation(KeyRangeMap<Version>* pRangeVersions,
-                              std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
-                              SerializedMutationListMap* pmutationMap,
-                              std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
-                              LoaderCounters* cc,
-                              const RestoreAsset& asset) {
-	VersionedMutationsMap& kvOps = kvOpsIter->second;
-	SampledMutationsVec& samples = samplesIter->second;
-	SerializedMutationListMap& mutationMap = *pmutationMap;
+ACTOR Future<Void> _parseSerializedMutation(KeyRangeMap<Version>* pRangeVersions,
+                                            std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
+                                            SerializedMutationListMap* pmutationMap,
+                                            std::map<LoadingParam, SampledMutationsVec>::iterator samplesIter,
+                                            LoaderCounters* cc,
+                                            RestoreAsset asset,
+                                            Database cx) {
+	state VersionedMutationsMap* kvOps = &kvOpsIter->second;
+	state SampledMutationsVec* samples = &samplesIter->second;
+	state SerializedMutationListMap::iterator mutationMapIterator = pmutationMap->begin();

 	TraceEvent(SevFRMutationInfo, "FastRestoreLoaderParseSerializedLogMutation")
 	    .detail("BatchIndex", asset.batchIndex)
 	    .detail("RestoreAsset", asset.toString());

-	Arena tempArena;
-	for (auto& m : mutationMap) {
-		StringRef k = m.first.contents();
-		StringRef val = m.second.first.contents();
+	state Arena tempArena;
+	loop {
+		if (mutationMapIterator == pmutationMap->end()) {
+			break;
+		}
+		StringRef k = mutationMapIterator->first.contents();
+		state StringRef val = mutationMapIterator->second.first.contents();

 		StringRefReader kReader(k, restore_corrupted_data());
-		uint64_t commitVersion = kReader.consume<uint64_t>(); // Consume little Endian data
+		state uint64_t commitVersion = kReader.consume<uint64_t>(); // Consume little Endian data
 		// We have already filter the commit not in [beginVersion, endVersion) when we concatenate kv pair in log file
 		ASSERT_WE_THINK(asset.isInVersionRange(commitVersion));

-		StringRefReader vReader(val, restore_corrupted_data());
+		state StringRefReader vReader(val, restore_corrupted_data());
 		vReader.consume<uint64_t>(); // Consume the includeVersion
 		// TODO(xumengpanda): verify the protocol version is compatible and raise error if needed

@ -1114,72 +1149,79 @@ void _parseSerializedMutation(KeyRangeMap<Version>* pRangeVersions,
 		uint32_t val_length_decoded = vReader.consume<uint32_t>();
 		ASSERT(val_length_decoded == val.size() - sizeof(uint64_t) - sizeof(uint32_t));

-		int sub = 0;
-		while (1) {
+		state int sub = 0;
+		loop {
 			// stop when reach the end of the string
 			if (vReader.eof()) { //|| *reader.rptr == 0xFF
 				break;
 			}

-			uint32_t type = vReader.consume<uint32_t>();
-			uint32_t kLen = vReader.consume<uint32_t>();
-			uint32_t vLen = vReader.consume<uint32_t>();
-			const uint8_t* k = vReader.consume(kLen);
-			const uint8_t* v = vReader.consume(vLen);
+			state uint32_t type = vReader.consume<uint32_t>();
+			state uint32_t kLen = vReader.consume<uint32_t>();
+			state uint32_t vLen = vReader.consume<uint32_t>();
+			state const uint8_t* k = vReader.consume(kLen);
+			state const uint8_t* v = vReader.consume(vLen);

-			MutationRef mutation((MutationRef::Type)type, KeyRef(k, kLen), KeyRef(v, vLen));
+			state MutationRef mutation((MutationRef::Type)type, KeyRef(k, kLen), KeyRef(v, vLen));
+			if (mutation.isEncrypted()) {
+				MutationRef decryptedMutation = wait(_decryptMutation(mutation, cx, &tempArena));
+				mutation = decryptedMutation;
+			}
 			// Should this mutation be skipped?
 			// Skip mutation whose commitVesion < range kv's version
 			if (logMutationTooOld(pRangeVersions, mutation, commitVersion)) {
 				cc->oldLogMutations += 1;
-				continue;
-			}
-
-			if (mutation.param1 >= asset.range.end ||
-			    (isRangeMutation(mutation) && mutation.param2 < asset.range.begin) ||
-			    (!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) {
-				continue;
-			}
-			// Only apply mutation within the asset.range and apply removePrefix and addPrefix
-			ASSERT(asset.removePrefix.size() == 0);
-			if (isRangeMutation(mutation)) {
-				mutation.param1 = mutation.param1 >= asset.range.begin ? mutation.param1 : asset.range.begin;
-				mutation.param2 = mutation.param2 < asset.range.end ? mutation.param2 : asset.range.end;
-				// Remove prefix or add prefix if we restore data to a new key space
-				if (asset.hasPrefix()) { // Avoid creating new Key
-					mutation.param1 =
-					    mutation.param1.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
-					mutation.param2 =
-					    mutation.param2.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
-				}
 			} else {
-				if (asset.hasPrefix()) { // Avoid creating new Key
-					mutation.param1 =
-					    mutation.param1.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
+				if (mutation.param1 >= asset.range.end ||
+				    (isRangeMutation(mutation) && mutation.param2 < asset.range.begin) ||
+				    (!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) {
+				} else {
+					// Only apply mutation within the asset.range and apply removePrefix and addPrefix
+					ASSERT(asset.removePrefix.size() == 0);
+					if (isRangeMutation(mutation)) {
+						mutation.param1 = mutation.param1 >= asset.range.begin ? mutation.param1 : asset.range.begin;
+						mutation.param2 = mutation.param2 < asset.range.end ? mutation.param2 : asset.range.end;
+						// Remove prefix or add prefix if we restore data to a new key space
+						if (asset.hasPrefix()) { // Avoid creating new Key
+							mutation.param1 =
+							    mutation.param1.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
+							mutation.param2 =
+							    mutation.param2.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
+						}
+					} else {
+						if (asset.hasPrefix()) { // Avoid creating new Key
+							mutation.param1 =
+							    mutation.param1.removePrefix(asset.removePrefix).withPrefix(asset.addPrefix, tempArena);
+						}
+					}
+
+					cc->loadedLogBytes += mutation.totalSize();
+
+					TraceEvent(SevFRMutationInfo, "FastRestoreDecodeLogFile")
+					    .detail("CommitVersion", commitVersion)
+					    .detail("ParsedMutation", mutation.toString());
+
+					auto it = kvOps->insert(std::make_pair(LogMessageVersion(commitVersion, sub++), MutationsVec()));
+					ASSERT(it.second); // inserted is true
+					ASSERT(sub <
+					       std::numeric_limits<int32_t>::max()); // range file mutation uses int32_max as subversion
+					it.first->second.push_back_deep(it.first->second.arena(), mutation);
+
+					// Sampling data similar to how SS sample bytes
+					ByteSampleInfo sampleInfo = isKeyValueInSample(KeyValueRef(mutation.param1, mutation.param2));
+					if (sampleInfo.inSample) {
+						cc->sampledLogBytes += sampleInfo.sampledSize;
+						samples->push_back_deep(samples->arena(),
+						                        SampledMutation(mutation.param1, sampleInfo.sampledSize));
+					}
+					ASSERT_WE_THINK(kLen >= 0 && kLen < val.size());
+					ASSERT_WE_THINK(vLen >= 0 && vLen < val.size());
 				}
 			}
-
-			cc->loadedLogBytes += mutation.totalSize();
-
-			TraceEvent(SevFRMutationInfo, "FastRestoreDecodeLogFile")
-			    .detail("CommitVersion", commitVersion)
-			    .detail("ParsedMutation", mutation.toString());
-
-			auto it = kvOps.insert(std::make_pair(LogMessageVersion(commitVersion, sub++), MutationsVec()));
-			ASSERT(it.second); // inserted is true
-			ASSERT(sub < std::numeric_limits<int32_t>::max()); // range file mutation uses int32_max as subversion
-			it.first->second.push_back_deep(it.first->second.arena(), mutation);
-
-			// Sampling data similar to how SS sample bytes
-			ByteSampleInfo sampleInfo = isKeyValueInSample(KeyValueRef(mutation.param1, mutation.param2));
-			if (sampleInfo.inSample) {
-				cc->sampledLogBytes += sampleInfo.sampledSize;
-				samples.push_back_deep(samples.arena(), SampledMutation(mutation.param1, sampleInfo.sampledSize));
-			}
-			ASSERT_WE_THINK(kLen >= 0 && kLen < val.size());
-			ASSERT_WE_THINK(vLen >= 0 && vLen < val.size());
 		}
+		mutationMapIterator++;
 	}
+	return Void();
 }

 // Parsing the data blocks in a range file
--- a/fdbserver/SimKmsConnector.actor.cpp
+++ b/fdbserver/SimKmsConnector.actor.cpp
@ -66,11 +66,14 @@ struct SimKmsConnectorContext : NonCopyable, ReferenceCounted<SimKmsConnectorCon
 		// Construct encryption keyStore.
 		// Note the keys generated must be the same after restart.
 		for (int i = 1; i <= maxEncryptionKeys; i++) {
-			Arena arena;
-			StringRef digest = computeAuthToken(
-			    reinterpret_cast<const unsigned char*>(&i), sizeof(i), SHA_KEY, AES_256_KEY_LENGTH, arena);
-			simEncryptKeyStore[i] =
-			    std::make_unique<SimEncryptKeyCtx>(i, reinterpret_cast<const char*>(digest.begin()));
+			uint8_t digest[AUTH_TOKEN_SIZE];
+			computeAuthToken(reinterpret_cast<const unsigned char*>(&i),
+			                 sizeof(i),
+			                 SHA_KEY,
+			                 AES_256_KEY_LENGTH,
+			                 &digest[0],
+			                 AUTH_TOKEN_SIZE);
+			simEncryptKeyStore[i] = std::make_unique<SimEncryptKeyCtx>(i, reinterpret_cast<const char*>(&digest[0]));
 		}
 	}
 };
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -284,10 +284,20 @@ static JsonBuilderObject getError(const TraceEventFields& errorFields) {
 	return statusObj;
 }

-static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics,
-                                              std::vector<WorkerDetails> workers,
-                                              Optional<DatabaseConfiguration> configuration,
-                                              std::set<std::string>* incomplete_reasons) {
+namespace {
+
+void reportCgroupCpuStat(JsonBuilderObject& object, const TraceEventFields& eventFields) {
+	JsonBuilderObject cgroupCpuStatObj;
+	cgroupCpuStatObj.setKeyRawNumber("nr_periods", eventFields.getValue("NrPeriods"));
+	cgroupCpuStatObj.setKeyRawNumber("nr_throttled", eventFields.getValue("NrThrottled"));
+	cgroupCpuStatObj.setKeyRawNumber("throttled_time", eventFields.getValue("ThrottledTime"));
+	object["cgroup_cpu_stat"] = cgroupCpuStatObj;
+}
+
+JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics,
+                                       std::vector<WorkerDetails> workers,
+                                       Optional<DatabaseConfiguration> configuration,
+                                       std::set<std::string>* incomplete_reasons) {
 	JsonBuilderObject machineMap;
 	double metric;
 	int failed = 0;
@ -339,6 +349,10 @@ static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics,
 				memoryObj.setKeyRawNumber("free_bytes", event.getValue("AvailableMemory"));
 				statusObj["memory"] = memoryObj;

+#ifdef __linux__
+				reportCgroupCpuStat(statusObj, event);
+#endif // __linux__
+
 				JsonBuilderObject cpuObj;
 				double cpuSeconds = event.getDouble("CPUSeconds");
 				double elapsed = event.getDouble("Elapsed");
@ -402,6 +416,8 @@ static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics,
 	return machineMap;
 }

+} // anonymous namespace
+
 JsonBuilderObject getLagObject(int64_t versions) {
 	JsonBuilderObject lag;
 	lag["versions"] = versions;
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -3593,9 +3593,9 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
 			if (recovered.canBeSet())
 				recovered.send(Void());

-			if (!self.durableClusterId.isValid()) {
-				self.sharedActors.send(updateDurableClusterID(&self));
-			}
+			// if (!self.durableClusterId.isValid()) {
+			// 	self.sharedActors.send(updateDurableClusterID(&self));
+			// }
 			self.sharedActors.send(commitQueue(&self));
 			self.sharedActors.send(updateStorageLoop(&self));
 			self.sharedActors.send(traceRole(Role::SHARED_TRANSACTION_LOG, tlogId));
--- a/fdbserver/include/fdbserver/ClusterController.actor.h
+++ b/fdbserver/include/fdbserver/ClusterController.actor.h
@ -183,10 +183,15 @@ public:

 		void setEncryptKeyProxy(const EncryptKeyProxyInterface& interf) {
 			auto newInfo = serverInfo->get();
+			auto newClientInfo = clientInfo->get();
+			newClientInfo.id = deterministicRandom()->randomUniqueID();
 			newInfo.id = deterministicRandom()->randomUniqueID();
 			newInfo.infoGeneration = ++dbInfoCount;
 			newInfo.encryptKeyProxy = interf;
+			newInfo.client.encryptKeyProxy = interf;
+			newClientInfo.encryptKeyProxy = interf;
 			serverInfo->set(newInfo);
+			clientInfo->set(newClientInfo);
 		}

 		void setConsistencyScan(const ConsistencyScanInterface& interf) {
@ -199,7 +204,9 @@ public:

 		void clearInterf(ProcessClass::ClassType t) {
 			auto newInfo = serverInfo->get();
+			auto newClientInfo = clientInfo->get();
 			newInfo.id = deterministicRandom()->randomUniqueID();
+			newClientInfo.id = deterministicRandom()->randomUniqueID();
 			newInfo.infoGeneration = ++dbInfoCount;
 			if (t == ProcessClass::DataDistributorClass) {
 				newInfo.distributor = Optional<DataDistributorInterface>();
@ -209,10 +216,13 @@ public:
 				newInfo.blobManager = Optional<BlobManagerInterface>();
 			} else if (t == ProcessClass::EncryptKeyProxyClass) {
 				newInfo.encryptKeyProxy = Optional<EncryptKeyProxyInterface>();
+				newInfo.client.encryptKeyProxy = Optional<EncryptKeyProxyInterface>();
+				newClientInfo.encryptKeyProxy = Optional<EncryptKeyProxyInterface>();
 			} else if (t == ProcessClass::ConsistencyScanClass) {
 				newInfo.consistencyScan = Optional<ConsistencyScanInterface>();
 			}
 			serverInfo->set(newInfo);
+			clientInfo->set(newClientInfo);
 		}

 		ACTOR static Future<Void> countClients(DBInfo* self) {
--- a/fdbserver/include/fdbserver/IEncryptionKeyProvider.actor.h
+++ b/fdbserver/include/fdbserver/IEncryptionKeyProvider.actor.h
@ -0,0 +1,299 @@
+/*
+ * IEncryptionKeyProvider.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/BlobCipher.h"
+#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_G_H)
+#define FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_G_H
+#include "fdbserver/IEncryptionKeyProvider.actor.g.h"
+#elif !defined(FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_H)
+#define FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_H
+
+#include "fdbclient/GetEncryptCipherKeys.actor.h"
+#include "fdbclient/Tenant.h"
+
+#include "fdbserver/EncryptionOpsUtils.h"
+#include "fdbserver/ServerDBInfo.h"
+
+#include "flow/Arena.h"
+#include "flow/EncryptUtils.h"
+#define XXH_INLINE_ALL
+#include "flow/xxhash.h"
+
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+typedef uint64_t XOREncryptionKeyID;
+
+// EncryptionKeyRef is somewhat multi-variant, it will contain members representing the union
+// of all fields relevant to any implemented encryption scheme.  They are generally of
+// the form
+//   Page Fields - fields which come from or are stored in the Page
+//   Secret Fields - fields which are only known by the Key Provider
+// but it is up to each encoding and provider which fields are which and which ones are used
+//
+// TODO(yiwu): Rename and/or refactor this struct. It doesn't sound like an encryption key should
+// contain page fields like encryption header.
+struct EncryptionKeyRef {
+
+	EncryptionKeyRef(){};
+	EncryptionKeyRef(Arena& arena, const EncryptionKeyRef& toCopy)
+	  : cipherKeys(toCopy.cipherKeys), secret(arena, toCopy.secret), id(toCopy.id) {}
+	int expectedSize() const { return secret.size(); }
+
+	// Fields for AESEncryptionV1
+	TextAndHeaderCipherKeys cipherKeys;
+	Optional<BlobCipherEncryptHeader> cipherHeader;
+	// Fields for XOREncryption_TestOnly
+	StringRef secret;
+	Optional<XOREncryptionKeyID> id;
+};
+typedef Standalone<EncryptionKeyRef> EncryptionKey;
+
+// Interface used by pager to get encryption keys reading pages from disk
+// and by the BTree to get encryption keys to use for new pages
+class IEncryptionKeyProvider : public ReferenceCounted<IEncryptionKeyProvider> {
+public:
+	virtual ~IEncryptionKeyProvider() {}
+
+	// Get an EncryptionKey with Secret Fields populated based on the given Page Fields.
+	// It is up to the implementation which fields those are.
+	// The output Page Fields must match the input Page Fields.
+	virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) = 0;
+
+	// Get encryption key that should be used for a given user Key-Value range
+	virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) = 0;
+
+	// Setting tenant prefix to tenant name map.
+	virtual void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) {}
+
+	virtual bool shouldEnableEncryption() const = 0;
+};
+
+// The null key provider is useful to simplify page decoding.
+// It throws an error for any key info requested.
+class NullKeyProvider : public IEncryptionKeyProvider {
+public:
+	virtual ~NullKeyProvider() {}
+	bool shouldEnableEncryption() const override { return true; }
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { throw encryption_key_not_found(); }
+	Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		throw encryption_key_not_found();
+	}
+};
+
+// Key provider for dummy XOR encryption scheme
+class XOREncryptionKeyProvider_TestOnly : public IEncryptionKeyProvider {
+public:
+	XOREncryptionKeyProvider_TestOnly(std::string filename) {
+		ASSERT(g_network->isSimulated());
+
+		// Choose a deterministic random filename (without path) byte for secret generation
+		// Remove any leading directory names
+		size_t lastSlash = filename.find_last_of("\\/");
+		if (lastSlash != filename.npos) {
+			filename.erase(0, lastSlash);
+		}
+		xorWith = filename.empty() ? 0x5e
+		                           : (uint8_t)filename[XXH3_64bits(filename.data(), filename.size()) % filename.size()];
+	}
+
+	virtual ~XOREncryptionKeyProvider_TestOnly() {}
+
+	bool shouldEnableEncryption() const override { return true; }
+
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
+		if (!key.id.present()) {
+			throw encryption_key_not_found();
+		}
+		EncryptionKey s = key;
+		uint8_t secret = ~(uint8_t)key.id.get() ^ xorWith;
+		s.secret = StringRef(s.arena(), &secret, 1);
+		return s;
+	}
+
+	Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		EncryptionKeyRef k;
+		k.id = end.empty() ? 0 : *(end.end() - 1);
+		return getSecrets(k);
+	}
+
+	uint8_t xorWith;
+};
+
+// Key provider to provider cipher keys randomly from a pre-generated pool. Use for testing.
+class RandomEncryptionKeyProvider : public IEncryptionKeyProvider {
+public:
+	RandomEncryptionKeyProvider() {
+		for (unsigned i = 0; i < NUM_CIPHER; i++) {
+			BlobCipherDetails cipherDetails;
+			cipherDetails.encryptDomainId = i;
+			cipherDetails.baseCipherId = deterministicRandom()->randomUInt64();
+			cipherDetails.salt = deterministicRandom()->randomUInt64();
+			cipherKeys[i] = generateCipherKey(cipherDetails);
+		}
+	}
+	virtual ~RandomEncryptionKeyProvider() = default;
+
+	bool shouldEnableEncryption() const override { return true; }
+
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
+		ASSERT(key.cipherHeader.present());
+		EncryptionKey s = key;
+		s.cipherKeys.cipherTextKey = cipherKeys[key.cipherHeader.get().cipherTextDetails.encryptDomainId];
+		s.cipherKeys.cipherHeaderKey = cipherKeys[key.cipherHeader.get().cipherHeaderDetails.encryptDomainId];
+		return s;
+	}
+
+	Future<EncryptionKey> getByRange(const KeyRef& /*begin*/, const KeyRef& /*end*/) override {
+		EncryptionKey s;
+		s.cipherKeys.cipherTextKey = getRandomCipherKey();
+		s.cipherKeys.cipherHeaderKey = getRandomCipherKey();
+		return s;
+	}
+
+private:
+	Reference<BlobCipherKey> generateCipherKey(const BlobCipherDetails& cipherDetails) {
+		static unsigned char SHA_KEY[] = "3ab9570b44b8315fdb261da6b1b6c13b";
+		uint8_t digest[AUTH_TOKEN_SIZE];
+		computeAuthToken(reinterpret_cast<const unsigned char*>(&cipherDetails.baseCipherId),
+		                 sizeof(EncryptCipherBaseKeyId),
+		                 SHA_KEY,
+		                 AES_256_KEY_LENGTH,
+		                 &digest[0],
+		                 AUTH_TOKEN_SIZE);
+		return makeReference<BlobCipherKey>(cipherDetails.encryptDomainId,
+		                                    cipherDetails.baseCipherId,
+		                                    &digest[0],
+		                                    AES_256_KEY_LENGTH,
+		                                    cipherDetails.salt,
+		                                    std::numeric_limits<int64_t>::max() /* refreshAt */,
+		                                    std::numeric_limits<int64_t>::max() /* expireAt */);
+	}
+
+	Reference<BlobCipherKey> getRandomCipherKey() {
+		return cipherKeys[deterministicRandom()->randomInt(0, NUM_CIPHER)];
+	}
+
+	static constexpr int NUM_CIPHER = 1000;
+	Reference<BlobCipherKey> cipherKeys[NUM_CIPHER];
+};
+
+// Key provider which extract tenant id from range key prefixes, and fetch tenant specific encryption keys from
+// EncryptKeyProxy.
+class TenantAwareEncryptionKeyProvider : public IEncryptionKeyProvider {
+public:
+	TenantAwareEncryptionKeyProvider(Reference<AsyncVar<ServerDBInfo> const> db) : db(db) {}
+
+	virtual ~TenantAwareEncryptionKeyProvider() = default;
+
+	bool shouldEnableEncryption() const override {
+		return isEncryptionOpSupported(EncryptOperationType::STORAGE_SERVER_ENCRYPTION, db->get().client);
+	}
+
+	ACTOR static Future<EncryptionKey> getSecrets(TenantAwareEncryptionKeyProvider* self, EncryptionKeyRef key) {
+		if (!key.cipherHeader.present()) {
+			TraceEvent("TenantAwareEncryptionKeyProvider_CipherHeaderMissing");
+			throw encrypt_ops_error();
+		}
+		TextAndHeaderCipherKeys cipherKeys =
+		    wait(getEncryptCipherKeys(self->db, key.cipherHeader.get(), BlobCipherMetrics::KV_REDWOOD));
+		EncryptionKey s = key;
+		s.cipherKeys = cipherKeys;
+		return s;
+	}
+
+	Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { return getSecrets(this, key); }
+
+	ACTOR static Future<EncryptionKey> getByRange(TenantAwareEncryptionKeyProvider* self, KeyRef begin, KeyRef end) {
+		EncryptCipherDomainNameRef domainName;
+		EncryptCipherDomainId domainId = self->getEncryptionDomainId(begin, end, &domainName);
+		TextAndHeaderCipherKeys cipherKeys =
+		    wait(getLatestEncryptCipherKeysForDomain(self->db, domainId, domainName, BlobCipherMetrics::KV_REDWOOD));
+		EncryptionKey s;
+		s.cipherKeys = cipherKeys;
+		return s;
+	}
+
+	Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
+		return getByRange(this, begin, end);
+	}
+
+	void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) override {
+		ASSERT(tenantPrefixIndex.isValid());
+		this->tenantPrefixIndex = tenantPrefixIndex;
+	}
+
+private:
+	EncryptCipherDomainId getEncryptionDomainId(const KeyRef& begin,
+	                                            const KeyRef& end,
+	                                            EncryptCipherDomainNameRef* domainName) {
+		int64_t domainId = SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
+		int64_t beginTenantId = getTenantId(begin, true /*inclusive*/);
+		int64_t endTenantId = getTenantId(end, false /*inclusive*/);
+		if (beginTenantId == endTenantId && beginTenantId != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+			ASSERT(tenantPrefixIndex.isValid());
+			Key tenantPrefix = TenantMapEntry::idToPrefix(beginTenantId);
+			auto view = tenantPrefixIndex->atLatest();
+			auto itr = view.find(tenantPrefix);
+			if (itr != view.end()) {
+				*domainName = *itr;
+				domainId = beginTenantId;
+			} else {
+				// No tenant with the same tenant id. We could be in optional or disabled tenant mode.
+			}
+		}
+		if (domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
+			*domainName = FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME;
+		}
+		return domainId;
+	}
+
+	int64_t getTenantId(const KeyRef& key, bool inclusive) {
+		// A valid tenant id is always a valid encrypt domain id.
+		static_assert(INVALID_ENCRYPT_DOMAIN_ID == -1);
+
+		if (key.size() && key >= systemKeys.begin) {
+			return SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
+		}
+
+		if (key.size() < TENANT_PREFIX_SIZE) {
+			// Encryption domain information not available, leverage 'default encryption domain'
+			return FDB_DEFAULT_ENCRYPT_DOMAIN_ID;
+		}
+
+		StringRef prefix = key.substr(0, TENANT_PREFIX_SIZE);
+		int64_t tenantId = TenantMapEntry::prefixToId(prefix, EnforceValidTenantId::False);
+		if (tenantId == TenantInfo::INVALID_TENANT) {
+			// Encryption domain information not available, leverage 'default encryption domain'
+			return FDB_DEFAULT_ENCRYPT_DOMAIN_ID;
+		}
+
+		if (!inclusive && key.size() == TENANT_PREFIX_SIZE) {
+			tenantId = tenantId - 1;
+		}
+		ASSERT(tenantId >= 0);
+		return tenantId;
+	}
+
+	Reference<AsyncVar<ServerDBInfo> const> db;
+	Reference<TenantPrefixIndex> tenantPrefixIndex;
+};
+
+#include "flow/unactorcompiler.h"
+#endif
--- a/fdbserver/include/fdbserver/IPageEncryptionKeyProvider.actor.h
+++ b/fdbserver/include/fdbserver/IPageEncryptionKeyProvider.actor.h
@ -18,6 +18,7 @@
 * limitations under the License.
 */

+#include "fdbclient/BlobCipher.h"
 #if defined(NO_INTELLISENSE) && !defined(FDBSERVER_IPAGEENCRYPTIONKEYPROVIDER_ACTOR_G_H)
 #define FDBSERVER_IPAGEENCRYPTIONKEYPROVIDER_ACTOR_G_H
 #include "fdbserver/IPageEncryptionKeyProvider.actor.g.h"
@ -207,14 +208,17 @@ private:
 	Reference<BlobCipherKey> generateCipherKey(const BlobCipherDetails& cipherDetails) {
 		static unsigned char SHA_KEY[] = "3ab9570b44b8315fdb261da6b1b6c13b";
 		Arena arena;
-		StringRef digest = computeAuthToken(reinterpret_cast<const unsigned char*>(&cipherDetails.baseCipherId),
-		                                    sizeof(EncryptCipherBaseKeyId),
-		                                    SHA_KEY,
-		                                    AES_256_KEY_LENGTH,
-		                                    arena);
+		uint8_t digest[AUTH_TOKEN_SIZE];
+		computeAuthToken(reinterpret_cast<const unsigned char*>(&cipherDetails.baseCipherId),
+		                 sizeof(EncryptCipherBaseKeyId),
+		                 SHA_KEY,
+		                 AES_256_KEY_LENGTH,
+		                 &digest[0],
+		                 AUTH_TOKEN_SIZE);
+		ASSERT_EQ(AUTH_TOKEN_SIZE, AES_256_KEY_LENGTH);
 		return makeReference<BlobCipherKey>(cipherDetails.encryptDomainId,
 		                                    cipherDetails.baseCipherId,
-		                                    digest.begin(),
+		                                    &digest[0],
 		                                    AES_256_KEY_LENGTH,
 		                                    cipherDetails.salt,
 		                                    std::numeric_limits<int64_t>::max() /* refreshAt */,
--- a/fdbserver/include/fdbserver/workloads/workloads.actor.h
+++ b/fdbserver/include/fdbserver/workloads/workloads.actor.h
@ -75,6 +75,7 @@ struct TestWorkload : NonCopyable, WorkloadContext, ReferenceCounted<TestWorkloa
 	virtual ~TestWorkload(){};
 	virtual Future<Void> initialized() { return Void(); }
 	virtual std::string description() const = 0;
+	virtual void disableFailureInjectionWorkloads(std::set<std::string>& out) const;
 	virtual Future<Void> setup(Database const& cx) { return Void(); }
 	virtual Future<Void> start(Database const& cx) = 0;
 	virtual Future<bool> check(Database const& cx) = 0;
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -2929,49 +2929,6 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
 	return std::make_pair(reply, gotAll);
 }

-ACTOR Future<Void> localChangeFeedStream(StorageServer* data,
-                                         PromiseStream<Standalone<MutationsAndVersionRef>> results,
-                                         Key rangeID,
-                                         Version begin,
-                                         Version end,
-                                         KeyRange range) {
-	try {
-		loop {
-			state ChangeFeedStreamRequest feedRequest;
-			feedRequest.rangeID = rangeID;
-			feedRequest.begin = begin;
-			feedRequest.end = end;
-			feedRequest.range = range;
-			state std::pair<ChangeFeedStreamReply, bool> feedReply =
-			    wait(getChangeFeedMutations(data, feedRequest, true, false, UID()));
-			begin = feedReply.first.mutations.back().version + 1;
-			state int resultLoc = 0;
-			while (resultLoc < feedReply.first.mutations.size()) {
-				if (feedReply.first.mutations[resultLoc].mutations.size() ||
-				    feedReply.first.mutations[resultLoc].version == end - 1) {
-					wait(results.onEmpty());
-					results.send(feedReply.first.mutations[resultLoc]);
-				}
-				resultLoc++;
-			}
-
-			if (begin == end) {
-				return Void();
-			}
-		}
-	} catch (Error& e) {
-		if (e.code() == error_code_unknown_change_feed) {
-			CODE_PROBE(true, "CF was moved away, no more local data to merge with");
-			// Send endVersion so local stream is effectively done. We couldn't have send that already, because that
-			// would mean the stream would have finished without error
-			results.send(MutationsAndVersionRef(end, invalidVersion));
-		} else {
-			TraceEvent(SevError, "LocalChangeFeedError", data->thisServerID).error(e).detail("CFID", rangeID);
-		}
-		throw;
-	}
-}
-
 // Change feed stream must be sent an error as soon as it is moved away, or change feed can get incorrect results
 ACTOR Future<Void> stopChangeFeedOnMove(StorageServer* data, ChangeFeedStreamRequest req, UID streamUID) {
 	auto feed = data->uidChangeFeed.find(req.rangeID);
@ -5834,17 +5791,9 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
 	state Version lastVersion = invalidVersion;
 	state int64_t versionsFetched = 0;

-	state PromiseStream<Standalone<MutationsAndVersionRef>> localResults;
+	// ensure SS is at least caught up to begin version, to maintain behavior with old fetch
+	wait(data->version.whenAtLeast(startVersion));

-	// Add 1 to fetch version to make sure the local stream will have more versions in the stream than the remote stream
-	// to avoid edge cases in the merge logic
-
-	state Future<Void> localStream =
-	    localChangeFeedStream(data, localResults, rangeId, startVersion, endVersion + 1, range);
-	state Standalone<MutationsAndVersionRef> localResult;
-
-	Standalone<MutationsAndVersionRef> _localResult = waitNext(localResults.getFuture());
-	localResult = _localResult;
 	try {
 		loop {
 			while (data->fetchKeysBudgetUsed.get()) {
@ -5854,6 +5803,10 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
 			state Standalone<VectorRef<MutationsAndVersionRef>> remoteResult =
 			    waitNext(feedResults->mutations.getFuture());
 			state int remoteLoc = 0;
+			// ensure SS is at least caught up to begin version, to maintain behavior with old fetch
+			if (!remoteResult.empty()) {
+				wait(data->version.whenAtLeast(remoteResult.back().version));
+			}

 			while (remoteLoc < remoteResult.size()) {
 				if (feedResults->popVersion - 1 > changeFeedInfo->emptyVersion) {
@ -5881,80 +5834,53 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
 					++data->counters.kvSystemClearRanges;
 				}

-				Version localVersion = localResult.version;
 				Version remoteVersion = remoteResult[remoteLoc].version;
-
-				if (remoteVersion <= localVersion) {
-					if (remoteVersion > changeFeedInfo->emptyVersion) {
-						// merge if same version
-						if (remoteVersion == localVersion && remoteResult[remoteLoc].mutations.size() &&
-						    remoteResult[remoteLoc].mutations.back().param1 != lastEpochEndPrivateKey) {
-							int remoteSize = remoteResult[remoteLoc].mutations.size();
-							ASSERT(localResult.mutations.size());
-							remoteResult[remoteLoc].mutations.append(
-							    remoteResult.arena(), localResult.mutations.begin(), localResult.mutations.size());
-							if (MUTATION_TRACKING_ENABLED) {
-								int midx = 0;
-								for (auto& m : remoteResult[remoteLoc].mutations) {
-									DEBUG_MUTATION("ChangeFeedWriteMoveMerge", remoteVersion, m, data->thisServerID)
-									    .detail("Range", range)
-									    .detail("FromLocal", midx >= remoteSize)
-									    .detail("ChangeFeedID", rangeId);
-									midx++;
-								}
-							}
-						} else {
-							if (MUTATION_TRACKING_ENABLED) {
-								for (auto& m : remoteResult[remoteLoc].mutations) {
-									DEBUG_MUTATION("ChangeFeedWriteMove", remoteVersion, m, data->thisServerID)
-									    .detail("Range", range)
-									    .detail("ChangeFeedID", rangeId);
-								}
-							}
-						}
-
-						data->storage.writeKeyValue(
-						    KeyValueRef(changeFeedDurableKey(rangeId, remoteVersion),
-						                changeFeedDurableValue(remoteResult[remoteLoc].mutations,
-						                                       remoteResult[remoteLoc].knownCommittedVersion)));
-						++data->counters.kvSystemClearRanges;
-						changeFeedInfo->fetchVersion = std::max(changeFeedInfo->fetchVersion, remoteVersion);
-
-						if (firstVersion == invalidVersion) {
-							firstVersion = remoteVersion;
-						}
-						lastVersion = remoteVersion;
-						versionsFetched++;
-					} else {
-						CODE_PROBE(true, "Change feed ignoring write on move because it was popped concurrently");
-						if (MUTATION_TRACKING_ENABLED) {
-							for (auto& m : remoteResult[remoteLoc].mutations) {
-								DEBUG_MUTATION("ChangeFeedWriteMoveIgnore", remoteVersion, m, data->thisServerID)
-								    .detail("Range", range)
-								    .detail("ChangeFeedID", rangeId)
-								    .detail("EmptyVersion", changeFeedInfo->emptyVersion);
-							}
-						}
-						if (versionsFetched > 0) {
-							ASSERT(firstVersion != invalidVersion);
-							ASSERT(lastVersion != invalidVersion);
-							data->storage.clearRange(
-							    KeyRangeRef(changeFeedDurableKey(changeFeedInfo->id, firstVersion),
-							                changeFeedDurableKey(changeFeedInfo->id, lastVersion + 1)));
-							++data->counters.kvSystemClearRanges;
-							firstVersion = invalidVersion;
-							lastVersion = invalidVersion;
-							versionsFetched = 0;
+				// ensure SS is at least caught up to this version, to maintain behavior with old fetch
+				ASSERT(remoteVersion <= data->version.get());
+				if (remoteVersion > changeFeedInfo->emptyVersion) {
+					if (MUTATION_TRACKING_ENABLED) {
+						for (auto& m : remoteResult[remoteLoc].mutations) {
+							DEBUG_MUTATION("ChangeFeedWriteMove", remoteVersion, m, data->thisServerID)
+							    .detail("Range", range)
+							    .detail("ChangeFeedID", rangeId);
 						}
 					}
-					remoteLoc++;
-				}
-				if (localVersion <= remoteVersion) {
-					// Do this once per wait instead of once per version for efficiency
-					data->fetchingChangeFeeds.insert(changeFeedInfo->id);
-					Standalone<MutationsAndVersionRef> _localResult = waitNext(localResults.getFuture());
-					localResult = _localResult;
+
+					data->storage.writeKeyValue(
+					    KeyValueRef(changeFeedDurableKey(rangeId, remoteVersion),
+					                changeFeedDurableValue(remoteResult[remoteLoc].mutations,
+					                                       remoteResult[remoteLoc].knownCommittedVersion)));
+					++data->counters.kvSystemClearRanges;
+					changeFeedInfo->fetchVersion = std::max(changeFeedInfo->fetchVersion, remoteVersion);
+
+					if (firstVersion == invalidVersion) {
+						firstVersion = remoteVersion;
+					}
+					lastVersion = remoteVersion;
+					versionsFetched++;
+				} else {
+					CODE_PROBE(true, "Change feed ignoring write on move because it was popped concurrently");
+					if (MUTATION_TRACKING_ENABLED) {
+						for (auto& m : remoteResult[remoteLoc].mutations) {
+							DEBUG_MUTATION("ChangeFeedWriteMoveIgnore", remoteVersion, m, data->thisServerID)
+							    .detail("Range", range)
+							    .detail("ChangeFeedID", rangeId)
+							    .detail("EmptyVersion", changeFeedInfo->emptyVersion);
+						}
+					}
+					if (versionsFetched > 0) {
+						ASSERT(firstVersion != invalidVersion);
+						ASSERT(lastVersion != invalidVersion);
+						data->storage.clearRange(
+						    KeyRangeRef(changeFeedDurableKey(changeFeedInfo->id, firstVersion),
+						                changeFeedDurableKey(changeFeedInfo->id, lastVersion + 1)));
+						++data->counters.kvSystemClearRanges;
+						firstVersion = invalidVersion;
+						lastVersion = invalidVersion;
+						versionsFetched = 0;
+					}
 				}
+				remoteLoc++;
 			}
 			// Do this once per wait instead of once per version for efficiency
 			data->fetchingChangeFeeds.insert(changeFeedInfo->id);
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@ -350,7 +350,7 @@ Future<bool> CompoundWorkload::check(Database const& cx) {
 			        .detail("Name", workloadName)
 			        .detail("Remaining", *wCount)
 			        .detail("Phase", "End");
-			    return true;
+			    return ret;
 		    },
 		    workload.check(cx));
 	};
@ -384,19 +384,21 @@ void CompoundWorkload::addFailureInjection(WorkloadRequest& work) {
 	if (!work.runFailureWorkloads || !FLOW_KNOBS->ENABLE_SIMULATION_IMPROVEMENTS) {
 		return;
 	}
-	// Some common workloads won't work with failure injection workloads
+	// Some workloads won't work with some failure injection workloads
+	std::set<std::string> disabledWorkloads;
 	for (auto const& w : workloads) {
-		auto desc = w->description();
-		if (desc == "ChangeConfig") {
-			return;
-		} else if (desc == "SaveAndKill") {
-			return;
-		}
+		w->disableFailureInjectionWorkloads(disabledWorkloads);
+	}
+	if (disabledWorkloads.count("all") > 0) {
+		return;
 	}
 	auto& factories = IFailureInjectorFactory::factories();
 	DeterministicRandom random(sharedRandomNumber);
 	for (auto& factory : factories) {
 		auto workload = factory->create(*this);
+		if (disabledWorkloads.count(workload->description()) > 0) {
+			continue;
+		}
 		while (workload->add(random, work, *this)) {
 			failureInjection.push_back(workload);
 			workload = factory->create(*this);
@ -419,6 +421,8 @@ void CompoundWorkload::getMetrics(std::vector<PerfMetric>&) {
 	ASSERT(false);
 }

+void TestWorkload::disableFailureInjectionWorkloads(std::set<std::string>& out) const {}
+
 FailureInjectionWorkload::FailureInjectionWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {}

 bool FailureInjectionWorkload::add(DeterministicRandom& random,
@ -508,6 +512,8 @@ ACTOR Future<Reference<TestWorkload>> getWorkloadIface(WorkloadRequest work,
 	wcx.clientId = work.clientId;
 	wcx.clientCount = work.clientCount;
 	wcx.sharedRandomNumber = work.sharedRandomNumber;
+	wcx.ccr = ccr;
+	wcx.dbInfo = dbInfo;
 	// FIXME: Other stuff not filled in; why isn't this constructed here and passed down to the other
 	// getWorkloadIface()?
 	for (int i = 0; i < work.options.size(); i++) {
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@ -270,7 +270,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 			                               StringRef(backupContainer),
 			                               {},
 			                               deterministicRandom()->randomInt(0, 60),
-			                               deterministicRandom()->randomInt(0, 100),
+			                               deterministicRandom()->randomInt(0, 2000),
 			                               tag.toString(),
 			                               backupRanges,
 			                               StopWhenDone{ !stopDifferentialDelay },
--- a/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp
@ -130,6 +130,13 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 		return Void();
 	}

+	ACTOR Future<Key> versionedForcePurge(Database cx, KeyRange range, Optional<TenantName> tenantName) {
+		Version rv = deterministicRandom()->coinflip() ? latestVersion : 1;
+		Key purgeKey = wait(cx->purgeBlobGranules(range, rv, tenantName, true));
+
+		return purgeKey;
+	}
+
 	ACTOR Future<Void> unregisterRandomRange(Database cx, BlobGranuleRangesWorkload* self) {
 		int randomRangeIdx = deterministicRandom()->randomInt(0, self->activeRanges.size());
 		state KeyRange range = self->activeRanges[randomRangeIdx];
@ -147,7 +154,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 				           range.begin.printable(),
 				           range.end.printable());
 			}
-			Key purgeKey = wait(cx->purgeBlobGranules(range, 1, {}, true));
+			Key purgeKey = wait(self->versionedForcePurge(cx, range, {}));
 			wait(cx->waitPurgeGranulesComplete(purgeKey));
 		}
 		bool success = wait(self->setRange(cx, range, false));
@ -194,7 +201,11 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 	}

 	ACTOR Future<bool> isRangeActive(Database cx, KeyRange range) {
-		Version v = wait(cx->verifyBlobRange(range, {}));
+		Optional<Version> rv;
+		if (deterministicRandom()->coinflip()) {
+			rv = latestVersion;
+		}
+		state Version v = wait(cx->verifyBlobRange(range, rv));
 		return v != invalidVersion;
 	}

@ -307,7 +318,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 		}

 		// tear down range at end
-		Key purgeKey = wait(cx->purgeBlobGranules(range, 1, {}, true));
+		Key purgeKey = wait(self->versionedForcePurge(cx, range, {}));
 		wait(cx->waitPurgeGranulesComplete(purgeKey));
 		bool success = wait(self->setRange(cx, range, false));
 		ASSERT(success);
@ -533,7 +544,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 		}

 		// tear down + check that un-blobbifying at a non-aligned range also doesn't work
-		Key purgeKey = wait(cx->purgeBlobGranules(activeRange, 1, {}, true));
+		Key purgeKey = wait(self->versionedForcePurge(cx, activeRange, {}));
 		wait(cx->waitPurgeGranulesComplete(purgeKey));

 		if (deterministicRandom()->coinflip()) {
@ -586,7 +597,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
 		wait(self->checkRange(cx, self, range, true));

 		// force purge range
-		Key purgeKey = wait(cx->purgeBlobGranules(range, 1, {}, true));
+		Key purgeKey = wait(self->versionedForcePurge(cx, range, {}));
 		wait(cx->waitPurgeGranulesComplete(purgeKey));
 		wait(self->checkRange(cx, self, range, false));

--- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
+++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp
@ -307,14 +307,14 @@ struct BlobGranuleVerifierWorkload : TestWorkload {

 		TraceEvent("BlobGranuleVerifierStart");
 		if (BGV_DEBUG) {
-			printf("BGV thread starting\n");
+			fmt::print("BGV {0}) thread starting\n", self->clientId);
 		}

 		// wait for first set of ranges to be loaded
 		wait(self->granuleRanges.onChange());

 		if (BGV_DEBUG) {
-			printf("BGV got ranges\n");
+			fmt::print("BGV {0}) got ranges\n", self->clientId);
 		}

 		loop {
@ -468,7 +468,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 					try {
 						Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, fdb.second, {}, false));
 						if (BGV_DEBUG) {
-							fmt::print("BGV Purged Latest @ {0}, waiting\n", fdb.second);
+							fmt::print("BGV {0}) Purged Latest @ {1}, waiting\n", self->clientId, fdb.second);
 						}
 						wait(cx->waitPurgeGranulesComplete(purgeKey));
 					} catch (Error& e) {
@ -506,7 +506,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 					throw;
 				}
 				if (e.code() != error_code_blob_granule_transaction_too_old && BGV_DEBUG) {
-					printf("BGVerifier got unexpected error %s\n", e.name());
+					fmt::print("BGVerifier {0} got unexpected error {1}\n", self->clientId, e.name());
 				}
 				self->errors++;
 			}
@ -987,7 +987,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		}

 		if (BGV_DEBUG) {
-			fmt::print("BGV Final data check complete, checked {0} rows\n", totalRows);
+			fmt::print("BGV {0}) Final data check complete, checked {1} rows\n", self->clientId, totalRows);
 		}

 		return true;
@ -1010,12 +1010,14 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		} else if (self->enablePurging && self->purgeAtLatest && deterministicRandom()->coinflip()) {
 			Version latestPurgeVersion = wait(self->doGrv(&tr));
 			if (BGV_DEBUG) {
-				fmt::print("BGV Purging Latest @ {0} before final availability check\n", latestPurgeVersion);
+				fmt::print("BGV {0}) Purging Latest @ {1} before final availability check\n",
+				           self->clientId,
+				           latestPurgeVersion);
 			}
 			Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, latestPurgeVersion, {}, false));
 			wait(cx->waitPurgeGranulesComplete(purgeKey));
 			if (BGV_DEBUG) {
-				fmt::print("BGV Purged Latest before final availability check complete\n");
+				fmt::print("BGV {0}) Purged Latest before final availability check complete\n", self->clientId);
 			}
 			self->purges++;
 		}
@ -1023,7 +1025,6 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		// check error counts, and do an availability check at the end

 		if (self->doSetup && self->initAtEnd) {
-			// FIXME: this doesn't check the data contents post-conversion, just that it finishes successfully
 			wait(self->setUpBlobRange(cx));
 		}

@ -1066,7 +1067,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		for (auto& range : allRanges) {
 			state KeyRange r = range;
 			if (BGV_DEBUG) {
-				fmt::print("Final availability check [{0} - {1}) @ {2}\n",
+				fmt::print("BGV {0}) Final availability check [{1} - {2}) @ {3}\n",
+				           self->clientId,
 				           r.begin.printable(),
 				           r.end.printable(),
 				           readVersion);
@ -1159,12 +1161,14 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 		} else if (self->enablePurging && self->purgeAtLatest && deterministicRandom()->coinflip()) {
 			Version latestPurgeVersion = wait(self->doGrv(&tr));
 			if (BGV_DEBUG) {
-				fmt::print("BGV Purging Latest @ {0} after final availability check, waiting\n", latestPurgeVersion);
+				fmt::print("BGV {0}) Purging Latest @ {1} after final availability check, waiting\n",
+				           self->clientId,
+				           latestPurgeVersion);
 			}
 			Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, latestPurgeVersion, {}, false));
 			wait(cx->waitPurgeGranulesComplete(purgeKey));
 			if (BGV_DEBUG) {
-				fmt::print("BGV Purged Latest after final availability check complete\n");
+				fmt::print("BGV {0}) Purged Latest after final availability check complete\n", self->clientId);
 			}
 		}

@ -1175,29 +1179,39 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
 			if (self->enablePurging && self->purgeAtLatest && deterministicRandom()->coinflip()) {
 				Version latestPurgeVersion = wait(self->doGrv(&tr));
 				if (BGV_DEBUG) {
-					fmt::print("BGV Purging Latest @ {0} after clearAndAwaitMerge, waiting\n", latestPurgeVersion);
+					fmt::print("BGV {0}) Purging Latest @ {1} after clearAndAwaitMerge, waiting\n",
+					           self->clientId,
+					           latestPurgeVersion);
 				}
 				Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, latestPurgeVersion, {}, false));
 				wait(cx->waitPurgeGranulesComplete(purgeKey));
 				if (BGV_DEBUG) {
-					fmt::print("BGV Purged Latest after clearAndAwaitMerge complete\n");
+					fmt::print("BGV {0}) Purged Latest after clearAndAwaitMerge complete\n", self->clientId);
 				}
 			}

+			if (BGV_DEBUG) {
+				fmt::print("BGV {0}) Checking data after merge\n", self->clientId);
+			}
+
 			// read after merge to make sure it completed, granules are available, and data is empty
 			bool dataCheckAfterMerge = wait(self->checkAllData(cx, self));
 			ASSERT(dataCheckAfterMerge);
+
+			if (BGV_DEBUG) {
+				fmt::print("BGV {0}) Checked data after merge\n", self->clientId);
+			}
 		}

 		if (BGV_DEBUG) {
-			fmt::print("BGV check waiting on summarizer to complete\n");
+			fmt::print("BGV {0}) check waiting on summarizer to complete\n", self->clientId);
 		}

 		// validate that summary completes without error
 		wait(self->summaryClient);

 		if (BGV_DEBUG) {
-			fmt::print("BGV check done\n");
+			fmt::print("BGV {0}) check done\n", self->clientId);
 		}

 		return result;
--- a/fdbserver/workloads/ChangeConfig.actor.cpp
+++ b/fdbserver/workloads/ChangeConfig.actor.cpp
@ -49,6 +49,8 @@ struct ChangeConfigWorkload : TestWorkload {

 	std::string description() const override { return "ChangeConfig"; }

+	void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.insert("all"); }
+
 	Future<Void> start(Database const& cx) override {
 		if (this->clientId != 0)
 			return Void();
--- a/fdbserver/workloads/DataDistributionMetrics.actor.cpp
+++ b/fdbserver/workloads/DataDistributionMetrics.actor.cpp
@ -29,7 +29,7 @@
 struct DataDistributionMetricsWorkload : KVWorkload {

 	int numShards, readPerTx, writePerTx;
-	int64_t avgBytes;
+	int64_t avgBytes, transactionTimeLimit;
 	double testDuration;
 	std::string keyPrefix;
 	PerfIntCounter commits, errors;
@ -38,6 +38,8 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 	DataDistributionMetricsWorkload(WorkloadContext const& wcx)
 	  : KVWorkload(wcx), numShards(0), avgBytes(0), commits("Commits"), errors("Errors") {
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
+		// transaction time out duration(ms)
+		transactionTimeLimit = getOption(options, "transactionTimeLimit"_sr, 10000);
 		keyPrefix = getOption(options, "keyPrefix"_sr, "DDMetrics"_sr).toString();
 		readPerTx = getOption(options, "readPerTransaction"_sr, 1);
 		writePerTx = getOption(options, "writePerTransaction"_sr, 5 * readPerTx);
@ -73,6 +75,9 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 	ACTOR Future<Void> resultConsistencyCheckClient(Database cx, DataDistributionMetricsWorkload* self) {
 		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
 		loop {
+			tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+			tr->setOption(FDBTransactionOptions::TIMEOUT,
+			              StringRef((uint8_t*)&self->transactionTimeLimit, sizeof(int64_t)));
 			try {
 				wait(delay(self->delayPerLoop));
 				int startIndex = deterministicRandom()->randomInt(0, self->nodeCount - 1);
@ -88,7 +93,7 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 				// the range. If we didn't read through the end of the range, then the second last key
 				// in the result will be the last key less than endKey. (Condition #2)
 				state KeySelector end = KeySelectorRef(endKey.withPrefix(ddStatsRange.begin, endKey.arena()), false, 2);
-				RangeResult result = wait(tr->getRange(begin, end, GetRangeLimits(CLIENT_KNOBS->SHARD_COUNT_LIMIT)));
+				RangeResult result = wait(tr->getRange(begin, end, GetRangeLimits(CLIENT_KNOBS->TOO_MANY)));
 				// Condition #1 and #2 can be broken if multiple rpc calls happened in one getRange
 				if (result.size() > 1) {
 					if (result[0].key > begin.getKey() || result[1].key <= begin.getKey()) {
@ -100,13 +105,14 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 						    .detail("SecondKey", result[1].key)
 						    .detail("BeginKeySelector", begin);
 					}
-					if (result[result.size() - 1].key < end.getKey() || result[result.size() - 2].key >= end.getKey()) {
+					if (!result.readThroughEnd && (result[result.size() - 1].key < end.getKey() ||
+					                               result[result.size() - 2].key >= end.getKey())) {
 						++self->errors;
 						TraceEvent(SevError, "TestFailure")
 						    .detail("Reason", "Result mismatches the given end selector")
 						    .detail("Size", result.size())
-						    .detail("FirstKey", result[result.size() - 1].key)
-						    .detail("SecondKey", result[result.size() - 2].key)
+						    .detail("LastKey", result[result.size() - 1].key)
+						    .detail("SecondLastKey", result[result.size() - 2].key)
 						    .detail("EndKeySelector", end);
 					}
 					// Debugging traces
@ -123,9 +129,10 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 				}
 			} catch (Error& e) {
 				// Ignore timed_out error and cross_module_read, the end key selector may read through the end
-				if (e.code() == error_code_timed_out || e.code() == error_code_special_keys_cross_module_read)
+				if (e.code() == error_code_timed_out || e.code() == error_code_transaction_timed_out) {
+					tr->reset();
 					continue;
-				TraceEvent(SevDebug, "FailedToRetrieveDDMetrics").error(e);
+				}
 				wait(tr->onError(e));
 			}
 		}
@ -139,38 +146,45 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 		// TODO : find why this not work
 		// wait(quietDatabase(cx, self->dbInfo, "PopulateTPCC"));
 		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
-		try {
-			state RangeResult result = wait(tr->getRange(ddStatsRange, CLIENT_KNOBS->SHARD_COUNT_LIMIT));
-			ASSERT(!result.more);
-			self->numShards = result.size();
-			if (self->numShards < 1)
-				return false;
-			state int64_t totalBytes = 0;
-			auto schema = readJSONStrictly(JSONSchemas::dataDistributionStatsSchema.toString()).get_obj();
-			for (int i = 0; i < result.size(); ++i) {
-				ASSERT(result[i].key.startsWith(ddStatsRange.begin));
-				std::string errorStr;
-				auto valueObj = readJSONStrictly(result[i].value.toString()).get_obj();
-				CODE_PROBE(true, "data_distribution_stats schema validation");
-				if (!schemaMatch(schema, valueObj, errorStr, SevError, true)) {
-					TraceEvent(SevError, "DataDistributionStatsSchemaValidationFailed")
-					    .detail("ErrorStr", errorStr.c_str())
-					    .detail("JSON", json_spirit::write_string(json_spirit::mValue(result[i].value.toString())));
-					return false;
+		state int i;
+		state int retries = 0;
+		loop {
+			tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+			tr->setOption(FDBTransactionOptions::TIMEOUT,
+			              StringRef((uint8_t*)&self->transactionTimeLimit, sizeof(int64_t)));
+			try {
+				state RangeResult result = wait(tr->getRange(ddStatsRange, CLIENT_KNOBS->TOO_MANY));
+				ASSERT(!result.more);
+				self->numShards = result.size();
+				// There's no guarantee that #shards <= CLIENT_KNOBS->SHARD_COUNT_LIMIT all the time
+				ASSERT(self->numShards >= 1);
+				state int64_t totalBytes = 0;
+				auto schema = readJSONStrictly(JSONSchemas::dataDistributionStatsSchema.toString()).get_obj();
+				for (i = 0; i < result.size(); ++i) {
+					ASSERT(result[i].key.startsWith(ddStatsRange.begin));
+					std::string errorStr;
+					auto valueObj = readJSONStrictly(result[i].value.toString()).get_obj();
+					CODE_PROBE(true, "data_distribution_stats schema validation");
+					if (!schemaMatch(schema, valueObj, errorStr, SevError, true)) {
+						TraceEvent(SevError, "DataDistributionStatsSchemaValidationFailed")
+						    .detail("ErrorStr", errorStr.c_str())
+						    .detail("JSON", json_spirit::write_string(json_spirit::mValue(result[i].value.toString())));
+						return false;
+					}
+					totalBytes += valueObj["shard_bytes"].get_int64();
 				}
-				totalBytes += valueObj["shard_bytes"].get_int64();
+				self->avgBytes = totalBytes / self->numShards;
+				break;
+			} catch (Error& e) {
+				if (e.code() == error_code_timed_out || e.code() == error_code_transaction_timed_out) {
+					tr->reset();
+					// The RPC call may in some corner cases get no response
+					if (++retries > 10)
+						break;
+					continue;
+				}
+				wait(tr->onError(e));
 			}
-			self->avgBytes = totalBytes / self->numShards;
-			// fetch data-distribution stats for a smaller range
-			ASSERT(result.size());
-			state int idx = deterministicRandom()->randomInt(0, result.size());
-			RangeResult res = wait(tr->getRange(
-			    KeyRangeRef(result[idx].key, idx + 1 < result.size() ? result[idx + 1].key : ddStatsRange.end), 100));
-			ASSERT_WE_THINK(res.size() == 1 && res[0] == result[idx]); // It works good now. However, not sure in any
-			                                                           // case of data-distribution, the number changes
-		} catch (Error& e) {
-			TraceEvent(SevError, "FailedToRetrieveDDMetrics").detail("Error", e.what());
-			throw;
 		}
 		return true;
 	}
@ -188,7 +202,12 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 	std::string description() const override { return "DataDistributionMetrics"; }
 	Future<Void> setup(Database const& cx) override { return Void(); }
 	Future<Void> start(Database const& cx) override { return _start(cx, this); }
-	Future<bool> check(Database const& cx) override { return _check(cx, this); }
+
+	Future<bool> check(Database const& cx) override {
+		if (clientId == 0)
+			return _check(cx, this);
+		return true;
+	}

 	void getMetrics(std::vector<PerfMetric>& m) override {
 		m.emplace_back("NumShards", numShards, Averaged::True);
--- a/fdbserver/workloads/DiskFailureInjection.actor.cpp
+++ b/fdbserver/workloads/DiskFailureInjection.actor.cpp
@ -46,7 +46,7 @@ struct DiskFailureInjectionWorkload : FailureInjectionWorkload {
 	// Verification Mode: We run the workload indefinitely in this mode.
 	// The idea is to keep going until we get a non-zero chaosMetric to ensure
 	// that we haven't lost the chaos event. testDuration is ignored in this mode
-	bool verificationMode;
+	bool verificationMode = false;

 	DiskFailureInjectionWorkload(WorkloadContext const& wcx, NoOptions) : FailureInjectionWorkload(wcx) {}

--- a/fdbserver/workloads/RandomMoveKeys.actor.cpp
+++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp
@ -56,6 +56,7 @@ struct MoveKeysWorkload : FailureInjectionWorkload {
 			state Transaction tr(cx);
 			loop {
 				tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
 				try {
 					RangeResult res = wait(tr.getRange(configKeys, 1000));
 					ASSERT(res.size() < 1000);
--- a/fdbserver/workloads/SaveAndKill.actor.cpp
+++ b/fdbserver/workloads/SaveAndKill.actor.cpp
@ -45,6 +45,7 @@ struct SaveAndKillWorkload : TestWorkload {
 	}

 	std::string description() const override { return "SaveAndKillWorkload"; }
+	void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.insert("all"); }
 	Future<Void> setup(Database const& cx) override {
 		g_simulator->disableSwapsToAll();
 		return Void();
--- a/fdbservice/CMakeLists.txt
+++ b/fdbservice/CMakeLists.txt
@ -3,4 +3,4 @@ add_executable(fdbmonitor ${FDBSERVICE_SRCS})

 get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES)
 target_link_libraries(fdbmonitor PUBLIC SimpleOpt)
-target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}")
+target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}")
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@ -21,22 +21,9 @@
 #ifdef _WIN32
 // This has to come as the first include on Win32 for rand_s() to be found
 #define _CRT_RAND_S
-#include <stdlib.h>
-#include <math.h> // For _set_FMA3_enable workaround in platformInit
-#endif
+#endif // _WIN32

-#include <errno.h>
-#include "fmt/format.h"
 #include "flow/Platform.h"
-#include "flow/Platform.actor.h"
-#include "flow/Arena.h"
-
-#include "flow/StreamCipher.h"
-#include "flow/ScopeExit.h"
-#include "flow/Trace.h"
-#include "flow/Error.h"
-
-#include "flow/Knobs.h"

 #include <algorithm>
 #include <iostream>
@ -46,29 +33,46 @@
 #include <string>
 #include <string_view>
 #include <vector>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+
 #include <boost/format.hpp>
 #include <boost/filesystem.hpp>
 #include <boost/filesystem/operations.hpp>
 #include <boost/algorithm/string.hpp>

-#include <sys/types.h>
-#include <time.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "flow/UnitTest.h"
+#include "fmt/format.h"
+
+#include "flow/Arena.h"
+#include "flow/Error.h"
 #include "flow/FaultInjection.h"
+#include "flow/Knobs.h"
+#include "flow/Platform.actor.h"
+#include "flow/ScopeExit.h"
+#include "flow/StreamCipher.h"
+#include "flow/Trace.h"
+#include "flow/Trace.h"
+#include "flow/UnitTest.h"
+#include "flow/Util.h"

 #ifdef _WIN32
-#include <windows.h>
-#include <winioctl.h>
-#include <io.h>
-#include <psapi.h>
-#include <stdio.h>
+
 #include <conio.h>
 #include <direct.h>
+#include <io.h>
+#include <math.h> // For _set_FMA3_enable workaround in platformInit
 #include <pdh.h>
 #include <pdhmsg.h>
 #include <processenv.h>
+#include <psapi.h>
+#include <stdlib.h>
+#include <windows.h>
+#include <winioctl.h>
+
 #pragma comment(lib, "pdh.lib")

 // for SHGetFolderPath
@ -83,19 +87,18 @@
 #define CANONICAL_PATH_SEPARATOR '/'

 #include <dirent.h>
-#include <sys/time.h>
-#include <sys/mman.h>
-#include <unistd.h>
 #include <ftw.h>
 #include <pwd.h>
 #include <sched.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <sys/statvfs.h> /* Needed for disk capacity */
+
 #if !defined(__aarch64__) && !defined(__powerpc64__)
 #include <cpuid.h>
 #endif

-/* Needed for disk capacity */
-#include <sys/statvfs.h>
-
 /* getifaddrs */
 #include <sys/socket.h>
 #include <ifaddrs.h>
@ -116,7 +119,7 @@
 #include <signal.h>
 /* Needed for gnu_dev_{major,minor} */
 #include <sys/sysmacros.h>
-#endif
+#endif // __linux__

 #ifdef __FreeBSD__
 /* Needed for processor affinity */
@ -149,32 +152,31 @@
 #include <devstat.h>
 #include <kvm.h>
 #include <libutil.h>
-#endif
+#endif // __FreeBSD__

 #ifdef __APPLE__
 /* Needed for cross-platform 'environ' */
 #include <crt_externs.h>
-
-#include <sys/uio.h>
-#include <sys/syslimits.h>
-#include <mach/mach.h>
 #include <mach-o/dyld.h>
-#include <sys/param.h>
-#include <sys/mount.h>
-#include <sys/sysctl.h>
-#include <netinet/in.h>
-#include <net/if.h>
+#include <mach/mach.h>
 #include <net/if_dl.h>
+#include <net/if.h>
 #include <net/route.h>
+#include <netinet/in.h>
+#include <sys/mount.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <sys/syslimits.h>
+#include <sys/uio.h>

 #include <CoreFoundation/CoreFoundation.h>
 #include <IOKit/IOKitLib.h>
 #include <IOKit/storage/IOBlockStorageDriver.h>
 #include <IOKit/storage/IOMedia.h>
 #include <IOKit/IOBSD.h>
-#endif
+#endif // __APPLE_

-#endif
+#endif // __unixish__

 #include "flow/actorcompiler.h" // This must be the last #include.

@ -408,25 +410,24 @@ uint64_t getMemoryUsage() {
 #endif
 }

-#if defined(__linux__)
+#ifdef __linux__
+namespace linux_os {
+
+namespace {
+
 void getMemoryInfo(std::map<StringRef, int64_t>& request, std::stringstream& memInfoStream) {
 	size_t count = request.size();
 	if (count == 0)
 		return;

-	while (count > 0 && !memInfoStream.eof()) {
-		std::string key;
-
-		memInfoStream >> key;
+	keyValueReader<std::string, int64_t>(memInfoStream, [&](const std::string& key, const int64_t& value) -> bool {
 		auto item = request.find(StringRef(key));
-		if (item != request.end()) {
-			int64_t value;
-			memInfoStream >> value;
+		if (item != std::end(request)) {
 			item->second = value;
-			count--;
+			--count;
 		}
-		memInfoStream.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
-	}
+		return count != 0;
+	});
 }

 int64_t getLowWatermark(std::stringstream& zoneInfoStream) {
@ -446,10 +447,8 @@ int64_t getLowWatermark(std::stringstream& zoneInfoStream) {

 	return lowWatermark;
 }
-#endif

-void getMachineRAMInfo(MachineRAMInfo& memInfo) {
-#if defined(__linux__)
+void getMachineRAMInfoImpl(MachineRAMInfo& memInfo) {
 	std::ifstream zoneInfoFileStream("/proc/zoneinfo", std::ifstream::in);
 	int64_t lowWatermark = 0;
 	if (!zoneInfoFileStream.good()) {
@ -492,6 +491,35 @@ void getMachineRAMInfo(MachineRAMInfo& memInfo) {
 	}

 	memInfo.committed = memInfo.total - memInfo.available;
+}
+
+} // anonymous namespace
+
+std::map<std::string, int64_t> reportCGroupCpuStat() {
+	// Default path to the cpu,cpuacct
+	// See manpages for cgroup
+	static const std::string PATH_TO_CPU_CPUACCT = "/sys/fs/cgroup/cpu,cpuacct/cpu.stat";
+
+	std::map<std::string, int64_t> result;
+	std::ifstream ifs(PATH_TO_CPU_CPUACCT);
+	if (!ifs.is_open()) {
+		return result;
+	}
+
+	keyValueReader<std::string, int64_t>(ifs, [&](const std::string& key, const int64_t& value) -> bool {
+		result[key] = value;
+		return true;
+	});
+
+	return result;
+}
+
+} // namespace linux_os
+#endif // #ifdef __linux__
+
+void getMachineRAMInfo(MachineRAMInfo& memInfo) {
+#if defined(__linux__)
+	linux_os::getMachineRAMInfoImpl(memInfo);
 #elif defined(__FreeBSD__)
 	int status;

@ -4001,14 +4029,14 @@ TEST_CASE("/flow/Platform/getMemoryInfo") {
 	};

 	std::stringstream memInfoStream(memString);
-	getMemoryInfo(request, memInfoStream);
-	ASSERT(request["MemTotal:"_sr] == 24733228);
-	ASSERT(request["MemFree:"_sr] == 2077580);
-	ASSERT(request["MemAvailable:"_sr] == 0);
-	ASSERT(request["Buffers:"_sr] == 266940);
-	ASSERT(request["Cached:"_sr] == 16798292);
-	ASSERT(request["SwapTotal:"_sr] == 25165820);
-	ASSERT(request["SwapFree:"_sr] == 23680228);
+	linux_os::getMemoryInfo(request, memInfoStream);
+	ASSERT(request[LiteralStringRef("MemTotal:")] == 24733228);
+	ASSERT(request[LiteralStringRef("MemFree:")] == 2077580);
+	ASSERT(request[LiteralStringRef("MemAvailable:")] == 0);
+	ASSERT(request[LiteralStringRef("Buffers:")] == 266940);
+	ASSERT(request[LiteralStringRef("Cached:")] == 16798292);
+	ASSERT(request[LiteralStringRef("SwapTotal:")] == 25165820);
+	ASSERT(request[LiteralStringRef("SwapFree:")] == 23680228);
 	for (auto& item : request) {
 		fmt::print("{}:{}\n", item.first.toString().c_str(), item.second);
 	}
@ -4052,14 +4080,14 @@ TEST_CASE("/flow/Platform/getMemoryInfo") {
 	                         "AnonHugePages:   1275904 kB\n";

 	std::stringstream memInfoStream1(memString1);
-	getMemoryInfo(request, memInfoStream1);
-	ASSERT(request["MemTotal:"_sr] == 31856496);
-	ASSERT(request["MemFree:"_sr] == 25492716);
-	ASSERT(request["MemAvailable:"_sr] == 28470756);
-	ASSERT(request["Buffers:"_sr] == 313644);
-	ASSERT(request["Cached:"_sr] == 2956444);
-	ASSERT(request["SwapTotal:"_sr] == 0);
-	ASSERT(request["SwapFree:"_sr] == 0);
+	linux_os::getMemoryInfo(request, memInfoStream1);
+	ASSERT(request[LiteralStringRef("MemTotal:")] == 31856496);
+	ASSERT(request[LiteralStringRef("MemFree:")] == 25492716);
+	ASSERT(request[LiteralStringRef("MemAvailable:")] == 28470756);
+	ASSERT(request[LiteralStringRef("Buffers:")] == 313644);
+	ASSERT(request[LiteralStringRef("Cached:")] == 2956444);
+	ASSERT(request[LiteralStringRef("SwapTotal:")] == 0);
+	ASSERT(request[LiteralStringRef("SwapFree:")] == 0);
 	for (auto& item : request) {
 		fmt::print("{}:{}\n", item.first.toString().c_str(), item.second);
 	}
--- a/flow/StreamCipher.cpp
+++ b/flow/StreamCipher.cpp
@ -113,16 +113,6 @@ void StreamCipher::cleanup() noexcept {
 	}
 }

-void applyHmacKeyDerivationFunc(StreamCipherKey* cipherKey, HmacSha256StreamCipher* hmacGenerator, Arena& arena) {
-	uint8_t buf[cipherKey->size() + sizeof(uint64_t)];
-	memcpy(&buf[0], cipherKey->data(), cipherKey->size());
-	uint64_t seed = deterministicRandom()->randomUInt64();
-	memcpy(&buf[0] + cipherKey->size(), &seed, sizeof(uint64_t));
-	StringRef digest = hmacGenerator->digest(&buf[0], cipherKey->size() + sizeof(uint64_t), arena);
-	std::copy(digest.begin(), digest.end(), &buf[0]);
-	cipherKey->initializeKey(&buf[0], cipherKey->size());
-}
-
 EncryptionStreamCipher::EncryptionStreamCipher(const StreamCipherKey* key, const StreamCipher::IV& iv)
  : cipher(StreamCipher(key->size())) {
 	EVP_EncryptInit_ex(cipher.getCtx(), EVP_aes_256_gcm(), nullptr, nullptr, nullptr);
@ -173,15 +163,6 @@ HmacSha256StreamCipher::HmacSha256StreamCipher() : cipher(EVP_MAX_KEY_LENGTH) {
 	HMAC_Init_ex(cipher.getHmacCtx(), NULL, 0, EVP_sha256(), nullptr);
 }

-StringRef HmacSha256StreamCipher::digest(unsigned char const* data, int len, Arena& arena) {
-	CODE_PROBE(true, "Digest using StreamCipher");
-	unsigned int digestLen = HMAC_size(cipher.getHmacCtx());
-	auto digest = new (arena) unsigned char[digestLen];
-	HMAC_Update(cipher.getHmacCtx(), data, len);
-	HMAC_Final(cipher.getHmacCtx(), digest, &digestLen);
-	return StringRef(digest, digestLen);
-}
-
 StringRef HmacSha256StreamCipher::finish(Arena& arena) {
 	unsigned int digestLen = HMAC_size(cipher.getHmacCtx());
 	auto digest = new (arena) unsigned char[digestLen];
--- a/flow/SystemMonitor.cpp
+++ b/flow/SystemMonitor.cpp
@ -18,6 +18,8 @@
 * limitations under the License.
 */

+#include <fstream>
+
 #include "flow/flow.h"
 #include "flow/Histogram.h"
 #include "flow/Platform.h"
@ -67,6 +69,35 @@ SystemStatistics getSystemStatistics() {
 	    .detail("ApproximateUnusedMemory" #size, FastAllocator<size>::getApproximateMemoryUnused())                    \
 	    .detail("ActiveThreads" #size, FastAllocator<size>::getActiveThreads())

+namespace {
+
+#ifdef __linux__
+// Converts cgroup key, e.g. nr_periods, to NrPeriods
+std::string capitalizeCgroupKey(const std::string& key) {
+	bool wordStart = true;
+	std::string result;
+	result.reserve(key.size());
+
+	for (const char ch : key) {
+		if (std::isalnum(ch)) {
+			if (wordStart) {
+				result.push_back(std::toupper(ch));
+				wordStart = false;
+			} else {
+				result.push_back(ch);
+			}
+		} else {
+			// Skip non-alnum characters
+			wordStart = true;
+		}
+	}
+
+	return result;
+}
+#endif // __linux__
+
+} // anonymous namespace
+
 SystemStatistics customSystemMonitor(std::string const& eventName, StatisticsState* statState, bool machineMetrics) {
 	const IPAddress ipAddr = machineState.ip.present() ? machineState.ip.get() : IPAddress();
 	SystemStatistics currentStats = getSystemStatistics(
@ -284,8 +315,8 @@ SystemStatistics customSystemMonitor(std::string const& eventName, StatisticsSta
 		}

 		if (machineMetrics) {
-			TraceEvent("MachineMetrics")
-			    .detail("Elapsed", currentStats.elapsed)
+			auto traceEvent = TraceEvent("MachineMetrics");
+			traceEvent.detail("Elapsed", currentStats.elapsed)
 			    .detail("MbpsSent", currentStats.machineMegabitsSent / currentStats.elapsed)
 			    .detail("MbpsReceived", currentStats.machineMegabitsReceived / currentStats.elapsed)
 			    .detail("OutSegs", currentStats.machineOutSegs)
@ -298,6 +329,11 @@ SystemStatistics customSystemMonitor(std::string const& eventName, StatisticsSta
 			    .detail("ZoneID", machineState.zoneId)
 			    .detail("MachineID", machineState.machineId)
 			    .trackLatest("MachineMetrics");
+#ifdef __linux__
+			for (const auto& [k, v] : linux_os::reportCGroupCpuStat()) {
+				traceEvent.detail(capitalizeCgroupKey(k).c_str(), v);
+			}
+#endif // __linux__
 		}
 	}

--- a/flow/include/flow/Platform.h
+++ b/flow/include/flow/Platform.h
@ -143,6 +143,7 @@ inline static T& makeDependent(T& value) {
 	return value;
 }

+#include <map>
 #include <string>
 #include <vector>

@ -244,6 +245,16 @@ double getProcessorTimeThread();

 double getProcessorTimeProcess();

+#ifdef __linux__
+namespace linux_os {
+
+// Collects the /sys/fs/cgroup/cpu,cpuacct/cpu.stat information and returns the content
+// For more information about cpu,cpuacct, check manpages for cgroup
+std::map<std::string, int64_t> reportCGroupCpuStat();
+
+} // namespace linux_os
+#endif // __linux__
+
 uint64_t getMemoryUsage();

 uint64_t getResidentMemoryUsage();
@ -316,6 +327,7 @@ void renameFile(std::string const& fromPath, std::string const& toPath);
 void atomicReplace(std::string const& path, std::string const& content, bool textmode = true);

 // Read a file into memory
+// This requires the file to be seekable
 std::string readFileBytes(std::string const& filename, int maxSize);

 // Read a file into memory supplied by the caller
@ -772,7 +784,7 @@ int64_t getNumProfilesDeferred();
 int64_t getNumProfilesOverflowed();
 int64_t getNumProfilesCaptured();

-#else
+#else // __cplusplus
 #define EXTERNC
 #endif // __cplusplus

--- a/flow/include/flow/StreamCipher.h
+++ b/flow/include/flow/StreamCipher.h
@ -104,8 +104,5 @@ class HmacSha256StreamCipher final : NonCopyable, public ReferenceCounted<HmacSh

 public:
 	HmacSha256StreamCipher();
-	StringRef digest(unsigned char const* data, int len, Arena&);
 	StringRef finish(Arena&);
 };
-
-void applyHmacKeyDerivationFunc(StreamCipherKey* cipherKey, HmacSha256StreamCipher* hmacGenerator, Arena& arena);
--- a/flow/include/flow/Util.h
+++ b/flow/include/flow/Util.h
@ -20,9 +20,48 @@

 #ifndef _FLOW_UTIL_H_
 #define _FLOW_UTIL_H_
+
 #pragma once

 #include <algorithm>
+#include <functional>
+#include <iosfwd>
+
+// Read key/value pairs from stream. The stream is constituted by lines of text.
+// Each line contains a pair of key/value, separated by space/tab. e.g.
+//
+// Key1   Value1      tailing characters
+// Key2   Value2      tailing characters
+//
+// The tailing characters will be ignored.
+//
+// K and V should have
+//
+//   std::istream& operator>>(std::istream&, K&);
+//
+// implemented.
+template <typename K, typename V>
+void keyValueReader(std::istream& stream, std::function<bool(const K&, const V&)> consumer) {
+	std::stringstream lineParser;
+	std::string line;
+	K key;
+	V value;
+	while (std::getline(stream, line)) {
+		lineParser.clear();
+		lineParser.str(std::move(line));
+		try {
+			lineParser >> key >> value;
+		} catch (std::ios_base::failure&) {
+			continue;
+		}
+		if (lineParser.fail() || lineParser.bad()) {
+			continue;
+		}
+		if (!consumer(key, value)) {
+			break;
+		}
+	}
+}

 template <typename C>
 void swapAndPop(C* container, int index) {
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -219,6 +219,7 @@ if(WITH_PYTHON)
  add_fdb_test(TEST_FILES rare/CycleRollbackClogged.toml)
  add_fdb_test(TEST_FILES rare/CycleWithKills.toml)
  add_fdb_test(TEST_FILES rare/CycleWithDeadHall.toml)
+  add_fdb_test(TEST_FILES rare/DataDistributionMetrics.toml)
  add_fdb_test(TEST_FILES rare/FuzzTest.toml)
  add_fdb_test(TEST_FILES rare/GlobalTagThrottling.toml IGNORE)
  add_fdb_test(TEST_FILES rare/HighContentionPrefixAllocator.toml)
--- a/tests/rare/DataDistributionMetrics.toml
+++ b/tests/rare/DataDistributionMetrics.toml
@ -0,0 +1,8 @@
+[configuration]
+buggify = false
+
+[[test]]
+testTitle = 'DataDistributionMetricsTest'
+
+    [[test.workload]]
+    testName = 'DataDistributionMetrics'