Merge remote-tracking branch 'origin/main' into ddteamcollection-boolean-param

2022-03-16 13:38:28 -07:00 · 2022-03-16 13:38:28 -07:00 · eaa699c203
parent 5fa135c8ad 10c536c700
commit eaa699c203
141 changed files with 8250 additions and 1175 deletions
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -91,11 +91,35 @@ if(NOT WIN32)

  set(UNIT_TEST_VERSION_510_SRCS test/unit/unit_tests_version_510.cpp)
  set(TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS test/unit/trace_partial_file_suffix_test.cpp)
-  set(DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS 
+  set(DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS
    test/unit/disconnected_timeout_tests.cpp
    test/unit/fdb_api.cpp
    test/unit/fdb_api.hpp)

+  set(API_TESTER_SRCS
+    test/apitester/fdb_c_api_tester.cpp
+    test/apitester/TesterApiWorkload.cpp
+    test/apitester/TesterApiWorkload.h
+    test/apitester/TesterApiWrapper.cpp
+    test/apitester/TesterApiWrapper.h
+    test/apitester/TesterTestSpec.cpp
+    test/apitester/TesterTestSpec.h
+    test/apitester/TesterCancelTransactionWorkload.cpp
+    test/apitester/TesterCorrectnessWorkload.cpp
+    test/apitester/TesterKeyValueStore.cpp
+    test/apitester/TesterKeyValueStore.h
+    test/apitester/TesterOptions.h
+    test/apitester/TesterScheduler.cpp
+    test/apitester/TesterScheduler.h
+    test/apitester/TesterTransactionExecutor.cpp
+    test/apitester/TesterTransactionExecutor.h
+    test/apitester/TesterUtil.cpp
+    test/apitester/TesterUtil.h
+    test/apitester/TesterWorkload.cpp
+    test/apitester/TesterWorkload.h
+    ../../flow/SimpleOpt.h
+  )
+
  if(OPEN_FOR_IDE)
    add_library(fdb_c_performance_test OBJECT test/performance_test.c test/test.h)
    add_library(fdb_c_ryw_benchmark OBJECT test/ryw_benchmark.c test/test.h)
@ -106,6 +130,7 @@ if(NOT WIN32)
    add_library(fdb_c_unit_tests_version_510 OBJECT ${UNIT_TEST_VERSION_510_SRCS})
    add_library(trace_partial_file_suffix_test OBJECT ${TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS})
    add_library(disconnected_timeout_unit_tests OBJECT ${DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS})
+    add_library(fdb_c_api_tester OBJECT ${API_TESTER_SRCS})
  else()
    add_executable(fdb_c_performance_test test/performance_test.c test/test.h)
    add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h)
@ -116,6 +141,7 @@ if(NOT WIN32)
    add_executable(fdb_c_unit_tests_version_510 ${UNIT_TEST_VERSION_510_SRCS})
    add_executable(trace_partial_file_suffix_test ${TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS})
    add_executable(disconnected_timeout_unit_tests ${DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS})
+    add_executable(fdb_c_api_tester ${API_TESTER_SRCS})
    strip_debug_symbols(fdb_c_performance_test)
    strip_debug_symbols(fdb_c_ryw_benchmark)
    strip_debug_symbols(fdb_c_txn_size_test)
@ -138,6 +164,12 @@ if(NOT WIN32)
  target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads flow)
  target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads)

+if(USE_SANITIZER)
+  target_link_libraries(fdb_c_api_tester PRIVATE fdb_c toml11_target Threads::Threads fmt::fmt boost_asan)
+else()
+  target_link_libraries(fdb_c_api_tester PRIVATE fdb_c toml11_target Threads::Threads fmt::fmt boost_target)
+endif()
+
  # do not set RPATH for mako
  set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
  target_link_libraries(mako PRIVATE fdb_c fdbclient)
@ -163,6 +195,7 @@ if(NOT WIN32)
  add_custom_target(external_client DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so)
  add_dependencies(fdb_c_unit_tests external_client)
  add_dependencies(disconnected_timeout_unit_tests external_client)
+  add_dependencies(fdb_c_api_tester external_client)

  add_fdbclient_test(
    NAME fdb_c_setup_tests
@ -200,6 +233,19 @@ if(NOT WIN32)
            @CLUSTER_FILE@
            ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
            )
+  add_fdbclient_test(
+    NAME fdb_c_api_tests
+    DISABLE_LOG_DUMP
+    COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
+            --cluster-file
+            @CLUSTER_FILE@
+            --tester-binary
+            $<TARGET_FILE:fdb_c_api_tester>
+            --external-client-library
+            ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
+            --test-dir
+            ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
+            )
 endif()

 set(c_workloads_srcs
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -37,12 +37,14 @@ int g_api_version = 0;
 *   FDBFuture -> ThreadSingleAssignmentVarBase
 *   FDBResult -> ThreadSingleAssignmentVarBase
 *   FDBDatabase -> IDatabase
+ *   FDBTenant -> ITenant
 *   FDBTransaction -> ITransaction
 */
 #define TSAVB(f) ((ThreadSingleAssignmentVarBase*)(f))
 #define TSAV(T, f) ((ThreadSingleAssignmentVar<T>*)(f))

 #define DB(d) ((IDatabase*)d)
+#define TENANT(t) ((ITenant*)t)
 #define TXN(t) ((ITransaction*)t)

 // Legacy (pre API version 610)
@ -386,6 +388,14 @@ extern "C" DLLEXPORT void fdb_database_destroy(FDBDatabase* d) {
 	CATCH_AND_DIE(DB(d)->delref(););
 }

+extern "C" DLLEXPORT fdb_error_t fdb_database_open_tenant(FDBDatabase* d,
+                                                          uint8_t const* tenant_name,
+                                                          int tenant_name_length,
+                                                          FDBTenant** out_tenant) {
+	CATCH_AND_RETURN(*out_tenant =
+	                     (FDBTenant*)DB(d)->openTenant(TenantNameRef(tenant_name, tenant_name_length)).extractPtr(););
+}
+
 extern "C" DLLEXPORT fdb_error_t fdb_database_create_transaction(FDBDatabase* d, FDBTransaction** out_transaction) {
 	CATCH_AND_RETURN(Reference<ITransaction> tr = DB(d)->createTransaction();
 	                 if (g_api_version <= 15) tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -439,6 +449,17 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db
 	                }).extractPtr());
 }

+extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) {
+	CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr(););
+}
+
+extern "C" DLLEXPORT void fdb_tenant_destroy(FDBTenant* tenant) {
+	try {
+		TENANT(tenant)->delref();
+	} catch (...) {
+	}
+}
+
 extern "C" DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr) {
 	try {
 		TXN(tr)->delref();
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@ -67,6 +67,7 @@ extern "C" {
 typedef struct FDB_future FDBFuture;
 typedef struct FDB_result FDBResult;
 typedef struct FDB_database FDBDatabase;
+typedef struct FDB_tenant FDBTenant;
 typedef struct FDB_transaction FDBTransaction;

 typedef int fdb_error_t;
@ -271,6 +272,11 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_database_set_option(FDBDatabase* d,
                                                                 uint8_t const* value,
                                                                 int value_length);

+DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_database_open_tenant(FDBDatabase* d,
+                                                                  uint8_t const* tenant_name,
+                                                                  int tenant_name_length,
+                                                                  FDBTenant** out_tenant);
+
 DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_database_create_transaction(FDBDatabase* d,
                                                                         FDBTransaction** out_transaction);

@ -294,6 +300,11 @@ DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDat

 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version);

+DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant,
+                                                                       FDBTransaction** out_transaction);
+
+DLLEXPORT void fdb_tenant_destroy(FDBTenant* tenant);
+
 DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr);

 DLLEXPORT void fdb_transaction_cancel(FDBTransaction* tr);
--- a/bindings/c/test/apitester/TesterApiWorkload.cpp
+++ b/bindings/c/test/apitester/TesterApiWorkload.cpp
@ -0,0 +1,129 @@
+/*
+ * TesterApiWorkload.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterApiWorkload.h"
+#include "TesterUtil.h"
+#include <fmt/format.h>
+
+namespace FdbApiTester {
+
+ApiWorkload::ApiWorkload(const WorkloadConfig& config) : WorkloadBase(config) {
+	minKeyLength = config.getIntOption("minKeyLength", 1);
+	maxKeyLength = config.getIntOption("maxKeyLength", 64);
+	minValueLength = config.getIntOption("minValueLength", 1);
+	maxValueLength = config.getIntOption("maxValueLength", 1000);
+	maxKeysPerTransaction = config.getIntOption("maxKeysPerTransaction", 50);
+	initialSize = config.getIntOption("initialSize", 1000);
+	readExistingKeysRatio = config.getFloatOption("readExistingKeysRatio", 0.9);
+	keyPrefix = fmt::format("{}/", workloadId);
+}
+
+void ApiWorkload::start() {
+	schedule([this]() {
+		// 1. Clear data
+		clearData([this]() {
+			// 2. Populate initial data
+			populateData([this]() {
+				// 3. Generate random workload
+				runTests();
+			});
+		});
+	});
+}
+
+std::string ApiWorkload::randomKeyName() {
+	return keyPrefix + Random::get().randomStringLowerCase(minKeyLength, maxKeyLength);
+}
+
+std::string ApiWorkload::randomValue() {
+	return Random::get().randomStringLowerCase(minValueLength, maxValueLength);
+}
+
+std::string ApiWorkload::randomNotExistingKey() {
+	while (true) {
+		std::string key = randomKeyName();
+		if (!store.exists(key)) {
+			return key;
+		}
+	}
+}
+
+std::string ApiWorkload::randomExistingKey() {
+	std::string genKey = randomKeyName();
+	std::string key = store.getKey(genKey, true, 1);
+	if (key != store.endKey()) {
+		return key;
+	}
+	key = store.getKey(genKey, true, 0);
+	if (key != store.startKey()) {
+		return key;
+	}
+	info("No existing key found, using a new random key.");
+	return genKey;
+}
+
+std::string ApiWorkload::randomKey(double existingKeyRatio) {
+	if (Random::get().randomBool(existingKeyRatio)) {
+		return randomExistingKey();
+	} else {
+		return randomNotExistingKey();
+	}
+}
+
+void ApiWorkload::populateDataTx(TTaskFct cont) {
+	int numKeys = maxKeysPerTransaction;
+	auto kvPairs = std::make_shared<std::vector<KeyValue>>();
+	for (int i = 0; i < numKeys; i++) {
+		kvPairs->push_back(KeyValue{ randomNotExistingKey(), randomValue() });
+	}
+	execTransaction(
+	    [kvPairs](auto ctx) {
+		    for (const KeyValue& kv : *kvPairs) {
+			    ctx->tx()->set(kv.key, kv.value);
+		    }
+		    ctx->commit();
+	    },
+	    [this, kvPairs, cont]() {
+		    for (const KeyValue& kv : *kvPairs) {
+			    store.set(kv.key, kv.value);
+		    }
+		    schedule(cont);
+	    });
+}
+
+void ApiWorkload::clearData(TTaskFct cont) {
+	execTransaction(
+	    [this](auto ctx) {
+		    ctx->tx()->clearRange(keyPrefix, fmt::format("{}\xff", keyPrefix));
+		    ctx->commit();
+	    },
+	    [this, cont]() { schedule(cont); });
+}
+
+void ApiWorkload::populateData(TTaskFct cont) {
+	if (store.size() < initialSize) {
+		populateDataTx([this, cont]() { populateData(cont); });
+	} else {
+		info("Data population completed");
+		schedule(cont);
+	}
+}
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterApiWorkload.h
+++ b/bindings/c/test/apitester/TesterApiWorkload.h
@ -0,0 +1,89 @@
+/*
+ * TesterApiWorkload.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef APITESTER_API_WORKLOAD_H
+#define APITESTER_API_WORKLOAD_H
+
+#include "TesterWorkload.h"
+#include "TesterKeyValueStore.h"
+
+namespace FdbApiTester {
+
+/**
+ * Base class for implementing API testing workloads.
+ * Provides various helper methods and reusable configuration parameters
+ */
+class ApiWorkload : public WorkloadBase {
+public:
+	void start() override;
+
+	// Method to be overridden to run specific tests
+	virtual void runTests() = 0;
+
+protected:
+	// The minimum length of a key
+	int minKeyLength;
+
+	// The maximum length of a key
+	int maxKeyLength;
+
+	// The minimum length of a value
+	int minValueLength;
+
+	// The maximum length of a value
+	int maxValueLength;
+
+	// Maximum number of keys to be accessed by a transaction
+	int maxKeysPerTransaction;
+
+	// Initial data size (number of key-value pairs)
+	int initialSize;
+
+	// The ratio of reading existing keys
+	double readExistingKeysRatio;
+
+	// Key prefix
+	std::string keyPrefix;
+
+	// In-memory store maintaining expected database state
+	KeyValueStore store;
+
+	ApiWorkload(const WorkloadConfig& config);
+
+	// Methods for generating random keys and values
+	std::string randomKeyName();
+	std::string randomValue();
+	std::string randomNotExistingKey();
+	std::string randomExistingKey();
+	std::string randomKey(double existingKeyRatio);
+
+	// Generate initial random data for the workload
+	void populateData(TTaskFct cont);
+
+	// Clear the data of the workload
+	void clearData(TTaskFct cont);
+
+private:
+	void populateDataTx(TTaskFct cont);
+};
+
+} // namespace FdbApiTester
+
+#endif
--- a/bindings/c/test/apitester/TesterApiWrapper.cpp
+++ b/bindings/c/test/apitester/TesterApiWrapper.cpp
@ -0,0 +1,124 @@
+/*
+ * TesterApiWrapper.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "TesterApiWrapper.h"
+#include "TesterUtil.h"
+#include <cstdint>
+#include <fmt/format.h>
+
+namespace FdbApiTester {
+
+namespace {
+
+void fdb_check(fdb_error_t e) {
+	if (e) {
+		fmt::print(stderr, "Unexpected error: %s\n", fdb_get_error(e));
+		std::abort();
+	}
+}
+
+} // namespace
+
+Future::Future(FDBFuture* f) : future_(f, fdb_future_destroy) {}
+
+void Future::reset() {
+	future_.reset();
+}
+
+void Future::cancel() {
+	ASSERT(future_);
+	fdb_future_cancel(future_.get());
+}
+
+fdb_error_t Future::getError() const {
+	ASSERT(future_);
+	return fdb_future_get_error(future_.get());
+}
+
+std::optional<std::string> ValueFuture::getValue() const {
+	ASSERT(future_);
+	int out_present;
+	const std::uint8_t* val;
+	int vallen;
+	fdb_check(fdb_future_get_value(future_.get(), &out_present, &val, &vallen));
+	return out_present ? std::make_optional(std::string((const char*)val, vallen)) : std::nullopt;
+}
+
+// Given an FDBDatabase, initializes a new transaction.
+Transaction::Transaction(FDBTransaction* tx) : tx_(tx, fdb_transaction_destroy) {}
+
+ValueFuture Transaction::get(std::string_view key, fdb_bool_t snapshot) {
+	ASSERT(tx_);
+	return ValueFuture(fdb_transaction_get(tx_.get(), (const uint8_t*)key.data(), key.size(), snapshot));
+}
+
+void Transaction::set(std::string_view key, std::string_view value) {
+	ASSERT(tx_);
+	fdb_transaction_set(tx_.get(), (const uint8_t*)key.data(), key.size(), (const uint8_t*)value.data(), value.size());
+}
+
+void Transaction::clear(std::string_view key) {
+	ASSERT(tx_);
+	fdb_transaction_clear(tx_.get(), (const uint8_t*)key.data(), key.size());
+}
+
+void Transaction::clearRange(std::string_view begin, std::string_view end) {
+	ASSERT(tx_);
+	fdb_transaction_clear_range(
+	    tx_.get(), (const uint8_t*)begin.data(), begin.size(), (const uint8_t*)end.data(), end.size());
+}
+
+Future Transaction::commit() {
+	ASSERT(tx_);
+	return Future(fdb_transaction_commit(tx_.get()));
+}
+
+void Transaction::cancel() {
+	ASSERT(tx_);
+	fdb_transaction_cancel(tx_.get());
+}
+
+Future Transaction::onError(fdb_error_t err) {
+	ASSERT(tx_);
+	return Future(fdb_transaction_on_error(tx_.get(), err));
+}
+
+void Transaction::reset() {
+	ASSERT(tx_);
+	fdb_transaction_reset(tx_.get());
+}
+
+fdb_error_t Transaction::setOption(FDBTransactionOption option) {
+	ASSERT(tx_);
+	return fdb_transaction_set_option(tx_.get(), option, reinterpret_cast<const uint8_t*>(""), 0);
+}
+
+fdb_error_t FdbApi::setOption(FDBNetworkOption option, std::string_view value) {
+	return fdb_network_set_option(option, reinterpret_cast<const uint8_t*>(value.data()), value.size());
+}
+
+fdb_error_t FdbApi::setOption(FDBNetworkOption option, int64_t value) {
+	return fdb_network_set_option(option, reinterpret_cast<const uint8_t*>(&value), sizeof(value));
+}
+
+fdb_error_t FdbApi::setOption(FDBNetworkOption option) {
+	return fdb_network_set_option(option, reinterpret_cast<const uint8_t*>(""), 0);
+}
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterApiWrapper.h
+++ b/bindings/c/test/apitester/TesterApiWrapper.h
@ -0,0 +1,92 @@
+/*
+ * TesterApiWrapper.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef APITESTER_API_WRAPPER_H
+#define APITESTER_API_WRAPPER_H
+
+#include <string_view>
+#include <optional>
+#include <memory>
+
+#define FDB_API_VERSION 710
+#include "bindings/c/foundationdb/fdb_c.h"
+
+#undef ERROR
+#define ERROR(name, number, description) enum { error_code_##name = number };
+
+#include "flow/error_definitions.h"
+
+namespace FdbApiTester {
+
+// Wrapper parent class to manage memory of an FDBFuture pointer. Cleans up
+// FDBFuture when this instance goes out of scope.
+class Future {
+public:
+	Future() = default;
+	Future(FDBFuture* f);
+
+	FDBFuture* fdbFuture() { return future_.get(); };
+
+	fdb_error_t getError() const;
+	explicit operator bool() const { return future_ != nullptr; };
+	void reset();
+	void cancel();
+
+protected:
+	std::shared_ptr<FDBFuture> future_;
+};
+
+class ValueFuture : public Future {
+public:
+	ValueFuture() = default;
+	ValueFuture(FDBFuture* f) : Future(f) {}
+	std::optional<std::string> getValue() const;
+};
+
+class Transaction {
+public:
+	Transaction() = default;
+	Transaction(FDBTransaction* tx);
+	ValueFuture get(std::string_view key, fdb_bool_t snapshot);
+	void set(std::string_view key, std::string_view value);
+	void clear(std::string_view key);
+	void clearRange(std::string_view begin, std::string_view end);
+	Future commit();
+	void cancel();
+	Future onError(fdb_error_t err);
+	void reset();
+	fdb_error_t setOption(FDBTransactionOption option);
+
+private:
+	std::shared_ptr<FDBTransaction> tx_;
+};
+
+class FdbApi {
+public:
+	static fdb_error_t setOption(FDBNetworkOption option, std::string_view value);
+	static fdb_error_t setOption(FDBNetworkOption option, int64_t value);
+	static fdb_error_t setOption(FDBNetworkOption option);
+};
+
+} // namespace FdbApiTester
+
+#endif
--- a/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp
+++ b/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp
@ -0,0 +1,113 @@
+/*
+ * TesterCancelTransactionWorkload.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "TesterApiWorkload.h"
+#include "TesterUtil.h"
+
+namespace FdbApiTester {
+
+class CancelTransactionWorkload : public ApiWorkload {
+public:
+	CancelTransactionWorkload(const WorkloadConfig& config) : ApiWorkload(config) {
+		numRandomOperations = config.getIntOption("numRandomOperations", 1000);
+		numOpLeft = numRandomOperations;
+	}
+
+	void runTests() override { randomOperations(); }
+
+private:
+	enum OpType { OP_CANCEL_GET, OP_CANCEL_AFTER_FIRST_GET, OP_LAST = OP_CANCEL_AFTER_FIRST_GET };
+
+	// The number of operations to be executed
+	int numRandomOperations;
+
+	// Operations counter
+	int numOpLeft;
+
+	// Start multiple concurrent gets and cancel the transaction
+	void randomCancelGetTx(TTaskFct cont) {
+		int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
+		auto keys = std::make_shared<std::vector<std::string>>();
+		for (int i = 0; i < numKeys; i++) {
+			keys->push_back(randomKey(readExistingKeysRatio));
+		}
+		execTransaction(
+		    [keys](auto ctx) {
+			    std::vector<Future> futures;
+			    for (const auto& key : *keys) {
+				    futures.push_back(ctx->tx()->get(key, false));
+			    }
+			    ctx->done();
+		    },
+		    [this, cont]() { schedule(cont); });
+	}
+
+	// Start multiple concurrent gets and cancel the transaction after the first get returns
+	void randomCancelAfterFirstResTx(TTaskFct cont) {
+		int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
+		auto keys = std::make_shared<std::vector<std::string>>();
+		for (int i = 0; i < numKeys; i++) {
+			keys->push_back(randomKey(readExistingKeysRatio));
+		}
+		execTransaction(
+		    [this, keys](auto ctx) {
+			    std::vector<ValueFuture> futures;
+			    for (const auto& key : *keys) {
+				    futures.push_back(ctx->tx()->get(key, false));
+			    }
+			    for (int i = 0; i < keys->size(); i++) {
+				    ValueFuture f = futures[i];
+				    auto expectedVal = store.get((*keys)[i]);
+				    ctx->continueAfter(f, [expectedVal, f, this, ctx]() {
+					    auto val = f.getValue();
+					    if (expectedVal != val) {
+						    error(fmt::format(
+						        "cancelAfterFirstResTx mismatch. expected: {:.80} actual: {:.80}", expectedVal, val));
+					    }
+					    ctx->done();
+				    });
+			    }
+		    },
+		    [this, cont]() { schedule(cont); });
+	}
+
+	void randomOperation(TTaskFct cont) {
+		OpType txType = (OpType)Random::get().randomInt(0, OP_LAST);
+		switch (txType) {
+		case OP_CANCEL_GET:
+			randomCancelGetTx(cont);
+			break;
+		case OP_CANCEL_AFTER_FIRST_GET:
+			randomCancelAfterFirstResTx(cont);
+			break;
+		}
+	}
+
+	void randomOperations() {
+		if (numOpLeft == 0)
+			return;
+
+		numOpLeft--;
+		randomOperation([this]() { randomOperations(); });
+	}
+};
+
+WorkloadFactory<CancelTransactionWorkload> MiscTestWorkloadFactory("CancelTransaction");
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp
+++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp
@ -0,0 +1,227 @@
+/*
+ * TesterCorrectnessWorkload.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "TesterApiWorkload.h"
+#include "TesterUtil.h"
+#include <memory>
+#include <fmt/format.h>
+
+namespace FdbApiTester {
+
+class ApiCorrectnessWorkload : public ApiWorkload {
+public:
+	ApiCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {
+		numRandomOperations = config.getIntOption("numRandomOperations", 1000);
+		numOpLeft = numRandomOperations;
+	}
+
+	void runTests() override { randomOperations(); }
+
+private:
+	enum OpType { OP_INSERT, OP_GET, OP_CLEAR, OP_CLEAR_RANGE, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ };
+
+	// The number of operations to be executed
+	int numRandomOperations;
+
+	// Operations counter
+	int numOpLeft;
+
+	void randomInsertOp(TTaskFct cont) {
+		int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
+		auto kvPairs = std::make_shared<std::vector<KeyValue>>();
+		for (int i = 0; i < numKeys; i++) {
+			kvPairs->push_back(KeyValue{ randomNotExistingKey(), randomValue() });
+		}
+		execTransaction(
+		    [kvPairs](auto ctx) {
+			    for (const KeyValue& kv : *kvPairs) {
+				    ctx->tx()->set(kv.key, kv.value);
+			    }
+			    ctx->commit();
+		    },
+		    [this, kvPairs, cont]() {
+			    for (const KeyValue& kv : *kvPairs) {
+				    store.set(kv.key, kv.value);
+			    }
+			    schedule(cont);
+		    });
+	}
+
+	void randomCommitReadOp(TTaskFct cont) {
+		int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
+		auto kvPairs = std::make_shared<std::vector<KeyValue>>();
+		for (int i = 0; i < numKeys; i++) {
+			kvPairs->push_back(KeyValue{ randomKey(readExistingKeysRatio), randomValue() });
+		}
+		execTransaction(
+		    [kvPairs](auto ctx) {
+			    for (const KeyValue& kv : *kvPairs) {
+				    ctx->tx()->set(kv.key, kv.value);
+			    }
+			    ctx->commit();
+		    },
+		    [this, kvPairs, cont]() {
+			    for (const KeyValue& kv : *kvPairs) {
+				    store.set(kv.key, kv.value);
+			    }
+			    auto results = std::make_shared<std::vector<std::optional<std::string>>>();
+			    execTransaction(
+			        [kvPairs, results](auto ctx) {
+				        // TODO: Enable after merging with GRV caching
+				        // ctx->tx()->setOption(FDB_TR_OPTION_USE_GRV_CACHE);
+				        auto futures = std::make_shared<std::vector<Future>>();
+				        for (const auto& kv : *kvPairs) {
+					        futures->push_back(ctx->tx()->get(kv.key, false));
+				        }
+				        ctx->continueAfterAll(*futures, [ctx, futures, results]() {
+					        results->clear();
+					        for (auto& f : *futures) {
+						        results->push_back(((ValueFuture&)f).getValue());
+					        }
+					        ASSERT(results->size() == futures->size());
+					        ctx->done();
+				        });
+			        },
+			        [this, kvPairs, results, cont]() {
+				        ASSERT(results->size() == kvPairs->size());
+				        for (int i = 0; i < kvPairs->size(); i++) {
+					        auto expected = store.get((*kvPairs)[i].key);
+					        auto actual = (*results)[i];
+					        if (actual != expected) {
+						        error(
+						            fmt::format("randomCommitReadOp mismatch. key: {} expected: {:.80} actual: {:.80}",
+						                        (*kvPairs)[i].key,
+						                        expected,
+						                        actual));
+						        ASSERT(false);
+					        }
+				        }
+				        schedule(cont);
+			        });
+		    });
+	}
+
+	void randomGetOp(TTaskFct cont) {
+		int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
+		auto keys = std::make_shared<std::vector<std::string>>();
+		auto results = std::make_shared<std::vector<std::optional<std::string>>>();
+		for (int i = 0; i < numKeys; i++) {
+			keys->push_back(randomKey(readExistingKeysRatio));
+		}
+		execTransaction(
+		    [keys, results](auto ctx) {
+			    auto futures = std::make_shared<std::vector<Future>>();
+			    for (const auto& key : *keys) {
+				    futures->push_back(ctx->tx()->get(key, false));
+			    }
+			    ctx->continueAfterAll(*futures, [ctx, futures, results]() {
+				    results->clear();
+				    for (auto& f : *futures) {
+					    results->push_back(((ValueFuture&)f).getValue());
+				    }
+				    ASSERT(results->size() == futures->size());
+				    ctx->done();
+			    });
+		    },
+		    [this, keys, results, cont]() {
+			    ASSERT(results->size() == keys->size());
+			    for (int i = 0; i < keys->size(); i++) {
+				    auto expected = store.get((*keys)[i]);
+				    if ((*results)[i] != expected) {
+					    error(fmt::format("randomGetOp mismatch. key: {} expected: {:.80} actual: {:.80}",
+					                      (*keys)[i],
+					                      expected,
+					                      (*results)[i]));
+				    }
+			    }
+			    schedule(cont);
+		    });
+	}
+
+	void randomClearOp(TTaskFct cont) {
+		int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
+		auto keys = std::make_shared<std::vector<std::string>>();
+		for (int i = 0; i < numKeys; i++) {
+			keys->push_back(randomExistingKey());
+		}
+		execTransaction(
+		    [keys](auto ctx) {
+			    for (const auto& key : *keys) {
+				    ctx->tx()->clear(key);
+			    }
+			    ctx->commit();
+		    },
+		    [this, keys, cont]() {
+			    for (const auto& key : *keys) {
+				    store.clear(key);
+			    }
+			    schedule(cont);
+		    });
+	}
+
+	void randomClearRangeOp(TTaskFct cont) {
+		std::string begin = randomKeyName();
+		std::string end = randomKeyName();
+		if (begin > end) {
+			std::swap(begin, end);
+		}
+		execTransaction(
+		    [begin, end](auto ctx) {
+			    ctx->tx()->clearRange(begin, end);
+			    ctx->commit();
+		    },
+		    [this, begin, end, cont]() {
+			    store.clear(begin, end);
+			    schedule(cont);
+		    });
+	}
+
+	void randomOperation(TTaskFct cont) {
+		OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST);
+		switch (txType) {
+		case OP_INSERT:
+			randomInsertOp(cont);
+			break;
+		case OP_GET:
+			randomGetOp(cont);
+			break;
+		case OP_CLEAR:
+			randomClearOp(cont);
+			break;
+		case OP_CLEAR_RANGE:
+			randomClearRangeOp(cont);
+			break;
+		case OP_COMMIT_READ:
+			randomCommitReadOp(cont);
+			break;
+		}
+	}
+
+	void randomOperations() {
+		if (numOpLeft == 0)
+			return;
+
+		numOpLeft--;
+		randomOperation([this]() { randomOperations(); });
+	}
+};
+
+WorkloadFactory<ApiCorrectnessWorkload> ApiCorrectnessWorkloadFactory("ApiCorrectness");
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterKeyValueStore.cpp
+++ b/bindings/c/test/apitester/TesterKeyValueStore.cpp
@ -0,0 +1,167 @@
+/*
+ * TesterKeyValueStore.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterKeyValueStore.h"
+
+namespace FdbApiTester {
+
+// Get the value associated with a key
+std::optional<std::string> KeyValueStore::get(std::string_view key) const {
+	std::unique_lock<std::mutex> lock(mutex);
+	auto value = store.find(std::string(key));
+	if (value != store.end())
+		return value->second;
+	else
+		return std::optional<std::string>();
+}
+
+// Checks if the key exists
+bool KeyValueStore::exists(std::string_view key) {
+	std::unique_lock<std::mutex> lock(mutex);
+	return (store.find(std::string(key)) != store.end());
+}
+
+// Returns the key designated by a key selector
+std::string KeyValueStore::getKey(std::string_view keyName, bool orEqual, int offset) const {
+	std::unique_lock<std::mutex> lock(mutex);
+	// Begin by getting the start key referenced by the key selector
+	std::map<std::string, std::string>::const_iterator mapItr = store.lower_bound(keyName);
+
+	// Update the iterator position if necessary based on the value of orEqual
+	int count = 0;
+	if (offset <= 0) {
+		if (mapItr == store.end() || keyName != mapItr->first || !orEqual) {
+			if (mapItr == store.begin())
+				return startKey();
+
+			mapItr--;
+		}
+	} else {
+		if (mapItr == store.end())
+			return endKey();
+
+		if (keyName == mapItr->first && orEqual) {
+			mapItr++;
+		}
+
+		count++;
+	}
+
+	// Increment the map iterator until the desired offset is reached
+	for (; count < abs(offset); count++) {
+		if (offset < 0) {
+			if (mapItr == store.begin())
+				break;
+
+			mapItr--;
+		} else {
+			if (mapItr == store.end())
+				break;
+
+			mapItr++;
+		}
+	}
+
+	if (mapItr == store.end())
+		return endKey();
+	else if (count == abs(offset))
+		return mapItr->first;
+	else
+		return startKey();
+}
+
+// Gets a range of key-value pairs, returning a maximum of <limit> results
+std::vector<KeyValue> KeyValueStore::getRange(std::string_view begin,
+                                              std::string_view end,
+                                              int limit,
+                                              bool reverse) const {
+	std::unique_lock<std::mutex> lock(mutex);
+	std::vector<KeyValue> results;
+	if (!reverse) {
+		std::map<std::string, std::string>::const_iterator mapItr = store.lower_bound(begin);
+
+		for (; mapItr != store.end() && mapItr->first < end && results.size() < limit; mapItr++)
+			results.push_back(KeyValue{ mapItr->first, mapItr->second });
+	}
+
+	// Support for reverse getRange queries is supported, but not tested at this time.  This is because reverse range
+	// queries have been disallowed by the database at the API level
+	else {
+		std::map<std::string, std::string>::const_iterator mapItr = store.lower_bound(end);
+		if (mapItr == store.begin())
+			return results;
+
+		for (--mapItr; mapItr->first >= begin && results.size() < abs(limit); mapItr--) {
+			results.push_back(KeyValue{ mapItr->first, mapItr->second });
+			if (mapItr == store.begin())
+				break;
+		}
+	}
+
+	return results;
+}
+
+// Stores a key-value pair in the database
+void KeyValueStore::set(std::string_view key, std::string_view value) {
+	std::unique_lock<std::mutex> lock(mutex);
+	store[std::string(key)] = value;
+}
+
+// Removes a key from the database
+void KeyValueStore::clear(std::string_view key) {
+	std::unique_lock<std::mutex> lock(mutex);
+	auto iter = store.find(key);
+	if (iter != store.end()) {
+		store.erase(iter);
+	}
+}
+
+// Removes a range of keys from the database
+void KeyValueStore::clear(std::string_view begin, std::string_view end) {
+	std::unique_lock<std::mutex> lock(mutex);
+	store.erase(store.lower_bound(begin), store.lower_bound(end));
+}
+
+// The number of keys in the database
+uint64_t KeyValueStore::size() const {
+	std::unique_lock<std::mutex> lock(mutex);
+	return store.size();
+}
+
+// The first key in the database; returned by key selectors that choose a key off the front
+std::string KeyValueStore::startKey() const {
+	return "";
+}
+
+// The last key in the database; returned by key selectors that choose a key off the back
+std::string KeyValueStore::endKey() const {
+	return "\xff";
+}
+
+// Debugging function that prints all key-value pairs
+void KeyValueStore::printContents() const {
+	std::unique_lock<std::mutex> lock(mutex);
+	printf("Contents:\n");
+	std::map<std::string, std::string>::const_iterator mapItr;
+	for (mapItr = store.begin(); mapItr != store.end(); mapItr++)
+		printf("%s\n", mapItr->first.c_str());
+}
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterKeyValueStore.h
+++ b/bindings/c/test/apitester/TesterKeyValueStore.h
@ -0,0 +1,83 @@
+/*
+ * TesterKeyValueStore.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef APITESTER_KEY_VALUE_STORE_H
+#define APITESTER_KEY_VALUE_STORE_H
+
+#include <map>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+#include <mutex>
+
+namespace FdbApiTester {
+
+struct KeyValue {
+	std::string key;
+	std::string value;
+};
+
+class KeyValueStore {
+public:
+	// Get the value associated with a key
+	std::optional<std::string> get(std::string_view key) const;
+
+	// Checks if the key exists
+	bool exists(std::string_view key);
+
+	// Returns the key designated by a key selector
+	std::string getKey(std::string_view keyName, bool orEqual, int offset) const;
+
+	// Gets a range of key-value pairs, returning a maximum of <limit> results
+	std::vector<KeyValue> getRange(std::string_view begin, std::string_view end, int limit, bool reverse) const;
+
+	// Stores a key-value pair in the database
+	void set(std::string_view key, std::string_view value);
+
+	// Removes a key from the database
+	void clear(std::string_view key);
+
+	// Removes a range of keys from the database
+	void clear(std::string_view begin, std::string_view end);
+
+	// The number of keys in the database
+	uint64_t size() const;
+
+	// The first key in the database; returned by key selectors that choose a key off the front
+	std::string startKey() const;
+
+	// The last key in the database; returned by key selectors that choose a key off the back
+	std::string endKey() const;
+
+	// Debugging function that prints all key-value pairs
+	void printContents() const;
+
+private:
+	// A map holding the key-value pairs
+	std::map<std::string, std::string, std::less<>> store;
+	mutable std::mutex mutex;
+};
+
+} // namespace FdbApiTester
+
+#endif
--- a/bindings/c/test/apitester/TesterOptions.h
+++ b/bindings/c/test/apitester/TesterOptions.h
@ -0,0 +1,49 @@
+/*
+ * TesterOptions.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef APITESTER_TESTER_OPTIONS_H
+#define APITESTER_TESTER_OPTIONS_H
+
+#include "TesterTestSpec.h"
+
+namespace FdbApiTester {
+
+class TesterOptions {
+public:
+	std::string clusterFile;
+	bool trace = false;
+	std::string traceDir;
+	std::string traceFormat;
+	std::string logGroup;
+	std::string externalClientLibrary;
+	std::string testFile;
+	int numFdbThreads;
+	int numClientThreads;
+	int numDatabases;
+	int numClients;
+	std::vector<std::pair<std::string, std::string>> knobs;
+	TestSpec testSpec;
+};
+
+} // namespace FdbApiTester
+
+#endif
--- a/bindings/c/test/apitester/TesterScheduler.cpp
+++ b/bindings/c/test/apitester/TesterScheduler.cpp
@ -0,0 +1,67 @@
+/*
+ * TesterScheduler.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterScheduler.h"
+#include "TesterUtil.h"
+
+#include <memory>
+#include <thread>
+#include <boost/asio.hpp>
+
+using namespace boost::asio;
+
+namespace FdbApiTester {
+
+const TTaskFct NO_OP_TASK = []() {};
+
+class AsioScheduler : public IScheduler {
+public:
+	AsioScheduler(int numThreads) : numThreads(numThreads) {}
+
+	void start() override {
+		work = require(io_ctx.get_executor(), execution::outstanding_work.tracked);
+		for (int i = 0; i < numThreads; i++) {
+			threads.emplace_back([this]() { io_ctx.run(); });
+		}
+	}
+
+	void schedule(TTaskFct task) override { post(io_ctx, task); }
+
+	void stop() override { work = any_io_executor(); }
+
+	void join() override {
+		for (auto& th : threads) {
+			th.join();
+		}
+	}
+
+private:
+	int numThreads;
+	std::vector<std::thread> threads;
+	io_context io_ctx;
+	any_io_executor work;
+};
+
+std::unique_ptr<IScheduler> createScheduler(int numThreads) {
+	ASSERT(numThreads > 0 && numThreads <= 1000);
+	return std::make_unique<AsioScheduler>(numThreads);
+}
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterScheduler.h
+++ b/bindings/c/test/apitester/TesterScheduler.h
@ -0,0 +1,60 @@
+/*
+ * TesterScheduler.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef APITESTER_SCHEDULER_H
+#define APITESTER_SCHEDULER_H
+
+#include <functional>
+#include <memory>
+
+namespace FdbApiTester {
+
+using TTaskFct = std::function<void(void)>;
+
+extern const TTaskFct NO_OP_TASK;
+
+/**
+ * Scheduler for asynchronous execution of tasks on a pool of threads
+ */
+class IScheduler {
+public:
+	virtual ~IScheduler() {}
+
+	// Create scheduler threads and begin accepting tasks
+	virtual void start() = 0;
+
+	// Schedule a task for asynchronous execution
+	virtual void schedule(TTaskFct task) = 0;
+
+	// Gracefully stop the scheduler. Waits for already running tasks to be finish
+	virtual void stop() = 0;
+
+	// Join with all threads of the scheduler
+	virtual void join() = 0;
+};
+
+// create a scheduler using given number of threads
+std::unique_ptr<IScheduler> createScheduler(int numThreads);
+
+} // namespace FdbApiTester
+
+#endif
--- a/bindings/c/test/apitester/TesterTestSpec.cpp
+++ b/bindings/c/test/apitester/TesterTestSpec.cpp
@ -0,0 +1,169 @@
+/*
+ * TesterTestSpec.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterTestSpec.h"
+#include "TesterUtil.h"
+#include <toml.hpp>
+#include <fmt/format.h>
+#include <functional>
+
+namespace FdbApiTester {
+
+namespace {
+
+void processIntOption(const std::string& value, const std::string& optionName, int& res, int minVal, int maxVal) {
+	char* endptr;
+	res = strtol(value.c_str(), &endptr, 10);
+	if (*endptr != '\0') {
+		throw TesterError(fmt::format("Invalid test file. Invalid value {} for {}", value, optionName));
+	}
+	if (res < minVal || res > maxVal) {
+		throw TesterError(
+		    fmt::format("Invalid test file. Value for {} must be between {} and {}", optionName, minVal, maxVal));
+	}
+}
+
+std::unordered_map<std::string, std::function<void(const std::string& value, TestSpec* spec)>> testSpecTestKeys = {
+	{ "title",
+	  [](const std::string& value, TestSpec* spec) { //
+	      spec->title = value;
+	  } },
+	{ "apiVersion",
+	  [](const std::string& value, TestSpec* spec) { //
+	      processIntOption(value, "apiVersion", spec->apiVersion, 700, 710);
+	  } },
+	{ "blockOnFutures",
+	  [](const std::string& value, TestSpec* spec) { //
+	      spec->blockOnFutures = (value == "true");
+	  } },
+	{ "buggify",
+	  [](const std::string& value, TestSpec* spec) { //
+	      spec->buggify = (value == "true");
+	  } },
+	{ "multiThreaded",
+	  [](const std::string& value, TestSpec* spec) { //
+	      spec->multiThreaded = (value == "true");
+	  } },
+	{ "fdbCallbacksOnExternalThreads",
+	  [](const std::string& value, TestSpec* spec) { //
+	      spec->fdbCallbacksOnExternalThreads = (value == "true");
+	  } },
+	{ "databasePerTransaction",
+	  [](const std::string& value, TestSpec* spec) { //
+	      spec->databasePerTransaction = (value == "true");
+	  } },
+	{ "minFdbThreads",
+	  [](const std::string& value, TestSpec* spec) { //
+	      processIntOption(value, "minFdbThreads", spec->minFdbThreads, 1, 1000);
+	  } },
+	{ "maxFdbThreads",
+	  [](const std::string& value, TestSpec* spec) { //
+	      processIntOption(value, "maxFdbThreads", spec->maxFdbThreads, 1, 1000);
+	  } },
+	{ "minClientThreads",
+	  [](const std::string& value, TestSpec* spec) { //
+	      processIntOption(value, "minClientThreads", spec->minClientThreads, 1, 1000);
+	  } },
+	{ "maxClientThreads",
+	  [](const std::string& value, TestSpec* spec) { //
+	      processIntOption(value, "maxClientThreads", spec->maxClientThreads, 1, 1000);
+	  } },
+	{ "minDatabases",
+	  [](const std::string& value, TestSpec* spec) { //
+	      processIntOption(value, "minDatabases", spec->minDatabases, 1, 1000);
+	  } },
+	{ "maxDatabases",
+	  [](const std::string& value, TestSpec* spec) { //
+	      processIntOption(value, "maxDatabases", spec->maxDatabases, 1, 1000);
+	  } },
+	{ "minClients",
+	  [](const std::string& value, TestSpec* spec) { //
+	      processIntOption(value, "minClients", spec->minClients, 1, 1000);
+	  } },
+	{ "maxClients",
+	  [](const std::string& value, TestSpec* spec) { //
+	      processIntOption(value, "maxClients", spec->maxClients, 1, 1000);
+	  } }
+};
+
+template <typename T>
+std::string toml_to_string(const T& value) {
+	// TOML formatting converts numbers to strings exactly how they're in the file
+	// and thus, is equivalent to testspec.  However, strings are quoted, so we
+	// must remove the quotes.
+	if (value.type() == toml::value_t::string) {
+		const std::string& formatted = toml::format(value);
+		return formatted.substr(1, formatted.size() - 2);
+	} else {
+		return toml::format(value);
+	}
+}
+
+} // namespace
+
+TestSpec readTomlTestSpec(std::string fileName) {
+	TestSpec spec;
+	WorkloadSpec workloadSpec;
+
+	const toml::value& conf = toml::parse(fileName);
+
+	// Then parse each test
+	const toml::array& tests = toml::find(conf, "test").as_array();
+	if (tests.size() == 0) {
+		throw TesterError("Invalid test file. No [test] section found");
+	} else if (tests.size() > 1) {
+		throw TesterError("Invalid test file. More than one [test] section found");
+	}
+
+	const toml::value& test = tests[0];
+
+	// First handle all test-level settings
+	for (const auto& [k, v] : test.as_table()) {
+		if (k == "workload") {
+			continue;
+		}
+		if (testSpecTestKeys.find(k) != testSpecTestKeys.end()) {
+			testSpecTestKeys[k](toml_to_string(v), &spec);
+		} else {
+			throw TesterError(fmt::format(
+			    "Invalid test file. Unrecognized test parameter. Name: {}, value {}", k, toml_to_string(v)));
+		}
+	}
+
+	// And then copy the workload attributes to spec.options
+	const toml::array& workloads = toml::find(test, "workload").as_array();
+	for (const toml::value& workload : workloads) {
+		workloadSpec = WorkloadSpec();
+		auto& options = workloadSpec.options;
+		for (const auto& [attrib, v] : workload.as_table()) {
+			options[attrib] = toml_to_string(v);
+		}
+		auto itr = options.find("name");
+		if (itr == options.end()) {
+			throw TesterError("Invalid test file. Unspecified workload name.");
+		}
+		workloadSpec.name = itr->second;
+		spec.workloads.push_back(workloadSpec);
+	}
+
+	return spec;
+}
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterTestSpec.h
+++ b/bindings/c/test/apitester/TesterTestSpec.h
@ -0,0 +1,90 @@
+/*
+ * TesterTestSpec.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef APITESTER_CONFIG_READER_H
+#define APITESTER_CONFIG_READER_H
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#define FDB_API_VERSION 710
+
+namespace FdbApiTester {
+
+/// Workload specification
+struct WorkloadSpec {
+	std::string name;
+	std::unordered_map<std::string, std::string> options;
+};
+
+// Test speficification loaded from a *.toml file
+struct TestSpec {
+	// Title of the test
+	std::string title;
+
+	// FDB API version, using the latest version by default
+	int apiVersion = FDB_API_VERSION;
+
+	// Use blocking waits on futures instead of scheduling callbacks
+	bool blockOnFutures = false;
+
+	// Use multi-threaded FDB client
+	bool multiThreaded = false;
+
+	// Enable injection of errors in FDB client
+	bool buggify = false;
+
+	// Execute future callbacks on the threads of the external FDB library
+	// rather than on the main thread of the local FDB client library
+	bool fdbCallbacksOnExternalThreads = false;
+
+	// Execute each transaction in a separate database instance
+	bool databasePerTransaction = false;
+
+	// Size of the FDB client thread pool (a random number in the [min,max] range)
+	int minFdbThreads = 1;
+	int maxFdbThreads = 1;
+
+	// Size of the thread pool for test workloads (a random number in the [min,max] range)
+	int minClientThreads = 1;
+	int maxClientThreads = 1;
+
+	// Size of the database instance pool (a random number in the [min,max] range)
+	// Each transaction is assigned randomly to one of the databases in the pool
+	int minDatabases = 1;
+	int maxDatabases = 1;
+
+	// Number of workload clients (a random number in the [min,max] range)
+	int minClients = 1;
+	int maxClients = 10;
+
+	// List of workloads with their options
+	std::vector<WorkloadSpec> workloads;
+};
+
+// Read the test specfication from a *.toml file
+TestSpec readTomlTestSpec(std::string fileName);
+
+} // namespace FdbApiTester
+
+#endif
--- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp
+++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp
@ -0,0 +1,471 @@
+/*
+ * TesterTransactionExecutor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterTransactionExecutor.h"
+#include "TesterUtil.h"
+#include "test/apitester/TesterScheduler.h"
+#include <memory>
+#include <unordered_map>
+#include <mutex>
+#include <atomic>
+#include <chrono>
+#include <thread>
+#include <fmt/format.h>
+
+namespace FdbApiTester {
+
+void TransactionActorBase::complete(fdb_error_t err) {
+	error = err;
+	context = {};
+}
+
+void ITransactionContext::continueAfterAll(std::vector<Future> futures, TTaskFct cont) {
+	auto counter = std::make_shared<std::atomic<int>>(futures.size());
+	auto errorCode = std::make_shared<std::atomic<fdb_error_t>>(error_code_success);
+	auto thisPtr = shared_from_this();
+	for (auto& f : futures) {
+		continueAfter(
+		    f,
+		    [thisPtr, f, counter, errorCode, cont]() {
+			    if (f.getError() != error_code_success) {
+				    (*errorCode) = f.getError();
+			    }
+			    if (--(*counter) == 0) {
+				    if (*errorCode == error_code_success) {
+					    // all futures successful -> continue
+					    cont();
+				    } else {
+					    // at least one future failed -> retry the transaction
+					    thisPtr->onError(*errorCode);
+				    }
+			    }
+		    },
+		    false);
+	}
+}
+
+/**
+ * Transaction context base class, containing reusable functionality
+ */
+class TransactionContextBase : public ITransactionContext {
+public:
+	TransactionContextBase(FDBTransaction* tx,
+	                       std::shared_ptr<ITransactionActor> txActor,
+	                       TTaskFct cont,
+	                       IScheduler* scheduler)
+	  : fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler), txState(TxState::IN_PROGRESS) {}
+
+	// A state machine:
+	// IN_PROGRESS -> (ON_ERROR -> IN_PROGRESS)* [-> ON_ERROR] -> DONE
+	enum class TxState { IN_PROGRESS, ON_ERROR, DONE };
+
+	Transaction* tx() override { return &fdbTx; }
+
+	// Set a continuation to be executed when a future gets ready
+	void continueAfter(Future f, TTaskFct cont, bool retryOnError) override { doContinueAfter(f, cont, retryOnError); }
+
+	// Complete the transaction with a commit
+	void commit() override {
+		std::unique_lock<std::mutex> lock(mutex);
+		if (txState != TxState::IN_PROGRESS) {
+			return;
+		}
+		lock.unlock();
+		Future f = fdbTx.commit();
+		auto thisRef = shared_from_this();
+		doContinueAfter(
+		    f, [thisRef]() { thisRef->done(); }, true);
+	}
+
+	// Complete the transaction without a commit (for read transactions)
+	void done() override {
+		std::unique_lock<std::mutex> lock(mutex);
+		if (txState != TxState::IN_PROGRESS) {
+			return;
+		}
+		txState = TxState::DONE;
+		lock.unlock();
+		// cancel transaction so that any pending operations on it
+		// fail gracefully
+		fdbTx.cancel();
+		txActor->complete(error_code_success);
+		cleanUp();
+		contAfterDone();
+	}
+
+protected:
+	virtual void doContinueAfter(Future f, TTaskFct cont, bool retryOnError) = 0;
+
+	// Clean up transaction state after completing the transaction
+	// Note that the object may live longer, because it is referenced
+	// by not yet triggered callbacks
+	virtual void cleanUp() {
+		ASSERT(txState == TxState::DONE);
+		ASSERT(!onErrorFuture);
+		txActor = {};
+	}
+
+	// Complete the transaction with an (unretriable) error
+	void transactionFailed(fdb_error_t err) {
+		ASSERT(err != error_code_success);
+		std::unique_lock<std::mutex> lock(mutex);
+		if (txState == TxState::DONE) {
+			return;
+		}
+		txState = TxState::DONE;
+		lock.unlock();
+		txActor->complete(err);
+		cleanUp();
+		contAfterDone();
+	}
+
+	// Handle result of an a transaction onError call
+	void handleOnErrorResult() {
+		ASSERT(txState == TxState::ON_ERROR);
+		fdb_error_t err = onErrorFuture.getError();
+		onErrorFuture = {};
+		if (err) {
+			transactionFailed(err);
+		} else {
+			std::unique_lock<std::mutex> lock(mutex);
+			txState = TxState::IN_PROGRESS;
+			lock.unlock();
+			txActor->start();
+		}
+	}
+
+	// FDB transaction
+	Transaction fdbTx;
+
+	// Actor implementing the transaction worklflow
+	std::shared_ptr<ITransactionActor> txActor;
+
+	// Mutex protecting access to shared mutable state
+	std::mutex mutex;
+
+	// Continuation to be called after completion of the transaction
+	TTaskFct contAfterDone;
+
+	// Reference to the scheduler
+	IScheduler* scheduler;
+
+	// Transaction execution state
+	TxState txState;
+
+	// onError future used in ON_ERROR state
+	Future onErrorFuture;
+};
+
+/**
+ *  Transaction context using blocking waits to implement continuations on futures
+ */
+class BlockingTransactionContext : public TransactionContextBase {
+public:
+	BlockingTransactionContext(FDBTransaction* tx,
+	                           std::shared_ptr<ITransactionActor> txActor,
+	                           TTaskFct cont,
+	                           IScheduler* scheduler)
+	  : TransactionContextBase(tx, txActor, cont, scheduler) {}
+
+protected:
+	void doContinueAfter(Future f, TTaskFct cont, bool retryOnError) override {
+		auto thisRef = std::static_pointer_cast<BlockingTransactionContext>(shared_from_this());
+		scheduler->schedule(
+		    [thisRef, f, cont, retryOnError]() mutable { thisRef->blockingContinueAfter(f, cont, retryOnError); });
+	}
+
+	void blockingContinueAfter(Future f, TTaskFct cont, bool retryOnError) {
+		std::unique_lock<std::mutex> lock(mutex);
+		if (txState != TxState::IN_PROGRESS) {
+			return;
+		}
+		lock.unlock();
+		fdb_error_t err = fdb_future_block_until_ready(f.fdbFuture());
+		if (err) {
+			transactionFailed(err);
+			return;
+		}
+		err = f.getError();
+		if (err == error_code_transaction_cancelled) {
+			return;
+		}
+		if (err == error_code_success || !retryOnError) {
+			scheduler->schedule([cont]() { cont(); });
+			return;
+		}
+
+		onError(err);
+	}
+
+	virtual void onError(fdb_error_t err) override {
+		std::unique_lock<std::mutex> lock(mutex);
+		if (txState != TxState::IN_PROGRESS) {
+			// Ignore further errors, if the transaction is in the error handing mode or completed
+			return;
+		}
+		txState = TxState::ON_ERROR;
+		lock.unlock();
+
+		ASSERT(!onErrorFuture);
+		onErrorFuture = fdbTx.onError(err);
+		fdb_error_t err2 = fdb_future_block_until_ready(onErrorFuture.fdbFuture());
+		if (err2) {
+			transactionFailed(err2);
+			return;
+		}
+		auto thisRef = std::static_pointer_cast<BlockingTransactionContext>(shared_from_this());
+		scheduler->schedule([thisRef]() { thisRef->handleOnErrorResult(); });
+	}
+};
+
+/**
+ *  Transaction context using callbacks to implement continuations on futures
+ */
+class AsyncTransactionContext : public TransactionContextBase {
+public:
+	AsyncTransactionContext(FDBTransaction* tx,
+	                        std::shared_ptr<ITransactionActor> txActor,
+	                        TTaskFct cont,
+	                        IScheduler* scheduler)
+	  : TransactionContextBase(tx, txActor, cont, scheduler) {}
+
+protected:
+	void doContinueAfter(Future f, TTaskFct cont, bool retryOnError) override {
+		std::unique_lock<std::mutex> lock(mutex);
+		if (txState != TxState::IN_PROGRESS) {
+			return;
+		}
+		callbackMap[f.fdbFuture()] = CallbackInfo{ f, cont, shared_from_this(), retryOnError };
+		lock.unlock();
+		fdb_error_t err = fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this);
+		if (err) {
+			lock.lock();
+			callbackMap.erase(f.fdbFuture());
+			lock.unlock();
+			transactionFailed(err);
+		}
+	}
+
+	static void futureReadyCallback(FDBFuture* f, void* param) {
+		AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param;
+		txCtx->onFutureReady(f);
+	}
+
+	void onFutureReady(FDBFuture* f) {
+		injectRandomSleep();
+		// Hold a reference to this to avoid it to be
+		// destroyed before releasing the mutex
+		auto thisRef = shared_from_this();
+		std::unique_lock<std::mutex> lock(mutex);
+		auto iter = callbackMap.find(f);
+		ASSERT(iter != callbackMap.end());
+		CallbackInfo cbInfo = iter->second;
+		callbackMap.erase(iter);
+		if (txState != TxState::IN_PROGRESS) {
+			return;
+		}
+		lock.unlock();
+		fdb_error_t err = fdb_future_get_error(f);
+		if (err == error_code_transaction_cancelled) {
+			return;
+		}
+		if (err == error_code_success || !cbInfo.retryOnError) {
+			scheduler->schedule(cbInfo.cont);
+			return;
+		}
+		onError(err);
+	}
+
+	virtual void onError(fdb_error_t err) override {
+		std::unique_lock<std::mutex> lock(mutex);
+		if (txState != TxState::IN_PROGRESS) {
+			// Ignore further errors, if the transaction is in the error handing mode or completed
+			return;
+		}
+		txState = TxState::ON_ERROR;
+		lock.unlock();
+
+		ASSERT(!onErrorFuture);
+		onErrorFuture = tx()->onError(err);
+		onErrorThisRef = std::static_pointer_cast<AsyncTransactionContext>(shared_from_this());
+		fdb_error_t err2 = fdb_future_set_callback(onErrorFuture.fdbFuture(), onErrorReadyCallback, this);
+		if (err2) {
+			onErrorFuture = {};
+			transactionFailed(err2);
+		}
+	}
+
+	static void onErrorReadyCallback(FDBFuture* f, void* param) {
+		AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param;
+		txCtx->onErrorReady(f);
+	}
+
+	void onErrorReady(FDBFuture* f) {
+		injectRandomSleep();
+		auto thisRef = onErrorThisRef;
+		onErrorThisRef = {};
+		scheduler->schedule([thisRef]() { thisRef->handleOnErrorResult(); });
+	}
+
+	void cleanUp() override {
+		TransactionContextBase::cleanUp();
+
+		// Cancel all pending operations
+		// Note that the callbacks of the cancelled futures will still be called
+		std::unique_lock<std::mutex> lock(mutex);
+		std::vector<Future> futures;
+		for (auto& iter : callbackMap) {
+			futures.push_back(iter.second.future);
+		}
+		lock.unlock();
+		for (auto& f : futures) {
+			f.cancel();
+		}
+	}
+
+	// Inject a random sleep with a low probability
+	void injectRandomSleep() {
+		if (Random::get().randomBool(0.01)) {
+			std::this_thread::sleep_for(std::chrono::milliseconds(Random::get().randomInt(1, 5)));
+		}
+	}
+
+	// Object references for a future callback
+	struct CallbackInfo {
+		Future future;
+		TTaskFct cont;
+		std::shared_ptr<ITransactionContext> thisRef;
+		bool retryOnError;
+	};
+
+	// Map for keeping track of future waits and holding necessary object references
+	std::unordered_map<FDBFuture*, CallbackInfo> callbackMap;
+
+	// Holding reference to this for onError future C callback
+	std::shared_ptr<AsyncTransactionContext> onErrorThisRef;
+};
+
+/**
+ * Transaction executor base class, containing reusable functionality
+ */
+class TransactionExecutorBase : public ITransactionExecutor {
+public:
+	TransactionExecutorBase(const TransactionExecutorOptions& options) : options(options), scheduler(nullptr) {}
+
+	void init(IScheduler* scheduler, const char* clusterFile) override {
+		this->scheduler = scheduler;
+		this->clusterFile = clusterFile;
+	}
+
+protected:
+	// Execute the transaction on the given database instance
+	void executeOnDatabase(FDBDatabase* db, std::shared_ptr<ITransactionActor> txActor, TTaskFct cont) {
+		FDBTransaction* tx;
+		fdb_error_t err = fdb_database_create_transaction(db, &tx);
+		if (err != error_code_success) {
+			txActor->complete(err);
+			cont();
+		} else {
+			std::shared_ptr<ITransactionContext> ctx;
+			if (options.blockOnFutures) {
+				ctx = std::make_shared<BlockingTransactionContext>(tx, txActor, cont, scheduler);
+			} else {
+				ctx = std::make_shared<AsyncTransactionContext>(tx, txActor, cont, scheduler);
+			}
+			txActor->init(ctx);
+			txActor->start();
+		}
+	}
+
+protected:
+	TransactionExecutorOptions options;
+	std::string clusterFile;
+	IScheduler* scheduler;
+};
+
+/**
+ * Transaction executor load balancing transactions over a fixed pool of databases
+ */
+class DBPoolTransactionExecutor : public TransactionExecutorBase {
+public:
+	DBPoolTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {}
+
+	~DBPoolTransactionExecutor() override { release(); }
+
+	void init(IScheduler* scheduler, const char* clusterFile) override {
+		TransactionExecutorBase::init(scheduler, clusterFile);
+		for (int i = 0; i < options.numDatabases; i++) {
+			FDBDatabase* db;
+			fdb_error_t err = fdb_create_database(clusterFile, &db);
+			if (err != error_code_success) {
+				throw TesterError(fmt::format("Failed create database with the cluster file '{}'. Error: {}({})",
+				                              clusterFile,
+				                              err,
+				                              fdb_get_error(err)));
+			}
+			databases.push_back(db);
+		}
+	}
+
+	void execute(std::shared_ptr<ITransactionActor> txActor, TTaskFct cont) override {
+		int idx = Random::get().randomInt(0, options.numDatabases - 1);
+		executeOnDatabase(databases[idx], txActor, cont);
+	}
+
+	void release() {
+		for (FDBDatabase* db : databases) {
+			fdb_database_destroy(db);
+		}
+	}
+
+private:
+	std::vector<FDBDatabase*> databases;
+};
+
+/**
+ * Transaction executor executing each transaction on a separate database
+ */
+class DBPerTransactionExecutor : public TransactionExecutorBase {
+public:
+	DBPerTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {}
+
+	void execute(std::shared_ptr<ITransactionActor> txActor, TTaskFct cont) override {
+		FDBDatabase* db = nullptr;
+		fdb_error_t err = fdb_create_database(clusterFile.c_str(), &db);
+		if (err != error_code_success) {
+			txActor->complete(err);
+			cont();
+		}
+		executeOnDatabase(db, txActor, [cont, db]() {
+			fdb_database_destroy(db);
+			cont();
+		});
+	}
+};
+
+std::unique_ptr<ITransactionExecutor> createTransactionExecutor(const TransactionExecutorOptions& options) {
+	if (options.databasePerTransaction) {
+		return std::make_unique<DBPerTransactionExecutor>(options);
+	} else {
+		return std::make_unique<DBPoolTransactionExecutor>(options);
+	}
+}
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterTransactionExecutor.h
+++ b/bindings/c/test/apitester/TesterTransactionExecutor.h
@ -0,0 +1,145 @@
+/*
+ * TesterTransactionExecutor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef APITESTER_TRANSACTION_EXECUTOR_H
+#define APITESTER_TRANSACTION_EXECUTOR_H
+
+#include "TesterOptions.h"
+#include "TesterApiWrapper.h"
+#include "TesterScheduler.h"
+#include <string_view>
+#include <memory>
+
+namespace FdbApiTester {
+
+/**
+ * Interface to be used for implementation of a concrete transaction
+ */
+class ITransactionContext : public std::enable_shared_from_this<ITransactionContext> {
+public:
+	virtual ~ITransactionContext() {}
+
+	// Current FDB transaction
+	virtual Transaction* tx() = 0;
+
+	// Schedule a continuation to be executed when the future gets ready
+	// retryOnError controls whether transaction is retried in case of an error instead
+	// of calling the continuation
+	virtual void continueAfter(Future f, TTaskFct cont, bool retryOnError = true) = 0;
+
+	// Complete the transaction with a commit
+	virtual void commit() = 0;
+
+	// retry transaction on error
+	virtual void onError(fdb_error_t err) = 0;
+
+	// Mark the transaction as completed without committing it (for read transactions)
+	virtual void done() = 0;
+
+	// A continuation to be executed when all of the given futures get ready
+	virtual void continueAfterAll(std::vector<Future> futures, TTaskFct cont);
+};
+
+/**
+ * Interface of an actor object implementing a concrete transaction
+ */
+class ITransactionActor {
+public:
+	virtual ~ITransactionActor() {}
+
+	// Initialize with the given transaction context
+	virtual void init(std::shared_ptr<ITransactionContext> ctx) = 0;
+
+	// Start execution of the transaction, also called on retries
+	virtual void start() = 0;
+
+	// Transaction completion result (error_code_success in case of success)
+	virtual fdb_error_t getErrorCode() = 0;
+
+	// Notification about the completion of the transaction
+	virtual void complete(fdb_error_t err) = 0;
+};
+
+/**
+ * A helper base class for transaction actors
+ */
+class TransactionActorBase : public ITransactionActor {
+public:
+	void init(std::shared_ptr<ITransactionContext> ctx) override { context = ctx; }
+	fdb_error_t getErrorCode() override { return error; }
+	void complete(fdb_error_t err) override;
+
+protected:
+	std::shared_ptr<ITransactionContext> ctx() { return context; }
+
+private:
+	std::shared_ptr<ITransactionContext> context;
+	fdb_error_t error = error_code_success;
+};
+
+// Type of the lambda functions implementing a transaction
+using TTxStartFct = std::function<void(std::shared_ptr<ITransactionContext>)>;
+
+/**
+ * A wrapper class for transactions implemented by lambda functions
+ */
+class TransactionFct : public TransactionActorBase {
+public:
+	TransactionFct(TTxStartFct startFct) : startFct(startFct) {}
+	void start() override { startFct(this->ctx()); }
+
+private:
+	TTxStartFct startFct;
+};
+
+/**
+ * Configuration of transaction execution mode
+ */
+struct TransactionExecutorOptions {
+	// Use blocking waits on futures
+	bool blockOnFutures = false;
+
+	// Create each transaction in a separate database instance
+	bool databasePerTransaction = false;
+
+	// The size of the database instance pool
+	int numDatabases = 1;
+};
+
+/**
+ * Transaction executor provides an interface for executing transactions
+ * It is responsible for instantiating FDB databases and transactions and managing their lifecycle
+ * according to the provided options
+ */
+class ITransactionExecutor {
+public:
+	virtual ~ITransactionExecutor() {}
+	virtual void init(IScheduler* sched, const char* clusterFile) = 0;
+	virtual void execute(std::shared_ptr<ITransactionActor> tx, TTaskFct cont) = 0;
+};
+
+// Create a transaction executor for the given options
+std::unique_ptr<ITransactionExecutor> createTransactionExecutor(const TransactionExecutorOptions& options);
+
+} // namespace FdbApiTester
+
+#endif
--- a/bindings/c/test/apitester/TesterUtil.cpp
+++ b/bindings/c/test/apitester/TesterUtil.cpp
@ -0,0 +1,58 @@
+/*
+ * TesterUtil.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterUtil.h"
+#include <cstdio>
+
+namespace FdbApiTester {
+
+Random::Random() {
+	std::random_device dev;
+	random.seed(dev());
+}
+
+int Random::randomInt(int min, int max) {
+	return std::uniform_int_distribution<int>(min, max)(random);
+}
+
+Random& Random::get() {
+	static thread_local Random random;
+	return random;
+}
+
+std::string Random::randomStringLowerCase(int minLength, int maxLength) {
+	int length = randomInt(minLength, maxLength);
+	std::string str;
+	str.reserve(length);
+	for (int i = 0; i < length; i++) {
+		str += (char)randomInt('a', 'z');
+	}
+	return str;
+}
+
+bool Random::randomBool(double trueRatio) {
+	return std::uniform_real_distribution<double>(0.0, 1.0)(random) <= trueRatio;
+}
+
+void print_internal_error(const char* msg, const char* file, int line) {
+	fprintf(stderr, "Assertion %s failed @ %s %d:\n", msg, file, line);
+}
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterUtil.h
+++ b/bindings/c/test/apitester/TesterUtil.h
@ -0,0 +1,87 @@
+/*
+ * TesterUtil.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef APITESTER_UTIL_H
+#define APITESTER_UTIL_H
+
+#include <random>
+#include <ostream>
+#include <optional>
+#include <fmt/format.h>
+
+namespace fmt {
+
+template <typename T>
+struct formatter<std::optional<T>> : fmt::formatter<T> {
+
+	template <typename FormatContext>
+	auto format(const std::optional<T>& opt, FormatContext& ctx) {
+		if (opt) {
+			fmt::formatter<T>::format(*opt, ctx);
+			return ctx.out();
+		}
+		return fmt::format_to(ctx.out(), "<empty>");
+	}
+};
+
+} // namespace fmt
+
+namespace FdbApiTester {
+
+class Random {
+public:
+	Random();
+
+	static Random& get();
+
+	int randomInt(int min, int max);
+
+	std::string randomStringLowerCase(int minLength, int maxLength);
+
+	bool randomBool(double trueRatio);
+
+	std::mt19937 random;
+};
+
+class TesterError : public std::runtime_error {
+public:
+	explicit TesterError(const char* message) : std::runtime_error(message) {}
+	explicit TesterError(const std::string& message) : std::runtime_error(message) {}
+	TesterError(const TesterError&) = default;
+	TesterError& operator=(const TesterError&) = default;
+	TesterError(TesterError&&) = default;
+	TesterError& operator=(TesterError&&) = default;
+};
+
+void print_internal_error(const char* msg, const char* file, int line);
+
+#define ASSERT(condition)                                                                                              \
+	do {                                                                                                               \
+		if (!(condition)) {                                                                                            \
+			print_internal_error(#condition, __FILE__, __LINE__);                                                      \
+			abort();                                                                                                   \
+		}                                                                                                              \
+	} while (false) // For use in destructors, where throwing exceptions is extremely dangerous
+
+} // namespace FdbApiTester
+
+#endif
--- a/bindings/c/test/apitester/TesterWorkload.cpp
+++ b/bindings/c/test/apitester/TesterWorkload.cpp
@ -0,0 +1,184 @@
+/*
+ * TesterWorkload.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterWorkload.h"
+#include "TesterUtil.h"
+#include "test/apitester/TesterScheduler.h"
+#include <cstdlib>
+#include <memory>
+#include <fmt/format.h>
+#include <vector>
+
+namespace FdbApiTester {
+
+int WorkloadConfig::getIntOption(const std::string& name, int defaultVal) const {
+	auto iter = options.find(name);
+	if (iter == options.end()) {
+		return defaultVal;
+	} else {
+		char* endptr;
+		int intVal = strtol(iter->second.c_str(), &endptr, 10);
+		if (*endptr != '\0') {
+			throw TesterError(
+			    fmt::format("Invalid workload configuration. Invalid value {} for {}", iter->second, name));
+		}
+		return intVal;
+	}
+}
+
+double WorkloadConfig::getFloatOption(const std::string& name, double defaultVal) const {
+	auto iter = options.find(name);
+	if (iter == options.end()) {
+		return defaultVal;
+	} else {
+		char* endptr;
+		double floatVal = strtod(iter->second.c_str(), &endptr);
+		if (*endptr != '\0') {
+			throw TesterError(
+			    fmt::format("Invalid workload configuration. Invalid value {} for {}", iter->second, name));
+		}
+		return floatVal;
+	}
+}
+
+WorkloadBase::WorkloadBase(const WorkloadConfig& config)
+  : manager(nullptr), tasksScheduled(0), numErrors(0), clientId(config.clientId), numClients(config.numClients),
+    failed(false) {
+	maxErrors = config.getIntOption("maxErrors", 10);
+	workloadId = fmt::format("{}{}", config.name, clientId);
+}
+
+void WorkloadBase::init(WorkloadManager* manager) {
+	this->manager = manager;
+}
+
+void WorkloadBase::schedule(TTaskFct task) {
+	if (failed) {
+		return;
+	}
+	tasksScheduled++;
+	manager->scheduler->schedule([this, task]() {
+		task();
+		scheduledTaskDone();
+	});
+}
+
+void WorkloadBase::execTransaction(std::shared_ptr<ITransactionActor> tx, TTaskFct cont, bool failOnError) {
+	if (failed) {
+		return;
+	}
+	tasksScheduled++;
+	manager->txExecutor->execute(tx, [this, tx, cont, failOnError]() {
+		fdb_error_t err = tx->getErrorCode();
+		if (tx->getErrorCode() == error_code_success) {
+			cont();
+		} else {
+			std::string msg = fmt::format("Transaction failed with error: {} ({}})", err, fdb_get_error(err));
+			if (failOnError) {
+				error(msg);
+				failed = true;
+			} else {
+				info(msg);
+				cont();
+			}
+		}
+		scheduledTaskDone();
+	});
+}
+
+void WorkloadBase::info(const std::string& msg) {
+	fmt::print(stderr, "[{}] {}\n", workloadId, msg);
+}
+
+void WorkloadBase::error(const std::string& msg) {
+	fmt::print(stderr, "[{}] ERROR: {}\n", workloadId, msg);
+	numErrors++;
+	if (numErrors > maxErrors && !failed) {
+		fmt::print(stderr, "[{}] ERROR: Stopping workload after {} errors\n", workloadId, numErrors);
+		failed = true;
+	}
+}
+
+void WorkloadBase::scheduledTaskDone() {
+	if (--tasksScheduled == 0) {
+		if (numErrors > 0) {
+			error(fmt::format("Workload failed with {} errors", numErrors.load()));
+		} else {
+			info("Workload successfully completed");
+		}
+		manager->workloadDone(this, numErrors > 0);
+	}
+}
+
+void WorkloadManager::add(std::shared_ptr<IWorkload> workload, TTaskFct cont) {
+	std::unique_lock<std::mutex> lock(mutex);
+	workloads[workload.get()] = WorkloadInfo{ workload, cont };
+}
+
+void WorkloadManager::run() {
+	std::vector<std::shared_ptr<IWorkload>> initialWorkloads;
+	for (auto iter : workloads) {
+		initialWorkloads.push_back(iter.second.ref);
+	}
+	for (auto iter : initialWorkloads) {
+		iter->init(this);
+	}
+	for (auto iter : initialWorkloads) {
+		iter->start();
+	}
+	scheduler->join();
+	if (failed()) {
+		fmt::print(stderr, "{} workloads failed\n", numWorkloadsFailed);
+	} else {
+		fprintf(stderr, "All workloads succesfully completed\n");
+	}
+}
+
+void WorkloadManager::workloadDone(IWorkload* workload, bool failed) {
+	std::unique_lock<std::mutex> lock(mutex);
+	auto iter = workloads.find(workload);
+	ASSERT(iter != workloads.end());
+	lock.unlock();
+	iter->second.cont();
+	lock.lock();
+	workloads.erase(iter);
+	if (failed) {
+		numWorkloadsFailed++;
+	}
+	bool done = workloads.empty();
+	lock.unlock();
+	if (done) {
+		scheduler->stop();
+	}
+}
+
+std::shared_ptr<IWorkload> IWorkloadFactory::create(std::string const& name, const WorkloadConfig& config) {
+	auto it = factories().find(name);
+	if (it == factories().end())
+		return {}; // or throw?
+	return it->second->create(config);
+}
+
+std::unordered_map<std::string, IWorkloadFactory*>& IWorkloadFactory::factories() {
+	static std::unordered_map<std::string, IWorkloadFactory*> theFactories;
+	return theFactories;
+}
+
+} // namespace FdbApiTester
--- a/bindings/c/test/apitester/TesterWorkload.h
+++ b/bindings/c/test/apitester/TesterWorkload.h
@ -0,0 +1,205 @@
+/*
+ * TesterWorkload.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#ifndef APITESTER_WORKLOAD_H
+#define APITESTER_WORKLOAD_H
+
+#include "TesterTransactionExecutor.h"
+#include "TesterUtil.h"
+#include <atomic>
+#include <unordered_map>
+#include <mutex>
+
+namespace FdbApiTester {
+
+class WorkloadManager;
+
+// Workoad interface
+class IWorkload {
+public:
+	virtual ~IWorkload() {}
+
+	// Intialize the workload
+	virtual void init(WorkloadManager* manager) = 0;
+
+	// Start executing the workload
+	virtual void start() = 0;
+};
+
+// Workload configuration
+struct WorkloadConfig {
+	// Workoad name
+	std::string name;
+
+	// Client ID assigned to the workload (a number from 0 to numClients-1)
+	int clientId;
+
+	// Total number of clients
+	int numClients;
+
+	// Workload options: as key-value pairs
+	std::unordered_map<std::string, std::string> options;
+
+	// Get option of a certain type by name. Throws an exception if the values is of a wrong type
+	int getIntOption(const std::string& name, int defaultVal) const;
+	double getFloatOption(const std::string& name, double defaultVal) const;
+};
+
+// A base class for test workloads
+// Tracks if workload is active, notifies the workload manager when the workload completes
+class WorkloadBase : public IWorkload {
+public:
+	WorkloadBase(const WorkloadConfig& config);
+
+	// Initialize the workload
+	void init(WorkloadManager* manager) override;
+
+protected:
+	// Schedule the a task as a part of the workload
+	void schedule(TTaskFct task);
+
+	// Execute a transaction within the workload
+	void execTransaction(std::shared_ptr<ITransactionActor> tx, TTaskFct cont, bool failOnError = true);
+
+	// Execute a transaction within the workload, a convenience method for a tranasaction defined by a lambda function
+	void execTransaction(TTxStartFct start, TTaskFct cont, bool failOnError = true) {
+		execTransaction(std::make_shared<TransactionFct>(start), cont, failOnError);
+	}
+
+	// Log an error message, increase error counter
+	void error(const std::string& msg);
+
+	// Log an info message
+	void info(const std::string& msg);
+
+private:
+	WorkloadManager* manager;
+
+	// Decrease scheduled task counter, notify the workload manager
+	// that the task is done if no more tasks schedule
+	void scheduledTaskDone();
+
+	// Keep track of tasks scheduled by the workload
+	// End workload when this number falls to 0
+	std::atomic<int> tasksScheduled;
+
+	// Number of errors logged
+	std::atomic<int> numErrors;
+
+protected:
+	// Client ID assigned to the workload (a number from 0 to numClients-1)
+	int clientId;
+
+	// Total number of clients
+	int numClients;
+
+	// The maximum number of errors before stoppoing the workload
+	int maxErrors;
+
+	// Workload identifier, consisting of workload name and client ID
+	std::string workloadId;
+
+	// Workload is failed, no further transactions or continuations will be scheduled by the workload
+	std::atomic<bool> failed;
+};
+
+// Workload manager
+// Keeps track of active workoads, stops the scheduler after all workloads complete
+class WorkloadManager {
+public:
+	WorkloadManager(ITransactionExecutor* txExecutor, IScheduler* scheduler)
+	  : txExecutor(txExecutor), scheduler(scheduler), numWorkloadsFailed(0) {}
+
+	// Add a workload
+	// A continuation is to be specified for subworkloads
+	void add(std::shared_ptr<IWorkload> workload, TTaskFct cont = NO_OP_TASK);
+
+	// Run all workloads. Blocks until all workloads complete
+	void run();
+
+	// True if at least one workload has failed
+	bool failed() {
+		std::unique_lock<std::mutex> lock(mutex);
+		return numWorkloadsFailed > 0;
+	}
+
+private:
+	friend WorkloadBase;
+
+	// Info about a running workload
+	struct WorkloadInfo {
+		// Reference to the workoad for ownership
+		std::shared_ptr<IWorkload> ref;
+		// Continuation to be executed after completing the workload
+		TTaskFct cont;
+	};
+
+	// To be called by a workload to notify that it is done
+	void workloadDone(IWorkload* workload, bool failed);
+
+	// Transaction executor to be used by the workloads
+	ITransactionExecutor* txExecutor;
+
+	// A scheduler to be used by the workloads
+	IScheduler* scheduler;
+
+	// Mutex protects access to workloads & numWorkloadsFailed
+	std::mutex mutex;
+
+	// A map of currently running workloads
+	std::unordered_map<IWorkload*, WorkloadInfo> workloads;
+
+	// Number of workloads failed
+	int numWorkloadsFailed;
+};
+
+// A workload factory
+struct IWorkloadFactory {
+	// create a workload by name
+	static std::shared_ptr<IWorkload> create(std::string const& name, const WorkloadConfig& config);
+
+	// a singleton registry of workload factories
+	static std::unordered_map<std::string, IWorkloadFactory*>& factories();
+
+	// Interface to be implemented by a workload factory
+	virtual ~IWorkloadFactory() = default;
+	virtual std::shared_ptr<IWorkload> create(const WorkloadConfig& config) = 0;
+};
+
+/**
+ * A template for a workload factory for creating workloads of a certain type
+ *
+ * Declare a global instance of the factory for a workload type as follows:
+ * WorkloadFactory<MyWorkload> MyWorkloadFactory("myWorkload");
+ */
+template <class WorkloadType>
+struct WorkloadFactory : IWorkloadFactory {
+	WorkloadFactory(const char* name) { factories()[name] = this; }
+	std::shared_ptr<IWorkload> create(const WorkloadConfig& config) override {
+		return std::make_shared<WorkloadType>(config);
+	}
+};
+
+} // namespace FdbApiTester
+
+#endif
--- a/bindings/c/test/apitester/fdb_c_api_tester.cpp
+++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp
@ -0,0 +1,284 @@
+/*
+ * fdb_c_api_tester.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TesterOptions.h"
+#include "TesterWorkload.h"
+#include "TesterScheduler.h"
+#include "TesterTransactionExecutor.h"
+#include "TesterTestSpec.h"
+#include "TesterUtil.h"
+#include "flow/SimpleOpt.h"
+#include "bindings/c/foundationdb/fdb_c.h"
+
+#include <memory>
+#include <stdexcept>
+#include <thread>
+#include <fmt/format.h>
+
+namespace FdbApiTester {
+
+namespace {
+
+enum TesterOptionId {
+	OPT_CONNFILE,
+	OPT_HELP,
+	OPT_TRACE,
+	OPT_TRACE_DIR,
+	OPT_LOGGROUP,
+	OPT_TRACE_FORMAT,
+	OPT_KNOB,
+	OPT_EXTERNAL_CLIENT_LIBRARY,
+	OPT_TEST_FILE
+};
+
+CSimpleOpt::SOption TesterOptionDefs[] = //
+    { { OPT_CONNFILE, "-C", SO_REQ_SEP },
+	  { OPT_CONNFILE, "--cluster-file", SO_REQ_SEP },
+	  { OPT_TRACE, "--log", SO_NONE },
+	  { OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP },
+	  { OPT_LOGGROUP, "--log-group", SO_REQ_SEP },
+	  { OPT_HELP, "-h", SO_NONE },
+	  { OPT_HELP, "--help", SO_NONE },
+	  { OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP },
+	  { OPT_KNOB, "--knob-", SO_REQ_SEP },
+	  { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP },
+	  { OPT_TEST_FILE, "-f", SO_REQ_SEP },
+	  { OPT_TEST_FILE, "--test-file", SO_REQ_SEP },
+	  SO_END_OF_OPTIONS };
+
+void printProgramUsage(const char* execName) {
+	printf("usage: %s [OPTIONS]\n"
+	       "\n",
+	       execName);
+	printf("  -C, --cluster-file FILE\n"
+	       "                 The path of a file containing the connection string for the\n"
+	       "                 FoundationDB cluster. The default is `fdb.cluster'\n"
+	       "  --log          Enables trace file logging for the CLI session.\n"
+	       "  --log-dir PATH Specifes the output directory for trace files. If\n"
+	       "                 unspecified, defaults to the current directory. Has\n"
+	       "                 no effect unless --log is specified.\n"
+	       "  --log-group LOG_GROUP\n"
+	       "                 Sets the LogGroup field with the specified value for all\n"
+	       "                 events in the trace output (defaults to `default').\n"
+	       "  --trace-format FORMAT\n"
+	       "                 Select the format of the log files. xml (the default) and json\n"
+	       "                 are supported. Has no effect unless --log is specified.\n"
+	       "  --knob-KNOBNAME KNOBVALUE\n"
+	       "                 Changes a knob option. KNOBNAME should be lowercase.\n"
+	       "  --external-client-library FILE\n"
+	       "                 Path to the external client library.\n"
+	       "  -f, --test-file FILE\n"
+	       "                 Test file to run.\n"
+	       "  -h, --help     Display this help and exit.\n");
+}
+
+// Extracts the key for command line arguments that are specified with a prefix (e.g. --knob-).
+// This function converts any hyphens in the extracted key to underscores.
+bool extractPrefixedArgument(std::string prefix, const std::string& arg, std::string& res) {
+	if (arg.size() <= prefix.size() || arg.find(prefix) != 0 ||
+	    (arg[prefix.size()] != '-' && arg[prefix.size()] != '_')) {
+		return false;
+	}
+
+	res = arg.substr(prefix.size() + 1);
+	std::transform(res.begin(), res.end(), res.begin(), [](int c) { return c == '-' ? '_' : c; });
+	return true;
+}
+
+bool validateTraceFormat(std::string_view format) {
+	return format == "xml" || format == "json";
+}
+
+bool processArg(TesterOptions& options, const CSimpleOpt& args) {
+	switch (args.OptionId()) {
+	case OPT_CONNFILE:
+		options.clusterFile = args.OptionArg();
+		break;
+	case OPT_TRACE:
+		options.trace = true;
+		break;
+	case OPT_TRACE_DIR:
+		options.traceDir = args.OptionArg();
+		break;
+	case OPT_LOGGROUP:
+		options.logGroup = args.OptionArg();
+		break;
+	case OPT_TRACE_FORMAT:
+		if (!validateTraceFormat(args.OptionArg())) {
+			fmt::print(stderr, "ERROR: Unrecognized trace format `{}'\n", args.OptionArg());
+			return false;
+		}
+		options.traceFormat = args.OptionArg();
+		break;
+	case OPT_KNOB: {
+		std::string knobName;
+		if (!extractPrefixedArgument("--knob", args.OptionSyntax(), knobName)) {
+			fmt::print(stderr, "ERROR: unable to parse knob option '{}'\n", args.OptionSyntax());
+			return false;
+		}
+		options.knobs.emplace_back(knobName, args.OptionArg());
+		break;
+	}
+	case OPT_EXTERNAL_CLIENT_LIBRARY:
+		options.externalClientLibrary = args.OptionArg();
+		break;
+
+	case OPT_TEST_FILE:
+		options.testFile = args.OptionArg();
+		options.testSpec = readTomlTestSpec(options.testFile);
+		break;
+	}
+	return true;
+}
+
+bool parseArgs(TesterOptions& options, int argc, char** argv) {
+	// declare our options parser, pass in the arguments from main
+	// as well as our array of valid options.
+	CSimpleOpt args(argc, argv, TesterOptionDefs);
+
+	// while there are arguments left to process
+	while (args.Next()) {
+		if (args.LastError() == SO_SUCCESS) {
+			if (args.OptionId() == OPT_HELP) {
+				printProgramUsage(argv[0]);
+				return false;
+			}
+			if (!processArg(options, args)) {
+				return false;
+			}
+		} else {
+			fmt::print(stderr, "ERROR: Invalid argument: {}\n", args.OptionText());
+			printProgramUsage(argv[0]);
+			return false;
+		}
+	}
+	return true;
+}
+
+void fdb_check(fdb_error_t e) {
+	if (e) {
+		fmt::print(stderr, "Unexpected FDB error: {}({})\n", e, fdb_get_error(e));
+		std::abort();
+	}
+}
+
+void applyNetworkOptions(TesterOptions& options) {
+	if (!options.externalClientLibrary.empty()) {
+		fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT));
+		fdb_check(
+		    FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY, options.externalClientLibrary));
+	}
+
+	if (options.testSpec.multiThreaded) {
+		fdb_check(
+		    FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads));
+	}
+
+	if (options.testSpec.fdbCallbacksOnExternalThreads) {
+		fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CALLBACKS_ON_EXTERNAL_THREADS));
+	}
+
+	if (options.testSpec.buggify) {
+		fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE));
+	}
+
+	if (options.trace) {
+		fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, options.traceDir));
+		fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_FORMAT, options.traceFormat));
+		fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_LOG_GROUP, options.logGroup));
+	}
+
+	for (auto knob : options.knobs) {
+		fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_KNOB,
+		                            fmt::format("{}={}", knob.first.c_str(), knob.second.c_str())));
+	}
+}
+
+void randomizeOptions(TesterOptions& options) {
+	Random& random = Random::get();
+	options.numFdbThreads = random.randomInt(options.testSpec.minFdbThreads, options.testSpec.maxFdbThreads);
+	options.numClientThreads = random.randomInt(options.testSpec.minClientThreads, options.testSpec.maxClientThreads);
+	options.numDatabases = random.randomInt(options.testSpec.minDatabases, options.testSpec.maxDatabases);
+	options.numClients = random.randomInt(options.testSpec.minClients, options.testSpec.maxClients);
+}
+
+bool runWorkloads(TesterOptions& options) {
+	TransactionExecutorOptions txExecOptions;
+	txExecOptions.blockOnFutures = options.testSpec.blockOnFutures;
+	txExecOptions.numDatabases = options.numDatabases;
+	txExecOptions.databasePerTransaction = options.testSpec.databasePerTransaction;
+
+	std::unique_ptr<IScheduler> scheduler = createScheduler(options.numClientThreads);
+	std::unique_ptr<ITransactionExecutor> txExecutor = createTransactionExecutor(txExecOptions);
+	scheduler->start();
+	txExecutor->init(scheduler.get(), options.clusterFile.c_str());
+
+	WorkloadManager workloadMgr(txExecutor.get(), scheduler.get());
+	for (const auto& workloadSpec : options.testSpec.workloads) {
+		for (int i = 0; i < options.numClients; i++) {
+			WorkloadConfig config;
+			config.name = workloadSpec.name;
+			config.options = workloadSpec.options;
+			config.clientId = i;
+			config.numClients = options.numClients;
+			std::shared_ptr<IWorkload> workload = IWorkloadFactory::create(workloadSpec.name, config);
+			if (!workload) {
+				throw TesterError(fmt::format("Unknown workload '{}'", workloadSpec.name));
+			}
+			workloadMgr.add(workload);
+		}
+	}
+
+	workloadMgr.run();
+	return !workloadMgr.failed();
+}
+
+} // namespace
+} // namespace FdbApiTester
+
+using namespace FdbApiTester;
+
+int main(int argc, char** argv) {
+	int retCode = 0;
+	try {
+		TesterOptions options;
+		if (!parseArgs(options, argc, argv)) {
+			return 1;
+		}
+		randomizeOptions(options);
+
+		fdb_check(fdb_select_api_version(options.testSpec.apiVersion));
+		applyNetworkOptions(options);
+		fdb_check(fdb_setup_network());
+
+		std::thread network_thread{ &fdb_run_network };
+
+		if (!runWorkloads(options)) {
+			retCode = 1;
+		}
+
+		fdb_check(fdb_stop_network());
+		network_thread.join();
+	} catch (const std::runtime_error& err) {
+		fmt::print(stderr, "ERROR: {}\n", err.what());
+		retCode = 1;
+	}
+	return retCode;
+}
--- a/bindings/c/test/apitester/run_c_api_tests.py
+++ b/bindings/c/test/apitester/run_c_api_tests.py
@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+#
+# run_c_api_tests.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+import subprocess
+import argparse
+import os
+from subprocess import Popen, TimeoutExpired
+import logging
+import signal
+
+
+def get_logger():
+    return logging.getLogger('foundationdb.run_c_api_tests')
+
+
+def initialize_logger_level(logging_level):
+    logger = get_logger()
+
+    assert logging_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR']
+
+    logging.basicConfig(format='%(message)s')
+    if logging_level == 'DEBUG':
+        logger.setLevel(logging.DEBUG)
+    elif logging_level == 'INFO':
+        logger.setLevel(logging.INFO)
+    elif logging_level == 'WARNING':
+        logger.setLevel(logging.WARNING)
+    elif logging_level == 'ERROR':
+        logger.setLevel(logging.ERROR)
+
+
+def run_tester(args, test_file):
+    cmd = [args.tester_binary, "--cluster-file",
+           args.cluster_file, "--test-file", test_file]
+    if args.external_client_library is not None:
+        cmd += ["--external-client-library", args.external_client_library]
+
+    get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd))
+    proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
+    timed_out = False
+    try:
+        ret_code = proc.wait(args.timeout)
+    except TimeoutExpired:
+        proc.kill()
+        timed_out = True
+    except Exception as e:
+        raise Exception('Unable to run tester (%s)' % e)
+
+    if ret_code != 0:
+        if ret_code < 0:
+            reason = signal.Signals(-ret_code).name
+        else:
+            reason = 'exit code: %d' % ret_code
+        if timed_out:
+            reason = 'timed out after %d seconds' % args.timeout
+            ret_code = 1
+        get_logger().error('\n\'%s\' did not complete succesfully (%s)' %
+                           (cmd[0], reason))
+
+    get_logger().info('')
+    return ret_code
+
+
+def run_tests(args):
+    num_failed = 0
+    test_files = [f for f in os.listdir(args.test_dir)
+                  if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml")]
+
+    for test_file in test_files:
+        get_logger().info('=========================================================')
+        get_logger().info('Running test %s' % test_file)
+        get_logger().info('=========================================================')
+        ret_code = run_tester(args, os.path.join(args.test_dir, test_file))
+        if ret_code != 0:
+            num_failed += 1
+
+    return num_failed
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='FoundationDB C API Tester')
+
+    parser.add_argument('--cluster-file', type=str, default="fdb.cluster",
+                        help='The cluster file for the cluster being connected to. (default: fdb.cluster)')
+    parser.add_argument('--tester-binary', type=str, default="fdb_c_api_tester",
+                        help='Path to the fdb_c_api_tester executable. (default: fdb_c_api_tester)')
+    parser.add_argument('--external-client-library', type=str, default=None,
+                        help='Path to the external client library. (default: None)')
+    parser.add_argument('--test-dir', type=str, default="./",
+                        help='Path to a directory with test definitions. (default: ./)')
+    parser.add_argument('--timeout', type=int, default=300,
+                        help='The timeout in seconds for running each individual test. (default 300)')
+    parser.add_argument('--logging-level', type=str, default='INFO',
+                        choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').')
+
+    return parser.parse_args(argv)
+
+
+def main(argv):
+    args = parse_args(argv)
+    initialize_logger_level(args.logging_level)
+    return run_tests(args)
+
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml
@ -0,0 +1,24 @@
+[[test]]
+title = 'Cancel Transaction with Blocking Waits'
+multiThreaded = true
+buggify = true
+blockOnFutures = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+	[[test.workload]]
+    name = 'CancelTransaction'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
+	readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml
@ -0,0 +1,23 @@
+[[test]]
+title = 'Cancel Transactions with Future Callbacks'
+multiThreaded = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+	[[test.workload]]
+    name = 'CancelTransaction'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
+	readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml
+++ b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml
@ -0,0 +1,24 @@
+[[test]]
+title = 'Cancel Transaction with Database per Transaction'
+multiThreaded = true
+buggify = true
+databasePerTransaction = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+	[[test.workload]]
+    name = 'CancelTransaction'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
+	readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml
@ -0,0 +1,25 @@
+[[test]]
+title = 'API Correctness Blocking'
+multiThreaded = true
+buggify = true
+blockOnFutures = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+
+    [[test.workload]]
+    name = 'ApiCorrectness'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
+	readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml
@ -0,0 +1,24 @@
+[[test]]
+title = 'API Correctness Callbacks On External Threads'
+multiThreaded = true
+fdbCallbacksOnExternalThreads = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+    [[test.workload]]
+    name = 'ApiCorrectness'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
+	readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml
@ -0,0 +1,24 @@
+[[test]]
+title = 'API Correctness Database Per Transaction'
+multiThreaded = true
+buggify = true
+databasePerTransaction = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+    [[test.workload]]
+    name = 'ApiCorrectness'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
+	readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
@ -0,0 +1,23 @@
+[[test]]
+title = 'API Correctness Multi Threaded'
+multiThreaded = true
+buggify = true
+minFdbThreads = 2
+maxFdbThreads = 8
+minDatabases = 2
+maxDatabases = 8
+minClientThreads = 2
+maxClientThreads = 8
+minClients = 2
+maxClients = 8
+
+    [[test.workload]]
+    name = 'ApiCorrectness'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
+	readExistingKeysRatio = 0.9
--- a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml
+++ b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml
@ -0,0 +1,16 @@
+[[test]]
+title = 'API Correctness Single Threaded'
+minClients = 1
+maxClients = 3
+multiThreaded = false
+
+    [[test.workload]]
+    name = 'ApiCorrectness'
+    minKeyLength = 1
+	maxKeyLength = 64
+	minValueLength = 1
+	maxValueLength = 1000
+	maxKeysPerTransaction = 50
+	initialSize = 100
+	numRandomOperations = 100
+	readExistingKeysRatio = 0.9
--- a/bindings/c/test/mako/mako.rst
+++ b/bindings/c/test/mako/mako.rst
@ -149,7 +149,7 @@ Format
 ------
 | One operation type is defined as ``<Type><Count>`` or ``<Type><Count>:<Range>``.
 | When Count is omitted, it's equivalent to setting it to 1.  (e.g. ``g`` is equivalent to ``g1``)
-| Multiple operation types within the same trancaction can be concatenated.  (e.g. ``g9u1`` = 9 GETs and 1 update)
+| Multiple operation types within the same transaction can be concatenated.  (e.g. ``g9u1`` = 9 GETs and 1 update)

 Transaction Specification Examples
 ----------------------------------
--- a/bindings/flow/FDBLoanerTypes.h
+++ b/bindings/flow/FDBLoanerTypes.h
@ -161,7 +161,7 @@ struct RangeResultRef : VectorRef<KeyValueRef> {
 	// False implies that no such values remain
 	Optional<KeyRef> readThrough; // Only present when 'more' is true. When present, this value represent the end (or
 	                              // beginning if reverse) of the range
-	// which was read to produce these results. This is guarenteed to be less than the requested range.
+	// which was read to produce these results. This is guaranteed to be less than the requested range.
 	bool readToBegin;
 	bool readThroughEnd;

--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@ -448,16 +448,21 @@ func (o TransactionOptions) SetInitializeNewDatabase() error {
 	return o.setOpt(300, nil)
 }

-// Allows this transaction to read and modify system keys (those that start with the byte 0xFF)
+// Allows this transaction to read and modify system keys (those that start with the byte 0xFF). Implies raw_access.
 func (o TransactionOptions) SetAccessSystemKeys() error {
 	return o.setOpt(301, nil)
 }

-// Allows this transaction to read system keys (those that start with the byte 0xFF)
+// Allows this transaction to read system keys (those that start with the byte 0xFF). Implies raw_access.
 func (o TransactionOptions) SetReadSystemKeys() error {
 	return o.setOpt(302, nil)
 }

+// Allows this transaction to access the raw key-space when tenant mode is on.
+func (o TransactionOptions) SetRawAccess() error {
+	return o.setOpt(303, nil)
+}
+
 // Not yet implemented.
 func (o TransactionOptions) SetDebugRetryLogging(param string) error {
 	return o.setOpt(401, []byte(param))
--- a/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java
+++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java
@ -24,7 +24,6 @@ import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.Arrays;

-
 /**
 * Used to represent values written by versionstamp operations with a {@link Tuple}.
 *  This wraps a single array which should contain twelve bytes. The first ten bytes
@ -37,7 +36,7 @@ import java.util.Arrays;
 *  over time. The final two bytes are the "user" version and should be set by the client.
 *  This allows the user to use this class to impose a total order of items across multiple
 *  transactions in the database in a consistent and conflict-free way. The user can elect to
- *  ignore this parameter by instantiating the class with the paramaterless {@link #incomplete() incomplete()}
+ *  ignore this parameter by instantiating the class with the parameterless {@link #incomplete() incomplete()}
 *  and one-parameter {@link #complete(byte[]) complete} static initializers. If they do so,
 *  then versions are written with a default (constant) user version.
 *
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@ -129,7 +129,7 @@ function(add_fdb_test)
      -n ${test_name}
      -b ${PROJECT_BINARY_DIR}
      -t ${test_type}
-      -O ${OLD_FDBSERVER_BINARY}  
+      -O ${OLD_FDBSERVER_BINARY}
      --config "@CTEST_CONFIGURATION_TYPE@"
      --crash
      --aggregate-traces ${TEST_AGGREGATE_TRACES}
@ -404,7 +404,7 @@ endfunction()

 # Creates a single cluster before running the specified command (usually a ctest test)
 function(add_fdbclient_test)
-  set(options DISABLED ENABLED)
+  set(options DISABLED ENABLED DISABLE_LOG_DUMP)
  set(oneValueArgs NAME PROCESS_NUMBER TEST_TIMEOUT WORKING_DIRECTORY)
  set(multiValueArgs COMMAND)
  cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
@ -423,23 +423,20 @@ function(add_fdbclient_test)
  if(NOT T_COMMAND)
    message(FATAL_ERROR "COMMAND is a required argument for add_fdbclient_test")
  endif()
-  message(STATUS "Adding Client test ${T_NAME}")
-  if (T_PROCESS_NUMBER)
-    add_test(NAME "${T_NAME}"
-    WORKING_DIRECTORY ${T_WORKING_DIRECTORY}
-    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py
-            --build-dir ${CMAKE_BINARY_DIR}
-            --process-number ${T_PROCESS_NUMBER}
-            --
-            ${T_COMMAND})
-  else()
-    add_test(NAME "${T_NAME}"
-    WORKING_DIRECTORY ${T_WORKING_DIRECTORY}
-    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py
-            --build-dir ${CMAKE_BINARY_DIR}
-            --
-            ${T_COMMAND})
+  set(TMP_CLUSTER_CMD ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py
+                      --build-dir ${CMAKE_BINARY_DIR})
+  if(T_PROCESS_NUMBER)
+    list(APPEND TMP_CLUSTER_CMD --process-number ${T_PROCESS_NUMBER})
  endif()
+  if(T_DISABLE_LOG_DUMP)
+    list(APPEND TMP_CLUSTER_CMD --disable-log-dump)
+  endif()
+  message(STATUS "Adding Client test ${T_NAME}")
+  add_test(NAME "${T_NAME}"
+    WORKING_DIRECTORY ${T_WORKING_DIRECTORY}
+    COMMAND ${Python_EXECUTABLE} ${TMP_CLUSTER_CMD}
+            --
+            ${T_COMMAND})
  if (T_TEST_TIMEOUT)
    set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT ${T_TEST_TIMEOUT})
  else()
@ -449,7 +446,7 @@ function(add_fdbclient_test)
  set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1)
 endfunction()

-# Creates a cluster file for a nonexistent cluster before running the specified command 
+# Creates a cluster file for a nonexistent cluster before running the specified command
 # (usually a ctest test)
 function(add_unavailable_fdbclient_test)
  set(options DISABLED ENABLED)
--- a/contrib/alloc_instrumentation.py
+++ b/contrib/alloc_instrumentation.py
@ -41,10 +41,10 @@ def print_stacks(stack_count, sort_by_count):

    sort_dict = counts if sort_by_count else sizes
    ordered_list = [(val, backtrace) for (backtrace, val) in sort_dict.items()]
-    ordered_list.sort(reverse=True)
+    ordered_list.sort()

    if stack_count:
-        ordered_list = ordered_list[:stack_count]
+        ordered_list = ordered_list[-stack_count:]

    for size, backtrace in ordered_list:
        print(str.format('bytes={0:<10} count={1:<8} {2}', sizes[backtrace], counts[backtrace], backtrace))
--- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py
+++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py
@ -192,6 +192,8 @@ class BaseInfo(object):
        self.start_timestamp = bb.get_double()
        if protocol_version >= PROTOCOL_VERSION_6_3:
            self.dc_id = bb.get_bytes_with_length()
+        if protocol_version >= PROTOCOL_VERSION_7_1:
+            self.tenant = bb.get_bytes_with_length()

 class GetVersionInfo(BaseInfo):
    def __init__(self, bb, protocol_version):
--- a/documentation/sphinx/source/api-c.rst
+++ b/documentation/sphinx/source/api-c.rst
@ -6,6 +6,7 @@
 .. |database-type| replace:: ``FDBDatabase``
 .. |database-class| replace:: :type:`FDBDatabase`
 .. |database-auto| replace:: FIXME
+.. |tenant-type| replace:: ``FDBTenant``
 .. |transaction-class| replace:: FIXME
 .. |get-key-func| replace:: :func:`fdb_transaction_get_key()`
 .. |get-range-func| replace:: :func:`fdb_transaction_get_range()`
@ -419,9 +420,20 @@ An |database-blurb1| Modifications to a database are performed via transactions.

   |option-doc|

+.. function:: fdb_error_t fdb_database_open_tenant(FDBDatabase* database, uint8_t const* tenant_name, int tenant_name_length, FDBTenant** out_tenant)
+
+   Opens a tenant on the given database. All transactions created by this tenant will operate on the tenant's key-space. The caller assumes ownership of the :type:`FDBTenant` object and must destroy it with :func:`fdb_tenant_destroy()`.
+
+   ``tenant_name``
+      The name of the tenant being accessed, as a byte string.
+   ``tenant_name_length``
+      The length of the tenant name byte string.
+   ``*out_tenant``
+      Set to point to the newly created :type:`FDBTenant`.
+
 .. function:: fdb_error_t fdb_database_create_transaction(FDBDatabase* database, FDBTransaction** out_transaction)

-   Creates a new transaction on the given database. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`.
+   Creates a new transaction on the given database without using a tenant, meaning that it will operate on the entire database key-space. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`.

   ``*out_transaction``
      Set to point to the newly created :type:`FDBTransaction`.
@ -454,7 +466,7 @@ An |database-blurb1| Modifications to a database are performed via transactions.
   
   The function will change the region configuration to have a positive priority for the chosen dcId, and a negative priority for all other dcIds.

-   In particular, no error will be thrown if the given dcId does not exist. It will just not attemp to force a recovery.
+   In particular, no error will be thrown if the given dcId does not exist. It will just not attempt to force a recovery.
   
   If the database has already recovered, the function does nothing. Thus it's safe to call it multiple times.

@ -486,6 +498,26 @@ An |database-blurb1| Modifications to a database are performed via transactions.

   Returns a value where 0 indicates that the client is idle and 1 (or larger) indicates that the client is saturated. By default, this value is updated every second.

+Tenant
+======
+
+|tenant-blurb1|
+
+.. type:: FDBTenant
+
+   An opaque type that represents a tenant in the FoundationDB C API.
+
+.. function:: void fdb_tenant_destroy(FDBTenant* tenant)
+
+   Destroys an :type:`FDBTenant` object. It must be called exactly once for each successful call to :func:`fdb_database_create_tenant()`. This function only destroys a handle to the tenant -- the tenant and its data will be fine!
+
+.. function:: fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTronsaction **out_transaction)
+
+   Creates a new transaction on the given tenant. This transaction will operate within the tenant's key-space and cannot access data outside the tenant. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`.
+
+   ``*out_transaction``
+      Set to point to the newly created :type:`FDBTransaction`.
+
 Transaction
 ===========

--- a/documentation/sphinx/source/api-common.rst.inc
+++ b/documentation/sphinx/source/api-common.rst.inc
@ -74,6 +74,9 @@
 .. |database-sync| replace::
    The convenience methods provided by |database-type| have the same signature as the corresponding methods of ``Transaction``. However, most of the |database-type| methods are fully synchronous. (An exception is the methods for watches.) As a result, the |database-type| methods do not support the use of :ref:`implicit parallelism with futures <developer-guide-programming-with-futures>`.

+.. |tenant-blurb1| replace::
+    |tenant-type| represents a FoundationDB tenant. Tenants are optional named transaction domains that can be used to provide multiple disjoint key-spaces to client applications. A transaction created in a tenant will be limited to the keys contained within that tenant, and transactions operating on different tenants can use the same key names without interfering with each other.
+
 .. |keysel-blurb1| replace::
    FoundationDB's lexicographically ordered data model permits finding keys based on their order (for example, finding the first key in the database greater than a given key). Key selectors represent a description of a key in the database that could be resolved to an actual key by |get-key-func| or used directly as the beginning or end of a range in |get-range-func|.

@ -627,4 +630,4 @@

 .. |option-set-distributed-client-tracer| replace::

-    Sets a tracer to run on the client. Should be set to the same value as the tracer set on the server.
+    Sets a tracer to run on the client. Should be set to the same value as the tracer set on the server.
--- a/documentation/sphinx/source/api-python.rst
+++ b/documentation/sphinx/source/api-python.rst
@ -7,6 +7,7 @@
 .. |database-type| replace:: ``Database``
 .. |database-class| replace:: :class:`Database`
 .. |database-auto| replace:: the :func:`@fdb.transactional <transactional>` decorator
+.. |tenant-type| replace:: FIXME
 .. |transaction-class| replace:: :class:`Transaction`
 .. |get-key-func| replace:: :func:`Transaction.get_key`
 .. |get-range-func| replace:: :func:`Transaction.get_range`
--- a/documentation/sphinx/source/api-ruby.rst
+++ b/documentation/sphinx/source/api-ruby.rst
@ -5,6 +5,7 @@
 .. |database-type| replace:: ``Database``
 .. |database-class| replace:: :class:`Database`
 .. |database-auto| replace:: :meth:`Database.transact`
+.. |tenant-type| replace:: FIXME
 .. |transaction-class| replace:: :class:`Transaction`
 .. |get-key-func| replace:: :meth:`Transaction.get_key`
 .. |get-range-func| replace:: :meth:`Transaction.get_range`
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@ -115,7 +115,7 @@ Here is a complete list of valid parameters:

 *request_timeout_min* (or *rtom*) - Minimum number of seconds to wait for a request to succeed after a connection is established.

- *request_tries* (or *rt*) - Number of times to try each request until a parseable HTTP response other than 429 is received.
+ *request_tries* (or *rt*) - Number of times to try each request until a parsable HTTP response other than 429 is received.

 *requests_per_second* (or *rps*) - Max number of requests to start per second.

--- a/documentation/sphinx/source/client-testing.rst
+++ b/documentation/sphinx/source/client-testing.rst
@ -11,7 +11,7 @@ Testing Error Handling with Buggify

 FoundationDB clients need to handle errors correctly. Wrong error handling can lead to many bugs - in the worst case it can
 lead to a corrupted database. Because of this it is important that an application or layer author tests properly their
-application during failure scenarios. But this is non-trivial. In a developement environment cluster failures are very
+application during failure scenarios. But this is non-trivial. In a development environment cluster failures are very
 unlikely and it is therefore possible that certain types of exceptions are never tested in a controlled environment.

 The simplest way of testing for these kind of errors is a simple mechanism called ``Buggify``. If this option is enabled
@ -327,7 +327,7 @@ processes with the class test. So above 2-step process becomes a bit more comple

 1. Write the test (same as above).
 2. Set up a cluster with as many test clients as you want.
-3. Run the orchestor to actually execute the test.
+3. Run the orchestrator to actually execute the test.

 Step 1. is explained further up. For step 2., please refer to the general FoundationDB
 configuration. The main difference to a normal FoundationDB cluster is that some processes
--- a/documentation/sphinx/source/data-modeling.rst
+++ b/documentation/sphinx/source/data-modeling.rst
@ -8,6 +8,7 @@
 .. |database-type| replace:: ``Database``
 .. |database-class| replace:: ``Database``
 .. |database-auto| replace:: FIXME
+.. |tenant-type| replace:: FIXME
 .. |transaction-class| replace:: ``Transaction``
 .. |get-key-func| replace:: get_key()
 .. |get-range-func| replace:: get_range()
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@ -8,6 +8,7 @@
 .. |database-type| replace:: ``Database``
 .. |database-class| replace:: ``Database``
 .. |database-auto| replace:: FIXME
+.. |tenant-type| replace:: FIXME
 .. |transaction-class| replace:: ``Transaction``
 .. |get-key-func| replace:: get_key()
 .. |get-range-func| replace:: get_range()
@ -915,7 +916,7 @@ When using FoundationDB we strongly recommend users to use the retry-loop. In Py
       except FDBError as e:
           tr.on_error(e.code).wait()

-This is also what the transaction decoration in python does, if you pass a ``Database`` object to a decorated function. There are some interesting properies of this retry loop:
+This is also what the transaction decoration in python does, if you pass a ``Database`` object to a decorated function. There are some interesting properties of this retry loop:

 * We never create a new transaction within that loop. Instead ``tr.on_error`` will create a soft reset on the transaction.
 * ``tr.on_error`` returns a future. This is because ``on_error`` will do back off to make sure we don't overwhelm the cluster.
--- a/documentation/sphinx/source/special-keys.rst
+++ b/documentation/sphinx/source/special-keys.rst
@ -121,8 +121,8 @@ Aggregate stats about cluster health. Reading this key alone is slightly cheaper
 **Field**                           **Type** **Description**
 ----------------------------------- -------- ---------------
 batch_limited                       boolean  Whether or not the cluster is limiting batch priority transactions
-limiting_storage_durability_lag     number   storage_durability_lag that ratekeeper is using to determing throttling (see the description for storage_durability_lag)
-limiting_storage_queue              number   storage_queue that ratekeeper is using to determing throttling (see the description for storage_queue)
+limiting_storage_durability_lag     number   storage_durability_lag that ratekeeper is using to determine throttling (see the description for storage_durability_lag)
+limiting_storage_queue              number   storage_queue that ratekeeper is using to determine throttling (see the description for storage_queue)
 tps_limit                           number   The rate at which normal priority transactions are allowed to start
 worst_storage_durability_lag        number   See the description for storage_durability_lag
 worst_storage_queue                 number   See the description for storage_queue
@ -205,6 +205,7 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/failed_locality/<locality>`` Read/write. Indicates that the cluster should consider matching processes as permanently failed. This allows the cluster to avoid maintaining extra state and doing extra work in the hope that these processes come back. See :ref:`removing machines from a cluster <removing-machines-from-a-cluster>` for documentation for the corresponding fdbcli command.
 #. ``\xff\xff/management/options/excluded_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/excluded_locality/<locality>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/<locality>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
+#. ``\xff\xff/management/tenant_map/<tenant>`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ``<tenant>``. Clearing a key in this range will delete the tenant with name ``<tenant>``. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants.

 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or
--- a/documentation/sphinx/source/tss.rst
+++ b/documentation/sphinx/source/tss.rst
@ -13,7 +13,7 @@ This document covers the operation and architecture of the Testing Storage Serve
 Summary
 ============

-The TSS feature allows FoundationDB to run an "untrusted" storage engine (the *testing storage engine*) directly in a QA or production envronment with identical workload to the current storage engine, with zero impact on durability or correctness, and minimal impact on performance.
+The TSS feature allows FoundationDB to run an "untrusted" storage engine (the *testing storage engine*) directly in a QA or production environment with identical workload to the current storage engine, with zero impact on durability or correctness, and minimal impact on performance.

 This allows a FoundationDB cluster operator to validate the correctness and performance of a different storage engine on the exact cluster workload before migrating data to the different storage engine.

@ -44,10 +44,10 @@ The ``status`` command in the FDB :ref:`command line interface <command-line-int

 Trace Events
 ----------------------
-Whenever a client detects a *TSS Mismatch*, or when the SS and TSS response differ, and the difference can only be explained by different storage engine contents, it will emit an error-level trace event with a type starting with ``TSSMismatch``, with a different type for each read request. This trace event will include all of the information necessary to investgate the mismatch, such as the TSS storage ID, the full request data, and the summarized replies (full keys and checksummed values) from both the SS and TSS.
+Whenever a client detects a *TSS Mismatch*, or when the SS and TSS response differ, and the difference can only be explained by different storage engine contents, it will emit an error-level trace event with a type starting with ``TSSMismatch``, with a different type for each read request. This trace event will include all of the information necessary to investigate the mismatch, such as the TSS storage ID, the full request data, and the summarized replies (full keys and checksummed values) from both the SS and TSS.

 Each client emits a ``TSSClientMetrics`` trace event for each TSS pair in the cluster that it has sent requests to recently, similar to the ``TransactionMetrics`` trace event.
-It contains the TSS storage ID, and latency statistics for each type of read request. It also includes a count of any mismatches, and a histogram of error codes recieved by the SS and TSS to ensure the storage engines have similar error rates and types.
+It contains the TSS storage ID, and latency statistics for each type of read request. It also includes a count of any mismatches, and a histogram of error codes received by the SS and TSS to ensure the storage engines have similar error rates and types.

 The ``StorageMetrics`` trace event emitted by storage servers includes the storage ID of its pair if part of a TSS pairing, and includes a ``TSSJointID`` detail with a unique id for the SS/TSS pair that enables correlating the separate StorageMetrics events from the SS and TSS.

@ -101,7 +101,7 @@ The pair recruitment logic is as follows:

 * Once DD gets a candidate worker from the Cluster Controller, hold that worker as a desired TSS process.
 * Once DD gets a second candidate worker from the Cluster Controller, initialize that worker as a normal SS.
-* Once the second candidate worker is successfully initialized, initialize the first candidate worker as a TSS, passing it the storage ID, starting tag + version, and other information from its SS pair. Because the TSS reads from the same tag starting at the same version, it is guaranteed to recieve the same mutations and data movements as its pair.
+* Once the second candidate worker is successfully initialized, initialize the first candidate worker as a TSS, passing it the storage ID, starting tag + version, and other information from its SS pair. Because the TSS reads from the same tag starting at the same version, it is guaranteed to receive the same mutations and data movements as its pair.

 One implication of this is, during TSS recruitment, the cluster is effectively down one storage process until a second storage process becomes available.
 While clusters should be able to handle being down a single storage process anyway to tolerate machine failure, an active TSS recruitment will be cancelled if the lack of that single storage process is causing the cluster to be unhealthy. Similarly, if the cluster is unhealthy and unable to find new teams to replicate data to, any existing TSS processes may be killed to make room for new storage servers.
@ -121,4 +121,4 @@ Because it is only enabled on a small percentage of the cluster and only compare

 TSS testing using the recommended small number of TSS pairs may also miss performance pathologies from workloads not experienced by the specific storage teams with TSS pairs in their membership.

-TSS testing is not a substitute for full-cluster performance and correctness testing or simulation testing.
+TSS testing is not a substitute for full-cluster performance and correctness testing or simulation testing.
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -1661,7 +1661,7 @@ ACTOR Future<std::string> getLayerStatus(Reference<ReadYourWritesTransaction> tr
 	return json;
 }

-// Check for unparseable or expired statuses and delete them.
+// Check for unparsable or expired statuses and delete them.
 // First checks the first doc in the key range, and if it is valid, alive and not "me" then
 // returns.  Otherwise, checks the rest of the range as well.
 ACTOR Future<Void> cleanupStatus(Reference<ReadYourWritesTransaction> tr,
--- a/fdbcli/StatusCommand.actor.cpp
+++ b/fdbcli/StatusCommand.actor.cpp
@ -1201,7 +1201,7 @@ void printStatus(StatusObjectReader statusObj,

 // "db" is the handler to the multiversion database
 // localDb is the native Database object
-// localDb is rarely needed except the "db" has not establised a connection to the cluster where the operation will
+// localDb is rarely needed except the "db" has not established a connection to the cluster where the operation will
 // return Never as we expect status command to always return, we use "localDb" to return the default result
 ACTOR Future<bool> statusCommandActor(Reference<IDatabase> db,
                                      Database localDb,
@ -1255,4 +1255,4 @@ CommandFactory statusFactory(
                "statistics.\n\nSpecifying `minimal' will provide a minimal description of the status of your "
                "database.\n\nSpecifying `details' will provide load information for individual "
                "workers.\n\nSpecifying `json' will provide status information in a machine readable JSON format."));
-} // namespace fdb_cli
+} // namespace fdb_cli
--- a/fdbclient/BlobGranuleReader.actor.cpp
+++ b/fdbclient/BlobGranuleReader.actor.cpp
@ -30,7 +30,7 @@
 #include "fdbclient/BlobWorkerInterface.h"
 #include "flow/actorcompiler.h" // This must be the last #include.

-// TODO more efficient data structure besides std::map? PTree is unecessary since this isn't versioned, but some other
+// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other
 // sorted thing could work. And if it used arenas it'd probably be more efficient with allocations, since everything
 // else is in 1 arena and discarded at the end.

--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -128,6 +128,7 @@ set(FDBCLIENT_SRCS
  StatusClient.h
  StorageServerInterface.cpp
  StorageServerInterface.h
+  StorageCheckpoint.h
  Subspace.cpp
  Subspace.h
  StackLineage.h
--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -61,6 +61,7 @@ void ClientKnobs::initialize(Randomize randomize) {

 	init( WRONG_SHARD_SERVER_DELAY,                .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
 	init( FUTURE_VERSION_RETRY_DELAY,              .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
+	init( UNKNOWN_TENANT_RETRY_DELAY,              0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01();
 	init( REPLY_BYTE_LIMIT,                      80000 );
 	init( DEFAULT_BACKOFF,                         .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01();
 	init( DEFAULT_MAX_BACKOFF,                     1.0 );
@ -89,6 +90,8 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( LOCATION_CACHE_EVICTION_SIZE_SIM,         10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3;
 	init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD,     60 );
 	init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL,    60 );
+	init( TENANT_CACHE_EVICTION_SIZE,           100000 );
+	init( TENANT_CACHE_EVICTION_SIZE_SIM,           10 ); if( randomize && BUGGIFY ) TENANT_CACHE_EVICTION_SIZE_SIM = 3;

 	init( GET_RANGE_SHARD_LIMIT,                     2 );
 	init( WARM_RANGE_SHARD_LIMIT,                  100 );
--- a/fdbclient/ClientKnobs.h
+++ b/fdbclient/ClientKnobs.h
@ -60,6 +60,7 @@ public:
 	double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is
 	                                 // mostly wrong (e.g. dumping the database after a test)
 	double FUTURE_VERSION_RETRY_DELAY;
+	double UNKNOWN_TENANT_RETRY_DELAY;
 	int REPLY_BYTE_LIMIT;
 	double DEFAULT_BACKOFF;
 	double DEFAULT_MAX_BACKOFF;
@ -89,6 +90,8 @@ public:
 	int LOCATION_CACHE_EVICTION_SIZE_SIM;
 	double LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD;
 	double LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL;
+	int TENANT_CACHE_EVICTION_SIZE;
+	int TENANT_CACHE_EVICTION_SIZE_SIM;

 	int GET_RANGE_SHARD_LIMIT;
 	int WARM_RANGE_SHARD_LIMIT;
--- a/fdbclient/ClientLogEvents.h
+++ b/fdbclient/ClientLogEvents.h
@ -41,7 +41,8 @@ enum class TransactionPriorityType : int { PRIORITY_DEFAULT = 0, PRIORITY_BATCH
 static_assert(sizeof(TransactionPriorityType) == 4, "transaction_profiling_analyzer.py assumes this field has size 4");

 struct Event {
-	Event(EventType t, double ts, const Optional<Standalone<StringRef>>& dc) : type(t), startTs(ts) {
+	Event(EventType t, double ts, const Optional<Standalone<StringRef>>& dc, const Optional<TenantName>& tenant)
+	  : type(t), startTs(ts), tenant(tenant) {
 		if (dc.present())
 			dcId = dc.get();
 	}
@ -49,7 +50,9 @@ struct Event {

 	template <typename Ar>
 	Ar& serialize(Ar& ar) {
-		if (ar.protocolVersion().version() >= (uint64_t)0x0FDB00B063010001LL) {
+		if (ar.protocolVersion().hasTenants()) {
+			return serializer(ar, type, startTs, dcId, tenant);
+		} else if (ar.protocolVersion().version() >= (uint64_t)0x0FDB00B063010001LL) {
 			return serializer(ar, type, startTs, dcId);
 		} else {
 			return serializer(ar, type, startTs);
@ -59,8 +62,10 @@ struct Event {
 	EventType type{ EventType::UNSET };
 	double startTs{ 0 };
 	Key dcId{};
+	Optional<TenantName> tenant{};

 	void logEvent(std::string id, int maxFieldLength) const {}
+	void augmentTraceEvent(TraceEvent& event) const { event.detail("Tenant", tenant); }
 };

 struct EventGetVersion : public Event {
@ -77,7 +82,9 @@ struct EventGetVersion : public Event {
 	double latency;

 	void logEvent(std::string id, int maxFieldLength) const {
-		TraceEvent("TransactionTrace_GetVersion").detail("TransactionID", id).detail("Latency", latency);
+		TraceEvent event("TransactionTrace_GetVersion");
+		event.detail("TransactionID", id).detail("Latency", latency);
+		augmentTraceEvent(event);
 	}
 };

@ -97,10 +104,9 @@ struct EventGetVersion_V2 : public Event {
 	TransactionPriorityType priorityType{ TransactionPriorityType::UNSET };

 	void logEvent(std::string id, int maxFieldLength) const {
-		TraceEvent("TransactionTrace_GetVersion")
-		    .detail("TransactionID", id)
-		    .detail("Latency", latency)
-		    .detail("PriorityType", priorityType);
+		TraceEvent event("TransactionTrace_GetVersion");
+		event.detail("TransactionID", id).detail("Latency", latency).detail("PriorityType", priorityType);
+		augmentTraceEvent(event);
 	}
 };

@ -110,8 +116,9 @@ struct EventGetVersion_V3 : public Event {
 	                   const Optional<Standalone<StringRef>>& dcId,
 	                   double lat,
 	                   TransactionPriority priority,
-	                   Version version)
-	  : Event(EventType::GET_VERSION_LATENCY, ts, dcId), latency(lat), readVersion(version) {
+	                   Version version,
+	                   const Optional<TenantName>& tenant)
+	  : Event(EventType::GET_VERSION_LATENCY, ts, dcId, tenant), latency(lat), readVersion(version) {
 		switch (priority) {
 		// Unfortunately, the enum serialized here disagrees with the enum used elsewhere for the values used by each
 		// priority
@ -143,17 +150,23 @@ struct EventGetVersion_V3 : public Event {
 	Version readVersion;

 	void logEvent(std::string id, int maxFieldLength) const {
-		TraceEvent("TransactionTrace_GetVersion")
-		    .detail("TransactionID", id)
+		TraceEvent event("TransactionTrace_GetVersion");
+		event.detail("TransactionID", id)
 		    .detail("Latency", latency)
 		    .detail("PriorityType", priorityType)
 		    .detail("ReadVersion", readVersion);
+		augmentTraceEvent(event);
 	}
 };

 struct EventGet : public Event {
-	EventGet(double ts, const Optional<Standalone<StringRef>>& dcId, double lat, int size, const KeyRef& in_key)
-	  : Event(EventType::GET_LATENCY, ts, dcId), latency(lat), valueSize(size), key(in_key) {}
+	EventGet(double ts,
+	         const Optional<Standalone<StringRef>>& dcId,
+	         double lat,
+	         int size,
+	         const KeyRef& in_key,
+	         const Optional<TenantName>& tenant)
+	  : Event(EventType::GET_LATENCY, ts, dcId, tenant), latency(lat), valueSize(size), key(in_key) {}
 	EventGet() {}

 	template <typename Ar>
@ -169,13 +182,14 @@ struct EventGet : public Event {
 	Key key;

 	void logEvent(std::string id, int maxFieldLength) const {
-		TraceEvent("TransactionTrace_Get")
-		    .setMaxEventLength(-1)
+		TraceEvent event("TransactionTrace_Get");
+		event.setMaxEventLength(-1)
 		    .detail("TransactionID", id)
 		    .detail("Latency", latency)
 		    .detail("ValueSizeBytes", valueSize)
 		    .setMaxFieldLength(maxFieldLength)
 		    .detail("Key", key);
+		augmentTraceEvent(event);
 	}
 };

@ -185,8 +199,9 @@ struct EventGetRange : public Event {
 	              double lat,
 	              int size,
 	              const KeyRef& start_key,
-	              const KeyRef& end_key)
-	  : Event(EventType::GET_RANGE_LATENCY, ts, dcId), latency(lat), rangeSize(size), startKey(start_key),
+	              const KeyRef& end_key,
+	              const Optional<TenantName>& tenant)
+	  : Event(EventType::GET_RANGE_LATENCY, ts, dcId, tenant), latency(lat), rangeSize(size), startKey(start_key),
 	    endKey(end_key) {}
 	EventGetRange() {}

@ -204,14 +219,15 @@ struct EventGetRange : public Event {
 	Key endKey;

 	void logEvent(std::string id, int maxFieldLength) const {
-		TraceEvent("TransactionTrace_GetRange")
-		    .setMaxEventLength(-1)
+		TraceEvent event("TransactionTrace_GetRange");
+		event.setMaxEventLength(-1)
 		    .detail("TransactionID", id)
 		    .detail("Latency", latency)
 		    .detail("RangeSizeBytes", rangeSize)
 		    .setMaxFieldLength(maxFieldLength)
 		    .detail("StartKey", startKey)
 		    .detail("EndKey", endKey);
+		augmentTraceEvent(event);
 	}
 };

@ -234,36 +250,40 @@ struct EventCommit : public Event {

 	void logEvent(std::string id, int maxFieldLength) const {
 		for (auto& read_range : req.transaction.read_conflict_ranges) {
-			TraceEvent("TransactionTrace_Commit_ReadConflictRange")
-			    .setMaxEventLength(-1)
+			TraceEvent ev1("TransactionTrace_Commit_ReadConflictRange");
+			ev1.setMaxEventLength(-1)
 			    .detail("TransactionID", id)
 			    .setMaxFieldLength(maxFieldLength)
 			    .detail("Begin", read_range.begin)
 			    .detail("End", read_range.end);
+			augmentTraceEvent(ev1);
 		}

 		for (auto& write_range : req.transaction.write_conflict_ranges) {
-			TraceEvent("TransactionTrace_Commit_WriteConflictRange")
-			    .setMaxEventLength(-1)
+			TraceEvent ev2("TransactionTrace_Commit_WriteConflictRange");
+			ev2.setMaxEventLength(-1)
 			    .detail("TransactionID", id)
 			    .setMaxFieldLength(maxFieldLength)
 			    .detail("Begin", write_range.begin)
 			    .detail("End", write_range.end);
+			augmentTraceEvent(ev2);
 		}

 		for (auto& mutation : req.transaction.mutations) {
-			TraceEvent("TransactionTrace_Commit_Mutation")
-			    .setMaxEventLength(-1)
+			TraceEvent ev3("TransactionTrace_Commit_Mutation");
+			ev3.setMaxEventLength(-1)
 			    .detail("TransactionID", id)
 			    .setMaxFieldLength(maxFieldLength)
 			    .detail("Mutation", mutation);
+			augmentTraceEvent(ev3);
 		}

-		TraceEvent("TransactionTrace_Commit")
-		    .detail("TransactionID", id)
+		TraceEvent ev4("TransactionTrace_Commit");
+		ev4.detail("TransactionID", id)
 		    .detail("Latency", latency)
 		    .detail("NumMutations", numMutations)
 		    .detail("CommitSizeBytes", commitBytes);
+		augmentTraceEvent(ev4);
 	}
 };

@ -275,8 +295,9 @@ struct EventCommit_V2 : public Event {
 	               int mut,
 	               int bytes,
 	               Version version,
-	               const CommitTransactionRequest& commit_req)
-	  : Event(EventType::COMMIT_LATENCY, ts, dcId), latency(lat), numMutations(mut), commitBytes(bytes),
+	               const CommitTransactionRequest& commit_req,
+	               const Optional<TenantName>& tenant)
+	  : Event(EventType::COMMIT_LATENCY, ts, dcId, tenant), latency(lat), numMutations(mut), commitBytes(bytes),
 	    commitVersion(version), req(commit_req) {}
 	EventCommit_V2() {}

@ -298,43 +319,51 @@ struct EventCommit_V2 : public Event {

 	void logEvent(std::string id, int maxFieldLength) const {
 		for (auto& read_range : req.transaction.read_conflict_ranges) {
-			TraceEvent("TransactionTrace_Commit_ReadConflictRange")
-			    .setMaxEventLength(-1)
+			TraceEvent ev1("TransactionTrace_Commit_ReadConflictRange");
+			ev1.setMaxEventLength(-1)
 			    .detail("TransactionID", id)
 			    .setMaxFieldLength(maxFieldLength)
 			    .detail("Begin", read_range.begin)
 			    .detail("End", read_range.end);
+			augmentTraceEvent(ev1);
 		}

 		for (auto& write_range : req.transaction.write_conflict_ranges) {
-			TraceEvent("TransactionTrace_Commit_WriteConflictRange")
-			    .setMaxEventLength(-1)
+			TraceEvent ev2("TransactionTrace_Commit_WriteConflictRange");
+			ev2.setMaxEventLength(-1)
 			    .detail("TransactionID", id)
 			    .setMaxFieldLength(maxFieldLength)
 			    .detail("Begin", write_range.begin)
 			    .detail("End", write_range.end);
+			augmentTraceEvent(ev2);
 		}

 		for (auto& mutation : req.transaction.mutations) {
-			TraceEvent("TransactionTrace_Commit_Mutation")
-			    .setMaxEventLength(-1)
+			TraceEvent ev3("TransactionTrace_Commit_Mutation");
+			ev3.setMaxEventLength(-1)
 			    .detail("TransactionID", id)
 			    .setMaxFieldLength(maxFieldLength)
 			    .detail("Mutation", mutation);
+			augmentTraceEvent(ev3);
 		}

-		TraceEvent("TransactionTrace_Commit")
-		    .detail("TransactionID", id)
+		TraceEvent ev4("TransactionTrace_Commit");
+		ev4.detail("TransactionID", id)
 		    .detail("CommitVersion", commitVersion)
 		    .detail("Latency", latency)
 		    .detail("NumMutations", numMutations)
 		    .detail("CommitSizeBytes", commitBytes);
+		augmentTraceEvent(ev4);
 	}
 };

 struct EventGetError : public Event {
-	EventGetError(double ts, const Optional<Standalone<StringRef>>& dcId, int err_code, const KeyRef& in_key)
-	  : Event(EventType::ERROR_GET, ts, dcId), errCode(err_code), key(in_key) {}
+	EventGetError(double ts,
+	              const Optional<Standalone<StringRef>>& dcId,
+	              int err_code,
+	              const KeyRef& in_key,
+	              const Optional<TenantName>& tenant)
+	  : Event(EventType::ERROR_GET, ts, dcId, tenant), errCode(err_code), key(in_key) {}
 	EventGetError() {}

 	template <typename Ar>
@ -349,12 +378,13 @@ struct EventGetError : public Event {
 	Key key;

 	void logEvent(std::string id, int maxFieldLength) const {
-		TraceEvent("TransactionTrace_GetError")
-		    .setMaxEventLength(-1)
+		TraceEvent event("TransactionTrace_GetError");
+		event.setMaxEventLength(-1)
 		    .detail("TransactionID", id)
 		    .detail("ErrCode", errCode)
 		    .setMaxFieldLength(maxFieldLength)
 		    .detail("Key", key);
+		augmentTraceEvent(event);
 	}
 };

@ -363,8 +393,9 @@ struct EventGetRangeError : public Event {
 	                   const Optional<Standalone<StringRef>>& dcId,
 	                   int err_code,
 	                   const KeyRef& start_key,
-	                   const KeyRef& end_key)
-	  : Event(EventType::ERROR_GET_RANGE, ts, dcId), errCode(err_code), startKey(start_key), endKey(end_key) {}
+	                   const KeyRef& end_key,
+	                   const Optional<TenantName>& tenant)
+	  : Event(EventType::ERROR_GET_RANGE, ts, dcId, tenant), errCode(err_code), startKey(start_key), endKey(end_key) {}
 	EventGetRangeError() {}

 	template <typename Ar>
@ -380,13 +411,14 @@ struct EventGetRangeError : public Event {
 	Key endKey;

 	void logEvent(std::string id, int maxFieldLength) const {
-		TraceEvent("TransactionTrace_GetRangeError")
-		    .setMaxEventLength(-1)
+		TraceEvent event("TransactionTrace_GetRangeError");
+		event.setMaxEventLength(-1)
 		    .detail("TransactionID", id)
 		    .detail("ErrCode", errCode)
 		    .setMaxFieldLength(maxFieldLength)
 		    .detail("StartKey", startKey)
 		    .detail("EndKey", endKey);
+		augmentTraceEvent(event);
 	}
 };

@ -394,8 +426,9 @@ struct EventCommitError : public Event {
 	EventCommitError(double ts,
 	                 const Optional<Standalone<StringRef>>& dcId,
 	                 int err_code,
-	                 const CommitTransactionRequest& commit_req)
-	  : Event(EventType::ERROR_COMMIT, ts, dcId), errCode(err_code), req(commit_req) {}
+	                 const CommitTransactionRequest& commit_req,
+	                 const Optional<TenantName>& tenant)
+	  : Event(EventType::ERROR_COMMIT, ts, dcId, tenant), errCode(err_code), req(commit_req) {}
 	EventCommitError() {}

 	template <typename Ar>
@ -412,32 +445,37 @@ struct EventCommitError : public Event {

 	void logEvent(std::string id, int maxFieldLength) const {
 		for (auto& read_range : req.transaction.read_conflict_ranges) {
-			TraceEvent("TransactionTrace_CommitError_ReadConflictRange")
-			    .setMaxEventLength(-1)
+			TraceEvent ev1("TransactionTrace_CommitError_ReadConflictRange");
+			ev1.setMaxEventLength(-1)
 			    .detail("TransactionID", id)
 			    .setMaxFieldLength(maxFieldLength)
 			    .detail("Begin", read_range.begin)
 			    .detail("End", read_range.end);
+			augmentTraceEvent(ev1);
 		}

 		for (auto& write_range : req.transaction.write_conflict_ranges) {
-			TraceEvent("TransactionTrace_CommitError_WriteConflictRange")
-			    .setMaxEventLength(-1)
+			TraceEvent ev2("TransactionTrace_CommitError_WriteConflictRange");
+			ev2.setMaxEventLength(-1)
 			    .detail("TransactionID", id)
 			    .setMaxFieldLength(maxFieldLength)
 			    .detail("Begin", write_range.begin)
 			    .detail("End", write_range.end);
+			augmentTraceEvent(ev2);
 		}

 		for (auto& mutation : req.transaction.mutations) {
-			TraceEvent("TransactionTrace_CommitError_Mutation")
-			    .setMaxEventLength(-1)
+			TraceEvent ev3("TransactionTrace_CommitError_Mutation");
+			ev3.setMaxEventLength(-1)
 			    .detail("TransactionID", id)
 			    .setMaxFieldLength(maxFieldLength)
 			    .detail("Mutation", mutation);
+			augmentTraceEvent(ev3);
 		}

-		TraceEvent("TransactionTrace_CommitError").detail("TransactionID", id).detail("ErrCode", errCode);
+		TraceEvent ev4("TransactionTrace_CommitError");
+		ev4.detail("TransactionID", id).detail("ErrCode", errCode);
+		augmentTraceEvent(ev4);
 	}
 };
 } // namespace FdbClientLogEvents
--- a/fdbclient/ClusterInterface.h
+++ b/fdbclient/ClusterInterface.h
@ -308,7 +308,7 @@ struct SplitShardReply {
 };

 // Split keyrange [shard.begin, shard.end) into num shards.
-// Split points are chosen as the arithmeticlly equal division points of the given range.
+// Split points are chosen as the arithmetically equal division points of the given range.
 struct SplitShardRequest {
 	constexpr static FileIdentifier file_identifier = 1384443;
 	KeyRange shard;
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@ -171,9 +171,8 @@ struct CommitTransactionRequest : TimedRequest {

 	TenantInfo tenantInfo;

-	CommitTransactionRequest() : CommitTransactionRequest(TenantInfo(), SpanID()) {}
-	CommitTransactionRequest(TenantInfo const& tenantInfo, SpanID const& context)
-	  : spanContext(context), flags(0), tenantInfo(tenantInfo) {}
+	CommitTransactionRequest() : CommitTransactionRequest(SpanID()) {}
+	CommitTransactionRequest(SpanID const& context) : spanContext(context), flags(0) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -133,6 +133,7 @@ public:
 };

 struct WatchParameters : public ReferenceCounted<WatchParameters> {
+	const TenantInfo tenant;
 	const Key key;
 	const Optional<Value> value;

@ -143,7 +144,8 @@ struct WatchParameters : public ReferenceCounted<WatchParameters> {
 	const Optional<UID> debugID;
 	const UseProvisionalProxies useProvisionalProxies;

-	WatchParameters(Key key,
+	WatchParameters(TenantInfo tenant,
+	                Key key,
 	                Optional<Value> value,
 	                Version version,
 	                TagSet tags,
@ -151,8 +153,8 @@ struct WatchParameters : public ReferenceCounted<WatchParameters> {
 	                TaskPriority taskID,
 	                Optional<UID> debugID,
 	                UseProvisionalProxies useProvisionalProxies)
-	  : key(key), value(value), version(version), tags(tags), spanID(spanID), taskID(taskID), debugID(debugID),
-	    useProvisionalProxies(useProvisionalProxies) {}
+	  : tenant(tenant), key(key), value(value), version(version), tags(tags), spanID(spanID), taskID(taskID),
+	    debugID(debugID), useProvisionalProxies(useProvisionalProxies) {}
 };

 class WatchMetadata : public ReferenceCounted<WatchMetadata> {
@ -203,6 +205,16 @@ struct EndpointFailureInfo {
 	double lastRefreshTime = 0;
 };

+struct KeyRangeLocationInfo {
+	TenantMapEntry tenantEntry;
+	KeyRange range;
+	Reference<LocationInfo> locations;
+
+	KeyRangeLocationInfo() {}
+	KeyRangeLocationInfo(TenantMapEntry tenantEntry, KeyRange range, Reference<LocationInfo> locations)
+	  : tenantEntry(tenantEntry), range(range), locations(locations) {}
+};
+
 class DatabaseContext : public ReferenceCounted<DatabaseContext>, public FastAllocated<DatabaseContext>, NonCopyable {
 public:
 	static DatabaseContext* allocateOnForeignThread() {
@ -237,14 +249,22 @@ public:
 		                                    switchable));
 	}

-	std::pair<KeyRange, Reference<LocationInfo>> getCachedLocation(const KeyRef&, Reverse isBackward = Reverse::False);
-	bool getCachedLocations(const KeyRangeRef&,
-	                        std::vector<std::pair<KeyRange, Reference<LocationInfo>>>&,
+	Optional<KeyRangeLocationInfo> getCachedLocation(const Optional<TenantName>& tenant,
+	                                                 const KeyRef&,
+	                                                 Reverse isBackward = Reverse::False);
+	bool getCachedLocations(const Optional<TenantName>& tenant,
+	                        const KeyRangeRef&,
+	                        std::vector<KeyRangeLocationInfo>&,
 	                        int limit,
 	                        Reverse reverse);
-	Reference<LocationInfo> setCachedLocation(const KeyRangeRef&, const std::vector<struct StorageServerInterface>&);
-	void invalidateCache(const KeyRef&, Reverse isBackward = Reverse::False);
-	void invalidateCache(const KeyRangeRef&);
+	void cacheTenant(const TenantName& tenant, const TenantMapEntry& tenantEntry);
+	Reference<LocationInfo> setCachedLocation(const Optional<TenantName>& tenant,
+	                                          const TenantMapEntry& tenantEntry,
+	                                          const KeyRangeRef&,
+	                                          const std::vector<struct StorageServerInterface>&);
+	void invalidateCachedTenant(const TenantNameRef& tenant);
+	void invalidateCache(const KeyRef& tenantPrefix, const KeyRef& key, Reverse isBackward = Reverse::False);
+	void invalidateCache(const KeyRef& tenantPrefix, const KeyRangeRef& keys);

 	// Records that `endpoint` is failed on a healthy server.
 	void setFailedEndpointOnHealthyServer(const Endpoint& endpoint);
@ -287,9 +307,9 @@ public:
 	void removeWatch();

 	// watch map operations
-	Reference<WatchMetadata> getWatchMetadata(KeyRef key) const;
-	Key setWatchMetadata(Reference<WatchMetadata> metadata);
-	void deleteWatchMetadata(KeyRef key);
+	Reference<WatchMetadata> getWatchMetadata(int64_t tenantId, KeyRef key) const;
+	void setWatchMetadata(Reference<WatchMetadata> metadata);
+	void deleteWatchMetadata(int64_t tenant, KeyRef key);
 	void clearWatchMetadata();

 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value);
@ -407,8 +427,10 @@ public:

 	// Cache of location information
 	int locationCacheSize;
+	int tenantCacheSize;
 	CoalescedKeyRangeMap<Reference<LocationInfo>> locationCache;
 	std::unordered_map<Endpoint, EndpointFailureInfo> failedEndpointsOnHealthyServersInfo;
+	std::unordered_map<TenantName, TenantMapEntry> tenantCache;

 	std::map<UID, StorageServerInfo*> server_interf;
 	std::map<UID, BlobWorkerInterface> blobWorker_interf; // blob workers don't change endpoints for the same ID
@ -558,7 +580,8 @@ public:
 	EventCacheHolder connectToDatabaseEventCacheHolder;

 private:
-	std::unordered_map<Key, Reference<WatchMetadata>> watchMap;
+	std::unordered_map<std::pair<int64_t, Key>, Reference<WatchMetadata>, boost::hash<std::pair<int64_t, Key>>>
+	    watchMap;
 };

 #endif
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -656,7 +656,7 @@ struct RangeResultRef : VectorRef<KeyValueRef> {
 	           // limits requested) False implies that no such values remain
 	Optional<KeyRef> readThrough; // Only present when 'more' is true. When present, this value represent the end (or
 	                              // beginning if reverse) of the range which was read to produce these results. This is
-	                              // guarenteed to be less than the requested range.
+	                              // guaranteed to be less than the requested range.
 	bool readToBegin;
 	bool readThroughEnd;

--- a/fdbclient/GenericManagementAPI.actor.h
+++ b/fdbclient/GenericManagementAPI.actor.h
@ -37,6 +37,9 @@ the contents of the system key space.
 #include "fdbclient/ClientBooleanParams.h"
 #include "fdbclient/DatabaseConfiguration.h"
 #include "fdbclient/Status.h"
+#include "fdbclient/Subspace.h"
+#include "fdbclient/DatabaseConfiguration.h"
+#include "fdbclient/Status.h"
 #include "fdbclient/SystemData.h"
 #include "flow/actorcompiler.h" // has to be last include

@ -626,6 +629,231 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db,
 // used by special keys and fdbcli
 std::string generateErrorMessage(const CoordinatorsResult& res);

+ACTOR template <class Transaction>
+Future<Optional<TenantMapEntry>> tryGetTenantTransaction(Transaction tr, TenantName name) {
+	state Key tenantMapKey = name.withPrefix(tenantMapPrefix);
+
+	tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+	tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+
+	Optional<Value> val = wait(safeThreadFutureToFuture(tr->get(tenantMapKey)));
+	return val.map<TenantMapEntry>([](Optional<Value> v) { return decodeTenantEntry(v.get()); });
+}
+
+ACTOR template <class DB>
+Future<Optional<TenantMapEntry>> tryGetTenant(Reference<DB> db, TenantName name) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+
+	loop {
+		try {
+			Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
+			return entry;
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR template <class Transaction>
+Future<TenantMapEntry> getTenantTransaction(Transaction tr, TenantName name) {
+	Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
+	if (!entry.present()) {
+		throw tenant_not_found();
+	}
+
+	return entry.get();
+}
+
+ACTOR template <class DB>
+Future<TenantMapEntry> getTenant(Reference<DB> db, TenantName name) {
+	Optional<TenantMapEntry> entry = wait(tryGetTenant(db, name));
+	if (!entry.present()) {
+		throw tenant_not_found();
+	}
+
+	return entry.get();
+}
+
+// Creates a tenant with the given name. If the tenant already exists, an empty optional will be returned.
+ACTOR template <class Transaction>
+Future<Optional<TenantMapEntry>> createTenantTransaction(Transaction tr, TenantNameRef name) {
+	state Key tenantMapKey = name.withPrefix(tenantMapPrefix);
+
+	if (name.startsWith("\xff"_sr)) {
+		throw invalid_tenant_name();
+	}
+
+	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+	tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+
+	state Future<Optional<TenantMapEntry>> tenantEntryFuture = tryGetTenantTransaction(tr, name);
+	state Future<Optional<Value>> tenantDataPrefixFuture = safeThreadFutureToFuture(tr->get(tenantDataPrefixKey));
+	state Future<Optional<Value>> lastIdFuture = safeThreadFutureToFuture(tr->get(tenantLastIdKey));
+
+	Optional<Value> tenantMode = wait(safeThreadFutureToFuture(tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr))));
+
+	if (!tenantMode.present() || tenantMode.get() == StringRef(format("%d", TenantMode::DISABLED))) {
+		throw tenants_disabled();
+	}
+
+	Optional<TenantMapEntry> tenantEntry = wait(tenantEntryFuture);
+	if (tenantEntry.present()) {
+		return Optional<TenantMapEntry>();
+	}
+
+	state Optional<Value> lastIdVal = wait(lastIdFuture);
+	Optional<Value> tenantDataPrefix = wait(tenantDataPrefixFuture);
+
+	state TenantMapEntry newTenant(lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0,
+	                               tenantDataPrefix.present() ? (KeyRef)tenantDataPrefix.get() : ""_sr);
+
+	RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(newTenant.prefix), 1)));
+	if (!contents.empty()) {
+		throw tenant_prefix_allocator_conflict();
+	}
+
+	tr->set(tenantLastIdKey, TenantMapEntry::idToPrefix(newTenant.id));
+	tr->set(tenantMapKey, encodeTenantEntry(newTenant));
+
+	return newTenant;
+}
+
+ACTOR template <class DB>
+Future<Void> createTenant(Reference<DB> db, TenantName name) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+
+	state bool firstTry = true;
+	loop {
+		try {
+			if (firstTry) {
+				Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
+				if (entry.present()) {
+					throw tenant_already_exists();
+				}
+
+				firstTry = false;
+			}
+
+			state Optional<TenantMapEntry> newTenant = wait(createTenantTransaction(tr, name));
+
+			if (BUGGIFY) {
+				throw commit_unknown_result();
+			}
+
+			wait(safeThreadFutureToFuture(tr->commit()));
+
+			if (BUGGIFY) {
+				throw commit_unknown_result();
+			}
+
+			TraceEvent("CreatedTenant")
+			    .detail("Tenant", name)
+			    .detail("TenantId", newTenant.present() ? newTenant.get().id : -1)
+			    .detail("Prefix", newTenant.present() ? (StringRef)newTenant.get().prefix : "Unknown"_sr)
+			    .detail("Version", tr->getCommittedVersion());
+
+			return Void();
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR template <class Transaction>
+Future<Void> deleteTenantTransaction(Transaction tr, TenantNameRef name) {
+	state Key tenantMapKey = name.withPrefix(tenantMapPrefix);
+
+	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+	tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+
+	state Optional<TenantMapEntry> tenantEntry = wait(tryGetTenantTransaction(tr, name));
+	if (!tenantEntry.present()) {
+		return Void();
+	}
+
+	RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(tenantEntry.get().prefix), 1)));
+	if (!contents.empty()) {
+		throw tenant_not_empty();
+	}
+
+	tr->clear(tenantMapKey);
+
+	return Void();
+}
+
+ACTOR template <class DB>
+Future<Void> deleteTenant(Reference<DB> db, TenantName name) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+
+	state bool firstTry = true;
+	loop {
+		try {
+			if (firstTry) {
+				Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
+				if (!entry.present()) {
+					throw tenant_not_found();
+				}
+
+				firstTry = false;
+			}
+
+			wait(deleteTenantTransaction(tr, name));
+
+			if (BUGGIFY) {
+				throw commit_unknown_result();
+			}
+
+			wait(safeThreadFutureToFuture(tr->commit()));
+
+			if (BUGGIFY) {
+				throw commit_unknown_result();
+			}
+
+			TraceEvent("DeletedTenant").detail("Tenant", name).detail("Version", tr->getCommittedVersion());
+			return Void();
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
+
+ACTOR template <class Transaction>
+Future<std::map<TenantName, TenantMapEntry>> listTenantsTransaction(Transaction tr,
+                                                                    TenantNameRef begin,
+                                                                    TenantNameRef end,
+                                                                    int limit) {
+	state KeyRange range = KeyRangeRef(begin, end).withPrefix(tenantMapPrefix);
+
+	tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+	tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+
+	RangeResult results = wait(safeThreadFutureToFuture(
+	    tr->getRange(firstGreaterOrEqual(range.begin), firstGreaterOrEqual(range.end), limit)));
+
+	std::map<TenantName, TenantMapEntry> tenants;
+	for (auto kv : results) {
+		tenants[kv.key.removePrefix(tenantMapPrefix)] = decodeTenantEntry(kv.value);
+	}
+
+	return tenants;
+}
+
+ACTOR template <class DB>
+Future<std::map<TenantName, TenantMapEntry>> listTenants(Reference<DB> db,
+                                                         TenantName begin,
+                                                         TenantName end,
+                                                         int limit) {
+	state Reference<typename DB::TransactionT> tr = db->createTransaction();
+
+	loop {
+		try {
+			std::map<TenantName, TenantMapEntry> tenants = wait(listTenantsTransaction(tr, begin, end, limit));
+			return tenants;
+		} catch (Error& e) {
+			wait(safeThreadFutureToFuture(tr->onError(e)));
+		}
+	}
+}
 } // namespace ManagementAPI

 #include "flow/unactorcompiler.h"
--- a/fdbclient/HTTP.actor.cpp
+++ b/fdbclient/HTTP.actor.cpp
@ -207,7 +207,7 @@ ACTOR Future<Void> read_http_response_headers(Reference<IConnection> conn,

 // Reads an HTTP response from a network connection
 // If the connection fails while being read the exception will emitted
-// If the response is not parseable or complete in some way, http_bad_response will be thrown
+// If the response is not parsable or complete in some way, http_bad_response will be thrown
 ACTOR Future<Void> read_http_response(Reference<HTTP::Response> r, Reference<IConnection> conn, bool header_only) {
 	state std::string buf;
 	state size_t pos = 0;
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@ -24,6 +24,7 @@

 #include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/FDBTypes.h"
+#include "fdbclient/Tenant.h"

 #include "flow/ThreadHelper.actor.h"

@ -109,6 +110,18 @@ public:
 	// Only if it's a MultiVersionTransaction and the underlying transaction handler is null,
 	// it will return false
 	virtual bool isValid() { return true; }
+
+	virtual Optional<TenantName> getTenant() = 0;
+};
+
+class ITenant {
+public:
+	virtual ~ITenant() {}
+
+	virtual Reference<ITransaction> createTransaction() = 0;
+
+	virtual void addref() = 0;
+	virtual void delref() = 0;
 };

 // An interface that represents a connection to a cluster made by a client
@ -116,6 +129,7 @@ class IDatabase {
 public:
 	virtual ~IDatabase() {}

+	virtual Reference<ITenant> openTenant(TenantNameRef tenantName) = 0;
 	virtual Reference<ITransaction> createTransaction() = 0;
 	virtual void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
 	virtual double getMainThreadBusyness() = 0;
--- a/fdbclient/ISingleThreadTransaction.cpp
+++ b/fdbclient/ISingleThreadTransaction.cpp
@ -48,6 +48,21 @@ Reference<ISingleThreadTransaction> ISingleThreadTransaction::create(Type type,
 	} else {
 		result = makeReference<PaxosConfigTransaction>();
 	}
-	result->setDatabase(cx);
+	result->construct(cx);
+	return result;
+}
+
+Reference<ISingleThreadTransaction> ISingleThreadTransaction::create(Type type,
+                                                                     Database const& cx,
+                                                                     TenantName const& tenant) {
+	Reference<ISingleThreadTransaction> result;
+	if (type == Type::RYW) {
+		result = makeReference<ReadYourWritesTransaction>();
+	} else if (type == Type::SIMPLE_CONFIG) {
+		result = makeReference<SimpleConfigTransaction>();
+	} else {
+		result = makeReference<PaxosConfigTransaction>();
+	}
+	result->construct(cx, tenant);
 	return result;
 }
--- a/fdbclient/ISingleThreadTransaction.h
+++ b/fdbclient/ISingleThreadTransaction.h
@ -45,8 +45,15 @@ public:
 	};

 	static ISingleThreadTransaction* allocateOnForeignThread(Type);
+
 	static Reference<ISingleThreadTransaction> create(Type, Database const&);
-	virtual void setDatabase(Database const&) = 0;
+	static Reference<ISingleThreadTransaction> create(Type, Database const&, TenantName const&);
+
+	virtual void construct(Database const&) = 0;
+	virtual void construct(Database const&, TenantName const&) {
+		// By default, a transaction implementation does not support tenants.
+		ASSERT(false);
+	}

 	virtual void setVersion(Version v) = 0;
 	virtual Future<Version> getReadVersion() = 0;
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -18,7 +18,9 @@
 * limitations under the License.
 */

+#include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/FDBTypes.h"
+#include "fdbclient/GenericManagementAPI.actor.h"
 #include "fdbclient/MultiVersionTransaction.h"
 #include "fdbclient/MultiVersionAssignmentVars.h"
 #include "fdbclient/ClientVersion.h"
@ -382,6 +384,15 @@ void DLTransaction::reset() {
 	api->transactionReset(tr);
 }

+// DLTenant
+Reference<ITransaction> DLTenant::createTransaction() {
+	ASSERT(api->tenantCreateTransaction != nullptr);
+
+	FdbCApi::FDBTransaction* tr;
+	api->tenantCreateTransaction(tenant, &tr);
+	return Reference<ITransaction>(new DLTransaction(api, tr));
+}
+
 // DLDatabase
 DLDatabase::DLDatabase(Reference<FdbCApi> api, ThreadFuture<FdbCApi::FDBDatabase*> dbFuture) : api(api), db(nullptr) {
 	addref();
@ -401,9 +412,19 @@ ThreadFuture<Void> DLDatabase::onReady() {
 	return ready;
 }

+Reference<ITenant> DLDatabase::openTenant(TenantNameRef tenantName) {
+	if (!api->databaseOpenTenant) {
+		throw unsupported_operation();
+	}
+
+	FdbCApi::FDBTenant* tenant;
+	throwIfError(api->databaseOpenTenant(db, tenantName.begin(), tenantName.size(), &tenant));
+	return makeReference<DLTenant>(api, tenant);
+}
+
 Reference<ITransaction> DLDatabase::createTransaction() {
 	FdbCApi::FDBTransaction* tr;
-	api->databaseCreateTransaction(db, &tr);
+	throwIfError(api->databaseCreateTransaction(db, &tr));
 	return Reference<ITransaction>(new DLTransaction(api, tr));
 }

@ -522,6 +543,7 @@ void DLApi::init() {
 	loadClientFunction(&api->stopNetwork, lib, fdbCPath, "fdb_stop_network", headerVersion >= 0);
 	loadClientFunction(&api->createDatabase, lib, fdbCPath, "fdb_create_database", headerVersion >= 610);

+	loadClientFunction(&api->databaseOpenTenant, lib, fdbCPath, "fdb_database_open_tenant", headerVersion >= 710);
 	loadClientFunction(
 	    &api->databaseCreateTransaction, lib, fdbCPath, "fdb_database_create_transaction", headerVersion >= 0);
 	loadClientFunction(&api->databaseSetOption, lib, fdbCPath, "fdb_database_set_option", headerVersion >= 0);
@ -542,6 +564,10 @@ void DLApi::init() {
 	loadClientFunction(
 	    &api->databaseCreateSnapshot, lib, fdbCPath, "fdb_database_create_snapshot", headerVersion >= 700);

+	loadClientFunction(
+	    &api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710);
+	loadClientFunction(&api->tenantDestroy, lib, fdbCPath, "fdb_tenant_destroy", headerVersion >= 710);
+
 	loadClientFunction(&api->transactionSetOption, lib, fdbCPath, "fdb_transaction_set_option", headerVersion >= 0);
 	loadClientFunction(&api->transactionDestroy, lib, fdbCPath, "fdb_transaction_destroy", headerVersion >= 0);
 	loadClientFunction(
@ -737,8 +763,9 @@ void DLApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParame

 // MultiVersionTransaction
 MultiVersionTransaction::MultiVersionTransaction(Reference<MultiVersionDatabase> db,
+                                                 Optional<Reference<MultiVersionTenant>> tenant,
                                                 UniqueOrderedOptionList<FDBTransactionOptions> defaultOptions)
-  : db(db), startTime(timer_monotonic()), timeoutTsav(new ThreadSingleAssignmentVar<Void>()) {
+  : db(db), tenant(tenant), startTime(timer_monotonic()), timeoutTsav(new ThreadSingleAssignmentVar<Void>()) {
 	setDefaultOptions(defaultOptions);
 	updateTransaction();
 }
@ -749,18 +776,29 @@ void MultiVersionTransaction::setDefaultOptions(UniqueOrderedOptionList<FDBTrans
 }

 void MultiVersionTransaction::updateTransaction() {
-	auto currentDb = db->dbState->dbVar->get();
-
 	TransactionInfo newTr;
-	if (currentDb.value) {
-		newTr.transaction = currentDb.value->createTransaction();
+	if (tenant.present()) {
+		ASSERT(tenant.get());
+		auto currentTenant = tenant.get()->tenantVar->get();
+		if (currentTenant.value) {
+			newTr.transaction = currentTenant.value->createTransaction();
+		}
+
+		newTr.onChange = currentTenant.onChange;
+	} else {
+		auto currentDb = db->dbState->dbVar->get();
+		if (currentDb.value) {
+			newTr.transaction = currentDb.value->createTransaction();
+		}
+
+		newTr.onChange = currentDb.onChange;
 	}

 	Optional<StringRef> timeout;
 	for (auto option : persistentOptions) {
 		if (option.first == FDBTransactionOptions::TIMEOUT) {
 			timeout = option.second.castTo<StringRef>();
-		} else if (currentDb.value) {
+		} else if (newTr.transaction) {
 			newTr.transaction->setOption(option.first, option.second.castTo<StringRef>());
 		}
 	}
@ -770,13 +808,11 @@ void MultiVersionTransaction::updateTransaction() {
 	// that might inadvertently fail the transaction.
 	if (timeout.present()) {
 		setTimeout(timeout);
-		if (currentDb.value) {
+		if (newTr.transaction) {
 			newTr.transaction->setOption(FDBTransactionOptions::TIMEOUT, timeout);
 		}
 	}

-	newTr.onChange = currentDb.onChange;
-
 	lock.enter();
 	transaction = newTr;
 	lock.leave();
@ -1041,6 +1077,14 @@ ThreadFuture<Void> MultiVersionTransaction::onError(Error const& e) {
 	}
 }

+Optional<TenantName> MultiVersionTransaction::getTenant() {
+	if (tenant.present()) {
+		return tenant.get()->tenantName;
+	} else {
+		return Optional<TenantName>();
+	}
+}
+
 // Waits for the specified duration and signals the assignment variable with a timed out error
 // This will be canceled if a new timeout is set, in which case the tsav will not be signaled.
 ACTOR Future<Void> timeoutImpl(Reference<ThreadSingleAssignmentVar<Void>> tsav, double duration) {
@ -1167,6 +1211,39 @@ bool MultiVersionTransaction::isValid() {
 	return tr.transaction.isValid();
 }

+// MultiVersionTenant
+MultiVersionTenant::MultiVersionTenant(Reference<MultiVersionDatabase> db, StringRef tenantName)
+  : tenantVar(new ThreadSafeAsyncVar<Reference<ITenant>>(Reference<ITenant>(nullptr))), tenantName(tenantName), db(db) {
+	updateTenant();
+}
+
+MultiVersionTenant::~MultiVersionTenant() {}
+
+Reference<ITransaction> MultiVersionTenant::createTransaction() {
+	return Reference<ITransaction>(new MultiVersionTransaction(
+	    db, Reference<MultiVersionTenant>::addRef(this), db->dbState->transactionDefaultOptions));
+}
+
+// Creates a new underlying tenant object whenever the database connection changes. This change is signaled
+// to open transactions via an AsyncVar.
+void MultiVersionTenant::updateTenant() {
+	Reference<ITenant> tenant;
+	auto currentDb = db->dbState->dbVar->get();
+	if (currentDb.value) {
+		tenant = currentDb.value->openTenant(tenantName);
+	} else {
+		tenant = Reference<ITenant>(nullptr);
+	}
+
+	tenantVar->set(tenant);
+
+	MutexHolder holder(tenantLock);
+	tenantUpdater = mapThreadFuture<Void, Void>(currentDb.onChange, [this](ErrorOr<Void> result) {
+		updateTenant();
+		return Void();
+	});
+}
+
 // MultiVersionDatabase
 MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
                                           int threadIdx,
@ -1241,9 +1318,14 @@ Reference<IDatabase> MultiVersionDatabase::debugCreateFromExistingDatabase(Refer
 	return Reference<IDatabase>(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, db, false));
 }

+Reference<ITenant> MultiVersionDatabase::openTenant(TenantNameRef tenantName) {
+	return makeReference<MultiVersionTenant>(Reference<MultiVersionDatabase>::addRef(this), tenantName);
+}
+
 Reference<ITransaction> MultiVersionDatabase::createTransaction() {
-	return Reference<ITransaction>(
-	    new MultiVersionTransaction(Reference<MultiVersionDatabase>::addRef(this), dbState->transactionDefaultOptions));
+	return Reference<ITransaction>(new MultiVersionTransaction(Reference<MultiVersionDatabase>::addRef(this),
+	                                                           Optional<Reference<MultiVersionTenant>>(),
+	                                                           dbState->transactionDefaultOptions));
 }

 void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value) {
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@ -36,6 +36,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	typedef struct FDB_result FDBResult;
 	typedef struct FDB_cluster FDBCluster;
 	typedef struct FDB_database FDBDatabase;
+	typedef struct FDB_tenant FDBTenant;
 	typedef struct FDB_transaction FDBTransaction;

 	typedef int fdb_error_t;
@ -120,6 +121,10 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	fdb_error_t (*createDatabase)(const char* clusterFilePath, FDBDatabase** db);

 	// Database
+	fdb_error_t (*databaseOpenTenant)(FDBDatabase* database,
+	                                  uint8_t const* tenantName,
+	                                  int tenantNameLength,
+	                                  FDBTenant** outTenant);
 	fdb_error_t (*databaseCreateTransaction)(FDBDatabase* database, FDBTransaction** tr);
 	fdb_error_t (*databaseSetOption)(FDBDatabase* database,
 	                                 FDBDatabaseOption option,
@ -140,6 +145,10 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	double (*databaseGetMainThreadBusyness)(FDBDatabase* database);
 	FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion);

+	// Tenant
+	fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction);
+	void (*tenantDestroy)(FDBTenant* tenant);
+
 	// Transaction
 	fdb_error_t (*transactionSetOption)(FDBTransaction* tr,
 	                                    FDBTransactionOption option,
@ -353,6 +362,11 @@ public:
 	ThreadFuture<Void> onError(Error const& e) override;
 	void reset() override;

+	Optional<TenantName> getTenant() override {
+		ASSERT(false);
+		throw internal_error();
+	}
+
 	void addref() override { ThreadSafeReferenceCounted<DLTransaction>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<DLTransaction>::delref(); }

@ -361,6 +375,25 @@ private:
 	FdbCApi::FDBTransaction* const tr;
 };

+class DLTenant : public ITenant, ThreadSafeReferenceCounted<DLTenant> {
+public:
+	DLTenant(Reference<FdbCApi> api, FdbCApi::FDBTenant* tenant) : api(api), tenant(tenant) {}
+	~DLTenant() override {
+		if (tenant) {
+			api->tenantDestroy(tenant);
+		}
+	}
+
+	Reference<ITransaction> createTransaction() override;
+
+	void addref() override { ThreadSafeReferenceCounted<DLTenant>::addref(); }
+	void delref() override { ThreadSafeReferenceCounted<DLTenant>::delref(); }
+
+private:
+	const Reference<FdbCApi> api;
+	FdbCApi::FDBTenant* tenant;
+};
+
 // An implementation of IDatabase that wraps a database object created on an externally loaded client library.
 // All API calls to that database are routed through the external library.
 class DLDatabase : public IDatabase, ThreadSafeReferenceCounted<DLDatabase> {
@ -375,6 +408,7 @@ public:

 	ThreadFuture<Void> onReady();

+	Reference<ITenant> openTenant(TenantNameRef tenantName) override;
 	Reference<ITransaction> createTransaction() override;
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
@ -432,6 +466,7 @@ private:
 };

 class MultiVersionDatabase;
+class MultiVersionTenant;

 // An implementation of ITransaction that wraps a transaction created either locally or through a dynamically loaded
 // external client. When needed (e.g on cluster version change), the MultiVersionTransaction can automatically replace
@ -439,6 +474,7 @@ class MultiVersionDatabase;
 class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted<MultiVersionTransaction> {
 public:
 	MultiVersionTransaction(Reference<MultiVersionDatabase> db,
+	                        Optional<Reference<MultiVersionTenant>> tenant,
 	                        UniqueOrderedOptionList<FDBTransactionOptions> defaultOptions);

 	~MultiVersionTransaction() override;
@ -507,6 +543,8 @@ public:
 	ThreadFuture<Void> onError(Error const& e) override;
 	void reset() override;

+	Optional<TenantName> getTenant() override;
+
 	void addref() override { ThreadSafeReferenceCounted<MultiVersionTransaction>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<MultiVersionTransaction>::delref(); }

@ -515,6 +553,7 @@ public:

 private:
 	const Reference<MultiVersionDatabase> db;
+	const Optional<Reference<MultiVersionTenant>> tenant;
 	ThreadSpinLock lock;

 	struct TransactionInfo {
@ -555,6 +594,8 @@ private:
 	void setDefaultOptions(UniqueOrderedOptionList<FDBTransactionOptions> options);

 	std::vector<std::pair<FDBTransactionOptions::Option, Optional<Standalone<StringRef>>>> persistentOptions;
+
+	const Optional<TenantName> tenantName;
 };

 struct ClientDesc {
@ -585,6 +626,33 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted<ClientInfo> {

 class MultiVersionApi;

+// An implementation of ITenant that wraps a tenant created either locally or through a dynamically loaded
+// external client. The wrapped ITenant is automatically changed when the MultiVersionDatabase used to create
+// it connects with a different version.
+class MultiVersionTenant final : public ITenant, ThreadSafeReferenceCounted<MultiVersionTenant> {
+public:
+	MultiVersionTenant(Reference<MultiVersionDatabase> db, StringRef tenantName);
+	~MultiVersionTenant() override;
+
+	Reference<ITransaction> createTransaction() override;
+
+	void addref() override { ThreadSafeReferenceCounted<MultiVersionTenant>::addref(); }
+	void delref() override { ThreadSafeReferenceCounted<MultiVersionTenant>::delref(); }
+
+	Reference<ThreadSafeAsyncVar<Reference<ITenant>>> tenantVar;
+	const Standalone<StringRef> tenantName;
+
+private:
+	Reference<MultiVersionDatabase> db;
+
+	Mutex tenantLock;
+	ThreadFuture<Void> tenantUpdater;
+
+	// Creates a new underlying tenant object whenever the database connection changes. This change is signaled
+	// to open transactions via an AsyncVar.
+	void updateTenant();
+};
+
 // An implementation of IDatabase that wraps a database created either locally or through a dynamically loaded
 // external client. The MultiVersionDatabase monitors the protocol version of the cluster and automatically
 // replaces the wrapped database when the protocol version changes.
@ -599,6 +667,7 @@ public:

 	~MultiVersionDatabase() override;

+	Reference<ITenant> openTenant(TenantNameRef tenantName) override;
 	Reference<ITransaction> createTransaction() override;
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -132,12 +132,12 @@ void setupNetwork(uint64_t transportId = 0, UseMetrics = UseMetrics::False);
 //  call stopNetwork (from a non-networking thread) can cause the runNetwork() call to
 //  return.
 //
-// Throws network_already_setup if g_network has already been initalized
+// Throws network_already_setup if g_network has already been initialized
 void runNetwork();

 // See above.  Can be called from a thread that is not the "networking thread"
 //
-// Throws network_not_setup if g_network has not been initalized
+// Throws network_not_setup if g_network has not been initialized
 void stopNetwork();

 struct StorageMetrics;
@ -159,6 +159,7 @@ struct TransactionOptions {
 	bool expensiveClearCostEstimation : 1;
 	bool useGrvCache : 1;
 	bool skipGrvCache : 1;
+	bool rawAccess : 1;

 	TransactionPriority priority;

@ -236,6 +237,8 @@ struct Watch : public ReferenceCounted<Watch>, NonCopyable {

 struct TransactionState : ReferenceCounted<TransactionState> {
 	Database cx;
+	Optional<TenantName> tenant;
+	int64_t tenantId = TenantInfo::INVALID_TENANT;
 	Reference<TransactionLogInfo> trLogInfo;
 	TransactionOptions options;

@ -258,15 +261,19 @@ struct TransactionState : ReferenceCounted<TransactionState> {
 	// Only available so that Transaction can have a default constructor, for use in state variables
 	TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID) {}

-	TransactionState(Database cx, TaskPriority taskID, SpanID spanID, Reference<TransactionLogInfo> trLogInfo)
-	  : cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID) {}
+	TransactionState(Database cx,
+	                 Optional<TenantName> tenant,
+	                 TaskPriority taskID,
+	                 SpanID spanID,
+	                 Reference<TransactionLogInfo> trLogInfo);

 	Reference<TransactionState> cloneAndReset(Reference<TransactionLogInfo> newTrLogInfo, bool generateNewSpan) const;
+	TenantInfo getTenantInfo() const;
 };

 class Transaction : NonCopyable {
 public:
-	explicit Transaction(Database const& cx);
+	explicit Transaction(Database const& cx, Optional<TenantName> const& tenant = Optional<TenantName>());
 	~Transaction();

 	void setVersion(Version v);
@ -440,6 +447,8 @@ public:
 		return Standalone<VectorRef<KeyRangeRef>>(tr.transaction.write_conflict_ranges, tr.arena);
 	}

+	Optional<TenantName> getTenant() { return trState->tenant; }
+
 	Reference<TransactionState> trState;
 	std::vector<Reference<Watch>> watches;
 	Span span;
@ -481,6 +490,25 @@ int64_t extractIntOption(Optional<StringRef> value,
 // states: coordinator, TLog and storage state
 ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);

+// Adds necessary mutation(s) to the transaction, so that *one* checkpoint will be created for
+// each and every shards overlapping with `range`. Each checkpoint will be created at a random
+// storage server for each shard.
+// All checkpoint(s) will be created at the transaction's commit version.
+Future<Void> createCheckpoint(Transaction* tr, KeyRangeRef range, CheckpointFormat format);
+
+// Same as above.
+Future<Void> createCheckpoint(Reference<ReadYourWritesTransaction> tr, KeyRangeRef range, CheckpointFormat format);
+
+// Gets checkpoint metadata for `keys` at the specific version, with the particular format.
+// One CheckpointMetaData will be returned for each distinctive shard.
+// The collective keyrange of the returned checkpoint(s) is a super-set of `keys`.
+// checkpoint_not_found() error will be returned if the specific checkpoint(s) cannot be found.
+ACTOR Future<std::vector<CheckpointMetaData>> getCheckpointMetaData(Database cx,
+                                                                    KeyRange keys,
+                                                                    Version version,
+                                                                    CheckpointFormat format,
+                                                                    double timeout = 5.0);
+
 // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
 ACTOR Future<bool> checkSafeExclusions(Database cx, std::vector<AddressExclusion> exclusions);

--- a/fdbclient/PaxosConfigTransaction.actor.cpp
+++ b/fdbclient/PaxosConfigTransaction.actor.cpp
@ -22,6 +22,8 @@
 #include "fdbclient/PaxosConfigTransaction.h"
 #include "flow/actorcompiler.h" // must be last include

+using ConfigTransactionInfo = ModelInterface<ConfigTransactionInterface>;
+
 class CommitQuorum {
 	ActorCollection actors{ false };
 	std::vector<ConfigTransactionInterface> ctis;
@ -224,10 +226,12 @@ class PaxosConfigTransactionImpl {
 		loop {
 			try {
 				ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
-				// TODO: Load balance
+				state Reference<ConfigTransactionInfo> configNodes(
+				    new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false));
 				ConfigTransactionGetReply reply =
-				    wait(timeoutError(self->getGenerationQuorum.getReadReplicas()[0].get.getReply(
-				                          ConfigTransactionGetRequest{ generation, configKey }),
+				    wait(timeoutError(basicLoadBalance(configNodes,
+				                                       &ConfigTransactionInterface::get,
+				                                       ConfigTransactionGetRequest{ generation, configKey }),
 				                      CLIENT_KNOBS->GET_KNOB_TIMEOUT));
 				if (reply.value.present()) {
 					return reply.value.get().toValue();
@ -245,10 +249,12 @@ class PaxosConfigTransactionImpl {

 	ACTOR static Future<RangeResult> getConfigClasses(PaxosConfigTransactionImpl* self) {
 		ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
-		// TODO: Load balance
+		state Reference<ConfigTransactionInfo> configNodes(
+		    new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false));
 		ConfigTransactionGetConfigClassesReply reply =
-		    wait(retryBrokenPromise(self->getGenerationQuorum.getReadReplicas()[0].getClasses,
-		                            ConfigTransactionGetConfigClassesRequest{ generation }));
+		    wait(basicLoadBalance(configNodes,
+		                          &ConfigTransactionInterface::getClasses,
+		                          ConfigTransactionGetConfigClassesRequest{ generation }));
 		RangeResult result;
 		result.reserve(result.arena(), reply.configClasses.size());
 		for (const auto& configClass : reply.configClasses) {
@ -259,10 +265,12 @@ class PaxosConfigTransactionImpl {

 	ACTOR static Future<RangeResult> getKnobs(PaxosConfigTransactionImpl* self, Optional<Key> configClass) {
 		ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
-		// TODO: Load balance
+		state Reference<ConfigTransactionInfo> configNodes(
+		    new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false));
 		ConfigTransactionGetKnobsReply reply =
-		    wait(retryBrokenPromise(self->getGenerationQuorum.getReadReplicas()[0].getKnobs,
-		                            ConfigTransactionGetKnobsRequest{ generation, configClass }));
+		    wait(basicLoadBalance(configNodes,
+		                          &ConfigTransactionInterface::getKnobs,
+		                          ConfigTransactionGetKnobsRequest{ generation, configClass }));
 		RangeResult result;
 		result.reserve(result.arena(), reply.knobNames.size());
 		for (const auto& knobName : reply.knobNames) {
@ -461,6 +469,6 @@ PaxosConfigTransaction::PaxosConfigTransaction() = default;

 PaxosConfigTransaction::~PaxosConfigTransaction() = default;

-void PaxosConfigTransaction::setDatabase(Database const& cx) {
+void PaxosConfigTransaction::construct(Database const& cx) {
 	impl = PImpl<PaxosConfigTransactionImpl>::create(cx);
 }
--- a/fdbclient/PaxosConfigTransaction.h
+++ b/fdbclient/PaxosConfigTransaction.h
@ -35,7 +35,7 @@ public:
 	PaxosConfigTransaction(std::vector<ConfigTransactionInterface> const&);
 	PaxosConfigTransaction();
 	~PaxosConfigTransaction();
-	void setDatabase(Database const&) override;
+	void construct(Database const&) override;
 	Future<Version> getReadVersion() override;
 	Optional<Version> getCachedReadVersion() const override;

--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -1443,17 +1443,21 @@ public:
 	}
 };

-ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx)
-  : ISingleThreadTransaction(cx->deferredError), tr(cx), cache(&arena), writes(&arena), retries(0), approximateSize(0),
-    creationTime(now()), commitStarted(false), versionStampFuture(tr.getVersionstamp()),
+ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx, Optional<TenantName> tenantName)
+  : ISingleThreadTransaction(cx->deferredError), tr(cx, tenantName), cache(&arena), writes(&arena), retries(0),
+    approximateSize(0), creationTime(now()), commitStarted(false), versionStampFuture(tr.getVersionstamp()),
    specialKeySpaceWriteMap(std::make_pair(false, Optional<Value>()), specialKeys.end), options(tr) {
 	std::copy(
 	    cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(), std::back_inserter(persistentOptions));
 	applyPersistentOptions();
 }

-void ReadYourWritesTransaction::setDatabase(Database const& cx) {
-	*this = ReadYourWritesTransaction(cx);
+void ReadYourWritesTransaction::construct(Database const& cx) {
+	*this = ReadYourWritesTransaction(cx, Optional<TenantName>());
+}
+
+void ReadYourWritesTransaction::construct(Database const& cx, TenantName const& tenantName) {
+	*this = ReadYourWritesTransaction(cx, tenantName);
 }

 ACTOR Future<Void> timebomb(double endTime, Promise<Void> resetPromise) {
--- a/fdbclient/ReadYourWrites.h
+++ b/fdbclient/ReadYourWrites.h
@ -68,10 +68,11 @@ class ReadYourWritesTransaction final : NonCopyable,
                                        public ISingleThreadTransaction,
                                        public FastAllocated<ReadYourWritesTransaction> {
 public:
-	explicit ReadYourWritesTransaction(Database const& cx);
+	explicit ReadYourWritesTransaction(Database const& cx, Optional<TenantName> tenant = Optional<TenantName>());
 	~ReadYourWritesTransaction();

-	void setDatabase(Database const&) override;
+	void construct(Database const&) override;
+	void construct(Database const&, TenantName const& tenant) override;
 	void setVersion(Version v) override { tr.setVersion(v); }
 	Future<Version> getReadVersion() override;
 	Optional<Version> getCachedReadVersion() const override { return tr.getCachedReadVersion(); }
@ -190,6 +191,8 @@ public:
 	void setSpecialKeySpaceErrorMsg(const std::string& msg) { specialKeySpaceErrorMsg = msg; }
 	Transaction& getTransaction() { return tr; }

+	Optional<TenantName> getTenant() { return tr.getTenant(); }
+
 	// used in template functions as returned Future type
 	template <typename Type>
 	using FutureT = Future<Type>;
--- a/fdbclient/S3BlobStore.h
+++ b/fdbclient/S3BlobStore.h
@ -68,7 +68,7 @@ public:
 				"connect_tries (or ct)                 Number of times to try to connect for each request.",
 				"connect_timeout (or cto)              Number of seconds to wait for a connect request to succeed.",
 				"max_connection_life (or mcl)          Maximum number of seconds to use a single TCP connection.",
-				"request_tries (or rt)                 Number of times to try each request until a parseable HTTP "
+				"request_tries (or rt)                 Number of times to try each request until a parsable HTTP "
 				"response other than 429 is received.",
 				"request_timeout_min (or rtom)         Number of seconds to wait for a request to succeed after a "
 				"connection is established.",
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -113,10 +113,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// Data distribution queue
 	init( HEALTH_POLL_TIME,                                      1.0 );
 	init( BEST_TEAM_STUCK_DELAY,                                 1.0 );
+	init( DEST_OVERLOADED_DELAY,                                 0.2 );
 	init( BG_REBALANCE_POLLING_INTERVAL,                        10.0 );
 	init( BG_REBALANCE_SWITCH_CHECK_INTERVAL,                    5.0 ); if (randomize && BUGGIFY) BG_REBALANCE_SWITCH_CHECK_INTERVAL = 1.0;
 	init( DD_QUEUE_LOGGING_INTERVAL,                             5.0 );
 	init( RELOCATION_PARALLELISM_PER_SOURCE_SERVER,                2 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_SOURCE_SERVER = 1;
+	init( RELOCATION_PARALLELISM_PER_DEST_SERVER,                 10 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_DEST_SERVER = 1; // Note: if this is smaller than FETCH_KEYS_PARALLELISM, this will artificially reduce performance. The current default of 10 is probably too high but is set conservatively for now.
 	init( DD_QUEUE_MAX_KEY_SERVERS,                              100 ); if( randomize && BUGGIFY ) DD_QUEUE_MAX_KEY_SERVERS = 1;
 	init( DD_REBALANCE_PARALLELISM,                               50 );
 	init( DD_REBALANCE_RESET_AMOUNT,                              30 );
@ -365,6 +367,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC,                0 );
 	// If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO.
 	init( ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE,                 true );
+	init( DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY,                    "fdb");
+
 	init( ROCKSDB_PERFCONTEXT_ENABLE,                          false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true;
 	init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE, 					  0.0001 );
 	init( ROCKSDB_MAX_SUBCOMPACTIONS,                              2 );
@ -676,6 +680,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( MAX_STORAGE_COMMIT_TIME,                             120.0 ); //The max fsync stall time on the storage server and tlog before marking a disk as failed
 	init( RANGESTREAM_LIMIT_BYTES,                               2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
 	init( ENABLE_CLEAR_RANGE_EAGER_READS,                       true );
+	init( CHECKPOINT_TRANSFER_BLOCK_BYTES,                      40e6 );
 	init( QUICK_GET_VALUE_FALLBACK,                             true );
 	init( QUICK_GET_KEY_VALUES_FALLBACK,                        true );
 	init( QUICK_GET_KEY_VALUES_LIMIT,                           2000 );
@ -714,6 +719,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( COORDINATOR_LEADER_CONNECTION_TIMEOUT,                20.0 );

 	// Dynamic Knobs (implementation)
+	init( COMPACTION_INTERVAL,             isSimulated ? 5.0 : 300.0 );
 	init( UPDATE_NODE_TIMEOUT,                                   3.0 );
 	init( GET_COMMITTED_VERSION_TIMEOUT,                         3.0 );
 	init( GET_SNAPSHOT_AND_CHANGES_TIMEOUT,                      3.0 );
--- a/fdbclient/ServerKnobs.h
+++ b/fdbclient/ServerKnobs.h
@ -112,10 +112,12 @@ public:
 	// Data distribution queue
 	double HEALTH_POLL_TIME;
 	double BEST_TEAM_STUCK_DELAY;
+	double DEST_OVERLOADED_DELAY;
 	double BG_REBALANCE_POLLING_INTERVAL;
 	double BG_REBALANCE_SWITCH_CHECK_INTERVAL;
 	double DD_QUEUE_LOGGING_INTERVAL;
 	double RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
+	double RELOCATION_PARALLELISM_PER_DEST_SERVER;
 	int DD_QUEUE_MAX_KEY_SERVERS;
 	int DD_REBALANCE_PARALLELISM;
 	int DD_REBALANCE_RESET_AMOUNT;
@ -296,6 +298,7 @@ public:
 	bool ROCKSDB_READ_RANGE_REUSE_ITERATORS;
 	int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC;
 	bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE;
+	std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY;
 	bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead
 	double ROCKSDB_PERFCONTEXT_SAMPLE_RATE;
 	int ROCKSDB_MAX_SUBCOMPACTIONS;
@ -615,6 +618,7 @@ public:
 	bool ENABLE_CLEAR_RANGE_EAGER_READS;
 	bool QUICK_GET_VALUE_FALLBACK;
 	bool QUICK_GET_KEY_VALUES_FALLBACK;
+	int CHECKPOINT_TRANSFER_BLOCK_BYTES;
 	int QUICK_GET_KEY_VALUES_LIMIT;
 	int QUICK_GET_KEY_VALUES_LIMIT_BYTES;

@ -653,6 +657,7 @@ public:
 	double COORDINATOR_LEADER_CONNECTION_TIMEOUT;

 	// Dynamic Knobs (implementation)
+	double COMPACTION_INTERVAL;
 	double UPDATE_NODE_TIMEOUT;
 	double GET_COMMITTED_VERSION_TIMEOUT;
 	double GET_SNAPSHOT_AND_CHANGES_TIMEOUT;
--- a/fdbclient/SimpleConfigTransaction.actor.cpp
+++ b/fdbclient/SimpleConfigTransaction.actor.cpp
@ -286,7 +286,7 @@ void SimpleConfigTransaction::checkDeferredError() const {
 	impl->checkDeferredError(deferredError);
 }

-void SimpleConfigTransaction::setDatabase(Database const& cx) {
+void SimpleConfigTransaction::construct(Database const& cx) {
 	impl = PImpl<SimpleConfigTransactionImpl>::create(cx);
 }

--- a/fdbclient/SimpleConfigTransaction.h
+++ b/fdbclient/SimpleConfigTransaction.h
@ -43,7 +43,7 @@ public:
 	SimpleConfigTransaction(ConfigTransactionInterface const&);
 	SimpleConfigTransaction(Database const&);
 	SimpleConfigTransaction();
-	void setDatabase(Database const&) override;
+	void construct(Database const&) override;
 	~SimpleConfigTransaction();
 	Future<Version> getReadVersion() override;
 	Optional<Version> getCachedReadVersion() const override;
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -28,6 +28,7 @@

 #include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/ClusterConnectionMemoryRecord.h"
+#include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/ProcessInterface.h"
 #include "fdbclient/GlobalConfig.actor.h"
@ -54,6 +55,8 @@ static bool isAlphaNumeric(const std::string& key) {
 }
 } // namespace

+const KeyRangeRef TenantMapRangeImpl::submoduleRange = KeyRangeRef("tenant_map/"_sr, "tenant_map0"_sr);
+
 std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToBoundary = {
 	{ SpecialKeySpace::MODULE::TRANSACTION,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/transaction/"), LiteralStringRef("\xff\xff/transaction0")) },
@ -111,7 +114,8 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
 	{ "datadistribution",
 	  KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
-	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "tenantmap", TenantMapRangeImpl::submoduleRange.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
 };

 std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiCommandToRange = {
@ -1291,6 +1295,7 @@ void ProcessClassRangeImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef&
 }

 ACTOR Future<RangeResult> getProcessClassSourceActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
+	ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 	std::vector<ProcessData> _workers = wait(getWorkers(&ryw->getTransaction()));
 	auto workers = _workers; // strip const
 	// Note : the sort by string is anti intuition, ex. 1.1.1.1:11 < 1.1.1.1:5
@ -2568,7 +2573,7 @@ void includeLocalities(ReadYourWritesTransaction* ryw) {
 	}
 }

-// Reads the excludedlocality and failed locality keys using managment api,
+// Reads the excludedlocality and failed locality keys using management api,
 // parses them and returns the list.
 bool parseLocalitiesFromKeys(ReadYourWritesTransaction* ryw,
                             bool failed,
@ -2697,3 +2702,95 @@ Future<Optional<std::string>> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr
 	// exclude locality with failed option as true.
 	return excludeLocalityCommitActor(ryw, true);
 }
+
+ACTOR Future<RangeResult> getTenantList(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) {
+	KeyRangeRef tenantRange =
+	    kr.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
+	        .removePrefix(TenantMapRangeImpl::submoduleRange.begin);
+	state KeyRef managementPrefix =
+	    kr.begin.substr(0,
+	                    SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin.size() +
+	                        TenantMapRangeImpl::submoduleRange.begin.size());
+
+	std::map<TenantName, TenantMapEntry> tenants = wait(ManagementAPI::listTenantsTransaction(
+	    Reference<ReadYourWritesTransaction>::addRef(ryw), tenantRange.begin, tenantRange.end, limitsHint.rows));
+
+	RangeResult results;
+	for (auto tenant : tenants) {
+		json_spirit::mObject tenantEntry;
+		tenantEntry["id"] = tenant.second.id;
+		tenantEntry["prefix"] = tenant.second.prefix.toString();
+		std::string tenantEntryString = json_spirit::write_string(json_spirit::mValue(tenantEntry));
+		ValueRef tenantEntryBytes(results.arena(), tenantEntryString);
+		results.push_back(results.arena(),
+		                  KeyValueRef(tenant.first.withPrefix(managementPrefix, results.arena()), tenantEntryBytes));
+	}
+
+	return results;
+}
+
+TenantMapRangeImpl::TenantMapRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
+Future<RangeResult> TenantMapRangeImpl::getRange(ReadYourWritesTransaction* ryw,
+                                                 KeyRangeRef kr,
+                                                 GetRangeLimits limitsHint) const {
+	return getTenantList(ryw, kr, limitsHint);
+}
+
+ACTOR Future<Void> deleteTenantRange(ReadYourWritesTransaction* ryw, TenantName beginTenant, TenantName endTenant) {
+	std::map<TenantName, TenantMapEntry> tenants = wait(
+	    ManagementAPI::listTenantsTransaction(&ryw->getTransaction(), beginTenant, endTenant, CLIENT_KNOBS->TOO_MANY));
+
+	if (tenants.size() == CLIENT_KNOBS->TOO_MANY) {
+		TraceEvent(SevWarn, "DeleteTenantRangeTooLange")
+		    .detail("BeginTenant", beginTenant)
+		    .detail("EndTenant", endTenant);
+		ryw->setSpecialKeySpaceErrorMsg("too many tenants to range delete");
+		throw special_keys_api_failure();
+	}
+
+	std::vector<Future<Void>> deleteFutures;
+	for (auto tenant : tenants) {
+		deleteFutures.push_back(ManagementAPI::deleteTenantTransaction(&ryw->getTransaction(), tenant.first));
+	}
+
+	wait(waitForAll(deleteFutures));
+	return Void();
+}
+
+Future<Optional<std::string>> TenantMapRangeImpl::commit(ReadYourWritesTransaction* ryw) {
+	auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(range);
+	std::vector<Future<Void>> tenantManagementFutures;
+	for (auto range : ranges) {
+		if (!range.value().first) {
+			continue;
+		}
+
+		TenantNameRef tenantName =
+		    range.begin()
+		        .removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
+		        .removePrefix(TenantMapRangeImpl::submoduleRange.begin);
+
+		if (range.value().second.present()) {
+			tenantManagementFutures.push_back(
+			    success(ManagementAPI::createTenantTransaction(&ryw->getTransaction(), tenantName)));
+		} else {
+			// For a single key clear, just issue the delete
+			if (KeyRangeRef(range.begin(), range.end()).singleKeyRange()) {
+				tenantManagementFutures.push_back(
+				    ManagementAPI::deleteTenantTransaction(&ryw->getTransaction(), tenantName));
+			} else {
+				TenantNameRef endTenant = range.end().removePrefix(
+				    SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin);
+				if (endTenant.startsWith(submoduleRange.begin)) {
+					endTenant = endTenant.removePrefix(submoduleRange.end);
+				} else {
+					endTenant = "\xff"_sr;
+				}
+				tenantManagementFutures.push_back(deleteTenantRange(ryw, tenantName, endTenant));
+			}
+		}
+	}
+
+	return tag(waitForAll(tenantManagementFutures), Optional<std::string>());
+}
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@ -281,7 +281,7 @@ public:

 // Use special key prefix "\xff\xff/transaction/conflicting_keys/<some_key>",
 // to retrieve keys which caused latest not_committed(conflicting with another transaction) error.
-// The returned key value pairs are interpretted as :
+// The returned key value pairs are interpreted as :
 // prefix/<key1> : '1' - any keys equal or larger than this key are (probably) conflicting keys
 // prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
 // Currently, the conflicting keyranges returned are original read_conflict_ranges or union of them.
@ -528,5 +528,16 @@ public:
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };

+class TenantMapRangeImpl : public SpecialKeyRangeRWImpl {
+public:
+	const static KeyRangeRef submoduleRange;
+
+	explicit TenantMapRangeImpl(KeyRangeRef kr);
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
+	                             KeyRangeRef kr,
+	                             GetRangeLimits limitsHint) const override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+};
+
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/StorageCheckpoint.h
+++ b/fdbclient/StorageCheckpoint.h
@ -0,0 +1,88 @@
+/*
+ * StorageCheckpoint.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDBCLIENT_STORAGCHECKPOINT_H
+#define FDBCLIENT_STORAGCHECKPOINT_H
+#pragma once
+
+#include "fdbclient/FDBTypes.h"
+
+// FDB storage checkpoint format.
+enum CheckpointFormat {
+	InvalidFormat = 0,
+	// For RocksDB, checkpoint generated via rocksdb::Checkpoint::ExportColumnFamily().
+	RocksDBColumnFamily = 1,
+	// For RocksDB, checkpoint generated via rocksdb::Checkpoint::CreateCheckpoint().
+	RocksDB = 2,
+};
+
+// Metadata of a FDB checkpoint.
+struct CheckpointMetaData {
+	enum CheckpointState {
+		InvalidState = 0,
+		Pending = 1, // Checkpoint creation pending.
+		Complete = 2, // Checkpoint is created and ready to be read.
+		Deleting = 3, // Checkpoint deletion requested.
+		Fail = 4,
+	};
+
+	constexpr static FileIdentifier file_identifier = 13804342;
+	Version version;
+	KeyRange range;
+	int16_t format; // CheckpointFormat.
+	UID ssID; // Storage server ID on which this checkpoint is created.
+	UID checkpointID; // A unique id for this checkpoint.
+	int16_t state; // CheckpointState.
+	int referenceCount; // A reference count on the checkpoint, it can only be deleted when this is 0.
+	int64_t gcTime; // Time to delete this checkpoint, a Unix timestamp in seconds.
+
+	// A serialized metadata associated with format, this data can be understood by the corresponding KVS.
+	Standalone<StringRef> serializedCheckpoint;
+
+	CheckpointMetaData() : format(InvalidFormat), state(InvalidState), referenceCount(0) {}
+	CheckpointMetaData(KeyRange const& range, CheckpointFormat format, UID const& ssID, UID const& checkpointID)
+	  : version(invalidVersion), range(range), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
+	    referenceCount(0) {}
+	CheckpointMetaData(Version version, KeyRange const& range, CheckpointFormat format, UID checkpointID)
+	  : version(version), range(range), format(format), checkpointID(checkpointID), referenceCount(0) {}
+
+	CheckpointState getState() const { return static_cast<CheckpointState>(state); }
+
+	void setState(CheckpointState state) { this->state = static_cast<int16_t>(state); }
+
+	CheckpointFormat getFormat() const { return static_cast<CheckpointFormat>(format); }
+
+	void setFormat(CheckpointFormat format) { this->format = static_cast<int16_t>(format); }
+
+	std::string toString() const {
+		std::string res = "Checkpoint MetaData:\nRange: " + range.toString() + "\nVersion: " + std::to_string(version) +
+		                  "\nFormat: " + std::to_string(format) + "\nServer: " + ssID.toString() +
+		                  "\nID: " + checkpointID.toString() + "\nState: " + std::to_string(static_cast<int>(state)) +
+		                  "\n";
+		return res;
+	}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, version, range, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
+	}
+};
+
+#endif
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -24,6 +24,7 @@

 #include <ostream>
 #include "fdbclient/FDBTypes.h"
+#include "fdbclient/StorageCheckpoint.h"
 #include "fdbrpc/Locality.h"
 #include "fdbrpc/QueueModel.h"
 #include "fdbrpc/fdbrpc.h"
@ -85,6 +86,8 @@ struct StorageServerInterface {
 	RequestStream<struct OverlappingChangeFeedsRequest> overlappingChangeFeeds;
 	RequestStream<struct ChangeFeedPopRequest> changeFeedPop;
 	RequestStream<struct ChangeFeedVersionUpdateRequest> changeFeedVersionUpdate;
+	RequestStream<struct GetCheckpointRequest> checkpoint;
+	RequestStream<struct FetchCheckpointRequest> fetchCheckpoint;

 	explicit StorageServerInterface(UID uid) : uniqueID(uid) {}
 	StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {}
@ -137,6 +140,9 @@ struct StorageServerInterface {
 				    RequestStream<struct ChangeFeedPopRequest>(getValue.getEndpoint().getAdjustedEndpoint(17));
 				changeFeedVersionUpdate = RequestStream<struct ChangeFeedVersionUpdateRequest>(
 				    getValue.getEndpoint().getAdjustedEndpoint(18));
+				checkpoint = RequestStream<struct GetCheckpointRequest>(getValue.getEndpoint().getAdjustedEndpoint(19));
+				fetchCheckpoint =
+				    RequestStream<struct FetchCheckpointRequest>(getValue.getEndpoint().getAdjustedEndpoint(20));
 			}
 		} else {
 			ASSERT(Ar::isDeserializing);
@ -184,6 +190,8 @@ struct StorageServerInterface {
 		streams.push_back(overlappingChangeFeeds.getReceiver());
 		streams.push_back(changeFeedPop.getReceiver());
 		streams.push_back(changeFeedVersionUpdate.getReceiver());
+		streams.push_back(checkpoint.getReceiver());
+		streams.push_back(fetchCheckpoint.getReceiver());
 		FlowTransport::transport().addEndpoints(streams);
 	}
 };
@ -816,6 +824,60 @@ struct ChangeFeedPopRequest {
 	}
 };

+// Request to search for a checkpoint for a minimum keyrange: `range`, at the specific version,
+// in the specific format.
+// A CheckpointMetaData will be returned if the specific checkpoint is found.
+struct GetCheckpointRequest {
+	constexpr static FileIdentifier file_identifier = 13804343;
+	Version version; // The FDB version at which the checkpoint is created.
+	KeyRange range;
+	int16_t format; // CheckpointFormat.
+	Optional<UID> checkpointID; // When present, look for the checkpoint with the exact UID.
+	ReplyPromise<CheckpointMetaData> reply;
+
+	GetCheckpointRequest() {}
+	GetCheckpointRequest(Version version, KeyRange const& range, CheckpointFormat format)
+	  : version(version), range(range), format(format) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, version, range, format, checkpointID, reply);
+	}
+};
+
+// Reply to FetchCheckpointRequest, transfers checkpoint back to client.
+struct FetchCheckpointReply : public ReplyPromiseStreamReply {
+	constexpr static FileIdentifier file_identifier = 13804345;
+	Standalone<StringRef> token; // Serialized data specific to a particular checkpoint format.
+	Standalone<StringRef> data;
+
+	FetchCheckpointReply() {}
+	FetchCheckpointReply(StringRef token) : token(token) {}
+
+	int expectedSize() const { return data.expectedSize(); }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, ReplyPromiseStreamReply::sequence, token, data);
+	}
+};
+
+// Request to fetch checkpoint from a storage server.
+struct FetchCheckpointRequest {
+	constexpr static FileIdentifier file_identifier = 13804344;
+	UID checkpointID;
+	Standalone<StringRef> token; // Serialized data specific to a particular checkpoint format.
+	ReplyPromiseStream<FetchCheckpointReply> reply;
+
+	FetchCheckpointRequest() = default;
+	FetchCheckpointRequest(UID checkpointID, StringRef token) : checkpointID(checkpointID), token(token) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, checkpointID, token, reply);
+	}
+};
+
 struct OverlappingChangeFeedEntry {
 	Key rangeId;
 	KeyRange range;
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -215,6 +215,33 @@ const KeyRangeRef writeConflictRangeKeysRange =

 const KeyRef clusterIdKey = LiteralStringRef("\xff/clusterId");

+const KeyRef checkpointPrefix = "\xff/checkpoint/"_sr;
+
+const Key checkpointKeyFor(UID checkpointID) {
+	BinaryWriter wr(Unversioned());
+	wr.serializeBytes(checkpointPrefix);
+	wr << checkpointID;
+	return wr.toValue();
+}
+
+const Value checkpointValue(const CheckpointMetaData& checkpoint) {
+	return ObjectWriter::toValue(checkpoint, IncludeVersion());
+}
+
+UID decodeCheckpointKey(const KeyRef& key) {
+	UID checkpointID;
+	BinaryReader rd(key.removePrefix(checkpointPrefix), Unversioned());
+	rd >> checkpointID;
+	return checkpointID;
+}
+
+CheckpointMetaData decodeCheckpointValue(const ValueRef& value) {
+	CheckpointMetaData checkpoint;
+	ObjectReader reader(value.begin(), IncludeVersion());
+	reader.deserialize(checkpoint);
+	return checkpoint;
+}
+
 // "\xff/cacheServer/[[UID]] := StorageServerInterface"
 const KeyRangeRef storageCacheServerKeys(LiteralStringRef("\xff/cacheServer/"), LiteralStringRef("\xff/cacheServer0"));
 const KeyRef storageCacheServersPrefix = storageCacheServerKeys.begin;
@ -1336,6 +1363,8 @@ TenantMapEntry decodeTenantEntry(ValueRef const& value) {
 const KeyRangeRef tenantMapKeys("\xff/tenantMap/"_sr, "\xff/tenantMap0"_sr);
 const KeyRef tenantMapPrefix = tenantMapKeys.begin;
 const KeyRef tenantMapPrivatePrefix = "\xff\xff/tenantMap/"_sr;
+const KeyRef tenantLastIdKey = "\xff/tenantLastId/"_sr;
+const KeyRef tenantDataPrefixKey = "\xff/tenantDataPrefix"_sr;

 // for tests
 void testSSISerdes(StorageServerInterface const& ssi, bool useFB) {
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -70,6 +70,13 @@ void decodeKeyServersValue(std::map<Tag, UID> const& tag_uid,

 extern const KeyRef clusterIdKey;

+// "\xff/checkpoint/[[UID]] := [[CheckpointMetaData]]"
+extern const KeyRef checkpointPrefix;
+const Key checkpointKeyFor(UID checkpointID);
+const Value checkpointValue(const CheckpointMetaData& checkpoint);
+UID decodeCheckpointKey(const KeyRef& key);
+CheckpointMetaData decodeCheckpointValue(const ValueRef& value);
+
 // "\xff/storageCacheServer/[[UID]] := StorageServerInterface"
 // This will be added by the cache server on initialization and removed by DD
 // TODO[mpilman]: We will need a way to map uint16_t ids to UIDs in a future
@ -490,7 +497,7 @@ const Value healthyZoneValue(StringRef const& zoneId, Version version);
 std::pair<Key, Version> decodeHealthyZoneValue(ValueRef const&);

 // All mutations done to this range are blindly copied into txnStateStore.
-// Used to create artifically large txnStateStore instances in testing.
+// Used to create artificially large txnStateStore instances in testing.
 extern const KeyRangeRef testOnlyTxnStateStorePrefixRange;

 // Snapshot + Incremental Restore
@ -598,6 +605,8 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value);
 extern const KeyRangeRef tenantMapKeys;
 extern const KeyRef tenantMapPrefix;
 extern const KeyRef tenantMapPrivatePrefix;
+extern const KeyRef tenantLastIdKey;
+extern const KeyRef tenantDataPrefixKey;

 Value encodeTenantEntry(TenantMapEntry const& tenantEntry);
 TenantMapEntry decodeTenantEntry(ValueRef const& value);
--- a/fdbclient/TaskBucket.h
+++ b/fdbclient/TaskBucket.h
@ -52,7 +52,7 @@ FDB_DECLARE_BOOLEAN_PARAM(UpdateParams);
 //   4.  If the executor loses contact with FDB, another executor may begin at step 2.  The first
 //       Task execution can detect this by checking the result of keepRunning() periodically.
 //   5.  Once a Task execution's _execute() call returns, the _finish() step is called.
-//       _finish() is transactional and is guaraunteed to never be called more than once for the
+//       _finish() is transactional and is guaranteed to never be called more than once for the
 //       same Task
 class Task : public ReferenceCounted<Task> {
 public:
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@ -23,6 +23,7 @@
 #include "fdbclient/ThreadSafeTransaction.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/versions.h"
+#include "fdbclient/GenericManagementAPI.actor.h"
 #include "fdbclient/NativeAPI.actor.h"

 // Users of ThreadSafeTransaction might share Reference<ThreadSafe...> between different threads as long as they don't
@ -46,9 +47,13 @@ ThreadFuture<Reference<IDatabase>> ThreadSafeDatabase::createFromExistingDatabas
 	});
 }

+Reference<ITenant> ThreadSafeDatabase::openTenant(TenantNameRef tenantName) {
+	return makeReference<ThreadSafeTenant>(Reference<ThreadSafeDatabase>::addRef(this), tenantName);
+}
+
 Reference<ITransaction> ThreadSafeDatabase::createTransaction() {
 	auto type = isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW;
-	return Reference<ITransaction>(new ThreadSafeTransaction(db, type));
+	return Reference<ITransaction>(new ThreadSafeTransaction(db, type, Optional<TenantName>()));
 }

 void ThreadSafeDatabase::setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value) {
@ -139,7 +144,17 @@ ThreadSafeDatabase::~ThreadSafeDatabase() {
 	onMainThreadVoid([db]() { db->delref(); }, nullptr);
 }

-ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadTransaction::Type type) {
+Reference<ITransaction> ThreadSafeTenant::createTransaction() {
+	auto type = db->isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW;
+	return Reference<ITransaction>(new ThreadSafeTransaction(db->db, type, name));
+}
+
+ThreadSafeTenant::~ThreadSafeTenant() {}
+
+ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx,
+                                             ISingleThreadTransaction::Type type,
+                                             Optional<TenantName> tenant)
+  : tenantName(tenant) {
 	// Allocate memory for the transaction from this thread (so the pointer is known for subsequent method calls)
 	// but run its constructor on the main thread

@ -150,9 +165,13 @@ ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadT
 	auto tr = this->tr = ISingleThreadTransaction::allocateOnForeignThread(type);
 	// No deferred error -- if the construction of the RYW transaction fails, we have no where to put it
 	onMainThreadVoid(
-	    [tr, cx]() {
+	    [tr, cx, tenant]() {
 		    cx->addref();
-		    tr->setDatabase(Database(cx));
+		    if (tenant.present()) {
+			    tr->construct(Database(cx), tenant.get());
+		    } else {
+			    tr->construct(Database(cx));
+		    }
 	    },
 	    nullptr);
 }
@ -461,6 +480,10 @@ ThreadFuture<Void> ThreadSafeTransaction::onError(Error const& e) {
 	return onMainThread([tr, e]() { return tr->onError(e); });
 }

+Optional<TenantName> ThreadSafeTransaction::getTenant() {
+	return tenantName;
+}
+
 void ThreadSafeTransaction::operator=(ThreadSafeTransaction&& r) noexcept {
 	tr = r.tr;
 	r.tr = nullptr;
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@ -35,6 +35,7 @@ public:
 	~ThreadSafeDatabase() override;
 	static ThreadFuture<Reference<IDatabase>> createFromExistingDatabase(Database cx);

+	Reference<ITenant> openTenant(TenantNameRef tenantName) override;
 	Reference<ITransaction> createTransaction() override;

 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
@ -58,6 +59,7 @@ public:
 	ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;

 private:
+	friend class ThreadSafeTenant;
 	friend class ThreadSafeTransaction;
 	bool isConfigDB{ false };
 	DatabaseContext* db;
@ -68,11 +70,28 @@ public: // Internal use only
 	DatabaseContext* unsafeGetPtr() const { return db; }
 };

+class ThreadSafeTenant : public ITenant, ThreadSafeReferenceCounted<ThreadSafeTenant>, NonCopyable {
+public:
+	ThreadSafeTenant(Reference<ThreadSafeDatabase> db, StringRef name) : db(db), name(name) {}
+	~ThreadSafeTenant() override;
+
+	Reference<ITransaction> createTransaction() override;
+
+	void addref() override { ThreadSafeReferenceCounted<ThreadSafeTenant>::addref(); }
+	void delref() override { ThreadSafeReferenceCounted<ThreadSafeTenant>::delref(); }
+
+private:
+	Reference<ThreadSafeDatabase> db;
+	Standalone<StringRef> name;
+};
+
 // An implementation of ITransaction that serializes operations onto the network thread and interacts with the
 // lower-level client APIs exposed by ISingleThreadTransaction
 class ThreadSafeTransaction : public ITransaction, ThreadSafeReferenceCounted<ThreadSafeTransaction>, NonCopyable {
 public:
-	explicit ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadTransaction::Type type);
+	explicit ThreadSafeTransaction(DatabaseContext* cx,
+	                               ISingleThreadTransaction::Type type,
+	                               Optional<TenantName> tenant);
 	~ThreadSafeTransaction() override;

 	// Note: used while refactoring fdbcli, need to be removed later
@ -149,6 +168,8 @@ public:
 	ThreadFuture<Void> checkDeferredError();
 	ThreadFuture<Void> onError(Error const& e) override;

+	Optional<TenantName> getTenant() override;
+
 	// These are to permit use as state variables in actors:
 	ThreadSafeTransaction() : tr(nullptr) {}
 	void operator=(ThreadSafeTransaction&& r) noexcept;
@ -161,6 +182,7 @@ public:

 private:
 	ISingleThreadTransaction* tr;
+	const Optional<TenantName> tenantName;
 };

 // An implementation of IClientApi that serializes operations onto the network thread and interacts with the lower-level
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@ -230,9 +230,11 @@ description is not currently required but encouraged.
    <Option name="initialize_new_database" code="300"
            description="This is a write-only transaction which sets the initial configuration. This option is designed for use by database system tools only." />
    <Option name="access_system_keys" code="301"
-            description="Allows this transaction to read and modify system keys (those that start with the byte 0xFF)"/>
+            description="Allows this transaction to read and modify system keys (those that start with the byte 0xFF). Implies raw_access."/>
    <Option name="read_system_keys" code="302"
-            description="Allows this transaction to read system keys (those that start with the byte 0xFF)"/>
+            description="Allows this transaction to read system keys (those that start with the byte 0xFF). Implies raw_access."/>
+    <Option name="raw_access" code="303"
+            description="Allows this transaction to access the raw key-space when tenant mode is on."/>
    <Option name="debug_dump" code="400" 
            hidden="true" />
    <Option name="debug_retry_logging" code="401" paramType="String" paramDescription="Optional transaction name" />
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -921,7 +921,7 @@ ACTOR static void deliver(TransportData* self,
                          bool inReadSocket) {
 	// We want to run the task at the right priority. If the priority is higher than the current priority (which is
 	// ReadSocket) we can just upgrade. Otherwise we'll context switch so that we don't block other tasks that might run
-	// with a higher priority. ReplyPromiseStream needs to guarentee that messages are recieved in the order they were
+	// with a higher priority. ReplyPromiseStream needs to guarantee that messages are received in the order they were
 	// sent, so we are using orderedDelay.
 	// NOTE: don't skip delay(0) when it's local deliver since it could cause out of order object deconstruction.
 	if (priority < TaskPriority::ReadSocket || !inReadSocket) {
--- a/fdbrpc/IRateControl.h
+++ b/fdbrpc/IRateControl.h
@ -36,7 +36,7 @@ public:
 	virtual void delref() = 0;
 };

-// An IRateControl implemenation that allows at most hands out at most windowLimit units of 'credit' in windowSeconds
+// An IRateControl implementation that allows at most hands out at most windowLimit units of 'credit' in windowSeconds
 // seconds
 class SpeedLimit final : public IRateControl, ReferenceCounted<SpeedLimit> {
 public:
@ -89,7 +89,7 @@ private:
 	Promise<Void> m_stop;
 };

-// An IRateControl implemenation that enforces no limit
+// An IRateControl implementation that enforces no limit
 class Unlimited final : public IRateControl, ReferenceCounted<Unlimited> {
 public:
 	Unlimited() {}
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@ -274,7 +274,7 @@ struct AcknowledgementReply {
 	}
 };

-// Registered on the server to recieve acknowledgements that the client has received stream data. This prevents the
+// Registered on the server to receive acknowledgements that the client has received stream data. This prevents the
 // server from sending too much data to the client if the client is not consuming it.
 struct AcknowledgementReceiver final : FlowReceiver, FastAllocated<AcknowledgementReceiver> {
 	using FastAllocated<AcknowledgementReceiver>::operator new;
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -865,7 +865,7 @@ public:

 		if (!ordered && !currentProcess->rebooting && machine == currentProcess &&
 		    !currentProcess->shutdownSignal.isSet() && FLOW_KNOBS->MAX_BUGGIFIED_DELAY > 0 &&
-		    deterministicRandom()->random01() < 0.25) { // FIXME: why doesnt this work when we are changing machines?
+		    deterministicRandom()->random01() < 0.25) { // FIXME: why doesn't this work when we are changing machines?
 			seconds += FLOW_KNOBS->MAX_BUGGIFIED_DELAY * pow(deterministicRandom()->random01(), 1000.0);
 		}

--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@ -427,6 +427,7 @@ public:
 	bool speedUpSimulation;
 	BackupAgentType backupAgents;
 	BackupAgentType drAgents;
+	bool restarted = false;

 	bool hasDiffProtocolProcess; // true if simulator is testing a process with a different version
 	bool setDiffProtocol; // true if a process with a different protocol version has been started
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -541,6 +541,29 @@ private:
 		toCommit->writeTypedMessage(privatized);
 	}

+	// Generates private mutations for the target storage server, instructing it to create a checkpoint.
+	void checkSetCheckpointKeys(MutationRef m) {
+		if (!m.param1.startsWith(checkpointPrefix)) {
+			return;
+		}
+		if (toCommit) {
+			CheckpointMetaData checkpoint = decodeCheckpointValue(m.param2);
+			Tag tag = decodeServerTagValue(txnStateStore->readValue(serverTagKeyFor(checkpoint.ssID)).get().get());
+			MutationRef privatized = m;
+			privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
+			TraceEvent("SendingPrivateMutationCheckpoint", dbgid)
+			    .detail("Original", m)
+			    .detail("Privatized", privatized)
+			    .detail("Server", checkpoint.ssID)
+			    .detail("TagKey", serverTagKeyFor(checkpoint.ssID))
+			    .detail("Tag", tag.toString())
+			    .detail("Checkpoint", checkpoint.toString());
+
+			toCommit->addTag(tag);
+			toCommit->writeTypedMessage(privatized);
+		}
+	}
+
 	void checkSetOtherKeys(MutationRef m) {
 		if (initialCommit)
 			return;
@ -703,7 +726,7 @@ private:
 				}
 			}
 			// Might be a tss removal, which doesn't store a tag there.
-			// Chained if is a little verbose, but avoids unecessary work
+			// Chained if is a little verbose, but avoids unnecessary work
 			if (toCommit && !initialCommit && !serverKeysCleared.size()) {
 				KeyRangeRef maybeTssRange = range & serverTagKeys;
 				if (maybeTssRange.singleKeyRange()) {
@ -1081,6 +1104,7 @@ public:
 			if (m.type == MutationRef::SetValue && isSystemKey(m.param1)) {
 				checkSetKeyServersPrefix(m);
 				checkSetServerKeysPrefix(m);
+				checkSetCheckpointKeys(m);
 				checkSetServerTagsPrefix(m);
 				checkSetStorageCachePrefix(m);
 				checkSetCacheKeysPrefix(m);
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -473,7 +473,7 @@ ACTOR Future<std::pair<BlobGranuleSplitState, Version>> getGranuleSplitState(Tra
 }

 // writeDelta file writes speculatively in the common case to optimize throughput. It creates the s3 object even though
-// the data in it may not yet be committed, and even though previous delta fiels with lower versioned data may still be
+// the data in it may not yet be committed, and even though previous delta files with lower versioned data may still be
 // in flight. The synchronization happens after the s3 file is written, but before we update the FDB index of what files
 // exist. Before updating FDB, we ensure the version is committed and all previous delta files have updated FDB.
 ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@ -50,6 +50,10 @@ set(FDBSERVER_SRCS
  KeyValueStoreMemory.actor.cpp
  KeyValueStoreRocksDB.actor.cpp
  KeyValueStoreSQLite.actor.cpp
+  ServerCheckpoint.actor.cpp
+  ServerCheckpoint.actor.h
+  RocksDBCheckpointUtils.actor.cpp
+  RocksDBCheckpointUtils.actor.h
  Knobs.h
  LatencyBandConfig.cpp
  LatencyBandConfig.h
@ -191,6 +195,7 @@ set(FDBSERVER_SRCS
  workloads/ChangeFeeds.actor.cpp
  workloads/DataDistributionMetrics.actor.cpp
  workloads/DataLossRecovery.actor.cpp
+  workloads/PhysicalShardMove.actor.cpp
  workloads/DDBalance.actor.cpp
  workloads/DDMetrics.actor.cpp
  workloads/DDMetricsExclude.actor.cpp
--- a/fdbserver/ConfigBroadcaster.actor.cpp
+++ b/fdbserver/ConfigBroadcaster.actor.cpp
@ -94,6 +94,7 @@ class ConfigBroadcasterImpl {

 	int coordinators = 0;
 	std::unordered_set<NetworkAddress> activeConfigNodes;
+	std::unordered_set<NetworkAddress> registrationResponses;
 	bool disallowUnregistered = false;
 	Promise<Void> newConfigNodesAllowed;

@ -217,6 +218,7 @@ class ConfigBroadcasterImpl {
 		self->clients.erase(clientUID);
 		self->clientFailures.erase(clientUID);
 		self->activeConfigNodes.erase(clientAddress);
+		self->registrationResponses.erase(clientAddress);
 		// See comment where this promise is reset below.
 		if (self->newConfigNodesAllowed.isSet()) {
 			self->newConfigNodesAllowed.reset();
@ -258,6 +260,7 @@ class ConfigBroadcasterImpl {
 				self->newConfigNodesAllowed.reset();
 			}
 		}
+		self->registrationResponses.insert(address);

 		if (registered) {
 			if (!self->disallowUnregistered) {
@ -265,9 +268,18 @@ class ConfigBroadcasterImpl {
 			}
 			self->activeConfigNodes.insert(address);
 			self->disallowUnregistered = true;
-		} else if (self->activeConfigNodes.size() < self->coordinators / 2 + 1 && !self->disallowUnregistered) {
-			// Need to allow registration of previously unregistered nodes when
-			// the cluster first starts up.
+		} else if ((self->activeConfigNodes.size() < self->coordinators / 2 + 1 && !self->disallowUnregistered) ||
+		           self->coordinators - self->registrationResponses.size() <=
+		               self->coordinators / 2 + 1 - self->activeConfigNodes.size()) {
+			// Received a registration request from an unregistered node. There
+			// are two cases where we want to allow unregistered nodes to
+			// register:
+			// 	 * the cluster is just starting and no nodes are registered
+			// 	 * a minority of nodes are registered and a majority are
+			// 	   unregistered. This situation should only occur in rare
+			// 	   circumstances where the cluster controller dies with only a
+			// 	   minority of config nodes having received a
+			// 	   ConfigBroadcastReadyRequest
 			self->activeConfigNodes.insert(address);
 			if (self->activeConfigNodes.size() >= self->coordinators / 2 + 1 &&
 			    self->newConfigNodesAllowed.canBeSet()) {
@ -390,9 +402,9 @@ public:
 		this->coordinators = coordinators.configServers.size();
 		if (configDBType != ConfigDBType::DISABLED) {
 			if (configDBType == ConfigDBType::SIMPLE) {
-				consumer = IConfigConsumer::createSimple(coordinators, 0.5, Optional<double>{});
+				consumer = IConfigConsumer::createSimple(coordinators, 0.5, SERVER_KNOBS->COMPACTION_INTERVAL);
 			} else {
-				consumer = IConfigConsumer::createPaxos(coordinators, 0.5, Optional<double>{});
+				consumer = IConfigConsumer::createPaxos(coordinators, 0.5, SERVER_KNOBS->COMPACTION_INTERVAL);
 			}
 			TraceEvent(SevDebug, "ConfigBroadcasterStartingConsumer", id)
 			    .detail("Consumer", consumer->getID())
--- a/fdbserver/ConfigFollowerInterface.h
+++ b/fdbserver/ConfigFollowerInterface.h
@ -176,14 +176,16 @@ struct ConfigFollowerRollforwardRequest {

 struct ConfigFollowerGetCommittedVersionReply {
 	static constexpr FileIdentifier file_identifier = 9214735;
+	Version lastCompacted;
 	Version lastCommitted;

 	ConfigFollowerGetCommittedVersionReply() = default;
-	explicit ConfigFollowerGetCommittedVersionReply(Version lastCommitted) : lastCommitted(lastCommitted) {}
+	explicit ConfigFollowerGetCommittedVersionReply(Version lastCompacted, Version lastCommitted)
+	  : lastCompacted(lastCompacted), lastCommitted(lastCommitted) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, lastCommitted);
+		serializer(ar, lastCompacted, lastCommitted);
 	}
 };

--- a/fdbserver/ConfigNode.actor.cpp
+++ b/fdbserver/ConfigNode.actor.cpp
@ -495,7 +495,7 @@ class ConfigNodeImpl {
 	}

 	ACTOR static Future<Void> rollforward(ConfigNodeImpl* self, ConfigFollowerRollforwardRequest req) {
-		Version lastCompactedVersion = wait(getLastCompactedVersion(self));
+		state Version lastCompactedVersion = wait(getLastCompactedVersion(self));
 		if (req.lastKnownCommitted < lastCompactedVersion) {
 			req.reply.sendError(version_already_compacted());
 			return Void();
@ -529,6 +529,10 @@ class ConfigNodeImpl {
 			                                 versionedAnnotationKey(currentGeneration.committedVersion + 1)));

 			currentGeneration.committedVersion = req.rollback.get();
+			if (req.rollback.get() < lastCompactedVersion) {
+				self->kvStore->set(
+				    KeyValueRef(lastCompactedVersionKey, BinaryWriter::toValue(req.rollback.get(), IncludeVersion())));
+			}
 			// The mutation commit loop below should persist the new generation
 			// to disk, so we don't need to do it here.
 		}
@ -536,13 +540,15 @@ class ConfigNodeImpl {
 		// committed version and rollforward version.
 		ASSERT_GT(req.mutations[0].version, currentGeneration.committedVersion);
 		wait(commitMutations(self, req.mutations, req.annotations, req.target));
+
 		req.reply.send(Void());
 		return Void();
 	}

 	ACTOR static Future<Void> getCommittedVersion(ConfigNodeImpl* self, ConfigFollowerGetCommittedVersionRequest req) {
+		state Version lastCompacted = wait(getLastCompactedVersion(self));
 		ConfigGeneration generation = wait(getGeneration(self));
-		req.reply.send(ConfigFollowerGetCommittedVersionReply{ generation.committedVersion });
+		req.reply.send(ConfigFollowerGetCommittedVersionReply{ lastCompacted, generation.committedVersion });
 		return Void();
 	}

--- a/fdbserver/DDTeamCollection.actor.cpp
+++ b/fdbserver/DDTeamCollection.actor.cpp
@ -204,8 +204,9 @@ public:
 			}

 			int64_t bestLoadBytes = 0;
+			bool wigglingBestOption = false; // best option contains server in paused wiggle state
 			Optional<Reference<IDataDistributionTeam>> bestOption;
-			std::vector<Reference<IDataDistributionTeam>> randomTeams;
+			std::vector<Reference<TCTeamInfo>> randomTeams;
 			const std::set<UID> completeSources(req.completeSources.begin(), req.completeSources.end());

 			// Note: this block does not apply any filters from the request
@ -253,9 +254,18 @@ public:
 						    (!req.teamMustHaveShards ||
 						     self->shardsAffectedByTeamFailure->hasShards(ShardsAffectedByTeamFailure::Team(
 						         self->teams[currentIndex]->getServerIDs(), self->primary)))) {
+
+							// bestOption doesn't contain wiggling SS while current team does. Don't replace bestOption
+							// in this case
+							if (bestOption.present() && !wigglingBestOption &&
+							    self->teams[currentIndex]->hasWigglePausedServer()) {
+								continue;
+							}
+
 							bestLoadBytes = loadBytes;
 							bestOption = self->teams[currentIndex];
 							bestIndex = currentIndex;
+							wigglingBestOption = self->teams[bestIndex]->hasWigglePausedServer();
 						}
 					}
 				}
@ -266,7 +276,7 @@ public:
 				while (randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT &&
 				       nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES) {
 					// If unhealthy team is majority, we may not find an ok dest in this while loop
-					Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams);
+					Reference<TCTeamInfo> dest = deterministicRandom()->randomChoice(self->teams);

 					bool ok = dest->isHealthy() && (!req.preferLowerUtilization ||
 					                                dest->hasHealthyAvailableSpace(self->medianAvailableSpace));
@ -302,8 +312,16 @@ public:
 					int64_t loadBytes = randomTeams[i]->getLoadBytes(true, req.inflightPenalty);
 					if (!bestOption.present() || (req.preferLowerUtilization && loadBytes < bestLoadBytes) ||
 					    (!req.preferLowerUtilization && loadBytes > bestLoadBytes)) {
+
+						// bestOption doesn't contain wiggling SS while current team does. Don't replace bestOption
+						// in this case
+						if (bestOption.present() && !wigglingBestOption && randomTeams[i]->hasWigglePausedServer()) {
+							continue;
+						}
+
 						bestLoadBytes = loadBytes;
 						bestOption = randomTeams[i];
+						wigglingBestOption = randomTeams[i]->hasWigglePausedServer();
 					}
 				}
 			}
@ -3617,6 +3635,10 @@ void DDTeamCollection::removeLaggingStorageServer(Key zoneId) {
 		disableFailingLaggingServers.set(false);
 }

+bool DDTeamCollection::isWigglePausedServer(const UID& server) const {
+	return pauseWiggle && pauseWiggle->get() && wigglingId == server;
+}
+
 std::vector<UID> DDTeamCollection::getRandomHealthyTeam(const UID& excludeServer) {
 	std::vector<int> candidates, backup;
 	for (int i = 0; i < teams.size(); ++i) {
@ -5635,6 +5657,62 @@ public:

 		return Void();
 	}
+
+	ACTOR static Future<Void> GetTeam_DeprioritizeWigglePausedTeam() {
+		Reference<IReplicationPolicy> policy = Reference<IReplicationPolicy>(
+		    new PolicyAcross(3, "zoneid", Reference<IReplicationPolicy>(new PolicyOne())));
+		state int processSize = 5;
+		state int teamSize = 3;
+		state std::unique_ptr<DDTeamCollection> collection = testTeamCollection(teamSize, policy, processSize);
+		GetStorageMetricsReply mid_avail;
+		mid_avail.capacity.bytes = 1000 * 1024 * 1024;
+		mid_avail.available.bytes = 400 * 1024 * 1024;
+		mid_avail.load.bytes = 100 * 1024 * 1024;
+
+		GetStorageMetricsReply high_avail;
+		high_avail.capacity.bytes = 1000 * 1024 * 1024;
+		high_avail.available.bytes = 800 * 1024 * 1024;
+		high_avail.load.bytes = 90 * 1024 * 1024;
+
+		collection->addTeam(std::set<UID>({ UID(1, 0), UID(2, 0), UID(3, 0) }), true);
+		collection->addTeam(std::set<UID>({ UID(2, 0), UID(3, 0), UID(4, 0) }), true);
+		collection->disableBuildingTeams();
+		collection->setCheckTeamDelay();
+
+		/*
+		 * Among server teams that have healthy space available, pick the team that is
+		 * least utilized, if the caller says they preferLowerUtilization.
+		 */
+
+		collection->server_info[UID(1, 0)]->setMetrics(mid_avail);
+		collection->server_info[UID(2, 0)]->setMetrics(high_avail);
+		collection->server_info[UID(3, 0)]->setMetrics(high_avail);
+		collection->server_info[UID(4, 0)]->setMetrics(high_avail);
+
+		collection->wigglingId = UID(4, 0);
+		collection->pauseWiggle = makeReference<AsyncVar<bool>>(true);
+
+		bool wantsNewServers = true;
+		bool wantsTrueBest = true;
+		bool preferLowerUtilization = true;
+		bool teamMustHaveShards = false;
+		std::vector<UID> completeSources{ UID(1, 0), UID(2, 0), UID(3, 0) };
+
+		state GetTeamRequest req(wantsNewServers, wantsTrueBest, preferLowerUtilization, teamMustHaveShards);
+		req.completeSources = completeSources;
+
+		wait(collection->getTeam(req));
+
+		std::pair<Optional<Reference<IDataDistributionTeam>>, bool> resTeam = req.reply.getFuture().get();
+
+		std::set<UID> expectedServers{ UID(1, 0), UID(2, 0), UID(3, 0) };
+		ASSERT(resTeam.first.present());
+		auto servers = resTeam.first.get()->getServerIDs();
+		const std::set<UID> selectedServers(servers.begin(), servers.end());
+		ASSERT(expectedServers == selectedServers);
+
+		return Void();
+	}
 };

 TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") {
@ -5696,3 +5774,8 @@ TEST_CASE("/DataDistribution/GetTeam/ServerUtilizationNearCutoff") {
 	wait(DDTeamCollectionUnitTest::GetTeam_ServerUtilizationNearCutoff());
 	return Void();
 }
+
+TEST_CASE("/DataDistribution/GetTeam/DeprioritizeWigglePausedTeam") {
+	wait(DDTeamCollectionUnitTest::GetTeam_DeprioritizeWigglePausedTeam());
+	return Void();
+}
--- a/Show More
+++ b/Show More