Merge branch 'main' of github.com:apple/foundationdb into jfu-list-tenants

2022-04-29 13:16:54 -04:00 · 2022-04-29 13:16:54 -04:00 · d953b961b7
parent 2afaf55a48 c8b1662ab7
commit d953b961b7
130 changed files with 11900 additions and 5493 deletions
--- a/FDBLibTLS/CMakeLists.txt
+++ b/FDBLibTLS/CMakeLists.txt
@ -10,3 +10,4 @@ set(SRCS

 add_library(FDBLibTLS STATIC ${SRCS})
 target_link_libraries(FDBLibTLS PUBLIC OpenSSL::SSL boost_target PRIVATE flow)
+target_include_directories(FDBLibTLS INTERFACE OpenSSL::SSL boost_target PRIVATE flow)
--- a/FDBLibTLS/FDBLibTLSPolicy.cpp
+++ b/FDBLibTLS/FDBLibTLSPolicy.cpp
@ -22,6 +22,9 @@
 #include "FDBLibTLS/FDBLibTLSSession.h"
 #include "flow/Trace.h"

+#if defined(HAVE_WOLFSSL)
+#include <wolfssl/options.h>
+#endif
 #include <openssl/bio.h>
 #include <openssl/err.h>
 #include <openssl/evp.h>
--- a/FDBLibTLS/FDBLibTLSSession.cpp
+++ b/FDBLibTLS/FDBLibTLSSession.cpp
@ -23,6 +23,9 @@
 #include "flow/flow.h"
 #include "flow/Trace.h"

+#if defined(HAVE_WOLFSSL)
+#include <wolfssl/options.h>
+#endif
 #include <openssl/bio.h>
 #include <openssl/err.h>
 #include <openssl/pem.h>
--- a/FDBLibTLS/FDBLibTLSVerify.cpp
+++ b/FDBLibTLS/FDBLibTLSVerify.cpp
@ -20,6 +20,9 @@

 #include "FDBLibTLS/FDBLibTLSVerify.h"

+#if defined(HAVE_WOLFSSL)
+#include <wolfssl/options.h>
+#endif
 #include <openssl/objects.h>

 #include <algorithm>
--- a/FDBLibTLS/verify-test.cpp
+++ b/FDBLibTLS/verify-test.cpp
@ -25,6 +25,9 @@
 #include <string.h>
 #include <boost/lexical_cast.hpp>

+#if defined(HAVE_WOLFSSL)
+#include <wolfssl/options.h>
+#endif
 #include <openssl/objects.h>

 #include "fdbrpc/ITLSPlugin.h"
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -80,10 +80,23 @@ endif()
 # The tests don't build on windows
 if(NOT WIN32)
  set(MAKO_SRCS
-    test/mako/mako.c
-    test/mako/mako.h
-    test/mako/utils.c
-    test/mako/utils.h)
+    test/mako/async.hpp
+    test/mako/async.cpp
+    test/mako/blob_granules.hpp
+    test/mako/blob_granules.cpp
+    test/mako/future.hpp
+    test/mako/limit.hpp
+    test/mako/logger.hpp
+    test/mako/mako.cpp
+    test/mako/mako.hpp
+    test/mako/operations.hpp
+    test/mako/operations.cpp
+    test/mako/process.hpp
+    test/mako/shm.hpp
+    test/mako/stats.hpp
+    test/mako/time.hpp
+    test/mako/utils.cpp
+    test/mako/utils.hpp)
  add_subdirectory(test/unit/third_party)
  find_package(Threads REQUIRED)
  set(UNIT_TEST_SRCS
@ -98,6 +111,11 @@ if(NOT WIN32)
    test/unit/fdb_api.cpp
    test/unit/fdb_api.hpp)

+  add_library(fdb_cpp INTERFACE)
+  target_sources(fdb_cpp INTERFACE test/fdb_api.hpp)
+  target_include_directories(fdb_cpp INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/test)
+  target_link_libraries(fdb_cpp INTERFACE fmt::fmt)
+
  set(API_TESTER_SRCS
    test/apitester/fdb_c_api_tester.cpp
    test/apitester/TesterApiWorkload.cpp
@ -179,7 +197,11 @@ endif()

  # do not set RPATH for mako
  set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
-  target_link_libraries(mako PRIVATE fdb_c fdbclient)
+  if (USE_SANITIZER)
+	target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_asan)
+  else()
+	target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_target)
+  endif()

  if(NOT OPEN_FOR_IDE)
    # Make sure that fdb_c.h is compatible with c90
@ -254,6 +276,8 @@ endif()
            ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
            --tmp-dir
            @TMP_DIR@
+            --log-dir
+            @LOG_DIR@
            )

  add_fdbclient_test(
@ -271,6 +295,10 @@ endif()
            ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests
            --blob-granule-local-file-path
            @DATA_DIR@/fdbblob/
+            --tmp-dir
+            @TMP_DIR@
+            --log-dir
+            @LOG_DIR@
            )

  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER)
--- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
+++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
@ -37,27 +37,42 @@ private:
 	enum OpType { OP_INSERT, OP_CLEAR, OP_CLEAR_RANGE, OP_READ, OP_GET_RANGES, OP_LAST = OP_GET_RANGES };
 	std::vector<OpType> excludedOpTypes;

+	// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
+	// FIXME: should still guarantee a read succeeds eventually somehow
+	bool seenReadSuccess = false;
+
 	void randomReadOp(TTaskFct cont) {
 		std::string begin = randomKeyName();
 		std::string end = randomKeyName();
 		auto results = std::make_shared<std::vector<KeyValue>>();
+		auto tooOld = std::make_shared<bool>(false);
 		if (begin > end) {
 			std::swap(begin, end);
 		}
 		execTransaction(
-		    [begin, end, results](auto ctx) {
+		    [this, begin, end, results, tooOld](auto ctx) {
 			    ctx->tx()->setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
 			    KeyValuesResult res = ctx->tx()->readBlobGranules(begin, end, ctx->getBGBasePath());
 			    bool more;
 			    (*results) = res.getKeyValues(&more);
 			    ASSERT(!more);
-			    if (res.getError() != error_code_success) {
+			    if (res.getError() == error_code_blob_granule_transaction_too_old) {
+				    info("BlobGranuleCorrectness::randomReadOp bg too old\n");
+				    ASSERT(!seenReadSuccess);
+				    *tooOld = true;
+				    ctx->done();
+			    } else if (res.getError() != error_code_success) {
 				    ctx->onError(res.getError());
 			    } else {
+				    if (!seenReadSuccess) {
+					    info("BlobGranuleCorrectness::randomReadOp first success\n");
+				    }
+				    seenReadSuccess = true;
 				    ctx->done();
 			    }
 		    },
-		    [this, begin, end, results, cont]() {
+		    [this, begin, end, results, tooOld, cont]() {
+			    if (!*tooOld) {
 				    std::vector<KeyValue> expected = store.getRange(begin, end, store.size(), false);
 				    if (results->size() != expected.size()) {
 					    error(fmt::format("randomReadOp result size mismatch. expected: {} actual: {}",
@ -77,8 +92,8 @@ private:
 					    ASSERT((*results)[i].key == expected[i].key);

 					    if ((*results)[i].value != expected[i].value) {
-					    error(
-					        fmt::format("randomReadOp value mismatch at {}/{}. key: {} expected: {:.80} actual: {:.80}",
+						    error(fmt::format(
+						        "randomReadOp value mismatch at {}/{}. key: {} expected: {:.80} actual: {:.80}",
 						        i,
 						        results->size(),
 						        expected[i].key,
@ -87,6 +102,7 @@ private:
 					    }
 					    ASSERT((*results)[i].value == expected[i].value);
 				    }
+			    }
 			    schedule(cont);
 		    });
 	}
@ -110,9 +126,11 @@ private:
 			        true);
 		    },
 		    [this, begin, end, results, cont]() {
+			    if (seenReadSuccess) {
 				    ASSERT(results->size() > 0);
 				    ASSERT(results->front().key <= begin);
 				    ASSERT(results->back().value >= end);
+			    }

 			    for (int i = 0; i < results->size(); i++) {
 				    // no empty or inverted ranges
--- a/bindings/c/test/apitester/run_c_api_tests.py
+++ b/bindings/c/test/apitester/run_c_api_tests.py
@ -20,12 +20,19 @@
 #

 import sys
-import subprocess
 import argparse
 import os
 from subprocess import Popen, TimeoutExpired
 import logging
 import signal
+from pathlib import Path
+import glob
+import random
+import string
+
+
+def random_string(len):
+    return ''.join(random.choice(string.ascii_letters + string.digits) for i in range(len))


 def get_logger():
@ -48,6 +55,14 @@ def initialize_logger_level(logging_level):
        logger.setLevel(logging.ERROR)


+def dump_client_logs(log_dir):
+    for log_file in glob.glob(os.path.join(log_dir, "*")):
+        print(">>>>>>>>>>>>>>>>>>>> Contents of {}:".format(log_file))
+        with open(log_file, "r") as f:
+            print(f.read())
+        print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file))
+
+
 def run_tester(args, test_file):
    cmd = [args.tester_binary,
           "--cluster-file", args.cluster_file,
@ -56,6 +71,12 @@ def run_tester(args, test_file):
        cmd += ["--external-client-library", args.external_client_library]
    if args.tmp_dir is not None:
        cmd += ["--tmp-dir", args.tmp_dir]
+    log_dir = None
+    if args.log_dir is not None:
+        log_dir = Path(args.log_dir).joinpath(random_string(8))
+        log_dir.mkdir(exist_ok=True)
+        cmd += ['--log', "--log-dir", str(log_dir)]
+
    if args.blob_granule_local_file_path is not None:
        cmd += ["--blob-granule-local-file-path",
                args.blob_granule_local_file_path]
@ -63,6 +84,7 @@ def run_tester(args, test_file):
    get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd))
    proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
    timed_out = False
+    ret_code = 1
    try:
        ret_code = proc.wait(args.timeout)
    except TimeoutExpired:
@ -72,15 +94,16 @@ def run_tester(args, test_file):
        raise Exception('Unable to run tester (%s)' % e)

    if ret_code != 0:
-        if ret_code < 0:
+        if timed_out:
+            reason = 'timed out after %d seconds' % args.timeout
+        elif ret_code < 0:
            reason = signal.Signals(-ret_code).name
        else:
            reason = 'exit code: %d' % ret_code
-        if timed_out:
-            reason = 'timed out after %d seconds' % args.timeout
-            ret_code = 1
        get_logger().error('\n\'%s\' did not complete succesfully (%s)' %
                           (cmd[0], reason))
+        if (log_dir is not None):
+            dump_client_logs(log_dir)

    get_logger().info('')
    return ret_code
@ -115,6 +138,8 @@ def parse_args(argv):
                        help='Path to a directory with test definitions. (default: ./)')
    parser.add_argument('--timeout', type=int, default=300,
                        help='The timeout in seconds for running each individual test. (default 300)')
+    parser.add_argument('--log-dir', type=str, default=None,
+                        help='The directory for storing logs (default: None)')
    parser.add_argument('--logging-level', type=str, default='INFO',
                        choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').')
    parser.add_argument('--tmp-dir', type=str, default=None,
--- a/bindings/c/test/fdb_api.hpp
+++ b/bindings/c/test/fdb_api.hpp
@ -0,0 +1,561 @@
+/*
+ * fdb_api.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDB_API_HPP
+#define FDB_API_HPP
+#pragma once
+
+#ifndef FDB_API_VERSION
+#define FDB_API_VERSION 720
+#endif
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <fmt/format.h>
+
+// introduce the option enums
+#include <fdb_c_options.g.h>
+
+namespace fdb {
+
+// hide C API to discourage mixing C/C++ API
+namespace native {
+#include <foundationdb/fdb_c.h>
+}
+
+using ByteString = std::basic_string<uint8_t>;
+using BytesRef = std::basic_string_view<uint8_t>;
+using CharsRef = std::string_view;
+using KeyRef = BytesRef;
+using ValueRef = BytesRef;
+
+inline uint8_t const* toBytePtr(char const* ptr) noexcept {
+	return reinterpret_cast<uint8_t const*>(ptr);
+}
+
+// get bytestring view from charstring: e.g. std::basic_string{_view}<char>
+template <template <class...> class StringLike, class Char>
+BytesRef toBytesRef(const StringLike<Char>& s) noexcept {
+	static_assert(sizeof(Char) == 1);
+	return BytesRef(reinterpret_cast<uint8_t const*>(s.data()), s.size());
+}
+
+// get charstring view from bytestring: e.g. std::basic_string{_view}<uint8_t>
+template <template <class...> class StringLike, class Char>
+CharsRef toCharsRef(const StringLike<Char>& s) noexcept {
+	static_assert(sizeof(Char) == 1);
+	return CharsRef(reinterpret_cast<char const*>(s.data()), s.size());
+}
+
+[[maybe_unused]] constexpr const bool OverflowCheck = false;
+
+inline int intSize(BytesRef b) {
+	if constexpr (OverflowCheck) {
+		if (b.size() > static_cast<size_t>(std::numeric_limits<int>::max()))
+			throw std::overflow_error("byte strlen goes beyond int bounds");
+	}
+	return static_cast<int>(b.size());
+}
+
+class Error {
+public:
+	using CodeType = native::fdb_error_t;
+
+	Error() noexcept : err(0) {}
+
+	explicit Error(CodeType err) noexcept : err(err) {}
+
+	char const* what() noexcept { return native::fdb_get_error(err); }
+
+	explicit operator bool() const noexcept { return err != 0; }
+
+	bool is(CodeType other) const noexcept { return err == other; }
+
+	CodeType code() const noexcept { return err; }
+
+	bool retryable() const noexcept { return native::fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err) != 0; }
+
+private:
+	CodeType err;
+};
+
+/* Traits of value types held by ready futures.
+   Holds type and value extraction function. */
+namespace future_var {
+struct None {
+	struct Type {};
+	static Error extract(native::FDBFuture*, Type&) noexcept { return Error(0); }
+};
+struct Int64 {
+	using Type = int64_t;
+	static Error extract(native::FDBFuture* f, Type& out) noexcept {
+		return Error(native::fdb_future_get_int64(f, &out));
+	}
+};
+struct Key {
+	using Type = std::pair<uint8_t const*, int>;
+	static Error extract(native::FDBFuture* f, Type& out) noexcept {
+		auto& [out_key, out_key_length] = out;
+		return Error(native::fdb_future_get_key(f, &out_key, &out_key_length));
+	}
+};
+struct Value {
+	using Type = std::tuple<bool, uint8_t const*, int>;
+	static Error extract(native::FDBFuture* f, Type& out) noexcept {
+		auto& [out_present, out_value, out_value_length] = out;
+		auto out_present_native = native::fdb_bool_t{};
+		auto err = native::fdb_future_get_value(f, &out_present_native, &out_value, &out_value_length);
+		out_present = (out_present_native != 0);
+		return Error(err);
+	}
+};
+struct StringArray {
+	using Type = std::pair<const char**, int>;
+	static Error extract(native::FDBFuture* f, Type& out) noexcept {
+		auto& [out_strings, out_count] = out;
+		return Error(native::fdb_future_get_string_array(f, &out_strings, &out_count));
+	}
+};
+struct KeyValueArray {
+	using Type = std::tuple<native::FDBKeyValue const*, int, bool>;
+	static Error extract(native::FDBFuture* f, Type& out) noexcept {
+		auto& [out_kv, out_count, out_more] = out;
+		auto out_more_native = native::fdb_bool_t{};
+		auto err = native::fdb_future_get_keyvalue_array(f, &out_kv, &out_count, &out_more_native);
+		out_more = (out_more_native != 0);
+		return Error(err);
+	}
+};
+} // namespace future_var
+
+[[noreturn]] inline void throwError(std::string_view preamble, Error err) {
+	auto msg = std::string(preamble);
+	msg.append(err.what());
+	throw std::runtime_error(msg);
+}
+
+inline int maxApiVersion() {
+	return native::fdb_get_max_api_version();
+}
+
+inline Error selectApiVersionNothrow(int version) {
+	return Error(native::fdb_select_api_version(version));
+}
+
+inline void selectApiVersion(int version) {
+	if (auto err = selectApiVersionNothrow(version)) {
+		throwError(fmt::format("ERROR: fdb_select_api_version({}): ", version), err);
+	}
+}
+
+namespace network {
+
+inline Error setOptionNothrow(FDBNetworkOption option, BytesRef str) noexcept {
+	return Error(native::fdb_network_set_option(option, str.data(), intSize(str)));
+}
+
+inline Error setOptionNothrow(FDBNetworkOption option, int64_t value) noexcept {
+	return Error(native::fdb_network_set_option(
+	    option, reinterpret_cast<const uint8_t*>(&value), static_cast<int>(sizeof(value))));
+}
+
+inline void setOption(FDBNetworkOption option, BytesRef str) {
+	if (auto err = setOptionNothrow(option, str)) {
+		throwError(fmt::format("ERROR: fdb_network_set_option({}): ",
+		                       static_cast<std::underlying_type_t<FDBNetworkOption>>(option)),
+		           err);
+	}
+}
+
+inline void setOption(FDBNetworkOption option, int64_t value) {
+	if (auto err = setOptionNothrow(option, value)) {
+		throwError(fmt::format("ERROR: fdb_network_set_option({}, {}): ",
+		                       static_cast<std::underlying_type_t<FDBNetworkOption>>(option),
+		                       value),
+		           err);
+	}
+}
+
+inline Error setupNothrow() noexcept {
+	return Error(native::fdb_setup_network());
+}
+
+inline void setup() {
+	if (auto err = setupNothrow())
+		throwError("ERROR: fdb_network_setup(): ", err);
+}
+
+inline Error run() {
+	return Error(native::fdb_run_network());
+}
+
+inline Error stop() {
+	return Error(native::fdb_stop_network());
+}
+
+} // namespace network
+
+class Transaction;
+class Database;
+
+class Result {
+	friend class Transaction;
+	std::shared_ptr<native::FDBResult> r;
+
+	Result(native::FDBResult* result) {
+		if (result)
+			r = std::shared_ptr<native::FDBResult>(result, &native::fdb_result_destroy);
+	}
+
+public:
+	using KeyValueArray = future_var::KeyValueArray::Type;
+
+	Error getKeyValueArrayNothrow(KeyValueArray& out) const noexcept {
+		auto out_more_native = native::fdb_bool_t{};
+		auto& [out_kv, out_count, out_more] = out;
+		auto err_raw = native::fdb_result_get_keyvalue_array(r.get(), &out_kv, &out_count, &out_more_native);
+		out_more = out_more_native != 0;
+		return Error(err_raw);
+	}
+
+	KeyValueArray getKeyValueArray() const {
+		auto ret = KeyValueArray{};
+		if (auto err = getKeyValueArrayNothrow(ret))
+			throwError("ERROR: result_get_keyvalue_array(): ", err);
+		return ret;
+	}
+};
+
+class Future {
+protected:
+	friend class Transaction;
+	std::shared_ptr<native::FDBFuture> f;
+
+	Future(native::FDBFuture* future) {
+		if (future)
+			f = std::shared_ptr<native::FDBFuture>(future, &native::fdb_future_destroy);
+	}
+
+	// wrap any capturing lambda as callback passable to fdb_future_set_callback().
+	// destroy after invocation.
+	template <class Fn>
+	static void callback(native::FDBFuture*, void* param) {
+		auto fp = static_cast<Fn*>(param);
+		try {
+			(*fp)();
+		} catch (const std::exception& e) {
+			fmt::print(stderr, "ERROR: Exception thrown in user callback: {}", e.what());
+		}
+		delete fp;
+	}
+
+	// set as callback user-defined completion handler of signature void(Future)
+	template <class FutureType, class UserFunc>
+	void then(UserFunc&& fn) {
+		auto cb = [fut = FutureType(*this), fn = std::forward<UserFunc>(fn)]() { fn(fut); };
+		using cb_type = std::decay_t<decltype(cb)>;
+		auto fp = new cb_type(std::move(cb));
+		if (auto err = Error(native::fdb_future_set_callback(f.get(), &callback<cb_type>, fp))) {
+			throwError("ERROR: future_set_callback: ", err);
+		}
+	}
+
+public:
+	Future() noexcept : Future(nullptr) {}
+	Future(const Future&) noexcept = default;
+	Future& operator=(const Future&) noexcept = default;
+
+	bool valid() const noexcept { return f != nullptr; }
+
+	explicit operator bool() const noexcept { return valid(); }
+
+	bool ready() const noexcept {
+		assert(valid());
+		return native::fdb_future_is_ready(f.get()) != 0;
+	}
+
+	Error blockUntilReady() const noexcept {
+		assert(valid());
+		return Error(native::fdb_future_block_until_ready(f.get()));
+	}
+
+	Error error() const noexcept {
+		assert(valid());
+		return Error(native::fdb_future_get_error(f.get()));
+	}
+
+	void cancel() noexcept { native::fdb_future_cancel(f.get()); }
+
+	template <class VarTraits>
+	typename VarTraits::Type get() const {
+		assert(valid());
+		assert(!error());
+		auto out = typename VarTraits::Type{};
+		if (auto err = VarTraits::extract(f.get(), out)) {
+			throwError("future_get: ", err);
+		}
+		return out;
+	}
+
+	template <class VarTraits>
+	Error getNothrow(typename VarTraits::Type& var) const noexcept {
+		assert(valid());
+		assert(!error());
+		auto out = typename VarTraits::Type{};
+		return VarTraits::extract(f.get(), out);
+	}
+
+	template <class UserFunc>
+	void then(UserFunc&& fn) {
+		then<Future>(std::forward<UserFunc>(fn));
+	}
+};
+
+template <typename VarTraits>
+class TypedFuture : public Future {
+	friend class Future;
+	friend class Transaction;
+	using SelfType = TypedFuture<VarTraits>;
+	using Future::Future;
+	// hide type-unsafe inherited functions
+	using Future::get;
+	using Future::getNothrow;
+	using Future::then;
+	TypedFuture(const Future& f) noexcept : Future(f) {}
+
+public:
+	using ContainedType = typename VarTraits::Type;
+
+	Future eraseType() const noexcept { return static_cast<Future const&>(*this); }
+
+	ContainedType get() const { return get<VarTraits>(); }
+
+	Error getNothrow(ContainedType& out) const noexcept { return getNothrow<VarTraits>(out); }
+
+	template <class UserFunc>
+	void then(UserFunc&& fn) {
+		Future::then<SelfType>(std::forward<UserFunc>(fn));
+	}
+};
+
+struct KeySelector {
+	const uint8_t* key;
+	int keyLength;
+	bool orEqual;
+	int offset;
+};
+
+namespace key_select {
+
+inline KeySelector firstGreaterThan(KeyRef key, int offset = 0) {
+	return KeySelector{ FDB_KEYSEL_FIRST_GREATER_THAN(key.data(), intSize(key)) + offset };
+}
+
+inline KeySelector firstGreaterOrEqual(KeyRef key, int offset = 0) {
+	return KeySelector{ FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(key.data(), intSize(key)) + offset };
+}
+
+inline KeySelector lastLessThan(KeyRef key, int offset = 0) {
+	return KeySelector{ FDB_KEYSEL_LAST_LESS_THAN(key.data(), intSize(key)) + offset };
+}
+
+inline KeySelector lastLessOrEqual(KeyRef key, int offset = 0) {
+	return KeySelector{ FDB_KEYSEL_LAST_LESS_OR_EQUAL(key.data(), intSize(key)) + offset };
+}
+
+} // namespace key_select
+
+class Transaction {
+	friend class Database;
+	std::shared_ptr<native::FDBTransaction> tr;
+
+	explicit Transaction(native::FDBTransaction* tr_raw) {
+		if (tr_raw)
+			tr = std::shared_ptr<native::FDBTransaction>(tr_raw, &native::fdb_transaction_destroy);
+	}
+
+public:
+	Transaction() noexcept : Transaction(nullptr) {}
+	Transaction(const Transaction&) noexcept = default;
+	Transaction& operator=(const Transaction&) noexcept = default;
+
+	bool valid() const noexcept { return tr != nullptr; }
+
+	explicit operator bool() const noexcept { return valid(); }
+
+	Error setOptionNothrow(FDBTransactionOption option, int64_t value) noexcept {
+		return Error(native::fdb_transaction_set_option(
+		    tr.get(), option, reinterpret_cast<const uint8_t*>(&value), static_cast<int>(sizeof(value))));
+	}
+
+	Error setOptionNothrow(FDBTransactionOption option, BytesRef str) noexcept {
+		return Error(native::fdb_transaction_set_option(tr.get(), option, str.data(), intSize(str)));
+	}
+
+	void setOption(FDBTransactionOption option, int64_t value) {
+		if (auto err = setOptionNothrow(option, value)) {
+			throwError(fmt::format("transaction_set_option({}, {}) returned error: ",
+			                       static_cast<std::underlying_type_t<FDBTransactionOption>>(option),
+			                       value),
+			           err);
+		}
+	}
+
+	void setOption(FDBTransactionOption option, BytesRef str) {
+		if (auto err = setOptionNothrow(option, str)) {
+			throwError(fmt::format("transaction_set_option({}) returned error: ",
+			                       static_cast<std::underlying_type_t<FDBTransactionOption>>(option)),
+			           err);
+		}
+	}
+
+	TypedFuture<future_var::Int64> getReadVersion() { return native::fdb_transaction_get_read_version(tr.get()); }
+
+	Error getCommittedVersionNothrow(int64_t& out) {
+		return Error(native::fdb_transaction_get_committed_version(tr.get(), &out));
+	}
+
+	int64_t getCommittedVersion() {
+		auto out = int64_t{};
+		if (auto err = getCommittedVersionNothrow(out)) {
+			throwError("get_committed_version: ", err);
+		}
+		return out;
+	}
+
+	TypedFuture<future_var::Key> getKey(KeySelector sel, bool snapshot) {
+		return native::fdb_transaction_get_key(tr.get(), sel.key, sel.keyLength, sel.orEqual, sel.offset, snapshot);
+	}
+
+	TypedFuture<future_var::Value> get(KeyRef key, bool snapshot) {
+		return native::fdb_transaction_get(tr.get(), key.data(), intSize(key), snapshot);
+	}
+
+	// Usage: tx.getRange(key_select::firstGreaterOrEqual(firstKey), key_select::lastLessThan(lastKey), ...)
+	// gets key-value pairs in key range [begin, end)
+	TypedFuture<future_var::KeyValueArray> getRange(KeySelector first,
+	                                                KeySelector last,
+	                                                int limit,
+	                                                int target_bytes,
+	                                                FDBStreamingMode mode,
+	                                                int iteration,
+	                                                bool snapshot,
+	                                                bool reverse) {
+		return native::fdb_transaction_get_range(tr.get(),
+		                                         first.key,
+		                                         first.keyLength,
+		                                         first.orEqual,
+		                                         first.offset,
+		                                         last.key,
+		                                         last.keyLength,
+		                                         last.orEqual,
+		                                         last.offset,
+		                                         limit,
+		                                         target_bytes,
+		                                         mode,
+		                                         iteration,
+		                                         snapshot,
+		                                         reverse);
+	}
+
+	Result readBlobGranules(KeyRef begin,
+	                        KeyRef end,
+	                        int64_t begin_version,
+	                        int64_t read_version,
+	                        native::FDBReadBlobGranuleContext context) {
+		return Result(native::fdb_transaction_read_blob_granules(
+		    tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), begin_version, read_version, context));
+	}
+
+	TypedFuture<future_var::None> commit() { return native::fdb_transaction_commit(tr.get()); }
+
+	TypedFuture<future_var::None> onError(Error err) { return native::fdb_transaction_on_error(tr.get(), err.code()); }
+
+	void reset() { return native::fdb_transaction_reset(tr.get()); }
+
+	void set(KeyRef key, ValueRef value) {
+		native::fdb_transaction_set(tr.get(), key.data(), intSize(key), value.data(), intSize(value));
+	}
+
+	void clear(KeyRef key) { native::fdb_transaction_clear(tr.get(), key.data(), intSize(key)); }
+
+	void clearRange(KeyRef begin, KeyRef end) {
+		native::fdb_transaction_clear_range(tr.get(), begin.data(), intSize(begin), end.data(), intSize(end));
+	}
+};
+
+class Database {
+	std::shared_ptr<native::FDBDatabase> db;
+
+public:
+	Database(const Database&) noexcept = default;
+	Database& operator=(const Database&) noexcept = default;
+	Database(const std::string& cluster_file_path) : db(nullptr) {
+		auto db_raw = static_cast<native::FDBDatabase*>(nullptr);
+		if (auto err = Error(native::fdb_create_database(cluster_file_path.c_str(), &db_raw)))
+			throwError(fmt::format("Failed to create database with '{}': ", cluster_file_path), err);
+		db = std::shared_ptr<native::FDBDatabase>(db_raw, &native::fdb_database_destroy);
+	}
+	Database() noexcept : db(nullptr) {}
+
+	Error setOptionNothrow(FDBDatabaseOption option, int64_t value) noexcept {
+		return Error(native::fdb_database_set_option(
+		    db.get(), option, reinterpret_cast<const uint8_t*>(&value), static_cast<int>(sizeof(value))));
+	}
+
+	Error setOptionNothrow(FDBDatabaseOption option, BytesRef str) noexcept {
+		return Error(native::fdb_database_set_option(db.get(), option, str.data(), intSize(str)));
+	}
+
+	void setOption(FDBDatabaseOption option, int64_t value) {
+		if (auto err = setOptionNothrow(option, value)) {
+			throwError(fmt::format("database_set_option({}, {}) returned error: ",
+			                       static_cast<std::underlying_type_t<FDBDatabaseOption>>(option),
+			                       value),
+			           err);
+		}
+	}
+
+	void setOption(FDBDatabaseOption option, BytesRef str) {
+		if (auto err = setOptionNothrow(option, str)) {
+			throwError(fmt::format("database_set_option({}) returned error: ",
+			                       static_cast<std::underlying_type_t<FDBDatabaseOption>>(option)),
+			           err);
+		}
+	}
+
+	Transaction createTransaction() {
+		if (!db)
+			throw std::runtime_error("create_transaction from null database");
+		auto tx_native = static_cast<native::FDBTransaction*>(nullptr);
+		auto err = Error(native::fdb_database_create_transaction(db.get(), &tx_native));
+		if (err)
+			throwError("Failed to create transaction: ", err);
+		return Transaction(tx_native);
+	}
+};
+
+} // namespace fdb
+
+#endif /*FDB_API_HPP*/
--- a/bindings/c/test/mako/async.cpp
+++ b/bindings/c/test/mako/async.cpp
@ -0,0 +1,288 @@
+/*
+ * async.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <boost/asio.hpp>
+#include "async.hpp"
+#include "future.hpp"
+#include "logger.hpp"
+#include "operations.hpp"
+#include "stats.hpp"
+#include "time.hpp"
+#include "utils.hpp"
+
+extern thread_local mako::Logger logr;
+
+using namespace fdb;
+
+namespace mako {
+
+void ResumableStateForPopulate::postNextTick() {
+	boost::asio::post(io_context, [this, state = shared_from_this()]() { runOneTick(); });
+}
+
+void ResumableStateForPopulate::runOneTick() {
+	const auto num_commit_every = args.txnspec.ops[OP_INSERT][OP_COUNT];
+	for (auto i = key_checkpoint; i <= key_end; i++) {
+		genKey(keystr.data(), KEY_PREFIX, args, i);
+		randomString(valstr.data(), args.value_length);
+		tx.set(keystr, valstr);
+		stats.incrOpCount(OP_INSERT);
+		if (i == key_end || (i - key_begin + 1) % num_commit_every == 0) {
+			watch_commit.start();
+			tx.commit().then([this, state = shared_from_this(), i](Future f) {
+				if (auto err = f.error()) {
+					logr.printWithLogLevel(err.retryable() ? VERBOSE_WARN : VERBOSE_NONE,
+					                       "ERROR",
+					                       "commit for populate returned '{}'",
+					                       err.what());
+					tx.onError(err).then([this, state = shared_from_this()](Future f) {
+						const auto f_rc = handleForOnError(tx, f, "ON_ERROR_FOR_POPULATE");
+						if (f_rc == FutureRC::ABORT) {
+							signalEnd();
+							return;
+						} else {
+							postNextTick();
+						}
+					});
+				} else {
+					// successfully committed
+					watch_commit.stop();
+					watch_tx.setStop(watch_commit.getStop());
+					if (stats.getOpCount(OP_TRANSACTION) % args.sampling == 0) {
+						const auto commit_latency = watch_commit.diff();
+						const auto tx_duration = watch_tx.diff();
+						stats.addLatency(OP_COMMIT, commit_latency);
+						stats.addLatency(OP_TRANSACTION, tx_duration);
+						sample_bins[OP_COMMIT].put(commit_latency);
+						sample_bins[OP_TRANSACTION].put(tx_duration);
+					}
+					stats.incrOpCount(OP_COMMIT);
+					stats.incrOpCount(OP_TRANSACTION);
+					tx.reset();
+					watch_tx.startFromStop();
+					key_checkpoint = i + 1;
+					if (i != key_end) {
+						postNextTick();
+					} else {
+						logr.debug("Populated {} rows [{}, {}]: {:6.3f} sec",
+						           key_end - key_begin + 1,
+						           key_begin,
+						           key_end,
+						           toDoubleSeconds(watch_total.stop().diff()));
+						signalEnd();
+						return;
+					}
+				}
+			});
+			break;
+		}
+	}
+}
+
+void ResumableStateForRunWorkload::postNextTick() {
+	boost::asio::post(io_context, [this, state = shared_from_this()]() { runOneTick(); });
+}
+
+void ResumableStateForRunWorkload::runOneTick() {
+	assert(iter != OpEnd);
+	if (iter.step == 0 /* first step */)
+		prepareKeys(iter.op, key1, key2, args);
+	watch_step.start();
+	if (iter.step == 0)
+		watch_op = Stopwatch(watch_step.getStart());
+	auto f = Future{};
+	// to minimize context switch overhead, repeat immediately completed ops
+	// in a loop, not an async continuation.
+repeat_immediate_steps:
+	f = opTable[iter.op].stepFunction(iter.step)(tx, args, key1, key2, val);
+	if (!f) {
+		// immediately completed client-side ops: e.g. set, setrange, clear, clearrange, ...
+		updateStepStats();
+		iter = getOpNext(args, iter);
+		if (iter == OpEnd)
+			onTransactionSuccess();
+		else
+			goto repeat_immediate_steps;
+	} else {
+		// step is blocking. register a continuation and return
+		f.then([this, state = shared_from_this()](Future f) {
+			if (auto postStepFn = opTable[iter.op].postStepFunction(iter.step))
+				postStepFn(f, tx, args, key1, key2, val);
+			if (iter.stepKind() != StepKind::ON_ERROR) {
+				if (auto err = f.error()) {
+					logr.printWithLogLevel(err.retryable() ? VERBOSE_WARN : VERBOSE_NONE,
+					                       "ERROR",
+					                       "{}:{} returned '{}'",
+					                       iter.opName(),
+					                       iter.step,
+					                       err.what());
+					tx.onError(err).then([this, state = shared_from_this()](Future f) {
+						const auto rc = handleForOnError(tx, f, fmt::format("{}:{}", iter.opName(), iter.step));
+						if (rc == FutureRC::RETRY) {
+							stats.incrErrorCount(iter.op);
+						} else if (rc == FutureRC::CONFLICT) {
+							stats.incrConflictCount();
+						} else if (rc == FutureRC::ABORT) {
+							tx.reset();
+							signalEnd();
+							return;
+						}
+						// restart this iteration from beginning
+						iter = getOpBegin(args);
+						needs_commit = false;
+						postNextTick();
+					});
+				} else {
+					// async step succeeded
+					updateStepStats();
+					iter = getOpNext(args, iter);
+					if (iter == OpEnd) {
+						onTransactionSuccess();
+					} else {
+						postNextTick();
+					}
+				}
+			} else {
+				// blob granules op error
+				auto rc = handleForOnError(tx, f, "BG_ON_ERROR");
+				if (rc == FutureRC::RETRY) {
+					stats.incrErrorCount(iter.op);
+				} else if (rc == FutureRC::CONFLICT) {
+					stats.incrConflictCount();
+				} else if (rc == FutureRC::ABORT) {
+					tx.reset();
+					stopcount.fetch_add(1);
+					return;
+				}
+				iter = getOpBegin(args);
+				needs_commit = false;
+				// restart this iteration from beginning
+				postNextTick();
+			}
+		});
+	}
+}
+
+void ResumableStateForRunWorkload::updateStepStats() {
+	logr.debug("Step {}:{} succeeded", iter.opName(), iter.step);
+	// step successful
+	watch_step.stop();
+	const auto do_sample = stats.getOpCount(OP_TRANSACTION) % args.sampling == 0;
+	if (iter.stepKind() == StepKind::COMMIT) {
+		// reset transaction boundary
+		const auto step_latency = watch_step.diff();
+		if (do_sample) {
+			stats.addLatency(OP_COMMIT, step_latency);
+			sample_bins[OP_COMMIT].put(step_latency);
+		}
+		tx.reset();
+		stats.incrOpCount(OP_COMMIT);
+		needs_commit = false;
+	}
+	// op completed successfully
+	if (iter.step + 1 == opTable[iter.op].steps()) {
+		if (opTable[iter.op].needsCommit())
+			needs_commit = true;
+		watch_op.setStop(watch_step.getStop());
+		if (do_sample) {
+			const auto op_latency = watch_op.diff();
+			stats.addLatency(iter.op, op_latency);
+			sample_bins[iter.op].put(op_latency);
+		}
+		stats.incrOpCount(iter.op);
+	}
+}
+
+void ResumableStateForRunWorkload::onTransactionSuccess() {
+	if (needs_commit || args.commit_get) {
+		// task completed, need to commit before finish
+		watch_commit.start();
+		tx.commit().then([this, state = shared_from_this()](Future f) {
+			if (auto err = f.error()) {
+				// commit had errors
+				logr.printWithLogLevel(err.retryable() ? VERBOSE_WARN : VERBOSE_NONE,
+				                       "ERROR",
+				                       "Post-iteration commit returned error: {}",
+				                       err.what());
+				tx.onError(err).then([this, state = shared_from_this()](Future f) {
+					const auto rc = handleForOnError(tx, f, "ON_ERROR");
+					if (rc == FutureRC::CONFLICT)
+						stats.incrConflictCount();
+					else
+						stats.incrErrorCount(OP_COMMIT);
+					if (rc == FutureRC::ABORT) {
+						signalEnd();
+						return;
+					}
+					if (ended()) {
+						signalEnd();
+					} else {
+						iter = getOpBegin(args);
+						needs_commit = false;
+						postNextTick();
+					}
+				});
+			} else {
+				// commit successful
+				watch_commit.stop();
+				watch_tx.setStop(watch_commit.getStop());
+				if (stats.getOpCount(OP_TRANSACTION) % args.sampling == 0) {
+					const auto commit_latency = watch_commit.diff();
+					const auto tx_duration = watch_tx.diff();
+					stats.addLatency(OP_COMMIT, commit_latency);
+					stats.addLatency(OP_TRANSACTION, commit_latency);
+					sample_bins[OP_COMMIT].put(commit_latency);
+					sample_bins[OP_TRANSACTION].put(tx_duration);
+				}
+				stats.incrOpCount(OP_COMMIT);
+				stats.incrOpCount(OP_TRANSACTION);
+				tx.reset();
+				watch_tx.startFromStop();
+				if (ended()) {
+					signalEnd();
+				} else {
+					// start next iteration
+					iter = getOpBegin(args);
+					postNextTick();
+				}
+			}
+		});
+	} else {
+		// transaction completed but no need to commit
+		watch_tx.stop();
+		if (stats.getOpCount(OP_TRANSACTION) % args.sampling == 0) {
+			const auto tx_duration = watch_tx.diff();
+			stats.addLatency(OP_TRANSACTION, tx_duration);
+			sample_bins[OP_TRANSACTION].put(tx_duration);
+		}
+		stats.incrOpCount(OP_TRANSACTION);
+		watch_tx.startFromStop();
+		tx.reset();
+		if (ended()) {
+			signalEnd();
+		} else {
+			iter = getOpBegin(args);
+			// start next iteration
+			postNextTick();
+		}
+	}
+}
+
+} // namespace mako
--- a/bindings/c/test/mako/async.hpp
+++ b/bindings/c/test/mako/async.hpp
@ -0,0 +1,127 @@
+/*
+ * async.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_ASYNC_HPP
+#define MAKO_ASYNC_HPP
+
+#include <atomic>
+#include <memory>
+#include <boost/asio.hpp>
+#include "logger.hpp"
+#include "mako.hpp"
+#include "shm.hpp"
+#include "stats.hpp"
+#include "time.hpp"
+
+namespace mako {
+
+// as we don't have coroutines yet, we need to store in heap the complete state of execution,
+// such that we can resume exactly where we were from last database op.
+struct ResumableStateForPopulate : std::enable_shared_from_this<ResumableStateForPopulate> {
+	Logger logr;
+	fdb::Database db;
+	fdb::Transaction tx;
+	boost::asio::io_context& io_context;
+	Arguments const& args;
+	ThreadStatistics& stats;
+	std::atomic<int>& stopcount;
+	LatencySampleBinArray sample_bins;
+	int key_begin;
+	int key_end;
+	int key_checkpoint;
+	fdb::ByteString keystr;
+	fdb::ByteString valstr;
+	Stopwatch watch_tx;
+	Stopwatch watch_commit;
+	Stopwatch watch_total;
+
+	ResumableStateForPopulate(Logger logr,
+	                          fdb::Database db,
+	                          fdb::Transaction tx,
+	                          boost::asio::io_context& io_context,
+	                          Arguments const& args,
+	                          ThreadStatistics& stats,
+	                          std::atomic<int>& stopcount,
+	                          int key_begin,
+	                          int key_end)
+	  : logr(logr), db(db), tx(tx), io_context(io_context), args(args), stats(stats), stopcount(stopcount),
+	    key_begin(key_begin), key_end(key_end), key_checkpoint(key_begin) {
+		keystr.resize(args.key_length);
+		valstr.resize(args.value_length);
+	}
+	void runOneTick();
+	void postNextTick();
+	void signalEnd() { stopcount.fetch_add(1); }
+};
+
+using PopulateStateHandle = std::shared_ptr<ResumableStateForPopulate>;
+
+struct ResumableStateForRunWorkload : std::enable_shared_from_this<ResumableStateForRunWorkload> {
+	Logger logr;
+	fdb::Database db;
+	fdb::Transaction tx;
+	boost::asio::io_context& io_context;
+	Arguments const& args;
+	ThreadStatistics& stats;
+	std::atomic<int>& stopcount;
+	std::atomic<int> const& signal;
+	int max_iters;
+	OpIterator iter;
+	LatencySampleBinArray sample_bins;
+	fdb::ByteString key1;
+	fdb::ByteString key2;
+	fdb::ByteString val;
+	Stopwatch watch_step;
+	Stopwatch watch_op;
+	Stopwatch watch_commit;
+	Stopwatch watch_tx;
+	bool needs_commit;
+
+	ResumableStateForRunWorkload(Logger logr,
+	                             fdb::Database db,
+	                             fdb::Transaction tx,
+	                             boost::asio::io_context& io_context,
+	                             Arguments const& args,
+	                             ThreadStatistics& stats,
+	                             std::atomic<int>& stopcount,
+	                             std::atomic<int> const& signal,
+	                             int max_iters,
+	                             OpIterator iter)
+	  : logr(logr), db(db), tx(tx), io_context(io_context), args(args), stats(stats), stopcount(stopcount),
+	    signal(signal), max_iters(max_iters), iter(iter), needs_commit(false) {
+		key1.resize(args.key_length);
+		key2.resize(args.key_length);
+		val.resize(args.value_length);
+	}
+	void signalEnd() noexcept { stopcount.fetch_add(1); }
+	bool ended() noexcept {
+		return (max_iters != -1 && max_iters >= stats.getOpCount(OP_TRANSACTION)) || signal.load() == SIGNAL_RED;
+	}
+	void postNextTick();
+	void runOneTick();
+	void updateStepStats();
+	void onTransactionSuccess();
+};
+
+using RunWorkloadStateHandle = std::shared_ptr<ResumableStateForRunWorkload>;
+
+} // namespace mako
+
+#endif /*MAKO_ASYNC_HPP*/
--- a/bindings/c/test/mako/blob_granules.cpp
+++ b/bindings/c/test/mako/blob_granules.cpp
@ -0,0 +1,116 @@
+/*
+ * blob_granules.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "blob_granules.hpp"
+#include "limit.hpp"
+#include "logger.hpp"
+#include <cstdio>
+#include <fdb_api.hpp>
+
+extern thread_local mako::Logger logr;
+
+namespace mako::blob_granules::local_file {
+
+int64_t startLoad(const char* filename,
+                  int filenameLength,
+                  int64_t offset,
+                  int64_t length,
+                  int64_t fullFileLength,
+                  void* userContext) {
+	FILE* fp;
+	char full_fname[PATH_MAX]{
+		0,
+	};
+	int loadId;
+	uint8_t* data;
+	size_t readSize;
+
+	auto context = static_cast<UserContext*>(userContext);
+
+	loadId = context->nextId;
+	if (context->dataById[loadId] != 0) {
+		logr.error("too many granule file loads at once: {}", MAX_BG_IDS);
+		return -1;
+	}
+	context->nextId = (context->nextId + 1) % MAX_BG_IDS;
+
+	int ret = snprintf(full_fname, PATH_MAX, "%s%s", context->bgFilePath, filename);
+	if (ret < 0 || ret >= PATH_MAX) {
+		logr.error("BG filename too long: {}{}", context->bgFilePath, filename);
+		return -1;
+	}
+
+	fp = fopen(full_fname, "r");
+	if (!fp) {
+		logr.error("BG could not open file: {}", full_fname);
+		return -1;
+	}
+
+	// don't seek if offset == 0
+	if (offset && fseek(fp, offset, SEEK_SET)) {
+		// if fseek was non-zero, it failed
+		logr.error("BG could not seek to %{} in file {}", offset, full_fname);
+		fclose(fp);
+		return -1;
+	}
+
+	data = new uint8_t[length];
+	readSize = fread(data, sizeof(uint8_t), length, fp);
+	fclose(fp);
+
+	if (readSize != length) {
+		logr.error("BG could not read {} bytes from file: {}", length, full_fname);
+		return -1;
+	}
+
+	context->dataById[loadId] = data;
+	return loadId;
+}
+
+uint8_t* getLoad(int64_t loadId, void* userContext) {
+	auto context = static_cast<UserContext*>(userContext);
+	if (context->dataById[loadId] == 0) {
+		logr.error("BG loadId invalid for get_load: {}", loadId);
+		return 0;
+	}
+	return context->dataById[loadId];
+}
+
+void freeLoad(int64_t loadId, void* userContext) {
+	auto context = static_cast<UserContext*>(userContext);
+	if (context->dataById[loadId] == 0) {
+		logr.error("BG loadId invalid for free_load: {}", loadId);
+	}
+	delete[] context->dataById[loadId];
+	context->dataById[loadId] = 0;
+}
+
+fdb::native::FDBReadBlobGranuleContext createApiContext(UserContext& ctx, bool materialize_files) {
+	auto ret = fdb::native::FDBReadBlobGranuleContext{};
+	ret.userContext = &ctx;
+	ret.start_load_f = &startLoad;
+	ret.get_load_f = &getLoad;
+	ret.free_load_f = &freeLoad;
+	ret.debugNoMaterialize = !materialize_files;
+	ret.granuleParallelism = 2; // TODO make knob or setting for changing this?
+	return ret;
+}
+
+} // namespace mako::blob_granules::local_file
--- a/bindings/c/test/mako/blob_granules.hpp
+++ b/bindings/c/test/mako/blob_granules.hpp
@ -0,0 +1,50 @@
+/*
+ * blob_granules.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_BLOB_GRANULES_HPP
+#define MAKO_BLOB_GRANULES_HPP
+
+#include <cstdint>
+#include <memory>
+#include <fdb_api.hpp>
+
+namespace mako::blob_granules::local_file {
+
+constexpr const int MAX_BG_IDS = 1000;
+
+// TODO: could always abstract this into something more generically usable by something other than mako.
+// But outside of testing there are likely few use cases for local granules
+struct UserContext {
+	char const* bgFilePath;
+	int nextId;
+	std::unique_ptr<uint8_t*[]> dataByIdMem;
+	uint8_t** dataById;
+
+	UserContext(char const* filePath)
+	  : bgFilePath(filePath), nextId(0), dataByIdMem(new uint8_t*[MAX_BG_IDS]()), dataById(dataByIdMem.get()) {}
+
+	void clear() { dataByIdMem.reset(); }
+};
+
+fdb::native::FDBReadBlobGranuleContext createApiContext(UserContext& ctx, bool materialize_files);
+
+} // namespace mako::blob_granules::local_file
+
+#endif /*MAKO_BLOB_GRANULES_HPP*/
--- a/bindings/c/test/mako/future.hpp
+++ b/bindings/c/test/mako/future.hpp
@ -0,0 +1,89 @@
+/*
+ * future.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_FUTURE_HPP
+#define MAKO_FUTURE_HPP
+
+#include <fdb_api.hpp>
+#include <cassert>
+#include <string_view>
+#include "logger.hpp"
+#include "macro.hpp"
+
+extern thread_local mako::Logger logr;
+
+namespace mako {
+
+enum class FutureRC { OK, RETRY, CONFLICT, ABORT };
+
+template <class FutureType>
+force_inline FutureRC handleForOnError(fdb::Transaction& tx, FutureType& f, std::string_view step) {
+	if (auto err = f.error()) {
+		if (err.is(1020 /*not_committed*/)) {
+			return FutureRC::CONFLICT;
+		} else if (err.retryable()) {
+			logr.warn("Retryable error '{}' found at on_error(), step: {}", err.what(), step);
+			return FutureRC::RETRY;
+		} else {
+			logr.error("Unretryable error '{}' found at on_error(), step: {}", err.what(), step);
+			tx.reset();
+			return FutureRC::ABORT;
+		}
+	} else {
+		return FutureRC::RETRY;
+	}
+}
+
+template <class FutureType>
+force_inline FutureRC waitAndHandleForOnError(fdb::Transaction& tx, FutureType& f, std::string_view step) {
+	assert(f);
+	if (auto err = f.blockUntilReady()) {
+		logr.error("'{}' found while waiting for on_error() future, step: {}", err.what(), step);
+		return FutureRC::ABORT;
+	}
+	return handleForOnError(tx, f, step);
+}
+
+// wait on any non-immediate tx-related step to complete. Follow up with on_error().
+template <class FutureType>
+force_inline FutureRC waitAndHandleError(fdb::Transaction& tx, FutureType& f, std::string_view step) {
+	assert(f);
+	auto err = fdb::Error{};
+	if ((err = f.blockUntilReady())) {
+		const auto retry = err.retryable();
+		logr.error("{} error '{}' found during step: {}", (retry ? "Retryable" : "Unretryable"), err.what(), step);
+		return retry ? FutureRC::RETRY : FutureRC::ABORT;
+	}
+	err = f.error();
+	if (!err)
+		return FutureRC::OK;
+	if (err.retryable()) {
+		logr.warn("step {} returned '{}'", step, err.what());
+	} else {
+		logr.error("step {} returned '{}'", step, err.what());
+	}
+	// implicit backoff
+	auto follow_up = tx.onError(err);
+	return waitAndHandleForOnError(tx, f, step);
+}
+
+} // namespace mako
+
+#endif /*MAKO_FUTURE_HPP*/
--- a/bindings/c/test/mako/limit.hpp
+++ b/bindings/c/test/mako/limit.hpp
@ -0,0 +1,32 @@
+/*
+ * limit.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIMIT_HPP
+#define LIMIT_HPP
+
+#if defined(__linux__)
+#include <linux/limits.h>
+#elif defined(__APPLE__)
+#include <sys/syslimits.h>
+#else
+#include <limits.h>
+#endif
+
+#endif
--- a/bindings/c/test/mako/logger.hpp
+++ b/bindings/c/test/mako/logger.hpp
@ -0,0 +1,117 @@
+/*
+ * logger.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_LOGGER_HPP
+#define MAKO_LOGGER_HPP
+#include <fmt/format.h>
+#include <cassert>
+#include <cstdio>
+#include <iterator>
+#include <string_view>
+#include "process.hpp"
+
+namespace mako {
+
+constexpr const int VERBOSE_NONE = 0; // will still print errors
+constexpr const int VERBOSE_DEFAULT = 1; // will print info and work stats
+constexpr const int VERBOSE_WARN = 2; // will print expected errors
+constexpr const int VERBOSE_DEBUG = 3; // will print everything
+
+template <ProcKind P>
+using ProcKindConstant = std::integral_constant<ProcKind, P>;
+
+using MainProcess = ProcKindConstant<ProcKind::MAIN>;
+using StatsProcess = ProcKindConstant<ProcKind::STATS>;
+using WorkerProcess = ProcKindConstant<ProcKind::WORKER>;
+
+class Logger {
+	ProcKind proc;
+	int verbosity{ VERBOSE_DEFAULT };
+	int process_id{ -1 };
+	int thread_id{ -1 };
+
+	void putHeader(fmt::memory_buffer& buf, std::string_view category) {
+		if (proc == ProcKind::MAIN) {
+			fmt::format_to(std::back_inserter(buf), "[MAIN] {}: ", category);
+		} else if (proc == ProcKind::STATS) {
+			fmt::format_to(std::back_inserter(buf), "[STATS] {}: ", category);
+		} else {
+			if (thread_id == -1) {
+				fmt::format_to(std::back_inserter(buf), "[WORKER{:3d}] {}: ", process_id + 1, category);
+			} else {
+				fmt::format_to(
+				    std::back_inserter(buf), "[WORKER{:3d}:{:3d}] {}: ", process_id + 1, thread_id + 1, category);
+			}
+		}
+	}
+
+public:
+	Logger(MainProcess, int verbosity) noexcept : proc(MainProcess::value), verbosity(verbosity) {}
+
+	Logger(StatsProcess, int verbosity) noexcept : proc(StatsProcess::value), verbosity(verbosity) {}
+
+	Logger(WorkerProcess, int verbosity, int process_id, int thread_id = -1) noexcept
+	  : proc(WorkerProcess::value), verbosity(verbosity), process_id(process_id), thread_id(thread_id) {}
+
+	Logger(const Logger&) noexcept = default;
+	Logger& operator=(const Logger&) noexcept = default;
+
+	void setVerbosity(int value) noexcept {
+		assert(value >= VERBOSE_NONE && value <= VERBOSE_DEBUG);
+		verbosity = value;
+	}
+
+	template <typename... Args>
+	void printWithLogLevel(int log_level, std::string_view header, Args&&... args) {
+		assert(log_level >= VERBOSE_NONE && log_level <= VERBOSE_DEBUG);
+		if (log_level <= verbosity) {
+			const auto fp = log_level == VERBOSE_NONE ? stderr : stdout;
+			// 500B inline buffer
+			auto buf = fmt::memory_buffer{};
+			putHeader(buf, header);
+			fmt::format_to(std::back_inserter(buf), std::forward<Args>(args)...);
+			fmt::print(fp, "{}\n", std::string_view(buf.data(), buf.size()));
+		}
+	}
+
+	template <typename... Args>
+	void error(Args&&... args) {
+		printWithLogLevel(VERBOSE_NONE, "ERROR", std::forward<Args>(args)...);
+	}
+
+	template <typename... Args>
+	void info(Args&&... args) {
+		printWithLogLevel(VERBOSE_DEFAULT, "INFO", std::forward<Args>(args)...);
+	}
+
+	template <typename... Args>
+	void warn(Args&&... args) {
+		printWithLogLevel(VERBOSE_WARN, "WARNING", std::forward<Args>(args)...);
+	}
+
+	template <typename... Args>
+	void debug(Args&&... args) {
+		printWithLogLevel(VERBOSE_DEBUG, "DEBUG", std::forward<Args>(args)...);
+	}
+};
+
+} // namespace mako
+
+#endif /*MAKO_LOGGER_HPP*/
--- a/bindings/c/test/mako/macro.hpp
+++ b/bindings/c/test/mako/macro.hpp
@ -0,0 +1,32 @@
+/*
+ * macro.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_MACRO_HPP
+#define MAKO_MACRO_HPP
+
+#if defined(__GNUG__)
+#define force_inline inline __attribute__((__always_inline__))
+#elif defined(_MSC_VER)
+#define force_inline __forceinline
+#else
+#error Missing force inline
+#endif
+
+#endif /*MAKO_MACRO_HPP*/
--- a/bindings/c/test/mako/mako.c
+++ b/bindings/c/test/mako/mako.c
--- a/bindings/c/test/mako/mako.cpp
+++ b/bindings/c/test/mako/mako.cpp
--- a/bindings/c/test/mako/mako.h
+++ b/bindings/c/test/mako/mako.h
@ -1,209 +0,0 @@
-#ifndef MAKO_H
-#define MAKO_H
-#pragma once
-
-#ifndef FDB_API_VERSION
-#define FDB_API_VERSION 720
-#endif
-
-#include <foundationdb/fdb_c.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <stdbool.h>
-#if defined(__linux__)
-#include <linux/limits.h>
-#elif defined(__APPLE__)
-#include <sys/syslimits.h>
-#else
-#include <limits.h>
-#endif
-
-#define VERBOSE_NONE 0
-#define VERBOSE_DEFAULT 1
-#define VERBOSE_ANNOYING 2
-#define VERBOSE_DEBUG 3
-
-#define MODE_INVALID -1
-#define MODE_CLEAN 0
-#define MODE_BUILD 1
-#define MODE_RUN 2
-
-#define FDB_SUCCESS 0
-#define FDB_ERROR_RETRY -1
-#define FDB_ERROR_ABORT -2
-#define FDB_ERROR_CONFLICT -3
-
-#define LAT_BLOCK_SIZE 511 /* size of each block to get detailed latency for each operation */
-
-/* transaction specification */
-enum Operations {
-	OP_GETREADVERSION,
-	OP_GET,
-	OP_GETRANGE,
-	OP_SGET,
-	OP_SGETRANGE,
-	OP_UPDATE,
-	OP_INSERT,
-	OP_INSERTRANGE,
-	OP_OVERWRITE,
-	OP_CLEAR,
-	OP_SETCLEAR,
-	OP_CLEARRANGE,
-	OP_SETCLEARRANGE,
-	OP_COMMIT,
-	OP_TRANSACTION, /* pseudo-operation - cumulative time for the operation + commit */
-	OP_READ_BG,
-	MAX_OP /* must be the last item */
-};
-
-#define OP_COUNT 0
-#define OP_RANGE 1
-#define OP_REVERSE 2
-
-/* for long arguments */
-enum Arguments {
-	ARG_KEYLEN,
-	ARG_VALLEN,
-	ARG_TPS,
-	ARG_COMMITGET,
-	ARG_SAMPLING,
-	ARG_VERSION,
-	ARG_KNOBS,
-	ARG_FLATBUFFERS,
-	ARG_LOGGROUP,
-	ARG_PREFIXPADDING,
-	ARG_TRACE,
-	ARG_TRACEPATH,
-	ARG_TRACEFORMAT,
-	ARG_TPSMAX,
-	ARG_TPSMIN,
-	ARG_TPSINTERVAL,
-	ARG_TPSCHANGE,
-	ARG_TXNTRACE,
-	ARG_TXNTAGGING,
-	ARG_TXNTAGGINGPREFIX,
-	ARG_STREAMING_MODE,
-	ARG_DISABLE_RYW,
-	ARG_CLIENT_THREADS_PER_VERSION,
-	ARG_JSON_REPORT,
-	ARG_BG_FILE_PATH // if blob granule files are stored locally, mako will read and materialize them if this is set
-};
-
-enum TPSChangeTypes { TPS_SIN, TPS_SQUARE, TPS_PULSE };
-
-#define KEYPREFIX "mako"
-#define KEYPREFIXLEN 4
-
-#define TEMP_DATA_STORE "/tmp/makoTemp"
-
-/* we set mako_txnspec_t and mako_args_t only once in the master process,
- * and won't be touched by child processes.
- */
-
-typedef struct {
-	/* for each operation, it stores "count", "range" and "reverse" */
-	int ops[MAX_OP][3];
-} mako_txnspec_t;
-
-#define LOGGROUP_MAX 256
-#define KNOB_MAX 256
-#define TAGPREFIXLENGTH_MAX 8
-#define NUM_CLUSTERS_MAX 3
-#define NUM_DATABASES_MAX 10
-#define MAX_BG_IDS 1000
-
-/* benchmark parameters */
-typedef struct {
-	int api_version;
-	int json;
-	int num_processes;
-	int num_threads;
-	int mode;
-	int rows; /* is 2 billion enough? */
-	int seconds;
-	int iteration;
-	int tpsmax;
-	int tpsmin;
-	int tpsinterval;
-	int tpschange;
-	int sampling;
-	int key_length;
-	int value_length;
-	int zipf;
-	int commit_get;
-	int verbose;
-	mako_txnspec_t txnspec;
-	char cluster_files[NUM_CLUSTERS_MAX][PATH_MAX];
-	int num_fdb_clusters;
-	int num_databases;
-	char log_group[LOGGROUP_MAX];
-	int prefixpadding;
-	int trace;
-	char tracepath[PATH_MAX];
-	int traceformat; /* 0 - XML, 1 - JSON */
-	char knobs[KNOB_MAX];
-	uint8_t flatbuffers;
-	int txntrace;
-	int txntagging;
-	char txntagging_prefix[TAGPREFIXLENGTH_MAX];
-	FDBStreamingMode streaming_mode;
-	int client_threads_per_version;
-	int disable_ryw;
-	char json_output_path[PATH_MAX];
-	bool bg_materialize_files;
-	char bg_file_path[PATH_MAX];
-} mako_args_t;
-
-/* shared memory */
-#define SIGNAL_RED 0
-#define SIGNAL_GREEN 1
-#define SIGNAL_OFF 2
-
-typedef struct {
-	int signal;
-	int readycount;
-	double throttle_factor;
-	int stopcount;
-} mako_shmhdr_t;
-
-/* memory block allocated to each operation when collecting detailed latency */
-typedef struct {
-	uint64_t data[LAT_BLOCK_SIZE];
-	void* next_block;
-} lat_block_t;
-
-typedef struct {
-	uint64_t xacts;
-	uint64_t conflicts;
-	uint64_t ops[MAX_OP];
-	uint64_t errors[MAX_OP];
-	uint64_t latency_samples[MAX_OP];
-	uint64_t latency_us_total[MAX_OP];
-	uint64_t latency_us_min[MAX_OP];
-	uint64_t latency_us_max[MAX_OP];
-} mako_stats_t;
-
-/* per-process information */
-typedef struct {
-	int worker_id;
-	pid_t parent_id;
-	mako_args_t* args;
-	mako_shmhdr_t* shm;
-	FDBDatabase* databases[NUM_DATABASES_MAX];
-} process_info_t;
-
-/* args for threads */
-typedef struct {
-	int thread_id;
-	int database_index; // index of the database to do work to
-	int elem_size[MAX_OP]; /* stores the multiple of LAT_BLOCK_SIZE to check the memory allocation of each operation */
-	bool is_memory_allocated[MAX_OP]; /* flag specified for each operation, whether the memory was allocated to that
-	                                     specific operation */
-	lat_block_t* block[MAX_OP];
-	process_info_t* process;
-} thread_args_t;
-
-/* process type */
-typedef enum { proc_master = 0, proc_worker, proc_stats } proc_type_t;
-
-#endif /* MAKO_H */
--- a/bindings/c/test/mako/mako.hpp
+++ b/bindings/c/test/mako/mako.hpp
@ -0,0 +1,168 @@
+/*
+ * mako.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_HPP
+#define MAKO_HPP
+
+#ifndef FDB_API_VERSION
+#define FDB_API_VERSION 720
+#endif
+
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <list>
+#include <vector>
+#include <string_view>
+#include <fdb_api.hpp>
+#include <pthread.h>
+#include <sys/types.h>
+#include <stdbool.h>
+#include "limit.hpp"
+
+namespace mako {
+
+constexpr const int MODE_INVALID = -1;
+constexpr const int MODE_CLEAN = 0;
+constexpr const int MODE_BUILD = 1;
+constexpr const int MODE_RUN = 2;
+
+/* for long arguments */
+enum ArgKind {
+	ARG_KEYLEN,
+	ARG_VALLEN,
+	ARG_TPS,
+	ARG_ASYNC,
+	ARG_COMMITGET,
+	ARG_SAMPLING,
+	ARG_VERSION,
+	ARG_KNOBS,
+	ARG_FLATBUFFERS,
+	ARG_LOGGROUP,
+	ARG_PREFIXPADDING,
+	ARG_TRACE,
+	ARG_TRACEPATH,
+	ARG_TRACEFORMAT,
+	ARG_TPSMAX,
+	ARG_TPSMIN,
+	ARG_TPSINTERVAL,
+	ARG_TPSCHANGE,
+	ARG_TXNTRACE,
+	ARG_TXNTAGGING,
+	ARG_TXNTAGGINGPREFIX,
+	ARG_STREAMING_MODE,
+	ARG_DISABLE_RYW,
+	ARG_CLIENT_THREADS_PER_VERSION,
+	ARG_JSON_REPORT,
+	ARG_BG_FILE_PATH // if blob granule files are stored locally, mako will read and materialize them if this is set
+};
+
+constexpr const int OP_COUNT = 0;
+constexpr const int OP_RANGE = 1;
+constexpr const int OP_REVERSE = 2;
+
+/* transaction specification */
+enum OpKind {
+	OP_GETREADVERSION,
+	OP_GET,
+	OP_GETRANGE,
+	OP_SGET,
+	OP_SGETRANGE,
+	OP_UPDATE,
+	OP_INSERT,
+	OP_INSERTRANGE,
+	OP_OVERWRITE,
+	OP_CLEAR,
+	OP_SETCLEAR,
+	OP_CLEARRANGE,
+	OP_SETCLEARRANGE,
+	OP_COMMIT,
+	OP_TRANSACTION, /* pseudo-operation - time it takes to run one iteration of ops sequence */
+	OP_READ_BG,
+	MAX_OP /* must be the last item */
+};
+
+enum TPSChangeTypes { TPS_SIN, TPS_SQUARE, TPS_PULSE };
+
+/* we set WorkloadSpec and Arguments only once in the master process,
+ * and won't be touched by child processes.
+ */
+
+struct WorkloadSpec {
+	/* for each operation, it stores "count", "range" and "reverse" */
+	int ops[MAX_OP][3];
+};
+
+constexpr const int LOGGROUP_MAX = 256;
+constexpr const int KNOB_MAX = 256;
+constexpr const int TAGPREFIXLENGTH_MAX = 8;
+constexpr const int NUM_CLUSTERS_MAX = 3;
+constexpr const int NUM_DATABASES_MAX = 10;
+constexpr const std::string_view KEY_PREFIX{ "mako" };
+constexpr const std::string_view TEMP_DATA_STORE{ "/tmp/makoTemp" };
+
+/* benchmark parameters */
+struct Arguments {
+	int api_version;
+	int json;
+	int num_processes;
+	int num_threads;
+	int async_xacts;
+	int mode;
+	int rows; /* is 2 billion enough? */
+	int row_digits;
+	int seconds;
+	int iteration;
+	int tpsmax;
+	int tpsmin;
+	int tpsinterval;
+	int tpschange;
+	int sampling;
+	int key_length;
+	int value_length;
+	int zipf;
+	int commit_get;
+	int verbose;
+	WorkloadSpec txnspec;
+	char cluster_files[NUM_CLUSTERS_MAX][PATH_MAX];
+	int num_fdb_clusters;
+	int num_databases;
+	char log_group[LOGGROUP_MAX];
+	int prefixpadding;
+	int trace;
+	char tracepath[PATH_MAX];
+	int traceformat; /* 0 - XML, 1 - JSON */
+	char knobs[KNOB_MAX];
+	uint8_t flatbuffers;
+	int txntrace;
+	int txntagging;
+	char txntagging_prefix[TAGPREFIXLENGTH_MAX];
+	FDBStreamingMode streaming_mode;
+	int64_t client_threads_per_version;
+	int disable_ryw;
+	char json_output_path[PATH_MAX];
+	bool bg_materialize_files;
+	char bg_file_path[PATH_MAX];
+};
+
+} // namespace mako
+
+#endif /* MAKO_HPP */
--- a/bindings/c/test/mako/mako.rst
+++ b/bindings/c/test/mako/mako.rst
@ -53,6 +53,13 @@ Arguments

 - | ``-t | --threads <threads>``
  | Number of threads per worker process (Default: 1)
+  | With ``--async_xacts <xacts>`` == 0 (Default), each of the ``<threads>`` operates on a transaction object with blocking API calls
+  | Otherwise, all of the ``<threads>`` run an asynchronous job scheduler, serving ``<xacts>`` transactions
+
+- | ``--async_xacts <xacts>``
+  | Number of transactions per worker process to run asynchronously (Default: 0)
+  | ``<xacts>`` > 0 switches the execution mode to non-blocking (See ``-t | --threads``), with the exception of blob granules API
+  | Note: throttling options, e.g. ``--tpsmax``, ``--tpsmin``, ``--tpschange``, ``--tpsinterval``, are ignored in asynchronous mode

 - | ``-r | --rows <rows>``
  | Number of rows initially populated (Default: 100000)
--- a/bindings/c/test/mako/operations.cpp
+++ b/bindings/c/test/mako/operations.cpp
@ -0,0 +1,275 @@
+/*
+ * operations.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "blob_granules.hpp"
+#include "operations.hpp"
+#include "mako.hpp"
+#include "logger.hpp"
+#include "utils.hpp"
+#include <array>
+
+extern thread_local mako::Logger logr;
+
+namespace mako {
+
+using namespace fdb;
+
+const std::array<Operation, MAX_OP> opTable{
+	{ { "GRV",
+	    { { StepKind::READ,
+	        [](Transaction& tx, Arguments const&, ByteString&, ByteString&, ByteString&) {
+	            return tx.getReadVersion().eraseType();
+	        },
+	        [](Future& f, Transaction&, Arguments const&, ByteString&, ByteString&, ByteString&) {
+	            if (f && !f.error()) {
+		            f.get<future_var::Int64>();
+	            }
+	        } } },
+	    1,
+	    false },
+	  { "GET",
+	    { { StepKind::READ,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString&) {
+	            return tx.get(key, false /*snapshot*/).eraseType();
+	        },
+	        [](Future& f, Transaction&, Arguments const&, ByteString&, ByteString&, ByteString& val) {
+	            if (f && !f.error()) {
+		            f.get<future_var::Value>();
+	            }
+	        } } },
+	    1,
+	    false },
+	  { "GETRANGE",
+	    { { StepKind::READ,
+	        [](Transaction& tx, Arguments const& args, ByteString& begin, ByteString& end, ByteString&) {
+	            return tx
+	                .getRange(key_select::firstGreaterOrEqual(begin),
+	                          key_select::lastLessOrEqual(end, 1),
+	                          0 /*limit*/,
+	                          0 /*target_bytes*/,
+	                          args.streaming_mode,
+	                          0 /*iteration*/,
+	                          false /*snapshot*/,
+	                          args.txnspec.ops[OP_GETRANGE][OP_REVERSE])
+	                .eraseType();
+	        },
+	        [](Future& f, Transaction&, Arguments const&, ByteString&, ByteString&, ByteString& val) {
+	            if (f && !f.error()) {
+		            f.get<future_var::KeyValueArray>();
+	            }
+	        } } },
+	    1,
+	    false },
+	  { "SGET",
+	    { { StepKind::READ,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString&) {
+	            return tx.get(key, true /*snapshot*/).eraseType();
+	        },
+	        [](Future& f, Transaction&, Arguments const&, ByteString&, ByteString&, ByteString& val) {
+	            if (f && !f.error()) {
+		            f.get<future_var::Value>();
+	            }
+	        } } },
+	    1,
+	    false },
+	  { "SGETRANGE",
+	    { {
+
+	        StepKind::READ,
+	        [](Transaction& tx, Arguments const& args, ByteString& begin, ByteString& end, ByteString&) {
+	            return tx
+	                .getRange(key_select::firstGreaterOrEqual(begin),
+	                          key_select::lastLessOrEqual(end, 1),
+	                          0 /*limit*/,
+	                          0 /*target_bytes*/,
+	                          args.streaming_mode,
+	                          0 /*iteration*/,
+	                          true /*snapshot*/,
+	                          args.txnspec.ops[OP_GETRANGE][OP_REVERSE])
+	                .eraseType();
+	        },
+	        [](Future& f, Transaction&, Arguments const&, ByteString&, ByteString&, ByteString& val) {
+	            if (f && !f.error()) {
+		            f.get<future_var::KeyValueArray>();
+	            }
+	        } } },
+	    1,
+	    false },
+	  { "UPDATE",
+	    { { StepKind::READ,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString&) {
+	            return tx.get(key, false /*snapshot*/).eraseType();
+	        },
+	        [](Future& f, Transaction&, Arguments const&, ByteString&, ByteString&, ByteString& val) {
+	            if (f && !f.error()) {
+		            f.get<future_var::Value>();
+	            }
+	        } },
+	      { StepKind::IMM,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString& value) {
+	            randomString(value.data(), args.value_length);
+	            tx.set(key, value);
+	            return Future();
+	        } } },
+	    2,
+	    true },
+	  { "INSERT",
+	    { { StepKind::IMM,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString& value) {
+	            // key[0..args.key_length] := concat(key_prefix, random_string)
+	            randomString(key.data() + intSize(KEY_PREFIX), args.key_length - intSize(KEY_PREFIX));
+	            randomString(value.data(), args.value_length);
+	            tx.set(key, value);
+	            return Future();
+	        } } },
+	    1,
+	    true },
+	  { "INSERTRANGE",
+	    { { StepKind::IMM,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString& value) {
+	            randomString(value.data(), args.value_length);
+
+	            // key[0..args.key_length] := concat(prefix, random_string, num[0..range_digits])
+	            const auto range = args.txnspec.ops[OP_INSERTRANGE][OP_RANGE];
+	            assert(range > 0);
+	            const auto range_digits = digits(range);
+	            const auto random_len = args.key_length - intSize(KEY_PREFIX) - range_digits;
+	            randomString(&key[intSize(KEY_PREFIX)], random_len);
+	            for (auto i = 0; i < range; i++) {
+		            numericWithFill(&key[args.key_length - range_digits], range_digits, i);
+		            tx.set(key, value);
+	            }
+	            return Future();
+	        } } },
+	    1,
+	    true },
+	  { "OVERWRITE",
+	    { { StepKind::IMM,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString& value) {
+	            randomString(value.data(), args.value_length);
+	            tx.set(key, value);
+	            return Future();
+	        } } },
+	    1,
+	    true },
+	  { "CLEAR",
+	    { { StepKind::IMM,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString&) {
+	            tx.clear(key);
+	            return Future();
+	        } } },
+	    1,
+	    true },
+	  { "SETCLEAR",
+	    { { StepKind::COMMIT,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString& value) {
+	            randomString(&key[KEY_PREFIX.size()], args.key_length - intSize(KEY_PREFIX));
+	            randomString(value.data(), args.value_length);
+	            tx.set(key, value);
+	            return tx.commit().eraseType();
+	        } },
+	      { StepKind::IMM,
+	        [](Transaction& tx, Arguments const& args, ByteString& key, ByteString&, ByteString&) {
+	            tx.reset(); // assuming commit from step 0 worked.
+	            tx.clear(key); // key should forward unchanged from step 0
+	            return Future();
+	        } } },
+	    2,
+	    true },
+	  { "CLEARRANGE",
+	    { { StepKind::IMM,
+	        [](Transaction& tx, Arguments const& args, ByteString& begin, ByteString& end, ByteString&) {
+	            tx.clearRange(begin, end);
+	            return Future();
+	        } } },
+	    1,
+	    true },
+	  { "SETCLEARRANGE",
+	    { { StepKind::COMMIT,
+	        [](Transaction& tx, Arguments const& args, ByteString& key_begin, ByteString& key, ByteString& value) {
+	            randomString(value.data(), args.value_length);
+
+	            // key[0..args.key_length] := concat(prefix, random_string, num[0..range_digits])
+	            const auto range = args.txnspec.ops[OP_SETCLEARRANGE][OP_RANGE];
+	            assert(range > 0);
+	            const auto range_digits = digits(range);
+	            const auto random_len = args.key_length - intSize(KEY_PREFIX) - range_digits;
+	            randomString(&key[KEY_PREFIX.size()], random_len);
+	            for (auto i = 0; i < range; i++) {
+		            numericWithFill(&key[args.key_length - range_digits], range_digits, i);
+		            tx.set(key, value);
+		            if (i == 0)
+			            key_begin.assign(key);
+	            }
+	            return tx.commit().eraseType();
+	        } },
+	      { StepKind::IMM,
+	        [](Transaction& tx, Arguments const& args, ByteString& begin, ByteString& end, ByteString&) {
+	            tx.reset();
+	            tx.clearRange(begin, end);
+	            return Future();
+	        } } },
+	    2,
+	    true },
+	  { "COMMIT", { { StepKind::NONE, nullptr } }, 0, false },
+	  { "TRANSACTION", { { StepKind::NONE, nullptr } }, 0, false },
+	  { "READBLOBGRANULE",
+	    { { StepKind::ON_ERROR,
+	        [](Transaction& tx, Arguments const& args, ByteString& begin, ByteString& end, ByteString&) {
+	            auto err = Error{};
+
+	            err = tx.setOptionNothrow(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, BytesRef());
+	            if (err) {
+		            // Issuing read/writes before disabling RYW results in error.
+		            // Possible malformed workload?
+		            // As workloads execute in sequence, retrying would likely repeat this error.
+		            fmt::print(stderr, "ERROR: TR_OPTION_READ_YOUR_WRITES_DISABLE: {}", err.what());
+		            return Future();
+	            }
+
+	            // Allocate a separate context per call to avoid multiple threads accessing
+	            auto user_context = blob_granules::local_file::UserContext(args.bg_file_path);
+
+	            auto api_context = blob_granules::local_file::createApiContext(user_context, args.bg_materialize_files);
+
+	            auto r = tx.readBlobGranules(begin,
+	                                         end,
+	                                         0 /* beginVersion*/,
+	                                         -2, /* endVersion. -2 (latestVersion) is use txn read version */
+	                                         api_context);
+
+	            user_context.clear();
+
+	            auto out = Result::KeyValueArray{};
+	            err = r.getKeyValueArrayNothrow(out);
+	            if (!err || err.is(2037 /*blob_granule_not_materialized*/))
+		            return Future();
+	            const auto level = (err.is(1020 /*not_committed*/) || err.is(1021 /*commit_unknown_result*/) ||
+	                                err.is(1213 /*tag_throttled*/))
+	                                   ? VERBOSE_WARN
+	                                   : VERBOSE_NONE;
+	            logr.printWithLogLevel(level, "ERROR", "get_keyvalue_array() after readBlobGranules(): {}", err.what());
+	            return tx.onError(err).eraseType();
+	        } } },
+	    1,
+	    false } }
+};
+
+} // namespace mako
--- a/bindings/c/test/mako/operations.hpp
+++ b/bindings/c/test/mako/operations.hpp
@ -0,0 +1,140 @@
+/*
+ * operations.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_OPERATIONS_HPP
+#define MAKO_OPERATIONS_HPP
+
+#include <fdb_api.hpp>
+#include <array>
+#include <cassert>
+#include <string_view>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include "macro.hpp"
+#include "mako.hpp"
+
+namespace mako {
+
+// determines how resultant future will be handled
+enum class StepKind {
+	NONE, ///< not part of the table: OP_TRANSACTION, OP_COMMIT
+	IMM, ///< non-future ops that return immediately: e.g. set, clear_range
+	READ, ///< blockable reads: get(), get_range(), get_read_version, ...
+	COMMIT, ///< self-explanatory
+	ON_ERROR ///< future is a result of tx.on_error()
+};
+
+// Ops that doesn't have concrete steps to execute and are there for measurements only
+force_inline bool isAbstractOp(int op) noexcept {
+	return op == OP_COMMIT || op == OP_TRANSACTION;
+}
+
+using StepFunction = fdb::Future (*)(fdb::Transaction& tx,
+                                     Arguments const&,
+                                     fdb::ByteString& /*key1*/,
+                                     fdb::ByteString& /*key2*/,
+                                     fdb::ByteString& /*value*/);
+
+using PostStepFunction = void (*)(fdb::Future&,
+                                  fdb::Transaction& tx,
+                                  Arguments const&,
+                                  fdb::ByteString& /*key1*/,
+                                  fdb::ByteString& /*key2*/,
+                                  fdb::ByteString& /*value*/);
+
+struct Step {
+	StepKind kind;
+	StepFunction step_func_;
+	PostStepFunction post_step_func_{ nullptr };
+};
+
+struct Operation {
+	std::string_view name_;
+	Step steps_[2];
+	int num_steps_;
+	bool needs_commit_;
+
+	std::string_view name() const noexcept { return name_; }
+
+	StepKind stepKind(int step) const noexcept {
+		assert(step < steps());
+		return steps_[step].kind;
+	}
+
+	StepFunction stepFunction(int step) const noexcept { return steps_[step].step_func_; }
+
+	PostStepFunction postStepFunction(int step) const noexcept { return steps_[step].post_step_func_; }
+	// how many steps in this op?
+	int steps() const noexcept { return num_steps_; }
+	// does the op needs to commit some time after its final step?
+	bool needsCommit() const noexcept { return needs_commit_; }
+};
+
+extern const std::array<Operation, MAX_OP> opTable;
+
+force_inline char const* getOpName(int ops_code) {
+	if (ops_code >= 0 && ops_code < MAX_OP)
+		return opTable[ops_code].name().data();
+	return "";
+}
+
+struct OpIterator {
+	int op, count, step;
+
+	bool operator==(const OpIterator& other) const noexcept {
+		return op == other.op && count == other.count && step == other.step;
+	}
+
+	bool operator!=(const OpIterator& other) const noexcept { return !(*this == other); }
+
+	StepKind stepKind() const noexcept { return opTable[op].stepKind(step); }
+
+	char const* opName() const noexcept { return getOpName(op); }
+};
+
+constexpr const OpIterator OpEnd = OpIterator{ MAX_OP, -1, -1 };
+
+force_inline OpIterator getOpBegin(Arguments const& args) noexcept {
+	for (auto op = 0; op < MAX_OP; op++) {
+		if (isAbstractOp(op) || args.txnspec.ops[op][OP_COUNT] == 0)
+			continue;
+		return OpIterator{ op, 0, 0 };
+	}
+	return OpEnd;
+}
+
+force_inline OpIterator getOpNext(Arguments const& args, OpIterator current) noexcept {
+	auto& [op, count, step] = current;
+	assert(op < MAX_OP && !isAbstractOp(op));
+	if (opTable[op].steps() > step + 1)
+		return OpIterator{ op, count, step + 1 };
+	count++;
+	for (; op < MAX_OP; op++, count = 0) {
+		if (isAbstractOp(op) || args.txnspec.ops[op][OP_COUNT] <= count)
+			continue;
+		return OpIterator{ op, count, 0 };
+	}
+	return OpEnd;
+}
+
+} // namespace mako
+
+#endif /* MAKO_OPERATIONS_HPP */
--- a/bindings/c/test/mako/process.hpp
+++ b/bindings/c/test/mako/process.hpp
@ -0,0 +1,26 @@
+/*
+ * process.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_PROCESS_HPP
+#define MAKO_PROCESS_HPP
+
+enum class ProcKind { MAIN, WORKER, STATS };
+
+#endif /*MAKO_PROCESS_HPP*/
--- a/bindings/c/test/mako/shm.hpp
+++ b/bindings/c/test/mako/shm.hpp
@ -0,0 +1,108 @@
+/*
+ * shm.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_SHM_HPP
+#define MAKO_SHM_HPP
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include "stats.hpp"
+
+/* shared memory */
+constexpr const int SIGNAL_RED = 0;
+constexpr const int SIGNAL_GREEN = 1;
+constexpr const int SIGNAL_OFF = 2;
+
+// controlled, safer access to shared memory
+namespace mako::shared_memory {
+
+struct Header {
+	std::atomic<int> signal = ATOMIC_VAR_INIT(SIGNAL_OFF);
+	std::atomic<int> readycount = ATOMIC_VAR_INIT(0);
+	std::atomic<double> throttle_factor = ATOMIC_VAR_INIT(1.0);
+	std::atomic<int> stopcount = ATOMIC_VAR_INIT(0);
+};
+
+struct LayoutHelper {
+	Header hdr;
+	ThreadStatistics stats;
+};
+
+inline size_t storageSize(int num_processes, int num_threads) noexcept {
+	assert(num_processes >= 1 && num_threads >= 1);
+	return sizeof(LayoutHelper) + sizeof(ThreadStatistics) * ((num_processes * num_threads) - 1);
+}
+
+class Access {
+	void* base;
+	int num_processes;
+	int num_threads;
+
+	static inline ThreadStatistics& statsSlot(void* shm_base,
+	                                          int num_threads,
+	                                          int process_idx,
+	                                          int thread_idx) noexcept {
+		return (&static_cast<LayoutHelper*>(shm_base)->stats)[process_idx * num_threads + thread_idx];
+	}
+
+public:
+	Access(void* shm, int num_processes, int num_threads) noexcept
+	  : base(shm), num_processes(num_processes), num_threads(num_threads) {}
+
+	Access() noexcept : Access(nullptr, 0, 0) {}
+
+	Access(const Access&) noexcept = default;
+
+	Access& operator=(const Access&) noexcept = default;
+
+	size_t size() const noexcept { return storageSize(num_processes, num_threads); }
+
+	void initMemory() noexcept {
+		new (&header()) Header{};
+		for (auto i = 0; i < num_processes; i++)
+			for (auto j = 0; j < num_threads; j++)
+				new (&statsSlot(i, j)) ThreadStatistics();
+	}
+
+	Header const& headerConst() const noexcept { return *static_cast<Header const*>(base); }
+
+	Header& header() const noexcept { return *static_cast<Header*>(base); }
+
+	ThreadStatistics const* statsConstArray() const noexcept {
+		return &statsSlot(base, num_threads, 0 /*process_id*/, 0 /*thread_id*/);
+	}
+
+	ThreadStatistics* statsArray() const noexcept {
+		return &statsSlot(base, num_threads, 0 /*process_id*/, 0 /*thread_id*/);
+	}
+
+	ThreadStatistics const& statsConstSlot(int process_idx, int thread_idx) const noexcept {
+		return statsSlot(base, num_threads, process_idx, thread_idx);
+	}
+
+	ThreadStatistics& statsSlot(int process_idx, int thread_idx) const noexcept {
+		return statsSlot(base, num_threads, process_idx, thread_idx);
+	}
+};
+
+} // namespace mako::shared_memory
+
+#endif /* MAKO_SHM_HPP */
--- a/bindings/c/test/mako/stats.hpp
+++ b/bindings/c/test/mako/stats.hpp
@ -0,0 +1,177 @@
+/*
+ * stats.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_STATS_HPP
+#define MAKO_STATS_HPP
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <list>
+#include <new>
+#include <utility>
+#include "operations.hpp"
+#include "time.hpp"
+
+namespace mako {
+
+/* rough cap on the number of samples to avoid OOM hindering benchmark */
+constexpr const size_t SAMPLE_CAP = 2000000;
+
+/* size of each block to get detailed latency for each operation */
+constexpr const size_t LAT_BLOCK_SIZE = 4093;
+
+/* hard cap on the number of sample blocks = 488 */
+constexpr const size_t MAX_LAT_BLOCKS = SAMPLE_CAP / LAT_BLOCK_SIZE;
+
+/* memory block allocated to each operation when collecting detailed latency */
+class LatencySampleBlock {
+	uint64_t samples[LAT_BLOCK_SIZE]{
+		0,
+	};
+	uint64_t index{ 0 };
+
+public:
+	LatencySampleBlock() noexcept = default;
+	bool full() const noexcept { return index >= LAT_BLOCK_SIZE; }
+	void put(timediff_t td) {
+		assert(!full());
+		samples[index++] = toIntegerMicroseconds(td);
+	}
+	// return {data block, number of samples}
+	std::pair<uint64_t const*, size_t> data() const noexcept { return { samples, index }; }
+};
+
+/* collect sampled latencies until OOM is hit */
+class LatencySampleBin {
+	std::list<LatencySampleBlock> blocks;
+	bool noMoreAlloc{ false };
+
+	bool tryAlloc() {
+		try {
+			blocks.emplace_back();
+		} catch (const std::bad_alloc&) {
+			noMoreAlloc = true;
+			return false;
+		}
+		return true;
+	}
+
+public:
+	void reserveOneBlock() {
+		if (blocks.empty())
+			tryAlloc();
+	}
+
+	void put(timediff_t td) {
+		if (blocks.empty() || blocks.back().full()) {
+			if (blocks.size() >= MAX_LAT_BLOCKS || noMoreAlloc || !tryAlloc())
+				return;
+		}
+		blocks.back().put(td);
+	}
+
+	// iterate & apply for each block user function void(uint64_t const*, size_t)
+	template <typename Func>
+	void forEachBlock(Func&& fn) const {
+		for (const auto& block : blocks) {
+			auto [ptr, cnt] = block.data();
+			fn(ptr, cnt);
+		}
+	}
+};
+
+class alignas(64) ThreadStatistics {
+	uint64_t conflicts;
+	uint64_t total_errors;
+	uint64_t ops[MAX_OP];
+	uint64_t errors[MAX_OP];
+	uint64_t latency_samples[MAX_OP];
+	uint64_t latency_us_total[MAX_OP];
+	uint64_t latency_us_min[MAX_OP];
+	uint64_t latency_us_max[MAX_OP];
+
+public:
+	ThreadStatistics() noexcept {
+		memset(this, 0, sizeof(ThreadStatistics));
+		memset(latency_us_min, 0xff, sizeof(latency_us_min));
+	}
+
+	ThreadStatistics(const ThreadStatistics& other) noexcept = default;
+	ThreadStatistics& operator=(const ThreadStatistics& other) noexcept = default;
+
+	uint64_t getConflictCount() const noexcept { return conflicts; }
+
+	uint64_t getOpCount(int op) const noexcept { return ops[op]; }
+
+	uint64_t getErrorCount(int op) const noexcept { return errors[op]; }
+
+	uint64_t getTotalErrorCount() const noexcept { return total_errors; }
+
+	uint64_t getLatencySampleCount(int op) const noexcept { return latency_samples[op]; }
+
+	uint64_t getLatencyUsTotal(int op) const noexcept { return latency_us_total[op]; }
+
+	uint64_t getLatencyUsMin(int op) const noexcept { return latency_us_min[op]; }
+
+	uint64_t getLatencyUsMax(int op) const noexcept { return latency_us_max[op]; }
+
+	// with 'this' as final aggregation, factor in 'other'
+	void combine(const ThreadStatistics& other) {
+		conflicts += other.conflicts;
+		for (auto op = 0; op < MAX_OP; op++) {
+			ops[op] += other.ops[op];
+			errors[op] += other.errors[op];
+			total_errors += other.errors[op];
+			latency_samples[op] += other.latency_samples[op];
+			latency_us_total[op] += other.latency_us_total[op];
+			if (latency_us_min[op] > other.latency_us_min[op])
+				latency_us_min[op] = other.latency_us_min[op];
+			if (latency_us_max[op] < other.latency_us_max[op])
+				latency_us_max[op] = other.latency_us_max[op];
+		}
+	}
+
+	void incrConflictCount() noexcept { conflicts++; }
+
+	// non-commit write operations aren't measured for time.
+	void incrOpCount(int op) noexcept { ops[op]++; }
+
+	void incrErrorCount(int op) noexcept {
+		total_errors++;
+		errors[op]++;
+	}
+
+	void addLatency(int op, timediff_t diff) noexcept {
+		const auto latency_us = toIntegerMicroseconds(diff);
+		latency_samples[op]++;
+		latency_us_total[op] += latency_us;
+		if (latency_us_min[op] > latency_us)
+			latency_us_min[op] = latency_us;
+		if (latency_us_max[op] < latency_us)
+			latency_us_max[op] = latency_us;
+	}
+};
+
+using LatencySampleBinArray = std::array<LatencySampleBin, MAX_OP>;
+
+} // namespace mako
+
+#endif /* MAKO_STATS_HPP */
--- a/bindings/c/test/mako/time.hpp
+++ b/bindings/c/test/mako/time.hpp
@ -0,0 +1,77 @@
+/*
+ * time.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MAKO_TIME_HPP
+#define MAKO_TIME_HPP
+
+#include <chrono>
+
+namespace mako {
+
+/* time measurement helpers */
+using std::chrono::steady_clock;
+using timepoint_t = decltype(steady_clock::now());
+using timediff_t = decltype(std::declval<timepoint_t>() - std::declval<timepoint_t>());
+
+template <typename Duration>
+double toDoubleSeconds(Duration duration) {
+	return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
+}
+
+template <typename Duration>
+uint64_t toIntegerSeconds(Duration duration) {
+	return std::chrono::duration_cast<std::chrono::duration<uint64_t>>(duration).count();
+}
+
+template <typename Duration>
+uint64_t toIntegerMicroseconds(Duration duration) {
+	return std::chrono::duration_cast<std::chrono::duration<uint64_t, std::micro>>(duration).count();
+}
+
+// timing helpers
+struct StartAtCtor {};
+
+class Stopwatch {
+	timepoint_t p1, p2;
+
+public:
+	Stopwatch() noexcept : p1(), p2() {}
+	Stopwatch(StartAtCtor) noexcept { start(); }
+	Stopwatch(timepoint_t start_time) noexcept : p1(start_time), p2() {}
+	Stopwatch(const Stopwatch&) noexcept = default;
+	Stopwatch& operator=(const Stopwatch&) noexcept = default;
+	timepoint_t getStart() const noexcept { return p1; }
+	timepoint_t getStop() const noexcept { return p2; }
+	void start() noexcept { p1 = steady_clock::now(); }
+	Stopwatch& stop() noexcept {
+		p2 = steady_clock::now();
+		return *this;
+	}
+	Stopwatch& setStop(timepoint_t p_stop) noexcept {
+		p2 = p_stop;
+		return *this;
+	}
+	void startFromStop() noexcept { p1 = p2; }
+	auto diff() const noexcept { return p2 - p1; }
+};
+
+} // namespace mako
+
+#endif /* MAKO_TIME_HPP */
--- a/bindings/c/test/mako/utils.c
+++ b/bindings/c/test/mako/utils.c
@ -1,136 +0,0 @@
-#include "utils.h"
-#include "mako.h"
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-/* uniform-distribution random */
-int urand(int low, int high) {
-	double r = rand() / (1.0 + RAND_MAX);
-	int range = high - low + 1;
-	return (int)((r * range) + low);
-}
-
-/* random string */
-/* len is the buffer size, must include null */
-void randstr(char* str, int len) {
-	int i;
-	for (i = 0; i < len - 1; i++) {
-		str[i] = '!' + urand(0, 'z' - '!'); /* generage a char from '!' to 'z' */
-	}
-	str[len - 1] = '\0';
-}
-
-/* random numeric string */
-/* len is the buffer size, must include null */
-void randnumstr(char* str, int len) {
-	int i;
-	for (i = 0; i < len - 1; i++) {
-		str[i] = '0' + urand(0, 9); /* generage a char from '!' to 'z' */
-	}
-	str[len - 1] = '\0';
-}
-
-/* return the first key to be inserted */
-int insert_begin(int rows, int p_idx, int t_idx, int total_p, int total_t) {
-	double interval = (double)rows / total_p / total_t;
-	return (int)(round(interval * ((p_idx * total_t) + t_idx)));
-}
-
-/* return the last key to be inserted */
-int insert_end(int rows, int p_idx, int t_idx, int total_p, int total_t) {
-	double interval = (double)rows / total_p / total_t;
-	return (int)(round(interval * ((p_idx * total_t) + t_idx + 1) - 1));
-}
-
-/* devide val equally among threads */
-int compute_thread_portion(int val, int p_idx, int t_idx, int total_p, int total_t) {
-	int interval = val / total_p / total_t;
-	int remaining = val - (interval * total_p * total_t);
-	if ((p_idx * total_t + t_idx) < remaining) {
-		return interval + 1;
-	} else if (interval == 0) {
-		return -1;
-	}
-	/* else */
-	return interval;
-}
-
-/* number of digits */
-int digits(int num) {
-	int digits = 0;
-	while (num > 0) {
-		num /= 10;
-		digits++;
-	}
-	return digits;
-}
-
-/* generate a key for a given key number */
-/* prefix is "mako" by default, prefixpadding = 1 means 'x' will be in front rather than trailing the keyname */
-/* len is the buffer size, key length + null */
-void genkey(char* str, char* prefix, int prefixlen, int prefixpadding, int num, int rows, int len) {
-	const int rowdigit = digits(rows);
-	const int prefixoffset = prefixpadding ? len - (prefixlen + rowdigit) - 1 : 0;
-	char* prefixstr = (char*)alloca(sizeof(char) * (prefixlen + rowdigit + 1));
-	snprintf(prefixstr, prefixlen + rowdigit + 1, "%s%0.*d", prefix, rowdigit, num);
-	memset(str, 'x', len);
-	memcpy(str + prefixoffset, prefixstr, prefixlen + rowdigit);
-	str[len - 1] = '\0';
-}
-
-/* This is another sorting algorithm used to calculate latency parameters */
-/* We moved from radix sort to quick sort to avoid extra space used in radix sort */
-
-#if 0
-uint64_t get_max(uint64_t arr[], int n) {
-	uint64_t mx = arr[0];
-	for (int i = 1; i < n; i++) {
-		if (arr[i] > mx) {
-			mx = arr[i];
-		}
-	}
-	return mx;
-}
-
-void bucket_data(uint64_t arr[], int n, uint64_t exp) {
-	// uint64_t output[n];
-	int i, count[10] = { 0 };
-	uint64_t* output = (uint64_t*)malloc(sizeof(uint64_t) * n);
-
-	for (i = 0; i < n; i++) {
-		count[(arr[i] / exp) % 10]++;
-	}
-	for (i = 1; i < 10; i++) {
-		count[i] += count[i - 1];
-	}
-	for (i = n - 1; i >= 0; i--) {
-		output[count[(arr[i] / exp) % 10] - 1] = arr[i];
-		count[(arr[i] / exp) % 10]--;
-	}
-	for (i = 0; i < n; i++) {
-		arr[i] = output[i];
-	}
-	free(output);
-}
-
-// The main function is to sort arr[] of size n using Radix Sort
-void radix_sort(uint64_t* arr, int n) {
-	// Find the maximum number to know number of digits
-	uint64_t m = get_max(arr, n);
-	for (uint64_t exp = 1; m / exp > 0; exp *= 10) bucket_data(arr, n, exp);
-}
-#endif
-
-int compare(const void* a, const void* b) {
-	const uint64_t* da = (const uint64_t*)a;
-	const uint64_t* db = (const uint64_t*)b;
-
-	return (*da > *db) - (*da < *db);
-}
-
-// The main function is to sort arr[] of size n using Quick Sort
-void quick_sort(uint64_t* arr, int n) {
-	qsort(arr, n, sizeof(uint64_t), compare);
-}
--- a/bindings/c/test/mako/utils.cpp
+++ b/bindings/c/test/mako/utils.cpp
@ -0,0 +1,54 @@
+/*
+ * utils.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils.hpp"
+#include "mako.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fmt/format.h>
+
+namespace mako {
+
+/* return the last key to be inserted */
+/* devide val equally among threads */
+int computeThreadPortion(int val, int p_idx, int t_idx, int total_p, int total_t) {
+	int interval = val / total_p / total_t;
+	int remaining = val - (interval * total_p * total_t);
+	if ((p_idx * total_t + t_idx) < remaining) {
+		return interval + 1;
+	} else if (interval == 0) {
+		return -1;
+	}
+	/* else */
+	return interval;
+}
+
+/* number of digits */
+int digits(int num) {
+	int digits = 0;
+	while (num > 0) {
+		num /= 10;
+		digits++;
+	}
+	return digits;
+}
+
+} // namespace mako
--- a/bindings/c/test/mako/utils.h
+++ b/bindings/c/test/mako/utils.h
@ -1,65 +0,0 @@
-#ifndef UTILS_H
-#define UTILS_H
-#pragma once
-
-#include <stdint.h>
-
-/* uniform-distribution random */
-/* return a uniform random number between low and high, both inclusive */
-int urand(int low, int high);
-
-/* write a random string of the length of (len-1) to memory pointed by str
- * with a null-termination character at str[len-1].
- */
-void randstr(char* str, int len);
-
-/* write a random numeric string of the length of (len-1) to memory pointed by str
- * with a null-termination character at str[len-1].
- */
-void randnumstr(char* str, int len);
-
-/* given the total number of rows to be inserted,
- * the worker process index p_idx and the thread index t_idx (both 0-based),
- * and the total number of processes, total_p, and threads, total_t,
- * returns the first row number assigned to this partition.
- */
-int insert_begin(int rows, int p_idx, int t_idx, int total_p, int total_t);
-
-/* similar to insert_begin, insert_end returns the last row numer */
-int insert_end(int rows, int p_idx, int t_idx, int total_p, int total_t);
-
-/* devide a value equally among threads */
-int compute_thread_portion(int val, int p_idx, int t_idx, int total_p, int total_t);
-
-/* similar to insert_begin/end, compute_thread_tps computes
- * the per-thread target TPS for given configuration.
- */
-#define compute_thread_tps(val, p_idx, t_idx, total_p, total_t)                                                        \
-	compute_thread_portion(val, p_idx, t_idx, total_p, total_t)
-
-/* similar to compute_thread_tps,
- * compute_thread_iters computs the number of iterations.
- */
-#define compute_thread_iters(val, p_idx, t_idx, total_p, total_t)                                                      \
-	compute_thread_portion(val, p_idx, t_idx, total_p, total_t)
-
-/* get the number of digits */
-int digits(int num);
-
-/* generate a key for a given key number */
-/* prefix is "mako" by default, prefixpadding = 1 means 'x' will be in front rather than trailing the keyname */
-/* len is the buffer size, key length + null */
-void genkey(char* str, char* prefix, int prefixlen, int prefixpadding, int num, int rows, int len);
-
-#if 0
-// The main function is to sort arr[] of size n using Radix Sort
-void radix_sort(uint64_t arr[], int n);
-void bucket_data(uint64_t arr[], int n, uint64_t exp);
-uint64_t get_max(uint64_t arr[], int n);
-#endif
-
-// The main function is to sort arr[] of size n using Quick Sort
-void quick_sort(uint64_t arr[], int n);
-int compare(const void* a, const void* b);
-
-#endif /* UTILS_H */
--- a/bindings/c/test/mako/utils.hpp
+++ b/bindings/c/test/mako/utils.hpp
@ -0,0 +1,195 @@
+/*
+ * utils.hpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTILS_HPP
+#define UTILS_HPP
+#pragma once
+
+#include "macro.hpp"
+#include "mako.hpp"
+#include "fdbclient/zipf.h"
+#include <cassert>
+#include <chrono>
+#include <cstdint>
+#include <string_view>
+#include <type_traits>
+
+#include <fmt/format.h>
+
+namespace mako {
+
+/* uniform-distribution random */
+/* return a uniform random number between low and high, both inclusive */
+force_inline int urand(int low, int high) {
+	double r = rand() / (1.0 + RAND_MAX);
+	int range = high - low + 1;
+	return (int)((r * range) + low);
+}
+
+force_inline int nextKey(Arguments const& args) {
+	if (args.zipf)
+		return zipfian_next();
+	return urand(0, args.rows - 1);
+}
+
+force_inline int intSize(std::string_view sv) {
+	return static_cast<int>(sv.size());
+}
+
+/* random string */
+template <typename Char>
+force_inline void randomString(Char* str, int len) {
+	assert(len >= 0);
+	for (auto i = 0; i < len; i++) {
+		str[i] = ('!' + urand(0, 'z' - '!')); /* generate a char from '!' to 'z' */
+	}
+}
+
+/* given the total number of rows to be inserted,
+ * the worker process index p_idx and the thread index t_idx (both 0-based),
+ * and the total number of processes, total_p, and threads, total_t,
+ * returns the first row number assigned to this partition.
+ */
+force_inline int insertBegin(int rows, int p_idx, int t_idx, int total_p, int total_t) {
+	double interval = (double)rows / total_p / total_t;
+	return (int)(round(interval * ((p_idx * total_t) + t_idx)));
+}
+
+/* similar to insertBegin, insertEnd returns the last row numer */
+force_inline int insertEnd(int rows, int p_idx, int t_idx, int total_p, int total_t) {
+	double interval = (double)rows / total_p / total_t;
+	return (int)(round(interval * ((p_idx * total_t) + t_idx + 1) - 1));
+}
+
+/* devide a value equally among threads */
+int computeThreadPortion(int val, int p_idx, int t_idx, int total_p, int total_t);
+
+/* similar to insertBegin/end, computeThreadTps computes
+ * the per-thread target TPS for given configuration.
+ */
+#define computeThreadTps(val, p_idx, t_idx, total_p, total_t) computeThreadPortion(val, p_idx, t_idx, total_p, total_t)
+
+/* similar to computeThreadTps,
+ * computeThreadIters computs the number of iterations.
+ */
+#define computeThreadIters(val, p_idx, t_idx, total_p, total_t)                                                        \
+	computeThreadPortion(val, p_idx, t_idx, total_p, total_t)
+
+/* get the number of digits */
+int digits(int num);
+
+/* fill memory slice [str, str + len) as stringified, zero-padded num */
+template <typename Char>
+force_inline void numericWithFill(Char* str, int len, int num) {
+	static_assert(sizeof(Char) == 1);
+	assert(num >= 0);
+	memset(str, '0', len);
+	for (auto i = len - 1; num > 0 && i >= 0; i--, num /= 10) {
+		str[i] = (num % 10) + '0';
+	}
+}
+
+/* generate a key for a given key number */
+/* prefix is "mako" by default, prefixpadding = 1 means 'x' will be in front rather than trailing the keyname */
+template <typename Char>
+void genKey(Char* str, std::string_view prefix, Arguments const& args, int num) {
+	static_assert(sizeof(Char) == 1);
+	memset(str, 'x', args.key_length);
+	const auto prefix_len = static_cast<int>(prefix.size());
+	auto pos = args.prefixpadding ? (args.key_length - prefix_len - args.row_digits) : 0;
+	memcpy(&str[pos], prefix.data(), prefix_len);
+	pos += prefix_len;
+	numericWithFill(&str[pos], args.row_digits, num);
+}
+
+template <typename Char>
+force_inline void prepareKeys(int op,
+                              std::basic_string<Char>& key1,
+                              std::basic_string<Char>& key2,
+                              Arguments const& args) {
+	const auto key1_num = nextKey(args);
+	genKey(key1.data(), KEY_PREFIX, args, key1_num);
+	if (args.txnspec.ops[op][OP_RANGE] > 0) {
+		const auto key2_num = std::min(key1_num + args.txnspec.ops[op][OP_RANGE] - 1, args.rows - 1);
+		genKey(key2.data(), KEY_PREFIX, args, key2_num);
+	}
+}
+
+// invoke user-provided callable when object goes out of scope.
+template <typename Func>
+class ExitGuard {
+	std::decay_t<Func> fn;
+
+public:
+	ExitGuard(Func&& fn) : fn(std::forward<Func>(fn)) {}
+
+	~ExitGuard() { fn(); }
+};
+
+// invoke user-provided callable when stack unwinds by exception.
+template <typename Func>
+class FailGuard {
+	std::decay_t<Func> fn;
+
+public:
+	FailGuard(Func&& fn) : fn(std::forward<Func>(fn)) {}
+
+	~FailGuard() {
+		if (std::uncaught_exceptions()) {
+			fn();
+		}
+	}
+};
+
+// trace helpers
+constexpr const int STATS_TITLE_WIDTH = 12;
+constexpr const int STATS_FIELD_WIDTH = 12;
+
+template <typename Value>
+void putTitle(Value&& value) {
+	fmt::print("{0: <{1}} ", std::forward<Value>(value), STATS_TITLE_WIDTH);
+}
+
+template <typename Value>
+void putTitleRight(Value&& value) {
+	fmt::print("{0: >{1}} ", std::forward<Value>(value), STATS_TITLE_WIDTH);
+}
+
+inline void putTitleBar() {
+	fmt::print("{0:=<{1}} ", "", STATS_TITLE_WIDTH);
+}
+
+template <typename Value>
+void putField(Value&& value) {
+	fmt::print("{0: >{1}} ", std::forward<Value>(value), STATS_FIELD_WIDTH);
+}
+
+inline void putFieldBar() {
+	fmt::print("{0:=>{1}} ", "", STATS_FIELD_WIDTH);
+}
+
+template <typename Value>
+void putFieldFloat(Value&& value, int precision) {
+	fmt::print("{0: >{1}.{2}f} ", std::forward<Value>(value), STATS_FIELD_WIDTH, precision);
+}
+
+} // namespace mako
+
+#endif /* UTILS_HPP */
--- a/cmake/FDBComponents.cmake
+++ b/cmake/FDBComponents.cmake
@ -21,9 +21,25 @@ endif()
 include(CheckSymbolExists)

 set(DISABLE_TLS OFF CACHE BOOL "Don't try to find OpenSSL and always build without TLS support")
+set(USE_WOLFSSL OFF CACHE BOOL "Build against WolfSSL instead of OpenSSL")
+set(USE_OPENSSL ON CACHE BOOL "Build against OpenSSL")
 if(DISABLE_TLS)
  set(WITH_TLS OFF)
 else()
+  if(USE_WOLFSSL)
+    set(WOLFSSL_USE_STATIC_LIBS TRUE)
+    find_package(WolfSSL)
+    if(WOLFSSL_FOUND)
+      set(CMAKE_REQUIRED_INCLUDES ${WOLFSSL_INCLUDE_DIR})
+      set(WITH_TLS ON)
+      add_compile_options(-DHAVE_OPENSSL)
+      add_compile_options(-DHAVE_WOLFSSL)
+    else()
+      message(STATUS "WolfSSL was not found - Will compile without TLS Support")
+      message(STATUS "You can set WOLFSSL_ROOT_DIR to help cmake find it")
+      set(WITH_TLS OFF)
+    endif()
+  elseif(USE_OPENSSL)
    set(OPENSSL_USE_STATIC_LIBS TRUE)
    if(WIN32)
      set(OPENSSL_MSVC_STATIC_RT ON)
@ -39,6 +55,7 @@ else()
      set(WITH_TLS OFF)
    endif()
  endif()
+endif()

 ################################################################################
 # Python Bindings
--- a/cmake/FDBInstall.cmake
+++ b/cmake/FDBInstall.cmake
@ -198,7 +198,7 @@ function(fdb_configure_and_install)
      string(TOLOWER "${pkg}" package)
      string(TOUPPER "${IN_DESTINATION}" destination)
      get_install_dest(${pkg} INCLUDE INCLUDE_DIR)
-      get_install_dest(${pkg} INCLUDE LIB_DIR)
+      get_install_dest(${pkg} LIB LIB_DIR)
      get_install_dest(${pkg} ${destination} install_path)
      string(REGEX REPLACE "\.in$" "" name "${IN_FILE}")
      get_filename_component(name "${name}" NAME)
--- a/cmake/FindWolfSSL.cmake
+++ b/cmake/FindWolfSSL.cmake
@ -0,0 +1,63 @@
+# FindWolfSSL
+
+# Support preference of static libs by adjusting CMAKE_FIND_LIBRARY_SUFFIXES
+if(WOLFSSL_USE_STATIC_LIBS)
+  if(WIN32)
+    set(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  else()
+    set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
+  endif()
+endif()
+
+find_path(WOLFSSL_ROOT_DIR
+  NAMES
+    include/wolfssl/options.h
+)
+
+find_path(WOLFSSL_INCLUDE_DIR
+  NAMES
+    wolfssl/ssl.h
+  PATHS
+   ${WOLFSSL_ROOT_DIR}/include
+)
+
+find_library(WOLFSSL_LIBRARY
+  NAMES 
+    wolfssl
+  PATHS
+   ${WOLFSSL_ROOT_DIR}/lib
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(WolfSSL
+  REQUIRED_VARS
+    WOLFSSL_LIBRARY
+    WOLFSSL_INCLUDE_DIR
+  FAIL_MESSAGE
+    "Could NOT find WolfSSL"
+)
+
+mark_as_advanced(
+  WOLFSSL_ROOT_DIR
+  WOLFSSL_LIBRARY
+  WOLFSSL_INCLUDE_DIR
+)
+
+if(WOLFSSL_FOUND)
+  message(STATUS "Found wolfssl library: ${WOLFSSL_LIBRARY}")
+  message(STATUS "Found wolfssl includes: ${WOLFSSL_INCLUDE_DIR}")
+
+  set(WOLFSSL_INCLUDE_DIRS ${WOLFSSL_INCLUDE_DIR})
+  set(WOLFSSL_LIBRARIES ${WOLFSSL_LIBRARY})
+
+  add_library(WolfSSL UNKNOWN IMPORTED GLOBAL)
+  add_library(OpenSSL::SSL ALIAS WolfSSL)
+  add_library(OpenSSL::CRYPTO ALIAS WolfSSL)
+
+  target_include_directories(WolfSSL INTERFACE "${WOLFSSL_INCLUDE_DIR}")
+  target_link_libraries(WolfSSL INTERFACE "${WOLFSSL_TLS_LIBRARY}" "${WOLFSSL_SSL_LIBRARY}" "${WOLFSSL_CRYPTO_LIBRARY}")
+  set_target_properties(WolfSSL PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${WOLFSSL_INCLUDE_DIR}"
+    IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+    IMPORTED_LOCATION "${WOLFSSL_LIBRARY}")
+endif()
--- a/contrib/pkg_tester/snapshots/test_fdb_pkgs.ambr
+++ b/contrib/pkg_tester/snapshots/test_fdb_pkgs.ambr
@ -48,29 +48,25 @@
 ---
 # name: test_execstack_permissions_libfdb_c[centos-versioned]
  '
-    GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
-                   0x0000000000000000 0x0000000000000000  RW     0x0
+  - /lib64/libfdb_c.so
  
  '
 ---
 # name: test_execstack_permissions_libfdb_c[centos]
  '
-    GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
-                   0x0000000000000000 0x0000000000000000  RW     0x0
+  - /lib64/libfdb_c.so
  
  '
 ---
 # name: test_execstack_permissions_libfdb_c[ubuntu-versioned]
  '
-    GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
-                   0x0000000000000000 0x0000000000000000  RW     0x0
+  - /lib/libfdb_c.so
  
  '
 ---
 # name: test_execstack_permissions_libfdb_c[ubuntu]
  '
-    GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
-                   0x0000000000000000 0x0000000000000000  RW     0x0
+  - /lib/libfdb_c.so
  
  '
 ---
--- a/contrib/pkg_tester/test_fdb_pkgs.py
+++ b/contrib/pkg_tester/test_fdb_pkgs.py
@ -22,6 +22,7 @@ import pathlib
 import pytest
 import shlex
 import subprocess
+import sys
 import uuid

 from typing import Iterator, List, Optional, Union
@ -29,9 +30,14 @@ from typing import Iterator, List, Optional, Union

 def run(args: List[str]) -> str:
    print("$ {}".format(" ".join(map(shlex.quote, args))))
-    result = subprocess.check_output(args).decode("utf-8")
-    print(result, end="")
-    return result
+    result = []
+    proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    while proc.poll() is None:
+        text = proc.stdout.readline().decode("utf-8")
+        result.append(text)
+        sys.stdout.write(text)
+    assert proc.returncode == 0
+    return "".join(result)


 class Image:
@ -106,7 +112,16 @@ def ubuntu_image_with_fdb_helper(versioned: bool) -> Iterator[Optional[Image]]:
        for deb in debs:
            container.copy_to(deb, "/opt")
        container.run(["bash", "-c", "apt-get update"])
-        container.run(["bash", "-c", "apt-get install --yes binutils"]) # this is for testing libfdb_c execstack permissions
+        container.run(
+            ["bash", "-c", "apt-get install --yes execstack"]
+        )  # this is for testing libfdb_c execstack permissions
+        container.run(
+            [
+                "bash",
+                "-c",
+                "DEBIAN_FRONTEND=noninteractive DEBCONF_NONINTERACTIVE_SEEN=true apt-get install --yes gcc pkg-config cmake",
+            ]
+        )  # this is for testing building client apps
        container.run(["bash", "-c", "dpkg -i /opt/*.deb"])
        container.run(["bash", "-c", "rm /opt/*.deb"])
        image = container.commit()
@ -151,7 +166,12 @@ def centos_image_with_fdb_helper(versioned: bool) -> Iterator[Optional[Image]]:
        for rpm in rpms:
            container.copy_to(rpm, "/opt")
        container.run(["bash", "-c", "yum update -y"])
-        container.run(["bash", "-c", "yum install -y binutils"]) # this is for testing libfdb_c execstack permissions
+        container.run(
+            ["bash", "-c", "yum install -y prelink"]
+        )  # this is for testing libfdb_c execstack permissions
+        container.run(
+            ["bash", "-c", "yum install -y gcc pkg-config cmake make"]
+        )  # this is for testing building client apps
        container.run(["bash", "-c", "yum install -y /opt/*.rpm"])
        container.run(["bash", "-c", "rm /opt/*.rpm"])
        image = container.commit()
@ -232,6 +252,70 @@ def test_db_available(linux_container: Container):
    linux_container.run(["fdbcli", "--exec", "get x"])


+def test_client_app(linux_container: Container):
+    test_client_app_script = r"""#!/bin/bash
+
+set -euxo pipefail
+
+cat > app.c << EOF
+// FDB_API_VERSION doesn't necessarily need to be kept up to date here
+#define FDB_API_VERSION 700
+#include <foundationdb/fdb_c.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void check(fdb_error_t e) {
+    if (e) {
+        fprintf(stderr, "%s\n", fdb_get_error(e));
+        fflush(NULL);
+        abort();
+    }
+}
+
+int result = 0;
+
+static void callback(FDBFuture* f, void* _ignored) {
+    check(fdb_stop_network());
+}
+
+int main() {
+    check(fdb_select_api_version(700));
+    check(fdb_setup_network());
+    FDBDatabase* db;
+    check(fdb_create_database(NULL, &db));
+    FDBTransaction* tr;
+    check(fdb_database_create_transaction(db, &tr));
+    FDBFuture* f = fdb_transaction_get_read_version(tr);
+    check(fdb_future_set_callback(f, callback, NULL));
+    check(fdb_run_network());
+    fdb_future_destroy(f);
+    fdb_transaction_destroy(tr);
+    fdb_database_destroy(db);
+    return 0;
+}
+EOF
+
+cc app.c `pkg-config foundationdb-client --cflags --libs`
+./a.out
+
+cat > CMakeLists.txt << EOF
+project(app C)
+find_package(FoundationDB-Client REQUIRED)
+add_executable(app app.c)
+target_link_libraries(app PRIVATE fdb_c)
+EOF
+
+mkdir build
+cd build
+cmake ..
+make
+./app
+
+"""
+    linux_container.run(["bash", "-c", test_client_app_script])
+
+
 def test_write(linux_container: Container, snapshot):
    linux_container.run(["fdbcli", "--exec", "writemode on; set x y"])
    assert snapshot == linux_container.run(["fdbcli", "--exec", "get x"])
@ -243,7 +327,7 @@ def test_execstack_permissions_libfdb_c(linux_container: Container, snapshot):
        [
            "bash",
            "-c",
-            "readelf -l $(ldconfig -p | grep libfdb_c | awk '{print $(NF)}') | grep -A1 GNU_STACK",
+            "execstack -q $(ldconfig -p | grep libfdb_c | awk '{print $(NF)}')",
        ]
    )

--- a/documentation/sphinx/source/class-scheduling-java.rst
+++ b/documentation/sphinx/source/class-scheduling-java.rst
@ -148,7 +148,7 @@ is equivalent to something like:
        tr.set(Tuple.from("class", "class1").pack(), encodeInt(100));
        t.commit().join();
      } catch (RuntimeException e) {
-        t = t.onError(e).get();
+        t = t.onError(e).join();
      }
    }

@ -290,10 +290,10 @@ This is easy -- we simply add a condition to check that the value is non-zero. L
  private static void signup(TransactionContext db, final String s, final String c) {
    db.run((Transaction tr) -> {
      byte[] rec = Tuple.from("attends", s, c).pack();
-      if (tr.get(rec).get() != null)
+      if (tr.get(rec).join() != null)
        return null; // already signed up

-      int seatsLeft = decodeInt(tr.get(Tuple.from("class", c).pack()).get());
+      int seatsLeft = decodeInt(tr.get(Tuple.from("class", c).pack()).join());
      if (seatsLeft == 0)
        throw new IllegalStateException("No remaining seats");

--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -1189,7 +1189,6 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	    ClusterConnectionFile::lookupClusterFileName(opt.clusterFile);
 	try {
 		ccf = makeReference<ClusterConnectionFile>(resolvedClusterFile.first);
-		wait(ccf->resolveHostnames());
 	} catch (Error& e) {
 		if (e.code() == error_code_operation_cancelled) {
 			throw;
--- a/fdbclient/AutoPublicAddress.cpp
+++ b/fdbclient/AutoPublicAddress.cpp
@ -28,16 +28,30 @@

 #include "fdbclient/CoordinationInterface.h"

-// Determine public IP address by calling the first coordinator.
+// Determine public IP address by calling the first available coordinator.
+// If fail connecting all coordinators, throw bind_failed().
 IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs) {
+	int size = ccs.coordinators().size() + ccs.hostnames.size();
+	int index = 0;
+	loop {
 		try {
 			using namespace boost::asio;

 			io_service ioService;
 			ip::udp::socket socket(ioService);

-		ccs.resolveHostnamesBlocking();
-		const auto& coordAddr = ccs.coordinators()[0];
+			NetworkAddress coordAddr;
+			// Try coords first, because they don't need to be resolved.
+			if (index < ccs.coordinators().size()) {
+				coordAddr = ccs.coordinators()[index];
+			} else {
+				Hostname& h = ccs.hostnames[index - ccs.coordinators().size()];
+				Optional<NetworkAddress> resolvedAddr = h.resolveBlocking();
+				if (!resolvedAddr.present()) {
+					throw lookup_failed();
+				}
+				coordAddr = resolvedAddr.get();
+			}
 			const auto boostIp = coordAddr.ip.isV6() ? ip::address(ip::address_v6(coordAddr.ip.toV6()))
 			                                         : ip::address(ip::address_v4(coordAddr.ip.toV4()));

@ -48,8 +62,12 @@ IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs) {
 			socket.close();

 			return ip;
-	} catch (boost::system::system_error e) {
-		fprintf(stderr, "Error determining public address: %s\n", e.what());
+		} catch (...) {
+			++index;
+			if (index == size) {
+				fprintf(stderr, "Error determining public address.\n");
 				throw bind_failed();
 			}
 		}
+	}
+}
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -65,7 +65,6 @@ set(FDBCLIENT_SRCS
  GlobalConfig.actor.cpp
  GrvProxyInterface.h
  HighContentionPrefixAllocator.actor.h
-  HTTP.actor.cpp
  IClientApi.h
  IConfigTransaction.cpp
  IConfigTransaction.h
--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -21,6 +21,7 @@
 #include "fdbclient/Knobs.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/SystemData.h"
+#include "fdbclient/Tenant.h"
 #include "flow/UnitTest.h"

 #define init(...) KNOB_FN(__VA_ARGS__, INIT_ATOMIC_KNOB, INIT_KNOB)(__VA_ARGS__)
@ -82,6 +83,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( CHANGE_FEED_CACHE_SIZE,               100000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_SIZE = 1;
 	init( CHANGE_FEED_POP_TIMEOUT,                 5.0 );
 	init( CHANGE_FEED_STREAM_MIN_BYTES,            1e4 ); if( randomize && BUGGIFY ) CHANGE_FEED_STREAM_MIN_BYTES = 1;
+	init( TENANT_PREFIX_SIZE_LIMIT,                 28 ); ASSERT(TENANT_PREFIX_SIZE_LIMIT >= TenantMapEntry::ROOT_PREFIX_SIZE); // includes 8-byte ID and optional tenant subspace

 	init( MAX_BATCH_SIZE,                         1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1;
 	init( GRV_BATCH_TIMEOUT,                     0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1;
--- a/fdbclient/ClientKnobs.h
+++ b/fdbclient/ClientKnobs.h
@ -81,6 +81,7 @@ public:
 	int64_t CHANGE_FEED_CACHE_SIZE;
 	double CHANGE_FEED_POP_TIMEOUT;
 	int64_t CHANGE_FEED_STREAM_MIN_BYTES;
+	int64_t TENANT_PREFIX_SIZE_LIMIT;

 	int MAX_BATCH_SIZE;
 	double GRV_BATCH_TIMEOUT;
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@ -61,61 +61,31 @@ struct ClientLeaderRegInterface {
 //  - There is no address present more than once
 class ClusterConnectionString {
 public:
-	enum ConnectionStringStatus { RESOLVED, RESOLVING, UNRESOLVED };
-
 	ClusterConnectionString() {}
-	ClusterConnectionString(const std::string& connStr);
+	ClusterConnectionString(const std::string& connectionString);
 	ClusterConnectionString(const std::vector<NetworkAddress>& coordinators, Key key);
 	ClusterConnectionString(const std::vector<Hostname>& hosts, Key key);

-	ClusterConnectionString(const ClusterConnectionString& rhs) { operator=(rhs); }
-	ClusterConnectionString& operator=(const ClusterConnectionString& rhs) {
-		// Copy everything except AsyncTrigger resolveFinish.
-		status = rhs.status;
-		coords = rhs.coords;
-		hostnames = rhs.hostnames;
-		networkAddressToHostname = rhs.networkAddressToHostname;
-		key = rhs.key;
-		keyDesc = rhs.keyDesc;
-		connectionString = rhs.connectionString;
-		return *this;
-	}
-
 	std::vector<NetworkAddress> const& coordinators() const { return coords; }
-	void addResolved(const Hostname& hostname, const NetworkAddress& address) {
-		coords.push_back(address);
-		networkAddressToHostname.emplace(address, hostname);
-	}
 	Key clusterKey() const { return key; }
 	Key clusterKeyName() const {
 		return keyDesc;
 	} // Returns the "name" or "description" part of the clusterKey (the part before the ':')
 	std::string toString() const;
 	static std::string getErrorString(std::string const& source, Error const& e);
-	Future<Void> resolveHostnames();
-	// This one should only be used when resolving asynchronously is impossible. For all other cases, resolveHostnames()
-	// should be preferred.
-	void resolveHostnamesBlocking();
-	// This function derives the member connectionString from the current key, coordinators and hostnames.
-	void resetConnectionString();

-	void resetToUnresolved();
 	void parseKey(const std::string& key);

-	ConnectionStringStatus status = RESOLVED;
-	AsyncTrigger resolveFinish;
 	// This function tries to resolve all hostnames once, and return them with coords.
 	// Best effort, does not guarantee that the resolves succeed.
 	Future<std::vector<NetworkAddress>> tryResolveHostnames();

 	std::vector<NetworkAddress> coords;
 	std::vector<Hostname> hostnames;
-	std::unordered_map<NetworkAddress, Hostname> networkAddressToHostname;

 private:
 	void parseConnString();
 	Key key, keyDesc;
-	std::string connectionString;
 };

 FDB_DECLARE_BOOLEAN_PARAM(ConnectionStringNeedsPersisted);
@ -165,12 +135,6 @@ public:
 	// Signals to the connection record that it was successfully used to connect to a cluster.
 	void notifyConnected();

-	ClusterConnectionString::ConnectionStringStatus connectionStringStatus() const;
-	Future<Void> resolveHostnames();
-	// This one should only be used when resolving asynchronously is impossible. For all other cases, resolveHostnames()
-	// should be preferred.
-	void resolveHostnamesBlocking();
-
 	virtual void addref() = 0;
 	virtual void delref() = 0;

@ -275,12 +239,21 @@ struct OpenDatabaseCoordRequest {
 	Standalone<VectorRef<ClientVersionRef>> supportedVersions;
 	UID knownClientInfoID;
 	Key clusterKey;
+	std::vector<Hostname> hostnames;
 	std::vector<NetworkAddress> coordinators;
 	ReplyPromise<CachedSerialization<struct ClientDBInfo>> reply;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, issues, supportedVersions, traceLogGroup, knownClientInfoID, clusterKey, coordinators, reply);
+		serializer(ar,
+		           issues,
+		           supportedVersions,
+		           traceLogGroup,
+		           knownClientInfoID,
+		           clusterKey,
+		           hostnames,
+		           coordinators,
+		           reply);
 	}
 };

--- a/fdbclient/FDBTypes.cpp
+++ b/fdbclient/FDBTypes.cpp
@ -20,6 +20,7 @@

 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/Knobs.h"
+#include "fdbclient/NativeAPI.actor.h"

 KeyRef keyBetween(const KeyRangeRef& keys) {
 	int pos = 0; // will be the position of the first difference between keys.begin and keys.end
@ -40,17 +41,15 @@ KeyRef keyBetween(const KeyRangeRef& keys) {
 }

 void KeySelectorRef::setKey(KeyRef const& key) {
-	// There are no keys in the database with size greater than KEY_SIZE_LIMIT, so if this key selector has a key
+	// There are no keys in the database with size greater than the max key size, so if this key selector has a key
 	// which is large, then we can translate it to an equivalent key selector with a smaller key
-	if (key.size() >
-	    (key.startsWith(LiteralStringRef("\xff")) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		this->key = key.substr(0,
-		                       (key.startsWith(LiteralStringRef("\xff")) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-		                                                                 : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		                           1);
-	else
+	int64_t maxKeySize = getMaxKeySize(key);
+	if (key.size() > maxKeySize) {
+		this->key = key.substr(0, maxKeySize + 1);
+	} else {
 		this->key = key;
 	}
+}

 void KeySelectorRef::setKeyUnlimited(KeyRef const& key) {
 	this->key = key;
--- a/fdbclient/GenericManagementAPI.actor.h
+++ b/fdbclient/GenericManagementAPI.actor.h
@ -746,6 +746,17 @@ Future<Optional<TenantMapEntry>> createTenantTransaction(Transaction tr, TenantN
 	state Optional<Value> lastIdVal = wait(safeThreadFutureToFuture(lastIdFuture));
 	Optional<Value> tenantDataPrefix = wait(safeThreadFutureToFuture(tenantDataPrefixFuture));

+	if (tenantDataPrefix.present() &&
+	    tenantDataPrefix.get().size() + TenantMapEntry::ROOT_PREFIX_SIZE > CLIENT_KNOBS->TENANT_PREFIX_SIZE_LIMIT) {
+		TraceEvent(SevWarnAlways, "TenantPrefixTooLarge")
+		    .detail("TenantSubspace", tenantDataPrefix.get())
+		    .detail("TenantSubspaceLength", tenantDataPrefix.get().size())
+		    .detail("RootPrefixLength", TenantMapEntry::ROOT_PREFIX_SIZE)
+		    .detail("MaxTenantPrefixSize", CLIENT_KNOBS->TENANT_PREFIX_SIZE_LIMIT);
+
+		throw client_invalid_operation();
+	}
+
 	state TenantMapEntry newTenant(lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0,
 	                               tenantDataPrefix.present() ? (KeyRef)tenantDataPrefix.get() : ""_sr);

--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -782,7 +782,7 @@ ACTOR Future<std::vector<ProcessData>> getWorkers(Database cx) {
 	}
 }

-ACTOR Future<std::vector<NetworkAddress>> getCoordinators(Database cx) {
+ACTOR Future<Optional<ClusterConnectionString>> getConnectionString(Database cx) {
 	state Transaction tr(cx);
 	loop {
 		try {
@ -790,9 +790,8 @@ ACTOR Future<std::vector<NetworkAddress>> getCoordinators(Database cx) {
 			tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 			Optional<Value> currentKey = wait(tr.get(coordinatorsKey));
 			if (!currentKey.present())
-				return std::vector<NetworkAddress>();
-
-			return ClusterConnectionString(currentKey.get().toString()).coordinators();
+				return Optional<ClusterConnectionString>();
+			return ClusterConnectionString(currentKey.get().toString());
 		} catch (Error& e) {
 			wait(tr.onError(e));
 		}
@ -801,7 +800,7 @@ ACTOR Future<std::vector<NetworkAddress>> getCoordinators(Database cx) {

 ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
                                                               Reference<IQuorumChange> change,
-                                                               ClusterConnectionString* conn) {
+                                                               std::vector<NetworkAddress> desiredCoordinators) {
 	tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 	tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
@ -812,47 +811,45 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
 		return CoordinatorsResult::BAD_DATABASE_STATE; // Someone deleted this key entirely?

 	state ClusterConnectionString old(currentKey.get().toString());
-	wait(old.resolveHostnames());
 	if (tr->getDatabase()->getConnectionRecord() &&
 	    old.clusterKeyName().toString() !=
 	        tr->getDatabase()->getConnectionRecord()->getConnectionString().clusterKeyName())
 		return CoordinatorsResult::BAD_DATABASE_STATE; // Someone changed the "name" of the database??

+	state std::vector<NetworkAddress> oldCoordinators = wait(old.tryResolveHostnames());
 	state CoordinatorsResult result = CoordinatorsResult::SUCCESS;
-	if (!conn->coords.size()) {
-		std::vector<NetworkAddress> desiredCoordinatorAddresses = wait(change->getDesiredCoordinators(
+	if (!desiredCoordinators.size()) {
+		std::vector<NetworkAddress> _desiredCoordinators = wait(change->getDesiredCoordinators(
 		    tr,
-		    old.coordinators(),
+		    oldCoordinators,
 		    Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(old)),
 		    result));
-		conn->coords = desiredCoordinatorAddresses;
+		desiredCoordinators = _desiredCoordinators;
 	}

 	if (result != CoordinatorsResult::SUCCESS)
 		return result;

-	if (!conn->coordinators().size())
+	if (!desiredCoordinators.size())
 		return CoordinatorsResult::INVALID_NETWORK_ADDRESSES;

-	std::sort(conn->coords.begin(), conn->coords.end());
-	std::sort(conn->hostnames.begin(), conn->hostnames.end());
+	std::sort(desiredCoordinators.begin(), desiredCoordinators.end());

 	std::string newName = change->getDesiredClusterKeyName();
 	if (newName.empty())
 		newName = old.clusterKeyName().toString();

-	if (old.coordinators() == conn->coordinators() && old.clusterKeyName() == newName)
+	if (oldCoordinators == desiredCoordinators && old.clusterKeyName() == newName)
 		return CoordinatorsResult::SAME_NETWORK_ADDRESSES;

-	std::string key(newName + ':' + deterministicRandom()->randomAlphaNumeric(32));
-	conn->parseKey(key);
-	conn->resetConnectionString();
+	state ClusterConnectionString conn(desiredCoordinators,
+	                                   StringRef(newName + ':' + deterministicRandom()->randomAlphaNumeric(32)));

 	if (g_network->isSimulated()) {
 		int i = 0;
 		int protectedCount = 0;
-		while ((protectedCount < ((conn->coordinators().size() / 2) + 1)) && (i < conn->coordinators().size())) {
-			auto process = g_simulator.getProcessByAddress(conn->coordinators()[i]);
+		while ((protectedCount < ((desiredCoordinators.size() / 2) + 1)) && (i < desiredCoordinators.size())) {
+			auto process = g_simulator.getProcessByAddress(desiredCoordinators[i]);
 			auto addresses = process->addresses;

 			if (!process->isReliable()) {
@ -864,14 +861,14 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
 			if (addresses.secondaryAddress.present()) {
 				g_simulator.protectedAddresses.insert(process->addresses.secondaryAddress.get());
 			}
-			TraceEvent("ProtectCoordinator").detail("Address", conn->coordinators()[i]).backtrace();
+			TraceEvent("ProtectCoordinator").detail("Address", desiredCoordinators[i]).backtrace();
 			protectedCount++;
 			i++;
 		}
 	}

 	std::vector<Future<Optional<LeaderInfo>>> leaderServers;
-	ClientCoordinators coord(Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(*conn)));
+	ClientCoordinators coord(Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(conn)));

 	leaderServers.reserve(coord.clientLeaderServers.size());
 	for (int i = 0; i < coord.clientLeaderServers.size(); i++)
@ -883,7 +880,7 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
 		when(wait(waitForAll(leaderServers))) {}
 		when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; }
 	}
-	tr->set(coordinatorsKey, conn->toString());
+	tr->set(coordinatorsKey, conn.toString());
 	return Optional<CoordinatorsResult>();
 }

@ -909,11 +906,12 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
 			    old.clusterKeyName().toString() != cx->getConnectionRecord()->getConnectionString().clusterKeyName())
 				return CoordinatorsResult::BAD_DATABASE_STATE; // Someone changed the "name" of the database??

+			state std::vector<NetworkAddress> oldCoordinators = wait(old.tryResolveHostnames());
 			state CoordinatorsResult result = CoordinatorsResult::SUCCESS;
 			if (!desiredCoordinators.size()) {
 				std::vector<NetworkAddress> _desiredCoordinators = wait(change->getDesiredCoordinators(
 				    &tr,
-				    old.coordinators(),
+				    oldCoordinators,
 				    Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(old)),
 				    result));
 				desiredCoordinators = _desiredCoordinators;
@ -937,7 +935,7 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
 			if (newName.empty())
 				newName = old.clusterKeyName().toString();

-			if (old.coordinators() == desiredCoordinators && old.clusterKeyName() == newName)
+			if (oldCoordinators == desiredCoordinators && old.clusterKeyName() == newName)
 				return retries ? CoordinatorsResult::SUCCESS : CoordinatorsResult::SAME_NETWORK_ADDRESSES;

 			state ClusterConnectionString conn(
@ -1075,10 +1073,17 @@ struct AutoQuorumChange final : IQuorumChange {
 		std::vector<Future<Optional<LeaderInfo>>> leaderServers;
 		leaderServers.reserve(coord.clientLeaderServers.size());
 		for (int i = 0; i < coord.clientLeaderServers.size(); i++) {
+			if (coord.clientLeaderServers[i].hostname.present()) {
+				leaderServers.push_back(retryGetReplyFromHostname(GetLeaderRequest(coord.clusterKey, UID()),
+				                                                  coord.clientLeaderServers[i].hostname.get(),
+				                                                  WLTOKEN_CLIENTLEADERREG_GETLEADER,
+				                                                  TaskPriority::CoordinationReply));
+			} else {
 				leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader,
 				                                           GetLeaderRequest(coord.clusterKey, UID()),
 				                                           TaskPriority::CoordinationReply));
 			}
+		}
 		Optional<std::vector<Optional<LeaderInfo>>> results =
 		    wait(timeout(getAll(leaderServers), CLIENT_KNOBS->IS_ACCEPTABLE_DELAY));
 		if (!results.present()) {
--- a/fdbclient/ManagementAPI.actor.h
+++ b/fdbclient/ManagementAPI.actor.h
@ -56,7 +56,7 @@ struct IQuorumChange : ReferenceCounted<IQuorumChange> {
 // Change to use the given set of coordination servers
 ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
                                                               Reference<IQuorumChange> change,
-                                                               ClusterConnectionString* conn);
+                                                               std::vector<NetworkAddress> desiredCoordinators);
 ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChange> change);
 Reference<IQuorumChange> autoQuorumChange(int desired = -1);
 Reference<IQuorumChange> noQuorumChange();
@ -146,7 +146,7 @@ ACTOR Future<bool> setHealthyZone(Database cx, StringRef zoneId, double seconds,
 ACTOR Future<Void> waitForPrimaryDC(Database cx, StringRef dcId);

 // Gets the cluster connection string
-ACTOR Future<std::vector<NetworkAddress>> getCoordinators(Database cx);
+ACTOR Future<Optional<ClusterConnectionString>> getConnectionString(Database cx);

 void schemaCoverage(std::string const& spath, bool covered = true);
 bool schemaMatch(json_spirit::mValue const& schema,
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@ -77,18 +77,6 @@ void IClusterConnectionRecord::setPersisted() {
 	connectionStringNeedsPersisted = false;
 }

-ClusterConnectionString::ConnectionStringStatus IClusterConnectionRecord::connectionStringStatus() const {
-	return cs.status;
-}
-
-Future<Void> IClusterConnectionRecord::resolveHostnames() {
-	return cs.resolveHostnames();
-}
-
-void IClusterConnectionRecord::resolveHostnamesBlocking() {
-	cs.resolveHostnamesBlocking();
-}
-
 std::string ClusterConnectionString::getErrorString(std::string const& source, Error const& e) {
 	if (e.code() == error_code_connection_string_invalid) {
 		return format("Invalid connection string `%s: %d %s", source.c_str(), e.code(), e.what());
@ -97,101 +85,19 @@ std::string ClusterConnectionString::getErrorString(std::string const& source, E
 	}
 }

-ACTOR Future<Void> resolveHostnamesImpl(ClusterConnectionString* self) {
-	loop {
-		if (self->status == ClusterConnectionString::UNRESOLVED) {
-			self->status = ClusterConnectionString::RESOLVING;
-			std::vector<Future<Void>> fs;
-			for (auto const& hostname : self->hostnames) {
-				fs.push_back(map(INetworkConnections::net()->resolveTCPEndpoint(hostname.host, hostname.service),
-				                 [=](std::vector<NetworkAddress> const& addresses) -> Void {
-					                 NetworkAddress address =
-					                     addresses[deterministicRandom()->randomInt(0, addresses.size())];
-					                 address.flags = 0; // Reset the parsed address to public
-					                 address.fromHostname = NetworkAddressFromHostname::True;
-					                 if (hostname.isTLS) {
-						                 address.flags |= NetworkAddress::FLAG_TLS;
-					                 }
-					                 self->addResolved(hostname, address);
-					                 return Void();
-				                 }));
-			}
-			wait(waitForAll(fs));
-			std::sort(self->coords.begin(), self->coords.end());
-			if (std::unique(self->coords.begin(), self->coords.end()) != self->coords.end()) {
-				self->status = ClusterConnectionString::UNRESOLVED;
-				self->resolveFinish.trigger();
-				throw connection_string_invalid();
-			}
-			self->status = ClusterConnectionString::RESOLVED;
-			self->resolveFinish.trigger();
-			break;
-		} else if (self->status == ClusterConnectionString::RESOLVING) {
-			wait(self->resolveFinish.onTrigger());
-			if (self->status == ClusterConnectionString::RESOLVED) {
-				break;
-			}
-			// Otherwise, this means other threads failed on resolve, so here we go back to the loop and try to resolve
-			// again.
-		} else {
-			// status is RESOLVED, nothing to do.
-			break;
-		}
-	}
-	return Void();
-}
-
-Future<Void> ClusterConnectionString::resolveHostnames() {
-	return resolveHostnamesImpl(this);
-}
-
-void ClusterConnectionString::resolveHostnamesBlocking() {
-	if (status != RESOLVED) {
-		status = RESOLVING;
-		for (auto const& hostname : hostnames) {
-			std::vector<NetworkAddress> addresses =
-			    INetworkConnections::net()->resolveTCPEndpointBlocking(hostname.host, hostname.service);
-			NetworkAddress address = addresses[deterministicRandom()->randomInt(0, addresses.size())];
-			address.flags = 0; // Reset the parsed address to public
-			address.fromHostname = NetworkAddressFromHostname::True;
-			if (hostname.isTLS) {
-				address.flags |= NetworkAddress::FLAG_TLS;
-			}
-			addResolved(hostname, address);
-		}
-		std::sort(coords.begin(), coords.end());
-		if (std::unique(coords.begin(), coords.end()) != coords.end()) {
-			status = UNRESOLVED;
-			throw connection_string_invalid();
-		}
-		status = RESOLVED;
-	}
-}
-
-void ClusterConnectionString::resetToUnresolved() {
-	if (status == RESOLVED && hostnames.size() > 0) {
-		coords.clear();
-		hostnames.clear();
-		networkAddressToHostname.clear();
-		status = UNRESOLVED;
-		parseConnString();
-	}
-}
-
-void ClusterConnectionString::resetConnectionString() {
-	connectionString = toString();
-}
-
-void ClusterConnectionString::parseConnString() {
+ClusterConnectionString::ClusterConnectionString(const std::string& connectionString) {
+	auto trimmed = trim(connectionString);
 	// Split on '@' into key@addrs
-	int pAt = connectionString.find_first_of('@');
-	if (pAt == connectionString.npos) {
+	int pAt = trimmed.find_first_of('@');
+	if (pAt == trimmed.npos) {
 		throw connection_string_invalid();
 	}
-	std::string key = connectionString.substr(0, pAt);
-	std::string addrs = connectionString.substr(pAt + 1);
+	std::string key = trimmed.substr(0, pAt);
+	std::string addrs = trimmed.substr(pAt + 1);

 	parseKey(key);
+	std::set<Hostname> hostnameSet;
+	std::set<NetworkAddress> addressSet;
 	std::string curAddr;
 	for (int p = 0; p <= addrs.size();) {
 		int pComma = addrs.find_first_of(',', p);
@ -199,31 +105,29 @@ void ClusterConnectionString::parseConnString() {
 			pComma = addrs.size();
 		curAddr = addrs.substr(p, pComma - p);
 		if (Hostname::isHostname(curAddr)) {
+			Hostname h = Hostname::parse(curAddr);
+			// Check that there are no duplicate hostnames
+			if (hostnameSet.find(h) != hostnameSet.end()) {
+				throw connection_string_invalid();
+			}
 			hostnames.push_back(Hostname::parse(curAddr));
+			hostnameSet.insert(h);
 		} else {
-			coords.push_back(NetworkAddress::parse(curAddr));
+			NetworkAddress n = NetworkAddress::parse(curAddr);
+			// Check that there are no duplicate addresses
+			if (addressSet.find(n) != addressSet.end()) {
+				throw connection_string_invalid();
+			}
+			coords.push_back(n);
+			addressSet.insert(n);
 		}
 		p = pComma + 1;
 	}
-	if (hostnames.size() > 0) {
-		status = UNRESOLVED;
-	}
 	ASSERT((coords.size() + hostnames.size()) > 0);
-
-	std::sort(coords.begin(), coords.end());
-	// Check that there are no duplicate addresses
-	if (std::unique(coords.begin(), coords.end()) != coords.end()) {
-		throw connection_string_invalid();
-	}
-}
-
-ClusterConnectionString::ClusterConnectionString(const std::string& connStr) {
-	connectionString = trim(connStr);
-	parseConnString();
 }

 TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/addresses") {
-	std::string input;
+	state std::string input;

 	{
 		input = "asdf:2345@1.1.1.1:345";
@ -231,6 +135,15 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/addresses") {
 		ASSERT(input == cs.toString());
 	}

+	{
+		input = "asdf:2345@1.1.1.1:345,1.1.1.1:345";
+		try {
+			ClusterConnectionString cs(input);
+		} catch (Error& e) {
+			ASSERT(e.code() == error_code_connection_string_invalid);
+		}
+	}
+
 	{
 		input = "0xxdeadbeef:100100100@1.1.1.1:34534,5.1.5.3:23443";
 		ClusterConnectionString cs(input);
@ -274,20 +187,27 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/addresses") {
 }

 TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
-	std::string input;
+	state std::string input;

 	{
 		input = "asdf:2345@localhost:1234";
 		ClusterConnectionString cs(input);
-		ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 		ASSERT(cs.hostnames.size() == 1);
 		ASSERT(input == cs.toString());
 	}

+	{
+		input = "asdf:2345@localhost:1234,localhost:1234";
+		try {
+			ClusterConnectionString cs(input);
+		} catch (Error& e) {
+			ASSERT(e.code() == error_code_connection_string_invalid);
+		}
+	}
+
 	{
 		input = "0xxdeadbeef:100100100@localhost:34534,host-name:23443";
 		ClusterConnectionString cs(input);
-		ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 		ASSERT(cs.hostnames.size() == 2);
 		ASSERT(input == cs.toString());
 	}
@ -300,7 +220,6 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
 		commented += "# asdfasdf ##";

 		ClusterConnectionString cs(commented);
-		ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 		ASSERT(cs.hostnames.size() == 2);
 		ASSERT(input == cs.toString());
 	}
@ -313,7 +232,6 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
 		commented += "# asdfasdf ##";

 		ClusterConnectionString cs(commented);
-		ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 		ASSERT(cs.hostnames.size() == 2);
 		ASSERT(input == cs.toString());
 	}
@ -321,45 +239,31 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
 	return Void();
 }

-TEST_CASE("/fdbclient/MonitorLeader/ConnectionString") {
-	state std::string connectionString = "TestCluster:0@localhost:1234,host-name:5678";
-	std::string hn1 = "localhost", port1 = "1234";
-	state std::string hn2 = "host-name";
-	state std::string port2 = "5678";
-	state std::vector<Hostname> hostnames;
+TEST_CASE("/fdbclient/MonitorLeader/ConnectionString/hostname") {
+	std::string connectionString = "TestCluster:0@localhost:1234,host-name:5678";
+	std::string hn1 = "localhost", port1 = "1234", hn2 = "host-name", port2 = "5678";
+	std::vector<Hostname> hostnames;
+
+	{
 		hostnames.push_back(Hostname::parse(hn1 + ":" + port1));
 		hostnames.push_back(Hostname::parse(hn2 + ":" + port2));

-	NetworkAddress address1 = NetworkAddress::parse("127.0.0.0:1234");
-	NetworkAddress address2 = NetworkAddress::parse("127.0.0.1:5678");
-
-	INetworkConnections::net()->addMockTCPEndpoint(hn1, port1, { address1 });
-	INetworkConnections::net()->addMockTCPEndpoint(hn2, port2, { address2 });
-
-	state ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0"));
-	ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
-	ASSERT(cs.hostnames.size() == 2);
-	ASSERT(cs.coordinators().size() == 0);
-	wait(cs.resolveHostnames());
-	ASSERT(cs.status == ClusterConnectionString::RESOLVED);
-	ASSERT(cs.hostnames.size() == 2);
-	ASSERT(cs.coordinators().size() == 2);
-	ASSERT(cs.toString() == connectionString);
-	cs.resetToUnresolved();
-	ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
+		ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0"));
 		ASSERT(cs.hostnames.size() == 2);
 		ASSERT(cs.coordinators().size() == 0);
 		ASSERT(cs.toString() == connectionString);
+	}

-	INetworkConnections::net()->removeMockTCPEndpoint(hn2, port2);
-	NetworkAddress address3 = NetworkAddress::parse("127.0.0.0:5678");
-	INetworkConnections::net()->addMockTCPEndpoint(hn2, port2, { address3 });
-
+	{
+		hostnames.clear();
+		hostnames.push_back(Hostname::parse(hn1 + ":" + port1));
+		hostnames.push_back(Hostname::parse(hn1 + ":" + port1));
 		try {
-		wait(cs.resolveHostnames());
+			ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0"));
 		} catch (Error& e) {
 			ASSERT(e.code() == error_code_connection_string_invalid);
 		}
+	}

 	return Void();
 }
@ -380,6 +284,7 @@ ACTOR Future<std::vector<NetworkAddress>> tryResolveHostnamesImpl(ClusterConnect
 		allCoordinatorsSet.insert(coord);
 	}
 	std::vector<NetworkAddress> allCoordinators(allCoordinatorsSet.begin(), allCoordinatorsSet.end());
+	std::sort(allCoordinators.begin(), allCoordinators.end());
 	return allCoordinators;
 }

@ -484,17 +389,22 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/fuzz") {
 }

 ClusterConnectionString::ClusterConnectionString(const std::vector<NetworkAddress>& servers, Key key)
-  : status(RESOLVED), coords(servers) {
+  : coords(servers) {
+	std::set<NetworkAddress> s(servers.begin(), servers.end());
+	if (s.size() != servers.size()) {
+		throw connection_string_invalid();
+	}
 	std::string keyString = key.toString();
 	parseKey(keyString);
-	resetConnectionString();
 }

-ClusterConnectionString::ClusterConnectionString(const std::vector<Hostname>& hosts, Key key)
-  : status(UNRESOLVED), hostnames(hosts) {
+ClusterConnectionString::ClusterConnectionString(const std::vector<Hostname>& hosts, Key key) : hostnames(hosts) {
+	std::set<Hostname> h(hosts.begin(), hosts.end());
+	if (h.size() != hosts.size()) {
+		throw connection_string_invalid();
+	}
 	std::string keyString = key.toString();
 	parseKey(keyString);
-	resetConnectionString();
 }

 void ClusterConnectionString::parseKey(const std::string& key) {
@ -529,13 +439,11 @@ void ClusterConnectionString::parseKey(const std::string& key) {
 std::string ClusterConnectionString::toString() const {
 	std::string s = key.toString();
 	s += '@';
-	for (int i = 0; i < coords.size(); i++) {
-		if (networkAddressToHostname.find(coords[i]) == networkAddressToHostname.end()) {
+	for (auto const& coord : coords) {
 		if (s.find('@') != s.length() - 1) {
 			s += ',';
 		}
-			s += coords[i].toString();
-		}
+		s += coord.toString();
 	}
 	for (auto const& host : hostnames) {
 		if (s.find('@') != s.length() - 1) {
@ -547,11 +455,14 @@ std::string ClusterConnectionString::toString() const {
 }

 ClientCoordinators::ClientCoordinators(Reference<IClusterConnectionRecord> ccr) : ccr(ccr) {
-	ASSERT(ccr->connectionStringStatus() == ClusterConnectionString::RESOLVED);
 	ClusterConnectionString cs = ccr->getConnectionString();
-	for (auto s = cs.coordinators().begin(); s != cs.coordinators().end(); ++s)
-		clientLeaderServers.push_back(ClientLeaderRegInterface(*s));
 	clusterKey = cs.clusterKey();
+	for (auto h : cs.hostnames) {
+		clientLeaderServers.push_back(ClientLeaderRegInterface(h));
+	}
+	for (auto s : cs.coordinators()) {
+		clientLeaderServers.push_back(ClientLeaderRegInterface(s));
+	}
 }

 ClientCoordinators::ClientCoordinators(Key clusterKey, std::vector<NetworkAddress> coordinators)
@ -576,49 +487,32 @@ ClientLeaderRegInterface::ClientLeaderRegInterface(INetwork* local) {

 // Nominee is the worker among all workers that are considered as leader by one coordinator
 // This function contacts a coordinator coord to ask who is its nominee.
-// Note: for coordinators whose NetworkAddress is parsed out of a hostname, a connection failure will cause this actor
-// to throw `coordinators_changed()` error
 ACTOR Future<Void> monitorNominee(Key key,
                                  ClientLeaderRegInterface coord,
                                  AsyncTrigger* nomineeChange,
-                                  Optional<LeaderInfo>* info,
-                                  Optional<Hostname> hostname = Optional<Hostname>()) {
+                                  Optional<LeaderInfo>* info) {
 	loop {
 		state Optional<LeaderInfo> li;
-
-		if (coord.getLeader.getEndpoint().getPrimaryAddress().fromHostname) {
-			state ErrorOr<Optional<LeaderInfo>> rep =
-			    wait(coord.getLeader.tryGetReply(GetLeaderRequest(key, info->present() ? info->get().changeID : UID()),
-			                                     TaskPriority::CoordinationReply));
-			if (rep.isError()) {
-				// Connecting to nominee failed, most likely due to connection failed.
-				TraceEvent("MonitorNomineeError")
-				    .error(rep.getError())
-				    .detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
-				    .detail("OldAddr", coord.getLeader.getEndpoint().getPrimaryAddress().toString());
-				if (rep.getError().code() == error_code_request_maybe_delivered) {
-					// Delay to prevent tight resolving loop due to outdated DNS cache
-					wait(delay(FLOW_KNOBS->HOSTNAME_RECONNECT_INIT_INTERVAL));
-					throw coordinators_changed();
+		if (coord.hostname.present()) {
+			wait(store(li,
+			           retryGetReplyFromHostname(GetLeaderRequest(key, info->present() ? info->get().changeID : UID()),
+			                                     coord.hostname.get(),
+			                                     WLTOKEN_CLIENTLEADERREG_GETLEADER,
+			                                     TaskPriority::CoordinationReply)));
 		} else {
-					throw rep.getError();
-				}
-			} else if (rep.present()) {
-				li = rep.get();
-			}
-		} else {
-			Optional<LeaderInfo> tmp =
-			    wait(retryBrokenPromise(coord.getLeader,
+			wait(store(li,
+			           retryBrokenPromise(coord.getLeader,
 			                              GetLeaderRequest(key, info->present() ? info->get().changeID : UID()),
-			                            TaskPriority::CoordinationReply));
-			li = tmp;
+			                              TaskPriority::CoordinationReply)));
 		}

 		wait(Future<Void>(Void())); // Make sure we weren't cancelled

 		TraceEvent("GetLeaderReply")
 		    .suppressFor(1.0)
-		    .detail("Coordinator", coord.getLeader.getEndpoint().getPrimaryAddress())
+		    .detail("Coordinator",
+		            coord.hostname.present() ? coord.hostname.get().toString()
+		                                     : coord.getLeader.getEndpoint().getPrimaryAddress().toString())
 		    .detail("Nominee", li.present() ? li.get().changeID : UID())
 		    .detail("ClusterKey", key.printable());

@ -687,9 +581,6 @@ Optional<std::pair<LeaderInfo, bool>> getLeader(const std::vector<Optional<Leade
 ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<IClusterConnectionRecord> connRecord,
                                                           Reference<AsyncVar<Value>> outSerializedLeaderInfo,
                                                           MonitorLeaderInfo info) {
-	loop {
-		wait(connRecord->resolveHostnames());
-		wait(info.intermediateConnRecord->resolveHostnames());
 	state ClientCoordinators coordinators(info.intermediateConnRecord);
 	state AsyncTrigger nomineeChange;
 	state std::vector<Optional<LeaderInfo>> nominees;
@ -701,14 +592,8 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<IClusterCon
 	// Ask all coordinators if the worker is considered as a leader (leader nominee) by the coordinator.
 	actors.reserve(coordinators.clientLeaderServers.size());
 	for (int i = 0; i < coordinators.clientLeaderServers.size(); i++) {
-			Optional<Hostname> hostname;
-			auto r = connRecord->getConnectionString().networkAddressToHostname.find(
-			    coordinators.clientLeaderServers[i].getLeader.getEndpoint().getPrimaryAddress());
-			if (r != connRecord->getConnectionString().networkAddressToHostname.end()) {
-				hostname = r->second;
-			}
-			actors.push_back(monitorNominee(
-			    coordinators.clusterKey, coordinators.clientLeaderServers[i], &nomineeChange, &nominees[i], hostname));
+		actors.push_back(
+		    monitorNominee(coordinators.clusterKey, coordinators.clientLeaderServers[i], &nomineeChange, &nominees[i]));
 	}
 	allActors = waitForAll(actors);

@ -743,18 +628,7 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<IClusterCon

 			outSerializedLeaderInfo->set(leader.get().first.serializedInfo);
 		}
-			try {
 		wait(nomineeChange.onTrigger() || allActors);
-			} catch (Error& e) {
-				if (e.code() == error_code_coordinators_changed) {
-					TraceEvent("MonitorLeaderCoordinatorsChanged").suppressFor(1.0);
-					connRecord->getConnectionString().resetToUnresolved();
-					break;
-				} else {
-					throw e;
-				}
-			}
-		}
 	}
 }

@ -885,10 +759,10 @@ ACTOR Future<Void> getClientInfoFromLeader(Reference<AsyncVar<Optional<ClusterCo
 }

 ACTOR Future<Void> monitorLeaderAndGetClientInfo(Key clusterKey,
+                                                 std::vector<Hostname> hostnames,
                                                 std::vector<NetworkAddress> coordinators,
                                                 ClientData* clientData,
-                                                 Reference<AsyncVar<Optional<LeaderInfo>>> leaderInfo,
-                                                 Reference<AsyncVar<Void>> coordinatorsChanged) {
+                                                 Reference<AsyncVar<Optional<LeaderInfo>>> leaderInfo) {
 	state std::vector<ClientLeaderRegInterface> clientLeaderServers;
 	state AsyncTrigger nomineeChange;
 	state std::vector<Optional<LeaderInfo>> nominees;
@ -896,8 +770,12 @@ ACTOR Future<Void> monitorLeaderAndGetClientInfo(Key clusterKey,
 	state Reference<AsyncVar<Optional<ClusterControllerClientInterface>>> knownLeader(
 	    new AsyncVar<Optional<ClusterControllerClientInterface>>{});

-	for (auto s = coordinators.begin(); s != coordinators.end(); ++s) {
-		clientLeaderServers.push_back(ClientLeaderRegInterface(*s));
+	clientLeaderServers.reserve(hostnames.size() + coordinators.size());
+	for (auto h : hostnames) {
+		clientLeaderServers.push_back(ClientLeaderRegInterface(h));
+	}
+	for (auto s : coordinators) {
+		clientLeaderServers.push_back(ClientLeaderRegInterface(s));
 	}

 	nominees.resize(clientLeaderServers.size());
@ -936,14 +814,7 @@ ACTOR Future<Void> monitorLeaderAndGetClientInfo(Key clusterKey,
 				leaderInfo->set(leader.get().first);
 			}
 		}
-		try {
 		wait(nomineeChange.onTrigger() || allActors);
-		} catch (Error& e) {
-			if (e.code() == error_code_coordinators_changed) {
-				coordinatorsChanged->trigger();
-			}
-			throw e;
-		}
 	}
 }

@ -995,7 +866,7 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
    Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
    Key traceLogGroup) {
 	state ClusterConnectionString cs = info.intermediateConnRecord->getConnectionString();
-	state std::vector<NetworkAddress> addrs = cs.coordinators();
+	state int coordinatorsSize = cs.hostnames.size() + cs.coordinators().size();
 	state int index = 0;
 	state int successIndex = 0;
 	state Optional<double> incorrectTime;
@ -1003,15 +874,26 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
 	state std::vector<CommitProxyInterface> lastCommitProxies;
 	state std::vector<UID> lastGrvProxyUIDs;
 	state std::vector<GrvProxyInterface> lastGrvProxies;
+	state std::vector<ClientLeaderRegInterface> clientLeaderServers;
+
+	clientLeaderServers.reserve(coordinatorsSize);
+	for (const auto& h : cs.hostnames) {
+		clientLeaderServers.push_back(ClientLeaderRegInterface(h));
+	}
+	for (const auto& c : cs.coordinators()) {
+		clientLeaderServers.push_back(ClientLeaderRegInterface(c));
+	}
+
+	deterministicRandom()->randomShuffle(clientLeaderServers);

-	deterministicRandom()->randomShuffle(addrs);
 	loop {
-		state ClientLeaderRegInterface clientLeaderServer(addrs[index]);
+		state ClientLeaderRegInterface clientLeaderServer = clientLeaderServers[index];
 		state OpenDatabaseCoordRequest req;

 		coordinator->set(clientLeaderServer);

 		req.clusterKey = cs.clusterKey();
+		req.hostnames = cs.hostnames;
 		req.coordinators = cs.coordinators();
 		req.knownClientInfoID = clientInfo->get().id;
 		req.supportedVersions = supportedVersions->get();
@ -1040,8 +922,16 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
 			incorrectTime = Optional<double>();
 		}

-		state ErrorOr<CachedSerialization<ClientDBInfo>> rep =
-		    wait(clientLeaderServer.openDatabase.tryGetReply(req, TaskPriority::CoordinationReply));
+		state ErrorOr<CachedSerialization<ClientDBInfo>> rep;
+		if (clientLeaderServer.hostname.present()) {
+			wait(store(rep,
+			           tryGetReplyFromHostname(req,
+			                                   clientLeaderServer.hostname.get(),
+			                                   WLTOKEN_CLIENTLEADERREG_OPENDATABASE,
+			                                   TaskPriority::CoordinationReply)));
+		} else {
+			wait(store(rep, clientLeaderServer.openDatabase.tryGetReply(req, TaskPriority::CoordinationReply)));
+		}
 		if (rep.present()) {
 			if (rep.get().read().forward.present()) {
 				TraceEvent("MonitorProxiesForwarding")
@ -1072,15 +962,10 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
 			successIndex = index;
 		} else {
 			TEST(rep.getError().code() == error_code_failed_to_progress); // Coordinator cant talk to cluster controller
-			if (rep.getError().code() == error_code_coordinators_changed) {
-				throw coordinators_changed();
-			}
-			index = (index + 1) % addrs.size();
+			TEST(rep.getError().code() == error_code_lookup_failed); // Coordinator hostname resolving failure
+			index = (index + 1) % coordinatorsSize;
 			if (index == successIndex) {
 				wait(delay(CLIENT_KNOBS->COORDINATOR_RECONNECTION_DELAY));
-				// When the client fails talking to all coordinators, we throw coordinators_changed() and let the caller
-				// re-resolve the connection string and retry.
-				throw coordinators_changed();
 			}
 		}
 	}
@ -1092,11 +977,8 @@ ACTOR Future<Void> monitorProxies(
    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
    Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
    Key traceLogGroup) {
-	wait(connRecord->get()->resolveHostnames());
 	state MonitorLeaderInfo info(connRecord->get());
 	loop {
-		try {
-			wait(info.intermediateConnRecord->resolveHostnames());
 		choose {
 			when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration(
 			         connRecord->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) {
@ -1107,13 +989,5 @@ ACTOR Future<Void> monitorProxies(
 				info.intermediateConnRecord = connRecord->get();
 			}
 		}
-		} catch (Error& e) {
-			if (e.code() == error_code_coordinators_changed) {
-				TraceEvent("MonitorProxiesCoordinatorsChanged").suppressFor(1.0);
-				info.intermediateConnRecord->getConnectionString().resetToUnresolved();
-			} else {
-				throw e;
-			}
-		}
 	}
 }
--- a/fdbclient/MonitorLeader.h
+++ b/fdbclient/MonitorLeader.h
@ -75,10 +75,10 @@ Future<Void> monitorLeader(Reference<IClusterConnectionRecord> const& connFile,
 // nominees, the nominee with the most nomination is the leader, and collects client data from the leader. This function
 // also monitors the change of the leader.
 Future<Void> monitorLeaderAndGetClientInfo(Key const& clusterKey,
+                                           std::vector<Hostname> const& hostnames,
                                           std::vector<NetworkAddress> const& coordinators,
                                           ClientData* const& clientData,
-                                           Reference<AsyncVar<Optional<LeaderInfo>>> const& leaderInfo,
-                                           Reference<AsyncVar<Void>> const& coordinatorsChanged);
+                                           Reference<AsyncVar<Optional<LeaderInfo>>> const& leaderInfo);

 Future<Void> monitorProxies(
    Reference<AsyncVar<Reference<IClusterConnectionRecord>>> const& connRecord,
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -18,6 +18,10 @@
 * limitations under the License.
 */

+#ifdef ADDRESS_SANITIZER
+#include <sanitizer/lsan_interface.h>
+#endif
+
 #include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/GenericManagementAPI.actor.h"
@ -2763,6 +2767,11 @@ template <class T>
 THREAD_FUNC runSingleAssignmentVarTest(void* arg) {
 	noUnseed = true;

+// This test intentionally leaks memory
+#ifdef ADDRESS_SANITIZER
+	__lsan::ScopedDisabler disableLeakChecks;
+#endif
+
 	volatile bool* done = (volatile bool*)arg;
 	try {
 		for (int i = 0; i < 25; ++i) {
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -265,11 +265,11 @@ void DatabaseContext::getLatestCommitVersions(const Reference<LocationInfo>& loc
 void updateCachedReadVersionShared(double t, Version v, DatabaseSharedState* p) {
 	MutexHolder mutex(p->mutexLock);
 	if (v >= p->grvCacheSpace.cachedReadVersion) {
-		TraceEvent(SevDebug, "CacheReadVersionUpdate")
-		    .detail("Version", v)
-		    .detail("CurTime", t)
-		    .detail("LastVersion", p->grvCacheSpace.cachedReadVersion)
-		    .detail("LastTime", p->grvCacheSpace.lastGrvTime);
+		//TraceEvent(SevDebug, "CacheReadVersionUpdate")
+		//    .detail("Version", v)
+		//    .detail("CurTime", t)
+		//    .detail("LastVersion", p->grvCacheSpace.cachedReadVersion)
+		//    .detail("LastTime", p->grvCacheSpace.lastGrvTime);
 		p->grvCacheSpace.cachedReadVersion = v;
 		if (t > p->grvCacheSpace.lastGrvTime) {
 			p->grvCacheSpace.lastGrvTime = t;
@ -282,11 +282,11 @@ void DatabaseContext::updateCachedReadVersion(double t, Version v) {
 		return updateCachedReadVersionShared(t, v, sharedStatePtr);
 	}
 	if (v >= cachedReadVersion) {
-		TraceEvent(SevDebug, "CachedReadVersionUpdate")
-		    .detail("Version", v)
-		    .detail("GrvStartTime", t)
-		    .detail("LastVersion", cachedReadVersion)
-		    .detail("LastTime", lastGrvTime);
+		//TraceEvent(SevDebug, "CachedReadVersionUpdate")
+		//    .detail("Version", v)
+		//    .detail("GrvStartTime", t)
+		//    .detail("LastVersion", cachedReadVersion)
+		//    .detail("LastTime", lastGrvTime);
 		cachedReadVersion = v;
 		// Since the time is based on the start of the request, it's possible that we
 		// get a newer version with an older time.
@ -5100,10 +5100,10 @@ Future<Optional<Value>> Transaction::get(const Key& key, Snapshot snapshot) {
 	++trState->cx->transactionGetValueRequests;
 	// ASSERT (key < allKeys.end);

-	// There are no keys in the database with size greater than KEY_SIZE_LIMIT
-	if (key.size() >
-	    (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+	// There are no keys in the database with size greater than the max key size
+	if (key.size() > getMaxReadKeySize(key)) {
 		return Optional<Value>();
+	}

 	auto ver = getReadVersion();

@ -5484,23 +5484,19 @@ Future<Void> Transaction::getRangeStream(const PromiseStream<RangeResult>& resul
 void Transaction::addReadConflictRange(KeyRangeRef const& keys) {
 	ASSERT(!keys.empty());

-	// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT, so if range contains large keys
+	// There aren't any keys in the database with size larger than the max key size, so if range contains large keys
 	// we can translate it to an equivalent one with smaller keys
 	KeyRef begin = keys.begin;
 	KeyRef end = keys.end;

-	if (begin.size() >
-	    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		begin = begin.substr(
-		    0,
-		    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
-	if (end.size() >
-	    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		end = end.substr(
-		    0,
-		    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
+	int64_t beginMaxSize = getMaxReadKeySize(begin);
+	int64_t endMaxSize = getMaxReadKeySize(end);
+	if (begin.size() > beginMaxSize) {
+		begin = begin.substr(0, beginMaxSize + 1);
+	}
+	if (end.size() > endMaxSize) {
+		end = end.substr(0, endMaxSize + 1);
+	}

 	KeyRangeRef r = KeyRangeRef(begin, end);

@ -5522,8 +5518,7 @@ void Transaction::makeSelfConflicting() {

 void Transaction::set(const KeyRef& key, const ValueRef& value, AddConflictRange addConflictRange) {
 	++trState->cx->transactionSetMutations;
-	if (key.size() >
-	    (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+	if (key.size() > getMaxWriteKeySize(key, trState->options.rawAccess))
 		throw key_too_large();
 	if (value.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT)
 		throw value_too_large();
@ -5544,8 +5539,7 @@ void Transaction::atomicOp(const KeyRef& key,
                           MutationRef::Type operationType,
                           AddConflictRange addConflictRange) {
 	++trState->cx->transactionAtomicMutations;
-	if (key.size() >
-	    (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+	if (key.size() > getMaxWriteKeySize(key, trState->options.rawAccess))
 		throw key_too_large();
 	if (operand.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT)
 		throw value_too_large();
@ -5578,20 +5572,16 @@ void Transaction::clear(const KeyRangeRef& range, AddConflictRange addConflictRa
 	KeyRef begin = range.begin;
 	KeyRef end = range.end;

-	// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT, so if range contains large keys
+	// There aren't any keys in the database with size larger than the max key size, so if range contains large keys
 	// we can translate it to an equivalent one with smaller keys
-	if (begin.size() >
-	    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		begin = begin.substr(
-		    0,
-		    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
-	if (end.size() >
-	    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		end = end.substr(
-		    0,
-		    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
+	int64_t beginMaxSize = getMaxClearKeySize(begin);
+	int64_t endMaxSize = getMaxClearKeySize(end);
+	if (begin.size() > beginMaxSize) {
+		begin = begin.substr(0, beginMaxSize + 1);
+	}
+	if (end.size() > endMaxSize) {
+		end = end.substr(0, endMaxSize + 1);
+	}

 	auto r = KeyRangeRef(req.arena, KeyRangeRef(begin, end));
 	if (r.empty())
@ -5604,10 +5594,10 @@ void Transaction::clear(const KeyRangeRef& range, AddConflictRange addConflictRa
 }
 void Transaction::clear(const KeyRef& key, AddConflictRange addConflictRange) {
 	++trState->cx->transactionClearMutations;
-	// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT
-	if (key.size() >
-	    (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+	// There aren't any keys in the database with size larger than the max key size
+	if (key.size() > getMaxClearKeySize(key)) {
 		return;
+	}

 	auto& req = tr;
 	auto& t = req.transaction;
@ -5626,24 +5616,19 @@ void Transaction::addWriteConflictRange(const KeyRangeRef& keys) {
 	auto& req = tr;
 	auto& t = req.transaction;

-	// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT, so if range contains large keys
+	// There aren't any keys in the database with size larger than the max key size, so if range contains large keys
 	// we can translate it to an equivalent one with smaller keys
 	KeyRef begin = keys.begin;
 	KeyRef end = keys.end;

-	if (begin.size() >
-	    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		begin = begin.substr(
-		    0,
-		    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
-	if (end.size() >
-	    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		end = end.substr(
-		    0,
-		    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
-
+	int64_t beginMaxSize = getMaxKeySize(begin);
+	int64_t endMaxSize = getMaxKeySize(end);
+	if (begin.size() > beginMaxSize) {
+		begin = begin.substr(0, beginMaxSize + 1);
+	}
+	if (end.size() > endMaxSize) {
+		end = end.substr(0, endMaxSize + 1);
+	}
 	KeyRangeRef r = KeyRangeRef(begin, end);

 	if (r.empty()) {
@ -6942,11 +6927,18 @@ Future<Standalone<StringRef>> Transaction::getVersionstamp() {
 }

 // Gets the protocol version reported by a coordinator via the protocol info interface
-ACTOR Future<ProtocolVersion> getCoordinatorProtocol(NetworkAddressList coordinatorAddresses) {
-	RequestStream<ProtocolInfoRequest> requestStream{ Endpoint::wellKnown({ coordinatorAddresses },
-		                                                                  WLTOKEN_PROTOCOL_INFO) };
-	ProtocolInfoReply reply = wait(retryBrokenPromise(requestStream, ProtocolInfoRequest{}));
-
+ACTOR Future<ProtocolVersion> getCoordinatorProtocol(
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>> const> coordinator) {
+	state ProtocolInfoReply reply;
+	if (coordinator->get().get().hostname.present()) {
+		wait(store(reply,
+		           retryGetReplyFromHostname(
+		               ProtocolInfoRequest{}, coordinator->get().get().hostname.get(), WLTOKEN_PROTOCOL_INFO)));
+	} else {
+		RequestStream<ProtocolInfoRequest> requestStream(
+		    Endpoint::wellKnown({ coordinator->get().get().getLeader.getEndpoint().addresses }, WLTOKEN_PROTOCOL_INFO));
+		wait(store(reply, retryBrokenPromise(requestStream, ProtocolInfoRequest{})));
+	}
 	return reply.version;
 }

@ -6955,8 +6947,16 @@ ACTOR Future<ProtocolVersion> getCoordinatorProtocol(NetworkAddressList coordina
 // function will return with an unset result.
 // If an expected version is given, this future won't return if the actual protocol version matches the expected version
 ACTOR Future<Optional<ProtocolVersion>> getCoordinatorProtocolFromConnectPacket(
-    NetworkAddress coordinatorAddress,
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>> const> coordinator,
    Optional<ProtocolVersion> expectedVersion) {
+	state NetworkAddress coordinatorAddress;
+	if (coordinator->get().get().hostname.present()) {
+		Hostname h = coordinator->get().get().hostname.get();
+		wait(store(coordinatorAddress, h.resolveWithRetry()));
+	} else {
+		coordinatorAddress = coordinator->get().get().getLeader.getEndpoint().getPrimaryAddress();
+	}
+
 	state Reference<AsyncVar<Optional<ProtocolVersion>> const> protocolVersion =
 	    FlowTransport::transport().getPeerProtocolAsyncVar(coordinatorAddress);

@ -6991,11 +6991,10 @@ ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
 		if (!coordinator->get().present()) {
 			wait(coordinator->onChange());
 		} else {
-			Endpoint coordinatorEndpoint = coordinator->get().get().getLeader.getEndpoint();
 			if (needToConnect) {
 				// Even though we typically rely on the connect packet to get the protocol version, we need to send some
 				// request in order to start a connection. This protocol version request serves that purpose.
-				protocolVersion = getCoordinatorProtocol(coordinatorEndpoint.addresses);
+				protocolVersion = getCoordinatorProtocol(coordinator);
 				needToConnect = false;
 			}
 			choose {
@ -7011,8 +7010,8 @@ ACTOR Future<ProtocolVersion> getClusterProtocolImpl(

 				// Older versions of FDB don't have an endpoint to return the protocol version, so we get this info from
 				// the connect packet
-				when(Optional<ProtocolVersion> pv = wait(getCoordinatorProtocolFromConnectPacket(
-				         coordinatorEndpoint.getPrimaryAddress(), expectedVersion))) {
+				when(Optional<ProtocolVersion> pv =
+				         wait(getCoordinatorProtocolFromConnectPacket(coordinator, expectedVersion))) {
 					if (pv.present()) {
 						return pv.get();
 					} else {
@ -8186,15 +8185,21 @@ ACTOR Future<bool> checkSafeExclusions(Database cx, std::vector<AddressExclusion
 		throw;
 	}
 	TraceEvent("ExclusionSafetyCheckCoordinators").log();
-	wait(cx->getConnectionRecord()->resolveHostnames());
 	state ClientCoordinators coordinatorList(cx->getConnectionRecord());
 	state std::vector<Future<Optional<LeaderInfo>>> leaderServers;
 	leaderServers.reserve(coordinatorList.clientLeaderServers.size());
 	for (int i = 0; i < coordinatorList.clientLeaderServers.size(); i++) {
+		if (coordinatorList.clientLeaderServers[i].hostname.present()) {
+			leaderServers.push_back(retryGetReplyFromHostname(GetLeaderRequest(coordinatorList.clusterKey, UID()),
+			                                                  coordinatorList.clientLeaderServers[i].hostname.get(),
+			                                                  WLTOKEN_CLIENTLEADERREG_GETLEADER,
+			                                                  TaskPriority::CoordinationReply));
+		} else {
 			leaderServers.push_back(retryBrokenPromise(coordinatorList.clientLeaderServers[i].getLeader,
 			                                           GetLeaderRequest(coordinatorList.clusterKey, UID()),
 			                                           TaskPriority::CoordinationReply));
 		}
+	}
 	// Wait for quorum so we don't dismiss live coordinators as unreachable by acting too fast
 	choose {
 		when(wait(smartQuorum(leaderServers, leaderServers.size() / 2 + 1, 1.0))) {}
@ -9395,3 +9400,21 @@ ACTOR Future<Void> waitPurgeGranulesCompleteActor(Reference<DatabaseContext> db,
 Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
 	return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
 }
+
+int64_t getMaxKeySize(KeyRef const& key) {
+	return getMaxWriteKeySize(key, true);
+}
+
+int64_t getMaxReadKeySize(KeyRef const& key) {
+	return getMaxKeySize(key);
+}
+
+int64_t getMaxWriteKeySize(KeyRef const& key, bool hasRawAccess) {
+	int64_t tenantSize = hasRawAccess ? CLIENT_KNOBS->TENANT_PREFIX_SIZE_LIMIT : 0;
+	return key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
+	                                        : CLIENT_KNOBS->KEY_SIZE_LIMIT + tenantSize;
+}
+
+int64_t getMaxClearKeySize(KeyRef const& key) {
+	return getMaxKeySize(key);
+}
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -539,5 +539,19 @@ ACTOR Future<std::vector<std::pair<UID, StorageWiggleValue>>> readStorageWiggleV
                                                                                      bool primary,
                                                                                      bool use_system_priority);

+// Returns the maximum legal size of a key. This size will be determined by the prefix of the passed in key
+// (system keys have a larger maximum size). This should be used for generic max key size requests.
+int64_t getMaxKeySize(KeyRef const& key);
+
+// Returns the maximum legal size of a key that can be read. Keys larger than this will be assumed not to exist.
+int64_t getMaxReadKeySize(KeyRef const& key);
+
+// Returns the maximum legal size of a key that can be written. If using raw access, writes to normal keys will
+// be allowed to be slighly larger to accommodate the prefix.
+int64_t getMaxWriteKeySize(KeyRef const& key, bool hasRawAccess);
+
+// Returns the maximum legal size of a key that can be cleared. Keys larger than this will be assumed not to exist.
+int64_t getMaxClearKeySize(KeyRef const& key);
+
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/PaxosConfigTransaction.actor.cpp
+++ b/fdbclient/PaxosConfigTransaction.actor.cpp
@ -59,8 +59,14 @@ class CommitQuorum {
 	                                          ConfigGeneration generation,
 	                                          ConfigTransactionInterface cti) {
 		try {
+			if (cti.hostname.present()) {
+				wait(timeoutError(retryGetReplyFromHostname(
+				                      self->getCommitRequest(generation), cti.hostname.get(), WLTOKEN_CONFIGTXN_COMMIT),
+				                  CLIENT_KNOBS->COMMIT_QUORUM_TIMEOUT));
+			} else {
 				wait(timeoutError(cti.commit.getReply(self->getCommitRequest(generation)),
 				                  CLIENT_KNOBS->COMMIT_QUORUM_TIMEOUT));
+			}
 			++self->successful;
 		} catch (Error& e) {
 			// self might be destroyed if this actor is cancelled
@ -122,9 +128,20 @@ class GetGenerationQuorum {
 	ACTOR static Future<Void> addRequestActor(GetGenerationQuorum* self, ConfigTransactionInterface cti) {
 		loop {
 			try {
-				ConfigTransactionGetGenerationReply reply = wait(timeoutError(
-				    cti.getGeneration.getReply(ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion }),
+				state ConfigTransactionGetGenerationReply reply;
+				if (cti.hostname.present()) {
+					wait(timeoutError(store(reply,
+					                        retryGetReplyFromHostname(
+					                            ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion },
+					                            cti.hostname.get(),
+					                            WLTOKEN_CONFIGTXN_GETGENERATION)),
 					                  CLIENT_KNOBS->GET_GENERATION_QUORUM_TIMEOUT));
+				} else {
+					wait(timeoutError(store(reply,
+					                        cti.getGeneration.getReply(
+					                            ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion })),
+					                  CLIENT_KNOBS->GET_GENERATION_QUORUM_TIMEOUT));
+				}

 				++self->totalRepliesReceived;
 				auto gen = reply.generation;
@ -225,9 +242,18 @@ class PaxosConfigTransactionImpl {
 		state ConfigKey configKey = ConfigKey::decodeKey(key);
 		loop {
 			try {
-				ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
-				state Reference<ConfigTransactionInfo> configNodes(
-				    new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas()));
+				state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
+				state std::vector<ConfigTransactionInterface> readReplicas =
+				    self->getGenerationQuorum.getReadReplicas();
+				std::vector<Future<Void>> fs;
+				for (ConfigTransactionInterface& readReplica : readReplicas) {
+					if (readReplica.hostname.present()) {
+						fs.push_back(tryInitializeRequestStream(
+						    &readReplica.get, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GET));
+					}
+				}
+				wait(waitForAll(fs));
+				state Reference<ConfigTransactionInfo> configNodes(new ConfigTransactionInfo(readReplicas));
 				ConfigTransactionGetReply reply =
 				    wait(timeoutError(basicLoadBalance(configNodes,
 				                                       &ConfigTransactionInterface::get,
@ -248,9 +274,17 @@ class PaxosConfigTransactionImpl {
 	}

 	ACTOR static Future<RangeResult> getConfigClasses(PaxosConfigTransactionImpl* self) {
-		ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
-		state Reference<ConfigTransactionInfo> configNodes(
-		    new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas()));
+		state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
+		state std::vector<ConfigTransactionInterface> readReplicas = self->getGenerationQuorum.getReadReplicas();
+		std::vector<Future<Void>> fs;
+		for (ConfigTransactionInterface& readReplica : readReplicas) {
+			if (readReplica.hostname.present()) {
+				fs.push_back(tryInitializeRequestStream(
+				    &readReplica.getClasses, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETCLASSES));
+			}
+		}
+		wait(waitForAll(fs));
+		state Reference<ConfigTransactionInfo> configNodes(new ConfigTransactionInfo(readReplicas));
 		ConfigTransactionGetConfigClassesReply reply =
 		    wait(basicLoadBalance(configNodes,
 		                          &ConfigTransactionInterface::getClasses,
@ -264,9 +298,17 @@ class PaxosConfigTransactionImpl {
 	}

 	ACTOR static Future<RangeResult> getKnobs(PaxosConfigTransactionImpl* self, Optional<Key> configClass) {
-		ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
-		state Reference<ConfigTransactionInfo> configNodes(
-		    new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas()));
+		state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
+		state std::vector<ConfigTransactionInterface> readReplicas = self->getGenerationQuorum.getReadReplicas();
+		std::vector<Future<Void>> fs;
+		for (ConfigTransactionInterface& readReplica : readReplicas) {
+			if (readReplica.hostname.present()) {
+				fs.push_back(tryInitializeRequestStream(
+				    &readReplica.getKnobs, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETKNOBS));
+			}
+		}
+		wait(waitForAll(fs));
+		state Reference<ConfigTransactionInfo> configNodes(new ConfigTransactionInfo(readReplicas));
 		ConfigTransactionGetKnobsReply reply =
 		    wait(basicLoadBalance(configNodes,
 		                          &ConfigTransactionInterface::getKnobs,
@ -366,10 +408,13 @@ public:
 	Future<Void> commit() { return commit(this); }

 	PaxosConfigTransactionImpl(Database const& cx) : cx(cx) {
-		auto coordinators = cx->getConnectionRecord()->getConnectionString().coordinators();
-		ctis.reserve(coordinators.size());
-		for (const auto& coordinator : coordinators) {
-			ctis.emplace_back(coordinator);
+		const ClusterConnectionString& cs = cx->getConnectionRecord()->getConnectionString();
+		ctis.reserve(cs.hostnames.size() + cs.coordinators().size());
+		for (const auto& h : cs.hostnames) {
+			ctis.emplace_back(h);
+		}
+		for (const auto& c : cs.coordinators()) {
+			ctis.emplace_back(c);
 		}
 		getGenerationQuorum = GetGenerationQuorum{ ctis };
 		commitQuorum = CommitQuorum{ ctis };
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -19,6 +19,7 @@
 */

 #include "fdbclient/ReadYourWrites.h"
+#include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/Atomic.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
@ -1578,10 +1579,10 @@ Future<Optional<Value>> ReadYourWritesTransaction::get(const Key& key, Snapshot
 	if (key >= getMaxReadKey() && key != metadataVersionKey)
 		return key_outside_legal_range();

-	// There are no keys in the database with size greater than KEY_SIZE_LIMIT
-	if (key.size() >
-	    (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+	// There are no keys in the database with size greater than the max key size
+	if (key.size() > getMaxReadKeySize(key)) {
 		return Optional<Value>();
+	}

 	Future<Optional<Value>> result = RYWImpl::readWithConflictRange(this, RYWImpl::GetValueReq(key), snapshot);
 	reading.add(success(result));
@ -1822,23 +1823,19 @@ void ReadYourWritesTransaction::addReadConflictRange(KeyRangeRef const& keys) {
 		}
 	}

-	// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT, so if range contains large keys
+	// There aren't any keys in the database with size larger than max key size, so if range contains large keys
 	// we can translate it to an equivalent one with smaller keys
 	KeyRef begin = keys.begin;
 	KeyRef end = keys.end;

-	if (begin.size() >
-	    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		begin = begin.substr(
-		    0,
-		    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
-	if (end.size() >
-	    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		end = end.substr(
-		    0,
-		    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
+	int64_t beginMaxSize = getMaxReadKeySize(begin);
+	int64_t endMaxSize = getMaxReadKeySize(end);
+	if (begin.size() > beginMaxSize) {
+		begin = begin.substr(0, beginMaxSize + 1);
+	}
+	if (end.size() > endMaxSize) {
+		end = end.substr(0, endMaxSize + 1);
+	}

 	KeyRangeRef r = KeyRangeRef(begin, end);

@ -2111,9 +2108,9 @@ void ReadYourWritesTransaction::atomicOp(const KeyRef& key, const ValueRef& oper
 	if (!isValidMutationType(operationType) || !isAtomicOp((MutationRef::Type)operationType))
 		throw invalid_mutation_type();

-	if (key.size() >
-	    (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+	if (key.size() > getMaxWriteKeySize(key, getTransactionState()->options.rawAccess)) {
 		throw key_too_large();
+	}
 	if (operand.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT)
 		throw value_too_large();

@ -2218,9 +2215,9 @@ void ReadYourWritesTransaction::set(const KeyRef& key, const ValueRef& value) {
 	}

 	// TODO: check transaction size here
-	if (key.size() >
-	    (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+	if (key.size() > getMaxWriteKeySize(key, getTransactionState()->options.rawAccess)) {
 		throw key_too_large();
+	}
 	if (value.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT)
 		throw value_too_large();

@ -2254,23 +2251,19 @@ void ReadYourWritesTransaction::clear(const KeyRangeRef& range) {
 		return tr.clear(range, addWriteConflict);
 	}

-	// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT, so if range contains large keys
+	// There aren't any keys in the database with size larger than the max key size, so if range contains large keys
 	// we can translate it to an equivalent one with smaller keys
 	KeyRef begin = range.begin;
 	KeyRef end = range.end;

-	if (begin.size() >
-	    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		begin = begin.substr(
-		    0,
-		    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
-	if (end.size() >
-	    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		end = end.substr(
-		    0,
-		    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
+	int64_t beginMaxSize = getMaxClearKeySize(begin);
+	int64_t endMaxSize = getMaxClearKeySize(end);
+	if (begin.size() > beginMaxSize) {
+		begin = begin.substr(0, beginMaxSize + 1);
+	}
+	if (end.size() > endMaxSize) {
+		end = end.substr(0, endMaxSize + 1);
+	}

 	KeyRangeRef r = KeyRangeRef(begin, end);

@ -2300,9 +2293,9 @@ void ReadYourWritesTransaction::clear(const KeyRef& key) {
 	if (key >= getMaxWriteKey())
 		throw key_outside_legal_range();

-	if (key.size() >
-	    (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+	if (key.size() > getMaxClearKeySize(key)) {
 		return;
+	}

 	if (options.readYourWritesDisabled) {
 		return tr.clear(key, addWriteConflict);
@ -2332,9 +2325,9 @@ Future<Void> ReadYourWritesTransaction::watch(const Key& key) {
 	if (key >= allKeys.end || (key >= getMaxReadKey() && key != metadataVersionKey && tr.apiVersionAtLeast(300)))
 		return key_outside_legal_range();

-	if (key.size() >
-	    (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+	if (key.size() > getMaxWriteKeySize(key, getTransactionState()->options.rawAccess)) {
 		return key_too_large();
+	}

 	return RYWImpl::watch(this, key);
 }
@ -2350,23 +2343,19 @@ void ReadYourWritesTransaction::addWriteConflictRange(KeyRangeRef const& keys) {
 		}
 	}

-	// There aren't any keys in the database with size larger than KEY_SIZE_LIMIT, so if range contains large keys
+	// There aren't any keys in the database with size larger than the max key size, so if range contains large keys
 	// we can translate it to an equivalent one with smaller keys
 	KeyRef begin = keys.begin;
 	KeyRef end = keys.end;

-	if (begin.size() >
-	    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		begin = begin.substr(
-		    0,
-		    (begin.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
-	if (end.size() >
-	    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT))
-		end = end.substr(
-		    0,
-		    (end.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-		        1);
+	int64_t beginMaxSize = getMaxKeySize(begin);
+	int64_t endMaxSize = getMaxKeySize(end);
+	if (begin.size() > beginMaxSize) {
+		begin = begin.substr(0, beginMaxSize + 1);
+	}
+	if (end.size() > endMaxSize) {
+		end = end.substr(0, endMaxSize + 1);
+	}

 	KeyRangeRef r = KeyRangeRef(begin, end);

--- a/fdbclient/S3BlobStore.actor.cpp
+++ b/fdbclient/S3BlobStore.actor.cpp
@ -25,9 +25,15 @@
 #include "fdbclient/sha1/SHA1.h"
 #include <time.h>
 #include <iomanip>
+#if defined(HAVE_WOLFSSL)
+#include <wolfssl/options.h>
+#endif
 #include <openssl/sha.h>
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
+#if defined(HAVE_WOLFSSL)
+#undef SHA1 // wolfSSL will will shadow FDB SHA1.h
+#endif
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string.hpp>
--- a/fdbclient/S3BlobStore.h
+++ b/fdbclient/S3BlobStore.h
@ -26,7 +26,7 @@
 #include "flow/Net2Packet.h"
 #include "fdbclient/Knobs.h"
 #include "fdbrpc/IRateControl.h"
-#include "fdbclient/HTTP.h"
+#include "fdbrpc/HTTP.h"
 #include "fdbclient/JSONDoc.h"

 // Representation of all the things you need to connect to a blob store instance with some credentials.
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -450,6 +450,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( MAX_COMMIT_UPDATES,                                    2000 ); if( randomize && BUGGIFY ) MAX_COMMIT_UPDATES = 1;
 	init( MAX_PROXY_COMPUTE,                                      2.0 );
 	init( MAX_COMPUTE_PER_OPERATION,                              0.1 );
+	init( MAX_COMPUTE_DURATION_LOG_CUTOFF,                       0.05 );
 	init( PROXY_COMPUTE_BUCKETS,                                20000 );
 	init( PROXY_COMPUTE_GROWTH_RATE,                             0.01 );
 	init( TXN_STATE_SEND_AMOUNT,                                    4 );
@ -541,6 +542,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( CC_ENABLE_ENTIRE_SATELLITE_MONITORING,               false );
 	init( CC_SATELLITE_DEGRADATION_MIN_COMPLAINER,                 3 );
 	init( CC_SATELLITE_DEGRADATION_MIN_BAD_SERVER,                 3 );
+	init( CC_THROTTLE_SINGLETON_RERECRUIT_INTERVAL,              0.5 );

 	init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL,                   600 ); if( randomize && BUGGIFY ) INCOMPATIBLE_PEERS_LOGGING_INTERVAL = 60.0;
 	init( EXPECTED_MASTER_FITNESS,            ProcessClass::UnsetFit );
--- a/fdbclient/ServerKnobs.h
+++ b/fdbclient/ServerKnobs.h
@ -374,6 +374,7 @@ public:
 	int MAX_COMMIT_UPDATES;
 	double MAX_PROXY_COMPUTE;
 	double MAX_COMPUTE_PER_OPERATION;
+	double MAX_COMPUTE_DURATION_LOG_CUTOFF;
 	int PROXY_COMPUTE_BUCKETS;
 	double PROXY_COMPUTE_GROWTH_RATE;
 	int TXN_STATE_SEND_AMOUNT;
@ -480,6 +481,8 @@ public:
 	                                             // be determined as degraded worker.
 	int CC_SATELLITE_DEGRADATION_MIN_BAD_SERVER; // The minimum amount of degraded server in satellite DC to be
 	                                             // determined as degraded satellite.
+	double CC_THROTTLE_SINGLETON_RERECRUIT_INTERVAL; // The interval to prevent re-recruiting the same singleton if a
+	                                                 // recruiting fight between two cluster controllers occurs.

 	// Knobs used to select the best policy (via monte carlo)
 	int POLICY_RATING_TESTS; // number of tests per policy (in order to compare)
--- a/fdbclient/SimpleConfigTransaction.actor.cpp
+++ b/fdbclient/SimpleConfigTransaction.actor.cpp
@ -41,9 +41,15 @@ class SimpleConfigTransactionImpl {
 		if (self->dID.present()) {
 			TraceEvent("SimpleConfigTransactionGettingReadVersion", self->dID.get());
 		}
-		ConfigTransactionGetGenerationRequest req;
-		ConfigTransactionGetGenerationReply reply =
-		    wait(retryBrokenPromise(self->cti.getGeneration, ConfigTransactionGetGenerationRequest{}));
+		state ConfigTransactionGetGenerationReply reply;
+		if (self->cti.hostname.present()) {
+			wait(store(reply,
+			           retryGetReplyFromHostname(ConfigTransactionGetGenerationRequest{},
+			                                     self->cti.hostname.get(),
+			                                     WLTOKEN_CONFIGTXN_GETGENERATION)));
+		} else {
+			wait(store(reply, retryBrokenPromise(self->cti.getGeneration, ConfigTransactionGetGenerationRequest{})));
+		}
 		if (self->dID.present()) {
 			TraceEvent("SimpleConfigTransactionGotReadVersion", self->dID.get())
 			    .detail("Version", reply.generation.liveVersion);
@ -62,8 +68,15 @@ class SimpleConfigTransactionImpl {
 			    .detail("ConfigClass", configKey.configClass)
 			    .detail("KnobName", configKey.knobName);
 		}
-		ConfigTransactionGetReply reply =
-		    wait(retryBrokenPromise(self->cti.get, ConfigTransactionGetRequest{ generation, configKey }));
+		state ConfigTransactionGetReply reply;
+		if (self->cti.hostname.present()) {
+			wait(store(reply,
+			           retryGetReplyFromHostname(ConfigTransactionGetRequest{ generation, configKey },
+			                                     self->cti.hostname.get(),
+			                                     WLTOKEN_CONFIGTXN_GET)));
+		} else {
+			wait(store(reply, retryBrokenPromise(self->cti.get, ConfigTransactionGetRequest{ generation, configKey })));
+		}
 		if (self->dID.present()) {
 			TraceEvent("SimpleConfigTransactionGotValue", self->dID.get())
 			    .detail("Value", reply.value.get().toString());
@ -80,8 +93,17 @@ class SimpleConfigTransactionImpl {
 			self->getGenerationFuture = getGeneration(self);
 		}
 		ConfigGeneration generation = wait(self->getGenerationFuture);
-		ConfigTransactionGetConfigClassesReply reply =
-		    wait(retryBrokenPromise(self->cti.getClasses, ConfigTransactionGetConfigClassesRequest{ generation }));
+		state ConfigTransactionGetConfigClassesReply reply;
+		if (self->cti.hostname.present()) {
+			wait(store(reply,
+			           retryGetReplyFromHostname(ConfigTransactionGetConfigClassesRequest{ generation },
+			                                     self->cti.hostname.get(),
+			                                     WLTOKEN_CONFIGTXN_GETCLASSES)));
+		} else {
+			wait(store(
+			    reply,
+			    retryBrokenPromise(self->cti.getClasses, ConfigTransactionGetConfigClassesRequest{ generation })));
+		}
 		RangeResult result;
 		for (const auto& configClass : reply.configClasses) {
 			result.push_back_deep(result.arena(), KeyValueRef(configClass, ""_sr));
@ -94,8 +116,17 @@ class SimpleConfigTransactionImpl {
 			self->getGenerationFuture = getGeneration(self);
 		}
 		ConfigGeneration generation = wait(self->getGenerationFuture);
-		ConfigTransactionGetKnobsReply reply =
-		    wait(retryBrokenPromise(self->cti.getKnobs, ConfigTransactionGetKnobsRequest{ generation, configClass }));
+		state ConfigTransactionGetKnobsReply reply;
+		if (self->cti.hostname.present()) {
+			wait(store(reply,
+			           retryGetReplyFromHostname(ConfigTransactionGetKnobsRequest{ generation, configClass },
+			                                     self->cti.hostname.get(),
+			                                     WLTOKEN_CONFIGTXN_GETKNOBS)));
+		} else {
+			wait(store(
+			    reply,
+			    retryBrokenPromise(self->cti.getKnobs, ConfigTransactionGetKnobsRequest{ generation, configClass })));
+		}
 		RangeResult result;
 		for (const auto& knobName : reply.knobNames) {
 			result.push_back_deep(result.arena(), KeyValueRef(knobName, ""_sr));
@ -109,7 +140,11 @@ class SimpleConfigTransactionImpl {
 		}
 		wait(store(self->toCommit.generation, self->getGenerationFuture));
 		self->toCommit.annotation.timestamp = now();
+		if (self->cti.hostname.present()) {
+			wait(retryGetReplyFromHostname(self->toCommit, self->cti.hostname.get(), WLTOKEN_CONFIGTXN_COMMIT));
+		} else {
 			wait(retryBrokenPromise(self->cti.commit, self->toCommit));
+		}
 		self->committed = true;
 		return Void();
 	}
@ -126,9 +161,14 @@ class SimpleConfigTransactionImpl {

 public:
 	SimpleConfigTransactionImpl(Database const& cx) : cx(cx) {
-		auto coordinators = cx->getConnectionRecord()->getConnectionString().coordinators();
+		const ClusterConnectionString& cs = cx->getConnectionRecord()->getConnectionString();
+		if (cs.coordinators().size()) {
+			std::vector<NetworkAddress> coordinators = cs.coordinators();
 			std::sort(coordinators.begin(), coordinators.end());
 			cti = ConfigTransactionInterface(coordinators[0]);
+		} else {
+			cti = ConfigTransactionInterface(cs.hostnames[0]);
+		}
 	}

 	SimpleConfigTransactionImpl(ConfigTransactionInterface const& cti) : cti(cti) {}
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -1644,13 +1644,10 @@ void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key

 CoordinatorsImpl::CoordinatorsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}

-Future<RangeResult> CoordinatorsImpl::getRange(ReadYourWritesTransaction* ryw,
-                                               KeyRangeRef kr,
-                                               GetRangeLimits limitsHint) const {
+ACTOR Future<RangeResult> coordinatorsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
+	state ClusterConnectionString cs = ryw->getDatabase()->getConnectionRecord()->getConnectionString();
+	state std::vector<NetworkAddress> coordinator_processes = wait(cs.tryResolveHostnames());
 	RangeResult result;
-	KeyRef prefix(getKeyRange().begin);
-	auto cs = ryw->getDatabase()->getConnectionRecord()->getConnectionString();
-	auto coordinator_processes = cs.coordinators();
 	Key cluster_decription_key = prefix.withSuffix(LiteralStringRef("cluster_description"));
 	if (kr.contains(cluster_decription_key)) {
 		result.push_back_deep(result.arena(), KeyValueRef(cluster_decription_key, cs.clusterKeyName()));
@ -1673,10 +1670,16 @@ Future<RangeResult> CoordinatorsImpl::getRange(ReadYourWritesTransaction* ryw,
 	return rywGetRange(ryw, kr, result);
 }

+Future<RangeResult> CoordinatorsImpl::getRange(ReadYourWritesTransaction* ryw,
+                                               KeyRangeRef kr,
+                                               GetRangeLimits limitsHint) const {
+	KeyRef prefix(getKeyRange().begin);
+	return coordinatorsGetRangeActor(ryw, prefix, kr);
+}
+
 ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
 	state Reference<IQuorumChange> change;
-	state ClusterConnectionString
-	    conn; // We don't care about the Key here, it will be overrode in changeQuorumChecker().
+	state ClusterConnectionString conn; // We don't care about the Key here.
 	state std::vector<std::string> process_address_or_hostname_strs;
 	state Optional<std::string> msg;
 	state int index;
@ -1700,7 +1703,6 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
 			try {
 				if (Hostname::isHostname(process_address_or_hostname_strs[index])) {
 					conn.hostnames.push_back(Hostname::parse(process_address_or_hostname_strs[index]));
-					conn.status = ClusterConnectionString::ConnectionStringStatus::UNRESOLVED;
 				} else {
 					NetworkAddress a = NetworkAddress::parse(process_address_or_hostname_strs[index]);
 					if (!a.isValid()) {
@ -1717,18 +1719,19 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
 			if (parse_error) {
 				std::string error = "ERROR: \'" + process_address_or_hostname_strs[index] +
 				                    "\' is not a valid network endpoint address\n";
-				if (process_address_or_hostname_strs[index].find(":tls") != std::string::npos)
-					error += "        Do not include the `:tls' suffix when naming a process\n";
 				return ManagementAPIError::toJsonString(false, "coordinators", error);
 			}
 		}
 	}

-	wait(conn.resolveHostnames());
-	if (conn.coordinators().size())
-		change = specifiedQuorumChange(conn.coordinators());
-	else
+	std::vector<NetworkAddress> addressesVec = wait(conn.tryResolveHostnames());
+	if (addressesVec.size() != conn.hostnames.size() + conn.coordinators().size()) {
+		return ManagementAPIError::toJsonString(false, "coordinators", "One or more hostnames are not resolvable.");
+	} else if (addressesVec.size()) {
+		change = specifiedQuorumChange(addressesVec);
+	} else {
 		change = noQuorumChange();
+	}

 	// check update for cluster_description
 	Key cluster_decription_key = LiteralStringRef("cluster_description").withPrefix(kr.begin);
@ -1740,19 +1743,18 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
 			change = nameQuorumChange(entry.second.get().toString(), change);
 		} else {
 			// throw the error
-			return Optional<std::string>(ManagementAPIError::toJsonString(
-			    false, "coordinators", "Cluster description must match [A-Za-z0-9_]+"));
+			return ManagementAPIError::toJsonString(
+			    false, "coordinators", "Cluster description must match [A-Za-z0-9_]+");
 		}
 	}

 	ASSERT(change.isValid());

 	TraceEvent(SevDebug, "SKSChangeCoordinatorsStart")
-	    .detail("NewHostnames", conn.hostnames.size() ? describe(conn.hostnames) : "N/A")
-	    .detail("NewAddresses", describe(conn.coordinators()))
+	    .detail("NewAddresses", describe(addressesVec))
 	    .detail("Description", entry.first ? entry.second.get().toString() : "");

-	Optional<CoordinatorsResult> r = wait(changeQuorumChecker(&ryw->getTransaction(), change, &conn));
+	Optional<CoordinatorsResult> r = wait(changeQuorumChecker(&ryw->getTransaction(), change, addressesVec));

 	TraceEvent(SevDebug, "SKSChangeCoordinatorsFinish")
 	    .detail("Result", r.present() ? static_cast<int>(r.get()) : -1); // -1 means success
@ -1804,9 +1806,10 @@ ACTOR static Future<RangeResult> CoordinatorsAutoImplActor(ReadYourWritesTransac
 	state ClusterConnectionString old(currentKey.get().toString());
 	state CoordinatorsResult result = CoordinatorsResult::SUCCESS;

+	std::vector<NetworkAddress> oldCoordinators = wait(old.tryResolveHostnames());
 	std::vector<NetworkAddress> _desiredCoordinators = wait(autoQuorumChange()->getDesiredCoordinators(
 	    &tr,
-	    old.coordinators(),
+	    oldCoordinators,
 	    Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(old)),
 	    result));

--- a/fdbclient/StatusClient.actor.cpp
+++ b/fdbclient/StatusClient.actor.cpp
@ -307,24 +307,36 @@ ACTOR Future<Optional<StatusObject>> clientCoordinatorsStatusFetcher(Reference<I
                                                                     bool* quorum_reachable,
                                                                     int* coordinatorsFaultTolerance) {
 	try {
-		wait(connRecord->resolveHostnames());
 		state ClientCoordinators coord(connRecord);
 		state StatusObject statusObj;

 		state std::vector<Future<Optional<LeaderInfo>>> leaderServers;
 		leaderServers.reserve(coord.clientLeaderServers.size());
-		for (int i = 0; i < coord.clientLeaderServers.size(); i++)
+		for (int i = 0; i < coord.clientLeaderServers.size(); i++) {
+			if (coord.clientLeaderServers[i].hostname.present()) {
+				leaderServers.push_back(retryGetReplyFromHostname(GetLeaderRequest(coord.clusterKey, UID()),
+				                                                  coord.clientLeaderServers[i].hostname.get(),
+				                                                  WLTOKEN_CLIENTLEADERREG_GETLEADER,
+				                                                  TaskPriority::CoordinationReply));
+			} else {
 				leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader,
 				                                           GetLeaderRequest(coord.clusterKey, UID()),
 				                                           TaskPriority::CoordinationReply));
+			}
+		}

 		state std::vector<Future<ProtocolInfoReply>> coordProtocols;
 		coordProtocols.reserve(coord.clientLeaderServers.size());
 		for (int i = 0; i < coord.clientLeaderServers.size(); i++) {
+			if (coord.clientLeaderServers[i].hostname.present()) {
+				coordProtocols.push_back(retryGetReplyFromHostname(
+				    ProtocolInfoRequest{}, coord.clientLeaderServers[i].hostname.get(), WLTOKEN_PROTOCOL_INFO));
+			} else {
 				RequestStream<ProtocolInfoRequest> requestStream{ Endpoint::wellKnown(
 					{ coord.clientLeaderServers[i].getLeader.getEndpoint().addresses }, WLTOKEN_PROTOCOL_INFO) };
 				coordProtocols.push_back(retryBrokenPromise(requestStream, ProtocolInfoRequest{}));
 			}
+		}

 		wait(smartQuorum(leaderServers, leaderServers.size() / 2 + 1, 1.5) &&
 		         smartQuorum(coordProtocols, coordProtocols.size() / 2 + 1, 1.5) ||
@ -337,8 +349,12 @@ ACTOR Future<Optional<StatusObject>> clientCoordinatorsStatusFetcher(Reference<I
 		int coordinatorsUnavailable = 0;
 		for (int i = 0; i < leaderServers.size(); i++) {
 			StatusObject coordStatus;
+			if (coord.clientLeaderServers[i].hostname.present()) {
+				coordStatus["address"] = coord.clientLeaderServers[i].hostname.get().toString();
+			} else {
 				coordStatus["address"] =
 				    coord.clientLeaderServers[i].getLeader.getEndpoint().getPrimaryAddress().toString();
+			}

 			if (leaderServers[i].isReady()) {
 				coordStatus["reachable"] = true;
--- a/fdbclient/Tenant.h
+++ b/fdbclient/Tenant.h
@ -48,6 +48,8 @@ struct TenantMapEntry {
 	int64_t id;
 	Key prefix;

+	constexpr static int ROOT_PREFIX_SIZE = sizeof(id);
+
 private:
 	void initPrefix(KeyRef subspace) {
 		ASSERT(id >= 0);
--- a/fdbclient/md5/md5.h
+++ b/fdbclient/md5/md5.h
@ -24,6 +24,9 @@
 */

 #if defined(HAVE_OPENSSL) && !defined(TLS_DISABLED)
+#if defined(HAVE_WOLFSSL)
+#include <wolfssl/options.h>
+#endif
 #include <openssl/md5.h>
 #elif !defined(_MD5_H)
 #define _MD5_H
--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@ -15,6 +15,7 @@ set(FDBRPC_SRCS
  genericactors.actor.h
  genericactors.actor.cpp
  HealthMonitor.actor.cpp
+  HTTP.actor.cpp
  IAsyncFile.actor.cpp
  IPAllowList.cpp
  LoadBalance.actor.cpp
@ -28,6 +29,10 @@ set(FDBRPC_SRCS
  ReplicationPolicy.cpp
  ReplicationTypes.cpp
  ReplicationUtils.cpp
+  RESTClient.h
+  RESTClient.actor.cpp
+  RESTUtils.h
+  RESTUtils.actor.cpp
  SimExternalConnection.actor.cpp
  SimExternalConnection.h
  Stats.actor.cpp
--- a/fdbclient/HTTP.actor.cpp
+++ b/fdbclient/HTTP.actor.cpp
@ -18,10 +18,12 @@
 * limitations under the License.
 */

-#include "fdbclient/HTTP.h"
+#include "fdbrpc/HTTP.h"
+
 #include "fdbclient/md5/md5.h"
 #include "fdbclient/libb64/encode.h"
 #include <cctype>
+
 #include "flow/actorcompiler.h" // has to be last include

 namespace HTTP {
--- a/fdbclient/HTTP.h
+++ b/fdbclient/HTTP.h
@ -18,6 +18,11 @@
 * limitations under the License.
 */

+#ifndef FDBRPC_HTTP_H
+#define FDBRPC_HTTP_H
+
+#pragma once
+
 #include "flow/flow.h"
 #include "flow/Net2Packet.h"
 #include "fdbrpc/IRateControl.h"
@ -63,4 +68,27 @@ Future<Reference<Response>> doRequest(Reference<IConnection> const& conn,
                                      int64_t* const& pSent,
                                      Reference<IRateControl> const& recvRate,
                                      const std::string& requestHeader = std::string());
+
+constexpr int HTTP_STATUS_CODE_OK = 200;
+constexpr int HTTP_STATUS_CODE_CREATED = 201;
+constexpr int HTTP_STATUS_CODE_ACCEPTED = 202;
+constexpr int HTTP_STATUS_CODE_NO_CONTENT = 204;
+constexpr int HTTP_STATUS_CODE_UNAUTHORIZED = 401;
+constexpr int HTTP_STATUS_CODE_NOT_ACCEPTABLE = 406;
+constexpr int HTTP_STATUS_CODE_TOO_MANY_REQUESTS = 429;
+constexpr int HTTP_STATUS_CODE_INTERNAL_SERVER_ERROR = 500;
+constexpr int HTTP_STATUS_CODE_BAD_GATEWAY = 502;
+constexpr int HTTP_STATUS_CODE_SERVICE_UNAVAILABLE = 503;
+
+constexpr int HTTP_RETRYAFTER_DELAY_SECS = 300;
+
+const std::string HTTP_VERB_GET = "GET";
+const std::string HTTP_VERB_HEAD = "HEAD";
+const std::string HTTP_VERB_DELETE = "DELETE";
+const std::string HTTP_VERB_TRACE = "TRACE";
+const std::string HTTP_VERB_PUT = "PUT";
+const std::string HTTP_VERB_POST = "POST";
+
 } // namespace HTTP
+
+#endif
--- a/fdbrpc/RESTClient.actor.cpp
+++ b/fdbrpc/RESTClient.actor.cpp
@ -0,0 +1,363 @@
+/*
+ * RESTClient.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbrpc/RESTClient.h"
+
+#include "fdbrpc/HTTP.h"
+#include "fdbrpc/IRateControl.h"
+#include "fdbrpc/RESTUtils.h"
+#include "flow/Arena.h"
+#include "flow/Error.h"
+#include "flow/FastRef.h"
+#include "flow/Knobs.h"
+#include "flow/Net2Packet.h"
+#include "flow/flow.h"
+#include "flow/network.h"
+#include "flow/serialize.h"
+#include "flow/Trace.h"
+#include "flow/UnitTest.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include "flow/actorcompiler.h" // always the last include
+
+json_spirit::mObject RESTClient::Stats::getJSON() {
+	json_spirit::mObject o;
+
+	o["host_service"] = host_service;
+	o["requests_failed"] = requests_failed;
+	o["requests_successful"] = requests_successful;
+	o["bytes_sent"] = bytes_sent;
+
+	return o;
+}
+
+RESTClient::Stats RESTClient::Stats::operator-(const Stats& rhs) {
+	Stats r(host_service);
+
+	r.requests_failed = requests_failed - rhs.requests_failed;
+	r.requests_successful = requests_successful - rhs.requests_successful;
+	r.bytes_sent = bytes_sent - rhs.bytes_sent;
+
+	return r;
+}
+
+RESTClient::RESTClient() {}
+
+RESTClient::RESTClient(std::unordered_map<std::string, int>& knobSettings) {
+	knobs.set(knobSettings);
+}
+
+void RESTClient::setKnobs(const std::unordered_map<std::string, int>& knobSettings) {
+	knobs.set(knobSettings);
+}
+
+std::unordered_map<std::string, int> RESTClient::getKnobs() const {
+	return knobs.get();
+}
+
+ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<RESTClient> client,
+                                                       std::string verb,
+                                                       HTTP::Headers headers,
+                                                       RESTUrl* url,
+                                                       std::set<unsigned int> successCodes) {
+	state UnsentPacketQueue content;
+	state int contentLen = url->body.size();
+
+	if (url->body.size() > 0) {
+		PacketWriter pw(content.getWriteBuffer(url->body.size()), nullptr, Unversioned());
+		pw.serializeBytes(url->body);
+	}
+
+	std::string statsKey = RESTClient::getStatsKey(url->service, url->service);
+	auto sItr = client->statsMap.find(statsKey);
+	if (sItr == client->statsMap.end()) {
+		client->statsMap.emplace(statsKey, std::make_unique<RESTClient::Stats>(statsKey));
+	}
+
+	headers["Content-Length"] = format("%d", contentLen);
+	headers["Host"] = url->host;
+
+	state int maxTries = std::min(client->knobs.request_tries, client->knobs.connect_tries);
+	state int thisTry = 1;
+	state double nextRetryDelay = 2.0;
+	state Reference<IRateControl> sendReceiveRate = makeReference<Unlimited>();
+	state double reqTimeout = (client->knobs.request_timeout_secs * 1.0) / 60;
+	state RESTConnectionPoolKey connectPoolKey = RESTConnectionPool::getConnectionPoolKey(url->host, url->service);
+	state RESTClient::Stats* statsPtr = client->statsMap[statsKey].get();
+
+	loop {
+		state Optional<Error> err;
+		state Optional<NetworkAddress> remoteAddress;
+		state bool connectionEstablished = false;
+		state Reference<HTTP::Response> r;
+
+		try {
+			// Start connecting
+			Future<RESTConnectionPool::ReusableConnection> frconn = client->conectionPool->connect(
+			    connectPoolKey, client->knobs.secure_connection, client->knobs.max_connection_life);
+
+			// Finish connecting, do request
+			state RESTConnectionPool::ReusableConnection rconn =
+			    wait(timeoutError(frconn, client->knobs.connect_timeout));
+			connectionEstablished = true;
+
+			remoteAddress = rconn.conn->getPeerAddress();
+			Reference<HTTP::Response> _r = wait(timeoutError(HTTP::doRequest(rconn.conn,
+			                                                                 verb,
+			                                                                 url->resource,
+			                                                                 headers,
+			                                                                 contentLen > 0 ? &content : nullptr,
+			                                                                 contentLen,
+			                                                                 sendReceiveRate,
+			                                                                 &statsPtr->bytes_sent,
+			                                                                 sendReceiveRate),
+			                                                 reqTimeout));
+			r = _r;
+
+			// Since the response was parsed successfully (which is why we are here) reuse the connection unless we
+			// received the "Connection: close" header.
+			if (r->headers["Connection"] != "close") {
+				client->conectionPool->returnConnection(connectPoolKey, rconn, client->knobs.connection_pool_size);
+			}
+			rconn.conn.clear();
+		} catch (Error& e) {
+			if (e.code() == error_code_actor_cancelled) {
+				throw;
+			}
+			err = e;
+		}
+
+		// If err is not present then r is valid.
+		// If r->code is in successCodes then record the successful request and return r.
+		if (!err.present() && successCodes.count(r->code) != 0) {
+			statsPtr->requests_successful++;
+			return r;
+		}
+
+		// Otherwise, this request is considered failed.  Update failure count.
+		statsPtr->requests_failed++;
+
+		// All errors in err are potentially retryable as well as certain HTTP response codes...
+		bool retryable = err.present() || r->code == HTTP::HTTP_STATUS_CODE_INTERNAL_SERVER_ERROR ||
+		                 r->code == HTTP::HTTP_STATUS_CODE_BAD_GATEWAY ||
+		                 r->code == HTTP::HTTP_STATUS_CODE_SERVICE_UNAVAILABLE ||
+		                 r->code == HTTP::HTTP_STATUS_CODE_TOO_MANY_REQUESTS;
+
+		// But only if our previous attempt was not the last allowable try.
+		retryable = retryable && (thisTry < maxTries);
+
+		TraceEvent event(SevWarn, retryable ? "RESTClient_FailedRetryable" : "RESTClient_RequestFailed");
+
+		// Attach err to trace event if present, otherwise extract some stuff from the response
+		if (err.present()) {
+			event.errorUnsuppressed(err.get());
+		}
+		event.suppressFor(60);
+		if (!err.present()) {
+			event.detail("ResponseCode", r->code);
+		}
+
+		event.detail("ConnectionEstablished", connectionEstablished);
+
+		if (remoteAddress.present())
+			event.detail("RemoteEndpoint", remoteAddress.get());
+		else
+			event.detail("RemoteHost", url->host);
+
+		event.detail("Verb", verb).detail("Resource", url->resource).detail("ThisTry", thisTry);
+
+		// If r is not valid or not code TOO_MANY_REQUESTS then increment the try count.
+		// TOO_MANY_REQUEST's will not count against the attempt limit.
+		if (!r || r->code != HTTP::HTTP_STATUS_CODE_TOO_MANY_REQUESTS) {
+			++thisTry;
+		}
+
+		// We will wait delay seconds before the next retry, start with nextRetryDelay.
+		double delay = nextRetryDelay;
+		// Double but limit the *next* nextRetryDelay.
+		nextRetryDelay = std::min(nextRetryDelay * 2, 60.0);
+
+		if (retryable) {
+			// If r is valid then obey the Retry-After response header if present.
+			if (r) {
+				auto iRetryAfter = r->headers.find("Retry-After");
+				if (iRetryAfter != r->headers.end()) {
+					event.detail("RetryAfterHeader", iRetryAfter->second);
+					char* pEnd;
+					double retryAfter = strtod(iRetryAfter->second.c_str(), &pEnd);
+					if (*pEnd) {
+						// If there were other characters then don't trust the parsed value
+						retryAfter = HTTP::HTTP_RETRYAFTER_DELAY_SECS;
+					}
+					// Update delay
+					delay = std::max(delay, retryAfter);
+				}
+			}
+
+			// Log the delay then wait.
+			event.detail("RetryDelay", delay);
+			wait(::delay(delay));
+		} else {
+			// We can't retry, so throw something.
+
+			// This error code means the authentication header was not accepted, likely the account or key is wrong.
+			if (r && r->code == HTTP::HTTP_STATUS_CODE_NOT_ACCEPTABLE) {
+				throw http_not_accepted();
+			}
+
+			if (r && r->code == HTTP::HTTP_STATUS_CODE_UNAUTHORIZED) {
+				throw http_auth_failed();
+			}
+
+			// Recognize and throw specific errors
+			if (err.present()) {
+				int code = err.get().code();
+
+				// If we get a timed_out error during the the connect() phase, we'll call that connection_failed despite
+				// the fact that there was technically never a 'connection' to begin with.  It differentiates between an
+				// active connection timing out vs a connection timing out, though not between an active connection
+				// failing vs connection attempt failing.
+				// TODO:  Add more error types?
+				if (code == error_code_timed_out && !connectionEstablished) {
+					throw connection_failed();
+				}
+
+				if (code == error_code_timed_out || code == error_code_connection_failed ||
+				    code == error_code_lookup_failed) {
+					throw err.get();
+				}
+			}
+
+			throw http_request_failed();
+		}
+	}
+}
+
+Future<Reference<HTTP::Response>> RESTClient::doPutOrPost(const std::string& verb,
+                                                          Optional<HTTP::Headers> optHeaders,
+                                                          RESTUrl* url,
+                                                          std::set<unsigned int> successCodes) {
+	HTTP::Headers headers;
+	if (optHeaders.present()) {
+		headers = optHeaders.get();
+	}
+
+	return doRequest_impl(Reference<RESTClient>::addRef(this), verb, headers, url, successCodes);
+}
+
+Future<Reference<HTTP::Response>> RESTClient::doPost(const std::string& fullUrl,
+                                                     const std::string& requestBody,
+                                                     Optional<HTTP::Headers> optHeaders) {
+	RESTUrl url(fullUrl, requestBody, knobs.secure_connection);
+	return doPutOrPost(HTTP::HTTP_VERB_POST, optHeaders, std::addressof(url), { HTTP::HTTP_STATUS_CODE_OK });
+}
+
+Future<Reference<HTTP::Response>> RESTClient::doPut(const std::string& fullUrl,
+                                                    const std::string& requestBody,
+                                                    Optional<HTTP::Headers> optHeaders) {
+	RESTUrl url(fullUrl, requestBody, knobs.secure_connection);
+	return doPutOrPost(
+	    HTTP::HTTP_VERB_PUT,
+	    optHeaders,
+	    std::addressof(url),
+	    // 201 - on successful resource create
+	    // 200 / 204 - if target resource representation was successfully modified with the desired state
+	    { HTTP::HTTP_STATUS_CODE_OK, HTTP::HTTP_STATUS_CODE_CREATED, HTTP::HTTP_STATUS_CODE_NO_CONTENT });
+}
+
+Future<Reference<HTTP::Response>> RESTClient::doGetHeadDeleteOrTrace(const std::string& verb,
+                                                                     Optional<HTTP::Headers> optHeaders,
+                                                                     RESTUrl* url,
+                                                                     std::set<unsigned int> successCodes) {
+	HTTP::Headers headers;
+	if (optHeaders.present()) {
+		headers = optHeaders.get();
+	}
+
+	return doRequest_impl(Reference<RESTClient>::addRef(this), HTTP::HTTP_VERB_GET, headers, url, successCodes);
+}
+
+Future<Reference<HTTP::Response>> RESTClient::doGet(const std::string& fullUrl, Optional<HTTP::Headers> optHeaders) {
+	RESTUrl url(fullUrl, knobs.secure_connection);
+	return doGetHeadDeleteOrTrace(HTTP::HTTP_VERB_GET, optHeaders, std::addressof(url), { HTTP::HTTP_STATUS_CODE_OK });
+}
+
+Future<Reference<HTTP::Response>> RESTClient::doHead(const std::string& fullUrl, Optional<HTTP::Headers> optHeaders) {
+	RESTUrl url(fullUrl, knobs.secure_connection);
+	return doGetHeadDeleteOrTrace(HTTP::HTTP_VERB_HEAD, optHeaders, std::addressof(url), { HTTP::HTTP_STATUS_CODE_OK });
+}
+
+Future<Reference<HTTP::Response>> RESTClient::doDelete(const std::string& fullUrl, Optional<HTTP::Headers> optHeaders) {
+	RESTUrl url(fullUrl, knobs.secure_connection);
+	return doGetHeadDeleteOrTrace(
+	    HTTP::HTTP_VERB_DELETE,
+	    optHeaders,
+	    std::addressof(url),
+	    // 200 - action has been enacted.
+	    // 202 - action will likely succeed, but, has not yet been enacted.
+	    // 204 - action has been enated, no further information is to supplied.
+	    { HTTP::HTTP_STATUS_CODE_OK, HTTP::HTTP_STATUS_CODE_NO_CONTENT, HTTP::HTTP_STATUS_CODE_ACCEPTED });
+}
+
+Future<Reference<HTTP::Response>> RESTClient::doTrace(const std::string& fullUrl, Optional<HTTP::Headers> optHeaders) {
+	RESTUrl url(fullUrl, knobs.secure_connection);
+	return doGetHeadDeleteOrTrace(
+	    HTTP::HTTP_VERB_TRACE, optHeaders, std::addressof(url), { HTTP::HTTP_STATUS_CODE_OK });
+}
+
+// Only used to link unit tests
+void forceLinkRESTClientTests() {}
+
+TEST_CASE("fdbrpc/RESTClient") {
+	RESTClient r;
+	std::unordered_map<std::string, int> knobs = r.getKnobs();
+	ASSERT_EQ(knobs["secure_connection"], RESTClientKnobs::SECURE_CONNECTION);
+	ASSERT_EQ(knobs["connection_pool_size"], FLOW_KNOBS->RESTCLIENT_MAX_CONNECTIONPOOL_SIZE);
+	ASSERT_EQ(knobs["connect_tries"], FLOW_KNOBS->RESTCLIENT_CONNECT_TRIES);
+	ASSERT_EQ(knobs["connect_timeout"], FLOW_KNOBS->RESTCLIENT_CONNECT_TIMEOUT);
+	ASSERT_EQ(knobs["max_connection_life"], FLOW_KNOBS->RESTCLIENT_MAX_CONNECTION_LIFE);
+	ASSERT_EQ(knobs["request_tries"], FLOW_KNOBS->RESTCLIENT_REQUEST_TRIES);
+	ASSERT_EQ(knobs["request_timeout_secs"], FLOW_KNOBS->RESTCLIENT_REQUEST_TIMEOUT_SEC);
+
+	for (auto& itr : knobs) {
+		itr.second++;
+	}
+	r.setKnobs(knobs);
+
+	std::unordered_map<std::string, int> updated = r.getKnobs();
+	for (auto& itr : updated) {
+		ASSERT_EQ(knobs[itr.first], itr.second);
+	}
+
+	// invalid client knob
+	knobs["foo"] = 100;
+	try {
+		r.setKnobs(knobs);
+		ASSERT(false);
+	} catch (Error& e) {
+		if (e.code() != error_code_rest_invalid_rest_client_knob) {
+			throw e;
+		}
+	}
+
+	return Void();
+}
--- a/fdbrpc/RESTClient.h
+++ b/fdbrpc/RESTClient.h
@ -0,0 +1,97 @@
+/*
+ * RESTClient.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDBRPC_RESTCLIENT_H
+#define FDBRPC_RESTCLIENT_H
+
+#include <memory>
+#pragma once
+
+#include "fdbclient/JSONDoc.h"
+#include "fdbrpc/HTTP.h"
+#include "fdbrpc/RESTUtils.h"
+#include "flow/Arena.h"
+#include "flow/FastRef.h"
+#include "flow/flow.h"
+#include "flow/Net2Packet.h"
+
+// This interface enables sending REST HTTP requests and receiving REST HTTP responses from a resource identified by a
+// URI.
+
+class RESTClient : public ReferenceCounted<RESTClient> {
+public:
+	struct Stats {
+		explicit Stats(const std::string& hService)
+		  : host_service(hService), requests_successful(0), requests_failed(0), bytes_sent(0) {}
+		Stats operator-(const Stats& rhs);
+		void clear() { requests_failed = requests_successful = bytes_sent = 0; }
+		json_spirit::mObject getJSON();
+
+		std::string host_service;
+		int64_t requests_successful;
+		int64_t requests_failed;
+		int64_t bytes_sent;
+	};
+
+	RESTClientKnobs knobs;
+	Reference<RESTConnectionPool> conectionPool;
+	// Connection stats maintained per "host:service"
+	std::unordered_map<std::string, std::unique_ptr<Stats>> statsMap;
+
+	RESTClient();
+	explicit RESTClient(std::unordered_map<std::string, int>& params);
+
+	void setKnobs(const std::unordered_map<std::string, int>& knobSettings);
+	std::unordered_map<std::string, int> getKnobs() const;
+
+	// Supports common REST APIs.
+	// On invocation of below methods, input 'fullUrl' is parsed using RESTUrl interface,
+	// RESTConnectionPool is used to leverage cached connection if any for 'host:service' pair. API then leverage
+	// HTTP::doRequest to accomplish the specified operation
+
+	Future<Reference<HTTP::Response>> doGet(const std::string& fullUrl,
+	                                        Optional<HTTP::Headers> optHeaders = Optional<HTTP::Headers>());
+	Future<Reference<HTTP::Response>> doHead(const std::string& fullUrl,
+	                                         Optional<HTTP::Headers> optHeaders = Optional<HTTP::Headers>());
+	Future<Reference<HTTP::Response>> doDelete(const std::string& fullUrl,
+	                                           Optional<HTTP::Headers> optHeaders = Optional<HTTP::Headers>());
+	Future<Reference<HTTP::Response>> doTrace(const std::string& fullUrl,
+	                                          Optional<HTTP::Headers> optHeaders = Optional<HTTP::Headers>());
+	Future<Reference<HTTP::Response>> doPut(const std::string& fullUrl,
+	                                        const std::string& requestBody,
+	                                        Optional<HTTP::Headers> optHeaders = Optional<HTTP::Headers>());
+	Future<Reference<HTTP::Response>> doPost(const std::string& fullUrl,
+	                                         const std::string& requestBody,
+	                                         Optional<HTTP::Headers> optHeaders = Optional<HTTP::Headers>());
+
+	static std::string getStatsKey(const std::string& host, const std::string& service) { return host + ":" + service; }
+
+private:
+	Future<Reference<HTTP::Response>> doGetHeadDeleteOrTrace(const std::string& verb,
+	                                                         Optional<HTTP::Headers> optHeaders,
+	                                                         RESTUrl* url,
+	                                                         std::set<unsigned int> successCodes);
+	Future<Reference<HTTP::Response>> doPutOrPost(const std::string& verb,
+	                                              Optional<HTTP::Headers> headers,
+	                                              RESTUrl* url,
+	                                              std::set<unsigned int> successCodes);
+};
+
+#endif
--- a/fdbrpc/RESTUtils.actor.cpp
+++ b/fdbrpc/RESTUtils.actor.cpp
@ -0,0 +1,276 @@
+/*
+ * RESTUtils.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbrpc/RESTUtils.h"
+
+#include "flow/flat_buffers.h"
+#include "flow/UnitTest.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include "flow/actorcompiler.h" // always the last include
+
+namespace {
+std::unordered_set<std::string> protocols = { "http", "https" };
+
+bool isProtocolSupported(const std::string& protocol) {
+	return protocols.find(protocol) != protocols.end();
+}
+
+bool isSecurePrototol(const std::string& protocol) {
+	return protocol.compare("https") == 0;
+}
+} // namespace
+
+RESTClientKnobs::RESTClientKnobs() {
+	secure_connection = RESTClientKnobs::SECURE_CONNECTION;
+	connection_pool_size = FLOW_KNOBS->RESTCLIENT_MAX_CONNECTIONPOOL_SIZE;
+	connect_tries = FLOW_KNOBS->RESTCLIENT_CONNECT_TRIES;
+	connect_timeout = FLOW_KNOBS->RESTCLIENT_CONNECT_TIMEOUT;
+	max_connection_life = FLOW_KNOBS->RESTCLIENT_MAX_CONNECTION_LIFE;
+	request_tries = FLOW_KNOBS->RESTCLIENT_REQUEST_TRIES;
+	request_timeout_secs = FLOW_KNOBS->RESTCLIENT_REQUEST_TIMEOUT_SEC;
+
+	knobMap["connection_pool_size"] = std::addressof(connection_pool_size);
+	knobMap["pz"] = std::addressof(connection_pool_size);
+	knobMap["secure_connection"] = std::addressof(secure_connection);
+	knobMap["sc"] = std::addressof(secure_connection);
+	knobMap["connect_tries"] = std::addressof(connect_tries);
+	knobMap["ct"] = std::addressof(connect_tries);
+	knobMap["connect_timeout"] = std::addressof(connect_timeout);
+	knobMap["cto"] = std::addressof(connect_timeout);
+	knobMap["max_connection_life"] = std::addressof(max_connection_life);
+	knobMap["mcl"] = std::addressof(max_connection_life);
+	knobMap["request_tries"] = std::addressof(request_tries);
+	knobMap["rt"] = std::addressof(request_tries);
+	knobMap["request_timeout_secs"] = std::addressof(request_timeout_secs);
+	knobMap["rtom"] = std::addressof(request_timeout_secs);
+}
+
+void RESTClientKnobs::set(const std::unordered_map<std::string, int>& knobSettings) {
+	TraceEvent trace = TraceEvent("RESTClient_SetKnobs");
+
+	for (const auto& itr : knobSettings) {
+		const auto& kItr = RESTClientKnobs::knobMap.find(itr.first);
+		if (kItr == RESTClientKnobs::knobMap.end()) {
+			trace.detail("RESTClient_InvalidKnobName", itr.first);
+			throw rest_invalid_rest_client_knob();
+		}
+		*(kItr->second) = itr.second;
+		trace.detail(itr.first.c_str(), itr.second);
+	}
+}
+
+std::unordered_map<std::string, int> RESTClientKnobs::get() const {
+	std::unordered_map<std::string, int> details = {
+		{ "connection_pool_size", connection_pool_size },
+		{ "secure_connection", secure_connection },
+		{ "connect_tries", connect_tries },
+		{ "connect_timeout", connect_timeout },
+		{ "max_connection_life", max_connection_life },
+		{ "request_tries", request_tries },
+		{ "request_timeout_secs", request_timeout_secs },
+	};
+
+	return details;
+}
+
+ACTOR Future<RESTConnectionPool::ReusableConnection> connect_impl(Reference<RESTConnectionPool> connectionPool,
+                                                                  RESTConnectionPoolKey connectKey,
+                                                                  bool isSecure,
+                                                                  int maxConnLife) {
+	auto poolItr = connectionPool->connectionPoolMap.find(connectKey);
+	if (poolItr == connectionPool->connectionPoolMap.end()) {
+		throw rest_connectpool_key_not_found();
+	}
+
+	while (!poolItr->second.empty()) {
+		RESTConnectionPool::ReusableConnection rconn = poolItr->second.front();
+		poolItr->second.pop();
+
+		if (rconn.expirationTime > now()) {
+			TraceEvent("RESTClient_ReusableConnection")
+			    .suppressFor(60)
+			    .detail("RemoteEndpoint", rconn.conn->getPeerAddress())
+			    .detail("ExpireIn", rconn.expirationTime - now());
+			return rconn;
+		}
+	}
+
+	state Reference<IConnection> conn =
+	    wait(INetworkConnections::net()->connect(connectKey.first, connectKey.second, isSecure));
+	wait(conn->connectHandshake());
+
+	return RESTConnectionPool::ReusableConnection({ conn, now() + maxConnLife });
+}
+
+Future<RESTConnectionPool::ReusableConnection> RESTConnectionPool::connect(RESTConnectionPoolKey connectKey,
+                                                                           const bool isSecure,
+                                                                           const int maxConnLife) {
+	return connect_impl(Reference<RESTConnectionPool>::addRef(this), connectKey, isSecure, maxConnLife);
+}
+
+void RESTConnectionPool::returnConnection(RESTConnectionPoolKey connectKey,
+                                          ReusableConnection& rconn,
+                                          const int maxConnections) {
+	auto poolItr = connectionPoolMap.find(connectKey);
+	if (poolItr == connectionPoolMap.end()) {
+		throw rest_connectpool_key_not_found();
+	}
+
+	// If it expires in the future then add it to the pool in the front iff connection pool size is not maxed
+	if (rconn.expirationTime > now() && poolItr->second.size() < maxConnections) {
+		poolItr->second.push(rconn);
+	}
+	rconn.conn = Reference<IConnection>();
+}
+
+RESTUrl::RESTUrl(const std::string& fUrl, const bool isSecure) {
+	parseUrl(fUrl, isSecure);
+}
+
+RESTUrl::RESTUrl(const std::string& fullUrl, const std::string& b, const bool isSecure) : body(b) {
+	parseUrl(fullUrl, isSecure);
+}
+
+void RESTUrl::parseUrl(const std::string& fullUrl, const bool isSecure) {
+	// Sample valid URIs
+	// 1. With 'host' & 'resource' := '<protocol>://<host>/<resource>'
+	// 2. With 'host', 'service' & 'resource' := '<protocol>://<host>:port/<resource>'
+	// 3. With 'host', 'service', 'resource' & 'reqParameters' := '<protocol>://<host>:port/<resource>?<parameter-list>'
+
+	try {
+		StringRef t(fullUrl);
+		StringRef p = t.eat("://");
+		std::string protocol = p.toString();
+		boost::algorithm::to_lower(protocol);
+		if (!isProtocolSupported(protocol)) {
+			throw format("Invalid REST URI protocol '%s'", protocol.c_str());
+		}
+
+		// Ensure connection secure knob setting matches with the input URI
+		if ((isSecurePrototol(protocol) && !isSecure) || (!isSecurePrototol(protocol) && isSecure)) {
+			throw format("Invalid REST URI protocol secure knob '%s'", fullUrl.c_str());
+		}
+
+		// extract 'resource' and optional 'parameter list' if supplied in the URL
+		uint8_t foundSeparator = 0;
+		StringRef hostPort = t.eatAny("/?", &foundSeparator);
+		if (foundSeparator == '/') {
+			resource = t.eat("?").toString();
+			reqParameters = t.eat().toString();
+		}
+
+		// hostPort is at least a host or IP address, optionally followed by :portNumber or :serviceName
+		StringRef hRef(hostPort);
+		StringRef h = hRef.eat(":");
+		if (h.size() == 0) {
+			throw std::string("host cannot be empty");
+		}
+		host = h.toString();
+		service = hRef.eat().toString();
+
+		TraceEvent("RESTClient_ParseURI")
+		    .detail("URI", fullUrl)
+		    .detail("Host", host)
+		    .detail("Service", service)
+		    .detail("Resource", resource)
+		    .detail("ReqParameters", reqParameters);
+	} catch (std::string& err) {
+		TraceEvent("RESTClient_ParseError").detail("URI", fullUrl).detail("Error", err);
+		throw rest_invalid_uri();
+	}
+}
+
+// Only used to link unit tests
+void forceLinkRESTUtilsTests() {}
+
+TEST_CASE("fdbrpc/RESTUtils") {
+	// invalid protocol
+	try {
+		std::string uri("httpx://foo/bar");
+		RESTUrl r(uri, false);
+		ASSERT(false);
+	} catch (Error& e) {
+		if (e.code() != error_code_rest_invalid_uri) {
+			throw e;
+		}
+	}
+
+	// mismatch protocol and knob values
+	try {
+		std::string uri("http://foo/bar");
+		RESTUrl r(uri, true);
+		ASSERT(false);
+	} catch (Error& e) {
+		if (e.code() != error_code_rest_invalid_uri) {
+			throw e;
+		}
+	}
+
+	// missing host
+	try {
+		std::string uri("https://:/bar");
+		RESTUrl r(uri, true);
+		ASSERT(false);
+	} catch (Error& e) {
+		if (e.code() != error_code_rest_invalid_uri) {
+			throw e;
+		}
+	}
+
+	// valid URI with service
+	try {
+		std::string uri("https://host:80/foo/bar");
+		RESTUrl r(uri, true);
+		ASSERT_EQ(r.host.compare("host"), 0);
+		ASSERT_EQ(r.service.compare("80"), 0);
+		ASSERT_EQ(r.resource.compare("foo/bar"), 0);
+	} catch (Error& e) {
+		throw e;
+	}
+
+	// valid URI with-out service
+	try {
+		std::string uri("https://host/foo/bar");
+		RESTUrl r(uri, true);
+		ASSERT_EQ(r.host.compare("host"), 0);
+		ASSERT(r.service.empty());
+		ASSERT_EQ(r.resource.compare("foo/bar"), 0);
+	} catch (Error& e) {
+		throw e;
+	}
+
+	// valid URI with parameters
+	try {
+		std::string uri("https://host/foo/bar?param1,param2");
+		RESTUrl r(uri, true);
+		ASSERT_EQ(r.host.compare("host"), 0);
+		ASSERT(r.service.empty());
+		ASSERT_EQ(r.resource.compare("foo/bar"), 0);
+		ASSERT_EQ(r.reqParameters.compare("param1,param2"), 0);
+	} catch (Error& e) {
+		throw e;
+	}
+
+	// ensure RESTClient::Knob default values and updates
+
+	return Void();
+}
--- a/fdbrpc/RESTUtils.h
+++ b/fdbrpc/RESTUtils.h
@ -0,0 +1,113 @@
+/*
+ * RESTUtils.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDRPC_REST_UTILS_H
+#define FDRPC_REST_UTILS_H
+
+#pragma once
+
+#include "flow/flow.h"
+#include "flow/FastRef.h"
+#include "flow/Net2Packet.h"
+
+#include <unordered_map>
+#include <utility>
+
+// Util interface managing REST active connection pool.
+// The interface internally constructs and maintains map {"host:service" -> activeConnection}; any new connection
+// request would first access cached connection if possible (not expired), if none exists, it would establish a new
+// connection and return to the caller. Caller on accomplishing the task at-hand, should return the connection back to
+// the pool.
+
+using RESTConnectionPoolKey = std::pair<std::string, std::string>;
+
+class RESTConnectionPool : public ReferenceCounted<RESTConnectionPool> {
+public:
+	struct ReusableConnection {
+		Reference<IConnection> conn;
+		double expirationTime;
+	};
+
+	// Maximum number of connections cached in the connection-pool.
+	int maxConnPerConnectKey;
+	std::map<RESTConnectionPoolKey, std::queue<ReusableConnection>> connectionPoolMap;
+
+	RESTConnectionPool(const int maxConnsPerKey) : maxConnPerConnectKey(maxConnsPerKey) {}
+
+	// Routine is responsible to provide an usable TCP connection object; it reuses an active connection from
+	// connection-pool if availalbe, otherwise, establish a new TCP connection
+	Future<ReusableConnection> connect(RESTConnectionPoolKey connectKey, const bool isSecure, const int maxConnLife);
+	void returnConnection(RESTConnectionPoolKey connectKey, ReusableConnection& conn, const int maxConnections);
+
+	static RESTConnectionPoolKey getConnectionPoolKey(const std::string& host, const std::string& service) {
+		return std::make_pair(host, service);
+	}
+};
+
+// Util interface facilitating management and update for RESTClient knob parameters
+struct RESTClientKnobs {
+	int connection_pool_size, secure_connection, connect_timeout, connect_tries, max_connection_life, request_tries,
+	    request_timeout_secs;
+
+	constexpr static int SECURE_CONNECTION = 1;
+	constexpr static int NOT_SECURE_CONNECTION = 0;
+
+	RESTClientKnobs();
+
+	void set(const std::unordered_map<std::string, int>& knobSettings);
+	std::unordered_map<std::string, int> get() const;
+	std::unordered_map<std::string, int*> knobMap;
+
+	static std::vector<std::string> getKnobDescriptions() {
+		return {
+			"connection_pool_size (pz)             Maximum numbers of active connections in the connection-pool",
+			"secure_connection (or sc)             Set 1 for secure connection and 0 for insecure connection.",
+			"connect_tries (or ct)                 Number of times to try to connect for each request.",
+			"connect_timeout (or cto)              Number of seconds to wait for a connect request to succeed.",
+			"max_connection_life (or mcl)          Maximum number of seconds to use a single TCP connection.",
+			"request_tries (or rt)                 Number of times to try each request until a parsable HTTP "
+			"response other than 429 is received.",
+			"request_timeout_secs (or rtom)        Number of seconds to wait for a request to succeed after a "
+			"connection is established.",
+		};
+	}
+};
+
+// Util interface facilitating parsing of an input REST 'full_url'
+struct RESTUrl {
+public:
+	// Connection resources - host and port details
+	std::string host;
+	std::string service;
+	// resource identified by URI
+	std::string resource;
+	// optional REST request parameters
+	std::string reqParameters;
+	// Request 'body' payload
+	std::string body;
+
+	explicit RESTUrl(const std::string& fullUrl, const bool isSecure);
+	explicit RESTUrl(const std::string& fullUrl, const std::string& body, const bool isSecure);
+
+private:
+	void parseUrl(const std::string& fullUrl, bool isSecure);
+};
+
+#endif
--- a/fdbrpc/genericactors.actor.h
+++ b/fdbrpc/genericactors.actor.h
@ -72,6 +72,20 @@ Future<REPLY_TYPE(Req)> retryBrokenPromise(RequestStream<Req, P> to, Req request
 	}
 }

+ACTOR template <class Req>
+Future<Void> tryInitializeRequestStream(RequestStream<Req>* stream, Hostname hostname, WellKnownEndpoints token) {
+	Optional<NetworkAddress> address = wait(hostname.resolve());
+	if (!address.present()) {
+		return Void();
+	}
+	if (stream == nullptr) {
+		stream = new RequestStream<Req>(Endpoint::wellKnown({ address.get() }, token));
+	} else {
+		*stream = RequestStream<Req>(Endpoint::wellKnown({ address.get() }, token));
+	}
+	return Void();
+}
+
 ACTOR template <class Req>
 Future<ErrorOr<REPLY_TYPE(Req)>> tryGetReplyFromHostname(Req request, Hostname hostname, WellKnownEndpoints token) {
 	// A wrapper of tryGetReply(request), except that the request is sent to an address resolved from a hostname.
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -1110,10 +1110,10 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
 	newPriorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController);

 	bool isCoordinator =
-	    (std::find(coordinatorAddresses.begin(), coordinatorAddresses.end(), req.wi.address()) !=
+	    (std::find(coordinatorAddresses.begin(), coordinatorAddresses.end(), w.address()) !=
 	     coordinatorAddresses.end()) ||
-	    (req.wi.secondaryAddress().present() &&
-	     std::find(coordinatorAddresses.begin(), coordinatorAddresses.end(), req.wi.secondaryAddress().get()) !=
+	    (w.secondaryAddress().present() &&
+	     std::find(coordinatorAddresses.begin(), coordinatorAddresses.end(), w.secondaryAddress().get()) !=
 	         coordinatorAddresses.end());

 	for (auto it : req.incompatiblePeers) {
@ -1933,8 +1933,24 @@ ACTOR Future<Void> handleForcedRecoveries(ClusterControllerData* self, ClusterCo
 	}
 }

-ACTOR Future<Void> startDataDistributor(ClusterControllerData* self) {
-	wait(delay(0.0)); // If master fails at the same time, give it a chance to clear master PID.
+struct SingletonRecruitThrottler {
+	double lastRecruitStart;
+
+	SingletonRecruitThrottler() : lastRecruitStart(-1) {}
+
+	double newRecruitment() {
+		double n = now();
+		double waitTime =
+		    std::max(0.0, (lastRecruitStart + SERVER_KNOBS->CC_THROTTLE_SINGLETON_RERECRUIT_INTERVAL - n));
+		lastRecruitStart = n;
+		return waitTime;
+	}
+};
+
+ACTOR Future<Void> startDataDistributor(ClusterControllerData* self, double waitTime) {
+	// If master fails at the same time, give it a chance to clear master PID.
+	// Also wait to avoid too many consecutive recruits in a small time window.
+	wait(delay(waitTime));

 	TraceEvent("CCStartDataDistributor", self->id).log();
 	loop {
@ -2003,6 +2019,7 @@ ACTOR Future<Void> startDataDistributor(ClusterControllerData* self) {
 }

 ACTOR Future<Void> monitorDataDistributor(ClusterControllerData* self) {
+	state SingletonRecruitThrottler recruitThrottler;
 	while (self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 		wait(self->db.serverInfo->onChange());
 	}
@ -2019,13 +2036,15 @@ ACTOR Future<Void> monitorDataDistributor(ClusterControllerData* self) {
 				when(wait(self->recruitDistributor.onChange())) {}
 			}
 		} else {
-			wait(startDataDistributor(self));
+			wait(startDataDistributor(self, recruitThrottler.newRecruitment()));
 		}
 	}
 }

-ACTOR Future<Void> startRatekeeper(ClusterControllerData* self) {
-	wait(delay(0.0)); // If master fails at the same time, give it a chance to clear master PID.
+ACTOR Future<Void> startRatekeeper(ClusterControllerData* self, double waitTime) {
+	// If master fails at the same time, give it a chance to clear master PID.
+	// Also wait to avoid too many consecutive recruits in a small time window.
+	wait(delay(waitTime));

 	TraceEvent("CCStartRatekeeper", self->id).log();
 	loop {
@ -2091,6 +2110,7 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData* self) {
 }

 ACTOR Future<Void> monitorRatekeeper(ClusterControllerData* self) {
+	state SingletonRecruitThrottler recruitThrottler;
 	while (self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 		wait(self->db.serverInfo->onChange());
 	}
@ -2107,34 +2127,15 @@ ACTOR Future<Void> monitorRatekeeper(ClusterControllerData* self) {
 				when(wait(self->recruitRatekeeper.onChange())) {}
 			}
 		} else {
-			wait(startRatekeeper(self));
+			wait(startRatekeeper(self, recruitThrottler.newRecruitment()));
 		}
 	}
 }

-// Acquires the BM lock by getting the next epoch no.
-ACTOR Future<int64_t> getNextBMEpoch(ClusterControllerData* self) {
-	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
-
-	loop {
-		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-		tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-		try {
-			Optional<Value> oldEpoch = wait(tr->get(blobManagerEpochKey));
-			state int64_t newEpoch = oldEpoch.present() ? decodeBlobManagerEpochValue(oldEpoch.get()) + 1 : 1;
-			tr->set(blobManagerEpochKey, blobManagerEpochValueFor(newEpoch));
-
-			wait(tr->commit());
-			TraceEvent(SevDebug, "CCNextBlobManagerEpoch", self->id).detail("Epoch", newEpoch);
-			return newEpoch;
-		} catch (Error& e) {
-			wait(tr->onError(e));
-		}
-	}
-}
-
-ACTOR Future<Void> startEncryptKeyProxy(ClusterControllerData* self) {
-	wait(delay(0.0)); // If master fails at the same time, give it a chance to clear master PID.
+ACTOR Future<Void> startEncryptKeyProxy(ClusterControllerData* self, double waitTime) {
+	// If master fails at the same time, give it a chance to clear master PID.
+	// Also wait to avoid too many consecutive recruits in a small time window.
+	wait(delay(waitTime));

 	TraceEvent("CCEKP_Start", self->id).log();
 	loop {
@ -2208,6 +2209,7 @@ ACTOR Future<Void> startEncryptKeyProxy(ClusterControllerData* self) {
 }

 ACTOR Future<Void> monitorEncryptKeyProxy(ClusterControllerData* self) {
+	state SingletonRecruitThrottler recruitThrottler;
 	loop {
 		if (self->db.serverInfo->get().encryptKeyProxy.present() && !self->recruitEncryptKeyProxy.get()) {
 			choose {
@ -2219,13 +2221,36 @@ ACTOR Future<Void> monitorEncryptKeyProxy(ClusterControllerData* self) {
 				when(wait(self->recruitEncryptKeyProxy.onChange())) {}
 			}
 		} else {
-			wait(startEncryptKeyProxy(self));
+			wait(startEncryptKeyProxy(self, recruitThrottler.newRecruitment()));
 		}
 	}
 }

-ACTOR Future<Void> startBlobManager(ClusterControllerData* self) {
-	wait(delay(0.0)); // If master fails at the same time, give it a chance to clear master PID.
+// Acquires the BM lock by getting the next epoch no.
+ACTOR Future<int64_t> getNextBMEpoch(ClusterControllerData* self) {
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
+
+	loop {
+		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		try {
+			Optional<Value> oldEpoch = wait(tr->get(blobManagerEpochKey));
+			state int64_t newEpoch = oldEpoch.present() ? decodeBlobManagerEpochValue(oldEpoch.get()) + 1 : 1;
+			tr->set(blobManagerEpochKey, blobManagerEpochValueFor(newEpoch));
+
+			wait(tr->commit());
+			TraceEvent(SevDebug, "CCNextBlobManagerEpoch", self->id).detail("Epoch", newEpoch);
+			return newEpoch;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}
+}
+
+ACTOR Future<Void> startBlobManager(ClusterControllerData* self, double waitTime) {
+	// If master fails at the same time, give it a chance to clear master PID.
+	// Also wait to avoid too many consecutive recruits in a small time window.
+	wait(delay(waitTime));

 	TraceEvent("CCStartBlobManager", self->id).log();
 	loop {
@ -2322,6 +2347,7 @@ ACTOR Future<Void> watchBlobGranulesConfigKey(ClusterControllerData* self) {
 }

 ACTOR Future<Void> monitorBlobManager(ClusterControllerData* self) {
+	state SingletonRecruitThrottler recruitThrottler;
 	while (self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 		wait(self->db.serverInfo->onChange());
 	}
@ -2352,7 +2378,7 @@ ACTOR Future<Void> monitorBlobManager(ClusterControllerData* self) {
 			}
 		} else if (self->db.blobGranulesEnabled.get()) {
 			// if there is no blob manager present but blob granules are now enabled, recruit a BM
-			wait(startBlobManager(self));
+			wait(startBlobManager(self, recruitThrottler.newRecruitment()));
 		} else {
 			// if there is no blob manager present and blob granules are disabled, wait for a config change
 			wait(self->db.blobGranulesEnabled.onChange());
@ -2481,12 +2507,11 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
 	}
 }

-ACTOR Future<Void> clusterControllerCore(Reference<IClusterConnectionRecord> connRecord,
-                                         ClusterControllerFullInterface interf,
+ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
                                         Future<Void> leaderFail,
+                                         ServerCoordinators coordinators,
                                         LocalityData locality,
                                         ConfigDBType configDBType) {
-	state ServerCoordinators coordinators(connRecord);
 	state ClusterControllerData self(interf, locality, coordinators);
 	state ConfigBroadcaster configBroadcaster(coordinators, configDBType);
 	state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
@ -2621,7 +2646,7 @@ ACTOR Future<Void> replaceInterface(ClusterControllerFullInterface interf) {
 	}
 }

-ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRecord,
+ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
                                     Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC,
                                     bool hasConnected,
                                     Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
@ -2632,10 +2657,9 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
 		state bool inRole = false;
 		cci.initEndpoints();
 		try {
-			wait(connRecord->resolveHostnames());
 			// Register as a possible leader; wait to be elected
 			state Future<Void> leaderFail =
-			    tryBecomeLeader(connRecord, cci, currentCC, hasConnected, asyncPriorityInfo);
+			    tryBecomeLeader(coordinators, cci, currentCC, hasConnected, asyncPriorityInfo);
 			state Future<Void> shouldReplace = replaceInterface(cci);

 			while (!currentCC->get().present() || currentCC->get().get() != cci) {
@ -2654,7 +2678,7 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
 				startRole(Role::CLUSTER_CONTROLLER, cci.id(), UID());
 				inRole = true;

-				wait(clusterControllerCore(connRecord, cci, leaderFail, locality, configDBType));
+				wait(clusterControllerCore(cci, leaderFail, coordinators, locality, configDBType));
 			}
 		} catch (Error& e) {
 			if (inRole)
@ -2683,7 +2707,8 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
 	state bool hasConnected = false;
 	loop {
 		try {
-			wait(clusterController(connRecord, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
+			ServerCoordinators coordinators(connRecord);
+			wait(clusterController(coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
 			hasConnected = true;
 		} catch (Error& e) {
 			if (e.code() != error_code_coordinators_changed)
--- a/fdbserver/ClusterRecovery.actor.cpp
+++ b/fdbserver/ClusterRecovery.actor.cpp
@ -537,8 +537,7 @@ ACTOR Future<Void> changeCoordinators(Reference<ClusterRecoveryData> self) {
 		}

 		try {
-			state ClusterConnectionString conn(changeCoordinatorsRequest.newConnectionString.toString());
-			wait(conn.resolveHostnames());
+			ClusterConnectionString conn(changeCoordinatorsRequest.newConnectionString.toString());
 			wait(self->cstate.move(conn));
 		} catch (Error& e) {
 			if (e.code() != error_code_actor_cancelled)
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@ -236,6 +236,105 @@ struct ResolutionRequestBuilder {
 	}
 };

+ErrorOr<Optional<TenantMapEntry>> getTenantEntry(ProxyCommitData* commitData,
+                                                 Optional<TenantNameRef> tenant,
+                                                 Optional<int64_t> tenantId,
+                                                 bool logOnFailure) {
+	if (tenant.present()) {
+		auto itr = commitData->tenantMap.find(tenant.get());
+		if (itr == commitData->tenantMap.end()) {
+			if (logOnFailure) {
+				TraceEvent(SevWarn, "CommitProxyUnknownTenant", commitData->dbgid).detail("Tenant", tenant.get());
+			}
+
+			return unknown_tenant();
+		} else if (tenantId.present() && tenantId.get() != itr->second.id) {
+			if (logOnFailure) {
+				TraceEvent(SevWarn, "CommitProxyTenantIdMismatch", commitData->dbgid)
+				    .detail("Tenant", tenant.get())
+				    .detail("TenantId", tenantId)
+				    .detail("ExistingId", itr->second.id);
+			}
+
+			return unknown_tenant();
+		}
+
+		return ErrorOr<Optional<TenantMapEntry>>(Optional<TenantMapEntry>(itr->second));
+	}
+
+	return Optional<TenantMapEntry>();
+}
+
+bool verifyTenantPrefix(ProxyCommitData* const commitData, const CommitTransactionRequest& req) {
+	ErrorOr<Optional<TenantMapEntry>> tenantEntry =
+	    getTenantEntry(commitData, req.tenantInfo.name.castTo<TenantNameRef>(), req.tenantInfo.tenantId, true);
+
+	if (tenantEntry.isError()) {
+		return true;
+	}
+
+	if (tenantEntry.get().present()) {
+		Key tenantPrefix = tenantEntry.get().get().prefix;
+		for (auto& m : req.transaction.mutations) {
+			if (m.param1 != metadataVersionKey) {
+				if (!m.param1.startsWith(tenantPrefix)) {
+					TraceEvent(SevWarnAlways, "TenantPrefixMismatch")
+					    .suppressFor(60)
+					    .detail("Prefix", tenantPrefix.toHexString())
+					    .detail("Key", m.param1.toHexString());
+					return false;
+				}
+
+				if (m.type == MutationRef::ClearRange && !m.param2.startsWith(tenantPrefix)) {
+					TraceEvent(SevWarnAlways, "TenantClearRangePrefixMismatch")
+					    .suppressFor(60)
+					    .detail("Prefix", tenantPrefix.toHexString())
+					    .detail("Key", m.param2.toHexString());
+					return false;
+				} else if (m.type == MutationRef::SetVersionstampedKey) {
+					ASSERT(m.param1.size() >= 4);
+					uint8_t* key = const_cast<uint8_t*>(m.param1.begin());
+					int* offset = reinterpret_cast<int*>(&key[m.param1.size() - 4]);
+					if (*offset < tenantPrefix.size()) {
+						TraceEvent(SevWarnAlways, "TenantVersionstampInvalidOffset")
+						    .suppressFor(60)
+						    .detail("Prefix", tenantPrefix.toHexString())
+						    .detail("Key", m.param1.toHexString())
+						    .detail("Offset", *offset);
+						return false;
+					}
+				}
+			}
+		}
+
+		for (auto& rc : req.transaction.read_conflict_ranges) {
+			if (rc.begin != metadataVersionKey &&
+			    (!rc.begin.startsWith(tenantPrefix) || !rc.end.startsWith(tenantPrefix))) {
+				TraceEvent(SevWarnAlways, "TenantReadConflictPrefixMismatch")
+				    .suppressFor(60)
+				    .detail("Prefix", tenantPrefix.toHexString())
+				    .detail("BeginKey", rc.begin.toHexString())
+				    .detail("EndKey", rc.end.toHexString());
+				return false;
+			}
+		}
+
+		for (auto& wc : req.transaction.write_conflict_ranges) {
+			if (wc.begin != metadataVersionKey &&
+			    (!wc.begin.startsWith(tenantPrefix) || !wc.end.startsWith(tenantPrefix))) {
+				TraceEvent(SevWarnAlways, "TenantWriteConflictPrefixMismatch")
+				    .suppressFor(60)
+				    .detail("Prefix", tenantPrefix.toHexString())
+				    .detail("BeginKey", wc.begin.toHexString())
+				    .detail("EndKey", wc.end.toHexString());
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
 ACTOR Future<Void> commitBatcher(ProxyCommitData* commitData,
                                 PromiseStream<std::pair<std::vector<CommitTransactionRequest>, int>> out,
                                 FutureStream<CommitTransactionRequest> in,
@ -282,6 +381,13 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData* commitData,
 						    .detail("Size", bytes)
 						    .detail("Client", req.reply.getEndpoint().getPrimaryAddress());
 					}
+
+					if (!verifyTenantPrefix(commitData, req)) {
+						++commitData->stats.txnCommitErrors;
+						req.reply.sendError(illegal_tenant_access());
+						continue;
+					}
+
 					++commitData->stats.txnCommitIn;

 					if (req.debugID.present()) {
@ -450,35 +556,6 @@ ACTOR static Future<ResolveTransactionBatchReply> trackResolutionMetrics(Referen
 	return reply;
 }

-ErrorOr<Optional<TenantMapEntry>> getTenantEntry(ProxyCommitData* commitData,
-                                                 Optional<TenantNameRef> tenant,
-                                                 Optional<int64_t> tenantId,
-                                                 bool logOnFailure) {
-	if (tenant.present()) {
-		auto itr = commitData->tenantMap.find(tenant.get());
-		if (itr == commitData->tenantMap.end()) {
-			if (logOnFailure) {
-				TraceEvent(SevWarn, "CommitProxyUnknownTenant", commitData->dbgid).detail("Tenant", tenant.get());
-			}
-
-			return unknown_tenant();
-		} else if (tenantId.present() && tenantId.get() != itr->second.id) {
-			if (logOnFailure) {
-				TraceEvent(SevWarn, "CommitProxyTenantIdMismatch", commitData->dbgid)
-				    .detail("Tenant", tenant.get())
-				    .detail("TenantId", tenantId)
-				    .detail("ExistingId", itr->second.id);
-			}
-
-			return unknown_tenant();
-		}
-
-		return ErrorOr<Optional<TenantMapEntry>>(Optional<TenantMapEntry>(itr->second));
-	}
-
-	return Optional<TenantMapEntry>();
-}
-
 namespace CommitBatch {

 struct CommitBatchContext {
@ -685,6 +762,11 @@ bool canReject(const std::vector<CommitTransactionRequest>& trs) {
 	return true;
 }

+double computeReleaseDelay(CommitBatchContext* self, double latencyBucket) {
+	return std::min(SERVER_KNOBS->MAX_PROXY_COMPUTE,
+	                self->batchOperations * self->pProxyCommitData->commitComputePerOperation[latencyBucket]);
+}
+
 ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {

 	state ProxyCommitData* const pProxyCommitData = self->pProxyCommitData;
@ -708,6 +790,7 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
 	// Pre-resolution the commits
 	TEST(pProxyCommitData->latestLocalCommitBatchResolving.get() < localBatchNumber - 1); // Wait for local batch
 	wait(pProxyCommitData->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber - 1));
+	pProxyCommitData->stats.computeLatency.addMeasurement(now() - timeStart);
 	double queuingDelay = g_network->now() - timeStart;
 	pProxyCommitData->stats.commitBatchQueuingDist->sampleSeconds(queuingDelay);
 	if ((queuingDelay > (double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND ||
@ -736,10 +819,7 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
 		return Void();
 	}

-	self->releaseDelay =
-	    delay(std::min(SERVER_KNOBS->MAX_PROXY_COMPUTE,
-	                   self->batchOperations * pProxyCommitData->commitComputePerOperation[latencyBucket]),
-	          TaskPriority::ProxyMasterVersionReply);
+	self->releaseDelay = delay(computeReleaseDelay(self, latencyBucket), TaskPriority::ProxyMasterVersionReply);

 	if (debugID.present()) {
 		g_traceBatch.addEvent(
@ -1385,8 +1465,10 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {

 	self->computeDuration += g_network->timer() - self->computeStart;
 	if (self->batchOperations > 0) {
+		double estimatedDelay = computeReleaseDelay(self, self->latencyBucket);
 		double computePerOperation =
 		    std::min(SERVER_KNOBS->MAX_COMPUTE_PER_OPERATION, self->computeDuration / self->batchOperations);
+
 		if (computePerOperation <= pProxyCommitData->commitComputePerOperation[self->latencyBucket]) {
 			pProxyCommitData->commitComputePerOperation[self->latencyBucket] = computePerOperation;
 		} else {
@ -1401,6 +1483,20 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 		pProxyCommitData->stats.minComputeNS =
 		    std::min<int64_t>(pProxyCommitData->stats.minComputeNS,
 		                      1e9 * pProxyCommitData->commitComputePerOperation[self->latencyBucket]);
+
+		if (estimatedDelay >= SERVER_KNOBS->MAX_COMPUTE_DURATION_LOG_CUTOFF ||
+		    self->computeDuration >= SERVER_KNOBS->MAX_COMPUTE_DURATION_LOG_CUTOFF) {
+			TraceEvent(SevInfo, "LongComputeDuration", pProxyCommitData->dbgid)
+			    .suppressFor(10.0)
+			    .detail("EstimatedComputeDuration", estimatedDelay)
+			    .detail("ComputeDuration", self->computeDuration)
+			    .detail("ComputePerOperation", computePerOperation)
+			    .detail("LatencyBucket", self->latencyBucket)
+			    .detail("UpdatedComputePerOperationEstimate",
+			            pProxyCommitData->commitComputePerOperation[self->latencyBucket])
+			    .detail("BatchBytes", self->batchBytes)
+			    .detail("BatchOperations", self->batchOperations);
+		}
 	}

 	pProxyCommitData->stats.processingMutationDist->sampleSeconds(now() - postResolutionQueuing);
--- a/fdbserver/CoordinatedState.actor.cpp
+++ b/fdbserver/CoordinatedState.actor.cpp
@ -26,21 +26,29 @@
 #include "fdbserver/LeaderElection.h"
 #include "flow/actorcompiler.h" // has to be last include

-ACTOR Future<GenerationRegReadReply> waitAndSendRead(RequestStream<GenerationRegReadRequest> to,
-                                                     GenerationRegReadRequest req) {
+ACTOR Future<GenerationRegReadReply> waitAndSendRead(GenerationRegInterface stateServer, GenerationRegReadRequest req) {
 	if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
 		wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
-	state GenerationRegReadReply reply = wait(retryBrokenPromise(to, req));
+	state GenerationRegReadReply reply;
+	if (stateServer.hostname.present()) {
+		wait(store(reply, retryGetReplyFromHostname(req, stateServer.hostname.get(), WLTOKEN_GENERATIONREG_READ)));
+	} else {
+		wait(store(reply, retryBrokenPromise(stateServer.read, req)));
+	}
 	if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
 		wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
 	return reply;
 }

-ACTOR Future<UniqueGeneration> waitAndSendWrite(RequestStream<GenerationRegWriteRequest> to,
-                                                GenerationRegWriteRequest req) {
+ACTOR Future<UniqueGeneration> waitAndSendWrite(GenerationRegInterface stateServer, GenerationRegWriteRequest req) {
 	if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
 		wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
-	state UniqueGeneration reply = wait(retryBrokenPromise(to, req));
+	state UniqueGeneration reply;
+	if (stateServer.hostname.present()) {
+		wait(store(reply, retryGetReplyFromHostname(req, stateServer.hostname.get(), WLTOKEN_GENERATIONREG_WRITE)));
+	} else {
+		wait(store(reply, retryBrokenPromise(stateServer.write, req)));
+	}
 	if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
 		wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
 	return reply;
@ -152,7 +160,7 @@ struct CoordinatedStateImpl {
 		state std::vector<Future<GenerationRegReadReply>> rep_reply;
 		for (int i = 0; i < replicas.size(); i++) {
 			Future<GenerationRegReadReply> reply =
-			    waitAndSendRead(replicas[i].read, GenerationRegReadRequest(req.key, req.gen));
+			    waitAndSendRead(replicas[i], GenerationRegReadRequest(req.key, req.gen));
 			rep_empty_reply.push_back(nonemptyToNever(reply));
 			rep_reply.push_back(emptyToNever(reply));
 			self->ac.add(success(reply));
@ -192,8 +200,7 @@ struct CoordinatedStateImpl {
 		state std::vector<GenerationRegInterface>& replicas = self->coordinators.stateServers;
 		state std::vector<Future<UniqueGeneration>> wrep_reply;
 		for (int i = 0; i < replicas.size(); i++) {
-			Future<UniqueGeneration> reply =
-			    waitAndSendWrite(replicas[i].write, GenerationRegWriteRequest(req.kv, req.gen));
+			Future<UniqueGeneration> reply = waitAndSendWrite(replicas[i], GenerationRegWriteRequest(req.kv, req.gen));
 			wrep_reply.push_back(reply);
 			self->ac.add(success(reply));
 		}
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@ -98,12 +98,16 @@ LeaderElectionRegInterface::LeaderElectionRegInterface(INetwork* local) : Client
 }

 ServerCoordinators::ServerCoordinators(Reference<IClusterConnectionRecord> ccr) : ClientCoordinators(ccr) {
-	ASSERT(ccr->connectionStringStatus() == ClusterConnectionString::RESOLVED);
 	ClusterConnectionString cs = ccr->getConnectionString();
-	for (auto s = cs.coordinators().begin(); s != cs.coordinators().end(); ++s) {
-		leaderElectionServers.emplace_back(*s);
-		stateServers.emplace_back(*s);
-		configServers.emplace_back(*s);
+	for (auto h : cs.hostnames) {
+		leaderElectionServers.emplace_back(h);
+		stateServers.emplace_back(h);
+		configServers.emplace_back(h);
+	}
+	for (auto s : cs.coordinators()) {
+		leaderElectionServers.emplace_back(s);
+		stateServers.emplace_back(s);
+		configServers.emplace_back(s);
 	}
 }

@ -208,10 +212,8 @@ ACTOR Future<Void> openDatabase(ClientData* db,
                                int* clientCount,
                                Reference<AsyncVar<bool>> hasConnectedClients,
                                OpenDatabaseCoordRequest req,
-                                Future<Void> checkStuck,
-                                Reference<AsyncVar<Void>> coordinatorsChanged) {
+                                Future<Void> checkStuck) {
 	state ErrorOr<CachedSerialization<ClientDBInfo>> replyContents;
-	state Future<Void> coordinatorsChangedOnChange = coordinatorsChanged->onChange();
 	state Future<Void> clientInfoOnChange = db->clientInfo->onChange();

 	++(*clientCount);
@ -233,11 +235,6 @@ ACTOR Future<Void> openDatabase(ClientData* db,
 				clientInfoOnChange = db->clientInfo->onChange();
 				replyContents = db->clientInfo->get();
 			}
-			when(wait(coordinatorsChangedOnChange)) {
-				coordinatorsChangedOnChange = coordinatorsChanged->onChange();
-				replyContents = coordinators_changed();
-				break;
-			}
 			when(wait(delayJittered(SERVER_KNOBS->CLIENT_REGISTER_INTERVAL))) {
 				if (db->clientInfo->get().read().id.isValid()) {
 					replyContents = db->clientInfo->get();
@ -268,10 +265,7 @@ ACTOR Future<Void> openDatabase(ClientData* db,
 ACTOR Future<Void> remoteMonitorLeader(int* clientCount,
                                       Reference<AsyncVar<bool>> hasConnectedClients,
                                       Reference<AsyncVar<Optional<LeaderInfo>>> currentElectedLeader,
-                                       ElectionResultRequest req,
-                                       Reference<AsyncVar<Void>> coordinatorsChanged) {
-	state bool coordinatorsChangeDetected = false;
-	state Future<Void> coordinatorsChangedOnChange = coordinatorsChanged->onChange();
+                                       ElectionResultRequest req) {
 	state Future<Void> currentElectedLeaderOnChange = currentElectedLeader->onChange();
 	++(*clientCount);
 	hasConnectedClients->set(true);
@ -281,20 +275,11 @@ ACTOR Future<Void> remoteMonitorLeader(int* clientCount,
 			when(wait(yieldedFuture(currentElectedLeaderOnChange))) {
 				currentElectedLeaderOnChange = currentElectedLeader->onChange();
 			}
-			when(wait(coordinatorsChangedOnChange)) {
-				coordinatorsChangedOnChange = coordinatorsChanged->onChange();
-				coordinatorsChangeDetected = true;
-				break;
-			}
 			when(wait(delayJittered(SERVER_KNOBS->CLIENT_REGISTER_INTERVAL))) { break; }
 		}
 	}

-	if (coordinatorsChangeDetected) {
-		req.reply.sendError(coordinators_changed());
-	} else {
 	req.reply.send(currentElectedLeader->get());
-	}

 	if (--(*clientCount) == 0) {
 		hasConnectedClients->set(false);
@ -325,8 +310,6 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 	state Reference<AsyncVar<Optional<LeaderInfo>>> currentElectedLeader =
 	    makeReference<AsyncVar<Optional<LeaderInfo>>>();
 	state LivenessChecker canConnectToLeader(SERVER_KNOBS->COORDINATOR_LEADER_CONNECTION_TIMEOUT);
-	state Reference<AsyncVar<Void>> coordinatorsChanged = makeReference<AsyncVar<Void>>();
-	state Future<Void> coordinatorsChangedOnChange = coordinatorsChanged->onChange();
 	state Future<Void> hasConnectedClientsOnChange = hasConnectedClients->onChange();

 	loop choose {
@ -338,14 +321,10 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 			} else {
 				if (!leaderMon.isValid()) {
 					leaderMon = monitorLeaderAndGetClientInfo(
-					    req.clusterKey, req.coordinators, &clientData, currentElectedLeader, coordinatorsChanged);
+					    req.clusterKey, req.hostnames, req.coordinators, &clientData, currentElectedLeader);
 				}
-				actors.add(openDatabase(&clientData,
-				                        &clientCount,
-				                        hasConnectedClients,
-				                        req,
-				                        canConnectToLeader.checkStuck(),
-				                        coordinatorsChanged));
+				actors.add(
+				    openDatabase(&clientData, &clientCount, hasConnectedClients, req, canConnectToLeader.checkStuck()));
 			}
 		}
 		when(ElectionResultRequest req = waitNext(interf.electionResult.getFuture())) {
@ -355,10 +334,9 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 			} else {
 				if (!leaderMon.isValid()) {
 					leaderMon = monitorLeaderAndGetClientInfo(
-					    req.key, req.coordinators, &clientData, currentElectedLeader, coordinatorsChanged);
+					    req.key, req.hostnames, req.coordinators, &clientData, currentElectedLeader);
 				}
-				actors.add(remoteMonitorLeader(
-				    &clientCount, hasConnectedClients, currentElectedLeader, req, coordinatorsChanged));
+				actors.add(remoteMonitorLeader(&clientCount, hasConnectedClients, currentElectedLeader, req));
 			}
 		}
 		when(GetLeaderRequest req = waitNext(interf.getLeader.getFuture())) {
@ -499,10 +477,6 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 			}
 		}
 		when(wait(actors.getResult())) {}
-		when(wait(coordinatorsChangedOnChange)) {
-			leaderMon = Future<Void>();
-			coordinatorsChangedOnChange = coordinatorsChanged->onChange();
-		}
 	}
 }

--- a/fdbserver/CoordinationInterface.h
+++ b/fdbserver/CoordinationInterface.h
@ -153,17 +153,21 @@ struct CandidacyRequest {
 struct ElectionResultRequest {
 	constexpr static FileIdentifier file_identifier = 11815465;
 	Key key;
+	std::vector<Hostname> hostnames;
 	std::vector<NetworkAddress> coordinators;
 	UID knownLeader;
 	ReplyPromise<Optional<LeaderInfo>> reply;

 	ElectionResultRequest() = default;
-	ElectionResultRequest(Key key, std::vector<NetworkAddress> coordinators, UID knownLeader)
-	  : key(key), coordinators(std::move(coordinators)), knownLeader(knownLeader) {}
+	ElectionResultRequest(Key key,
+	                      std::vector<Hostname> hostnames,
+	                      std::vector<NetworkAddress> coordinators,
+	                      UID knownLeader)
+	  : key(key), hostnames(std::move(hostnames)), coordinators(std::move(coordinators)), knownLeader(knownLeader) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, key, coordinators, knownLeader, reply);
+		serializer(ar, key, hostnames, coordinators, knownLeader, reply);
 	}
 };

--- a/fdbserver/DDTeamCollection.actor.cpp
+++ b/fdbserver/DDTeamCollection.actor.cpp
@ -726,6 +726,7 @@ public:
 				//    .detail("LastAnyUndesired", lastAnyUndesired)
 				//    .detail("AnyWrongConfiguration", anyWrongConfiguration)
 				//    .detail("LastWrongConfiguration", lastWrongConfiguration)
+				//    .detail("ContainsWigglingServer", anyWigglingServer)
 				//    .detail("Recheck", recheck)
 				//    .detail("BadTeam", badTeam)
 				//    .detail("LastZeroHealthy", lastZeroHealthy)
@ -1103,9 +1104,8 @@ public:
 				if (worstStatus == DDTeamCollection::Status::WIGGLING && invalidWiggleServer(worstAddr, self, server)) {
 					TraceEvent(SevInfo, "InvalidWiggleServer", self->distributorId)
 					    .detail("Address", worstAddr.toString())
-					    .detail("ProcessId", server->getLastKnownInterface().locality.processId())
-					    .detail("WigglingId", self->wigglingId.present());
-					self->excludedServers.set(worstAddr, DDTeamCollection::Status::NONE);
+					    .detail("ServerId", server->getId())
+					    .detail("WigglingId", self->wigglingId.present() ? self->wigglingId.get().toString() : "");
 					worstStatus = DDTeamCollection::Status::NONE;
 				}
 				otherChanges.push_back(self->excludedServers.onChange(worstAddr));
@ -1127,10 +1127,9 @@ public:
 					if (testStatus == DDTeamCollection::Status::WIGGLING &&
 					    invalidWiggleServer(testAddr, self, server)) {
 						TraceEvent(SevInfo, "InvalidWiggleServer", self->distributorId)
-						    .detail("Address", testAddr.toString())
-						    .detail("ProcessId", server->getLastKnownInterface().locality.processId())
-						    .detail("ValidWigglingId", self->wigglingId.present());
-						self->excludedServers.set(testAddr, DDTeamCollection::Status::NONE);
+						    .detail("Address", worstAddr.toString())
+						    .detail("ServerId", server->getId())
+						    .detail("WigglingId", self->wigglingId.present() ? self->wigglingId.get().toString() : "");
 						testStatus = DDTeamCollection::Status::NONE;
 					}

@ -2052,7 +2051,7 @@ public:
 					           "PerpetualStorageWigglePause",
 					           self->distributorId)
 					    .detail("Primary", self->primary)
-					    .detail("ProcessId", id)
+					    .detail("ServerId", id)
 					    .detail("BestTeamKeepStuckCount", self->bestTeamKeepStuckCount)
 					    .detail("ExtraHealthyTeamCount", extraTeamCount)
 					    .detail("HealthyTeamCount", self->healthyTeamCount);
@ -2065,7 +2064,7 @@ public:
 							moveFinishFuture = fv;
 							TraceEvent("PerpetualStorageWiggleStart", self->distributorId)
 							    .detail("Primary", self->primary)
-							    .detail("ProcessId", id)
+							    .detail("ServerId", id)
 							    .detail("ExtraHealthyTeamCount", extraTeamCount)
 							    .detail("HealthyTeamCount", self->healthyTeamCount);
 						}
@ -2091,7 +2090,7 @@ public:
 					self->includeStorageServersForWiggle();
 					TraceEvent("PerpetualStorageWiggleFinish", self->distributorId)
 					    .detail("Primary", self->primary)
-					    .detail("ProcessId", self->wigglingId.get());
+					    .detail("ServerId", self->wigglingId.get());

 					wait(self->eraseStorageWiggleMap(&metadataMap, self->wigglingId.get()) &&
 					     self->storageWiggler->finishWiggle());
@ -2112,7 +2111,7 @@ public:
 			self->includeStorageServersForWiggle();
 			TraceEvent("PerpetualStorageWiggleExitingPause", self->distributorId)
 			    .detail("Primary", self->primary)
-			    .detail("ProcessId", self->wigglingId.get());
+			    .detail("ServerId", self->wigglingId.get());
 			self->wigglingId.reset();
 		}

--- a/fdbserver/LeaderElection.actor.cpp
+++ b/fdbserver/LeaderElection.actor.cpp
@ -27,44 +27,29 @@

 // Keep trying to become a leader by submitting itself to all coordinators.
 // Monitor the health of all coordinators at the same time.
-// Note: for coordinators whose NetworkAddress is parsed out of a hostname, a connection failure will cause this actor
-// to throw `coordinators_changed()` error
 ACTOR Future<Void> submitCandidacy(Key key,
                                   LeaderElectionRegInterface coord,
                                   LeaderInfo myInfo,
                                   UID prevChangeID,
                                   AsyncTrigger* nomineeChange,
-                                   Optional<LeaderInfo>* nominee,
-                                   Optional<Hostname> hostname = Optional<Hostname>()) {
+                                   Optional<LeaderInfo>* nominee) {
 	loop {
 		state Optional<LeaderInfo> li;
-
-		if (coord.candidacy.getEndpoint().getPrimaryAddress().fromHostname) {
-			state ErrorOr<Optional<LeaderInfo>> rep = wait(coord.candidacy.tryGetReply(
+		if (coord.hostname.present()) {
+			wait(store(
+			    li,
+			    retryGetReplyFromHostname(
 			        CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID),
-			    TaskPriority::CoordinationReply));
-			if (rep.isError()) {
-				// Connecting to nominee failed, most likely due to connection failed.
-				TraceEvent("SubmitCandadicyError")
-				    .error(rep.getError())
-				    .detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
-				    .detail("OldAddr", coord.candidacy.getEndpoint().getPrimaryAddress().toString());
-				if (rep.getError().code() == error_code_request_maybe_delivered) {
-					// Delay to prevent tight resolving loop due to outdated DNS cache
-					wait(delay(FLOW_KNOBS->HOSTNAME_RECONNECT_INIT_INTERVAL));
-					throw coordinators_changed();
+			        coord.hostname.get(),
+			        WLTOKEN_LEADERELECTIONREG_CANDIDACY,
+			        TaskPriority::CoordinationReply)));
 		} else {
-					throw rep.getError();
-				}
-			} else if (rep.present()) {
-				li = rep.get();
-			}
-		} else {
-			Optional<LeaderInfo> tmp = wait(retryBrokenPromise(
+			wait(store(
+			    li,
+			    retryBrokenPromise(
 			        coord.candidacy,
 			        CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID),
-			    TaskPriority::CoordinationReply));
-			li = tmp;
+			        TaskPriority::CoordinationReply)));
 		}

 		wait(Future<Void>(Void())); // Make sure we weren't cancelled
@ -104,20 +89,26 @@ Future<Void> buggifyDelayedAsyncVar(Reference<AsyncVar<T>>& var) {
 ACTOR Future<Void> changeLeaderCoordinators(ServerCoordinators coordinators, Value forwardingInfo) {
 	std::vector<Future<Void>> forwardRequests;
 	forwardRequests.reserve(coordinators.leaderElectionServers.size());
-	for (int i = 0; i < coordinators.leaderElectionServers.size(); i++)
+	for (int i = 0; i < coordinators.leaderElectionServers.size(); i++) {
+		if (coordinators.leaderElectionServers[i].hostname.present()) {
+			forwardRequests.push_back(retryGetReplyFromHostname(ForwardRequest(coordinators.clusterKey, forwardingInfo),
+			                                                    coordinators.leaderElectionServers[i].hostname.get(),
+			                                                    WLTOKEN_LEADERELECTIONREG_FORWARD));
+		} else {
 			forwardRequests.push_back(retryBrokenPromise(coordinators.leaderElectionServers[i].forward,
 			                                             ForwardRequest(coordinators.clusterKey, forwardingInfo)));
+		}
+	}
 	int quorum_size = forwardRequests.size() / 2 + 1;
 	wait(quorum(forwardRequests, quorum_size));
 	return Void();
 }

-ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> connRecord,
+ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
                                           Value proposedSerializedInterface,
                                           Reference<AsyncVar<Value>> outSerializedLeader,
                                           bool hasConnected,
                                           Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo) {
-	state ServerCoordinators coordinators(connRecord);
 	state AsyncTrigger nomineeChange;
 	state std::vector<Optional<LeaderInfo>> nominees;
 	state LeaderInfo myInfo;
@ -134,6 +125,8 @@ ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> c
 		wait(delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY));
 	}

+	nominees.resize(coordinators.leaderElectionServers.size());
+
 	myInfo.serializedInfo = proposedSerializedInterface;
 	outSerializedLeader->set(Value());

@ -141,9 +134,6 @@ ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> c
 	    (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY) ? buggifyDelayedAsyncVar(outSerializedLeader) : Void();

 	while (!iAmLeader) {
-		wait(connRecord->resolveHostnames());
-		coordinators = ServerCoordinators(connRecord);
-		nominees.resize(coordinators.leaderElectionServers.size());
 		state Future<Void> badCandidateTimeout;

 		myInfo.changeID = deterministicRandom()->randomUniqueID();
@ -153,19 +143,12 @@ ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> c
 		std::vector<Future<Void>> cand;
 		cand.reserve(coordinators.leaderElectionServers.size());
 		for (int i = 0; i < coordinators.leaderElectionServers.size(); i++) {
-			Optional<Hostname> hostname;
-			auto r = connRecord->getConnectionString().networkAddressToHostname.find(
-			    coordinators.leaderElectionServers[i].candidacy.getEndpoint().getPrimaryAddress());
-			if (r != connRecord->getConnectionString().networkAddressToHostname.end()) {
-				hostname = r->second;
-			}
 			cand.push_back(submitCandidacy(coordinators.clusterKey,
 			                               coordinators.leaderElectionServers[i],
 			                               myInfo,
 			                               prevChangeID,
 			                               &nomineeChange,
-			                               &nominees[i],
-			                               hostname));
+			                               &nominees[i]));
 		}
 		candidacies = waitForAll(cand);

@ -220,7 +203,6 @@ ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> c
 			} else
 				badCandidateTimeout = Future<Void>();

-			try {
 			choose {
 				when(wait(nomineeChange.onTrigger())) {}
 				when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) {
@ -231,14 +213,6 @@ ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> c
 				when(wait(candidacies)) { ASSERT(false); }
 				when(wait(asyncPriorityInfo->onChange())) { break; }
 			}
-			} catch (Error& e) {
-				if (e.code() == error_code_coordinators_changed) {
-					connRecord->getConnectionString().resetToUnresolved();
-					break;
-				} else {
-					throw e;
-				}
-			}
 		}

 		candidacies.cancel();
@ -258,10 +232,17 @@ ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> c
 		state std::vector<Future<Void>> true_heartbeats;
 		state std::vector<Future<Void>> false_heartbeats;
 		for (int i = 0; i < coordinators.leaderElectionServers.size(); i++) {
-			Future<LeaderHeartbeatReply> hb =
-			    retryBrokenPromise(coordinators.leaderElectionServers[i].leaderHeartbeat,
+			Future<LeaderHeartbeatReply> hb;
+			if (coordinators.leaderElectionServers[i].hostname.present()) {
+				hb = retryGetReplyFromHostname(LeaderHeartbeatRequest(coordinators.clusterKey, myInfo, prevChangeID),
+				                               coordinators.leaderElectionServers[i].hostname.get(),
+				                               WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT,
+				                               TaskPriority::CoordinationReply);
+			} else {
+				hb = retryBrokenPromise(coordinators.leaderElectionServers[i].leaderHeartbeat,
 				                        LeaderHeartbeatRequest(coordinators.clusterKey, myInfo, prevChangeID),
 				                        TaskPriority::CoordinationReply);
+			}
 			true_heartbeats.push_back(onEqual(hb, LeaderHeartbeatReply{ true }));
 			false_heartbeats.push_back(onEqual(hb, LeaderHeartbeatReply{ false }));
 		}
--- a/fdbserver/LeaderElection.h
+++ b/fdbserver/LeaderElection.h
@ -37,7 +37,7 @@ class ServerCoordinators;
 // eventually be set.  If the return value is cancelled, the candidacy or leadership of the proposedInterface
 // will eventually end.
 template <class LeaderInterface>
-Future<Void> tryBecomeLeader(Reference<IClusterConnectionRecord> const& connRecord,
+Future<Void> tryBecomeLeader(ServerCoordinators const& coordinators,
                             LeaderInterface const& proposedInterface,
                             Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
                             bool hasConnected,
@ -50,20 +50,20 @@ Future<Void> changeLeaderCoordinators(ServerCoordinators const& coordinators, Va
 #pragma region Implementation
 #endif // __INTEL_COMPILER

-Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> const& connRecord,
+Future<Void> tryBecomeLeaderInternal(ServerCoordinators const& coordinators,
                                     Value const& proposedSerializedInterface,
                                     Reference<AsyncVar<Value>> const& outSerializedLeader,
                                     bool const& hasConnected,
                                     Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo);

 template <class LeaderInterface>
-Future<Void> tryBecomeLeader(Reference<IClusterConnectionRecord> const& connRecord,
+Future<Void> tryBecomeLeader(ServerCoordinators const& coordinators,
                             LeaderInterface const& proposedInterface,
                             Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
                             bool hasConnected,
                             Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo) {
 	auto serializedInfo = makeReference<AsyncVar<Value>>();
-	Future<Void> m = tryBecomeLeaderInternal(connRecord,
+	Future<Void> m = tryBecomeLeaderInternal(coordinators,
 	                                         ObjectWriter::toValue(proposedInterface, IncludeVersion()),
 	                                         serializedInfo,
 	                                         hasConnected,
--- a/fdbserver/PaxosConfigConsumer.actor.cpp
+++ b/fdbserver/PaxosConfigConsumer.actor.cpp
@ -99,8 +99,17 @@ class GetCommittedVersionQuorum {

 			// Now roll node forward to match the largest committed version of
 			// the replies.
-			state Reference<ConfigFollowerInfo> quorumCfi(new ConfigFollowerInfo(self->replies[target]));
 			try {
+				state std::vector<ConfigFollowerInterface> interfs = self->replies[target];
+				std::vector<Future<Void>> fs;
+				for (ConfigFollowerInterface& interf : interfs) {
+					if (interf.hostname.present()) {
+						fs.push_back(tryInitializeRequestStream(
+						    &interf.getChanges, interf.hostname.get(), WLTOKEN_CONFIGFOLLOWER_GETCHANGES));
+					}
+				}
+				wait(waitForAll(fs));
+				state Reference<ConfigFollowerInfo> quorumCfi(new ConfigFollowerInfo(interfs));
 				state Version lastSeenVersion = std::max(
 				    rollback.present() ? rollback.get() : nodeVersion.lastCommitted, self->largestCompactedResponse);
 				ConfigFollowerGetChangesReply reply =
@ -108,9 +117,21 @@ class GetCommittedVersionQuorum {
 				                                       &ConfigFollowerInterface::getChanges,
 				                                       ConfigFollowerGetChangesRequest{ lastSeenVersion, target }),
 				                      SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT));
-				wait(timeoutError(cfi.rollforward.getReply(ConfigFollowerRollforwardRequest{
+
+				if (cfi.hostname.present()) {
+					wait(timeoutError(
+					    retryGetReplyFromHostname(
+					        ConfigFollowerRollforwardRequest{
+					            rollback, nodeVersion.lastCommitted, target, reply.changes, reply.annotations },
+					        cfi.hostname.get(),
+					        WLTOKEN_CONFIGFOLLOWER_ROLLFORWARD),
+					    SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT));
+				} else {
+					wait(timeoutError(
+					    cfi.rollforward.getReply(ConfigFollowerRollforwardRequest{
 					        rollback, nodeVersion.lastCommitted, target, reply.changes, reply.annotations }),
 					    SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT));
+				}
 			} catch (Error& e) {
 				if (e.code() == error_code_transaction_too_old) {
 					// Seeing this trace is not necessarily a problem. There
@ -129,9 +150,18 @@ class GetCommittedVersionQuorum {

 	ACTOR static Future<Void> getCommittedVersionActor(GetCommittedVersionQuorum* self, ConfigFollowerInterface cfi) {
 		try {
-			ConfigFollowerGetCommittedVersionReply reply =
-			    wait(timeoutError(cfi.getCommittedVersion.getReply(ConfigFollowerGetCommittedVersionRequest{}),
+			state ConfigFollowerGetCommittedVersionReply reply;
+			if (cfi.hostname.present()) {
+				wait(timeoutError(store(reply,
+				                        retryGetReplyFromHostname(ConfigFollowerGetCommittedVersionRequest{},
+				                                                  cfi.hostname.get(),
+				                                                  WLTOKEN_CONFIGFOLLOWER_GETCOMMITTEDVERSION)),
 				                  SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT));
+			} else {
+				wait(timeoutError(
+				    store(reply, cfi.getCommittedVersion.getReply(ConfigFollowerGetCommittedVersionRequest{})),
+				    SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT));
+			}

 			++self->totalRepliesReceived;
 			self->largestCompactedResponse = std::max(self->largestCompactedResponse, reply.lastCompacted);
@ -279,7 +309,15 @@ class PaxosConfigConsumerImpl {
 			std::vector<Future<Void>> compactionRequests;
 			compactionRequests.reserve(compactionRequests.size());
 			for (const auto& cfi : self->cfis) {
-				compactionRequests.push_back(cfi.compact.getReply(ConfigFollowerCompactRequest{ compactionVersion }));
+				if (cfi.hostname.present()) {
+					compactionRequests.push_back(
+					    retryGetReplyFromHostname(ConfigFollowerCompactRequest{ compactionVersion },
+					                              cfi.hostname.get(),
+					                              WLTOKEN_CONFIGFOLLOWER_COMPACT));
+				} else {
+					compactionRequests.push_back(
+					    cfi.compact.getReply(ConfigFollowerCompactRequest{ compactionVersion }));
+				}
 			}
 			try {
 				wait(timeoutError(waitForAll(compactionRequests), 1.0));
@ -294,8 +332,18 @@ class PaxosConfigConsumerImpl {
 			self->resetCommittedVersionQuorum(); // TODO: This seems to fix a segfault, investigate more
 			try {
 				state Version committedVersion = wait(getCommittedVersion(self));
-				state Reference<ConfigFollowerInfo> configNodes(
-				    new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas()));
+				state std::vector<ConfigFollowerInterface> readReplicas =
+				    self->getCommittedVersionQuorum.getReadReplicas();
+				std::vector<Future<Void>> fs;
+				for (ConfigFollowerInterface& readReplica : readReplicas) {
+					if (readReplica.hostname.present()) {
+						fs.push_back(tryInitializeRequestStream(&readReplica.getSnapshotAndChanges,
+						                                        readReplica.hostname.get(),
+						                                        WLTOKEN_CONFIGFOLLOWER_GETSNAPSHOTANDCHANGES));
+					}
+				}
+				wait(waitForAll(fs));
+				state Reference<ConfigFollowerInfo> configNodes(new ConfigFollowerInfo(readReplicas));
 				ConfigFollowerGetSnapshotAndChangesReply reply =
 				    wait(timeoutError(basicLoadBalance(configNodes,
 				                                       &ConfigFollowerInterface::getSnapshotAndChanges,
@ -349,8 +397,18 @@ class PaxosConfigConsumerImpl {
 				// returned would be 1.
 				if (committedVersion > self->lastSeenVersion) {
 					ASSERT(self->getCommittedVersionQuorum.getReadReplicas().size() >= self->cfis.size() / 2 + 1);
-					state Reference<ConfigFollowerInfo> configNodes(
-					    new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas()));
+					state std::vector<ConfigFollowerInterface> readReplicas =
+					    self->getCommittedVersionQuorum.getReadReplicas();
+					std::vector<Future<Void>> fs;
+					for (ConfigFollowerInterface& readReplica : readReplicas) {
+						if (readReplica.hostname.present()) {
+							fs.push_back(tryInitializeRequestStream(&readReplica.getChanges,
+							                                        readReplica.hostname.get(),
+							                                        WLTOKEN_CONFIGFOLLOWER_GETCHANGES));
+						}
+					}
+					wait(waitForAll(fs));
+					state Reference<ConfigFollowerInfo> configNodes(new ConfigFollowerInfo(readReplicas));
 					ConfigFollowerGetChangesReply reply = wait(timeoutError(
 					    basicLoadBalance(configNodes,
 					                     &ConfigFollowerInterface::getChanges,
--- a/fdbserver/ProxyCommitData.actor.h
+++ b/fdbserver/ProxyCommitData.actor.h
@ -73,6 +73,8 @@ struct ProxyStats {

 	LatencySample commitBatchingWindowSize;

+	LatencySample computeLatency;
+
 	Future<Void> logger;

 	int64_t maxComputeNS;
@ -126,6 +128,10 @@ struct ProxyStats {
 	                             id,
 	                             SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                             SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    computeLatency("ComputeLatency",
+	                   id,
+	                   SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                   SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 	    maxComputeNS(0), minComputeNS(1e12),
 	    commitBatchQueuingDist(Histogram::getHistogram(LiteralStringRef("CommitProxy"),
 	                                                   LiteralStringRef("CommitBatchQueuing"),
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@ -161,9 +161,8 @@ ACTOR Future<std::vector<WorkerInterface>> getCoordWorkers(Database cx,
 	if (!coordinators.present()) {
 		throw operation_failed();
 	}
-	state ClusterConnectionString ccs(coordinators.get().toString());
-	wait(ccs.resolveHostnames());
-	std::vector<NetworkAddress> coordinatorsAddr = ccs.coordinators();
+	ClusterConnectionString ccs(coordinators.get().toString());
+	std::vector<NetworkAddress> coordinatorsAddr = wait(ccs.tryResolveHostnames());
 	std::set<NetworkAddress> coordinatorsAddrSet;
 	for (const auto& addr : coordinatorsAddr) {
 		TraceEvent(SevDebug, "CoordinatorAddress").detail("Addr", addr);
--- a/fdbserver/SimpleConfigConsumer.actor.cpp
+++ b/fdbserver/SimpleConfigConsumer.actor.cpp
@ -44,15 +44,29 @@ class SimpleConfigConsumerImpl {
 		loop {
 			state Version compactionVersion = self->lastSeenVersion;
 			wait(delayJittered(self->compactionInterval.get()));
+			if (self->cfi.hostname.present()) {
+				wait(retryGetReplyFromHostname(ConfigFollowerCompactRequest{ compactionVersion },
+				                               self->cfi.hostname.get(),
+				                               WLTOKEN_CONFIGFOLLOWER_COMPACT));
+			} else {
 				wait(self->cfi.compact.getReply(ConfigFollowerCompactRequest{ compactionVersion }));
+			}
 			++self->compactRequest;
 			broadcaster->compact(compactionVersion);
 		}
 	}

 	ACTOR static Future<Version> getCommittedVersion(SimpleConfigConsumerImpl* self) {
-		ConfigFollowerGetCommittedVersionReply committedVersionReply =
-		    wait(self->cfi.getCommittedVersion.getReply(ConfigFollowerGetCommittedVersionRequest{}));
+		state ConfigFollowerGetCommittedVersionReply committedVersionReply;
+		if (self->cfi.hostname.present()) {
+			wait(store(committedVersionReply,
+			           retryGetReplyFromHostname(ConfigFollowerGetCommittedVersionRequest{},
+			                                     self->cfi.hostname.get(),
+			                                     WLTOKEN_CONFIGFOLLOWER_GETCOMMITTEDVERSION)));
+		} else {
+			wait(store(committedVersionReply,
+			           self->cfi.getCommittedVersion.getReply(ConfigFollowerGetCommittedVersionRequest{})));
+		}
 		return committedVersionReply.lastCommitted;
 	}

@ -63,8 +77,18 @@ class SimpleConfigConsumerImpl {
 				state Version committedVersion = wait(getCommittedVersion(self));
 				ASSERT_GE(committedVersion, self->lastSeenVersion);
 				if (committedVersion > self->lastSeenVersion) {
-					ConfigFollowerGetChangesReply reply = wait(self->cfi.getChanges.getReply(
-					    ConfigFollowerGetChangesRequest{ self->lastSeenVersion, committedVersion }));
+					state ConfigFollowerGetChangesReply reply;
+					if (self->cfi.hostname.present()) {
+						wait(store(reply,
+						           retryGetReplyFromHostname(
+						               ConfigFollowerGetChangesRequest{ self->lastSeenVersion, committedVersion },
+						               self->cfi.hostname.get(),
+						               WLTOKEN_CONFIGFOLLOWER_GETCHANGES)));
+					} else {
+						wait(store(reply,
+						           self->cfi.getChanges.getReply(
+						               ConfigFollowerGetChangesRequest{ self->lastSeenVersion, committedVersion })));
+					}
 					++self->successfulChangeRequest;
 					for (const auto& versionedMutation : reply.changes) {
 						TraceEvent te(SevDebug, "ConsumerFetchedMutation", self->id);
@ -96,8 +120,17 @@ class SimpleConfigConsumerImpl {

 	ACTOR static Future<Void> getSnapshotAndChanges(SimpleConfigConsumerImpl* self, ConfigBroadcaster* broadcaster) {
 		state Version committedVersion = wait(getCommittedVersion(self));
-		ConfigFollowerGetSnapshotAndChangesReply reply = wait(
-		    self->cfi.getSnapshotAndChanges.getReply(ConfigFollowerGetSnapshotAndChangesRequest{ committedVersion }));
+		state ConfigFollowerGetSnapshotAndChangesReply reply;
+		if (self->cfi.hostname.present()) {
+			wait(store(reply,
+			           retryGetReplyFromHostname(ConfigFollowerGetSnapshotAndChangesRequest{ committedVersion },
+			                                     self->cfi.hostname.get(),
+			                                     WLTOKEN_CONFIGFOLLOWER_GETSNAPSHOTANDCHANGES)));
+		} else {
+			wait(store(reply,
+			           self->cfi.getSnapshotAndChanges.getReply(
+			               ConfigFollowerGetSnapshotAndChangesRequest{ committedVersion })));
+		}
 		++self->snapshotRequest;
 		TraceEvent(SevDebug, "ConfigConsumerGotSnapshotAndChanges", self->id)
 		    .detail("SnapshotVersion", reply.snapshotVersion)
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -1980,8 +1980,8 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 	TEST(useIPv6); // Use IPv6
 	TEST(!useIPv6); // Use IPv4

-	// TODO(renxuan): Use hostname 25% of the time, unless it is disabled
-	bool useHostname = false; // !testConfig.disableHostname && deterministicRandom()->random01() < 0.25;
+	// Use hostname 25% of the time, unless it is disabled
+	bool useHostname = !testConfig.disableHostname && deterministicRandom()->random01() < 0.25;
 	TEST(useHostname); // Use hostname
 	TEST(!useHostname); // Use IP address
 	NetworkAddressFromHostname fromHostname =
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -831,7 +831,8 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
 		}
 	}

-	for (auto& coordinator : coordinators.ccr->getConnectionString().coordinators()) {
+	std::vector<NetworkAddress> addressVec = wait(coordinators.ccr->getConnectionString().tryResolveHostnames());
+	for (const auto& coordinator : addressVec) {
 		roles.addCoordinatorRole(coordinator);
 	}

@ -1689,8 +1690,7 @@ static JsonBuilderObject configurationFetcher(Optional<DatabaseConfiguration> co
 			}
 			statusObj["excluded_servers"] = excludedServersArr;
 		}
-		std::vector<ClientLeaderRegInterface> coordinatorLeaderServers = coordinators.clientLeaderServers;
-		int count = coordinatorLeaderServers.size();
+		int count = coordinators.clientLeaderServers.size();
 		statusObj["coordinators_count"] = count;
 	} catch (Error&) {
 		incomplete_reasons->insert("Could not retrieve all configuration status information.");
@ -2505,7 +2505,8 @@ static JsonBuilderArray tlogFetcher(int* logFaultTolerance,

 static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration configuration,
                                                     ServerCoordinators coordinators,
-                                                     std::vector<WorkerDetails>& workers,
+                                                     const std::vector<NetworkAddress>& coordinatorAddresses,
+                                                     const std::vector<WorkerDetails>& workers,
                                                     int extraTlogEligibleZones,
                                                     int minStorageReplicasRemaining,
                                                     int oldLogFaultTolerance,
@ -2521,11 +2522,11 @@ static JsonBuilderObject faultToleranceStatusFetcher(DatabaseConfiguration confi
 	int maxCoordinatorFailures = (coordinators.clientLeaderServers.size() - 1) / 2;

 	std::map<NetworkAddress, StringRef> workerZones;
-	for (auto& worker : workers) {
+	for (const auto& worker : workers) {
 		workerZones[worker.interf.address()] = worker.interf.locality.zoneId().orDefault(LiteralStringRef(""));
 	}
 	std::map<StringRef, int> coordinatorZoneCounts;
-	for (auto& coordinator : coordinators.ccr->getConnectionString().coordinators()) {
+	for (const auto& coordinator : coordinatorAddresses) {
 		auto zone = workerZones[coordinator];
 		coordinatorZoneCounts[zone] += 1;
 	}
@ -3061,6 +3062,9 @@ ACTOR Future<StatusReply> clusterGetStatus(
 			state std::vector<JsonBuilderObject> workerStatuses = wait(getAll(futures2));
 			wait(success(primaryDCFO));

+			std::vector<NetworkAddress> coordinatorAddresses =
+			    wait(coordinators.ccr->getConnectionString().tryResolveHostnames());
+
 			int logFaultTolerance = 100;
 			if (db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
 				statusObj["logs"] = tlogFetcher(&logFaultTolerance, db, address_workers);
@ -3070,6 +3074,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
 			statusObj["fault_tolerance"] =
 			    faultToleranceStatusFetcher(configuration.get(),
 			                                coordinators,
+			                                coordinatorAddresses,
 			                                workers,
 			                                extraTlogEligibleZones,
 			                                minStorageReplicasRemaining,
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -859,9 +859,9 @@ std::pair<NetworkAddressList, NetworkAddressList> buildNetworkAddresses(
 	NetworkAddressList publicNetworkAddresses;
 	NetworkAddressList listenNetworkAddresses;

-	connectionRecord.resolveHostnamesBlocking();
-	auto& coordinators = connectionRecord.getConnectionString().coordinators();
-	ASSERT(coordinators.size() > 0);
+	std::vector<Hostname>& hostnames = connectionRecord.getConnectionString().hostnames;
+	const std::vector<NetworkAddress>& coords = connectionRecord.getConnectionString().coordinators();
+	ASSERT(hostnames.size() + coords.size() > 0);

 	for (int ii = 0; ii < publicAddressStrs.size(); ++ii) {
 		const std::string& publicAddressStr = publicAddressStrs[ii];
@ -930,13 +930,26 @@ std::pair<NetworkAddressList, NetworkAddressList> buildNetworkAddresses(
 			listenNetworkAddresses.secondaryAddress = currentListenAddress;
 		}

-		bool hasSameCoord = std::all_of(coordinators.begin(), coordinators.end(), [&](const NetworkAddress& address) {
+		bool matchCoordinatorsTls = std::all_of(coords.begin(), coords.end(), [&](const NetworkAddress& address) {
 			if (address.ip == currentPublicAddress.ip && address.port == currentPublicAddress.port) {
 				return address.isTLS() == currentPublicAddress.isTLS();
 			}
 			return true;
 		});
-		if (!hasSameCoord) {
+		// If true, further check hostnames.
+		if (matchCoordinatorsTls) {
+			matchCoordinatorsTls = std::all_of(hostnames.begin(), hostnames.end(), [&](Hostname& hostname) {
+				Optional<NetworkAddress> resolvedAddress = hostname.resolveBlocking();
+				if (resolvedAddress.present()) {
+					NetworkAddress address = resolvedAddress.get();
+					if (address.ip == currentPublicAddress.ip && address.port == currentPublicAddress.port) {
+						return address.isTLS() == currentPublicAddress.isTLS();
+					}
+				}
+				return true;
+			});
+		}
+		if (!matchCoordinatorsTls) {
 			fprintf(stderr,
 			        "ERROR: TLS state of public address %s does not match in coordinator list.\n",
 			        publicAddressStr.c_str());
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -3455,7 +3455,8 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
 		tr.setVersion(version);
 		// TODO: is DefaultPromiseEndpoint the best priority for this?
 		tr.trState->taskID = TaskPriority::DefaultPromiseEndpoint;
-		Future<RangeResult> rangeResultFuture = tr.getRange(prefixRange(prefix), Snapshot::True);
+		Future<RangeResult> rangeResultFuture =
+		    tr.getRange(prefixRange(prefix), GetRangeLimits::ROW_LIMIT_UNLIMITED, Snapshot::True);
 		// TODO: async in case it needs to read from other servers.
 		RangeResult rangeResult = wait(rangeResultFuture);
 		a->dependsOn(rangeResult.arena());
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -2977,21 +2977,40 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderWithDelayedCandidacyImplOneGenerati
    Reference<IClusterConnectionRecord> connRecord,
    Reference<AsyncVar<Value>> result,
    MonitorLeaderInfo info) {
-	state ClusterConnectionString ccf = info.intermediateConnRecord->getConnectionString();
-	state std::vector<NetworkAddress> addrs = ccf.coordinators();
+	ClusterConnectionString cs = info.intermediateConnRecord->getConnectionString();
+	state int coordinatorsSize = cs.hostnames.size() + cs.coordinators().size();
 	state ElectionResultRequest request;
 	state int index = 0;
 	state int successIndex = 0;
-	request.key = ccf.clusterKey();
-	request.coordinators = ccf.coordinators();
+	state std::vector<LeaderElectionRegInterface> leaderElectionServers;

-	deterministicRandom()->randomShuffle(addrs);
+	leaderElectionServers.reserve(coordinatorsSize);
+	for (const auto& h : cs.hostnames) {
+		leaderElectionServers.push_back(LeaderElectionRegInterface(h));
+	}
+	for (const auto& c : cs.coordinators()) {
+		leaderElectionServers.push_back(LeaderElectionRegInterface(c));
+	}
+	deterministicRandom()->randomShuffle(leaderElectionServers);
+
+	request.key = cs.clusterKey();
+	request.hostnames = cs.hostnames;
+	request.coordinators = cs.coordinators();

 	loop {
-		LeaderElectionRegInterface interf(addrs[index]);
+		LeaderElectionRegInterface interf = leaderElectionServers[index];
+		bool usingHostname = interf.hostname.present();
 		request.reply = ReplyPromise<Optional<LeaderInfo>>();

-		ErrorOr<Optional<LeaderInfo>> leader = wait(interf.electionResult.tryGetReply(request));
+		state ErrorOr<Optional<LeaderInfo>> leader;
+		if (usingHostname) {
+			wait(store(
+			    leader,
+			    tryGetReplyFromHostname(request, interf.hostname.get(), WLTOKEN_LEADERELECTIONREG_ELECTIONRESULT)));
+		} else {
+			wait(store(leader, interf.electionResult.tryGetReply(request)));
+		}
+
 		if (leader.present()) {
 			if (leader.get().present()) {
 				if (leader.get().get().forward) {
@ -3027,14 +3046,9 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderWithDelayedCandidacyImplOneGenerati
 			}
 			successIndex = index;
 		} else {
-			if (leader.isError() && leader.getError().code() == error_code_coordinators_changed) {
-				info.intermediateConnRecord->getConnectionString().resetToUnresolved();
-				throw coordinators_changed();
-			}
-			index = (index + 1) % addrs.size();
+			index = (index + 1) % coordinatorsSize;
 			if (index == successIndex) {
 				wait(delay(CLIENT_KNOBS->COORDINATOR_RECONNECTION_DELAY));
-				throw coordinators_changed();
 			}
 		}
 	}
@ -3042,22 +3056,11 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderWithDelayedCandidacyImplOneGenerati

 ACTOR Future<Void> monitorLeaderWithDelayedCandidacyImplInternal(Reference<IClusterConnectionRecord> connRecord,
                                                                 Reference<AsyncVar<Value>> outSerializedLeaderInfo) {
-	wait(connRecord->resolveHostnames());
 	state MonitorLeaderInfo info(connRecord);
 	loop {
-		try {
-			wait(info.intermediateConnRecord->resolveHostnames());
 		MonitorLeaderInfo _info =
 		    wait(monitorLeaderWithDelayedCandidacyImplOneGeneration(connRecord, outSerializedLeaderInfo, info));
 		info = _info;
-		} catch (Error& e) {
-			if (e.code() == error_code_coordinators_changed) {
-				TraceEvent("MonitorLeaderWithDelayedCandidacyCoordinatorsChanged").suppressFor(1.0);
-				info.intermediateConnRecord->getConnectionString().resetToUnresolved();
-			} else {
-				throw e;
-			}
-		}
 	}
 }

@ -3191,6 +3194,7 @@ ACTOR Future<Void> fdbd(Reference<IClusterConnectionRecord> connRecord,
 	actors.push_back(serveProcess());

 	try {
+		ServerCoordinators coordinators(connRecord);
 		if (g_network->isSimulated()) {
 			whitelistBinPaths = ",, random_path,  /bin/snap_create.sh,,";
 		}
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@ -2096,7 +2096,8 @@ struct ConsistencyCheckWorkload : TestWorkload {
 					return false;
 				}

-				state ClusterConnectionString old(currentKey.get().toString());
+				ClusterConnectionString old(currentKey.get().toString());
+				state std::vector<NetworkAddress> oldCoordinators = wait(old.tryResolveHostnames());

 				std::vector<ProcessData> workers = wait(::getWorkers(&tr));

@ -2106,7 +2107,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				}

 				std::set<Optional<Standalone<StringRef>>> checkDuplicates;
-				for (const auto& addr : old.coordinators()) {
+				for (const auto& addr : oldCoordinators) {
 					auto findResult = addr_locality.find(addr);
 					if (findResult != addr_locality.end()) {
 						if (checkDuplicates.count(findResult->second.zoneId())) {
--- a/fdbserver/workloads/Cycle.actor.cpp
+++ b/fdbserver/workloads/Cycle.actor.cpp
@ -106,6 +106,7 @@ struct CycleWorkload : TestWorkload {
 				state Transaction tr(cx);
 				if (deterministicRandom()->random01() >= self->traceParentProbability) {
 					state Span span("CycleClient"_loc);
+					// TraceEvent("CycleTracingTransaction", span.context).log();
 					TraceEvent("CycleTracingTransaction", span.context).log();
 					tr.setOption(FDBTransactionOptions::SPAN_PARENT,
 					             BinaryWriter::toValue(span.context, Unversioned()));
--- a/fdbserver/workloads/DataLossRecovery.actor.cpp
+++ b/fdbserver/workloads/DataLossRecovery.actor.cpp
@ -132,7 +132,7 @@ struct DataLossRecoveryWorkload : TestWorkload {
 				} else {
 					tr.clear(key);
 				}
-				wait(timeoutError(tr.commit(), 30.0));
+				wait(tr.commit());
 				break;
 			} catch (Error& e) {
 				wait(tr.onError(e));
--- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
@ -329,9 +329,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 								for (int j = i; j < end; j++) {
 									if (deterministicRandom()->random01() < self->initialKeyDensity) {
 										Key key = self->getKeyForIndex(tenantNum, j);
-										if (key.size() <= (key.startsWith(systemKeys.begin)
-										                       ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-										                       : CLIENT_KNOBS->KEY_SIZE_LIMIT)) {
+										if (key.size() <= getMaxWriteKeySize(key, false)) {
 											Value value = self->getRandomValue();
 											value = value.substr(
 											    0, std::min<int>(value.size(), CLIENT_KNOBS->VALUE_SIZE_LIMIT));
@ -1091,24 +1089,22 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 				pos = littleEndian32(*(int32_t*)&value.end()[-4]);
 			}

-			contract = {
-				std::make_pair(error_code_key_too_large,
-				               ExceptionContract::requiredIf(key.size() > (key.startsWith(systemKeys.begin)
-				                                                               ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-				                                                               : CLIENT_KNOBS->KEY_SIZE_LIMIT))),
+			contract = { std::make_pair(error_code_key_too_large,
+				                        key.size() > getMaxWriteKeySize(key, true)    ? ExceptionContract::Always
+				                        : key.size() > getMaxWriteKeySize(key, false) ? ExceptionContract::Possible
+				                                                                      : ExceptionContract::Never),
 				         std::make_pair(error_code_value_too_large,
 				                        ExceptionContract::requiredIf(value.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT)),
-				std::make_pair(
-				    error_code_invalid_mutation_type,
-				    ExceptionContract::requiredIf(!isValidMutationType(op) || !isAtomicOp((MutationRef::Type)op))),
+				         std::make_pair(error_code_invalid_mutation_type,
+				                        ExceptionContract::requiredIf(!isValidMutationType(op) ||
+				                                                      !isAtomicOp((MutationRef::Type)op))),
 				         std::make_pair(error_code_key_outside_legal_range,
 				                        ExceptionContract::requiredIf((key >= workload->getMaxKey(tr)))),
-				std::make_pair(
-				    error_code_client_invalid_operation,
-				    ExceptionContract::requiredIf(
-				        (op == MutationRef::SetVersionstampedKey && (pos < 0 || pos + 10 > key.size() - 4)) ||
-				        (op == MutationRef::SetVersionstampedValue && (pos < 0 || pos + 10 > value.size() - 4))))
-			};
+				         std::make_pair(error_code_client_invalid_operation,
+				                        ExceptionContract::requiredIf((op == MutationRef::SetVersionstampedKey &&
+				                                                       (pos < 0 || pos + 10 > key.size() - 4)) ||
+				                                                      (op == MutationRef::SetVersionstampedValue &&
+				                                                       (pos < 0 || pos + 10 > value.size() - 4)))) };
 		}

 		void callback(Reference<ITransaction> tr) override { tr->atomicOp(key, value, (FDBMutationTypes::Option)op); }
@ -1131,11 +1127,10 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 				key = makeKey();
 			}
 			value = makeValue();
-			contract = { std::make_pair(
-				             error_code_key_too_large,
-				             ExceptionContract::requiredIf(key.size() > (key.startsWith(systemKeys.begin)
-				                                                             ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-				                                                             : CLIENT_KNOBS->KEY_SIZE_LIMIT))),
+			contract = { std::make_pair(error_code_key_too_large,
+				                        key.size() > getMaxWriteKeySize(key, true)    ? ExceptionContract::Always
+				                        : key.size() > getMaxWriteKeySize(key, false) ? ExceptionContract::Possible
+				                                                                      : ExceptionContract::Never),
 				         std::make_pair(error_code_value_too_large,
 				                        ExceptionContract::requiredIf(value.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT)),
 				         std::make_pair(error_code_key_outside_legal_range,
@ -1268,11 +1263,11 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 		TestWatch(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference<ITransaction> tr)
 		  : BaseTest(id, workload, "TestWatch") {
 			key = makeKey();
-			contract = { std::make_pair(
-				             error_code_key_too_large,
-				             ExceptionContract::requiredIf(key.size() > (key.startsWith(systemKeys.begin)
-				                                                             ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-				                                                             : CLIENT_KNOBS->KEY_SIZE_LIMIT))),
+			printf("Watching: %d %s\n", key.size(), printable(key.substr(0, std::min(key.size(), 20))).c_str());
+			contract = { std::make_pair(error_code_key_too_large,
+				                        key.size() > getMaxWriteKeySize(key, true)    ? ExceptionContract::Always
+				                        : key.size() > getMaxWriteKeySize(key, false) ? ExceptionContract::Possible
+				                                                                      : ExceptionContract::Never),
 				         std::make_pair(error_code_watches_disabled, ExceptionContract::Possible),
 				         std::make_pair(error_code_key_outside_legal_range,
 				                        ExceptionContract::requiredIf((key >= workload->getMaxKey(tr)))),
--- a/fdbserver/workloads/RemoveServersSafely.actor.cpp
+++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp
@ -541,7 +541,12 @@ struct RemoveServersSafelyWorkload : TestWorkload {
 		state AddressExclusion coordExcl;
 		// Exclude a coordinator under buggify, but only if fault tolerance is > 0 and kill set is non-empty already
 		if (BUGGIFY && toKill.size()) {
-			std::vector<NetworkAddress> coordinators = wait(getCoordinators(cx));
+			Optional<ClusterConnectionString> csOptional = wait(getConnectionString(cx));
+			state std::vector<NetworkAddress> coordinators;
+			if (csOptional.present()) {
+				ClusterConnectionString cs = csOptional.get();
+				wait(store(coordinators, cs.tryResolveHostnames()));
+			}
 			if (coordinators.size() > 2) {
 				auto randomCoordinator = deterministicRandom()->randomChoice(coordinators);
 				coordExcl = AddressExclusion(randomCoordinator.ip, randomCoordinator.port);
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@ -957,9 +957,9 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 				boost::split(
 				    process_addresses, coordinator_processes_key.get().toString(), [](char c) { return c == ','; });
 				ASSERT(process_addresses.size() == cs.coordinators().size() + cs.hostnames.size());
-				wait(cs.resolveHostnames());
 				// compare the coordinator process network addresses one by one
-				for (const auto& network_address : cs.coordinators()) {
+				std::vector<NetworkAddress> coordinators = wait(cs.tryResolveHostnames());
+				for (const auto& network_address : coordinators) {
 					ASSERT(std::find(process_addresses.begin(), process_addresses.end(), network_address.toString()) !=
 					       process_addresses.end());
 				}
@ -1077,19 +1077,20 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 					tx->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 					Optional<Value> res = wait(tx->get(coordinatorsKey));
 					ASSERT(res.present()); // Otherwise, database is in a bad state
-					state ClusterConnectionString csNew(res.get().toString());
-					wait(csNew.resolveHostnames());
-					ASSERT(csNew.coordinators().size() == old_coordinators_processes.size() + 1);
+					ClusterConnectionString csNew(res.get().toString());
+					// verify the cluster decription
+					ASSERT(new_cluster_description == csNew.clusterKeyName().toString());
+					ASSERT(csNew.hostnames.size() + csNew.coordinators().size() ==
+					       old_coordinators_processes.size() + 1);
+					std::vector<NetworkAddress> newCoordinators = wait(csNew.tryResolveHostnames());
 					// verify the coordinators' addresses
-					for (const auto& network_address : csNew.coordinators()) {
+					for (const auto& network_address : newCoordinators) {
 						std::string address_str = network_address.toString();
 						ASSERT(std::find(old_coordinators_processes.begin(),
 						                 old_coordinators_processes.end(),
 						                 address_str) != old_coordinators_processes.end() ||
 						       new_coordinator_process == address_str);
 					}
-					// verify the cluster decription
-					ASSERT(new_cluster_description == csNew.clusterKeyName().toString());
 					tx->reset();
 				} catch (Error& e) {
 					wait(tx->onError(e));
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@ -30,7 +30,7 @@ void forceLinkMemcpyTests();
 void forceLinkMemcpyPerfTests();
 #if (!defined(TLS_DISABLED) && !defined(_WIN32))
 void forceLinkStreamCipherTests();
-void forceLinkBLockCiherTests();
+void forceLinkBlobCipherTests();
 #endif
 void forceLinkParallelStreamTests();
 void forceLinkSimExternalConnectionTests();
@ -39,6 +39,8 @@ void forceLinkSimKmsConnectorTests();
 void forceLinkIThreadPoolTests();
 void forceLinkTokenSignTests();
 void forceLinkVersionVectorTests();
+void forceLinkRESTClientTests();
+void forceLinkRESTUtilsTests();

 struct UnitTestWorkload : TestWorkload {
 	bool enabled;
@ -88,6 +90,8 @@ struct UnitTestWorkload : TestWorkload {
 		forceLinkIThreadPoolTests();
 		forceLinkTokenSignTests();
 		forceLinkVersionVectorTests();
+		forceLinkRESTClientTests();
+		forceLinkRESTUtilsTests();
 	}

 	std::string description() const override { return "UnitTests"; }
--- a/fdbserver/workloads/WriteDuringRead.actor.cpp
+++ b/fdbserver/workloads/WriteDuringRead.actor.cpp
@ -653,9 +653,7 @@ struct WriteDuringReadWorkload : TestWorkload {
 						for (int j = i; j < end; j++) {
 							if (deterministicRandom()->random01() < self->initialKeyDensity) {
 								Key key = self->getKeyForIndex(j);
-								if (key.size() <= (key.startsWith(systemKeys.begin)
-								                       ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-								                       : CLIENT_KNOBS->KEY_SIZE_LIMIT)) {
+								if (key.size() <= getMaxWriteKeySize(key, false)) {
 									Value value = self->getRandomValue();
 									value =
 									    value.substr(0, std::min<int>(value.size(), CLIENT_KNOBS->VALUE_SIZE_LIMIT));
@ -898,18 +896,10 @@ struct WriteDuringReadWorkload : TestWorkload {
 								tr.clear(range);
 								if (!noConflict) {
 									KeyRangeRef conflict(
-									    range.begin.substr(0,
-									                       std::min<int>(range.begin.size(),
-									                                     (range.begin.startsWith(systemKeys.begin)
-									                                          ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-									                                          : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-									                                         1)),
-									    range.end.substr(0,
-									                     std::min<int>(range.end.size(),
-									                                   (range.end.startsWith(systemKeys.begin)
-									                                        ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-									                                        : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-									                                       1)));
+									    range.begin.substr(
+									        0, std::min<int>(range.begin.size(), getMaxClearKeySize(range.begin) + 1)),
+									    range.end.substr(
+									        0, std::min<int>(range.end.size(), getMaxClearKeySize(range.end) + 1)));
 									self->addedConflicts.insert(conflict, true);
 								}
 								self->memoryDatabase.erase(self->memoryDatabase.lower_bound(range.begin),
@ -922,9 +912,7 @@ struct WriteDuringReadWorkload : TestWorkload {
 								if (noConflict)
 									tr.setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE);
 								tr.clear(key);
-								if (!noConflict && key.size() <= (key.startsWith(systemKeys.begin)
-								                                      ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-								                                      : CLIENT_KNOBS->KEY_SIZE_LIMIT)) {
+								if (!noConflict && key.size() <= getMaxClearKeySize(key)) {
 									self->addedConflicts.insert(key, true);
 								}
 								self->memoryDatabase.erase(key);
@ -936,18 +924,9 @@ struct WriteDuringReadWorkload : TestWorkload {
 								//TraceEvent("WDRAddWriteConflict").detail("Range", range);
 								tr.addWriteConflictRange(range);
 								KeyRangeRef conflict(
-								    range.begin.substr(0,
-								                       std::min<int>(range.begin.size(),
-								                                     (range.begin.startsWith(systemKeys.begin)
-								                                          ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-								                                          : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-								                                         1)),
-								    range.end.substr(0,
-								                     std::min<int>(range.end.size(),
-								                                   (range.end.startsWith(systemKeys.begin)
-								                                        ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-								                                        : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-								                                       1)));
+								    range.begin.substr(
+								        0, std::min<int>(range.begin.size(), getMaxKeySize(range.begin) + 1)),
+								    range.end.substr(0, std::min<int>(range.end.size(), getMaxKeySize(range.end) + 1)));
 								self->addedConflicts.insert(conflict, true);
 							} else if (operationType == 8 && !disableDelay) {
 								double maxTime = 6.0;
@ -991,18 +970,10 @@ struct WriteDuringReadWorkload : TestWorkload {
 									tr.atomicOp(versionStampKey, value, MutationRef::SetVersionstampedKey);
 									tr.clear(range);
 									KeyRangeRef conflict(
-									    range.begin.substr(0,
-									                       std::min<int>(range.begin.size(),
-									                                     (range.begin.startsWith(systemKeys.begin)
-									                                          ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-									                                          : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-									                                         1)),
-									    range.end.substr(0,
-									                     std::min<int>(range.end.size(),
-									                                   (range.end.startsWith(systemKeys.begin)
-									                                        ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-									                                        : CLIENT_KNOBS->KEY_SIZE_LIMIT) +
-									                                       1)));
+									    range.begin.substr(
+									        0, std::min<int>(range.begin.size(), getMaxClearKeySize(range.begin) + 1)),
+									    range.end.substr(
+									        0, std::min<int>(range.end.size(), getMaxClearKeySize(range.end) + 1)));
 									self->addedConflicts.insert(conflict, true);
 									self->memoryDatabase.erase(self->memoryDatabase.lower_bound(range.begin),
 									                           self->memoryDatabase.lower_bound(range.end));
@ -1043,10 +1014,9 @@ struct WriteDuringReadWorkload : TestWorkload {
 										tr.setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE);
 									tr.atomicOp(key, value, opType);
 									//TraceEvent("WDRAtomicOpSuccess").detail("Key", key).detail("Value", value.size());
-									if (!noConflict && key.size() <= (key.startsWith(systemKeys.begin)
-									                                      ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-									                                      : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+									if (!noConflict && key.size() <= getMaxWriteKeySize(key, false)) {
 										self->addedConflicts.insert(key, true);
+									}
 									Optional<Value> existing = self->memoryGet(&self->memoryDatabase, key);
 									self->memoryDatabase[key] =
 									    self->applyAtomicOp(existing.present() ? Optional<StringRef>(existing.get())
@ -1063,10 +1033,9 @@ struct WriteDuringReadWorkload : TestWorkload {
 								if (noConflict)
 									tr.setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE);
 								tr.set(key, value);
-								if (!noConflict && key.size() <= (key.startsWith(systemKeys.begin)
-								                                      ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT
-								                                      : CLIENT_KNOBS->KEY_SIZE_LIMIT))
+								if (!noConflict && key.size() <= getMaxWriteKeySize(key, false)) {
 									self->addedConflicts.insert(key, true);
+								}
 								//TraceEvent("WDRSetSuccess").detail("Key", key).detail("Value", value.size());
 								self->memoryDatabase[key] = value;
 							}
--- a/flow/BlobCipher.h
+++ b/flow/BlobCipher.h
@ -39,6 +39,9 @@
 #include "flow/flow.h"
 #include "flow/genericactors.actor.h"

+#if defined(HAVE_WOLFSSL)
+#include <wolfssl/options.h>
+#endif
 #include <openssl/aes.h>
 #include <openssl/engine.h>
 #include <openssl/evp.h>
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@ -84,6 +84,10 @@ set(FLOW_SRCS
  actorcompiler.h
  crc32c.h
  crc32c.cpp
+  ppc-asm.h
+  crc32.S
+  crc32_wrapper.h
+  crc32_wrapper.c
  error_definitions.h
  ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h
  flat_buffers.cpp
@ -172,6 +176,10 @@ if(NOT WITH_TLS)
 else()
  target_link_libraries(flow PUBLIC OpenSSL::SSL)
  target_link_libraries(flow_sampling PUBLIC OpenSSL::SSL)
+  if(USE_WOLFSSL)
+    target_include_directories(flow SYSTEM BEFORE PUBLIC ${WOLFSSL_INCLUDE_DIR}/wolfssl)
+    target_include_directories(flow_sampling SYSTEM BEFORE PUBLIC ${WOLFSSL_INCLUDE_DIR}/wolfssl)
+  endif()
 endif()
 target_link_libraries(flow PUBLIC Threads::Threads ${CMAKE_DL_LIBS})
 target_link_libraries(flow_sampling PUBLIC Threads::Threads ${CMAKE_DL_LIBS})
--- a/Show More
+++ b/Show More