Merge remote-tracking branch 'apple/main' into vgasiunas-fdbmonitor-in-python

2022-11-03 10:56:54 +01:00 · 2022-11-03 10:56:54 +01:00 · 423049f0f8
parent 0802bbfbb0 9de72eb675
commit 423049f0f8
95 changed files with 2124 additions and 743 deletions
--- a/.flake8
+++ b/.flake8
@ -1,5 +1,5 @@
 [flake8]
-ignore = E203, E266, E501, W503, F403, F401, E711
+ignore = E203, E266, E501, W503, F403, F401, E711, C901
 max-line-length = 79
 max-complexity = 18
 select = B,C,E,F,W,T4,B9
--- a/bindings/bindingtester/tests/api.py
+++ b/bindings/bindingtester/tests/api.py
@ -577,7 +577,7 @@ class ApiTest(Test):
                    key1, key2 = key2, key1

                # TODO: randomize chunkSize but should not exceed 100M(shard limit)
-                chunkSize = 10000000 # 10M
+                chunkSize = 10000000  # 10M
                instructions.push_args(key1, key2, chunkSize)
                instructions.append(op)
                self.add_strings(1)
--- a/bindings/bindingtester/tests/directory.py
+++ b/bindings/bindingtester/tests/directory.py
@ -114,7 +114,7 @@ class DirectoryTest(Test):
            instructions.push_args(layer)
            instructions.push_args(*test_util.with_length(path))
            instructions.append('DIRECTORY_OPEN')
-            self.dir_list.append(self.root.add_child(path, DirectoryStateTreeNode(True, True, has_known_prefix=False, is_partition=(layer==b'partition'))))
+            self.dir_list.append(self.root.add_child(path, DirectoryStateTreeNode(True, True, has_known_prefix=False, is_partition=(layer == b'partition'))))
            # print('%d. Selected %s, dir=%s, dir_id=%s, has_known_prefix=%s, dir_list_len=%d' \
            #       % (len(instructions), 'DIRECTORY_OPEN', repr(self.dir_index), self.dir_list[-1].dir_id, False, len(self.dir_list)-1))

@ -163,8 +163,8 @@ class DirectoryTest(Test):

            elif root_op == 'DIRECTORY_CREATE_LAYER':
                indices = []
-                
-                prefixes = [generate_prefix(require_unique=args.concurrency==1, is_partition=True) for i in range(2)]
+
+                prefixes = [generate_prefix(require_unique=args.concurrency == 1, is_partition=True) for i in range(2)]
                for i in range(2):
                    instructions.push_args(prefixes[i])
                    instructions.push_args(*test_util.with_length(generate_path()))
@ -184,9 +184,9 @@ class DirectoryTest(Test):
                    test_util.blocking_commit(instructions)

                path = generate_path()
-                # Partitions that use the high-contention allocator can result in non-determinism if they fail to commit, 
+                # Partitions that use the high-contention allocator can result in non-determinism if they fail to commit,
                # so we disallow them in comparison tests
-                op_args = test_util.with_length(path) + (self.generate_layer(allow_partition=args.concurrency>1),)
+                op_args = test_util.with_length(path) + (self.generate_layer(allow_partition=args.concurrency > 1),)
                directory_util.push_instruction_and_record_prefix(instructions, op, op_args, path, len(self.dir_list), self.random, self.prefix_log)

                if not op.endswith('_DATABASE') and args.concurrency == 1:
@ -196,14 +196,14 @@ class DirectoryTest(Test):
                if child_entry is None:
                    child_entry = DirectoryStateTreeNode(True, True)

-                child_entry.state.has_known_prefix = False  
+                child_entry.state.has_known_prefix = False
                self.dir_list.append(dir_entry.add_child(path, child_entry))

            elif root_op == 'DIRECTORY_CREATE':
                layer = self.generate_layer()
                is_partition = layer == b'partition'

-                prefix = generate_prefix(require_unique=is_partition and args.concurrency==1, is_partition=is_partition, min_length=0)
+                prefix = generate_prefix(require_unique=is_partition and args.concurrency == 1, is_partition=is_partition, min_length=0)

                # Because allocated prefixes are non-deterministic, we cannot have overlapping
                # transactions that allocate/remove these prefixes in a comparison test
@ -409,7 +409,7 @@ def generate_prefix(require_unique=False, is_partition=False, min_length=1):
        if require_unique:
            min_length = max(min_length, 16)

-        length = random.randint(min_length, min_length+5)
+        length = random.randint(min_length, min_length + 5)
        if length == 0:
            return b''

@ -419,6 +419,6 @@ def generate_prefix(require_unique=False, is_partition=False, min_length=1):
        else:
            return bytes([random.randrange(ord('\x02'), ord('\x14')) for i in range(0, length)])
    else:
-        prefix = fixed_prefix 
+        prefix = fixed_prefix
        generated = prefix[0:random.randrange(min_length, len(prefix))]
        return generated
--- a/bindings/bindingtester/tests/directory_state_tree.py
+++ b/bindings/bindingtester/tests/directory_state_tree.py
@ -1,5 +1,6 @@
 import sys

+
 class TreeNodeState:
    def __init__(self, node, dir_id, is_directory, is_subspace, has_known_prefix, root, is_partition):
        self.dir_id = dir_id
@ -9,10 +10,11 @@ class TreeNodeState:
        self.root = root
        self.is_partition = is_partition

-        self.parents = { node }
+        self.parents = {node}
        self.children = {}
        self.deleted = False

+
 # Represents an element of the directory hierarchy. As a result of various operations (e.g. moves) that
 # may or may not have succeeded, a node can represent multiple possible states.
 class DirectoryStateTreeNode:
@ -25,7 +27,7 @@ class DirectoryStateTreeNode:
    default_directory = None

    # Used for debugging
-    dir_id = 0  
+    dir_id = 0

    @classmethod
    def reset(cls):
@ -62,7 +64,7 @@ class DirectoryStateTreeNode:
        if default is not None:
            default_child = default.state.children.get(subpath[0])

-        self_child = self.state.children.get(subpath[0]) 
+        self_child = self.state.children.get(subpath[0])

        if self_child is None:
            if default_child is None:
@ -143,13 +145,15 @@ class DirectoryStateTreeNode:
        child = self.get_descendent(path)
        if child:
            child._delete_impl()
-    
+
+
 def validate_dir(dir, root):
    if dir.state.is_directory:
        assert dir.state.root == root
    else:
        assert dir.state.root == dir

+
 def run_test():
    all_entries = []

@ -249,11 +253,11 @@ def run_test():
    # Test moving an entry
    assert not entry.state.has_known_prefix
    assert not entry.state.is_subspace
-    assert list(entry.state.children.keys()) == ['1']    
+    assert list(entry.state.children.keys()) == ['1']

    for e in all_entries:
        validate_dir(e, root)

+
 if __name__ == '__main__':
    sys.exit(run_test())
-
--- a/bindings/bindingtester/tests/directory_util.py
+++ b/bindings/bindingtester/tests/directory_util.py
@ -18,7 +18,6 @@
 # limitations under the License.
 #

-import random
 import struct

 import fdb
@ -35,6 +34,7 @@ DEFAULT_DIRECTORY_INDEX = 4
 DEFAULT_DIRECTORY_PREFIX = b'default'
 DIRECTORY_ERROR_STRING = b'DIRECTORY_ERROR'

+
 def setup_directories(instructions, default_path, random):
    # Clients start with the default directory layer in the directory list
    DirectoryStateTreeNode.reset()
--- a/bindings/bindingtester/tests/test_util.py
+++ b/bindings/bindingtester/tests/test_util.py
@ -107,7 +107,7 @@ class RandomGenerator(object):
                user_version = random.randint(0, 0xffff)
                tup.append(fdb.tuple.Versionstamp(tr_version, user_version))
            else:
-                assert false
+                assert False

        return tuple(tup)

--- a/bindings/bindingtester/tests/tuple.py
+++ b/bindings/bindingtester/tests/tuple.py
@ -31,6 +31,7 @@ from bindingtester.tests import test_util

 fdb.api_version(FDB_API_VERSION)

+
 class TupleTest(Test):
    def __init__(self, subspace):
        super(TupleTest, self).__init__(subspace)
@ -44,14 +45,14 @@ class TupleTest(Test):
    def generate(self, args, thread_number):
        instructions = InstructionSet()

-        min_value = -2**self.max_int_bits+1
-        max_value = 2**self.max_int_bits-1
+        min_value = -2**self.max_int_bits + 1
+        max_value = 2**self.max_int_bits - 1

        instructions.append('NEW_TRANSACTION')

        # Test integer encoding
        mutations = 0
-        for i in range(0, self.max_int_bits+1):
+        for i in range(0, self.max_int_bits + 1):
            for sign in [-1, 1]:
                sign_str = '' if sign == 1 else '-'
                for offset in range(-10, 11):
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -21,7 +21,7 @@
 #include "fdbclient/FDBTypes.h"
 #include "flow/ProtocolVersion.h"
 #include <cstdint>
-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #define FDB_INCLUDE_LEGACY_TYPES

 #include "fdbclient/MultiVersionTransaction.h"
@ -905,6 +905,10 @@ extern "C" DLLEXPORT fdb_error_t fdb_transaction_get_committed_version(FDBTransa
 	CATCH_AND_RETURN(*out_version = TXN(tr)->getCommittedVersion(););
 }

+extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_total_cost(FDBTransaction* tr) {
+	return (FDBFuture*)TXN(tr)->getTotalCost().extractPtr();
+}
+
 extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_approximate_size(FDBTransaction* tr) {
 	return (FDBFuture*)TXN(tr)->getApproximateSize().extractPtr();
 }
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@ -27,10 +27,10 @@
 #endif

 #if !defined(FDB_API_VERSION)
-#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 720)
+#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 730)
 #elif FDB_API_VERSION < 13
 #error API version no longer supported (upgrade to 13)
-#elif FDB_API_VERSION > 720
+#elif FDB_API_VERSION > 730
 #error Requested API version requires a newer version of this header
 #endif

@ -514,12 +514,14 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_transaction_get_committed_version(F
                                                                               int64_t* out_version);

 /*
- * This function intentionally returns an FDBFuture instead of an integer
- * directly, so that calling this API can see the effect of previous
+ * These functions intentionally return an FDBFuture instead of an integer
+ * directly, so that calling the API can see the effect of previous
 * mutations on the transaction. Specifically, mutations are applied
 * asynchronously by the main thread. In order to see them, this call has to
 * be serviced by the main thread too.
 */
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_total_cost(FDBTransaction* tr);
+
 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_approximate_size(FDBTransaction* tr);

 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* tr);
--- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
+++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp
@ -20,11 +20,14 @@
 #include "TesterApiWorkload.h"
 #include "TesterBlobGranuleUtil.h"
 #include "TesterUtil.h"
+#include <unordered_set>
 #include <memory>
 #include <fmt/format.h>

 namespace FdbApiTester {

+#define BG_API_DEBUG_VERBOSE false
+
 class ApiBlobGranuleCorrectnessWorkload : public ApiWorkload {
 public:
 	ApiBlobGranuleCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {
@ -35,7 +38,7 @@ public:
 	}

 private:
-	// FIXME: use other new blob granule apis!
+	// FIXME: add tenant support for DB operations
 	enum OpType {
 		OP_INSERT,
 		OP_CLEAR,
@ -51,7 +54,27 @@ private:

 	// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
 	// FIXME: should still guarantee a read succeeds eventually somehow
-	bool seenReadSuccess = false;
+	// FIXME: this needs to be per tenant if tenant ids are set
+	std::unordered_set<std::optional<int>> tenantsWithReadSuccess;
+
+	inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); }
+
+	inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); }
+
+	std::string tenantDebugString(std::optional<int> tenantId) {
+		return tenantId.has_value() ? fmt::format(" (tenant {0})", tenantId.value()) : "";
+	}
+
+	void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) {
+		if (BG_API_DEBUG_VERBOSE) {
+			info(fmt::format("{0}: [{1} - {2}){3}: {4}",
+			                 opName,
+			                 fdb::toCharsRef(begin),
+			                 fdb::toCharsRef(end),
+			                 tenantDebugString(tenantId),
+			                 message));
+		}
+	}

 	void randomReadOp(TTaskFct cont, std::optional<int> tenantId) {
 		fdb::Key begin = randomKeyName();
@ -63,8 +86,10 @@ private:
 		auto results = std::make_shared<std::vector<fdb::KeyValue>>();
 		auto tooOld = std::make_shared<bool>(false);

+		debugOp("Read", begin, end, tenantId, "starting");
+
 		execTransaction(
-		    [this, begin, end, results, tooOld](auto ctx) {
+		    [this, begin, end, tenantId, results, tooOld](auto ctx) {
 			    ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
 			    TesterGranuleContext testerContext(ctx->getBGBasePath());
 			    fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext);
@ -74,8 +99,13 @@ private:
 			    auto out = fdb::Result::KeyValueRefArray{};
 			    fdb::Error err = res.getKeyValueArrayNothrow(out);
 			    if (err.code() == error_code_blob_granule_transaction_too_old) {
-				    info("BlobGranuleCorrectness::randomReadOp bg too old\n");
-				    ASSERT(!seenReadSuccess);
+				    bool previousSuccess = seenReadSuccess(tenantId);
+				    if (previousSuccess) {
+					    error("Read bg too old after read success!\n");
+				    } else {
+					    info("Read bg too old\n");
+				    }
+				    ASSERT(!previousSuccess);
 				    *tooOld = true;
 				    ctx->done();
 			    } else if (err.code() != error_code_success) {
@ -85,10 +115,13 @@ private:
 				    auto& [resVector, out_more] = resCopy;
 				    ASSERT(!out_more);
 				    results.get()->assign(resVector.begin(), resVector.end());
-				    if (!seenReadSuccess) {
-					    info("BlobGranuleCorrectness::randomReadOp first success\n");
+				    bool previousSuccess = seenReadSuccess(tenantId);
+				    if (!previousSuccess) {
+					    info(fmt::format("Read{0}: first success\n", tenantDebugString(tenantId)));
+					    setReadSuccess(tenantId);
+				    } else {
+					    debugOp("Read", begin, end, tenantId, "complete");
 				    }
-				    seenReadSuccess = true;
 				    ctx->done();
 			    }
 		    },
@ -97,7 +130,7 @@ private:
 				    std::vector<fdb::KeyValue> expected =
 				        stores[tenantId].getRange(begin, end, stores[tenantId].size(), false);
 				    if (results->size() != expected.size()) {
-					    error(fmt::format("randomReadOp result size mismatch. expected: {} actual: {}",
+					    error(fmt::format("randomReadOp result size mismatch. expected: {0} actual: {1}",
 					                      expected.size(),
 					                      results->size()));
 				    }
@ -105,7 +138,7 @@ private:

 				    for (int i = 0; i < results->size(); i++) {
 					    if ((*results)[i].key != expected[i].key) {
-						    error(fmt::format("randomReadOp key mismatch at {}/{}. expected: {} actual: {}",
+						    error(fmt::format("randomReadOp key mismatch at {0}/{1}. expected: {2} actual: {3}",
 						                      i,
 						                      results->size(),
 						                      fdb::toCharsRef(expected[i].key),
@ -138,6 +171,8 @@ private:
 		}
 		auto results = std::make_shared<std::vector<fdb::KeyRange>>();

+		debugOp("GetGranules", begin, end, tenantId, "starting");
+
 		execTransaction(
 		    [begin, end, results](auto ctx) {
 			    fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType();
@ -149,15 +184,17 @@ private:
 			        },
 			        true);
 		    },
-		    [this, begin, end, results, cont]() {
-			    this->validateRanges(results, begin, end, seenReadSuccess);
+		    [this, begin, end, tenantId, results, cont]() {
+			    debugOp(
+			        "GetGranules", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
+			    this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
 			    schedule(cont);
 		    },
 		    getTenant(tenantId));
 	}

 	void randomSummarizeOp(TTaskFct cont, std::optional<int> tenantId) {
-		if (!seenReadSuccess) {
+		if (!seenReadSuccess(tenantId)) {
 			// tester can't handle this throwing bg_txn_too_old, so just don't call it unless we have already seen a
 			// read success
 			schedule(cont);
@ -169,6 +206,9 @@ private:
 			std::swap(begin, end);
 		}
 		auto results = std::make_shared<std::vector<fdb::GranuleSummary>>();
+
+		debugOp("Summarize", begin, end, tenantId, "starting");
+
 		execTransaction(
 		    [begin, end, results](auto ctx) {
 			    fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2 /*latest version*/, 1000).eraseType();
@ -180,10 +220,11 @@ private:
 			        },
 			        true);
 		    },
-		    [this, begin, end, results, cont]() {
-			    ASSERT(results->size() > 0);
-			    ASSERT(results->front().keyRange.beginKey <= begin);
-			    ASSERT(results->back().keyRange.endKey >= end);
+		    [this, begin, end, tenantId, results, cont]() {
+			    debugOp("Summarize", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
+
+			    // use validateRanges to share validation
+			    auto ranges = std::make_shared<std::vector<fdb::KeyRange>>();

 			    for (int i = 0; i < results->size(); i++) {
 				    // TODO: could do validation of subsequent calls and ensure snapshot version never decreases
@ -191,12 +232,11 @@ private:
 				    ASSERT((*results)[i].snapshotVersion <= (*results)[i].deltaVersion);
 				    ASSERT((*results)[i].snapshotSize > 0);
 				    ASSERT((*results)[i].deltaSize >= 0);
+
+				    ranges->push_back((*results)[i].keyRange);
 			    }

-			    for (int i = 1; i < results->size(); i++) {
-				    // ranges contain entire requested key range
-				    ASSERT((*results)[i].keyRange.beginKey == (*results)[i - 1].keyRange.endKey);
-			    }
+			    this->validateRanges(ranges, begin, end, true);

 			    schedule(cont);
 		    },
@ -208,18 +248,29 @@ private:
 	                    fdb::Key end,
 	                    bool shouldBeRanges) {
 		if (shouldBeRanges) {
+			if (results->size() == 0) {
+				error(fmt::format(
+				    "ValidateRanges: [{0} - {1}): No ranges returned!", fdb::toCharsRef(begin), fdb::toCharsRef(end)));
+			}
 			ASSERT(results->size() > 0);
+			if (results->front().beginKey > begin || results->back().endKey < end) {
+				error(fmt::format("ValidateRanges: [{0} - {1}): Incomplete range(s) returned [{2} - {3})!",
+				                  fdb::toCharsRef(begin),
+				                  fdb::toCharsRef(end),
+				                  fdb::toCharsRef(results->front().beginKey),
+				                  fdb::toCharsRef(results->back().endKey)));
+			}
 			ASSERT(results->front().beginKey <= begin);
 			ASSERT(results->back().endKey >= end);
 		}
 		for (int i = 0; i < results->size(); i++) {
 			// no empty or inverted ranges
 			if ((*results)[i].beginKey >= (*results)[i].endKey) {
-				error(fmt::format("Empty/inverted range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
-				                  fdb::toCharsRef((*results)[i].beginKey),
-				                  fdb::toCharsRef((*results)[i].endKey),
+				error(fmt::format("ValidateRanges: [{0} - {1}): Empty/inverted range [{2} - {3})",
 				                  fdb::toCharsRef(begin),
-				                  fdb::toCharsRef(end)));
+				                  fdb::toCharsRef(end),
+				                  fdb::toCharsRef((*results)[i].beginKey),
+				                  fdb::toCharsRef((*results)[i].endKey)));
 			}
 			ASSERT((*results)[i].beginKey < (*results)[i].endKey);
 		}
@ -227,16 +278,17 @@ private:
 		for (int i = 1; i < results->size(); i++) {
 			// ranges contain entire requested key range
 			if ((*results)[i].beginKey != (*results)[i].endKey) {
-				error(fmt::format("Non-contiguous range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
-				                  fdb::toCharsRef((*results)[i].beginKey),
-				                  fdb::toCharsRef((*results)[i].endKey),
+				error(fmt::format("ValidateRanges: [{0} - {1}): Non-covereed range [{2} - {3})",
 				                  fdb::toCharsRef(begin),
-				                  fdb::toCharsRef(end)));
+				                  fdb::toCharsRef(end),
+				                  fdb::toCharsRef((*results)[i - 1].endKey),
+				                  fdb::toCharsRef((*results)[i].endKey)));
 			}
 			ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey);
 		}
 	}

+	// TODO: tenant support
 	void randomGetBlobRangesOp(TTaskFct cont) {
 		fdb::Key begin = randomKeyName();
 		fdb::Key end = randomKeyName();
@ -244,6 +296,10 @@ private:
 		if (begin > end) {
 			std::swap(begin, end);
 		}
+		std::optional<int> tenantId = {};
+
+		debugOp("GetBlobRanges", begin, end, tenantId, "starting");
+
 		execOperation(
 		    [begin, end, results](auto ctx) {
 			    fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType();
@ -252,22 +308,27 @@ private:
 				    ctx->done();
 			    });
 		    },
-		    [this, begin, end, results, cont]() {
-			    this->validateRanges(results, begin, end, seenReadSuccess);
+		    [this, begin, end, tenantId, results, cont]() {
+			    debugOp(
+			        "GetBlobRanges", begin, end, tenantId, fmt::format("complete with {0} ranges", results->size()));
+			    this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
 			    schedule(cont);
 		    },
 		    /* failOnError = */ false);
 	}

+	// TODO: tenant support
 	void randomVerifyOp(TTaskFct cont) {
 		fdb::Key begin = randomKeyName();
 		fdb::Key end = randomKeyName();
+		std::optional<int> tenantId;
 		if (begin > end) {
 			std::swap(begin, end);
 		}

 		auto verifyVersion = std::make_shared<int64_t>(false);
-		// info("Verify op starting");
+
+		debugOp("Verify", begin, end, tenantId, "starting");

 		execOperation(
 		    [begin, end, verifyVersion](auto ctx) {
@ -277,16 +338,15 @@ private:
 				    ctx->done();
 			    });
 		    },
-		    [this, begin, end, verifyVersion, cont]() {
+		    [this, begin, end, tenantId, verifyVersion, cont]() {
+			    debugOp("Verify", begin, end, tenantId, fmt::format("Complete @ {0}", *verifyVersion));
+			    bool previousSuccess = seenReadSuccess(tenantId);
 			    if (*verifyVersion == -1) {
-				    ASSERT(!seenReadSuccess);
-			    } else {
-				    if (!seenReadSuccess) {
-					    info("BlobGranuleCorrectness::randomVerifyOp first success");
-				    }
-				    seenReadSuccess = true;
+				    ASSERT(!previousSuccess);
+			    } else if (!previousSuccess) {
+				    info(fmt::format("Verify{0}: first success\n", tenantDebugString(tenantId)));
+				    setReadSuccess(tenantId);
 			    }
-			    // info(fmt::format("verify op done @ {}", *verifyVersion));
 			    schedule(cont);
 		    },
 		    /* failOnError = */ false);
--- a/bindings/c/test/apitester/TesterTestSpec.h
+++ b/bindings/c/test/apitester/TesterTestSpec.h
@ -27,7 +27,7 @@
 #include <unordered_map>
 #include <vector>

-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730

 namespace FdbApiTester {

--- a/bindings/c/test/apitester/fdb_c_api_tester.cpp
+++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp
@ -36,7 +36,7 @@ namespace FdbApiTester {

 namespace {

-#define API_VERSION_CLIENT_TMP_DIR 720
+#define API_VERSION_CLIENT_TMP_DIR 730

 enum TesterOptionId {
 	OPT_CONNFILE,
--- a/bindings/c/test/client_memory_test.cpp
+++ b/bindings/c/test/client_memory_test.cpp
@ -18,7 +18,7 @@
 * limitations under the License.
 */

-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #include <foundationdb/fdb_c.h>

 #include "unit/fdb_api.hpp"
--- a/bindings/c/test/fdb_api.hpp
+++ b/bindings/c/test/fdb_api.hpp
@ -23,7 +23,7 @@
 #pragma once

 #ifndef FDB_API_VERSION
-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #endif

 #include <cassert>
@ -716,6 +716,12 @@ public:
 			throwError("Failed to create transaction: ", err);
 		return Transaction(tx_native);
 	}
+
+	TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) {
+		if (!tenant)
+			throw std::runtime_error("blobbifyRange from null tenant");
+		return native::fdb_tenant_blobbify_range(tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
+	}
 };

 class Database {
--- a/bindings/c/test/mako/mako.cpp
+++ b/bindings/c/test/mako/mako.cpp
@ -283,24 +283,60 @@ int populate(Database db,
 			int batch_size = args.tenant_batch_size;
 			int batches = (args.total_tenants + batch_size - 1) / batch_size;
 			for (int batch = 0; batch < batches; ++batch) {
+				while (1) {
+					for (int i = batch * batch_size; i < args.total_tenants && i < (batch + 1) * batch_size; ++i) {
+						std::string tenant_str = "tenant" + std::to_string(i);
+						Tenant::createTenant(systemTx, toBytesRef(tenant_str));
+					}
+					auto future_commit = systemTx.commit();
+					const auto rc = waitAndHandleError(systemTx, future_commit, "CREATE_TENANT");
+					if (rc == FutureRC::OK) {
+						// Keep going with reset transaction if commit was successful
+						systemTx.reset();
+						break;
+					} else if (rc == FutureRC::RETRY) {
+						// We want to retry this batch. Transaction is already reset
+					} else {
+						// Abort
+						return -1;
+					}
+				}
+
+				Tenant tenants[batch_size];
+				fdb::TypedFuture<fdb::future_var::Bool> blobbifyResults[batch_size];
+
+				// blobbify tenant ranges explicitly
+				// FIXME: skip if database not configured for blob granules?
 				for (int i = batch * batch_size; i < args.total_tenants && i < (batch + 1) * batch_size; ++i) {
-					std::string tenant_name = "tenant" + std::to_string(i);
-					Tenant::createTenant(systemTx, toBytesRef(tenant_name));
+					std::string tenant_str = "tenant" + std::to_string(i);
+					BytesRef tenant_name = toBytesRef(tenant_str);
+					tenants[i] = db.openTenant(tenant_name);
+					std::string rangeEnd = "\xff";
+					blobbifyResults[i - (batch * batch_size)] =
+					    tenants[i].blobbifyRange(BytesRef(), toBytesRef(rangeEnd));
 				}
-				auto future_commit = systemTx.commit();
-				const auto rc = waitAndHandleError(systemTx, future_commit, "CREATE_TENANT");
-				if (rc == FutureRC::OK) {
-					// Keep going with reset transaction if commit was successful
-					systemTx.reset();
-				} else if (rc == FutureRC::RETRY) {
-					// We want to retry this batch, so decrement the number
-					// and go back through the loop to get the same value
-					// Transaction is already reset
-					--batch;
-				} else {
-					// Abort
-					return -1;
+
+				for (int i = batch * batch_size; i < args.total_tenants && i < (batch + 1) * batch_size; ++i) {
+					while (true) {
+						// not technically an operation that's part of systemTx, but it works
+						const auto rc =
+						    waitAndHandleError(systemTx, blobbifyResults[i - (batch * batch_size)], "BLOBBIFY_TENANT");
+						if (rc == FutureRC::OK) {
+							if (!blobbifyResults[i - (batch * batch_size)].get()) {
+								fmt::print("Blobbifying tenant {0} failed!\n", i);
+								return -1;
+							}
+							break;
+						} else if (rc == FutureRC::RETRY) {
+							continue;
+						} else {
+							// Abort
+							return -1;
+						}
+					}
 				}
+
+				systemTx.reset();
 			}
 		} else {
 			std::string last_tenant_name = "tenant" + std::to_string(args.total_tenants - 1);
@ -1261,7 +1297,7 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			/* name, has_arg, flag, val */
 			{ "api_version", required_argument, NULL, 'a' },
 			{ "cluster", required_argument, NULL, 'c' },
-			{ "num_databases", optional_argument, NULL, 'd' },
+			{ "num_databases", required_argument, NULL, 'd' },
 			{ "procs", required_argument, NULL, 'p' },
 			{ "threads", required_argument, NULL, 't' },
 			{ "async_xacts", required_argument, NULL, ARG_ASYNC },
@ -1312,6 +1348,17 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			{ "authorization_token_file", required_argument, NULL, ARG_AUTHORIZATION_TOKEN_FILE },
 			{ NULL, 0, NULL, 0 }
 		};
+
+/* For optional arguments, optarg is only set when the argument is passed as "--option=[ARGUMENT]" but not as
+ "--option [ARGUMENT]". This function sets optarg in the latter case. See
+ https://cfengine.com/blog/2021/optional-arguments-with-getopt-long/ for a more detailed explanation */
+#define SET_OPT_ARG_IF_PRESENT()                                                                                       \
+	{                                                                                                                  \
+		if (optarg == NULL && optind < argc && argv[optind][0] != '-') {                                               \
+			optarg = argv[optind++];                                                                                   \
+		}                                                                                                              \
+	}
+
 		idx = 0;
 		c = getopt_long(argc, argv, short_options, long_options, &idx);
 		if (c < 0) {
@ -1513,9 +1560,8 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			args.disable_ryw = 1;
 			break;
 		case ARG_JSON_REPORT:
-			if (optarg == NULL && (argv[optind] == NULL || (argv[optind] != NULL && argv[optind][0] == '-'))) {
-				// if --report_json is the last option and no file is specified
-				// or --report_json is followed by another option
+			SET_OPT_ARG_IF_PRESENT();
+			if (!optarg) {
 				char default_file[] = "mako.json";
 				strncpy(args.json_output_path, default_file, sizeof(default_file));
 			} else {
@ -1526,13 +1572,12 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
 			args.bg_materialize_files = true;
 			strncpy(args.bg_file_path, optarg, std::min(sizeof(args.bg_file_path), strlen(optarg) + 1));
 		case ARG_EXPORT_PATH:
-			if (optarg == NULL && (argv[optind] == NULL || (argv[optind] != NULL && argv[optind][0] == '-'))) {
+			SET_OPT_ARG_IF_PRESENT();
+			if (!optarg) {
 				char default_file[] = "sketch_data.json";
 				strncpy(args.stats_export_path, default_file, sizeof(default_file));
 			} else {
-				strncpy(args.stats_export_path,
-				        argv[optind],
-				        std::min(sizeof(args.stats_export_path), strlen(argv[optind]) + 1));
+				strncpy(args.stats_export_path, optarg, std::min(sizeof(args.stats_export_path), strlen(optarg) + 1));
 			}
 			break;
 		case ARG_DISTRIBUTED_TRACER_CLIENT:
--- a/bindings/c/test/mako/mako.hpp
+++ b/bindings/c/test/mako/mako.hpp
@ -22,7 +22,7 @@
 #define MAKO_HPP

 #ifndef FDB_API_VERSION
-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #endif

 #include <array>
--- a/bindings/c/test/test.h
+++ b/bindings/c/test/test.h
@ -29,7 +29,7 @@
 #include <inttypes.h>

 #ifndef FDB_API_VERSION
-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #endif

 #include <foundationdb/fdb_c.h>
--- a/bindings/c/test/unit/disconnected_timeout_tests.cpp
+++ b/bindings/c/test/unit/disconnected_timeout_tests.cpp
@ -20,7 +20,7 @@

 // Unit tests that test the timeouts for a disconnected cluster

-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #include <foundationdb/fdb_c.h>

 #include <chrono>
--- a/bindings/c/test/unit/fdb_api.cpp
+++ b/bindings/c/test/unit/fdb_api.cpp
@ -231,6 +231,10 @@ Int64Future Transaction::get_approximate_size() {
 	return Int64Future(fdb_transaction_get_approximate_size(tr_));
 }

+Int64Future Transaction::get_total_cost() {
+	return Int64Future(fdb_transaction_get_total_cost(tr_));
+}
+
 KeyFuture Transaction::get_versionstamp() {
 	return KeyFuture(fdb_transaction_get_versionstamp(tr_));
 }
--- a/bindings/c/test/unit/fdb_api.hpp
+++ b/bindings/c/test/unit/fdb_api.hpp
@ -39,7 +39,7 @@

 #pragma once

-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #include <foundationdb/fdb_c.h>

 #include <string>
@ -276,6 +276,9 @@ public:
 	// Returns a future which will be set to the approximate transaction size so far.
 	Int64Future get_approximate_size();

+	// Returns a future which will be set tot the transaction's total cost so far.
+	Int64Future get_total_cost();
+
 	// Returns a future which will be set to the versionstamp which was used by
 	// any versionstamp operations in the transaction.
 	KeyFuture get_versionstamp();
--- a/bindings/c/test/unit/setup_tests.cpp
+++ b/bindings/c/test/unit/setup_tests.cpp
@ -20,7 +20,7 @@

 // Unit tests for API setup, network initialization functions from the FDB C API.

-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #include <foundationdb/fdb_c.h>
 #include <iostream>
 #include <thread>
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@ -21,7 +21,7 @@
 // Unit tests for the FoundationDB C API.

 #include "fdb_c_options.g.h"
-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #include <foundationdb/fdb_c.h>
 #include <assert.h>
 #include <string.h>
@ -1945,6 +1945,30 @@ TEST_CASE("fdb_transaction_get_committed_version") {
 	}
 }

+TEST_CASE("fdb_transaction_get_total_cost") {
+	fdb::Transaction tr(db);
+	while (1) {
+		fdb::ValueFuture f1 = tr.get("foo", /*snapshot*/ false);
+		fdb_error_t err = wait_future(f1);
+		if (err) {
+			fdb::EmptyFuture fOnError = tr.on_error(err);
+			fdb_check(wait_future(fOnError));
+			continue;
+		}
+		fdb::Int64Future f2 = tr.get_total_cost();
+		err = wait_future(f2);
+		if (err) {
+			fdb::EmptyFuture fOnError = tr.on_error(err);
+			fdb_check(wait_future(fOnError));
+			continue;
+		}
+		int64_t cost;
+		fdb_check(f2.get(&cost));
+		CHECK(cost > 0);
+		break;
+	}
+}
+
 TEST_CASE("fdb_transaction_get_approximate_size") {
 	fdb::Transaction tr(db);
 	while (1) {
--- a/bindings/c/test/workloads/SimpleWorkload.cpp
+++ b/bindings/c/test/workloads/SimpleWorkload.cpp
@ -18,7 +18,7 @@
 * limitations under the License.
 */

-#define FDB_API_VERSION 720
+#define FDB_API_VERSION 730
 #include "foundationdb/fdb_c.h"
 #undef DLLEXPORT
 #include "workloads.h"
--- a/design/global-tag-throttling.md
+++ b/design/global-tag-throttling.md
@ -11,16 +11,16 @@ The global tag throttler bases throttling decisions on "quotas" provided by clie
 The global tag throttler cannot throttle tags to a throughput below the reserved quota, and it cannot allow throughput to exceed the total quota.

 ### Cost
-Internally, the units for these quotas are "page costs", computed as follows. The "page cost" of a read operation is computed as:
+Internally, the units for these quotas are bytes. The cost of an operation is rounded up to the nearest page size. The cost of a read operation is computed as:

 ```
-readCost = ceiling(bytesRead / CLIENT_KNOBS->READ_COST_BYTE_FACTOR);
+readCost = ceiling(bytesRead / CLIENT_KNOBS->READ_COST_BYTE_FACTOR) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
 ```

-The "page cost" of a write operation is computed as:
+The cost of a write operation is computed as:

 ```
-writeCost = SERVER_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * ceiling(bytesWritten / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR);
+writeCost = CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * ceiling(bytesWritten / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) * CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR;
 ```

 Here `bytesWritten` includes cleared bytes. The size of range clears is estimated at commit time.
@ -41,12 +41,6 @@ To set the quota through `fdbcli`, run:
 fdbcli> quota set <tag> [reserved_throughput|total_throughput] <bytes_per_second>
 ```

-Note that the quotas are specified in terms of bytes/second, and internally converted to page costs:
-
-```
-page_cost_quota = ceiling(byte_quota / CLIENT_KNOBS->READ_COST_BYTE_FACTOR)
-```
-
 To clear a both reserved and total throughput quotas for a tag, run:

 ```
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@ -2,6 +2,12 @@
 Release Notes
 #############

+6.3.25
+======
+* Fixed a transaction log data corruption bug. `(PR #8558) <https://github.com/apple/foundationdb/pull/8558>`_
+* Fixed a special keyspace ``SpecialKeyRangeAsyncImpl::getRange`` bug. `(PR #6453) <https://github.com/apple/foundationdb/pull/6453>`_
+* Fixed a special keyspace ``ConflictingKeysImpl::getRange`` bug. `(PR #7724) <https://github.com/apple/foundationdb/pull/7724>`_
+
 6.3.24
 ======
 * Fixed a bug where get key location can overload proxies. `(PR #6453) <https://github.com/apple/foundationdb/pull/6453>`_ 
--- a/documentation/sphinx/source/release-notes/release-notes-710.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-710.rst
@ -2,6 +2,24 @@
 Release Notes
 #############

+7.1.25
+======
+* Same as 7.1.24 release with AVX enabled.
+
+7.1.24
+======
+* Released with AVX disabled.
+* Fixed a transaction log data corruption bug. `(PR #8525) <https://github.com/apple/foundationdb/pull/8525>`_, `(PR #8562) <https://github.com/apple/foundationdb/pull/8562>`_, and `(PR #8647) <https://github.com/apple/foundationdb/pull/8647>`_
+* Fixed a rare data race in transaction logs when PEEK_BATCHING_EMPTY_MSG is enabled. `(PR #8660) <https://github.com/apple/foundationdb/pull/8660>`_
+* Changed consistency check to report all corruptions. `(PR #8571) <https://github.com/apple/foundationdb/pull/8571>`_
+* Fixed a rare storage server crashing bug after recovery. `(PR #8468) <https://github.com/apple/foundationdb/pull/8468>`_
+* Added client knob UNLINKONLOAD_FDBCLIB to control deletion of external client libraries. `(PR #8434) <https://github.com/apple/foundationdb/pull/8434>`_
+* Updated the default peer latency degradation percentile to 0.5. `(PR #8370) <https://github.com/apple/foundationdb/pull/8370>`_
+* Made exclusion less pessimistic when warning about low space usage. `(PR #8347) <https://github.com/apple/foundationdb/pull/8347>`_ 
+* Added storage server readrange and update latency metrics. `(PR #8353) <https://github.com/apple/foundationdb/pull/8353>`_
+* Increased the default PEER_DEGRADATION_CONNECTION_FAILURE_COUNT value to 5s. `(PR #8336) <https://github.com/apple/foundationdb/pull/8336>`_
+* Increased RocksDB block cache size. `(PR #8274) <https://github.com/apple/foundationdb/pull/8274>`_
+
 7.1.23
 ======
 * Same as 7.1.22 release with AVX enabled.
--- a/fdbcli/QuotaCommand.actor.cpp
+++ b/fdbcli/QuotaCommand.actor.cpp
@ -43,9 +43,9 @@ Optional<LimitType> parseLimitType(StringRef token) {
 	}
 }

-Optional<double> parseLimitValue(StringRef token) {
+Optional<int64_t> parseLimitValue(StringRef token) {
 	try {
-		return std::stod(token.toString());
+		return std::stol(token.toString());
 	} catch (...) {
 		return {};
 	}
@ -63,9 +63,9 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
 			} else {
 				auto const quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
 				if (limitType == LimitType::TOTAL) {
-					fmt::print("{}\n", quota.totalQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR);
+					fmt::print("{}\n", quota.totalQuota);
 				} else if (limitType == LimitType::RESERVED) {
-					fmt::print("{}\n", quota.reservedQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR);
+					fmt::print("{}\n", quota.reservedQuota);
 				}
 			}
 			return Void();
@ -75,7 +75,7 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
 	}
 }

-ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, double value) {
+ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, int64_t value) {
 	state Reference<ITransaction> tr = db->createTransaction();
 	loop {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -89,9 +89,13 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
 			// Internally, costs are stored in terms of pages, but in the API,
 			// costs are specified in terms of bytes
 			if (limitType == LimitType::TOTAL) {
-				quota.totalQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
+				// Round up to nearest page size
+				quota.totalQuota =
+				    ((value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
 			} else if (limitType == LimitType::RESERVED) {
-				quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
+				// Round up to nearest page size
+				quota.reservedQuota =
+				    ((value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
 			}
 			if (!quota.isValid()) {
 				throw invalid_throttle_quota_value();
--- a/fdbclient/BlobGranuleFiles.cpp
+++ b/fdbclient/BlobGranuleFiles.cpp
@ -232,10 +232,10 @@ void validateEncryptionHeaderDetails(const BlobGranuleFileEncryptionKeys& eKeys,
 		    .detail("ExpectedHeaderSalt", header.cipherHeaderDetails.salt);
 		throw encrypt_header_metadata_mismatch();
 	}
-	// Validate encryption header 'cipherHeader' details sanity
-	if (!(header.cipherHeaderDetails.baseCipherId == eKeys.headerCipherKey->getBaseCipherId() &&
-	      header.cipherHeaderDetails.encryptDomainId == eKeys.headerCipherKey->getDomainId() &&
-	      header.cipherHeaderDetails.salt == eKeys.headerCipherKey->getSalt())) {
+	// Validate encryption header 'cipherText' details sanity
+	if (!(header.cipherTextDetails.baseCipherId == eKeys.textCipherKey->getBaseCipherId() &&
+	      header.cipherTextDetails.encryptDomainId == eKeys.textCipherKey->getDomainId() &&
+	      header.cipherTextDetails.salt == eKeys.textCipherKey->getSalt())) {
 		TraceEvent(SevError, "EncryptionHeader_CipherTextMismatch")
 		    .detail("TextDomainId", eKeys.textCipherKey->getDomainId())
 		    .detail("ExpectedTextDomainId", header.cipherTextDetails.encryptDomainId)
--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -61,7 +61,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( WRONG_SHARD_SERVER_DELAY,                .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
 	init( FUTURE_VERSION_RETRY_DELAY,              .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
 	init( GRV_ERROR_RETRY_DELAY,                   5.0 ); if( randomize && BUGGIFY ) GRV_ERROR_RETRY_DELAY = 0.01 + 5 * deterministicRandom()->random01();
-	init( UNKNOWN_TENANT_RETRY_DELAY,              0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01();
+	init( UNKNOWN_TENANT_RETRY_DELAY,              .01 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = 0.01 + deterministicRandom()->random01();
 	init( REPLY_BYTE_LIMIT,                      80000 );
 	init( DEFAULT_BACKOFF,                         .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01();
 	init( DEFAULT_MAX_BACKOFF,                     1.0 );
@ -272,7 +272,7 @@ void ClientKnobs::initialize(Randomize randomize) {
 	init( TAG_THROTTLE_EXPIRATION_INTERVAL,        60.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_EXPIRATION_INTERVAL = 1.0;
 	init( WRITE_COST_BYTE_FACTOR,                 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096;
 	init( READ_COST_BYTE_FACTOR,                  16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096;
-	init( PROXY_MAX_TAG_THROTTLE_DURATION,          5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;
+	init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO,            5.0 );

 	// busyness reporting
 	init( BUSYNESS_SPIKE_START_THRESHOLD,         0.100 );
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -414,6 +414,20 @@ Version DLTransaction::getCommittedVersion() {
 	return version;
 }

+ThreadFuture<int64_t> DLTransaction::getTotalCost() {
+	if (!api->transactionGetTotalCost) {
+		return unsupported_operation();
+	}
+
+	FdbCApi::FDBFuture* f = api->transactionGetTotalCost(tr);
+	return toThreadFuture<int64_t>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		int64_t size = 0;
+		FdbCApi::fdb_error_t error = api->futureGetInt64(f, &size);
+		ASSERT(!error);
+		return size;
+	});
+}
+
 ThreadFuture<int64_t> DLTransaction::getApproximateSize() {
 	if (!api->transactionGetApproximateSize) {
 		return unsupported_operation();
@ -950,6 +964,11 @@ void DLApi::init() {
 	                   fdbCPath,
 	                   "fdb_transaction_get_committed_version",
 	                   headerVersion >= 0);
+	loadClientFunction(&api->transactionGetTotalCost,
+	                   lib,
+	                   fdbCPath,
+	                   "fdb_transaction_get_total_cost",
+	                   headerVersion >= ApiVersion::withGetTotalCost().version());
 	loadClientFunction(&api->transactionGetApproximateSize,
 	                   lib,
 	                   fdbCPath,
@ -1486,6 +1505,12 @@ ThreadFuture<SpanContext> MultiVersionTransaction::getSpanContext() {
 	return SpanContext();
 }

+ThreadFuture<int64_t> MultiVersionTransaction::getTotalCost() {
+	auto tr = getTransaction();
+	auto f = tr.transaction ? tr.transaction->getTotalCost() : makeTimeout<int64_t>();
+	return abortableFuture(f, tr.onChange);
+}
+
 ThreadFuture<int64_t> MultiVersionTransaction::getApproximateSize() {
 	auto tr = getTransaction();
 	auto f = tr.transaction ? tr.transaction->getApproximateSize() : makeTimeout<int64_t>();
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -3456,6 +3456,8 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
 			}
 			trState->cx->getValueCompleted->latency = timer_int() - startTime;
 			trState->cx->getValueCompleted->log();
+			trState->totalCost +=
+			    getReadOperationCost(key.size() + (reply.value.present() ? reply.value.get().size() : 0));

 			if (getValueID.present()) {
 				g_traceBatch.addEvent("GetValueDebug",
@ -4285,6 +4287,7 @@ void getRangeFinished(Reference<TransactionState> trState,
                      RangeResultFamily result) {
 	int64_t bytes = getRangeResultFamilyBytes(result);

+	trState->totalCost += getReadOperationCost(bytes);
 	trState->cx->transactionBytesRead += bytes;
 	trState->cx->transactionKeysRead += result.size();

@ -5767,6 +5770,7 @@ void Transaction::set(const KeyRef& key, const ValueRef& value, AddConflictRange
 	auto r = singleKeyRange(key, req.arena);
 	auto v = ValueRef(req.arena, value);
 	t.mutations.emplace_back(req.arena, MutationRef::SetValue, r.begin, v);
+	trState->totalCost += getWriteOperationCost(key.expectedSize() + value.expectedSize());

 	if (addConflictRange) {
 		t.write_conflict_ranges.push_back(req.arena, r);
@ -5796,6 +5800,7 @@ void Transaction::atomicOp(const KeyRef& key,
 	auto v = ValueRef(req.arena, operand);

 	t.mutations.emplace_back(req.arena, operationType, r.begin, v);
+	trState->totalCost += getWriteOperationCost(key.expectedSize());

 	if (addConflictRange && operationType != MutationRef::SetVersionstampedKey)
 		t.write_conflict_ranges.push_back(req.arena, r);
@ -5827,7 +5832,10 @@ void Transaction::clear(const KeyRangeRef& range, AddConflictRange addConflictRa
 		return;

 	t.mutations.emplace_back(req.arena, MutationRef::ClearRange, r.begin, r.end);
-
+	// NOTE: The throttling cost of each clear is assumed to be one page.
+	// This makes compuation fast, but can be inaccurate and may
+	// underestimate the cost of large clears.
+	trState->totalCost += CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR;
 	if (addConflictRange)
 		t.write_conflict_ranges.push_back(req.arena, r);
 }
@ -6240,14 +6248,14 @@ ACTOR Future<Optional<ClientTrCommitCostEstimation>> estimateCommitCosts(Referen
 	state int i = 0;

 	for (; i < transaction->mutations.size(); ++i) {
-		auto* it = &transaction->mutations[i];
+		auto const& mutation = transaction->mutations[i];

-		if (it->type == MutationRef::Type::SetValue || it->isAtomicOp()) {
+		if (mutation.type == MutationRef::Type::SetValue || mutation.isAtomicOp()) {
 			trCommitCosts.opsCount++;
-			trCommitCosts.writeCosts += getWriteOperationCost(it->expectedSize());
-		} else if (it->type == MutationRef::Type::ClearRange) {
+			trCommitCosts.writeCosts += getWriteOperationCost(mutation.expectedSize());
+		} else if (mutation.type == MutationRef::Type::ClearRange) {
 			trCommitCosts.opsCount++;
-			keyRange = KeyRangeRef(it->param1, it->param2);
+			keyRange = KeyRangeRef(mutation.param1, mutation.param2);
 			if (trState->options.expensiveClearCostEstimation) {
 				StorageMetrics m = wait(trState->cx->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY, trState));
 				trCommitCosts.clearIdxCosts.emplace_back(i, getWriteOperationCost(m.bytes));
@ -7548,12 +7556,11 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
                                                            Optional<Reference<TransactionState>> trState);

 ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx,
+                                                 TenantInfo tenantInfo,
                                                 KeyRange keys,
                                                 Reference<LocationInfo> locationInfo,
                                                 TenantMapEntry tenantEntry,
                                                 Optional<Reference<TransactionState>> trState) {
-	state TenantInfo tenantInfo =
-	    wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
 	try {
 		WaitMetricsRequest req(tenantInfo, keys, StorageMetrics(), StorageMetrics());
 		req.min.bytes = 0;
@ -7562,12 +7569,16 @@ ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx,
 		    locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
 		return m;
 	} catch (Error& e) {
-		if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
+		if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
+			cx->invalidateCache(tenantEntry.prefix, keys);
+			wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
+		} else if (e.code() == error_code_unknown_tenant && trState.present() &&
+		           tenantInfo.tenantId != TenantInfo::INVALID_TENANT) {
+			wait(trState.get()->handleUnknownTenant());
+		} else {
 			TraceEvent(SevError, "WaitStorageMetricsError").error(e);
 			throw;
 		}
-		wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
-		cx->invalidateCache(tenantEntry.prefix, keys);

 		StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys, trState));
 		return m;
@ -7598,7 +7609,7 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
 		partBegin = (i == 0) ? keys.begin : locations[i].range.begin;
 		partEnd = (i == nLocs - 1) ? keys.end : locations[i].range.end;
 		fx[i] = doGetStorageMetrics(
-		    cx, KeyRangeRef(partBegin, partEnd), locations[i].locations, locations[i].tenantEntry, trState);
+		    cx, tenantInfo, KeyRangeRef(partBegin, partEnd), locations[i].locations, locations[i].tenantEntry, trState);
 	}
 	wait(waitForAll(fx));
 	for (int i = 0; i < nLocs; i++) {
@ -7753,27 +7764,18 @@ ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo
                                                                      StorageMetrics min,
                                                                      StorageMetrics max,
                                                                      StorageMetrics permittedError) {
-	try {
-		Future<StorageMetrics> fx;
-		if (locations.size() > 1) {
-			fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
-		} else {
-			WaitMetricsRequest req(tenantInfo, keys, min, max);
-			fx = loadBalance(locations[0].locations->locations(),
-			                 &StorageServerInterface::waitMetrics,
-			                 req,
-			                 TaskPriority::DataDistribution);
-		}
-		StorageMetrics x = wait(fx);
-		return x;
-	} catch (Error& e) {
-		TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
-		if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
-			TraceEvent(SevError, "WaitStorageMetricsError").error(e);
-			throw;
-		}
+	Future<StorageMetrics> fx;
+	if (locations.size() > 1) {
+		fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
+	} else {
+		WaitMetricsRequest req(tenantInfo, keys, min, max);
+		fx = loadBalance(locations[0].locations->locations(),
+		                 &StorageServerInterface::waitMetrics,
+		                 req,
+		                 TaskPriority::DataDistribution);
 	}
-	return Optional<StorageMetrics>();
+	StorageMetrics x = wait(fx);
+	return x;
 }

 ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
@ -7786,9 +7788,9 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
    int expectedShardCount,
    Optional<Reference<TransactionState>> trState) {
 	state Span span("NAPI:WaitStorageMetrics"_loc, generateSpanID(cx->transactionTracingSample));
-	state TenantInfo tenantInfo =
-	    wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
 	loop {
+		state TenantInfo tenantInfo =
+		    wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
 		state std::vector<KeyRangeLocationInfo> locations =
 		    wait(getKeyRangeLocations(cx,
 		                              tenantInfo,
@ -7818,13 +7820,25 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
 			continue;
 		}

-		Optional<StorageMetrics> res =
-		    wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
-		if (res.present()) {
-			return std::make_pair(res, -1);
+		try {
+			Optional<StorageMetrics> res =
+			    wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
+			if (res.present()) {
+				return std::make_pair(res, -1);
+			}
+		} catch (Error& e) {
+			TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
+			if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
+				cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
+				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
+			} else if (e.code() == error_code_unknown_tenant && trState.present() &&
+			           tenantInfo.tenantId != TenantInfo::INVALID_TENANT) {
+				wait(trState.get()->handleUnknownTenant());
+			} else {
+				TraceEvent(SevError, "WaitStorageMetricsError").error(e);
+				throw;
+			}
 		}
-		cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
-		wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
 	}
 }

@ -7994,6 +8008,21 @@ ACTOR Future<TenantMapEntry> blobGranuleGetTenantEntry(Transaction* self,
 	return tme;
 }

+// Tenant's are supposed to be unique and therefore can be loaded once.
+// There is an assumption that a tenant exists as long as operations are happening against said tenant.
+ACTOR Future<TenantMapEntry> blobLoadTenantMapEntry(Database* db, Key rangeStartKey, Optional<TenantName> tenantName) {
+	state Transaction tr(*db);
+
+	loop {
+		try {
+			TenantMapEntry tme = wait(blobGranuleGetTenantEntry(&tr, rangeStartKey, tenantName));
+			return tme;
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
 Future<Standalone<VectorRef<KeyRef>>> Transaction::getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize) {
 	return ::getRangeSplitPoints(
 	    trState, keys, chunkSize, readVersion.isValid() && readVersion.isReady() ? readVersion.get() : latestVersion);
@ -8465,7 +8494,6 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx,
 	state Version readVersionOut = invalidVersion;
 	state int batchSize = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2;
 	state int loadSize = (BUGGIFY ? deterministicRandom()->randomInt(1, 20) : 20) * batchSize;
-	state bool loadedTenantEntry = false;

 	if (version.present()) {
 		if (version.get() == latestVersion) {
@ -8485,16 +8513,16 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx,
 		}
 	}

+	if (tenantName.present()) {
+		TenantMapEntry tme = wait(blobLoadTenantMapEntry(&db, range.begin, tenantName));
+		range = range.withPrefix(tme.prefix);
+		curRegion = KeyRangeRef(range.begin, range.begin);
+	}
+
 	loop {
 		if (curRegion.begin >= range.end) {
 			return readVersionOut;
 		}
-		if (tenantName.present() && !loadedTenantEntry) {
-			TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin, tenantName));
-			loadedTenantEntry = true;
-			range = range.withPrefix(tenantEntry.prefix);
-			curRegion = KeyRangeRef(range.begin, range.begin);
-		}
 		loop {
 			try {
 				wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), loadSize)));
@ -10618,70 +10646,28 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Transaction* tr,
 	state Standalone<VectorRef<KeyRangeRef>> blobRanges;
 	state Key beginKey = range.begin;

+	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
 	loop {
-		try {
-			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);

-			state RangeResult results = wait(
-			    krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
+		state RangeResult results =
+		    wait(krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));

-			blobRanges.arena().dependsOn(results.arena());
-			for (int i = 0; i < results.size() - 1; i++) {
-				if (results[i].value == blobRangeActive) {
-					blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
-				}
-				if (blobRanges.size() == batchLimit) {
-					return blobRanges;
-				}
+		blobRanges.arena().dependsOn(results.arena());
+		for (int i = 0; i < results.size() - 1; i++) {
+			if (results[i].value == blobRangeActive) {
+				blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
 			}
-
-			if (!results.more) {
+			if (blobRanges.size() == batchLimit) {
 				return blobRanges;
 			}
-			beginKey = results.back().key;
-		} catch (Error& e) {
-			wait(tr->onError(e));
 		}
-	}
-}

-ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobbifiedRanges(Transaction* tr,
-                                                                     KeyRange range,
-                                                                     int rangeLimit,
-                                                                     Optional<TenantName> tenantName) {
-	state TenantMapEntry tme;
-
-	loop {
-		try {
-			if (tenantName.present()) {
-				wait(store(tme, blobGranuleGetTenantEntry(tr, range.begin, tenantName)));
-				range = range.withPrefix(tme.prefix);
-			}
-			break;
-		} catch (Error& e) {
-			wait(tr->onError(e));
+		if (!results.more) {
+			return blobRanges;
 		}
+		beginKey = results.back().key;
 	}
-
-	state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
-	if (!tenantName.present()) {
-		return blobRanges;
-	}
-
-	// Strip tenant prefix out.
-	state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
-	for (auto& blobRange : blobRanges) {
-		// Filter out blob ranges that span tenants for some reason.
-		if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
-			TraceEvent("ListBlobbifiedRangeSpansTenants")
-			    .suppressFor(/*seconds=*/5)
-			    .detail("Tenant", tenantName.get())
-			    .detail("Range", blobRange);
-			continue;
-		}
-		tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
-	}
-	return tenantBlobRanges;
 }

 ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
@ -10693,7 +10679,6 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
 	state Transaction tr(cx);
 	state Key purgeKey;
 	state KeyRange purgeRange = range;
-	state bool loadedTenantPrefix = false;

 	tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 	if (purgeVersion == latestVersion) {
@ -10713,23 +10698,22 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
 		throw unsupported_operation();
 	}

+	if (tenant.present()) {
+		TenantMapEntry tme = wait(blobLoadTenantMapEntry(&cx, range.begin, tenant));
+		purgeRange = purgeRange.withPrefix(tme.prefix);
+	}
+
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);

-			if (tenant.present() && !loadedTenantPrefix) {
-				TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin, tenant));
-				loadedTenantPrefix = true;
-				purgeRange = purgeRange.withPrefix(tenantEntry.prefix);
-			}
-
 			// must be aligned to blob range(s)
 			state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedBegin =
-			    getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2, {});
+			    getBlobRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2);
 			state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedEnd =
-			    getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2, {});
+			    getBlobRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2);
 			wait(success(blobbifiedBegin) && success(blobbifiedEnd));
 			if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) ||
 			    (!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) {
@ -10815,7 +10799,11 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
                                     Optional<TenantName> tenantName) {
 	state Database db(cx);
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
-	state bool loadedTenantEntry = false;
+
+	if (tenantName.present()) {
+		TenantMapEntry tme = wait(blobLoadTenantMapEntry(&db, range.begin, tenantName));
+		range = range.withPrefix(tme.prefix);
+	}

 	state Value value = active ? blobRangeActive : blobRangeInactive;
 	loop {
@ -10823,13 +10811,6 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);

-			if (tenantName.present() && !loadedTenantEntry) {
-				TenantMapEntry tenantEntry =
-				    wait(blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName));
-				loadedTenantEntry = true;
-				range = range.withPrefix(tenantEntry.prefix);
-			}
-
 			Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(&tr->getTransaction(), range, 1));

 			if (active) {
@ -10881,10 +10862,41 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Refer

 	state Database db(cx);
 	state Transaction tr(db);
+	state TenantMapEntry tme;
+	state Standalone<VectorRef<KeyRangeRef>> blobRanges;

-	Standalone<VectorRef<KeyRangeRef>> blobbifiedRanges = wait(getBlobbifiedRanges(&tr, range, rangeLimit, tenantName));
+	if (tenantName.present()) {
+		wait(store(tme, blobLoadTenantMapEntry(&db, range.begin, tenantName)));
+		range = range.withPrefix(tme.prefix);
+	}

-	return blobbifiedRanges;
+	loop {
+		try {
+			wait(store(blobRanges, getBlobRanges(&tr, range, rangeLimit)));
+			break;
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+
+	if (!tenantName.present()) {
+		return blobRanges;
+	}
+
+	// Strip tenant prefix out.
+	state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
+	for (auto& blobRange : blobRanges) {
+		// Filter out blob ranges that span tenants for some reason.
+		if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
+			TraceEvent("ListBlobbifiedRangeSpansTenants")
+			    .suppressFor(/*seconds=*/5)
+			    .detail("Tenant", tenantName.get())
+			    .detail("Range", blobRange);
+			continue;
+		}
+		tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
+	}
+	return tenantBlobRanges;
 }

 Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range,
--- a/fdbclient/ParallelStream.actor.cpp
+++ b/fdbclient/ParallelStream.actor.cpp
@ -42,7 +42,7 @@ ACTOR static Future<Void> produce(ParallelStream<ParallelStreamTest::TestValue>:
 }

 ACTOR static Future<Void> consume(FutureStream<ParallelStreamTest::TestValue> stream, int expected) {
-	state int next;
+	state int next = 0;
 	try {
 		loop {
 			ParallelStreamTest::TestValue value = waitNext(stream);
--- a/fdbclient/PaxosConfigTransaction.actor.cpp
+++ b/fdbclient/PaxosConfigTransaction.actor.cpp
@ -564,6 +564,10 @@ Version PaxosConfigTransaction::getCommittedVersion() const {
 	return impl->getCommittedVersion();
 }

+int64_t PaxosConfigTransaction::getTotalCost() const {
+	return 0;
+}
+
 int64_t PaxosConfigTransaction::getApproximateSize() const {
 	return impl->getApproximateSize();
 }
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -383,6 +383,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ROCKSDB_WRITER_THREAD_PRIORITY,                          0 );
 	init( ROCKSDB_BACKGROUND_PARALLELISM,                          4 );
 	init( ROCKSDB_READ_PARALLELISM,                                4 );
+	// If true, do not process and store RocksDB logs
+	init( ROCKSDB_MUTE_LOGS,                                   false );
 	// Use a smaller memtable in simulation to avoid OOMs.
 	int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024;
 	init( ROCKSDB_MEMTABLE_BYTES,                      memtableBytes );
@ -732,9 +734,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ENFORCE_TAG_THROTTLING_ON_PROXIES,   GLOBAL_TAG_THROTTLING );
 	init( GLOBAL_TAG_THROTTLING_MIN_RATE,                        1.0 );
 	init( GLOBAL_TAG_THROTTLING_FOLDING_TIME,                   10.0 );
-	init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO,            5.0 );
 	init( GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED,                 10 );
 	init( GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER,              240.0 );
+	init( PROXY_MAX_TAG_THROTTLE_DURATION,          5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;
+	init( GLOBAL_TAG_THROTTLING_PROXY_LOGGING_INTERVAL,         60.0 );

 	//Storage Metrics
 	init( STORAGE_METRICS_AVERAGE_INTERVAL,                    120.0 );
@ -943,9 +946,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ENCRYPTION_MODE,                             "AES-256-CTR" );
 	init( SIM_KMS_MAX_KEYS,                                     4096 );
 	init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH,                 100000 );
-	init( ENABLE_TLOG_ENCRYPTION,                  ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ) ENABLE_TLOG_ENCRYPTION = true;
-	init( ENABLE_STORAGE_SERVER_ENCRYPTION,        ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_STORAGE_SERVER_ENCRYPTION = !ENABLE_STORAGE_SERVER_ENCRYPTION;
-	init( ENABLE_BLOB_GRANULE_ENCRYPTION,          ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_BLOB_GRANULE_ENCRYPTION = !ENABLE_BLOB_GRANULE_ENCRYPTION;
+	init( ENABLE_TLOG_ENCRYPTION,                  ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION ) ENABLE_TLOG_ENCRYPTION = false;
+	init( ENABLE_STORAGE_SERVER_ENCRYPTION,        ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION) ENABLE_STORAGE_SERVER_ENCRYPTION = false;
+	init( ENABLE_BLOB_GRANULE_ENCRYPTION,          ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION) ENABLE_BLOB_GRANULE_ENCRYPTION = false;

 	// encrypt key proxy
 	init( ENABLE_BLOB_GRANULE_COMPRESSION,                     false ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_COMPRESSION = deterministicRandom()->coinflip(); }
--- a/fdbclient/SimpleConfigTransaction.actor.cpp
+++ b/fdbclient/SimpleConfigTransaction.actor.cpp
@ -296,6 +296,10 @@ Version SimpleConfigTransaction::getCommittedVersion() const {
 	return impl->getCommittedVersion();
 }

+int64_t SimpleConfigTransaction::getTotalCost() const {
+	return 0;
+}
+
 int64_t SimpleConfigTransaction::getApproximateSize() const {
 	return impl->getApproximateSize();
 }
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@ -626,6 +626,14 @@ ThreadFuture<SpanContext> ThreadSafeTransaction::getSpanContext() {
 	});
 }

+ThreadFuture<int64_t> ThreadSafeTransaction::getTotalCost() {
+	ISingleThreadTransaction* tr = this->tr;
+	return onMainThread([tr]() -> Future<int64_t> {
+		tr->checkDeferredError();
+		return tr->getTotalCost();
+	});
+}
+
 ThreadFuture<int64_t> ThreadSafeTransaction::getApproximateSize() {
 	ISingleThreadTransaction* tr = this->tr;
 	return onMainThread([tr]() -> Future<int64_t> {
--- a/fdbclient/include/fdbclient/ClientKnobs.h
+++ b/fdbclient/include/fdbclient/ClientKnobs.h
@ -262,8 +262,8 @@ public:
 	double TAG_THROTTLE_EXPIRATION_INTERVAL;
 	int64_t WRITE_COST_BYTE_FACTOR; // Used to round up the cost of write operations
 	int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations
-	double PROXY_MAX_TAG_THROTTLE_DURATION; // Maximum duration that a transaction can be tag throttled by proxy before
-	                                        // being rejected
+	// Cost multiplier for writes (because write operations are more expensive than reads):
+	double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO;

 	// busyness reporting
 	double BUSYNESS_SPIKE_START_THRESHOLD;
--- a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h
+++ b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h
@ -284,7 +284,6 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
 	state Key versionKey = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
 	state bool oldReplicationUsesDcId = false;
 	state bool warnPPWGradual = false;
-	state bool warnChangeStorageNoMigrate = false;
 	state bool warnRocksDBIsExperimental = false;
 	state bool warnShardedRocksDBIsExperimental = false;
 	loop {
--- a/fdbclient/include/fdbclient/IClientApi.h
+++ b/fdbclient/include/fdbclient/IClientApi.h
@ -120,6 +120,7 @@ public:
 	// later if they are not really needed.
 	virtual ThreadFuture<VersionVector> getVersionVector() = 0;
 	virtual ThreadFuture<SpanContext> getSpanContext() = 0;
+	virtual ThreadFuture<int64_t> getTotalCost() = 0;
 	virtual ThreadFuture<int64_t> getApproximateSize() = 0;

 	virtual void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
--- a/fdbclient/include/fdbclient/ISingleThreadTransaction.h
+++ b/fdbclient/include/fdbclient/ISingleThreadTransaction.h
@ -101,6 +101,7 @@ public:
 	virtual Version getCommittedVersion() const = 0;
 	virtual VersionVector getVersionVector() const = 0;
 	virtual SpanContext getSpanContext() const = 0;
+	virtual int64_t getTotalCost() const = 0;
 	virtual int64_t getApproximateSize() const = 0;
 	virtual Future<Standalone<StringRef>> getVersionstamp() = 0;
 	virtual void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
--- a/fdbclient/include/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/include/fdbclient/MultiVersionTransaction.h
@ -377,6 +377,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {

 	FDBFuture* (*transactionCommit)(FDBTransaction* tr);
 	fdb_error_t (*transactionGetCommittedVersion)(FDBTransaction* tr, int64_t* outVersion);
+	FDBFuture* (*transactionGetTotalCost)(FDBTransaction* tr);
 	FDBFuture* (*transactionGetApproximateSize)(FDBTransaction* tr);
 	FDBFuture* (*transactionWatch)(FDBTransaction* tr, uint8_t const* keyName, int keyNameLength);
 	FDBFuture* (*transactionOnError)(FDBTransaction* tr, fdb_error_t error);
@ -505,6 +506,7 @@ public:
 	Version getCommittedVersion() override;
 	ThreadFuture<VersionVector> getVersionVector() override;
 	ThreadFuture<SpanContext> getSpanContext() override { return SpanContext(); };
+	ThreadFuture<int64_t> getTotalCost() override;
 	ThreadFuture<int64_t> getApproximateSize() override;

 	void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
@ -732,6 +734,7 @@ public:
 	Version getCommittedVersion() override;
 	ThreadFuture<VersionVector> getVersionVector() override;
 	ThreadFuture<SpanContext> getSpanContext() override;
+	ThreadFuture<int64_t> getTotalCost() override;
 	ThreadFuture<int64_t> getApproximateSize() override;

 	void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
--- a/fdbclient/include/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/include/fdbclient/NativeAPI.actor.h
@ -249,6 +249,9 @@ struct TransactionState : ReferenceCounted<TransactionState> {
 	SpanContext spanContext;
 	UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False;
 	bool readVersionObtainedFromGrvProxy;
+	// Measured by summing the bytes accessed by each read and write operation
+	// after rounding up to the nearest page size and applying a write penalty
+	int64_t totalCost = 0;

 	// Special flag to skip prepending tenant prefix to mutations and conflict ranges
 	// when a dummy, internal transaction gets commited. The sole purpose of commitDummyTransaction() is to
@ -447,6 +450,8 @@ public:
 	// May be called only after commit() returns success
 	Version getCommittedVersion() const { return trState->committedVersion; }

+	int64_t getTotalCost() const { return trState->totalCost; }
+
 	// Will be fulfilled only after commit() returns success
 	[[nodiscard]] Future<Standalone<StringRef>> getVersionstamp();

@ -566,9 +571,16 @@ ACTOR Future<std::vector<CheckpointMetaData>> getCheckpointMetaData(Database cx,
 // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
 ACTOR Future<bool> checkSafeExclusions(Database cx, std::vector<AddressExclusion> exclusions);

-// Round up to the nearest page size
+// Measured in bytes, rounded up to the nearest page size. Multiply by fungibility ratio
+// because writes are more expensive than reads.
 inline uint64_t getWriteOperationCost(uint64_t bytes) {
-	return (bytes - 1) / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR + 1;
+	return CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR *
+	       ((bytes - 1) / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR + 1);
+}
+
+// Measured in bytes, rounded up to the nearest page size.
+inline uint64_t getReadOperationCost(uint64_t bytes) {
+	return ((bytes - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
 }

 // Create a transaction to set the value of system key \xff/conf/perpetual_storage_wiggle. If enable == true, the value
--- a/fdbclient/include/fdbclient/PaxosConfigTransaction.h
+++ b/fdbclient/include/fdbclient/PaxosConfigTransaction.h
@ -64,6 +64,7 @@ public:
 	void clear(KeyRef const&) override;
 	Future<Void> commit() override;
 	Version getCommittedVersion() const override;
+	int64_t getTotalCost() const override;
 	int64_t getApproximateSize() const override;
 	void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	Future<Void> onError(Error const& e) override;
--- a/fdbclient/include/fdbclient/ReadYourWrites.h
+++ b/fdbclient/include/fdbclient/ReadYourWrites.h
@ -149,6 +149,7 @@ public:
 	VersionVector getVersionVector() const override { return tr.getVersionVector(); }
 	SpanContext getSpanContext() const override { return tr.getSpanContext(); }

+	int64_t getTotalCost() const override { return tr.getTotalCost(); }
 	int64_t getApproximateSize() const override { return approximateSize; }
 	[[nodiscard]] Future<Standalone<StringRef>> getVersionstamp() override;

--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@ -316,6 +316,7 @@ public:
 	int64_t ROCKSDB_MEMTABLE_BYTES;
 	bool ROCKSDB_LEVEL_STYLE_COMPACTION;
 	bool ROCKSDB_UNSAFE_AUTO_FSYNC;
+	bool ROCKSDB_MUTE_LOGS;
 	int64_t ROCKSDB_PERIODIC_COMPACTION_SECONDS;
 	int ROCKSDB_PREFIX_LEN;
 	int64_t ROCKSDB_BLOCK_CACHE_SIZE;
@ -629,14 +630,16 @@ public:
 	double GLOBAL_TAG_THROTTLING_MIN_RATE;
 	// Used by global tag throttling counters
 	double GLOBAL_TAG_THROTTLING_FOLDING_TIME;
-	// Cost multiplier for writes (because write operations are more expensive than reads)
-	double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO;
 	// Maximum number of tags tracked by global tag throttler. Additional tags will be ignored
 	// until some existing tags expire
 	int64_t GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED;
 	// Global tag throttler forgets about throughput from a tag once no new transactions from that
 	// tag have been received for this duration (in seconds):
 	int64_t GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER;
+	// Maximum duration that a transaction can be tag throttled by proxy before being rejected
+	double PROXY_MAX_TAG_THROTTLE_DURATION;
+	// Interval at which latency bands are logged for each tag on grv proxy
+	double GLOBAL_TAG_THROTTLING_PROXY_LOGGING_INTERVAL;

 	double MAX_TRANSACTIONS_PER_BYTE;

@ -740,7 +743,6 @@ public:
 	int64_t MIN_TAG_READ_PAGES_RATE;
 	int64_t MIN_TAG_WRITE_PAGES_RATE;
 	double TAG_MEASUREMENT_INTERVAL;
-	int64_t READ_COST_BYTE_FACTOR;
 	bool PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS;
 	bool REPORT_DD_METRICS;
 	double DD_METRICS_REPORT_INTERVAL;
--- a/fdbclient/include/fdbclient/SimpleConfigTransaction.h
+++ b/fdbclient/include/fdbclient/SimpleConfigTransaction.h
@ -76,6 +76,7 @@ public:
 	void reset() override;
 	void debugTransaction(UID dID) override;
 	void checkDeferredError() const override;
+	int64_t getTotalCost() const override;
 	int64_t getApproximateSize() const override;
 	void set(KeyRef const&, ValueRef const&) override;
 	void clear(KeyRangeRef const&) override { throw client_invalid_operation(); }
--- a/fdbclient/include/fdbclient/Tenant.h
+++ b/fdbclient/include/fdbclient/Tenant.h
@ -211,6 +211,31 @@ struct TenantMetadata {
 };

 typedef VersionedMap<TenantName, TenantMapEntry> TenantMap;
-class TenantPrefixIndex : public VersionedMap<Key, TenantName>, public ReferenceCounted<TenantPrefixIndex> {};
+
+// A set of tenant names that is generally expected to have one item in it. The set can have more than one item in it
+// during certain periods when the set is being updated (e.g. while restoring a backup), but it is expected to have
+// one item at the end. It is not possible to use the set while it contains more than one item.
+struct TenantNameUniqueSet {
+	std::unordered_set<TenantName> tenantNames;
+
+	// Returns the single tenant name stored in the set
+	// It is an error to call this function if the set holds more than one name
+	TenantName get() const {
+		ASSERT(tenantNames.size() == 1);
+		return *tenantNames.begin();
+	}
+
+	void insert(TenantName const& name) { tenantNames.insert(name); }
+
+	// Removes a tenant name from the set. Returns true if the set is now empty.
+	bool remove(TenantName const& name) {
+		auto itr = tenantNames.find(name);
+		ASSERT(itr != tenantNames.end());
+		tenantNames.erase(itr);
+		return tenantNames.empty();
+	}
+};
+
+class TenantPrefixIndex : public VersionedMap<Key, TenantNameUniqueSet>, public ReferenceCounted<TenantPrefixIndex> {};

 #endif
--- a/fdbclient/include/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/include/fdbclient/ThreadSafeTransaction.h
@ -205,6 +205,7 @@ public:
 	Version getCommittedVersion() override;
 	ThreadFuture<VersionVector> getVersionVector() override;
 	ThreadFuture<SpanContext> getSpanContext() override;
+	ThreadFuture<int64_t> getTotalCost() override;
 	ThreadFuture<int64_t> getApproximateSize() override;

 	ThreadFuture<uint64_t> getProtocolVersion();
--- a/fdbrpc/Stats.actor.cpp
+++ b/fdbrpc/Stats.actor.cpp
@ -133,3 +133,50 @@ Future<Void> CounterCollection::traceCounters(std::string const& traceEventName,
 	return CounterCollectionImpl::traceCounters(
 	    this, traceEventName, traceEventID, interval, trackLatestName, decorator);
 }
+
+void LatencyBands::insertBand(double value) {
+	bands.emplace(std::make_pair(value, std::make_unique<Counter>(format("Band%f", value), *cc)));
+}
+
+FDB_DEFINE_BOOLEAN_PARAM(Filtered);
+
+LatencyBands::LatencyBands(std::string const& name,
+                           UID id,
+                           double loggingInterval,
+                           std::function<void(TraceEvent&)> const& decorator)
+  : name(name), id(id), loggingInterval(loggingInterval), decorator(decorator) {}
+
+void LatencyBands::addThreshold(double value) {
+	if (value > 0 && bands.count(value) == 0) {
+		if (bands.size() == 0) {
+			ASSERT(!cc && !filteredCount);
+			cc = std::make_unique<CounterCollection>(name, id.toString());
+			logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name, decorator);
+			filteredCount = std::make_unique<Counter>("Filtered", *cc);
+			insertBand(std::numeric_limits<double>::infinity());
+		}
+
+		insertBand(value);
+	}
+}
+
+void LatencyBands::addMeasurement(double measurement, int count, Filtered filtered) {
+	if (filtered && filteredCount) {
+		(*filteredCount) += count;
+	} else if (bands.size() > 0) {
+		auto itr = bands.upper_bound(measurement);
+		ASSERT(itr != bands.end());
+		(*itr->second) += count;
+	}
+}
+
+void LatencyBands::clearBands() {
+	logger = Void();
+	bands.clear();
+	filteredCount.reset();
+	cc.reset();
+}
+
+LatencyBands::~LatencyBands() {
+	clearBands();
+}
--- a/fdbrpc/include/fdbrpc/Stats.h
+++ b/fdbrpc/include/fdbrpc/Stats.h
@ -182,47 +182,12 @@ static void specialCounter(CounterCollection& collection, std::string const& nam
 	new SpecialCounter<F>(collection, name, std::move(f));
 }

+FDB_DECLARE_BOOLEAN_PARAM(Filtered);
+
 class LatencyBands {
-public:
-	LatencyBands(std::string name, UID id, double loggingInterval)
-	  : name(name), id(id), loggingInterval(loggingInterval) {}
-
-	void addThreshold(double value) {
-		if (value > 0 && bands.count(value) == 0) {
-			if (bands.size() == 0) {
-				ASSERT(!cc && !filteredCount);
-				cc = std::make_unique<CounterCollection>(name, id.toString());
-				logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name);
-				filteredCount = std::make_unique<Counter>("Filtered", *cc);
-				insertBand(std::numeric_limits<double>::infinity());
-			}
-
-			insertBand(value);
-		}
-	}
-
-	void addMeasurement(double measurement, bool filtered = false) {
-		if (filtered && filteredCount) {
-			++(*filteredCount);
-		} else if (bands.size() > 0) {
-			auto itr = bands.upper_bound(measurement);
-			ASSERT(itr != bands.end());
-			++(*itr->second);
-		}
-	}
-
-	void clearBands() {
-		logger = Void();
-		bands.clear();
-		filteredCount.reset();
-		cc.reset();
-	}
-
-	~LatencyBands() { clearBands(); }
-
-private:
 	std::map<double, std::unique_ptr<Counter>> bands;
 	std::unique_ptr<Counter> filteredCount;
+	std::function<void(TraceEvent&)> decorator;

 	std::string name;
 	UID id;
@ -231,9 +196,22 @@ private:
 	std::unique_ptr<CounterCollection> cc;
 	Future<Void> logger;

-	void insertBand(double value) {
-		bands.emplace(std::make_pair(value, std::make_unique<Counter>(format("Band%f", value), *cc)));
-	}
+	void insertBand(double value);
+
+public:
+	LatencyBands(
+	    std::string const& name,
+	    UID id,
+	    double loggingInterval,
+	    std::function<void(TraceEvent&)> const& decorator = [](auto&) {});
+
+	LatencyBands(LatencyBands&&) = default;
+	LatencyBands& operator=(LatencyBands&&) = default;
+
+	void addThreshold(double value);
+	void addMeasurement(double measurement, int count = 1, Filtered = Filtered::False);
+	void clearBands();
+	~LatencyBands();
 };

 class LatencySample {
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -27,6 +27,7 @@
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/EncryptionOpsUtils.h"
 #include "fdbserver/IKeyValueStore.h"
+#include "fdbserver/Knobs.h"
 #include "fdbserver/LogProtocolMessage.h"
 #include "fdbserver/LogSystem.h"
 #include "flow/Error.h"
@ -87,9 +88,10 @@ public:

 	ApplyMetadataMutationsImpl(const SpanContext& spanContext_,
 	                           ResolverData& resolverData_,
-	                           const VectorRef<MutationRef>& mutations_)
+	                           const VectorRef<MutationRef>& mutations_,
+	                           const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* cipherKeys_)
 	  : spanContext(spanContext_), dbgid(resolverData_.dbgid), arena(resolverData_.arena), mutations(mutations_),
-	    txnStateStore(resolverData_.txnStateStore), toCommit(resolverData_.toCommit),
+	    cipherKeys(cipherKeys_), txnStateStore(resolverData_.txnStateStore), toCommit(resolverData_.toCommit),
 	    confChange(resolverData_.confChanges), logSystem(resolverData_.logSystem), popVersion(resolverData_.popVersion),
 	    keyInfo(resolverData_.keyInfo), storageCache(resolverData_.storageCache),
 	    initialCommit(resolverData_.initialCommit), forResolver(true) {}
@ -132,7 +134,7 @@ private:
 	std::unordered_map<UID, StorageServerInterface>* tssMapping = nullptr;

 	std::map<TenantName, TenantMapEntry>* tenantMap = nullptr;
-	std::unordered_map<int64_t, TenantName>* tenantIdIndex = nullptr;
+	std::unordered_map<int64_t, TenantNameUniqueSet>* tenantIdIndex = nullptr;

 	// true if the mutations were already written to the txnStateStore as part of recovery
 	bool initialCommit = false;
@ -160,11 +162,13 @@ private:

 private:
 	void writeMutation(const MutationRef& m) {
-		if (forResolver || !isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION)) {
+		if (!isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION)) {
 			toCommit->writeTypedMessage(m);
 		} else {
 			ASSERT(cipherKeys != nullptr);
 			Arena arena;
+			CODE_PROBE(!forResolver, "encrypting metadata mutations");
+			CODE_PROBE(forResolver, "encrypting resolver mutations");
 			toCommit->writeTypedMessage(m.encryptMetadata(*cipherKeys, arena, BlobCipherMetrics::TLOG));
 		}
 	}
@ -669,7 +673,7 @@ private:

 				(*tenantMap)[tenantName] = tenantEntry;
 				if (tenantIdIndex) {
-					(*tenantIdIndex)[tenantEntry.id] = tenantName;
+					(*tenantIdIndex)[tenantEntry.id].insert(tenantName);
 				}
 			}

@ -799,7 +803,7 @@ private:
 				    .detail("Tag", tag.toString())
 				    .detail("Server", decodeServerTagKey(kv.key));
 				if (!forResolver) {
-					logSystem->pop(popVersion, decodeServerTagValue(kv.value));
+					logSystem->pop(popVersion, tag);
 					(*tag_popped)[tag] = popVersion;
 				}
 				ASSERT_WE_THINK(forResolver ^ (tag_popped != nullptr));
@ -807,11 +811,11 @@ private:
 				if (toCommit) {
 					MutationRef privatized = m;
 					privatized.param1 = kv.key.withPrefix(systemKeys.begin, arena);
-					privatized.param2 = keyAfter(kv.key, arena).withPrefix(systemKeys.begin, arena);
+					privatized.param2 = keyAfter(privatized.param1, arena);

 					TraceEvent(SevDebug, "SendingPrivatized_ClearServerTag", dbgid).detail("M", privatized);

-					toCommit->addTag(decodeServerTagValue(kv.value));
+					toCommit->addTag(tag);
 					writeMutation(privatized);
 				}
 			}
@ -1096,7 +1100,11 @@ private:
 					// TODO: O(n) operation, optimize cpu
 					auto itr = startItr;
 					while (itr != endItr) {
-						tenantIdIndex->erase(itr->second.id);
+						auto indexItr = tenantIdIndex->find(itr->second.id);
+						ASSERT(indexItr != tenantIdIndex->end());
+						if (indexItr->second.remove(itr->first)) {
+							tenantIdIndex->erase(indexItr);
+						}
 						itr++;
 					}
 				}
@ -1343,8 +1351,9 @@ void applyMetadataMutations(SpanContext const& spanContext,

 void applyMetadataMutations(SpanContext const& spanContext,
                            ResolverData& resolverData,
-                            const VectorRef<MutationRef>& mutations) {
-	ApplyMetadataMutationsImpl(spanContext, resolverData, mutations).apply();
+                            const VectorRef<MutationRef>& mutations,
+                            const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* pCipherKeys) {
+	ApplyMetadataMutationsImpl(spanContext, resolverData, mutations, pCipherKeys).apply();
 }

 void applyMetadataMutations(SpanContext const& spanContext,
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -2776,6 +2776,7 @@ ACTOR Future<Void> haltBlobWorker(Reference<BlobManagerData> bmData, BlobWorkerI
 			if (bmData->iAmReplaced.canBeSet()) {
 				bmData->iAmReplaced.send(Void());
 			}
+			throw;
 		}
 	}

@ -2896,6 +2897,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
 					if (bmData->iAmReplaced.canBeSet()) {
 						bmData->iAmReplaced.send(Void());
 					}
+					throw blob_manager_replaced();
 				}

 				BoundaryEvaluation newEval(rep.continueEpoch,
@ -5299,6 +5301,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
 					fmt::print("BM {} exiting because it is replaced\n", self->epoch);
 				}
 				TraceEvent("BlobManagerReplaced", bmInterf.id()).detail("Epoch", epoch);
+				wait(delay(0.0));
 				break;
 			}
 			when(HaltBlobManagerRequest req = waitNext(bmInterf.haltBlobManager.getFuture())) {
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -3549,7 +3549,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
 				continue;
 			}
 			state Reference<GranuleMetadata> metadata = m;
-			state Version granuleBeginVersion = req.beginVersion;
+			// state Version granuleBeginVersion = req.beginVersion;
 			// skip waiting for CF ready for recovery mode
 			if (!isFullRestoreMode()) {
 				choose {
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@ -892,7 +892,7 @@ Optional<TenantName> getTenantName(ProxyCommitData* commitData, int64_t tenantId
 	if (tenantId != TenantInfo::INVALID_TENANT) {
 		auto itr = commitData->tenantIdIndex.find(tenantId);
 		if (itr != commitData->tenantIdIndex.end()) {
-			return Optional<TenantName>(itr->second);
+			return Optional<TenantName>(itr->second.get());
 		}
 	}

@ -1266,8 +1266,14 @@ ACTOR Future<MutationRef> writeMutation(CommitBatchContext* self,
 	if (self->pProxyCommitData->isEncryptionEnabled) {
 		state EncryptCipherDomainId domainId = tenantId;
 		state MutationRef encryptedMutation;
+		CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::DISABLED,
+		           "using disabled tenant mode");
+		CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::OPTIONAL_TENANT,
+		           "using optional tenant mode");
+		CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::REQUIRED,
+		           "using required tenant mode");

-		if (encryptedMutationOpt->present()) {
+		if (encryptedMutationOpt && encryptedMutationOpt->present()) {
 			CODE_PROBE(true, "using already encrypted mutation");
 			encryptedMutation = encryptedMutationOpt->get();
 			ASSERT(encryptedMutation.isEncrypted());
@ -1299,6 +1305,8 @@ ACTOR Future<MutationRef> writeMutation(CommitBatchContext* self,
 			ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID);
 			encryptedMutation = mutation->encrypt(self->cipherKeys, domainId, *arena, BlobCipherMetrics::TLOG);
 		}
+		ASSERT(encryptedMutation.isEncrypted());
+		CODE_PROBE(true, "encrypting non-metadata mutations");
 		self->toCommit.writeTypedMessage(encryptedMutation);
 		return encryptedMutation;
 	} else {
@ -1473,12 +1481,12 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 			if (!hasCandidateBackupKeys) {
 				continue;
 			}
-
 			if (m.type != MutationRef::Type::ClearRange) {
 				// Add the mutation to the relevant backup tag
 				for (auto backupName : pProxyCommitData->vecBackupKeys[m.param1]) {
 					// If encryption is enabled make sure the mutation we are writing is also encrypted
 					ASSERT(!self->pProxyCommitData->isEncryptionEnabled || writtenMutation.isEncrypted());
+					CODE_PROBE(writtenMutation.isEncrypted(), "using encrypted backup mutation");
 					self->logRangeMutations[backupName].push_back_deep(self->logRangeMutationsArena, writtenMutation);
 				}
 			} else {
@ -1500,6 +1508,7 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 					// TODO (Nim): Currently clear ranges are encrypted using the default encryption key, this must be
 					// changed to account for clear ranges which span tenant boundaries
 					if (self->pProxyCommitData->isEncryptionEnabled) {
+						CODE_PROBE(true, "encrypting clear range backup mutation");
 						if (backupMutation.param1 == m.param1 && backupMutation.param2 == m.param2 &&
 						    encryptedMutation.present()) {
 							backupMutation = encryptedMutation.get();
@ -1510,6 +1519,7 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 							backupMutation =
 							    backupMutation.encrypt(self->cipherKeys, domainId, arena, BlobCipherMetrics::BACKUP);
 						}
+						ASSERT(backupMutation.isEncrypted());
 					}

 					// Add the mutation to the relevant backup tag
@ -1613,14 +1623,25 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 		                            idempotencyIdSet.param2 = kv.value;
 		                            auto& tags = pProxyCommitData->tagsForKey(kv.key);
 		                            self->toCommit.addTags(tags);
-		                            self->toCommit.writeTypedMessage(idempotencyIdSet);
+		                            if (self->pProxyCommitData->isEncryptionEnabled) {
+			                            CODE_PROBE(true, "encrypting idempotency mutation");
+			                            std::pair<EncryptCipherDomainName, EncryptCipherDomainId> p =
+			                                getEncryptDetailsFromMutationRef(self->pProxyCommitData, idempotencyIdSet);
+			                            Arena arena;
+			                            MutationRef encryptedMutation = idempotencyIdSet.encrypt(
+			                                self->cipherKeys, p.second, arena, BlobCipherMetrics::TLOG);
+			                            self->toCommit.writeTypedMessage(encryptedMutation);
+		                            } else {
+			                            self->toCommit.writeTypedMessage(idempotencyIdSet);
+		                            }
 	                            });
-
-	for (const auto& m : pProxyCommitData->idempotencyClears) {
+	state int i = 0;
+	for (i = 0; i < pProxyCommitData->idempotencyClears.size(); i++) {
+		MutationRef& m = pProxyCommitData->idempotencyClears[i];
 		auto& tags = pProxyCommitData->tagsForKey(m.param1);
 		self->toCommit.addTags(tags);
-		// TODO(nwijetunga): Encrypt these mutations
-		self->toCommit.writeTypedMessage(m);
+		Arena arena;
+		wait(success(writeMutation(self, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, &m, nullptr, &arena)));
 	}
 	pProxyCommitData->idempotencyClears = Standalone<VectorRef<MutationRef>>();

@ -1922,7 +1943,7 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 			bool filter = self->maxTransactionBytes >
 			              pProxyCommitData->latencyBandConfig.get().commitConfig.maxCommitBytes.orDefault(
 			                  std::numeric_limits<int>::max());
-			pProxyCommitData->stats.commitLatencyBands.addMeasurement(duration, filter);
+			pProxyCommitData->stats.commitLatencyBands.addMeasurement(duration, 1, Filtered(filter));
 		}
 	}

--- a/fdbserver/ConsistencyScan.actor.cpp
+++ b/fdbserver/ConsistencyScan.actor.cpp
@ -382,7 +382,6 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
 	// Note: this may cause some shards to be processed more than once or not at all in a non-quiescent database
 	state int effectiveClientCount = distributed ? clientCount : 1;
 	state int i = clientId * (shardSampleFactor + 1);
-	state int increment = (distributed && !firstClient) ? effectiveClientCount * shardSampleFactor : 1;
 	state int64_t rateLimitForThisRound =
 	    *bytesReadInPrevRound == 0
 	        ? maxRate
--- a/fdbserver/DDShardTracker.actor.cpp
+++ b/fdbserver/DDShardTracker.actor.cpp
@ -272,9 +272,6 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
 	state double lastLowBandwidthStartTime =
 	    shardMetrics->get().present() ? shardMetrics->get().get().lastLowBandwidthStartTime : now();
 	state int shardCount = shardMetrics->get().present() ? shardMetrics->get().get().shardCount : 1;
-	state ReadBandwidthStatus readBandwidthStatus = shardMetrics->get().present()
-	                                                    ? getReadBandwidthStatus(shardMetrics->get().get().metrics)
-	                                                    : ReadBandwidthStatusNormal;
 	state bool initWithNewMetrics = whenDDInit;
 	wait(delay(0, TaskPriority::DataDistribution));

--- a/fdbserver/DDTeamCollection.actor.cpp
+++ b/fdbserver/DDTeamCollection.actor.cpp
@ -1518,8 +1518,6 @@ public:
 	                                                      ServerStatus* status,
 	                                                      Version addedVersion) {
 		state StorageServerInterface interf = server->getLastKnownInterface();
-		state int targetTeamNumPerServer =
-		    (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2;
 		loop {
 			state bool inHealthyZone = false; // healthChanged actor will be Never() if this flag is true
 			if (self->healthyZone.get().present()) {
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@ -55,7 +55,7 @@ struct StringBuffer {
 	StringBuffer(UID fromFileID) : reserved(0), id(fromFileID) {}

 	int size() const { return str.size(); }
-	StringRef& ref() { return str; }
+	Standalone<StringRef> get() { return str; }
 	void clear() {
 		str = Standalone<StringRef>();
 		reserved = 0;
@ -63,19 +63,19 @@ struct StringBuffer {
 	void clearReserve(int size) {
 		str = Standalone<StringRef>();
 		reserved = size;
-		ref() = StringRef(new (str.arena()) uint8_t[size], 0);
+		str.contents() = StringRef(new (str.arena()) uint8_t[size], 0);
 	}
 	void append(StringRef x) { memcpy(append(x.size()), x.begin(), x.size()); }
 	void* append(int bytes) {
 		ASSERT(str.size() + bytes <= reserved);
 		void* p = const_cast<uint8_t*>(str.end());
-		ref() = StringRef(str.begin(), str.size() + bytes);
+		str.contents() = StringRef(str.begin(), str.size() + bytes);
 		return p;
 	}
 	StringRef pop_front(int bytes) {
 		ASSERT(bytes <= str.size());
 		StringRef result = str.substr(0, bytes);
-		ref() = str.substr(bytes);
+		str.contents() = str.substr(bytes);
 		return result;
 	}
 	void alignReserve(int alignment, int size) {
@ -101,7 +101,7 @@ struct StringBuffer {
 			if (str.size() > 0) {
 				memcpy(p, str.begin(), str.size());
 			}
-			ref() = StringRef(p, str.size());
+			str.contents() = StringRef(p, str.size());
 		}
 	}
 };
@ -196,7 +196,7 @@ public:
 		stallCount.init("RawDiskQueue.StallCount"_sr);
 	}

-	Future<Void> pushAndCommit(StringRef pageData, StringBuffer* pageMem, uint64_t poppedPages) {
+	Future<Void> pushAndCommit(Standalone<StringRef> pageData, StringBuffer* pageMem, uint64_t poppedPages) {
 		return pushAndCommit(this, pageData, pageMem, poppedPages);
 	}

@ -332,13 +332,13 @@ public:
 	}
 #endif

-	Future<Future<Void>> push(StringRef pageData, std::vector<Reference<SyncQueue>>* toSync) {
+	Future<Future<Void>> push(Standalone<StringRef> pageData, std::vector<Reference<SyncQueue>>* toSync) {
 		return push(this, pageData, toSync);
 	}

-	ACTOR static Future<Future<Void>> push(RawDiskQueue_TwoFiles* self,
-	                                       StringRef pageData,
-	                                       std::vector<Reference<SyncQueue>>* toSync) {
+	ACTOR static UNCANCELLABLE Future<Future<Void>> push(RawDiskQueue_TwoFiles* self,
+	                                                     Standalone<StringRef> pageData,
+	                                                     std::vector<Reference<SyncQueue>>* toSync) {
 		// Write the given data (pageData) to the queue files, swapping or extending them if necessary.
 		// Don't do any syncs, but push the modified file(s) onto toSync.
 		ASSERT(self->readingFile == 2);
@ -357,8 +357,9 @@ public:
 					toSync->push_back(self->files[1].syncQueue);
 					/*TraceEvent("RDQWriteAndSwap", this->dbgid).detail("File1name", self->files[1].dbgFilename).detail("File1size", self->files[1].size)
 					    .detail("WritingPos", self->writingPos).detail("WritingBytes", p);*/
-					waitfor.push_back(self->files[1].f->write(pageData.begin(), p, self->writingPos));
-					pageData = pageData.substr(p);
+					waitfor.push_back(uncancellable(
+					    holdWhile(pageData, self->files[1].f->write(pageData.begin(), p, self->writingPos))));
+					pageData.contents() = pageData.substr(p);
 				}

 				self->dbg_file0BeginSeq += self->files[0].size;
@ -426,7 +427,8 @@ public:
 		    .detail("WritingPos", self->writingPos).detail("WritingBytes", pageData.size());*/
 		self->files[1].size = std::max(self->files[1].size, self->writingPos + pageData.size());
 		toSync->push_back(self->files[1].syncQueue);
-		waitfor.push_back(self->files[1].f->write(pageData.begin(), pageData.size(), self->writingPos));
+		waitfor.push_back(uncancellable(
+		    holdWhile(pageData, self->files[1].f->write(pageData.begin(), pageData.size(), self->writingPos))));
 		self->writingPos += pageData.size();

 		return waitForAllReadyThenThrow(waitfor);
@ -435,7 +437,7 @@ public:
 	// Write the given data (pageData) to the queue files of self, sync data to disk, and delete the memory (pageMem)
 	// that hold the pageData
 	ACTOR static UNCANCELLABLE Future<Void> pushAndCommit(RawDiskQueue_TwoFiles* self,
-	                                                      StringRef pageData,
+	                                                      Standalone<StringRef> pageData,
 	                                                      StringBuffer* pageMem,
 	                                                      uint64_t poppedPages) {
 		state Promise<Void> pushing, committed;
@ -983,7 +985,7 @@ public:

 		lastCommittedSeq = backPage().endSeq();
 		auto f = rawQueue->pushAndCommit(
-		    pushed_page_buffer->ref(), pushed_page_buffer, poppedSeq / sizeof(Page) - lastPoppedSeq / sizeof(Page));
+		    pushed_page_buffer->get(), pushed_page_buffer, poppedSeq / sizeof(Page) - lastPoppedSeq / sizeof(Page));
 		lastPoppedSeq = poppedSeq;
 		pushed_page_buffer = 0;
 		return f;
@ -1179,7 +1181,7 @@ private:
 			Standalone<StringRef> pagedData = wait(readPages(self, start, end));
 			const int startOffset = start % _PAGE_SIZE;
 			const int dataLen = end - start;
-			ASSERT(pagedData.substr(startOffset, dataLen).compare(buffer->ref().substr(0, dataLen)) == 0);
+			ASSERT(pagedData.substr(startOffset, dataLen).compare(buffer->get().substr(0, dataLen)) == 0);
 		} catch (Error& e) {
 			if (e.code() != error_code_io_error) {
 				delete buffer;
@ -1546,9 +1548,9 @@ private:
 	StringBuffer* pushed_page_buffer;
 	Page& backPage() {
 		ASSERT(pushedPageCount());
-		return ((Page*)pushed_page_buffer->ref().end())[-1];
+		return ((Page*)pushed_page_buffer->get().end())[-1];
 	}
-	Page const& backPage() const { return ((Page*)pushed_page_buffer->ref().end())[-1]; }
+	Page const& backPage() const { return ((Page*)pushed_page_buffer->get().end())[-1]; }
 	int pushedPageCount() const { return pushed_page_buffer ? pushed_page_buffer->size() / sizeof(Page) : 0; }

 	// Recovery state
--- a/fdbserver/GlobalTagThrottler.actor.cpp
+++ b/fdbserver/GlobalTagThrottler.actor.cpp
@ -107,7 +107,7 @@ class GlobalTagThrottlerImpl {
 			if (opType == OpType::READ) {
 				readCost.setTotal(newCost);
 			} else {
-				writeCost.setTotal(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * newCost);
+				writeCost.setTotal(CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * newCost);
 			}
 		}

@ -161,12 +161,18 @@ class GlobalTagThrottlerImpl {
 		}
 	};

+	struct StorageServerInfo {
+		Optional<Standalone<StringRef>> zoneId;
+		Optional<double> throttlingRatio;
+	};
+
 	Database db;
 	UID id;
+	int maxFallingBehind{ 0 };
 	uint64_t throttledTagChangeId{ 0 };
 	uint32_t lastBusyTagCount{ 0 };

-	std::unordered_map<UID, Optional<double>> throttlingRatios;
+	std::unordered_map<UID, StorageServerInfo> ssInfos;
 	std::unordered_map<TransactionTag, PerTagStatistics> tagStatistics;
 	std::unordered_map<UID, std::unordered_map<TransactionTag, ThroughputCounters>> throughput;

@ -304,19 +310,20 @@ class GlobalTagThrottlerImpl {
 	// Returns the desired cost for a storage server, based on its current
 	// cost and throttling ratio
 	Optional<double> getLimitingCost(UID storageServerId) const {
-		auto const throttlingRatio = tryGet(throttlingRatios, storageServerId);
-		auto const currentCost = getCurrentCost(storageServerId);
-		if (!throttlingRatio.present() || !currentCost.present() || !throttlingRatio.get().present()) {
+		auto const ssInfo = tryGet(ssInfos, storageServerId);
+		Optional<double> const throttlingRatio = ssInfo.present() ? ssInfo.get().throttlingRatio : Optional<double>{};
+		Optional<double> const currentCost = getCurrentCost(storageServerId);
+		if (!throttlingRatio.present() || !currentCost.present()) {
 			return {};
 		}
-		return throttlingRatio.get().get() * currentCost.get();
+		return throttlingRatio.get() * currentCost.get();
 	}

 	// For a given storage server and tag combination, return the limiting transaction rate.
 	Optional<double> getLimitingTps(UID storageServerId, TransactionTag tag) const {
 		auto const quotaRatio = getQuotaRatio(tag, storageServerId);
-		auto const limitingCost = getLimitingCost(storageServerId);
-		auto const averageTransactionCost = getAverageTransactionCost(tag, storageServerId);
+		Optional<double> const limitingCost = getLimitingCost(storageServerId);
+		Optional<double> const averageTransactionCost = getAverageTransactionCost(tag, storageServerId);
 		if (!limitingCost.present() || !averageTransactionCost.present()) {
 			return {};
 		}
@ -325,14 +332,36 @@ class GlobalTagThrottlerImpl {
 		return limitingCostForTag / averageTransactionCost.get();
 	}

-	// Return the limiting transaction rate, aggregated across all storage servers
+	// Return the limiting transaction rate, aggregated across all storage servers.
+	// The limits from the worst maxFallingBehind zones are
+	// ignored, because we do not non-workload related issues (e.g. slow disks)
+	// to affect tag throttling. If more than maxFallingBehind zones are at
+	// or near saturation, this indicates that throttling should take place.
 	Optional<double> getLimitingTps(TransactionTag tag) const {
-		Optional<double> result;
-		for (const auto& [id, _] : throttlingRatios) {
-			auto const targetTpsForSS = getLimitingTps(id, tag);
-			result = getMin(result, targetTpsForSS);
+		// TODO: The algorithm for ignoring the worst zones can be made more efficient
+		std::unordered_map<Optional<Standalone<StringRef>>, double> zoneIdToLimitingTps;
+		for (const auto& [id, ssInfo] : ssInfos) {
+			auto const limitingTpsForSS = getLimitingTps(id, tag);
+			if (limitingTpsForSS.present()) {
+				auto it = zoneIdToLimitingTps.find(ssInfo.zoneId);
+				if (it != zoneIdToLimitingTps.end()) {
+					auto& limitingTpsForZone = it->second;
+					limitingTpsForZone = std::min<double>(limitingTpsForZone, limitingTpsForSS.get());
+				} else {
+					zoneIdToLimitingTps[ssInfo.zoneId] = limitingTpsForSS.get();
+				}
+			}
+		}
+		if (zoneIdToLimitingTps.size() <= maxFallingBehind) {
+			return {};
+		} else {
+			std::vector<double> zoneLimits;
+			for (const auto& [_, limit] : zoneIdToLimitingTps) {
+				zoneLimits.push_back(limit);
+			}
+			std::nth_element(zoneLimits.begin(), zoneLimits.begin() + maxFallingBehind, zoneLimits.end());
+			return zoneLimits[maxFallingBehind];
 		}
-		return result;
 	}

 	Optional<double> getTps(TransactionTag tag, LimitType limitType, double averageTransactionCost) const {
@ -408,7 +437,8 @@ class GlobalTagThrottlerImpl {
 	}

 public:
-	GlobalTagThrottlerImpl(Database db, UID id) : db(db), id(id) {}
+	GlobalTagThrottlerImpl(Database db, UID id, int maxFallingBehind)
+	  : db(db), id(id), maxFallingBehind(maxFallingBehind) {}
 	Future<Void> monitorThrottlingChanges() { return monitorThrottlingChanges(this); }
 	void addRequests(TransactionTag tag, int count) {
 		auto it = tagStatistics.find(tag);
@ -492,8 +522,11 @@ public:
 	int64_t manualThrottleCount() const { return 0; }

 	Future<Void> tryUpdateAutoThrottling(StorageQueueInfo const& ss) {
-		throttlingRatios[ss.id] = ss.getThrottlingRatio(SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER,
-		                                                SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER);
+		auto& ssInfo = ssInfos[ss.id];
+		ssInfo.throttlingRatio = ss.getTagThrottlingRatio(SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER,
+		                                                  SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER);
+		ssInfo.zoneId = ss.locality.zoneId();
+
 		for (const auto& busyReadTag : ss.busiestReadTags) {
 			if (tagStatistics.find(busyReadTag.tag) != tagStatistics.end()) {
 				throughput[ss.id][busyReadTag.tag].updateCost(busyReadTag.rate, OpType::READ);
@ -530,7 +563,8 @@ public:
 	uint32_t tagsTracked() const { return tagStatistics.size(); }
 };

-GlobalTagThrottler::GlobalTagThrottler(Database db, UID id) : impl(PImpl<GlobalTagThrottlerImpl>::create(db, id)) {}
+GlobalTagThrottler::GlobalTagThrottler(Database db, UID id, int maxFallingBehind)
+  : impl(PImpl<GlobalTagThrottlerImpl>::create(db, id, maxFallingBehind)) {}

 GlobalTagThrottler::~GlobalTagThrottler() = default;

@ -584,7 +618,7 @@ void GlobalTagThrottler::removeExpiredTags() {
 	return impl->removeExpiredTags();
 }

-namespace GlobalTagThrottlerTesting {
+namespace {

 enum class LimitType { RESERVED, TOTAL };
 enum class OpType { READ, WRITE };
@ -615,12 +649,13 @@ class MockStorageServer {
 	};

 	UID id;
-	double targetCost;
+	// bytes/second that this storage server can handle
+	double capacity;
 	std::map<TransactionTag, Cost> readCosts, writeCosts;
 	Cost totalReadCost, totalWriteCost;

 public:
-	explicit MockStorageServer(UID id, double targetCost) : id(id), targetCost(targetCost) { ASSERT_GT(targetCost, 0); }
+	explicit MockStorageServer(UID id, double capacity) : id(id), capacity(capacity) { ASSERT_GT(capacity, 0); }
 	void addReadCost(TransactionTag tag, double cost) {
 		readCosts[tag] += cost;
 		totalReadCost += cost;
@ -630,8 +665,10 @@ public:
 		totalWriteCost += cost;
 	}

+	void setCapacity(double value) { capacity = value; }
+
 	StorageQueueInfo getStorageQueueInfo() const {
-		StorageQueueInfo result(id, LocalityData{});
+		StorageQueueInfo result(id, LocalityData({}, Value(id.toString()), {}, {}));
 		for (const auto& [tag, readCost] : readCosts) {
 			double fractionalBusyness{ 0.0 }; // unused for global tag throttling
 			result.busiestReadTags.emplace_back(tag, readCost.smoothRate(), fractionalBusyness);
@ -640,7 +677,7 @@ public:
 			double fractionalBusyness{ 0.0 }; // unused for global tag throttling
 			result.busiestWriteTags.emplace_back(tag, writeCost.smoothRate(), fractionalBusyness);
 		}
-		result.lastReply.bytesInput = ((totalReadCost.smoothRate() + totalWriteCost.smoothRate()) / targetCost) *
+		result.lastReply.bytesInput = ((totalReadCost.smoothRate() + totalWriteCost.smoothRate()) / capacity) *
 		                              SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER;
 		return result;
 	}
@ -680,6 +717,8 @@ public:
 		}
 	}

+	void setCapacity(int index, double value) { storageServers[index].setCapacity(value); }
+
 	std::vector<StorageQueueInfo> getStorageQueueInfos() const {
 		std::vector<StorageQueueInfo> result;
 		result.reserve(storageServers.size());
@ -707,12 +746,14 @@ ACTOR Future<Void> runClient(GlobalTagThrottler* globalTagThrottler,
 }

 ACTOR template <class Check>
-Future<Void> monitor(GlobalTagThrottler* globalTagThrottler, Check check) {
+Future<Void> monitorActor(GlobalTagThrottler* globalTagThrottler, Check check) {
 	state int successes = 0;
 	loop {
 		wait(delay(1.0));
 		if (check(*globalTagThrottler)) {
-			if (++successes == 3) {
+			// Wait for 10 consecutive successes so we're certain
+			// than a stable equilibrium has been reached
+			if (++successes == 10) {
 				return Void();
 			}
 		} else {
@ -775,47 +816,55 @@ ACTOR Future<Void> updateGlobalTagThrottler(GlobalTagThrottler* globalTagThrottl
 	}
 }

-} // namespace GlobalTagThrottlerTesting
+} // namespace

+// 10 storage servers can handle 100 bytes/second each.
+// Total quota set to 100 bytes/second.
+// Client attempts 5 6-byte read transactions per second.
+// Limit should adjust to allow 100/6 transactions per second.
 TEST_CASE("/GlobalTagThrottler/Simple") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 100);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 100);
 	ThrottleApi::TagQuotaValue tagQuotaValue;
 	TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 100.0 / 6.0);
-	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, OpType::READ);
+	state Future<Void> monitor =
+	    monitorActor(&globalTagThrottler, [testTag](auto& gtt) { return targetRateIsNear(gtt, testTag, 100.0 / 6.0); });
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }

+// 10 storage servers can handle 100 bytes/second each.
+// Total quota set to 100 bytes/second.
+// Client attempts 5 6-byte write transactions per second.
+// Limit should adjust to allow 100/(6*<fungibility_ratio>) transactions per second.
 TEST_CASE("/GlobalTagThrottler/WriteThrottling") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 100);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 100);
 	ThrottleApi::TagQuotaValue tagQuotaValue;
 	TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::WRITE);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 100.0 / 6.0);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, OpType::WRITE);
+	state Future<Void> monitor = monitorActor(&globalTagThrottler, [testTag](auto& gtt) {
+		return targetRateIsNear(gtt, testTag, 100.0 / (6.0 * CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO));
 	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }

+// 10 storage servers can handle 100 bytes/second each.
+// Total quota set to 100 bytes/second for each tag.
+// 2 clients each attempt 5 6-byte read transactions per second.
+// Both limits should adjust to allow 100/6 transactions per second.
 TEST_CASE("/GlobalTagThrottler/MultiTagThrottling") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 100);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 100);
 	ThrottleApi::TagQuotaValue tagQuotaValue;
 	TransactionTag testTag1 = "sampleTag1"_sr;
 	TransactionTag testTag2 = "sampleTag2"_sr;
@ -824,171 +873,181 @@ TEST_CASE("/GlobalTagThrottler/MultiTagThrottling") {
 	globalTagThrottler.setQuota(testTag2, tagQuotaValue);
 	state std::vector<Future<Void>> futures;
 	state std::vector<Future<Void>> monitorFutures;
-	futures.push_back(GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag1, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ));
-	futures.push_back(GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag2, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ));
-	futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
-	state Future<Void> monitor =
-	    GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
-		    return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag1, 100.0 / 6.0) &&
-		           GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag2, 100.0 / 6.0);
-	    });
+	futures.push_back(runClient(&globalTagThrottler, &storageServers, testTag1, 5.0, 6.0, OpType::READ));
+	futures.push_back(runClient(&globalTagThrottler, &storageServers, testTag2, 5.0, 6.0, OpType::READ));
+	futures.push_back(updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
+	state Future<Void> monitor = monitorActor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
+		return targetRateIsNear(gtt, testTag1, 100.0 / 6.0) && targetRateIsNear(gtt, testTag2, 100.0 / 6.0);
+	});
 	wait(timeoutError(waitForAny(futures) || monitor, 600.0));
 	return Void();
 }

+// 10 storage servers can handle 100 bytes/second each.
+// Total quota set to 100 bytes/second.
+// Client attempts 20 10-byte read transactions per second.
+// Limit should adjust to allow 100/10 transactions per second.
 TEST_CASE("/GlobalTagThrottler/AttemptWorkloadAboveQuota") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 100);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 100);
 	ThrottleApi::TagQuotaValue tagQuotaValue;
 	TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 20.0, 10.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 10.0);
-	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 20.0, 10.0, OpType::READ);
+	state Future<Void> monitor =
+	    monitorActor(&globalTagThrottler, [testTag](auto& gtt) { return targetRateIsNear(gtt, testTag, 10.0); });
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }

+// 10 storage servers can handle 100 bytes/second each.
+// Total quota set to 100 bytes/second.
+// 2 clients each attempt 5 6-byte transactions per second.
+// Limit should adjust to allow 100/6 transactions per second.
 TEST_CASE("/GlobalTagThrottler/MultiClientThrottling") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 100);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 100);
 	ThrottleApi::TagQuotaValue tagQuotaValue;
 	TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> client2 = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 100.0 / 6.0) &&
-		       GlobalTagThrottlerTesting::clientRateIsNear(gtt, testTag, 100.0 / 6.0);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, OpType::READ);
+	state Future<Void> client2 = runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, OpType::READ);
+	state Future<Void> monitor = monitorActor(&globalTagThrottler, [testTag](auto& gtt) {
+		return targetRateIsNear(gtt, testTag, 100.0 / 6.0) && clientRateIsNear(gtt, testTag, 100.0 / 6.0);
 	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || client2 || updater, 600.0));
 	return Void();
 }

+// 10 storage servers can handle 100 bytes/second each.
+// Total quota set to 100 bytes/second.
+// 2 clients each attempt 20 10-byte transactions per second.
+// Target rate should adjust to allow 100/10 transactions per second.
+// Each client is throttled to only perform 100/20 transactions per second.
 TEST_CASE("/GlobalTagThrottler/MultiClientThrottling2") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 100);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 100);
 	ThrottleApi::TagQuotaValue tagQuotaValue;
 	TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 20.0, 10.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> client2 = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 20.0, 10.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 10.0) &&
-		       GlobalTagThrottlerTesting::clientRateIsNear(gtt, testTag, 5.0);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 20.0, 10.0, OpType::READ);
+	state Future<Void> client2 = runClient(&globalTagThrottler, &storageServers, testTag, 20.0, 10.0, OpType::READ);
+	state Future<Void> monitor = monitorActor(&globalTagThrottler, [testTag](auto& gtt) {
+		return targetRateIsNear(gtt, testTag, 10.0) && clientRateIsNear(gtt, testTag, 5.0);
 	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }

-// Global transaction rate should be 20.0, with a distribution of (5, 15) between the 2 clients
+// 10 storage servers can handle 100 bytes/second each.
+// Total quota set to 100 bytes/second.
+// One client attempts 5 5-byte read transactions per second.
+// Another client attempts 25 5-byte read transactions per second.
+// Target rate should adjust to allow 100/5 transactions per second.
+// This 20 transactions/second limit is split with a distribution of (5, 15) between the 2 clients.
 TEST_CASE("/GlobalTagThrottler/SkewedMultiClientThrottling") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 100);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 100);
 	ThrottleApi::TagQuotaValue tagQuotaValue;
 	TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 5.0, 5.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> client2 = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 25.0, 5.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 20.0) &&
-		       GlobalTagThrottlerTesting::clientRateIsNear(gtt, testTag, 15.0);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 5.0, OpType::READ);
+	state Future<Void> client2 = runClient(&globalTagThrottler, &storageServers, testTag, 25.0, 5.0, OpType::READ);
+	state Future<Void> monitor = monitorActor(&globalTagThrottler, [testTag](auto& gtt) {
+		return targetRateIsNear(gtt, testTag, 20.0) && clientRateIsNear(gtt, testTag, 15.0);
 	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }

+// 10 storage servers can handle 100 bytes/second each.
+// Total quota is initially set to 100 bytes/second.
+// Client attempts 5 6-byte transactions per second.
 // Test that the tag throttler can reach equilibrium, then adjust to a new equilibrium once the quota is changed
+// Target rate should adjust to allow 100/6 transactions per second.
+// Total quota is modified to 50 bytes/second.
+// Target rate should adjust to allow 50/6 transactions per second.
 TEST_CASE("/GlobalTagThrottler/UpdateQuota") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 100);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 100);
 	state ThrottleApi::TagQuotaValue tagQuotaValue;
 	state TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, "sampleTag1"_sr, 100.0 / 6.0);
-	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, OpType::READ);
+	state Future<Void> monitor = monitorActor(
+	    &globalTagThrottler, [](auto& gtt) { return targetRateIsNear(gtt, "sampleTag1"_sr, 100.0 / 6.0); });
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || updater, 600.0));
 	tagQuotaValue.totalQuota = 50.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, "sampleTag1"_sr, 50.0 / 6.0);
-	});
+	monitor =
+	    monitorActor(&globalTagThrottler, [](auto& gtt) { return targetRateIsNear(gtt, "sampleTag1"_sr, 50.0 / 6.0); });
 	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }

+// 10 storage servers can handle 100 bytes/second each.
+// Total quota is initially set to 100 bytes/second.
+// Client attempts 5 6-byte read transactions per second.
+// Target limit adjusts to allow 100/6 transactions per second.
+// Then Quota is removed.
+// Target limit is removed as a result.
 TEST_CASE("/GlobalTagThrottler/RemoveQuota") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 100);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 100);
 	state ThrottleApi::TagQuotaValue tagQuotaValue;
 	state TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 5.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, "sampleTag1"_sr, 100.0 / 6.0);
-	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, OpType::READ);
+	state Future<Void> monitor = monitorActor(
+	    &globalTagThrottler, [](auto& gtt) { return targetRateIsNear(gtt, "sampleTag1"_sr, 100.0 / 6.0); });
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || updater, 600.0));
 	globalTagThrottler.removeQuota(testTag);
-	monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, "sampleTag1"_sr, {});
-	});
+	monitor = monitorActor(&globalTagThrottler, [](auto& gtt) { return targetRateIsNear(gtt, "sampleTag1"_sr, {}); });
 	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }

+// 10 storage servers can handle 5 bytes/second each.
+// Total quota is set to 100 bytes/second.
+// Client attempts 10 6-byte transactions per second
+// Target is adjusted to 50/6 transactions per second, to match the total capacity all storage servers.
 TEST_CASE("/GlobalTagThrottler/ActiveThrottling") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 5);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 5);
 	state ThrottleApi::TagQuotaValue tagQuotaValue;
 	TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 50 / 6.0) && gtt.busyReadTagCount() == 1;
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 10.0, 6.0, OpType::READ);
+	state Future<Void> monitor = monitorActor(&globalTagThrottler, [testTag](auto& gtt) {
+		return targetRateIsNear(gtt, testTag, 50 / 6.0) && gtt.busyReadTagCount() == 1;
 	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }

+// 10 storage servers can handle 5 bytes/second each.
+// Total quota is set to 50 bytes/second for one tag, 100 bytes/second for another.
+// For each tag, a client attempts to execute 10 6-byte read transactions per second.
+// Target rates are adjusted to utilize the full 50 bytes/second capacity of the
+//   add storage servers. The two tags receive this capacity with a 2:1 ratio,
+//   matching the ratio of their total quotas.
 TEST_CASE("/GlobalTagThrottler/MultiTagActiveThrottling") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 5);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 5);
 	state ThrottleApi::TagQuotaValue tagQuotaValue1;
 	state ThrottleApi::TagQuotaValue tagQuotaValue2;
 	TransactionTag testTag1 = "sampleTag1"_sr;
@ -998,24 +1057,26 @@ TEST_CASE("/GlobalTagThrottler/MultiTagActiveThrottling") {
 	globalTagThrottler.setQuota(testTag1, tagQuotaValue1);
 	globalTagThrottler.setQuota(testTag2, tagQuotaValue2);
 	std::vector<Future<Void>> futures;
-	futures.push_back(GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag1, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ));
-	futures.push_back(GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag2, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ));
-	state Future<Void> monitor =
-	    GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
-		    return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag1, (50 / 6.0) / 3) &&
-		           GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag2, 2 * (50 / 6.0) / 3) &&
-		           gtt.busyReadTagCount() == 2;
-	    });
-	futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
+	futures.push_back(runClient(&globalTagThrottler, &storageServers, testTag1, 10.0, 6.0, OpType::READ));
+	futures.push_back(runClient(&globalTagThrottler, &storageServers, testTag2, 10.0, 6.0, OpType::READ));
+	state Future<Void> monitor = monitorActor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
+		return targetRateIsNear(gtt, testTag1, (50 / 6.0) / 3) && targetRateIsNear(gtt, testTag2, 2 * (50 / 6.0) / 3) &&
+		       gtt.busyReadTagCount() == 2;
+	});
+	futures.push_back(updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
 	wait(timeoutError(waitForAny(futures) || monitor, 600.0));
 	return Void();
 }

+// 3 storage servers can handle 50 bytes/second each.
+// Total quota is set to 100 bytes/second for each tag.
+// Each client attempts 10 6-byte read transactions per second.
+// This workload is sent to 2 storage servers per client (with an overlap of one storage server).
+// Target rates for both tags are adjusted to 50/6 transactions per second to match the throughput
+//   that the busiest server can handle.
 TEST_CASE("/GlobalTagThrottler/MultiTagActiveThrottling2") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(3, 50);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(3, 50);
 	state ThrottleApi::TagQuotaValue tagQuotaValue1;
 	state ThrottleApi::TagQuotaValue tagQuotaValue2;
 	TransactionTag testTag1 = "sampleTag1"_sr;
@ -1025,23 +1086,27 @@ TEST_CASE("/GlobalTagThrottler/MultiTagActiveThrottling2") {
 	globalTagThrottler.setQuota(testTag1, tagQuotaValue1);
 	globalTagThrottler.setQuota(testTag2, tagQuotaValue2);
 	std::vector<Future<Void>> futures;
-	futures.push_back(GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag1, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ, { 0, 1 }));
-	futures.push_back(GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag2, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ, { 1, 2 }));
-	state Future<Void> monitor =
-	    GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
-		    return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag1, 50 / 6.0) &&
-		           GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag2, 50 / 6.0) && gtt.busyReadTagCount() == 2;
-	    });
-	futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
+	futures.push_back(runClient(&globalTagThrottler, &storageServers, testTag1, 10.0, 6.0, OpType::READ, { 0, 1 }));
+	futures.push_back(runClient(&globalTagThrottler, &storageServers, testTag2, 10.0, 6.0, OpType::READ, { 1, 2 }));
+	state Future<Void> monitor = monitorActor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
+		return targetRateIsNear(gtt, testTag1, 50 / 6.0) && targetRateIsNear(gtt, testTag2, 50 / 6.0) &&
+		       gtt.busyReadTagCount() == 2;
+	});
+	futures.push_back(updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
 	wait(timeoutError(waitForAny(futures) || monitor, 600.0));
 	return Void();
 }

+// 3 storage servers can handle 50 bytes/second each.
+// Total quota is set to 100 bytes/second for each tag.
+// One client attempts 10 6-byte read transactions per second, all directed towards a single storage server.
+// Another client, using a different tag, attempts 10 6-byte read transactions split across the other two storage
+// servers. Target rates adjust to 50/6 and 100/6 transactions per second for the two clients, based on the capacities
+// of the
+//   storage servers being accessed.
 TEST_CASE("/GlobalTagThrottler/MultiTagActiveThrottling3") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(3, 50);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(3, 50);
 	state ThrottleApi::TagQuotaValue tagQuotaValue1;
 	state ThrottleApi::TagQuotaValue tagQuotaValue2;
 	TransactionTag testTag1 = "sampleTag1"_sr;
@ -1051,35 +1116,35 @@ TEST_CASE("/GlobalTagThrottler/MultiTagActiveThrottling3") {
 	globalTagThrottler.setQuota(testTag1, tagQuotaValue1);
 	globalTagThrottler.setQuota(testTag2, tagQuotaValue2);
 	std::vector<Future<Void>> futures;
-	futures.push_back(GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag1, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ, { 0 }));
-	futures.push_back(GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag2, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ, { 1, 2 }));
-	state Future<Void> monitor =
-	    GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
-		    return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag1, 50 / 6.0) &&
-		           GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag2, 100 / 6.0) && gtt.busyReadTagCount() == 1;
-	    });
-	futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
+	futures.push_back(runClient(&globalTagThrottler, &storageServers, testTag1, 10.0, 6.0, OpType::READ, { 0 }));
+	futures.push_back(runClient(&globalTagThrottler, &storageServers, testTag2, 10.0, 6.0, OpType::READ, { 1, 2 }));
+	state Future<Void> monitor = monitorActor(&globalTagThrottler, [testTag1, testTag2](auto& gtt) {
+		return targetRateIsNear(gtt, testTag1, 50 / 6.0) && targetRateIsNear(gtt, testTag2, 100 / 6.0) &&
+		       gtt.busyReadTagCount() == 1;
+	});
+	futures.push_back(updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
 	wait(timeoutError(waitForAny(futures) || monitor, 600.0));
 	return Void();
 }

+// 10 storage servers can serve 5 bytes/second each.
+// Total quota is set to 100 bytes/second.
+// Reserved quota is set to 70 bytes/second.
+// A client attempts to execute 10 6-byte read transactions per second.
+// Despite the storage server only having capacity to serve 50/6 transactions per second,
+//   the reserved quota will ensure the target rate adjusts to 70/6 transactions per second.
 TEST_CASE("/GlobalTagThrottler/ReservedQuota") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 5);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 5);
 	state ThrottleApi::TagQuotaValue tagQuotaValue;
 	TransactionTag testTag = "sampleTag1"_sr;
 	tagQuotaValue.totalQuota = 100.0;
 	tagQuotaValue.reservedQuota = 70.0;
 	globalTagThrottler.setQuota(testTag, tagQuotaValue);
-	state Future<Void> client = GlobalTagThrottlerTesting::runClient(
-	    &globalTagThrottler, &storageServers, testTag, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ);
-	state Future<Void> monitor = GlobalTagThrottlerTesting::monitor(&globalTagThrottler, [testTag](auto& gtt) {
-		return GlobalTagThrottlerTesting::targetRateIsNear(gtt, testTag, 70 / 6.0);
-	});
-	state Future<Void> updater =
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 10.0, 6.0, OpType::READ);
+	state Future<Void> monitor =
+	    monitorActor(&globalTagThrottler, [testTag](auto& gtt) { return targetRateIsNear(gtt, testTag, 70 / 6.0); });
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
 	wait(timeoutError(monitor || client || updater, 600.0));
 	return Void();
 }
@ -1087,17 +1152,13 @@ TEST_CASE("/GlobalTagThrottler/ReservedQuota") {
 // Test that tags are expired iff a sufficient amount of time has passed since the
 // last transaction with that tag
 TEST_CASE("/GlobalTagThrottler/ExpireTags") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 5);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 5);
 	TransactionTag testTag = "sampleTag1"_sr;

 	state Future<Void> client =
-	    timeout(GlobalTagThrottlerTesting::runClient(
-	                &globalTagThrottler, &storageServers, testTag, 10.0, 6.0, GlobalTagThrottlerTesting::OpType::READ),
-	            60.0,
-	            Void());
-	state Future<Void> updater = timeout(
-	    GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers), 60.0, Void());
+	    timeout(runClient(&globalTagThrottler, &storageServers, testTag, 10.0, 6.0, OpType::READ), 60.0, Void());
+	state Future<Void> updater = timeout(updateGlobalTagThrottler(&globalTagThrottler, &storageServers), 60.0, Void());
 	wait(client && updater);
 	client.cancel();
 	updater.cancel();
@ -1113,17 +1174,43 @@ TEST_CASE("/GlobalTagThrottler/ExpireTags") {

 // Test that the number of tags tracked does not grow beyond SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED
 TEST_CASE("/GlobalTagThrottler/TagLimit") {
-	state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
-	state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10, 5);
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 0);
+	state StorageServerCollection storageServers(10, 5);
 	std::vector<Future<Void>> futures;
 	for (int i = 0; i < 2 * SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED; ++i) {
 		Arena arena;
 		TransactionTag tag = makeString(8, arena);
 		deterministicRandom()->randomBytes(mutateString(tag), tag.size());
-		futures.push_back(GlobalTagThrottlerTesting::runClient(
-		    &globalTagThrottler, &storageServers, tag, 1.0, 6.0, GlobalTagThrottlerTesting::OpType::READ));
+		futures.push_back(runClient(&globalTagThrottler, &storageServers, tag, 1.0, 6.0, OpType::READ));
 	}
 	wait(timeout(waitForAll(futures), 60.0, Void()));
 	ASSERT_EQ(globalTagThrottler.tagsTracked(), SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED);
 	return Void();
 }
+
+// 9 storage servers can handle 100 bytes/second each.
+// 1 unhealthy storage server can only handle 1 byte/second.
+// Total quota is set to 100 bytes/second.
+// Client attempts 5 6-byte transactions per second.
+// Target rate adjusts to 100/6 transactions per second, ignoring the worst storage server.
+// Then, a second storage server becomes unhealthy and can only handle 1 byte/second.
+// Target rate adjusts down to 1/6 transactions per second, because only one bad zone can be ignored.
+TEST_CASE("/GlobalTagThrottler/IgnoreWorstZone") {
+	state GlobalTagThrottler globalTagThrottler(Database{}, UID{}, 1);
+	state StorageServerCollection storageServers(10, 100);
+	state TransactionTag testTag = "sampleTag1"_sr;
+	storageServers.setCapacity(0, 1);
+	ThrottleApi::TagQuotaValue tagQuotaValue;
+	tagQuotaValue.totalQuota = 100.0;
+	globalTagThrottler.setQuota(testTag, tagQuotaValue);
+	state Future<Void> client = runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, OpType::READ);
+	state Future<Void> monitor = monitorActor(
+	    &globalTagThrottler, [](auto& gtt) { return targetRateIsNear(gtt, "sampleTag1"_sr, 100.0 / 6.0); });
+	state Future<Void> updater = updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
+	wait(timeoutError(monitor || client || updater, 600.0));
+	storageServers.setCapacity(1, 1);
+	monitor =
+	    monitorActor(&globalTagThrottler, [](auto& gtt) { return targetRateIsNear(gtt, "sampleTag1"_sr, 1.0 / 6.0); });
+	wait(timeoutError(monitor || client || updater, 600.0));
+	return Void();
+}
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@ -27,7 +27,7 @@
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/GrvProxyInterface.h"
 #include "fdbclient/VersionVector.h"
-#include "fdbserver/GrvProxyTransactionTagThrottler.h"
+#include "fdbserver/GrvProxyTagThrottler.h"
 #include "fdbserver/GrvTransactionRateInfo.h"
 #include "fdbserver/LogSystem.h"
 #include "fdbserver/LogSystemDiskQueueAdapter.h"
@ -62,7 +62,7 @@ struct GrvProxyStats {
 	LatencySample defaultTxnGRVTimeInQueue;
 	LatencySample batchTxnGRVTimeInQueue;

-	// These latency bands and samples ignore latency injected by the GrvProxyTransactionTagThrottler
+	// These latency bands and samples ignore latency injected by the GrvProxyTagThrottler
 	LatencyBands grvLatencyBands;
 	LatencySample grvLatencySample; // GRV latency metric sample of default priority
 	LatencySample grvBatchLatencySample; // GRV latency metric sample of batched priority
@ -183,6 +183,8 @@ struct GrvProxyData {
 	Version version;
 	Version minKnownCommittedVersion; // we should ask master for this version.

+	GrvProxyTagThrottler tagThrottler;
+
 	// Cache of the latest commit versions of storage servers.
 	VersionVector ssVersionVectorCache;

@ -195,6 +197,7 @@ struct GrvProxyData {
 			if (newLatencyBandConfig.present()) {
 				for (auto band : newLatencyBandConfig.get().grvConfig.bands) {
 					stats.grvLatencyBands.addThreshold(band);
+					tagThrottler.addLatencyBandThreshold(band);
 				}
 			}
 		}
@ -213,7 +216,8 @@ struct GrvProxyData {
 	                                dbgid,
 	                                SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                                SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
-	    updateCommitRequests(0), lastCommitTime(0), version(0), minKnownCommittedVersion(invalidVersion) {}
+	    updateCommitRequests(0), lastCommitTime(0), version(0), minKnownCommittedVersion(invalidVersion),
+	    tagThrottler(SERVER_KNOBS->PROXY_MAX_TAG_THROTTLE_DURATION) {}
 };

 ACTOR Future<Void> healthMetricsRequestServer(GrvProxyInterface grvProxy,
@ -363,7 +367,6 @@ ACTOR Future<Void> getRate(UID myID,
                           GetHealthMetricsReply* detailedHealthMetricsReply,
                           TransactionTagMap<uint64_t>* transactionTagCounter,
                           PrioritizedTransactionTagMap<ClientTagThrottleLimits>* clientThrottledTags,
-                           GrvProxyTransactionTagThrottler* tagThrottler,
                           GrvProxyStats* stats,
                           GrvProxyData* proxyData) {
 	state Future<Void> nextRequestTimer = Never();
@ -424,7 +427,7 @@ ACTOR Future<Void> getRate(UID myID,
 				*clientThrottledTags = std::move(rep.clientThrottledTags.get());
 			}
 			if (rep.proxyThrottledTags.present()) {
-				tagThrottler->updateRates(rep.proxyThrottledTags.get());
+				proxyData->tagThrottler.updateRates(rep.proxyThrottledTags.get());
 			}
 		}
 		when(wait(leaseTimeout)) {
@ -470,7 +473,7 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
                                               GrvProxyStats* stats,
                                               GrvTransactionRateInfo* batchRateInfo,
                                               TransactionTagMap<uint64_t>* transactionTagCounter,
-                                               GrvProxyTransactionTagThrottler* tagThrottler) {
+                                               GrvProxyTagThrottler* tagThrottler) {
 	getCurrentLineage()->modify(&TransactionLineage::operation) =
 	    TransactionLineage::Operation::GetConsistentReadVersion;
 	loop choose {
@ -721,7 +724,7 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 		reply.proxyId = grvProxyData->dbgid;
 		reply.proxyTagThrottledDuration = request.proxyTagThrottledDuration;

-		if (!request.tags.empty()) {
+		if (request.isTagged()) {
 			auto& priorityThrottledTags = clientThrottledTags[request.priority];
 			for (auto tag : request.tags) {
 				auto tagItr = priorityThrottledTags.find(tag.first);
@ -824,7 +827,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	state int64_t batchTransactionCount = 0;
 	state GrvTransactionRateInfo normalRateInfo(10);
 	state GrvTransactionRateInfo batchRateInfo(0);
-	state GrvProxyTransactionTagThrottler tagThrottler;

 	state Deque<GetReadVersionRequest> systemQueue;
 	state Deque<GetReadVersionRequest> defaultQueue;
@ -850,7 +852,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	                      detailedHealthMetricsReply,
 	                      &transactionTagCounter,
 	                      &clientThrottledTags,
-	                      &tagThrottler,
 	                      &grvProxyData->stats,
 	                      grvProxyData));
 	addActor.send(queueGetReadVersionRequests(db,
@ -865,7 +866,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	                                          &grvProxyData->stats,
 	                                          &batchRateInfo,
 	                                          &transactionTagCounter,
-	                                          &tagThrottler));
+	                                          &grvProxyData->tagThrottler));

 	while (std::find(db->get().client.grvProxies.begin(), db->get().client.grvProxies.end(), proxy) ==
 	       db->get().client.grvProxies.end()) {
@ -888,7 +889,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 			elapsed = 1e-15;
 		}

-		tagThrottler.releaseTransactions(elapsed, defaultQueue, batchQueue);
+		grvProxyData->tagThrottler.releaseTransactions(elapsed, defaultQueue, batchQueue);
 		normalRateInfo.startReleaseWindow();
 		batchRateInfo.startReleaseWindow();

--- a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp
+++ b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp
@ -1,5 +1,5 @@
 /*
- * GrvProxyTransactionTagThrottler.actor.cpp
+ * GrvProxyTagThrottler.actor.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
@ -19,21 +19,24 @@
 */

 #include "fdbclient/Knobs.h"
-#include "fdbserver/GrvProxyTransactionTagThrottler.h"
+#include "fdbserver/GrvProxyTagThrottler.h"
+#include "fdbserver/Knobs.h"
 #include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // must be last include

-uint64_t GrvProxyTransactionTagThrottler::DelayedRequest::lastSequenceNumber = 0;
+uint64_t GrvProxyTagThrottler::DelayedRequest::lastSequenceNumber = 0;

-void GrvProxyTransactionTagThrottler::DelayedRequest::updateProxyTagThrottledDuration() {
+void GrvProxyTagThrottler::DelayedRequest::updateProxyTagThrottledDuration(LatencyBandsMap& latencyBandsMap) {
 	req.proxyTagThrottledDuration = now() - startTime;
+	auto const& [tag, count] = *req.tags.begin();
+	latencyBandsMap.addMeasurement(tag, req.proxyTagThrottledDuration, count);
 }

-bool GrvProxyTransactionTagThrottler::DelayedRequest::isMaxThrottled() const {
-	return now() - startTime > CLIENT_KNOBS->PROXY_MAX_TAG_THROTTLE_DURATION;
+bool GrvProxyTagThrottler::DelayedRequest::isMaxThrottled(double maxThrottleDuration) const {
+	return now() - startTime > maxThrottleDuration;
 }

-void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
+void GrvProxyTagThrottler::TagQueue::setRate(double rate) {
 	if (rateInfo.present()) {
 		rateInfo.get().setRate(rate);
 	} else {
@ -41,21 +44,28 @@ void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
 	}
 }

-bool GrvProxyTransactionTagThrottler::TagQueue::isMaxThrottled() const {
-	return !requests.empty() && requests.front().isMaxThrottled();
+bool GrvProxyTagThrottler::TagQueue::isMaxThrottled(double maxThrottleDuration) const {
+	return !requests.empty() && requests.front().isMaxThrottled(maxThrottleDuration);
 }

-void GrvProxyTransactionTagThrottler::TagQueue::rejectRequests() {
-	CODE_PROBE(true, "GrvProxyTransactionTagThrottler rejecting requests");
+void GrvProxyTagThrottler::TagQueue::rejectRequests(LatencyBandsMap& latencyBandsMap) {
+	CODE_PROBE(true, "GrvProxyTagThrottler rejecting requests");
 	while (!requests.empty()) {
 		auto& delayedReq = requests.front();
-		delayedReq.updateProxyTagThrottledDuration();
+		delayedReq.updateProxyTagThrottledDuration(latencyBandsMap);
 		delayedReq.req.reply.sendError(proxy_tag_throttled());
 		requests.pop_front();
 	}
 }

-void GrvProxyTransactionTagThrottler::updateRates(TransactionTagMap<double> const& newRates) {
+GrvProxyTagThrottler::GrvProxyTagThrottler(double maxThrottleDuration)
+  : maxThrottleDuration(maxThrottleDuration),
+    latencyBandsMap("GrvProxyTagThrottler",
+                    deterministicRandom()->randomUniqueID(),
+                    SERVER_KNOBS->GLOBAL_TAG_THROTTLING_PROXY_LOGGING_INTERVAL,
+                    SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED) {}
+
+void GrvProxyTagThrottler::updateRates(TransactionTagMap<double> const& newRates) {
 	for (const auto& [tag, rate] : newRates) {
 		auto it = queues.find(tag);
 		if (it == queues.end()) {
@ -83,15 +93,15 @@ void GrvProxyTransactionTagThrottler::updateRates(TransactionTagMap<double> cons
 	}
 }

-void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& req) {
+void GrvProxyTagThrottler::addRequest(GetReadVersionRequest const& req) {
 	ASSERT(req.isTagged());
 	auto const& tag = req.tags.begin()->first;
 	if (req.tags.size() > 1) {
-		// The GrvProxyTransactionTagThrottler assumes that each GetReadVersionRequest
+		// The GrvProxyTagThrottler assumes that each GetReadVersionRequest
 		// has at most one tag. If a transaction uses multiple tags and
 		// SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES is enabled, there may be
 		// unexpected behaviour, because only one tag is used for throttling.
-		TraceEvent(SevWarnAlways, "GrvProxyTransactionTagThrottler_MultipleTags")
+		TraceEvent(SevWarnAlways, "GrvProxyTagThrottler_MultipleTags")
 		    .suppressFor(1.0)
 		    .detail("NumTags", req.tags.size())
 		    .detail("UsingTag", printable(tag));
@ -99,9 +109,9 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re
 	queues[tag].requests.emplace_back(req);
 }

-void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
-                                                          Deque<GetReadVersionRequest>& outBatchPriority,
-                                                          Deque<GetReadVersionRequest>& outDefaultPriority) {
+void GrvProxyTagThrottler::releaseTransactions(double elapsed,
+                                               Deque<GetReadVersionRequest>& outBatchPriority,
+                                               Deque<GetReadVersionRequest>& outDefaultPriority) {
 	// Pointer to a TagQueue with some extra metadata stored alongside
 	struct TagQueueHandle {
 		// Store pointers here to avoid frequent std::unordered_map lookups
@ -159,24 +169,24 @@ void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
 			    !tagQueueHandle.queue->rateInfo.get().canStart(*(tagQueueHandle.numReleased), count)) {
 				// Cannot release any more transaction from this tag (don't push the tag queue handle back into
 				// pqOfQueues)
-				CODE_PROBE(true, "GrvProxyTransactionTagThrottler throttling transaction");
-				if (tagQueueHandle.queue->isMaxThrottled()) {
+				CODE_PROBE(true, "GrvProxyTagThrottler throttling transaction");
+				if (tagQueueHandle.queue->isMaxThrottled(maxThrottleDuration)) {
 					// Requests in this queue have been throttled too long and errors
 					// should be sent to clients.
-					tagQueueHandle.queue->rejectRequests();
+					tagQueueHandle.queue->rejectRequests(latencyBandsMap);
 				}
 				break;
 			} else {
 				if (tagQueueHandle.nextSeqNo < nextQueueSeqNo) {
 					// Releasing transaction
 					*(tagQueueHandle.numReleased) += count;
-					delayedReq.updateProxyTagThrottledDuration();
+					delayedReq.updateProxyTagThrottledDuration(latencyBandsMap);
 					if (delayedReq.req.priority == TransactionPriority::BATCH) {
 						outBatchPriority.push_back(delayedReq.req);
 					} else if (delayedReq.req.priority == TransactionPriority::DEFAULT) {
 						outDefaultPriority.push_back(delayedReq.req);
 					} else {
-						// Immediate priority transactions should bypass the GrvProxyTransactionTagThrottler
+						// Immediate priority transactions should bypass the GrvProxyTagThrottler
 						ASSERT(false);
 					}
 					tagQueueHandle.queue->requests.pop_front();
@ -184,7 +194,7 @@ void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
 						tagQueueHandle.nextSeqNo = tagQueueHandle.queue->requests.front().sequenceNumber;
 					}
 				} else {
-					CODE_PROBE(true, "GrvProxyTransactionTagThrottler switching tags to preserve FIFO");
+					CODE_PROBE(true, "GrvProxyTagThrottler switching tags to preserve FIFO");
 					pqOfQueues.push(tagQueueHandle);
 					break;
 				}
@ -209,11 +219,16 @@ void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
 	ASSERT_EQ(transactionsReleased.capacity(), transactionsReleasedInitialCapacity);
 }

-uint32_t GrvProxyTransactionTagThrottler::size() {
+void GrvProxyTagThrottler::addLatencyBandThreshold(double value) {
+	CODE_PROBE(size() > 0, "GrvProxyTagThrottler adding latency bands while actively throttling");
+	latencyBandsMap.addThreshold(value);
+}
+
+uint32_t GrvProxyTagThrottler::size() const {
 	return queues.size();
 }

-ACTOR static Future<Void> mockClient(GrvProxyTransactionTagThrottler* throttler,
+ACTOR static Future<Void> mockClient(GrvProxyTagThrottler* throttler,
                                     TransactionPriority priority,
                                     TagSet tagSet,
                                     int batchSize,
@ -237,7 +252,7 @@ ACTOR static Future<Void> mockClient(GrvProxyTransactionTagThrottler* throttler,
 	}
 }

-ACTOR static Future<Void> mockFifoClient(GrvProxyTransactionTagThrottler* throttler) {
+ACTOR static Future<Void> mockFifoClient(GrvProxyTagThrottler* throttler) {
 	state TransactionTagMap<uint32_t> tagSet1;
 	state TransactionTagMap<uint32_t> tagSet2;
 	state std::vector<GetReadVersionRequest> reqs;
@ -279,7 +294,7 @@ ACTOR static Future<Void> mockFifoClient(GrvProxyTransactionTagThrottler* thrott
 	return Void();
 }

-ACTOR static Future<Void> mockServer(GrvProxyTransactionTagThrottler* throttler) {
+ACTOR static Future<Void> mockServer(GrvProxyTagThrottler* throttler) {
 	state Deque<GetReadVersionRequest> outBatchPriority;
 	state Deque<GetReadVersionRequest> outDefaultPriority;
 	loop {
@ -290,6 +305,7 @@ ACTOR static Future<Void> mockServer(GrvProxyTransactionTagThrottler* throttler)
 			outBatchPriority.front().reply.send(GetReadVersionReply{});
 			outBatchPriority.pop_front();
 		}
+		TraceEvent("HERE_ServerProcessing").detail("Size", outDefaultPriority.size());
 		while (!outDefaultPriority.empty()) {
 			outDefaultPriority.front().reply.send(GetReadVersionReply{});
 			outDefaultPriority.pop_front();
@ -313,8 +329,8 @@ static bool isNear(double desired, int64_t actual) {

 // Rate limit set at 10, but client attempts 20 transactions per second.
 // Client should be throttled to only 10 transactions per second.
-TEST_CASE("/GrvProxyTransactionTagThrottler/Simple") {
-	state GrvProxyTransactionTagThrottler throttler;
+TEST_CASE("/GrvProxyTagThrottler/Simple") {
+	state GrvProxyTagThrottler throttler(5.0);
 	state TagSet tagSet;
 	state TransactionTagMap<uint32_t> counters;
 	{
@ -333,8 +349,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/Simple") {
 }

 // Clients share the available 30 transaction/second budget
-TEST_CASE("/GrvProxyTransactionTagThrottler/MultiClient") {
-	state GrvProxyTransactionTagThrottler throttler;
+TEST_CASE("/GrvProxyTagThrottler/MultiClient") {
+	state GrvProxyTagThrottler throttler(5.0);
 	state TagSet tagSet;
 	state TransactionTagMap<uint32_t> counters;
 	{
@ -358,8 +374,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/MultiClient") {
 }

 // Test processing GetReadVersionRequests that batch several transactions
-TEST_CASE("/GrvProxyTransactionTagThrottler/Batch") {
-	state GrvProxyTransactionTagThrottler throttler;
+TEST_CASE("/GrvProxyTagThrottler/Batch") {
+	state GrvProxyTagThrottler throttler(5.0);
 	state TagSet tagSet;
 	state TransactionTagMap<uint32_t> counters;
 	{
@ -379,8 +395,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/Batch") {
 }

 // Tests cleanup of tags that are no longer throttled.
-TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup1") {
-	GrvProxyTransactionTagThrottler throttler;
+TEST_CASE("/GrvProxyTagThrottler/Cleanup1") {
+	GrvProxyTagThrottler throttler(5.0);
 	for (int i = 0; i < 1000; ++i) {
 		auto const tag = getRandomTag();
 		TransactionTagMap<double> rates;
@ -392,8 +408,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup1") {
 }

 // Tests cleanup of tags once queues have been emptied
-TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup2") {
-	GrvProxyTransactionTagThrottler throttler;
+TEST_CASE("/GrvProxyTagThrottler/Cleanup2") {
+	GrvProxyTagThrottler throttler(5.0);
 	{
 		GetReadVersionRequest req;
 		req.tags["sampleTag"_sr] = 1;
@ -416,8 +432,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup2") {

 // Tests that unthrottled transactions are released in FIFO order, even when they
 // have different tags
-TEST_CASE("/GrvProxyTransactionTagThrottler/Fifo") {
-	state GrvProxyTransactionTagThrottler throttler;
+TEST_CASE("/GrvProxyTagThrottler/Fifo") {
+	state GrvProxyTagThrottler throttler(5.0);
 	state Future<Void> server = mockServer(&throttler);
 	wait(mockFifoClient(&throttler));
 	return Void();
--- a/fdbserver/KeyValueStoreMemory.actor.cpp
+++ b/fdbserver/KeyValueStoreMemory.actor.cpp
@ -836,14 +836,13 @@ private:
 					useDelta = false;

 					auto thisSnapshotEnd = self->log_op(OpSnapshotEnd, StringRef(), StringRef());
-					//TraceEvent("SnapshotEnd", self->id)
-					//	.detail("LastKey", lastKey.present() ? lastKey.get() : "<none>"_sr)
-					//	.detail("CurrentSnapshotEndLoc", self->currentSnapshotEnd)
-					//	.detail("PreviousSnapshotEndLoc", self->previousSnapshotEnd)
-					//	.detail("ThisSnapshotEnd", thisSnapshotEnd)
-					//	.detail("Items", snapItems)
-					//	.detail("CommittedWrites", self->notifiedCommittedWriteBytes.get())
-					//	.detail("SnapshotSize", snapshotBytes);
+					DisabledTraceEvent("SnapshotEnd", self->id)
+					    .detail("CurrentSnapshotEndLoc", self->currentSnapshotEnd)
+					    .detail("PreviousSnapshotEndLoc", self->previousSnapshotEnd)
+					    .detail("ThisSnapshotEnd", thisSnapshotEnd)
+					    .detail("Items", snapItems)
+					    .detail("CommittedWrites", self->notifiedCommittedWriteBytes.get())
+					    .detail("SnapshotSize", snapshotBytes);

 					ASSERT(thisSnapshotEnd >= self->currentSnapshotEnd);
 					self->previousSnapshotEnd = self->currentSnapshotEnd;
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@ -48,6 +48,7 @@
 #endif
 #include "fdbclient/SystemData.h"
 #include "fdbserver/CoroFlow.h"
+#include "fdbserver/RocksDBLogForwarder.h"
 #include "flow/ActorCollection.h"
 #include "flow/flow.h"
 #include "flow/IThreadPool.h"
@ -202,6 +203,13 @@ rocksdb::DBOptions SharedRocksDBState::initialDbOptions() {
 	options.statistics->set_stats_level(rocksdb::kExceptHistogramOrTimers);

 	options.db_log_dir = SERVER_KNOBS->LOG_DIRECTORY;
+
+	if (SERVER_KNOBS->ROCKSDB_MUTE_LOGS) {
+		options.info_log = std::make_shared<NullRocksDBLogForwarder>();
+	} else {
+		options.info_log = std::make_shared<RocksDBLogForwarder>(id, options.info_log_level);
+	}
+
 	return options;
 }

--- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
@ -224,7 +224,7 @@ Error statusToError(const rocksdb::Status& s) {
 	if (s.IsIOError()) {
 		return io_error();
 	} else if (s.IsTimedOut()) {
-		return transaction_too_old();
+		return key_value_store_deadline_exceeded();
 	} else {
 		return unknown_error();
 	}
@ -518,17 +518,10 @@ int readRangeInDb(PhysicalShard* shard, const KeyRangeRef& range, int rowLimit,

 	int accumulatedRows = 0;
 	int accumulatedBytes = 0;
-	// TODO: Pass read timeout.
-	const int readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT;
 	rocksdb::Status s;
-	auto options = getReadOptions();
-	// TODO: define single shard read timeout.
-	const uint64_t deadlineMircos = shard->db->GetEnv()->NowMicros() + readRangeTimeout * 1000000;
-	options.deadline = std::chrono::microseconds(deadlineMircos / 1000000);

 	// When using a prefix extractor, ensure that keys are returned in order even if they cross
 	// a prefix boundary.
-	options.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
 	if (rowLimit >= 0) {
 		ReadIterator readIter = shard->readIterPool->getIterator();
 		auto cursor = readIter.iter;
@ -2023,7 +2016,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				    .detail("Error", "Read value request timedout")
 				    .detail("Method", "ReadValueAction")
 				    .detail("Timeout value", readValueTimeout);
-				a.result.sendError(transaction_too_old());
+				a.result.sendError(key_value_store_deadline_exceeded());
 				return;
 			}

@ -2101,7 +2094,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				    .detail("Error", "Read value prefix request timedout")
 				    .detail("Method", "ReadValuePrefixAction")
 				    .detail("Timeout value", readValuePrefixTimeout);
-				a.result.sendError(transaction_too_old());
+				a.result.sendError(key_value_store_deadline_exceeded());
 				return;
 			}

@ -2176,7 +2169,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				    .detail("Error", "Read range request timedout")
 				    .detail("Method", "ReadRangeAction")
 				    .detail("Timeout value", readRangeTimeout);
-				a.result.sendError(transaction_too_old());
+				a.result.sendError(key_value_store_deadline_exceeded());
 				return;
 			}

@ -2217,6 +2210,15 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				if (result.size() >= abs(a.rowLimit) || accumulatedBytes >= a.byteLimit) {
 					break;
 				}
+
+				if (timer_monotonic() - a.startTime > readRangeTimeout) {
+					TraceEvent(SevInfo, "ShardedRocksDBTimeout")
+					    .detail("Action", "ReadRange")
+					    .detail("ShardsRead", numShards)
+					    .detail("BytesRead", accumulatedBytes);
+					a.result.sendError(key_value_store_deadline_exceeded());
+					return;
+				}
 			}

 			result.more =
@ -2229,6 +2231,9 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 				double currTime = timer_monotonic();
 				rocksDBMetrics->getReadRangeActionHistogram(threadIndex)->sampleSeconds(currTime - readBeginTime);
 				rocksDBMetrics->getReadRangeLatencyHistogram(threadIndex)->sampleSeconds(currTime - a.startTime);
+				if (a.shardRanges.size() > 1) {
+					TraceEvent(SevInfo, "ShardedRocksDB").detail("ReadRangeShards", a.shardRanges.size());
+				}
 			}

 			sample();
@ -2396,7 +2401,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		auto* shard = shardManager.getDataShard(key);
 		if (shard == nullptr || !shard->physicalShard->initialized()) {
 			// TODO: read non-exist system key range should not cause an error.
-			TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id)
+			TraceEvent(SevWarn, "ShardedRocksDB", this->id)
 			    .detail("Detail", "Read non-exist key range")
 			    .detail("ReadKey", key);
 			return Optional<Value>();
@ -2429,7 +2434,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
 		auto* shard = shardManager.getDataShard(key);
 		if (shard == nullptr || !shard->physicalShard->initialized()) {
 			// TODO: read non-exist system key range should not cause an error.
-			TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id)
+			TraceEvent(SevWarn, "ShardedRocksDB", this->id)
 			    .detail("Detail", "Read non-exist key range")
 			    .detail("ReadKey", key);
 			return Optional<Value>();
@ -2602,7 +2607,6 @@ TEST_CASE("noSim/ShardedRocksDB/Initialization") {

 	state IKeyValueStore* kvStore =
 	    new ShardedRocksDBKeyValueStore(rocksDBTestDir, deterministicRandom()->randomUniqueID());
-	state ShardedRocksDBKeyValueStore* rocksDB = dynamic_cast<ShardedRocksDBKeyValueStore*>(kvStore);
 	wait(kvStore->init());

 	Future<Void> closed = kvStore->onClosed();
@ -2617,7 +2621,6 @@ TEST_CASE("noSim/ShardedRocksDB/SingleShardRead") {

 	state IKeyValueStore* kvStore =
 	    new ShardedRocksDBKeyValueStore(rocksDBTestDir, deterministicRandom()->randomUniqueID());
-	state ShardedRocksDBKeyValueStore* rocksDB = dynamic_cast<ShardedRocksDBKeyValueStore*>(kvStore);
 	wait(kvStore->init());

 	KeyRangeRef range("a"_sr, "b"_sr);
--- a/fdbserver/LatencyBandsMap.actor.cpp
+++ b/fdbserver/LatencyBandsMap.actor.cpp
@ -0,0 +1,131 @@
+/*
+ * LatencyBandsMap.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbserver/LatencyBandsMap.h"
+#include "flow/UnitTest.h"
+#include "flow/actorcompiler.h" // must be last include
+
+class LatencyBandsMapImpl {
+public:
+	ACTOR static Future<Void> expireOldTagsActor(LatencyBandsMap* self) {
+		loop {
+			wait(delay(5.0));
+			for (auto it = self->map.begin(); it != self->map.end();) {
+				const auto& [tag, expirableBands] = *it;
+				if (now() - expirableBands.lastUpdated > SERVER_KNOBS->GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER) {
+					CODE_PROBE(true, "LatencyBandsMap erasing expired tag");
+					it = self->map.erase(it);
+				} else {
+					++it;
+				}
+			}
+		}
+	}
+};
+
+LatencyBandsMap::ExpirableBands::ExpirableBands(LatencyBands&& bands)
+  : latencyBands(std::move(bands)), lastUpdated(now()) {}
+
+Optional<LatencyBands*> LatencyBandsMap::getLatencyBands(TransactionTag tag) {
+	if (map.size() == maxSize && !map.count(tag)) {
+		CODE_PROBE(true, "LatencyBandsMap reached maxSize");
+		return {};
+	}
+	auto const [it, inserted] = map.try_emplace(
+	    tag, LatencyBands(name, id, loggingInterval, [tag](auto& te) { te.detail("Tag", printable(tag)); }));
+	auto& expirableBands = it->second;
+	if (inserted) {
+		for (const auto& threshold : thresholds) {
+			expirableBands.latencyBands.addThreshold(threshold);
+		}
+	}
+	expirableBands.lastUpdated = now();
+	return &expirableBands.latencyBands;
+}
+
+LatencyBandsMap::LatencyBandsMap(std::string const& name, UID id, double loggingInterval, int maxSize)
+  : name(name), id(id), loggingInterval(loggingInterval), maxSize(maxSize) {
+	expireOldTags = LatencyBandsMapImpl::expireOldTagsActor(this);
+}
+
+void LatencyBandsMap::addMeasurement(TransactionTag tag, double value, int count) {
+	auto bands = getLatencyBands(tag);
+	if (bands.present()) {
+		bands.get()->addMeasurement(value, count);
+	}
+}
+
+void LatencyBandsMap::addThreshold(double value) {
+	thresholds.push_back(value);
+	for (auto& [tag, expirableBands] : map) {
+		expirableBands.latencyBands.addThreshold(value);
+	}
+}
+
+TEST_CASE("/fdbserver/LatencyBandsMap/Simple") {
+	state LatencyBandsMap latencyBandsMap("TestLatencyBandsMap", deterministicRandom()->randomUniqueID(), 10.0, 100);
+	state Standalone<VectorRef<TransactionTagRef>> tags;
+	tags.push_back_deep(tags.arena(), "a"_sr);
+	tags.push_back_deep(tags.arena(), "b"_sr);
+	tags.push_back_deep(tags.arena(), "c"_sr);
+	latencyBandsMap.addThreshold(0.1);
+	latencyBandsMap.addThreshold(0.2);
+	latencyBandsMap.addThreshold(0.4);
+	state int measurements = 0;
+	loop {
+		wait(delayJittered(0.1));
+		auto const tag = deterministicRandom()->randomChoice(tags);
+		latencyBandsMap.addMeasurement(tag, deterministicRandom()->random01());
+		if (++measurements == 1000) {
+			return Void();
+		}
+	}
+}
+
+TEST_CASE("/fdbserver/LatencyBandsMap/MaxSize") {
+	LatencyBandsMap latencyBandsMap("TestLatencyBandsMap", deterministicRandom()->randomUniqueID(), 10.0, 2);
+	latencyBandsMap.addMeasurement("a"_sr, deterministicRandom()->random01());
+	latencyBandsMap.addMeasurement("b"_sr, deterministicRandom()->random01());
+	latencyBandsMap.addMeasurement("c"_sr, deterministicRandom()->random01());
+	ASSERT_EQ(latencyBandsMap.size(), 2);
+	return Void();
+}
+
+TEST_CASE("/fdbserver/LatencyBandsMap/Expire") {
+	state LatencyBandsMap latencyBandsMap("TestLatencyBandsMap", deterministicRandom()->randomUniqueID(), 10.0, 100);
+	latencyBandsMap.addMeasurement("a"_sr, deterministicRandom()->random01());
+	latencyBandsMap.addMeasurement("b"_sr, deterministicRandom()->random01());
+	latencyBandsMap.addMeasurement("c"_sr, deterministicRandom()->random01());
+	latencyBandsMap.addThreshold(0.1);
+	latencyBandsMap.addThreshold(0.2);
+	latencyBandsMap.addThreshold(0.4);
+	ASSERT_EQ(latencyBandsMap.size(), 3);
+	state int waitIterations = 0;
+	loop {
+		wait(delay(1.0));
+		latencyBandsMap.addMeasurement("a"_sr, deterministicRandom()->random01());
+		latencyBandsMap.addMeasurement("b"_sr, deterministicRandom()->random01());
+		if (++waitIterations == 2 * SERVER_KNOBS->GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER) {
+			break;
+		}
+	}
+	ASSERT_EQ(latencyBandsMap.size(), 2);
+	return Void();
+}
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@ -2401,6 +2401,9 @@ ACTOR Future<Void> cleanUpDataMove(Database occ,
 					    .detail("ReadVersion", tr.getReadVersion().get());

 					if (destId != dataMoveId) {
+						for (const auto& uid : dest) {
+							physicalShardMap[uid].push_back(Shard(rangeIntersectKeys, destId));
+						}
 						TraceEvent(SevVerbose, "CleanUpDataMoveSkipShard", dataMoveId)
 						    .detail("DataMoveID", dataMoveId)
 						    .detail("ShardRange", rangeIntersectKeys)
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@ -637,7 +637,7 @@ Ratekeeper::Ratekeeper(UID id, Database db)
                SERVER_KNOBS->TARGET_BW_LAG_BATCH),
    maxVersion(0), blobWorkerTime(now()), unblockedAssignmentTime(now()) {
 	if (SERVER_KNOBS->GLOBAL_TAG_THROTTLING) {
-		tagThrottler = std::make_unique<GlobalTagThrottler>(db, id);
+		tagThrottler = std::make_unique<GlobalTagThrottler>(db, id, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND);
 	} else {
 		tagThrottler = std::make_unique<TagThrottler>(db, id);
 	}
@ -695,17 +695,16 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {

 		limitReason_t ssLimitReason = limitReason_t::unlimited;

-		int64_t minFreeSpace =
-		    std::max(SERVER_KNOBS->MIN_AVAILABLE_SPACE,
-		             (int64_t)(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO * ss.smoothTotalSpace.smoothTotal()));
+		int64_t minFreeSpace = std::max(SERVER_KNOBS->MIN_AVAILABLE_SPACE,
+		                                (int64_t)(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO * ss.getSmoothTotalSpace()));

 		worstFreeSpaceStorageServer =
-		    std::min(worstFreeSpaceStorageServer, (int64_t)ss.smoothFreeSpace.smoothTotal() - minFreeSpace);
+		    std::min(worstFreeSpaceStorageServer, (int64_t)ss.getSmoothFreeSpace() - minFreeSpace);

 		int64_t springBytes = std::max<int64_t>(
-		    1, std::min<int64_t>(limits->storageSpringBytes, (ss.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2));
-		int64_t targetBytes = std::max<int64_t>(
-		    1, std::min(limits->storageTargetBytes, (int64_t)ss.smoothFreeSpace.smoothTotal() - minFreeSpace));
+		    1, std::min<int64_t>(limits->storageSpringBytes, (ss.getSmoothFreeSpace() - minFreeSpace) * 0.2));
+		int64_t targetBytes =
+		    std::max<int64_t>(1, std::min(limits->storageTargetBytes, (int64_t)ss.getSmoothFreeSpace() - minFreeSpace));
 		if (targetBytes != limits->storageTargetBytes) {
 			if (minFreeSpace == SERVER_KNOBS->MIN_AVAILABLE_SPACE) {
 				ssLimitReason = limitReason_t::storage_server_min_free_space;
@ -716,8 +715,8 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 				TraceEvent("RatekeeperLimitReasonDetails")
 				    .detail("Reason", ssLimitReason)
 				    .detail("SSID", ss.id)
-				    .detail("SSSmoothTotalSpace", ss.smoothTotalSpace.smoothTotal())
-				    .detail("SSSmoothFreeSpace", ss.smoothFreeSpace.smoothTotal())
+				    .detail("SSSmoothTotalSpace", ss.getSmoothTotalSpace())
+				    .detail("SSSmoothFreeSpace", ss.getSmoothFreeSpace())
 				    .detail("TargetBytes", targetBytes)
 				    .detail("LimitsStorageTargetBytes", limits->storageTargetBytes)
 				    .detail("MinFreeSpace", minFreeSpace);
@ -744,7 +743,7 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 			addActor.send(tagThrottler->tryUpdateAutoThrottling(ss));
 		}

-		double inputRate = ss.smoothInputBytes.smoothRate();
+		double inputRate = ss.getSmoothInputBytesRate();
 		// inputRate = std::max( inputRate, actualTps / SERVER_KNOBS->MAX_TRANSACTIONS_PER_BYTE );

 		/*if( deterministicRandom()->random01() < 0.1 ) {
@ -753,15 +752,15 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 		  .detail("MinFreeSpace", minFreeSpace)
 		  .detail("SpringBytes", springBytes)
 		  .detail("TargetBytes", targetBytes)
-		  .detail("SmoothTotalSpaceTotal", ss.smoothTotalSpace.smoothTotal())
-		  .detail("SmoothFreeSpaceTotal", ss.smoothFreeSpace.smoothTotal())
+		  .detail("SmoothTotalSpaceTotal", ss.getSmoothTotalSpace())
+		  .detail("SmoothFreeSpaceTotal", ss.getSmoothFreeSpace())
 		  .detail("LastReplyBytesInput", ss.lastReply.bytesInput)
-		  .detail("SmoothDurableBytesTotal", ss.smoothDurableBytes.smoothTotal())
+		  .detail("SmoothDurableBytesTotal", ss.getSmoothDurableBytes())
 		  .detail("TargetRateRatio", targetRateRatio)
-		  .detail("SmoothInputBytesRate", ss.smoothInputBytes.smoothRate())
+		  .detail("SmoothInputBytesRate", ss.getSmoothInputBytesRate())
 		  .detail("ActualTPS", actualTps)
 		  .detail("InputRate", inputRate)
-		  .detail("VerySmoothDurableBytesRate", ss.verySmoothDurableBytes.smoothRate())
+		  .detail("VerySmoothDurableBytesRate", ss.getVerySmoothDurableBytesRate())
 		  .detail("B", b);
 		  }*/

@ -777,7 +776,7 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 		if (targetRateRatio > 0 && inputRate > 0) {
 			ASSERT(inputRate != 0);
 			double smoothedRate =
-			    std::max(ss.verySmoothDurableBytes.smoothRate(), actualTps / SERVER_KNOBS->MAX_TRANSACTIONS_PER_BYTE);
+			    std::max(ss.getVerySmoothDurableBytesRate(), actualTps / SERVER_KNOBS->MAX_TRANSACTIONS_PER_BYTE);
 			double x = smoothedRate / (inputRate * targetRateRatio);
 			double lim = actualTps * x;
 			if (lim < limitTps) {
@ -790,17 +789,17 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 						    .detail("Reason", limitReason_t::storage_server_write_queue_size)
 						    .detail("FromReason", ssLimitReason)
 						    .detail("SSID", ss.id)
-						    .detail("SSSmoothTotalSpace", ss.smoothTotalSpace.smoothTotal())
+						    .detail("SSSmoothTotalSpace", ss.getSmoothTotalSpace())
 						    .detail("LimitsStorageTargetBytes", limits->storageTargetBytes)
 						    .detail("LimitsStorageSpringBytes", limits->storageSpringBytes)
-						    .detail("SSSmoothFreeSpace", ss.smoothFreeSpace.smoothTotal())
+						    .detail("SSSmoothFreeSpace", ss.getSmoothFreeSpace())
 						    .detail("MinFreeSpace", minFreeSpace)
 						    .detail("SSLastReplyBytesInput", ss.lastReply.bytesInput)
-						    .detail("SSSmoothDurableBytes", ss.smoothDurableBytes.smoothTotal())
+						    .detail("SSSmoothDurableBytes", ss.getSmoothDurableBytes())
 						    .detail("StorageQueue", storageQueue)
 						    .detail("TargetBytes", targetBytes)
 						    .detail("SpringBytes", springBytes)
-						    .detail("SSVerySmoothDurableBytesSmoothRate", ss.verySmoothDurableBytes.smoothRate())
+						    .detail("SSVerySmoothDurableBytesRate", ss.getVerySmoothDurableBytesRate())
 						    .detail("SmoothedRate", smoothedRate)
 						    .detail("X", x)
 						    .detail("ActualTps", actualTps)
@ -839,8 +838,7 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 			continue;
 		}

-		limitingStorageQueueStorageServer =
-		    ss->second->lastReply.bytesInput - ss->second->smoothDurableBytes.smoothTotal();
+		limitingStorageQueueStorageServer = ss->second->lastReply.bytesInput - ss->second->getSmoothDurableBytes();
 		limits->tpsLimit = ss->first;
 		reasonID = storageTpsLimitReverseIndex.begin()->second->id; // Although we aren't controlling based on the worst
 		// SS, we still report it as the limiting process
@ -1086,16 +1084,15 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {

 		limitReason_t tlogLimitReason = limitReason_t::log_server_write_queue;

-		int64_t minFreeSpace =
-		    std::max(SERVER_KNOBS->MIN_AVAILABLE_SPACE,
-		             (int64_t)(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO * tl.smoothTotalSpace.smoothTotal()));
+		int64_t minFreeSpace = std::max(SERVER_KNOBS->MIN_AVAILABLE_SPACE,
+		                                (int64_t)(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO * tl.getSmoothTotalSpace()));

-		worstFreeSpaceTLog = std::min(worstFreeSpaceTLog, (int64_t)tl.smoothFreeSpace.smoothTotal() - minFreeSpace);
+		worstFreeSpaceTLog = std::min(worstFreeSpaceTLog, (int64_t)tl.getSmoothFreeSpace() - minFreeSpace);

 		int64_t springBytes = std::max<int64_t>(
-		    1, std::min<int64_t>(limits->logSpringBytes, (tl.smoothFreeSpace.smoothTotal() - minFreeSpace) * 0.2));
-		int64_t targetBytes = std::max<int64_t>(
-		    1, std::min(limits->logTargetBytes, (int64_t)tl.smoothFreeSpace.smoothTotal() - minFreeSpace));
+		    1, std::min<int64_t>(limits->logSpringBytes, (tl.getSmoothFreeSpace() - minFreeSpace) * 0.2));
+		int64_t targetBytes =
+		    std::max<int64_t>(1, std::min(limits->logTargetBytes, (int64_t)tl.getSmoothFreeSpace() - minFreeSpace));
 		if (targetBytes != limits->logTargetBytes) {
 			if (minFreeSpace == SERVER_KNOBS->MIN_AVAILABLE_SPACE) {
 				tlogLimitReason = limitReason_t::log_server_min_free_space;
@ -1106,15 +1103,15 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 				TraceEvent("RatekeeperLimitReasonDetails")
 				    .detail("TLogID", tl.id)
 				    .detail("Reason", tlogLimitReason)
-				    .detail("TLSmoothFreeSpace", tl.smoothFreeSpace.smoothTotal())
-				    .detail("TLSmoothTotalSpace", tl.smoothTotalSpace.smoothTotal())
+				    .detail("TLSmoothFreeSpace", tl.getSmoothFreeSpace())
+				    .detail("TLSmoothTotalSpace", tl.getSmoothTotalSpace())
 				    .detail("LimitsLogTargetBytes", limits->logTargetBytes)
 				    .detail("TargetBytes", targetBytes)
 				    .detail("MinFreeSpace", minFreeSpace);
 			}
 		}

-		int64_t queue = tl.lastReply.bytesInput - tl.smoothDurableBytes.smoothTotal();
+		int64_t queue = tl.lastReply.bytesInput - tl.getSmoothDurableBytes();
 		healthMetrics.tLogQueue[tl.id] = queue;
 		int64_t b = queue - targetBytes;
 		worstStorageQueueTLog = std::max(worstStorageQueueTLog, queue);
@ -1136,14 +1133,14 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 				TraceEvent("RatekeeperLimitReasonDetails")
 				    .detail("TLogID", tl.id)
 				    .detail("Reason", limitReason_t::storage_server_readable_behind)
-				    .detail("TLSmoothFreeSpace", tl.smoothFreeSpace.smoothTotal())
-				    .detail("TLSmoothTotalSpace", tl.smoothTotalSpace.smoothTotal())
+				    .detail("TLSmoothFreeSpace", tl.getSmoothFreeSpace())
+				    .detail("TLSmoothTotalSpace", tl.getSmoothTotalSpace())
 				    .detail("LimitsLogSpringBytes", limits->logSpringBytes)
 				    .detail("LimitsLogTargetBytes", limits->logTargetBytes)
 				    .detail("SpringBytes", springBytes)
 				    .detail("TargetBytes", targetBytes)
 				    .detail("TLLastReplyBytesInput", tl.lastReply.bytesInput)
-				    .detail("TLSmoothDurableBytes", tl.smoothDurableBytes.smoothTotal())
+				    .detail("TLSmoothDurableBytes", tl.getSmoothDurableBytes())
 				    .detail("Queue", queue)
 				    .detail("B", b)
 				    .detail("TargetRateRatio", targetRateRatio)
@ -1155,11 +1152,11 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 			tlogLimitReason = limitReason_t::storage_server_readable_behind;
 		}

-		double inputRate = tl.smoothInputBytes.smoothRate();
+		double inputRate = tl.getSmoothInputBytesRate();

 		if (targetRateRatio > 0) {
 			double smoothedRate =
-			    std::max(tl.verySmoothDurableBytes.smoothRate(), actualTps / SERVER_KNOBS->MAX_TRANSACTIONS_PER_BYTE);
+			    std::max(tl.getVerySmoothDurableBytesRate(), actualTps / SERVER_KNOBS->MAX_TRANSACTIONS_PER_BYTE);
 			double x = smoothedRate / (inputRate * targetRateRatio);
 			if (targetRateRatio < .75) //< FIXME: KNOB for 2.0
 				x = std::max(x, 0.95);
@ -1184,8 +1181,8 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) {
 					    .detail("Reason", limitReason_t::log_server_mvcc_write_bandwidth)
 					    .detail("TLogID", tl.id)
 					    .detail("MinFreeSpace", minFreeSpace)
-					    .detail("TLSmoothFreeSpace", tl.smoothFreeSpace.smoothTotal())
-					    .detail("TLSmoothTotalSpace", tl.smoothTotalSpace.smoothTotal())
+					    .detail("TLSmoothFreeSpace", tl.getSmoothFreeSpace())
+					    .detail("TLSmoothTotalSpace", tl.getSmoothTotalSpace())
 					    .detail("LimitsLogSpringBytes", limits->logSpringBytes)
 					    .detail("LimitsLogTargetBytes", limits->logTargetBytes)
 					    .detail("SpringBytes", springBytes)
@ -1366,7 +1363,7 @@ UpdateCommitCostRequest StorageQueueInfo::refreshCommitCost(double elapsed) {
 	return updateCommitCostRequest;
 }

-Optional<double> StorageQueueInfo::getThrottlingRatio(int64_t storageTargetBytes, int64_t storageSpringBytes) const {
+Optional<double> StorageQueueInfo::getTagThrottlingRatio(int64_t storageTargetBytes, int64_t storageSpringBytes) const {
 	auto const storageQueue = getStorageQueueBytes();
 	if (storageQueue < storageTargetBytes - storageSpringBytes) {
 		return {};
--- a/fdbserver/Resolver.actor.cpp
+++ b/fdbserver/Resolver.actor.cpp
@ -205,6 +205,17 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
 	    req.prevVersion >= 0 ? req.reply.getEndpoint().getPrimaryAddress() : NetworkAddress();
 	state ProxyRequestsInfo& proxyInfo = self->proxyInfoMap[proxyAddress];

+	state std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys;
+	if (isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION)) {
+		static const std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> metadataDomains = {
+			{ SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME },
+			{ ENCRYPT_HEADER_DOMAIN_ID, FDB_ENCRYPT_HEADER_DOMAIN_NAME }
+		};
+		std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cks =
+		    wait(getLatestEncryptCipherKeys(db, metadataDomains, BlobCipherMetrics::TLOG));
+		cipherKeys = cks;
+	}
+
 	++self->resolveBatchIn;

 	if (req.debugID.present()) {
@ -351,7 +362,11 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
 				SpanContext spanContext =
 				    req.transactions[t].spanContext.present() ? req.transactions[t].spanContext.get() : SpanContext();

-				applyMetadataMutations(spanContext, *resolverData, req.transactions[t].mutations);
+				applyMetadataMutations(spanContext,
+				                       *resolverData,
+				                       req.transactions[t].mutations,
+				                       isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION) ? &cipherKeys
+				                                                                                      : nullptr);
 			}
 			CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery");
 		}
@ -506,8 +521,10 @@ struct TransactionStateResolveContext {
 	}
 };

-ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolveContext* pContext,
-                                                          Reference<AsyncVar<ServerDBInfo> const> db) {
+ACTOR Future<Void> processCompleteTransactionStateRequest(
+    TransactionStateResolveContext* pContext,
+    Reference<AsyncVar<ServerDBInfo> const> db,
+    std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* cipherKeys) {
 	state KeyRange txnKeys = allKeys;
 	state std::map<Tag, UID> tag_uid;

@ -574,7 +591,7 @@ ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolv
 		bool confChanges; // Ignore configuration changes for initial commits.
 		ResolverData resolverData(
 		    pContext->pResolverData->dbgid, pContext->pTxnStateStore, &pContext->pResolverData->keyInfo, confChanges);
-		applyMetadataMutations(SpanContext(), resolverData, mutations);
+		applyMetadataMutations(SpanContext(), resolverData, mutations, cipherKeys);
 	} // loop

 	auto lockedKey = pContext->pTxnStateStore->readValue(databaseLockedKey).get();
@ -615,7 +632,18 @@ ACTOR Future<Void> processTransactionStateRequestPart(TransactionStateResolveCon
 	if (pContext->receivedSequences.size() == pContext->maxSequence) {
 		// Received all components of the txnStateRequest
 		ASSERT(!pContext->processed);
-		wait(processCompleteTransactionStateRequest(pContext, db));
+		state std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys;
+		if (isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION)) {
+			static const std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> metadataDomains = {
+				{ SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME },
+				{ ENCRYPT_HEADER_DOMAIN_ID, FDB_ENCRYPT_HEADER_DOMAIN_NAME }
+			};
+			std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cks =
+			    wait(getLatestEncryptCipherKeys(db, metadataDomains, BlobCipherMetrics::TLOG));
+			cipherKeys = cks;
+		}
+		wait(processCompleteTransactionStateRequest(
+		    pContext, db, isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION) ? &cipherKeys : nullptr));
 		pContext->processed = true;
 	}

--- a/fdbserver/RocksDBLogForwarder.actor.cpp
+++ b/fdbserver/RocksDBLogForwarder.actor.cpp
@ -0,0 +1,147 @@
+/*
+ * RocksDBLogForwarder.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef SSD_ROCKSDB_EXPERIMENTAL
+#include "fdbserver/RocksDBLogForwarder.h"
+
+#include "flow/network.h"
+#include "flow/Trace.h"
+#include "fdbrpc/simulator.h"
+
+#include "flow/actorcompiler.h" // This must be the last include file
+
+using InfoLogLevel = rocksdb::InfoLogLevel;
+
+namespace {
+
+Severity getSeverityFromLogLevel(const InfoLogLevel& log_level) {
+	switch (log_level) {
+	case InfoLogLevel::DEBUG_LEVEL:
+		return SevDebug;
+	case InfoLogLevel::INFO_LEVEL:
+		return SevInfo;
+	case InfoLogLevel::WARN_LEVEL:
+		return SevWarn;
+	case InfoLogLevel::ERROR_LEVEL:
+		return SevError;
+	case InfoLogLevel::FATAL_LEVEL:
+		return SevError;
+	case InfoLogLevel::HEADER_LEVEL:
+		return SevVerbose;
+	case InfoLogLevel::NUM_INFO_LOG_LEVELS:
+		ASSERT(false);
+	}
+	UNREACHABLE();
+}
+
+} // namespace
+
+namespace details {
+
+void logTraceEvent(const RocksDBLogRecord& record) {
+	TraceEvent event = TraceEvent(record.severity, "RocksDBLogRecord", record.uid);
+	event.detail("RocksDBLogTime", record.logReceiveTime);
+
+	{
+		std::stringstream ss;
+		ss << record.threadID;
+		event.detail("RocksDBThreadID", ss.str());
+	}
+
+	for (const auto& [k, v] : record.kvPairs) {
+		event.detail(k.c_str(), v);
+	}
+}
+
+ACTOR Future<Void> rocksDBPeriodicallyLogger(RocksDBLogger* pRecords) {
+	loop choose {
+		when(wait(delay(0.1))) { pRecords->consume(); }
+	}
+}
+
+RocksDBLogger::RocksDBLogger()
+  : mainThreadId(std::this_thread::get_id()), periodicLogger(rocksDBPeriodicallyLogger(this)) {}
+
+void RocksDBLogger::inject(RocksDBLogRecord&& record) {
+	const std::thread::id threadId = std::this_thread::get_id();
+	if (threadId == mainThreadId) {
+		// In the main thread, it is *NOT* necessary to cache the record.
+		logTraceEvent(record);
+
+		consume();
+	} else {
+		const std::lock_guard<std::mutex> lockGuard(recordsMutex);
+
+		logRecords.emplace_back();
+		logRecords.back() = std::move(record);
+	}
+}
+
+void RocksDBLogger::consume() {
+	std::vector<RocksDBLogRecord> currentRecords;
+	{
+		const std::lock_guard<std::mutex> lockGuard(recordsMutex);
+		currentRecords.swap(logRecords);
+	}
+
+	for (const auto& record : currentRecords) {
+		logTraceEvent(record);
+	}
+}
+
+} // namespace details
+
+RocksDBLogForwarder::RocksDBLogForwarder(const UID& id_, const InfoLogLevel log_level)
+  : rocksdb::Logger(log_level), id(id_), logger() {
+	TraceEvent(SevInfo, "RocksDBLoggerStart", id);
+}
+
+RocksDBLogForwarder::~RocksDBLogForwarder() {
+	TraceEvent(SevInfo, "RocksDBLoggerStop", id);
+}
+
+void RocksDBLogForwarder::Logv(const char* format, va_list ap) {
+	Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+}
+
+void RocksDBLogForwarder::Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
+	const std::thread::id threadID = std::this_thread::get_id();
+
+	// FIXME: Restrict the RocksDB log level to warn in order to prevent almost all simulation test failure. This has to
+	// be reconsidered.
+	const Severity severity = std::min(getSeverityFromLogLevel(log_level), SevWarn);
+
+	// TODO: Parse the log information into KV pairs
+	// At this stage vsnprintf is used
+	char buf[1024];
+	vsnprintf(buf, 1024, format, ap);
+	if (severity < SevError) {
+		logger.inject(details::RocksDBLogRecord{ now(), severity, id, threadID, { { "Text", std::string(buf) } } });
+	} else {
+		logger.inject(details::RocksDBLogRecord{
+		    now(),
+		    severity,
+		    id,
+		    threadID,
+		    { { "Text", std::string(buf) }, { "OriginalBacktrace", platform::get_backtrace() } } });
+	}
+}
+
+#endif // SSD_ROCKSDB_EXPERIMENTAL
--- a/fdbserver/TenantCache.actor.cpp
+++ b/fdbserver/TenantCache.actor.cpp
@ -137,8 +137,13 @@ public:
 						tenantCache->tenantStorageMap[tenants[i]].usage = size;
 						break;
 					} catch (Error& e) {
-						TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
-						wait(tr.onError(e));
+						if (e.code() == error_code_tenant_not_found) {
+							tenantCache->tenantStorageMap.erase(tenants[i]);
+							break;
+						} else {
+							TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
+							wait(tr.onError(e));
+						}
 					}
 				}
 			}
--- a/fdbserver/TransactionTagCounter.cpp
+++ b/fdbserver/TransactionTagCounter.cpp
@ -18,6 +18,7 @@
 * limitations under the License.
 */

+#include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/TransactionTagCounter.h"
 #include "flow/Trace.h"
@ -90,9 +91,6 @@ class TransactionTagCounterImpl {
 	std::vector<StorageQueuingMetricsReply::TagInfo> previousBusiestTags;
 	Reference<EventCacheHolder> busiestReadTagEventHolder;

-	// Round up to the nearest page size
-	static int64_t costFunction(int64_t bytes) { return (bytes - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1; }
-
 public:
 	TransactionTagCounterImpl(UID thisServerID)
 	  : thisServerID(thisServerID), topTags(SERVER_KNOBS->SS_THROTTLE_TAGS_TRACKED),
@ -101,7 +99,7 @@ public:
 	void addRequest(Optional<TagSet> const& tags, int64_t bytes) {
 		if (tags.present()) {
 			CODE_PROBE(true, "Tracking transaction tag in counter");
-			double cost = costFunction(bytes);
+			auto const cost = getReadOperationCost(bytes);
 			for (auto& tag : tags.get()) {
 				int64_t& count = intervalCounts[TransactionTag(tag, tags.get().getArena())];
 				topTags.incrementCount(tag, count, cost);
--- a/fdbserver/include/fdbserver/ApplyMetadataMutation.h
+++ b/fdbserver/include/fdbserver/ApplyMetadataMutation.h
@ -144,6 +144,7 @@ inline bool containsMetadataMutation(const VectorRef<MutationRef>& mutations) {
 // Resolver's version
 void applyMetadataMutations(SpanContext const& spanContext,
                            ResolverData& resolverData,
-                            const VectorRef<MutationRef>& mutations);
+                            const VectorRef<MutationRef>& mutations,
+                            const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* pCipherKeys);

 #endif
--- a/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h
+++ b/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h
@ -1,5 +1,5 @@
 /*
- * GrvProxyTransactionTagThrottler.h
+ * GrvProxyTagThrottler.h
 *
 * This source file is part of the FoundationDB open source project
 *
@ -23,8 +23,9 @@
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/TagThrottle.actor.h"
 #include "fdbserver/GrvTransactionRateInfo.h"
+#include "fdbserver/LatencyBandsMap.h"

-// GrvProxyTransactionTagThrottler is used to throttle GetReadVersionRequests based on tag quotas
+// GrvProxyTagThrottler is used to throttle GetReadVersionRequests based on tag quotas
 // before they're pushed into priority-partitioned queues.
 //
 // A GrvTransactionRateInfo object and a request queue are maintained for each tag.
@ -33,7 +34,7 @@
 // Between each set of waits, releaseTransactions is run, releasing queued transactions
 // that have passed the tag throttling stage. Transactions that are not yet ready
 // are requeued during releaseTransactions.
-class GrvProxyTransactionTagThrottler {
+class GrvProxyTagThrottler {
 	class DelayedRequest {
 		static uint64_t lastSequenceNumber;
 		double startTime;
@ -45,8 +46,8 @@ class GrvProxyTransactionTagThrottler {
 		explicit DelayedRequest(GetReadVersionRequest const& req)
 		  : req(req), startTime(now()), sequenceNumber(++lastSequenceNumber) {}

-		void updateProxyTagThrottledDuration();
-		bool isMaxThrottled() const;
+		void updateProxyTagThrottledDuration(LatencyBandsMap&);
+		bool isMaxThrottled(double maxThrottleDuration) const;
 	};

 	struct TagQueue {
@ -57,14 +58,20 @@ class GrvProxyTransactionTagThrottler {
 		explicit TagQueue(double rate) : rateInfo(rate) {}

 		void setRate(double rate);
-		bool isMaxThrottled() const;
-		void rejectRequests();
+		bool isMaxThrottled(double maxThrottleDuration) const;
+		void rejectRequests(LatencyBandsMap&);
 	};

 	// Track the budgets for each tag
 	TransactionTagMap<TagQueue> queues;
+	double maxThrottleDuration;
+
+	// Track latency bands for each tag
+	LatencyBandsMap latencyBandsMap;

 public:
+	explicit GrvProxyTagThrottler(double maxThrottleDuration);
+
 	// Called with rates received from ratekeeper
 	void updateRates(TransactionTagMap<double> const& newRates);

@ -77,7 +84,9 @@ public:

 	void addRequest(GetReadVersionRequest const&);

+	void addLatencyBandThreshold(double value);
+
 public: // testing
 	// Returns number of tags tracked
-	uint32_t size();
+	uint32_t size() const;
 };
--- a/fdbserver/include/fdbserver/IPageEncryptionKeyProvider.actor.h
+++ b/fdbserver/include/fdbserver/IPageEncryptionKeyProvider.actor.h
@ -392,7 +392,7 @@ private:
 			auto view = tenantPrefixIndex->atLatest();
 			auto itr = view.find(prefix);
 			if (itr != view.end()) {
-				return *itr;
+				return itr->get();
 			}
 		}
 		TraceEvent(SevWarn, "TenantAwareEncryptionKeyProvider_TenantNotFoundForDomain").detail("DomainId", domainId);
--- a/fdbserver/include/fdbserver/LatencyBandsMap.h
+++ b/fdbserver/include/fdbserver/LatencyBandsMap.h
@ -0,0 +1,60 @@
+/*
+ * LatencyBandsMap.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "fdbclient/TagThrottle.actor.h"
+#include "fdbrpc/Stats.h"
+#include "fdbserver/Knobs.h"
+
+class LatencyBandsMap {
+	friend class LatencyBandsMapImpl;
+
+	std::string name;
+	UID id;
+	double loggingInterval;
+	int maxSize;
+	Future<Void> expireOldTags;
+
+	struct ExpirableBands {
+		LatencyBands latencyBands;
+		double lastUpdated;
+
+		explicit ExpirableBands(LatencyBands&&);
+	};
+
+	TransactionTagMap<ExpirableBands> map;
+	// Manually added thresholds (does not include "infinite" threshold automatically
+	// added by LatencyBands)
+	std::vector<double> thresholds;
+
+	// Get or create an LatencyBands object stored in map.
+	// Updates the lastUpdated field corresponding to this LatencyBands object.
+	// Returns pointer to this object, or an empty optional if object
+	// cannot be created.
+	Optional<LatencyBands*> getLatencyBands(TransactionTag tag);
+
+public:
+	LatencyBandsMap(std::string const& name, UID id, double loggingInterval, int maxSize);
+
+	void addMeasurement(TransactionTag tag, double measurement, int count = 1);
+	void addThreshold(double value);
+	int size() const { return map.size(); }
+};
--- a/fdbserver/include/fdbserver/ProxyCommitData.actor.h
+++ b/fdbserver/include/fdbserver/ProxyCommitData.actor.h
@ -174,7 +174,7 @@ struct ProxyCommitData {
 	UID dbgid;
 	int64_t commitBatchesMemBytesCount;
 	std::map<TenantName, TenantMapEntry> tenantMap;
-	std::unordered_map<int64_t, TenantName> tenantIdIndex;
+	std::unordered_map<int64_t, TenantNameUniqueSet> tenantIdIndex;
 	ProxyStats stats;
 	MasterInterface master;
 	std::vector<ResolverInterface> resolvers;
--- a/fdbserver/include/fdbserver/Ratekeeper.h
+++ b/fdbserver/include/fdbserver/Ratekeeper.h
@ -58,17 +58,19 @@ class StorageQueueInfo {
 	// refresh periodically
 	TransactionTagMap<TransactionCommitCostEstimation> tagCostEst;

+	UID ratekeeperID;
+	Smoother smoothFreeSpace, smoothTotalSpace;
+	Smoother smoothDurableBytes, smoothInputBytes, verySmoothDurableBytes;
+
+	// Currently unused
+	Smoother smoothDurableVersion, smoothLatestVersion;
+
 public:
 	bool valid;
-	UID ratekeeperID;
 	UID id;
 	LocalityData locality;
 	StorageQueuingMetricsReply lastReply;
 	bool acceptingRequests;
-	Smoother smoothDurableBytes, smoothInputBytes, verySmoothDurableBytes;
-	Smoother smoothDurableVersion, smoothLatestVersion;
-	Smoother smoothFreeSpace;
-	Smoother smoothTotalSpace;
 	limitReason_t limitReason;
 	std::vector<StorageQueuingMetricsReply::TagInfo> busiestReadTags, busiestWriteTags;

@ -81,18 +83,34 @@ public:
 	void update(StorageQueuingMetricsReply const&, Smoother& smoothTotalDurableBytes);
 	void addCommitCost(TransactionTagRef tagName, TransactionCommitCostEstimation const& cost);

+	// Accessor methods for Smoothers
+	double getSmoothFreeSpace() const { return smoothFreeSpace.smoothTotal(); }
+	double getSmoothTotalSpace() const { return smoothTotalSpace.smoothTotal(); }
+	double getSmoothDurableBytes() const { return smoothDurableBytes.smoothTotal(); }
+	double getSmoothInputBytesRate() const { return smoothInputBytes.smoothRate(); }
+	double getVerySmoothDurableBytesRate() const { return verySmoothDurableBytes.smoothRate(); }
+
 	// Determine the ratio (limit / current throughput) for throttling based on write queue size
-	Optional<double> getThrottlingRatio(int64_t storageTargetBytes, int64_t storageSpringBytes) const;
+	Optional<double> getTagThrottlingRatio(int64_t storageTargetBytes, int64_t storageSpringBytes) const;
 };

-struct TLogQueueInfo {
-	TLogQueuingMetricsReply lastReply;
-	bool valid;
-	UID id;
+class TLogQueueInfo {
 	Smoother smoothDurableBytes, smoothInputBytes, verySmoothDurableBytes;
 	Smoother smoothFreeSpace;
 	Smoother smoothTotalSpace;

+public:
+	TLogQueuingMetricsReply lastReply;
+	bool valid;
+	UID id;
+
+	// Accessor methods for Smoothers
+	double getSmoothFreeSpace() const { return smoothFreeSpace.smoothTotal(); }
+	double getSmoothTotalSpace() const { return smoothTotalSpace.smoothTotal(); }
+	double getSmoothDurableBytes() const { return smoothDurableBytes.smoothTotal(); }
+	double getSmoothInputBytesRate() const { return smoothInputBytes.smoothRate(); }
+	double getVerySmoothDurableBytesRate() const { return verySmoothDurableBytes.smoothRate(); }
+
 	TLogQueueInfo(UID id);
 	Version getLastCommittedVersion() const { return lastReply.v; }
 	void update(TLogQueuingMetricsReply const& reply, Smoother& smoothTotalDurableBytes);
@ -225,4 +243,4 @@ public:
 	static Future<Void> run(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo);
 };

-#endif // FDBSERVER_RATEKEEPER_H
+#endif // FDBSERVER_RATEKEEPER_H
--- a/fdbserver/include/fdbserver/RocksDBLogForwarder.h
+++ b/fdbserver/include/fdbserver/RocksDBLogForwarder.h
@ -0,0 +1,107 @@
+/*
+ * RocksDBLogForwarder.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ROCKSDB_LOG_FORWARDER_H__
+#define __ROCKSDB_LOG_FORWARDER_H__
+
+#include <cstdarg>
+#include <thread>
+
+#ifdef SSD_ROCKSDB_EXPERIMENTAL
+#include <rocksdb/env.h>
+#endif
+
+#include "flow/genericactors.actor.h"
+#include "flow/IRandom.h"
+#include "flow/Trace.h"
+
+namespace details {
+
+// Stores a RocksDB log line, transformed into Key/Value pairs
+struct RocksDBLogRecord {
+	double logReceiveTime;
+	Severity severity;
+	UID uid;
+	std::thread::id threadID;
+	std::vector<std::pair<std::string, std::string>> kvPairs;
+};
+
+// Stores RocksDB log lines for furthur consumption.
+// *NOTE* This logger *MUST* run in a thread that is able to generate TraceEvents, e.g. in the event loop thread.
+class RocksDBLogger {
+	// The mutex that protects log records, as RocksDB is multi-threaded
+	std::mutex recordsMutex;
+
+	// Main thread ID. Only triggers TraceEvent when on main thread. In FDB only the main thread contains information
+	// that could thread.
+	const std::thread::id mainThreadId;
+
+	// The log record
+	std::vector<RocksDBLogRecord> logRecords;
+
+	// An ACTOR that logs the non-main thread data periodically
+	Future<Void> periodicLogger;
+
+public:
+	// Constructor
+	RocksDBLogger();
+
+	// *Moves* the record to internal records
+	void inject(RocksDBLogRecord&& record);
+
+	// Consumes all the records
+	void consume();
+};
+
+} // namespace details
+
+class NullRocksDBLogForwarder : public rocksdb::Logger {
+public:
+	virtual void Logv(const char*, va_list) { /* intended to be blank */
+	}
+	virtual void Logv(const rocksdb::InfoLogLevel, const char*, va_list) { /* intended to be blank */
+	}
+};
+
+class RocksDBLogForwarder : public rocksdb::Logger {
+	// The ID of the RocksDB instance
+	const UID id;
+
+	// The cache that stores the logs from RocksDB
+	details::RocksDBLogger logger;
+
+public:
+	// Constructor
+	// id is the UID of the logger
+	// log_level specifies the log level
+	explicit RocksDBLogForwarder(const UID& id,
+	                             const rocksdb::InfoLogLevel log_level = rocksdb::InfoLogLevel::INFO_LEVEL);
+
+	// Destructor
+	virtual ~RocksDBLogForwarder();
+
+	// Writes an entry to the log file
+	virtual void Logv(const char* format, va_list ap);
+
+	// Writes an entry to the log file, with a specificied log level
+	virtual void Logv(const rocksdb::InfoLogLevel log_level, const char* format, va_list ap);
+};
+
+#endif // __ROCKSDB_LOG_FORWARDER_H__
--- a/fdbserver/include/fdbserver/TagThrottler.h
+++ b/fdbserver/include/fdbserver/TagThrottler.h
@ -79,7 +79,7 @@ class GlobalTagThrottler : public ITagThrottler {
 	PImpl<class GlobalTagThrottlerImpl> impl;

 public:
-	GlobalTagThrottler(Database db, UID id);
+	GlobalTagThrottler(Database db, UID id, int maxFallingBehind);
 	~GlobalTagThrottler();

 	Future<Void> monitorThrottlingChanges() override;
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -2001,7 +2001,9 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 		data->sendErrorWithPenalty(req.reply, e, data->getPenalty());
 	}

-	data->transactionTagCounter.addRequest(req.tags, resultSize);
+	// Key size is not included in "BytesQueried", but still contributes to cost,
+	// so it must be accounted for here.
+	data->transactionTagCounter.addRequest(req.tags, req.key.size() + resultSize);

 	++data->counters.finishedQueries;

@ -2011,7 +2013,7 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 	if (data->latencyBandConfig.present()) {
 		int maxReadBytes =
 		    data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
-		data->counters.readLatencyBands.addMeasurement(duration, resultSize > maxReadBytes);
+		data->counters.readLatencyBands.addMeasurement(duration, 1, Filtered(resultSize > maxReadBytes));
 	}

 	return Void();
@ -3014,7 +3016,11 @@ ACTOR Future<Void> changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques
 			req.reply.setByteLimit(std::min((int64_t)req.replyBufferSize, SERVER_KNOBS->CHANGEFEEDSTREAM_LIMIT_BYTES));
 		}

-		wait(delay(0, TaskPriority::DefaultEndpoint));
+		// Change feeds that are not atLatest must have a lower priority than UpdateStorage to not starve it out, and
+		// change feed disk reads generally only happen on blob worker recovery or data movement, so they should be
+		// lower priority. AtLatest change feeds are triggered directly from the SS update loop with no waits, so they
+		// will still be low latency
+		wait(delay(0, TaskPriority::SSSpilledChangeFeedReply));

 		if (DEBUG_CF_TRACE) {
 			TraceEvent(SevDebug, "TraceChangeFeedStreamStart", data->thisServerID)
@ -3935,9 +3941,10 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 		int maxSelectorOffset =
 		    data->latencyBandConfig.get().readConfig.maxKeySelectorOffset.orDefault(std::numeric_limits<int>::max());
 		data->counters.readLatencyBands.addMeasurement(duration,
-		                                               resultSize > maxReadBytes ||
-		                                                   abs(req.begin.offset) > maxSelectorOffset ||
-		                                                   abs(req.end.offset) > maxSelectorOffset);
+		                                               1,
+		                                               Filtered(resultSize > maxReadBytes ||
+		                                                        abs(req.begin.offset) > maxSelectorOffset ||
+		                                                        abs(req.end.offset) > maxSelectorOffset));
 	}

 	return Void();
@ -4722,7 +4729,9 @@ TEST_CASE("/fdbserver/storageserver/rangeIntersectsAnyTenant") {
 	TenantPrefixIndex index;
 	index.createNewVersion(1);
 	for (auto entry : entries) {
-		index.insert(entry.second.prefix, entry.first);
+		TenantNameUniqueSet nameSet;
+		nameSet.insert(entry.first);
+		index.insert(entry.second.prefix, nameSet);
 	}

 	// Before all tenants
@ -5017,9 +5026,10 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
 		int maxSelectorOffset =
 		    data->latencyBandConfig.get().readConfig.maxKeySelectorOffset.orDefault(std::numeric_limits<int>::max());
 		data->counters.readLatencyBands.addMeasurement(duration,
-		                                               resultSize > maxReadBytes ||
-		                                                   abs(req.begin.offset) > maxSelectorOffset ||
-		                                                   abs(req.end.offset) > maxSelectorOffset);
+		                                               1,
+		                                               Filtered(resultSize > maxReadBytes ||
+		                                                        abs(req.begin.offset) > maxSelectorOffset ||
+		                                                        abs(req.end.offset) > maxSelectorOffset));
 	}

 	return Void();
@ -5337,7 +5347,7 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 		int maxSelectorOffset =
 		    data->latencyBandConfig.get().readConfig.maxKeySelectorOffset.orDefault(std::numeric_limits<int>::max());
 		data->counters.readLatencyBands.addMeasurement(
-		    duration, resultSize > maxReadBytes || abs(req.sel.offset) > maxSelectorOffset);
+		    duration, 1, Filtered(resultSize > maxReadBytes || abs(req.sel.offset) > maxSelectorOffset));
 	}

 	return Void();
@ -8365,7 +8375,16 @@ bool StorageServer::insertTenant(TenantNameRef tenantName, TenantMapEntry tenant
 		tenantPrefixIndex->createNewVersion(version);

 		tenantMap.insert(tenantName, tenantEntry);
-		tenantPrefixIndex->insert(tenantEntry.prefix, tenantName);
+
+		auto view = tenantPrefixIndex->at(version);
+		auto itr = view.find(tenantEntry.prefix);
+		TenantNameUniqueSet nameSet;
+		if (itr != view.end()) {
+			nameSet = *itr;
+		}
+
+		nameSet.insert(tenantName);
+		tenantPrefixIndex->insert(tenantEntry.prefix, nameSet);

 		TraceEvent("InsertTenant", thisServerID).detail("Tenant", tenantName).detail("Version", version);
 		return true;
@ -8389,9 +8408,17 @@ void StorageServer::clearTenants(TenantNameRef startTenant, TenantNameRef endTen

 		auto view = tenantMap.at(version);
 		for (auto itr = view.lower_bound(startTenant); itr != view.lower_bound(endTenant); ++itr) {
+			auto indexView = tenantPrefixIndex->at(version);
 			// Trigger any watches on the prefix associated with the tenant.
 			watches.triggerRange(itr->prefix, strinc(itr->prefix));
-			tenantPrefixIndex->erase(itr->prefix);
+			auto indexItr = indexView.find(itr->prefix);
+			ASSERT(indexItr != indexView.end());
+			TenantNameUniqueSet nameSet = *indexItr;
+			if (nameSet.remove(itr.key())) {
+				tenantPrefixIndex->erase(itr->prefix);
+			} else {
+				tenantPrefixIndex->insert(itr->prefix, nameSet);
+			}
 			TraceEvent("EraseTenant", thisServerID).detail("Tenant", itr.key()).detail("Version", version);
 		}

@ -8565,6 +8592,11 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				} else {
 					MutationRef msg;
 					cloneReader >> msg;
+					if (isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION) && !msg.isEncrypted() &&
+					    !(isSingleKeyMutation((MutationRef::Type)msg.type) &&
+					      (backupLogKeys.contains(msg.param1) || (applyLogKeys.contains(msg.param1))))) {
+						ASSERT(false);
+					}
 					if (msg.isEncrypted()) {
 						if (!cipherKeys.present()) {
 							const BlobCipherEncryptHeader* header = msg.encryptionHeader();
@ -8718,6 +8750,11 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 			} else {
 				MutationRef msg;
 				rd >> msg;
+				if (isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION) && !msg.isEncrypted() &&
+				    !(isSingleKeyMutation((MutationRef::Type)msg.type) &&
+				      (backupLogKeys.contains(msg.param1) || (applyLogKeys.contains(msg.param1))))) {
+					ASSERT(false);
+				}
 				if (msg.isEncrypted()) {
 					ASSERT(cipherKeys.present());
 					msg = msg.decrypt(cipherKeys.get(), rd.arena(), BlobCipherMetrics::TLOG);
@ -9872,7 +9909,16 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 		TenantMapEntry tenantEntry = TenantMapEntry::decode(result.value);

 		data->tenantMap.insert(tenantName, tenantEntry);
-		data->tenantPrefixIndex->insert(tenantEntry.prefix, tenantName);
+
+		auto view = data->tenantPrefixIndex->at(version);
+		auto itr = view.find(tenantEntry.prefix);
+		TenantNameUniqueSet nameSet;
+		if (itr != view.end()) {
+			nameSet = *itr;
+		}
+
+		nameSet.insert(tenantName);
+		data->tenantPrefixIndex->insert(tenantEntry.prefix, nameSet);

 		TraceEvent("RestoringTenant", data->thisServerID)
 		    .detail("Key", tenantMap[tenantMapLoc].key)
@ -10131,8 +10177,14 @@ Future<Void> StorageServerMetrics::waitMetrics(WaitMetricsRequest req, Future<Vo

 ACTOR Future<Void> waitMetricsTenantAware_internal(StorageServer* self, WaitMetricsRequest req) {
 	if (req.tenantInfo.present() && req.tenantInfo.get().tenantId != TenantInfo::INVALID_TENANT) {
-		wait(success(waitForVersionNoTooOld(self, latestVersion)));
-		Optional<TenantMapEntry> entry = self->getTenantEntry(latestVersion, req.tenantInfo.get());
+		state Version version = wait(waitForVersionNoTooOld(self, latestVersion));
+		state Optional<TenantMapEntry> entry;
+		try {
+			entry = self->getTenantEntry(version, req.tenantInfo.get());
+		} catch (Error& e) {
+			self->sendErrorWithPenalty(req.reply, e, self->getPenalty());
+			return Void();
+		}
 		Optional<Key> tenantPrefix = entry.map<Key>([](TenantMapEntry e) { return e.prefix; });
 		if (tenantPrefix.present()) {
 			req.keys = req.keys.withPrefix(tenantPrefix.get(), req.arena);
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@ -84,7 +84,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 			Key startKey(baseKey + "/");
 			Key endKey(baseKey + "/\xff");
 			self->keys.push_back_deep(self->keys.arena(), KeyRangeRef(startKey, endKey));
-			if (deterministicRandom()->random01() < 0.2) {
+			if (deterministicRandom()->random01() < 0.2 && !self->rwImpls.empty()) {
 				self->asyncReadImpls.push_back(std::make_shared<SKSCTestAsyncReadImpl>(KeyRangeRef(startKey, endKey)));
 				cx->specialKeySpace->registerKeyRange(SpecialKeySpace::MODULE::TESTONLY,
 				                                      SpecialKeySpace::IMPLTYPE::READONLY,
@ -106,6 +106,8 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 				               Value(deterministicRandom()->randomAlphaNumeric(self->valBytes)));
 			}
 		}
+		ASSERT(rwImpls.size() > 0);
+
 		return Void();
 	}
 	ACTOR Future<Void> _start(Database cx, SpecialKeySpaceCorrectnessWorkload* self) {
@ -250,6 +252,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 	}

 	KeyRange randomRWKeyRange() {
+		ASSERT(rwImpls.size() > 0);
 		Key prefix = rwImpls[deterministicRandom()->randomInt(0, rwImpls.size())]->getKeyRange().begin;
 		Key rkey1 = Key(deterministicRandom()->randomAlphaNumeric(deterministicRandom()->randomInt(0, keyBytes)))
 		                .withPrefix(prefix);
--- a/fdbserver/workloads/TransactionCost.actor.cpp
+++ b/fdbserver/workloads/TransactionCost.actor.cpp
@ -0,0 +1,323 @@
+/*
+ * TransactionCost.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/ReadYourWrites.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "flow/actorcompiler.h"
+
+class TransactionCostWorkload : public TestWorkload {
+	int iterations{ 1000 };
+	Key prefix;
+	bool debugTransactions{ false };
+
+	Key getKey(uint64_t testNumber, uint64_t index = 0) const {
+		BinaryWriter bw(Unversioned());
+		bw << bigEndian64(testNumber);
+		bw << bigEndian64(index);
+		return bw.toValue().withPrefix(prefix);
+	}
+
+	static Value getValue(uint32_t size) { return makeString(size); }
+
+	static UID getDebugID(uint64_t testNumber) { return UID(testNumber << 32, testNumber << 32); }
+
+	class ITest {
+	protected:
+		uint64_t testNumber;
+		explicit ITest(uint64_t testNumber) : testNumber(testNumber) {}
+
+	public:
+		void debugTransaction(ReadYourWritesTransaction& tr) { tr.debugTransaction(getDebugID(testNumber)); }
+		virtual Future<Void> setup(TransactionCostWorkload const& workload, Database const&) { return Void(); }
+		virtual Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction>) = 0;
+		virtual int64_t expectedFinalCost() const = 0;
+		virtual ~ITest() = default;
+	};
+
+	class ReadEmptyTest : public ITest {
+	public:
+		explicit ReadEmptyTest(uint64_t testNumber) : ITest(testNumber) {}
+
+		Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction> tr) override {
+			return success(tr->get(workload.getKey(testNumber)));
+		}
+
+		int64_t expectedFinalCost() const override { return CLIENT_KNOBS->READ_COST_BYTE_FACTOR; }
+	};
+
+	class ReadLargeValueTest : public ITest {
+		ACTOR static Future<Void> setup(TransactionCostWorkload const* workload,
+		                                ReadLargeValueTest* self,
+		                                Database cx) {
+			state Transaction tr(cx);
+			loop {
+				try {
+					tr.set(workload->getKey(self->testNumber), getValue(CLIENT_KNOBS->READ_COST_BYTE_FACTOR));
+					wait(tr.commit());
+					return Void();
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+
+	public:
+		explicit ReadLargeValueTest(int64_t testNumber) : ITest(testNumber) {}
+
+		Future<Void> setup(TransactionCostWorkload const& workload, Database const& cx) override {
+			return setup(&workload, this, cx);
+		}
+
+		Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction> tr) override {
+			return success(tr->get(workload.getKey(testNumber)));
+		}
+
+		int64_t expectedFinalCost() const override { return 2 * CLIENT_KNOBS->READ_COST_BYTE_FACTOR; }
+	};
+
+	class WriteTest : public ITest {
+	public:
+		explicit WriteTest(int64_t testNumber) : ITest(testNumber) {}
+
+		Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction> tr) override {
+			tr->set(workload.getKey(testNumber), getValue(20));
+			return Void();
+		}
+
+		int64_t expectedFinalCost() const override {
+			return CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR;
+		}
+	};
+
+	class WriteLargeValueTest : public ITest {
+	public:
+		explicit WriteLargeValueTest(int64_t testNumber) : ITest(testNumber) {}
+
+		Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction> tr) override {
+			tr->set(workload.getKey(testNumber), getValue(CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR));
+			return Void();
+		}
+
+		int64_t expectedFinalCost() const override {
+			return 2 * CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR;
+		}
+	};
+
+	class WriteMultipleValuesTest : public ITest {
+	public:
+		explicit WriteMultipleValuesTest(int64_t testNumber) : ITest(testNumber) {}
+
+		Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction> tr) override {
+			for (int i = 0; i < 10; ++i) {
+				tr->set(workload.getKey(testNumber, i), getValue(20));
+			}
+			return Void();
+		}
+
+		int64_t expectedFinalCost() const override {
+			return 10 * CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR;
+		}
+	};
+
+	class ClearTest : public ITest {
+	public:
+		explicit ClearTest(int64_t testNumber) : ITest(testNumber) {}
+
+		Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction> tr) override {
+			tr->clear(singleKeyRange(workload.getKey(testNumber)));
+			return Void();
+		}
+
+		int64_t expectedFinalCost() const override { return CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR; }
+	};
+
+	class ReadRangeTest : public ITest {
+		ACTOR static Future<Void> setup(ReadRangeTest* self, TransactionCostWorkload const* workload, Database cx) {
+			state Transaction tr(cx);
+			loop {
+				try {
+					for (int i = 0; i < 10; ++i) {
+						tr.set(workload->getKey(self->testNumber, i), workload->getValue(20));
+					}
+					wait(tr.commit());
+					return Void();
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+
+	public:
+		explicit ReadRangeTest(int64_t testNumber) : ITest(testNumber) {}
+
+		Future<Void> setup(TransactionCostWorkload const& workload, Database const& cx) override {
+			return setup(this, &workload, cx);
+		}
+
+		Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction> tr) override {
+			KeyRange const keys = KeyRangeRef(workload.getKey(testNumber, 0), workload.getKey(testNumber, 10));
+			return success(tr->getRange(keys, 10));
+		}
+
+		int64_t expectedFinalCost() const override { return CLIENT_KNOBS->READ_COST_BYTE_FACTOR; }
+	};
+
+	class ReadMultipleValuesTest : public ITest {
+		ACTOR static Future<Void> setup(ReadMultipleValuesTest* self,
+		                                TransactionCostWorkload const* workload,
+		                                Database cx) {
+			state Transaction tr(cx);
+			loop {
+				try {
+					for (int i = 0; i < 10; ++i) {
+						tr.set(workload->getKey(self->testNumber, i), workload->getValue(20));
+					}
+					wait(tr.commit());
+					return Void();
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+
+	public:
+		explicit ReadMultipleValuesTest(int64_t testNumber) : ITest(testNumber) {}
+
+		Future<Void> setup(TransactionCostWorkload const& workload, Database const& cx) override {
+			return setup(this, &workload, cx);
+		}
+
+		Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction> tr) override {
+			std::vector<Future<Void>> futures;
+			for (int i = 0; i < 10; ++i) {
+				futures.push_back(success(tr->get(workload.getKey(testNumber, i))));
+			}
+			return waitForAll(futures);
+		}
+
+		int64_t expectedFinalCost() const override { return 10 * CLIENT_KNOBS->READ_COST_BYTE_FACTOR; }
+	};
+
+	class LargeReadRangeTest : public ITest {
+		ACTOR static Future<Void> setup(LargeReadRangeTest* self,
+		                                TransactionCostWorkload const* workload,
+		                                Database cx) {
+			state Transaction tr(cx);
+			loop {
+				try {
+					for (int i = 0; i < 10; ++i) {
+						tr.set(workload->getKey(self->testNumber, i),
+						       workload->getValue(CLIENT_KNOBS->READ_COST_BYTE_FACTOR));
+					}
+					wait(tr.commit());
+					return Void();
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+
+	public:
+		explicit LargeReadRangeTest(int64_t testNumber) : ITest(testNumber) {}
+
+		Future<Void> setup(TransactionCostWorkload const& workload, Database const& cx) override {
+			return setup(this, &workload, cx);
+		}
+
+		Future<Void> exec(TransactionCostWorkload const& workload, Reference<ReadYourWritesTransaction> tr) override {
+			KeyRange const keys = KeyRangeRef(workload.getKey(testNumber, 0), workload.getKey(testNumber, 10));
+			return success(tr->getRange(keys, 10));
+		}
+
+		int64_t expectedFinalCost() const override { return 11 * CLIENT_KNOBS->READ_COST_BYTE_FACTOR; }
+	};
+
+	static std::unique_ptr<ITest> createRandomTest(int64_t testNumber) {
+		auto const rand = deterministicRandom()->randomInt(0, 9);
+		if (rand == 0) {
+			return std::make_unique<ReadEmptyTest>(testNumber);
+		} else if (rand == 1) {
+			return std::make_unique<ReadLargeValueTest>(testNumber);
+		} else if (rand == 2) {
+			return std::make_unique<ReadMultipleValuesTest>(testNumber);
+		} else if (rand == 3) {
+			return std::make_unique<WriteTest>(testNumber);
+		} else if (rand == 4) {
+			return std::make_unique<WriteLargeValueTest>(testNumber);
+		} else if (rand == 5) {
+			return std::make_unique<WriteMultipleValuesTest>(testNumber);
+		} else if (rand == 6) {
+			return std::make_unique<ClearTest>(testNumber);
+		} else if (rand == 7) {
+			return std::make_unique<ReadRangeTest>(testNumber);
+		} else {
+			return std::make_unique<LargeReadRangeTest>(testNumber);
+		}
+	}
+
+	ACTOR static Future<Void> runTest(TransactionCostWorkload* self, Database cx, ITest* test) {
+		wait(test->setup(*self, cx));
+		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+		if (self->debugTransactions) {
+			test->debugTransaction(*tr);
+		}
+		loop {
+			try {
+				wait(test->exec(*self, tr));
+				wait(tr->commit());
+				ASSERT_EQ(tr->getTotalCost(), test->expectedFinalCost());
+				return Void();
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
+	}
+
+	ACTOR static Future<Void> start(TransactionCostWorkload* self, Database cx) {
+		state uint64_t testNumber = 0;
+		state Future<Void> f;
+		// Must use shared_ptr because Flow doesn't support perfect forwarding into actors
+		state std::shared_ptr<ITest> test;
+		for (; testNumber < self->iterations; ++testNumber) {
+			test = createRandomTest(testNumber);
+			wait(runTest(self, cx, test.get()));
+		}
+		return Void();
+	}
+
+public:
+	TransactionCostWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		iterations = getOption(options, "iterations"_sr, 1000);
+		prefix = getOption(options, "prefix"_sr, "transactionCost/"_sr);
+		debugTransactions = getOption(options, "debug"_sr, false);
+	}
+
+	static constexpr auto NAME = "TransactionCost";
+
+	Future<Void> setup(Database const& cx) override { return Void(); }
+
+	Future<Void> start(Database const& cx) override { return clientId ? Void() : start(this, cx); }
+
+	Future<bool> check(Database const& cx) override { return true; }
+
+	void getMetrics(std::vector<PerfMetric>& m) override {}
+};
+
+WorkloadFactory<TransactionCostWorkload> TransactionCostWorkloadFactory;
--- a/flow/ApiVersion.h.cmake
+++ b/flow/ApiVersion.h.cmake
@ -71,6 +71,7 @@ public: // introduced features
    API_VERSION_FEATURE(@FDB_AV_FUTURE_GET_BOOL@, FutureGetBool);
    API_VERSION_FEATURE(@FDB_AV_FUTURE_PROTOCOL_VERSION_API@, FutureProtocolVersionApi);
    API_VERSION_FEATURE(@FDB_AV_TENANT_BLOB_RANGE_API@, TenantBlobRangeApi);
+    API_VERSION_FEATURE(@FDB_AV_GET_TOTAL_COST@, GetTotalCost);
 };

 #endif // FLOW_CODE_API_VERSION_H
--- a/flow/ApiVersions.cmake
+++ b/flow/ApiVersions.cmake
@ -1,5 +1,5 @@
 # API Versions
-set(FDB_AV_LATEST_VERSION                   "720")
+set(FDB_AV_LATEST_VERSION                   "730")

 # Features
 set(FDB_AV_SNAPSHOT_RYW                     "300")
@ -11,4 +11,5 @@ set(FDB_AV_BLOB_RANGE_API                   "720")
 set(FDB_AV_CREATE_DB_FROM_CONN_STRING       "720")
 set(FDB_AV_FUTURE_GET_BOOL                  "720")
 set(FDB_AV_FUTURE_PROTOCOL_VERSION_API      "720")
-set(FDB_AV_TENANT_BLOB_RANGE_API            "720")
+set(FDB_AV_TENANT_BLOB_RANGE_API            "720")
+set(FDB_AV_GET_TOTAL_COST                   "730")
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@ -881,8 +881,8 @@ public:

 		try {
 			Future<Void> onHandshook;
-			ConfigureSSLStream(N2::g_net2->activeTlsPolicy, self->ssl_sock, [self = self](bool verifyOk) {
-				self->has_trusted_peer = verifyOk;
+			ConfigureSSLStream(N2::g_net2->activeTlsPolicy, self->ssl_sock, [conn = self.getPtr()](bool verifyOk) {
+				conn->has_trusted_peer = verifyOk;
 			});

 			// If the background handshakers are not all busy, use one
@ -959,8 +959,8 @@ public:

 		try {
 			Future<Void> onHandshook;
-			ConfigureSSLStream(N2::g_net2->activeTlsPolicy, self->ssl_sock, [self = self](bool verifyOk) {
-				self->has_trusted_peer = verifyOk;
+			ConfigureSSLStream(N2::g_net2->activeTlsPolicy, self->ssl_sock, [conn = self.getPtr()](bool verifyOk) {
+				conn->has_trusted_peer = verifyOk;
 			});

 			// If the background handshakers are not all busy, use one
--- a/flow/include/flow/PriorityMultiLock.actor.h
+++ b/flow/include/flow/PriorityMultiLock.actor.h
@ -42,21 +42,21 @@
 #endif

 // A multi user lock with a concurrent holder limit where waiters request a lock with a priority
-// id and are granted locks based on a total concurrency and relative importants of the priority
-// ids defined.
+// id and are granted locks based on a total concurrency and relative weights of the current active
+// priorities.  Priority id's must start at 0 and are sequential integers.
 //
 // Scheduling logic
 // Let
-// 	 launchLimits[n] = configured amount from the launchLimit vector for priority n
+// 	 weights[n] = configured weight for priority n
 //   waiters[n] = the number of waiters for priority n
 //   runnerCounts[n] = number of runners at priority n
 //
-//   totalActiveLaunchLimits = sum of limits for all priorities with waiters[n] > 0
-//   When waiters[n] becomes == 0, totalActiveLaunchLimits -= launchLimits[n]
-//   When waiters[n] becomes  > 0, totalActiveLaunchLimits += launchLimits[n]
+//   totalPendingWeights = sum of weights for all priorities with waiters[n] > 0
+//   When waiters[n] becomes == 0, totalPendingWeights -= weights[n]
+//   When waiters[n] becomes  > 0, totalPendingWeights += weights[n]
 //
 //   The total capacity of a priority to be considered when launching tasks is
-//     ceil(launchLimits[n] / totalLimits * concurrency)
+//     ceil(weights[n] / totalPendingWeights * concurrency)
 //
 // For improved memory locality the properties mentioned above are stored as priorities[n].<property>
 // in the actual implementation.
@ -80,15 +80,15 @@ public:
 		Promise<Void> promise;
 	};

-	PriorityMultiLock(int concurrency, std::string launchLimits)
-	  : PriorityMultiLock(concurrency, parseStringToVector<int>(launchLimits, ',')) {}
+	PriorityMultiLock(int concurrency, std::string weights)
+	  : PriorityMultiLock(concurrency, parseStringToVector<int>(weights, ',')) {}

-	PriorityMultiLock(int concurrency, std::vector<int> launchLimitsByPriority)
-	  : concurrency(concurrency), available(concurrency), waiting(0), totalActiveLaunchLimits(0), releaseDebugID(0) {
+	PriorityMultiLock(int concurrency, std::vector<int> weightsByPriority)
+	  : concurrency(concurrency), available(concurrency), waiting(0), totalPendingWeights(0), releaseDebugID(0) {

-		priorities.resize(launchLimitsByPriority.size());
+		priorities.resize(weightsByPriority.size());
 		for (int i = 0; i < priorities.size(); ++i) {
-			priorities[i].launchLimit = launchLimitsByPriority[i];
+			priorities[i].weight = weightsByPriority[i];
 		}

 		fRunner = runner(this);
@ -99,17 +99,16 @@ public:
 	Future<Lock> lock(int priority = 0) {
 		Priority& p = priorities[priority];
 		Queue& q = p.queue;
-		Waiter w;

 		// If this priority currently has no waiters
 		if (q.empty()) {
-			// Add this priority's launch limit to totalLimits
-			totalActiveLaunchLimits += p.launchLimit;
+			// Add this priority's weight to the total for priorities with pending work
+			totalPendingWeights += p.weight;

 			// If there are slots available and the priority has capacity then don't make the caller wait
-			if (available > 0 && p.runners < currentCapacity(p.launchLimit)) {
-				// Remove this priority's launch limit from the total since it will remain empty
-				totalActiveLaunchLimits -= p.launchLimit;
+			if (available > 0 && p.runners < currentCapacity(p.weight)) {
+				// Remove this priority's weight from the total since it will remain empty
+				totalPendingWeights -= p.weight;

 				// Return a Lock to the caller
 				Lock lock;
@ -119,6 +118,8 @@ public:
 				return lock;
 			}
 		}
+
+		Waiter w;
 		q.push_back(w);
 		++waiting;

@ -144,7 +145,7 @@ public:
 		}

 		std::string s = format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d "
-		                       "runnersDone=%d activeLimits=%d ",
+		                       "runnersDone=%d pendingWeights=%d ",
 		                       this,
 		                       concurrency,
 		                       available,
@ -152,7 +153,7 @@ public:
 		                       waiting,
 		                       runners.size(),
 		                       runnersDone,
-		                       totalActiveLaunchLimits);
+		                       totalPendingWeights);

 		for (int i = 0; i < priorities.size(); ++i) {
 			s += format("p%d:{%s} ", i, priorities[i].toString(this).c_str());
@ -196,27 +197,27 @@ private:
 	int available;
 	// Total waiters across all priorities
 	int waiting;
-	// Sum of launch limits for all priorities with 1 or more waiters
-	int totalActiveLaunchLimits;
+	// Sum of weights for all priorities with 1 or more waiters
+	int totalPendingWeights;

 	typedef Deque<Waiter> Queue;

 	struct Priority {
-		Priority() : runners(0), launchLimit(0) {}
+		Priority() : runners(0), weight(0) {}

 		// Queue of waiters at this priority
 		Queue queue;
 		// Number of runners at this priority
 		int runners;
-		// Configured launch limit for this priority
-		int launchLimit;
+		// Configured weight for this priority
+		int weight;

 		std::string toString(const PriorityMultiLock* pml) const {
-			return format("limit=%d run=%d wait=%d cap=%d",
-			              launchLimit,
+			return format("weight=%d run=%d wait=%d cap=%d",
+			              weight,
 			              runners,
 			              queue.size(),
-			              queue.empty() ? 0 : pml->currentCapacity(launchLimit));
+			              queue.empty() ? 0 : pml->currentCapacity(weight));
 		}
 	};

@ -270,10 +271,10 @@ private:

 	// Current maximum running tasks for the specified priority, which must have waiters
 	// or the result is undefined
-	int currentCapacity(int launchLimit) const {
+	int currentCapacity(int weight) const {
 		// The total concurrency allowed for this priority at present is the total concurrency times
-		// priority's launch limit divided by the total launch limits for all priorities with waiters.
-		return ceil((float)launchLimit / totalActiveLaunchLimits * concurrency);
+		// priority's weight divided by the total weights for all priorities with waiters.
+		return ceil((float)weight / totalPendingWeights * concurrency);
 	}

 	ACTOR static Future<Void> runner(PriorityMultiLock* self) {
@ -329,8 +330,7 @@ private:
 					                 priority,
 					                 self->toString().c_str());

-					if (!pPriority->queue.empty() &&
-					    pPriority->runners < self->currentCapacity(pPriority->launchLimit)) {
+					if (!pPriority->queue.empty() && pPriority->runners < self->currentCapacity(pPriority->weight)) {
 						break;
 					}
 				}
@ -340,9 +340,9 @@ private:
 				Waiter w = queue.front();
 				queue.pop_front();

-				// If this priority is now empty, subtract its launch limit from totalLimits
+				// If this priority is now empty, subtract its weight from the total pending weights
 				if (queue.empty()) {
-					self->totalActiveLaunchLimits -= pPriority->launchLimit;
+					self->totalPendingWeights -= pPriority->weight;

 					pml_debug_printf("      emptied priority line %d  priority=%d  %s\n",
 					                 __LINE__,
--- a/flow/include/flow/error_definitions.h
+++ b/flow/include/flow/error_definitions.h
@ -133,6 +133,7 @@ ERROR( incompatible_software_version, 1220, "Current software does not support d
 ERROR( audit_storage_failed, 1221, "Validate storage consistency operation failed" )
 ERROR( audit_storage_exceeded_request_limit, 1222, "Exceeded the max number of allowed concurrent audit storage requests" )
 ERROR( proxy_tag_throttled, 1223, "Exceeded maximum proxy tag throttling duration" )
+ERROR( key_value_store_deadline_exceeded, 1224, "Exceeded maximum time allowed to read or write.")

 // 15xx Platform errors
 ERROR( platform_error, 1500, "Platform error" )
--- a/flow/include/flow/network.h
+++ b/flow/include/flow/network.h
@ -101,6 +101,7 @@ enum class TaskPriority {
 	UpdateStorage = 3000,
 	CompactCache = 2900,
 	TLogSpilledPeekReply = 2800,
+	SSSpilledChangeFeedReply = 2730,
 	BlobWorkerReadChangeFeed = 2720,
 	BlobWorkerUpdateFDB = 2710,
 	BlobWorkerUpdateStorage = 2700,
--- a/flowbench/BenchNet2.actor.cpp
+++ b/flowbench/BenchNet2.actor.cpp
@ -20,6 +20,7 @@

 #include "benchmark/benchmark.h"

+#include "flow/IRandom.h"
 #include "flow/flow.h"
 #include "flow/DeterministicRandom.h"
 #include "flow/network.h"
@ -61,3 +62,32 @@ static void bench_net2(benchmark::State& benchState) {
 }

 BENCHMARK(bench_net2)->Range(1, 1 << 16)->ReportAggregatesOnly(true);
+
+static constexpr bool DELAY = false;
+static constexpr bool YIELD = true;
+
+ACTOR template <bool useYield>
+static Future<Void> benchDelay(benchmark::State* benchState) {
+	// Number of random delays to start to just to populate the run loop
+	// priority queue
+	state int64_t timerCount = benchState->range(0);
+	state std::vector<Future<Void>> futures;
+	state DeterministicRandom rand(platform::getRandomSeed());
+	while (--timerCount > 0) {
+		futures.push_back(delay(1.0 + rand.random01(), getRandomTaskPriority(rand)));
+	}
+
+	while (benchState->KeepRunning()) {
+		wait(useYield ? yield() : delay(0));
+	}
+	benchState->SetItemsProcessed(static_cast<long>(benchState->iterations()));
+	return Void();
+}
+
+template <bool useYield>
+static void bench_delay(benchmark::State& benchState) {
+	onMainThread([&benchState] { return benchDelay<useYield>(&benchState); }).blockUntilReady();
+}
+
+BENCHMARK_TEMPLATE(bench_delay, DELAY)->Range(0, 1 << 16)->ReportAggregatesOnly(true);
+BENCHMARK_TEMPLATE(bench_delay, YIELD)->Range(0, 1 << 16)->ReportAggregatesOnly(true);
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -240,6 +240,7 @@ if(WITH_PYTHON)
  add_fdb_test(TEST_FILES rare/RedwoodDeltaTree.toml)
  add_fdb_test(TEST_FILES rare/Throttling.toml)
  add_fdb_test(TEST_FILES rare/ThroughputQuota.toml)
+  add_fdb_test(TEST_FILES rare/TransactionCost.toml)
  add_fdb_test(TEST_FILES rare/TransactionTagApiCorrectness.toml)
  add_fdb_test(TEST_FILES rare/TransactionTagSwizzledApiCorrectness.toml)
  add_fdb_test(TEST_FILES rare/WriteTagThrottling.toml)
--- a/tests/SpecificUnitTest.txt
+++ b/tests/SpecificUnitTest.txt
@ -1,6 +1,7 @@
 testTitle=UnitTests
 startDelay=0
 useDB=false
+runSetup=false

    testName=UnitTests
    maxTestCases=0
--- a/tests/rare/SpecificUnitTests.toml
+++ b/tests/rare/SpecificUnitTests.toml
@ -3,8 +3,9 @@
 testTitle = 'UnitTests'
 useDB = false
 startDelay = 0
+runSetup=false

    [[test.workload]]
    testName = 'UnitTests'
    maxTestCases = 1
-    testsMatching = '/'
+    testsMatching = 'noSim/'
--- a/tests/rare/ThroughputQuota.toml
+++ b/tests/rare/ThroughputQuota.toml
@ -6,8 +6,13 @@ testTitle='ThroughputQuotaTest'
  transactionTag='a'
  totalQuota=1.0

+  [[test.workload]]
+  testName='Status'
+  enableLatencyBands = true
+  testDuration = 60.0
+
  [[test.workload]]
  testName = 'Cycle'
  transactionsPerSecond = 2500.0
-  testDuration = 10.0
+  testDuration = 60.0
  expectedRate = 0
--- a/tests/rare/TransactionCost.toml
+++ b/tests/rare/TransactionCost.toml
@ -0,0 +1,6 @@
+[[test]]
+testTitle = 'TransactionCostTest'
+
+    [[test.workload]]
+    testName = 'TransactionCost'
+    iterations = 1000
--- a/tests/slow/ApiCorrectnessAtomicRestore.toml
+++ b/tests/slow/ApiCorrectnessAtomicRestore.toml
@ -1,3 +1,8 @@
+[[knobs]]
+rocksdb_read_value_timeout=300.0
+rocksdb_read_value_prefix_timeout=300.0
+rocksdb_read_range_timeout=300.0
+
 [[test]]
 testTitle = 'ApiCorrectnessTest'
 clearAfterTest = false