Merge branch 'master' into feature/range-split-points-based-on-size

2020-08-27 10:06:02 -07:00 · 2020-08-27 10:06:02 -07:00 · cd86ca6850
parent daead84fdd d36d61e0ba
commit cd86ca6850
42 changed files with 1259 additions and 492 deletions
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -215,6 +215,9 @@ else()
  if (USE_AVX512F)
    if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^x86")
      add_compile_options(-mavx512f)
+    elseif(USE_VALGRIND)
+      message(STATUS "USE_VALGRIND=ON make USE_AVX OFF to satisfy valgrind analysis requirement")
+      set(USE_AVX512F OFF)
    else()
      message(STATUS "USE_AVX512F is supported on x86 or x86_64 only")
      set(USE_AVX512F OFF)
@ -224,6 +227,9 @@ else()
  if (USE_AVX)
    if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^x86")
      add_compile_options(-mavx)
+    elseif(USE_VALGRIND)
+      message(STATUS "USE_VALGRIND=ON make USE_AVX OFF to satisfy valgrind analysis requirement")
+      set(USE_AVX OFF)
    else()
      message(STATUS "USE_AVX is supported on x86 or x86_64 only")
      set(USE_AVX OFF)
--- a/contrib/TraceLogHelper/JsonParser.cs
+++ b/contrib/TraceLogHelper/JsonParser.cs
@ -80,8 +80,7 @@ namespace Magnesium
 				TraceFile = file,
 				DDetails = xEvent.Elements()
 					.Where(a=>a.Name != "Type" && a.Name != "Time" && a.Name != "Machine" && a.Name != "ID" && a.Name != "Severity" && (!rolledEvent || a.Name != "OriginalTime"))
-					// When the key contains a colon character, it gets parsed as a:item
-					.ToDictionary(a=>a.Name.LocalName == "item" ? a.Attribute("item").Value : string.Intern(a.Name.LocalName), a=>(object)a.Value),
+					.ToDictionary(a=>string.Intern(a.Name.LocalName), a=>(object)a.Value),
 				original = keepOriginalElement ? xEvent : null
 			};
 		}
--- a/documentation/sphinx/source/command-line-interface.rst
+++ b/documentation/sphinx/source/command-line-interface.rst
@ -456,16 +456,20 @@ disable

 ``throttle disable auto``

-Disables cluster auto-throttling for busy transaction tags. This does not disable any currently active throttles. To do so, run the following command after disabling auto-throttling::
-
-> throttle off auto
+Disables cluster auto-throttling for busy transaction tags. This may not disable currently active throttles immediately, seconds of delay is expected.

 list
 ^^^^

-``throttle list [LIMIT]``
+``throttle list [throttled|recommended|all] [LIMIT]``

-Prints a list of currently active transaction tag throttles.
+Prints a list of currently active transaction tag throttles, or recommended transaction tag throttles if auto-throttling is disabled.
+
+``throttled`` - list active transaction tag throttles.
+
+``recommended`` - list transaction tag throttles recommended by the ratekeeper, but not active yet.
+
+``all`` - list both active and recommended transaction tag throttles.

 ``LIMIT`` - The number of throttles to print. Defaults to 100.

--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@ -313,11 +313,18 @@
         "batch_released_transactions_per_second":0,
         "released_transactions_per_second":0,
         "throttled_tags":{
-            "auto":{
-               "count":0
+            "auto" : {
+                "busy_read" : 0,
+                "busy_write" : 0,
+                "count" : 0
            },
-            "manual":{
-               "count":0
+            "manual" : {
+                "count" : 0
+            },
+            "recommend" : {
+                "busy_read" : 0,
+                "busy_write" : 0,
+                "count" : 0
            }
         },
         "limiting_queue_bytes_storage_server":0,
--- a/documentation/sphinx/source/mr-status.rst
+++ b/documentation/sphinx/source/mr-status.rst
@ -132,3 +132,13 @@ log_server_min_free_space           Log server running out of space (approaching
 log_server_min_free_space_ratio     Log server running out of space (approaching 5% limit).
 storage_server_durability_lag       Storage server durable version falling behind.
 =================================== ====================================================
+
+The JSON path ``cluster.qos.throttled_tags``, when it exists, is an Object containing ``"auto"`` , ``"manual"`` and ``"recommended"``.  The possible fields for those object are in the following table:
+
+=================================== ====================================================
+Name                                Description
+=================================== ====================================================
+count                               How many tags are throttled
+busy_read                           How many tags are throttled because of busy read
+busy_write                          How many tags are throttled because of busy write
+=================================== ====================================================
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -2541,6 +2541,16 @@ void throttleGenerator(const char* text, const char *line, std::vector<std::stri
 		const char* opts[] = { "auto", nullptr };
 		arrayGenerator(text, line, opts, lc);
 	}
+	else if(tokens.size() >= 2 && tokencmp(tokens[1], "list")) {
+		if(tokens.size() == 2) {
+			const char* opts[] = { "throttled", "recommended", "all", nullptr };
+			arrayGenerator(text, line, opts, lc);
+		}
+		else if(tokens.size() == 3) {
+			const char* opts[] = {"LIMITS", nullptr};
+			arrayGenerator(text, line, opts, lc);
+		}
+	}
 }

 void fdbcliCompCmd(std::string const& text, std::vector<std::string>& lc) {
@ -2661,6 +2671,14 @@ std::vector<const char*> throttleHintGenerator(std::vector<StringRef> const& tok
 	else if((tokencmp(tokens[1], "enable") || tokencmp(tokens[1], "disable")) && tokens.size() == 2) {
 		return { "auto" };
 	}
+	else if(tokens.size() >= 2 && tokencmp(tokens[1], "list")) {
+		if(tokens.size() == 2) {
+			return { "[throttled|recommended|all]", "[LIMITS]" };
+		}
+		else if(tokens.size() == 3 && (tokencmp(tokens[2], "throttled") || tokencmp(tokens[2], "recommended") || tokencmp(tokens[2], "all"))){
+			return {"[LIMITS]"};
+		}
+	}
 	else if(tokens.size() == 2 && inArgument) {
 		return { "[ARGS]" };
 	}
@ -4077,8 +4095,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 						continue;
 					}
 					else if(tokencmp(tokens[1], "list")) {
-						if(tokens.size() > 3) {
-							printf("Usage: throttle list [LIMIT]\n");
+						if(tokens.size() > 4) {
+							printf("Usage: throttle list [throttled|recommended|all] [LIMIT]\n");
 							printf("\n");
 							printf("Lists tags that are currently throttled.\n");
 							printf("The default LIMIT is 100 tags.\n");
@ -4086,36 +4104,72 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 							continue;
 						}

-						state int throttleListLimit = 100;
+						state bool reportThrottled = true;
+						state bool reportRecommended = false;
 						if(tokens.size() >= 3) {
-							char *end;
-							throttleListLimit = std::strtol((const char*)tokens[2].begin(), &end, 10);
-							if ((tokens.size() > 3 && !std::isspace(*end)) || (tokens.size() == 3 && *end != '\0')) {
-								printf("ERROR: failed to parse limit `%s'.\n", printable(tokens[2]).c_str());
+							if(tokencmp(tokens[2], "recommended")) {
+								reportThrottled = false; reportRecommended = true;
+							}
+							else if(tokencmp(tokens[2], "all")){
+								reportThrottled = true; reportRecommended = true;
+							}
+							else if(!tokencmp(tokens[2], "throttled")){
+								printf("ERROR: failed to parse `%s'.\n", printable(tokens[2]).c_str());
 								is_error = true;
 								continue;
 							}
 						}

-						std::vector<TagThrottleInfo> tags = wait(ThrottleApi::getThrottledTags(db, throttleListLimit));
+						state int throttleListLimit = 100;
+						if(tokens.size() >= 4) {
+							char *end;
+							throttleListLimit = std::strtol((const char*)tokens[3].begin(), &end, 10);
+							if ((tokens.size() > 4 && !std::isspace(*end)) || (tokens.size() == 4 && *end != '\0')) {
+								printf("ERROR: failed to parse limit `%s'.\n", printable(tokens[3]).c_str());
+								is_error = true;
+								continue;
+							}
+						}
+
+						state std::vector<TagThrottleInfo> tags;
+						if(reportThrottled && reportRecommended) {
+							wait(store(tags, ThrottleApi::getThrottledTags(db, throttleListLimit, true)));
+						}
+						else if(reportThrottled) {
+							wait(store(tags, ThrottleApi::getThrottledTags(db, throttleListLimit)));
+						}
+						else if(reportRecommended) {
+							wait(store(tags, ThrottleApi::getRecommendedTags(db, throttleListLimit)));
+						}

 						bool anyLogged = false;
 						for(auto itr = tags.begin(); itr != tags.end(); ++itr) {
 							if(itr->expirationTime > now()) {
 								if(!anyLogged) {
 									printf("Throttled tags:\n\n");
-									printf("  Rate (txn/s) | Expiration (s) | Priority  | Type   | Tag\n");
-									printf(" --------------+----------------+-----------+--------+------------------\n");
+									printf("  Rate (txn/s) | Expiration (s) | Priority  | Type   | Reason     |Tag\n");
+									printf(" --------------+----------------+-----------+--------+------------+------\n");
 									
 									anyLogged = true;
 								}

-								printf("  %12d | %13ds | %9s | %6s | %s\n", 
-									(int)(itr->tpsRate), 
-									std::min((int)(itr->expirationTime-now()), (int)(itr->initialDuration)), 
-									transactionPriorityToString(itr->priority, false), 
-									itr->throttleType == TagThrottleType::AUTO ? "auto" : "manual", 
-									itr->tag.toString().c_str());
+								std::string reasonStr = "unset";
+								if(itr->reason == TagThrottledReason::MANUAL){
+									reasonStr = "manual";
+								}
+								else if(itr->reason == TagThrottledReason::BUSY_WRITE) {
+									reasonStr = "busy write";
+								}
+								else if(itr->reason == TagThrottledReason::BUSY_READ) {
+									reasonStr = "busy read";
+								}
+
+								printf("  %12d | %13ds | %9s | %6s | %10s |%s\n", (int)(itr->tpsRate),
+								       std::min((int)(itr->expirationTime - now()), (int)(itr->initialDuration)),
+								       transactionPriorityToString(itr->priority, false),
+								       itr->throttleType == TagThrottleType::AUTO ? "auto" : "manual",
+								       reasonStr.c_str(),
+								       itr->tag.toString().c_str());
 							}
 						}

@ -4124,7 +4178,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 							printf("Usage: throttle list [LIMIT]\n");
 						}
 						if(!anyLogged) {
-							printf("There are no throttled tags\n");
+							printf("There are no %s tags\n", reportThrottled ? "throttled" : "recommended");
 						}
 					}
 					else if(tokencmp(tokens[1], "on")) {	
--- a/fdbclient/AsyncFileBlobStore.actor.h
+++ b/fdbclient/AsyncFileBlobStore.actor.h
@ -225,9 +225,9 @@ private:

 		// Do the upload, and if it fails forward errors to m_error and also stop if anything else sends an error to m_error
 		// Also, hold a releaser for the concurrent upload slot while all that is going on.
-		f->m_parts.back()->etag = holdWhile(std::shared_ptr<FlowLock::Releaser>(new FlowLock::Releaser(f->m_concurrentUploads, 1)),
-									joinErrorGroup(doPartUpload(f, f->m_parts.back().getPtr()), f->m_error)
-								  );
+		auto releaser = std::make_shared<FlowLock::Releaser>(f->m_concurrentUploads, 1);
+		f->m_parts.back()->etag =
+		    holdWhile(std::move(releaser), joinErrorGroup(doPartUpload(f, f->m_parts.back().getPtr()), f->m_error));

 		// Make a new part to write to
 		if(startNew)
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -160,7 +160,8 @@ public:
 	void invalidateCache( const KeyRef&, bool isBackward = false );
 	void invalidateCache( const KeyRangeRef& );

-	bool sampleReadTags();
+	bool sampleReadTags() const;
+	bool sampleOnCost(uint64_t cost) const;

 	Reference<ProxyInfo> getMasterProxies(bool useProvisionalProxies);
 	Future<Reference<ProxyInfo>> getMasterProxiesFuture(bool useProvisionalProxies);
@ -305,6 +306,7 @@ public:
 	Counter transactionsResourceConstrained;
 	Counter transactionsProcessBehind;
 	Counter transactionsThrottled;
+	Counter transactionsExpensiveClearCostEstCount;

 	ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;

@ -337,6 +339,7 @@ public:
 	HealthMetrics healthMetrics;
 	double healthMetricsLastUpdated;
 	double detailedHealthMetricsLastUpdated;
+	Smoother smoothMidShardSize;

 	UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaults;

@ -345,7 +348,7 @@ public:
 	std::vector<std::unique_ptr<SpecialKeyRangeReadImpl>> specialKeySpaceModules;
 	std::unique_ptr<SpecialKeySpace> specialKeySpace;
 	void registerSpecialKeySpaceModule(SpecialKeySpace::MODULE module, SpecialKeySpace::IMPLTYPE type,
-	                                   std::unique_ptr<SpecialKeyRangeReadImpl> impl);
+	                                   std::unique_ptr<SpecialKeyRangeReadImpl> &&impl);

 	static bool debugUseTags;
 	static const std::vector<std::string> debugTransactionTagChoices; 
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@ -65,6 +65,8 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKOFF_GROWTH_RATE,                     2.0 );
 	init( RESOURCE_CONSTRAINED_MAX_BACKOFF,       30.0 );
 	init( PROXY_COMMIT_OVERHEAD_BYTES,              23 ); //The size of serializing 7 tags (3 primary, 3 remote, 1 log router) + 2 for the tag length
+	init( SHARD_STAT_SMOOTH_AMOUNT,                5.0 );
+	init( INIT_MID_SHARD_BYTES,                 200000 ); if( randomize && BUGGIFY ) INIT_MID_SHARD_BYTES = 40000; // The same value as SERVER_KNOBS->MIN_SHARD_BYTES

 	init( TRANSACTION_SIZE_LIMIT,                  1e7 );
 	init( KEY_SIZE_LIMIT,                          1e4 );
@ -89,6 +91,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( STORAGE_METRICS_TOO_MANY_SHARDS_DELAY,  15.0 );
 	init( AGGREGATE_HEALTH_METRICS_MAX_STALENESS,  0.5 );
 	init( DETAILED_HEALTH_METRICS_MAX_STALENESS,   5.0 );
+	init( MID_SHARD_SIZE_MAX_STALENESS,           10.0 );
 	init( TAG_ENCODE_KEY_SERVERS,                 true ); if( randomize && BUGGIFY ) TAG_ENCODE_KEY_SERVERS = false;

 	//KeyRangeMap
@ -227,6 +230,9 @@ void ClientKnobs::initialize(bool randomize) {
 	// transaction tags
 	init( MAX_TAGS_PER_TRANSACTION,                   5 );
 	init( MAX_TRANSACTION_TAG_LENGTH,                16 );
+	init( COMMIT_SAMPLE_COST,                       100 ); if( randomize && BUGGIFY ) COMMIT_SAMPLE_COST = 10;
+	init( WRITE_COST_BYTE_FACTOR,                 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096;
+	init( INCOMPLETE_SHARD_PLUS,                   4096 );
 	init( READ_TAG_SAMPLE_RATE,                    0.01 ); if( randomize && BUGGIFY ) READ_TAG_SAMPLE_RATE = 1.0; // Communicated to clients from cluster
 	init( TAG_THROTTLE_SMOOTHING_WINDOW,            2.0 );
 	init( TAG_THROTTLE_RECHECK_INTERVAL,            5.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_RECHECK_INTERVAL = 0.0;
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@ -60,6 +60,8 @@ public:
 	double BACKOFF_GROWTH_RATE;
 	double RESOURCE_CONSTRAINED_MAX_BACKOFF;
 	int PROXY_COMMIT_OVERHEAD_BYTES;
+	double SHARD_STAT_SMOOTH_AMOUNT;
+	int INIT_MID_SHARD_BYTES;

 	int TRANSACTION_SIZE_LIMIT;
 	int64_t KEY_SIZE_LIMIT;
@ -85,6 +87,7 @@ public:
 	double STORAGE_METRICS_TOO_MANY_SHARDS_DELAY;
 	double AGGREGATE_HEALTH_METRICS_MAX_STALENESS;
 	double DETAILED_HEALTH_METRICS_MAX_STALENESS;
+	double MID_SHARD_SIZE_MAX_STALENESS;
 	bool TAG_ENCODE_KEY_SERVERS;

 	//KeyRangeMap
@ -214,6 +217,9 @@ public:
 	// transaction tags
 	int MAX_TRANSACTION_TAG_LENGTH;
 	int MAX_TAGS_PER_TRANSACTION;
+	int COMMIT_SAMPLE_COST; // The expectation of sampling is every COMMIT_SAMPLE_COST sample once
+	int WRITE_COST_BYTE_FACTOR;
+	int INCOMPLETE_SHARD_PLUS; // The size of (possible) incomplete shard when estimate clear range
 	double READ_TAG_SAMPLE_RATE; // Communicated to clients from cluster
 	double TAG_THROTTLE_SMOOTHING_WINDOW;
 	double TAG_THROTTLE_RECHECK_INTERVAL;
--- a/fdbclient/MasterProxyInterface.h
+++ b/fdbclient/MasterProxyInterface.h
@ -105,8 +105,11 @@ struct ClientDBInfo {
 	int64_t clientTxnInfoSizeLimit;
 	Optional<Value> forward;
 	double transactionTagSampleRate;
+	double transactionTagSampleCost;

-	ClientDBInfo() : clientTxnInfoSampleRate(std::numeric_limits<double>::infinity()), clientTxnInfoSizeLimit(-1), transactionTagSampleRate(CLIENT_KNOBS->READ_TAG_SAMPLE_RATE) {}
+	ClientDBInfo()
+	  : clientTxnInfoSampleRate(std::numeric_limits<double>::infinity()), clientTxnInfoSizeLimit(-1),
+	    transactionTagSampleRate(CLIENT_KNOBS->READ_TAG_SAMPLE_RATE), transactionTagSampleCost(CLIENT_KNOBS->COMMIT_SAMPLE_COST) {}

 	bool operator == (ClientDBInfo const& r) const { return id == r.id; }
 	bool operator != (ClientDBInfo const& r) const { return id != r.id; }
@ -116,7 +119,8 @@ struct ClientDBInfo {
 		if constexpr (!is_fb_function<Archive>) {
 			ASSERT(ar.protocolVersion().isValid());
 		}
-		serializer(ar, proxies, id, clientTxnInfoSampleRate, clientTxnInfoSizeLimit, forward, transactionTagSampleRate);
+		serializer(ar, proxies, id, clientTxnInfoSampleRate, clientTxnInfoSizeLimit, forward, transactionTagSampleRate,
+		           transactionTagSampleCost);
 	}
 };

@ -155,7 +159,7 @@ struct CommitTransactionRequest : TimedRequest {
 	ReplyPromise<CommitID> reply;
 	uint32_t flags;
 	Optional<UID> debugID;
-	Optional<TransactionCommitCostEstimation> commitCostEstimation;
+	Optional<ClientTrCommitCostEstimation> commitCostEstimation;
 	Optional<TagSet> tagSet;

 	CommitTransactionRequest() : flags(0) {}
@ -184,6 +188,7 @@ struct GetReadVersionReply : public BasicLoadBalancedReply {
 	Version version;
 	bool locked;
 	Optional<Value> metadataVersion;
+	int64_t midShardSize = 0;

 	TransactionTagMap<ClientTagThrottleLimits> tagThrottleInfo;

@ -191,7 +196,7 @@ struct GetReadVersionReply : public BasicLoadBalancedReply {

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, BasicLoadBalancedReply::recentRequests, version, locked, metadataVersion, tagThrottleInfo);
+		serializer(ar, BasicLoadBalancedReply::recentRequests, version, locked, metadataVersion, tagThrottleInfo, midShardSize);
 	}
 };

--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -697,7 +697,7 @@ Future<HealthMetrics> DatabaseContext::getHealthMetrics(bool detailed = false) {
 }

 void DatabaseContext::registerSpecialKeySpaceModule(SpecialKeySpace::MODULE module, SpecialKeySpace::IMPLTYPE type,
-                                                    std::unique_ptr<SpecialKeyRangeReadImpl> impl) {
+                                                    std::unique_ptr<SpecialKeyRangeReadImpl> &&impl) {
 	specialKeySpace->registerKeyRange(module, type, impl->getKeyRange(), impl.get());
 	specialKeySpaceModules.push_back(std::move(impl));
 }
@ -875,6 +875,7 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
    transactionsProcessBehind("ProcessBehind", cc), outstandingWatches(0), latencies(1000), readLatencies(1000),
    commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
    healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal),
+    smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
    specialKeySpace(std::make_unique<SpecialKeySpace>(specialKeys.begin, specialKeys.end, /* test */ false)) {
 	dbId = deterministicRandom()->randomUniqueID();
 	connected = clientInfo->get().proxies.size() ? Void() : clientInfo->onChange();
@ -895,6 +896,9 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	monitorMasterProxiesInfoChange = monitorMasterProxiesChange(clientInfo, &masterProxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 	cacheListMonitor = monitorCacheList(this);
+
+	smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES);
+
 	if (apiVersionAtLeast(700)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ERRORMSG, SpecialKeySpace::IMPLTYPE::READONLY,
 		                              std::make_unique<SingleSpecialKeyImpl>(
@ -1004,6 +1008,7 @@ DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("T
 	transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc),
 	transactionsResourceConstrained("ResourceConstrained", cc), transactionsThrottled("Throttled", cc), transactionsProcessBehind("ProcessBehind", cc), latencies(1000), readLatencies(1000), commitLatencies(1000),
 	GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000),
+	smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
 	internal(false) {}


@ -1096,10 +1101,15 @@ Future<Void> DatabaseContext::onMasterProxiesChanged() {
 	return this->masterProxiesChangeTrigger.onTrigger();
 }

-bool DatabaseContext::sampleReadTags() {
+bool DatabaseContext::sampleReadTags() const {
 	return clientInfo->get().transactionTagSampleRate > 0 && deterministicRandom()->random01() <= clientInfo->get().transactionTagSampleRate;
 }

+bool DatabaseContext::sampleOnCost(uint64_t cost) const {
+	if(clientInfo->get().transactionTagSampleCost <= 0) return false;
+	return deterministicRandom()->random01() <= (double)cost / clientInfo->get().transactionTagSampleCost;
+}
+
 int64_t extractIntOption( Optional<StringRef> value, int64_t minValue, int64_t maxValue ) {
 	validateOptionValue(value, true);
 	if( value.get().size() != 8 ) {
@ -1994,7 +2004,8 @@ ACTOR Future<Version> waitForCommittedVersion( Database cx, Version version, Spa
 				         cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion,
 				         GetReadVersionRequest(span.context, 0, TransactionPriority::IMMEDIATE), cx->taskID))) {
 					cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version);
-
+					if(v.midShardSize > 0)
+						cx->smoothMidShardSize.setTotal(v.midShardSize);
 					if (v.version >= version)
 						return v.version;
 					// SOMEDAY: Do the wait on the server side, possibly use less expensive source of committed version (causal consistency is not needed for this purpose)
@ -3010,7 +3021,6 @@ void Transaction::clear( const KeyRef& key, bool addConflictRange ) {
 	data[key.size()] = 0;
 	t.mutations.emplace_back(req.arena, MutationRef::ClearRange, KeyRef(data, key.size()),
 	                         KeyRef(data, key.size() + 1));
-
 	if(addConflictRange)
 		t.write_conflict_ranges.emplace_back(req.arena, KeyRef(data, key.size()), KeyRef(data, key.size() + 1));
 }
@ -3293,33 +3303,76 @@ void Transaction::setupWatches() {
 	}
 }

-ACTOR Future<TransactionCommitCostEstimation> estimateCommitCosts(Transaction* self,
-                                                                  CommitTransactionRef* transaction) {
-	state MutationRef* it = transaction->mutations.begin();
-	state MutationRef* end = transaction->mutations.end();
-	state TransactionCommitCostEstimation trCommitCosts;
-	state KeyRange keyRange;
-	for (; it != end; ++it) {
-		if (it->type == MutationRef::Type::SetValue) {
-			trCommitCosts.bytesWrite += it->expectedSize();
-			trCommitCosts.numWrite++;
-		} else if (it->isAtomicOp()) {
-			trCommitCosts.bytesAtomicWrite += it->expectedSize();
-			trCommitCosts.numAtomicWrite++;
-		} else if (it->type == MutationRef::Type::ClearRange) {
-			trCommitCosts.numClear++;
-			keyRange = KeyRange(KeyRangeRef(it->param1, it->param2));
+ACTOR Future<Optional<ClientTrCommitCostEstimation>> estimateCommitCosts(Transaction* self,
+                                                                         CommitTransactionRef const * transaction) {
+	state ClientTrCommitCostEstimation trCommitCosts;
+	state KeyRangeRef keyRange;
+	state int i = 0;
+
+	for (; i < transaction->mutations.size(); ++i) {
+		auto* it = &transaction->mutations[i];
+
+		if (it->type == MutationRef::Type::SetValue || it->isAtomicOp()) {
+			trCommitCosts.opsCount++;
+			trCommitCosts.writeCosts += getWriteOperationCost(it->expectedSize());
+		}
+		else if (it->type == MutationRef::Type::ClearRange) {
+			trCommitCosts.opsCount++;
+			keyRange = KeyRangeRef(it->param1, it->param2);
 			if (self->options.expensiveClearCostEstimation) {
-				StorageMetrics m = wait(self->getStorageMetrics(keyRange, std::numeric_limits<int>::max()));
-				trCommitCosts.bytesClearEst += m.bytes;
+				StorageMetrics m = wait(self->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY));
+				trCommitCosts.clearIdxCosts.emplace_back(i, getWriteOperationCost(m.bytes));
+				trCommitCosts.writeCosts += getWriteOperationCost(m.bytes);
+				++ trCommitCosts.expensiveCostEstCount;
+				++ self->getDatabase()->transactionsExpensiveClearCostEstCount;
 			}
 			else {
-				std::vector<pair<KeyRange, Reference<LocationInfo>>> locations = wait(getKeyRangeLocations(
-					self->getDatabase(), keyRange, std::numeric_limits<int>::max(), false, &StorageServerInterface::getShardState, self->info));
-				trCommitCosts.numClearShards += locations.size();
+				std::vector<pair<KeyRange, Reference<LocationInfo>>> locations =
+				    wait(getKeyRangeLocations(self->getDatabase(), keyRange, CLIENT_KNOBS->TOO_MANY, false,
+				                              &StorageServerInterface::getShardState, self->info));
+				if (locations.empty()) continue;
+
+				uint64_t bytes = 0;
+				if (locations.size() == 1) {
+					bytes = CLIENT_KNOBS->INCOMPLETE_SHARD_PLUS;
+				}
+				else { // small clear on the boundary will hit two shards but be much smaller than the shard size
+					bytes = CLIENT_KNOBS->INCOMPLETE_SHARD_PLUS * 2 +
+					        (locations.size() - 2) * (int64_t)self->getDatabase()->smoothMidShardSize.smoothTotal();
+				}
+
+				trCommitCosts.clearIdxCosts.emplace_back(i, getWriteOperationCost(bytes));
+				trCommitCosts.writeCosts += getWriteOperationCost(bytes);
 			}
 		}
 	}
+
+	// sample on written bytes
+	if (!self->getDatabase()->sampleOnCost(trCommitCosts.writeCosts))
+		return Optional<ClientTrCommitCostEstimation>();
+
+	// sample clear op: the expectation of #sampledOp is every COMMIT_SAMPLE_COST sample once
+	// we also scale the cost of mutations whose cost is less than COMMIT_SAMPLE_COST as scaledCost = min(COMMIT_SAMPLE_COST, cost)
+	// If we have 4 transactions:
+	// A - 100 1-cost mutations: E[sampled ops] = 1, E[sampled cost] = 100
+	// B - 1 100-cost mutation: E[sampled ops] = 1, E[sampled cost] = 100
+	// C - 50 2-cost mutations: E[sampled ops] = 1, E[sampled cost] = 100
+	// D - 1 150-cost mutation and 150 1-cost mutations: E[sampled ops] = 3, E[sampled cost] = 150cost * 1 + 150 * 100cost * 0.01 = 300
+	ASSERT(trCommitCosts.writeCosts > 0);
+	std::deque<std::pair<int, uint64_t>> newClearIdxCosts;
+	for (const auto& [idx, cost] : trCommitCosts.clearIdxCosts) {
+		if(trCommitCosts.writeCosts >= CLIENT_KNOBS->COMMIT_SAMPLE_COST){
+			double mul = trCommitCosts.writeCosts / std::max(1.0, (double)CLIENT_KNOBS->COMMIT_SAMPLE_COST);
+			if(deterministicRandom()->random01() < cost * mul / trCommitCosts.writeCosts) {
+				newClearIdxCosts.emplace_back(idx, cost < CLIENT_KNOBS->COMMIT_SAMPLE_COST ? CLIENT_KNOBS->COMMIT_SAMPLE_COST : cost);
+			}
+		}
+		else if(deterministicRandom()->random01() < (double)cost / trCommitCosts.writeCosts){
+			newClearIdxCosts.emplace_back(idx, cost < CLIENT_KNOBS->COMMIT_SAMPLE_COST ? CLIENT_KNOBS->COMMIT_SAMPLE_COST : cost);
+		}
+	}
+
+	trCommitCosts.clearIdxCosts.swap(newClearIdxCosts);
 	return trCommitCosts;
 }

@ -3339,11 +3392,11 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 					commit_unknown_result()});
 		}

-		if (!req.tagSet.present()) {
-			wait(store(req.transaction.read_snapshot, readVersion));
+		if(req.tagSet.present() && tr->options.priority < TransactionPriority::IMMEDIATE){
+			wait(store(req.transaction.read_snapshot, readVersion) &&
+			     store(req.commitCostEstimation, estimateCommitCosts(tr, &req.transaction)));
 		} else {
-			req.commitCostEstimation = TransactionCommitCostEstimation();
-			wait(store(req.transaction.read_snapshot, readVersion) && store(req.commitCostEstimation.get(), estimateCommitCosts(tr, &req.transaction)));
+			wait(store(req.transaction.read_snapshot, readVersion));
 		}

 		startTime = now();
@ -3668,7 +3721,7 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
 			if (info.debugID.present()) {
 				TraceEvent(SevInfo, "TransactionBeingTraced")
 					.detail("DebugTransactionID", trLogInfo->identifier)
-					.detail("ServerTraceID", info.debugID.get().first());
+					.detail("ServerTraceID", info.debugID.get());

 			}
 			break;
@ -3704,7 +3757,7 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
 			if (trLogInfo && !trLogInfo->identifier.empty()) {
 				TraceEvent(SevInfo, "TransactionBeingTraced")
 					.detail("DebugTransactionID", trLogInfo->identifier)
-					.detail("ServerTraceID", info.debugID.get().first());
+					.detail("ServerTraceID", info.debugID.get());
 			}
 			break;

@ -4098,9 +4151,9 @@ Future<Void> Transaction::onError( Error const& e ) {

 	return e;
 }
-ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRangeRef keys);
+ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRange keys);

-ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx, KeyRangeRef keys, Reference<LocationInfo> locationInfo) {
+ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx, KeyRange keys, Reference<LocationInfo> locationInfo) {
 	loop {
 		try {
 			WaitMetricsRequest req(keys, StorageMetrics(), StorageMetrics());
@ -4122,7 +4175,7 @@ ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx, KeyRangeRef keys,
 	}
 }

-ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRangeRef keys) {
+ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRange keys) {
 	state Span span("NAPI:GetStorageMetricsLargeKeyRange"_loc);
 	vector<pair<KeyRange, Reference<LocationInfo>>> locations = wait(
 	    getKeyRangeLocations(cx, keys, std::numeric_limits<int>::max(), false, &StorageServerInterface::waitMetrics,
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -360,5 +360,8 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
 // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
 ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exclusions);

+inline uint64_t getWriteOperationCost(uint64_t bytes) {
+	return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1;
+}
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@ -341,11 +341,18 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
         "batch_released_transactions_per_second":0,
         "released_transactions_per_second":0,
         "throttled_tags":{
-            "auto":{
-               "count":0
+            "auto" : {
+                "busy_read" : 0,
+                "busy_write" : 0,
+                "count" : 0
            },
-            "manual":{
-               "count":0
+            "manual" : {
+                "count" : 0
+            },
+            "recommend" : {
+                "busy_read" : 0,
+                "busy_write" : 0,
+                "count" : 0
            }
         },
         "limiting_queue_bytes_storage_server":0,
--- a/fdbclient/TagThrottle.actor.cpp
+++ b/fdbclient/TagThrottle.actor.cpp
@ -110,6 +110,31 @@ TagThrottleValue TagThrottleValue::fromValue(const ValueRef& value) {
 }

 namespace ThrottleApi {
+	ACTOR Future<bool> getValidAutoEnabled(Transaction* tr, Database db) {
+		state bool result;
+		loop {
+			Optional<Value> value = wait(tr->get(tagThrottleAutoEnabledKey));
+			if(!value.present()) {
+				tr->reset();
+				wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF));
+				continue;
+			}
+			else if(value.get() == LiteralStringRef("1")) {
+				result = true;
+			}
+			else if(value.get() == LiteralStringRef("0")) {
+				result = false;
+			}
+			else {
+				TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue", db->dbId).detail("Value", value.get());
+				tr->reset();
+				wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF));
+				continue;
+			}
+			return result;
+		};
+	}
+
 	void signalThrottleChange(Transaction &tr) {
 		tr.atomicOp(tagThrottleSignalKey, LiteralStringRef("XXXXXXXXXX\x00\x00\x00\x00"), MutationRef::SetVersionstampedValue);
 	}
@ -146,12 +171,16 @@ namespace ThrottleApi {
 		return Void();
 	}

-	ACTOR Future<std::vector<TagThrottleInfo>> getThrottledTags(Database db, int limit) {
+	ACTOR Future<std::vector<TagThrottleInfo>> getThrottledTags(Database db, int limit, bool containsRecommend) {
 		state Transaction tr(db);
-
+		state bool reportAuto = containsRecommend;
 		loop {
 			try {
-				Standalone<RangeResultRef> throttles = wait(tr.getRange(tagThrottleKeys, limit));
+				if (!containsRecommend) {
+					wait(store(reportAuto, getValidAutoEnabled(&tr, db)));
+				}
+				Standalone<RangeResultRef> throttles = wait(tr.getRange(
+					reportAuto ? tagThrottleKeys : KeyRangeRef(tagThrottleKeysPrefix, tagThrottleAutoKeysPrefix), limit));
 				std::vector<TagThrottleInfo> results;
 				for(auto throttle : throttles) {
 					results.push_back(TagThrottleInfo(TagThrottleKey::fromKey(throttle.key), TagThrottleValue::fromValue(throttle.value)));
@ -164,13 +193,41 @@ namespace ThrottleApi {
 		}
 	}

-	ACTOR Future<Void> throttleTags(Database db, TagSet tags, double tpsRate, double initialDuration, TagThrottleType throttleType, TransactionPriority priority, Optional<double> expirationTime) {
+	ACTOR Future<std::vector<TagThrottleInfo>> getRecommendedTags(Database db, int limit) {
+		state Transaction tr(db);
+		loop {
+			try {
+				bool enableAuto = wait(getValidAutoEnabled(&tr, db));
+				if(enableAuto) {
+					return std::vector<TagThrottleInfo>();
+				}
+
+				Standalone<RangeResultRef> throttles = wait(tr.getRange(KeyRangeRef(tagThrottleAutoKeysPrefix, tagThrottleKeys.end), limit));
+				std::vector<TagThrottleInfo> results;
+				for(auto throttle : throttles) {
+					results.push_back(TagThrottleInfo(TagThrottleKey::fromKey(throttle.key), TagThrottleValue::fromValue(throttle.value)));
+				}
+				return results;
+			}
+			catch(Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
+	ACTOR Future<Void> throttleTags(Database db, TagSet tags, double tpsRate, double initialDuration,
+                                    TagThrottleType throttleType, TransactionPriority priority, Optional<double> expirationTime,
+                                    Optional<TagThrottledReason> reason) {
 		state Transaction tr(db);
 		state Key key = TagThrottleKey(tags, throttleType, priority).toKey();

 		ASSERT(initialDuration > 0);

-		TagThrottleValue throttle(tpsRate, expirationTime.present() ? expirationTime.get() : 0, initialDuration);
+		if(throttleType == TagThrottleType::MANUAL) {
+			reason = TagThrottledReason::MANUAL;
+		}
+		TagThrottleValue throttle(tpsRate, expirationTime.present() ? expirationTime.get() : 0, initialDuration,
+	                              reason.present() ? reason.get() : TagThrottledReason::UNSET);
 		BinaryWriter wr(IncludeVersion(ProtocolVersion::withTagThrottleValue()));
 		wr << throttle;
 		state Value value = wr.toValue();
--- a/fdbclient/TagThrottle.h
+++ b/fdbclient/TagThrottle.h
@ -115,6 +115,13 @@ enum class TagThrottleType : uint8_t {
 	AUTO
 };

+enum class TagThrottledReason: uint8_t {
+	UNSET = 0,
+	MANUAL,
+	BUSY_READ,
+	BUSY_WRITE
+};
+
 struct TagThrottleKey {
 	TagSet tags;
 	TagThrottleType throttleType;
@ -132,17 +139,26 @@ struct TagThrottleValue {
 	double tpsRate;
 	double expirationTime;
 	double initialDuration;
+	TagThrottledReason reason;

-	TagThrottleValue() : tpsRate(0), expirationTime(0), initialDuration(0) {}
-	TagThrottleValue(double tpsRate, double expirationTime, double initialDuration) 
-		: tpsRate(tpsRate), expirationTime(expirationTime), initialDuration(initialDuration) {}
+	TagThrottleValue() : tpsRate(0), expirationTime(0), initialDuration(0), reason(TagThrottledReason::UNSET) {}
+	TagThrottleValue(double tpsRate, double expirationTime, double initialDuration, TagThrottledReason reason)
+		: tpsRate(tpsRate), expirationTime(expirationTime), initialDuration(initialDuration), reason(reason) {}

 	static TagThrottleValue fromValue(const ValueRef& value);

 	//To change this serialization, ProtocolVersion::TagThrottleValue must be updated, and downgrades need to be considered
 	template<class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, tpsRate, expirationTime, initialDuration);
+		if(ar.protocolVersion().hasTagThrottleValueReason()) {
+			serializer(ar, tpsRate, expirationTime, initialDuration, reinterpret_cast<uint8_t&>(reason));
+		}
+		else if(ar.protocolVersion().hasTagThrottleValue()) {
+			serializer(ar, tpsRate, expirationTime, initialDuration);
+			if(ar.isDeserializing) {
+			    reason = TagThrottledReason::UNSET;
+			}
+		}
 	}
 };

@ -153,12 +169,13 @@ struct TagThrottleInfo {
 	double tpsRate;
 	double expirationTime;
 	double initialDuration;
+	TagThrottledReason reason;

-	TagThrottleInfo(TransactionTag tag, TagThrottleType throttleType, TransactionPriority priority, double tpsRate, double expirationTime, double initialDuration)
-		: tag(tag), throttleType(throttleType), priority(priority), tpsRate(tpsRate), expirationTime(expirationTime), initialDuration(initialDuration) {}
+	TagThrottleInfo(TransactionTag tag, TagThrottleType throttleType, TransactionPriority priority, double tpsRate, double expirationTime, double initialDuration, TagThrottledReason reason = TagThrottledReason::UNSET)
+		: tag(tag), throttleType(throttleType), priority(priority), tpsRate(tpsRate), expirationTime(expirationTime), initialDuration(initialDuration), reason(reason) {}

-	TagThrottleInfo(TagThrottleKey key, TagThrottleValue value) 
-		: throttleType(key.throttleType), priority(key.priority), tpsRate(value.tpsRate), expirationTime(value.expirationTime), initialDuration(value.initialDuration) 
+	TagThrottleInfo(TagThrottleKey key, TagThrottleValue value)
+		: throttleType(key.throttleType), priority(key.priority), tpsRate(value.tpsRate), expirationTime(value.expirationTime), initialDuration(value.initialDuration), reason(value.reason)
 	{
 		ASSERT(key.tags.size() == 1); // Multiple tags per throttle is not currently supported
 		tag = *key.tags.begin();
@ -166,10 +183,12 @@ struct TagThrottleInfo {
 };

 namespace ThrottleApi {
-	Future<std::vector<TagThrottleInfo>> getThrottledTags(Database const& db, int const& limit);
+	Future<std::vector<TagThrottleInfo>> getThrottledTags(Database const& db, int const& limit, bool const& containsRecommend = false);
+	Future<std::vector<TagThrottleInfo>> getRecommendedTags(Database const& db, int const& limit);

 	Future<Void> throttleTags(Database const& db, TagSet const& tags, double const& tpsRate, double const& initialDuration, 
-	                         TagThrottleType const& throttleType, TransactionPriority const& priority, Optional<double> const& expirationTime = Optional<double>());
+	                         TagThrottleType const& throttleType, TransactionPriority const& priority, Optional<double> const& expirationTime = Optional<double>(),
+                              Optional<TagThrottledReason> const& reason = Optional<TagThrottledReason>());

 	Future<bool> unthrottleTags(Database const& db, TagSet const& tags, Optional<TagThrottleType> const& throttleType, Optional<TransactionPriority> const& priority);

@ -187,4 +206,6 @@ using TransactionTagMap = std::unordered_map<TransactionTag, Value, std::hash<Tr
 template<class Value>
 using PrioritizedTransactionTagMap = std::map<TransactionPriority, TransactionTagMap<Value>>;

+template<class Value>
+using UIDTransactionTagMap = std::unordered_map<UID, TransactionTagMap<Value>>;
 #endif
--- a/fdbrpc/ReplicationPolicy.cpp
+++ b/fdbrpc/ReplicationPolicy.cpp
@ -53,29 +53,17 @@ bool IReplicationPolicy::validateFull(

 	if (!solved) {
 		if (validate(totalSolution, fromServers)) {
-			if (g_replicationdebug > 2) {
-				printf("Error: Validate unsolved policy with%3lu also servers and%3lu solution servers\n", alsoServers.size(), solutionSet.size());
-			}
 			valid = false;
 		}
 		else if (validate(fromServers->getGroupEntries(), fromServers)) {
-			if (g_replicationdebug > 2) {
-				printf("Error: Validated unsolved policy with all%5d servers\n", fromServers->size());
-			}
 			valid = false;
 		}
 	}
 	else if (!validate(totalSolution, fromServers)) {
-		if (g_replicationdebug > 2) {
-			printf("Error: Failed to validate solved policy with%3lu also servers and%3lu solution servers\n", alsoServers.size(), solutionSet.size());
-		}
 		valid = false;
 	}
 	else if (solutionSet.empty()) {
 		if (!validate(alsoServers, fromServers)) {
-			if (g_replicationdebug > 2) {
-				printf("Error: Failed to validate policy with only%3lu also servers\n", alsoServers.size());
-			}
 			valid = false;
 		}
 	}
@ -85,14 +73,7 @@ bool IReplicationPolicy::validateFull(
 		totalSolution[lastSolutionIndex] = totalSolution.back();
 		totalSolution.pop_back();
 		for (int index = 0; index < solutionSet.size() && index < totalSolution.size(); index ++) {
-			if (g_replicationdebug > 3) {
-				auto fromServer = fromServers->getRecordViaEntry(missingEntry);
-				printf("Test remove entry:   %s   test:%3d of%3lu\n", fromServers->getEntryInfo(missingEntry).c_str(), index+1, solutionSet.size());
-			}
 			if (validate(totalSolution, fromServers)) {
-				if (g_replicationdebug > 2) {
-					printf("Invalid extra entry: %s\n", fromServers->getEntryInfo(missingEntry).c_str());
-				}
 				valid = false;
 				break;
 			}
@ -119,9 +100,6 @@ bool PolicyOne::selectReplicas(
 		itemsUsed ++;
 		totalUsed ++;
 	}
-	if (g_replicationdebug > 0) {
-		printf("PolicyOne    used:%5d results:%3d from %3d servers\n", totalUsed, itemsUsed, fromServers->size());
-	}
 	return (totalUsed > 0);
 }

@ -205,50 +183,16 @@ bool PolicyAcross::validate(
 		}
 	}
 	if (validMap.size() < _count) {
-		if (g_replicationdebug > 3) {
-			printf("Across too few values:%3lu <%2d key: %-7s policy: %-10s => %s\n", validMap.size(), _count, _attribKey.c_str(), _policy->name().c_str(), _policy->info().c_str());
-		}
 		valid = false;
 	}
 	else {
-		if (g_replicationdebug > 3) {
-			printf("Across check values:%9lu key: %-7s solutions:%2lu count:%2d policy: %-10s => %s\n", validMap.size(), _attribKey.c_str(), solutionSet.size(), _count, _policy->name().c_str(), _policy->info().c_str());
-			for (auto& itValue : validMap) {
-				printf("   value: (%3d) %-10s\n", itValue.first._id, fromServers->valueText(itValue.first).c_str());
-			}
-		}
 		for (auto& itValid : validMap) {
 			// itValid.second is the vector of LocalityEntries that belong to the same locality
 			if (_policy->validate(itValid.second, fromServers)) {
-				if (g_replicationdebug > 4) {
-					printf("Across valid solution: %6lu key: %-7s count:%3d of%3d value: (%3d) %-10s policy: %-10s => "
-					       "%s\n",
-					       itValid.second.size(), _attribKey.c_str(), count + 1, _count, itValid.first._id,
-					       fromServers->valueText(itValid.first).c_str(), _policy->name().c_str(),
-					       _policy->info().c_str());
-					if (g_replicationdebug > 5) {
-						for (auto& entry : itValid.second) {
-							printf("   entry: %s\n", fromServers->getEntryInfo(entry).c_str());
-						}
-					}
-				}
 				count ++;
-			} else if (g_replicationdebug > 4) {
-				printf("Across invalid solution:%5lu key: %-7s value: (%3d) %-10s policy: %-10s => %s\n", itValid.second.size(), _attribKey.c_str(), itValid.first._id, fromServers->valueText(itValid.first).c_str(), _policy->name().c_str(), _policy->info().c_str());
-				if (g_replicationdebug > 5) {
-					for (auto& entry : itValid.second) {
-						printf("   entry: %s\n", fromServers->getEntryInfo(entry).c_str());
-					}
-				}
 			}
 		}
 		if (count < _count) {
-			if (g_replicationdebug > 3) {
-				printf("Across failed solution: %3lu  key: %-7s values:%3lu count: %d=%d policy: %-10s => %s\n", solutionSet.size(), _attribKey.c_str(), validMap.size(),  count, _count, _policy->name().c_str(), _policy->info().c_str());
-				for (auto& entry : solutionSet) {
-					printf("   entry: %s\n", fromServers->getEntryInfo(entry).c_str());
-				}
-			}
 			valid = false;
 		}
 	}
@ -277,9 +221,6 @@ bool PolicyAcross::selectReplicas(
 	_newResults.clear();
 	_addedResults.resize(_arena, 0);

-	if (g_replicationdebug > 0) {
-		printf("Across !also:%4lu key: %-7s policy: %-10s => %s\n", alsoServers.size(), _attribKey.c_str(), _policy->name().c_str(), _policy->info().c_str());
-	}
 	for (auto& alsoServer : alsoServers) {
 		auto value = fromServers->getValueViaGroupKey(alsoServer, groupIndexKey);
 		if (value.present()) {
@ -287,16 +228,6 @@ bool PolicyAcross::selectReplicas(
 			if ((lowerBound == _usedValues.end()) || (*lowerBound != value.get())) {
 				//_selected is a set of processes that have the same indexKey and indexValue (value)
 				_selected = fromServers->restrict(indexKey, value.get());
-				if (g_replicationdebug > 0) {
-					if (_selected->size() > 0) {
-						// entry is the locality entry info (entryValue) from the to-be-selected team member alsoServer
-						printf("Across !select    key: %-7s value: (%3d) %-10s entry: %s\n", _attribKey.c_str(),
-						       value.get()._id, fromServers->valueText(value.get()).c_str(),
-						       fromServers->getEntryInfo(alsoServer).c_str());
-					} else {
-						printf("Across !select    empty\n");
-					}
-				}
 				if (_selected->size()) {
 					// Pass only the also array item which are valid for the value
 					resultsSize = _newResults.size();
@ -321,11 +252,6 @@ bool PolicyAcross::selectReplicas(
 	if ((count < _count) && (_addedResults.size())) {
 		// Sort the added results array
 		std::sort(_addedResults.begin(), _addedResults.end(), PolicyAcross::compareAddedResults);
-
-		if (g_replicationdebug > 0) {
-			printf("Across !add sets  key: %-7s sets:%3d results:%3lu count:%3d of%3d\n", _attribKey.c_str(), _addedResults.size(), _newResults.size(), count, _count);
-		}
-
 		if (g_replicationdebug > 0) {
 			LocalitySet::staticDisplayEntries(fromServers, alsoServers, "also");
 			LocalitySet::staticDisplayEntries(fromServers, results, "results");
@ -334,9 +260,6 @@ bool PolicyAcross::selectReplicas(

 		for (auto& addedResult : _addedResults) {
 			count ++;
-			if (g_replicationdebug > 0) {
-				printf("Across !add set   key: %-7s count:%3d of%3d  results:%3d index:%3d\n", _attribKey.c_str(), count, _count, addedResult.first, addedResult.second);
-			}
 			results.reserve(results.size() + addedResult.first);
 			results.insert(results.end(), _newResults.begin()+addedResult.second, _newResults.begin()+addedResult.second+addedResult.first);
 			if (count >= _count) break;
@ -349,9 +272,6 @@ bool PolicyAcross::selectReplicas(
 	// Cannot find replica from the least used alsoServers, now try to find replicas from all servers
 	// Process the remaining values
 	if (count < _count) {
-		if (g_replicationdebug > 0) {
-			printf("Across items:%4d key: %-7s policy: %-10s => %s  count:%3d of%3d\n", fromServers->size(), _attribKey.c_str(), _policy->name().c_str(), _policy->info().c_str(), count, _count);
-		}
 		int recordIndex;
 		// Use mutable array so that swaps does not affect actual element array
 		auto& mutableArray = fromServers->getMutableEntries();
@ -367,20 +287,8 @@ bool PolicyAcross::selectReplicas(
 				if ((lowerBound == _usedValues.end()) || (*lowerBound != value.get())) {
 					_selected = fromServers->restrict(indexKey, value.get());
 					if (_selected->size()) {
-						if (g_replicationdebug > 5) {
-							printf("Across select:%3d key: %-7s value: (%3d) %-10s entry: %s  index:%4d\n",
-							       fromServers->size() - checksLeft + 1, _attribKey.c_str(), value.get()._id,
-							       fromServers->valueText(value.get()).c_str(),
-							       fromServers->getEntryInfo(entry).c_str(), recordIndex);
-						}
 						if (_policy->selectReplicas(_selected, emptyEntryArray, results))
 						{
-							if (g_replicationdebug > 5) {
-								printf("Across added:%4d key: %-7s value: (%3d) %-10s policy: %-10s => %s needed:%3d\n",
-								       count + 1, _attribKey.c_str(), value.get()._id,
-								       fromServers->valueText(value.get()).c_str(), _policy->name().c_str(),
-								       _policy->info().c_str(), _count);
-							}
 							count ++;
 							if (count >= _count) break;
 							_usedValues.insert(lowerBound, value.get());
@ -395,13 +303,9 @@ bool PolicyAcross::selectReplicas(
 	}
 	// Clear the return array, if not satified
 	if (count < _count) {
-		if (g_replicationdebug > 0) printf("Across result count: %d < %d requested\n", count, _count);
 		results.resize(resultsInit);
 		count = 0;
 	}
-	if (g_replicationdebug > 0) {
-		printf("Across used:%5lu results:%3d from %3d items  key: %-7s  policy: %-10s => %s\n", results.size()-resultsInit, count, fromServers->size(), _attribKey.c_str(), _policy->name().c_str(), _policy->info().c_str());
-	}
 	return (count >= _count);
 }

--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -512,10 +512,10 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution( Dat
 				beginKey = keyServers.end()[-1].key;
 				break;
 			} catch (Error& e) {
-				wait( tr.onError(e) );
+				TraceEvent("GetInitialTeamsKeyServersRetry", distributorId).error(e);

+				wait( tr.onError(e) );
 				ASSERT(!succeeded); //We shouldn't be retrying if we have already started modifying result in this loop
-				TraceEvent("GetInitialTeamsKeyServersRetry", distributorId);
 			}
 		}

@ -2075,7 +2075,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				for (auto& server : serverTeam) {
 					score += server_info[server]->teams.size();
 				}
-				TraceEvent("BuildServerTeams")
+				TraceEvent(SevDebug, "BuildServerTeams")
 				    .detail("Score", score)
 				    .detail("BestScore", bestScore)
 				    .detail("TeamSize", serverTeam.size())
@ -4975,6 +4975,38 @@ ACTOR Future<Void> cacheServerWatcher(Database* db) {
 	}
 }

+static int64_t getMedianShardSize(VectorRef<DDMetricsRef> metricVec) {
+	std::nth_element(metricVec.begin(), metricVec.begin() + metricVec.size() / 2,
+	                 metricVec.end(), [](const DDMetricsRef& d1, const DDMetricsRef& d2) {
+						  return d1.shardBytes < d2.shardBytes;
+						});
+	return metricVec[metricVec.size() / 2].shardBytes;
+}
+
+ACTOR Future<Void> ddGetMetrics(GetDataDistributorMetricsRequest req, PromiseStream<GetMetricsListRequest> getShardMetricsList) {
+	ErrorOr<Standalone<VectorRef<DDMetricsRef>>> result = wait(errorOr(brokenPromiseToNever(
+		getShardMetricsList.getReply(GetMetricsListRequest(req.keys, req.shardLimit)))));
+
+	if(result.isError()) {
+		req.reply.sendError(result.getError());
+	} else {
+		GetDataDistributorMetricsReply rep;
+		if(!req.midOnly) {
+			rep.storageMetricsList = result.get();
+		}
+		else {
+			auto& metricVec = result.get();
+			if(metricVec.empty()) rep.midShardSize = 0;
+			else {
+				rep.midShardSize = getMedianShardSize(metricVec.contents());
+			}
+		}
+		req.reply.send(rep);
+	}
+
+	return Void();
+}
+
 ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncVar<struct ServerDBInfo>> db ) {
 	state Reference<DataDistributorData> self( new DataDistributorData(db, di.id()) );
 	state Future<Void> collection = actorCollection( self->addActor.getFuture() );
@ -5000,16 +5032,8 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 				TraceEvent("DataDistributorHalted", di.id()).detail("ReqID", req.requesterID);
 				break;
 			}
-			when ( state GetDataDistributorMetricsRequest req = waitNext(di.dataDistributorMetrics.getFuture()) ) {
-				ErrorOr<Standalone<VectorRef<DDMetricsRef>>> result = wait(errorOr(brokenPromiseToNever(
-				    getShardMetricsList.getReply(GetMetricsListRequest(req.keys, req.shardLimit)))));
-				if ( result.isError() ) {
-					req.reply.sendError(result.getError());
-				} else {
-					GetDataDistributorMetricsReply rep;
-					rep.storageMetricsList = result.get();
-					req.reply.send(rep);
-				}
+			when(GetDataDistributorMetricsRequest req = waitNext(di.dataDistributorMetrics.getFuture())) {
+				actors.add(ddGetMetrics(req, getShardMetricsList));
 			}
 			when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) {
 				actors.add(ddSnapCreate(snapReq, db));
--- a/fdbserver/DataDistributorInterface.h
+++ b/fdbserver/DataDistributorInterface.h
@ -70,12 +70,13 @@ struct HaltDataDistributorRequest {
 struct GetDataDistributorMetricsReply {
 	constexpr static FileIdentifier file_identifier = 1284337;
 	Standalone<VectorRef<DDMetricsRef>> storageMetricsList;
+	Optional<int64_t> midShardSize;

 	GetDataDistributorMetricsReply() {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar,storageMetricsList);
+		serializer(ar,storageMetricsList, midShardSize);
 	}
 };

@ -84,13 +85,15 @@ struct GetDataDistributorMetricsRequest {
 	KeyRange keys;
 	int shardLimit;
 	ReplyPromise<struct GetDataDistributorMetricsReply> reply;
+	bool midOnly = false;

 	GetDataDistributorMetricsRequest() {}
-	explicit GetDataDistributorMetricsRequest(KeyRange const& keys, const int shardLimit) : keys(keys), shardLimit(shardLimit) {}
+	explicit GetDataDistributorMetricsRequest(KeyRange const& keys, const int shardLimit, bool midOnly = false)
+	  : keys(keys), shardLimit(shardLimit), midOnly(midOnly) {}

 	template<class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, keys, shardLimit, reply);
+		serializer(ar, keys, shardLimit, reply, midOnly);
 	}
 };

--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@ -105,6 +105,10 @@ public:
 	// Free pageID to be used again after the commit that moves oldestVersion past v
 	virtual void freePage(LogicalPageID pageID, Version v) = 0;

+	// If id is remapped, delete the original as of version v and return the page it was remapped to.  The caller
+	// is then responsible for referencing and deleting the returned page ID.
+	virtual LogicalPageID detachRemappedPage(LogicalPageID id, Version v) = 0;
+
 	// Returns the latest data (regardless of version) for a page by LogicalPageID
 	// The data returned will be the later of
 	//   - the most recent committed atomic
@ -133,7 +137,7 @@ public:

 	virtual StorageBytes getStorageBytes() const = 0;

-	// Count of pages in use by the pager client
+	// Count of pages in use by the pager client (including retained old page versions)
 	virtual Future<int64_t> getUserPageCount() = 0;

 	// Future returned is ready when pager has been initialized from disk and is ready for reads and writes.
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@ -2,6 +2,7 @@

 #include <rocksdb/db.h>
 #include <rocksdb/options.h>
+#include <rocksdb/utilities/table_properties_collectors.h>
 #include "flow/flow.h"
 #include "flow/IThreadPool.h"

@ -22,14 +23,23 @@ StringRef toStringRef(rocksdb::Slice s) {
 	return StringRef(reinterpret_cast<const uint8_t*>(s.data()), s.size());
 }

-rocksdb::Options getOptions() {
-	rocksdb::Options options;
-	options.create_if_missing = true;
+rocksdb::ColumnFamilyOptions getCFOptions() {
+	rocksdb::ColumnFamilyOptions options;
+	options.level_compaction_dynamic_level_bytes = true;
+	options.OptimizeLevelStyleCompaction(SERVER_KNOBS->ROCKSDB_MEMTABLE_BYTES);
+	// Compact sstables when there's too much deleted stuff.
+	options.table_properties_collector_factories = { rocksdb::NewCompactOnDeletionCollectorFactory(128, 1) };
 	return options;
 }

-rocksdb::ColumnFamilyOptions getCFOptions() {
-	return {};
+rocksdb::Options getOptions() {
+	rocksdb::Options options({}, getCFOptions());
+	options.avoid_unnecessary_blocking_io = true;
+	options.create_if_missing = true;
+	if (SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM > 0) {
+		options.IncreaseParallelism(SERVER_KNOBS->ROCKSDB_BACKGROUND_PARALLELISM);
+	}
+	return options;
 }

 struct RocksDBKeyValueStore : IKeyValueStore {
@ -119,7 +129,6 @@ struct RocksDBKeyValueStore : IKeyValueStore {

 	struct Reader : IThreadPoolReceiver {
 		DB& db;
-		rocksdb::ReadOptions readOptions;

 		explicit Reader(DB& db) : db(db) {}

@ -141,7 +150,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				traceBatch.get().addEvent("GetValueDebug", a.debugID.get().first(), "Reader.Before");
 			}
 			rocksdb::PinnableSlice value;
-			auto s = db->Get(readOptions, db->DefaultColumnFamily(), toSlice(a.key), &value);
+			auto s = db->Get({}, db->DefaultColumnFamily(), toSlice(a.key), &value);
 			if (a.debugID.present()) {
 				traceBatch.get().addEvent("GetValueDebug", a.debugID.get().first(), "Reader.After");
 				traceBatch.get().dump();
@ -172,7 +181,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				traceBatch.get().addEvent("GetValuePrefixDebug", a.debugID.get().first(),
 				                          "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask());
 			}
-			auto s = db->Get(readOptions, db->DefaultColumnFamily(), toSlice(a.key), &value);
+			auto s = db->Get({}, db->DefaultColumnFamily(), toSlice(a.key), &value);
 			if (a.debugID.present()) {
 				traceBatch.get().addEvent("GetValuePrefixDebug", a.debugID.get().first(),
 				                          "Reader.After"); //.detail("TaskID", g_network->getCurrentTask());
@ -195,33 +204,51 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			virtual double getTimeEstimate() { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; }
 		};
 		void action(ReadRangeAction& a) {
-			auto cursor = std::unique_ptr<rocksdb::Iterator>(db->NewIterator(readOptions));
 			Standalone<RangeResultRef> result;
+			if (a.rowLimit == 0 || a.byteLimit == 0) {
+				a.result.send(result);
+			}
 			int accumulatedBytes = 0;
+			rocksdb::Status s;
 			if (a.rowLimit >= 0) {
+				rocksdb::ReadOptions options;
+				auto endSlice = toSlice(a.keys.end);
+				options.iterate_upper_bound = &endSlice;
+				auto cursor = std::unique_ptr<rocksdb::Iterator>(db->NewIterator(options));
 				cursor->Seek(toSlice(a.keys.begin));
-				while (cursor->Valid() && toStringRef(cursor->key()) < a.keys.end && result.size() < a.rowLimit &&
-				       accumulatedBytes < a.byteLimit) {
+				while (cursor->Valid() && toStringRef(cursor->key()) < a.keys.end) {
 					KeyValueRef kv(toStringRef(cursor->key()), toStringRef(cursor->value()));
 					accumulatedBytes += sizeof(KeyValueRef) + kv.expectedSize();
 					result.push_back_deep(result.arena(), kv);
+					// Calling `cursor->Next()` is potentially expensive, so short-circut here just in case.
+					if (result.size() >= a.rowLimit || accumulatedBytes >= a.byteLimit) {
+						break;
+					}
 					cursor->Next();
 				}
+				s = cursor->status();
 			} else {
+				rocksdb::ReadOptions options;
+				auto beginSlice = toSlice(a.keys.begin);
+				options.iterate_lower_bound = &beginSlice;
+				auto cursor = std::unique_ptr<rocksdb::Iterator>(db->NewIterator(options));
 				cursor->SeekForPrev(toSlice(a.keys.end));
 				if (cursor->Valid() && toStringRef(cursor->key()) == a.keys.end) {
 					cursor->Prev();
 				}
-
-				while (cursor->Valid() && toStringRef(cursor->key()) >= a.keys.begin && result.size() < -a.rowLimit &&
-				       accumulatedBytes < a.byteLimit) {
+				while (cursor->Valid() && toStringRef(cursor->key()) >= a.keys.begin) {
 					KeyValueRef kv(toStringRef(cursor->key()), toStringRef(cursor->value()));
 					accumulatedBytes += sizeof(KeyValueRef) + kv.expectedSize();
 					result.push_back_deep(result.arena(), kv);
+					// Calling `cursor->Prev()` is potentially expensive, so short-circut here just in case.
+					if (result.size() >= -a.rowLimit || accumulatedBytes >= a.byteLimit) {
+						break;
+					}
 					cursor->Prev();
 				}
+				s = cursor->status();
 			}
-			auto s = cursor->status();
+
 			if (!s.ok()) {
 				TraceEvent(SevError, "RocksDBError").detail("Error", s.ToString()).detail("Method", "ReadRange");
 			}
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -93,7 +93,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( PEEK_RESET_INTERVAL,                                 300.0 ); if ( randomize && BUGGIFY ) PEEK_RESET_INTERVAL = 20.0;
 	init( PEEK_MAX_LATENCY,                                      0.5 ); if ( randomize && BUGGIFY ) PEEK_MAX_LATENCY = 0.0;
 	init( PEEK_COUNT_SMALL_MESSAGES,                           false ); if ( randomize && BUGGIFY ) PEEK_COUNT_SMALL_MESSAGES = true;
-	init( PEEK_STATS_INTERVAL,                                  10.0 ); 
+	init( PEEK_STATS_INTERVAL,                                  10.0 );
 	init( PEEK_STATS_SLOW_AMOUNT,                                  0 );
 	init( PEEK_STATS_SLOW_RATIO,                                 0.5 );

@ -236,7 +236,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( DD_VALIDATE_LOCALITY,                                 true ); if( randomize && BUGGIFY ) DD_VALIDATE_LOCALITY = false;
 	init( DD_CHECK_INVALID_LOCALITY_DELAY,                       60  ); if( randomize && BUGGIFY ) DD_CHECK_INVALID_LOCALITY_DELAY = 1 + deterministicRandom()->random01() * 600;
 	init( DD_ENABLE_VERBOSE_TRACING,                           false ); if( randomize && BUGGIFY ) DD_ENABLE_VERBOSE_TRACING = true;
-	init( DD_SS_FAILURE_VERSIONLAG,                        250000000 ); 
+	init( DD_SS_FAILURE_VERSIONLAG,                        250000000 );
 	init( DD_SS_ALLOWED_VERSIONLAG,                        200000000 ); if( randomize && BUGGIFY ) { DD_SS_FAILURE_VERSIONLAG = deterministicRandom()->randomInt(15000000, 500000000); DD_SS_ALLOWED_VERSIONLAG = 0.75 * DD_SS_FAILURE_VERSIONLAG; }
 	init( DD_SS_STUCK_TIME_LIMIT,                              300.0 ); if( randomize && BUGGIFY ) { DD_SS_STUCK_TIME_LIMIT = 200.0 + deterministicRandom()->random01() * 100.0; }

@ -308,6 +308,10 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	// KeyValueStoreMemory
 	init( REPLACE_CONTENTS_BYTES,                                1e5 );

+	// KeyValueStoreRocksDB
+	init( ROCKSDB_BACKGROUND_PARALLELISM,                          0 );
+	init( ROCKSDB_MEMTABLE_BYTES,                  512 * 1024 * 1024 );
+
 	// Leader election
 	bool longLeaderElection = randomize && BUGGIFY;
 	init( MAX_NOTIFICATIONS,                                  100000 );
@ -555,9 +559,9 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( BEHIND_CHECK_COUNT,                                      2 );
 	init( BEHIND_CHECK_VERSIONS,             5 * VERSIONS_PER_SECOND );
 	init( WAIT_METRICS_WRONG_SHARD_CHANCE,   isSimulated ? 1.0 : 0.1 );
-	init( MIN_TAG_PAGES_READ_RATE,                             1.0e4 ); if( randomize && BUGGIFY ) MIN_TAG_PAGES_READ_RATE = 0;
-	init( READ_TAG_MEASUREMENT_INTERVAL,                        30.0 ); if( randomize && BUGGIFY ) READ_TAG_MEASUREMENT_INTERVAL = 1.0;
-	init( OPERATION_COST_BYTE_FACTOR,                          16384 ); if( randomize && BUGGIFY ) OPERATION_COST_BYTE_FACTOR = 4096;
+	init( MIN_TAG_PAGES_RATE,                             1.0e4 ); if( randomize && BUGGIFY ) MIN_TAG_PAGES_RATE = 0;
+	init( TAG_MEASUREMENT_INTERVAL,                        30.0 ); if( randomize && BUGGIFY ) TAG_MEASUREMENT_INTERVAL = 1.0;
+	init( READ_COST_BYTE_FACTOR,                          16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096;
 	init( PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS,                    true ); if( randomize && BUGGIFY ) PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS = false;

 	//Wait Failure
@ -573,7 +577,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS,         10 );
 	init( TRACE_LOG_PING_TIMEOUT_SECONDS,                        5.0 );
 	init( MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS,             10.0 );
-	init( MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS,             30.0 ); 
+	init( MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS,             30.0 );
 	init( DBINFO_FAILED_DELAY,                                   1.0 );

 	// Test harness
@ -650,13 +654,18 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( REDWOOD_REMAP_CLEANUP_WINDOW,                           50 );
 	init( REDWOOD_REMAP_CLEANUP_LAG,                             0.1 );
 	init( REDWOOD_LOGGING_INTERVAL,                              5.0 );
-	
+
 	// Server request latency measurement
 	init( LATENCY_SAMPLE_SIZE,                                100000 );
 	init( LATENCY_METRICS_LOGGING_INTERVAL,                     60.0 );

 	// clang-format on

-	if(clientKnobs)
-		clientKnobs->IS_ACCEPTABLE_DELAY = clientKnobs->IS_ACCEPTABLE_DELAY*std::min(MAX_READ_TRANSACTION_LIFE_VERSIONS, MAX_WRITE_TRANSACTION_LIFE_VERSIONS)/(5.0*VERSIONS_PER_SECOND);
+	if(clientKnobs) {
+		clientKnobs->IS_ACCEPTABLE_DELAY =
+		    clientKnobs->IS_ACCEPTABLE_DELAY *
+		    std::min(MAX_READ_TRANSACTION_LIFE_VERSIONS, MAX_WRITE_TRANSACTION_LIFE_VERSIONS) /
+		    (5.0 * VERSIONS_PER_SECOND);
+		clientKnobs->INIT_MID_SHARD_BYTES = MIN_SHARD_BYTES;
+	}
 }
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@ -243,6 +243,10 @@ public:
 	// KeyValueStoreMemory
 	int64_t REPLACE_CONTENTS_BYTES;

+	// KeyValueStoreRocksDB
+	int ROCKSDB_BACKGROUND_PARALLELISM;
+	int64_t ROCKSDB_MEMTABLE_BYTES;
+
 	// Leader election
 	int MAX_NOTIFICATIONS;
 	int MIN_NOTIFICATIONS;
@ -484,9 +488,9 @@ public:
 	int BEHIND_CHECK_COUNT;
 	int64_t BEHIND_CHECK_VERSIONS;
 	double WAIT_METRICS_WRONG_SHARD_CHANCE;
-	int64_t MIN_TAG_PAGES_READ_RATE;
-	double READ_TAG_MEASUREMENT_INTERVAL;
-	int64_t OPERATION_COST_BYTE_FACTOR;
+	int64_t MIN_TAG_PAGES_RATE;
+	double TAG_MEASUREMENT_INTERVAL;
+	int64_t READ_COST_BYTE_FACTOR;
 	bool PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS;

 	//Wait Failure
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@ -158,7 +158,7 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
                           GetHealthMetricsReply* detailedHealthMetricsReply,
                           TransactionTagMap<uint64_t>* transactionTagCounter,
                           PrioritizedTransactionTagMap<ClientTagThrottleLimits>* throttledTags,
-                           TransactionTagMap<TransactionCommitCostEstimation>* transactionTagCommitCostEst) {
+                           UIDTransactionTagMap<TransactionCommitCostEstimation>* ssTrTagCommitCost) {
 	state Future<Void> nextRequestTimer = Never();
 	state Future<Void> leaseTimeout = Never();
 	state Future<GetRateInfoReply> reply = Never();
@ -191,9 +191,9 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
 			}
 			reply = brokenPromiseToNever(db->get().ratekeeper.get().getRateInfo.getReply(
 			    GetRateInfoRequest(myID, *inTransactionCount, *inBatchTransactionCount, tagCounts,
-			                       *transactionTagCommitCostEst, detailed)));
+			                       *ssTrTagCommitCost, detailed)));
 			transactionTagCounter->clear();
-			transactionTagCommitCostEst->clear();
+			ssTrTagCommitCost->clear();
 			expectingDetailedReply = detailed;
 		}
 		when ( GetRateInfoReply rep = wait(reply) ) {
@ -405,6 +405,49 @@ struct ResolutionRequestBuilder {
 	}
 };

+ACTOR Future<Void> monitorDDMetricsChanges(int64_t* midShardSize, Reference<AsyncVar<ServerDBInfo>> db) {
+	state Future<Void> nextRequestTimer = Never();
+	state Future<GetDataDistributorMetricsReply> nextReply = Never();
+
+	if(db->get().distributor.present()) nextRequestTimer = Void();
+	loop {
+		try {
+			choose {
+				when(wait(db->onChange())) {
+					if ( db->get().distributor.present() ) {
+						TraceEvent("DataDistributorChanged", db->get().id)
+							.detail("DDID", db->get().distributor.get().id());
+						nextRequestTimer = Void();
+					} else {
+						TraceEvent("DataDistributorDied", db->get().id);
+						nextRequestTimer = Never();
+					}
+					nextReply = Never();
+				}
+				when(wait(nextRequestTimer)) {
+					nextRequestTimer = Never();
+					if(db->get().distributor.present()) {
+						nextReply = brokenPromiseToNever(db->get().distributor.get().dataDistributorMetrics.getReply(
+						    GetDataDistributorMetricsRequest(normalKeys, CLIENT_KNOBS->TOO_MANY, true)));
+					} else nextReply = Never();
+				}
+				when(GetDataDistributorMetricsReply reply = wait(nextReply)) {
+					nextReply = Never();
+					ASSERT(reply.midShardSize.present());
+					*midShardSize = reply.midShardSize.get();
+					nextRequestTimer = delay(CLIENT_KNOBS->MID_SHARD_SIZE_MAX_STALENESS);
+				}
+			}
+		} catch (Error& e) {
+			TraceEvent("DDMidShardSizeUpdateFail").error(e);
+			if(e.code() != error_code_timed_out && e.code() != error_code_dd_not_found)
+				throw ;
+			nextRequestTimer = delay(CLIENT_KNOBS->MID_SHARD_SIZE_MAX_STALENESS);
+			nextReply = Never();
+		}
+	}
+}
+
 ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std::pair<std::vector<CommitTransactionRequest>, int> > out, FutureStream<CommitTransactionRequest> in, int desiredBytes, int64_t memBytesLimit) {
 	wait(delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher));

@ -943,7 +986,7 @@ void determineCommittedTransactions(CommitBatchContext* self) {
 	self->lockedKey = pProxyCommitData->txnStateStore->readValue(databaseLockedKey).get();
 	self->locked = self->lockedKey.present() && self->lockedKey.get().size();

-	const auto& mustContainSystemKey = pProxyCommitData->txnStateStore->readValue(mustContainSystemMutationsKey).get();
+	const Optional<Value> mustContainSystemKey = pProxyCommitData->txnStateStore->readValue(mustContainSystemMutationsKey).get();
 	if (mustContainSystemKey.present() && mustContainSystemKey.get().size()) {
 		for (int t = 0; t < trs.size(); t++) {
 			if( self->committed[t] == ConflictBatch::TransactionCommitted ) {
@ -1016,6 +1059,8 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 			continue;
 		}

+		state bool checkSample = trs[self->transactionNum].commitCostEstimation.present();
+		state Optional<ClientTrCommitCostEstimation>* trCost = &trs[self->transactionNum].commitCostEstimation;
 		state int mutationNum = 0;
 		state VectorRef<MutationRef>* pMutations = &trs[self->transactionNum].transaction.mutations;
 		for (; mutationNum < pMutations->size(); mutationNum++) {
@ -1038,6 +1083,25 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 			if (isSingleKeyMutation((MutationRef::Type) m.type)) {
 				auto& tags = pProxyCommitData->tagsForKey(m.param1);

+				// sample single key mutation based on cost
+				// the expectation of sampling is every COMMIT_SAMPLE_COST sample once
+				if (checkSample) {
+					double totalCosts = trCost->get().writeCosts;
+					double cost = getWriteOperationCost(m.expectedSize());
+					double mul = std::max(1.0, totalCosts / std::max(1.0, (double)CLIENT_KNOBS->COMMIT_SAMPLE_COST));
+					ASSERT(totalCosts > 0);
+					double prob = mul * cost / totalCosts;
+
+					if(deterministicRandom()->random01() < prob) {
+						for(const auto& ssInfo : pProxyCommitData->keyInfo[m.param1].src_info) {
+							auto id = ssInfo->interf.id();
+							// scale cost
+							cost = cost < CLIENT_KNOBS->COMMIT_SAMPLE_COST ? CLIENT_KNOBS->COMMIT_SAMPLE_COST : cost;
+							pProxyCommitData->updateSSTagCost(id, trs[self->transactionNum].tagSet.get(), m, cost);
+						}
+					}
+				}
+
 				if(pProxyCommitData->singleKeyMutationEvent->enabled) {
 					KeyRangeRef shard = pProxyCommitData->keyInfo.rangeContaining(m.param1).range();
 					pProxyCommitData->singleKeyMutationEvent->tag1 = (int64_t)tags[0].id;
@ -1066,6 +1130,15 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {

 					ranges.begin().value().populateTags();
 					self->toCommit.addTags(ranges.begin().value().tags);
+
+					// check whether clear is sampled
+					if(checkSample && !trCost->get().clearIdxCosts.empty() && trCost->get().clearIdxCosts[0].first == mutationNum) {
+						for(const auto& ssInfo : ranges.begin().value().src_info) {
+							auto id = ssInfo->interf.id();
+							pProxyCommitData->updateSSTagCost(id, trs[self->transactionNum].tagSet.get(), m, trCost->get().clearIdxCosts[0].second);
+						}
+						trCost->get().clearIdxCosts.pop_front();
+					}
 				}
 				else {
 					TEST(true); //A clear range extends past a shard boundary
@ -1073,6 +1146,15 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 					for (auto r : ranges) {
 						r.value().populateTags();
 						allSources.insert(r.value().tags.begin(), r.value().tags.end());
+
+						// check whether clear is sampled
+						if(checkSample && !trCost->get().clearIdxCosts.empty() && trCost->get().clearIdxCosts[0].first == mutationNum) {
+							for(const auto& ssInfo : r.value().src_info) {
+								auto id = ssInfo->interf.id();
+								pProxyCommitData->updateSSTagCost(id, trs[self->transactionNum].tagSet.get(), m, trCost->get().clearIdxCosts[0].second);
+							}
+							trCost->get().clearIdxCosts.pop_front();
+						}
 					}
 					DEBUG_MUTATION("ProxyCommit", self->commitVersion, m).detail("Dbgid", pProxyCommitData->dbgid).detail("To", allSources).detail("Mutation", m);

@ -1121,6 +1203,11 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
 				}
 			}
 		}
+
+		if(checkSample) {
+			self->pProxyCommitData->stats.txnExpensiveClearCostEstCount +=
+			    trs[self->transactionNum].commitCostEstimation.get().expensiveCostEstCount;
+		}
 	}

 	return Void();
@ -1339,15 +1426,6 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
 		if (self->committed[t] == ConflictBatch::TransactionCommitted && (!self->locked || tr.isLockAware())) {
 			ASSERT_WE_THINK(self->commitVersion != invalidVersion);
 			tr.reply.send(CommitID(self->commitVersion, t, self->metadataVersionAfter));
-
-			// aggregate commit cost estimation if committed
-			ASSERT(tr.commitCostEstimation.present() == tr.tagSet.present());
-			if (tr.tagSet.present()) {
-				TransactionCommitCostEstimation& costEstimation = tr.commitCostEstimation.get();
-				for (auto& tag : tr.tagSet.get()) {
-					pProxyCommitData->transactionTagCommitCostEst[tag] += costEstimation;
-				}
-			}
 		}
 		else if (self->committed[t] == ConflictBatch::TransactionTooOld) {
 			tr.reply.sendError(transaction_too_old());
@ -1440,6 +1518,7 @@ ACTOR Future<Void> commitBatch(

 	context.pProxyCommitData->lastVersionTime = context.startTime;
 	++context.pProxyCommitData->stats.commitBatchIn;
+	context.setupTraceBatch();

 	/////// Phase 1: Pre-resolution processing (CPU bound except waiting for a version # which is separately pipelined and *should* be available by now (unless empty commit); ordered; currently atomic but could yield)
 	wait(CommitBatch::preresolutionProcessing(&context));
@ -1516,7 +1595,8 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanID parentSpan, Pro
 }

 ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture, std::vector<GetReadVersionRequest> requests,
-                                  ProxyStats* stats, Version minKnownCommittedVersion, PrioritizedTransactionTagMap<ClientTagThrottleLimits> throttledTags) {
+                                  ProxyStats* stats, Version minKnownCommittedVersion,
+                                  PrioritizedTransactionTagMap<ClientTagThrottleLimits> throttledTags, int64_t midShardSize = 0) {
 	GetReadVersionReply _reply = wait(replyFuture);
 	GetReadVersionReply reply = _reply;
 	Version replyVersion = reply.version;
@ -1538,7 +1618,7 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture, std::
 		else {
 			reply.version = replyVersion;
 		}
-
+		reply.midShardSize = midShardSize;
 		reply.tagThrottleInfo.clear();

 		if(!request.tags.empty()) {
@ -1597,9 +1677,12 @@ ACTOR static Future<Void> transactionStarter(
 	state PromiseStream<double> replyTimes;
 	state Span span;

+	state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES;
+	addActor.send(monitorDDMetricsChanges(&midShardSize, db));
+
 	addActor.send(getRate(proxy.id(), db, &transactionCount, &batchTransactionCount, &normalRateInfo, &batchRateInfo,
 	                      healthMetricsReply, detailedHealthMetricsReply, &transactionTagCounter, &throttledTags,
-	                      &(commitData->transactionTagCommitCostEst)));
+	                      &(commitData->ssTrTagCommitCost)));
 	addActor.send(queueTransactionStartRequests(db, &systemQueue, &defaultQueue, &batchQueue, proxy.getConsistentReadVersion.getFuture(),
 	                                            GRVTimer, &lastGRVTime, &GRVBatchTime, replyTimes.getFuture(), &commitData->stats, &batchRateInfo,
 	                                            &transactionTagCounter));
@ -1712,7 +1795,7 @@ ACTOR static Future<Void> transactionStarter(
 				    span.context, commitData, i, debugID, transactionsStarted[i], systemTransactionsStarted[i],
 				    defaultPriTransactionsStarted[i], batchPriTransactionsStarted[i]);
 				addActor.send(sendGrvReplies(readVersionReply, start[i], &commitData->stats,
-				                             commitData->minKnownCommittedVersion, throttledTags));
+				                             commitData->minKnownCommittedVersion, throttledTags, midShardSize));

 				// for now, base dynamic batching on the time for normal requests (not read_risky)
 				if (i == 0) {
@ -1879,8 +1962,14 @@ ACTOR Future<Void> ddMetricsRequestServer(MasterProxyInterface proxy, Reference<
 		choose {
 			when(state GetDDMetricsRequest req = waitNext(proxy.getDDMetrics.getFuture()))
 			{
-				ErrorOr<GetDataDistributorMetricsReply> reply = wait(errorOr(db->get().distributor.get().dataDistributorMetrics.getReply(GetDataDistributorMetricsRequest(req.keys, req.shardLimit))));
-				if ( reply.isError() ) {
+				if(!db->get().distributor.present()) {
+					req.reply.sendError(dd_not_found());
+					continue;
+				}
+				ErrorOr<GetDataDistributorMetricsReply> reply =
+				    wait(errorOr(db->get().distributor.get().dataDistributorMetrics.getReply(
+				        GetDataDistributorMetricsRequest(req.keys, req.shardLimit))));
+				if (reply.isError()) {
 					req.reply.sendError(reply.getError());
 				} else {
 					GetDDMetricsReply newReply;
@ -2011,7 +2100,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
 		// send a snap request to DD
 		if (!commitData->db->get().distributor.present()) {
 			TraceEvent(SevWarnAlways, "DataDistributorNotPresent").detail("Operation", "SnapRequest");
-			throw operation_failed();
+			throw dd_not_found();
 		}
 		state Future<ErrorOr<Void>> ddSnapReq =
 			commitData->db->get().distributor.get().distributorSnapReq.tryGetReply(DistributorSnapRequest(snapReq.snapPayload, snapReq.snapUID));
--- a/fdbserver/ProxyCommitData.actor.h
+++ b/fdbserver/ProxyCommitData.actor.h
@ -63,6 +63,7 @@ struct ProxyStats {
 	Counter mutations;
 	Counter conflictRanges;
 	Counter keyServerLocationIn, keyServerLocationOut, keyServerLocationErrors;
+	Counter txnExpensiveClearCostEstCount;
 	Version lastCommitVersionAssigned;

 	LatencySample commitLatencySample;
@ -119,6 +120,7 @@ struct ProxyStats {
 	    conflictRanges("ConflictRanges", cc), keyServerLocationIn("KeyServerLocationIn", cc),
 	    keyServerLocationOut("KeyServerLocationOut", cc), keyServerLocationErrors("KeyServerLocationErrors", cc),
 	    lastCommitVersionAssigned(0),
+	    txnExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
 	    commitLatencySample("CommitLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                        SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 	    grvLatencySample("GRVLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
@ -190,7 +192,7 @@ struct ProxyCommitData {
 	NotifiedDouble lastCommitTime;

 	vector<double> commitComputePerOperation;
-	TransactionTagMap<TransactionCommitCostEstimation> transactionTagCommitCostEst;
+	UIDTransactionTagMap<TransactionCommitCostEstimation> ssTrTagCommitCost;

 	// The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly
 	// more CPU efficient. When a tag related to a storage server does change, we empty out all of these vectors to
@ -199,13 +201,7 @@ struct ProxyCommitData {
 		auto& tags = keyInfo[key].tags;
 		if (!tags.size()) {
 			auto& r = keyInfo.rangeContaining(key).value();
-			for (auto info : r.src_info) {
-				r.tags.push_back(info->tag);
-			}
-			for (auto info : r.dest_info) {
-				r.tags.push_back(info->tag);
-			}
-			uniquify(r.tags);
+			r.populateTags();
 			return r.tags;
 		}
 		return tags;
@ -249,6 +245,18 @@ struct ProxyCommitData {
 		latencyBandConfig = newLatencyBandConfig;
 	}

+	void updateSSTagCost(const UID& id, const TagSet& tagSet, MutationRef m, int cost){
+		auto [it, _] = ssTrTagCommitCost.try_emplace(id, TransactionTagMap<TransactionCommitCostEstimation>());
+
+		for(auto& tag: tagSet) {
+			auto& costItem = it->second[tag];
+			if(m.isAtomicOp() || m.type == MutationRef::Type::SetValue || m.type == MutationRef::Type::ClearRange) {
+				costItem.opsSum ++;
+				costItem.costSum += cost;
+			}
+		}
+	}
+
 	ProxyCommitData(UID dbgid, MasterInterface master, RequestStream<GetReadVersionRequest> getConsistentReadVersion,
 	                Version recoveryTransactionVersion, RequestStream<CommitTransactionRequest> commit,
 	                Reference<AsyncVar<ServerDBInfo>> db, bool firstProxy)
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@ -97,17 +97,21 @@ struct StorageQueueInfo {
 	Smoother smoothTotalSpace;
 	limitReason_t limitReason;

-	Optional<TransactionTag> busiestTag;
-	double busiestTagFractionalBusyness;
-	double busiestTagRate;
+	Optional<TransactionTag> busiestReadTag, busiestWriteTag;
+	double busiestReadTagFractionalBusyness = 0, busiestWriteTagFractionalBusyness = 0;
+	double busiestReadTagRate = 0, busiestWriteTagRate = 0;
+
+	// refresh periodically
+	TransactionTagMap<TransactionCommitCostEstimation> tagCostEst;
+	uint64_t totalWriteCosts = 0;
+	int totalWriteOps = 0;

 	StorageQueueInfo(UID id, LocalityData locality)
 	  : valid(false), id(id), locality(locality), smoothDurableBytes(SERVER_KNOBS->SMOOTHING_AMOUNT),
 	    smoothInputBytes(SERVER_KNOBS->SMOOTHING_AMOUNT), verySmoothDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
 	    smoothDurableVersion(SERVER_KNOBS->SMOOTHING_AMOUNT),
 	    smoothLatestVersion(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT),
-	    smoothTotalSpace(SERVER_KNOBS->SMOOTHING_AMOUNT), limitReason(limitReason_t::unlimited), busiestTagFractionalBusyness(0),
-		busiestTagRate(0) {
+	    smoothTotalSpace(SERVER_KNOBS->SMOOTHING_AMOUNT), limitReason(limitReason_t::unlimited) {
 		// FIXME: this is a tacky workaround for a potential uninitialized use in trackStorageServerQueueInfo
 		lastReply.instanceID = -1;
 	}
@ -338,7 +342,7 @@ public:
 		return Optional<ClientTagThrottleLimits>();
 	}

-	PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() {
+	PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates(bool autoThrottlingEnabled) {
 		PrioritizedTransactionTagMap<ClientTagThrottleLimits> clientRates;

 		for(auto tagItr = tagData.begin(); tagItr != tagData.end();) {
@ -401,14 +405,18 @@ public:
 					}

 					tagPresent = true;
-					auto result = clientRates[TransactionPriority::DEFAULT].try_emplace(tagItr->first, adjustedRate, autoItr->second.limits.expiration);
-					if(!result.second && result.first->second.tpsRate > adjustedRate) {
-						result.first->second = ClientTagThrottleLimits(adjustedRate, autoItr->second.limits.expiration);
+					if (autoThrottlingEnabled) {
+						auto result = clientRates[TransactionPriority::DEFAULT].try_emplace(
+						    tagItr->first, adjustedRate, autoItr->second.limits.expiration);
+						if (!result.second && result.first->second.tpsRate > adjustedRate) {
+							result.first->second =
+							    ClientTagThrottleLimits(adjustedRate, autoItr->second.limits.expiration);
+						} else {
+							TEST(true); // Auto throttle overriden by manual throttle
+						}
+						clientRates[TransactionPriority::BATCH][tagItr->first] =
+						    ClientTagThrottleLimits(0, autoItr->second.limits.expiration);
 					}
-					else {
-						TEST(true); // Auto throttle overriden by manual throttle
-					}
-					clientRates[TransactionPriority::BATCH][tagItr->first] = ClientTagThrottleLimits(0, autoItr->second.limits.expiration);
 				}
 				else {
 					ASSERT(autoItr->second.limits.expiration <= now());
@ -481,6 +489,7 @@ public:
 	TransactionTagMap<RkTagThrottleData> autoThrottledTags;
 	TransactionTagMap<std::map<TransactionPriority, RkTagThrottleData>> manualThrottledTags;
 	TransactionTagMap<RkTagData> tagData;
+	uint32_t busyReadTagCount = 0, busyWriteTagCount = 0;
 };

 struct RatekeeperLimits {
@ -546,6 +555,7 @@ struct RatekeeperData {

 	double lastWarning;
 	double lastSSListFetchedTimestamp;
+	double lastBusiestCommitTagPick;

 	RkTagThrottleCollection throttledTags;
 	uint64_t throttledTagChangeId;
@ -565,7 +575,7 @@ struct RatekeeperData {
 	    smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT),
 	    smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
 	    actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")), lastWarning(0), lastSSListFetchedTimestamp(now()),
-	    throttledTagChangeId(0),
+	    throttledTagChangeId(0), lastBusiestCommitTagPick(0),
 	    normalLimits(TransactionPriority::DEFAULT, "", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER,
 	                 SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG,
 	                 SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE,
@ -611,9 +621,9 @@ ACTOR Future<Void> trackStorageServerQueueInfo( RatekeeperData* self, StorageSer
 					myQueueInfo->value.smoothLatestVersion.setTotal(reply.get().version);
 				}

-				myQueueInfo->value.busiestTag = reply.get().busiestTag;
-				myQueueInfo->value.busiestTagFractionalBusyness = reply.get().busiestTagFractionalBusyness;
-				myQueueInfo->value.busiestTagRate = reply.get().busiestTagRate;
+				myQueueInfo->value.busiestReadTag = reply.get().busiestTag;
+				myQueueInfo->value.busiestReadTagFractionalBusyness = reply.get().busiestTagFractionalBusyness;
+				myQueueInfo->value.busiestReadTagRate = reply.get().busiestTagRate;
 			} else {
 				if(myQueueInfo->value.valid) {
 					TraceEvent("RkStorageServerDidNotRespond", self->id)
@ -787,6 +797,8 @@ ACTOR Future<Void> monitorThrottlingChanges(RatekeeperData *self) {
 						TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue", self->id).detail("Value", autoThrottlingEnabled.get().get());
 					}
 					self->autoThrottlingEnabled = SERVER_KNOBS->AUTO_TAG_THROTTLING_ENABLED;
+					if(!committed)
+					    tr.set(tagThrottleAutoEnabledKey, LiteralStringRef(self->autoThrottlingEnabled ? "1" : "0"));
 				}

 				RkTagThrottleCollection updatedTagThrottles;
@ -814,6 +826,12 @@ ACTOR Future<Void> monitorThrottlingChanges(RatekeeperData *self) {

 						if(tagKey.throttleType == TagThrottleType::AUTO) {
 							updatedTagThrottles.autoThrottleTag(self->id, tag, 0, tagValue.tpsRate, tagValue.expirationTime);
+							if(tagValue.reason == TagThrottledReason::BUSY_READ){
+								updatedTagThrottles.busyReadTagCount ++;
+							}
+							else if(tagValue.reason == TagThrottledReason::BUSY_WRITE) {
+								updatedTagThrottles.busyWriteTagCount ++;
+							}
 						}
 						else {
 							updatedTagThrottles.manualThrottleTag(self->id, tag, tagKey.priority, tagValue.tpsRate, tagValue.expirationTime, oldLimits);
@ -840,17 +858,82 @@ ACTOR Future<Void> monitorThrottlingChanges(RatekeeperData *self) {
 	}
 }

-void tryAutoThrottleTag(RatekeeperData *self, StorageQueueInfo const& ss) {
-	if(ss.busiestTag.present() && ss.busiestTagFractionalBusyness > SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS && ss.busiestTagRate > SERVER_KNOBS->MIN_TAG_COST) {
-		TEST(true); // Transaction tag auto-throttled
-
-		Optional<double> clientRate = self->throttledTags.autoThrottleTag(self->id, ss.busiestTag.get(), ss.busiestTagFractionalBusyness);
-		if(clientRate.present()) {
-			TagSet tags;
-			tags.addTag(ss.busiestTag.get());
-
-			self->addActor.send(ThrottleApi::throttleTags(self->db, tags, clientRate.get(), SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION, TagThrottleType::AUTO, TransactionPriority::DEFAULT, now() + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION));
+Future<Void> refreshStorageServerCommitCost(RatekeeperData *self) {
+	if(self->lastBusiestCommitTagPick == 0) { // the first call should be skipped
+		self->lastBusiestCommitTagPick = now();
+		return Void();
+	}
+	double elapsed = now() - self->lastBusiestCommitTagPick;
+	// for each SS, select the busiest commit tag from ssTrTagCommitCost
+	for(auto it = self->storageQueueInfo.begin(); it != self->storageQueueInfo.end(); ++it) {
+		it->value.busiestWriteTag.reset();
+		TransactionTag busiestTag;
+		TransactionCommitCostEstimation maxCost;
+		double maxRate = 0, maxBusyness = 0;
+		for(const auto& [tag, cost] : it->value.tagCostEst) {
+			double rate = cost.getOpsSum() / elapsed;
+			if(rate > maxRate) {
+				busiestTag = tag;
+				maxRate = rate;
+				maxCost = cost;
+			}
 		}
+		if(maxRate > SERVER_KNOBS->MIN_TAG_PAGES_RATE) {
+			it->value.busiestWriteTag = busiestTag;
+			// TraceEvent("RefreshSSCommitCost").detail("TotalWriteCost", it->value.totalWriteCost).detail("TotalWriteOps",it->value.totalWriteOps);
+			ASSERT(it->value.totalWriteCosts > 0);
+			maxBusyness = double(maxCost.getCostSum()) / it->value.totalWriteCosts;
+			it->value.busiestWriteTagFractionalBusyness = maxBusyness;
+			it->value.busiestWriteTagRate = maxRate;
+		}
+
+		TraceEvent("BusiestWriteTag", it->key)
+			.detail("Elapsed", elapsed)
+			.detail("Tag", printable(busiestTag))
+			.detail("TagOps", maxCost.getOpsSum())
+			.detail("TagCosts", maxCost.getCostSum())
+			.detail("TagRate", maxRate)
+			.detail("TagBusyness", maxBusyness)
+			.detail("Reported", it->value.busiestWriteTag.present())
+			.trackLatest(it->key.toString() + "/BusiestWriteTag");
+
+		// reset statistics
+		it->value.tagCostEst.clear();
+		it->value.totalWriteOps = 0;
+		it->value.totalWriteCosts = 0;
+	}
+	self->lastBusiestCommitTagPick = now();
+	return Void();
+}
+
+void tryAutoThrottleTag(RatekeeperData* self, TransactionTag tag, double rate, double busyness, TagThrottledReason reason) {
+	if (busyness > SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS && rate > SERVER_KNOBS->MIN_TAG_COST) {
+		TEST(true); // Transaction tag auto-throttled
+		Optional<double> clientRate = self->throttledTags.autoThrottleTag(self->id, tag, busyness);
+		if (clientRate.present()) {
+			TagSet tags;
+			tags.addTag(tag);
+
+			self->addActor.send(ThrottleApi::throttleTags(
+			    self->db, tags, clientRate.get(), SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION, TagThrottleType::AUTO,
+			    TransactionPriority::DEFAULT, now() + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION, reason));
+		}
+	}
+}
+
+void tryAutoThrottleTag(RatekeeperData* self, StorageQueueInfo& ss, int64_t storageQueue, int64_t storageDurabilityLag) {
+// TODO: reasonable criteria for write satuation should be investigated in experiment
+//	if (ss.busiestWriteTag.present() && storageQueue > SERVER_KNOBS->AUTO_TAG_THROTTLE_STORAGE_QUEUE_BYTES &&
+//	    storageDurabilityLag > SERVER_KNOBS->AUTO_TAG_THROTTLE_DURABILITY_LAG_VERSIONS) {
+//		// write-saturated
+//		tryAutoThrottleTag(self, ss.busiestWriteTag.get(), ss.busiestWriteTagRate, ss.busiestWriteTagFractionalBusyness);
+//	} else
+    if (ss.busiestReadTag.present() &&
+	           (storageQueue > SERVER_KNOBS->AUTO_TAG_THROTTLE_STORAGE_QUEUE_BYTES ||
+	           storageDurabilityLag > SERVER_KNOBS->AUTO_TAG_THROTTLE_DURABILITY_LAG_VERSIONS)) {
+		// read saturated
+		tryAutoThrottleTag(self, ss.busiestReadTag.get(), ss.busiestReadTagRate, ss.busiestReadTagFractionalBusyness,
+		                   TagThrottledReason::BUSY_READ);
 	}
 }

@ -921,8 +1004,8 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {

 		double targetRateRatio = std::min(( storageQueue - targetBytes + springBytes ) / (double)springBytes, 2.0);

-		if(limits->priority == TransactionPriority::DEFAULT && (storageQueue > SERVER_KNOBS->AUTO_TAG_THROTTLE_STORAGE_QUEUE_BYTES || storageDurabilityLag > SERVER_KNOBS->AUTO_TAG_THROTTLE_DURABILITY_LAG_VERSIONS)) {
-			tryAutoThrottleTag(self, ss);
+		if(limits->priority == TransactionPriority::DEFAULT) {
+			tryAutoThrottleTag(self, ss, storageQueue, storageDurabilityLag);
 		}

 		double inputRate = ss.smoothInputBytes.smoothRate();
@ -994,6 +1077,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		break;
 	}

+	// Calculate limited durability lag
 	int64_t limitingDurabilityLag = 0;

 	std::set<Optional<Standalone<StringRef>>> ignoredDurabilityLagMachines;
@ -1197,11 +1281,26 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 			.detail("WorstStorageServerDurabilityLag", worstDurabilityLag)
 			.detail("LimitingStorageServerDurabilityLag", limitingDurabilityLag)
 			.detail("TagsAutoThrottled", self->throttledTags.autoThrottleCount())
+			.detail("TagsAutoThrottledBusyRead", self->throttledTags.busyReadTagCount)
+			.detail("TagsAutoThrottledBusyWrite", self->throttledTags.busyWriteTagCount)
 			.detail("TagsManuallyThrottled", self->throttledTags.manualThrottleCount())
+			.detail("AutoThrottlingEnabled", self->autoThrottlingEnabled)
 			.trackLatest(name);
 	}
 }

+static void updateCommitCostEstimation(RatekeeperData* self, UIDTransactionTagMap<TransactionCommitCostEstimation> const& costEstimation) {
+	for(auto it = self->storageQueueInfo.begin(); it != self->storageQueueInfo.end(); ++ it) {
+		auto tagCostIt = costEstimation.find(it->key);
+		if(tagCostIt == costEstimation.end()) continue;
+		for(const auto& [tagName, cost] : tagCostIt->second) {
+			it->value.tagCostEst[tagName] += cost;
+			it->value.totalWriteCosts += cost.getCostSum();
+			it->value.totalWriteOps += cost.getOpsSum();
+		}
+	}
+}
+
 ACTOR Future<Void> configurationMonitor(RatekeeperData *self) {
 	loop {
 		state ReadYourWritesTransaction tr(self->db);
@ -1244,6 +1343,8 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 	self.addActor.send( traceRole(Role::RATEKEEPER, rkInterf.id()) );

 	self.addActor.send(monitorThrottlingChanges(&self));
+	RatekeeperData* selfPtr = &self; // let flow compiler capture self
+	self.addActor.send(recurring([selfPtr](){refreshStorageServerCommitCost(selfPtr);}, SERVER_KNOBS->TAG_MEASUREMENT_INTERVAL));

 	TraceEvent("RkTLogQueueSizeParameters", rkInterf.id()).detail("Target", SERVER_KNOBS->TARGET_BYTES_PER_TLOG).detail("Spring", SERVER_KNOBS->SPRING_BYTES_TLOG)
 		.detail("Rate", (SERVER_KNOBS->TARGET_BYTES_PER_TLOG - SERVER_KNOBS->SPRING_BYTES_TLOG) / ((((double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) / SERVER_KNOBS->VERSIONS_PER_SECOND) + 2.0));
@ -1286,10 +1387,6 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 					for(auto tag : req.throttledTagCounts) {
 						self.throttledTags.addRequests(tag.first, tag.second);
 					}
-					// TODO process commitCostEstimation
-					//	for (const auto &[tagName, cost] : req.throttledTagCommitCostEst) {
-					//
-					//	}
 				}
 				if(p.batchTransactions > 0) {
 					self.smoothBatchReleasedTransactions.addDelta( req.batchReleasedTransactions - p.batchTransactions );
@ -1303,11 +1400,13 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 				reply.batchTransactionRate = self.batchLimits.tpsLimit / self.proxyInfo.size();
 				reply.leaseDuration = SERVER_KNOBS->METRIC_UPDATE_RATE;

+				updateCommitCostEstimation(&self, req.ssTrTagCommitCost);
+
 				if(p.lastThrottledTagChangeId != self.throttledTagChangeId || now() < p.lastTagPushTime + SERVER_KNOBS->TAG_THROTTLE_PUSH_INTERVAL) {
 					p.lastThrottledTagChangeId = self.throttledTagChangeId;
 					p.lastTagPushTime = now();

-					reply.throttledTags = self.throttledTags.getClientRates();
+					reply.throttledTags = self.throttledTags.getClientRates(self.autoThrottlingEnabled);
 					TEST(reply.throttledTags.present() && reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
 				}

--- a/fdbserver/RatekeeperInterface.h
+++ b/fdbserver/RatekeeperInterface.h
@ -76,31 +76,35 @@ struct ClientTagThrottleLimits {
 };

 struct TransactionCommitCostEstimation {
-	int numWrite = 0;
-	int numAtomicWrite = 0;
-	int numClear = 0;
-	int numClearShards = 0;
-	uint64_t bytesWrite = 0;
-	uint64_t bytesAtomicWrite = 0;
-	uint64_t bytesClearEst = 0;
+	int opsSum = 0;
+	uint64_t costSum = 0;
+
+	uint64_t getCostSum() const { return costSum; }
+	int getOpsSum() const { return opsSum; }

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, bytesWrite, bytesClearEst, bytesAtomicWrite, numWrite, numAtomicWrite, numClear, numClearShards);
+		serializer(ar, opsSum, costSum);
 	}

 	TransactionCommitCostEstimation& operator+=(const TransactionCommitCostEstimation& other) {
-		numWrite += other.numWrite;
-		numAtomicWrite += other.numAtomicWrite;
-		numClear += other.numClear;
-		bytesWrite += other.bytesWrite;
-		bytesAtomicWrite += other.numAtomicWrite;
-		numClearShards += other.numClearShards;
-		bytesClearEst += other.bytesClearEst;
+		opsSum += other.opsSum;
+		costSum += other.costSum;
 		return *this;
 	}
 };

+struct ClientTrCommitCostEstimation {
+	int opsCount = 0;
+	uint64_t writeCosts = 0;
+	std::deque<std::pair<int, uint64_t>> clearIdxCosts;
+	uint32_t expensiveCostEstCount = 0;
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, opsCount, writeCosts, clearIdxCosts, expensiveCostEstCount);
+	}
+};
+
 struct GetRateInfoReply {
 	constexpr static FileIdentifier file_identifier = 7845006;
 	double transactionRate;
@ -123,21 +127,21 @@ struct GetRateInfoRequest {
 	int64_t batchReleasedTransactions;

 	TransactionTagMap<uint64_t> throttledTagCounts;
-	TransactionTagMap<TransactionCommitCostEstimation> throttledTagCommitCostEst;
+	UIDTransactionTagMap<TransactionCommitCostEstimation> ssTrTagCommitCost;
 	bool detailed;
 	ReplyPromise<struct GetRateInfoReply> reply;

 	GetRateInfoRequest() {}
 	GetRateInfoRequest(UID const& requesterID, int64_t totalReleasedTransactions, int64_t batchReleasedTransactions,
 	                   TransactionTagMap<uint64_t> throttledTagCounts,
-	                   TransactionTagMap<TransactionCommitCostEstimation> throttledTagCommitCostEst, bool detailed)
+	                   UIDTransactionTagMap<TransactionCommitCostEstimation> ssTrTagCommitCost, bool detailed)
 	  : requesterID(requesterID), totalReleasedTransactions(totalReleasedTransactions),
 	    batchReleasedTransactions(batchReleasedTransactions), throttledTagCounts(throttledTagCounts),
-	    throttledTagCommitCostEst(throttledTagCommitCostEst), detailed(detailed) {}
+	    ssTrTagCommitCost(ssTrTagCommitCost), detailed(detailed) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, requesterID, totalReleasedTransactions, batchReleasedTransactions, throttledTagCounts, detailed, reply, throttledTagCommitCostEst);
+		serializer(ar, requesterID, totalReleasedTransactions, batchReleasedTransactions, throttledTagCounts, detailed, reply, ssTrTagCommitCost);
 	}
 };

--- a/fdbserver/RestoreApplier.actor.cpp
+++ b/fdbserver/RestoreApplier.actor.cpp
@ -185,7 +185,7 @@ ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRange
 	state int retries = 0;
 	state double numOps = 0;
 	wait(delay(delayTime + deterministicRandom()->random01() * delayTime));
-	TraceEvent(delayTime > 5 ? SevWarnAlways : SevInfo, "FastRestoreApplierClearRangeMutationsStart", applierID)
+	TraceEvent(delayTime > 5 ? SevWarnAlways : SevDebug, "FastRestoreApplierClearRangeMutationsStart", applierID)
 	    .detail("BatchIndex", batchIndex)
 	    .detail("Ranges", ranges.size())
 	    .detail("DelayTime", delayTime);
@ -296,7 +296,7 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 	for (auto& key : incompleteStagingKeys) {
 		if (!fValues[i].get().present()) { // Key not exist in DB
 			// if condition: fValues[i].Valid() && fValues[i].isReady() && !fValues[i].isError() &&
-			TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB", applierID)
+			TraceEvent(SevDebug, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB", applierID)
 			    .suppressFor(5.0)
 			    .detail("BatchIndex", batchIndex)
 			    .detail("Key", key.first)
@ -304,7 +304,7 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 			    .detail("PendingMutations", key.second->second.pendingMutations.size())
 			    .detail("StagingKeyType", getTypeString(key.second->second.type));
 			for (auto& vm : key.second->second.pendingMutations) {
-				TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
+				TraceEvent(SevDebug, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
 				    .detail("PendingMutationVersion", vm.first.toString())
 				    .detail("PendingMutation", vm.second.toString());
 			}
--- a/fdbserver/RestoreController.actor.cpp
+++ b/fdbserver/RestoreController.actor.cpp
@ -300,7 +300,7 @@ ACTOR static Future<Version> processRestoreRequest(Reference<RestoreControllerDa
 	state std::vector<RestoreFileFR> logFiles;
 	state std::vector<RestoreFileFR> allFiles;
 	state Version minRangeVersion = MAX_VERSION;
-	state ActorCollection actors(false);
+	state Future<Void> error = actorCollection(self->addActor.getFuture());

 	self->initBackupContainer(request.url);

@ -356,7 +356,7 @@ ACTOR static Future<Version> processRestoreRequest(Reference<RestoreControllerDa
 		}
 	}

-	actors.add(monitorFinishedVersion(self, request));
+	self->addActor.send(monitorFinishedVersion(self, request));
 	state std::vector<VersionBatch>::iterator versionBatch = versionBatches.begin();
 	for (; versionBatch != versionBatches.end(); versionBatch++) {
 		while (self->runningVersionBatches.get() >= SERVER_KNOBS->FASTRESTORE_VB_PARALLELISM && !releaseVBOutOfOrder) {
@ -378,7 +378,11 @@ ACTOR static Future<Version> processRestoreRequest(Reference<RestoreControllerDa
 		wait(delay(SERVER_KNOBS->FASTRESTORE_VB_LAUNCH_DELAY));
 	}

-	wait(waitForAll(fBatches));
+	try {
+		wait(waitForAll(fBatches) || error);
+	} catch (Error& e) {
+		TraceEvent(SevError, "FastRestoreControllerDispatchVersionBatchesUnexpectedError").error(e);
+	}

 	TraceEvent("FastRestoreController").detail("RestoreToVersion", request.targetVersion);
 	return request.targetVersion;
--- a/fdbserver/RestoreController.actor.h
+++ b/fdbserver/RestoreController.actor.h
@ -149,6 +149,10 @@ struct RestoreControllerData : RestoreRoleData, public ReferenceCounted<RestoreC

 	std::map<UID, double> rolesHeartBeatTime; // Key: role id; Value: most recent time controller receives heart beat

+	// addActor: add to actorCollection so that when an actor has error, the ActorCollection can catch the error.
+	// addActor is used to create the actorCollection when the RestoreController is created
+	PromiseStream<Future<Void>> addActor;
+
 	void addref() { return ReferenceCounted<RestoreControllerData>::addref(); }
 	void delref() { return ReferenceCounted<RestoreControllerData>::delref(); }

--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -1745,11 +1745,14 @@ ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(Reference<AsyncVar<
 		state TraceEventFields ratekeeper = wait( timeoutError(rkWorker.interf.eventLogRequest.getReply( EventLogRequest(LiteralStringRef("RkUpdate") ) ), 1.0) );
 		TraceEventFields batchRatekeeper = wait( timeoutError(rkWorker.interf.eventLogRequest.getReply( EventLogRequest(LiteralStringRef("RkUpdateBatch") ) ), 1.0) );

+		bool autoThrottlingEnabled = ratekeeper.getInt("AutoThrottlingEnabled");
 		double tpsLimit = ratekeeper.getDouble("TPSLimit");
 		double batchTpsLimit = batchRatekeeper.getDouble("TPSLimit");
 		double transPerSec = ratekeeper.getDouble("ReleasedTPS");
 		double batchTransPerSec = ratekeeper.getDouble("ReleasedBatchTPS");
 		int autoThrottledTags = ratekeeper.getInt("TagsAutoThrottled");
+		int autoThrottledTagsBusyRead = ratekeeper.getInt("TagsAutoThrottledBusyRead");
+		int autoThrottledTagsBusyWrite = ratekeeper.getInt("TagsAutoThrottledBusyWrite");
 		int manualThrottledTags = ratekeeper.getInt("TagsManuallyThrottled");
 		int ssCount = ratekeeper.getInt("StorageServers");
 		int tlogCount = ratekeeper.getInt("TLogs");
@ -1779,9 +1782,28 @@ ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(Reference<AsyncVar<
 		(*qos)["batch_released_transactions_per_second"] = batchTransPerSec;

 		JsonBuilderObject throttledTagsObj;
-		JsonBuilderObject autoThrottledTagsObj;
-		autoThrottledTagsObj["count"] = autoThrottledTags;
+		JsonBuilderObject autoThrottledTagsObj, recommendThrottleTagsObj;
+		if(autoThrottlingEnabled) {
+			autoThrottledTagsObj["count"] = autoThrottledTags;
+			autoThrottledTagsObj["busy_read"] = autoThrottledTagsBusyRead;
+			autoThrottledTagsObj["busy_write"] = autoThrottledTagsBusyWrite;
+
+			recommendThrottleTagsObj["count"] = 0;
+			recommendThrottleTagsObj["busy_read"] = 0;
+			recommendThrottleTagsObj["busy_write"] = 0;
+		}
+		else {
+			recommendThrottleTagsObj["count"] = autoThrottledTags;
+			recommendThrottleTagsObj["busy_read"] = autoThrottledTagsBusyRead;
+			recommendThrottleTagsObj["busy_write"] = autoThrottledTagsBusyWrite;
+
+			autoThrottledTagsObj["count"] = 0;
+			autoThrottledTagsObj["busy_read"] = 0;
+			autoThrottledTagsObj["busy_write"] = 0;
+		}
+
 		throttledTagsObj["auto"] = autoThrottledTagsObj;
+		throttledTagsObj["recommend"] = recommendThrottleTagsObj;

 		JsonBuilderObject manualThrottledTagsObj;
 		manualThrottledTagsObj["count"] = manualThrottledTags;
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@ -92,7 +92,18 @@ std::string toString(LogicalPageID id) {
 	if (id == invalidLogicalPageID) {
 		return "LogicalPageID{invalid}";
 	}
-	return format("LogicalPageID{%" PRId64 "}", id);
+	return format("LogicalPageID{%u}", id);
+}
+
+std::string toString(Version v) {
+	if (v == invalidVersion) {
+		return "invalidVersion";
+	}
+	return format("@%" PRId64, v);
+}
+
+std::string toString(bool b) {
+	return b ? "true" : "false";
 }

 template <typename T>
@ -136,6 +147,11 @@ std::string toString(const Optional<T>& o) {
 	return "<not present>";
 }

+template <typename F, typename S>
+std::string toString(const std::pair<F, S>& o) {
+	return format("{%s, %s}", toString(o.first).c_str(), toString(o.second).c_str());
+}
+
 // A FIFO queue of T stored as a linked list of pages.
 // Main operations are pop(), pushBack(), pushFront(), and flush().
 //
@ -765,6 +781,8 @@ struct RedwoodMetrics {
 		unsigned int lazyClearRequeueExt;
 		unsigned int lazyClearFree;
 		unsigned int lazyClearFreeExt;
+		unsigned int forceUpdate;
+		unsigned int detachChild;
 		double buildStoredPct;
 		double buildFillPct;
 		unsigned int buildItemCount;
@ -797,6 +815,12 @@ struct RedwoodMetrics {
 	unsigned int btreeLeafPreload;
 	unsigned int btreeLeafPreloadExt;

+	// Return number of pages read or written, from cache or disk
+	unsigned int pageOps() const {
+		// All page reads are either a cache hit, probe hit, or a disk read
+		return pagerDiskWrite + pagerDiskRead + pagerCacheHit + pagerProbeHit;
+	}
+
 	double startTime;

 	Level& level(unsigned int level) {
@ -807,9 +831,9 @@ struct RedwoodMetrics {
 		return levels[level - 1];
 	}

-	// This will populate a trace event and/or a string with Redwood metrics.  The string is a
-	// reasonably well formatted page of information
-	void getFields(TraceEvent* e, std::string* s = nullptr) {
+	// This will populate a trace event and/or a string with Redwood metrics.
+	// The string is a reasonably well formatted page of information
+	void getFields(TraceEvent* e, std::string* s = nullptr, bool skipZeroes = false) {
 		std::pair<const char*, unsigned int> metrics[] = { { "BTreePreload", btreeLeafPreload },
 			                                               { "BTreePreloadExt", btreeLeafPreloadExt },
 			                                               { "", 0 },
@ -837,21 +861,26 @@ struct RedwoodMetrics {
 			                                               { "PagerRemapCopy", pagerRemapCopy },
 			                                               { "PagerRemapSkip", pagerRemapSkip } };
 		double elapsed = now() - startTime;
-		for (auto& m : metrics) {
-			if (*m.first == '\0') {
-				if (s != nullptr) {
-					*s += "\n";
-				}
-			} else {
-				if (s != nullptr) {
-					*s += format("%-15s %-8u %8u/s  ", m.first, m.second, int(m.second / elapsed));
-				}
-				if (e != nullptr) {
+
+		if (e != nullptr) {
+			for (auto& m : metrics) {
+				char c = m.first[0];
+				if(c != 0 && (!skipZeroes || m.second != 0) ) {
 					e->detail(m.first, m.second);
 				}
 			}
 		}

+		if(s != nullptr) {
+			for (auto& m : metrics) {
+				if (*m.first == '\0') {
+					*s += "\n";
+				} else if(!skipZeroes || m.second != 0) {
+					*s += format("%-15s %-8u %8u/s  ", m.first, m.second, int(m.second / elapsed));
+				}
+			}
+		}
+
 		for (int i = 0; i < btreeLevels; ++i) {
 			auto& level = levels[i];
 			std::pair<const char*, unsigned int> metrics[] = {
@ -869,37 +898,44 @@ struct RedwoodMetrics {
 				{ "LazyClear", level.lazyClearFree },
 				{ "LazyClearExt", level.lazyClearFreeExt },
 				{ "", 0 },
+				{ "ForceUpdate", level.forceUpdate },
+				{ "DetachChild", level.detachChild },
+				{ "", 0 },
 				{ "-BldAvgCount", level.pageBuild ? level.buildItemCount / level.pageBuild : 0 },
 				{ "-BldAvgFillPct", level.pageBuild ? level.buildFillPct / level.pageBuild * 100 : 0 },
 				{ "-BldAvgStoredPct", level.pageBuild ? level.buildStoredPct / level.pageBuild * 100 : 0 },
 				{ "", 0 },
 				{ "-ModAvgCount", level.pageModify ? level.modifyItemCount / level.pageModify : 0 },
 				{ "-ModAvgFillPct", level.pageModify ? level.modifyFillPct / level.pageModify * 100 : 0 },
-				{ "-ModAvgStoredPct", level.pageModify ? level.modifyStoredPct / level.pageModify * 100 : 0 }
+				{ "-ModAvgStoredPct", level.pageModify ? level.modifyStoredPct / level.pageModify * 100 : 0 },
+				{ "", 0 },
 			};

+			if(e != nullptr) {
+				for (auto& m : metrics) {
+					char c = m.first[0];
+					if(c != 0 && (!skipZeroes || m.second != 0) ) {
+						e->detail(format("L%d%s", i + 1, m.first + (c == '-' ? 1 : 0)), m.second);
+					}
+				}
+			}
+
 			if (s != nullptr) {
 				*s += format("\nLevel %d\n\t", i + 1);
-			}
-			for (auto& m : metrics) {
-				const char* name = m.first;
-				bool rate = elapsed != 0;
-				if (*name == '-') {
-					++name;
-					rate = false;
-				}

-				if (*name == '\0') {
-					if (s != nullptr) {
+				for (auto& m : metrics) {
+					const char* name = m.first;
+					bool rate = elapsed != 0;
+					if (*name == '-') {
+						++name;
+						rate = false;
+					}
+
+					if (*name == '\0') {
 						*s += "\n\t";
-					}
-				} else {
-					if (s != nullptr) {
+					} else if(!skipZeroes || m.second != 0) {
 						*s += format("%-15s %8u %8u/s  ", name, m.second, rate ? int(m.second / elapsed) : 0);
 					}
-					if (e != nullptr) {
-						e->detail(format("L%d%s", i + 1, name), m.second);
-					}
 				}
 			}
 		}
@ -1124,22 +1160,32 @@ public:
 	};

 	struct RemappedPage {
-		RemappedPage() : version(invalidVersion) {}
-		RemappedPage(Version v, LogicalPageID o, LogicalPageID n) : version(v), originalPageID(o), newPageID(n) {}
+		enum Type { NONE = 'N', REMAP = 'R', FREE = 'F', DETACH = 'D' };
+		RemappedPage(Version v = invalidVersion, LogicalPageID o = invalidLogicalPageID, LogicalPageID n = invalidLogicalPageID) : version(v), originalPageID(o), newPageID(n) {}

 		Version version;
 		LogicalPageID originalPageID;
 		LogicalPageID newPageID;

-		bool isFree() const {
-			return newPageID == invalidLogicalPageID;
+		static Type getTypeOf(LogicalPageID newPageID) {
+			if(newPageID == invalidLogicalPageID) {
+				return FREE;
+			}
+			if(newPageID == 0) {
+				return DETACH;
+			}
+			return REMAP;
+		}
+
+		Type getType() const {
+			return getTypeOf(newPageID);
 		}

 		bool operator<(const RemappedPage& rhs) { return version < rhs.version; }

 		std::string toString() const {
-			return format("RemappedPage(%s -> %s @%" PRId64 "}", ::toString(originalPageID).c_str(),
-			              ::toString(newPageID).c_str(), version);
+			return format("RemappedPage(%c: %s -> %s %s}", getType(), ::toString(originalPageID).c_str(),
+			              ::toString(newPageID).c_str(), ::toString(version).c_str());
 		}
 	};

@ -1484,6 +1530,35 @@ public:
 		}
 	}

+	LogicalPageID detachRemappedPage(LogicalPageID pageID, Version v) override {
+		auto i = remappedPages.find(pageID);
+		if(i == remappedPages.end()) {
+			// Page is not remapped
+			return invalidLogicalPageID;
+		}
+
+		// Get the page that id was most recently remapped to
+		auto iLast = i->second.rbegin();
+		LogicalPageID newID = iLast->second;
+		ASSERT(RemappedPage::getTypeOf(newID) == RemappedPage::REMAP);
+
+		// If the last change remap was also at v then change the remap to a delete, as it's essentially
+		// the same as the original page being deleted at that version and newID being used from then on.
+		if(iLast->first == v) {
+			debug_printf("DWALPager(%s) op=detachDelete originalID=%s newID=%s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(),
+							toString(pageID).c_str(), toString(newID).c_str(), v, pLastCommittedHeader->oldestVersion);
+			iLast->second = invalidLogicalPageID;
+			remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID });
+		} else {
+			debug_printf("DWALPager(%s) op=detach originalID=%s newID=%s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(),
+							toString(pageID).c_str(), toString(newID).c_str(), v, pLastCommittedHeader->oldestVersion);
+			// Mark id as converted to its last remapped location as of v
+			i->second[v] = 0;
+			remapQueue.pushBack(RemappedPage{ v, pageID, 0 });
+		}
+		return newID;
+	}
+
 	void freePage(LogicalPageID pageID, Version v) override {
 		// If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone,
 		// so queue it for later deletion
@ -1588,13 +1663,13 @@ public:
 			auto j = i->second.upper_bound(v);
 			if (j != i->second.begin()) {
 				--j;
-				debug_printf("DWALPager(%s) read %s @%" PRId64 " -> %s\n", filename.c_str(), toString(pageID).c_str(),
+				debug_printf("DWALPager(%s) op=readAtVersionRemapped %s @%" PRId64 " -> %s\n", filename.c_str(), toString(pageID).c_str(),
 				             v, toString(j->second).c_str());
 				pageID = j->second;
 				ASSERT(pageID != invalidLogicalPageID);
 			}
 		} else {
-			debug_printf("DWALPager(%s) read %s @%" PRId64 " (not remapped)\n", filename.c_str(),
+			debug_printf("DWALPager(%s) op=readAtVersionNotRemapped %s @%" PRId64 " (not remapped)\n", filename.c_str(),
 			             toString(pageID).c_str(), v);
 		}

@ -1623,29 +1698,126 @@ public:
 		return std::min(pLastCommittedHeader->oldestVersion, snapshots.front().version);
 	}

-	ACTOR static Future<Void> remapCopyAndFree(DWALPager* self, RemappedPage p, VersionToPageMapT *m, VersionToPageMapT::iterator i) {
-		debug_printf("DWALPager(%s) remapCleanup copyAndFree %s\n", self->filename.c_str(), p.toString().c_str());
+	ACTOR static Future<Void> removeRemapEntry(DWALPager* self, RemappedPage p, Version oldestRetainedVersion) {
+		// Get iterator to the versioned page map entry for the original page
+		state PageToVersionedMapT::iterator iPageMapPair = self->remappedPages.find(p.originalPageID);
+		// The iterator must be valid and not empty and its first page map entry must match p's version
+		ASSERT(iPageMapPair != self->remappedPages.end());
+		ASSERT(!iPageMapPair->second.empty());
+		state VersionToPageMapT::iterator iVersionPagePair = iPageMapPair->second.find(p.version);
+		ASSERT(iVersionPagePair != iPageMapPair->second.end());

-		// Read the data from the page that the original was mapped to
-		Reference<IPage> data = wait(self->readPage(p.newPageID, false));
+		RemappedPage::Type firstType = p.getType();
+		state RemappedPage::Type secondType;
+		bool secondAfterOldestRetainedVersion = false;
+		state bool deleteAtSameVersion = false;
+		if(p.newPageID == iVersionPagePair->second) {
+			auto nextEntry = iVersionPagePair;
+			++nextEntry;
+			if(nextEntry == iPageMapPair->second.end()) {
+				secondType = RemappedPage::NONE;
+			} else {
+				secondType = RemappedPage::getTypeOf(nextEntry->second);
+				secondAfterOldestRetainedVersion = nextEntry->first >= oldestRetainedVersion;
+			}
+		} else {
+			ASSERT(iVersionPagePair->second == invalidLogicalPageID);
+ 			secondType = RemappedPage::FREE;
+			deleteAtSameVersion = true;
+		}
+		ASSERT(firstType == RemappedPage::REMAP || secondType == RemappedPage::NONE);

-		// Write the data to the original page so it can be read using its original pageID
-		self->updatePage(p.originalPageID, data);
-		++g_redwoodMetrics.pagerRemapCopy;
+		// Scenarios and actions to take:
+		//
+		// The first letter (firstType) is the type of the entry just popped from the remap queue.
+		// The second letter (secondType) is the type of the next item in the queue for the same
+		// original page ID, if present.  If not present, secondType will be NONE.
+		//
+		// Since the next item can be arbitrarily ahead in the queue, secondType is determined by 
+		// looking at the remappedPages structure.
+		//
+		// R == Remap    F == Free   D == Detach   | == oldestRetaineedVersion
+		//
+		//   R R |  free new ID
+		//   R F |  free new ID if R and D are at different versions
+		//   R D |  do nothing
+		//   R | R  copy new to original ID, free new ID
+		//   R | F  copy new to original ID, free new ID
+		//   R | D  copy new to original ID
+		//   R |    copy new to original ID, free new ID
+		//   F |    free original ID
+		//   D |    free original ID
+		//
+		// Note that
+		//
+		// Special case:  Page is detached while it is being read in remapCopyAndFree()
+		//   Initial state:  R |
+		//   Start remapCopyAndFree(), intending to copy new, ID to originalID and free newID
+		//   New state:  R | D
+		//   Read of newID completes. 
+		//   Copy new contents over original, do NOT free new ID
+		//   Later popped state:  D |
+		//   free original ID
+		//
+		state bool freeNewID = (firstType == RemappedPage::REMAP && secondType != RemappedPage::DETACH && !deleteAtSameVersion);
+		state bool copyNewToOriginal = (firstType == RemappedPage::REMAP && (secondAfterOldestRetainedVersion || secondType == RemappedPage::NONE));
+		state bool freeOriginalID = (firstType == RemappedPage::FREE || firstType == RemappedPage::DETACH);

-		// Now that the page data has been copied to the original page, the versioned page map entry is no longer
-		// needed and the new page ID can be freed as of the next commit.
-		m->erase(i);
-		self->freeUnmappedPage(p.newPageID, 0);
-		++g_redwoodMetrics.pagerRemapFree;
+		debug_printf("DWALPager(%s) remapCleanup %s secondType=%c mapEntry=%s oldestRetainedVersion=%" PRId64 " \n",
+			self->filename.c_str(), p.toString().c_str(), secondType, ::toString(*iVersionPagePair).c_str(), oldestRetainedVersion);
+
+		if(copyNewToOriginal) {
+			debug_printf("DWALPager(%s) remapCleanup copy %s\n", self->filename.c_str(), p.toString().c_str());
+
+			// Read the data from the page that the original was mapped to
+			Reference<IPage> data = wait(self->readPage(p.newPageID, false, true));
+
+			// Write the data to the original page so it can be read using its original pageID
+			self->updatePage(p.originalPageID, data);
+			++g_redwoodMetrics.pagerRemapCopy;
+		} else if (firstType == RemappedPage::REMAP) {
+			++g_redwoodMetrics.pagerRemapSkip;
+		}
+
+		// Now that the page contents have been copied to the original page, if the corresponding map entry
+		// represented the remap and there wasn't a delete later in the queue at p for the same version then
+		// erase the entry.
+		if(!deleteAtSameVersion) {
+			debug_printf("DWALPager(%s) remapCleanup deleting map entry %s\n", self->filename.c_str(), p.toString().c_str());
+			// Erase the entry and set iVersionPagePair to the next entry or end
+			iVersionPagePair = iPageMapPair->second.erase(iVersionPagePair);
+
+			// If the map is now empty, delete it
+			if(iPageMapPair->second.empty()) {
+				debug_printf("DWALPager(%s) remapCleanup deleting empty map %s\n", self->filename.c_str(), p.toString().c_str());
+				self->remappedPages.erase(iPageMapPair);
+			} else if(freeNewID && secondType == RemappedPage::NONE && iVersionPagePair != iPageMapPair->second.end() && RemappedPage::getTypeOf(iVersionPagePair->second) == RemappedPage::DETACH) {
+				// If we intend to free the new ID and there was no map entry, one could have been added during the wait above.
+				// If so, and if it was a detach operation, then we can't free the new page ID as its lifetime will be managed
+				// by the client starting at some later version.
+				freeNewID = false;
+			}
+		}
+
+		if(freeNewID) {
+			debug_printf("DWALPager(%s) remapCleanup freeNew %s\n", self->filename.c_str(), p.toString().c_str());
+			self->freeUnmappedPage(p.newPageID, 0);
+			++g_redwoodMetrics.pagerRemapFree;
+		}
+
+		if(freeOriginalID) {
+			debug_printf("DWALPager(%s) remapCleanup freeOriginal %s\n", self->filename.c_str(), p.toString().c_str());
+			self->freeUnmappedPage(p.originalPageID, 0);
+			++g_redwoodMetrics.pagerRemapFree;
+		}

 		return Void();
 	}

 	ACTOR static Future<Void> remapCleanup(DWALPager* self) {
-		state ActorCollection copies(true);
+		state ActorCollection tasks(true);
 		state Promise<Void> signal;
-		copies.add(signal.getFuture());
+		tasks.add(signal.getFuture());

 		self->remapCleanupStop = false;

@ -1654,8 +1826,7 @@ public:
 		state Version oldestRetainedVersion = self->effectiveOldestVersion();

 		// Cutoff is the version we can pop to
-		state RemappedPage cutoff;
-		cutoff.version = oldestRetainedVersion - self->remapCleanupWindow;
+		state RemappedPage cutoff(oldestRetainedVersion - self->remapCleanupWindow);

 		// Minimum version we must pop to before obeying stop command.
 		state Version minStopVersion = cutoff.version - (self->remapCleanupWindow * SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_LAG);
@ -1663,46 +1834,15 @@ public:
 		loop {
 			state Optional<RemappedPage> p = wait(self->remapQueue.pop(cutoff));
 			debug_printf("DWALPager(%s) remapCleanup popped %s\n", self->filename.c_str(), ::toString(p).c_str());
+
+			// Stop if we have reached the cutoff version, which is the start of the cleanup coalescing window
 			if (!p.present()) {
 				break;
 			}

-			// Get iterator to the versioned page map entry for the original page
-			auto iPageMapPair = self->remappedPages.find(p.get().originalPageID);
-			// The iterator must be valid and not empty and its first page map entry must match p's version
-			ASSERT(iPageMapPair != self->remappedPages.end());
-			ASSERT(!iPageMapPair->second.empty());
-			auto iVersionPagePair = iPageMapPair->second.begin();
-			ASSERT(iVersionPagePair->first == p.get().version);
-
-			// If this is a free page entry then free the original page ID
-			if(p.get().isFree()) {
-				debug_printf("DWALPager(%s) remapCleanup free %s\n", self->filename.c_str(),
-					p.get().toString().c_str());
-				self->freeUnmappedPage(p.get().originalPageID, 0);
-				++g_redwoodMetrics.pagerRemapFree;
-
-				// There can't be any more entries in the page map after this one so verify that
-				// the map size is 1 and erase the map for p's original page ID.
-				ASSERT(iPageMapPair->second.size() == 1);
-				self->remappedPages.erase(iPageMapPair);
-			}
-			else {
-				// If there is no next page map entry or there is but it is after the oldest retained version
-				// then p must be copied to unmap it.
-				auto iNextVersionPagePair = iVersionPagePair;
-				++iNextVersionPagePair;
-				if(iNextVersionPagePair == iPageMapPair->second.end() || iNextVersionPagePair->first > oldestRetainedVersion) {
-					// Copy the remapped page to the original so it can be freed.
-					copies.add(remapCopyAndFree(self, p.get(), &iPageMapPair->second, iVersionPagePair));
-				}
-				else {
-					debug_printf("DWALPager(%s) remapCleanup skipAndFree %s\n", self->filename.c_str(), p.get().toString().c_str());
-					self->freeUnmappedPage(p.get().newPageID, 0);
-					++g_redwoodMetrics.pagerRemapFree;
-					++g_redwoodMetrics.pagerRemapSkip;
-					iPageMapPair->second.erase(iVersionPagePair);
-				}
+			Future<Void> task = removeRemapEntry(self, p.get(), oldestRetainedVersion);
+			if(!task.isReady()) {
+				tasks.add(task);
 			}

 			// If the stop flag is set and we've reached the minimum stop version according the the allowed lag then stop.
@ -1713,7 +1853,7 @@ public:

 		debug_printf("DWALPager(%s) remapCleanup stopped (stop=%d)\n", self->filename.c_str(), self->remapCleanupStop);
 		signal.send(Void());
-		wait(copies.getResult());
+		wait(tasks.getResult());
 		return Void();
 	}

@ -1889,8 +2029,7 @@ public:
 	Future<int64_t> getUserPageCount() override {
 		return map(getUserPageCount_cleanup(this), [=](Void) {
 			int64_t userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries -
-			                    delayedFreeList.numPages - delayedFreeList.numEntries - remapQueue.numPages
-								- remapQueue.numEntries;
+			                    delayedFreeList.numPages - delayedFreeList.numEntries - remapQueue.numPages;

 			debug_printf("DWALPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64
 			             " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64
@ -2871,6 +3010,38 @@ public:

 	typedef FIFOQueue<LazyClearQueueEntry> LazyClearQueueT;

+	struct ParentInfo {
+		ParentInfo() {
+			count = 0;
+			bits = 0;
+		}
+		void clear() {
+			count = 0;
+			bits = 0;
+		}
+
+		static uint32_t mask(LogicalPageID id) {
+			return 1 << (id & 31);
+		}
+
+		void pageUpdated(LogicalPageID child) {
+			auto m = mask(child);
+			if((bits & m) == 0) {
+				bits |= m;
+				++count;
+			}
+		}
+
+		bool maybeUpdated(LogicalPageID child) {
+			return (mask(child) & bits) != 0;
+		}
+
+		uint32_t bits;
+		int count;
+	};
+
+	typedef std::unordered_map<LogicalPageID, ParentInfo> ParentInfoMapT;
+
 #pragma pack(push, 1)
 	struct MetaKey {
 		static constexpr int FORMAT_VERSION = 8;
@ -2923,8 +3094,8 @@ public:
 	// durable once the following call to commit() returns
 	void set(KeyValueRef keyValue) override {
 		++g_redwoodMetrics.opSet;
-		++g_redwoodMetrics.opSetKeyBytes += keyValue.key.size();
-		++g_redwoodMetrics.opSetValueBytes += keyValue.value.size();
+		g_redwoodMetrics.opSetKeyBytes += keyValue.key.size();
+		g_redwoodMetrics.opSetValueBytes += keyValue.value.size();
 		m_pBuffer->insert(keyValue.key).mutation().setBoundaryValue(m_pBuffer->copyToArena(keyValue.value));
 	}

@ -3022,7 +3193,7 @@ public:
 						// If this page is height 2, then the children are leaves so free them directly
 						if (btPage.height == 2) {
 							debug_printf("LazyClear: freeing child %s\n", toString(btChildPageID).c_str());
-							self->freeBtreePage(btChildPageID, v);
+							self->freeBTreePage(btChildPageID, v);
 							freedPages += btChildPageID.size();
 							metrics.lazyClearFree += 1;
 							metrics.lazyClearFreeExt += (btChildPageID.size() - 1);
@ -3041,7 +3212,7 @@ public:

 				// Free the page, now that its children have either been freed or queued
 				debug_printf("LazyClear: freeing queue entry %s\n", toString(entry.pageID).c_str());
-				self->freeBtreePage(entry.pageID, v);
+				self->freeBTreePage(entry.pageID, v);
 				freedPages += entry.pageID.size();
 				metrics.lazyClearFree += 1;
 				metrics.lazyClearFreeExt += entry.pageID.size() - 1;
@ -3146,7 +3317,7 @@ public:
 		return commit_impl(this);
 	}

-	ACTOR static Future<Void> destroyAndCheckSanity_impl(VersionedBTree* self) {
+	ACTOR static Future<Void> clearAllAndCheckSanity_impl(VersionedBTree* self) {
 		ASSERT(g_network->isSimulated());

 		debug_printf("Clearing tree.\n");
@ -3191,7 +3362,7 @@ public:
 		return Void();
 	}

-	Future<Void> destroyAndCheckSanity() { return destroyAndCheckSanity_impl(this); }
+	Future<Void> clearAllAndCheckSanity() { return clearAllAndCheckSanity_impl(this); }

 private:
 	// Represents a change to a single key - set, clear, or atomic op
@ -3412,6 +3583,8 @@ private:
 	Future<Void> m_init;
 	std::string m_name;
 	int m_blockSize;
+	std::unordered_map<LogicalPageID, ParentInfo> parents;
+	ParentInfoMapT childUpdateTracker;

 	// MetaKey changes size so allocate space for it to expand into
 	union {
@ -3603,7 +3776,7 @@ private:
 				// must be rewritten anyway to count for the change in child count or child links.
 				// Free the old IDs, but only once (before the first output record is added).
 				if (records.empty()) {
-					self->freeBtreePage(previousID, v);
+					self->freeBTreePage(previousID, v);
 				}
 				for (p = 0; p < pages.size(); ++p) {
 					LogicalPageID id = wait(self->m_pager->newPageID());
@ -3771,7 +3944,7 @@ private:
 		}
 	}

-	void freeBtreePage(BTreePageIDRef btPageID, Version v) {
+	void freeBTreePage(BTreePageIDRef btPageID, Version v) {
 		// Free individual pages at v
 		for (LogicalPageID id : btPageID) {
 			m_pager->freePage(id, v);
@ -3780,7 +3953,7 @@ private:

 	// Write new version of pageID at version v using page as its data.
 	// Attempts to reuse original id(s) in btPageID, returns BTreePageID.
-	ACTOR static Future<BTreePageIDRef> updateBtreePage(VersionedBTree* self, BTreePageIDRef oldID, Arena* arena,
+	ACTOR static Future<BTreePageIDRef> updateBTreePage(VersionedBTree* self, BTreePageIDRef oldID, Arena* arena,
 	                                                    Reference<IPage> page, Version writeVersion) {
 		state BTreePageIDRef newID;
 		newID.resize(*arena, oldID.size());
@ -3878,19 +4051,23 @@ private:
 		// If the last record in the range has a null link then this will be null.
 		const RedwoodRecordRef* expectedUpperBound;

+		bool inPlaceUpdate;
+
 		// CommitSubtree will call one of the following three functions based on its exit path

 		// Subtree was cleared.
 		void cleared() {
+			inPlaceUpdate = false;
 			childrenChanged = true;
 			expectedUpperBound = nullptr;
 		}

 		// Page was updated in-place through edits and written to maybeNewID
 		void updatedInPlace(BTreePageIDRef maybeNewID, BTreePage* btPage, int capacity) {
+			inPlaceUpdate = true;
 			auto& metrics = g_redwoodMetrics.level(btPage->height);
 			metrics.pageModify += 1;
-			metrics.pageModify += (maybeNewID.size() - 1);
+			metrics.pageModifyExt += (maybeNewID.size() - 1);
 			metrics.modifyFillPct += (double)btPage->size() / capacity;
 			metrics.modifyStoredPct += (double)btPage->kvBytes / capacity;
 			metrics.modifyItemCount += btPage->tree().numItems;
@ -3912,6 +4089,7 @@ private:

 		// writePages() was used to build 1 or more replacement pages.
 		void rebuilt(Standalone<VectorRef<RedwoodRecordRef>> newRecords) {
+			inPlaceUpdate = false;
 			newLinks = newRecords;
 			childrenChanged = true;

@ -3952,14 +4130,15 @@ private:

 	struct InternalPageModifier {
 		InternalPageModifier() {}
-		InternalPageModifier(BTreePage* p, BTreePage::BinaryTree::Mirror* m, bool updating)
-		  : btPage(p), m(m), updating(updating), changesMade(false) {}
+		InternalPageModifier(BTreePage* p, BTreePage::BinaryTree::Mirror* m, bool updating, ParentInfo *parentInfo)
+		  : btPage(p), m(m), updating(updating), changesMade(false), parentInfo(parentInfo) {}

 		bool updating;
 		BTreePage* btPage;
 		BTreePage::BinaryTree::Mirror* m;
 		Standalone<VectorRef<RedwoodRecordRef>> rebuild;
 		bool changesMade;
+		ParentInfo *parentInfo;

 		bool empty() const {
 			if (updating) {
@ -4055,6 +4234,13 @@ private:
 				// endpoint.
 				changesMade = true;
 			} else {
+
+				if(u.inPlaceUpdate) {
+					for(auto id : u.decodeLowerBound->getChildPage()) {
+						parentInfo->pageUpdated(id);
+					}
+				}
+
 				keep(u.cBegin, u.cEnd);
 			}

@ -4226,7 +4412,7 @@ private:
 							debug_printf("%s Inserted %s [mutation, boundary start]\n", context.c_str(),
 							             rec.toString().c_str());
 						} else {
-							debug_printf("%s Inserted failed for %s [mutation, boundary start]\n", context.c_str(),
+							debug_printf("%s Insert failed for %s [mutation, boundary start]\n", context.c_str(),
 							             rec.toString().c_str());
 							switchToLinearMerge();
 						}
@ -4339,12 +4525,12 @@ private:
 				// If the tree is now empty, delete the page
 				if (deltaTree.numItems == 0) {
 					update->cleared();
-					self->freeBtreePage(rootID, writeVersion);
+					self->freeBTreePage(rootID, writeVersion);
 					debug_printf("%s Page updates cleared all entries, returning %s\n", context.c_str(),
 					             toString(*update).c_str());
 				} else {
 					// Otherwise update it.
-					BTreePageIDRef newID = wait(self->updateBtreePage(self, rootID, &update->newLinks.arena(),
+					BTreePageIDRef newID = wait(self->updateBTreePage(self, rootID, &update->newLinks.arena(),
 					                                                  page.castTo<IPage>(), writeVersion));

 					update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize);
@ -4357,7 +4543,7 @@ private:
 			// If everything in the page was deleted then this page should be deleted as of the new version
 			if (merged.empty()) {
 				update->cleared();
-				self->freeBtreePage(rootID, writeVersion);
+				self->freeBTreePage(rootID, writeVersion);

 				debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(),
 				             toString(*update).c_str());
@ -4511,7 +4697,7 @@ private:
 									if (btPage->height == 2) {
 										debug_printf("%s: freeing child page in cleared subtree range: %s\n",
 										             context.c_str(), ::toString(rec.getChildPage()).c_str());
-										self->freeBtreePage(rec.getChildPage(), writeVersion);
+										self->freeBTreePage(rec.getChildPage(), writeVersion);
 									} else {
 										debug_printf("%s: queuing subtree deletion cleared subtree range: %s\n",
 										             context.c_str(), ::toString(rec.getChildPage()).c_str());
@ -4547,7 +4733,10 @@ private:
 			wait(waitForAll(recursions));
 			debug_printf("%s Recursions done, processing slice updates.\n", context.c_str());

-			state InternalPageModifier m(btPage, cursor.mirror, tryToUpdate);
+			// Note:  parentInfo could be invalid after a wait and must be re-initialized.
+			// All uses below occur before waits so no reinitialization is done.
+			state ParentInfo *parentInfo = &self->childUpdateTracker[rootID.front()];
+			state InternalPageModifier m(btPage, cursor.mirror, tryToUpdate, parentInfo);

 			// Apply the possible changes for each subtree range recursed to, except the last one.
 			// For each range, the expected next record, if any, is checked against the first boundary
@ -4565,25 +4754,103 @@ private:
 			             context.c_str(), m.changesMade, update->toString().c_str());
 			m.applyUpdate(*slices.back(), m.changesMade ? update->subtreeUpperBound : update->decodeUpperBound);

+			state bool detachChildren = (parentInfo->count > 2);
+			state bool forceUpdate = false;
+
+			if(!m.changesMade && detachChildren) {
+				debug_printf("%s Internal page forced rewrite because at least %d children have been updated in-place.\n", context.c_str(), parentInfo->count);
+				forceUpdate = true;
+				if(!m.updating) {
+					page = self->cloneForUpdate(page);
+					cursor = getCursor(page);
+					btPage = (BTreePage*)page->begin();
+					m.btPage = btPage;
+					m.m = cursor.mirror;
+					m.updating = true;
+				}
+				++g_redwoodMetrics.level(btPage->height).forceUpdate;
+			}
+
 			// If page contents have changed
-			if (m.changesMade) {
-				if ((m.empty())) {
+			if (m.changesMade || forceUpdate) {
+				if (m.empty()) {
 					update->cleared();
 					debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n",
 					             context.c_str(), toString(*update).c_str());
-					self->freeBtreePage(rootID, writeVersion);
+					self->freeBTreePage(rootID, writeVersion);
+					self->childUpdateTracker.erase(rootID.front());
 				} else {
 					if (m.updating) {
-						// Page was updated in place
-						BTreePageIDRef newID = wait(self->updateBtreePage(self, rootID, &update->newLinks.arena(),
+						// Page was updated in place (or being forced to be updated in place to update child page ids)
+						debug_printf("%s Internal page modified in-place tryUpdate=%d forceUpdate=%d detachChildren=%d\n", context.c_str(), tryToUpdate, forceUpdate, detachChildren);
+
+						if(detachChildren) {
+							int detached = 0;
+							cursor.moveFirst();
+							auto &stats = g_redwoodMetrics.level(btPage->height);
+							while(cursor.valid()) {
+								if(cursor.get().value.present()) {
+									for(auto &p : cursor.get().getChildPage()) {
+										if(parentInfo->maybeUpdated(p)) {
+											LogicalPageID newID = self->m_pager->detachRemappedPage(p, writeVersion);
+											if(newID != invalidLogicalPageID) {
+												debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID);
+												p = newID;
+												++stats.detachChild;
+												++detached;
+											}
+										}
+									}
+								}
+								cursor.moveNext();
+							}
+							parentInfo->clear();
+							if(forceUpdate && detached == 0) {
+								debug_printf("%s No children detached during forced update, returning %s\n", context.c_str(), toString(*update).c_str());
+								return Void();
+							}
+						}
+
+						BTreePageIDRef newID = wait(self->updateBTreePage(self, rootID, &update->newLinks.arena(),
 						                                                  page.castTo<IPage>(), writeVersion));
+						debug_printf(
+							"%s commitSubtree(): Internal page updated in-place at version %s, new contents: %s\n", context.c_str(), toString(writeVersion).c_str(),
+							btPage->toString(false, newID, snapshot->getVersion(), update->decodeLowerBound, update->decodeUpperBound)
+								.c_str());

 						update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize);
 						debug_printf("%s Internal page updated in-place, returning %s\n", context.c_str(),
 						             toString(*update).c_str());
 					} else {
 						// Page was rebuilt, possibly split.
-						debug_printf("%s Internal page modified, creating replacements.\n", context.c_str());
+						debug_printf("%s Internal page could not be modified, rebuilding replacement(s).\n", context.c_str());
+
+						if(detachChildren) {
+							auto &stats = g_redwoodMetrics.level(btPage->height);
+							for(auto &rec : m.rebuild) {
+								if(rec.value.present()) {
+									BTreePageIDRef oldPages = rec.getChildPage();
+									BTreePageIDRef newPages;
+									for(int i = 0; i < oldPages.size(); ++i) {
+										LogicalPageID p = oldPages[i];
+										if(parentInfo->maybeUpdated(p)) {
+											LogicalPageID newID = self->m_pager->detachRemappedPage(p, writeVersion);
+											if(newID != invalidLogicalPageID) {
+												// Rebuild record values reference original page memory so make a copy
+												if(newPages.empty()) {
+													newPages = BTreePageIDRef(m.rebuild.arena(), oldPages);
+													rec.setChildPage(newPages);
+												}
+												debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID);
+												newPages[i] = newID;
+												++stats.detachChild;
+											}
+										}
+									}
+								}
+							}
+							parentInfo->clear();
+						}

 						Standalone<VectorRef<RedwoodRecordRef>> newChildEntries =
 						    wait(writePages(self, update->subtreeLowerBound, update->subtreeUpperBound, m.rebuild,
@ -4985,7 +5252,7 @@ public:
 		bool isValid() const { return valid; }

 		std::string toString() const {
-			std::string r;
+			std::string r = format("{ptr=%p %s ", this, ::toString(pager->getVersion()).c_str());
 			for (int i = 0; i < path.size(); ++i) {
 				r += format("[%d/%d: %s] ", i + 1, path.size(),
 				            path[i].cursor.valid() ? path[i].cursor.get().toString(path[i].btPage->isLeaf()).c_str()
@ -4994,6 +5261,7 @@ public:
 			if (!valid) {
 				r += " (invalid) ";
 			}
+			r += "}";
 			return r;
 		}

@ -5014,6 +5282,8 @@ public:
 		                      const RedwoodRecordRef& upperBound) {
 			Reference<const IPage>& page = pages[id.front()];
 			if (page.isValid()) {
+				// The pager won't see this access so count it as a cache hit
+				++g_redwoodMetrics.pagerCacheHit;
 				path.push_back(arena, { (BTreePage*)page->begin(), getCursor(page) });
 				return Void();
 			}
@ -6960,24 +7230,23 @@ TEST_CASE("!/redwood/correctness/btree") {
 	state int pageSize =
 	    shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400));

+	state int64_t targetPageOps = shortTest ? 50000 : 1000000;
 	state bool pagerMemoryOnly = shortTest && (deterministicRandom()->random01() < .01);
 	state int maxKeySize = deterministicRandom()->randomInt(1, pageSize * 2);
 	state int maxValueSize = randomSize(pageSize * 25);
 	state int maxCommitSize = shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6));
-	state int mutationBytesTarget =
-	    shortTest ? 100000 : randomSize(std::min<int>(maxCommitSize * 100, pageSize * 100000));
 	state double clearProbability = deterministicRandom()->random01() * .1;
 	state double clearSingleKeyProbability = deterministicRandom()->random01();
 	state double clearPostSetProbability = deterministicRandom()->random01() * .1;
 	state double coldStartProbability = pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3);
 	state double advanceOldVersionProbability = deterministicRandom()->random01();
-	state double maxDuration = 60;
 	state int64_t cacheSizeBytes =
 	    pagerMemoryOnly ? 2e9 : (BUGGIFY ? deterministicRandom()->randomInt(1, 10 * pageSize) : 0);
 	state Version versionIncrement = deterministicRandom()->randomInt64(1, 1e8);
 	state Version remapCleanupWindow = deterministicRandom()->randomInt64(0, versionIncrement * 50);

 	printf("\n");
+	printf("targetPageOps: %" PRId64 "\n", targetPageOps);
 	printf("pagerMemoryOnly: %d\n", pagerMemoryOnly);
 	printf("serialTest: %d\n", serialTest);
 	printf("shortTest: %d\n", shortTest);
@ -6985,7 +7254,6 @@ TEST_CASE("!/redwood/correctness/btree") {
 	printf("maxKeySize: %d\n", maxKeySize);
 	printf("maxValueSize: %d\n", maxValueSize);
 	printf("maxCommitSize: %d\n", maxCommitSize);
-	printf("mutationBytesTarget: %d\n", mutationBytesTarget);
 	printf("clearProbability: %f\n", clearProbability);
 	printf("clearSingleKeyProbability: %f\n", clearSingleKeyProbability);
 	printf("clearPostSetProbability: %f\n", clearPostSetProbability);
@ -7000,8 +7268,6 @@ TEST_CASE("!/redwood/correctness/btree") {
 	deleteFile(pagerFile);

 	printf("Initializing...\n");
-	state double startTime = now();
-
 	pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly);
 	state VersionedBTree* btree = new VersionedBTree(pager, pagerFile);
 	wait(btree->init());
@ -7028,14 +7294,12 @@ TEST_CASE("!/redwood/correctness/btree") {
 	state PromiseStream<Version> committedVersions;
 	state Future<Void> verifyTask = verify(btree, committedVersions.getFuture(), &written, &errorCount, serialTest);
 	state Future<Void> randomTask = serialTest ? Void() : (randomReader(btree) || btree->getError());
+	committedVersions.send(lastVer);

 	state Future<Void> commit = Void();
+	state int64_t totalPageOps = 0;

-	while (mutationBytes.get() < mutationBytesTarget && (now() - startTime) < maxDuration) {
-		if (now() - startTime > 600) {
-			mutationBytesTarget = mutationBytes.get();
-		}
-
+	while (totalPageOps < targetPageOps) {
 		// Sometimes increment the version
 		if (deterministicRandom()->random01() < 0.10) {
 			++version;
@ -7131,14 +7395,12 @@ TEST_CASE("!/redwood/correctness/btree") {
 		}

 		// Commit at end or after this commit's mutation bytes are reached
-		if (mutationBytes.get() >= mutationBytesTarget || mutationBytesThisCommit >= mutationBytesTargetThisCommit) {
+		if (totalPageOps >= targetPageOps || mutationBytesThisCommit >= mutationBytesTargetThisCommit) {
 			// Wait for previous commit to finish
 			wait(commit);
-			printf("Committed.  Next commit %d bytes, %" PRId64
-			       "/%d (%.2f%%)  Stats: Insert %.2f MB/s  ClearedKeys %.2f MB/s  Total %.2f\n",
-			       mutationBytesThisCommit, mutationBytes.get(), mutationBytesTarget,
-			       (double)mutationBytes.get() / mutationBytesTarget * 100,
-			       (keyBytesInserted.rate() + valueBytesInserted.rate()) / 1e6, keyBytesCleared.rate() / 1e6,
+			printf("Committed.  Next commit %d bytes, %" PRId64 " bytes.", mutationBytesThisCommit, mutationBytes.get());
+			printf("  Stats:  Insert %.2f MB/s  ClearedKeys %.2f MB/s  Total %.2f\n",
+		          (keyBytesInserted.rate() + valueBytesInserted.rate()) / 1e6, keyBytesCleared.rate() / 1e6,
 			       mutationBytes.rate() / 1e6);

 			Version v = version; // Avoid capture of version as a member of *this
@ -7151,8 +7413,12 @@ TEST_CASE("!/redwood/correctness/btree") {
 				                                                                btree->getOldestVersion() + 1));
 			}

-			commit = map(btree->commit(), [=](Void) {
+			commit = map(btree->commit(), [=,&ops=totalPageOps](Void) {
+				// Update pager ops before clearing metrics
+				ops += g_redwoodMetrics.pageOps();
+				printf("PageOps %" PRId64 "/%" PRId64 " (%.2f%%)\n", ops, targetPageOps, ops * 100.0 / targetPageOps);
 				printf("Committed:\n%s\n", g_redwoodMetrics.toString(true).c_str());
+
 				// Notify the background verifier that version is committed and therefore readable
 				committedVersions.send(v);
 				return Void();
@ -7202,6 +7468,7 @@ TEST_CASE("!/redwood/correctness/btree") {
 				committedVersions = PromiseStream<Version>();
 				verifyTask = verify(btree, committedVersions.getFuture(), &written, &errorCount, serialTest);
 				randomTask = randomReader(btree) || btree->getError();
+				committedVersions.send(v);
 			}

 			version += versionIncrement;
@ -7209,7 +7476,7 @@ TEST_CASE("!/redwood/correctness/btree") {
 		}

 		// Check for errors
-		if (errorCount != 0) throw internal_error();
+		ASSERT(errorCount == 0);
 	}

 	debug_printf("Waiting for outstanding commit\n");
@ -7220,11 +7487,18 @@ TEST_CASE("!/redwood/correctness/btree") {
 	wait(verifyTask);

 	// Check for errors
-	if (errorCount != 0) throw internal_error();
+	ASSERT(errorCount == 0);

-	wait(btree->destroyAndCheckSanity());
+	// Reopen pager and btree with a remap cleanup window of 0 to reclaim all old pages
+	state Future<Void> closedFuture = btree->onClosed();
+	btree->close();
+	wait(closedFuture);
+	btree = new VersionedBTree(new DWALPager(pageSize, pagerFile, cacheSizeBytes, 0), pagerFile);
+	wait(btree->init());

-	Future<Void> closedFuture = btree->onClosed();
+	wait(btree->clearAllAndCheckSanity());
+
+	closedFuture = btree->onClosed();
 	btree->close();
 	debug_printf("Closing.\n");
 	wait(closedFuture);
@ -7330,7 +7604,7 @@ TEST_CASE("!/redwood/performance/set") {
 	state int minValueSize = 100;
 	state int maxValueSize = 500;
 	state int minConsecutiveRun = 1;
-	state int maxConsecutiveRun = 10;
+	state int maxConsecutiveRun = 100000;
 	state char firstKeyChar = 'a';
 	state char lastKeyChar = 'm';
 	state Version remapCleanupWindow = SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW;
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -477,7 +477,7 @@ public:
 		Optional<TagInfo> previousBusiestTag;

 		int64_t costFunction(int64_t bytes) {
-			return bytes / SERVER_KNOBS->OPERATION_COST_BYTE_FACTOR + 1;
+			return bytes / SERVER_KNOBS->READ_COST_BYTE_FACTOR + 1;
 		}

 		void addRequest(Optional<TagSet> const& tags, int64_t bytes) {
@ -502,7 +502,7 @@ public:
 			previousBusiestTag.reset();
 			if (intervalStart > 0 && CLIENT_KNOBS->READ_TAG_SAMPLE_RATE > 0 && elapsed > 0) {
 				double rate = busiestTagCount / CLIENT_KNOBS->READ_TAG_SAMPLE_RATE / elapsed;
-				if(rate > SERVER_KNOBS->MIN_TAG_PAGES_READ_RATE) {
+				if(rate > SERVER_KNOBS->MIN_TAG_PAGES_RATE) {
 					previousBusiestTag = TagInfo(busiestTag, rate, (double)busiestTagCount / intervalTotalSampledCount);
 				}

@ -3788,7 +3788,7 @@ ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterfac
 	self->actors.add(traceRole(Role::STORAGE_SERVER, ssi.id()));

 	self->transactionTagCounter.startNewInterval(self->thisServerID);
-	self->actors.add(recurring([&](){ self->transactionTagCounter.startNewInterval(self->thisServerID); }, SERVER_KNOBS->READ_TAG_MEASUREMENT_INTERVAL));
+	self->actors.add(recurring([&](){ self->transactionTagCounter.startNewInterval(self->thisServerID); }, SERVER_KNOBS->TAG_MEASUREMENT_INTERVAL));

 	self->coreStarted.send( Void() );

--- a/flow/Arena.h
+++ b/flow/Arena.h
@ -85,11 +85,12 @@ struct TrackIt {
 class NonCopyable
 {
  protected:
-	NonCopyable () {}
-	~NonCopyable () {} /// Protected non-virtual destructor
-  private:
-	NonCopyable (const NonCopyable &);
-	NonCopyable & operator = (const NonCopyable &);
+	NonCopyable()=default;
+	~NonCopyable()=default; /// Protected non-virtual destructor
+	NonCopyable(NonCopyable&&)=default;
+	NonCopyable &operator=(NonCopyable&&)=default;
+	NonCopyable(const NonCopyable&)=delete;
+	NonCopyable &operator=(const NonCopyable &)=delete;
 };

 // An Arena is a custom allocator that consists of a set of ArenaBlocks.  Allocation is performed by bumping a pointer
@ -174,9 +175,7 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted<ArenaBlock>
 	static ArenaBlock* create(int dataSize, Reference<ArenaBlock>& next);
 	void destroy();
 	void destroyLeaf();
-
-private:
-	static void* operator new(size_t s); // not implemented
+	static void* operator new(size_t s)=delete;
 };

 inline void* operator new ( size_t size, Arena& p ) {
--- a/flow/FastAlloc.h
+++ b/flow/FastAlloc.h
@ -118,6 +118,7 @@ public:
 	static volatile int32_t pageCount;
 #endif

+	FastAllocator()=delete;
 private:
 #ifdef VALGRIND
 	static unsigned long vLock;
@ -147,7 +148,6 @@ private:
 	}
 	static void* freelist;

-	FastAllocator();  // not implemented
 	static void initThread();
 	static void getMagazine();
 	static void releaseMagazine(void*);
--- a/flow/IThreadPool.cpp
+++ b/flow/IThreadPool.cpp
@ -71,11 +71,10 @@ class ThreadPool : public IThreadPool, public ReferenceCounted<ThreadPool> {
 		PThreadAction action;
 		ActionWrapper(PThreadAction action) : action(action) {}
 		// HACK: Boost won't use move constructors, so we just assume the last copy made is the one that will be called or cancelled
-		ActionWrapper(ActionWrapper const& r) : action(r.action) { const_cast<ActionWrapper&>(r).action=NULL; }
-		void operator()() { Thread::dispatch(action); action = NULL; }
+		ActionWrapper(ActionWrapper const& r) : action(r.action) { const_cast<ActionWrapper&>(r).action=nullptr; }
+		void operator()() { Thread::dispatch(action); action = nullptr; }
 		~ActionWrapper() { if (action) { action->cancel(); } }
-	private:
-		ActionWrapper &operator=(ActionWrapper const&);
+		ActionWrapper &operator=(ActionWrapper const&)=delete;
 	};
 public:
 	ThreadPool(int stackSize) : dontstop(ios), mode(Run), stackSize(stackSize) {}
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@ -128,7 +128,8 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, ReportConflictingKeys);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, SmallEndpoints);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, CacheRole);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B070000000LL, RangeSplit);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010000LL, RangeSplit);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TagThrottleValueReason);
 };

 // These impact both communications and the deserialization of certain database and IKeyValueStore keys.
--- a/flow/error_definitions.h
+++ b/flow/error_definitions.h
@ -73,6 +73,7 @@ ERROR( connection_idle, 1049, "Connection closed after idle timeout" )
 ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" )
 ERROR( batch_transaction_throttled, 1051, "Batch GRV request rate limit exceeded")
 ERROR( dd_cancelled, 1052, "Data distribution components cancelled")
+ERROR( dd_not_found, 1053, "Data distributor not found")

 ERROR( broken_promise, 1100, "Broken promise" )
 ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" )
--- a/flow/flat_buffers.h
+++ b/flow/flat_buffers.h
@ -36,6 +36,7 @@
 #include <typeinfo>
 #include <typeindex>
 #include <unordered_map>
+#include <deque>
 #include "flow/FileIdentifier.h"
 #include "flow/ObjectSerializerTraits.h"

@ -129,6 +130,33 @@ struct vector_like_traits<std::vector<T, Alloc>> : std::true_type {
 	}
 };

+template <class T, class Alloc>
+struct vector_like_traits<std::deque<T, Alloc>> : std::true_type {
+	using Deq = std::deque<T, Alloc>;
+	using value_type = typename Deq::value_type;
+	using iterator = typename Deq::const_iterator;
+	using insert_iterator = std::back_insert_iterator<Deq>;
+
+	template <class Context>
+	static size_t num_entries(const Deq& v, Context&) {
+		return v.size();
+	}
+	template <class Context>
+	static void reserve(Deq& v, size_t size, Context&) {
+		v.resize(size);
+		v.clear();
+	}
+
+	template <class Context>
+	static insert_iterator insert(Deq& v, Context&) {
+		return std::back_inserter(v);
+	}
+	template <class Context>
+	static iterator begin(const Deq& v, Context&) {
+		return v.begin();
+	}
+};
+
 template <class T, size_t N>
 struct vector_like_traits<std::array<T, N>> : std::true_type {
 	using Vec = std::array<T, N>;
--- a/flow/serialize.h
+++ b/flow/serialize.h
@ -31,6 +31,7 @@
 #include "flow/FileIdentifier.h"
 #include "flow/ObjectSerializer.h"
 #include <algorithm>
+#include <deque>

 // Though similar, is_binary_serializable cannot be replaced by std::is_pod, as doing so would prefer
 // memcpy over a defined serialize() method on a POD struct.  As not all of our structs are packed,
@ -183,6 +184,27 @@ inline void load( Archive& ar, std::vector<T>& value ) {
 	ASSERT( ar.protocolVersion().isValid() );
 }

+template <class Archive, class T>
+inline void save( Archive& ar, const std::deque<T>& value ) {
+	ar << (int)value.size();
+	for(auto it = value.begin(); it != value.end(); ++it)
+		ar << *it;
+	ASSERT( ar.protocolVersion().isValid() );
+}
+
+template <class Archive, class T>
+inline void load( Archive& ar, std::deque<T>& value ) {
+	int s;
+	ar >> s;
+	value.clear();
+	value.reserve(s);
+	for (int i = 0; i < s; i++) {
+		value.push_back(T());
+		ar >> value[i];
+	}
+	ASSERT( ar.protocolVersion().isValid() );
+}
+
 template <class Archive, class T, size_t N>
 inline void save( Archive& ar, const std::array<T, N>& value ) {
 	for(int ii = 0; ii < N; ++ii)
--- a/tests/restarting/from_5.0.0/StorefrontTestRestart-1.txt
+++ b/tests/restarting/from_5.0.0/StorefrontTestRestart-1.txt
@ -1,11 +1,9 @@
 testTitle=StorefrontTest
 clearAfterTest=false 
-
-    testName=Storefront
-    actorsPerClient=50
-    itemCount=100000
-    maxOrderSize=4
-
-    testName=SaveAndKill
-    restartInfoLocation=simfdb/restartInfo.ini
-    testDuration=10.0
+testName=Storefront
+actorsPerClient=50
+itemCount=100000
+maxOrderSize=4
+testName=SaveAndKill
+restartInfoLocation=simfdb/restartInfo.ini
+testDuration=10.0
--- a/tests/restarting/from_5.0.0/StorefrontTestRestart-2.txt
+++ b/tests/restarting/from_5.0.0/StorefrontTestRestart-2.txt
@ -1,7 +1,6 @@
 testTitle=StorefrontTest
 runSetup=false
-
-    testName=Storefront
-    actorsPerClient=50
-    itemCount=100000
-    maxOrderSize=4
+testName=Storefront
+actorsPerClient=50
+itemCount=100000
+maxOrderSize=4