From f21fcf67ceca82693d8dd2252d458b3326de1d0e Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Tue, 25 Oct 2022 10:59:21 -0700
Subject: [PATCH 01/57] initial commit to allow tenant list filtering in
 metacluster

---
 .../sphinx/source/command-line-interface.rst  |  4 ++-
 fdbcli/MetaclusterCommands.actor.cpp          |  2 +-
 fdbcli/TenantCommands.actor.cpp               | 23 ++++++++++---
 fdbclient/Tenant.cpp                          |  2 ++
 .../fdbclient/MetaclusterManagement.actor.h   | 33 +++++++++++++------
 5 files changed, 47 insertions(+), 17 deletions(-)
diff --git a/documentation/sphinx/source/command-line-interface.rst b/documentation/sphinx/source/command-line-interface.rst
index c561379100..a6c60d3f4f 100644
--- a/documentation/sphinx/source/command-line-interface.rst
+++ b/documentation/sphinx/source/command-line-interface.rst
@@ -475,7 +475,7 @@ Deletes a tenant from the cluster. The tenant must be empty.
 list
 ^^^^
 
-``tenant list [BEGIN] [END] [LIMIT]``
+``tenant list [BEGIN] [END] [LIMIT] [state=<STATE1>,<STATE2>,...]``
 
 Lists the tenants present in the cluster.
 
@@ -485,6 +485,8 @@ Lists the tenants present in the cluster.
 
 ``LIMIT`` - the number of tenants to list. Defaults to 100.
 
+``STATE``` - TenantState(s) to filter the list with. Defaults to no filters.
+
 get
 ^^^
 
diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp
index edb25ace2c..652df4645b 100644
--- a/fdbcli/MetaclusterCommands.actor.cpp
+++ b/fdbcli/MetaclusterCommands.actor.cpp
@@ -398,7 +398,7 @@ std::vector<const char*> metaclusterHintGenerator(std::vector<StringRef> const&
 		};
 		return std::vector<const char*>(opts.begin() + std::min<int>(1, tokens.size() - 2), opts.end());
 	} else if (tokencmp(tokens[1], "list") && tokens.size() < 5) {
-		static std::vector<const char*> opts = { "[BEGIN]", "[END]", "[LIMIT]" };
+		static std::vector<const char*> opts = { "[BEGIN]", "[END]", "[LIMIT]", "[state=<STATE1>,<STATE2>,...]" };
 		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
 	} else if (tokencmp(tokens[1], "get") && tokens.size() < 4) {
 		static std::vector<const char*> opts = { "<NAME>", "[JSON]" };
diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp
index e2be6fac56..73c4e79adb 100644
--- a/fdbcli/TenantCommands.actor.cpp
+++ b/fdbcli/TenantCommands.actor.cpp
@@ -225,11 +225,12 @@ ACTOR Future<bool> tenantDeleteCommand(Reference<IDatabase> db, std::vector<Stri
 
 // tenant list command
 ACTOR Future<bool> tenantListCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
-	if (tokens.size() > 5) {
-		fmt::print("Usage: tenant list [BEGIN] [END] [LIMIT]\n\n");
+	if (tokens.size() > 6) {
+		fmt::print("Usage: tenant list [BEGIN] [END] [LIMIT] [state=<STATE1>,<STATE2>,...]\n\n");
 		fmt::print("Lists the tenants in a cluster.\n");
 		fmt::print("Only tenants in the range BEGIN - END will be printed.\n");
 		fmt::print("An optional LIMIT can be specified to limit the number of results (default 100).\n");
+		fmt::print("Optional comma-separated state(s) can be provided to filter the list.\n");
 		return false;
 	}
 
@@ -243,11 +244,11 @@ ACTOR Future<bool> tenantListCommand(Reference<IDatabase> db, std::vector<String
 	if (tokens.size() >= 4) {
 		endTenant = tokens[3];
 		if (endTenant <= beginTenant) {
-			fmt::print(stderr, "ERROR: end must be larger than begin");
+			fmt::print(stderr, "ERROR: end must be larger than begin\n");
 			return false;
 		}
 	}
-	if (tokens.size() == 5) {
+	if (tokens.size() >= 5) {
 		int n = 0;
 		if (sscanf(tokens[4].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[4].size() || limit <= 0) {
 			fmt::print(stderr, "ERROR: invalid limit `{}'\n", tokens[4].toString().c_str());
@@ -255,6 +256,18 @@ ACTOR Future<bool> tenantListCommand(Reference<IDatabase> db, std::vector<String
 		}
 	}
 
+	state std::vector<TenantState> filters;
+	if (tokens.size() == 6) { // state=ready,registering
+		if (!tokens[5].startsWith("state="_sr)) {
+			fmt::print(stderr, "ERROR: state filter must begin with `state='\n");
+			return false;
+		}
+		auto filterStrings = tokens[5].removePrefix("state="_sr).splitAny(","_sr);
+		for (auto sref : filterStrings) {
+			filters.push_back(TenantMapEntry::stringToTenantState(sref.toString()));
+		}
+	}
+
 	state Key beginTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(beginTenant);
 	state Key endTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(endTenant);
 	state Reference<ITransaction> tr = db->createTransaction();
@@ -266,7 +279,7 @@ ACTOR Future<bool> tenantListCommand(Reference<IDatabase> db, std::vector<String
 			state std::vector<TenantName> tenantNames;
 			if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
 				std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-				    wait(MetaclusterAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit));
+				    wait(MetaclusterAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit, filters));
 				for (auto tenant : tenants) {
 					tenantNames.push_back(tenant.first);
 				}
diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp
index 2ad1989fd0..e4d27a8be0 100644
--- a/fdbclient/Tenant.cpp
+++ b/fdbclient/Tenant.cpp
@@ -70,6 +70,7 @@ std::string TenantMapEntry::tenantStateToString(TenantState tenantState) {
 }
 
 TenantState TenantMapEntry::stringToTenantState(std::string stateStr) {
+	std::transform(stateStr.begin(), stateStr.end(), stateStr.begin(), [](unsigned char c) { return std::tolower(c); });
 	if (stateStr == "registering") {
 		return TenantState::REGISTERING;
 	} else if (stateStr == "ready") {
@@ -103,6 +104,7 @@ std::string TenantMapEntry::tenantLockStateToString(TenantLockState tenantState)
 }
 
 TenantLockState TenantMapEntry::stringToTenantLockState(std::string stateStr) {
+	std::transform(stateStr.begin(), stateStr.end(), stateStr.begin(), [](unsigned char c) { return std::tolower(c); });
 	if (stateStr == "unlocked") {
 		return TenantLockState::UNLOCKED;
 	} else if (stateStr == "read only") {
diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
index 91a17a8b88..7d930a8d9d 100644
--- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
+++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
@@ -1555,23 +1555,36 @@ Future<Void> deleteTenant(Reference<DB> db, TenantName name) {
 }
 
 ACTOR template <class Transaction>
-Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(Transaction tr,
-                                                                                  TenantNameRef begin,
-                                                                                  TenantNameRef end,
-                                                                                  int limit) {
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(
+    Transaction tr,
+    TenantNameRef begin,
+    TenantNameRef end,
+    int limit,
+    std::vector<TenantState> filters = std::vector<TenantState>()) {
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
 
 	KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> results =
 	    wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(tr, begin, end, limit));
 
-	return results.results;
+	if (filters.empty()) {
+		return results.results;
+	}
+	std::vector<std::pair<TenantName, TenantMapEntry>> filterResults;
+	for (auto pair : results.results) {
+		if (std::count(filters.begin(), filters.end(), pair.second.tenantState)) {
+			filterResults.push_back(pair);
+		}
+	}
+	return filterResults;
 }
 
 ACTOR template <class DB>
-Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(Reference<DB> db,
-                                                                       TenantName begin,
-                                                                       TenantName end,
-                                                                       int limit) {
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
+    Reference<DB> db,
+    TenantName begin,
+    TenantName end,
+    int limit,
+    std::vector<TenantState> filters = std::vector<TenantState>()) {
 	state Reference<typename DB::TransactionT> tr = db->createTransaction();
 
 	loop {
@@ -1579,7 +1592,7 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(Reference
 			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
 			std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-			    wait(listTenantsTransaction(tr, begin, end, limit));
+			    wait(listTenantsTransaction(tr, begin, end, limit, filters));
 			return tenants;
 		} catch (Error& e) {
 			wait(safeThreadFutureToFuture(tr->onError(e)));

From b17c3fecbbd9a41865ddf3fb270447e6eea9682f Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Wed, 26 Oct 2022 14:37:00 -0700
Subject: [PATCH 02/57] add invalid tenant state and assertion in metacluster
 consistency

---
 fdbcli/MetaclusterCommands.actor.cpp          |  2 +-
 fdbclient/Tenant.cpp                          |  4 +++-
 fdbclient/include/fdbclient/Tenant.h          | 12 +++++++++-
 .../workloads/MetaclusterConsistency.actor.h  | 22 +++++++++++++++++++
 4 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp
index 652df4645b..068fe75de5 100644
--- a/fdbcli/MetaclusterCommands.actor.cpp
+++ b/fdbcli/MetaclusterCommands.actor.cpp
@@ -397,7 +397,7 @@ std::vector<const char*> metaclusterHintGenerator(std::vector<StringRef> const&
 			"<NAME>", "<max_tenant_groups=<NUM_GROUPS>|connection_string=<CONNECTION_STRING>>"
 		};
 		return std::vector<const char*>(opts.begin() + std::min<int>(1, tokens.size() - 2), opts.end());
-	} else if (tokencmp(tokens[1], "list") && tokens.size() < 5) {
+	} else if (tokencmp(tokens[1], "list") && tokens.size() < 6) {
 		static std::vector<const char*> opts = { "[BEGIN]", "[END]", "[LIMIT]", "[state=<STATE1>,<STATE2>,...]" };
 		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
 	} else if (tokencmp(tokens[1], "get") && tokens.size() < 4) {
diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp
index e4d27a8be0..8ef4a8b9e3 100644
--- a/fdbclient/Tenant.cpp
+++ b/fdbclient/Tenant.cpp
@@ -64,6 +64,8 @@ std::string TenantMapEntry::tenantStateToString(TenantState tenantState) {
 		return "renaming to";
 	case TenantState::ERROR:
 		return "error";
+	case TenantState::INVALID:
+		return "invalid";
 	default:
 		UNREACHABLE();
 	}
@@ -87,7 +89,7 @@ TenantState TenantMapEntry::stringToTenantState(std::string stateStr) {
 		return TenantState::ERROR;
 	}
 
-	UNREACHABLE();
+	return TenantState::INVALID;
 }
 
 std::string TenantMapEntry::tenantLockStateToString(TenantLockState tenantState) {
diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h
index 87e1731e90..0781ed08a2 100644
--- a/fdbclient/include/fdbclient/Tenant.h
+++ b/fdbclient/include/fdbclient/Tenant.h
@@ -49,6 +49,7 @@ typedef Standalone<TenantGroupNameRef> TenantGroupName;
 // RENAMING_TO - the tenant is being created as a rename from an existing tenant and is awaiting the rename to complete
 //               on the data cluster
 // ERROR - the tenant is in an error state
+// INVALID - Unrecognized state - likely the result of a failed parsing
 //
 // A tenant in any configuration is allowed to be removed. Only tenants in the READY or UPDATING_CONFIGURATION phases
 // can have their configuration updated. A tenant must not exist or be in the REGISTERING phase to be created. To be
@@ -57,7 +58,16 @@ typedef Standalone<TenantGroupNameRef> TenantGroupName;
 //
 // If an operation fails and the tenant is left in a non-ready state, re-running the same operation is legal. If
 // successful, the tenant will return to the READY state.
-enum class TenantState { REGISTERING, READY, REMOVING, UPDATING_CONFIGURATION, RENAMING_FROM, RENAMING_TO, ERROR };
+enum class TenantState {
+	REGISTERING,
+	READY,
+	REMOVING,
+	UPDATING_CONFIGURATION,
+	RENAMING_FROM,
+	RENAMING_TO,
+	ERROR,
+	INVALID
+};
 
 // Represents the lock state the tenant could be in.
 // Can be used in conjunction with the other tenant states above.
diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h
index 25f3fcae19..55b6aa863a 100644
--- a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h
+++ b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h
@@ -71,6 +71,20 @@ private:
 	ACTOR static Future<Void> loadManagementClusterMetadata(MetaclusterConsistencyCheck* self) {
 		state Reference<typename DB::TransactionT> managementTr = self->managementDb->createTransaction();
 		state std::vector<std::pair<TenantName, TenantMapEntry>> tenantList;
+		state std::vector<std::pair<TenantName, TenantMapEntry>> tenantListReady;
+		state std::vector<std::pair<TenantName, TenantMapEntry>> tenantListOther;
+
+		state std::vector<TenantState> readyFilter;
+		state std::vector<TenantState> otherFilter;
+
+		readyFilter.push_back(TenantState::READY);
+		otherFilter.push_back(TenantState::REGISTERING);
+		otherFilter.push_back(TenantState::REMOVING);
+		otherFilter.push_back(TenantState::UPDATING_CONFIGURATION);
+		otherFilter.push_back(TenantState::RENAMING_FROM);
+		otherFilter.push_back(TenantState::RENAMING_TO);
+		otherFilter.push_back(TenantState::ERROR);
+		otherFilter.push_back(TenantState::INVALID);
 
 		loop {
 			try {
@@ -101,6 +115,12 @@ private:
 				     store(tenantList,
 				           MetaclusterAPI::listTenantsTransaction(
 				               managementTr, ""_sr, "\xff\xff"_sr, metaclusterMaxTenants)) &&
+				     store(tenantListReady,
+				           MetaclusterAPI::listTenantsTransaction(
+				               managementTr, ""_sr, "\xff\xff"_sr, metaclusterMaxTenants, readyFilter)) &&
+				     store(tenantListOther,
+				           MetaclusterAPI::listTenantsTransaction(
+				               managementTr, ""_sr, "\xff\xff"_sr, metaclusterMaxTenants, otherFilter)) &&
 				     store(self->managementMetadata.tenantGroups,
 				           MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantGroupMap.getRange(
 				               managementTr, {}, {}, metaclusterMaxTenants)) &&
@@ -113,6 +133,8 @@ private:
 			}
 		}
 
+		ASSERT(tenantListReady.size() + tenantListOther.size() == tenantList.size());
+
 		self->managementMetadata.tenantMap = std::map<TenantName, TenantMapEntry>(tenantList.begin(), tenantList.end());
 
 		for (auto t : self->managementMetadata.clusterTenantTuples.results) {

From 098793893e96a35eeddb1b1f9f309ab049d3b545 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Wed, 26 Oct 2022 14:50:59 -0700
Subject: [PATCH 03/57] move hints to correct generator

---
 fdbcli/MetaclusterCommands.actor.cpp | 4 ++--
 fdbcli/TenantCommands.actor.cpp      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp
index 068fe75de5..edb25ace2c 100644
--- a/fdbcli/MetaclusterCommands.actor.cpp
+++ b/fdbcli/MetaclusterCommands.actor.cpp
@@ -397,8 +397,8 @@ std::vector<const char*> metaclusterHintGenerator(std::vector<StringRef> const&
 			"<NAME>", "<max_tenant_groups=<NUM_GROUPS>|connection_string=<CONNECTION_STRING>>"
 		};
 		return std::vector<const char*>(opts.begin() + std::min<int>(1, tokens.size() - 2), opts.end());
-	} else if (tokencmp(tokens[1], "list") && tokens.size() < 6) {
-		static std::vector<const char*> opts = { "[BEGIN]", "[END]", "[LIMIT]", "[state=<STATE1>,<STATE2>,...]" };
+	} else if (tokencmp(tokens[1], "list") && tokens.size() < 5) {
+		static std::vector<const char*> opts = { "[BEGIN]", "[END]", "[LIMIT]" };
 		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
 	} else if (tokencmp(tokens[1], "get") && tokens.size() < 4) {
 		static std::vector<const char*> opts = { "<NAME>", "[JSON]" };
diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp
index 73c4e79adb..00bd0e8309 100644
--- a/fdbcli/TenantCommands.actor.cpp
+++ b/fdbcli/TenantCommands.actor.cpp
@@ -626,8 +626,8 @@ std::vector<const char*> tenantHintGenerator(std::vector<StringRef> const& token
 	} else if (tokencmp(tokens[1], "delete") && tokens.size() < 3) {
 		static std::vector<const char*> opts = { "<NAME>" };
 		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
-	} else if (tokencmp(tokens[1], "list") && tokens.size() < 5) {
-		static std::vector<const char*> opts = { "[BEGIN]", "[END]", "[LIMIT]" };
+	} else if (tokencmp(tokens[1], "list") && tokens.size() < 6) {
+		static std::vector<const char*> opts = { "[BEGIN]", "[END]", "[LIMIT]", "[state=<STATE1>,<STATE2>,...]" };
 		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
 	} else if (tokencmp(tokens[1], "get") && tokens.size() < 4) {
 		static std::vector<const char*> opts = { "<NAME>", "[JSON]" };

From 8a59bc276d52c3299b8e5a1ff7b95403a014de95 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Wed, 26 Oct 2022 11:44:42 -0700
Subject: [PATCH 04/57] data operation api (not finished)

---
 fdbserver/MockGlobalState.actor.cpp           | 62 +++++++++++++++++++
 fdbserver/include/fdbserver/MockGlobalState.h | 22 +++++++
 .../include/fdbserver/StorageMetrics.actor.h  |  4 ++
 fdbserver/storageserver.actor.cpp             |  3 +-
 4 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index 8de995fd2b..f95b2a33f6 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -250,6 +250,68 @@ Future<Void> MockStorageServer::run() {
 	return serveStorageMetricsRequests(this, ssi);
 }
 
+void MockStorageServer::set(KeyRef key, int64_t bytes, int64_t oldBytes) {
+	notifyMvccStorageCost(key, bytes);
+}
+
+void MockStorageServer::insert(KeyRef key, int64_t bytes) {
+	notifyMvccStorageCost(key, bytes);
+}
+
+void MockStorageServer::clear(KeyRef key, int64_t bytes) {
+	notifyMvccStorageCost(key, bytes);
+}
+
+void MockStorageServer::clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+	notifyMvccStorageCost(range.begin, range.begin.size() + range.end.size());
+
+	auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
+}
+
+void MockStorageServer::get(KeyRef key, int64_t bytes) {
+	// If the read yields no value, randomly sample the empty read.
+	int64_t bytesReadPerKSecond = std::max(bytes, SERVER_KNOBS->EMPTY_READ_PENALTY);
+	metrics.notifyBytesReadPerKSecond(key, bytesReadPerKSecond);
+}
+
+void MockStorageServer::getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+	auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
+	// For performance concerns, the cost of a range read is billed to the start key and end key of the
+	// range.
+	if (totalByteSize > 0) {
+		int64_t bytesReadPerKSecond = std::max(totalByteSize, SERVER_KNOBS->EMPTY_READ_PENALTY) / 2;
+		metrics.notifyBytesReadPerKSecond(range.begin, bytesReadPerKSecond);
+		metrics.notifyBytesReadPerKSecond(range.end, bytesReadPerKSecond);
+	}
+}
+
+int64_t MockStorageServer::estimateRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+	int64_t totalByteSize = 0;
+	auto ranges = serverKeys.intersectingRanges(range);
+
+	// use the beginShardBytes as partial size
+	if (ranges.begin().begin() < range.begin) {
+		ranges.pop_front();
+		totalByteSize += beginShardBytes;
+	}
+	// use the endShardBytes as partial size
+	if (ranges.end().begin() < range.end) {
+		totalByteSize += endShardBytes;
+	}
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		totalByteSize += it->cvalue().shardSize;
+	}
+	return totalByteSize;
+}
+
+void MockStorageServer::notifyMvccStorageCost(KeyRef key, int64_t size) {
+	// update write bandwidth and iops as mock the cost of writing mvcc storage
+	StorageMetrics s;
+	s.bytesPerKSecond = mvccStorageBytes(size) / 2;
+	s.iosPerKSecond = 1;
+	metrics.notify(key, s);
+}
+
 void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
 	ASSERT(conf.storageTeamSize > 0);
 	configuration = conf;
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index ac984e9069..260478c57e 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -133,6 +133,23 @@ public:
 
 	Future<Void> run();
 
+	// data operation APIs - change the metrics
+
+	// Set key with a new value, the total bytes change from oldBytes to bytes
+	void set(KeyRef key, int64_t bytes, int64_t oldBytes);
+	// Insert key with a new value, the total bytes is `bytes`
+	void insert(KeyRef key, int64_t bytes);
+	// Clear key and its value of which the size is bytes
+	void clear(KeyRef key, int64_t bytes);
+	// Clear range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
+	void clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+
+	// modify the metrics as like doing an n-bytes read op
+	// Read key and cause bytes read overhead
+	void get(KeyRef key, int64_t bytes);
+	// Read range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
+	void getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+
 protected:
 	void threeWayShardSplitting(KeyRangeRef outerRange,
 	                            KeyRangeRef innerRange,
@@ -140,6 +157,11 @@ protected:
 	                            bool restrictSize);
 
 	void twoWayShardSplitting(KeyRangeRef range, KeyRef splitPoint, uint64_t rangeSize, bool restrictSize);
+
+	// Assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
+	int64_t estimateRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+	// Update the storage metrics as if we write the MVCC storage with a mutation of `size` bytes.
+	void notifyMvccStorageCost(KeyRef key, int64_t size);
 };
 
 class MockGlobalStateImpl;
diff --git a/fdbserver/include/fdbserver/StorageMetrics.actor.h b/fdbserver/include/fdbserver/StorageMetrics.actor.h
index dc518cf318..34a2d27dd5 100644
--- a/fdbserver/include/fdbserver/StorageMetrics.actor.h
+++ b/fdbserver/include/fdbserver/StorageMetrics.actor.h
@@ -228,5 +228,9 @@ Future<Void> serveStorageMetricsRequests(ServiceType* self, StorageServerInterfa
 	}
 }
 
+// For both the mutation log and the versioned map.
+inline int mvccStorageBytes(int64_t size) {
+	return VersionedMap<KeyRef, ValueOrClearToRef>::overheadPerItem * 2 + (MutationRef::OVERHEAD_BYTES + size) * 2;
+}
 #include "flow/unactorcompiler.h"
 #endif // FDBSERVER_STORAGEMETRICS_H
\ No newline at end of file
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 1e337bc4fd..41cc8e701d 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -535,8 +535,7 @@ const int VERSION_OVERHEAD =
                                                                               // overhead for map
 // For both the mutation log and the versioned map.
 static int mvccStorageBytes(MutationRef const& m) {
-	return VersionedMap<KeyRef, ValueOrClearToRef>::overheadPerItem * 2 +
-	       (MutationRef::OVERHEAD_BYTES + m.param1.size() + m.param2.size()) * 2;
+	return mvccStorageBytes(m.param1.size() + m.param2.size());
 }
 
 struct FetchInjectionInfo {

From 334fced5723747bbcbe5b0f7bf0a377710f9d8eb Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Thu, 27 Oct 2022 10:59:19 -0700
Subject: [PATCH 05/57] add data api implementations; add more realistic
 fetchKey implementation; finish randomKeyBetween implementation

---
 fdbclient/FDBTypes.cpp                        | 46 +++++++++++
 fdbclient/include/fdbclient/FDBTypes.h        |  3 +
 fdbserver/DDTxnProcessor.actor.cpp            | 30 ++++++-
 fdbserver/MockGlobalState.actor.cpp           | 82 +++++++++++++++++--
 fdbserver/include/fdbserver/MockGlobalState.h | 21 ++++-
 .../include/fdbserver/StorageMetrics.actor.h  |  5 +-
 fdbserver/storageserver.actor.cpp             |  6 +-
 7 files changed, 179 insertions(+), 14 deletions(-)

diff --git a/fdbclient/FDBTypes.cpp b/fdbclient/FDBTypes.cpp
index edd016d391..056fc3430d 100644
--- a/fdbclient/FDBTypes.cpp
+++ b/fdbclient/FDBTypes.cpp
@@ -50,6 +50,52 @@ KeyRef keyBetween(const KeyRangeRef& keys) {
 	return keys.end;
 }
 
+Key randomKeyBetween(const KeyRangeRef& keys) {
+	KeyRef begin = keys.begin;
+	KeyRef end = keys.end;
+	ASSERT(begin < end);
+	if (begin.size() < end.size()) {
+		// randomly append a char
+		uint8_t newChar = deterministicRandom()->randomInt(0, end[begin.size()] + 1);
+		return begin.withSuffix(StringRef(&newChar, 1));
+	}
+
+	int pos = 0; // will be the position of the first difference between keys.begin and keys.end
+	for (; pos < end.size() && pos < CLIENT_KNOBS->KEY_SIZE_LIMIT; pos++) {
+		if (keys.begin[pos] != keys.end[pos]) {
+			break;
+		}
+	}
+	ASSERT(pos < end.size()); // otherwise, begin >= end
+
+	// find the lowest char in range begin[pos+1, begin.size()) that is not \xff (255)
+	int lowest = begin.size() - 1;
+	for (; lowest > pos; lowest--) {
+		if (begin[lowest] < 255) {
+			Key res = begin;
+			uint8_t* ptr = mutateString(res);
+			*(ptr + lowest) = (uint8_t)deterministicRandom()->randomInt(begin[lowest] + 1, 256);
+			return res;
+		}
+	}
+
+	if (begin[pos] + 1 < end[pos]) {
+		Key res = begin;
+		uint8_t* ptr = mutateString(res);
+		*(ptr + pos) = (uint8_t)deterministicRandom()->randomInt(begin[pos] + 1, end[pos]);
+		return res;
+	}
+
+	if (begin.size() + 1 < CLIENT_KNOBS->KEY_SIZE_LIMIT) {
+		// randomly append a char
+		uint8_t newChar = deterministicRandom()->randomInt(1, 255);
+		return begin.withSuffix(StringRef(&newChar, 1));
+	}
+
+	// no possible result
+	return end;
+}
+
 void KeySelectorRef::setKey(KeyRef const& key) {
 	// There are no keys in the database with size greater than the max key size, so if this key selector has a key
 	// which is large, then we can translate it to an equivalent key selector with a smaller key
diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h
index ddb6404bb8..a7ed1040ec 100644
--- a/fdbclient/include/fdbclient/FDBTypes.h
+++ b/fdbclient/include/fdbclient/FDBTypes.h
@@ -590,6 +590,9 @@ inline KeyRange prefixRange(KeyRef prefix) {
 // The returned reference is valid as long as keys is valid.
 KeyRef keyBetween(const KeyRangeRef& keys);
 
+// Returns a randomKey between keys. If it's impossible, return keys.end.
+Key randomKeyBetween(const KeyRangeRef& keys);
+
 KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix);
 
 struct KeySelectorRef {
diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 693e06e949..6a6b7d78dc 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -680,14 +680,32 @@ Future<Void> DDTxnProcessor::rawFinishMovement(MoveKeysParams& params,
 }
 
 struct DDMockTxnProcessorImpl {
+	// return when all status become TRANSFERRED
+	ACTOR static Future<Void> checkFetchingState(DDMockTxnProcessor* self, std::vector<UID> ids, KeyRangeRef range) {
+		loop {
+			wait(delayJittered(1.0));
+			DDMockTxnProcessor* selfP = self;
+			KeyRangeRef cloneRef;
+			if (std::all_of(ids.begin(), ids.end(), [selfP, cloneRef](const UID& id) {
+				    auto& server = selfP->mgs->allServers.at(id);
+				    return server.allShardStatusEqual(cloneRef, MockShardStatus::TRANSFERRED) ||
+				           server.allShardStatusEqual(cloneRef, MockShardStatus::COMPLETED);
+			    })) {
+				break;
+			}
+		}
+		if (BUGGIFY_WITH_PROB(0.5)) {
+			wait(delayJittered(5.0));
+		}
+		return Void();
+	}
+
 	ACTOR static Future<Void> moveKeys(DDMockTxnProcessor* self, MoveKeysParams params) {
 		state std::map<UID, StorageServerInterface> tssMapping;
 		self->rawStartMovement(params, tssMapping);
 		ASSERT(tssMapping.empty());
 
-		if (BUGGIFY_WITH_PROB(0.5)) {
-			wait(delayJittered(5.0));
-		}
+		wait(checkFetchingState(self, params.destinationTeam, params.keys));
 
 		self->rawFinishMovement(params, tssMapping);
 		if (!params.dataMovementComplete.isSet())
@@ -877,8 +895,12 @@ void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID,
 	destTeams.emplace_back(params.destinationTeam, true);
 	mgs->shardMapping->moveShard(params.keys, destTeams);
 
+	auto randomRangeSize =
+	    deterministicRandom()->randomInt64(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
 	for (auto& id : params.destinationTeam) {
-		mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::INFLIGHT, mgs->restrictSize);
+		auto& server = mgs->allServers.at(id);
+		server.setShardStatus(params.keys, MockShardStatus::INFLIGHT, mgs->restrictSize);
+		server.signalFetchKeys(params.keys, randomRangeSize);
 	}
 }
 
diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index f95b2a33f6..cbb0a53bf4 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -114,6 +114,43 @@ public:
 		}
 		return Void();
 	}
+
+	ACTOR static Future<Void> serveMockStorageServer(MockStorageServer* self) {
+		state ActorCollection actors;
+		loop choose {
+			when(MockStorageServer::FetchKeysParams params = waitNext(self->fetchKeysRequests.getFuture())) {
+				if (!self->allShardStatusEqual(params.keys, MockShardStatus::COMPLETED)) {
+					actors.add(waitFetchKeysFinish(self, params));
+				}
+			}
+			when(wait(actors.getResult())) { ASSERT(false); }
+		}
+	}
+	ACTOR static Future<Void> waitFetchKeysFinish(MockStorageServer* self, MockStorageServer::FetchKeysParams params) {
+		// between each chunk delay for random time, and finally set the fetchComplete signal.
+		ASSERT(params.totalRangeBytes > 0);
+		state int chunkCount = std::ceil(params.totalRangeBytes * 1.0 / SERVER_KNOBS->FETCH_BLOCK_BYTES);
+		state Key lastKey = params.keys.begin;
+
+		state int i = 0;
+		for (; i < chunkCount; ++i) {
+			wait(delayJittered(0.01));
+			int remainBytes = (chunkCount == 1 ? params.totalRangeBytes : SERVER_KNOBS->FETCH_BLOCK_BYTES);
+
+			while (remainBytes >= lastKey.size()) {
+				int maxSize = std::min(remainBytes, 130000) + 1;
+				int randomSize = deterministicRandom()->randomInt(lastKey.size(), maxSize);
+
+				self->availableDiskSpace -= randomSize;
+				self->byteSampleApplySet(lastKey, randomSize);
+				remainBytes -= randomSize;
+				lastKey = randomKeyBetween(KeyRangeRef(lastKey, params.keys.end));
+			}
+		}
+
+		self->setShardStatus(params.keys, MockShardStatus::TRANSFERRED, true);
+		return Void();
+	}
 };
 
 bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
@@ -133,7 +170,6 @@ void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status
 	if (ranges.begin().range().contains(range)) {
 		CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
 		threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
-		return;
 	}
 	if (ranges.begin().begin() < range.begin) {
 		CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
@@ -155,7 +191,8 @@ void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status
 		auto oldStatus = it.value().status;
 		if (isStatusTransitionValid(oldStatus, status)) {
 			it.value() = ShardInfo{ status, newSize };
-		} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
+		} else if (oldStatus == MockShardStatus::COMPLETED &&
+		           (status == MockShardStatus::INFLIGHT || status == MockShardStatus::TRANSFERRED)) {
 			CODE_PROBE(true, "Shard already on server");
 		} else {
 			TraceEvent(SevError, "MockShardStatusTransitionError")
@@ -176,6 +213,9 @@ void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
                                                uint64_t outerRangeSize,
                                                bool restrictSize) {
 	ASSERT(outerRange.contains(innerRange));
+	if (outerRange == innerRange) {
+		return;
+	}
 
 	Key left = outerRange.begin;
 	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
@@ -216,6 +256,7 @@ void MockStorageServer::removeShard(KeyRangeRef range) {
 	auto ranges = serverKeys.containedRanges(range);
 	ASSERT(ranges.begin().range() == range);
 	serverKeys.rawErase(range);
+	metrics.notifyNotReadable(range);
 }
 
 uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
@@ -247,7 +288,9 @@ Future<Void> MockStorageServer::run() {
 	ssi.initEndpoints();
 	ssi.startAcceptingRequests();
 	TraceEvent("MockStorageServerStart").detail("Address", ssi.address());
-	return serveStorageMetricsRequests(this, ssi);
+	addActor(serveStorageMetricsRequests(this, ssi));
+	addActor(MockStorageServerImpl::serveMockStorageServer(this));
+	return actors.getResult();
 }
 
 void MockStorageServer::set(KeyRef key, int64_t bytes, int64_t oldBytes) {
@@ -258,14 +301,14 @@ void MockStorageServer::insert(KeyRef key, int64_t bytes) {
 	notifyMvccStorageCost(key, bytes);
 }
 
+// TODO: finish clear implementation. Currently the clear operations are not used.
 void MockStorageServer::clear(KeyRef key, int64_t bytes) {
 	notifyMvccStorageCost(key, bytes);
 }
 
 void MockStorageServer::clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
 	notifyMvccStorageCost(range.begin, range.begin.size() + range.end.size());
-
-	auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
+	// auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
 }
 
 void MockStorageServer::get(KeyRef key, int64_t bytes) {
@@ -312,6 +355,35 @@ void MockStorageServer::notifyMvccStorageCost(KeyRef key, int64_t size) {
 	metrics.notify(key, s);
 }
 
+void MockStorageServer::signalFetchKeys(KeyRangeRef range, int64_t rangeTotalBytes) {
+	fetchKeysRequests.send({ KeyRange(range), rangeTotalBytes });
+}
+
+Future<Void> MockStorageServer::fetchKeys(const MockStorageServer::FetchKeysParams& param) {
+	return MockStorageServerImpl::waitFetchKeysFinish(this, param);
+}
+
+void MockStorageServer::byteSampleApplySet(KeyRef key, int64_t kvSize) {
+	// Update byteSample in memory and notify waiting metrics
+	ByteSampleInfo sampleInfo = isKeyValueInSample(key, kvSize);
+	auto& byteSample = metrics.byteSample.sample;
+
+	int64_t delta = 0;
+	auto old = byteSample.find(key);
+	if (old != byteSample.end())
+		delta = -byteSample.getMetric(old);
+
+	if (sampleInfo.inSample) {
+		delta += sampleInfo.sampledSize;
+		byteSample.insert(key, sampleInfo.sampledSize);
+	} else if (old != byteSample.end()) {
+		byteSample.erase(old);
+	}
+
+	if (delta)
+		metrics.notifyBytes(key, delta);
+}
+
 void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
 	ASSERT(conf.storageTeamSize > 0);
 	configuration = conf;
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index 260478c57e..16e93ce664 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -35,6 +35,7 @@ enum class MockShardStatus {
 	EMPTY = 0, // data loss
 	COMPLETED,
 	INFLIGHT,
+	TRANSFERRED, // finish fetch Keys but not change the serverKey mapping. Only can be set by MSS itself.
 	UNSET
 };
 
@@ -42,8 +43,11 @@ inline bool isStatusTransitionValid(MockShardStatus from, MockShardStatus to) {
 	switch (from) {
 	case MockShardStatus::UNSET:
 	case MockShardStatus::EMPTY:
-	case MockShardStatus::INFLIGHT:
 		return to == MockShardStatus::COMPLETED || to == MockShardStatus::INFLIGHT || to == MockShardStatus::EMPTY;
+	case MockShardStatus::INFLIGHT:
+		return to == MockShardStatus::TRANSFERRED || to == MockShardStatus::INFLIGHT || to == MockShardStatus::EMPTY;
+	case MockShardStatus::TRANSFERRED:
+		return to == MockShardStatus::COMPLETED;
 	case MockShardStatus::COMPLETED:
 		return to == MockShardStatus::EMPTY;
 	default:
@@ -52,8 +56,10 @@ inline bool isStatusTransitionValid(MockShardStatus from, MockShardStatus to) {
 	return false;
 }
 
+class MockStorageServerImpl;
 class MockStorageServer : public IStorageMetricsService {
 	friend struct MockGlobalStateTester;
+	friend class MockStorageServerImpl;
 
 	ActorCollection actors;
 
@@ -66,6 +72,11 @@ public:
 		bool operator!=(const ShardInfo& a) const { return !(a == *this); }
 	};
 
+	struct FetchKeysParams {
+		KeyRange keys;
+		int64_t totalRangeBytes;
+	};
+
 	static constexpr uint64_t DEFAULT_DISK_SPACE = 1000LL * 1024 * 1024 * 1024;
 
 	// control plane statistics associated with a real storage server
@@ -150,7 +161,11 @@ public:
 	// Read range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
 	void getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
 
+	void signalFetchKeys(KeyRangeRef range, int64_t rangeTotalBytes);
+
 protected:
+	PromiseStream<FetchKeysParams> fetchKeysRequests;
+
 	void threeWayShardSplitting(KeyRangeRef outerRange,
 	                            KeyRangeRef innerRange,
 	                            uint64_t outerRangeSize,
@@ -162,6 +177,10 @@ protected:
 	int64_t estimateRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
 	// Update the storage metrics as if we write the MVCC storage with a mutation of `size` bytes.
 	void notifyMvccStorageCost(KeyRef key, int64_t size);
+
+	Future<Void> fetchKeys(const FetchKeysParams&);
+
+	void byteSampleApplySet(KeyRef key, int64_t kvSize);
 };
 
 class MockGlobalStateImpl;
diff --git a/fdbserver/include/fdbserver/StorageMetrics.actor.h b/fdbserver/include/fdbserver/StorageMetrics.actor.h
index 34a2d27dd5..db7524d5f9 100644
--- a/fdbserver/include/fdbserver/StorageMetrics.actor.h
+++ b/fdbserver/include/fdbserver/StorageMetrics.actor.h
@@ -156,7 +156,10 @@ struct ByteSampleInfo {
 
 // Determines whether a key-value pair should be included in a byte sample
 // Also returns size information about the sample
-ByteSampleInfo isKeyValueInSample(KeyValueRef keyValue);
+ByteSampleInfo isKeyValueInSample(KeyRef key, int64_t totalKvSize);
+inline ByteSampleInfo isKeyValueInSample(KeyValueRef keyValue) {
+	return isKeyValueInSample(keyValue.key, keyValue.key.size() + keyValue.value.size());
+}
 
 class IStorageMetricsService {
 public:
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 41cc8e701d..35101586f7 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -9931,11 +9931,11 @@ Future<bool> StorageServerDisk::restoreDurableState() {
 
 // Determines whether a key-value pair should be included in a byte sample
 // Also returns size information about the sample
-ByteSampleInfo isKeyValueInSample(KeyValueRef keyValue) {
+ByteSampleInfo isKeyValueInSample(const KeyRef key, int64_t totalKvSize) {
+	ASSERT(totalKvSize >= key.size());
 	ByteSampleInfo info;
 
-	const KeyRef key = keyValue.key;
-	info.size = key.size() + keyValue.value.size();
+	info.size = totalKvSize;
 
 	uint32_t a = 0;
 	uint32_t b = 0;

From cc61ea6a01005f9a78e90e942bb9ac74038342a8 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Thu, 27 Oct 2022 12:50:49 -0700
Subject: [PATCH 06/57] finish the clearrange ops

---
 fdbserver/MockGlobalState.actor.cpp           | 67 +++++++++++++++++--
 fdbserver/include/fdbserver/MockGlobalState.h |  7 +-
 2 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index cbb0a53bf4..6230fa2c99 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -255,7 +255,10 @@ void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
 void MockStorageServer::removeShard(KeyRangeRef range) {
 	auto ranges = serverKeys.containedRanges(range);
 	ASSERT(ranges.begin().range() == range);
+	auto rangeSize = sumRangeSize(range);
+	availableDiskSpace += rangeSize;
 	serverKeys.rawErase(range);
+	byteSampleApplyClear(range);
 	metrics.notifyNotReadable(range);
 }
 
@@ -295,20 +298,26 @@ Future<Void> MockStorageServer::run() {
 
 void MockStorageServer::set(KeyRef key, int64_t bytes, int64_t oldBytes) {
 	notifyMvccStorageCost(key, bytes);
+	byteSampleApplySet(key, bytes);
+	auto delta = oldBytes - bytes;
+	availableDiskSpace += delta;
+	serverKeys[key].shardSize += delta;
 }
 
-void MockStorageServer::insert(KeyRef key, int64_t bytes) {
-	notifyMvccStorageCost(key, bytes);
-}
-
-// TODO: finish clear implementation. Currently the clear operations are not used.
 void MockStorageServer::clear(KeyRef key, int64_t bytes) {
 	notifyMvccStorageCost(key, bytes);
+	KeyRange sr = singleKeyRange(key);
+	byteSampleApplyClear(sr);
+	availableDiskSpace += bytes;
+	serverKeys[key].shardSize -= bytes;
 }
 
 void MockStorageServer::clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
 	notifyMvccStorageCost(range.begin, range.begin.size() + range.end.size());
-	// auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
+	byteSampleApplyClear(range);
+	auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
+	availableDiskSpace += totalByteSize;
+	clearRangeTotalBytes(range, beginShardBytes, endShardBytes);
 }
 
 void MockStorageServer::get(KeyRef key, int64_t bytes) {
@@ -347,6 +356,25 @@ int64_t MockStorageServer::estimateRangeTotalBytes(KeyRangeRef range, int64_t be
 	return totalByteSize;
 }
 
+void MockStorageServer::clearRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+	auto ranges = serverKeys.intersectingRanges(range);
+
+	// use the beginShardBytes as partial size
+	if (ranges.begin().begin() < range.begin) {
+		auto delta = std::min(ranges.begin().value().shardSize, (uint64_t)beginShardBytes);
+		ranges.begin().value().shardSize -= delta;
+		ranges.pop_front();
+	}
+	// use the endShardBytes as partial size
+	if (ranges.end().begin() < range.end) {
+		auto delta = std::min(ranges.end().value().shardSize, (uint64_t)endShardBytes);
+		ranges.end().value().shardSize -= delta;
+	}
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		it->value().shardSize = 0;
+	}
+}
+
 void MockStorageServer::notifyMvccStorageCost(KeyRef key, int64_t size) {
 	// update write bandwidth and iops as mock the cost of writing mvcc storage
 	StorageMetrics s;
@@ -384,6 +412,33 @@ void MockStorageServer::byteSampleApplySet(KeyRef key, int64_t kvSize) {
 		metrics.notifyBytes(key, delta);
 }
 
+void MockStorageServer::byteSampleApplyClear(KeyRangeRef range) {
+	// Update byteSample in memory and (eventually) on disk via the mutationLog and notify waiting metrics
+
+	auto& byteSample = metrics.byteSample.sample;
+	bool any = false;
+
+	if (range.begin < allKeys.end) {
+		// NotifyBytes should not be called for keys past allKeys.end
+		KeyRangeRef searchRange = KeyRangeRef(range.begin, std::min(range.end, allKeys.end));
+
+		auto r = metrics.waitMetricsMap.intersectingRanges(searchRange);
+		for (auto shard = r.begin(); shard != r.end(); ++shard) {
+			KeyRangeRef intersectingRange = shard.range() & range;
+			int64_t bytes = byteSample.sumRange(intersectingRange.begin, intersectingRange.end);
+			metrics.notifyBytes(shard, -bytes);
+			any = any || bytes > 0;
+		}
+	}
+
+	if (range.end > allKeys.end && byteSample.sumRange(std::max(allKeys.end, range.begin), range.end) > 0)
+		any = true;
+
+	if (any) {
+		byteSample.eraseAsync(range.begin, range.end);
+	}
+}
+
 void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
 	ASSERT(conf.storageTeamSize > 0);
 	configuration = conf;
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index 16e93ce664..ed01f26ee0 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -148,8 +148,6 @@ public:
 
 	// Set key with a new value, the total bytes change from oldBytes to bytes
 	void set(KeyRef key, int64_t bytes, int64_t oldBytes);
-	// Insert key with a new value, the total bytes is `bytes`
-	void insert(KeyRef key, int64_t bytes);
 	// Clear key and its value of which the size is bytes
 	void clear(KeyRef key, int64_t bytes);
 	// Clear range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
@@ -175,12 +173,17 @@ protected:
 
 	// Assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
 	int64_t estimateRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+	// Decrease the intersecting shard bytes as if delete the data
+	void clearRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+
 	// Update the storage metrics as if we write the MVCC storage with a mutation of `size` bytes.
 	void notifyMvccStorageCost(KeyRef key, int64_t size);
 
 	Future<Void> fetchKeys(const FetchKeysParams&);
 
 	void byteSampleApplySet(KeyRef key, int64_t kvSize);
+
+	void byteSampleApplyClear(KeyRangeRef range);
 };
 
 class MockGlobalStateImpl;

From 0cbd1dfccaa268e18e18878314f51cc0287b0f5b Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Thu, 27 Oct 2022 15:46:48 -0700
Subject: [PATCH 07/57] add comment and MockDDTest base class file

---
 fdbserver/DDTxnProcessor.actor.cpp            |  5 +--
 fdbserver/MockGlobalState.actor.cpp           | 20 ++++++++--
 fdbserver/include/fdbserver/MockGlobalState.h | 22 ++++++----
 fdbserver/workloads/MockDDTest.actor.cpp      | 40 +++++++++++++++++++
 4 files changed, 73 insertions(+), 14 deletions(-)
 create mode 100644 fdbserver/workloads/MockDDTest.actor.cpp

diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 6a6b7d78dc..7b124bea06 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -680,7 +680,7 @@ Future<Void> DDTxnProcessor::rawFinishMovement(MoveKeysParams& params,
 }
 
 struct DDMockTxnProcessorImpl {
-	// return when all status become TRANSFERRED
+	// return when all status become FETCHED
 	ACTOR static Future<Void> checkFetchingState(DDMockTxnProcessor* self, std::vector<UID> ids, KeyRangeRef range) {
 		loop {
 			wait(delayJittered(1.0));
@@ -688,8 +688,7 @@ struct DDMockTxnProcessorImpl {
 			KeyRangeRef cloneRef;
 			if (std::all_of(ids.begin(), ids.end(), [selfP, cloneRef](const UID& id) {
 				    auto& server = selfP->mgs->allServers.at(id);
-				    return server.allShardStatusEqual(cloneRef, MockShardStatus::TRANSFERRED) ||
-				           server.allShardStatusEqual(cloneRef, MockShardStatus::COMPLETED);
+				    return server.allShardStatusIn(cloneRef, { MockShardStatus::FETCHED, MockShardStatus::COMPLETED });
 			    })) {
 				break;
 			}
diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index 6230fa2c99..a862072d51 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -148,7 +148,7 @@ public:
 			}
 		}
 
-		self->setShardStatus(params.keys, MockShardStatus::TRANSFERRED, true);
+		self->setShardStatus(params.keys, MockShardStatus::FETCHED, true);
 		return Void();
 	}
 };
@@ -164,6 +164,17 @@ bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus s
 	return true;
 }
 
+bool MockStorageServer::allShardStatusIn(KeyRangeRef range, const std::set<MockShardStatus>& status) {
+	auto ranges = serverKeys.intersectingRanges(range);
+	ASSERT(!ranges.empty()); // at least the range is allKeys
+
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		if (!status.count(it->cvalue().status))
+			return false;
+	}
+	return true;
+}
+
 void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
 	auto ranges = serverKeys.intersectingRanges(range);
 	ASSERT(!ranges.empty());
@@ -192,7 +203,7 @@ void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status
 		if (isStatusTransitionValid(oldStatus, status)) {
 			it.value() = ShardInfo{ status, newSize };
 		} else if (oldStatus == MockShardStatus::COMPLETED &&
-		           (status == MockShardStatus::INFLIGHT || status == MockShardStatus::TRANSFERRED)) {
+		           (status == MockShardStatus::INFLIGHT || status == MockShardStatus::FETCHED)) {
 			CODE_PROBE(true, "Shard already on server");
 		} else {
 			TraceEvent(SevError, "MockShardStatusTransitionError")
@@ -413,7 +424,7 @@ void MockStorageServer::byteSampleApplySet(KeyRef key, int64_t kvSize) {
 }
 
 void MockStorageServer::byteSampleApplyClear(KeyRangeRef range) {
-	// Update byteSample in memory and (eventually) on disk via the mutationLog and notify waiting metrics
+	// Update byteSample and notify waiting metrics
 
 	auto& byteSample = metrics.byteSample.sample;
 	bool any = false;
@@ -483,7 +494,8 @@ bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shar
 
 	// check serverKeys
 	auto& mss = allServers.at(serverId);
-	if (!mss.allShardStatusEqual(shard, MockShardStatus::INFLIGHT)) {
+	if (!mss.allShardStatusIn(shard,
+	                          { MockShardStatus::INFLIGHT, MockShardStatus::COMPLETED, MockShardStatus::FETCHED })) {
 		return false;
 	}
 
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index ed01f26ee0..a86d8aeb44 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -35,7 +35,7 @@ enum class MockShardStatus {
 	EMPTY = 0, // data loss
 	COMPLETED,
 	INFLIGHT,
-	TRANSFERRED, // finish fetch Keys but not change the serverKey mapping. Only can be set by MSS itself.
+	FETCHED, // finish fetch but not change the serverKey mapping. Only can be set by MSS itself.
 	UNSET
 };
 
@@ -45,8 +45,8 @@ inline bool isStatusTransitionValid(MockShardStatus from, MockShardStatus to) {
 	case MockShardStatus::EMPTY:
 		return to == MockShardStatus::COMPLETED || to == MockShardStatus::INFLIGHT || to == MockShardStatus::EMPTY;
 	case MockShardStatus::INFLIGHT:
-		return to == MockShardStatus::TRANSFERRED || to == MockShardStatus::INFLIGHT || to == MockShardStatus::EMPTY;
-	case MockShardStatus::TRANSFERRED:
+		return to == MockShardStatus::FETCHED || to == MockShardStatus::INFLIGHT || to == MockShardStatus::EMPTY;
+	case MockShardStatus::FETCHED:
 		return to == MockShardStatus::COMPLETED;
 	case MockShardStatus::COMPLETED:
 		return to == MockShardStatus::EMPTY;
@@ -80,7 +80,7 @@ public:
 	static constexpr uint64_t DEFAULT_DISK_SPACE = 1000LL * 1024 * 1024 * 1024;
 
 	// control plane statistics associated with a real storage server
-	uint64_t usedDiskSpace = 0, availableDiskSpace = DEFAULT_DISK_SPACE;
+	uint64_t totalDiskSpace = DEFAULT_DISK_SPACE, availableDiskSpace = DEFAULT_DISK_SPACE;
 
 	// In-memory counterpart of the `serverKeys` in system keyspace
 	// the value ShardStatus is [InFlight, Completed, Empty] and metrics uint64_t is the shard size, the caveat is the
@@ -96,7 +96,8 @@ public:
 	MockStorageServer() = default;
 
 	MockStorageServer(StorageServerInterface ssi, uint64_t availableDiskSpace, uint64_t usedDiskSpace = 0)
-	  : usedDiskSpace(usedDiskSpace), availableDiskSpace(availableDiskSpace), ssi(ssi), id(ssi.id()) {}
+	  : totalDiskSpace(usedDiskSpace + availableDiskSpace), availableDiskSpace(availableDiskSpace), ssi(ssi),
+	    id(ssi.id()) {}
 
 	MockStorageServer(const UID& id, uint64_t availableDiskSpace, uint64_t usedDiskSpace = 0)
 	  : MockStorageServer(StorageServerInterface(id), availableDiskSpace, usedDiskSpace) {}
@@ -104,6 +105,7 @@ public:
 	decltype(serverKeys)::Ranges getAllRanges() { return serverKeys.ranges(); }
 
 	bool allShardStatusEqual(KeyRangeRef range, MockShardStatus status);
+	bool allShardStatusIn(KeyRangeRef range, const std::set<MockShardStatus>& status);
 
 	// change the status of range. This function may result in split to make the shard boundary align with range.begin
 	// and range.end. In this case, if restrictSize==true, the sum of the split shard size is strictly equal to the old
@@ -113,6 +115,7 @@ public:
 	// this function removed an aligned range from server
 	void removeShard(KeyRangeRef range);
 
+	// intersecting range size
 	uint64_t sumRangeSize(KeyRangeRef range) const;
 
 	void addActor(Future<Void> future) override;
@@ -144,7 +147,7 @@ public:
 
 	Future<Void> run();
 
-	// data operation APIs - change the metrics
+	// data operation APIs - change the metrics sample, disk space and shard size
 
 	// Set key with a new value, the total bytes change from oldBytes to bytes
 	void set(KeyRef key, int64_t bytes, int64_t oldBytes);
@@ -159,6 +162,7 @@ public:
 	// Read range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
 	void getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
 
+	// trigger the asynchronous fetch keys operation
 	void signalFetchKeys(KeyRangeRef range, int64_t rangeTotalBytes);
 
 protected:
@@ -179,10 +183,14 @@ protected:
 	// Update the storage metrics as if we write the MVCC storage with a mutation of `size` bytes.
 	void notifyMvccStorageCost(KeyRef key, int64_t size);
 
+	// Randomly generate keys and kv size between the fetch range, updating the byte sample.
+	// Once the fetchKeys return, the shard status will become FETCHED.
 	Future<Void> fetchKeys(const FetchKeysParams&);
 
+	// Update byte sample as if set a key value pair of which the size is kvSize
 	void byteSampleApplySet(KeyRef key, int64_t kvSize);
 
+	// Update byte sample as if clear a whole range
 	void byteSampleApplyClear(KeyRangeRef range);
 };
 
@@ -223,7 +231,7 @@ public:
 	 * Shard is in-flight.
 	 * * In mgs.shardMapping,the destination teams is non-empty for a given shard;
 	 * * For each MSS belonging to the source teams, mss.serverKeys[shard] = Completed
-	 * * For each MSS belonging to the destination teams, mss.serverKeys[shard] = InFlight|Completed
+	 * * For each MSS belonging to the destination teams, mss.serverKeys[shard] = InFlight | Fetched | Completed
 	 * Shard is lost.
 	 * * In mgs.shardMapping,  the destination teams is empty for the given shard;
 	 * * For each MSS belonging to the source teams, mss.serverKeys[shard] = Empty
diff --git a/fdbserver/workloads/MockDDTest.actor.cpp b/fdbserver/workloads/MockDDTest.actor.cpp
new file mode 100644
index 0000000000..209df66a5e
--- /dev/null
+++ b/fdbserver/workloads/MockDDTest.actor.cpp
@@ -0,0 +1,40 @@
+/*
+ * MockDDTest.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/ManagementAPI.actor.h"
+#include "fdbserver/DDSharedContext.h"
+#include "fdbserver/DDTxnProcessor.h"
+#include "fdbserver/MoveKeys.actor.h"
+#include "fdbclient/StorageServerInterface.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+struct MockDDTestWorkload {
+	bool enabled;
+	double testDuration;
+	double meanDelay = 0.05;
+	double maxKeyspace = 0.1;
+	DDSharedContext ddContext;
+
+	std::shared_ptr<MockGlobalState> mgs;
+	std::shared_ptr<DDMockTxnProcessor> mock;
+};
\ No newline at end of file

From 11b2c035c0acb19f49c529e712a9d7f6a1bcfb62 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Fri, 28 Oct 2022 00:21:54 -0700
Subject: [PATCH 08/57] add unit test for randomKeyBetween

---
 fdbclient/FDBTypes.cpp                        | 25 +++++++++++++++++++
 .../fdbclient/StorageServerInterface.h        |  4 +--
 .../include/fdbserver/StorageMetrics.actor.h  |  6 ++---
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/fdbclient/FDBTypes.cpp b/fdbclient/FDBTypes.cpp
index 056fc3430d..c104aea5e8 100644
--- a/fdbclient/FDBTypes.cpp
+++ b/fdbclient/FDBTypes.cpp
@@ -96,6 +96,31 @@ Key randomKeyBetween(const KeyRangeRef& keys) {
 	return end;
 }
 
+TEST_CASE("/KeyRangeUtil/randomKeyBetween") {
+	Key begin = "qwert"_sr;
+	Key end = "qwertyu"_sr;
+	Key res;
+	for(int i = 0; i < 10; ++ i) {
+		res = randomKeyBetween(KeyRangeRef(begin, end));
+		ASSERT(res > begin);
+		ASSERT(res < end);
+	}
+
+	begin = "q"_sr;
+	end = "q\x00"_sr;
+	res = randomKeyBetween(KeyRangeRef(begin, end));
+	ASSERT(res == end);
+
+	begin = "aaaaaaa"_sr;
+	end = "b"_sr;
+	for(int i = 0; i < 10; ++ i) {
+		res = randomKeyBetween(KeyRangeRef(begin, end));
+		ASSERT(res > begin);
+		ASSERT(res < end);
+	}
+	return Void();
+}
+
 void KeySelectorRef::setKey(KeyRef const& key) {
 	// There are no keys in the database with size greater than the max key size, so if this key selector has a key
 	// which is large, then we can translate it to an equivalent key selector with a smaller key
diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h
index 2358312a4a..cdc79c05c5 100644
--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@@ -634,9 +634,9 @@ struct GetShardStateRequest {
 struct StorageMetrics {
 	constexpr static FileIdentifier file_identifier = 13622226;
 	int64_t bytes = 0; // total storage
-	// FIXME: currently, neither of bytesPerKSecond or iosPerKSecond are actually used in DataDistribution calculations.
-	// This may change in the future, but this comment is left here to avoid any confusion for the time being.
 	int64_t bytesPerKSecond = 0; // network bandwidth (average over 10s)
+
+	// FIXME: currently, iosPerKSecond is not used in DataDistribution calculations.
 	int64_t iosPerKSecond = 0;
 	int64_t bytesReadPerKSecond = 0;
 
diff --git a/fdbserver/include/fdbserver/StorageMetrics.actor.h b/fdbserver/include/fdbserver/StorageMetrics.actor.h
index db7524d5f9..553dcaa4b9 100644
--- a/fdbserver/include/fdbserver/StorageMetrics.actor.h
+++ b/fdbserver/include/fdbserver/StorageMetrics.actor.h
@@ -77,9 +77,9 @@ private:
 struct StorageServerMetrics {
 	KeyRangeMap<std::vector<PromiseStream<StorageMetrics>>> waitMetricsMap;
 	StorageMetricSample byteSample;
-	TransientStorageMetricSample iopsSample,
-	    bandwidthSample; // FIXME: iops and bandwidth calculations are not effectively tested, since they aren't
-	                     // currently used by data distribution
+
+	// FIXME: iops is not effectively tested, and is not used by data distribution
+	TransientStorageMetricSample iopsSample, bandwidthSample;
 	TransientStorageMetricSample bytesReadSample;
 
 	StorageServerMetrics()

From 55a3db82b540e1c1f7630557a47bcb6674660aea Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Fri, 28 Oct 2022 11:13:21 -0700
Subject: [PATCH 09/57] update the name, comment and discription of write byte
 sampling; update the calculation of write bandwidth metrics

---
 fdbclient/ServerKnobs.cpp                     |  4 +-
 fdbclient/include/fdbclient/ServerKnobs.h     |  2 +-
 .../fdbclient/StorageServerInterface.h        | 20 +++++-----
 fdbserver/BlobManager.actor.cpp               | 10 ++---
 fdbserver/BlobWorker.actor.cpp                |  2 +-
 fdbserver/DDShardTracker.actor.cpp            | 40 +++++++++----------
 fdbserver/DataDistribution.actor.cpp          |  6 +--
 fdbserver/MockGlobalState.actor.cpp           | 12 +++---
 fdbserver/StorageMetrics.actor.cpp            | 30 +++++++-------
 fdbserver/include/fdbserver/MockGlobalState.h |  4 +-
 .../include/fdbserver/StorageMetrics.actor.h  |  9 +----
 fdbserver/storageserver.actor.cpp             | 18 +++++----
 12 files changed, 78 insertions(+), 79 deletions(-)

diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp
index d729384ebf..c153fb3cc3 100644
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@@ -223,7 +223,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 		shards.
 
 		The bandwidth sample maintained by the storage server needs to be accurate enough to reliably measure this minimum bandwidth.  See
-		BANDWIDTH_UNITS_PER_SAMPLE.  If this number is too low, the storage server needs to spend more memory and time on sampling.
+		BYTES_WRITE_UNITS_PER_SAMPLE.  If this number is too low, the storage server needs to spend more memory and time on sampling.
 		*/
 
 	init( SHARD_SPLIT_BYTES_PER_KSEC,              250 * 1000 * 1000 ); if( buggifySmallBandwidthSplit ) SHARD_SPLIT_BYTES_PER_KSEC = 50 * 1000 * 1000;
@@ -743,7 +743,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS,        1000.0 / STORAGE_METRICS_AVERAGE_INTERVAL );  // milliHz!
 	init( SPLIT_JITTER_AMOUNT,                                  0.05 ); if( randomize && BUGGIFY ) SPLIT_JITTER_AMOUNT = 0.2;
 	init( IOPS_UNITS_PER_SAMPLE,                                10000 * 1000 / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 100 );
-	init( BANDWIDTH_UNITS_PER_SAMPLE,                           SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 );
+	init( BYTES_WRITE_UNITS_PER_SAMPLE,                           SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 );
 	init( BYTES_READ_UNITS_PER_SAMPLE,                          100000 ); // 100K bytes
 	init( READ_HOT_SUB_RANGE_CHUNK_SIZE,                        10000000); // 10MB
 	init( EMPTY_READ_PENALTY,                                   20 ); // 20 bytes
diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h
index d4ba08d518..f782d63e69 100644
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@@ -697,7 +697,7 @@ public:
 	double STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 	double SPLIT_JITTER_AMOUNT;
 	int64_t IOPS_UNITS_PER_SAMPLE;
-	int64_t BANDWIDTH_UNITS_PER_SAMPLE;
+	int64_t BYTES_WRITE_UNITS_PER_SAMPLE;
 	int64_t BYTES_READ_UNITS_PER_SAMPLE;
 	int64_t READ_HOT_SUB_RANGE_CHUNK_SIZE;
 	int64_t EMPTY_READ_PENALTY;
diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h
index cdc79c05c5..2a2442c94b 100644
--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@@ -634,7 +634,7 @@ struct GetShardStateRequest {
 struct StorageMetrics {
 	constexpr static FileIdentifier file_identifier = 13622226;
 	int64_t bytes = 0; // total storage
-	int64_t bytesPerKSecond = 0; // network bandwidth (average over 10s)
+	int64_t writeBytesPerKSecond = 0; // network bandwidth (average over 10s) == write bandwidth through any IO devices
 
 	// FIXME: currently, iosPerKSecond is not used in DataDistribution calculations.
 	int64_t iosPerKSecond = 0;
@@ -643,33 +643,33 @@ struct StorageMetrics {
 	static const int64_t infinity = 1LL << 60;
 
 	bool allLessOrEqual(const StorageMetrics& rhs) const {
-		return bytes <= rhs.bytes && bytesPerKSecond <= rhs.bytesPerKSecond && iosPerKSecond <= rhs.iosPerKSecond &&
+		return bytes <= rhs.bytes && writeBytesPerKSecond <= rhs.writeBytesPerKSecond && iosPerKSecond <= rhs.iosPerKSecond &&
 		       bytesReadPerKSecond <= rhs.bytesReadPerKSecond;
 	}
 	void operator+=(const StorageMetrics& rhs) {
 		bytes += rhs.bytes;
-		bytesPerKSecond += rhs.bytesPerKSecond;
+		writeBytesPerKSecond += rhs.writeBytesPerKSecond;
 		iosPerKSecond += rhs.iosPerKSecond;
 		bytesReadPerKSecond += rhs.bytesReadPerKSecond;
 	}
 	void operator-=(const StorageMetrics& rhs) {
 		bytes -= rhs.bytes;
-		bytesPerKSecond -= rhs.bytesPerKSecond;
+		writeBytesPerKSecond -= rhs.writeBytesPerKSecond;
 		iosPerKSecond -= rhs.iosPerKSecond;
 		bytesReadPerKSecond -= rhs.bytesReadPerKSecond;
 	}
 	template <class F>
 	void operator*=(F f) {
 		bytes *= f;
-		bytesPerKSecond *= f;
+		writeBytesPerKSecond *= f;
 		iosPerKSecond *= f;
 		bytesReadPerKSecond *= f;
 	}
-	bool allZero() const { return !bytes && !bytesPerKSecond && !iosPerKSecond && !bytesReadPerKSecond; }
+	bool allZero() const { return !bytes && !writeBytesPerKSecond && !iosPerKSecond && !bytesReadPerKSecond; }
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, bytes, bytesPerKSecond, iosPerKSecond, bytesReadPerKSecond);
+		serializer(ar, bytes, writeBytesPerKSecond, iosPerKSecond, bytesReadPerKSecond);
 	}
 
 	void negate() { operator*=(-1.0); }
@@ -697,14 +697,14 @@ struct StorageMetrics {
 	}
 
 	bool operator==(StorageMetrics const& rhs) const {
-		return bytes == rhs.bytes && bytesPerKSecond == rhs.bytesPerKSecond && iosPerKSecond == rhs.iosPerKSecond &&
+		return bytes == rhs.bytes && writeBytesPerKSecond == rhs.writeBytesPerKSecond && iosPerKSecond == rhs.iosPerKSecond &&
 		       bytesReadPerKSecond == rhs.bytesReadPerKSecond;
 	}
 
 	std::string toString() const {
-		return format("Bytes: %lld, BPerKSec: %lld, iosPerKSec: %lld, BReadPerKSec: %lld",
+		return format("Bytes: %lld, BWritePerKSec: %lld, iosPerKSec: %lld, BReadPerKSec: %lld",
 		              bytes,
-		              bytesPerKSecond,
+		              writeBytesPerKSecond,
 		              iosPerKSecond,
 		              bytesReadPerKSecond);
 	}
diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp
index 84b482cfcf..50c68f328e 100644
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@@ -636,11 +636,11 @@ ACTOR Future<BlobGranuleSplitPoints> splitRange(Reference<BlobManagerData> bmDat
 			// only split on bytes and write rate
 			state StorageMetrics splitMetrics;
 			splitMetrics.bytes = SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES;
-			splitMetrics.bytesPerKSecond = SERVER_KNOBS->SHARD_SPLIT_BYTES_PER_KSEC;
+			splitMetrics.writeBytesPerKSecond = SERVER_KNOBS->SHARD_SPLIT_BYTES_PER_KSEC;
 			if (writeHot) {
-				splitMetrics.bytesPerKSecond = std::min(splitMetrics.bytesPerKSecond, estimated.bytesPerKSecond / 2);
-				splitMetrics.bytesPerKSecond =
-				    std::max(splitMetrics.bytesPerKSecond, SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC);
+				splitMetrics.writeBytesPerKSecond = std::min(splitMetrics.writeBytesPerKSecond, estimated.writeBytesPerKSecond / 2);
+				splitMetrics.writeBytesPerKSecond =
+				    std::max(splitMetrics.writeBytesPerKSecond, SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC);
 			}
 			splitMetrics.iosPerKSecond = splitMetrics.infinity;
 			splitMetrics.bytesReadPerKSecond = splitMetrics.infinity;
@@ -2616,7 +2616,7 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
 		    wait(bmData->db->getStorageMetrics(std::get<1>(candidates[i]), CLIENT_KNOBS->TOO_MANY));
 
 		if (metrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
-		    metrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
+		    metrics.writeBytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
 			// This granule cannot be merged with any neighbors.
 			// If current candidates up to here can be merged, merge them and skip over this one
 			attemptStartMerge(bmData, currentCandidates);
diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp
index fc51c8fae0..f5cb4e2c13 100644
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@@ -1595,7 +1595,7 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 
 		// FIXME: maybe separate knob and/or value for write rate?
 		if (currentMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2 ||
-		    currentMetrics.bytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
+		    currentMetrics.writeBytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
 			wait(delayJittered(SERVER_KNOBS->BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 2.0));
 			CODE_PROBE(true, "wait and check later to see if granule got smaller or colder");
 			continue;
diff --git a/fdbserver/DDShardTracker.actor.cpp b/fdbserver/DDShardTracker.actor.cpp
index 7964915217..b436be9965 100644
--- a/fdbserver/DDShardTracker.actor.cpp
+++ b/fdbserver/DDShardTracker.actor.cpp
@@ -41,9 +41,9 @@ enum BandwidthStatus { BandwidthStatusLow, BandwidthStatusNormal, BandwidthStatu
 enum ReadBandwidthStatus { ReadBandwidthStatusNormal, ReadBandwidthStatusHigh };
 
 BandwidthStatus getBandwidthStatus(StorageMetrics const& metrics) {
-	if (metrics.bytesPerKSecond > SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC)
+	if (metrics.writeBytesPerKSecond > SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC)
 		return BandwidthStatusHigh;
-	else if (metrics.bytesPerKSecond < SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC)
+	else if (metrics.writeBytesPerKSecond < SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC)
 		return BandwidthStatusLow;
 
 	return BandwidthStatusNormal;
@@ -176,7 +176,7 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize) {
 		bounds.max.bytes = maxShardSize;
 	}
 
-	bounds.max.bytesPerKSecond = bounds.max.infinity;
+	bounds.max.writeBytesPerKSecond = bounds.max.infinity;
 	bounds.max.iosPerKSecond = bounds.max.infinity;
 	bounds.max.bytesReadPerKSecond = bounds.max.infinity;
 
@@ -187,14 +187,14 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize) {
 		bounds.min.bytes = maxShardSize / SERVER_KNOBS->SHARD_BYTES_RATIO;
 	}
 
-	bounds.min.bytesPerKSecond = 0;
+	bounds.min.writeBytesPerKSecond = 0;
 	bounds.min.iosPerKSecond = 0;
 	bounds.min.bytesReadPerKSecond = 0;
 
 	// The permitted error is 1/3 of the general-case minimum bytes (even in the special case where this is the last
 	// shard)
 	bounds.permittedError.bytes = bounds.max.bytes / SERVER_KNOBS->SHARD_BYTES_RATIO / 3;
-	bounds.permittedError.bytesPerKSecond = bounds.permittedError.infinity;
+	bounds.permittedError.writeBytesPerKSecond = bounds.permittedError.infinity;
 	bounds.permittedError.iosPerKSecond = bounds.permittedError.infinity;
 	bounds.permittedError.bytesReadPerKSecond = bounds.permittedError.infinity;
 
@@ -222,18 +222,18 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
 		                            std::max(int64_t(bytes - (SERVER_KNOBS->MIN_SHARD_BYTES * 0.1)), (int64_t)0));
 		bounds.permittedError.bytes = bytes * 0.1;
 		if (bandwidthStatus == BandwidthStatusNormal) { // Not high or low
-			bounds.max.bytesPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC;
-			bounds.min.bytesPerKSecond = SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC;
-			bounds.permittedError.bytesPerKSecond = bounds.min.bytesPerKSecond / 4;
+			bounds.max.writeBytesPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC;
+			bounds.min.writeBytesPerKSecond = SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC;
+			bounds.permittedError.writeBytesPerKSecond = bounds.min.writeBytesPerKSecond / 4;
 		} else if (bandwidthStatus == BandwidthStatusHigh) { // > 10MB/sec for 100MB shard, proportionally lower
 			                                                 // for smaller shard, > 200KB/sec no matter what
-			bounds.max.bytesPerKSecond = bounds.max.infinity;
-			bounds.min.bytesPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC;
-			bounds.permittedError.bytesPerKSecond = bounds.min.bytesPerKSecond / 4;
+			bounds.max.writeBytesPerKSecond = bounds.max.infinity;
+			bounds.min.writeBytesPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC;
+			bounds.permittedError.writeBytesPerKSecond = bounds.min.writeBytesPerKSecond / 4;
 		} else if (bandwidthStatus == BandwidthStatusLow) { // < 10KB/sec
-			bounds.max.bytesPerKSecond = SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC;
-			bounds.min.bytesPerKSecond = 0;
-			bounds.permittedError.bytesPerKSecond = bounds.max.bytesPerKSecond / 4;
+			bounds.max.writeBytesPerKSecond = SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC;
+			bounds.min.writeBytesPerKSecond = 0;
+			bounds.permittedError.writeBytesPerKSecond = bounds.max.writeBytesPerKSecond / 4;
 		} else {
 			ASSERT(false);
 		}
@@ -309,12 +309,12 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
 					/*TraceEvent("ShardSizeUpdate")
 					    .detail("Keys", keys)
 					    .detail("UpdatedSize", metrics.metrics.bytes)
-					    .detail("Bandwidth", metrics.metrics.bytesPerKSecond)
+					    .detail("Bandwidth", metrics.metrics.writeBytesPerKSecond)
 					    .detail("BandwidthStatus", getBandwidthStatus(metrics))
 					    .detail("BytesLower", bounds.min.bytes)
 					    .detail("BytesUpper", bounds.max.bytes)
-					    .detail("BandwidthLower", bounds.min.bytesPerKSecond)
-					    .detail("BandwidthUpper", bounds.max.bytesPerKSecond)
+					    .detail("BandwidthLower", bounds.min.writeBytesPerKSecond)
+					    .detail("BandwidthUpper", bounds.max.writeBytesPerKSecond)
 					    .detail("ShardSizePresent", shardSize->get().present())
 					    .detail("OldShardSize", shardSize->get().present() ? shardSize->get().get().metrics.bytes : 0)
 					    .detail("TrackerID", trackerID);*/
@@ -882,7 +882,7 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
 
 	StorageMetrics splitMetrics;
 	splitMetrics.bytes = shardBounds.max.bytes / 2;
-	splitMetrics.bytesPerKSecond =
+	splitMetrics.writeBytesPerKSecond =
 	    keys.begin >= keyServersKeys.begin ? splitMetrics.infinity : SERVER_KNOBS->SHARD_SPLIT_BYTES_PER_KSEC;
 	splitMetrics.iosPerKSecond = splitMetrics.infinity;
 	splitMetrics.bytesReadPerKSecond = splitMetrics.infinity; // Don't split by readBandwidth
@@ -905,7 +905,7 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
 	            bandwidthStatus == BandwidthStatusHigh     ? "High"
 	            : bandwidthStatus == BandwidthStatusNormal ? "Normal"
 	                                                       : "Low")
-	    .detail("BytesPerKSec", metrics.bytesPerKSecond)
+	    .detail("BytesPerKSec", metrics.writeBytesPerKSecond)
 	    .detail("NumShards", numShards);
 
 	if (numShards > 1) {
@@ -1206,7 +1206,7 @@ ACTOR Future<Void> shardTracker(DataDistributionTracker::SafeAccessor self,
 	    .detail("TrackerID", trackerID)
 	    .detail("MaxBytes", self()->maxShardSize->get().get())
 	    .detail("ShardSize", shardSize->get().get().bytes)
-	    .detail("BytesPerKSec", shardSize->get().get().bytesPerKSecond);*/
+	    .detail("BytesPerKSec", shardSize->get().get().writeBytesPerKSecond);*/
 
 	try {
 		loop {
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 1ce76e642c..e71cc51ef5 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -56,12 +56,12 @@
 ShardSizeBounds ShardSizeBounds::shardSizeBoundsBeforeTrack() {
 	return ShardSizeBounds{
 		.max = StorageMetrics{ .bytes = -1,
-		                       .bytesPerKSecond = StorageMetrics::infinity,
+		                       .writeBytesPerKSecond = StorageMetrics::infinity,
 		                       .iosPerKSecond = StorageMetrics::infinity,
 		                       .bytesReadPerKSecond = StorageMetrics::infinity },
-		.min = StorageMetrics{ .bytes = -1, .bytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 },
+		.min = StorageMetrics{ .bytes = -1, .writeBytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 },
 		.permittedError = StorageMetrics{ .bytes = -1,
-		                                  .bytesPerKSecond = StorageMetrics::infinity,
+		                                  .writeBytesPerKSecond = StorageMetrics::infinity,
 		                                  .iosPerKSecond = StorageMetrics::infinity,
 		                                  .bytesReadPerKSecond = StorageMetrics::infinity }
 	};
diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index a862072d51..24d32dfb34 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -308,7 +308,7 @@ Future<Void> MockStorageServer::run() {
 }
 
 void MockStorageServer::set(KeyRef key, int64_t bytes, int64_t oldBytes) {
-	notifyMvccStorageCost(key, bytes);
+	notifyWriteMetrics(key, bytes);
 	byteSampleApplySet(key, bytes);
 	auto delta = oldBytes - bytes;
 	availableDiskSpace += delta;
@@ -316,7 +316,7 @@ void MockStorageServer::set(KeyRef key, int64_t bytes, int64_t oldBytes) {
 }
 
 void MockStorageServer::clear(KeyRef key, int64_t bytes) {
-	notifyMvccStorageCost(key, bytes);
+	notifyWriteMetrics(key, bytes);
 	KeyRange sr = singleKeyRange(key);
 	byteSampleApplyClear(sr);
 	availableDiskSpace += bytes;
@@ -324,7 +324,7 @@ void MockStorageServer::clear(KeyRef key, int64_t bytes) {
 }
 
 void MockStorageServer::clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
-	notifyMvccStorageCost(range.begin, range.begin.size() + range.end.size());
+	notifyWriteMetrics(range.begin, range.begin.size() + range.end.size());
 	byteSampleApplyClear(range);
 	auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
 	availableDiskSpace += totalByteSize;
@@ -386,10 +386,10 @@ void MockStorageServer::clearRangeTotalBytes(KeyRangeRef range, int64_t beginSha
 	}
 }
 
-void MockStorageServer::notifyMvccStorageCost(KeyRef key, int64_t size) {
-	// update write bandwidth and iops as mock the cost of writing mvcc storage
+void MockStorageServer::notifyWriteMetrics(KeyRef key, int64_t size) {
+	// update write bandwidth and iops as mock the cost of writing a mutation
 	StorageMetrics s;
-	s.bytesPerKSecond = mvccStorageBytes(size) / 2;
+	s.writeBytesPerKSecond = size + MutationRef::OVERHEAD_BYTES;
 	s.iosPerKSecond = 1;
 	metrics.notify(key, s);
 }
diff --git a/fdbserver/StorageMetrics.actor.cpp b/fdbserver/StorageMetrics.actor.cpp
index ada2301483..89305a968f 100644
--- a/fdbserver/StorageMetrics.actor.cpp
+++ b/fdbserver/StorageMetrics.actor.cpp
@@ -75,8 +75,8 @@ KeyRef StorageMetricSample::splitEstimate(KeyRangeRef range, int64_t offset, boo
 StorageMetrics StorageServerMetrics::getMetrics(KeyRangeRef const& keys) const {
 	StorageMetrics result;
 	result.bytes = byteSample.getEstimate(keys);
-	result.bytesPerKSecond =
-	    bandwidthSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
+	result.writeBytesPerKSecond =
+	    bytesWriteSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 	result.iosPerKSecond = iopsSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 	result.bytesReadPerKSecond =
 	    bytesReadSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
@@ -88,7 +88,7 @@ StorageMetrics StorageServerMetrics::getMetrics(KeyRangeRef const& keys) const {
 void StorageServerMetrics::notify(KeyRef key, StorageMetrics& metrics) {
 	ASSERT(metrics.bytes == 0); // ShardNotifyMetrics
 	if (g_network->isSimulated()) {
-		CODE_PROBE(metrics.bytesPerKSecond != 0, "ShardNotifyMetrics bytes");
+		CODE_PROBE(metrics.writeBytesPerKSecond != 0, "ShardNotifyMetrics bytes");
 		CODE_PROBE(metrics.iosPerKSecond != 0, "ShardNotifyMetrics ios");
 		CODE_PROBE(metrics.bytesReadPerKSecond != 0, "ShardNotifyMetrics bytesRead", probe::decoration::rare);
 	}
@@ -97,8 +97,8 @@ void StorageServerMetrics::notify(KeyRef key, StorageMetrics& metrics) {
 
 	StorageMetrics notifyMetrics;
 
-	if (metrics.bytesPerKSecond)
-		notifyMetrics.bytesPerKSecond = bandwidthSample.addAndExpire(key, metrics.bytesPerKSecond, expire) *
+	if (metrics.writeBytesPerKSecond)
+		notifyMetrics.writeBytesPerKSecond = bytesWriteSample.addAndExpire(key, metrics.writeBytesPerKSecond, expire) *
 		                                SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 	if (metrics.iosPerKSecond)
 		notifyMetrics.iosPerKSecond = iopsSample.addAndExpire(key, metrics.iosPerKSecond, expire) *
@@ -177,8 +177,8 @@ void StorageServerMetrics::notifyNotReadable(KeyRangeRef keys) {
 void StorageServerMetrics::poll() {
 	{
 		StorageMetrics m;
-		m.bytesPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
-		bandwidthSample.poll(waitMetricsMap, m);
+		m.writeBytesPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
+		bytesWriteSample.poll(waitMetricsMap, m);
 	}
 	{
 		StorageMetrics m;
@@ -250,7 +250,7 @@ void StorageServerMetrics::splitMetrics(SplitMetricsRequest req) const {
 			if (remaining.bytes < 2 * minSplitBytes)
 				break;
 			KeyRef key = req.keys.end;
-			bool hasUsed = used.bytes != 0 || used.bytesPerKSecond != 0 || used.iosPerKSecond != 0;
+			bool hasUsed = used.bytes != 0 || used.writeBytesPerKSecond != 0 || used.iosPerKSecond != 0;
 			key = getSplitKey(remaining.bytes,
 			                  estimated.bytes,
 			                  req.limits.bytes,
@@ -276,13 +276,13 @@ void StorageServerMetrics::splitMetrics(SplitMetricsRequest req) const {
 			                  lastKey,
 			                  key,
 			                  hasUsed);
-			key = getSplitKey(remaining.bytesPerKSecond,
-			                  estimated.bytesPerKSecond,
-			                  req.limits.bytesPerKSecond,
-			                  used.bytesPerKSecond,
+			key = getSplitKey(remaining.writeBytesPerKSecond,
+			                  estimated.writeBytesPerKSecond,
+			                  req.limits.writeBytesPerKSecond,
+			                  used.writeBytesPerKSecond,
 			                  req.limits.infinity,
 			                  req.isLastShard,
-			                  bandwidthSample,
+			                  bytesWriteSample,
 			                  SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS,
 			                  lastKey,
 			                  key,
@@ -328,12 +328,12 @@ void StorageServerMetrics::getStorageMetrics(GetStorageMetricsRequest req,
 
 	rep.available.bytes = sb.available;
 	rep.available.iosPerKSecond = 10e6;
-	rep.available.bytesPerKSecond = 100e9;
+	rep.available.writeBytesPerKSecond = 100e9;
 	rep.available.bytesReadPerKSecond = 100e9;
 
 	rep.capacity.bytes = sb.total;
 	rep.capacity.iosPerKSecond = 10e6;
-	rep.capacity.bytesPerKSecond = 100e9;
+	rep.capacity.writeBytesPerKSecond = 100e9;
 	rep.capacity.bytesReadPerKSecond = 100e9;
 
 	rep.bytesInputRate = bytesInputRate;
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index a86d8aeb44..cf831dccb2 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -180,8 +180,8 @@ protected:
 	// Decrease the intersecting shard bytes as if delete the data
 	void clearRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
 
-	// Update the storage metrics as if we write the MVCC storage with a mutation of `size` bytes.
-	void notifyMvccStorageCost(KeyRef key, int64_t size);
+	// Update the storage metrics as if we write a k-v pair of `size` bytes.
+	void notifyWriteMetrics(KeyRef key, int64_t size);
 
 	// Randomly generate keys and kv size between the fetch range, updating the byte sample.
 	// Once the fetchKeys return, the shard status will become FETCHED.
diff --git a/fdbserver/include/fdbserver/StorageMetrics.actor.h b/fdbserver/include/fdbserver/StorageMetrics.actor.h
index 553dcaa4b9..b0985ec52a 100644
--- a/fdbserver/include/fdbserver/StorageMetrics.actor.h
+++ b/fdbserver/include/fdbserver/StorageMetrics.actor.h
@@ -79,12 +79,12 @@ struct StorageServerMetrics {
 	StorageMetricSample byteSample;
 
 	// FIXME: iops is not effectively tested, and is not used by data distribution
-	TransientStorageMetricSample iopsSample, bandwidthSample;
+	TransientStorageMetricSample iopsSample, bytesWriteSample;
 	TransientStorageMetricSample bytesReadSample;
 
 	StorageServerMetrics()
 	  : byteSample(0), iopsSample(SERVER_KNOBS->IOPS_UNITS_PER_SAMPLE),
-	    bandwidthSample(SERVER_KNOBS->BANDWIDTH_UNITS_PER_SAMPLE),
+	    bytesWriteSample(SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE),
 	    bytesReadSample(SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE) {}
 
 	StorageMetrics getMetrics(KeyRangeRef const& keys) const;
@@ -230,10 +230,5 @@ Future<Void> serveStorageMetricsRequests(ServiceType* self, StorageServerInterfa
 		}
 	}
 }
-
-// For both the mutation log and the versioned map.
-inline int mvccStorageBytes(int64_t size) {
-	return VersionedMap<KeyRef, ValueOrClearToRef>::overheadPerItem * 2 + (MutationRef::OVERHEAD_BYTES + size) * 2;
-}
 #include "flow/unactorcompiler.h"
 #endif // FDBSERVER_STORAGEMETRICS_H
\ No newline at end of file
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 35101586f7..af7ddeba69 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -533,9 +533,13 @@ const int VERSION_OVERHEAD =
          sizeof(Reference<VersionedMap<KeyRef, ValueOrClearToRef>::PTreeT>)); // versioned map [ x2 for
                                                                               // createNewVersion(version+1) ], 64b
                                                                               // overhead for map
-// For both the mutation log and the versioned map.
+
+// Memory size for storing mutation in the mutation log and the versioned map.
 static int mvccStorageBytes(MutationRef const& m) {
-	return mvccStorageBytes(m.param1.size() + m.param2.size());
+	// Why * 2:
+	// - 1 insertion into version map costs 2 nodes in avg;
+	// - The mutation will be stored in both mutation log and versioned map;
+	return VersionedMap<KeyRef, ValueOrClearToRef>::overheadPerItem * 2 + m.totalSize() * 2;
 }
 
 struct FetchInjectionInfo {
@@ -1960,7 +1964,7 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 
 		/*
 		StorageMetrics m;
-		m.bytesPerKSecond = req.key.size() + (v.present() ? v.get().size() : 0);
+		m.writeBytesPerKSecond = req.key.size() + (v.present() ? v.get().size() : 0);
 		m.iosPerKSecond = 1;
 		data->metrics.notify(req.key, m);
 		*/
@@ -5610,7 +5614,7 @@ void applyMutation(StorageServer* self,
 	// m is expected to be in arena already
 	// Clear split keys are added to arena
 	StorageMetrics metrics;
-	metrics.bytesPerKSecond = mvccStorageBytes(m) / 2;
+	metrics.writeBytesPerKSecond = m.totalSize(); // comparable to counter.mutationBytes
 	metrics.iosPerKSecond = 1;
 	self->metrics.notify(m.param1, metrics);
 
@@ -10070,12 +10074,12 @@ ACTOR Future<Void> waitMetrics(StorageServerMetrics* self, WaitMetricsRequest re
 						//  all the messages for one clear or set have been dispatched.
 
 						/*StorageMetrics m = getMetrics( data, req.keys );
-						  bool b = ( m.bytes != metrics.bytes || m.bytesPerKSecond != metrics.bytesPerKSecond ||
+						  bool b = ( m.bytes != metrics.bytes || m.writeBytesPerKSecond != metrics.writeBytesPerKSecond ||
 						  m.iosPerKSecond != metrics.iosPerKSecond ); if (b) { printf("keys: '%s' - '%s' @%p\n",
 						  printable(req.keys.begin).c_str(), printable(req.keys.end).c_str(), this);
 						  printf("waitMetrics: desync %d (%lld %lld %lld) != (%lld %lld %lld); +(%lld %lld %lld)\n",
-						  b, m.bytes, m.bytesPerKSecond, m.iosPerKSecond, metrics.bytes, metrics.bytesPerKSecond,
-						  metrics.iosPerKSecond, c.bytes, c.bytesPerKSecond, c.iosPerKSecond);
+						  b, m.bytes, m.writeBytesPerKSecond, m.iosPerKSecond, metrics.bytes, metrics.writeBytesPerKSecond,
+						  metrics.iosPerKSecond, c.bytes, c.writeBytesPerKSecond, c.iosPerKSecond);
 
 						  }*/
 					}

From 004a0f8915af106a73b9ab633bacbc7860a7aac1 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Fri, 28 Oct 2022 15:51:08 -0700
Subject: [PATCH 10/57] update data ops definition and comments; add a unit
 test

---
 fdbserver/MockGlobalState.actor.cpp           | 159 ++++++++++++++++--
 fdbserver/ShardsAffectedByTeamFailure.cpp     |  10 ++
 fdbserver/include/fdbserver/MockGlobalState.h |  33 +++-
 .../fdbserver/ShardsAffectedByTeamFailure.h   |   2 +
 4 files changed, 183 insertions(+), 21 deletions(-)

diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index 24d32dfb34..ba855affcf 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -141,7 +141,7 @@ public:
 				int maxSize = std::min(remainBytes, 130000) + 1;
 				int randomSize = deterministicRandom()->randomInt(lastKey.size(), maxSize);
 
-				self->availableDiskSpace -= randomSize;
+				self->usedDiskSpace += randomSize;
 				self->byteSampleApplySet(lastKey, randomSize);
 				remainBytes -= randomSize;
 				lastKey = randomKeyBetween(KeyRangeRef(lastKey, params.keys.end));
@@ -267,7 +267,7 @@ void MockStorageServer::removeShard(KeyRangeRef range) {
 	auto ranges = serverKeys.containedRanges(range);
 	ASSERT(ranges.begin().range() == range);
 	auto rangeSize = sumRangeSize(range);
-	availableDiskSpace += rangeSize;
+	usedDiskSpace -= rangeSize;
 	serverKeys.rawErase(range);
 	byteSampleApplyClear(range);
 	metrics.notifyNotReadable(range);
@@ -310,8 +310,8 @@ Future<Void> MockStorageServer::run() {
 void MockStorageServer::set(KeyRef key, int64_t bytes, int64_t oldBytes) {
 	notifyWriteMetrics(key, bytes);
 	byteSampleApplySet(key, bytes);
-	auto delta = oldBytes - bytes;
-	availableDiskSpace += delta;
+	auto delta = bytes - oldBytes;
+	usedDiskSpace += delta;
 	serverKeys[key].shardSize += delta;
 }
 
@@ -319,16 +319,17 @@ void MockStorageServer::clear(KeyRef key, int64_t bytes) {
 	notifyWriteMetrics(key, bytes);
 	KeyRange sr = singleKeyRange(key);
 	byteSampleApplyClear(sr);
-	availableDiskSpace += bytes;
+	usedDiskSpace -= bytes;
 	serverKeys[key].shardSize -= bytes;
 }
 
-void MockStorageServer::clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+int64_t MockStorageServer::clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
 	notifyWriteMetrics(range.begin, range.begin.size() + range.end.size());
 	byteSampleApplyClear(range);
 	auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
-	availableDiskSpace += totalByteSize;
+	usedDiskSpace -= totalByteSize;
 	clearRangeTotalBytes(range, beginShardBytes, endShardBytes);
+	return totalByteSize;
 }
 
 void MockStorageServer::get(KeyRef key, int64_t bytes) {
@@ -337,8 +338,8 @@ void MockStorageServer::get(KeyRef key, int64_t bytes) {
 	metrics.notifyBytesReadPerKSecond(key, bytesReadPerKSecond);
 }
 
-void MockStorageServer::getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
-	auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
+int64_t MockStorageServer::getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+	int64_t totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
 	// For performance concerns, the cost of a range read is billed to the start key and end key of the
 	// range.
 	if (totalByteSize > 0) {
@@ -346,6 +347,7 @@ void MockStorageServer::getRange(KeyRangeRef range, int64_t beginShardBytes, int
 		metrics.notifyBytesReadPerKSecond(range.begin, bytesReadPerKSecond);
 		metrics.notifyBytesReadPerKSecond(range.end, bytesReadPerKSecond);
 	}
+	return totalByteSize;
 }
 
 int64_t MockStorageServer::estimateRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
@@ -609,6 +611,107 @@ Future<Standalone<VectorRef<KeyRef>>> MockGlobalState::splitStorageMetrics(const
 	return MockGlobalStateImpl::splitStorageMetrics(this, keys, limit, estimated, minSplitBytes);
 }
 
+std::vector<Future<Void>> MockGlobalState::runAllMockServers() {
+	std::vector<Future<Void>> futures;
+	futures.reserve(allServers.size());
+	for (auto& [id, _] : allServers) {
+		futures.emplace_back(runMockServer(id));
+	}
+	return futures;
+}
+Future<Void> MockGlobalState::runMockServer(const UID& id) {
+	auto& server = allServers.at(id);
+	IFailureMonitor::failureMonitor().setStatus(server.ssi.address(), FailureStatus(false));
+	return server.run();
+}
+
+int64_t MockGlobalState::get(KeyRef key) {
+	auto ids = shardMapping->getSourceServerIdsFor(key);
+	int64_t randomBytes = 0;
+	if (deterministicRandom()->random01() > emptyProb) {
+		randomBytes = deterministicRandom()->randomInt64(minByteSize, maxByteSize);
+	}
+	// randomly choose 1 server
+	auto id = deterministicRandom()->randomChoice(ids);
+	allServers.at(id).get(key, randomBytes);
+	return randomBytes;
+}
+
+int64_t MockGlobalState::getRange(KeyRangeRef range) {
+	auto ranges = shardMapping->intersectingRanges(range);
+	int64_t totalSize = 0;
+	KeyRef begin, end;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		auto ids = shardMapping->getSourceServerIdsFor(it->begin());
+		if (range.begin > it->begin()) {
+			begin = range.begin;
+		}
+		if (range.end < it->end()) {
+			end = range.end;
+		}
+
+		// randomly choose 1 server
+		auto id = deterministicRandom()->randomChoice(ids);
+		int64_t beginSize = deterministicRandom()->randomInt64(0, SERVER_KNOBS->MIN_SHARD_BYTES),
+		        endSize = deterministicRandom()->randomInt64(0, SERVER_KNOBS->MIN_SHARD_BYTES);
+		totalSize += allServers.at(id).getRange(KeyRangeRef(begin, end), beginSize, endSize);
+	}
+	return totalSize;
+}
+
+int64_t MockGlobalState::set(KeyRef key, int valueSize, bool insert) {
+	auto ids = shardMapping->getSourceServerIdsFor(key);
+	int64_t oldKvBytes = 0;
+	insert |= (deterministicRandom()->random01() < emptyProb);
+
+	if (!insert) {
+		oldKvBytes = key.size() + deterministicRandom()->randomInt64(minByteSize, maxByteSize);
+	}
+
+	for (auto& id : ids) {
+		allServers.at(id).set(key, valueSize + key.size(), oldKvBytes);
+	}
+	return oldKvBytes;
+}
+
+int64_t MockGlobalState::clear(KeyRef key) {
+	auto ids = shardMapping->getSourceServerIdsFor(key);
+	int64_t randomBytes = 0;
+	if (deterministicRandom()->random01() > emptyProb) {
+		randomBytes = deterministicRandom()->randomInt64(minByteSize, maxByteSize) + key.size();
+	}
+
+	for (auto& id : ids) {
+		allServers.at(id).clear(key, randomBytes);
+	}
+	return randomBytes;
+}
+
+int64_t MockGlobalState::clearRange(KeyRangeRef range) {
+	auto ranges = shardMapping->intersectingRanges(range);
+	int64_t totalSize = 0;
+	KeyRef begin, end;
+	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+		auto ids = shardMapping->getSourceServerIdsFor(it->begin());
+		if (range.begin > it->begin()) {
+			begin = range.begin;
+		}
+		if (range.end < it->end()) {
+			end = range.end;
+		}
+
+		int64_t beginSize = deterministicRandom()->randomInt64(0, SERVER_KNOBS->MIN_SHARD_BYTES),
+		        endSize = deterministicRandom()->randomInt64(0, SERVER_KNOBS->MIN_SHARD_BYTES);
+		int64_t lastSize = -1;
+		for (auto& id : ids) {
+			int64_t size = allServers.at(id).clearRange(KeyRangeRef(begin, end), beginSize, endSize);
+			ASSERT(lastSize == size || lastSize == -1); // every server should return the same result
+		}
+		totalSize += lastSize;
+	}
+	return totalSize;
+}
+
 TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
 	BasicTestConfig testConfig;
 	testConfig.simpleConfig = true;
@@ -803,15 +906,12 @@ TEST_CASE("/MockGlobalState/MockStorageServer/WaitStorageMetricsRequest") {
 
 	state std::shared_ptr<MockGlobalState> mgs = std::make_shared<MockGlobalState>();
 	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
-	state ActorCollection actors;
-
-	ActorCollection* ptr = &actors; // get around ACTOR syntax restriction
-	std::for_each(mgs->allServers.begin(), mgs->allServers.end(), [ptr](auto& server) {
-		ptr->add(server.second.run());
-		IFailureMonitor::failureMonitor().setStatus(server.second.ssi.address(), FailureStatus(false));
+	std::for_each(mgs->allServers.begin(), mgs->allServers.end(), [](auto& server) {
 		server.second.metrics.byteSample.sample.insert("something"_sr, 500000);
 	});
 
+	state Future<Void> allServerFutures = waitForAll(mgs->runAllMockServers());
+
 	KeyRange testRange = allKeys;
 	ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
 	std::pair<Optional<StorageMetrics>, int> res =
@@ -822,3 +922,32 @@ TEST_CASE("/MockGlobalState/MockStorageServer/WaitStorageMetricsRequest") {
 	ASSERT_EQ(res.first.get().bytes, 500000);
 	return Void();
 }
+
+TEST_CASE("/MockGlobalState/MockStorageServer/DataOpsSet") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("DataOpsUnitTestConfig").detail("Config", dbConfig.toString());
+	state std::shared_ptr<MockGlobalState> mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+	state Future<Void> allServerFutures = waitForAll(mgs->runAllMockServers());
+
+	// use data ops
+	state int64_t setBytes = 0;
+	setBytes += mgs->set("a"_sr, 1 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
+	setBytes += mgs->set("b"_sr, 2 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
+	setBytes += mgs->set("c"_sr, 3 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
+	for (auto& server : mgs->allServers) {
+		ASSERT_EQ(server.second.sumRangeSize(KeyRangeRef("a"_sr, "c"_sr)),
+		          2 + 3 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE);
+		ASSERT_EQ(server.second.usedDiskSpace, 3 + 6 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE);
+	}
+	ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
+	std::pair<Optional<StorageMetrics>, int> res =
+	    wait(mgs->waitStorageMetrics(allKeys, bounds.min, bounds.max, bounds.permittedError, 1, 1));
+	std::cout << "get result " << res.second << "\n";
+	std::cout << "get byte " << res.first.get().bytes << " " << setBytes << "\n";
+	return Void();
+}
diff --git a/fdbserver/ShardsAffectedByTeamFailure.cpp b/fdbserver/ShardsAffectedByTeamFailure.cpp
index cc634689ec..bc1b150656 100644
--- a/fdbserver/ShardsAffectedByTeamFailure.cpp
+++ b/fdbserver/ShardsAffectedByTeamFailure.cpp
@@ -246,3 +246,13 @@ void ShardsAffectedByTeamFailure::removeFailedServerForRange(KeyRangeRef keys, c
 auto ShardsAffectedByTeamFailure::intersectingRanges(KeyRangeRef keyRange) const -> decltype(shard_teams)::ConstRanges {
 	return shard_teams.intersectingRanges(keyRange);
 }
+
+std::vector<UID> ShardsAffectedByTeamFailure::getSourceServerIdsFor(KeyRef key) {
+	auto teamPair = getTeamsFor(key);
+	std::set<UID> res;
+	auto& srcTeams = teamPair.second.empty() ? teamPair.first : teamPair.second;
+	for (auto& team : srcTeams) {
+		res.insert(team.servers.begin(), team.servers.end());
+	}
+	return std::vector<UID>(res.begin(), res.end());
+}
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index cf831dccb2..071fcb2609 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -80,7 +80,7 @@ public:
 	static constexpr uint64_t DEFAULT_DISK_SPACE = 1000LL * 1024 * 1024 * 1024;
 
 	// control plane statistics associated with a real storage server
-	uint64_t totalDiskSpace = DEFAULT_DISK_SPACE, availableDiskSpace = DEFAULT_DISK_SPACE;
+	uint64_t totalDiskSpace = DEFAULT_DISK_SPACE, usedDiskSpace = DEFAULT_DISK_SPACE;
 
 	// In-memory counterpart of the `serverKeys` in system keyspace
 	// the value ShardStatus is [InFlight, Completed, Empty] and metrics uint64_t is the shard size, the caveat is the
@@ -96,8 +96,7 @@ public:
 	MockStorageServer() = default;
 
 	MockStorageServer(StorageServerInterface ssi, uint64_t availableDiskSpace, uint64_t usedDiskSpace = 0)
-	  : totalDiskSpace(usedDiskSpace + availableDiskSpace), availableDiskSpace(availableDiskSpace), ssi(ssi),
-	    id(ssi.id()) {}
+	  : totalDiskSpace(usedDiskSpace + availableDiskSpace), usedDiskSpace(usedDiskSpace), ssi(ssi), id(ssi.id()) {}
 
 	MockStorageServer(const UID& id, uint64_t availableDiskSpace, uint64_t usedDiskSpace = 0)
 	  : MockStorageServer(StorageServerInterface(id), availableDiskSpace, usedDiskSpace) {}
@@ -154,13 +153,15 @@ public:
 	// Clear key and its value of which the size is bytes
 	void clear(KeyRef key, int64_t bytes);
 	// Clear range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
-	void clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+	// return the total range size
+	int64_t clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
 
 	// modify the metrics as like doing an n-bytes read op
 	// Read key and cause bytes read overhead
 	void get(KeyRef key, int64_t bytes);
-	// Read range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
-	void getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+	// Read range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`,
+	// return the total range size;
+	int64_t getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
 
 	// trigger the asynchronous fetch keys operation
 	void signalFetchKeys(KeyRangeRef range, int64_t rangeTotalBytes);
@@ -280,6 +281,26 @@ public:
 	                                                               Optional<UID> debugID,
 	                                                               UseProvisionalProxies useProvisionalProxies,
 	                                                               Version version) override;
+
+	// data ops
+	// MGS finds the shard X contains this key, randomly generates a N-bytes read operation on that shard, which may
+	// change the read sampling stats of shard X. return the random size of value
+	int64_t get(KeyRef key);
+	// For the edge shards contains the range boundaries, randomly do N1 byte and N2 byte read operations. For other
+	// shards fully within the range, mock a full shard read op.
+	int64_t getRange(KeyRangeRef range);
+	// MGS finds the shard X contains this key, mock an N-bytes write to shard X, where N = valueSize + key.size().
+	// Return a random number representing the old kv size
+	int64_t set(KeyRef key, int valueSize, bool insert);
+	// MGS finds the shard X contains this key, randomly generate an N-byte clear operation.
+	// Return a random number representing the old kv size
+	int64_t clear(KeyRef key);
+	// Similar as getRange, but need to change shardTotalBytes because this is a clear operation.
+	int64_t clearRange(KeyRangeRef range);
+
+	// convenient shortcuts for test
+	std::vector<Future<Void>> runAllMockServers();
+	Future<Void> runMockServer(const UID& id);
 };
 
 #endif // FOUNDATIONDB_MOCKGLOBALSTATE_H
diff --git a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
index 7b674510d4..0bb9d00d7b 100644
--- a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
+++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
@@ -86,6 +86,8 @@ public:
 
 	std::pair<std::vector<Team>, std::vector<Team>> getTeamsFor(KeyRef key);
 
+	std::vector<UID> getSourceServerIdsFor(KeyRef key);
+
 	// Shard boundaries are modified in defineShard and the content of what servers correspond to each shard is a copy
 	// or union of the shards already there
 	void defineShard(KeyRangeRef keys);

From 802dce47b6cc5e773c9eb876499c572634e176b5 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Fri, 28 Oct 2022 17:12:46 -0700
Subject: [PATCH 11/57] unit test clean; fix some bugs

---
 fdbserver/MockGlobalState.actor.cpp           | 37 ++++++++++---------
 fdbserver/include/fdbserver/MockGlobalState.h |  3 +-
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index ba855affcf..a1570bea98 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -251,6 +251,9 @@ void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
                                              KeyRef splitPoint,
                                              uint64_t rangeSize,
                                              bool restrictSize) {
+	if (splitPoint == range.begin || !range.contains(splitPoint)) {
+		return;
+	}
 	Key left = range.begin;
 	// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
 	int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
@@ -301,6 +304,8 @@ Future<Void> MockStorageServer::run() {
 	                            Optional<Standalone<StringRef>>());
 	ssi.initEndpoints();
 	ssi.startAcceptingRequests();
+	IFailureMonitor::failureMonitor().setStatus(ssi.address(), FailureStatus(false));
+
 	TraceEvent("MockStorageServerStart").detail("Address", ssi.address());
 	addActor(serveStorageMetricsRequests(this, ssi));
 	addActor(MockStorageServerImpl::serveMockStorageServer(this));
@@ -620,9 +625,7 @@ std::vector<Future<Void>> MockGlobalState::runAllMockServers() {
 	return futures;
 }
 Future<Void> MockGlobalState::runMockServer(const UID& id) {
-	auto& server = allServers.at(id);
-	IFailureMonitor::failureMonitor().setStatus(server.ssi.address(), FailureStatus(false));
-	return server.run();
+	return allServers.at(id).run();
 }
 
 int64_t MockGlobalState::get(KeyRef key) {
@@ -934,20 +937,20 @@ TEST_CASE("/MockGlobalState/MockStorageServer/DataOpsSet") {
 	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
 	state Future<Void> allServerFutures = waitForAll(mgs->runAllMockServers());
 
-	// use data ops
-	state int64_t setBytes = 0;
-	setBytes += mgs->set("a"_sr, 1 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
-	setBytes += mgs->set("b"_sr, 2 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
-	setBytes += mgs->set("c"_sr, 3 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
-	for (auto& server : mgs->allServers) {
-		ASSERT_EQ(server.second.sumRangeSize(KeyRangeRef("a"_sr, "c"_sr)),
-		          2 + 3 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE);
-		ASSERT_EQ(server.second.usedDiskSpace, 3 + 6 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE);
+	// insert
+	{
+		mgs->set("a"_sr, 1 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
+		mgs->set("b"_sr, 2 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
+		mgs->set("c"_sr, 3 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
+		for (auto& server : mgs->allServers) {
+			ASSERT_EQ(server.second.usedDiskSpace, 3 + 6 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE);
+		}
+		ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
+		std::pair<Optional<StorageMetrics>, int> res = wait(
+		    mgs->waitStorageMetrics(KeyRangeRef("a"_sr, "c"_sr), bounds.min, bounds.max, bounds.permittedError, 1, 1));
+
+		int64_t testSize = 2 + 3 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE;
+		ASSERT_EQ(res.first.get().bytes, testSize);
 	}
-	ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
-	std::pair<Optional<StorageMetrics>, int> res =
-	    wait(mgs->waitStorageMetrics(allKeys, bounds.min, bounds.max, bounds.permittedError, 1, 1));
-	std::cout << "get result " << res.second << "\n";
-	std::cout << "get byte " << res.first.get().bytes << " " << setBytes << "\n";
 	return Void();
 }
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index 071fcb2609..3aa245c19c 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -282,7 +282,8 @@ public:
 	                                                               UseProvisionalProxies useProvisionalProxies,
 	                                                               Version version) override;
 
-	// data ops
+	// data ops - the key is not accurate, only the shard the key locate in matters.
+
 	// MGS finds the shard X contains this key, randomly generates a N-bytes read operation on that shard, which may
 	// change the read sampling stats of shard X. return the random size of value
 	int64_t get(KeyRef key);

From d2ecc3cb48c78becb5a947e3e143f2a65f6866e4 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Mon, 31 Oct 2022 09:19:33 -0700
Subject: [PATCH 12/57] handling no-sampled scenario in unit test

---
 fdbserver/MockGlobalState.actor.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index a1570bea98..28d1d9c7c7 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -947,10 +947,16 @@ TEST_CASE("/MockGlobalState/MockStorageServer/DataOpsSet") {
 		}
 		ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
 		std::pair<Optional<StorageMetrics>, int> res = wait(
-		    mgs->waitStorageMetrics(KeyRangeRef("a"_sr, "c"_sr), bounds.min, bounds.max, bounds.permittedError, 1, 1));
+		    mgs->waitStorageMetrics(KeyRangeRef("a"_sr, "bc"_sr), bounds.min, bounds.max, bounds.permittedError, 1, 1));
 
 		int64_t testSize = 2 + 3 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE;
-		ASSERT_EQ(res.first.get().bytes, testSize);
+		// SOMEDAY: how to integrate with isKeyValueInSample() better?
+		if (res.first.get().bytes > 0) {
+			// If sampled
+			ASSERT_EQ(res.first.get().bytes, testSize);
+			ASSERT_LT(res.first.get().writeBytesPerKSecond, 0);
+			ASSERT_LT(res.first.get().iosPerKSecond, 0);
+		}
 	}
 	return Void();
 }

From 7442cfa2cb73673235964ec3849dd560e24a47cb Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Mon, 31 Oct 2022 09:45:56 -0700
Subject: [PATCH 13/57] format code

---
 fdbclient/FDBTypes.cpp                               | 4 ++--
 fdbclient/include/fdbclient/StorageServerInterface.h | 8 ++++----
 fdbserver/BlobManager.actor.cpp                      | 3 ++-
 fdbserver/StorageMetrics.actor.cpp                   | 2 +-
 fdbserver/storageserver.actor.cpp                    | 9 +++++----
 5 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/fdbclient/FDBTypes.cpp b/fdbclient/FDBTypes.cpp
index c104aea5e8..e83630596b 100644
--- a/fdbclient/FDBTypes.cpp
+++ b/fdbclient/FDBTypes.cpp
@@ -100,7 +100,7 @@ TEST_CASE("/KeyRangeUtil/randomKeyBetween") {
 	Key begin = "qwert"_sr;
 	Key end = "qwertyu"_sr;
 	Key res;
-	for(int i = 0; i < 10; ++ i) {
+	for (int i = 0; i < 10; ++i) {
 		res = randomKeyBetween(KeyRangeRef(begin, end));
 		ASSERT(res > begin);
 		ASSERT(res < end);
@@ -113,7 +113,7 @@ TEST_CASE("/KeyRangeUtil/randomKeyBetween") {
 
 	begin = "aaaaaaa"_sr;
 	end = "b"_sr;
-	for(int i = 0; i < 10; ++ i) {
+	for (int i = 0; i < 10; ++i) {
 		res = randomKeyBetween(KeyRangeRef(begin, end));
 		ASSERT(res > begin);
 		ASSERT(res < end);
diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h
index 2a2442c94b..b8ad4523c9 100644
--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@@ -643,8 +643,8 @@ struct StorageMetrics {
 	static const int64_t infinity = 1LL << 60;
 
 	bool allLessOrEqual(const StorageMetrics& rhs) const {
-		return bytes <= rhs.bytes && writeBytesPerKSecond <= rhs.writeBytesPerKSecond && iosPerKSecond <= rhs.iosPerKSecond &&
-		       bytesReadPerKSecond <= rhs.bytesReadPerKSecond;
+		return bytes <= rhs.bytes && writeBytesPerKSecond <= rhs.writeBytesPerKSecond &&
+		       iosPerKSecond <= rhs.iosPerKSecond && bytesReadPerKSecond <= rhs.bytesReadPerKSecond;
 	}
 	void operator+=(const StorageMetrics& rhs) {
 		bytes += rhs.bytes;
@@ -697,8 +697,8 @@ struct StorageMetrics {
 	}
 
 	bool operator==(StorageMetrics const& rhs) const {
-		return bytes == rhs.bytes && writeBytesPerKSecond == rhs.writeBytesPerKSecond && iosPerKSecond == rhs.iosPerKSecond &&
-		       bytesReadPerKSecond == rhs.bytesReadPerKSecond;
+		return bytes == rhs.bytes && writeBytesPerKSecond == rhs.writeBytesPerKSecond &&
+		       iosPerKSecond == rhs.iosPerKSecond && bytesReadPerKSecond == rhs.bytesReadPerKSecond;
 	}
 
 	std::string toString() const {
diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp
index 50c68f328e..7e8455f79b 100644
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@@ -638,7 +638,8 @@ ACTOR Future<BlobGranuleSplitPoints> splitRange(Reference<BlobManagerData> bmDat
 			splitMetrics.bytes = SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES;
 			splitMetrics.writeBytesPerKSecond = SERVER_KNOBS->SHARD_SPLIT_BYTES_PER_KSEC;
 			if (writeHot) {
-				splitMetrics.writeBytesPerKSecond = std::min(splitMetrics.writeBytesPerKSecond, estimated.writeBytesPerKSecond / 2);
+				splitMetrics.writeBytesPerKSecond =
+				    std::min(splitMetrics.writeBytesPerKSecond, estimated.writeBytesPerKSecond / 2);
 				splitMetrics.writeBytesPerKSecond =
 				    std::max(splitMetrics.writeBytesPerKSecond, SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC);
 			}
diff --git a/fdbserver/StorageMetrics.actor.cpp b/fdbserver/StorageMetrics.actor.cpp
index 89305a968f..ea314eff77 100644
--- a/fdbserver/StorageMetrics.actor.cpp
+++ b/fdbserver/StorageMetrics.actor.cpp
@@ -99,7 +99,7 @@ void StorageServerMetrics::notify(KeyRef key, StorageMetrics& metrics) {
 
 	if (metrics.writeBytesPerKSecond)
 		notifyMetrics.writeBytesPerKSecond = bytesWriteSample.addAndExpire(key, metrics.writeBytesPerKSecond, expire) *
-		                                SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
+		                                     SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 	if (metrics.iosPerKSecond)
 		notifyMetrics.iosPerKSecond = iopsSample.addAndExpire(key, metrics.iosPerKSecond, expire) *
 		                              SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index af7ddeba69..5c355eadbf 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -10074,12 +10074,13 @@ ACTOR Future<Void> waitMetrics(StorageServerMetrics* self, WaitMetricsRequest re
 						//  all the messages for one clear or set have been dispatched.
 
 						/*StorageMetrics m = getMetrics( data, req.keys );
-						  bool b = ( m.bytes != metrics.bytes || m.writeBytesPerKSecond != metrics.writeBytesPerKSecond ||
-						  m.iosPerKSecond != metrics.iosPerKSecond ); if (b) { printf("keys: '%s' - '%s' @%p\n",
+						  bool b = ( m.bytes != metrics.bytes || m.writeBytesPerKSecond != metrics.writeBytesPerKSecond
+						  || m.iosPerKSecond != metrics.iosPerKSecond ); if (b) { printf("keys: '%s' - '%s' @%p\n",
 						  printable(req.keys.begin).c_str(), printable(req.keys.end).c_str(), this);
 						  printf("waitMetrics: desync %d (%lld %lld %lld) != (%lld %lld %lld); +(%lld %lld %lld)\n",
-						  b, m.bytes, m.writeBytesPerKSecond, m.iosPerKSecond, metrics.bytes, metrics.writeBytesPerKSecond,
-						  metrics.iosPerKSecond, c.bytes, c.writeBytesPerKSecond, c.iosPerKSecond);
+						  b, m.bytes, m.writeBytesPerKSecond, m.iosPerKSecond, metrics.bytes,
+						  metrics.writeBytesPerKSecond, metrics.iosPerKSecond, c.bytes, c.writeBytesPerKSecond,
+						  c.iosPerKSecond);
 
 						  }*/
 					}

From 7ed5a99213181bbd42e650b4951503c4dde64243 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Mon, 31 Oct 2022 17:18:43 -0700
Subject: [PATCH 14/57] add setShardStatus unit test and change signatures of
 methods to const&

---
 fdbserver/MockGlobalState.actor.cpp           | 77 ++++++++++++-------
 fdbserver/include/fdbserver/MockGlobalState.h | 49 ++++++------
 2 files changed, 74 insertions(+), 52 deletions(-)

diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index 28d1d9c7c7..a54a0fe494 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -153,7 +153,7 @@ public:
 	}
 };
 
-bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
+bool MockStorageServer::allShardStatusEqual(const KeyRangeRef& range, MockShardStatus status) {
 	auto ranges = serverKeys.intersectingRanges(range);
 	ASSERT(!ranges.empty()); // at least the range is allKeys
 
@@ -164,7 +164,7 @@ bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus s
 	return true;
 }
 
-bool MockStorageServer::allShardStatusIn(KeyRangeRef range, const std::set<MockShardStatus>& status) {
+bool MockStorageServer::allShardStatusIn(const KeyRangeRef& range, const std::set<MockShardStatus>& status) {
 	auto ranges = serverKeys.intersectingRanges(range);
 	ASSERT(!ranges.empty()); // at least the range is allKeys
 
@@ -175,7 +175,7 @@ bool MockStorageServer::allShardStatusIn(KeyRangeRef range, const std::set<MockS
 	return true;
 }
 
-void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
+void MockStorageServer::setShardStatus(const KeyRangeRef& range, MockShardStatus status, bool restrictSize) {
 	auto ranges = serverKeys.intersectingRanges(range);
 	ASSERT(!ranges.empty());
 	if (ranges.begin().range().contains(range)) {
@@ -219,8 +219,8 @@ void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status
 
 // split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
 // size of the new shards are randomly split from old size of [a, d)
-void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
-                                               KeyRangeRef innerRange,
+void MockStorageServer::threeWayShardSplitting(const KeyRangeRef& outerRange,
+                                               const KeyRangeRef& innerRange,
                                                uint64_t outerRangeSize,
                                                bool restrictSize) {
 	ASSERT(outerRange.contains(innerRange));
@@ -247,8 +247,8 @@ void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
 
 // split the range [a,c) with split point b. The result would be [a, b), [b, c). The
 // size of the new shards are randomly split from old size of [a, c)
-void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
-                                             KeyRef splitPoint,
+void MockStorageServer::twoWayShardSplitting(const KeyRangeRef& range,
+                                             const KeyRef& splitPoint,
                                              uint64_t rangeSize,
                                              bool restrictSize) {
 	if (splitPoint == range.begin || !range.contains(splitPoint)) {
@@ -266,7 +266,7 @@ void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
 	serverKeys[left].shardSize = leftSize;
 }
 
-void MockStorageServer::removeShard(KeyRangeRef range) {
+void MockStorageServer::removeShard(const KeyRangeRef& range) {
 	auto ranges = serverKeys.containedRanges(range);
 	ASSERT(ranges.begin().range() == range);
 	auto rangeSize = sumRangeSize(range);
@@ -276,7 +276,7 @@ void MockStorageServer::removeShard(KeyRangeRef range) {
 	metrics.notifyNotReadable(range);
 }
 
-uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
+uint64_t MockStorageServer::sumRangeSize(const KeyRangeRef& range) const {
 	auto ranges = serverKeys.intersectingRanges(range);
 	uint64_t totalSize = 0;
 	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
@@ -312,7 +312,7 @@ Future<Void> MockStorageServer::run() {
 	return actors.getResult();
 }
 
-void MockStorageServer::set(KeyRef key, int64_t bytes, int64_t oldBytes) {
+void MockStorageServer::set(KeyRef const& key, int64_t bytes, int64_t oldBytes) {
 	notifyWriteMetrics(key, bytes);
 	byteSampleApplySet(key, bytes);
 	auto delta = bytes - oldBytes;
@@ -320,7 +320,7 @@ void MockStorageServer::set(KeyRef key, int64_t bytes, int64_t oldBytes) {
 	serverKeys[key].shardSize += delta;
 }
 
-void MockStorageServer::clear(KeyRef key, int64_t bytes) {
+void MockStorageServer::clear(KeyRef const& key, int64_t bytes) {
 	notifyWriteMetrics(key, bytes);
 	KeyRange sr = singleKeyRange(key);
 	byteSampleApplyClear(sr);
@@ -328,7 +328,7 @@ void MockStorageServer::clear(KeyRef key, int64_t bytes) {
 	serverKeys[key].shardSize -= bytes;
 }
 
-int64_t MockStorageServer::clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+int64_t MockStorageServer::clearRange(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes) {
 	notifyWriteMetrics(range.begin, range.begin.size() + range.end.size());
 	byteSampleApplyClear(range);
 	auto totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
@@ -337,13 +337,13 @@ int64_t MockStorageServer::clearRange(KeyRangeRef range, int64_t beginShardBytes
 	return totalByteSize;
 }
 
-void MockStorageServer::get(KeyRef key, int64_t bytes) {
+void MockStorageServer::get(KeyRef const& key, int64_t bytes) {
 	// If the read yields no value, randomly sample the empty read.
 	int64_t bytesReadPerKSecond = std::max(bytes, SERVER_KNOBS->EMPTY_READ_PENALTY);
 	metrics.notifyBytesReadPerKSecond(key, bytesReadPerKSecond);
 }
 
-int64_t MockStorageServer::getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+int64_t MockStorageServer::getRange(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes) {
 	int64_t totalByteSize = estimateRangeTotalBytes(range, beginShardBytes, endShardBytes);
 	// For performance concerns, the cost of a range read is billed to the start key and end key of the
 	// range.
@@ -355,7 +355,7 @@ int64_t MockStorageServer::getRange(KeyRangeRef range, int64_t beginShardBytes,
 	return totalByteSize;
 }
 
-int64_t MockStorageServer::estimateRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+int64_t MockStorageServer::estimateRangeTotalBytes(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes) {
 	int64_t totalByteSize = 0;
 	auto ranges = serverKeys.intersectingRanges(range);
 
@@ -374,7 +374,7 @@ int64_t MockStorageServer::estimateRangeTotalBytes(KeyRangeRef range, int64_t be
 	return totalByteSize;
 }
 
-void MockStorageServer::clearRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes) {
+void MockStorageServer::clearRangeTotalBytes(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes) {
 	auto ranges = serverKeys.intersectingRanges(range);
 
 	// use the beginShardBytes as partial size
@@ -393,7 +393,7 @@ void MockStorageServer::clearRangeTotalBytes(KeyRangeRef range, int64_t beginSha
 	}
 }
 
-void MockStorageServer::notifyWriteMetrics(KeyRef key, int64_t size) {
+void MockStorageServer::notifyWriteMetrics(KeyRef const& key, int64_t size) {
 	// update write bandwidth and iops as mock the cost of writing a mutation
 	StorageMetrics s;
 	s.writeBytesPerKSecond = size + MutationRef::OVERHEAD_BYTES;
@@ -401,15 +401,17 @@ void MockStorageServer::notifyWriteMetrics(KeyRef key, int64_t size) {
 	metrics.notify(key, s);
 }
 
-void MockStorageServer::signalFetchKeys(KeyRangeRef range, int64_t rangeTotalBytes) {
-	fetchKeysRequests.send({ KeyRange(range), rangeTotalBytes });
+void MockStorageServer::signalFetchKeys(const KeyRangeRef& range, int64_t rangeTotalBytes) {
+	std::cout << "----- signalFetchKeys ---- \n";
+	fetchKeysRequests.send({ range, rangeTotalBytes });
+	std::cout << "----- signalFetchKeys end ---- \n";
 }
 
 Future<Void> MockStorageServer::fetchKeys(const MockStorageServer::FetchKeysParams& param) {
 	return MockStorageServerImpl::waitFetchKeysFinish(this, param);
 }
 
-void MockStorageServer::byteSampleApplySet(KeyRef key, int64_t kvSize) {
+void MockStorageServer::byteSampleApplySet(KeyRef const& key, int64_t kvSize) {
 	// Update byteSample in memory and notify waiting metrics
 	ByteSampleInfo sampleInfo = isKeyValueInSample(key, kvSize);
 	auto& byteSample = metrics.byteSample.sample;
@@ -430,7 +432,7 @@ void MockStorageServer::byteSampleApplySet(KeyRef key, int64_t kvSize) {
 		metrics.notifyBytes(key, delta);
 }
 
-void MockStorageServer::byteSampleApplyClear(KeyRangeRef range) {
+void MockStorageServer::byteSampleApplyClear(KeyRangeRef const& range) {
 	// Update byteSample and notify waiting metrics
 
 	auto& byteSample = metrics.byteSample.sample;
@@ -628,7 +630,7 @@ Future<Void> MockGlobalState::runMockServer(const UID& id) {
 	return allServers.at(id).run();
 }
 
-int64_t MockGlobalState::get(KeyRef key) {
+int64_t MockGlobalState::get(KeyRef const& key) {
 	auto ids = shardMapping->getSourceServerIdsFor(key);
 	int64_t randomBytes = 0;
 	if (deterministicRandom()->random01() > emptyProb) {
@@ -640,7 +642,7 @@ int64_t MockGlobalState::get(KeyRef key) {
 	return randomBytes;
 }
 
-int64_t MockGlobalState::getRange(KeyRangeRef range) {
+int64_t MockGlobalState::getRange(KeyRangeRef const& range) {
 	auto ranges = shardMapping->intersectingRanges(range);
 	int64_t totalSize = 0;
 	KeyRef begin, end;
@@ -662,7 +664,7 @@ int64_t MockGlobalState::getRange(KeyRangeRef range) {
 	return totalSize;
 }
 
-int64_t MockGlobalState::set(KeyRef key, int valueSize, bool insert) {
+int64_t MockGlobalState::set(KeyRef const& key, int valueSize, bool insert) {
 	auto ids = shardMapping->getSourceServerIdsFor(key);
 	int64_t oldKvBytes = 0;
 	insert |= (deterministicRandom()->random01() < emptyProb);
@@ -677,7 +679,7 @@ int64_t MockGlobalState::set(KeyRef key, int valueSize, bool insert) {
 	return oldKvBytes;
 }
 
-int64_t MockGlobalState::clear(KeyRef key) {
+int64_t MockGlobalState::clear(KeyRef const& key) {
 	auto ids = shardMapping->getSourceServerIdsFor(key);
 	int64_t randomBytes = 0;
 	if (deterministicRandom()->random01() > emptyProb) {
@@ -690,7 +692,7 @@ int64_t MockGlobalState::clear(KeyRef key) {
 	return randomBytes;
 }
 
-int64_t MockGlobalState::clearRange(KeyRangeRef range) {
+int64_t MockGlobalState::clearRange(KeyRangeRef const& range) {
 	auto ranges = shardMapping->intersectingRanges(range);
 	int64_t totalSize = 0;
 	KeyRef begin, end;
@@ -827,6 +829,25 @@ TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
 	return Void();
 }
 
+TEST_CASE("/MockGlobalState/MockStorageServer/SetShardStatus") {
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = true;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+	TraceEvent("SetShardStatusUnitTestDbConfig").detail("Config", dbConfig.toString());
+
+	auto mgs = std::make_shared<MockGlobalState>();
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+
+	auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
+	KeyRange testRange(KeyRangeRef("a"_sr, "b"_sr));
+	mss.setShardStatus(testRange, MockShardStatus::INFLIGHT, false);
+	ASSERT(mss.allShardStatusEqual(testRange, MockShardStatus::INFLIGHT));
+
+	return Void();
+}
+
 namespace {
 inline bool locationInfoEqualsToTeam(Reference<LocationInfo> loc, const std::vector<UID>& ids) {
 	return loc->locations()->size() == ids.size() &&
@@ -954,8 +975,8 @@ TEST_CASE("/MockGlobalState/MockStorageServer/DataOpsSet") {
 		if (res.first.get().bytes > 0) {
 			// If sampled
 			ASSERT_EQ(res.first.get().bytes, testSize);
-			ASSERT_LT(res.first.get().writeBytesPerKSecond, 0);
-			ASSERT_LT(res.first.get().iosPerKSecond, 0);
+			ASSERT_GT(res.first.get().writeBytesPerKSecond, 0);
+			ASSERT_GT(res.first.get().iosPerKSecond, 0);
 		}
 	}
 	return Void();
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index 3aa245c19c..05a9fdbca3 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -103,19 +103,19 @@ public:
 
 	decltype(serverKeys)::Ranges getAllRanges() { return serverKeys.ranges(); }
 
-	bool allShardStatusEqual(KeyRangeRef range, MockShardStatus status);
-	bool allShardStatusIn(KeyRangeRef range, const std::set<MockShardStatus>& status);
+	bool allShardStatusEqual(const KeyRangeRef& range, MockShardStatus status);
+	bool allShardStatusIn(const KeyRangeRef& range, const std::set<MockShardStatus>& status);
 
 	// change the status of range. This function may result in split to make the shard boundary align with range.begin
 	// and range.end. In this case, if restrictSize==true, the sum of the split shard size is strictly equal to the old
 	// large shard. Otherwise, the size are randomly generated between (min_shard_size, max_shard_size)
-	void setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize);
+	void setShardStatus(const KeyRangeRef& range, MockShardStatus status, bool restrictSize);
 
 	// this function removed an aligned range from server
-	void removeShard(KeyRangeRef range);
+	void removeShard(const KeyRangeRef& range);
 
 	// intersecting range size
-	uint64_t sumRangeSize(KeyRangeRef range) const;
+	uint64_t sumRangeSize(const KeyRangeRef& range) const;
 
 	void addActor(Future<Void> future) override;
 
@@ -149,50 +149,51 @@ public:
 	// data operation APIs - change the metrics sample, disk space and shard size
 
 	// Set key with a new value, the total bytes change from oldBytes to bytes
-	void set(KeyRef key, int64_t bytes, int64_t oldBytes);
+	void set(KeyRef const& key, int64_t bytes, int64_t oldBytes);
 	// Clear key and its value of which the size is bytes
-	void clear(KeyRef key, int64_t bytes);
+	void clear(KeyRef const& key, int64_t bytes);
 	// Clear range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
 	// return the total range size
-	int64_t clearRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+	int64_t clearRange(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes);
 
 	// modify the metrics as like doing an n-bytes read op
 	// Read key and cause bytes read overhead
-	void get(KeyRef key, int64_t bytes);
+	void get(KeyRef const& key, int64_t bytes);
 	// Read range, assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`,
 	// return the total range size;
-	int64_t getRange(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+	int64_t getRange(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes);
 
 	// trigger the asynchronous fetch keys operation
-	void signalFetchKeys(KeyRangeRef range, int64_t rangeTotalBytes);
+	void signalFetchKeys(const KeyRangeRef& range, int64_t rangeTotalBytes);
 
 protected:
 	PromiseStream<FetchKeysParams> fetchKeysRequests;
 
-	void threeWayShardSplitting(KeyRangeRef outerRange,
-	                            KeyRangeRef innerRange,
+	void threeWayShardSplitting(const KeyRangeRef& outerRange,
+	                            const KeyRangeRef& innerRange,
 	                            uint64_t outerRangeSize,
 	                            bool restrictSize);
 
-	void twoWayShardSplitting(KeyRangeRef range, KeyRef splitPoint, uint64_t rangeSize, bool restrictSize);
+	void twoWayShardSplitting(const KeyRangeRef& range,
+	                          const KeyRef& splitPoint, uint64_t rangeSize, bool restrictSize);
 
 	// Assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
-	int64_t estimateRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+	int64_t estimateRangeTotalBytes(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes);
 	// Decrease the intersecting shard bytes as if delete the data
-	void clearRangeTotalBytes(KeyRangeRef range, int64_t beginShardBytes, int64_t endShardBytes);
+	void clearRangeTotalBytes(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes);
 
 	// Update the storage metrics as if we write a k-v pair of `size` bytes.
-	void notifyWriteMetrics(KeyRef key, int64_t size);
+	void notifyWriteMetrics(KeyRef const& key, int64_t size);
 
 	// Randomly generate keys and kv size between the fetch range, updating the byte sample.
 	// Once the fetchKeys return, the shard status will become FETCHED.
 	Future<Void> fetchKeys(const FetchKeysParams&);
 
 	// Update byte sample as if set a key value pair of which the size is kvSize
-	void byteSampleApplySet(KeyRef key, int64_t kvSize);
+	void byteSampleApplySet(KeyRef const& key, int64_t kvSize);
 
 	// Update byte sample as if clear a whole range
-	void byteSampleApplyClear(KeyRangeRef range);
+	void byteSampleApplyClear(KeyRangeRef const& range);
 };
 
 class MockGlobalStateImpl;
@@ -286,18 +287,18 @@ public:
 
 	// MGS finds the shard X contains this key, randomly generates a N-bytes read operation on that shard, which may
 	// change the read sampling stats of shard X. return the random size of value
-	int64_t get(KeyRef key);
+	int64_t get(KeyRef const& key);
 	// For the edge shards contains the range boundaries, randomly do N1 byte and N2 byte read operations. For other
 	// shards fully within the range, mock a full shard read op.
-	int64_t getRange(KeyRangeRef range);
+	int64_t getRange(KeyRangeRef const& range);
 	// MGS finds the shard X contains this key, mock an N-bytes write to shard X, where N = valueSize + key.size().
 	// Return a random number representing the old kv size
-	int64_t set(KeyRef key, int valueSize, bool insert);
+	int64_t set(KeyRef const& key, int valueSize, bool insert);
 	// MGS finds the shard X contains this key, randomly generate an N-byte clear operation.
 	// Return a random number representing the old kv size
-	int64_t clear(KeyRef key);
+	int64_t clear(KeyRef const& key);
 	// Similar as getRange, but need to change shardTotalBytes because this is a clear operation.
-	int64_t clearRange(KeyRangeRef range);
+	int64_t clearRange(KeyRangeRef const& range);
 
 	// convenient shortcuts for test
 	std::vector<Future<Void>> runAllMockServers();

From 5a4736a574a9e4f815d0c10966619aedf2d05d53 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Mon, 31 Oct 2022 23:17:40 -0700
Subject: [PATCH 15/57] fix setShardStatus bug and finish the unit test

---
 fdbserver/MockGlobalState.actor.cpp           | 46 ++++++++++++++-----
 fdbserver/include/fdbserver/MockGlobalState.h | 12 +++--
 2 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index a54a0fe494..caa9451dc5 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -178,22 +178,25 @@ bool MockStorageServer::allShardStatusIn(const KeyRangeRef& range, const std::se
 void MockStorageServer::setShardStatus(const KeyRangeRef& range, MockShardStatus status, bool restrictSize) {
 	auto ranges = serverKeys.intersectingRanges(range);
 	ASSERT(!ranges.empty());
-	if (ranges.begin().range().contains(range)) {
+	if (ranges.begin().begin() < range.begin && ranges.begin().end() > range.end) {
 		CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
 		threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
-	}
-	if (ranges.begin().begin() < range.begin) {
-		CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
-		twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
-	}
-	if (ranges.end().end() > range.end) {
-		CODE_PROBE(true, "Implicitly split end range to 2 pieces");
-		twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
+	} else {
+		if (ranges.begin().begin() < range.begin) {
+			CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
+			twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
+		}
+		if (ranges.end().begin() > range.end) {
+			CODE_PROBE(true, "Implicitly split end range to 2 pieces");
+			auto lastRange = ranges.end();
+			--lastRange;
+			twoWayShardSplitting(lastRange.range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
+		}
 	}
 	ranges = serverKeys.containedRanges(range);
 	// now the boundary must be aligned
 	ASSERT(ranges.begin().begin() == range.begin);
-	ASSERT(ranges.end().end() == range.end);
+	ASSERT(ranges.end().begin() == range.end);
 	uint64_t newSize = 0;
 	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
 		newSize += it->cvalue().shardSize;
@@ -402,9 +405,7 @@ void MockStorageServer::notifyWriteMetrics(KeyRef const& key, int64_t size) {
 }
 
 void MockStorageServer::signalFetchKeys(const KeyRangeRef& range, int64_t rangeTotalBytes) {
-	std::cout << "----- signalFetchKeys ---- \n";
 	fetchKeysRequests.send({ range, rangeTotalBytes });
-	std::cout << "----- signalFetchKeys end ---- \n";
 }
 
 Future<Void> MockStorageServer::fetchKeys(const MockStorageServer::FetchKeysParams& param) {
@@ -754,11 +755,13 @@ struct MockGlobalStateTester {
 		mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
 		auto ranges = mss.serverKeys.containedRanges(outerRange);
 		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
+		ASSERT(ranges.begin().cvalue().status == oldStatus);
 		ranges.pop_front();
 		ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
 		ASSERT(ranges.begin().cvalue().status == oldStatus);
 		ranges.pop_front();
 		ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
+		ASSERT(ranges.begin().cvalue().status == oldStatus);
 		ranges.pop_front();
 		ASSERT(ranges.empty());
 	}
@@ -777,6 +780,7 @@ struct MockGlobalStateTester {
 		mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
 		auto ranges = mss.serverKeys.containedRanges(outerRange);
 		ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
+		ASSERT(ranges.begin().cvalue().status == oldStatus);
 		ranges.pop_front();
 		ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
 		ASSERT(ranges.begin().cvalue().status == oldStatus);
@@ -841,10 +845,28 @@ TEST_CASE("/MockGlobalState/MockStorageServer/SetShardStatus") {
 	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
 
 	auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
+	mss.serverKeys.insert(allKeys, { MockShardStatus::UNSET, 0 }); // manually reset status
+
+	// split to 3 shards [allKeys.begin, a, b, allKeys.end]
 	KeyRange testRange(KeyRangeRef("a"_sr, "b"_sr));
 	mss.setShardStatus(testRange, MockShardStatus::INFLIGHT, false);
 	ASSERT(mss.allShardStatusEqual(testRange, MockShardStatus::INFLIGHT));
 
+	// [allKeys.begin, a, ac, b, bc, allKeys.end]
+	testRange = KeyRangeRef("ac"_sr, "bc"_sr);
+	mss.setShardStatus(testRange, MockShardStatus::INFLIGHT, false);
+	ASSERT(mss.allShardStatusEqual(testRange, MockShardStatus::INFLIGHT));
+
+	testRange = KeyRangeRef("b"_sr, "bc"_sr);
+	mss.setShardStatus(testRange, MockShardStatus::FETCHED, false);
+	ASSERT(mss.allShardStatusEqual(testRange, MockShardStatus::FETCHED));
+	mss.setShardStatus(testRange, MockShardStatus::COMPLETED, false);
+	ASSERT(mss.allShardStatusEqual(testRange, MockShardStatus::COMPLETED));
+	mss.setShardStatus(testRange, MockShardStatus::FETCHED, false);
+	ASSERT(mss.allShardStatusEqual(testRange, MockShardStatus::COMPLETED));
+
+	ASSERT(mss.serverKeys.size() == 5);
+
 	return Void();
 }
 
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index 05a9fdbca3..b9e2125881 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -31,21 +31,25 @@
 
 struct MockGlobalStateTester;
 
+// the status is roughly order by transition order, except for UNSET and EMPTY
 enum class MockShardStatus {
 	EMPTY = 0, // data loss
-	COMPLETED,
+	UNSET,
 	INFLIGHT,
 	FETCHED, // finish fetch but not change the serverKey mapping. Only can be set by MSS itself.
-	UNSET
+	COMPLETED
 };
 
 inline bool isStatusTransitionValid(MockShardStatus from, MockShardStatus to) {
+	if (from == to)
+		return true;
+
 	switch (from) {
 	case MockShardStatus::UNSET:
 	case MockShardStatus::EMPTY:
-		return to == MockShardStatus::COMPLETED || to == MockShardStatus::INFLIGHT || to == MockShardStatus::EMPTY;
+		return to >= MockShardStatus::INFLIGHT;
 	case MockShardStatus::INFLIGHT:
-		return to == MockShardStatus::FETCHED || to == MockShardStatus::INFLIGHT || to == MockShardStatus::EMPTY;
+		return to == MockShardStatus::FETCHED || to == MockShardStatus::EMPTY;
 	case MockShardStatus::FETCHED:
 		return to == MockShardStatus::COMPLETED;
 	case MockShardStatus::COMPLETED:

From a0489330d001b571e18e824e8177e5cf5c50b633 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Tue, 1 Nov 2022 14:22:04 -0700
Subject: [PATCH 16/57] fix rawStartMovement defineShard bug

---
 fdbserver/DDTxnProcessor.actor.cpp            | 10 ++-
 .../fdbserver/ShardsAffectedByTeamFailure.h   |  4 +-
 .../IDDTxnProcessorApiCorrectness.actor.cpp   | 78 ++++++++++++-------
 fdbserver/workloads/MockDDTest.actor.cpp      | 30 ++++++-
 4 files changed, 88 insertions(+), 34 deletions(-)

diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 7b124bea06..991f4de95b 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -701,6 +701,10 @@ struct DDMockTxnProcessorImpl {
 
 	ACTOR static Future<Void> moveKeys(DDMockTxnProcessor* self, MoveKeysParams params) {
 		state std::map<UID, StorageServerInterface> tssMapping;
+		// Because SFBTF::Team requires the ID is ordered
+		std::sort(params.destinationTeam.begin(), params.destinationTeam.end());
+		std::sort(params.healthyDestinations.begin(), params.healthyDestinations.end());
+
 		self->rawStartMovement(params, tssMapping);
 		ASSERT(tssMapping.empty());
 
@@ -892,6 +896,7 @@ void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID,
 
 	std::vector<ShardsAffectedByTeamFailure::Team> destTeams;
 	destTeams.emplace_back(params.destinationTeam, true);
+	mgs->shardMapping->defineShard(params.keys);
 	mgs->shardMapping->moveShard(params.keys, destTeams);
 
 	auto randomRangeSize =
@@ -926,9 +931,12 @@ void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
 		mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::COMPLETED, mgs->restrictSize);
 	}
 
+	// remove destination servers from source servers
 	ASSERT_EQ(srcTeams.size(), 0);
 	for (auto& id : srcTeams.front().servers) {
-		mgs->allServers.at(id).removeShard(params.keys);
+		if (!std::binary_search(params.destinationTeam.begin(), params.destinationTeam.end(), id)) {
+			mgs->allServers.at(id).removeShard(params.keys);
+		}
 	}
 	mgs->shardMapping->finishMove(params.keys);
 }
diff --git a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
index 0bb9d00d7b..326958bbb6 100644
--- a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
+++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
@@ -36,7 +36,9 @@ public:
 		bool primary;
 
 		Team() : primary(true) {}
-		Team(std::vector<UID> const& servers, bool primary) : servers(servers), primary(primary) {}
+		Team(std::vector<UID> const& servers, bool primary) : servers(servers), primary(primary) {
+			ASSERT(std::is_sorted(servers.begin(), servers.end()));
+		}
 
 		bool operator<(const Team& r) const {
 			if (servers == r.servers)
diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index 3b10176103..351c84d97d 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -46,6 +46,10 @@ bool compareShardInfo(const DDShardInfo& a, const DDShardInfo& other) {
 
 void verifyInitDataEqual(Reference<InitialDataDistribution> real, Reference<InitialDataDistribution> mock) {
 	// Mock DD just care about the team list and server<->key mapping are consistent with the real cluster
+	if(real->shards.size() != mock->shards.size()) {
+		std::cout << "real.size: " << real->shards.size() << " mock.size: " << mock->shards.size() << "\n";
+		ASSERT(false);
+	}
 	ASSERT(std::equal(
 	    real->shards.begin(), real->shards.end(), mock->shards.begin(), mock->shards.end(), compareShardInfo));
 	std::cout << describe(real->primaryTeams) << " | " << describe(mock->primaryTeams) << "\n";
@@ -189,18 +193,17 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 
 		verifyInitDataEqual(self->realInitDD, mockInitData);
 
-		// wait(timeout(reportErrors(self->worker(cx, self), "IDDTxnProcessorApiWorkload"), self->testDuration,
-		// Void()));
+		wait(timeout(reportErrors(self->worker(cx, self), "IDDTxnProcessorApiWorkload"), self->testDuration, Void()));
 
 		// Always set the DD mode back, even if we die with an error
 		TraceEvent("IDDTxnApiTestDoneMoving").log();
-		wait(success(setDDMode(cx, 1)));
-		TraceEvent("IDDTxnApiTestDoneModeSetting").log();
+		int oldValue = wait(setDDMode(cx, 1));
+		TraceEvent("IDDTxnApiTestDoneModeSetting").detail("OldValue", oldValue);
 		return Void();
 	}
 
 	ACTOR static Future<Void> testRawMovementApi(IDDTxnProcessorApiWorkload* self) {
-		state TraceInterval relocateShardInterval("RelocateShard");
+		state TraceInterval relocateShardInterval("RelocateShard_TestRawMovementApi");
 		state FlowLock fl1(1);
 		state FlowLock fl2(1);
 		state std::map<UID, StorageServerInterface> emptyTssMapping;
@@ -209,32 +212,33 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		params.startMoveKeysParallelismLock = &fl1;
 		params.finishMoveKeysParallelismLock = &fl2;
 		params.relocationIntervalId = relocateShardInterval.pairID;
+		TraceEvent(SevDebug, relocateShardInterval.begin(), relocateShardInterval.pairID);
 
-		// test start
-		self->mock->testRawStartMovement(params, emptyTssMapping);
-		wait(self->real->testRawStartMovement(params, emptyTssMapping));
+		loop {
+			params.dataMovementComplete.reset();
+			wait(store(params.lock, self->real->takeMoveKeysLock(UID())));
+			try {
+				// test start
+				self->mock->testRawStartMovement(params, emptyTssMapping);
+				wait(self->real->testRawStartMovement(params, emptyTssMapping));
 
-		// read initial data again
-		wait(readRealInitialDataDistribution(self));
-		mockInitData = self->mock
-		                   ->getInitialDataDistribution(self->ddContext.id(),
-		                                                self->ddContext.lock,
-		                                                {},
-		                                                self->ddContext.ddEnabledState.get(),
-		                                                SkipDDModeCheck::True)
-		                   .get();
+				// test finish or started but cancelled movement
+				if (deterministicRandom()->coinflip()) {
+					CODE_PROBE(true, "RawMovementApi partial started");
+					break;
+				}
 
-		verifyInitDataEqual(self->realInitDD, mockInitData);
-
-		// test finish or started but cancelled movement
-		if (deterministicRandom()->coinflip()) {
-			CODE_PROBE(true, "RawMovementApi partial started");
-			return Void();
+				self->mock->testRawFinishMovement(params, emptyTssMapping);
+				wait(self->real->testRawFinishMovement(params, emptyTssMapping));
+				break;
+			} catch (Error& e) {
+				if (e.code() != error_code_movekeys_conflict && e.code() != error_code_operation_failed)
+					throw;
+				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+				// Keep trying to get the moveKeysLock
+			}
 		}
 
-		self->mock->testRawFinishMovement(params, emptyTssMapping);
-		wait(self->real->testRawFinishMovement(params, emptyTssMapping));
-
 		// read initial data again
 		wait(readRealInitialDataDistribution(self));
 		mockInitData = self->mock
@@ -246,6 +250,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		                   .get();
 
 		verifyInitDataEqual(self->realInitDD, mockInitData);
+		TraceEvent(SevDebug, relocateShardInterval.end(), relocateShardInterval.pairID);
 		return Void();
 	}
 
@@ -269,7 +274,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 	}
 
 	ACTOR static Future<Void> testMoveKeys(IDDTxnProcessorApiWorkload* self) {
-		state TraceInterval relocateShardInterval("RelocateShard");
+		state TraceInterval relocateShardInterval("RelocateShard_TestMoveKeys");
 		state FlowLock fl1(1);
 		state FlowLock fl2(1);
 		state std::map<UID, StorageServerInterface> emptyTssMapping;
@@ -278,9 +283,22 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		params.startMoveKeysParallelismLock = &fl1;
 		params.finishMoveKeysParallelismLock = &fl2;
 		params.relocationIntervalId = relocateShardInterval.pairID;
+		TraceEvent(SevDebug, relocateShardInterval.begin(), relocateShardInterval.pairID);
 
-		self->mock->moveKeys(params);
-		wait(self->real->moveKeys(params));
+		loop {
+			params.dataMovementComplete.reset();
+			wait(store(params.lock, self->real->takeMoveKeysLock(UID())));
+			try {
+				self->mock->moveKeys(params);
+				wait(self->real->moveKeys(params));
+				break;
+			} catch (Error& e) {
+				if (e.code() != error_code_movekeys_conflict && e.code() != error_code_operation_failed)
+					throw;
+				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+				// Keep trying to get the moveKeysLock
+			}
+		}
 
 		// read initial data again
 		wait(readRealInitialDataDistribution(self));
@@ -293,7 +311,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		                   .get();
 
 		verifyInitDataEqual(self->realInitDD, mockInitData);
-
+		TraceEvent(SevDebug, relocateShardInterval.end(), relocateShardInterval.pairID);
 		return Void();
 	}
 	ACTOR Future<Void> worker(Database cx, IDDTxnProcessorApiWorkload* self) {
diff --git a/fdbserver/workloads/MockDDTest.actor.cpp b/fdbserver/workloads/MockDDTest.actor.cpp
index 209df66a5e..c9ca9d1f8e 100644
--- a/fdbserver/workloads/MockDDTest.actor.cpp
+++ b/fdbserver/workloads/MockDDTest.actor.cpp
@@ -28,13 +28,39 @@
 #include "fdbserver/workloads/workloads.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-struct MockDDTestWorkload {
+struct MockDDTestWorkload : public TestWorkload {
 	bool enabled;
+	bool simpleConfig;
 	double testDuration;
 	double meanDelay = 0.05;
 	double maxKeyspace = 0.1;
-	DDSharedContext ddContext;
 
 	std::shared_ptr<MockGlobalState> mgs;
 	std::shared_ptr<DDMockTxnProcessor> mock;
+
+	MockDDTestWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		enabled = !clientId && g_network->isSimulated(); // only do this on the "first" client
+		simpleConfig = getOption(options, "simpleConfig"_sr, true);
+		testDuration = getOption(options, "testDuration"_sr, 10.0);
+		meanDelay = getOption(options, "meanDelay"_sr, meanDelay);
+		maxKeyspace = getOption(options, "maxKeyspace"_sr, maxKeyspace);
+	}
+
+	Future<Void> setup(Database const& cx) override {
+		if (!enabled)
+			return Void();
+		// initialize configuration
+		BasicTestConfig testConfig;
+		testConfig.simpleConfig = simpleConfig;
+		testConfig.minimumReplication = 1;
+		testConfig.logAntiQuorum = 0;
+		DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
+
+		// initialize mgs
+		mgs = std::make_shared<MockGlobalState>();
+		mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+		mock = std::make_shared<DDMockTxnProcessor>(mgs);
+
+		return Void();
+	}
 };
\ No newline at end of file

From 13fae7ba8a263351256a0831c78af26727ab043c Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Wed, 2 Nov 2022 13:45:53 -0700
Subject: [PATCH 17/57] finish add MockDDTrackerShardEvaluatorWorkload

---
 fdbserver/MockGlobalState.actor.cpp           |  10 +-
 fdbserver/include/fdbserver/MockGlobalState.h |   6 +-
 .../IDDTxnProcessorApiCorrectness.actor.cpp   |   6 +-
 fdbserver/workloads/MockDDTest.actor.cpp      | 147 +++++++++++++++++-
 4 files changed, 157 insertions(+), 12 deletions(-)

diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index caa9451dc5..388b3da93a 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -358,7 +358,9 @@ int64_t MockStorageServer::getRange(KeyRangeRef const& range, int64_t beginShard
 	return totalByteSize;
 }
 
-int64_t MockStorageServer::estimateRangeTotalBytes(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes) {
+int64_t MockStorageServer::estimateRangeTotalBytes(KeyRangeRef const& range,
+                                                   int64_t beginShardBytes,
+                                                   int64_t endShardBytes) {
 	int64_t totalByteSize = 0;
 	auto ranges = serverKeys.intersectingRanges(range);
 
@@ -635,7 +637,7 @@ int64_t MockGlobalState::get(KeyRef const& key) {
 	auto ids = shardMapping->getSourceServerIdsFor(key);
 	int64_t randomBytes = 0;
 	if (deterministicRandom()->random01() > emptyProb) {
-		randomBytes = deterministicRandom()->randomInt64(minByteSize, maxByteSize);
+		randomBytes = deterministicRandom()->randomInt64(minByteSize, maxByteSize + 1);
 	}
 	// randomly choose 1 server
 	auto id = deterministicRandom()->randomChoice(ids);
@@ -671,7 +673,7 @@ int64_t MockGlobalState::set(KeyRef const& key, int valueSize, bool insert) {
 	insert |= (deterministicRandom()->random01() < emptyProb);
 
 	if (!insert) {
-		oldKvBytes = key.size() + deterministicRandom()->randomInt64(minByteSize, maxByteSize);
+		oldKvBytes = key.size() + deterministicRandom()->randomInt64(minByteSize, maxByteSize + 1);
 	}
 
 	for (auto& id : ids) {
@@ -684,7 +686,7 @@ int64_t MockGlobalState::clear(KeyRef const& key) {
 	auto ids = shardMapping->getSourceServerIdsFor(key);
 	int64_t randomBytes = 0;
 	if (deterministicRandom()->random01() > emptyProb) {
-		randomBytes = deterministicRandom()->randomInt64(minByteSize, maxByteSize) + key.size();
+		randomBytes = deterministicRandom()->randomInt64(minByteSize, maxByteSize + 1) + key.size();
 	}
 
 	for (auto& id : ids) {
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index b9e2125881..4ea121697d 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -179,7 +179,9 @@ protected:
 	                            bool restrictSize);
 
 	void twoWayShardSplitting(const KeyRangeRef& range,
-	                          const KeyRef& splitPoint, uint64_t rangeSize, bool restrictSize);
+	                          const KeyRef& splitPoint,
+	                          uint64_t rangeSize,
+	                          bool restrictSize);
 
 	// Assuming the first and last shard within the range having size `beginShardBytes` and `endShardBytes`
 	int64_t estimateRangeTotalBytes(KeyRangeRef const& range, int64_t beginShardBytes, int64_t endShardBytes);
@@ -218,7 +220,7 @@ public:
 
 	// user defined parameters for mock workload purpose
 	double emptyProb; // probability of doing an empty read
-	uint32_t minByteSize, maxByteSize; // the size band of a point data operation
+	int minByteSize, maxByteSize; // the size band of a point data operation
 	bool restrictSize = true;
 
 	MockGlobalState() : shardMapping(new ShardsAffectedByTeamFailure) {}
diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index 351c84d97d..26cd3cf91c 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -46,7 +46,7 @@ bool compareShardInfo(const DDShardInfo& a, const DDShardInfo& other) {
 
 void verifyInitDataEqual(Reference<InitialDataDistribution> real, Reference<InitialDataDistribution> mock) {
 	// Mock DD just care about the team list and server<->key mapping are consistent with the real cluster
-	if(real->shards.size() != mock->shards.size()) {
+	if (real->shards.size() != mock->shards.size()) {
 		std::cout << "real.size: " << real->shards.size() << " mock.size: " << mock->shards.size() << "\n";
 		ASSERT(false);
 	}
@@ -223,7 +223,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 				wait(self->real->testRawStartMovement(params, emptyTssMapping));
 
 				// test finish or started but cancelled movement
-				if (deterministicRandom()->coinflip()) {
+				if (true || deterministicRandom()->coinflip()) {
 					CODE_PROBE(true, "RawMovementApi partial started");
 					break;
 				}
@@ -318,7 +318,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		state double lastTime = now();
 		state int choice = 0;
 		loop {
-			choice = deterministicRandom()->randomInt(0, 2);
+			choice = deterministicRandom()->randomInt(0, 1);
 			if (choice == 0) { // test rawStartMovement and rawFinishMovement separately
 				wait(testRawMovementApi(self));
 			} else if (choice == 1) { // test moveKeys
diff --git a/fdbserver/workloads/MockDDTest.actor.cpp b/fdbserver/workloads/MockDDTest.actor.cpp
index c9ca9d1f8e..c46643566e 100644
--- a/fdbserver/workloads/MockDDTest.actor.cpp
+++ b/fdbserver/workloads/MockDDTest.actor.cpp
@@ -33,10 +33,17 @@ struct MockDDTestWorkload : public TestWorkload {
 	bool simpleConfig;
 	double testDuration;
 	double meanDelay = 0.05;
-	double maxKeyspace = 0.1;
+	double maxKeyspace = 0.1; // range space
+	int maxByteSize = 1024, minByteSize = 32; // single point value size. The Key size is fixed to 16 bytes
 
 	std::shared_ptr<MockGlobalState> mgs;
-	std::shared_ptr<DDMockTxnProcessor> mock;
+	Reference<DDMockTxnProcessor> mock;
+
+	KeyRange getRandomRange(double offset) const {
+		double len = deterministicRandom()->random01() * this->maxKeyspace;
+		double pos = offset + deterministicRandom()->random01() * (1.0 - len);
+		return KeyRangeRef(doubleToTestKey(pos), doubleToTestKey(pos + len));
+	}
 
 	MockDDTestWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
 		enabled = !clientId && g_network->isSimulated(); // only do this on the "first" client
@@ -44,6 +51,8 @@ struct MockDDTestWorkload : public TestWorkload {
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
 		meanDelay = getOption(options, "meanDelay"_sr, meanDelay);
 		maxKeyspace = getOption(options, "maxKeyspace"_sr, maxKeyspace);
+		maxByteSize = getOption(options, "maxByteSize"_sr, maxByteSize);
+		minByteSize = getOption(options, "minByteSize"_sr, minByteSize);
 	}
 
 	Future<Void> setup(Database const& cx) override {
@@ -58,9 +67,141 @@ struct MockDDTestWorkload : public TestWorkload {
 
 		// initialize mgs
 		mgs = std::make_shared<MockGlobalState>();
+		mgs->maxByteSize = maxByteSize;
+		mgs->minByteSize = minByteSize;
 		mgs->initializeAsEmptyDatabaseMGS(dbConfig);
-		mock = std::make_shared<DDMockTxnProcessor>(mgs);
+		mock = makeReference<DDMockTxnProcessor>(mgs);
 
 		return Void();
 	}
+};
+
+struct MockDDTrackerShardEvaluatorWorkload : public MockDDTestWorkload {
+
+	DDSharedContext ddcx;
+
+	PromiseStream<RelocateShard> output;
+	PromiseStream<GetMetricsRequest> getShardMetrics;
+	PromiseStream<GetTopKMetricsRequest> getTopKMetrics;
+	PromiseStream<GetMetricsListRequest> getShardMetricsList;
+	PromiseStream<Promise<int64_t>> getAverageShardBytes;
+
+	KeyRangeMap<ShardTrackedData> shards;
+
+	ActorCollection actors;
+	uint64_t mockDbSize = 0;
+	const int keySize = 16;
+
+	// --- test configs ---
+
+	// Each key space is convert from an int N. [N, N+1) represent a key space. So at most we have 2G key spaces
+	int keySpaceCount = 0;
+	// 1. fixed -- each key space has fixed size. The size of each key space is calculated as minSpaceKeyCount *
+	// (minByteSize + 16) ;
+	// 2. linear -- from 0 to keySpaceCount the size of key space increase by size linearStride, from
+	// linearStartSize. Each value is fixed to minByteSize;
+	// 3. random -- each key space can has [minSpaceKeyCount,
+	// maxSpaceKeyCount] pairs and the size of value varies from [minByteSize, maxByteSize];
+	Value keySpaceStrategy = "fixed"_sr;
+	int minSpaceKeyCount = 1000, maxSpaceKeyCount = 1000;
+	int linearStride = 10 * (1 << 20), linearStartSize = 10 * (1 << 20);
+
+	MockDDTrackerShardEvaluatorWorkload(WorkloadContext const& wcx)
+	  : MockDDTestWorkload(wcx), ddcx(deterministicRandom()->randomUniqueID()) {
+		keySpaceCount = getOption(options, "keySpaceCount"_sr, keySpaceCount);
+		keySpaceStrategy = getOption(options, "keySpaceStrategy"_sr, keySpaceStrategy);
+		minSpaceKeyCount = getOption(options, "minSpaceKeyCount"_sr, minSpaceKeyCount);
+		maxSpaceKeyCount = getOption(options, "maxSpaceKeyCount"_sr, maxSpaceKeyCount);
+		linearStride = getOption(options, "linearStride"_sr, linearStride);
+		linearStartSize = getOption(options, "linearStartSize"_sr, linearStartSize);
+	}
+
+	void populateRandomStrategy() {
+		mockDbSize = 0;
+		for (int i = 0; i < keySpaceCount; ++i) {
+			int kCount = deterministicRandom()->randomInt(minSpaceKeyCount, maxSpaceKeyCount);
+			for (int j = 0; j < kCount; ++j) {
+				Key k = doubleToTestKey(i + deterministicRandom()->random01());
+				auto vSize = deterministicRandom()->randomInt(minByteSize, maxByteSize + 1);
+				mgs->set(k, vSize, true);
+				mockDbSize += vSize + k.size();
+			}
+		}
+	}
+
+	void populateLinearStrategy() {
+		mockDbSize = 0;
+		auto pSize = minByteSize + keySize;
+		for (int i = 0; i < keySpaceCount; ++i) {
+			int kCount = std::ceil((linearStride * i + linearStartSize) * 1.0 / pSize);
+			for (int j = 0; j < kCount; ++j) {
+				Key k = doubleToTestKey(i + deterministicRandom()->random01());
+				mgs->set(k, minByteSize, true);
+			}
+			mockDbSize += pSize * kCount;
+		}
+	}
+
+	void populateFixedStrategy() {
+		auto pSize = minByteSize + keySize;
+		for (int i = 0; i < keySpaceCount; ++i) {
+			for (int j = 0; j < minSpaceKeyCount; ++j) {
+				Key k = doubleToTestKey(i + deterministicRandom()->random01());
+				mgs->set(k, minByteSize, true);
+			}
+		}
+		mockDbSize = keySpaceCount * minSpaceKeyCount * pSize;
+	}
+
+	void populateMgs() {
+		if (keySpaceStrategy == "linear") {
+			populateLinearStrategy();
+		} else if (keySpaceStrategy == "fixed") {
+			populateFixedStrategy();
+		} else if (keySpaceStrategy == "random") {
+			populateRandomStrategy();
+		}
+		TraceEvent("PopulateMockGlobalState")
+		    .detail("Strategy", keySpaceStrategy)
+		    .detail("EstimatedDbSize", mockDbSize);
+	}
+
+	Future<Void> setup(Database const& cx) override {
+		if (!enabled)
+			return Void();
+		MockDDTestWorkload::setup(cx);
+		// populate mgs before run tracker
+		populateMgs();
+	}
+	Future<Void> start(Database const& cx) override {
+		if (!enabled)
+			return Void();
+
+		// start mock servers
+		actors.add(waitForAll(mgs->runAllMockServers()));
+
+		// start tracker
+		Reference<InitialDataDistribution> initData =
+		    mock->getInitialDataDistribution(ddcx.id(), ddcx.lock, {}, ddcx.ddEnabledState.get(), SkipDDModeCheck::True)
+		        .get();
+		Reference<PhysicalShardCollection> physicalShardCollection = makeReference<PhysicalShardCollection>();
+		Reference<AsyncVar<bool>> zeroHealthyTeams = makeReference<AsyncVar<bool>>(false);
+		actors.add(dataDistributionTracker(initData,
+		                                   mock,
+		                                   output,
+		                                   ddcx.shardsAffectedByTeamFailure,
+		                                   physicalShardCollection,
+		                                   getShardMetrics,
+		                                   getTopKMetrics.getFuture(),
+		                                   getShardMetricsList,
+		                                   getAverageShardBytes.getFuture(),
+		                                   Promise<Void>(),
+		                                   zeroHealthyTeams,
+		                                   ddcx.id(),
+		                                   &shards,
+		                                   &ddcx.trackerCancelled,
+		                                   {}));
+
+		return timeout(reportErrors(actors.getResult(), "MockDDTrackerShardEvaluatorWorkload"), testDuration, Void());
+	}
 };
\ No newline at end of file

From 38bd568e07fe157500198bf701575962cf368693 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Wed, 2 Nov 2022 21:43:27 -0700
Subject: [PATCH 18/57] change workload file structure

---
 .../include/fdbserver/workloads/MockDDTest.h  |  46 ++++
 fdbserver/workloads/MockDDTest.actor.cpp      | 211 +++---------------
 .../MockDDTrackerShardEvaluator.actor.cpp     | 186 +++++++++++++++
 3 files changed, 263 insertions(+), 180 deletions(-)
 create mode 100644 fdbserver/include/fdbserver/workloads/MockDDTest.h
 create mode 100644 fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp

diff --git a/fdbserver/include/fdbserver/workloads/MockDDTest.h b/fdbserver/include/fdbserver/workloads/MockDDTest.h
new file mode 100644
index 0000000000..133f0b582e
--- /dev/null
+++ b/fdbserver/include/fdbserver/workloads/MockDDTest.h
@@ -0,0 +1,46 @@
+/*
+ * MockDDTest.g
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FOUNDATIONDB_MOCKDDTEST_H
+#define FOUNDATIONDB_MOCKDDTEST_H
+
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/DDSharedContext.h"
+#include "fdbserver/DDTxnProcessor.h"
+#include "fdbserver/MoveKeys.actor.h"
+#include "fdbclient/StorageServerInterface.h"
+
+// other Mock DD workload can derive from this class to use the common settings
+struct MockDDTestWorkload : public TestWorkload {
+	bool enabled;
+	bool simpleConfig;
+	double testDuration;
+	double meanDelay = 0.05;
+	double maxKeyspace = 0.1; // range space
+	int maxByteSize = 1024, minByteSize = 32; // single point value size. The Key size is fixed to 16 bytes
+
+	std::shared_ptr<MockGlobalState> mgs;
+	Reference<DDMockTxnProcessor> mock;
+
+	KeyRange getRandomRange(double offset) const;
+	MockDDTestWorkload(WorkloadContext const& wcx);
+	Future<Void> setup(Database const& cx) override;
+};
+
+#endif // FOUNDATIONDB_MOCKDDTEST_H
diff --git a/fdbserver/workloads/MockDDTest.actor.cpp b/fdbserver/workloads/MockDDTest.actor.cpp
index c46643566e..2577a23511 100644
--- a/fdbserver/workloads/MockDDTest.actor.cpp
+++ b/fdbserver/workloads/MockDDTest.actor.cpp
@@ -18,190 +18,41 @@
  * limitations under the License.
  */
 
-#include "fdbserver/workloads/workloads.actor.h"
-#include "fdbclient/FDBOptions.g.h"
-#include "fdbclient/ManagementAPI.actor.h"
-#include "fdbserver/DDSharedContext.h"
-#include "fdbserver/DDTxnProcessor.h"
-#include "fdbserver/MoveKeys.actor.h"
-#include "fdbclient/StorageServerInterface.h"
-#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/workloads/MockDDTest.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-struct MockDDTestWorkload : public TestWorkload {
-	bool enabled;
-	bool simpleConfig;
-	double testDuration;
-	double meanDelay = 0.05;
-	double maxKeyspace = 0.1; // range space
-	int maxByteSize = 1024, minByteSize = 32; // single point value size. The Key size is fixed to 16 bytes
+KeyRange MockDDTestWorkload::getRandomRange(double offset) const {
+	double len = deterministicRandom()->random01() * this->maxKeyspace;
+	double pos = offset + deterministicRandom()->random01() * (1.0 - len);
+	return KeyRangeRef(doubleToTestKey(pos), doubleToTestKey(pos + len));
+}
 
-	std::shared_ptr<MockGlobalState> mgs;
-	Reference<DDMockTxnProcessor> mock;
-
-	KeyRange getRandomRange(double offset) const {
-		double len = deterministicRandom()->random01() * this->maxKeyspace;
-		double pos = offset + deterministicRandom()->random01() * (1.0 - len);
-		return KeyRangeRef(doubleToTestKey(pos), doubleToTestKey(pos + len));
-	}
-
-	MockDDTestWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
-		enabled = !clientId && g_network->isSimulated(); // only do this on the "first" client
-		simpleConfig = getOption(options, "simpleConfig"_sr, true);
-		testDuration = getOption(options, "testDuration"_sr, 10.0);
-		meanDelay = getOption(options, "meanDelay"_sr, meanDelay);
-		maxKeyspace = getOption(options, "maxKeyspace"_sr, maxKeyspace);
-		maxByteSize = getOption(options, "maxByteSize"_sr, maxByteSize);
-		minByteSize = getOption(options, "minByteSize"_sr, minByteSize);
-	}
-
-	Future<Void> setup(Database const& cx) override {
-		if (!enabled)
-			return Void();
-		// initialize configuration
-		BasicTestConfig testConfig;
-		testConfig.simpleConfig = simpleConfig;
-		testConfig.minimumReplication = 1;
-		testConfig.logAntiQuorum = 0;
-		DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
-
-		// initialize mgs
-		mgs = std::make_shared<MockGlobalState>();
-		mgs->maxByteSize = maxByteSize;
-		mgs->minByteSize = minByteSize;
-		mgs->initializeAsEmptyDatabaseMGS(dbConfig);
-		mock = makeReference<DDMockTxnProcessor>(mgs);
+MockDDTestWorkload::MockDDTestWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+	enabled = !clientId && g_network->isSimulated(); // only do this on the "first" client
+	simpleConfig = getOption(options, "simpleConfig"_sr, true);
+	testDuration = getOption(options, "testDuration"_sr, 10.0);
+	meanDelay = getOption(options, "meanDelay"_sr, meanDelay);
+	maxKeyspace = getOption(options, "maxKeyspace"_sr, maxKeyspace);
+	maxByteSize = getOption(options, "maxByteSize"_sr, maxByteSize);
+	minByteSize = getOption(options, "minByteSize"_sr, minByteSize);
+}
 
+Future<Void> MockDDTestWorkload::setup(Database const& cx) {
+	if (!enabled)
 		return Void();
-	}
-};
+	// initialize configuration
+	BasicTestConfig testConfig;
+	testConfig.simpleConfig = simpleConfig;
+	testConfig.minimumReplication = 1;
+	testConfig.logAntiQuorum = 0;
+	DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
 
-struct MockDDTrackerShardEvaluatorWorkload : public MockDDTestWorkload {
+	// initialize mgs
+	mgs = std::make_shared<MockGlobalState>();
+	mgs->maxByteSize = maxByteSize;
+	mgs->minByteSize = minByteSize;
+	mgs->initializeAsEmptyDatabaseMGS(dbConfig);
+	mock = makeReference<DDMockTxnProcessor>(mgs);
 
-	DDSharedContext ddcx;
-
-	PromiseStream<RelocateShard> output;
-	PromiseStream<GetMetricsRequest> getShardMetrics;
-	PromiseStream<GetTopKMetricsRequest> getTopKMetrics;
-	PromiseStream<GetMetricsListRequest> getShardMetricsList;
-	PromiseStream<Promise<int64_t>> getAverageShardBytes;
-
-	KeyRangeMap<ShardTrackedData> shards;
-
-	ActorCollection actors;
-	uint64_t mockDbSize = 0;
-	const int keySize = 16;
-
-	// --- test configs ---
-
-	// Each key space is convert from an int N. [N, N+1) represent a key space. So at most we have 2G key spaces
-	int keySpaceCount = 0;
-	// 1. fixed -- each key space has fixed size. The size of each key space is calculated as minSpaceKeyCount *
-	// (minByteSize + 16) ;
-	// 2. linear -- from 0 to keySpaceCount the size of key space increase by size linearStride, from
-	// linearStartSize. Each value is fixed to minByteSize;
-	// 3. random -- each key space can has [minSpaceKeyCount,
-	// maxSpaceKeyCount] pairs and the size of value varies from [minByteSize, maxByteSize];
-	Value keySpaceStrategy = "fixed"_sr;
-	int minSpaceKeyCount = 1000, maxSpaceKeyCount = 1000;
-	int linearStride = 10 * (1 << 20), linearStartSize = 10 * (1 << 20);
-
-	MockDDTrackerShardEvaluatorWorkload(WorkloadContext const& wcx)
-	  : MockDDTestWorkload(wcx), ddcx(deterministicRandom()->randomUniqueID()) {
-		keySpaceCount = getOption(options, "keySpaceCount"_sr, keySpaceCount);
-		keySpaceStrategy = getOption(options, "keySpaceStrategy"_sr, keySpaceStrategy);
-		minSpaceKeyCount = getOption(options, "minSpaceKeyCount"_sr, minSpaceKeyCount);
-		maxSpaceKeyCount = getOption(options, "maxSpaceKeyCount"_sr, maxSpaceKeyCount);
-		linearStride = getOption(options, "linearStride"_sr, linearStride);
-		linearStartSize = getOption(options, "linearStartSize"_sr, linearStartSize);
-	}
-
-	void populateRandomStrategy() {
-		mockDbSize = 0;
-		for (int i = 0; i < keySpaceCount; ++i) {
-			int kCount = deterministicRandom()->randomInt(minSpaceKeyCount, maxSpaceKeyCount);
-			for (int j = 0; j < kCount; ++j) {
-				Key k = doubleToTestKey(i + deterministicRandom()->random01());
-				auto vSize = deterministicRandom()->randomInt(minByteSize, maxByteSize + 1);
-				mgs->set(k, vSize, true);
-				mockDbSize += vSize + k.size();
-			}
-		}
-	}
-
-	void populateLinearStrategy() {
-		mockDbSize = 0;
-		auto pSize = minByteSize + keySize;
-		for (int i = 0; i < keySpaceCount; ++i) {
-			int kCount = std::ceil((linearStride * i + linearStartSize) * 1.0 / pSize);
-			for (int j = 0; j < kCount; ++j) {
-				Key k = doubleToTestKey(i + deterministicRandom()->random01());
-				mgs->set(k, minByteSize, true);
-			}
-			mockDbSize += pSize * kCount;
-		}
-	}
-
-	void populateFixedStrategy() {
-		auto pSize = minByteSize + keySize;
-		for (int i = 0; i < keySpaceCount; ++i) {
-			for (int j = 0; j < minSpaceKeyCount; ++j) {
-				Key k = doubleToTestKey(i + deterministicRandom()->random01());
-				mgs->set(k, minByteSize, true);
-			}
-		}
-		mockDbSize = keySpaceCount * minSpaceKeyCount * pSize;
-	}
-
-	void populateMgs() {
-		if (keySpaceStrategy == "linear") {
-			populateLinearStrategy();
-		} else if (keySpaceStrategy == "fixed") {
-			populateFixedStrategy();
-		} else if (keySpaceStrategy == "random") {
-			populateRandomStrategy();
-		}
-		TraceEvent("PopulateMockGlobalState")
-		    .detail("Strategy", keySpaceStrategy)
-		    .detail("EstimatedDbSize", mockDbSize);
-	}
-
-	Future<Void> setup(Database const& cx) override {
-		if (!enabled)
-			return Void();
-		MockDDTestWorkload::setup(cx);
-		// populate mgs before run tracker
-		populateMgs();
-	}
-	Future<Void> start(Database const& cx) override {
-		if (!enabled)
-			return Void();
-
-		// start mock servers
-		actors.add(waitForAll(mgs->runAllMockServers()));
-
-		// start tracker
-		Reference<InitialDataDistribution> initData =
-		    mock->getInitialDataDistribution(ddcx.id(), ddcx.lock, {}, ddcx.ddEnabledState.get(), SkipDDModeCheck::True)
-		        .get();
-		Reference<PhysicalShardCollection> physicalShardCollection = makeReference<PhysicalShardCollection>();
-		Reference<AsyncVar<bool>> zeroHealthyTeams = makeReference<AsyncVar<bool>>(false);
-		actors.add(dataDistributionTracker(initData,
-		                                   mock,
-		                                   output,
-		                                   ddcx.shardsAffectedByTeamFailure,
-		                                   physicalShardCollection,
-		                                   getShardMetrics,
-		                                   getTopKMetrics.getFuture(),
-		                                   getShardMetricsList,
-		                                   getAverageShardBytes.getFuture(),
-		                                   Promise<Void>(),
-		                                   zeroHealthyTeams,
-		                                   ddcx.id(),
-		                                   &shards,
-		                                   &ddcx.trackerCancelled,
-		                                   {}));
-
-		return timeout(reportErrors(actors.getResult(), "MockDDTrackerShardEvaluatorWorkload"), testDuration, Void());
-	}
-};
\ No newline at end of file
+	return Void();
+}
\ No newline at end of file
diff --git a/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp b/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp
new file mode 100644
index 0000000000..5988d15c64
--- /dev/null
+++ b/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp
@@ -0,0 +1,186 @@
+/*
+ * MockDDTrackerShardEvaluator.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbserver/workloads/MockDDTest.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+struct MockDDTrackerShardEvaluatorWorkload : public MockDDTestWorkload {
+	static constexpr auto NAME = "MockDDTrackerShardEvaluator";
+	DDSharedContext ddcx;
+
+	PromiseStream<RelocateShard> output;
+	PromiseStream<GetMetricsRequest> getShardMetrics;
+	PromiseStream<GetTopKMetricsRequest> getTopKMetrics;
+	PromiseStream<GetMetricsListRequest> getShardMetricsList;
+	PromiseStream<Promise<int64_t>> getAverageShardBytes;
+
+	KeyRangeMap<ShardTrackedData> shards;
+
+	ActorCollection actors;
+	uint64_t mockDbSize = 0;
+	const int keySize = 16;
+
+	std::map<int, int> rsReasonCounts;
+
+	// --- test configs ---
+
+	// Each key space is convert from an int N. [N, N+1) represent a key space. So at most we have 2G key spaces
+	int keySpaceCount = 0;
+	// 1. fixed -- each key space has fixed size. The size of each key space is calculated as minSpaceKeyCount *
+	// (minByteSize + 16) ;
+	// 2. linear -- from 0 to keySpaceCount the size of key space increase by size linearStride, from
+	// linearStartSize. Each value is fixed to minByteSize;
+	// 3. random -- each key space can has [minSpaceKeyCount,
+	// maxSpaceKeyCount] pairs and the size of value varies from [minByteSize, maxByteSize];
+	Value keySpaceStrategy = "fixed"_sr;
+	int minSpaceKeyCount = 1000, maxSpaceKeyCount = 1000;
+	int linearStride = 10 * (1 << 20), linearStartSize = 10 * (1 << 20);
+
+	MockDDTrackerShardEvaluatorWorkload(WorkloadContext const& wcx)
+	  : MockDDTestWorkload(wcx), ddcx(deterministicRandom()->randomUniqueID()) {
+		keySpaceCount = getOption(options, "keySpaceCount"_sr, keySpaceCount);
+		keySpaceStrategy = getOption(options, "keySpaceStrategy"_sr, keySpaceStrategy);
+		minSpaceKeyCount = getOption(options, "minSpaceKeyCount"_sr, minSpaceKeyCount);
+		maxSpaceKeyCount = getOption(options, "maxSpaceKeyCount"_sr, maxSpaceKeyCount);
+		linearStride = getOption(options, "linearStride"_sr, linearStride);
+		linearStartSize = getOption(options, "linearStartSize"_sr, linearStartSize);
+	}
+
+	void populateRandomStrategy() {
+		mockDbSize = 0;
+		for (int i = 0; i < keySpaceCount; ++i) {
+			int kCount = deterministicRandom()->randomInt(minSpaceKeyCount, maxSpaceKeyCount);
+			for (int j = 0; j < kCount; ++j) {
+				Key k = doubleToTestKey(i + deterministicRandom()->random01());
+				auto vSize = deterministicRandom()->randomInt(minByteSize, maxByteSize + 1);
+				mgs->set(k, vSize, true);
+				mockDbSize += vSize + k.size();
+			}
+		}
+	}
+
+	void populateLinearStrategy() {
+		mockDbSize = 0;
+		auto pSize = minByteSize + keySize;
+		for (int i = 0; i < keySpaceCount; ++i) {
+			int kCount = std::ceil((linearStride * i + linearStartSize) * 1.0 / pSize);
+			for (int j = 0; j < kCount; ++j) {
+				Key k = doubleToTestKey(i + deterministicRandom()->random01());
+				mgs->set(k, minByteSize, true);
+			}
+			mockDbSize += pSize * kCount;
+		}
+	}
+
+	void populateFixedStrategy() {
+		auto pSize = minByteSize + keySize;
+		for (int i = 0; i < keySpaceCount; ++i) {
+			for (int j = 0; j < minSpaceKeyCount; ++j) {
+				Key k = doubleToTestKey(i + deterministicRandom()->random01());
+				mgs->set(k, minByteSize, true);
+			}
+		}
+		mockDbSize = keySpaceCount * minSpaceKeyCount * pSize;
+	}
+
+	void populateMgs() {
+		// Will the sampling structure become too large?
+		std::cout << "MGS Populating ...\n";
+		if (keySpaceStrategy == "linear") {
+			populateLinearStrategy();
+		} else if (keySpaceStrategy == "fixed") {
+			populateFixedStrategy();
+		} else if (keySpaceStrategy == "random") {
+			populateRandomStrategy();
+		}
+		uint64_t totalSize = 0;
+		for (auto& server : mgs->allServers) {
+			totalSize = server.second.sumRangeSize(allKeys);
+		}
+		TraceEvent("PopulateMockGlobalState")
+		    .detail("Strategy", keySpaceStrategy)
+		    .detail("EstimatedDbSize", mockDbSize)
+		    .detail("MGSReportedTotalSize", totalSize);
+		std::cout << "MGS Populated.\n";
+	}
+
+	Future<Void> setup(Database const& cx) override {
+		if (!enabled)
+			return Void();
+		MockDDTestWorkload::setup(cx);
+		// populate mgs before run tracker
+		populateMgs();
+		return Void();
+	}
+
+	ACTOR static Future<Void> relocateShardReporter(MockDDTrackerShardEvaluatorWorkload* self,
+	                                                FutureStream<RelocateShard> input) {
+		loop choose {
+			when(RelocateShard rs = waitNext(input)) { ++self->rsReasonCounts[(int)rs.reason]; }
+		}
+	}
+
+	Future<Void> start(Database const& cx) override {
+		if (!enabled)
+			return Void();
+
+		// start mock servers
+		actors.add(waitForAll(mgs->runAllMockServers()));
+
+		// start tracker
+		Reference<InitialDataDistribution> initData =
+		    mock->getInitialDataDistribution(ddcx.id(), ddcx.lock, {}, ddcx.ddEnabledState.get(), SkipDDModeCheck::True)
+		        .get();
+		Reference<PhysicalShardCollection> physicalShardCollection = makeReference<PhysicalShardCollection>();
+		Reference<AsyncVar<bool>> zeroHealthyTeams = makeReference<AsyncVar<bool>>(false);
+		actors.add(dataDistributionTracker(initData,
+		                                   mock,
+		                                   output,
+		                                   ddcx.shardsAffectedByTeamFailure,
+		                                   physicalShardCollection,
+		                                   getShardMetrics,
+		                                   getTopKMetrics.getFuture(),
+		                                   getShardMetricsList,
+		                                   getAverageShardBytes.getFuture(),
+		                                   Promise<Void>(),
+		                                   zeroHealthyTeams,
+		                                   ddcx.id(),
+		                                   &shards,
+		                                   &ddcx.trackerCancelled,
+		                                   {}));
+		actors.add(relocateShardReporter(this, output.getFuture()));
+
+		return timeout(reportErrors(actors.getResult(), "MockDDTrackerShardEvaluatorWorkload"), testDuration, Void());
+	}
+
+	Future<bool> check(Database const& cx) override {
+		std::cout << "Check phase shards count: " << shards.size() << "\n";
+		actors.clear(true);
+		return true;
+	}
+
+	void getMetrics(std::vector<PerfMetric>& m) override {
+		for (auto& p : rsReasonCounts) {
+			m.push_back(PerfMetric(RelocateReason(p.first).toString(), p.second, Averaged::False));
+		}
+	}
+};
+
+WorkloadFactory<MockDDTrackerShardEvaluatorWorkload> MockDDTrackerShardEvaluatorWorkload;
\ No newline at end of file

From 8ef0411b32c942dbf10237f22f10c4868a3083a7 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Thu, 3 Nov 2022 11:37:55 -0700
Subject: [PATCH 19/57] address code review comments and introduce offset
 parameter

---
 .../sphinx/source/command-line-interface.rst  |  4 +-
 fdbcli/TenantCommands.actor.cpp               | 76 ++++++++++++++-----
 fdbclient/include/fdbclient/KeyBackedTypes.h  |  4 +
 .../fdbclient/MetaclusterManagement.actor.h   | 62 ++++++++++-----
 .../workloads/MetaclusterConsistency.actor.h  | 22 ------
 .../MetaclusterManagementWorkload.actor.cpp   | 32 ++++++++
 6 files changed, 136 insertions(+), 64 deletions(-)

diff --git a/documentation/sphinx/source/command-line-interface.rst b/documentation/sphinx/source/command-line-interface.rst
index a6c60d3f4f..14f8eaf1db 100644
--- a/documentation/sphinx/source/command-line-interface.rst
+++ b/documentation/sphinx/source/command-line-interface.rst
@@ -475,7 +475,7 @@ Deletes a tenant from the cluster. The tenant must be empty.
 list
 ^^^^
 
-``tenant list [BEGIN] [END] [LIMIT] [state=<STATE1>,<STATE2>,...]``
+``tenant list [BEGIN] [END] [limit=LIMIT] [offset=OFFSET] [state=<STATE1>,<STATE2>,...]``
 
 Lists the tenants present in the cluster.
 
@@ -485,6 +485,8 @@ Lists the tenants present in the cluster.
 
 ``LIMIT`` - the number of tenants to list. Defaults to 100.
 
+``OFFSET`` - the number of items to skip over, starting from the beginning of the range. Defaults to 0.
+
 ``STATE``` - TenantState(s) to filter the list with. Defaults to no filters.
 
 get
diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp
index 00bd0e8309..c055878d80 100644
--- a/fdbcli/TenantCommands.actor.cpp
+++ b/fdbcli/TenantCommands.actor.cpp
@@ -87,6 +87,49 @@ parseTenantConfiguration(std::vector<StringRef> const& tokens, int startIndex, b
 	return configParams;
 }
 
+bool parseTenantListOptions(std::vector<StringRef> const& tokens,
+                            int startIndex,
+                            int& limit,
+                            int& offset,
+                            std::vector<TenantState>& filters) {
+	for (int tokenNum = startIndex; tokenNum < tokens.size(); ++tokenNum) {
+		Optional<Value> value;
+		StringRef token = tokens[tokenNum];
+		StringRef param;
+		bool foundEquals;
+		param = token.eat("=", &foundEquals);
+		if (!foundEquals) {
+			fmt::print(stderr,
+			           "ERROR: invalid option string `{}'. String must specify a value using `='.\n",
+			           param.toString().c_str());
+			return false;
+		}
+		value = token;
+		if (tokencmp(param, "limit")) {
+			limit = std::stoi(value.get().toString());
+			if (limit <= 0) {
+				fmt::print(stderr, "ERROR: invalid limit `{}'\n", token.toString().c_str());
+				return false;
+			}
+		} else if (tokencmp(param, "offset")) {
+			offset = std::stoi(value.get().toString());
+			if (offset <= 0) {
+				fmt::print(stderr, "ERROR: invalid offset `{}'\n", token.toString().c_str());
+				return false;
+			}
+		} else if (tokencmp(param, "state")) {
+			auto filterStrings = value.get().splitAny(","_sr);
+			for (auto sref : filterStrings) {
+				filters.push_back(TenantMapEntry::stringToTenantState(sref.toString()));
+			}
+		} else {
+			fmt::print(stderr, "ERROR: unrecognized parameter `{}'.\n", param.toString().c_str());
+			return false;
+		}
+	}
+	return true;
+}
+
 Key makeConfigKey(TenantNameRef tenantName, StringRef configName) {
 	return tenantConfigSpecialKeyRange.begin.withSuffix(Tuple().append(tenantName).append(configName).pack());
 }
@@ -225,18 +268,21 @@ ACTOR Future<bool> tenantDeleteCommand(Reference<IDatabase> db, std::vector<Stri
 
 // tenant list command
 ACTOR Future<bool> tenantListCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
-	if (tokens.size() > 6) {
-		fmt::print("Usage: tenant list [BEGIN] [END] [LIMIT] [state=<STATE1>,<STATE2>,...]\n\n");
+	if (tokens.size() > 7) {
+		fmt::print("Usage: tenant list [BEGIN] [END] [limit=LIMIT] [offset=OFFSET] [state=<STATE1>,<STATE2>,...]\n\n");
 		fmt::print("Lists the tenants in a cluster.\n");
 		fmt::print("Only tenants in the range BEGIN - END will be printed.\n");
 		fmt::print("An optional LIMIT can be specified to limit the number of results (default 100).\n");
-		fmt::print("Optional comma-separated state(s) can be provided to filter the list.\n");
+		fmt::print("Optionally skip over the first OFFSET results (default 0).\n");
+		fmt::print("Optional comma-separated tenant state(s) can be provided to filter the list.\n");
 		return false;
 	}
 
 	state StringRef beginTenant = ""_sr;
 	state StringRef endTenant = "\xff\xff"_sr;
 	state int limit = 100;
+	state int offset = 0;
+	state std::vector<TenantState> filters;
 
 	if (tokens.size() >= 3) {
 		beginTenant = tokens[2];
@@ -249,25 +295,11 @@ ACTOR Future<bool> tenantListCommand(Reference<IDatabase> db, std::vector<String
 		}
 	}
 	if (tokens.size() >= 5) {
-		int n = 0;
-		if (sscanf(tokens[4].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[4].size() || limit <= 0) {
-			fmt::print(stderr, "ERROR: invalid limit `{}'\n", tokens[4].toString().c_str());
+		if (!parseTenantListOptions(tokens, 4, limit, offset, filters)) {
 			return false;
 		}
 	}
 
-	state std::vector<TenantState> filters;
-	if (tokens.size() == 6) { // state=ready,registering
-		if (!tokens[5].startsWith("state="_sr)) {
-			fmt::print(stderr, "ERROR: state filter must begin with `state='\n");
-			return false;
-		}
-		auto filterStrings = tokens[5].removePrefix("state="_sr).splitAny(","_sr);
-		for (auto sref : filterStrings) {
-			filters.push_back(TenantMapEntry::stringToTenantState(sref.toString()));
-		}
-	}
-
 	state Key beginTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(beginTenant);
 	state Key endTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(endTenant);
 	state Reference<ITransaction> tr = db->createTransaction();
@@ -279,7 +311,7 @@ ACTOR Future<bool> tenantListCommand(Reference<IDatabase> db, std::vector<String
 			state std::vector<TenantName> tenantNames;
 			if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
 				std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-				    wait(MetaclusterAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit, filters));
+				    wait(MetaclusterAPI::listTenants(db, beginTenant, endTenant, limit, offset, filters));
 				for (auto tenant : tenants) {
 					tenantNames.push_back(tenant.first);
 				}
@@ -626,8 +658,10 @@ std::vector<const char*> tenantHintGenerator(std::vector<StringRef> const& token
 	} else if (tokencmp(tokens[1], "delete") && tokens.size() < 3) {
 		static std::vector<const char*> opts = { "<NAME>" };
 		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
-	} else if (tokencmp(tokens[1], "list") && tokens.size() < 6) {
-		static std::vector<const char*> opts = { "[BEGIN]", "[END]", "[LIMIT]", "[state=<STATE1>,<STATE2>,...]" };
+	} else if (tokencmp(tokens[1], "list") && tokens.size() < 7) {
+		static std::vector<const char*> opts = {
+			"[BEGIN]", "[END]", "[limit=LIMIT]", "[offset=OFFSET]", "[state=<STATE1>,<STATE2>,...]"
+		};
 		return std::vector<const char*>(opts.begin() + tokens.size() - 2, opts.end());
 	} else if (tokencmp(tokens[1], "get") && tokens.size() < 4) {
 		static std::vector<const char*> opts = { "<NAME>", "[JSON]" };
diff --git a/fdbclient/include/fdbclient/KeyBackedTypes.h b/fdbclient/include/fdbclient/KeyBackedTypes.h
index 7446d52484..cb86aef2c9 100644
--- a/fdbclient/include/fdbclient/KeyBackedTypes.h
+++ b/fdbclient/include/fdbclient/KeyBackedTypes.h
@@ -168,6 +168,7 @@ template <typename ResultType>
 struct KeyBackedRangeResult {
 	std::vector<ResultType> results;
 	bool more;
+	Optional<KeyRef> readThrough;
 };
 
 // Convenient read/write access to a single value of type T stored at key
@@ -368,6 +369,7 @@ public:
 				    rangeResult.results.push_back(PairType(key, val));
 			    }
 			    rangeResult.more = kvs.more;
+			    rangeResult.readThrough = kvs.readThrough;
 			    return rangeResult;
 		    }));
 	}
@@ -573,6 +575,7 @@ public:
 				    rangeResult.results.push_back(PairType(key, val));
 			    }
 			    rangeResult.more = kvs.more;
+			    rangeResult.readThrough = kvs.readThrough;
 			    return rangeResult;
 		    }));
 	}
@@ -660,6 +663,7 @@ public:
 				    rangeResult.results.push_back(Codec::unpack(kvs[i].key.removePrefix(prefix)));
 			    }
 			    rangeResult.more = kvs.more;
+			    rangeResult.readThrough = kvs.readThrough;
 			    return rangeResult;
 		    }));
 	}
diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
index 7d930a8d9d..941cc1338e 100644
--- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
+++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
@@ -1555,27 +1555,16 @@ Future<Void> deleteTenant(Reference<DB> db, TenantName name) {
 }
 
 ACTOR template <class Transaction>
-Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(
-    Transaction tr,
-    TenantNameRef begin,
-    TenantNameRef end,
-    int limit,
-    std::vector<TenantState> filters = std::vector<TenantState>()) {
+Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(Transaction tr,
+                                                                                  TenantNameRef begin,
+                                                                                  TenantNameRef end,
+                                                                                  int limit) {
 	tr->setOption(FDBTransactionOptions::RAW_ACCESS);
 
-	KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> results =
+	state KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> results =
 	    wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(tr, begin, end, limit));
 
-	if (filters.empty()) {
-		return results.results;
-	}
-	std::vector<std::pair<TenantName, TenantMapEntry>> filterResults;
-	for (auto pair : results.results) {
-		if (std::count(filters.begin(), filters.end(), pair.second.tenantState)) {
-			filterResults.push_back(pair);
-		}
-	}
-	return filterResults;
+	return results.results;
 }
 
 ACTOR template <class DB>
@@ -1584,6 +1573,7 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
     TenantName begin,
     TenantName end,
     int limit,
+    int offset = 0,
     std::vector<TenantState> filters = std::vector<TenantState>()) {
 	state Reference<typename DB::TransactionT> tr = db->createTransaction();
 
@@ -1591,9 +1581,41 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
 		try {
 			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
-			std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-			    wait(listTenantsTransaction(tr, begin, end, limit, filters));
-			return tenants;
+			if (offset == 0 && filters.empty()) {
+				std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
+				    wait(listTenantsTransaction(tr, begin, end, limit));
+				return tenants;
+			}
+			tr->setOption(FDBTransactionOptions::RAW_ACCESS);
+
+			state KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> results =
+			    wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(tr, begin, end, limit));
+			state std::vector<std::pair<TenantName, TenantMapEntry>> filterResults;
+			state int count = 0;
+			loop {
+				for (auto pair : results.results) {
+					if (filters.empty() || std::count(filters.begin(), filters.end(), pair.second.tenantState)) {
+						++count;
+						if (count > offset) {
+							filterResults.push_back(pair);
+							if (count - offset == limit) {
+								ASSERT(count - offset == filterResults.size());
+								return filterResults;
+							}
+						}
+					}
+				}
+				if (!results.more) {
+					return filterResults;
+				}
+				if (results.readThrough.present()) {
+					begin = results.readThrough.get();
+				} else {
+					begin = keyAfter(results.results.back().first);
+				}
+				wait(store(results,
+				           ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(tr, begin, end, limit)));
+			}
 		} catch (Error& e) {
 			wait(safeThreadFutureToFuture(tr->onError(e)));
 		}
diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h
index 55b6aa863a..25f3fcae19 100644
--- a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h
+++ b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h
@@ -71,20 +71,6 @@ private:
 	ACTOR static Future<Void> loadManagementClusterMetadata(MetaclusterConsistencyCheck* self) {
 		state Reference<typename DB::TransactionT> managementTr = self->managementDb->createTransaction();
 		state std::vector<std::pair<TenantName, TenantMapEntry>> tenantList;
-		state std::vector<std::pair<TenantName, TenantMapEntry>> tenantListReady;
-		state std::vector<std::pair<TenantName, TenantMapEntry>> tenantListOther;
-
-		state std::vector<TenantState> readyFilter;
-		state std::vector<TenantState> otherFilter;
-
-		readyFilter.push_back(TenantState::READY);
-		otherFilter.push_back(TenantState::REGISTERING);
-		otherFilter.push_back(TenantState::REMOVING);
-		otherFilter.push_back(TenantState::UPDATING_CONFIGURATION);
-		otherFilter.push_back(TenantState::RENAMING_FROM);
-		otherFilter.push_back(TenantState::RENAMING_TO);
-		otherFilter.push_back(TenantState::ERROR);
-		otherFilter.push_back(TenantState::INVALID);
 
 		loop {
 			try {
@@ -115,12 +101,6 @@ private:
 				     store(tenantList,
 				           MetaclusterAPI::listTenantsTransaction(
 				               managementTr, ""_sr, "\xff\xff"_sr, metaclusterMaxTenants)) &&
-				     store(tenantListReady,
-				           MetaclusterAPI::listTenantsTransaction(
-				               managementTr, ""_sr, "\xff\xff"_sr, metaclusterMaxTenants, readyFilter)) &&
-				     store(tenantListOther,
-				           MetaclusterAPI::listTenantsTransaction(
-				               managementTr, ""_sr, "\xff\xff"_sr, metaclusterMaxTenants, otherFilter)) &&
 				     store(self->managementMetadata.tenantGroups,
 				           MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantGroupMap.getRange(
 				               managementTr, {}, {}, metaclusterMaxTenants)) &&
@@ -133,8 +113,6 @@ private:
 			}
 		}
 
-		ASSERT(tenantListReady.size() + tenantListOther.size() == tenantList.size());
-
 		self->managementMetadata.tenantMap = std::map<TenantName, TenantMapEntry>(tenantList.begin(), tenantList.end());
 
 		for (auto t : self->managementMetadata.clusterTenantTuples.results) {
diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
index 87d5a0419f..9022cbc80b 100644
--- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
+++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
@@ -392,6 +392,33 @@ struct MetaclusterManagementWorkload : TestWorkload {
 		return Void();
 	}
 
+	ACTOR static Future<Void> verifyListFilter(MetaclusterManagementWorkload* self, TenantName tenant) {
+		try {
+			state TenantMapEntry checkEntry = wait(MetaclusterAPI::getTenant(self->managementDb, tenant));
+			state TenantState checkState = checkEntry.tenantState;
+			state std::vector<std::pair<TenantName, TenantMapEntry>> tenantList;
+			state std::vector<TenantState> filters;
+			filters.push_back(checkState);
+			wait(store(tenantList,
+			           MetaclusterAPI::listTenants(self->managementDb, ""_sr, "\xff\xff"_sr, 10e6, 0, filters)));
+			ASSERT(!tenantList.empty());
+			bool found = false;
+			for (auto pair : tenantList) {
+				ASSERT(pair.second.tenantState == checkState);
+				if (pair.first == tenant) {
+					found = true;
+				}
+			}
+			ASSERT(found);
+		} catch (Error& e) {
+			if (e.code() != error_code_tenant_not_found) {
+				TraceEvent(SevError, "VerifyListFilterFailure").error(e).detail("Tenant", tenant);
+				throw;
+			}
+		}
+		return Void();
+	}
+
 	ACTOR static Future<Void> createTenant(MetaclusterManagementWorkload* self) {
 		state TenantName tenant = self->chooseTenantName();
 		state Optional<TenantGroupName> tenantGroup = self->chooseTenantGroup();
@@ -433,6 +460,7 @@ struct MetaclusterManagementWorkload : TestWorkload {
 						break;
 					} else {
 						retried = true;
+						wait(verifyListFilter(self, tenant));
 					}
 				} catch (Error& e) {
 					if (e.code() == error_code_tenant_already_exists && retried && !exists) {
@@ -533,6 +561,7 @@ struct MetaclusterManagementWorkload : TestWorkload {
 						break;
 					} else {
 						retried = true;
+						wait(verifyListFilter(self, tenant));
 					}
 				} catch (Error& e) {
 					if (e.code() == error_code_tenant_not_found && retried && exists) {
@@ -622,6 +651,7 @@ struct MetaclusterManagementWorkload : TestWorkload {
 				if (result.present()) {
 					break;
 				}
+				wait(verifyListFilter(self, tenant));
 			}
 
 			ASSERT(exists);
@@ -716,6 +746,8 @@ struct MetaclusterManagementWorkload : TestWorkload {
 					}
 
 					retried = true;
+					wait(verifyListFilter(self, tenant));
+					wait(verifyListFilter(self, newTenantName));
 				} catch (Error& e) {
 					// If we retry the rename after it had succeeded, we will get an error that we should ignore
 					if (e.code() == error_code_tenant_not_found && exists && !newTenantExists && retried) {

From 96cf3f855b3bd8d33cc3aafe421ce8d7274fe5cb Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Mon, 7 Nov 2022 16:47:14 -0800
Subject: [PATCH 20/57] add rawMoveShard function

---
 fdbserver/DDTxnProcessor.actor.cpp            |  4 +-
 fdbserver/MoveKeys.actor.cpp                  |  4 +-
 fdbserver/ShardsAffectedByTeamFailure.cpp     | 20 ++++-
 .../fdbserver/ShardsAffectedByTeamFailure.h   |  6 +-
 .../IDDTxnProcessorApiCorrectness.actor.cpp   | 89 ++++++++++++++++---
 tests/fast/IDDTxnProcessorApiCorrectness.toml |  4 +
 6 files changed, 108 insertions(+), 19 deletions(-)

diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 991f4de95b..22a66d3f66 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -897,7 +897,9 @@ void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID,
 	std::vector<ShardsAffectedByTeamFailure::Team> destTeams;
 	destTeams.emplace_back(params.destinationTeam, true);
 	mgs->shardMapping->defineShard(params.keys);
-	mgs->shardMapping->moveShard(params.keys, destTeams);
+	auto teamPair = mgs->shardMapping->getTeamsFor(params.keys.begin);
+	auto& srcTeams = teamPair.second.empty() ? teamPair.first : teamPair.second;
+	mgs->shardMapping->rawMoveShard(params.keys, srcTeams, destTeams);
 
 	auto randomRangeSize =
 	    deterministicRandom()->randomInt64(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index cd4d6ac12b..1425ccb30a 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -580,8 +580,8 @@ ACTOR Future<Void> logWarningAfter(const char* context, double duration, std::ve
 
 // keyServer: map from keys to destination servers
 // serverKeys: two-dimension map: [servers][keys], value is the servers' state of having the keys: active(not-have),
-// complete(already has), ""(). Set keyServers[keys].dest = servers Set serverKeys[servers][keys] = active for each
-// subrange of keys that the server did not already have, complete for each subrange that it already has Set
+// complete(already has), ""(). Set keyServers[keys].dest = servers. Set serverKeys[servers][keys] = active for each
+// subrange of keys that the server did not already have, = complete for each subrange that it already has. Set
 // serverKeys[dest][keys] = "" for the dest servers of each existing shard in keys (unless that destination is a member
 // of servers OR if the source list is sufficiently degraded)
 ACTOR static Future<Void> startMoveKeys(Database occ,
diff --git a/fdbserver/ShardsAffectedByTeamFailure.cpp b/fdbserver/ShardsAffectedByTeamFailure.cpp
index bc1b150656..d774f658e7 100644
--- a/fdbserver/ShardsAffectedByTeamFailure.cpp
+++ b/fdbserver/ShardsAffectedByTeamFailure.cpp
@@ -107,7 +107,6 @@ void ShardsAffectedByTeamFailure::defineShard(KeyRangeRef keys) {
 	check();
 }
 
-// Move keys to destinationTeams by updating shard_teams
 void ShardsAffectedByTeamFailure::moveShard(KeyRangeRef keys, std::vector<Team> destinationTeams) {
 	/*TraceEvent("ShardsAffectedByTeamFailureMove")
 	    .detail("KeyBegin", keys.begin)
@@ -158,6 +157,25 @@ void ShardsAffectedByTeamFailure::moveShard(KeyRangeRef keys, std::vector<Team>
 	check();
 }
 
+void ShardsAffectedByTeamFailure::rawMoveShard(KeyRangeRef keys,
+                                               const std::vector<Team>& srcTeams,
+                                               const std::vector<Team>& destinationTeams) {
+	auto it = shard_teams.rangeContaining(keys.begin);
+	std::vector<std::pair<std::pair<std::vector<Team>, std::vector<Team>>, KeyRange>> modifiedShards;
+	ASSERT(it->range() == keys);
+
+	// erase the many teams that were associated with this one shard
+	for (auto t = it->value().first.begin(); t != it->value().first.end(); ++t) {
+		erase(*t, it->range());
+	}
+	it.value() = std::make_pair(destinationTeams, srcTeams);
+	for(auto& team: destinationTeams) {
+		insert(team, keys);
+	}
+
+	check();
+}
+
 void ShardsAffectedByTeamFailure::finishMove(KeyRangeRef keys) {
 	auto ranges = shard_teams.containedRanges(keys);
 	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
diff --git a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
index 326958bbb6..ca702ee4a4 100644
--- a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
+++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h
@@ -93,8 +93,12 @@ public:
 	// Shard boundaries are modified in defineShard and the content of what servers correspond to each shard is a copy
 	// or union of the shards already there
 	void defineShard(KeyRangeRef keys);
-	// moveShard never change the shard boundary but just change the team value
+	// moveShard never change the shard boundary but just change the team value. Move keys to destinationTeams by
+	// updating shard_teams, the old destination teams will be added to new source teams.
 	void moveShard(KeyRangeRef keys, std::vector<Team> destinationTeam);
+	// This function assume keys is exactly a shard in this mapping, this function set the srcTeam and destination
+	// directly without retaining the old destination team info
+	void rawMoveShard(KeyRangeRef keys, const std::vector<Team>& srcTeams, const std::vector<Team>& destinationTeam);
 	// finishMove never change the shard boundary but just clear the old source team value
 	void finishMove(KeyRangeRef keys);
 	// a convenient function for (defineShard, moveShard, finishMove) pipeline
diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index 26cd3cf91c..aceae94ad7 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -28,13 +28,21 @@
 #include "flow/actorcompiler.h" // This must be the last #include.
 #include "fdbclient/VersionedMap.h"
 
+std::string describe(const DDShardInfo& a) {
+	std::string res = "key: " + a.key.toString() + "\n";
+	res += "\tprimarySrc: " + describe(a.primarySrc) + "\n";
+	res += "\tprimaryDest: " + describe(a.primaryDest) + "\n";
+	res += "\tremoteSrc: " + describe(a.remoteSrc) + "\n";
+	res += "\tremoteDest: " + describe(a.remoteDest) + "\n";
+	return res;
+}
 bool compareShardInfo(const DDShardInfo& a, const DDShardInfo& other) {
 	// Mock DD just care about the server<->key mapping in DDShardInfo
 	bool result = a.key == other.key && a.hasDest == other.hasDest && a.primaryDest == other.primaryDest &&
 	              a.primarySrc == other.primarySrc && a.remoteSrc == other.remoteSrc &&
 	              a.remoteDest == other.remoteDest;
 	if (!result) {
-		std::cout << a.key.toHexString() << " | " << other.key.toHexString() << "\n";
+		std::cout << a.key.toStringView() << " | " << other.key.toStringView() << "\n";
 		std::cout << a.hasDest << " | " << other.hasDest << "\n";
 		std::cout << describe(a.primarySrc) << " | " << describe(other.primarySrc) << "\n";
 		std::cout << describe(a.primaryDest) << " | " << describe(other.primaryDest) << "\n";
@@ -47,15 +55,25 @@ bool compareShardInfo(const DDShardInfo& a, const DDShardInfo& other) {
 void verifyInitDataEqual(Reference<InitialDataDistribution> real, Reference<InitialDataDistribution> mock) {
 	// Mock DD just care about the team list and server<->key mapping are consistent with the real cluster
 	if (real->shards.size() != mock->shards.size()) {
-		std::cout << "real.size: " << real->shards.size() << " mock.size: " << mock->shards.size() << "\n";
-		ASSERT(false);
+		std::cout << "shardBoundaries: real v.s. mock \n";
+		for (auto& shard : real->shards) {
+			std::cout << describe(shard);
+		}
+		std::cout << " ------- \n";
+		for (auto& shard : mock->shards) {
+			std::cout << describe(shard);
+		}
 	}
+	ASSERT_EQ(real->shards.size(), mock->shards.size());
 	ASSERT(std::equal(
 	    real->shards.begin(), real->shards.end(), mock->shards.begin(), mock->shards.end(), compareShardInfo));
-	std::cout << describe(real->primaryTeams) << " | " << describe(mock->primaryTeams) << "\n";
-	ASSERT(real->primaryTeams == mock->primaryTeams);
+
+	if (real->primaryTeams != mock->primaryTeams) {
+		std::cout << describe(real->primaryTeams) << " | " << describe(mock->primaryTeams) << "\n";
+		ASSERT(false);
+	}
+
 	ASSERT(real->remoteTeams == mock->remoteTeams);
-	ASSERT_EQ(real->shards.size(), mock->shards.size());
 }
 
 // testers expose protected methods
@@ -89,6 +107,7 @@ public:
 struct IDDTxnProcessorApiWorkload : TestWorkload {
 	static constexpr auto NAME = "IDDTxnProcessorApiCorrectness";
 	bool enabled;
+	bool testStartOnly;
 	double testDuration;
 	double meanDelay = 0.05;
 	double maxKeyspace = 0.1;
@@ -99,12 +118,14 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 	std::shared_ptr<DDMockTxnProcessorTester> mock;
 
 	Reference<InitialDataDistribution> realInitDD;
+	std::set<Key> boundaries;
 
 	IDDTxnProcessorApiWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), ddContext(UID()) {
 		enabled = !clientId && g_network->isSimulated(); // only do this on the "first" client
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
 		meanDelay = getOption(options, "meanDelay"_sr, meanDelay);
 		maxKeyspace = getOption(options, "maxKeyspace"_sr, maxKeyspace);
+		testStartOnly = getOption(options, "testStartOnly"_sr, false);
 	}
 
 	Future<Void> setup(Database const& cx) override { return enabled ? _setup(cx, this) : Void(); }
@@ -135,13 +156,44 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 					throw;
 			}
 		}
+		self->updateBoundaries();
 		return Void();
 	}
 
+	// according to boundaries, generate valid ranges for moveKeys operation
 	KeyRange getRandomKeys() const {
-		double len = deterministicRandom()->random01() * this->maxKeyspace;
-		double pos = deterministicRandom()->random01() * (1.0 - len);
-		return KeyRangeRef(doubleToTestKey(pos), doubleToTestKey(pos + len));
+		// merge or split operations
+		Key begin, end;
+		if (deterministicRandom()->coinflip()) {
+			// pure move
+			if (boundaries.size() == 2) {
+				begin = *boundaries.begin();
+				end = *boundaries.rbegin();
+			} else {
+				// merge shard
+				int a = deterministicRandom()->randomInt(0, boundaries.size() - 1);
+				int b = deterministicRandom()->randomInt(a + 1, boundaries.size());
+				auto it = boundaries.begin();
+				std::advance(it, a);
+				begin = *it;
+				std::advance(it, b - a);
+				end = *it;
+			}
+		} else {
+			// split
+			double start = deterministicRandom()->random01() * this->maxKeyspace;
+			begin = doubleToTestKey(start);
+			auto it = boundaries.upper_bound(begin);
+			ASSERT(it != boundaries.end()); // allKeys.end is larger than any random keys here
+
+			double len = deterministicRandom()->random01() * (1 - maxKeyspace);
+			end = doubleToTestKey(start + len);
+			if (end > *it || deterministicRandom()->coinflip()) {
+				end = *it;
+			}
+		}
+
+		return KeyRangeRef(begin, end);
 	}
 
 	std::vector<UID> getRandomTeam() {
@@ -158,6 +210,13 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		return result;
 	}
 
+	void updateBoundaries() {
+		boundaries.clear();
+		for (auto& shard : realInitDD->shards) {
+			boundaries.insert(boundaries.end(), shard.key);
+		}
+	}
+
 	ACTOR Future<Void> _setup(Database cx, IDDTxnProcessorApiWorkload* self) {
 		int oldMode = wait(setDDMode(cx, 0));
 		TraceEvent("IDDTxnApiTestStartModeSetting").detail("OldValue", oldMode).log();
@@ -169,7 +228,6 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		// FIXME: add support for generating random teams across DCs
 		ASSERT_EQ(self->ddContext.usableRegions(), 1);
 		wait(readRealInitialDataDistribution(self));
-
 		return Void();
 	}
 
@@ -212,7 +270,9 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		params.startMoveKeysParallelismLock = &fl1;
 		params.finishMoveKeysParallelismLock = &fl2;
 		params.relocationIntervalId = relocateShardInterval.pairID;
-		TraceEvent(SevDebug, relocateShardInterval.begin(), relocateShardInterval.pairID);
+		TraceEvent(SevDebug, relocateShardInterval.begin(), relocateShardInterval.pairID)
+		    .detail("Key", params.keys)
+		    .detail("Dest", params.destinationTeam);
 
 		loop {
 			params.dataMovementComplete.reset();
@@ -223,7 +283,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 				wait(self->real->testRawStartMovement(params, emptyTssMapping));
 
 				// test finish or started but cancelled movement
-				if (true || deterministicRandom()->coinflip()) {
+				if (self->testStartOnly || deterministicRandom()->coinflip()) {
 					CODE_PROBE(true, "RawMovementApi partial started");
 					break;
 				}
@@ -259,6 +319,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 
 		KeyRange keys = self->getRandomKeys();
 		std::vector<UID> destTeams = self->getRandomTeam();
+		std::sort(destTeams.begin(), destTeams.end());
 		return MoveKeysParams{ deterministicRandom()->randomUniqueID(),
 			                   keys,
 			                   destTeams,
@@ -317,8 +378,9 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 	ACTOR Future<Void> worker(Database cx, IDDTxnProcessorApiWorkload* self) {
 		state double lastTime = now();
 		state int choice = 0;
+		state int maxChoice = self->testStartOnly ? 1 : 2;
 		loop {
-			choice = deterministicRandom()->randomInt(0, 1);
+			choice = deterministicRandom()->randomInt(0, maxChoice);
 			if (choice == 0) { // test rawStartMovement and rawFinishMovement separately
 				wait(testRawMovementApi(self));
 			} else if (choice == 1) { // test moveKeys
@@ -327,7 +389,6 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 				ASSERT(false);
 			}
 			wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
-			// Keep trying to get the moveKeysLock
 		}
 	}
 
diff --git a/tests/fast/IDDTxnProcessorApiCorrectness.toml b/tests/fast/IDDTxnProcessorApiCorrectness.toml
index b45755e833..2a8ad23ce4 100644
--- a/tests/fast/IDDTxnProcessorApiCorrectness.toml
+++ b/tests/fast/IDDTxnProcessorApiCorrectness.toml
@@ -2,9 +2,13 @@
 generateFearless = false # prevent generating remote dc because in MGS there's no region setting yet
 disableTss = true # There's no TSS in MGS this prevent the DD operate TSS mapping
 
+[[knobs]]
+max_added_sources_multiplier = 0 # set to 0 because it's impossible to make sure SS and mock SS will finish fetch keys at the same time.
+
 [[test]]
 testTitle = 'IDDTxnProcessorApiCorrectness'
 
     [[test.workload]]
     testName = 'IDDTxnProcessorApiCorrectness'
     testDuration = 50.0
+    testStartOnly = true

From fd425db1cfb1e03a860ef1e41a24d981e8cb84ad Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Tue, 8 Nov 2022 23:37:58 -0800
Subject: [PATCH 21/57] fix rawStartMovement bugs for merge shard; change the
 test toml file name

---
 fdbserver/DDTxnProcessor.actor.cpp            | 38 ++++++++++++-------
 tests/CMakeLists.txt                          |  2 +-
 ...l => IDDTxnProcessorRawStartMovement.toml} |  2 +-
 3 files changed, 27 insertions(+), 15 deletions(-)
 rename tests/fast/{IDDTxnProcessorApiCorrectness.toml => IDDTxnProcessorRawStartMovement.toml} (88%)

diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 22a66d3f66..78e663fceb 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -889,17 +889,29 @@ Future<std::vector<ProcessData>> DDMockTxnProcessor::getWorkers() const {
 }
 
 void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
-	FlowLock::Releaser releaser(*params.startMoveKeysParallelismLock);
-	// Add wait(take) would always return immediately because there won’t be parallel rawStart or rawFinish in mock
-	// world due to the fact the following *mock* transaction code will always finish without coroutine switch.
-	ASSERT(params.startMoveKeysParallelismLock->take().isReady());
+	// There won’t be parallel rawStart or rawFinish in mock world due to the fact the following *mock* transaction code
+	// will always finish without coroutine switch.
+	ASSERT(params.startMoveKeysParallelismLock->activePermits() == 0);
 
 	std::vector<ShardsAffectedByTeamFailure::Team> destTeams;
 	destTeams.emplace_back(params.destinationTeam, true);
-	mgs->shardMapping->defineShard(params.keys);
-	auto teamPair = mgs->shardMapping->getTeamsFor(params.keys.begin);
-	auto& srcTeams = teamPair.second.empty() ? teamPair.first : teamPair.second;
-	mgs->shardMapping->rawMoveShard(params.keys, srcTeams, destTeams);
+	// invariant: the splitting and merge operation won't happen at the same moveKeys action. For example, if [a,c) [c,
+	// e) exists, the params.keys won't be [b, d).
+	auto intersectRanges = mgs->shardMapping->intersectingRanges(params.keys);
+	// 1. splitting or just move a range. The new boundary need to be defined in startMovement
+	if (intersectRanges.begin().range().contains(params.keys)) {
+		mgs->shardMapping->defineShard(params.keys);
+	}
+	// 2. merge ops will coalesce the boundary in finishMovement;
+	intersectRanges = mgs->shardMapping->intersectingRanges(params.keys);
+	ASSERT(params.keys.begin == intersectRanges.begin().begin());
+	ASSERT(params.keys.end == intersectRanges.end().begin());
+
+	for (auto it = intersectRanges.begin(); it != intersectRanges.end(); ++it) {
+		auto teamPair = mgs->shardMapping->getTeamsFor(it->begin());
+		auto& srcTeams = teamPair.second.empty() ? teamPair.first : teamPair.second;
+		mgs->shardMapping->rawMoveShard(it->range(), srcTeams, destTeams);
+	}
 
 	auto randomRangeSize =
 	    deterministicRandom()->randomInt64(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
@@ -912,15 +924,14 @@ void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID,
 
 void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
                                            const std::map<UID, StorageServerInterface>& tssMapping) {
-	FlowLock::Releaser releaser(*params.finishMoveKeysParallelismLock);
-	// Add wait(take) would always return immediately because there won’t be parallel rawStart or rawFinish in mock
-	// world due to the fact the following *mock* transaction code will always finish without coroutine switch.
-	ASSERT(params.finishMoveKeysParallelismLock->take().isReady());
+	// There won’t be parallel rawStart or rawFinish in mock world due to the fact the following *mock* transaction code
+	// will always finish without coroutine switch.
+	ASSERT(params.finishMoveKeysParallelismLock->activePermits() == 0);
 
 	// get source and dest teams
 	auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(params.keys);
 
-	ASSERT_EQ(destTeams.size(), 0);
+	ASSERT_EQ(destTeams.size(), 1); // Will the multi-region or dynamic replica make destTeam.size() > 1?
 	if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) {
 		TraceEvent(SevError, "MockRawFinishMovementError")
 		    .detail("Reason", "InconsistentDestinations")
@@ -941,4 +952,5 @@ void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
 		}
 	}
 	mgs->shardMapping->finishMove(params.keys);
+	mgs->shardMapping->defineShard(params.keys); // coalesce for merge
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 85f05d1631..c49e58b14c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -170,7 +170,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES fast/MutationLogReaderCorrectness.toml)
   add_fdb_test(TEST_FILES fast/GetEstimatedRangeSize.toml)
   add_fdb_test(TEST_FILES fast/GetMappedRange.toml)
-  add_fdb_test(TEST_FILES fast/IDDTxnProcessorApiCorrectness.toml)
+  add_fdb_test(TEST_FILES fast/IDDTxnProcessorRawStartMovement.toml)
   add_fdb_test(TEST_FILES fast/PrivateEndpoints.toml)
   add_fdb_test(TEST_FILES fast/ProtocolVersion.toml)
   add_fdb_test(TEST_FILES fast/RandomSelector.toml)
diff --git a/tests/fast/IDDTxnProcessorApiCorrectness.toml b/tests/fast/IDDTxnProcessorRawStartMovement.toml
similarity index 88%
rename from tests/fast/IDDTxnProcessorApiCorrectness.toml
rename to tests/fast/IDDTxnProcessorRawStartMovement.toml
index 2a8ad23ce4..8bec1e456a 100644
--- a/tests/fast/IDDTxnProcessorApiCorrectness.toml
+++ b/tests/fast/IDDTxnProcessorRawStartMovement.toml
@@ -11,4 +11,4 @@ testTitle = 'IDDTxnProcessorApiCorrectness'
     [[test.workload]]
     testName = 'IDDTxnProcessorApiCorrectness'
     testDuration = 50.0
-    testStartOnly = true
+    testStartOnly = true # only test startMovement implementation

From f08b2b86d9b1f8c4fe6c8323ac8e3f6e72b0fe29 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Wed, 9 Nov 2022 10:45:19 -0800
Subject: [PATCH 22/57] remove readthrough and have minimum limit for the
 getRange

---
 fdbclient/include/fdbclient/KeyBackedTypes.h         |  4 ----
 .../include/fdbclient/MetaclusterManagement.actor.h  | 12 +++++-------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/fdbclient/include/fdbclient/KeyBackedTypes.h b/fdbclient/include/fdbclient/KeyBackedTypes.h
index cb86aef2c9..7446d52484 100644
--- a/fdbclient/include/fdbclient/KeyBackedTypes.h
+++ b/fdbclient/include/fdbclient/KeyBackedTypes.h
@@ -168,7 +168,6 @@ template <typename ResultType>
 struct KeyBackedRangeResult {
 	std::vector<ResultType> results;
 	bool more;
-	Optional<KeyRef> readThrough;
 };
 
 // Convenient read/write access to a single value of type T stored at key
@@ -369,7 +368,6 @@ public:
 				    rangeResult.results.push_back(PairType(key, val));
 			    }
 			    rangeResult.more = kvs.more;
-			    rangeResult.readThrough = kvs.readThrough;
 			    return rangeResult;
 		    }));
 	}
@@ -575,7 +573,6 @@ public:
 				    rangeResult.results.push_back(PairType(key, val));
 			    }
 			    rangeResult.more = kvs.more;
-			    rangeResult.readThrough = kvs.readThrough;
 			    return rangeResult;
 		    }));
 	}
@@ -663,7 +660,6 @@ public:
 				    rangeResult.results.push_back(Codec::unpack(kvs[i].key.removePrefix(prefix)));
 			    }
 			    rangeResult.more = kvs.more;
-			    rangeResult.readThrough = kvs.readThrough;
 			    return rangeResult;
 		    }));
 	}
diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
index 941cc1338e..f8467d7e8c 100644
--- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
+++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
@@ -1589,7 +1589,8 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
 			tr->setOption(FDBTransactionOptions::RAW_ACCESS);
 
 			state KeyBackedRangeResult<std::pair<TenantName, TenantMapEntry>> results =
-			    wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(tr, begin, end, limit));
+			    wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(
+			        tr, begin, end, std::max(limit + offset, 100)));
 			state std::vector<std::pair<TenantName, TenantMapEntry>> filterResults;
 			state int count = 0;
 			loop {
@@ -1608,13 +1609,10 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
 				if (!results.more) {
 					return filterResults;
 				}
-				if (results.readThrough.present()) {
-					begin = results.readThrough.get();
-				} else {
-					begin = keyAfter(results.results.back().first);
-				}
+				begin = keyAfter(results.results.back().first);
 				wait(store(results,
-				           ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(tr, begin, end, limit)));
+				           ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(
+				               tr, begin, end, std::max(limit + offset, 100))));
 			}
 		} catch (Error& e) {
 			wait(safeThreadFutureToFuture(tr->onError(e)));

From 62b88a07725df0b2251e03a62551f74d11d8fd4c Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Wed, 9 Nov 2022 11:34:47 -0800
Subject: [PATCH 23/57] wait on lock take

---
 fdbserver/DDTxnProcessor.actor.cpp            | 29 +++++++++++++++----
 fdbserver/include/fdbserver/DDTxnProcessor.h  |  4 +--
 .../IDDTxnProcessorApiCorrectness.actor.cpp   | 16 +++++-----
 tests/CMakeLists.txt                          |  1 +
 tests/fast/IDDTxnProcessorMoveKeys.toml       | 13 +++++++++
 .../fast/IDDTxnProcessorRawStartMovement.toml |  2 +-
 6 files changed, 49 insertions(+), 16 deletions(-)
 create mode 100644 tests/fast/IDDTxnProcessorMoveKeys.toml

diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 78e663fceb..9690f7afda 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -705,12 +705,12 @@ struct DDMockTxnProcessorImpl {
 		std::sort(params.destinationTeam.begin(), params.destinationTeam.end());
 		std::sort(params.healthyDestinations.begin(), params.healthyDestinations.end());
 
-		self->rawStartMovement(params, tssMapping);
+		wait(self->rawStartMovement(params, tssMapping));
 		ASSERT(tssMapping.empty());
 
 		wait(checkFetchingState(self, params.destinationTeam, params.keys));
 
-		self->rawFinishMovement(params, tssMapping);
+		wait(self->rawFinishMovement(params, tssMapping));
 		if (!params.dataMovementComplete.isSet())
 			params.dataMovementComplete.send(Void());
 		return Void();
@@ -888,10 +888,14 @@ Future<std::vector<ProcessData>> DDMockTxnProcessor::getWorkers() const {
 	return Future<std::vector<ProcessData>>();
 }
 
-void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
+ACTOR Future<Void> rawStartMovement(std::shared_ptr<MockGlobalState> mgs,
+                                    MoveKeysParams params,
+                                    std::map<UID, StorageServerInterface> tssMapping) {
 	// There won’t be parallel rawStart or rawFinish in mock world due to the fact the following *mock* transaction code
 	// will always finish without coroutine switch.
 	ASSERT(params.startMoveKeysParallelismLock->activePermits() == 0);
+	wait(params.startMoveKeysParallelismLock->take(TaskPriority::DataDistributionLaunch));
+	state FlowLock::Releaser releaser(*params.startMoveKeysParallelismLock);
 
 	std::vector<ShardsAffectedByTeamFailure::Team> destTeams;
 	destTeams.emplace_back(params.destinationTeam, true);
@@ -920,13 +924,22 @@ void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID,
 		server.setShardStatus(params.keys, MockShardStatus::INFLIGHT, mgs->restrictSize);
 		server.signalFetchKeys(params.keys, randomRangeSize);
 	}
+	return Void();
 }
 
-void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
-                                           const std::map<UID, StorageServerInterface>& tssMapping) {
+Future<Void> DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params,
+                                                  std::map<UID, StorageServerInterface>& tssMapping) {
+	return ::rawStartMovement(mgs, params, tssMapping);
+}
+
+ACTOR Future<Void> rawFinishMovement(std::shared_ptr<MockGlobalState> mgs,
+                                     MoveKeysParams params,
+                                     std::map<UID, StorageServerInterface> tssMapping) {
 	// There won’t be parallel rawStart or rawFinish in mock world due to the fact the following *mock* transaction code
 	// will always finish without coroutine switch.
 	ASSERT(params.finishMoveKeysParallelismLock->activePermits() == 0);
+	wait(params.finishMoveKeysParallelismLock->take(TaskPriority::DataDistributionLaunch));
+	state FlowLock::Releaser releaser(*params.finishMoveKeysParallelismLock);
 
 	// get source and dest teams
 	auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(params.keys);
@@ -953,4 +966,10 @@ void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
 	}
 	mgs->shardMapping->finishMove(params.keys);
 	mgs->shardMapping->defineShard(params.keys); // coalesce for merge
+	return Void();
+}
+
+Future<Void> DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
+                                                   const std::map<UID, StorageServerInterface>& tssMapping) {
+	return ::rawFinishMovement(mgs, params, tssMapping);
 }
diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h
index 0142c95183..09a9f48160 100644
--- a/fdbserver/include/fdbserver/DDTxnProcessor.h
+++ b/fdbserver/include/fdbserver/DDTxnProcessor.h
@@ -292,9 +292,9 @@ public:
 	Future<std::vector<ProcessData>> getWorkers() const override;
 
 protected:
-	void rawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
+	Future<Void> rawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
 
-	void rawFinishMovement(MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping);
+	Future<Void> rawFinishMovement(MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping);
 };
 
 #endif // FOUNDATIONDB_DDTXNPROCESSOR_H
diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index aceae94ad7..bf5eccfa91 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -80,12 +80,12 @@ void verifyInitDataEqual(Reference<InitialDataDistribution> real, Reference<Init
 class DDMockTxnProcessorTester : public DDMockTxnProcessor {
 public:
 	explicit DDMockTxnProcessorTester(std::shared_ptr<MockGlobalState> mgs = nullptr) : DDMockTxnProcessor(mgs) {}
-	void testRawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
-		rawStartMovement(params, tssMapping);
+	Future<Void> testRawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
+		return rawStartMovement(params, tssMapping);
 	}
 
-	void testRawFinishMovement(MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping) {
-		rawFinishMovement(params, tssMapping);
+	Future<Void> testRawFinishMovement(MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping) {
+		return rawFinishMovement(params, tssMapping);
 	}
 };
 
@@ -94,12 +94,12 @@ public:
 	explicit DDTxnProcessorTester(Database cx) : DDTxnProcessor(cx) {}
 
 	Future<Void> testRawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
-		return this->rawStartMovement(params, tssMapping);
+		return rawStartMovement(params, tssMapping);
 	}
 
 	Future<Void> testRawFinishMovement(MoveKeysParams& params,
 	                                   const std::map<UID, StorageServerInterface>& tssMapping) {
-		return this->rawFinishMovement(params, tssMapping);
+		return rawFinishMovement(params, tssMapping);
 	}
 };
 
@@ -279,7 +279,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 			wait(store(params.lock, self->real->takeMoveKeysLock(UID())));
 			try {
 				// test start
-				self->mock->testRawStartMovement(params, emptyTssMapping);
+				wait(self->mock->testRawStartMovement(params, emptyTssMapping));
 				wait(self->real->testRawStartMovement(params, emptyTssMapping));
 
 				// test finish or started but cancelled movement
@@ -288,7 +288,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 					break;
 				}
 
-				self->mock->testRawFinishMovement(params, emptyTssMapping);
+				wait(self->mock->testRawFinishMovement(params, emptyTssMapping));
 				wait(self->real->testRawFinishMovement(params, emptyTssMapping));
 				break;
 			} catch (Error& e) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c49e58b14c..bc850f3333 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -171,6 +171,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES fast/GetEstimatedRangeSize.toml)
   add_fdb_test(TEST_FILES fast/GetMappedRange.toml)
   add_fdb_test(TEST_FILES fast/IDDTxnProcessorRawStartMovement.toml)
+  add_fdb_test(TEST_FILES fast/IDDTxnProcessorMoveKeys.toml IGNORE)
   add_fdb_test(TEST_FILES fast/PrivateEndpoints.toml)
   add_fdb_test(TEST_FILES fast/ProtocolVersion.toml)
   add_fdb_test(TEST_FILES fast/RandomSelector.toml)
diff --git a/tests/fast/IDDTxnProcessorMoveKeys.toml b/tests/fast/IDDTxnProcessorMoveKeys.toml
new file mode 100644
index 0000000000..9dedc67253
--- /dev/null
+++ b/tests/fast/IDDTxnProcessorMoveKeys.toml
@@ -0,0 +1,13 @@
+[configuration]
+generateFearless = false # prevent generating remote dc because in MGS there's no region setting yet
+disableTss = true # There's no TSS in MGS this prevent the DD operate TSS mapping
+
+[[knobs]]
+max_added_sources_multiplier = 0 # set to 0 because it's impossible to make sure SS and mock SS will finish fetch keys at the same time.
+
+[[test]]
+testTitle = 'IDDTxnProcessorMoveKeys'
+
+    [[test.workload]]
+    testName = 'IDDTxnProcessorApiCorrectness'
+    testDuration = 50.0
diff --git a/tests/fast/IDDTxnProcessorRawStartMovement.toml b/tests/fast/IDDTxnProcessorRawStartMovement.toml
index 8bec1e456a..73109583ee 100644
--- a/tests/fast/IDDTxnProcessorRawStartMovement.toml
+++ b/tests/fast/IDDTxnProcessorRawStartMovement.toml
@@ -6,7 +6,7 @@ disableTss = true # There's no TSS in MGS this prevent the DD operate TSS mappin
 max_added_sources_multiplier = 0 # set to 0 because it's impossible to make sure SS and mock SS will finish fetch keys at the same time.
 
 [[test]]
-testTitle = 'IDDTxnProcessorApiCorrectness'
+testTitle = 'IDDTxnProcessorRawStartMovement'
 
     [[test.workload]]
     testName = 'IDDTxnProcessorApiCorrectness'

From 3fb12680e3459e428c3caad699686eb9c5e49995 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Wed, 9 Nov 2022 13:15:46 -0800
Subject: [PATCH 24/57] revert the mvccStorageBytes for write sample change and
 mark it with FIXME

---
 fdbclient/include/fdbclient/StorageServerInterface.h | 11 ++++++++++-
 fdbserver/MockGlobalState.actor.cpp                  |  4 ++--
 fdbserver/storageserver.actor.cpp                    |  9 +++------
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h
index b8ad4523c9..a1b6e0ce08 100644
--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@@ -634,7 +634,7 @@ struct GetShardStateRequest {
 struct StorageMetrics {
 	constexpr static FileIdentifier file_identifier = 13622226;
 	int64_t bytes = 0; // total storage
-	int64_t writeBytesPerKSecond = 0; // network bandwidth (average over 10s) == write bandwidth through any IO devices
+	int64_t writeBytesPerKSecond = 0; // bytes write to SQ
 
 	// FIXME: currently, iosPerKSecond is not used in DataDistribution calculations.
 	int64_t iosPerKSecond = 0;
@@ -1180,4 +1180,13 @@ struct StorageQueuingMetricsRequest {
 	}
 };
 
+// Memory size for storing mutation in the mutation log and the versioned map.
+inline int mvccStorageBytes(int mutationBytes) {
+	// Why * 2:
+	// - 1 insertion into version map costs 2 nodes in avg;
+	// - The mutation will be stored in both mutation log and versioned map;
+	return VersionedMap<KeyRef, ValueOrClearToRef>::overheadPerItem * 2 +
+	       (mutationBytes + MutationRef::OVERHEAD_BYTES) * 2;
+}
+
 #endif
diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index 388b3da93a..240ff27f6d 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -401,7 +401,8 @@ void MockStorageServer::clearRangeTotalBytes(KeyRangeRef const& range, int64_t b
 void MockStorageServer::notifyWriteMetrics(KeyRef const& key, int64_t size) {
 	// update write bandwidth and iops as mock the cost of writing a mutation
 	StorageMetrics s;
-	s.writeBytesPerKSecond = size + MutationRef::OVERHEAD_BYTES;
+	// FIXME: remove the / 2 and double the related knobs.
+	s.writeBytesPerKSecond = mvccStorageBytes(size) / 2;
 	s.iosPerKSecond = 1;
 	metrics.notify(key, s);
 }
@@ -1000,7 +1001,6 @@ TEST_CASE("/MockGlobalState/MockStorageServer/DataOpsSet") {
 			// If sampled
 			ASSERT_EQ(res.first.get().bytes, testSize);
 			ASSERT_GT(res.first.get().writeBytesPerKSecond, 0);
-			ASSERT_GT(res.first.get().iosPerKSecond, 0);
 		}
 	}
 	return Void();
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 266a841ca1..ded64b5d3b 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -534,12 +534,8 @@ const int VERSION_OVERHEAD =
                                                                               // createNewVersion(version+1) ], 64b
                                                                               // overhead for map
 
-// Memory size for storing mutation in the mutation log and the versioned map.
 static int mvccStorageBytes(MutationRef const& m) {
-	// Why * 2:
-	// - 1 insertion into version map costs 2 nodes in avg;
-	// - The mutation will be stored in both mutation log and versioned map;
-	return VersionedMap<KeyRef, ValueOrClearToRef>::overheadPerItem * 2 + m.totalSize() * 2;
+	return mvccStorageBytes(m.param1.size() + m.param2.size());
 }
 
 struct FetchInjectionInfo {
@@ -5616,7 +5612,8 @@ void applyMutation(StorageServer* self,
 	// m is expected to be in arena already
 	// Clear split keys are added to arena
 	StorageMetrics metrics;
-	metrics.writeBytesPerKSecond = m.totalSize(); // comparable to counter.mutationBytes
+	// FIXME: remove the / 2 and double the related knobs.
+	metrics.writeBytesPerKSecond = mvccStorageBytes(m) / 2; // comparable to counter.bytesInput / 2
 	metrics.iosPerKSecond = 1;
 	self->metrics.notify(m.param1, metrics);
 

From 7c9334121a1dba17bfcff0e9c62c598904cda4b7 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Thu, 10 Nov 2022 12:51:22 -0800
Subject: [PATCH 25/57] mark MoveKeysParams const&

---
 fdbserver/DDTxnProcessor.actor.cpp               |  8 ++++----
 fdbserver/MockGlobalState.actor.cpp              | 16 +++++++++++++++-
 fdbserver/MoveKeys.actor.cpp                     |  5 +++--
 fdbserver/include/fdbserver/DDTxnProcessor.h     |  9 +++++----
 fdbserver/include/fdbserver/MoveKeys.actor.h     |  5 +++--
 .../IDDTxnProcessorApiCorrectness.actor.cpp      | 14 ++++++++++++--
 6 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 9690f7afda..614e15679b 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -669,12 +669,12 @@ Future<std::vector<ProcessData>> DDTxnProcessor::getWorkers() const {
 	return ::getWorkers(cx);
 }
 
-Future<Void> DDTxnProcessor::rawStartMovement(MoveKeysParams& params,
+Future<Void> DDTxnProcessor::rawStartMovement(const MoveKeysParams& params,
                                               std::map<UID, StorageServerInterface>& tssMapping) {
 	return ::rawStartMovement(cx, params, tssMapping);
 }
 
-Future<Void> DDTxnProcessor::rawFinishMovement(MoveKeysParams& params,
+Future<Void> DDTxnProcessor::rawFinishMovement(const MoveKeysParams& params,
                                                const std::map<UID, StorageServerInterface>& tssMapping) {
 	return ::rawFinishMovement(cx, params, tssMapping);
 }
@@ -927,7 +927,7 @@ ACTOR Future<Void> rawStartMovement(std::shared_ptr<MockGlobalState> mgs,
 	return Void();
 }
 
-Future<Void> DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params,
+Future<Void> DDMockTxnProcessor::rawStartMovement(const MoveKeysParams& params,
                                                   std::map<UID, StorageServerInterface>& tssMapping) {
 	return ::rawStartMovement(mgs, params, tssMapping);
 }
@@ -969,7 +969,7 @@ ACTOR Future<Void> rawFinishMovement(std::shared_ptr<MockGlobalState> mgs,
 	return Void();
 }
 
-Future<Void> DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
+Future<Void> DDMockTxnProcessor::rawFinishMovement(const MoveKeysParams& params,
                                                    const std::map<UID, StorageServerInterface>& tssMapping) {
 	return ::rawFinishMovement(mgs, params, tssMapping);
 }
diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index 240ff27f6d..aabe9f379c 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -166,6 +166,7 @@ bool MockStorageServer::allShardStatusEqual(const KeyRangeRef& range, MockShardS
 
 bool MockStorageServer::allShardStatusIn(const KeyRangeRef& range, const std::set<MockShardStatus>& status) {
 	auto ranges = serverKeys.intersectingRanges(range);
+	TraceEvent("AllShardStatusIn", id).detail("RangesEmpty", ranges.empty()).detail("Range", range);
 	ASSERT(!ranges.empty()); // at least the range is allKeys
 
 	for (auto it = ranges.begin(); it != ranges.end(); ++it) {
@@ -177,7 +178,15 @@ bool MockStorageServer::allShardStatusIn(const KeyRangeRef& range, const std::se
 
 void MockStorageServer::setShardStatus(const KeyRangeRef& range, MockShardStatus status, bool restrictSize) {
 	auto ranges = serverKeys.intersectingRanges(range);
-	ASSERT(!ranges.empty());
+	TraceEvent("SetShardStatus", id).detail("KeyRange", range).detail("Status", status);
+
+	if (ranges.empty()) {
+		CODE_PROBE(true, "new shard is adding to server");
+		serverKeys.insert(range, ShardInfo{ status, 0 });
+		return;
+	}
+
+	// change the old status
 	if (ranges.begin().begin() < range.begin && ranges.begin().end() > range.end) {
 		CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
 		threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
@@ -502,6 +511,11 @@ bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef sh
 }
 
 bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shard) {
+	TraceEvent(SevDebug, "ServerIsDestForShard")
+	    .detail("ServerId", serverId)
+	    .detail("Keys", shard)
+	    .detail("Contains", allServers.count(serverId));
+
 	if (!allServers.count(serverId))
 		return false;
 
diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index 1425ccb30a..90169e2177 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -2476,7 +2476,8 @@ ACTOR Future<Void> cleanUpDataMove(Database occ,
 	return Void();
 }
 
-Future<Void> rawStartMovement(Database occ, MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
+Future<Void> rawStartMovement(Database occ,
+                              const MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
 	if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
 		return startMoveShards(std::move(occ),
 		                       params.dataMoveId,
@@ -2499,7 +2500,7 @@ Future<Void> rawStartMovement(Database occ, MoveKeysParams& params, std::map<UID
 }
 
 Future<Void> rawFinishMovement(Database occ,
-                               MoveKeysParams& params,
+                               const MoveKeysParams& params,
                                const std::map<UID, StorageServerInterface>& tssMapping) {
 	if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
 		return finishMoveShards(std::move(occ),
diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h
index 09a9f48160..503dcca108 100644
--- a/fdbserver/include/fdbserver/DDTxnProcessor.h
+++ b/fdbserver/include/fdbserver/DDTxnProcessor.h
@@ -225,9 +225,9 @@ public:
 	Future<std::vector<ProcessData>> getWorkers() const override;
 
 protected:
-	Future<Void> rawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
+	Future<Void> rawStartMovement(const MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
 
-	Future<Void> rawFinishMovement(MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping);
+	Future<Void> rawFinishMovement(const MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping);
 };
 
 struct DDMockTxnProcessorImpl;
@@ -237,6 +237,7 @@ struct DDMockTxnProcessorImpl;
 class DDMockTxnProcessor : public IDDTxnProcessor {
 	friend struct DDMockTxnProcessorImpl;
 
+protected:
 	std::shared_ptr<MockGlobalState> mgs;
 
 	std::vector<DDShardInfo> getDDShardInfos() const;
@@ -292,9 +293,9 @@ public:
 	Future<std::vector<ProcessData>> getWorkers() const override;
 
 protected:
-	Future<Void> rawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
+	Future<Void> rawStartMovement(const MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
 
-	Future<Void> rawFinishMovement(MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping);
+	Future<Void> rawFinishMovement(const MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping);
 };
 
 #endif // FOUNDATIONDB_DDTXNPROCESSOR_H
diff --git a/fdbserver/include/fdbserver/MoveKeys.actor.h b/fdbserver/include/fdbserver/MoveKeys.actor.h
index 0318a70644..24ed41cdf5 100644
--- a/fdbserver/include/fdbserver/MoveKeys.actor.h
+++ b/fdbserver/include/fdbserver/MoveKeys.actor.h
@@ -86,10 +86,11 @@ void seedShardServers(Arena& trArena, CommitTransactionRef& tr, std::vector<Stor
 // Called by the master server to write the very first transaction to the database
 // establishing a set of shard servers and all invariants of the systemKeys.
 
-Future<Void> rawStartMovement(Database occ, MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
+Future<Void> rawStartMovement(Database occ,
+                              const MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
 
 Future<Void> rawFinishMovement(Database occ,
-                               MoveKeysParams& params,
+                               const MoveKeysParams& params,
                                const std::map<UID, StorageServerInterface>& tssMapping);
 // Eventually moves the given keys to the given destination team
 // Caller is responsible for cancelling it before issuing an overlapping move,
diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index bf5eccfa91..8e72072f18 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -260,6 +260,12 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		return Void();
 	}
 
+	void verifyServerKeyDest(MoveKeysParams& params) {
+		// check destination servers
+		for(auto& id: params.destinationTeam) {
+			ASSERT(mgs->serverIsDestForShard(id, params.keys));
+		}
+	}
 	ACTOR static Future<Void> testRawMovementApi(IDDTxnProcessorApiWorkload* self) {
 		state TraceInterval relocateShardInterval("RelocateShard_TestRawMovementApi");
 		state FlowLock fl1(1);
@@ -282,6 +288,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 				wait(self->mock->testRawStartMovement(params, emptyTssMapping));
 				wait(self->real->testRawStartMovement(params, emptyTssMapping));
 
+				self->verifyServerKeyDest(params);
 				// test finish or started but cancelled movement
 				if (self->testStartOnly || deterministicRandom()->coinflip()) {
 					CODE_PROBE(true, "RawMovementApi partial started");
@@ -344,13 +351,15 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		params.startMoveKeysParallelismLock = &fl1;
 		params.finishMoveKeysParallelismLock = &fl2;
 		params.relocationIntervalId = relocateShardInterval.pairID;
-		TraceEvent(SevDebug, relocateShardInterval.begin(), relocateShardInterval.pairID);
+		TraceEvent(SevDebug, relocateShardInterval.begin(), relocateShardInterval.pairID)
+		    .detail("Key", params.keys)
+		    .detail("Dest", params.destinationTeam);
 
 		loop {
 			params.dataMovementComplete.reset();
 			wait(store(params.lock, self->real->takeMoveKeysLock(UID())));
 			try {
-				self->mock->moveKeys(params);
+				wait(self->mock->moveKeys(params));
 				wait(self->real->moveKeys(params));
 				break;
 			} catch (Error& e) {
@@ -375,6 +384,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		TraceEvent(SevDebug, relocateShardInterval.end(), relocateShardInterval.pairID);
 		return Void();
 	}
+
 	ACTOR Future<Void> worker(Database cx, IDDTxnProcessorApiWorkload* self) {
 		state double lastTime = now();
 		state int choice = 0;

From 4691a352151082a97b5e53ba94f2d426a7def9f3 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Thu, 10 Nov 2022 13:11:24 -0800
Subject: [PATCH 26/57] format code

---
 fdbserver/MoveKeys.actor.cpp                                | 3 ++-
 fdbserver/ShardsAffectedByTeamFailure.cpp                   | 2 +-
 fdbserver/include/fdbserver/DDTxnProcessor.h                | 6 ++++--
 fdbserver/include/fdbserver/MoveKeys.actor.h                | 3 ++-
 fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp | 5 +++--
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index 90169e2177..1979173fdf 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -2477,7 +2477,8 @@ ACTOR Future<Void> cleanUpDataMove(Database occ,
 }
 
 Future<Void> rawStartMovement(Database occ,
-                              const MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
+                              const MoveKeysParams& params,
+                              std::map<UID, StorageServerInterface>& tssMapping) {
 	if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
 		return startMoveShards(std::move(occ),
 		                       params.dataMoveId,
diff --git a/fdbserver/ShardsAffectedByTeamFailure.cpp b/fdbserver/ShardsAffectedByTeamFailure.cpp
index d774f658e7..b8ab69bab7 100644
--- a/fdbserver/ShardsAffectedByTeamFailure.cpp
+++ b/fdbserver/ShardsAffectedByTeamFailure.cpp
@@ -169,7 +169,7 @@ void ShardsAffectedByTeamFailure::rawMoveShard(KeyRangeRef keys,
 		erase(*t, it->range());
 	}
 	it.value() = std::make_pair(destinationTeams, srcTeams);
-	for(auto& team: destinationTeams) {
+	for (auto& team : destinationTeams) {
 		insert(team, keys);
 	}
 
diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h
index 503dcca108..d350bda61f 100644
--- a/fdbserver/include/fdbserver/DDTxnProcessor.h
+++ b/fdbserver/include/fdbserver/DDTxnProcessor.h
@@ -227,7 +227,8 @@ public:
 protected:
 	Future<Void> rawStartMovement(const MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
 
-	Future<Void> rawFinishMovement(const MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping);
+	Future<Void> rawFinishMovement(const MoveKeysParams& params,
+	                               const std::map<UID, StorageServerInterface>& tssMapping);
 };
 
 struct DDMockTxnProcessorImpl;
@@ -295,7 +296,8 @@ public:
 protected:
 	Future<Void> rawStartMovement(const MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
 
-	Future<Void> rawFinishMovement(const MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping);
+	Future<Void> rawFinishMovement(const MoveKeysParams& params,
+	                               const std::map<UID, StorageServerInterface>& tssMapping);
 };
 
 #endif // FOUNDATIONDB_DDTXNPROCESSOR_H
diff --git a/fdbserver/include/fdbserver/MoveKeys.actor.h b/fdbserver/include/fdbserver/MoveKeys.actor.h
index 24ed41cdf5..ed027a29fa 100644
--- a/fdbserver/include/fdbserver/MoveKeys.actor.h
+++ b/fdbserver/include/fdbserver/MoveKeys.actor.h
@@ -87,7 +87,8 @@ void seedShardServers(Arena& trArena, CommitTransactionRef& tr, std::vector<Stor
 // establishing a set of shard servers and all invariants of the systemKeys.
 
 Future<Void> rawStartMovement(Database occ,
-                              const MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping);
+                              const MoveKeysParams& params,
+                              std::map<UID, StorageServerInterface>& tssMapping);
 
 Future<Void> rawFinishMovement(Database occ,
                                const MoveKeysParams& params,
diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index 8e72072f18..e7909e9813 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -84,7 +84,8 @@ public:
 		return rawStartMovement(params, tssMapping);
 	}
 
-	Future<Void> testRawFinishMovement(MoveKeysParams& params, const std::map<UID, StorageServerInterface>& tssMapping) {
+	Future<Void> testRawFinishMovement(MoveKeysParams& params,
+	                                   const std::map<UID, StorageServerInterface>& tssMapping) {
 		return rawFinishMovement(params, tssMapping);
 	}
 };
@@ -262,7 +263,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 
 	void verifyServerKeyDest(MoveKeysParams& params) {
 		// check destination servers
-		for(auto& id: params.destinationTeam) {
+		for (auto& id : params.destinationTeam) {
 			ASSERT(mgs->serverIsDestForShard(id, params.keys));
 		}
 	}

From 93fb151e6c403fa7ad64df4659b45b155ed516fa Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Thu, 10 Nov 2022 14:29:01 -0800
Subject: [PATCH 27/57] add fdbcli error handling, remove invalid state and
 change some offset logic

---
 fdbcli/TenantCommands.actor.cpp                      | 11 ++++++++---
 fdbclient/Tenant.cpp                                 |  4 +---
 .../include/fdbclient/MetaclusterManagement.actor.h  | 11 ++++++++---
 fdbclient/include/fdbclient/Tenant.h                 | 12 +-----------
 .../MetaclusterManagementWorkload.actor.cpp          | 12 +++++++++---
 5 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp
index c055878d80..daacb80fbd 100644
--- a/fdbcli/TenantCommands.actor.cpp
+++ b/fdbcli/TenantCommands.actor.cpp
@@ -113,14 +113,19 @@ bool parseTenantListOptions(std::vector<StringRef> const& tokens,
 			}
 		} else if (tokencmp(param, "offset")) {
 			offset = std::stoi(value.get().toString());
-			if (offset <= 0) {
+			if (offset < 0) {
 				fmt::print(stderr, "ERROR: invalid offset `{}'\n", token.toString().c_str());
 				return false;
 			}
 		} else if (tokencmp(param, "state")) {
 			auto filterStrings = value.get().splitAny(","_sr);
-			for (auto sref : filterStrings) {
-				filters.push_back(TenantMapEntry::stringToTenantState(sref.toString()));
+			try {
+				for (auto sref : filterStrings) {
+					filters.push_back(TenantMapEntry::stringToTenantState(sref.toString()));
+				}
+			} catch (Error& e) {
+				fmt::print(stderr, "ERROR: unrecognized tenant state(s) `{}'.\n", value.get().toString());
+				return false;
 			}
 		} else {
 			fmt::print(stderr, "ERROR: unrecognized parameter `{}'.\n", param.toString().c_str());
diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp
index 8ef4a8b9e3..b863a4ff85 100644
--- a/fdbclient/Tenant.cpp
+++ b/fdbclient/Tenant.cpp
@@ -64,8 +64,6 @@ std::string TenantMapEntry::tenantStateToString(TenantState tenantState) {
 		return "renaming to";
 	case TenantState::ERROR:
 		return "error";
-	case TenantState::INVALID:
-		return "invalid";
 	default:
 		UNREACHABLE();
 	}
@@ -89,7 +87,7 @@ TenantState TenantMapEntry::stringToTenantState(std::string stateStr) {
 		return TenantState::ERROR;
 	}
 
-	return TenantState::INVALID;
+	throw invalid_option();
 }
 
 std::string TenantMapEntry::tenantLockStateToString(TenantLockState tenantState) {
diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
index f8467d7e8c..019849aba8 100644
--- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
+++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h
@@ -1581,9 +1581,14 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
 		try {
 			tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
-			if (offset == 0 && filters.empty()) {
-				std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
-				    wait(listTenantsTransaction(tr, begin, end, limit));
+			if (filters.empty()) {
+				state std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
+				wait(store(tenants, listTenantsTransaction(tr, begin, end, limit + offset)));
+				if (offset >= tenants.size()) {
+					tenants.clear();
+				} else if (offset > 0) {
+					tenants.erase(tenants.begin(), tenants.begin() + offset);
+				}
 				return tenants;
 			}
 			tr->setOption(FDBTransactionOptions::RAW_ACCESS);
diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h
index 5dcfb8ce8a..d2b1e34c40 100644
--- a/fdbclient/include/fdbclient/Tenant.h
+++ b/fdbclient/include/fdbclient/Tenant.h
@@ -49,7 +49,6 @@ typedef Standalone<TenantGroupNameRef> TenantGroupName;
 // RENAMING_TO - the tenant is being created as a rename from an existing tenant and is awaiting the rename to complete
 //               on the data cluster
 // ERROR - the tenant is in an error state
-// INVALID - Unrecognized state - likely the result of a failed parsing
 //
 // A tenant in any configuration is allowed to be removed. Only tenants in the READY or UPDATING_CONFIGURATION phases
 // can have their configuration updated. A tenant must not exist or be in the REGISTERING phase to be created. To be
@@ -58,16 +57,7 @@ typedef Standalone<TenantGroupNameRef> TenantGroupName;
 //
 // If an operation fails and the tenant is left in a non-ready state, re-running the same operation is legal. If
 // successful, the tenant will return to the READY state.
-enum class TenantState {
-	REGISTERING,
-	READY,
-	REMOVING,
-	UPDATING_CONFIGURATION,
-	RENAMING_FROM,
-	RENAMING_TO,
-	ERROR,
-	INVALID
-};
+enum class TenantState { REGISTERING, READY, REMOVING, UPDATING_CONFIGURATION, RENAMING_FROM, RENAMING_TO, ERROR };
 
 // Represents the lock state the tenant could be in.
 // Can be used in conjunction with the other tenant states above.
diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
index 9022cbc80b..de6f81b9dc 100644
--- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
+++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
@@ -396,12 +396,18 @@ struct MetaclusterManagementWorkload : TestWorkload {
 		try {
 			state TenantMapEntry checkEntry = wait(MetaclusterAPI::getTenant(self->managementDb, tenant));
 			state TenantState checkState = checkEntry.tenantState;
-			state std::vector<std::pair<TenantName, TenantMapEntry>> tenantList;
 			state std::vector<TenantState> filters;
 			filters.push_back(checkState);
-			wait(store(tenantList,
+			state std::vector<std::pair<TenantName, TenantMapEntry>> tenantList;
+			// Possible to have changed state between now and the getTenant call above
+			state TenantMapEntry checkEntry2;
+			wait(store(checkEntry2, MetaclusterAPI::getTenant(self->managementDb, tenant)) &&
+			     store(tenantList,
 			           MetaclusterAPI::listTenants(self->managementDb, ""_sr, "\xff\xff"_sr, 10e6, 0, filters)));
-			ASSERT(!tenantList.empty());
+			if (tenantList.empty()) {
+				ASSERT(checkEntry2.tenantState != checkState);
+				return Void();
+			}
 			bool found = false;
 			for (auto pair : tenantList) {
 				ASSERT(pair.second.tenantState == checkState);

From 1816e5caa8be85fc26503f7ddc5c86cf490d8101 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Thu, 10 Nov 2022 15:22:01 -0800
Subject: [PATCH 28/57] setup the MGS after each test call

---
 fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index e7909e9813..04b73e9a09 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -319,6 +319,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 
 		verifyInitDataEqual(self->realInitDD, mockInitData);
 		TraceEvent(SevDebug, relocateShardInterval.end(), relocateShardInterval.pairID);
+		self->mock->setupMockGlobalState(self->realInitDD); // in case SS remove or recruit
 		return Void();
 	}
 
@@ -383,6 +384,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 
 		verifyInitDataEqual(self->realInitDD, mockInitData);
 		TraceEvent(SevDebug, relocateShardInterval.end(), relocateShardInterval.pairID);
+		self->mock->setupMockGlobalState(self->realInitDD); // in case SS remove or recruit
 		return Void();
 	}
 

From 23706c957b00b85a74840cacf71f735dbe524877 Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Fri, 30 Sep 2022 10:39:43 -0700
Subject: [PATCH 29/57] Use DDSketch for Sample Data.

---
 fdbclient/NativeAPI.actor.cpp                 |  12 +-
 fdbclient/ServerKnobs.cpp                     |   4 +-
 .../include/fdbclient/BlobWorkerCommon.h      |  12 +-
 fdbclient/include/fdbclient/DatabaseContext.h |   6 +-
 fdbclient/include/fdbclient/ServerKnobs.h     |   4 +-
 fdbrpc/FlowTransport.actor.cpp                |   4 +-
 fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h   |   6 +-
 fdbrpc/include/fdbrpc/DDSketch.h              | 311 ++++++++++++++++++
 fdbrpc/include/fdbrpc/FlowTransport.h         |   6 +-
 fdbrpc/include/fdbrpc/Stats.h                 |  35 +-
 fdbrpc/include/fdbrpc/TSSComparison.h         |  17 +-
 fdbserver/BlobWorker.actor.cpp                |   5 +-
 fdbserver/EncryptKeyProxy.actor.cpp           |   6 +-
 fdbserver/GrvProxyServer.actor.cpp            |  10 +-
 fdbserver/KeyValueStoreRocksDB.actor.cpp      |   6 +-
 fdbserver/TLogServer.actor.cpp                |   2 +-
 fdbserver/VersionedBTree.actor.cpp            |   2 +-
 .../include/fdbserver/ProxyCommitData.actor.h |   8 +-
 .../workloads/ReadWriteWorkload.actor.h       |   3 +-
 fdbserver/masterserver.actor.cpp              |   6 +-
 fdbserver/storageserver.actor.cpp             |  22 +-
 fdbserver/workloads/AtomicOps.actor.cpp       |   1 -
 .../workloads/BackgroundSelectors.actor.cpp   |   1 -
 fdbserver/workloads/BulkLoad.actor.cpp        |   7 +-
 fdbserver/workloads/BulkSetup.actor.cpp       |  74 +++++
 fdbserver/workloads/DDBalance.actor.cpp       |   6 +-
 .../workloads/FastTriggeredWatches.actor.cpp  |   1 -
 fdbserver/workloads/FileSystem.actor.cpp      |   8 +-
 fdbserver/workloads/IndexScan.actor.cpp       |   1 -
 fdbserver/workloads/LowLatency.actor.cpp      |   1 -
 fdbserver/workloads/Mako.actor.cpp            |   6 +-
 fdbserver/workloads/MemoryLifetime.actor.cpp  |   1 -
 fdbserver/workloads/MetricLogging.actor.cpp   |   1 -
 fdbserver/workloads/QueuePush.actor.cpp       |   6 +-
 fdbserver/workloads/RYWDisable.actor.cpp      |   1 -
 fdbserver/workloads/RYWPerformance.actor.cpp  |   1 -
 fdbserver/workloads/ReadAfterWrite.actor.cpp  |   6 +-
 .../workloads/ReadHotDetection.actor.cpp      |   2 +-
 fdbserver/workloads/ReadWrite.actor.cpp       |   6 +-
 fdbserver/workloads/SkewedReadWrite.actor.cpp |   4 +-
 fdbserver/workloads/SnapTest.actor.cpp        |   1 -
 fdbserver/workloads/StreamingRead.actor.cpp   |   6 +-
 fdbserver/workloads/Throughput.actor.cpp      |   7 +-
 fdbserver/workloads/Unreadable.actor.cpp      |   1 -
 fdbserver/workloads/VersionStamp.actor.cpp    |   1 -
 fdbserver/workloads/WatchAndWait.actor.cpp    |   1 -
 fdbserver/workloads/Watches.actor.cpp         |   8 +-
 .../WatchesSameKeyCorrectness.actor.cpp       |   1 -
 fdbserver/workloads/WriteBandwidth.actor.cpp  |   8 +-
 .../workloads/WriteTagThrottling.actor.cpp    |   9 +-
 flow/Knobs.cpp                                |   5 +-
 flow/include/flow/Knobs.h                     |   5 +-
 flowbench/BenchSamples.cpp                    |  54 +++
 tests/CMakeLists.txt                          |   1 +
 54 files changed, 572 insertions(+), 157 deletions(-)
 create mode 100644 fdbrpc/include/fdbrpc/DDSketch.h
 create mode 100644 fdbserver/workloads/BulkSetup.actor.cpp

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 0345bfee39..17c5d749f5 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -578,7 +578,7 @@ void traceTSSErrors(const char* name, UID tssId, const std::unordered_map<int, u
     Example:
     GetValueLatencySSMean
 */
-void traceSSOrTSSPercentiles(TraceEvent& ev, const std::string name, ContinuousSample<double>& sample) {
+void traceSSOrTSSPercentiles(TraceEvent& ev, const std::string name, DDSketch<double>& sample) {
 	ev.detail(name + "Mean", sample.mean());
 	// don't log the larger percentiles unless we actually have enough samples to log the accurate percentile instead of
 	// the largest sample in this window
@@ -595,8 +595,8 @@ void traceSSOrTSSPercentiles(TraceEvent& ev, const std::string name, ContinuousS
 
 void traceTSSPercentiles(TraceEvent& ev,
                          const std::string name,
-                         ContinuousSample<double>& ssSample,
-                         ContinuousSample<double>& tssSample) {
+                         DDSketch<double>& ssSample,
+                         DDSketch<double>& tssSample) {
 	ASSERT(ssSample.getPopulationSize() == tssSample.getPopulationSize());
 	ev.detail(name + "Count", ssSample.getPopulationSize());
 	if (ssSample.getPopulationSize() > 0) {
@@ -1534,12 +1534,12 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
     ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG),
     bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG),
     bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG),
-    bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(1000), bgGranulesPerRequest(1000),
+    bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(), bgGranulesPerRequest(),
     usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
     feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed),
     feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed),
-    feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), commitLatencies(1000),
-    GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), outstandingWatches(0), sharedStatePtr(nullptr),
+    feedPopsFallback("FeedPopsFallback", ccFeed), latencies(), readLatencies(), commitLatencies(),
+    GRVLatencies(), mutationsPerCommit(), bytesPerCommit(), outstandingWatches(0), sharedStatePtr(nullptr),
     lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0),
     lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo),
     clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0),
diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp
index 4e9f07c3ec..ff619aad0e 100644
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@@ -948,8 +948,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT,             false );
 
 	// Server request latency measurement
-	init( LATENCY_SAMPLE_SIZE,                                100000 );
-	init( FILE_LATENCY_SAMPLE_SIZE,                            10000 );
+	init( LATENCY_SKETCH_ACCURACY,                              0.01 );
+	init( FILE_LATENCY_SKETCH_ACCURACY,                         0.01 );
 	init( LATENCY_METRICS_LOGGING_INTERVAL,                     60.0 );
 
 	// Cluster recovery
diff --git a/fdbclient/include/fdbclient/BlobWorkerCommon.h b/fdbclient/include/fdbclient/BlobWorkerCommon.h
index 8acdd63598..617f167111 100644
--- a/fdbclient/include/fdbclient/BlobWorkerCommon.h
+++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h
@@ -75,8 +75,8 @@ struct BlobWorkerStats {
 	                         Reference<FlowLock> resnapshotLock,
 	                         Reference<FlowLock> deltaWritesLock,
 	                         double sampleLoggingInterval,
-	                         int fileOpLatencySampleSize,
-	                         int requestLatencySampleSize)
+	                         int fileOpLatencySketchAccuracy,
+	                         int requestLatencySketchAccuracy)
 	  : cc("BlobWorkerStats", id.toString()),
 
 	    s3PutReqs("S3PutReqs", cc), s3GetReqs("S3GetReqs", cc), s3DeleteReqs("S3DeleteReqs", cc),
@@ -95,10 +95,10 @@ struct BlobWorkerStats {
 	    forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc),
 	    numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0),
 	    minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0),
-	    snapshotBlobWriteLatencySample("SnapshotBlobWriteMetrics", id, sampleLoggingInterval, fileOpLatencySampleSize),
-	    deltaBlobWriteLatencySample("DeltaBlobWriteMetrics", id, sampleLoggingInterval, fileOpLatencySampleSize),
-	    reSnapshotLatencySample("GranuleResnapshotMetrics", id, sampleLoggingInterval, fileOpLatencySampleSize),
-	    readLatencySample("GranuleReadLatencyMetrics", id, sampleLoggingInterval, requestLatencySampleSize),
+	    snapshotBlobWriteLatencySample("SnapshotBlobWriteMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
+	    deltaBlobWriteLatencySample("DeltaBlobWriteMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
+	    reSnapshotLatencySample("GranuleResnapshotMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
+	    readLatencySample("GranuleReadLatencyMetrics", id, sampleLoggingInterval, requestLatencySketchAccuracy),
 	    estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock),
 	    deltaWritesLock(deltaWritesLock) {
 		specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h
index f28da0399a..32b4a7c153 100644
--- a/fdbclient/include/fdbclient/DatabaseContext.h
+++ b/fdbclient/include/fdbclient/DatabaseContext.h
@@ -42,8 +42,8 @@
 #include "fdbrpc/MultiInterface.h"
 #include "flow/TDMetric.actor.h"
 #include "fdbclient/EventTypes.actor.h"
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbrpc/Smoother.h"
+#include "fdbrpc/DDSketch.h"
 
 class StorageServerInfo : public ReferencedInterface<StorageServerInterface> {
 public:
@@ -565,7 +565,7 @@ public:
 	Counter bgReadRowsCleared;
 	Counter bgReadRowsInserted;
 	Counter bgReadRowsUpdated;
-	ContinuousSample<double> bgLatencies, bgGranulesPerRequest;
+	DDSketch<double> bgLatencies, bgGranulesPerRequest;
 
 	// Change Feed metrics. Omit change feed metrics from logging if not used
 	bool usedAnyChangeFeeds;
@@ -577,7 +577,7 @@ public:
 	Counter feedPops;
 	Counter feedPopsFallback;
 
-	ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
+	DDSketch<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
 	    bytesPerCommit;
 
 	int outstandingWatches;
diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h
index da85f88625..9b5c2b939e 100644
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@@ -920,8 +920,8 @@ public:
 	std::string REDWOOD_PRIORITY_LAUNCHS;
 
 	// Server request latency measurement
-	int LATENCY_SAMPLE_SIZE;
-	int FILE_LATENCY_SAMPLE_SIZE;
+	double LATENCY_SKETCH_ACCURACY;
+	double FILE_LATENCY_SKETCH_ACCURACY;
 	double LATENCY_METRICS_LOGGING_INTERVAL;
 
 	// Cluster recovery
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 8bb476efc9..23e6de902e 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -878,11 +878,11 @@ Peer::Peer(TransportData* transport, NetworkAddress const& destination)
   : transport(transport), destination(destination), compatible(true), outgoingConnectionIdle(true),
     lastConnectTime(0.0), reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), peerReferences(-1),
     bytesReceived(0), bytesSent(0), lastDataPacketSentTime(now()), outstandingReplies(0),
-    pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), lastLoggedTime(0.0),
+    pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SKETCH_ACCURACY : 0.1), lastLoggedTime(0.0),
     lastLoggedBytesReceived(0), lastLoggedBytesSent(0), timeoutCount(0),
     protocolVersion(Reference<AsyncVar<Optional<ProtocolVersion>>>(new AsyncVar<Optional<ProtocolVersion>>())),
     connectOutgoingCount(0), connectIncomingCount(0), connectFailedCount(0),
-    connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1) {
+    connectLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SKETCH_ACCURACY : 0.1) {
 	IFailureMonitor::failureMonitor().setStatus(destination, FailureStatus(false));
 }
 
diff --git a/fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h
index 40a84e6d9e..4925990342 100644
--- a/fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h
@@ -62,15 +62,15 @@ public:
 		LatencySample readLatencySample = { "AsyncFileKAIOReadLatency",
 			                                UID(),
 			                                FLOW_KNOBS->KAIO_LATENCY_LOGGING_INTERVAL,
-			                                FLOW_KNOBS->KAIO_LATENCY_SAMPLE_SIZE };
+			                                FLOW_KNOBS->KAIO_LATENCY_SKETCH_ACCURACY };
 		LatencySample writeLatencySample = { "AsyncFileKAIOWriteLatency",
 			                                 UID(),
 			                                 FLOW_KNOBS->KAIO_LATENCY_LOGGING_INTERVAL,
-			                                 FLOW_KNOBS->KAIO_LATENCY_SAMPLE_SIZE };
+			                                 FLOW_KNOBS->KAIO_LATENCY_SKETCH_ACCURACY };
 		LatencySample syncLatencySample = { "AsyncFileKAIOSyncLatency",
 			                                UID(),
 			                                FLOW_KNOBS->KAIO_LATENCY_LOGGING_INTERVAL,
-			                                FLOW_KNOBS->KAIO_LATENCY_SAMPLE_SIZE };
+			                                FLOW_KNOBS->KAIO_LATENCY_SKETCH_ACCURACY };
 	};
 
 	static AsyncFileKAIOMetrics& getMetrics() {
diff --git a/fdbrpc/include/fdbrpc/DDSketch.h b/fdbrpc/include/fdbrpc/DDSketch.h
new file mode 100644
index 0000000000..2bbe350ab8
--- /dev/null
+++ b/fdbrpc/include/fdbrpc/DDSketch.h
@@ -0,0 +1,311 @@
+/*
+ * DDSketch.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DDSKETCH_H
+#define DDSKETCH_H
+#include <iterator>
+#include <limits>
+#include <type_traits>
+#pragma once
+
+#include <vector>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include "flow/Error.h"
+#include "flow/UnitTest.h"
+
+// A namespace for fast log() computation.
+namespace fastLogger {
+// Basically, the goal is to compute log(x)/log(r).
+// For double, it is represented as 2^e*(1+s) (0<=s<1), so our goal becomes
+// e*log(2)/log(r)*log(1+s), and we approximate log(1+s) with a cubic function.
+// See more details on Datadog's paper, or CubicallyInterpolatedMapping.java in
+// https://github.com/DataDog/sketches-java/
+inline const double correctingFactor = 1.00988652862227438516; // = 7 / (10 * log(2));
+constexpr inline const double A = 6.0 / 35.0, B = -3.0 / 5.0, C = 10.0 / 7.0;
+
+inline double fastlog(double value) {
+	int e;
+	double s = frexp(value, &e);
+	s = s * 2 - 1;
+	return ((A * s + B) * s + C) * s + e - 1;
+}
+
+inline double reverseLog(double index) {
+	long exponent = floor(index);
+	// Derived from Cardano's formula
+	double d0 = B * B - 3 * A * C;
+	double d1 = 2 * B * B * B - 9 * A * B * C - 27 * A * A * (index - exponent);
+	double p = cbrt((d1 - sqrt(d1 * d1 - 4 * d0 * d0 * d0)) / 2);
+	double significandPlusOne = -(B + p + d0 / p) / (3 * A) + 1;
+	return ldexp(significandPlusOne / 2, exponent + 1);
+}
+}; // namespace fastLogger
+
+// DDSketch for non-negative numbers (those < EPS = 10^-18 are
+// treated as 0, and huge numbers (>1/EPS) fail ASSERT). This is the base
+// class without a concrete log() implementation.
+template <class Impl, class T>
+class DDSketchBase {
+
+	static constexpr T defaultMin() { return std::numeric_limits<T>::max(); }
+
+	static constexpr T defaultMax() {
+		if constexpr (std::is_floating_point_v<T>) {
+			return -std::numeric_limits<T>::max();
+		} else {
+			return std::numeric_limits<T>::min();
+		}
+	}
+
+public:
+	explicit DDSketchBase(double errorGuarantee)
+	  : errorGuarantee(errorGuarantee), populationSize(0), zeroPopulationSize(0), minValue(defaultMin()),
+	    maxValue(defaultMax()), sum(T()) {}
+
+	DDSketchBase<Impl, T>& addSample(T sample) {
+		// Call it addSample for now, while it is not a sample anymore
+		if (!populationSize)
+			minValue = maxValue = sample;
+
+		if (sample <= EPS) {
+			zeroPopulationSize++;
+		} else {
+			int index = static_cast<Impl*>(this)->getIndex(sample);
+			assert(index >= 0 && index < buckets.size());
+			buckets[index]++;
+		}
+
+		populationSize++;
+		sum += sample;
+		maxValue = std::max(maxValue, sample);
+		minValue = std::min(minValue, sample);
+		return *this;
+	}
+
+	double mean() const {
+		if (populationSize == 0)
+			return 0;
+		return (double)sum / populationSize;
+	}
+
+	T median() { return percentile(0.5); }
+
+	T percentile(double percentile) {
+		assert(percentile >= 0 && percentile <= 1);
+
+		if (populationSize == 0)
+			return T();
+		uint64_t targetPercentilePopulation = percentile * (populationSize - 1);
+		// Now find the tPP-th (0-indexed) element
+		if (targetPercentilePopulation < zeroPopulationSize)
+			return T(0);
+
+		int index = -1;
+		[[maybe_unused]] bool found = false;
+		if (percentile <= 0.5) { // count up
+			uint64_t count = zeroPopulationSize;
+			for (size_t i = 0; i < buckets.size(); i++) {
+				if (targetPercentilePopulation < count + buckets[i]) {
+					// count + buckets[i] = # of numbers so far (from the rightmost to
+					// this bucket, inclusive), so if target is in this bucket, it should
+					// means tPP < cnt + bck[i]
+					found = true;
+					index = i;
+					break;
+				}
+				count += buckets[i];
+			}
+		} else { // and count down
+			uint64_t count = 0;
+			for (auto rit = buckets.rbegin(); rit != buckets.rend(); rit++) {
+				if (targetPercentilePopulation + count + *rit >= populationSize) {
+					// cnt + bkt[i] is # of numbers to the right of this bucket (incl.),
+					// so if target is not in this bucket (i.e., to the left of this
+					// bucket), it would be as right as the left bucket's rightmost
+					// number, so we would have tPP + cnt + bkt[i] < total population (tPP
+					// is 0-indexed), that means target is in this bucket if this
+					// condition is not satisfied.
+					found = true;
+					index = std::distance(rit, buckets.rend()) - 1;
+					break;
+				}
+				count += *rit;
+			}
+		}
+		assert(found);
+		return static_cast<Impl*>(this)->getValue(index);
+	}
+
+	T min() const { return minValue; }
+	T max() const { return maxValue; }
+
+	void clear() {
+		std::fill(buckets.begin(), buckets.end(), 0);
+		populationSize = zeroPopulationSize = 0;
+		sum = 0;
+		minValue = defaultMin();
+		maxValue = defaultMax();
+	}
+
+	uint64_t getPopulationSize() const { return populationSize; }
+
+	double getErrorGuarantee() const { return errorGuarantee; }
+
+	size_t getBucketSize() const { return buckets.size(); }
+
+	DDSketchBase<Impl, T>& mergeWith(const DDSketchBase<Impl, T>& anotherSketch) {
+		// Must have the same guarantee
+		assert(fabs(errorGuarantee - anotherSketch.errorGuarantee) < EPS &&
+		       anotherSketch.buckets.size() == buckets.size());
+		for (size_t i = 0; i < anotherSketch.buckets.size(); i++) {
+			buckets[i] += anotherSketch.buckets[i];
+		}
+		populationSize += anotherSketch.populationSize;
+		zeroPopulationSize += anotherSketch.zeroPopulationSize;
+		minValue = std::min(minValue, anotherSketch.minValue);
+		maxValue = std::max(maxValue, anotherSketch.maxValue);
+		sum += anotherSketch.sum;
+		return *this;
+	}
+
+	constexpr static double EPS = 1e-18; // smaller numbers are considered as 0
+protected:
+	double errorGuarantee; // As defined in the paper
+
+	uint64_t populationSize, zeroPopulationSize; // we need to separately count 0s
+	std::vector<uint64_t> buckets;
+	T minValue, maxValue, sum;
+	void setBucketSize(int capacity) { buckets.resize(capacity, 0); }
+};
+
+// DDSketch with fast log implementation for float numbers
+template <class T>
+class DDSketch : public DDSketchBase<DDSketch<T>, T> {
+public:
+	explicit DDSketch(double errorGuarantee = 0.1)
+	  : DDSketchBase<DDSketch<T>, T>(errorGuarantee), gamma((1.0 + errorGuarantee) / (1.0 - errorGuarantee)),
+	    multiplier(fastLogger::correctingFactor * log(2) / log(gamma)) {
+		offset = getIndex(1.0 / DDSketchBase<DDSketch<T>, T>::EPS);
+		this->setBucketSize(2 * offset);
+	}
+
+	int getIndex(T sample) {
+		static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
+		return ceil(fastLogger::fastlog(sample) * multiplier) + offset;
+	}
+
+	T getValue(int index) { return fastLogger::reverseLog((index - offset) / multiplier) * 2.0 / (1 + gamma); }
+
+private:
+	double gamma, multiplier;
+	int offset = 0;
+};
+
+// DDSketch with <cmath> log. Slow and only use this when others doesn't work.
+template <class T>
+class DDSketchSlow : public DDSketchBase<DDSketchSlow<T>, T> {
+public:
+	DDSketchSlow(double errorGuarantee = 0.1)
+	  : DDSketchBase<DDSketchSlow<T>, T>(errorGuarantee), gamma((1.0 + errorGuarantee) / (1.0 - errorGuarantee)),
+	    logGamma(log(gamma)) {
+		offset = getIndex(1.0 / DDSketchBase<DDSketch<T>, T>::EPS) + 5;
+		this->setBucketSize(2 * offset);
+	}
+
+	int getIndex(T sample) { return ceil(log(sample) / logGamma) + offset; }
+
+	T getValue(int index) { return (T)(2.0 * pow(gamma, (index - offset)) / (1 + gamma)); }
+
+private:
+	double gamma, logGamma;
+	int offset = 0;
+};
+
+// DDSketch for unsigned int. Faster than the float version. Fixed accuracy.
+class DDSketchFastUnsigned : public DDSketchBase<DDSketchFastUnsigned, unsigned> {
+public:
+	DDSketchFastUnsigned() : DDSketchBase<DDSketchFastUnsigned, unsigned>(errorGuarantee) { this->setBucketSize(129); }
+
+	int getIndex(unsigned sample) {
+		__uint128_t v = sample;
+		v *= v;
+		v *= v; // sample^4
+		uint64_t low = (uint64_t)v, high = (uint64_t)(v >> 64);
+
+		return 128 - (high == 0 ? ((low == 0 ? 64 : __builtin_clzll(low)) + 64) : __builtin_clzll(high));
+	}
+
+	unsigned getValue(int index) {
+		double r = 1, g = gamma;
+		while (index) { // quick power method for power(gamma, index)
+			if (index & 1)
+				r *= g;
+			g *= g;
+			index >>= 1;
+		}
+		// 2.0 * pow(gamma, index) / (1 + gamma) is what we need
+		return (unsigned)(2.0 * r / (1 + gamma) + 0.5); // round to nearest int
+	}
+
+private:
+	constexpr static double errorGuarantee = 0.08642723372;
+	// getIndex basically calc floor(log_2(x^4)) + 1,
+	// which is almost ceil(log_2(x^4)) as it only matters when x is a power of 2,
+	// and it does not change the error bound. Original sketch asks for
+	// ceil(log_r(x)), so we know r = pow(2, 1/4) = 1.189207115. And r = (1 + eG)
+	// / (1 - eG) so eG = 0.08642723372.
+	constexpr static double gamma = 1.189207115;
+};
+
+#endif
+
+TEST_CASE("/fdbrpc/ddsketch/accuracy") {
+
+	int TRY = 100, SIZE = 1e6;
+	const int totalPercentiles = 7;
+	double targetPercentiles[totalPercentiles] = { .0001, .01, .1, .50, .90, .99, .9999 };
+	double stat[totalPercentiles] = { 0 };
+	for (int t = 0; t < TRY; t++) {
+		DDSketch<double> dd;
+		std::vector<double> nums;
+		for (int i = 0; i < SIZE; i++) {
+			static double a = 1, b = 1; // a skewed distribution
+			auto y = deterministicRandom()->random01();
+			auto num = b / pow(1 - y, 1 / a);
+			nums.push_back(num);
+			dd.addSample(num);
+		}
+		std::sort(nums.begin(), nums.end());
+		for (int percentID = 0; percentID < totalPercentiles; percentID++) {
+			double percentile = targetPercentiles[percentID];
+			double ground = nums[percentile * (SIZE - 1)], ddvalue = dd.percentile(percentile);
+			double relativeError = fabs(ground - ddvalue) / ground;
+			stat[percentID] += relativeError;
+		}
+	}
+
+	for (int percentID = 0; percentID < totalPercentiles; percentID++) {
+		printf("%.4lf per, relative error %.4lf\n", targetPercentiles[percentID], stat[percentID] / TRY);
+	}
+
+	return Void();
+}
diff --git a/fdbrpc/include/fdbrpc/FlowTransport.h b/fdbrpc/include/fdbrpc/FlowTransport.h
index 0f0c3a52e5..d8b4652e20 100644
--- a/fdbrpc/include/fdbrpc/FlowTransport.h
+++ b/fdbrpc/include/fdbrpc/FlowTransport.h
@@ -24,7 +24,7 @@
 
 #include <algorithm>
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbrpc/HealthMonitor.h"
 #include "flow/genericactors.actor.h"
 #include "flow/network.h"
@@ -159,7 +159,7 @@ struct Peer : public ReferenceCounted<Peer> {
 	int64_t bytesSent;
 	double lastDataPacketSentTime;
 	int outstandingReplies;
-	ContinuousSample<double> pingLatencies;
+	DDSketch<double> pingLatencies;
 	double lastLoggedTime;
 	int64_t lastLoggedBytesReceived;
 	int64_t lastLoggedBytesSent;
@@ -171,7 +171,7 @@ struct Peer : public ReferenceCounted<Peer> {
 	int connectOutgoingCount;
 	int connectIncomingCount;
 	int connectFailedCount;
-	ContinuousSample<double> connectLatencies;
+	DDSketch<double> connectLatencies;
 	Promise<Void> disconnect;
 
 	explicit Peer(TransportData* transport, NetworkAddress const& destination);
diff --git a/fdbrpc/include/fdbrpc/Stats.h b/fdbrpc/include/fdbrpc/Stats.h
index 2247299580..46d334c6e0 100644
--- a/fdbrpc/include/fdbrpc/Stats.h
+++ b/fdbrpc/include/fdbrpc/Stats.h
@@ -38,7 +38,7 @@ MyCounters() : foo("foo", cc), bar("bar", cc), baz("baz", cc) {}
 #include <cstddef>
 #include "flow/flow.h"
 #include "flow/TDMetric.actor.h"
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 
 struct ICounter {
 	// All counters have a name and value
@@ -216,40 +216,39 @@ public:
 
 class LatencySample {
 public:
-	LatencySample(std::string name, UID id, double loggingInterval, int sampleSize)
-	  : name(name), id(id), sampleStart(now()), sample(sampleSize),
-	    latencySampleEventHolder(makeReference<EventCacheHolder>(id.toString() + "/" + name)) {
+	LatencySample(std::string name, UID id, double loggingInterval, double accuracy)
+	  : name(name), id(id), sampleStart(now()), sketch(accuracy) {
 		logger = recurring([this]() { logSample(); }, loggingInterval);
 	}
 
-	void addMeasurement(double measurement) { sample.addSample(measurement); }
+	void addMeasurement(double measurement) { sketch.addSample(measurement); }
 
 private:
 	std::string name;
 	UID id;
 	double sampleStart;
 
-	ContinuousSample<double> sample;
+	DDSketch<double> sketch;
 	Future<Void> logger;
 
 	Reference<EventCacheHolder> latencySampleEventHolder;
 
 	void logSample() {
 		TraceEvent(name.c_str(), id)
-		    .detail("Count", sample.getPopulationSize())
+		    .detail("Count", sketch.getPopulationSize())
 		    .detail("Elapsed", now() - sampleStart)
-		    .detail("Min", sample.min())
-		    .detail("Max", sample.max())
-		    .detail("Mean", sample.mean())
-		    .detail("Median", sample.median())
-		    .detail("P25", sample.percentile(0.25))
-		    .detail("P90", sample.percentile(0.9))
-		    .detail("P95", sample.percentile(0.95))
-		    .detail("P99", sample.percentile(0.99))
-		    .detail("P99.9", sample.percentile(0.999))
-		    .trackLatest(latencySampleEventHolder->trackingKey);
+		    .detail("Min", sketch.min())
+		    .detail("Max", sketch.max())
+		    .detail("Mean", sketch.mean())
+		    .detail("Median", sketch.median())
+		    .detail("P25", sketch.percentile(0.25))
+		    .detail("P90", sketch.percentile(0.9))
+		    .detail("P95", sketch.percentile(0.95))
+		    .detail("P99", sketch.percentile(0.99))
+		    .detail("P99.9", sketch.percentile(0.999))
+		    .trackLatest(id.toString() + "/" + name);
 
-		sample.clear();
+		sketch.clear();
 		sampleStart = now();
 	}
 };
diff --git a/fdbrpc/include/fdbrpc/TSSComparison.h b/fdbrpc/include/fdbrpc/TSSComparison.h
index 3c0765c948..7fcc84499b 100644
--- a/fdbrpc/include/fdbrpc/TSSComparison.h
+++ b/fdbrpc/include/fdbrpc/TSSComparison.h
@@ -25,7 +25,6 @@
 #ifndef FDBRPC_TSS_COMPARISON_H
 #define FDBRPC_TSS_COMPARISON_H
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbrpc/Stats.h"
 
 // refcounted + noncopyable because both DatabaseContext and individual endpoints share ownership
@@ -48,15 +47,15 @@ struct TSSMetrics : ReferenceCounted<TSSMetrics>, NonCopyable {
 	Counter mismatches;
 
 	// We could probably just ignore getKey as it's seldom used?
-	ContinuousSample<double> SSgetValueLatency;
-	ContinuousSample<double> SSgetKeyLatency;
-	ContinuousSample<double> SSgetKeyValuesLatency;
-	ContinuousSample<double> SSgetMappedKeyValuesLatency;
+	DDSketch<double> SSgetValueLatency;
+	DDSketch<double> SSgetKeyLatency;
+	DDSketch<double> SSgetKeyValuesLatency;
+	DDSketch<double> SSgetMappedKeyValuesLatency;
 
-	ContinuousSample<double> TSSgetValueLatency;
-	ContinuousSample<double> TSSgetKeyLatency;
-	ContinuousSample<double> TSSgetKeyValuesLatency;
-	ContinuousSample<double> TSSgetMappedKeyValuesLatency;
+	DDSketch<double> TSSgetValueLatency;
+	DDSketch<double> TSSgetKeyLatency;
+	DDSketch<double> TSSgetKeyValuesLatency;
+	DDSketch<double> TSSgetMappedKeyValuesLatency;
 
 	std::unordered_map<int, uint64_t> ssErrorsByCode;
 	std::unordered_map<int, uint64_t> tssErrorsByCode;
diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp
index ccd7284fe1..157c34bf72 100644
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@@ -303,8 +303,8 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
 	          resnapshotLock,
 	          deltaWritesLock,
 	          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	          SERVER_KNOBS->FILE_LATENCY_SAMPLE_SIZE,
-	          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	          SERVER_KNOBS->FILE_LATENCY_SKETCH_ACCURACY,
+	          SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    isEncryptionEnabled(isEncryptionOpSupported(EncryptOperationType::BLOB_GRANULE_ENCRYPTION)) {}
 
 	bool managerEpochOk(int64_t epoch) {
@@ -1750,7 +1750,6 @@ bool granuleCanRetry(const Error& e) {
 	case error_code_http_request_failed:
 	case error_code_connection_failed:
 	case error_code_lookup_failed: // dns
-	case error_code_platform_error: // injected faults
 		return true;
 	default:
 		return false;
diff --git a/fdbserver/EncryptKeyProxy.actor.cpp b/fdbserver/EncryptKeyProxy.actor.cpp
index 7eb8b264a5..37c085db98 100644
--- a/fdbserver/EncryptKeyProxy.actor.cpp
+++ b/fdbserver/EncryptKeyProxy.actor.cpp
@@ -244,15 +244,15 @@ public:
 	    kmsLookupByIdsReqLatency("EKPKmsLookupByIdsReqLatency",
 	                             id,
 	                             SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                             SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                             SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    kmsLookupByDomainIdsReqLatency("EKPKmsLookupByDomainIdsReqLatency",
 	                                   id,
 	                                   SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                                   SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                                   SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    kmsBlobMetadataReqLatency("EKPKmsBlobMetadataReqLatency",
 	                              id,
 	                              SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                              SERVER_KNOBS->LATENCY_SAMPLE_SIZE) {}
+	                              SERVER_KNOBS->LATENCY_SKETCH_ACCURACY) {}
 
 	EncryptBaseCipherDomainIdKeyIdCacheKey getBaseCipherDomainIdKeyIdCacheKey(
 	    const EncryptCipherDomainId domainId,
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index ebca499738..49fbe4445b 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -117,20 +117,20 @@ struct GrvProxyStats {
 	    defaultTxnGRVTimeInQueue("DefaultTxnGRVTimeInQueue",
 	                             id,
 	                             SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                             SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                             SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    batchTxnGRVTimeInQueue("BatchTxnGRVTimeInQueue",
 	                           id,
 	                           SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                           SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                           SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    grvLatencyBands("GRVLatencyBands", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
 	    grvLatencySample("GRVLatencyMetrics",
 	                     id,
 	                     SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                     SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    grvBatchLatencySample("GRVBatchLatencyMetrics",
 	                          id,
 	                          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                          SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    recentRequests(0), lastBucketBegin(now()),
 	    bucketInterval(FLOW_KNOBS->BASIC_LOAD_BALANCE_UPDATE_RATE / FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS),
 	    grvConfirmEpochLiveDist(
@@ -215,7 +215,7 @@ struct GrvProxyData {
 	    versionVectorSizeOnGRVReply("VersionVectorSizeOnGRVReply",
 	                                dbgid,
 	                                SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                                SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                                SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    updateCommitRequests(0), lastCommitTime(0), version(0), minKnownCommittedVersion(invalidVersion),
 	    tagThrottler(SERVER_KNOBS->PROXY_MAX_TAG_THROTTLE_DURATION) {}
 };
diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp
index 5bf5efd1d8..490f2bfa4b 100644
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@@ -111,15 +111,15 @@ SharedRocksDBState::SharedRocksDBState(UID id)
     readOptions(initialReadOptions()), commitLatency(LatencySample("RocksDBCommitLatency",
                                                                    id,
                                                                    SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-                                                                   SERVER_KNOBS->LATENCY_SAMPLE_SIZE)),
+                                                                   SERVER_KNOBS->LATENCY_SKETCH_ACCURACY)),
     commitQueueLatency(LatencySample("RocksDBCommitQueueLatency",
                                      id,
                                      SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-                                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE)),
+                                     SERVER_KNOBS->LATENCY_SKETCH_ACCURACY)),
     dbWriteLatency(LatencySample("RocksDBWriteLatency",
                                  id,
                                  SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-                                 SERVER_KNOBS->LATENCY_SAMPLE_SIZE)) {}
+                                 SERVER_KNOBS->LATENCY_SKETCH_ACCURACY)) {}
 
 rocksdb::ColumnFamilyOptions SharedRocksDBState::initialCfOptions() {
 	rocksdb::ColumnFamilyOptions options;
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index dd2e83e360..1c5ddcab0c 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -1825,7 +1825,7 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 				UID ssID = nondeterministicRandom()->randomUniqueID();
 				std::string s = "BlockingPeekLatencies-" + reqTag.toString();
 				logData->blockingPeekLatencies.try_emplace(
-				    reqTag, s, ssID, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE);
+				    reqTag, s, ssID, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SKETCH_ACCURACY);
 			}
 			LatencySample& sample = logData->blockingPeekLatencies.at(reqTag);
 			sample.addMeasurement(latency);
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 24c9f47cbe..30c80c128a 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -21,7 +21,7 @@
 #include "fdbclient/CommitTransaction.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/Tuple.h"
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbrpc/simulator.h"
 #include "fdbserver/DeltaTree.h"
 #include "fdbserver/IKeyValueStore.h"
diff --git a/fdbserver/include/fdbserver/ProxyCommitData.actor.h b/fdbserver/include/fdbserver/ProxyCommitData.actor.h
index 379f13bc51..d8db57a650 100644
--- a/fdbserver/include/fdbserver/ProxyCommitData.actor.h
+++ b/fdbserver/include/fdbserver/ProxyCommitData.actor.h
@@ -121,20 +121,20 @@ struct ProxyStats {
 	    commitLatencySample("CommitLatencyMetrics",
 	                        id,
 	                        SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                        SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                        SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    commitLatencyBands("CommitLatencyBands", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
 	    commitBatchingEmptyMessageRatio("CommitBatchingEmptyMessageRatio",
 	                                    id,
 	                                    SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                                    SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                                    SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    commitBatchingWindowSize("CommitBatchingWindowSize",
 	                             id,
 	                             SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                             SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                             SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    computeLatency("ComputeLatency",
 	                   id,
 	                   SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                   SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                   SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    maxComputeNS(0), minComputeNS(1e12),
 	    commitBatchQueuingDist(
 	        Histogram::getHistogram("CommitProxy"_sr, "CommitBatchQueuing"_sr, Histogram::Unit::microseconds)),
diff --git a/fdbserver/include/fdbserver/workloads/ReadWriteWorkload.actor.h b/fdbserver/include/fdbserver/workloads/ReadWriteWorkload.actor.h
index 89bc36f393..5323235795 100644
--- a/fdbserver/include/fdbserver/workloads/ReadWriteWorkload.actor.h
+++ b/fdbserver/include/fdbserver/workloads/ReadWriteWorkload.actor.h
@@ -25,6 +25,7 @@
 #elif !defined(FDBSERVER_READWRITEWORKLOAD_ACTOR_H)
 #define FDBSERVER_READWRITEWORKLOAD_ACTOR_H
 
+#include "fdbrpc/DDSketch.h"
 #include "fdbserver/workloads/workloads.actor.h"
 #include "flow/TDMetric.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
@@ -75,7 +76,7 @@ struct ReadWriteCommon : KVWorkload {
 	EventMetricHandle<TransactionFailureMetric> transactionFailureMetric;
 	EventMetricHandle<ReadMetric> readMetric;
 	PerfIntCounter aTransactions, bTransactions, retries;
-	ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, fullReadLatencies;
+	DDSketch<double> latencies, readLatencies, commitLatencies, GRVLatencies, fullReadLatencies;
 	double readLatencyTotal;
 	int readLatencyCount;
 	std::vector<PerfMetric> periodicMetrics;
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index 69a02e2bc2..2a1f0ee94b 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -102,17 +102,17 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
 	    versionVectorTagUpdates("VersionVectorTagUpdates",
 	                            dbgid,
 	                            SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                            SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                            SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    waitForPrevCommitRequests("WaitForPrevCommitRequests", cc),
 	    nonWaitForPrevCommitRequests("NonWaitForPrevCommitRequests", cc),
 	    versionVectorSizeOnCVReply("VersionVectorSizeOnCVReply",
 	                               dbgid,
 	                               SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                               SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                               SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    waitForPrevLatencies("WaitForPrevLatencies",
 	                         dbgid,
 	                         SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                         SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	                         SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 	    addActor(addActor) {
 		logger = cc.traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "MasterMetrics");
 		if (forceRecovery && !myInterface.locality.dcId().present()) {
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index a6bb04860b..db5c63f9e8 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -1266,48 +1266,48 @@ public:
 		    readLatencySample("ReadLatencyMetrics",
 		                      self->thisServerID,
 		                      SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                      SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                      SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    readKeyLatencySample("GetKeyMetrics",
 		                         self->thisServerID,
 		                         SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                         SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                         SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    readValueLatencySample("GetValueMetrics",
 		                           self->thisServerID,
 		                           SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                           SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                           SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    readRangeLatencySample("GetRangeMetrics",
 		                           self->thisServerID,
 		                           SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                           SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                           SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    readVersionWaitSample("ReadVersionWaitMetrics",
 		                          self->thisServerID,
 		                          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                          SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    readQueueWaitSample("ReadQueueWaitMetrics",
 		                        self->thisServerID,
 		                        SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                        SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                        SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
 		    mappedRangeSample("GetMappedRangeMetrics",
 		                      self->thisServerID,
 		                      SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                      SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                      SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    mappedRangeRemoteSample("GetMappedRangeRemoteMetrics",
 		                            self->thisServerID,
 		                            SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                            SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                            SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    mappedRangeLocalSample("GetMappedRangeLocalMetrics",
 		                           self->thisServerID,
 		                           SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                           SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                           SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    kvReadRangeLatencySample("KVGetRangeMetrics",
 		                             self->thisServerID,
 		                             SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                             SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		                             SERVER_KNOBS->LATENCY_SKETCH_ACCURACY),
 		    updateLatencySample("UpdateLatencyMetrics",
 		                        self->thisServerID,
 		                        SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                        SERVER_KNOBS->LATENCY_SAMPLE_SIZE) {
+		                        SERVER_KNOBS->LATENCY_SKETCH_ACCURACY) {
 			specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
 			specialCounter(cc, "Version", [self]() { return self->version.get(); });
 			specialCounter(cc, "StorageVersion", [self]() { return self->storageVersion(); });
diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp
index e679dce9f9..662d4d30ae 100644
--- a/fdbserver/workloads/AtomicOps.actor.cpp
+++ b/fdbserver/workloads/AtomicOps.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/BulkSetup.actor.h"
diff --git a/fdbserver/workloads/BackgroundSelectors.actor.cpp b/fdbserver/workloads/BackgroundSelectors.actor.cpp
index 2b55db49ad..5f255535d3 100644
--- a/fdbserver/workloads/BackgroundSelectors.actor.cpp
+++ b/fdbserver/workloads/BackgroundSelectors.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/workloads.actor.h"
diff --git a/fdbserver/workloads/BulkLoad.actor.cpp b/fdbserver/workloads/BulkLoad.actor.cpp
index cc5447c321..684108d811 100644
--- a/fdbserver/workloads/BulkLoad.actor.cpp
+++ b/fdbserver/workloads/BulkLoad.actor.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/workloads.actor.h"
@@ -34,11 +34,10 @@ struct BulkLoadWorkload : TestWorkload {
 
 	std::vector<Future<Void>> clients;
 	PerfIntCounter transactions, retries;
-	ContinuousSample<double> latencies;
+	DDSketch<double> latencies;
 
 	BulkLoadWorkload(WorkloadContext const& wcx)
-	  : TestWorkload(wcx), clientCount(wcx.clientCount), transactions("Transactions"), retries("Retries"),
-	    latencies(2000) {
+	  : TestWorkload(wcx), clientCount(wcx.clientCount), transactions("Transactions"), retries("Retries"), latencies() {
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
 		actorCount = getOption(options, "actorCount"_sr, 20);
 		writesPerTransaction = getOption(options, "writesPerTransaction"_sr, 10);
diff --git a/fdbserver/workloads/BulkSetup.actor.cpp b/fdbserver/workloads/BulkSetup.actor.cpp
new file mode 100644
index 0000000000..59389cf0f7
--- /dev/null
+++ b/fdbserver/workloads/BulkSetup.actor.cpp
@@ -0,0 +1,74 @@
+/*
+ * BulkSetup.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/NativeAPI.actor.h"
+#include "fdbserver/TesterInterface.actor.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/workloads/BulkSetup.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+struct BulkSetupWorkload : TestWorkload {
+
+	std::vector<TenantName> tenantNames;
+	int nodeCount;
+	double transactionsPerSecond;
+	Key keyPrefix;
+
+	BulkSetupWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		transactionsPerSecond = getOption(options, "transactionsPerSecond"_sr, 5000.0) / clientCount;
+		nodeCount = getOption(options, "nodeCount"_sr, transactionsPerSecond * clientCount);
+		keyPrefix = unprintable(getOption(options, "keyPrefix"_sr, LiteralStringRef("")).toString());
+		std::vector<std::string> tenants = getOption(options, "tenants"_sr, std::vector<std::string>());
+		for (std::string tenant : tenants) {
+			tenantNames.push_back(TenantName(tenant));
+		}
+	}
+
+	std::string description() const override { return "BulkSetup"; }
+
+	void getMetrics(std::vector<PerfMetric>& m) override {}
+
+	Key keyForIndex(int n) { return key(n); }
+	Key key(int n) { return doubleToTestKey((double)n / nodeCount, keyPrefix); }
+	Value value(int n) { return doubleToTestKey(n, keyPrefix); }
+
+	Standalone<KeyValueRef> operator()(int n) { return KeyValueRef(key(n), value((n + 1) % nodeCount)); }
+
+	Future<Void> start(Database const& cx) override {
+		return bulkSetup(cx,
+		                 this,
+		                 nodeCount,
+		                 Promise<double>(),
+		                 false,
+		                 0.0,
+		                 1e12,
+		                 std::vector<uint64_t>(),
+		                 Promise<std::vector<std::pair<uint64_t, double>>>(),
+		                 0,
+		                 0.1,
+		                 0,
+		                 0,
+		                 this->tenantNames);
+	}
+
+	Future<bool> check(Database const& cx) override { return true; }
+};
+
+WorkloadFactory<BulkSetupWorkload> BulkSetupWorkloadFactory("BulkSetup");
diff --git a/fdbserver/workloads/DDBalance.actor.cpp b/fdbserver/workloads/DDBalance.actor.cpp
index efd3dbfb84..95fe5143b1 100644
--- a/fdbserver/workloads/DDBalance.actor.cpp
+++ b/fdbserver/workloads/DDBalance.actor.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/workloads.actor.h"
@@ -33,10 +33,10 @@ struct DDBalanceWorkload : TestWorkload {
 
 	std::vector<Future<Void>> clients;
 	PerfIntCounter bin_shifts, operations, retries;
-	ContinuousSample<double> latencies;
+	DDSketch<double> latencies;
 
 	DDBalanceWorkload(WorkloadContext const& wcx)
-	  : TestWorkload(wcx), bin_shifts("Bin_Shifts"), operations("Operations"), retries("Retries"), latencies(2000) {
+	  : TestWorkload(wcx), bin_shifts("Bin_Shifts"), operations("Operations"), retries("Retries"), latencies() {
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
 		binCount = getOption(options, "binCount"_sr, 1000);
 		writesPerTransaction = getOption(options, "writesPerTransaction"_sr, 1);
diff --git a/fdbserver/workloads/FastTriggeredWatches.actor.cpp b/fdbserver/workloads/FastTriggeredWatches.actor.cpp
index 32ba9ed1a6..b689adc3cf 100644
--- a/fdbserver/workloads/FastTriggeredWatches.actor.cpp
+++ b/fdbserver/workloads/FastTriggeredWatches.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbclient/ReadYourWrites.h"
diff --git a/fdbserver/workloads/FileSystem.actor.cpp b/fdbserver/workloads/FileSystem.actor.cpp
index 722c1c59ac..b0cfbdeedc 100644
--- a/fdbserver/workloads/FileSystem.actor.cpp
+++ b/fdbserver/workloads/FileSystem.actor.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/workloads.actor.h"
@@ -33,8 +33,8 @@ struct FileSystemWorkload : TestWorkload {
 
 	std::vector<Future<Void>> clients;
 	PerfIntCounter queries, writes;
-	ContinuousSample<double> latencies;
-	ContinuousSample<double> writeLatencies;
+	DDSketch<double> latencies;
+	DDSketch<double> writeLatencies;
 
 	class FileSystemOp {
 	public:
@@ -44,7 +44,7 @@ struct FileSystemWorkload : TestWorkload {
 	};
 
 	FileSystemWorkload(WorkloadContext const& wcx)
-	  : TestWorkload(wcx), queries("Queries"), writes("Latency"), latencies(2500), writeLatencies(1000) {
+	  : TestWorkload(wcx), queries("Queries"), writes("Latency"), latencies(), writeLatencies() {
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
 		transactionsPerSecond = getOption(options, "transactionsPerSecond"_sr, 5000.0) / clientCount;
 		double allowedLatency = getOption(options, "allowedLatency"_sr, 0.250);
diff --git a/fdbserver/workloads/IndexScan.actor.cpp b/fdbserver/workloads/IndexScan.actor.cpp
index b0297a1c31..d5f8a57db4 100644
--- a/fdbserver/workloads/IndexScan.actor.cpp
+++ b/fdbserver/workloads/IndexScan.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/workloads.actor.h"
diff --git a/fdbserver/workloads/LowLatency.actor.cpp b/fdbserver/workloads/LowLatency.actor.cpp
index 65ea46e750..3ac468ad9a 100644
--- a/fdbserver/workloads/LowLatency.actor.cpp
+++ b/fdbserver/workloads/LowLatency.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/IKnobCollection.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
diff --git a/fdbserver/workloads/Mako.actor.cpp b/fdbserver/workloads/Mako.actor.cpp
index ff30bbb2fd..7f18cd67b0 100644
--- a/fdbserver/workloads/Mako.actor.cpp
+++ b/fdbserver/workloads/Mako.actor.cpp
@@ -63,7 +63,7 @@ struct MakoWorkload : TestWorkload {
 	// used for periodically tracing
 	std::vector<PerfMetric> periodicMetrics;
 	// store latency of each operation with sampling
-	std::vector<ContinuousSample<double>> opLatencies;
+	std::vector<DDSketch<double>> opLatencies;
 	// key used to store checkSum for given key range
 	std::vector<Key> csKeys;
 	// key prefix of for all generated keys
@@ -142,7 +142,7 @@ struct MakoWorkload : TestWorkload {
 		parseOperationsSpec();
 		for (int i = 0; i < MAX_OP; ++i) {
 			// initilize per-operation latency record
-			opLatencies.push_back(ContinuousSample<double>(rowCount / sampleSize));
+			opLatencies.push_back(DDSketch<double>());
 			// initialize per-operation counter
 			opCounters.push_back(PerfIntCounter(opNames[i]));
 		}
@@ -658,7 +658,7 @@ struct MakoWorkload : TestWorkload {
 		return Void();
 	}
 	ACTOR template <class T>
-	static Future<Void> logLatency(Future<T> f, ContinuousSample<double>* opLatencies) {
+	static Future<Void> logLatency(Future<T> f, DDSketch<double>* opLatencies) {
 		state double opBegin = timer();
 		wait(success(f));
 		opLatencies->addSample(timer() - opBegin);
diff --git a/fdbserver/workloads/MemoryLifetime.actor.cpp b/fdbserver/workloads/MemoryLifetime.actor.cpp
index a206fbb7d2..8eb59b37a3 100644
--- a/fdbserver/workloads/MemoryLifetime.actor.cpp
+++ b/fdbserver/workloads/MemoryLifetime.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "flow/DeterministicRandom.h"
diff --git a/fdbserver/workloads/MetricLogging.actor.cpp b/fdbserver/workloads/MetricLogging.actor.cpp
index 817727a6c7..4b3ce6b97e 100644
--- a/fdbserver/workloads/MetricLogging.actor.cpp
+++ b/fdbserver/workloads/MetricLogging.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "flow/TDMetric.actor.h"
diff --git a/fdbserver/workloads/QueuePush.actor.cpp b/fdbserver/workloads/QueuePush.actor.cpp
index c7963a6b29..36d511d970 100644
--- a/fdbserver/workloads/QueuePush.actor.cpp
+++ b/fdbserver/workloads/QueuePush.actor.cpp
@@ -19,7 +19,7 @@
  */
 #include <vector>
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/workloads.actor.h"
@@ -38,10 +38,10 @@ struct QueuePushWorkload : TestWorkload {
 
 	std::vector<Future<Void>> clients;
 	PerfIntCounter transactions, retries;
-	ContinuousSample<double> commitLatencies, GRVLatencies;
+	DDSketch<double> commitLatencies, GRVLatencies;
 
 	QueuePushWorkload(WorkloadContext const& wcx)
-	  : TestWorkload(wcx), transactions("Transactions"), retries("Retries"), commitLatencies(2000), GRVLatencies(2000) {
+	  : TestWorkload(wcx), transactions("Transactions"), retries("Retries"), commitLatencies(), GRVLatencies() {
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
 		actorCount = getOption(options, "actorCount"_sr, 50);
 
diff --git a/fdbserver/workloads/RYWDisable.actor.cpp b/fdbserver/workloads/RYWDisable.actor.cpp
index 6d9d6a67bd..cbd5dfc818 100644
--- a/fdbserver/workloads/RYWDisable.actor.cpp
+++ b/fdbserver/workloads/RYWDisable.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbclient/ReadYourWrites.h"
diff --git a/fdbserver/workloads/RYWPerformance.actor.cpp b/fdbserver/workloads/RYWPerformance.actor.cpp
index 21683439b7..ca45b0d482 100644
--- a/fdbserver/workloads/RYWPerformance.actor.cpp
+++ b/fdbserver/workloads/RYWPerformance.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbclient/ReadYourWrites.h"
diff --git a/fdbserver/workloads/ReadAfterWrite.actor.cpp b/fdbserver/workloads/ReadAfterWrite.actor.cpp
index 219f30975f..ed080722d0 100644
--- a/fdbserver/workloads/ReadAfterWrite.actor.cpp
+++ b/fdbserver/workloads/ReadAfterWrite.actor.cpp
@@ -25,8 +25,6 @@
 #include "flow/genericactors.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-static constexpr int SAMPLE_SIZE = 10000;
-
 // If the log->storage propagation delay is longer than 1 second, then it's likely that our read
 // will see a `future_version` error from the storage server.  We need to retry the read until
 // a value is returned, or a different error is thrown.
@@ -51,9 +49,9 @@ struct ReadAfterWriteWorkload : KVWorkload {
 	static constexpr auto NAME = "ReadAfterWrite";
 
 	double testDuration;
-	ContinuousSample<double> propagationLatency;
+	DDSketch<double> propagationLatency;
 
-	ReadAfterWriteWorkload(WorkloadContext const& wcx) : KVWorkload(wcx), propagationLatency(SAMPLE_SIZE) {
+	ReadAfterWriteWorkload(WorkloadContext const& wcx) : KVWorkload(wcx), propagationLatency() {
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
 	}
 
diff --git a/fdbserver/workloads/ReadHotDetection.actor.cpp b/fdbserver/workloads/ReadHotDetection.actor.cpp
index 7779d3a6b2..0bae939bb4 100644
--- a/fdbserver/workloads/ReadHotDetection.actor.cpp
+++ b/fdbserver/workloads/ReadHotDetection.actor.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/BulkSetup.actor.h"
diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp
index 1571e084e5..6d2e37b003 100644
--- a/fdbserver/workloads/ReadWrite.actor.cpp
+++ b/fdbserver/workloads/ReadWrite.actor.cpp
@@ -23,7 +23,7 @@
 #include <vector>
 
 #include "fdbclient/FDBTypes.h"
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
@@ -200,7 +200,7 @@ struct ReadWriteCommonImpl {
 		}
 	}
 	ACTOR static Future<Void> logLatency(Future<Optional<Value>> f,
-	                                     ContinuousSample<double>* latencies,
+	                                     DDSketch<double>* latencies,
 	                                     double* totalLatency,
 	                                     int* latencyCount,
 	                                     EventMetricHandle<ReadMetric> readMetric,
@@ -220,7 +220,7 @@ struct ReadWriteCommonImpl {
 		return Void();
 	}
 	ACTOR static Future<Void> logLatency(Future<RangeResult> f,
-	                                     ContinuousSample<double>* latencies,
+	                                     DDSketch<double>* latencies,
 	                                     double* totalLatency,
 	                                     int* latencyCount,
 	                                     EventMetricHandle<ReadMetric> readMetric,
diff --git a/fdbserver/workloads/SkewedReadWrite.actor.cpp b/fdbserver/workloads/SkewedReadWrite.actor.cpp
index 20fcf24233..ba37e7ac42 100644
--- a/fdbserver/workloads/SkewedReadWrite.actor.cpp
+++ b/fdbserver/workloads/SkewedReadWrite.actor.cpp
@@ -22,7 +22,7 @@
 #include <utility>
 #include <vector>
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
@@ -389,4 +389,4 @@ TEST_CASE("/KVWorkload/methods/ParseKeyForIndex") {
 		ASSERT(parse == idx);
 	}
 	return Void();
-}
\ No newline at end of file
+}
diff --git a/fdbserver/workloads/SnapTest.actor.cpp b/fdbserver/workloads/SnapTest.actor.cpp
index e69c5ab9c2..f1256e6f5f 100644
--- a/fdbserver/workloads/SnapTest.actor.cpp
+++ b/fdbserver/workloads/SnapTest.actor.cpp
@@ -23,7 +23,6 @@
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/SystemData.h"
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/SimpleIni.h"
 #include "fdbserver/Status.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
diff --git a/fdbserver/workloads/StreamingRead.actor.cpp b/fdbserver/workloads/StreamingRead.actor.cpp
index da8656d2b3..3031f9bf56 100644
--- a/fdbserver/workloads/StreamingRead.actor.cpp
+++ b/fdbserver/workloads/StreamingRead.actor.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/workloads.actor.h"
@@ -37,11 +37,11 @@ struct StreamingReadWorkload : TestWorkload {
 	std::vector<Future<Void>> clients;
 	PerfIntCounter transactions, readKeys;
 	PerfIntCounter readValueBytes;
-	ContinuousSample<double> latencies;
+	DDSketch<double> latencies;
 
 	StreamingReadWorkload(WorkloadContext const& wcx)
 	  : TestWorkload(wcx), transactions("Transactions"), readKeys("Keys Read"), readValueBytes("Value Bytes Read"),
-	    latencies(2000) {
+	    latencies() {
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
 		actorCount = getOption(options, "actorCount"_sr, 20);
 		readsPerTransaction = getOption(options, "readsPerTransaction"_sr, 10);
diff --git a/fdbserver/workloads/Throughput.actor.cpp b/fdbserver/workloads/Throughput.actor.cpp
index fdee65878d..b5cde15078 100644
--- a/fdbserver/workloads/Throughput.actor.cpp
+++ b/fdbserver/workloads/Throughput.actor.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
@@ -189,12 +189,11 @@ struct MeasureSinglePeriod : IMeasurer {
 	double delay, duration;
 	double startT;
 
-	ContinuousSample<double> totalLatency, grvLatency, rowReadLatency, commitLatency;
+	DDSketch<double> totalLatency, grvLatency, rowReadLatency, commitLatency;
 	ITransactor::Stats stats; // totalled over the period
 
 	MeasureSinglePeriod(double delay, double duration)
-	  : delay(delay), duration(duration), totalLatency(2000), grvLatency(2000), rowReadLatency(2000),
-	    commitLatency(2000) {}
+	  : delay(delay), duration(duration), totalLatency(), grvLatency(), rowReadLatency(), commitLatency() {}
 
 	Future<Void> start() override {
 		startT = now();
diff --git a/fdbserver/workloads/Unreadable.actor.cpp b/fdbserver/workloads/Unreadable.actor.cpp
index 5f541e647f..3436990a30 100644
--- a/fdbserver/workloads/Unreadable.actor.cpp
+++ b/fdbserver/workloads/Unreadable.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/BulkSetup.actor.h"
diff --git a/fdbserver/workloads/VersionStamp.actor.cpp b/fdbserver/workloads/VersionStamp.actor.cpp
index 82e7d38c47..b542e3ad04 100644
--- a/fdbserver/workloads/VersionStamp.actor.cpp
+++ b/fdbserver/workloads/VersionStamp.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/ClusterConnectionMemoryRecord.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
diff --git a/fdbserver/workloads/WatchAndWait.actor.cpp b/fdbserver/workloads/WatchAndWait.actor.cpp
index 4dfd852422..bea4f6be23 100644
--- a/fdbserver/workloads/WatchAndWait.actor.cpp
+++ b/fdbserver/workloads/WatchAndWait.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/workloads/BulkSetup.actor.h"
diff --git a/fdbserver/workloads/Watches.actor.cpp b/fdbserver/workloads/Watches.actor.cpp
index f2bafae6b0..7175cf2565 100644
--- a/fdbserver/workloads/Watches.actor.cpp
+++ b/fdbserver/workloads/Watches.actor.cpp
@@ -18,15 +18,13 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "flow/DeterministicRandom.h"
 #include "fdbserver/workloads/workloads.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-const int sampleSize = 10000;
-
 struct WatchesWorkload : TestWorkload {
 	static constexpr auto NAME = "Watches";
 
@@ -34,10 +32,10 @@ struct WatchesWorkload : TestWorkload {
 	double testDuration;
 	std::vector<Future<Void>> clients;
 	PerfIntCounter cycles;
-	ContinuousSample<double> cycleLatencies;
+	DDSketch<double> cycleLatencies;
 	std::vector<int> nodeOrder;
 
-	WatchesWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), cycles("Cycles"), cycleLatencies(sampleSize) {
+	WatchesWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), cycles("Cycles"), cycleLatencies() {
 		testDuration = getOption(options, "testDuration"_sr, 600.0);
 		nodes = getOption(options, "nodeCount"_sr, 100);
 		extraPerNode = getOption(options, "extraPerNode"_sr, 1000);
diff --git a/fdbserver/workloads/WatchesSameKeyCorrectness.actor.cpp b/fdbserver/workloads/WatchesSameKeyCorrectness.actor.cpp
index 01e74303cd..04380a7472 100644
--- a/fdbserver/workloads/WatchesSameKeyCorrectness.actor.cpp
+++ b/fdbserver/workloads/WatchesSameKeyCorrectness.actor.cpp
@@ -18,7 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/ContinuousSample.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
diff --git a/fdbserver/workloads/WriteBandwidth.actor.cpp b/fdbserver/workloads/WriteBandwidth.actor.cpp
index 4446ab676a..75bab9720c 100644
--- a/fdbserver/workloads/WriteBandwidth.actor.cpp
+++ b/fdbserver/workloads/WriteBandwidth.actor.cpp
@@ -20,7 +20,7 @@
 
 #include <boost/lexical_cast.hpp>
 
-#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/DDSketch.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/TesterInterface.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
@@ -37,11 +37,11 @@ struct WriteBandwidthWorkload : KVWorkload {
 
 	std::vector<Future<Void>> clients;
 	PerfIntCounter transactions, retries;
-	ContinuousSample<double> commitLatencies, GRVLatencies;
+	DDSketch<double> commitLatencies, GRVLatencies;
 
 	WriteBandwidthWorkload(WorkloadContext const& wcx)
-	  : KVWorkload(wcx), loadTime(0.0), transactions("Transactions"), retries("Retries"), commitLatencies(2000),
-	    GRVLatencies(2000) {
+	  : KVWorkload(wcx), loadTime(0.0), transactions("Transactions"), retries("Retries"), commitLatencies(),
+	    GRVLatencies() {
 		testDuration = getOption(options, "testDuration"_sr, 10.0);
 		keysPerTransaction = getOption(options, "keysPerTransaction"_sr, 100);
 		valueString = std::string(maxValueBytes, '.');
diff --git a/fdbserver/workloads/WriteTagThrottling.actor.cpp b/fdbserver/workloads/WriteTagThrottling.actor.cpp
index 043bae5e0f..8f53d360d3 100644
--- a/fdbserver/workloads/WriteTagThrottling.actor.cpp
+++ b/fdbserver/workloads/WriteTagThrottling.actor.cpp
@@ -26,7 +26,6 @@
 #include "fdbclient/TagThrottle.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-constexpr int SAMPLE_SIZE = 10000;
 // workload description:
 // This workload aims to test whether we can throttling some bad clients that doing penetrating write on write hot-spot
 // range. There are several good clientActor just randomly do read and write ops in transaction. Also, some bad
@@ -41,8 +40,8 @@ struct WriteTagThrottlingWorkload : KVWorkload {
 	int badActorTrNum = 0, badActorRetries = 0, badActorTooOldRetries = 0, badActorCommitFailedRetries = 0;
 	int goodActorThrottleRetries = 0, badActorThrottleRetries = 0;
 	double badActorTotalLatency = 0.0, goodActorTotalLatency = 0.0;
-	ContinuousSample<double> badActorReadLatency, goodActorReadLatency;
-	ContinuousSample<double> badActorCommitLatency, goodActorCommitLatency;
+	DDSketch<double> badActorReadLatency, goodActorReadLatency;
+	DDSketch<double> badActorCommitLatency, goodActorCommitLatency;
 	// Test configuration
 	// KVWorkload::actorCount
 	int goodActorPerClient, badActorPerClient;
@@ -64,8 +63,8 @@ struct WriteTagThrottlingWorkload : KVWorkload {
 	static constexpr int MIN_TRANSACTION_TAG_LENGTH = 2;
 
 	WriteTagThrottlingWorkload(WorkloadContext const& wcx)
-	  : KVWorkload(wcx), badActorReadLatency(SAMPLE_SIZE), goodActorReadLatency(SAMPLE_SIZE),
-	    badActorCommitLatency(SAMPLE_SIZE), goodActorCommitLatency(SAMPLE_SIZE) {
+	  : KVWorkload(wcx), badActorReadLatency(), goodActorReadLatency(), badActorCommitLatency(),
+	    goodActorCommitLatency() {
 		testDuration = getOption(options, "testDuration"_sr, 120.0);
 		badOpRate = getOption(options, "badOpRate"_sr, 0.9);
 		numWritePerTr = getOption(options, "numWritePerTr"_sr, 1);
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index 06b4e07355..08bf96e529 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -112,8 +112,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
 	init( PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT,           3600.0 );
 	init( INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING,              5.0 );
 	init( PING_LOGGING_INTERVAL,                               3.0 );
-	init( PING_SAMPLE_AMOUNT,                                  100 );
-	init( NETWORK_CONNECT_SAMPLE_AMOUNT,                       100 );
+	init( PING_SKETCH_ACCURACY,                                0.1 );
 
 	init( TLS_CERT_REFRESH_DELAY_SECONDS,                 12*60*60 );
 	init( TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT,              9.0 );
@@ -168,7 +167,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
 	init( MIN_SUBMIT,                                           10 );
 	init( SQLITE_DISK_METRIC_LOGGING_INTERVAL,                 5.0 );
 	init( KAIO_LATENCY_LOGGING_INTERVAL,                      30.0 );
-	init( KAIO_LATENCY_SAMPLE_SIZE,                          30000 );
+	init( KAIO_LATENCY_SKETCH_ACCURACY,                       0.01 );
 
 	init( PAGE_WRITE_CHECKSUM_HISTORY,                           0 ); if( randomize && BUGGIFY ) PAGE_WRITE_CHECKSUM_HISTORY = 10000000;
 	init( DISABLE_POSIX_KERNEL_AIO,                              0 );
diff --git a/flow/include/flow/Knobs.h b/flow/include/flow/Knobs.h
index d0e40dd67f..0ba1e3b4ff 100644
--- a/flow/include/flow/Knobs.h
+++ b/flow/include/flow/Knobs.h
@@ -176,8 +176,7 @@ public:
 	int ACCEPT_BATCH_SIZE;
 	double INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING;
 	double PING_LOGGING_INTERVAL;
-	int PING_SAMPLE_AMOUNT;
-	int NETWORK_CONNECT_SAMPLE_AMOUNT;
+	double PING_SKETCH_ACCURACY;
 
 	int TLS_CERT_REFRESH_DELAY_SECONDS;
 	double TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT;
@@ -231,7 +230,7 @@ public:
 	int MIN_SUBMIT;
 	double SQLITE_DISK_METRIC_LOGGING_INTERVAL;
 	double KAIO_LATENCY_LOGGING_INTERVAL;
-	int KAIO_LATENCY_SAMPLE_SIZE;
+	double KAIO_LATENCY_SKETCH_ACCURACY;
 
 	int PAGE_WRITE_CHECKSUM_HISTORY;
 	int DISABLE_POSIX_KERNEL_AIO;
diff --git a/flowbench/BenchSamples.cpp b/flowbench/BenchSamples.cpp
index 687a371048..1ac9e034d2 100644
--- a/flowbench/BenchSamples.cpp
+++ b/flowbench/BenchSamples.cpp
@@ -22,8 +22,62 @@
 #include "flow/IRandom.h"
 #include "flowbench/GlobalData.h"
 #include "fdbrpc/Stats.h"
+#include "fdbrpc/DDSketch.h"
+#include "fdbrpc/ContinuousSample.h"
 #include "flow/Histogram.h"
 
+static void bench_ddsketchUnsigned(benchmark::State& state) {
+	DDSketchFastUnsigned dds;
+	InputGenerator<unsigned> data(1e6, []() { return deterministicRandom()->randomInt64(0, 1e9); });
+
+	for (auto _ : state) {
+		dds.addSample(data.next());
+	}
+
+	state.SetItemsProcessed(state.iterations());
+}
+// DDSketchFastUnsigned has a fixed error margin (~8%)
+BENCHMARK(bench_ddsketchUnsigned)->ReportAggregatesOnly(true);
+
+static void bench_ddsketchInt(benchmark::State& state) {
+	DDSketch<int64_t> dds((double)state.range(0) / 100);
+	InputGenerator<int64_t> data(1e6, []() { return deterministicRandom()->randomInt64(0, 1e9); });
+
+	for (auto _ : state) {
+		dds.addSample(data.next());
+	}
+
+	state.SetItemsProcessed(state.iterations());
+}
+// Try with 10%, 5% and 1% error margins
+BENCHMARK(bench_ddsketchInt)->Arg(10)->Arg(5)->Arg(1)->ReportAggregatesOnly(true);
+
+static void bench_ddsketchDouble(benchmark::State& state) {
+	DDSketch<double> dds((double)state.range(0) / 100);
+	InputGenerator<double> data(1e6, []() { return deterministicRandom()->randomInt64(0, 1e9); });
+
+	for (auto _ : state) {
+		dds.addSample(data.next());
+	}
+
+	state.SetItemsProcessed(state.iterations());
+}
+// Try with 10%, 5% and 1% error margins
+BENCHMARK(bench_ddsketchDouble)->Arg(10)->Arg(5)->Arg(1)->ReportAggregatesOnly(true);
+
+static void bench_ddsketchLatency(benchmark::State& state) {
+	DDSketch<double> dds((double)state.range(0) / 100);
+	InputGenerator<double> data(1e6, []() { return deterministicRandom()->random01() * 2.0; });
+
+	for (auto _ : state) {
+		dds.addSample(data.next());
+	}
+
+	state.SetItemsProcessed(state.iterations());
+}
+// Try with 10%, 5% and 1% error margins
+BENCHMARK(bench_ddsketchLatency)->Arg(10)->Arg(5)->Arg(1)->ReportAggregatesOnly(true);
+
 static void bench_continuousSampleInt(benchmark::State& state) {
 	ContinuousSample<int64_t> cs(state.range(0));
 	InputGenerator<int64_t> data(1e6, []() { return deterministicRandom()->randomInt64(0, 1e9); });
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a72fc06645..d62ea5fb14 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -56,6 +56,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES BlobManagerUnit.toml)
   add_fdb_test(TEST_FILES ConsistencyCheck.txt IGNORE)
   add_fdb_test(TEST_FILES DDMetricsExclude.txt IGNORE)
+  add_fdb_test(TEST_FILES DDSketch.txt IGNORE)
   add_fdb_test(TEST_FILES DataDistributionMetrics.txt IGNORE)
   add_fdb_test(TEST_FILES DiskDurability.txt IGNORE)
   add_fdb_test(TEST_FILES FileSystem.txt IGNORE)

From dfc5a3a78a9ce7868366273157489adb85b21633 Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Mon, 3 Oct 2022 12:44:41 -0700
Subject: [PATCH 30/57] Default errorGuarantee -> 1%

---
 fdbrpc/include/fdbrpc/DDSketch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/include/fdbrpc/DDSketch.h b/fdbrpc/include/fdbrpc/DDSketch.h
index 2bbe350ab8..a5c2105da4 100644
--- a/fdbrpc/include/fdbrpc/DDSketch.h
+++ b/fdbrpc/include/fdbrpc/DDSketch.h
@@ -201,7 +201,7 @@ protected:
 template <class T>
 class DDSketch : public DDSketchBase<DDSketch<T>, T> {
 public:
-	explicit DDSketch(double errorGuarantee = 0.1)
+	explicit DDSketch(double errorGuarantee = 0.01)
 	  : DDSketchBase<DDSketch<T>, T>(errorGuarantee), gamma((1.0 + errorGuarantee) / (1.0 - errorGuarantee)),
 	    multiplier(fastLogger::correctingFactor * log(2) / log(gamma)) {
 		offset = getIndex(1.0 / DDSketchBase<DDSketch<T>, T>::EPS);

From d59b6f3f47152ddf6a457151445fd8c8888b7d1b Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Sat, 12 Nov 2022 13:59:05 -0800
Subject: [PATCH 31/57] merge errors

---
 fdbrpc/include/fdbrpc/Stats.h           |  2 +-
 fdbserver/BlobWorker.actor.cpp          |  1 +
 fdbserver/workloads/BulkSetup.actor.cpp | 74 -------------------------
 3 files changed, 2 insertions(+), 75 deletions(-)
 delete mode 100644 fdbserver/workloads/BulkSetup.actor.cpp

diff --git a/fdbrpc/include/fdbrpc/Stats.h b/fdbrpc/include/fdbrpc/Stats.h
index 46d334c6e0..897705cfbf 100644
--- a/fdbrpc/include/fdbrpc/Stats.h
+++ b/fdbrpc/include/fdbrpc/Stats.h
@@ -246,7 +246,7 @@ private:
 		    .detail("P95", sketch.percentile(0.95))
 		    .detail("P99", sketch.percentile(0.99))
 		    .detail("P99.9", sketch.percentile(0.999))
-		    .trackLatest(id.toString() + "/" + name);
+		    .trackLatest(latencySampleEventHolder->trackingKey);
 
 		sketch.clear();
 		sampleStart = now();
diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp
index e7a262c2b8..d88e37704b 100644
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@@ -1752,6 +1752,7 @@ bool granuleCanRetry(const Error& e) {
 	case error_code_http_request_failed:
 	case error_code_connection_failed:
 	case error_code_lookup_failed: // dns
+	case error_code_platform_error: // injected faults
 		return true;
 	default:
 		return false;
diff --git a/fdbserver/workloads/BulkSetup.actor.cpp b/fdbserver/workloads/BulkSetup.actor.cpp
deleted file mode 100644
index 59389cf0f7..0000000000
--- a/fdbserver/workloads/BulkSetup.actor.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * BulkSetup.actor.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "fdbclient/NativeAPI.actor.h"
-#include "fdbserver/TesterInterface.actor.h"
-#include "fdbserver/workloads/workloads.actor.h"
-#include "fdbserver/workloads/BulkSetup.actor.h"
-#include "flow/actorcompiler.h" // This must be the last #include.
-
-struct BulkSetupWorkload : TestWorkload {
-
-	std::vector<TenantName> tenantNames;
-	int nodeCount;
-	double transactionsPerSecond;
-	Key keyPrefix;
-
-	BulkSetupWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
-		transactionsPerSecond = getOption(options, "transactionsPerSecond"_sr, 5000.0) / clientCount;
-		nodeCount = getOption(options, "nodeCount"_sr, transactionsPerSecond * clientCount);
-		keyPrefix = unprintable(getOption(options, "keyPrefix"_sr, LiteralStringRef("")).toString());
-		std::vector<std::string> tenants = getOption(options, "tenants"_sr, std::vector<std::string>());
-		for (std::string tenant : tenants) {
-			tenantNames.push_back(TenantName(tenant));
-		}
-	}
-
-	std::string description() const override { return "BulkSetup"; }
-
-	void getMetrics(std::vector<PerfMetric>& m) override {}
-
-	Key keyForIndex(int n) { return key(n); }
-	Key key(int n) { return doubleToTestKey((double)n / nodeCount, keyPrefix); }
-	Value value(int n) { return doubleToTestKey(n, keyPrefix); }
-
-	Standalone<KeyValueRef> operator()(int n) { return KeyValueRef(key(n), value((n + 1) % nodeCount)); }
-
-	Future<Void> start(Database const& cx) override {
-		return bulkSetup(cx,
-		                 this,
-		                 nodeCount,
-		                 Promise<double>(),
-		                 false,
-		                 0.0,
-		                 1e12,
-		                 std::vector<uint64_t>(),
-		                 Promise<std::vector<std::pair<uint64_t, double>>>(),
-		                 0,
-		                 0.1,
-		                 0,
-		                 0,
-		                 this->tenantNames);
-	}
-
-	Future<bool> check(Database const& cx) override { return true; }
-};
-
-WorkloadFactory<BulkSetupWorkload> BulkSetupWorkloadFactory("BulkSetup");

From 7f33b0fa70d49948bf749f53105c07acc24048c2 Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Sat, 12 Nov 2022 14:09:31 -0800
Subject: [PATCH 32/57] clang-format

---
 fdbclient/NativeAPI.actor.cpp                 | 69 +++++++++++++------
 .../include/fdbclient/BlobWorkerCommon.h      |  5 +-
 fdbclient/include/fdbclient/DatabaseContext.h |  3 +-
 fdbserver/TLogServer.actor.cpp                | 31 ++++++---
 4 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index a927f31690..70b1c261a3 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1016,7 +1016,9 @@ ACTOR static Future<Void> monitorClientDBInfoChange(DatabaseContext* cx,
 					proxiesChangeTrigger->trigger();
 				}
 			}
-			when(wait(actors.getResult())) { UNSTOPPABLE_ASSERT(false); }
+			when(wait(actors.getResult())) {
+				UNSTOPPABLE_ASSERT(false);
+			}
 		}
 	}
 }
@@ -1534,17 +1536,16 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
     ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG),
     bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG),
     bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG),
-    bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(), bgGranulesPerRequest(),
-    usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
+    bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(), bgGranulesPerRequest(), usedAnyChangeFeeds(false),
+    ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
     feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed),
     feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed),
-    feedPopsFallback("FeedPopsFallback", ccFeed), latencies(), readLatencies(), commitLatencies(),
-    GRVLatencies(), mutationsPerCommit(), bytesPerCommit(), outstandingWatches(0), sharedStatePtr(nullptr),
-    lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0),
-    lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo),
-    clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0),
-    healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0),
-    smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
+    feedPopsFallback("FeedPopsFallback", ccFeed), latencies(), readLatencies(), commitLatencies(), GRVLatencies(),
+    mutationsPerCommit(), bytesPerCommit(), outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0),
+    cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0),
+    transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor),
+    coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0),
+    detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
     specialKeySpace(std::make_unique<SpecialKeySpace>(specialKeys.begin, specialKeys.end, /* test */ false)),
     connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {
 
@@ -3422,7 +3423,9 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
 					    std::vector<Error>{ transaction_too_old(), future_version() });
 				}
 				choose {
-					when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); }
+					when(wait(trState->cx->connectionFileChanged())) {
+						throw transaction_too_old();
+					}
 					when(GetValueReply _reply = wait(loadBalance(
 					         trState->cx.getPtr(),
 					         locationInfo.locations,
@@ -3569,7 +3572,9 @@ ACTOR Future<Key> getKey(Reference<TransactionState> trState,
 			state GetKeyReply reply;
 			try {
 				choose {
-					when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); }
+					when(wait(trState->cx->connectionFileChanged())) {
+						throw transaction_too_old();
+					}
 					when(GetKeyReply _reply = wait(loadBalance(
 					         trState->cx.getPtr(),
 					         locationInfo.locations,
@@ -3733,7 +3738,9 @@ ACTOR Future<Version> watchValue(Database cx, Reference<const WatchParameters> p
 				                     TaskPriority::DefaultPromiseEndpoint))) {
 					resp = r;
 				}
-				when(wait(cx->connectionRecord ? cx->connectionRecord->onChange() : Never())) { wait(Never()); }
+				when(wait(cx->connectionRecord ? cx->connectionRecord->onChange() : Never())) {
+					wait(Never());
+				}
 			}
 			if (watchValueID.present()) {
 				g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After");
@@ -4052,7 +4059,9 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 				state GetKeyValuesFamilyReply rep;
 				try {
 					choose {
-						when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); }
+						when(wait(trState->cx->connectionFileChanged())) {
+							throw transaction_too_old();
+						}
 						when(GetKeyValuesFamilyReply _rep = wait(loadBalance(
 						         trState->cx.getPtr(),
 						         locations[shard].locations,
@@ -4951,7 +4960,9 @@ ACTOR Future<Void> getRangeStreamFragment(Reference<TransactionState> trState,
 								return Void();
 							}
 
-							when(GetKeyValuesStreamReply _rep = waitNext(replyStream.getFuture())) { rep = _rep; }
+							when(GetKeyValuesStreamReply _rep = waitNext(replyStream.getFuture())) {
+								rep = _rep;
+							}
 						}
 						++trState->cx->transactionPhysicalReadsCompleted;
 					} catch (Error& e) {
@@ -5444,7 +5455,9 @@ ACTOR Future<Void> watch(Reference<Watch> watch,
 				loop {
 					choose {
 						// NativeAPI watchValue future finishes or errors
-						when(wait(watch->watchFuture)) { break; }
+						when(wait(watch->watchFuture)) {
+							break;
+						}
 
 						when(wait(cx->connectionFileChanged())) {
 							CODE_PROBE(true, "Recreated a watch after switch");
@@ -7029,7 +7042,9 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
 			state Future<Void> onProxiesChanged = cx->onProxiesChanged();
 
 			choose {
-				when(wait(onProxiesChanged)) { onProxiesChanged = cx->onProxiesChanged(); }
+				when(wait(onProxiesChanged)) {
+					onProxiesChanged = cx->onProxiesChanged();
+				}
 				when(GetReadVersionReply v =
 				         wait(basicLoadBalance(cx->getGrvProxies(UseProvisionalProxies(
 				                                   flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES)),
@@ -7455,7 +7470,9 @@ ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
 				needToConnect = false;
 			}
 			choose {
-				when(wait(coordinator->onChange())) { needToConnect = true; }
+				when(wait(coordinator->onChange())) {
+					needToConnect = true;
+				}
 
 				when(ProtocolVersion pv = wait(protocolVersion)) {
 					if (!expectedVersion.present() || expectedVersion.get() != pv) {
@@ -9015,8 +9032,12 @@ ACTOR Future<std::vector<CheckpointMetaData>> getCheckpointMetaData(Database cx,
 			}
 
 			choose {
-				when(wait(cx->connectionFileChanged())) { cx->invalidateCache(KeyRef(), keys); }
-				when(wait(waitForAll(futures))) { break; }
+				when(wait(cx->connectionFileChanged())) {
+					cx->invalidateCache(KeyRef(), keys);
+				}
+				when(wait(waitForAll(futures))) {
+					break;
+				}
 				when(wait(delay(timeout))) {
 					TraceEvent("GetCheckpointTimeout").detail("Range", keys).detail("Version", version);
 				}
@@ -9663,8 +9684,12 @@ ACTOR Future<Void> changeFeedWhenAtLatest(Reference<ChangeFeedData> self, Versio
 		// only allowed to use empty versions if you're caught up
 		Future<Void> waitEmptyVersion = (self->notAtLatest.get() == 0) ? changeFeedWaitLatest(self, version) : Never();
 		choose {
-			when(wait(waitEmptyVersion)) { break; }
-			when(wait(lastReturned)) { break; }
+			when(wait(waitEmptyVersion)) {
+				break;
+			}
+			when(wait(lastReturned)) {
+				break;
+			}
 			when(wait(self->refresh.getFuture())) {}
 			when(wait(self->notAtLatest.onChange())) {}
 		}
diff --git a/fdbclient/include/fdbclient/BlobWorkerCommon.h b/fdbclient/include/fdbclient/BlobWorkerCommon.h
index 617f167111..6f44682ec9 100644
--- a/fdbclient/include/fdbclient/BlobWorkerCommon.h
+++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h
@@ -95,7 +95,10 @@ struct BlobWorkerStats {
 	    forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc),
 	    numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0),
 	    minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0),
-	    snapshotBlobWriteLatencySample("SnapshotBlobWriteMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
+	    snapshotBlobWriteLatencySample("SnapshotBlobWriteMetrics",
+	                                   id,
+	                                   sampleLoggingInterval,
+	                                   fileOpLatencySketchAccuracy),
 	    deltaBlobWriteLatencySample("DeltaBlobWriteMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
 	    reSnapshotLatencySample("GranuleResnapshotMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
 	    readLatencySample("GranuleReadLatencyMetrics", id, sampleLoggingInterval, requestLatencySketchAccuracy),
diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h
index b068157fbf..e7c0d4329f 100644
--- a/fdbclient/include/fdbclient/DatabaseContext.h
+++ b/fdbclient/include/fdbclient/DatabaseContext.h
@@ -578,8 +578,7 @@ public:
 	Counter feedPops;
 	Counter feedPopsFallback;
 
-	DDSketch<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
-	    bytesPerCommit;
+	DDSketch<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;
 
 	int outstandingWatches;
 	int maxOutstandingWatches;
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 1c5ddcab0c..d9796923e3 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -1445,7 +1445,9 @@ ACTOR Future<Void> updateStorage(TLogData* self) {
 ACTOR Future<Void> updateStorageLoop(TLogData* self) {
 	wait(delay(0, TaskPriority::UpdateStorage));
 
-	loop { wait(updateStorage(self)); }
+	loop {
+		wait(updateStorage(self));
+	}
 }
 
 void commitMessages(TLogData* self,
@@ -1606,7 +1608,9 @@ ACTOR Future<Void> waitForMessagesForTag(Reference<LogData> self, Tag reqTag, Ve
 			// we want the caller to finish first, otherwise the data structure it is building might not be complete
 			wait(delay(0.0));
 		}
-		when(wait(delay(timeout))) { self->blockingPeekTimeouts += 1; }
+		when(wait(delay(timeout))) {
+			self->blockingPeekTimeouts += 1;
+		}
 	}
 	return Void();
 }
@@ -1824,8 +1828,11 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 			if (logData->blockingPeekLatencies.find(reqTag) == logData->blockingPeekLatencies.end()) {
 				UID ssID = nondeterministicRandom()->randomUniqueID();
 				std::string s = "BlockingPeekLatencies-" + reqTag.toString();
-				logData->blockingPeekLatencies.try_emplace(
-				    reqTag, s, ssID, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SKETCH_ACCURACY);
+				logData->blockingPeekLatencies.try_emplace(reqTag,
+				                                           s,
+				                                           ssID,
+				                                           SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+				                                           SERVER_KNOBS->LATENCY_SKETCH_ACCURACY);
 			}
 			LatencySample& sample = logData->blockingPeekLatencies.at(reqTag);
 			sample.addMeasurement(latency);
@@ -2788,7 +2795,9 @@ ACTOR Future<Void> pullAsyncData(TLogData* self,
 	while (!endVersion.present() || logData->version.get() < endVersion.get()) {
 		loop {
 			choose {
-				when(wait(r ? r->getMore(TaskPriority::TLogCommit) : Never())) { break; }
+				when(wait(r ? r->getMore(TaskPriority::TLogCommit) : Never())) {
+					break;
+				}
 				when(wait(dbInfoChange)) {
 					if (logData->logSystem->get()) {
 						r = logData->logSystem->get()->peek(logData->logId, tagAt, endVersion, tags, true);
@@ -3269,7 +3278,9 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 
 								choose {
 									when(wait(updateStorage(self))) {}
-									when(wait(allRemoved)) { throw worker_removed(); }
+									when(wait(allRemoved)) {
+										throw worker_removed();
+									}
 								}
 							}
 						} else {
@@ -3280,7 +3291,9 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 						}
 					}
 				}
-				when(wait(allRemoved)) { throw worker_removed(); }
+				when(wait(allRemoved)) {
+					throw worker_removed();
+				}
 			}
 		}
 	} catch (Error& e) {
@@ -3626,7 +3639,9 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
 						forwardPromise(req.reply, self.tlogCache.get(req.recruitmentID));
 					}
 				}
-				when(wait(error)) { throw internal_error(); }
+				when(wait(error)) {
+					throw internal_error();
+				}
 				when(wait(activeSharedChange)) {
 					if (activeSharedTLog->get() == tlogId) {
 						TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get());

From 351525c189b63db4ccdd9f2f3266a66781df6f4c Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Sat, 12 Nov 2022 14:53:23 -0800
Subject: [PATCH 33/57] merge issue

---
 fdbrpc/include/fdbrpc/Stats.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/include/fdbrpc/Stats.h b/fdbrpc/include/fdbrpc/Stats.h
index 897705cfbf..2765916b53 100644
--- a/fdbrpc/include/fdbrpc/Stats.h
+++ b/fdbrpc/include/fdbrpc/Stats.h
@@ -217,7 +217,8 @@ public:
 class LatencySample {
 public:
 	LatencySample(std::string name, UID id, double loggingInterval, double accuracy)
-	  : name(name), id(id), sampleStart(now()), sketch(accuracy) {
+	  : name(name), id(id), sampleStart(now()), sketch(accuracy),
+		latencySampleEventHolder(makeReference<EventCacheHolder>(id.toString() + "/" + name)) {
 		logger = recurring([this]() { logSample(); }, loggingInterval);
 	}
 

From 4783e9fd721fd1be432893b9acbc5a2fb6237218 Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Mon, 14 Nov 2022 09:49:58 -0800
Subject: [PATCH 34/57] fix types

---
 fdbclient/include/fdbclient/BlobWorkerCommon.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbclient/include/fdbclient/BlobWorkerCommon.h b/fdbclient/include/fdbclient/BlobWorkerCommon.h
index 6f44682ec9..9ff3b3b30e 100644
--- a/fdbclient/include/fdbclient/BlobWorkerCommon.h
+++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h
@@ -75,8 +75,8 @@ struct BlobWorkerStats {
 	                         Reference<FlowLock> resnapshotLock,
 	                         Reference<FlowLock> deltaWritesLock,
 	                         double sampleLoggingInterval,
-	                         int fileOpLatencySketchAccuracy,
-	                         int requestLatencySketchAccuracy)
+	                         double fileOpLatencySketchAccuracy,
+	                         double requestLatencySketchAccuracy)
 	  : cc("BlobWorkerStats", id.toString()),
 
 	    s3PutReqs("S3PutReqs", cc), s3GetReqs("S3GetReqs", cc), s3DeleteReqs("S3DeleteReqs", cc),

From 92dccdd9eb1bc923dca702718731e39d7633377e Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Mon, 14 Nov 2022 09:50:13 -0800
Subject: [PATCH 35/57] experimental changes to fix out of bounds errors

---
 fdbrpc/include/fdbrpc/DDSketch.h | 30 ++++++++++++++++++------------
 fdbrpc/include/fdbrpc/Stats.h    |  4 ++++
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/fdbrpc/include/fdbrpc/DDSketch.h b/fdbrpc/include/fdbrpc/DDSketch.h
index a5c2105da4..551c087169 100644
--- a/fdbrpc/include/fdbrpc/DDSketch.h
+++ b/fdbrpc/include/fdbrpc/DDSketch.h
@@ -89,9 +89,13 @@ public:
 		if (sample <= EPS) {
 			zeroPopulationSize++;
 		} else {
-			int index = static_cast<Impl*>(this)->getIndex(sample);
+			size_t index = static_cast<Impl*>(this)->getIndex(sample);
 			assert(index >= 0 && index < buckets.size());
-			buckets[index]++;
+			try {
+				buckets.at(index)++;
+			} catch (std::out_of_range const& e) {
+				fmt::print(stderr, "ERROR: Invalid DDSketch bucket index ({}) at {}/{} for sample: {}\n", e.what(), index, buckets.size(), sample);
+			}
 		}
 
 		populationSize++;
@@ -119,7 +123,7 @@ public:
 		if (targetPercentilePopulation < zeroPopulationSize)
 			return T(0);
 
-		int index = -1;
+		size_t index = 0;
 		[[maybe_unused]] bool found = false;
 		if (percentile <= 0.5) { // count up
 			uint64_t count = zeroPopulationSize;
@@ -152,6 +156,7 @@ public:
 			}
 		}
 		assert(found);
+		if (!found) return -1;
 		return static_cast<Impl*>(this)->getValue(index);
 	}
 
@@ -194,7 +199,7 @@ protected:
 	uint64_t populationSize, zeroPopulationSize; // we need to separately count 0s
 	std::vector<uint64_t> buckets;
 	T minValue, maxValue, sum;
-	void setBucketSize(int capacity) { buckets.resize(capacity, 0); }
+	void setBucketSize(size_t capacity) { buckets.resize(capacity, 0); }
 };
 
 // DDSketch with fast log implementation for float numbers
@@ -204,20 +209,21 @@ public:
 	explicit DDSketch(double errorGuarantee = 0.01)
 	  : DDSketchBase<DDSketch<T>, T>(errorGuarantee), gamma((1.0 + errorGuarantee) / (1.0 - errorGuarantee)),
 	    multiplier(fastLogger::correctingFactor * log(2) / log(gamma)) {
+		assert(errorGuarantee > 0);
 		offset = getIndex(1.0 / DDSketchBase<DDSketch<T>, T>::EPS);
 		this->setBucketSize(2 * offset);
 	}
 
-	int getIndex(T sample) {
+	size_t getIndex(T sample) {
 		static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
 		return ceil(fastLogger::fastlog(sample) * multiplier) + offset;
 	}
 
-	T getValue(int index) { return fastLogger::reverseLog((index - offset) / multiplier) * 2.0 / (1 + gamma); }
+	T getValue(size_t index) { return fastLogger::reverseLog((index - offset) / multiplier) * 2.0 / (1 + gamma); }
 
 private:
 	double gamma, multiplier;
-	int offset = 0;
+	size_t offset = 0;
 };
 
 // DDSketch with <cmath> log. Slow and only use this when others doesn't work.
@@ -231,13 +237,13 @@ public:
 		this->setBucketSize(2 * offset);
 	}
 
-	int getIndex(T sample) { return ceil(log(sample) / logGamma) + offset; }
+	size_t getIndex(T sample) { return ceil(log(sample) / logGamma) + offset; }
 
-	T getValue(int index) { return (T)(2.0 * pow(gamma, (index - offset)) / (1 + gamma)); }
+	T getValue(size_t index) { return (T)(2.0 * pow(gamma, (index - offset)) / (1 + gamma)); }
 
 private:
 	double gamma, logGamma;
-	int offset = 0;
+	size_t offset = 0;
 };
 
 // DDSketch for unsigned int. Faster than the float version. Fixed accuracy.
@@ -245,7 +251,7 @@ class DDSketchFastUnsigned : public DDSketchBase<DDSketchFastUnsigned, unsigned>
 public:
 	DDSketchFastUnsigned() : DDSketchBase<DDSketchFastUnsigned, unsigned>(errorGuarantee) { this->setBucketSize(129); }
 
-	int getIndex(unsigned sample) {
+	size_t getIndex(unsigned sample) {
 		__uint128_t v = sample;
 		v *= v;
 		v *= v; // sample^4
@@ -254,7 +260,7 @@ public:
 		return 128 - (high == 0 ? ((low == 0 ? 64 : __builtin_clzll(low)) + 64) : __builtin_clzll(high));
 	}
 
-	unsigned getValue(int index) {
+	unsigned getValue(size_t index) {
 		double r = 1, g = gamma;
 		while (index) { // quick power method for power(gamma, index)
 			if (index & 1)
diff --git a/fdbrpc/include/fdbrpc/Stats.h b/fdbrpc/include/fdbrpc/Stats.h
index 2765916b53..c0564780bb 100644
--- a/fdbrpc/include/fdbrpc/Stats.h
+++ b/fdbrpc/include/fdbrpc/Stats.h
@@ -219,6 +219,10 @@ public:
 	LatencySample(std::string name, UID id, double loggingInterval, double accuracy)
 	  : name(name), id(id), sampleStart(now()), sketch(accuracy),
 		latencySampleEventHolder(makeReference<EventCacheHolder>(id.toString() + "/" + name)) {
+		assert(accuracy > 0);
+		if (accuracy <= 0) {
+			fmt::print(stderr, "ERROR: LatencySample {} has invalid accuracy ({})", name, accuracy);
+		}
 		logger = recurring([this]() { logSample(); }, loggingInterval);
 	}
 

From 0d4915f5ea82690d5353f85cb3d22611258e4314 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 14 Nov 2022 09:57:27 -0800
Subject: [PATCH 36/57] extra error check case

---
 fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
index de6f81b9dc..b2ac5ab39b 100644
--- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
+++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
@@ -415,7 +415,7 @@ struct MetaclusterManagementWorkload : TestWorkload {
 					found = true;
 				}
 			}
-			ASSERT(found);
+			ASSERT(found || checkEntry2.tenantState != checkState);
 		} catch (Error& e) {
 			if (e.code() != error_code_tenant_not_found) {
 				TraceEvent(SevError, "VerifyListFilterFailure").error(e).detail("Tenant", tenant);

From 34b8c5eb2b0fb7b1a35fc7633e86a0c85f683f5c Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Mon, 14 Nov 2022 10:47:45 -0800
Subject: [PATCH 37/57] ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE ->
 ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY

---
 fdbclient/BlobCipher.cpp  | 10 +++++-----
 flow/Knobs.cpp            |  2 +-
 flow/include/flow/Knobs.h |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fdbclient/BlobCipher.cpp b/fdbclient/BlobCipher.cpp
index 24bbf3ceb6..1eb338f13c 100644
--- a/fdbclient/BlobCipher.cpp
+++ b/fdbclient/BlobCipher.cpp
@@ -57,11 +57,11 @@ BlobCipherMetrics::CounterSet::CounterSet(CounterCollection& cc, std::string nam
     getCipherKeysLatency(name + "GetCipherKeysLatency",
                          UID(),
                          FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
-                         FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
+                         FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY),
     getLatestCipherKeysLatency(name + "GetLatestCipherKeysLatency",
                                UID(),
                                FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
-                               FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE) {}
+                               FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY) {}
 
 BlobCipherMetrics::BlobCipherMetrics()
   : cc("BlobCipher"), cipherKeyCacheHit("CipherKeyCacheHit", cc), cipherKeyCacheMiss("CipherKeyCacheMiss", cc),
@@ -71,15 +71,15 @@ BlobCipherMetrics::BlobCipherMetrics()
     getCipherKeysLatency("GetCipherKeysLatency",
                          UID(),
                          FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
-                         FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
+                         FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY),
     getLatestCipherKeysLatency("GetLatestCipherKeysLatency",
                                UID(),
                                FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
-                               FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
+                               FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY),
     getBlobMetadataLatency("GetBlobMetadataLatency",
                            UID(),
                            FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
-                           FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
+                           FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY),
     counterSets({ CounterSet(cc, "TLog"),
                   CounterSet(cc, "KVMemory"),
                   CounterSet(cc, "KVRedwood"),
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index 08bf96e529..7d6132bc6c 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -302,7 +302,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
 	if ( randomize && BUGGIFY) { ENCRYPT_KEY_REFRESH_INTERVAL = deterministicRandom()->randomInt(2, 10); }
 	init( TOKEN_CACHE_SIZE,                                    100 );
 	init( ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,                  5.0 );
-	init( ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE,              1000 );
+	init( ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY,          0.01 );
 	// Refer to EncryptUtil::EncryptAuthTokenAlgo for more details
 	init( ENCRYPT_HEADER_AUTH_TOKEN_ENABLED,                  true ); if ( randomize && BUGGIFY ) { ENCRYPT_HEADER_AUTH_TOKEN_ENABLED = !ENCRYPT_HEADER_AUTH_TOKEN_ENABLED; }
 	init( ENCRYPT_HEADER_AUTH_TOKEN_ALGO,                        1 ); if ( randomize && BUGGIFY ) { ENCRYPT_HEADER_AUTH_TOKEN_ALGO = getRandomAuthTokenAlgo(); }
diff --git a/flow/include/flow/Knobs.h b/flow/include/flow/Knobs.h
index 0ba1e3b4ff..ee163c1038 100644
--- a/flow/include/flow/Knobs.h
+++ b/flow/include/flow/Knobs.h
@@ -364,7 +364,7 @@ public:
 	int64_t ENCRYPT_CIPHER_KEY_CACHE_TTL;
 	int64_t ENCRYPT_KEY_REFRESH_INTERVAL;
 	double ENCRYPT_KEY_CACHE_LOGGING_INTERVAL;
-	double ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE;
+	double ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY;
 	bool ENCRYPT_HEADER_AUTH_TOKEN_ENABLED;
 	int ENCRYPT_HEADER_AUTH_TOKEN_ALGO;
 

From 2c889c411a4103876b85a6148da3e73b5da36910 Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Mon, 14 Nov 2022 10:51:04 -0800
Subject: [PATCH 38/57] add assert of errorGuarantee

---
 fdbrpc/include/fdbrpc/DDSketch.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/include/fdbrpc/DDSketch.h b/fdbrpc/include/fdbrpc/DDSketch.h
index 551c087169..88096b56ce 100644
--- a/fdbrpc/include/fdbrpc/DDSketch.h
+++ b/fdbrpc/include/fdbrpc/DDSketch.h
@@ -79,7 +79,9 @@ class DDSketchBase {
 public:
 	explicit DDSketchBase(double errorGuarantee)
 	  : errorGuarantee(errorGuarantee), populationSize(0), zeroPopulationSize(0), minValue(defaultMin()),
-	    maxValue(defaultMax()), sum(T()) {}
+	    maxValue(defaultMax()), sum(T()) {
+		ASSERT(errorGuarantee > 0 && errorGuarantee < 1);
+	}
 
 	DDSketchBase<Impl, T>& addSample(T sample) {
 		// Call it addSample for now, while it is not a sample anymore

From 40fa959f8dfab3433e124eb21e8788b7ed255c75 Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Mon, 14 Nov 2022 10:51:20 -0800
Subject: [PATCH 39/57] assert -> ASSERT

---
 fdbrpc/include/fdbrpc/DDSketch.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fdbrpc/include/fdbrpc/DDSketch.h b/fdbrpc/include/fdbrpc/DDSketch.h
index 88096b56ce..68fa63651f 100644
--- a/fdbrpc/include/fdbrpc/DDSketch.h
+++ b/fdbrpc/include/fdbrpc/DDSketch.h
@@ -92,7 +92,7 @@ public:
 			zeroPopulationSize++;
 		} else {
 			size_t index = static_cast<Impl*>(this)->getIndex(sample);
-			assert(index >= 0 && index < buckets.size());
+			ASSERT(index >= 0 && index < buckets.size());
 			try {
 				buckets.at(index)++;
 			} catch (std::out_of_range const& e) {
@@ -116,7 +116,7 @@ public:
 	T median() { return percentile(0.5); }
 
 	T percentile(double percentile) {
-		assert(percentile >= 0 && percentile <= 1);
+		ASSERT(percentile >= 0 && percentile <= 1);
 
 		if (populationSize == 0)
 			return T();
@@ -157,7 +157,7 @@ public:
 				count += *rit;
 			}
 		}
-		assert(found);
+		ASSERT(found);
 		if (!found) return -1;
 		return static_cast<Impl*>(this)->getValue(index);
 	}
@@ -181,7 +181,7 @@ public:
 
 	DDSketchBase<Impl, T>& mergeWith(const DDSketchBase<Impl, T>& anotherSketch) {
 		// Must have the same guarantee
-		assert(fabs(errorGuarantee - anotherSketch.errorGuarantee) < EPS &&
+		ASSERT(fabs(errorGuarantee - anotherSketch.errorGuarantee) < EPS &&
 		       anotherSketch.buckets.size() == buckets.size());
 		for (size_t i = 0; i < anotherSketch.buckets.size(); i++) {
 			buckets[i] += anotherSketch.buckets[i];
@@ -211,13 +211,13 @@ public:
 	explicit DDSketch(double errorGuarantee = 0.01)
 	  : DDSketchBase<DDSketch<T>, T>(errorGuarantee), gamma((1.0 + errorGuarantee) / (1.0 - errorGuarantee)),
 	    multiplier(fastLogger::correctingFactor * log(2) / log(gamma)) {
-		assert(errorGuarantee > 0);
+		ASSERT(errorGuarantee > 0);
 		offset = getIndex(1.0 / DDSketchBase<DDSketch<T>, T>::EPS);
 		this->setBucketSize(2 * offset);
 	}
 
 	size_t getIndex(T sample) {
-		static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
+		static_ASSERT(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
 		return ceil(fastLogger::fastlog(sample) * multiplier) + offset;
 	}
 

From 4254429e73f2a1623b7785334ec54b995e3c5430 Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Mon, 14 Nov 2022 11:15:35 -0800
Subject: [PATCH 40/57] capitlization

---
 fdbrpc/include/fdbrpc/DDSketch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/include/fdbrpc/DDSketch.h b/fdbrpc/include/fdbrpc/DDSketch.h
index 68fa63651f..d657ef9f23 100644
--- a/fdbrpc/include/fdbrpc/DDSketch.h
+++ b/fdbrpc/include/fdbrpc/DDSketch.h
@@ -217,7 +217,7 @@ public:
 	}
 
 	size_t getIndex(T sample) {
-		static_ASSERT(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
+		static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
 		return ceil(fastLogger::fastlog(sample) * multiplier) + offset;
 	}
 

From 47a8cebfb3e1ad168d2128aabc2a7fadd961815a Mon Sep 17 00:00:00 2001
From: Kevin Hoxha <kevin.hoxha@snowflake.com>
Date: Mon, 14 Nov 2022 11:41:06 -0800
Subject: [PATCH 41/57] ddsketch: Make sure that all ctors use 0 < error < 1

---
 fdbclient/NativeAPI.actor.cpp                 | 62 ++++++-------------
 fdbrpc/include/fdbrpc/DDSketch.h              | 13 +++-
 fdbrpc/include/fdbrpc/TSSComparison.h         |  6 +-
 .../workloads/ReadWriteWorkload.actor.h       |  8 +--
 4 files changed, 35 insertions(+), 54 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 70b1c261a3..8198318154 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1016,9 +1016,7 @@ ACTOR static Future<Void> monitorClientDBInfoChange(DatabaseContext* cx,
 					proxiesChangeTrigger->trigger();
 				}
 			}
-			when(wait(actors.getResult())) {
-				UNSTOPPABLE_ASSERT(false);
-			}
+			when(wait(actors.getResult())) { UNSTOPPABLE_ASSERT(false); }
 		}
 	}
 }
@@ -1839,13 +1837,13 @@ DatabaseContext::DatabaseContext(const Error& err)
     ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG),
     bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG),
     bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG),
-    bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(1000), bgGranulesPerRequest(1000),
-    usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
+    bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(), bgGranulesPerRequest(), usedAnyChangeFeeds(false),
+    ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
     feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed),
     feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed),
-    feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), commitLatencies(1000),
-    GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), sharedStatePtr(nullptr),
-    transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
+    feedPopsFallback("FeedPopsFallback", ccFeed), latencies(), readLatencies(), commitLatencies(), GRVLatencies(),
+    mutationsPerCommit(), bytesPerCommit(), sharedStatePtr(nullptr), transactionTracingSample(false),
+    smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
     connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {}
 
 // Static constructor used by server processes to create a DatabaseContext
@@ -3423,9 +3421,7 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
 					    std::vector<Error>{ transaction_too_old(), future_version() });
 				}
 				choose {
-					when(wait(trState->cx->connectionFileChanged())) {
-						throw transaction_too_old();
-					}
+					when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); }
 					when(GetValueReply _reply = wait(loadBalance(
 					         trState->cx.getPtr(),
 					         locationInfo.locations,
@@ -3572,9 +3568,7 @@ ACTOR Future<Key> getKey(Reference<TransactionState> trState,
 			state GetKeyReply reply;
 			try {
 				choose {
-					when(wait(trState->cx->connectionFileChanged())) {
-						throw transaction_too_old();
-					}
+					when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); }
 					when(GetKeyReply _reply = wait(loadBalance(
 					         trState->cx.getPtr(),
 					         locationInfo.locations,
@@ -3738,9 +3732,7 @@ ACTOR Future<Version> watchValue(Database cx, Reference<const WatchParameters> p
 				                     TaskPriority::DefaultPromiseEndpoint))) {
 					resp = r;
 				}
-				when(wait(cx->connectionRecord ? cx->connectionRecord->onChange() : Never())) {
-					wait(Never());
-				}
+				when(wait(cx->connectionRecord ? cx->connectionRecord->onChange() : Never())) { wait(Never()); }
 			}
 			if (watchValueID.present()) {
 				g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After");
@@ -4059,9 +4051,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
 				state GetKeyValuesFamilyReply rep;
 				try {
 					choose {
-						when(wait(trState->cx->connectionFileChanged())) {
-							throw transaction_too_old();
-						}
+						when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); }
 						when(GetKeyValuesFamilyReply _rep = wait(loadBalance(
 						         trState->cx.getPtr(),
 						         locations[shard].locations,
@@ -4960,9 +4950,7 @@ ACTOR Future<Void> getRangeStreamFragment(Reference<TransactionState> trState,
 								return Void();
 							}
 
-							when(GetKeyValuesStreamReply _rep = waitNext(replyStream.getFuture())) {
-								rep = _rep;
-							}
+							when(GetKeyValuesStreamReply _rep = waitNext(replyStream.getFuture())) { rep = _rep; }
 						}
 						++trState->cx->transactionPhysicalReadsCompleted;
 					} catch (Error& e) {
@@ -5455,9 +5443,7 @@ ACTOR Future<Void> watch(Reference<Watch> watch,
 				loop {
 					choose {
 						// NativeAPI watchValue future finishes or errors
-						when(wait(watch->watchFuture)) {
-							break;
-						}
+						when(wait(watch->watchFuture)) { break; }
 
 						when(wait(cx->connectionFileChanged())) {
 							CODE_PROBE(true, "Recreated a watch after switch");
@@ -7042,9 +7028,7 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
 			state Future<Void> onProxiesChanged = cx->onProxiesChanged();
 
 			choose {
-				when(wait(onProxiesChanged)) {
-					onProxiesChanged = cx->onProxiesChanged();
-				}
+				when(wait(onProxiesChanged)) { onProxiesChanged = cx->onProxiesChanged(); }
 				when(GetReadVersionReply v =
 				         wait(basicLoadBalance(cx->getGrvProxies(UseProvisionalProxies(
 				                                   flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES)),
@@ -7470,9 +7454,7 @@ ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
 				needToConnect = false;
 			}
 			choose {
-				when(wait(coordinator->onChange())) {
-					needToConnect = true;
-				}
+				when(wait(coordinator->onChange())) { needToConnect = true; }
 
 				when(ProtocolVersion pv = wait(protocolVersion)) {
 					if (!expectedVersion.present() || expectedVersion.get() != pv) {
@@ -9032,12 +9014,8 @@ ACTOR Future<std::vector<CheckpointMetaData>> getCheckpointMetaData(Database cx,
 			}
 
 			choose {
-				when(wait(cx->connectionFileChanged())) {
-					cx->invalidateCache(KeyRef(), keys);
-				}
-				when(wait(waitForAll(futures))) {
-					break;
-				}
+				when(wait(cx->connectionFileChanged())) { cx->invalidateCache(KeyRef(), keys); }
+				when(wait(waitForAll(futures))) { break; }
 				when(wait(delay(timeout))) {
 					TraceEvent("GetCheckpointTimeout").detail("Range", keys).detail("Version", version);
 				}
@@ -9684,12 +9662,8 @@ ACTOR Future<Void> changeFeedWhenAtLatest(Reference<ChangeFeedData> self, Versio
 		// only allowed to use empty versions if you're caught up
 		Future<Void> waitEmptyVersion = (self->notAtLatest.get() == 0) ? changeFeedWaitLatest(self, version) : Never();
 		choose {
-			when(wait(waitEmptyVersion)) {
-				break;
-			}
-			when(wait(lastReturned)) {
-				break;
-			}
+			when(wait(waitEmptyVersion)) { break; }
+			when(wait(lastReturned)) { break; }
 			when(wait(self->refresh.getFuture())) {}
 			when(wait(self->notAtLatest.onChange())) {}
 		}
diff --git a/fdbrpc/include/fdbrpc/DDSketch.h b/fdbrpc/include/fdbrpc/DDSketch.h
index 68fa63651f..d17508622e 100644
--- a/fdbrpc/include/fdbrpc/DDSketch.h
+++ b/fdbrpc/include/fdbrpc/DDSketch.h
@@ -96,7 +96,12 @@ public:
 			try {
 				buckets.at(index)++;
 			} catch (std::out_of_range const& e) {
-				fmt::print(stderr, "ERROR: Invalid DDSketch bucket index ({}) at {}/{} for sample: {}\n", e.what(), index, buckets.size(), sample);
+				fmt::print(stderr,
+				           "ERROR: Invalid DDSketch bucket index ({}) at {}/{} for sample: {}\n",
+				           e.what(),
+				           index,
+				           buckets.size(),
+				           sample);
 			}
 		}
 
@@ -158,7 +163,8 @@ public:
 			}
 		}
 		ASSERT(found);
-		if (!found) return -1;
+		if (!found)
+			return -1;
 		return static_cast<Impl*>(this)->getValue(index);
 	}
 
@@ -213,11 +219,12 @@ public:
 	    multiplier(fastLogger::correctingFactor * log(2) / log(gamma)) {
 		ASSERT(errorGuarantee > 0);
 		offset = getIndex(1.0 / DDSketchBase<DDSketch<T>, T>::EPS);
+		ASSERT(offset > 0);
 		this->setBucketSize(2 * offset);
 	}
 
 	size_t getIndex(T sample) {
-		static_ASSERT(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
+		static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
 		return ceil(fastLogger::fastlog(sample) * multiplier) + offset;
 	}
 
diff --git a/fdbrpc/include/fdbrpc/TSSComparison.h b/fdbrpc/include/fdbrpc/TSSComparison.h
index 7fcc84499b..e3b20cb6e5 100644
--- a/fdbrpc/include/fdbrpc/TSSComparison.h
+++ b/fdbrpc/include/fdbrpc/TSSComparison.h
@@ -105,9 +105,9 @@ struct TSSMetrics : ReferenceCounted<TSSMetrics>, NonCopyable {
 	TSSMetrics()
 	  : cc("TSSClientMetrics"), requests("Requests", cc), streamComparisons("StreamComparisons", cc),
 	    ssErrors("SSErrors", cc), tssErrors("TSSErrors", cc), tssTimeouts("TSSTimeouts", cc),
-	    mismatches("Mismatches", cc), SSgetValueLatency(1000), SSgetKeyLatency(1000), SSgetKeyValuesLatency(1000),
-	    SSgetMappedKeyValuesLatency(1000), TSSgetValueLatency(1000), TSSgetKeyLatency(1000),
-	    TSSgetKeyValuesLatency(1000), TSSgetMappedKeyValuesLatency(1000) {}
+	    mismatches("Mismatches", cc), SSgetValueLatency(), SSgetKeyLatency(), SSgetKeyValuesLatency(),
+	    SSgetMappedKeyValuesLatency(), TSSgetValueLatency(), TSSgetKeyLatency(), TSSgetKeyValuesLatency(),
+	    TSSgetMappedKeyValuesLatency() {}
 };
 
 template <class Rep>
diff --git a/fdbserver/include/fdbserver/workloads/ReadWriteWorkload.actor.h b/fdbserver/include/fdbserver/workloads/ReadWriteWorkload.actor.h
index 5323235795..12ff6a1844 100644
--- a/fdbserver/include/fdbserver/workloads/ReadWriteWorkload.actor.h
+++ b/fdbserver/include/fdbserver/workloads/ReadWriteWorkload.actor.h
@@ -47,7 +47,7 @@ DESCR struct ReadMetric {
 
 // Common ReadWrite test settings
 struct ReadWriteCommon : KVWorkload {
-	static constexpr int sampleSize = 10000;
+	static constexpr double sampleError = 0.01;
 	friend struct ReadWriteCommonImpl;
 
 	// general test setting
@@ -88,9 +88,9 @@ struct ReadWriteCommon : KVWorkload {
 
 	explicit ReadWriteCommon(WorkloadContext const& wcx)
 	  : KVWorkload(wcx), totalReadsMetric("ReadWrite.TotalReads"_sr), totalRetriesMetric("ReadWrite.TotalRetries"_sr),
-	    aTransactions("A Transactions"), bTransactions("B Transactions"), retries("Retries"), latencies(sampleSize),
-	    readLatencies(sampleSize), commitLatencies(sampleSize), GRVLatencies(sampleSize), fullReadLatencies(sampleSize),
-	    readLatencyTotal(0), readLatencyCount(0), loadTime(0.0), clientBegin(0) {
+	    aTransactions("A Transactions"), bTransactions("B Transactions"), retries("Retries"), latencies(sampleError),
+	    readLatencies(sampleError), commitLatencies(sampleError), GRVLatencies(sampleError),
+	    fullReadLatencies(sampleError), readLatencyTotal(0), readLatencyCount(0), loadTime(0.0), clientBegin(0) {
 
 		transactionSuccessMetric.init("ReadWrite.SuccessfulTransaction"_sr);
 		transactionFailureMetric.init("ReadWrite.FailedTransaction"_sr);

From 458daa33925e7d908a7c1123a23deabcfc47dcf1 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Mon, 14 Nov 2022 11:47:32 -0800
Subject: [PATCH 42/57] Apply suggestions from code review

Co-authored-by: Trevor Clinkenbeard <trevor.clinkenbeard@snowflake.com>
---
 fdbserver/include/fdbserver/workloads/MockDDTest.h          | 2 +-
 fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp | 2 +-
 fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdbserver/include/fdbserver/workloads/MockDDTest.h b/fdbserver/include/fdbserver/workloads/MockDDTest.h
index 133f0b582e..ec7a449a6f 100644
--- a/fdbserver/include/fdbserver/workloads/MockDDTest.h
+++ b/fdbserver/include/fdbserver/workloads/MockDDTest.h
@@ -1,5 +1,5 @@
 /*
- * MockDDTest.g
+ * MockDDTest.h
  *
  * This source file is part of the FoundationDB open source project
  *
diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index 04b73e9a09..2e364054a7 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -261,7 +261,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		return Void();
 	}
 
-	void verifyServerKeyDest(MoveKeysParams& params) {
+	void verifyServerKeyDest(MoveKeysParams& params) const {
 		// check destination servers
 		for (auto& id : params.destinationTeam) {
 			ASSERT(mgs->serverIsDestForShard(id, params.keys));
diff --git a/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp b/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp
index 5988d15c64..f0d7a542bb 100644
--- a/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp
+++ b/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp
@@ -177,8 +177,8 @@ struct MockDDTrackerShardEvaluatorWorkload : public MockDDTestWorkload {
 	}
 
 	void getMetrics(std::vector<PerfMetric>& m) override {
-		for (auto& p : rsReasonCounts) {
-			m.push_back(PerfMetric(RelocateReason(p.first).toString(), p.second, Averaged::False));
+		for (const auto& [reason, count] : rsReasonCounts) {
+			m.push_back(PerfMetric(RelocateReason(reason).toString(), count, Averaged::False));
 		}
 	}
 };

From 410b4375d2ae1aa1483d3ddef2ca61a72a732e97 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 14 Nov 2022 13:08:56 -0800
Subject: [PATCH 43/57] change fdbcli_tests.py

---
 fdbcli/tests/fdbcli_tests.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fdbcli/tests/fdbcli_tests.py b/fdbcli/tests/fdbcli_tests.py
index da52842d23..3ba5e54da2 100755
--- a/fdbcli/tests/fdbcli_tests.py
+++ b/fdbcli/tests/fdbcli_tests.py
@@ -771,7 +771,7 @@ def tenant_list(logger):
     output = run_fdbcli_command('tenant list')
     assert output == '1. tenant\n  2. tenant2'
 
-    output = run_fdbcli_command('tenant list a z 1')
+    output = run_fdbcli_command('tenant list a z limit=1')
     assert output == '1. tenant'
 
     output = run_fdbcli_command('tenant list a tenant2')
@@ -786,9 +786,15 @@ def tenant_list(logger):
     output = run_fdbcli_command_and_get_error('tenant list b a')
     assert output == 'ERROR: end must be larger than begin'
 
-    output = run_fdbcli_command_and_get_error('tenant list a b 12x')
+    output = run_fdbcli_command_and_get_error('tenant list a b limit=12x')
     assert output == 'ERROR: invalid limit `12x\''
 
+    output = run_fdbcli_command_and_get_error('tenant list a b offset=13y')
+    assert output == 'ERROR: invalid offset `13y\''
+
+    output = run_fdbcli_command_and_get_error('tenant list a b state=14z')
+    assert output == 'ERROR: unrecognized tenant state(s) `14z\''
+
 @enable_logging()
 def tenant_get(logger):
     setup_tenants(['tenant', 'tenant2 tenant_group=tenant_group2'])

From f997e737589030ef36e954f866f29d8f24918f0e Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Mon, 14 Nov 2022 12:31:13 -0800
Subject: [PATCH 44/57] rename variable and solve some light comments

---
 fdbclient/FDBTypes.cpp                        |  2 +-
 fdbclient/ServerKnobs.cpp                     |  4 +-
 fdbclient/include/fdbclient/ServerKnobs.h     |  2 +-
 .../fdbclient/StorageServerInterface.h        | 18 ++++-----
 fdbserver/BlobManager.actor.cpp               | 12 +++---
 fdbserver/BlobWorker.actor.cpp                |  2 +-
 fdbserver/DDShardTracker.actor.cpp            | 40 +++++++++----------
 fdbserver/DDTxnProcessor.actor.cpp            |  2 +-
 fdbserver/DataDistribution.actor.cpp          |  6 +--
 fdbserver/MockGlobalState.actor.cpp           | 20 +++++-----
 fdbserver/StorageMetrics.actor.cpp            | 27 +++++++------
 .../fdbserver/DataDistribution.actor.h        |  1 +
 .../include/fdbserver/StorageMetrics.actor.h  |  2 +-
 .../include/fdbserver/workloads/MockDDTest.h  |  4 +-
 fdbserver/storageserver.actor.cpp             | 11 ++---
 .../IDDTxnProcessorApiCorrectness.actor.cpp   | 12 +++---
 .../MockDDTrackerShardEvaluator.actor.cpp     |  4 +-
 17 files changed, 88 insertions(+), 81 deletions(-)

diff --git a/fdbclient/FDBTypes.cpp b/fdbclient/FDBTypes.cpp
index e83630596b..9e9401df65 100644
--- a/fdbclient/FDBTypes.cpp
+++ b/fdbclient/FDBTypes.cpp
@@ -66,7 +66,7 @@ Key randomKeyBetween(const KeyRangeRef& keys) {
 			break;
 		}
 	}
-	ASSERT(pos < end.size()); // otherwise, begin >= end
+	ASSERT_LT(pos, end.size()); // otherwise, begin >= end
 
 	// find the lowest char in range begin[pos+1, begin.size()) that is not \xff (255)
 	int lowest = begin.size() - 1;
diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp
index d0662bc5ce..a29efbe76a 100644
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@@ -224,7 +224,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 		shards.
 
 		The bandwidth sample maintained by the storage server needs to be accurate enough to reliably measure this minimum bandwidth.  See
-		BYTES_WRITE_UNITS_PER_SAMPLE.  If this number is too low, the storage server needs to spend more memory and time on sampling.
+		BYTES_WRITTEN_UNITS_PER_SAMPLE.  If this number is too low, the storage server needs to spend more memory and time on sampling.
 		*/
 
 	init( SHARD_SPLIT_BYTES_PER_KSEC,              250 * 1000 * 1000 ); if( buggifySmallBandwidthSplit ) SHARD_SPLIT_BYTES_PER_KSEC = 50 * 1000 * 1000;
@@ -757,7 +757,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS,        1000.0 / STORAGE_METRICS_AVERAGE_INTERVAL );  // milliHz!
 	init( SPLIT_JITTER_AMOUNT,                                  0.05 ); if( randomize && BUGGIFY ) SPLIT_JITTER_AMOUNT = 0.2;
 	init( IOPS_UNITS_PER_SAMPLE,                                10000 * 1000 / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 100 );
-	init( BYTES_WRITE_UNITS_PER_SAMPLE,                           SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 );
+	init( BYTES_WRITTEN_UNITS_PER_SAMPLE,                           SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 );
 	init( BYTES_READ_UNITS_PER_SAMPLE,                          100000 ); // 100K bytes
 	init( READ_HOT_SUB_RANGE_CHUNK_SIZE,                        10000000); // 10MB
 	init( EMPTY_READ_PENALTY,                                   20 ); // 20 bytes
diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h
index 3064a09c01..6ebee8765b 100644
--- a/fdbclient/include/fdbclient/ServerKnobs.h
+++ b/fdbclient/include/fdbclient/ServerKnobs.h
@@ -706,7 +706,7 @@ public:
 	double STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 	double SPLIT_JITTER_AMOUNT;
 	int64_t IOPS_UNITS_PER_SAMPLE;
-	int64_t BYTES_WRITE_UNITS_PER_SAMPLE;
+	int64_t BYTES_WRITTEN_UNITS_PER_SAMPLE;
 	int64_t BYTES_READ_UNITS_PER_SAMPLE;
 	int64_t READ_HOT_SUB_RANGE_CHUNK_SIZE;
 	int64_t EMPTY_READ_PENALTY;
diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h
index a1b6e0ce08..5b27b20776 100644
--- a/fdbclient/include/fdbclient/StorageServerInterface.h
+++ b/fdbclient/include/fdbclient/StorageServerInterface.h
@@ -634,7 +634,7 @@ struct GetShardStateRequest {
 struct StorageMetrics {
 	constexpr static FileIdentifier file_identifier = 13622226;
 	int64_t bytes = 0; // total storage
-	int64_t writeBytesPerKSecond = 0; // bytes write to SQ
+	int64_t bytesWrittenPerKSecond = 0; // bytes write to SQ
 
 	// FIXME: currently, iosPerKSecond is not used in DataDistribution calculations.
 	int64_t iosPerKSecond = 0;
@@ -643,33 +643,33 @@ struct StorageMetrics {
 	static const int64_t infinity = 1LL << 60;
 
 	bool allLessOrEqual(const StorageMetrics& rhs) const {
-		return bytes <= rhs.bytes && writeBytesPerKSecond <= rhs.writeBytesPerKSecond &&
+		return bytes <= rhs.bytes && bytesWrittenPerKSecond <= rhs.bytesWrittenPerKSecond &&
 		       iosPerKSecond <= rhs.iosPerKSecond && bytesReadPerKSecond <= rhs.bytesReadPerKSecond;
 	}
 	void operator+=(const StorageMetrics& rhs) {
 		bytes += rhs.bytes;
-		writeBytesPerKSecond += rhs.writeBytesPerKSecond;
+		bytesWrittenPerKSecond += rhs.bytesWrittenPerKSecond;
 		iosPerKSecond += rhs.iosPerKSecond;
 		bytesReadPerKSecond += rhs.bytesReadPerKSecond;
 	}
 	void operator-=(const StorageMetrics& rhs) {
 		bytes -= rhs.bytes;
-		writeBytesPerKSecond -= rhs.writeBytesPerKSecond;
+		bytesWrittenPerKSecond -= rhs.bytesWrittenPerKSecond;
 		iosPerKSecond -= rhs.iosPerKSecond;
 		bytesReadPerKSecond -= rhs.bytesReadPerKSecond;
 	}
 	template <class F>
 	void operator*=(F f) {
 		bytes *= f;
-		writeBytesPerKSecond *= f;
+		bytesWrittenPerKSecond *= f;
 		iosPerKSecond *= f;
 		bytesReadPerKSecond *= f;
 	}
-	bool allZero() const { return !bytes && !writeBytesPerKSecond && !iosPerKSecond && !bytesReadPerKSecond; }
+	bool allZero() const { return !bytes && !bytesWrittenPerKSecond && !iosPerKSecond && !bytesReadPerKSecond; }
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, bytes, writeBytesPerKSecond, iosPerKSecond, bytesReadPerKSecond);
+		serializer(ar, bytes, bytesWrittenPerKSecond, iosPerKSecond, bytesReadPerKSecond);
 	}
 
 	void negate() { operator*=(-1.0); }
@@ -697,14 +697,14 @@ struct StorageMetrics {
 	}
 
 	bool operator==(StorageMetrics const& rhs) const {
-		return bytes == rhs.bytes && writeBytesPerKSecond == rhs.writeBytesPerKSecond &&
+		return bytes == rhs.bytes && bytesWrittenPerKSecond == rhs.bytesWrittenPerKSecond &&
 		       iosPerKSecond == rhs.iosPerKSecond && bytesReadPerKSecond == rhs.bytesReadPerKSecond;
 	}
 
 	std::string toString() const {
 		return format("Bytes: %lld, BWritePerKSec: %lld, iosPerKSec: %lld, BReadPerKSec: %lld",
 		              bytes,
-		              writeBytesPerKSecond,
+		              bytesWrittenPerKSecond,
 		              iosPerKSecond,
 		              bytesReadPerKSecond);
 	}
diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp
index 140705be74..243bd44ac4 100644
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@@ -636,12 +636,12 @@ ACTOR Future<BlobGranuleSplitPoints> splitRange(Reference<BlobManagerData> bmDat
 			// only split on bytes and write rate
 			state StorageMetrics splitMetrics;
 			splitMetrics.bytes = SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES;
-			splitMetrics.writeBytesPerKSecond = SERVER_KNOBS->SHARD_SPLIT_BYTES_PER_KSEC;
+			splitMetrics.bytesWrittenPerKSecond = SERVER_KNOBS->SHARD_SPLIT_BYTES_PER_KSEC;
 			if (writeHot) {
-				splitMetrics.writeBytesPerKSecond =
-				    std::min(splitMetrics.writeBytesPerKSecond, estimated.writeBytesPerKSecond / 2);
-				splitMetrics.writeBytesPerKSecond =
-				    std::max(splitMetrics.writeBytesPerKSecond, SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC);
+				splitMetrics.bytesWrittenPerKSecond =
+				    std::min(splitMetrics.bytesWrittenPerKSecond, estimated.bytesWrittenPerKSecond / 2);
+				splitMetrics.bytesWrittenPerKSecond =
+				    std::max(splitMetrics.bytesWrittenPerKSecond, SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC);
 			}
 			splitMetrics.iosPerKSecond = splitMetrics.infinity;
 			splitMetrics.bytesReadPerKSecond = splitMetrics.infinity;
@@ -2617,7 +2617,7 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
 		    wait(bmData->db->getStorageMetrics(std::get<1>(candidates[i]), CLIENT_KNOBS->TOO_MANY));
 
 		if (metrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
-		    metrics.writeBytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
+		    metrics.bytesWrittenPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
 			// This granule cannot be merged with any neighbors.
 			// If current candidates up to here can be merged, merge them and skip over this one
 			attemptStartMerge(bmData, currentCandidates);
diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp
index 4797730254..5e81bd5e72 100644
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@@ -1632,7 +1632,7 @@ ACTOR Future<Void> granuleCheckMergeCandidate(Reference<BlobWorkerData> bwData,
 
 		// FIXME: maybe separate knob and/or value for write rate?
 		if (currentMetrics.bytes >= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2 ||
-		    currentMetrics.writeBytesPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
+		    currentMetrics.bytesWrittenPerKSecond >= SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC) {
 			wait(delayJittered(SERVER_KNOBS->BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 2.0));
 			CODE_PROBE(true, "wait and check later to see if granule got smaller or colder");
 			continue;
diff --git a/fdbserver/DDShardTracker.actor.cpp b/fdbserver/DDShardTracker.actor.cpp
index 7457ba81b4..d879cf7cc1 100644
--- a/fdbserver/DDShardTracker.actor.cpp
+++ b/fdbserver/DDShardTracker.actor.cpp
@@ -41,9 +41,9 @@ enum BandwidthStatus { BandwidthStatusLow, BandwidthStatusNormal, BandwidthStatu
 enum ReadBandwidthStatus { ReadBandwidthStatusNormal, ReadBandwidthStatusHigh };
 
 BandwidthStatus getBandwidthStatus(StorageMetrics const& metrics) {
-	if (metrics.writeBytesPerKSecond > SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC)
+	if (metrics.bytesWrittenPerKSecond > SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC)
 		return BandwidthStatusHigh;
-	else if (metrics.writeBytesPerKSecond < SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC)
+	else if (metrics.bytesWrittenPerKSecond < SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC)
 		return BandwidthStatusLow;
 
 	return BandwidthStatusNormal;
@@ -176,7 +176,7 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize) {
 		bounds.max.bytes = maxShardSize;
 	}
 
-	bounds.max.writeBytesPerKSecond = bounds.max.infinity;
+	bounds.max.bytesWrittenPerKSecond = bounds.max.infinity;
 	bounds.max.iosPerKSecond = bounds.max.infinity;
 	bounds.max.bytesReadPerKSecond = bounds.max.infinity;
 
@@ -187,14 +187,14 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize) {
 		bounds.min.bytes = maxShardSize / SERVER_KNOBS->SHARD_BYTES_RATIO;
 	}
 
-	bounds.min.writeBytesPerKSecond = 0;
+	bounds.min.bytesWrittenPerKSecond = 0;
 	bounds.min.iosPerKSecond = 0;
 	bounds.min.bytesReadPerKSecond = 0;
 
 	// The permitted error is 1/3 of the general-case minimum bytes (even in the special case where this is the last
 	// shard)
 	bounds.permittedError.bytes = bounds.max.bytes / SERVER_KNOBS->SHARD_BYTES_RATIO / 3;
-	bounds.permittedError.writeBytesPerKSecond = bounds.permittedError.infinity;
+	bounds.permittedError.bytesWrittenPerKSecond = bounds.permittedError.infinity;
 	bounds.permittedError.iosPerKSecond = bounds.permittedError.infinity;
 	bounds.permittedError.bytesReadPerKSecond = bounds.permittedError.infinity;
 
@@ -222,18 +222,18 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
 		                            std::max(int64_t(bytes - (SERVER_KNOBS->MIN_SHARD_BYTES * 0.1)), (int64_t)0));
 		bounds.permittedError.bytes = bytes * 0.1;
 		if (bandwidthStatus == BandwidthStatusNormal) { // Not high or low
-			bounds.max.writeBytesPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC;
-			bounds.min.writeBytesPerKSecond = SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC;
-			bounds.permittedError.writeBytesPerKSecond = bounds.min.writeBytesPerKSecond / 4;
+			bounds.max.bytesWrittenPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC;
+			bounds.min.bytesWrittenPerKSecond = SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC;
+			bounds.permittedError.bytesWrittenPerKSecond = bounds.min.bytesWrittenPerKSecond / 4;
 		} else if (bandwidthStatus == BandwidthStatusHigh) { // > 10MB/sec for 100MB shard, proportionally lower
 			                                                 // for smaller shard, > 200KB/sec no matter what
-			bounds.max.writeBytesPerKSecond = bounds.max.infinity;
-			bounds.min.writeBytesPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC;
-			bounds.permittedError.writeBytesPerKSecond = bounds.min.writeBytesPerKSecond / 4;
+			bounds.max.bytesWrittenPerKSecond = bounds.max.infinity;
+			bounds.min.bytesWrittenPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC;
+			bounds.permittedError.bytesWrittenPerKSecond = bounds.min.bytesWrittenPerKSecond / 4;
 		} else if (bandwidthStatus == BandwidthStatusLow) { // < 10KB/sec
-			bounds.max.writeBytesPerKSecond = SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC;
-			bounds.min.writeBytesPerKSecond = 0;
-			bounds.permittedError.writeBytesPerKSecond = bounds.max.writeBytesPerKSecond / 4;
+			bounds.max.bytesWrittenPerKSecond = SERVER_KNOBS->SHARD_MIN_BYTES_PER_KSEC;
+			bounds.min.bytesWrittenPerKSecond = 0;
+			bounds.permittedError.bytesWrittenPerKSecond = bounds.max.bytesWrittenPerKSecond / 4;
 		} else {
 			ASSERT(false);
 		}
@@ -306,12 +306,12 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
 					/*TraceEvent("ShardSizeUpdate")
 					    .detail("Keys", keys)
 					    .detail("UpdatedSize", metrics.metrics.bytes)
-					    .detail("Bandwidth", metrics.metrics.writeBytesPerKSecond)
+					    .detail("WriteBandwidth", metrics.metrics.bytesWrittenPerKSecond)
 					    .detail("BandwidthStatus", getBandwidthStatus(metrics))
 					    .detail("BytesLower", bounds.min.bytes)
 					    .detail("BytesUpper", bounds.max.bytes)
-					    .detail("BandwidthLower", bounds.min.writeBytesPerKSecond)
-					    .detail("BandwidthUpper", bounds.max.writeBytesPerKSecond)
+					    .detail("WriteBandwidthLower", bounds.min.bytesWrittenPerKSecond)
+					    .detail("WriteBandwidthUpper", bounds.max.bytesWrittenPerKSecond)
 					    .detail("ShardSizePresent", shardSize->get().present())
 					    .detail("OldShardSize", shardSize->get().present() ? shardSize->get().get().metrics.bytes : 0)
 					    .detail("TrackerID", trackerID);*/
@@ -879,7 +879,7 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
 
 	StorageMetrics splitMetrics;
 	splitMetrics.bytes = shardBounds.max.bytes / 2;
-	splitMetrics.writeBytesPerKSecond =
+	splitMetrics.bytesWrittenPerKSecond =
 	    keys.begin >= keyServersKeys.begin ? splitMetrics.infinity : SERVER_KNOBS->SHARD_SPLIT_BYTES_PER_KSEC;
 	splitMetrics.iosPerKSecond = splitMetrics.infinity;
 	splitMetrics.bytesReadPerKSecond = splitMetrics.infinity; // Don't split by readBandwidth
@@ -902,7 +902,7 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
 	            bandwidthStatus == BandwidthStatusHigh     ? "High"
 	            : bandwidthStatus == BandwidthStatusNormal ? "Normal"
 	                                                       : "Low")
-	    .detail("BytesPerKSec", metrics.writeBytesPerKSecond)
+	    .detail("BytesWrittenPerKSec", metrics.bytesWrittenPerKSecond)
 	    .detail("NumShards", numShards);
 
 	if (numShards > 1) {
@@ -1203,7 +1203,7 @@ ACTOR Future<Void> shardTracker(DataDistributionTracker::SafeAccessor self,
 	    .detail("TrackerID", trackerID)
 	    .detail("MaxBytes", self()->maxShardSize->get().get())
 	    .detail("ShardSize", shardSize->get().get().bytes)
-	    .detail("BytesPerKSec", shardSize->get().get().writeBytesPerKSecond);*/
+	    .detail("BytesPerKSec", shardSize->get().get().bytesWrittenPerKSecond);*/
 
 	try {
 		loop {
diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 3382caa76b..313ba6baa2 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -709,7 +709,7 @@ struct DDMockTxnProcessorImpl {
 		loop {
 			wait(delayJittered(1.0));
 			DDMockTxnProcessor* selfP = self;
-			KeyRangeRef cloneRef;
+			KeyRangeRef cloneRef = range;
 			if (std::all_of(ids.begin(), ids.end(), [selfP, cloneRef](const UID& id) {
 				    auto& server = selfP->mgs->allServers.at(id);
 				    return server.allShardStatusIn(cloneRef, { MockShardStatus::FETCHED, MockShardStatus::COMPLETED });
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 0732dec1da..c3d008218f 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -56,12 +56,12 @@
 ShardSizeBounds ShardSizeBounds::shardSizeBoundsBeforeTrack() {
 	return ShardSizeBounds{
 		.max = StorageMetrics{ .bytes = -1,
-		                       .writeBytesPerKSecond = StorageMetrics::infinity,
+		                       .bytesWrittenPerKSecond = StorageMetrics::infinity,
 		                       .iosPerKSecond = StorageMetrics::infinity,
 		                       .bytesReadPerKSecond = StorageMetrics::infinity },
-		.min = StorageMetrics{ .bytes = -1, .writeBytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 },
+		.min = StorageMetrics{ .bytes = -1, .bytesWrittenPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 },
 		.permittedError = StorageMetrics{ .bytes = -1,
-		                                  .writeBytesPerKSecond = StorageMetrics::infinity,
+		                                  .bytesWrittenPerKSecond = StorageMetrics::infinity,
 		                                  .iosPerKSecond = StorageMetrics::infinity,
 		                                  .bytesReadPerKSecond = StorageMetrics::infinity }
 	};
diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index aabe9f379c..b3b2c05d15 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -320,7 +320,7 @@ Future<Void> MockStorageServer::run() {
 
 	TraceEvent("MockStorageServerStart").detail("Address", ssi.address());
 	addActor(serveStorageMetricsRequests(this, ssi));
-	addActor(MockStorageServerImpl::serveMockStorageServer(this));
+	// addActor(MockStorageServerImpl::serveMockStorageServer(this));
 	return actors.getResult();
 }
 
@@ -411,13 +411,15 @@ void MockStorageServer::notifyWriteMetrics(KeyRef const& key, int64_t size) {
 	// update write bandwidth and iops as mock the cost of writing a mutation
 	StorageMetrics s;
 	// FIXME: remove the / 2 and double the related knobs.
-	s.writeBytesPerKSecond = mvccStorageBytes(size) / 2;
+	s.bytesWrittenPerKSecond = mvccStorageBytes(size) / 2;
 	s.iosPerKSecond = 1;
 	metrics.notify(key, s);
 }
 
 void MockStorageServer::signalFetchKeys(const KeyRangeRef& range, int64_t rangeTotalBytes) {
-	fetchKeysRequests.send({ range, rangeTotalBytes });
+	if (!allShardStatusEqual(range, MockShardStatus::COMPLETED)) {
+		actors.add(MockStorageServerImpl::waitFetchKeysFinish(this, { range, rangeTotalBytes }));
+	}
 }
 
 Future<Void> MockStorageServer::fetchKeys(const MockStorageServer::FetchKeysParams& param) {
@@ -999,22 +1001,22 @@ TEST_CASE("/MockGlobalState/MockStorageServer/DataOpsSet") {
 
 	// insert
 	{
-		mgs->set("a"_sr, 1 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
-		mgs->set("b"_sr, 2 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
-		mgs->set("c"_sr, 3 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE, true);
+		mgs->set("a"_sr, 1 * SERVER_KNOBS->BYTES_WRITTEN_UNITS_PER_SAMPLE, true);
+		mgs->set("b"_sr, 2 * SERVER_KNOBS->BYTES_WRITTEN_UNITS_PER_SAMPLE, true);
+		mgs->set("c"_sr, 3 * SERVER_KNOBS->BYTES_WRITTEN_UNITS_PER_SAMPLE, true);
 		for (auto& server : mgs->allServers) {
-			ASSERT_EQ(server.second.usedDiskSpace, 3 + 6 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE);
+			ASSERT_EQ(server.second.usedDiskSpace, 3 + 6 * SERVER_KNOBS->BYTES_WRITTEN_UNITS_PER_SAMPLE);
 		}
 		ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
 		std::pair<Optional<StorageMetrics>, int> res = wait(
 		    mgs->waitStorageMetrics(KeyRangeRef("a"_sr, "bc"_sr), bounds.min, bounds.max, bounds.permittedError, 1, 1));
 
-		int64_t testSize = 2 + 3 * SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE;
+		int64_t testSize = 2 + 3 * SERVER_KNOBS->BYTES_WRITTEN_UNITS_PER_SAMPLE;
 		// SOMEDAY: how to integrate with isKeyValueInSample() better?
 		if (res.first.get().bytes > 0) {
 			// If sampled
 			ASSERT_EQ(res.first.get().bytes, testSize);
-			ASSERT_GT(res.first.get().writeBytesPerKSecond, 0);
+			ASSERT_GT(res.first.get().bytesWrittenPerKSecond, 0);
 		}
 	}
 	return Void();
diff --git a/fdbserver/StorageMetrics.actor.cpp b/fdbserver/StorageMetrics.actor.cpp
index ea314eff77..c947c46049 100644
--- a/fdbserver/StorageMetrics.actor.cpp
+++ b/fdbserver/StorageMetrics.actor.cpp
@@ -75,7 +75,7 @@ KeyRef StorageMetricSample::splitEstimate(KeyRangeRef range, int64_t offset, boo
 StorageMetrics StorageServerMetrics::getMetrics(KeyRangeRef const& keys) const {
 	StorageMetrics result;
 	result.bytes = byteSample.getEstimate(keys);
-	result.writeBytesPerKSecond =
+	result.bytesWrittenPerKSecond =
 	    bytesWriteSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 	result.iosPerKSecond = iopsSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 	result.bytesReadPerKSecond =
@@ -88,7 +88,7 @@ StorageMetrics StorageServerMetrics::getMetrics(KeyRangeRef const& keys) const {
 void StorageServerMetrics::notify(KeyRef key, StorageMetrics& metrics) {
 	ASSERT(metrics.bytes == 0); // ShardNotifyMetrics
 	if (g_network->isSimulated()) {
-		CODE_PROBE(metrics.writeBytesPerKSecond != 0, "ShardNotifyMetrics bytes");
+		CODE_PROBE(metrics.bytesWrittenPerKSecond != 0, "ShardNotifyMetrics bytes");
 		CODE_PROBE(metrics.iosPerKSecond != 0, "ShardNotifyMetrics ios");
 		CODE_PROBE(metrics.bytesReadPerKSecond != 0, "ShardNotifyMetrics bytesRead", probe::decoration::rare);
 	}
@@ -97,9 +97,10 @@ void StorageServerMetrics::notify(KeyRef key, StorageMetrics& metrics) {
 
 	StorageMetrics notifyMetrics;
 
-	if (metrics.writeBytesPerKSecond)
-		notifyMetrics.writeBytesPerKSecond = bytesWriteSample.addAndExpire(key, metrics.writeBytesPerKSecond, expire) *
-		                                     SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
+	if (metrics.bytesWrittenPerKSecond)
+		notifyMetrics.bytesWrittenPerKSecond =
+		    bytesWriteSample.addAndExpire(key, metrics.bytesWrittenPerKSecond, expire) *
+		    SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 	if (metrics.iosPerKSecond)
 		notifyMetrics.iosPerKSecond = iopsSample.addAndExpire(key, metrics.iosPerKSecond, expire) *
 		                              SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
@@ -177,7 +178,7 @@ void StorageServerMetrics::notifyNotReadable(KeyRangeRef keys) {
 void StorageServerMetrics::poll() {
 	{
 		StorageMetrics m;
-		m.writeBytesPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
+		m.bytesWrittenPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
 		bytesWriteSample.poll(waitMetricsMap, m);
 	}
 	{
@@ -250,7 +251,7 @@ void StorageServerMetrics::splitMetrics(SplitMetricsRequest req) const {
 			if (remaining.bytes < 2 * minSplitBytes)
 				break;
 			KeyRef key = req.keys.end;
-			bool hasUsed = used.bytes != 0 || used.writeBytesPerKSecond != 0 || used.iosPerKSecond != 0;
+			bool hasUsed = used.bytes != 0 || used.bytesWrittenPerKSecond != 0 || used.iosPerKSecond != 0;
 			key = getSplitKey(remaining.bytes,
 			                  estimated.bytes,
 			                  req.limits.bytes,
@@ -276,10 +277,10 @@ void StorageServerMetrics::splitMetrics(SplitMetricsRequest req) const {
 			                  lastKey,
 			                  key,
 			                  hasUsed);
-			key = getSplitKey(remaining.writeBytesPerKSecond,
-			                  estimated.writeBytesPerKSecond,
-			                  req.limits.writeBytesPerKSecond,
-			                  used.writeBytesPerKSecond,
+			key = getSplitKey(remaining.bytesWrittenPerKSecond,
+			                  estimated.bytesWrittenPerKSecond,
+			                  req.limits.bytesWrittenPerKSecond,
+			                  used.bytesWrittenPerKSecond,
 			                  req.limits.infinity,
 			                  req.isLastShard,
 			                  bytesWriteSample,
@@ -328,12 +329,12 @@ void StorageServerMetrics::getStorageMetrics(GetStorageMetricsRequest req,
 
 	rep.available.bytes = sb.available;
 	rep.available.iosPerKSecond = 10e6;
-	rep.available.writeBytesPerKSecond = 100e9;
+	rep.available.bytesWrittenPerKSecond = 100e9;
 	rep.available.bytesReadPerKSecond = 100e9;
 
 	rep.capacity.bytes = sb.total;
 	rep.capacity.iosPerKSecond = 10e6;
-	rep.capacity.writeBytesPerKSecond = 100e9;
+	rep.capacity.bytesWrittenPerKSecond = 100e9;
 	rep.capacity.bytesReadPerKSecond = 100e9;
 
 	rep.bytesInputRate = bytesInputRate;
diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h
index 14fd6b6334..3636411a57 100644
--- a/fdbserver/include/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/include/fdbserver/DataDistribution.actor.h
@@ -83,6 +83,7 @@ public:
 	}
 	operator int() const { return (int)value; }
 	constexpr static int8_t typeCount() { return (int)__COUNT; }
+	bool operator<(const RelocateReason& reason) { return (int)value < (int)reason.value; }
 
 private:
 	Value value;
diff --git a/fdbserver/include/fdbserver/StorageMetrics.actor.h b/fdbserver/include/fdbserver/StorageMetrics.actor.h
index 0fb2ab3fa1..69024ea27b 100644
--- a/fdbserver/include/fdbserver/StorageMetrics.actor.h
+++ b/fdbserver/include/fdbserver/StorageMetrics.actor.h
@@ -86,7 +86,7 @@ struct StorageServerMetrics {
 
 	StorageServerMetrics()
 	  : byteSample(0), iopsSample(SERVER_KNOBS->IOPS_UNITS_PER_SAMPLE),
-	    bytesWriteSample(SERVER_KNOBS->BYTES_WRITE_UNITS_PER_SAMPLE),
+	    bytesWriteSample(SERVER_KNOBS->BYTES_WRITTEN_UNITS_PER_SAMPLE),
 	    bytesReadSample(SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE) {}
 
 	StorageMetrics getMetrics(KeyRangeRef const& keys) const;
diff --git a/fdbserver/include/fdbserver/workloads/MockDDTest.h b/fdbserver/include/fdbserver/workloads/MockDDTest.h
index ec7a449a6f..b14f65f7e2 100644
--- a/fdbserver/include/fdbserver/workloads/MockDDTest.h
+++ b/fdbserver/include/fdbserver/workloads/MockDDTest.h
@@ -39,8 +39,10 @@ struct MockDDTestWorkload : public TestWorkload {
 	Reference<DDMockTxnProcessor> mock;
 
 	KeyRange getRandomRange(double offset) const;
-	MockDDTestWorkload(WorkloadContext const& wcx);
 	Future<Void> setup(Database const& cx) override;
+
+protected:
+	MockDDTestWorkload(WorkloadContext const& wcx);
 };
 
 #endif // FOUNDATIONDB_MOCKDDTEST_H
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 30656166f2..2726709f90 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -2118,7 +2118,7 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 
 		/*
 		StorageMetrics m;
-		m.writeBytesPerKSecond = req.key.size() + (v.present() ? v.get().size() : 0);
+		m.bytesWrittenPerKSecond = req.key.size() + (v.present() ? v.get().size() : 0);
 		m.iosPerKSecond = 1;
 		data->metrics.notify(req.key, m);
 		*/
@@ -5793,7 +5793,7 @@ void applyMutation(StorageServer* self,
 	// Clear split keys are added to arena
 	StorageMetrics metrics;
 	// FIXME: remove the / 2 and double the related knobs.
-	metrics.writeBytesPerKSecond = mvccStorageBytes(m) / 2; // comparable to counter.bytesInput / 2
+	metrics.bytesWrittenPerKSecond = mvccStorageBytes(m) / 2; // comparable to counter.bytesInput / 2
 	metrics.iosPerKSecond = 1;
 	self->metrics.notify(m.param1, metrics);
 
@@ -10290,12 +10290,13 @@ ACTOR Future<Void> waitMetrics(StorageServerMetrics* self, WaitMetricsRequest re
 						//  all the messages for one clear or set have been dispatched.
 
 						/*StorageMetrics m = getMetrics( data, req.keys );
-						  bool b = ( m.bytes != metrics.bytes || m.writeBytesPerKSecond != metrics.writeBytesPerKSecond
+						  bool b = ( m.bytes != metrics.bytes || m.bytesWrittenPerKSecond !=
+						  metrics.bytesWrittenPerKSecond
 						  || m.iosPerKSecond != metrics.iosPerKSecond ); if (b) { printf("keys: '%s' - '%s' @%p\n",
 						  printable(req.keys.begin).c_str(), printable(req.keys.end).c_str(), this);
 						  printf("waitMetrics: desync %d (%lld %lld %lld) != (%lld %lld %lld); +(%lld %lld %lld)\n",
-						  b, m.bytes, m.writeBytesPerKSecond, m.iosPerKSecond, metrics.bytes,
-						  metrics.writeBytesPerKSecond, metrics.iosPerKSecond, c.bytes, c.writeBytesPerKSecond,
+						  b, m.bytes, m.bytesWrittenPerKSecond, m.iosPerKSecond, metrics.bytes,
+						  metrics.bytesWrittenPerKSecond, metrics.iosPerKSecond, c.bytes, c.bytesWrittenPerKSecond,
 						  c.iosPerKSecond);
 
 						  }*/
diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index 2e364054a7..8b42750d3c 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -300,7 +300,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 				wait(self->real->testRawFinishMovement(params, emptyTssMapping));
 				break;
 			} catch (Error& e) {
-				if (e.code() != error_code_movekeys_conflict && e.code() != error_code_operation_failed)
+				if (e.code() != error_code_movekeys_conflict)
 					throw;
 				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
 				// Keep trying to get the moveKeysLock
@@ -327,12 +327,12 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 		state MoveKeysLock lock = wait(takeMoveKeysLock(self->real->context(), UID()));
 
 		KeyRange keys = self->getRandomKeys();
-		std::vector<UID> destTeams = self->getRandomTeam();
-		std::sort(destTeams.begin(), destTeams.end());
+		std::vector<UID> destTeam = self->getRandomTeam();
+		std::sort(destTeam.begin(), destTeam.end());
 		return MoveKeysParams{ deterministicRandom()->randomUniqueID(),
 			                   keys,
-			                   destTeams,
-			                   destTeams,
+			                   destTeam,
+			                   destTeam,
 			                   lock,
 			                   Promise<Void>(),
 			                   nullptr,
@@ -365,7 +365,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 				wait(self->real->moveKeys(params));
 				break;
 			} catch (Error& e) {
-				if (e.code() != error_code_movekeys_conflict && e.code() != error_code_operation_failed)
+				if (e.code() != error_code_movekeys_conflict)
 					throw;
 				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
 				// Keep trying to get the moveKeysLock
diff --git a/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp b/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp
index f0d7a542bb..9d007e7419 100644
--- a/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp
+++ b/fdbserver/workloads/MockDDTrackerShardEvaluator.actor.cpp
@@ -37,7 +37,7 @@ struct MockDDTrackerShardEvaluatorWorkload : public MockDDTestWorkload {
 	uint64_t mockDbSize = 0;
 	const int keySize = 16;
 
-	std::map<int, int> rsReasonCounts;
+	std::map<RelocateReason, int> rsReasonCounts;
 
 	// --- test configs ---
 
@@ -133,7 +133,7 @@ struct MockDDTrackerShardEvaluatorWorkload : public MockDDTestWorkload {
 	ACTOR static Future<Void> relocateShardReporter(MockDDTrackerShardEvaluatorWorkload* self,
 	                                                FutureStream<RelocateShard> input) {
 		loop choose {
-			when(RelocateShard rs = waitNext(input)) { ++self->rsReasonCounts[(int)rs.reason]; }
+			when(RelocateShard rs = waitNext(input)) { ++self->rsReasonCounts[rs.reason]; }
 		}
 	}
 

From 89b38624979546827003f6b68d9c62297bdf0d9e Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Mon, 14 Nov 2022 13:11:48 -0800
Subject: [PATCH 45/57] fix randomKeyBetween bug

---
 fdbclient/FDBTypes.cpp                        |  7 +-
 fdbserver/MockGlobalState.actor.cpp           | 64 +++++++++++--------
 fdbserver/include/fdbserver/MockGlobalState.h |  4 --
 3 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/fdbclient/FDBTypes.cpp b/fdbclient/FDBTypes.cpp
index 9e9401df65..0fff0611e4 100644
--- a/fdbclient/FDBTypes.cpp
+++ b/fdbclient/FDBTypes.cpp
@@ -51,12 +51,17 @@ KeyRef keyBetween(const KeyRangeRef& keys) {
 }
 
 Key randomKeyBetween(const KeyRangeRef& keys) {
+	if (keys.empty() || keys.singleKeyRange()) {
+		return keys.end;
+	}
+
 	KeyRef begin = keys.begin;
 	KeyRef end = keys.end;
 	ASSERT(begin < end);
 	if (begin.size() < end.size()) {
 		// randomly append a char
-		uint8_t newChar = deterministicRandom()->randomInt(0, end[begin.size()] + 1);
+		uint8_t maxChar = end[begin.size()] > 0 ? end[begin.size()] : end[begin.size()] + 1;
+		uint8_t newChar = deterministicRandom()->randomInt(0, maxChar);
 		return begin.withSuffix(StringRef(&newChar, 1));
 	}
 
diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index b3b2c05d15..b4bdaf2918 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -115,36 +115,50 @@ public:
 		return Void();
 	}
 
-	ACTOR static Future<Void> serveMockStorageServer(MockStorageServer* self) {
-		state ActorCollection actors;
-		loop choose {
-			when(MockStorageServer::FetchKeysParams params = waitNext(self->fetchKeysRequests.getFuture())) {
-				if (!self->allShardStatusEqual(params.keys, MockShardStatus::COMPLETED)) {
-					actors.add(waitFetchKeysFinish(self, params));
-				}
-			}
-			when(wait(actors.getResult())) { ASSERT(false); }
-		}
-	}
+	// Randomly generate keys and kv size between the fetch range, updating the byte sample.
+	// Once the fetchKeys return, the shard status will become FETCHED.
 	ACTOR static Future<Void> waitFetchKeysFinish(MockStorageServer* self, MockStorageServer::FetchKeysParams params) {
 		// between each chunk delay for random time, and finally set the fetchComplete signal.
 		ASSERT(params.totalRangeBytes > 0);
 		state int chunkCount = std::ceil(params.totalRangeBytes * 1.0 / SERVER_KNOBS->FETCH_BLOCK_BYTES);
+		state int64_t currentTotal = 0;
 		state Key lastKey = params.keys.begin;
 
 		state int i = 0;
-		for (; i < chunkCount; ++i) {
+		for (; i < chunkCount && currentTotal < params.totalRangeBytes; ++i) {
 			wait(delayJittered(0.01));
-			int remainBytes = (chunkCount == 1 ? params.totalRangeBytes : SERVER_KNOBS->FETCH_BLOCK_BYTES);
+			int remainedBytes = (chunkCount == 1 ? params.totalRangeBytes : SERVER_KNOBS->FETCH_BLOCK_BYTES);
 
-			while (remainBytes >= lastKey.size()) {
-				int maxSize = std::min(remainBytes, 130000) + 1;
+			while (remainedBytes >= lastKey.size()) {
+				Key nextKey;
+				// try 10 times
+				for (int j = 0; j < 10; j++) {
+					nextKey = randomKeyBetween(KeyRangeRef(lastKey, params.keys.end));
+					if (nextKey < params.keys.end)
+						break;
+				}
+
+				// NOTE: in this case, we accumulate the bytes on lastKey on purpose (shall we?)
+				if (nextKey == params.keys.end) {
+					auto bytes = params.totalRangeBytes - currentTotal;
+					self->byteSampleApplySet(lastKey, bytes);
+					self->usedDiskSpace += bytes;
+					currentTotal = params.totalRangeBytes;
+					TraceEvent(SevWarn, "MockFetchKeysInaccurateSample")
+					    .detail("Range", params.keys)
+					    .detail("LastKey", lastKey)
+					    .detail("Size", bytes);
+					break; // break the most outside loop
+				}
+
+				int maxSize = std::min(remainedBytes, 130000) + 1;
 				int randomSize = deterministicRandom()->randomInt(lastKey.size(), maxSize);
-
 				self->usedDiskSpace += randomSize;
+				currentTotal += randomSize;
+
 				self->byteSampleApplySet(lastKey, randomSize);
-				remainBytes -= randomSize;
-				lastKey = randomKeyBetween(KeyRangeRef(lastKey, params.keys.end));
+				remainedBytes -= randomSize;
+				lastKey = nextKey;
 			}
 		}
 
@@ -214,16 +228,15 @@ void MockStorageServer::setShardStatus(const KeyRangeRef& range, MockShardStatus
 		auto oldStatus = it.value().status;
 		if (isStatusTransitionValid(oldStatus, status)) {
 			it.value() = ShardInfo{ status, newSize };
-		} else if (oldStatus == MockShardStatus::COMPLETED &&
+		} else if ((oldStatus == MockShardStatus::COMPLETED || oldStatus == MockShardStatus::FETCHED) &&
 		           (status == MockShardStatus::INFLIGHT || status == MockShardStatus::FETCHED)) {
 			CODE_PROBE(true, "Shard already on server");
 		} else {
-			TraceEvent(SevError, "MockShardStatusTransitionError")
+			TraceEvent(SevError, "MockShardStatusTransitionError", id)
 			    .detail("From", oldStatus)
 			    .detail("To", status)
-			    .detail("ID", id)
-			    .detail("KeyBegin", range.begin.toHexString())
-			    .detail("KeyEnd", range.begin.toHexString());
+			    .detail("KeyBegin", range.begin)
+			    .detail("KeyEnd", range.begin);
 		}
 	}
 	serverKeys.coalesce(range);
@@ -320,7 +333,6 @@ Future<Void> MockStorageServer::run() {
 
 	TraceEvent("MockStorageServerStart").detail("Address", ssi.address());
 	addActor(serveStorageMetricsRequests(this, ssi));
-	// addActor(MockStorageServerImpl::serveMockStorageServer(this));
 	return actors.getResult();
 }
 
@@ -422,10 +434,6 @@ void MockStorageServer::signalFetchKeys(const KeyRangeRef& range, int64_t rangeT
 	}
 }
 
-Future<Void> MockStorageServer::fetchKeys(const MockStorageServer::FetchKeysParams& param) {
-	return MockStorageServerImpl::waitFetchKeysFinish(this, param);
-}
-
 void MockStorageServer::byteSampleApplySet(KeyRef const& key, int64_t kvSize) {
 	// Update byteSample in memory and notify waiting metrics
 	ByteSampleInfo sampleInfo = isKeyValueInSample(key, kvSize);
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index 4ea121697d..21be352e48 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -191,10 +191,6 @@ protected:
 	// Update the storage metrics as if we write a k-v pair of `size` bytes.
 	void notifyWriteMetrics(KeyRef const& key, int64_t size);
 
-	// Randomly generate keys and kv size between the fetch range, updating the byte sample.
-	// Once the fetchKeys return, the shard status will become FETCHED.
-	Future<Void> fetchKeys(const FetchKeysParams&);
-
 	// Update byte sample as if set a key value pair of which the size is kvSize
 	void byteSampleApplySet(KeyRef const& key, int64_t kvSize);
 

From a93eda244fdb3c41d8bcfb291137d3068cf52141 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 14 Nov 2022 16:09:31 -0800
Subject: [PATCH 46/57] sscanf on argument parsing

---
 fdbcli/TenantCommands.actor.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp
index daacb80fbd..0de598f75b 100644
--- a/fdbcli/TenantCommands.actor.cpp
+++ b/fdbcli/TenantCommands.actor.cpp
@@ -106,14 +106,18 @@ bool parseTenantListOptions(std::vector<StringRef> const& tokens,
 		}
 		value = token;
 		if (tokencmp(param, "limit")) {
-			limit = std::stoi(value.get().toString());
-			if (limit <= 0) {
+			int limit = 0;
+			int n = 0;
+			if (sscanf(value.get().toString().c_str(), "%d%n", &limit, &n) != 1 || n != value.get().size() ||
+			    limit <= 0) {
 				fmt::print(stderr, "ERROR: invalid limit `{}'\n", token.toString().c_str());
 				return false;
 			}
 		} else if (tokencmp(param, "offset")) {
-			offset = std::stoi(value.get().toString());
-			if (offset < 0) {
+			int offset = 0;
+			int n = 0;
+			if (sscanf(value.get().toString().c_str(), "%d%n", &offset, &n) != 1 || n != value.get().size() ||
+			    offset < 0) {
 				fmt::print(stderr, "ERROR: invalid offset `{}'\n", token.toString().c_str());
 				return false;
 			}

From b699ba4c23a6681becfa91e7aa955e79980bb1b8 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Mon, 14 Nov 2022 14:25:54 -0800
Subject: [PATCH 47/57] Increase memtable and writebuffer size for rocksdb
 simulation test

memtable and writebuffer size are too small in simualtion, which causes
thousands of sst files and at least 6 levels of ssts.
Both makes compaction slower in simulation and contribute to timeout errors.

After increasing the size, failure rate (timeout failures) when we only run rocksdb and
sharded rocksdb engines in simulation drops from 10 out of 332339 tests to 10 out of 497532 tests.

For apple dev who wants to look into the joshua details,
before the change, joshua ensemble id is 20221111-223720-mengxudebugrocks-505ede1c55664ddf
after the change, joshua ensemble id is 20221114-192042-mengxurocksdebugknobchange-1e4c047d112e9a38
---
 fdbclient/ServerKnobs.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp
index 431278ee22..af9639cf0a 100644
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@@ -390,19 +390,22 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// If true, do not process and store RocksDB logs
 	init( ROCKSDB_MUTE_LOGS,                                    true );
 	// Use a smaller memtable in simulation to avoid OOMs.
-	int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024;
+	// TODO: change it to bigger value. it was 32K when timeout
+	int64_t memtableBytes = isSimulated ? 1024 * 1024 : 512 * 1024 * 1024;
 	init( ROCKSDB_MEMTABLE_BYTES,                      memtableBytes );
 	init( ROCKSDB_LEVEL_STYLE_COMPACTION,                       true );
 	init( ROCKSDB_UNSAFE_AUTO_FSYNC,                           false );
 	init( ROCKSDB_PERIODIC_COMPACTION_SECONDS,                     0 );
 	init( ROCKSDB_PREFIX_LEN,                                      0 );
 	// If rocksdb block cache size is 0, the default 8MB is used.
-	int64_t blockCacheSize = isSimulated ? 0 : 1024 * 1024 * 1024 /* 1GB */;
+	int64_t blockCacheSize = isSimulated ? 16 * 1024 * 1024 : 1024 * 1024 * 1024 /* 1GB */;
 	init( ROCKSDB_BLOCK_CACHE_SIZE,                   blockCacheSize );
 	init( ROCKSDB_METRICS_DELAY,                                60.0 );
-	init( ROCKSDB_READ_VALUE_TIMEOUT,      isSimulated ? 5.0 : 200.0 );
-	init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, isSimulated ? 5.0 : 200.0 );
-	init( ROCKSDB_READ_RANGE_TIMEOUT,       isSimulated ? 5.0 : 200.0 );
+	// // In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have
+	// very high load and single read thread cannot process all the load within the timeouts.
+	init( ROCKSDB_READ_VALUE_TIMEOUT,                          200.0 ); if (isSimulated) ROCKSDB_READ_VALUE_TIMEOUT = 5 * 60;
+	init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT,                   200.0 ); if (isSimulated) ROCKSDB_READ_VALUE_PREFIX_TIMEOUT = 5 * 60;
+	init( ROCKSDB_READ_RANGE_TIMEOUT,                          200.0 ); if (isSimulated) ROCKSDB_READ_RANGE_TIMEOUT = 5 * 60;
 	init( ROCKSDB_READ_QUEUE_WAIT,                               1.0 );
 	init( ROCKSDB_READ_QUEUE_HARD_MAX,                          1000 );
 	init( ROCKSDB_READ_QUEUE_SOFT_MAX,                           500 );

From 68eb129c71329837934c5d82265298add723ff31 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Mon, 14 Nov 2022 16:17:49 -0800
Subject: [PATCH 48/57] RocksDB:Use knob to control readValueTimeout value in
 simulation

---
 fdbclient/ServerKnobs.cpp                |  4 ++--
 fdbserver/KeyValueStoreRocksDB.actor.cpp | 16 +++++-----------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp
index af9639cf0a..8b32417aff 100644
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@@ -390,7 +390,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	// If true, do not process and store RocksDB logs
 	init( ROCKSDB_MUTE_LOGS,                                    true );
 	// Use a smaller memtable in simulation to avoid OOMs.
-	// TODO: change it to bigger value. it was 32K when timeout
 	int64_t memtableBytes = isSimulated ? 1024 * 1024 : 512 * 1024 * 1024;
 	init( ROCKSDB_MEMTABLE_BYTES,                      memtableBytes );
 	init( ROCKSDB_LEVEL_STYLE_COMPACTION,                       true );
@@ -401,7 +400,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	int64_t blockCacheSize = isSimulated ? 16 * 1024 * 1024 : 1024 * 1024 * 1024 /* 1GB */;
 	init( ROCKSDB_BLOCK_CACHE_SIZE,                   blockCacheSize );
 	init( ROCKSDB_METRICS_DELAY,                                60.0 );
-	// // In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have
+	// ROCKSDB_READ_VALUE_TIMEOUT, ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, ROCKSDB_READ_RANGE_TIMEOUT knobs:
+	// In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have
 	// very high load and single read thread cannot process all the load within the timeouts.
 	init( ROCKSDB_READ_VALUE_TIMEOUT,                          200.0 ); if (isSimulated) ROCKSDB_READ_VALUE_TIMEOUT = 5 * 60;
 	init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT,                   200.0 ); if (isSimulated) ROCKSDB_READ_VALUE_PREFIX_TIMEOUT = 5 * 60;
diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp
index f2525b49a2..94e07b0798 100644
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@@ -1402,17 +1402,11 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		                ThreadReturnPromiseStream<std::pair<std::string, double>>* metricPromiseStream)
 		  : id(id), db(db), cf(cf), sharedState(sharedState), readIterPool(readIterPool),
 		    perfContextMetrics(perfContextMetrics), metricPromiseStream(metricPromiseStream), threadIndex(threadIndex) {
-			if (g_network->isSimulated()) {
-				// In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have
-				// very high load and single read thread cannot process all the load within the timeouts.
-				readValueTimeout = 5 * 60;
-				readValuePrefixTimeout = 5 * 60;
-				readRangeTimeout = 5 * 60;
-			} else {
-				readValueTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_TIMEOUT;
-				readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT;
-				readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT;
-			}
+
+			readValueTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_TIMEOUT;
+			readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT;
+			readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT;
+
 			if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) {
 				// Enable perf context on the same thread with the db thread
 				rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);

From 2985b255fa508484c8758cc7038382386f102386 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 14 Nov 2022 16:39:13 -0800
Subject: [PATCH 49/57] remove unnecessary redeclaration

---
 fdbcli/TenantCommands.actor.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp
index 0de598f75b..52cc94001d 100644
--- a/fdbcli/TenantCommands.actor.cpp
+++ b/fdbcli/TenantCommands.actor.cpp
@@ -106,7 +106,6 @@ bool parseTenantListOptions(std::vector<StringRef> const& tokens,
 		}
 		value = token;
 		if (tokencmp(param, "limit")) {
-			int limit = 0;
 			int n = 0;
 			if (sscanf(value.get().toString().c_str(), "%d%n", &limit, &n) != 1 || n != value.get().size() ||
 			    limit <= 0) {
@@ -114,7 +113,6 @@ bool parseTenantListOptions(std::vector<StringRef> const& tokens,
 				return false;
 			}
 		} else if (tokencmp(param, "offset")) {
-			int offset = 0;
 			int n = 0;
 			if (sscanf(value.get().toString().c_str(), "%d%n", &offset, &n) != 1 || n != value.get().size() ||
 			    offset < 0) {

From 2f53c6ebd8182f4f971a2b3559ff78a94e4ddbb1 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Tue, 15 Nov 2022 10:32:19 -0800
Subject: [PATCH 50/57] remove extra test check

---
 fdbcli/tests/fdbcli_tests.py                                | 2 +-
 fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/fdbcli/tests/fdbcli_tests.py b/fdbcli/tests/fdbcli_tests.py
index 3ba5e54da2..a647752123 100755
--- a/fdbcli/tests/fdbcli_tests.py
+++ b/fdbcli/tests/fdbcli_tests.py
@@ -793,7 +793,7 @@ def tenant_list(logger):
     assert output == 'ERROR: invalid offset `13y\''
 
     output = run_fdbcli_command_and_get_error('tenant list a b state=14z')
-    assert output == 'ERROR: unrecognized tenant state(s) `14z\''
+    assert output == 'ERROR: unrecognized tenant state(s) `14z\'.'
 
 @enable_logging()
 def tenant_get(logger):
diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
index b2ac5ab39b..56c988243f 100644
--- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
+++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp
@@ -404,10 +404,6 @@ struct MetaclusterManagementWorkload : TestWorkload {
 			wait(store(checkEntry2, MetaclusterAPI::getTenant(self->managementDb, tenant)) &&
 			     store(tenantList,
 			           MetaclusterAPI::listTenants(self->managementDb, ""_sr, "\xff\xff"_sr, 10e6, 0, filters)));
-			if (tenantList.empty()) {
-				ASSERT(checkEntry2.tenantState != checkState);
-				return Void();
-			}
 			bool found = false;
 			for (auto pair : tenantList) {
 				ASSERT(pair.second.tenantState == checkState);

From 70129c85cb25f9bbeda8da50e49a39614eb32345 Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Tue, 15 Nov 2022 12:20:28 -0800
Subject: [PATCH 51/57] formatting

---
 fdbrpc/include/fdbrpc/Stats.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/include/fdbrpc/Stats.h b/fdbrpc/include/fdbrpc/Stats.h
index c0564780bb..5cbb2cccd0 100644
--- a/fdbrpc/include/fdbrpc/Stats.h
+++ b/fdbrpc/include/fdbrpc/Stats.h
@@ -218,7 +218,7 @@ class LatencySample {
 public:
 	LatencySample(std::string name, UID id, double loggingInterval, double accuracy)
 	  : name(name), id(id), sampleStart(now()), sketch(accuracy),
-		latencySampleEventHolder(makeReference<EventCacheHolder>(id.toString() + "/" + name)) {
+	    latencySampleEventHolder(makeReference<EventCacheHolder>(id.toString() + "/" + name)) {
 		assert(accuracy > 0);
 		if (accuracy <= 0) {
 			fmt::print(stderr, "ERROR: LatencySample {} has invalid accuracy ({})", name, accuracy);

From 214db4d17ee8ec38bb7b4b865dc29d8c2a7b2ddb Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Tue, 15 Nov 2022 13:38:55 -0800
Subject: [PATCH 52/57] formatting

---
 fdbserver/TLogServer.actor.cpp | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index d9796923e3..f85eea5bba 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -1445,9 +1445,7 @@ ACTOR Future<Void> updateStorage(TLogData* self) {
 ACTOR Future<Void> updateStorageLoop(TLogData* self) {
 	wait(delay(0, TaskPriority::UpdateStorage));
 
-	loop {
-		wait(updateStorage(self));
-	}
+	loop { wait(updateStorage(self)); }
 }
 
 void commitMessages(TLogData* self,
@@ -1608,9 +1606,7 @@ ACTOR Future<Void> waitForMessagesForTag(Reference<LogData> self, Tag reqTag, Ve
 			// we want the caller to finish first, otherwise the data structure it is building might not be complete
 			wait(delay(0.0));
 		}
-		when(wait(delay(timeout))) {
-			self->blockingPeekTimeouts += 1;
-		}
+		when(wait(delay(timeout))) { self->blockingPeekTimeouts += 1; }
 	}
 	return Void();
 }
@@ -2795,9 +2791,7 @@ ACTOR Future<Void> pullAsyncData(TLogData* self,
 	while (!endVersion.present() || logData->version.get() < endVersion.get()) {
 		loop {
 			choose {
-				when(wait(r ? r->getMore(TaskPriority::TLogCommit) : Never())) {
-					break;
-				}
+				when(wait(r ? r->getMore(TaskPriority::TLogCommit) : Never())) { break; }
 				when(wait(dbInfoChange)) {
 					if (logData->logSystem->get()) {
 						r = logData->logSystem->get()->peek(logData->logId, tagAt, endVersion, tags, true);
@@ -3278,9 +3272,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 
 								choose {
 									when(wait(updateStorage(self))) {}
-									when(wait(allRemoved)) {
-										throw worker_removed();
-									}
+									when(wait(allRemoved)) { throw worker_removed(); }
 								}
 							}
 						} else {
@@ -3291,9 +3283,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 						}
 					}
 				}
-				when(wait(allRemoved)) {
-					throw worker_removed();
-				}
+				when(wait(allRemoved)) { throw worker_removed(); }
 			}
 		}
 	} catch (Error& e) {
@@ -3639,9 +3629,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
 						forwardPromise(req.reply, self.tlogCache.get(req.recruitmentID));
 					}
 				}
-				when(wait(error)) {
-					throw internal_error();
-				}
+				when(wait(error)) { throw internal_error(); }
 				when(wait(activeSharedChange)) {
 					if (activeSharedTLog->get() == tlogId) {
 						TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get());

From 8971b5907c946a9e384dedae3301f2c4031c647f Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Wed, 16 Nov 2022 13:18:40 -0800
Subject: [PATCH 53/57] add comments; mark some methods const

---
 fdbserver/DDTxnProcessor.actor.cpp                          | 1 +
 fdbserver/MockGlobalState.actor.cpp                         | 4 ++--
 fdbserver/include/fdbserver/MockGlobalState.h               | 4 ++--
 fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp | 5 ++++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp
index 313ba6baa2..9907f22784 100644
--- a/fdbserver/DDTxnProcessor.actor.cpp
+++ b/fdbserver/DDTxnProcessor.actor.cpp
@@ -984,6 +984,7 @@ ACTOR Future<Void> rawFinishMovement(std::shared_ptr<MockGlobalState> mgs,
 	// remove destination servers from source servers
 	ASSERT_EQ(srcTeams.size(), 0);
 	for (auto& id : srcTeams.front().servers) {
+		// the only caller moveKeys will always make sure the UID are sorted
 		if (!std::binary_search(params.destinationTeam.begin(), params.destinationTeam.end(), id)) {
 			mgs->allServers.at(id).removeShard(params.keys);
 		}
diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp
index c94207975d..0959fa77b1 100644
--- a/fdbserver/MockGlobalState.actor.cpp
+++ b/fdbserver/MockGlobalState.actor.cpp
@@ -167,7 +167,7 @@ public:
 	}
 };
 
-bool MockStorageServer::allShardStatusEqual(const KeyRangeRef& range, MockShardStatus status) {
+bool MockStorageServer::allShardStatusEqual(const KeyRangeRef& range, MockShardStatus status) const {
 	auto ranges = serverKeys.intersectingRanges(range);
 	ASSERT(!ranges.empty()); // at least the range is allKeys
 
@@ -178,7 +178,7 @@ bool MockStorageServer::allShardStatusEqual(const KeyRangeRef& range, MockShardS
 	return true;
 }
 
-bool MockStorageServer::allShardStatusIn(const KeyRangeRef& range, const std::set<MockShardStatus>& status) {
+bool MockStorageServer::allShardStatusIn(const KeyRangeRef& range, const std::set<MockShardStatus>& status) const {
 	auto ranges = serverKeys.intersectingRanges(range);
 	ASSERT(!ranges.empty()); // at least the range is allKeys
 
diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h
index 21be352e48..4a4ff34fec 100644
--- a/fdbserver/include/fdbserver/MockGlobalState.h
+++ b/fdbserver/include/fdbserver/MockGlobalState.h
@@ -107,8 +107,8 @@ public:
 
 	decltype(serverKeys)::Ranges getAllRanges() { return serverKeys.ranges(); }
 
-	bool allShardStatusEqual(const KeyRangeRef& range, MockShardStatus status);
-	bool allShardStatusIn(const KeyRangeRef& range, const std::set<MockShardStatus>& status);
+	bool allShardStatusEqual(const KeyRangeRef& range, MockShardStatus status) const;
+	bool allShardStatusIn(const KeyRangeRef& range, const std::set<MockShardStatus>& status) const;
 
 	// change the status of range. This function may result in split to make the shard boundary align with range.begin
 	// and range.end. In this case, if restrictSize==true, the sum of the split shard size is strictly equal to the old
diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
index 5518cefa62..80248b09b4 100644
--- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp
@@ -319,7 +319,10 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
 
 		verifyInitDataEqual(self->realInitDD, mockInitData);
 		TraceEvent(SevDebug, relocateShardInterval.end(), relocateShardInterval.pairID);
-		self->mock->setupMockGlobalState(self->realInitDD); // in case SS remove or recruit
+		// The simulator have chances generating a scenario when after the first setupMockGlobalState call, there is a
+		// new storage server join the cluster, there's no way for mock DD to know the new storage server without
+		// calling setupMockGlobalState again.
+		self->mock->setupMockGlobalState(self->realInitDD);
 		return Void();
 	}
 

From c6ebdd8ae8037815991e4e79ed2a53b8cd38d9cc Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Wed, 16 Nov 2022 15:36:55 -0800
Subject: [PATCH 54/57] Send error when LogRouterPeekPopped happens

Otherwise, the remote tlog won't get a response and the parallel peek requests
will never be cleared, blocking subsequent peeks. As a result, remote tlog will
no longer be able to pop the log router, which in turn can no longer peek tlogs.
The whole remote side will become blocked.
---
 fdbserver/LogRouter.actor.cpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp
index 51796f9fc0..6309b47094 100644
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@@ -533,14 +533,19 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
 				// kills logRouterPeekStream actor, otherwise that actor becomes stuck
 				throw operation_obsolete();
 			}
-			replyPromise.send(Never());
-			if (reqSequence.present()) {
-				auto& trackerData = self->peekTracker[peekId];
-				auto& sequenceData = trackerData.sequence_version[sequence + 1];
-				if (!sequenceData.isSet()) {
-					sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
-				}
+			if (std::is_same<PromiseType, ReplyPromise<TLogPeekReply>>::value) {
+				replyPromise.sendError(operation_obsolete());
+			} else {
+				replyPromise.send(Never());
 			}
+
+			/*if (reqSequence.present()) {
+			    auto& trackerData = self->peekTracker[peekId];
+			    auto& sequenceData = trackerData.sequence_version[sequence + 1];
+			    if (!sequenceData.isSet()) {
+			        sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
+			    }
+			}*/
 			return Void();
 		}
 

From f285a91f6c440c26d4490bfc34545e6122640de8 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Thu, 17 Nov 2022 11:36:36 -0800
Subject: [PATCH 55/57] Add more debug events

---
 fdbserver/LogRouter.actor.cpp           | 35 +++++++++------
 fdbserver/LogSystemPeekCursor.actor.cpp | 58 ++++++++++++++++++++-----
 fdbserver/TLogServer.actor.cpp          |  3 ++
 3 files changed, 72 insertions(+), 24 deletions(-)

diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp
index 6309b47094..3c0cc68a70 100644
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@@ -28,6 +28,7 @@
 #include "flow/ActorCollection.h"
 #include "flow/Arena.h"
 #include "flow/Histogram.h"
+#include "flow/Trace.h"
 #include "flow/network.h"
 #include "flow/DebugTrace.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
@@ -448,6 +449,14 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
 	state int sequence = -1;
 	state UID peekId;
 
+	DebugLogTraceEvent("LogRouterPeek0", self->dbgid)
+	    .detail("ReturnIfBlocked", reqReturnIfBlocked)
+	    .detail("Tag", reqTag.toString())
+	    .detail("Seq", reqSequence.present() ? reqSequence.get().second : -1)
+	    .detail("SeqCursor", reqSequence.present() ? reqSequence.get().first : UID())
+	    .detail("Ver", self->version.get())
+	    .detail("Begin", reqBegin);
+
 	if (reqSequence.present()) {
 		try {
 			peekId = reqSequence.get().first;
@@ -481,6 +490,13 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
 			reqOnlySpilled = prevPeekData.second;
 			wait(yield());
 		} catch (Error& e) {
+			DebugLogTraceEvent("LogRouterPeekError", self->dbgid)
+			    .error(e)
+			    .detail("Tag", reqTag.toString())
+			    .detail("Seq", reqSequence.present() ? reqSequence.get().second : -1)
+			    .detail("SeqCursor", reqSequence.present() ? reqSequence.get().first : UID())
+			    .detail("Begin", reqBegin);
+
 			if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
 				replyPromise.sendError(e);
 				return Void();
@@ -490,12 +506,6 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
 		}
 	}
 
-	DebugLogTraceEvent("LogRouterPeek0", self->dbgid)
-	    .detail("ReturnIfBlocked", reqReturnIfBlocked)
-	    .detail("Tag", reqTag.toString())
-	    .detail("Ver", self->version.get())
-	    .detail("Begin", reqBegin);
-
 	if (reqReturnIfBlocked && self->version.get() < reqBegin) {
 		replyPromise.sendError(end_of_stream());
 		if (reqSequence.present()) {
@@ -528,24 +538,22 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
 			TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid)
 			    .detail("Begin", reqBegin)
 			    .detail("Popped", poppedVer)
+			    .detail("Tag", reqTag.toString())
+			    .detail("Seq", reqSequence.present() ? reqSequence.get().second : -1)
+			    .detail("SeqCursor", reqSequence.present() ? reqSequence.get().first : UID())
 			    .detail("Start", self->startVersion);
 			if (std::is_same<PromiseType, Promise<TLogPeekReply>>::value) {
 				// kills logRouterPeekStream actor, otherwise that actor becomes stuck
 				throw operation_obsolete();
 			}
 			if (std::is_same<PromiseType, ReplyPromise<TLogPeekReply>>::value) {
+				// Send error to avoid a race condition that the peer is really retrying,
+				// otherwise, the peer could be blocked forever.
 				replyPromise.sendError(operation_obsolete());
 			} else {
 				replyPromise.send(Never());
 			}
 
-			/*if (reqSequence.present()) {
-			    auto& trackerData = self->peekTracker[peekId];
-			    auto& sequenceData = trackerData.sequence_version[sequence + 1];
-			    if (!sequenceData.isSet()) {
-			        sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
-			    }
-			}*/
 			return Void();
 		}
 
@@ -686,6 +694,7 @@ ACTOR Future<Void> logRouterPop(LogRouterData* self, TLogPopRequest req) {
 	if (!tagData) {
 		tagData = self->createTagData(req.tag, req.to, req.durableKnownCommittedVersion);
 	} else if (req.to > tagData->popped) {
+		DebugLogTraceEvent("LogRouterPop", self->dbgid).detail("Tag", req.tag.toString()).detail("PopVersion", req.to);
 		tagData->popped = req.to;
 		tagData->durableKnownCommittedVersion = req.durableKnownCommittedVersion;
 		wait(tagData->eraseMessagesBefore(req.to, self, TaskPriority::TLogPop));
diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp
index 4a0164aac7..d636e71d1f 100644
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@@ -62,6 +62,8 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference<AsyncVar<OptionalInterf
 	this->results.minKnownCommittedVersion = 0;
 	DebugLogTraceEvent(SevDebug, "SPC_Starting", randomID)
 	    .detail("Tag", tag.toString())
+	    .detail("Parallel", parallelGetMore)
+	    .detail("Interf", interf && interf->get().present() ? interf->get().id() : UID())
 	    .detail("UsePeekStream", usePeekStream)
 	    .detail("Begin", begin)
 	    .detail("End", end);
@@ -111,7 +113,9 @@ bool ILogSystem::ServerPeekCursor::hasMessage() const {
 }
 
 void ILogSystem::ServerPeekCursor::nextMessage() {
-	//TraceEvent("SPC_NextMessage", randomID).detail("MessageVersion", messageVersion.toString());
+	DebugLogTraceEvent("SPC_NextMessage", randomID)
+	    .detail("Tag", tag.toString())
+	    .detail("MessageVersion", messageVersion.toString());
 	ASSERT(hasMsg);
 	if (rd.empty()) {
 		messageVersion.reset(std::min(results.end, end.version));
@@ -143,11 +147,13 @@ void ILogSystem::ServerPeekCursor::nextMessage() {
 	rd.rewind();
 	rd.readBytes(messageAndTags.getHeaderSize());
 	hasMsg = true;
-	//TraceEvent("SPC_NextMessageB", randomID).detail("MessageVersion", messageVersion.toString());
+	DebugLogTraceEvent("SPC_NextMessageB", randomID)
+	    .detail("Tag", tag.toString())
+	    .detail("MessageVersion", messageVersion.toString());
 }
 
 StringRef ILogSystem::ServerPeekCursor::getMessage() {
-	//TraceEvent("SPC_GetMessage", randomID);
+	DebugLogTraceEvent("SPC_GetMessage", randomID).detail("Tag", tag.toString());
 	StringRef message = messageAndTags.getMessageWithoutTags();
 	rd.readBytes(message.size()); // Consumes the message.
 	return message;
@@ -260,6 +266,14 @@ ACTOR Future<Void> serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self,
 	}
 
 	loop {
+		DebugLogTraceEvent("SPC_GetMoreP", self->randomID)
+		    .detail("Tag", self->tag.toString())
+		    .detail("Has", self->hasMessage())
+		    .detail("Begin", self->messageVersion.version)
+		    .detail("Parallel", self->parallelGetMore)
+		    .detail("Seq", self->sequence)
+		    .detail("Sizes", self->futureResults.size())
+		    .detail("Interf", self->interf->get().present() ? self->interf->get().id() : UID());
 		state Version expectedBegin = self->messageVersion.version;
 		try {
 			if (self->parallelGetMore || self->onlySpilled) {
@@ -294,7 +308,12 @@ ACTOR Future<Void> serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self,
 					expectedBegin = res.end;
 					self->futureResults.pop_front();
 					updateCursorWithReply(self, res);
-					//TraceEvent("SPC_GetMoreB", self->randomID).detail("Has", self->hasMessage()).detail("End", res.end).detail("Popped", res.popped.present() ? res.popped.get() : 0);
+					DebugLogTraceEvent("SPC_GetMoreReply", self->randomID)
+					    .detail("Has", self->hasMessage())
+					    .detail("Tag", self->tag.toString())
+					    .detail("End", res.end)
+					    .detail("Size", self->futureResults.size())
+					    .detail("Popped", res.popped.present() ? res.popped.get() : 0);
 					return Void();
 				}
 				when(wait(self->interfaceChanged)) {
@@ -306,11 +325,17 @@ ACTOR Future<Void> serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self,
 				}
 			}
 		} catch (Error& e) {
+			DebugLogTraceEvent("PeekCursorError", self->randomID)
+			    .error(e)
+			    .detail("Tag", self->tag.toString())
+			    .detail("Begin", self->messageVersion.version)
+			    .detail("Interf", self->interf->get().present() ? self->interf->get().id() : UID());
+
 			if (e.code() == error_code_end_of_stream) {
 				self->end.reset(self->messageVersion.version);
 				return Void();
 			} else if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
-				TraceEvent("PeekCursorTimedOut", self->randomID).error(e);
+				TraceEvent ev("PeekCursorTimedOut", self->randomID);
 				// We *should* never get timed_out(), as it means the TLog got stuck while handling a parallel peek,
 				// and thus we've likely just wasted 10min.
 				// timed_out() is sent by cleanupPeekTrackers as value PEEK_TRACKER_EXPIRATION_TIME
@@ -326,6 +351,11 @@ ACTOR Future<Void> serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self,
 				self->randomID = deterministicRandom()->randomUniqueID();
 				self->sequence = 0;
 				self->futureResults.clear();
+				ev.error(e)
+				    .detail("Tag", self->tag.toString())
+				    .detail("Begin", self->messageVersion.version)
+				    .detail("NewID", self->randomID)
+				    .detail("Interf", self->interf->get().present() ? self->interf->get().id() : UID());
 			} else {
 				throw e;
 			}
@@ -415,7 +445,11 @@ ACTOR Future<Void> serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPri
 				                        taskID))
 				                  : Never())) {
 					updateCursorWithReply(self, res);
-					//TraceEvent("SPC_GetMoreB", self->randomID).detail("Has", self->hasMessage()).detail("End", res.end).detail("Popped", res.popped.present() ? res.popped.get() : 0);
+					DebugLogTraceEvent("SPC_GetMoreB", self->randomID)
+					    .detail("Tag", self->tag.toString())
+					    .detail("Has", self->hasMessage())
+					    .detail("End", res.end)
+					    .detail("Popped", res.popped.present() ? res.popped.get() : 0);
 					return Void();
 				}
 				when(wait(self->interf->onChange())) { self->onlySpilled = false; }
@@ -431,11 +465,13 @@ ACTOR Future<Void> serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPri
 }
 
 Future<Void> ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) {
-	// TraceEvent("SPC_GetMore", randomID)
-	//     .detail("HasMessage", hasMessage())
-	//     .detail("More", !more.isValid() || more.isReady())
-	//     .detail("MessageVersion", messageVersion.toString())
-	//     .detail("End", end.toString());
+	DebugLogTraceEvent("SPC_GetMore", randomID)
+	    .detail("Tag", tag.toString())
+	    .detail("HasMessage", hasMessage())
+	    .detail("More", !more.isValid() || more.isReady())
+	    .detail("Parallel", parallelGetMore)
+	    .detail("MessageVersion", messageVersion.toString())
+	    .detail("End", end.toString());
 	if (hasMessage() && !parallelGetMore)
 		return Void();
 	if (!more.isValid() || more.isReady()) {
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 428ebc79a8..793f01628e 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -2187,6 +2187,9 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
 	if (logData->logSystem->get() &&
 	    (!logData->isPrimary || logData->logRouterPoppedVersion < logData->logRouterPopToVersion)) {
 		logData->logRouterPoppedVersion = ver;
+		DebugLogTraceEvent("LogPop", self->dbgid)
+		    .detail("Tag", logData->remoteTag.toString())
+		    .detail("Version", knownCommittedVersion);
 		logData->logSystem->get()->pop(ver, logData->remoteTag, knownCommittedVersion, logData->locality);
 	}
 

From d865e77f062f6aab9458b4e229400956cbe8c2ff Mon Sep 17 00:00:00 2001
From: neethuhaneesha <nbingi@apple.com>
Date: Thu, 17 Nov 2022 15:39:22 -0800
Subject: [PATCH 56/57] RocksDB 7.7.3 version upgrade (#8858)

---
 cmake/CompileRocksDB.cmake                      | 6 +++---
 fdbserver/KeyValueStoreRocksDB.actor.cpp        | 9 +++------
 fdbserver/KeyValueStoreShardedRocksDB.actor.cpp | 9 +++------
 fdbserver/RocksDBCheckpointUtils.actor.cpp      | 6 +++---
 4 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/cmake/CompileRocksDB.cmake b/cmake/CompileRocksDB.cmake
index 3fdea389ab..f257443c80 100644
--- a/cmake/CompileRocksDB.cmake
+++ b/cmake/CompileRocksDB.cmake
@@ -1,6 +1,6 @@
 # FindRocksDB
 
-find_package(RocksDB 6.27.3)
+find_package(RocksDB 7.7.3)
 
 include(ExternalProject)
 
@@ -49,8 +49,8 @@ if(ROCKSDB_FOUND)
       ${BINARY_DIR}/librocksdb.a)
 else()
   ExternalProject_Add(rocksdb
-    URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz
-    URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58
+    URL https://github.com/facebook/rocksdb/archive/refs/tags/v7.7.3.tar.gz
+    URL_HASH SHA256=b8ac9784a342b2e314c821f6d701148912215666ac5e9bdbccd93cf3767cb611
     CMAKE_ARGS ${RocksDB_CMAKE_ARGS}
     BUILD_BYPRODUCTS <BINARY_DIR>/librocksdb.a
     INSTALL_COMMAND ""
diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp
index aaaf8dd807..510dd5029b 100644
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@@ -68,12 +68,9 @@
 
 #ifdef SSD_ROCKSDB_EXPERIMENTAL
 
-// Enforcing rocksdb version to be 6.27.3 or greater.
-static_assert(ROCKSDB_MAJOR >= 6, "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
-static_assert(ROCKSDB_MAJOR == 6 ? ROCKSDB_MINOR >= 27 : true,
-              "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
-static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 : true,
-              "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
+// Enforcing rocksdb version to be 7.7.3.
+static_assert((ROCKSDB_MAJOR == 7 && ROCKSDB_MINOR == 7 && ROCKSDB_PATCH == 3),
+              "Unsupported rocksdb version. Update the rocksdb to 7.7.3 version");
 
 namespace {
 using rocksdb::BackgroundErrorReason;
diff --git a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
index 53edcc1d95..83f99fa3ab 100644
--- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp
@@ -41,12 +41,9 @@
 
 #ifdef SSD_ROCKSDB_EXPERIMENTAL
 
-// Enforcing rocksdb version to be 6.27.3 or greater.
-static_assert(ROCKSDB_MAJOR >= 6, "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
-static_assert(ROCKSDB_MAJOR == 6 ? ROCKSDB_MINOR >= 27 : true,
-              "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
-static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 : true,
-              "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
+// Enforcing rocksdb version to be 7.7.3.
+static_assert((ROCKSDB_MAJOR == 7 && ROCKSDB_MINOR == 7 && ROCKSDB_PATCH == 3),
+              "Unsupported rocksdb version. Update the rocksdb to 7.7.3 version");
 
 const std::string rocksDataFolderSuffix = "-data";
 const std::string METADATA_SHARD_ID = "kvs-metadata";
diff --git a/fdbserver/RocksDBCheckpointUtils.actor.cpp b/fdbserver/RocksDBCheckpointUtils.actor.cpp
index 53f41085e8..006a67aefc 100644
--- a/fdbserver/RocksDBCheckpointUtils.actor.cpp
+++ b/fdbserver/RocksDBCheckpointUtils.actor.cpp
@@ -43,9 +43,9 @@
 #include "flow/actorcompiler.h" // has to be last include
 
 #ifdef SSD_ROCKSDB_EXPERIMENTAL
-// Enforcing rocksdb version to be 6.22.1 or greater.
-static_assert(ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR >= 22 && ROCKSDB_PATCH >= 1,
-              "Unsupported rocksdb version. Update the rocksdb to at least 6.22.1 version");
+// Enforcing rocksdb version to be 7.7.3.
+static_assert((ROCKSDB_MAJOR == 7 && ROCKSDB_MINOR == 7 && ROCKSDB_PATCH == 3),
+              "Unsupported rocksdb version. Update the rocksdb to 7.7.3 version");
 
 namespace {
 

From f6612ebd00d3391d2a675890470e54be11c97cfd Mon Sep 17 00:00:00 2001
From: Zhe Wu <halfprice@users.noreply.github.com>
Date: Thu, 17 Nov 2022 10:56:50 -0800
Subject: [PATCH 57/57] When selecting physical shard and it is not available
 for the remote team, retry selecting a new dst physical shard

---
 fdbserver/DDRelocationQueue.actor.cpp              | 14 ++++++++++----
 fdbserver/DDShardTracker.actor.cpp                 | 12 +++++++-----
 .../include/fdbserver/DataDistribution.actor.h     | 12 ++++++------
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp
index ea5eef4848..0716a3d86f 100644
--- a/fdbserver/DDRelocationQueue.actor.cpp
+++ b/fdbserver/DDRelocationQueue.actor.cpp
@@ -1548,14 +1548,20 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
 						if (enableShardMove && tciIndex == 1) {
 							ASSERT(physicalShardIDCandidate != UID().first() &&
 							       physicalShardIDCandidate != anonymousShardId.first());
-							Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
+							std::pair<Optional<ShardsAffectedByTeamFailure::Team>, bool> remoteTeamWithPhysicalShard =
 							    self->physicalShardCollection->tryGetAvailableRemoteTeamWith(
 							        physicalShardIDCandidate, metrics, debugID);
-							// TODO: when we know that `physicalShardIDCandidate` exists, remote team must also exists.
-							if (remoteTeamWithPhysicalShard.present()) {
+							if (!remoteTeamWithPhysicalShard.second) {
+								// Physical shard with `physicalShardIDCandidate` is not available. Retry selecting new
+								// dst physical shard.
+								self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++;
+								foundTeams = false;
+								break;
+							}
+							if (remoteTeamWithPhysicalShard.first.present()) {
 								// Exists a remoteTeam in the mapping that has the physicalShardIDCandidate
 								// use the remoteTeam with the physicalShard as the bestTeam
-								req = GetTeamRequest(remoteTeamWithPhysicalShard.get().servers);
+								req = GetTeamRequest(remoteTeamWithPhysicalShard.first.get().servers);
 							}
 						}
 
diff --git a/fdbserver/DDShardTracker.actor.cpp b/fdbserver/DDShardTracker.actor.cpp
index fddbf25f14..69bb6c853a 100644
--- a/fdbserver/DDShardTracker.actor.cpp
+++ b/fdbserver/DDShardTracker.actor.cpp
@@ -1756,7 +1756,7 @@ InOverSizePhysicalShard PhysicalShardCollection::isInOverSizePhysicalShard(KeyRa
 }
 
 // May return a problematic remote team
-Optional<ShardsAffectedByTeamFailure::Team> PhysicalShardCollection::tryGetAvailableRemoteTeamWith(
+std::pair<Optional<ShardsAffectedByTeamFailure::Team>, bool> PhysicalShardCollection::tryGetAvailableRemoteTeamWith(
     uint64_t inputPhysicalShardID,
     StorageMetrics const& moveInMetrics,
     uint64_t debugID) {
@@ -1764,10 +1764,10 @@ Optional<ShardsAffectedByTeamFailure::Team> PhysicalShardCollection::tryGetAvail
 	ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD);
 	ASSERT(inputPhysicalShardID != anonymousShardId.first() && inputPhysicalShardID != UID().first());
 	if (physicalShardInstances.count(inputPhysicalShardID) == 0) {
-		return Optional<ShardsAffectedByTeamFailure::Team>();
+		return { Optional<ShardsAffectedByTeamFailure::Team>(), true };
 	}
 	if (!checkPhysicalShardAvailable(inputPhysicalShardID, moveInMetrics)) {
-		return Optional<ShardsAffectedByTeamFailure::Team>();
+		return { Optional<ShardsAffectedByTeamFailure::Team>(), false };
 	}
 	for (auto team : physicalShardInstances[inputPhysicalShardID].teams) {
 		if (team.primary == false) {
@@ -1777,10 +1777,12 @@ Optional<ShardsAffectedByTeamFailure::Team> PhysicalShardCollection::tryGetAvail
 			    .detail("TeamSize", team.servers.size())
 			    .detail("PhysicalShardsOfTeam", convertIDsToString(teamPhysicalShardIDs[team]))
 			    .detail("DebugID", debugID);*/
-			return team;
+			return { team, true };
 		}
 	}
-	UNREACHABLE();
+	// In this case, the physical shard may not be populated in the remote region yet, e.g., we are making a
+	// configuration change to turn a single region cluster into HA mode.
+	return { Optional<ShardsAffectedByTeamFailure::Team>(), true };
 }
 
 // The update of PhysicalShardToTeams, Collection, keyRangePhysicalShardIDMap should be atomic
diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h
index 9054ab55a3..40143c3109 100644
--- a/fdbserver/include/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/include/fdbserver/DataDistribution.actor.h
@@ -284,12 +284,12 @@ public:
 	                                                      const std::unordered_set<uint64_t>& excludedPhysicalShards,
 	                                                      uint64_t debugID);
 
-	// Step 2: get a remote team which has the input physical shard
-	// Return empty if no such remote team
-	// May return a problematic remote team, and re-selection is required for this case
-	Optional<ShardsAffectedByTeamFailure::Team> tryGetAvailableRemoteTeamWith(uint64_t inputPhysicalShardID,
-	                                                                          StorageMetrics const& moveInMetrics,
-	                                                                          uint64_t debugID);
+	// Step 2: get a remote team which has the input physical shard.
+	// Second field in the returned pair indicates whether this physical shard is available or not.
+	// Return empty if no such remote team.
+	// May return a problematic remote team, and re-selection is required for this case.
+	std::pair<Optional<ShardsAffectedByTeamFailure::Team>, bool>
+	tryGetAvailableRemoteTeamWith(uint64_t inputPhysicalShardID, StorageMetrics const& moveInMetrics, uint64_t debugID);
 	// Invariant:
 	// (1) If forceToUseNewPhysicalShard is set, use the bestTeams selected by getTeam(), and create a new physical
 	// shard for the teams