Merge branch 'main' of https://github.com/apple/foundationdb into readaware

2022-03-03 11:15:18 -08:00 · 2022-03-03 11:15:18 -08:00 · a28b0f4361
parent 87ab7d165b e7a82b68f2
commit a28b0f4361
147 changed files with 3979 additions and 2520 deletions
--- a/README.md
+++ b/README.md
@ -22,7 +22,7 @@ Contributing to FoundationDB can be in contributions to the code base, sharing y

 ### Binary downloads

-Developers interested in using FoundationDB can get started by downloading and installing a binary package. Please see the [downloads page](https://www.foundationdb.org/download/) for a list of available packages.
+Developers interested in using FoundationDB can get started by downloading and installing a binary package. Please see the [downloads page](https://github.com/apple/foundationdb/releases) for a list of available packages.


 ### Compiling from source
@ -181,4 +181,4 @@ Under Windows, only Visual Studio with ClangCl is supported
 1. `mkdir build && cd build`
 1. `cmake -G "Visual Studio 16 2019" -A x64 -T ClangCl <PATH_TO_FOUNDATIONDB_SOURCE>`
 1. `msbuild /p:Configuration=Release foundationdb.sln`
-1. To increase build performance, use `/p:UseMultiToolTask=true` and `/p:CL_MPCount=<NUMBER_OF_PARALLEL_JOBS>` 
+1. To increase build performance, use `/p:UseMultiToolTask=true` and `/p:CL_MPCount=<NUMBER_OF_PARALLEL_JOBS>` 
--- a/bindings/c/CMakeLists.txt
+++ b/bindings/c/CMakeLists.txt
@ -18,6 +18,8 @@ endif()

 if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
  set(cpu "aarch64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le|powerpc64le)")
+  set(cpu "ppc64le")
 endif()

 set(IS_ARM_MAC NO)
@ -49,7 +51,7 @@ endif()
 add_dependencies(fdb_c fdb_c_generated fdb_c_options)
 add_dependencies(fdbclient fdb_c_options)
 add_dependencies(fdbclient_sampling fdb_c_options)
-target_link_libraries(fdb_c PUBLIC $<BUILD_INTERFACE:fdbclient>)
+target_link_libraries(fdb_c PRIVATE $<BUILD_INTERFACE:fdbclient>)
 if(APPLE)
  set(symbols ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.symbols)
  add_custom_command(OUTPUT ${symbols}
@ -121,9 +123,9 @@ if(NOT WIN32 AND NOT IS_ARM_MAC)
    strip_debug_symbols(fdb_c_ryw_benchmark)
    strip_debug_symbols(fdb_c_txn_size_test)
  endif()
-  target_link_libraries(fdb_c_performance_test PRIVATE fdb_c)
-  target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c)
-  target_link_libraries(fdb_c_txn_size_test PRIVATE fdb_c)
+  target_link_libraries(fdb_c_performance_test PRIVATE fdb_c Threads::Threads)
+  target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c Threads::Threads)
+  target_link_libraries(fdb_c_txn_size_test PRIVATE fdb_c Threads::Threads)

  add_dependencies(fdb_c_setup_tests doctest)
  add_dependencies(fdb_c_unit_tests doctest)
@ -134,14 +136,14 @@ if(NOT WIN32 AND NOT IS_ARM_MAC)
  target_include_directories(fdb_c_unit_tests_version_510 PUBLIC ${DOCTEST_INCLUDE_DIR})
  target_include_directories(disconnected_timeout_unit_tests PUBLIC ${DOCTEST_INCLUDE_DIR})
  target_link_libraries(fdb_c_setup_tests PRIVATE fdb_c Threads::Threads)
-  target_link_libraries(fdb_c_unit_tests PRIVATE fdb_c Threads::Threads)
+  target_link_libraries(fdb_c_unit_tests PRIVATE fdb_c Threads::Threads fdbclient)
  target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads)
-  target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads)
+  target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads flow)
  target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads)

  # do not set RPATH for mako
  set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
-  target_link_libraries(mako PRIVATE fdb_c)
+  target_link_libraries(mako PRIVATE fdb_c fdbclient)

  if(NOT OPEN_FOR_IDE)
    # Make sure that fdb_c.h is compatible with c90
--- a/bindings/c/generate_asm.py
+++ b/bindings/c/generate_asm.py
@ -59,9 +59,10 @@ def write_windows_asm(asmfile, functions):


 def write_unix_asm(asmfile, functions, prefix):
-    if cpu != "aarch64":
+    if cpu != "aarch64" and cpu!= "ppc64le":
        asmfile.write(".intel_syntax noprefix\n")

+    i = 0
    if os == 'linux' or os == 'freebsd':
        asmfile.write("\n.data\n")
        for f in functions:
@ -70,8 +71,13 @@ def write_unix_asm(asmfile, functions, prefix):
        if os == 'linux' or os == 'freebsd':
            asmfile.write("\n.text\n")
            for f in functions:
+                if cpu == "ppc64le":
+                    asmfile.write("\n.LC%d:\n" % (i))
+                    asmfile.write("\t.quad \tfdb_api_ptr_%s\n" % (f))
+                    asmfile.write("\t.align 2\n")
+                    i = i + 1
                asmfile.write("\t.global %s\n\t.type %s, @function\n" % (f, f))
-
+    i = 0
    for f in functions:
        asmfile.write("\n.globl %s%s\n" % (prefix, f))
        if cpu == 'aarch64' and os == 'osx':
@ -118,6 +124,46 @@ def write_unix_asm(asmfile, functions, prefix):
                assert False, '{} not supported for Arm yet'.format(os)
            asmfile.write("\tldr x8, [x8]\n")
            asmfile.write("\tbr x8\n")
+        elif cpu == "ppc64le":
+            asmfile.write("\n.LCF%d:\n" % (i))
+            asmfile.write("\taddis 2,12,.TOC.-.LCF%d@ha\n" % (i))
+            asmfile.write("\taddi 2,2,.TOC.-.LCF%d@l\n" % (i))
+            asmfile.write("\tmflr 0\n")
+            asmfile.write("\tstd 31, -8(1)\n")
+            asmfile.write("\tstd     0,16(1)\n")
+            asmfile.write("\tstdu    1,-192(1)\n")
+            #asmfile.write("\tstd 2,24(1)\n")
+            asmfile.write("\taddis 11,2,.LC%d@toc@ha\n" % (i))
+            asmfile.write("\tld 11,.LC%d@toc@l(11)\n" % (i))
+            asmfile.write("\tld 12,0(11)\n")
+            asmfile.write("\tstd 2,24(1)\n")
+            asmfile.write("\tlwa 11,344(1)\n")
+            asmfile.write("\tmtctr 12\n")
+            asmfile.write("\tstd 11,152(1)\n")
+            asmfile.write("\tlwa 11,352(1)\n")
+            asmfile.write("\tstd 11,160(1)\n")
+            asmfile.write("\tlwa 11,336(1)\n")
+            asmfile.write("\tstd 11,144(1)\n")
+            asmfile.write("\tlwa 11,328(1)\n")
+            asmfile.write("\tstd 11,136(1)\n")
+            asmfile.write("\tlwa 11,320(1)\n")
+            asmfile.write("\tstd 11,128(1)\n")
+            asmfile.write("\tlwa 11,312(1)\n")
+            asmfile.write("\tstd 11,120(1)\n")
+            asmfile.write("\tlwa 11,304(1)\n")
+            asmfile.write("\tstd 11,112(1)\n")
+            asmfile.write("\tld 11,296(1)\n")
+            asmfile.write("\tstd 11,104(1)\n")
+            asmfile.write("\tlwa 11,288(1)\n")
+            asmfile.write("\tstd 11,96(1)\n")
+            asmfile.write("\tbctrl\n")
+            asmfile.write("\tld 2,24(1)\n")
+            asmfile.write("\taddi 1,1,192\n")
+            asmfile.write("\tld 0,16(1)\n")
+            asmfile.write("\tld 31, -8(1)\n")
+            asmfile.write("\tmtlr 0\n")
+            asmfile.write("\tblr\n")
+            i = i + 1
        else:
            asmfile.write(
                "\tmov r11, qword ptr [%sfdb_api_ptr_%s@GOTPCREL+rip]\n" % (prefix, f))
--- a/bindings/c/test/mako/mako.c
+++ b/bindings/c/test/mako/mako.c
@ -1,17 +1,18 @@
 #include <assert.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <getopt.h>
+#include <inttypes.h>
 #include <math.h>
-#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #include <sys/wait.h>
 #include <time.h>
 #include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>

 #if defined(__linux__)
 #include <linux/limits.h>
@ -615,7 +616,7 @@ int64_t granule_start_load(const char* filename,
 	// don't seek if offset == 0
 	if (offset && fseek(fp, offset, SEEK_SET)) {
 		// if fseek was non-zero, it failed
-		fprintf(stderr, "ERROR: BG could not seek to %ld in file %s\n", offset, full_fname);
+		fprintf(stderr, "ERROR: BG could not seek to %" PRId64 " in file %s\n", offset, full_fname);
 		fclose(fp);
 		return -1;
 	}
@ -625,7 +626,7 @@ int64_t granule_start_load(const char* filename,
 	fclose(fp);

 	if (readSize != length) {
-		fprintf(stderr, "ERROR: BG could not read %ld bytes from file: %s\n", length, full_fname);
+		fprintf(stderr, "ERROR: BG could not read %" PRId64 " bytes from file: %s\n", length, full_fname);
 		return -1;
 	}

@ -636,7 +637,7 @@ int64_t granule_start_load(const char* filename,
 uint8_t* granule_get_load(int64_t loadId, void* userContext) {
 	BGLocalFileContext* context = (BGLocalFileContext*)userContext;
 	if (context->data_by_id[loadId] == 0) {
-		fprintf(stderr, "ERROR: BG loadId invalid for get_load: %ld\n", loadId);
+		fprintf(stderr, "ERROR: BG loadId invalid for get_load: %" PRId64 "\n", loadId);
 		return 0;
 	}
 	return context->data_by_id[loadId];
@ -645,7 +646,7 @@ uint8_t* granule_get_load(int64_t loadId, void* userContext) {
 void granule_free_load(int64_t loadId, void* userContext) {
 	BGLocalFileContext* context = (BGLocalFileContext*)userContext;
 	if (context->data_by_id[loadId] == 0) {
-		fprintf(stderr, "ERROR: BG loadId invalid for free_load: %ld\n", loadId);
+		fprintf(stderr, "ERROR: BG loadId invalid for free_load: %" PRId64 "\n", loadId);
 	}
 	free(context->data_by_id[loadId]);
 	context->data_by_id[loadId] = 0;
@ -1119,7 +1120,7 @@ int run_workload(FDBTransaction* transaction,
 					if (tracetimer == dotrace) {
 						fdb_error_t err;
 						tracetimer = 0;
-						snprintf(traceid, 32, "makotrace%019ld", total_xacts);
+						snprintf(traceid, 32, "makotrace%019" PRId64, total_xacts);
 						fprintf(debugme, "DEBUG: txn tracing %s\n", traceid);
 						err = fdb_transaction_set_option(transaction,
 						                                 FDB_TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER,
@ -1283,7 +1284,7 @@ void* worker_thread(void* thread_args) {
 	}

 	fprintf(debugme,
-	        "DEBUG: worker_id:%d (%d) thread_id:%d (%d) database_index:%lu (tid:%lu)\n",
+	        "DEBUG: worker_id:%d (%d) thread_id:%d (%d) database_index:%lu (tid:%" PRIu64 ")\n",
 	        worker_id,
 	        args->num_processes,
 	        thread_id,
@ -1350,6 +1351,11 @@ void* worker_thread(void* thread_args) {
 		char str2[1000];
 		sprintf(str2, "%s%d", TEMP_DATA_STORE, *parent_id);
 		rc = mkdir(str2, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+		if (rc < 0) {
+			int ec = errno;
+			fprintf(stderr, "Failed to make directory: %s because %s\n", str2, strerror(ec));
+			goto failExit;
+		}
 		for (op = 0; op < MAX_OP; op++) {
 			if (args->txnspec.ops[op][OP_COUNT] > 0 || op == OP_COMMIT || op == OP_TRANSACTION) {
 				FILE* fp;
@ -1357,6 +1363,11 @@ void* worker_thread(void* thread_args) {
 				strcat(file_name, str2);
 				get_stats_file_name(file_name, worker_id, thread_id, op);
 				fp = fopen(file_name, "w");
+				if (!fp) {
+					int ec = errno;
+					fprintf(stderr, "Failed to open file: %s because %s\n", file_name, strerror(ec));
+					goto failExit;
+				}
 				lat_block_t* temp_block = ((thread_args_t*)thread_args)->block[op];
 				if (is_memory_allocated[op]) {
 					size = stats->latency_samples[op] / LAT_BLOCK_SIZE;
@ -1376,11 +1387,11 @@ void* worker_thread(void* thread_args) {
 				fclose(fp);
 			}
 		}
-		__sync_fetch_and_add(stopcount, 1);
 	}

 	/* fall through */
 failExit:
+	__sync_fetch_and_add(stopcount, 1);
 	for (op = 0; op < MAX_OP; op++) {
 		lat_block_t* curr = ((thread_args_t*)thread_args)->block[op];
 		lat_block_t* prev = NULL;
@ -2240,9 +2251,9 @@ void print_stats(mako_args_t* args, mako_stats_t* stats, struct timespec* now, s
 	for (op = 0; op < MAX_OP; op++) {
 		if (args->txnspec.ops[op][OP_COUNT] > 0) {
 			uint64_t ops_total_diff = ops_total[op] - ops_total_prev[op];
-			printf("%" STR(STATS_FIELD_WIDTH) "lu ", ops_total_diff);
+			printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", ops_total_diff);
 			if (fp) {
-				fprintf(fp, "\"%s\": %lu,", get_ops_name(op), ops_total_diff);
+				fprintf(fp, "\"%s\": %" PRIu64 ",", get_ops_name(op), ops_total_diff);
 			}
 			errors_diff[op] = errors_total[op] - errors_total_prev[op];
 			print_err = (errors_diff[op] > 0);
@ -2270,7 +2281,7 @@ void print_stats(mako_args_t* args, mako_stats_t* stats, struct timespec* now, s
 		printf("%" STR(STATS_TITLE_WIDTH) "s ", "Errors");
 		for (op = 0; op < MAX_OP; op++) {
 			if (args->txnspec.ops[op][OP_COUNT] > 0) {
-				printf("%" STR(STATS_FIELD_WIDTH) "lu ", errors_diff[op]);
+				printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", errors_diff[op]);
 				if (fp) {
 					fprintf(fp, ",\"errors\": %.2f", conflicts_diff);
 				}
@ -2419,10 +2430,10 @@ void print_report(mako_args_t* args,
 			break;
 		}
 	}
-	printf("Total Xacts:      %8lu\n", totalxacts);
-	printf("Total Conflicts:  %8lu\n", conflicts);
-	printf("Total Errors:     %8lu\n", totalerrors);
-	printf("Overall TPS:      %8lu\n\n", totalxacts * 1000000000 / duration_nsec);
+	printf("Total Xacts:      %8" PRIu64 "\n", totalxacts);
+	printf("Total Conflicts:  %8" PRIu64 "\n", conflicts);
+	printf("Total Errors:     %8" PRIu64 "\n", totalerrors);
+	printf("Overall TPS:      %8" PRIu64 "\n\n", totalxacts * 1000000000 / duration_nsec);

 	if (fp) {
 		fprintf(fp, "\"results\": {");
@ -2430,10 +2441,10 @@ void print_report(mako_args_t* args,
 		fprintf(fp, "\"totalProcesses\": %d,", args->num_processes);
 		fprintf(fp, "\"totalThreads\": %d,", args->num_threads);
 		fprintf(fp, "\"targetTPS\": %d,", args->tpsmax);
-		fprintf(fp, "\"totalXacts\": %lu,", totalxacts);
-		fprintf(fp, "\"totalConflicts\": %lu,", conflicts);
-		fprintf(fp, "\"totalErrors\": %lu,", totalerrors);
-		fprintf(fp, "\"overallTPS\": %lu,", totalxacts * 1000000000 / duration_nsec);
+		fprintf(fp, "\"totalXacts\": %" PRIu64 ",", totalxacts);
+		fprintf(fp, "\"totalConflicts\": %" PRIu64 ",", conflicts);
+		fprintf(fp, "\"totalErrors\": %" PRIu64 ",", totalerrors);
+		fprintf(fp, "\"overallTPS\": %" PRIu64 ",", totalxacts * 1000000000 / duration_nsec);
 	}

 	/* per-op stats */
@ -2446,14 +2457,14 @@ void print_report(mako_args_t* args,
 	}
 	for (op = 0; op < MAX_OP; op++) {
 		if ((args->txnspec.ops[op][OP_COUNT] > 0 && op != OP_TRANSACTION) || op == OP_COMMIT) {
-			printf("%" STR(STATS_FIELD_WIDTH) "lu ", ops_total[op]);
+			printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", ops_total[op]);
 			if (fp) {
 				if (first_op) {
 					first_op = 0;
 				} else {
 					fprintf(fp, ",");
 				}
-				fprintf(fp, "\"%s\": %lu", get_ops_name(op), ops_total[op]);
+				fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), ops_total[op]);
 			}
 		}
 	}
@ -2475,14 +2486,14 @@ void print_report(mako_args_t* args,
 	first_op = 1;
 	for (op = 0; op < MAX_OP; op++) {
 		if (args->txnspec.ops[op][OP_COUNT] > 0 && op != OP_TRANSACTION) {
-			printf("%" STR(STATS_FIELD_WIDTH) "lu ", errors_total[op]);
+			printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", errors_total[op]);
 			if (fp) {
 				if (first_op) {
 					first_op = 0;
 				} else {
 					fprintf(fp, ",");
 				}
-				fprintf(fp, "\"%s\": %lu", get_ops_name(op), errors_total[op]);
+				fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), errors_total[op]);
 			}
 		}
 	}
@ -2500,7 +2511,7 @@ void print_report(mako_args_t* args,
 	for (op = 0; op < MAX_OP; op++) {
 		if (args->txnspec.ops[op][OP_COUNT] > 0 || op == OP_TRANSACTION || op == OP_COMMIT) {
 			if (lat_total[op]) {
-				printf("%" STR(STATS_FIELD_WIDTH) "lu ", lat_samples[op]);
+				printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", lat_samples[op]);
 			} else {
 				printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
 			}
@ -2510,7 +2521,7 @@ void print_report(mako_args_t* args,
 				} else {
 					fprintf(fp, ",");
 				}
-				fprintf(fp, "\"%s\": %lu", get_ops_name(op), lat_samples[op]);
+				fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), lat_samples[op]);
 			}
 		}
 	}
@ -2527,14 +2538,14 @@ void print_report(mako_args_t* args,
 			if (lat_min[op] == -1) {
 				printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
 			} else {
-				printf("%" STR(STATS_FIELD_WIDTH) "lu ", lat_min[op]);
+				printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", lat_min[op]);
 				if (fp) {
 					if (first_op) {
 						first_op = 0;
 					} else {
 						fprintf(fp, ",");
 					}
-					fprintf(fp, "\"%s\": %lu", get_ops_name(op), lat_min[op]);
+					fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), lat_min[op]);
 				}
 			}
 		}
@ -2550,14 +2561,14 @@ void print_report(mako_args_t* args,
 	for (op = 0; op < MAX_OP; op++) {
 		if (args->txnspec.ops[op][OP_COUNT] > 0 || op == OP_TRANSACTION || op == OP_COMMIT) {
 			if (lat_total[op]) {
-				printf("%" STR(STATS_FIELD_WIDTH) "lu ", lat_total[op] / lat_samples[op]);
+				printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", lat_total[op] / lat_samples[op]);
 				if (fp) {
 					if (first_op) {
 						first_op = 0;
 					} else {
 						fprintf(fp, ",");
 					}
-					fprintf(fp, "\"%s\": %lu", get_ops_name(op), lat_total[op] / lat_samples[op]);
+					fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), lat_total[op] / lat_samples[op]);
 				}
 			} else {
 				printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
@ -2577,14 +2588,14 @@ void print_report(mako_args_t* args,
 			if (lat_max[op] == 0) {
 				printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
 			} else {
-				printf("%" STR(STATS_FIELD_WIDTH) "lu ", lat_max[op]);
+				printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", lat_max[op]);
 				if (fp) {
 					if (first_op) {
 						first_op = 0;
 					} else {
 						fprintf(fp, ",");
 					}
-					fprintf(fp, "\"%s\": %lu", get_ops_name(op), lat_max[op]);
+					fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), lat_max[op]);
 				}
 			}
 		}
@ -2635,14 +2646,14 @@ void print_report(mako_args_t* args,
 				} else {
 					median = (dataPoints[op][num_points[op] / 2] + dataPoints[op][num_points[op] / 2 - 1]) >> 1;
 				}
-				printf("%" STR(STATS_FIELD_WIDTH) "lu ", median);
+				printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", median);
 				if (fp) {
 					if (first_op) {
 						first_op = 0;
 					} else {
 						fprintf(fp, ",");
 					}
-					fprintf(fp, "\"%s\": %lu", get_ops_name(op), median);
+					fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), median);
 				}
 			} else {
 				printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
@ -2665,14 +2676,14 @@ void print_report(mako_args_t* args,
 			}
 			if (lat_total[op]) {
 				point_95pct = ((float)(num_points[op]) * 0.95) - 1;
-				printf("%" STR(STATS_FIELD_WIDTH) "lu ", dataPoints[op][point_95pct]);
+				printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", dataPoints[op][point_95pct]);
 				if (fp) {
 					if (first_op) {
 						first_op = 0;
 					} else {
 						fprintf(fp, ",");
 					}
-					fprintf(fp, "\"%s\": %lu", get_ops_name(op), dataPoints[op][point_95pct]);
+					fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), dataPoints[op][point_95pct]);
 				}
 			} else {
 				printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
@ -2695,14 +2706,14 @@ void print_report(mako_args_t* args,
 			}
 			if (lat_total[op]) {
 				point_99pct = ((float)(num_points[op]) * 0.99) - 1;
-				printf("%" STR(STATS_FIELD_WIDTH) "lu ", dataPoints[op][point_99pct]);
+				printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", dataPoints[op][point_99pct]);
 				if (fp) {
 					if (first_op) {
 						first_op = 0;
 					} else {
 						fprintf(fp, ",");
 					}
-					fprintf(fp, "\"%s\": %lu", get_ops_name(op), dataPoints[op][point_99pct]);
+					fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), dataPoints[op][point_99pct]);
 				}
 			} else {
 				printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
@ -2725,14 +2736,14 @@ void print_report(mako_args_t* args,
 			}
 			if (lat_total[op]) {
 				point_99_9pct = ((float)(num_points[op]) * 0.999) - 1;
-				printf("%" STR(STATS_FIELD_WIDTH) "lu ", dataPoints[op][point_99_9pct]);
+				printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", dataPoints[op][point_99_9pct]);
 				if (fp) {
 					if (first_op) {
 						first_op = 0;
 					} else {
 						fprintf(fp, ",");
 					}
-					fprintf(fp, "\"%s\": %lu", get_ops_name(op), dataPoints[op][point_99_9pct]);
+					fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), dataPoints[op][point_99_9pct]);
 				}
 			} else {
 				printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
--- a/bindings/c/test/txn_size_test.c
+++ b/bindings/c/test/txn_size_test.c
@ -67,25 +67,25 @@ void runTests(struct ResultSet* rs) {
 	fdb_transaction_set(tr, keys[i], KEY_SIZE, valueStr, VALUE_SIZE);
 	e = getSize(rs, tr, sizes + i);
 	checkError(e, "transaction get size", rs);
-	printf("size %d: %ld\n", i, sizes[i]);
+	printf("size %d: %" PRId64 "\n", i, sizes[i]);
 	i++;

 	fdb_transaction_set(tr, keys[i], KEY_SIZE, valueStr, VALUE_SIZE);
 	e = getSize(rs, tr, sizes + i);
 	checkError(e, "transaction get size", rs);
-	printf("size %d: %ld\n", i, sizes[i]);
+	printf("size %d: %" PRId64 "\n", i, sizes[i]);
 	i++;

 	fdb_transaction_clear(tr, keys[i], KEY_SIZE);
 	e = getSize(rs, tr, sizes + i);
 	checkError(e, "transaction get size", rs);
-	printf("size %d: %ld\n", i, sizes[i]);
+	printf("size %d: %" PRId64 "\n", i, sizes[i]);
 	i++;

 	fdb_transaction_clear_range(tr, keys[i], KEY_SIZE, keys[i + 1], KEY_SIZE);
 	e = getSize(rs, tr, sizes + i);
 	checkError(e, "transaction get size", rs);
-	printf("size %d: %ld\n", i, sizes[i]);
+	printf("size %d: %" PRId64 "\n", i, sizes[i]);
 	i++;

 	for (j = 0; j + 1 < i; j++) {
--- a/bindings/flow/CMakeLists.txt
+++ b/bindings/flow/CMakeLists.txt
@ -18,6 +18,7 @@ set(SRCS

 add_flow_target(STATIC_LIBRARY NAME fdb_flow SRCS ${SRCS})
 target_link_libraries(fdb_flow PUBLIC fdb_c)
+target_link_libraries(fdb_flow PUBLIC fdbclient)
 target_include_directories(fdb_flow PUBLIC
  "${CMAKE_CURRENT_BINARY_DIR}"
  "${CMAKE_CURRENT_SOURCE_DIR}"
--- a/bindings/java/CMakeLists.txt
+++ b/bindings/java/CMakeLists.txt
@ -154,6 +154,7 @@ endif()
 set_target_properties(java_workloads PROPERTIES
  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/share/foundationdb")
 target_link_libraries(java_workloads PUBLIC fdb_c ${JNI_LIBRARIES})
+target_link_libraries(java_workloads PRIVATE flow) # mostly for boost
 target_include_directories(java_workloads PUBLIC ${JNI_INCLUDE_DIRS})

 set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.8" "-target" "1.8" "-XDignore.symbol.file")
@ -228,6 +229,8 @@ if(NOT OPEN_FOR_IDE)
  else()
   if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
     set(lib_destination "linux/aarch64")
+   elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
+     set(lib_destination "linux/ppc64le")
   else()
     set(lib_destination "linux/amd64")
    endif()
--- a/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java
+++ b/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java
@ -182,7 +182,7 @@ public class JNIUtil {
 	private static OS getRunningOS() {
 		String osname = System.getProperty("os.name").toLowerCase();
 		String arch = System.getProperty("os.arch");
-		if (!arch.equals("amd64") && !arch.equals("x86_64") && !arch.equals("aarch64")) {
+		if (!arch.equals("amd64") && !arch.equals("x86_64") && !arch.equals("aarch64") && !arch.equals("ppc64le")) {
 			throw new IllegalStateException("Unknown or unsupported arch: " + arch);
 		}
 		if (osname.startsWith("windows")) {
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -219,7 +219,7 @@ else()
  endif()
  if(STATIC_LINK_LIBCXX)
    if (NOT USE_LIBCXX AND NOT APPLE)
-      add_link_options(-static-libstdc++ -static-libgcc)
+	add_link_options(-static-libstdc++ -static-libgcc)
    endif()
  endif()
  # # Instruction sets we require to be supported by the CPU
@ -309,7 +309,7 @@ else()
      if (PROFILE_INSTR_GENERATE)
          message(FATAL_ERROR "Can't set both PROFILE_INSTR_GENERATE and PROFILE_INSTR_USE")
      endif()
-      add_compile_options(-Wno-error=profile-instr-out-of-date)
+      add_compile_options(-Wno-error=profile-instr-out-of-date -Wno-error=profile-instr-unprofiled)
      add_compile_options(-fprofile-instr-use=${PROFILE_INSTR_USE})
      add_link_options(-fprofile-instr-use=${PROFILE_INSTR_USE})
    endif()
@ -349,6 +349,9 @@ else()
    add_compile_options(-march=armv8.2-a+crc+simd)
  endif()

+  if (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
+    add_compile_options(-m64 -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS)
+  endif()
  # Check whether we can use dtrace probes
  include(CheckSymbolExists)
  check_symbol_exists(DTRACE_PROBE sys/sdt.h SUPPORT_DTRACE)
--- a/cmake/InstallLayout.cmake
+++ b/cmake/InstallLayout.cmake
@ -303,7 +303,9 @@ set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
  "/etc/rc.d/init.d"
  "/usr/lib/pkgconfig"
  "/usr/lib/foundationdb"
-  "/usr/lib/cmake")
+  "/usr/lib/cmake"
+  "/usr/lib/foundationdb-${FDB_VERSION}/etc/foundationdb"
+  )
 set(CPACK_RPM_DEBUGINFO_PACKAGE ${GENERATE_DEBUG_PACKAGES})
 #set(CPACK_RPM_BUILD_SOURCE_FDB_INSTALL_DIRS_PREFIX /usr/src)
 set(CPACK_RPM_COMPONENT_INSTALL ON)
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -1,5 +1,6 @@
 add_subdirectory(fmt-8.0.1)
 if(NOT WIN32)
+  add_subdirectory(debug_determinism)
  add_subdirectory(monitoring)
  add_subdirectory(TraceLogHelper)
  add_subdirectory(TestHarness)
--- a/contrib/debug_determinism/CMakeLists.txt
+++ b/contrib/debug_determinism/CMakeLists.txt
@ -0,0 +1,5 @@
+add_library(debug_determinism STATIC debug_determinism.cpp)
+
+# So that we can link to libfdb_c.so. Not strictly necessary but convenient for use with our
+# TRACE_PC_GUARD_INSTRUMENTATION_LIB cmake option
+target_compile_options(debug_determinism PRIVATE -fPIC)
--- a/contrib/debug_determinism/README.md
+++ b/contrib/debug_determinism/README.md
@ -0,0 +1,45 @@
+Utilities for debugging unseed mismatches for foundationdb simulation tests.
+
+99/100 times the source of the nondeterminism is use of uninitialized memory and
+what you want to do is build with `-DUSE_VALGRIND=ON` and run simulations under
+valgrind.
+
+Common sources of nondeterminism and specialized tools to find them.
+1. Use of uninitialized memory (use valgrind!)
+1. Memory errors (use valgrind and/or asan)
+1. Undefined behavior (use ubsan. You can also try _GLIBCXX_DEBUG)
+
+If it's not any of these then now it's time to try this technique. Look for
+
+1. Call to some kind of "get current time" function that's not in `INetwork`
+1. Depending on the relative ordering of allocated memory. E.g. Using heap-allocated pointers as keys in a `std::map`.
+1. Inspecting something about the current state of the system (e.g. free disk space)
+1. Depending on iteration order of an unordered map
+
+# Quickstart
+
+Set these cmake flags
+
+```
+-DTRACE_PC_GUARD_INSTRUMENTATION_LIB=$BUILDDIR/lib/libdebug_determinism.a
+```
+
+and change `#define DEBUG_DETERMINISM 0` to `#define DEBUG_DETERMINISM 1` in
+flow/Platform.h. This disables several known sources of nondeterminism that
+don't affect unseeds.
+
+For reasons I don't fully understand, it appears that sqlite exhibits some
+nondeterminism if you don't add `#define SQLITE_OMIT_LOOKASIDE` to the top of
+fdbserver/sqlite/sqlite3.amalgamation.c, so you probably want to do that too.
+
+Now when you run an fdbserver simulation, it will write a file `out.bin` in the
+current directory which contains the sequence of edges in the control flow graph
+that were encountered during the simulation. If you rename `out.bin` to `in.bin`
+and then re-run, the simulation will validate that the sequence of edges is the
+same as the last run. If it's not, then the simulation will enter an infinite
+loop at the first difference and print a message. Then you probably want to
+attach gdb to the process and investigate from there.
+
+You'll need to make sure you delete the `simfdb` folder before each run, because
+otherwise you'll take a different codepath for deleting the `simfdb` folder at
+the beginning of simulation.
--- a/contrib/debug_determinism/debug_determinism.cpp
+++ b/contrib/debug_determinism/debug_determinism.cpp
@ -0,0 +1,52 @@
+#include <stdint.h>
+#include <stdio.h>
+
+namespace {
+FILE* out = nullptr;
+FILE* in = nullptr;
+void loop_forever() {
+	// Try to convince the optimizer not to optimize away this loop
+	static volatile uint64_t x = 0;
+	for (;;) {
+		++x;
+	}
+}
+} // namespace
+
+// This callback is inserted by the compiler as a module constructor
+// into every DSO. 'start' and 'stop' correspond to the
+// beginning and end of the section with the guards for the entire
+// binary (executable or DSO). The callback will be called at least
+// once per DSO and may be called multiple times with the same parameters.
+extern "C" void __sanitizer_cov_trace_pc_guard_init(uint32_t* start, uint32_t* stop) {
+	in = fopen("in.bin", "r");
+	out = fopen("out.bin", "w");
+	static uint64_t N; // Counter for the guards.
+	if (start == stop || *start)
+		return; // Initialize only once.
+	for (uint32_t* x = start; x < stop; x++) {
+		*x = ++N; // Guards should start from 1.
+	}
+}
+
+// This callback is inserted by the compiler on every edge in the
+// control flow (some optimizations apply).
+// Typically, the compiler will emit the code like this:
+//    if(*guard)
+//      __sanitizer_cov_trace_pc_guard(guard);
+// But for large functions it will emit a simple call:
+//    __sanitizer_cov_trace_pc_guard(guard);
+extern "C" void __sanitizer_cov_trace_pc_guard(uint32_t* guard) {
+	if (!guard) {
+		return;
+	}
+	fwrite(guard, 1, sizeof(*guard), out);
+	if (in) {
+		uint32_t theirs;
+		fread(&theirs, 1, sizeof(theirs), in);
+		if (*guard != theirs) {
+			printf("Non-determinism detected\n");
+			loop_forever();
+		}
+	}
+}
--- a/contrib/generate_profile.sh
+++ b/contrib/generate_profile.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+if [ $# -eq 0 ] || [ $# -gt 2 ]
+  then
+    echo "Usage: generate_profile.sh Path_Of_Foundation_Build_Directory Storage_Engine"
+    exit 1
+fi
+fdbdir=$1
+storage_engine='ssd'
+if [ $# -eq 2 ]
+  then
+    storage_engine=$2
+fi
+
+export LD_LIBRARY_PATH=$fdbdir/lib:$LD_LIBRARY_PATH
+export FDB_CLUSTER_FILE=$fdbdir/fdb.cluster
+export LLVM_PROFILE_FILE=$fdbdir/sandbox/fdb-%p.profraw
+$fdbdir/bin/fdbmonitor --conffile $fdbdir/sandbox/foundationdb.conf --lockfile $fdbdir/sandbox/fdbmonitor.pid &
+# This profile will be ignored
+export LLVM_PROFILE_FILE=$fdbdir/sandbox/cli-%m.profraw
+$fdbdir/bin/fdbcli -C $fdbdir/fdb.cluster --exec "configure new $storage_engine single"
+export LLVM_PROFILE_FILE=$fdbdir/sandbox/mako-build-%m.profraw
+$fdbdir/bin/mako -p 64 -t 1 --keylen 32 --vallen 16 --mode build --rows 10000  --trace  --trace_format json
+export LLVM_PROFILE_FILE=$fdbdir/sandbox/mako-run-%m.profraw
+$fdbdir/bin/mako -p 1 -t 2 --keylen 32 --vallen 16 --mode run --rows 10000 --transaction grvg7i2gr1:48cr1:48 --seconds 60 --trace $fdbdir/sandbox/logs --trace_format json
+
+# Shutdown fdbserver to trigger profile dumping
+fdbmonitor_pid=$(cat $fdbdir/sandbox/fdbmonitor.pid)
+fdbserver_pid=$(cat /proc/$fdbmonitor_pid/task/$fdbmonitor_pid/children)
+gdb --batch --eval-command 'call (void)exit(0)' --pid $fdbserver_pid
+
+# Clean up
+kill -9 $fdbmonitor_pid
+
+# Profile for server
+llvm-profdata merge -output=$fdbdir/fdb.profdata $fdbdir/sandbox/fdb-*.profraw
+# Profile for client
+llvm-profdata merge -output=$fdbdir/mako.profdata $fdbdir/sandbox/mako-*.profraw
--- a/contrib/pkg_tester/test_fdb_pkgs.py
+++ b/contrib/pkg_tester/test_fdb_pkgs.py
@ -147,7 +147,7 @@ def centos_image_with_fdb_helper(versioned: bool) -> Iterator[Optional[Image]]:
    container = None
    image = None
    try:
-        container = Container("centos", initd=True)
+        container = Container("centos:7", initd=True)
        for rpm in rpms:
            container.copy_to(rpm, "/opt")
        container.run(["bash", "-c", "yum update -y"])
@ -237,10 +237,6 @@ def test_write(linux_container: Container, snapshot):
    assert snapshot == linux_container.run(["fdbcli", "--exec", "get x"])


-def test_fdbcli_help_text(linux_container: Container, snapshot):
-    assert snapshot == linux_container.run(["fdbcli", "--help"])
-
-
 def test_execstack_permissions_libfdb_c(linux_container: Container, snapshot):
    linux_container.run(["ldconfig"])
    assert snapshot == linux_container.run(
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@ -2,8 +2,16 @@
 Release Notes
 #############

+6.3.24
+======
+* Fixed a bug where get key location can overload proxies. `(PR #6453) <https://github.com/apple/foundationdb/pull/6453>`_ 
+* Added a mechanism that can reduce the number of empty peek reply by not always returning empty peek reply immediately. `(PR #6413) <https://github.com/apple/foundationdb/pull/6413>`_
+* Enable TLS support for Windows. `(PR #6193) <https://github.com/apple/foundationdb/pull/6193>`_
+* Fixed a bug where a shard gets merged too soon. `(PR #6115) <https://github.com/apple/foundationdb/pull/6115>`_
+
 6.3.23
 ======
+* Add AWS v4 header support for backup. `(PR #6025) <https://github.com/apple/foundationdb/pull/6025>`_
 * Fixed a bug that remoteDCIsHealthy logic is not guarded by CC_ENABLE_WORKER_HEALTH_MONITOR, which may prevent HA failback. `(PR #6106) <https://github.com/apple/foundationdb/pull/6106>`_
 * Fixed a race condition with updating the coordinated state and updating the master registration. `(PR #6088) <https://github.com/apple/foundationdb/pull/6088>`_
 * Changed dbinfo broadcast to be explicitly requested by the worker registration message. `(PR #6073) <https://github.com/apple/foundationdb/pull/6073>`_
--- a/documentation/tutorial/tutorial.actor.cpp
+++ b/documentation/tutorial/tutorial.actor.cpp
@ -19,6 +19,7 @@
 * limitations under the License.
 */

+#include "contrib/fmt-8.0.1/include/fmt/format.h"
 #include "flow/flow.h"
 #include "flow/Platform.h"
 #include "flow/DeterministicRandom.h"
@ -413,7 +414,7 @@ ACTOR Future<Void> logThroughput(int64_t* v, Key* next) {
 	loop {
 		state int64_t last = *v;
 		wait(delay(1));
-		printf("throughput: %ld bytes/s, next: %s\n", *v - last, printable(*next).c_str());
+		fmt::print("throughput: {} bytes/s, next: {}\n", *v - last, printable(*next).c_str());
 	}
 }

--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -1690,7 +1690,7 @@ ACTOR Future<Void> cleanupStatus(Reference<ReadYourWritesTransaction> tr,
 				readMore = true;
 		} catch (Error& e) {
 			// If doc can't be parsed or isn't alive, delete it.
-			TraceEvent(SevWarn, "RemovedDeadBackupLayerStatus").detail("Key", docs[i].key).error(e, true);
+			TraceEvent(SevWarn, "RemovedDeadBackupLayerStatus").errorUnsuppressed(e).detail("Key", docs[i].key);
 			tr->clear(docs[i].key);
 			// If limit is 1 then read more.
 			if (limit == 1)
@ -2754,7 +2754,7 @@ ACTOR Future<Void> queryBackup(const char* name,
 			reportBackupQueryError(operationId,
 			                       result,
 			                       errorMessage =
-			                           format("the specified restorable version %ld is not valid", restoreVersion));
+			                           format("the specified restorable version %lld is not valid", restoreVersion));
 			return Void();
 		}
 		Optional<RestorableFileSet> fileSet = wait(bc->getRestoreSet(restoreVersion, keyRangesFilter));
@ -3081,7 +3081,7 @@ static void addKeyRange(std::string optionValue, Standalone<VectorRef<KeyRangeRe

 			// Too many keys
 		default:
-			fprintf(stderr, "ERROR: Invalid key range identified with %ld keys", tokens.size());
+			fmt::print(stderr, "ERROR: Invalid key range identified with {} keys", tokens.size());
 			throw invalid_option_value();
 			break;
 		}
@ -3887,9 +3887,9 @@ int main(int argc, char* argv[]) {
 				} else {
 					fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", knobName.c_str(), e.what());
 					TraceEvent(SevError, "FailedToSetKnob")
+					    .error(e)
 					    .detail("Knob", printable(knobName))
-					    .detail("Value", printable(knobValueString))
-					    .error(e);
+					    .detail("Value", printable(knobValueString));
 					throw;
 				}
 			}
--- a/fdbcli/AdvanceVersionCommand.actor.cpp
+++ b/fdbcli/AdvanceVersionCommand.actor.cpp
@ -19,7 +19,7 @@
 */

 #include "boost/lexical_cast.hpp"
-
+#include "contrib/fmt-8.0.1/include/fmt/format.h"
 #include "fdbcli/fdbcli.actor.h"

 #include "fdbclient/IClientApi.h"
@ -40,7 +40,7 @@ ACTOR Future<bool> advanceVersionCommandActor(Reference<IDatabase> db, std::vect
 	} else {
 		state Version v;
 		int n = 0;
-		if (sscanf(tokens[1].toString().c_str(), "%ld%n", &v, &n) != 1 || n != tokens[1].size()) {
+		if (sscanf(tokens[1].toString().c_str(), "%" PRId64 "%n", &v, &n) != 1 || n != tokens[1].size()) {
 			printUsage(tokens[0]);
 			return false;
 		} else {
@ -53,7 +53,7 @@ ACTOR Future<bool> advanceVersionCommandActor(Reference<IDatabase> db, std::vect
 						tr->set(advanceVersionSpecialKey, boost::lexical_cast<std::string>(v));
 						wait(safeThreadFutureToFuture(tr->commit()));
 					} else {
-						printf("Current read version is %ld\n", rv);
+						fmt::print("Current read version is {}\n", rv);
 						return true;
 					}
 				} catch (Error& e) {
--- a/fdbcli/ChangeFeedCommand.actor.cpp
+++ b/fdbcli/ChangeFeedCommand.actor.cpp
@ -115,7 +115,7 @@ ACTOR Future<bool> changeFeedCommandActor(Database localDb, std::vector<StringRe
 		Version end = std::numeric_limits<Version>::max();
 		if (tokens.size() > 3) {
 			int n = 0;
-			if (sscanf(tokens[3].toString().c_str(), "%ld%n", &begin, &n) != 1 || n != tokens[3].size()) {
+			if (sscanf(tokens[3].toString().c_str(), "%" PRId64 "%n", &begin, &n) != 1 || n != tokens[3].size()) {
 				printUsage(tokens[0]);
 				return false;
 			}
@ -168,7 +168,7 @@ ACTOR Future<bool> changeFeedCommandActor(Database localDb, std::vector<StringRe
 		}
 		Version v;
 		int n = 0;
-		if (sscanf(tokens[3].toString().c_str(), "%ld%n", &v, &n) != 1 || n != tokens[3].size()) {
+		if (sscanf(tokens[3].toString().c_str(), "%" PRId64 "%n", &v, &n) != 1 || n != tokens[3].size()) {
 			printUsage(tokens[0]);
 			return false;
 		} else {
--- a/fdbcli/ConfigureCommand.actor.cpp
+++ b/fdbcli/ConfigureCommand.actor.cpp
@ -176,7 +176,7 @@ ACTOR Future<bool> configureCommandActor(Reference<IDatabase> db,
 	case ConfigurationResult::STORAGE_MIGRATION_DISABLED:
 		fprintf(stderr,
 		        "ERROR: Storage engine type cannot be changed because "
-		        "storage_migration_mode=disabled.\n");
+		        "storage_migration_type=disabled.\n");
 		fprintf(stderr,
 		        "Type `configure perpetual_storage_wiggle=1 storage_migration_type=gradual' to enable gradual "
 		        "migration with the perpetual wiggle, or `configure "
--- a/fdbcli/CoordinatorsCommand.actor.cpp
+++ b/fdbcli/CoordinatorsCommand.actor.cpp
@ -65,13 +65,14 @@ ACTOR Future<bool> changeCoordinators(Reference<IDatabase> db, std::vector<Strin
 	state StringRef new_cluster_description;
 	state std::string auto_coordinators_str;
 	StringRef nameTokenBegin = LiteralStringRef("description=");
-	for (auto tok = tokens.begin() + 1; tok != tokens.end(); ++tok)
+	for (auto tok = tokens.begin() + 1; tok != tokens.end(); ++tok) {
 		if (tok->startsWith(nameTokenBegin)) {
 			new_cluster_description = tok->substr(nameTokenBegin.size());
 			std::copy(tok + 1, tokens.end(), tok);
 			tokens.resize(tokens.size() - 1);
 			break;
 		}
+	}

 	state bool automatic = tokens.size() == 2 && tokens[1] == LiteralStringRef("auto");
 	state Reference<ITransaction> tr = db->createTransaction();
@ -96,17 +97,32 @@ ACTOR Future<bool> changeCoordinators(Reference<IDatabase> db, std::vector<Strin
 				tr->set(fdb_cli::coordinatorsProcessSpecialKey, auto_coordinators_str);
 			} else if (tokens.size() > 1) {
 				state std::set<NetworkAddress> new_coordinators_addresses;
-				state std::vector<std::string> newAddresslist;
+				state std::set<Hostname> new_coordinators_hostnames;
+				state std::vector<std::string> newCoordinatorslist;
 				state std::vector<StringRef>::iterator t;
 				for (t = tokens.begin() + 1; t != tokens.end(); ++t) {
 					try {
-						auto const& addr = NetworkAddress::parse(t->toString());
-						if (new_coordinators_addresses.count(addr)) {
-							fprintf(stderr, "ERROR: passed redundant coordinators: `%s'\n", addr.toString().c_str());
-							return true;
+						if (Hostname::isHostname(t->toString())) {
+							// We do not resolve hostnames here. We commit them as is.
+							const auto& hostname = Hostname::parse(t->toString());
+							if (new_coordinators_hostnames.count(hostname)) {
+								fprintf(stderr,
+								        "ERROR: passed redundant coordinators: `%s'\n",
+								        hostname.toString().c_str());
+								return true;
+							}
+							new_coordinators_hostnames.insert(hostname);
+							newCoordinatorslist.push_back(hostname.toString());
+						} else {
+							const auto& addr = NetworkAddress::parse(t->toString());
+							if (new_coordinators_addresses.count(addr)) {
+								fprintf(
+								    stderr, "ERROR: passed redundant coordinators: `%s'\n", addr.toString().c_str());
+								return true;
+							}
+							new_coordinators_addresses.insert(addr);
+							newCoordinatorslist.push_back(addr.toString());
 						}
-						new_coordinators_addresses.insert(addr);
-						newAddresslist.push_back(addr.toString());
 					} catch (Error& e) {
 						if (e.code() == error_code_connection_string_invalid) {
 							fprintf(
@ -116,12 +132,12 @@ ACTOR Future<bool> changeCoordinators(Reference<IDatabase> db, std::vector<Strin
 						throw;
 					}
 				}
-				std::string new_addresses_str = boost::algorithm::join(newAddresslist, ", ");
-				tr->set(fdb_cli::coordinatorsProcessSpecialKey, new_addresses_str);
+				std::string new_coordinators_str = boost::algorithm::join(newCoordinatorslist, ", ");
+				tr->set(fdb_cli::coordinatorsProcessSpecialKey, new_coordinators_str);
 			}
 			wait(safeThreadFutureToFuture(tr->commit()));
 			// commit should always fail here
-			// if coordinators are changed, we should get commit_unknown() error
+			// If the commit succeeds, the coordinators change and the commit will fail with commit_unknown_result().
 			ASSERT(false);
 		} catch (Error& e) {
 			state Error err(e);
--- a/fdbcli/IncludeCommand.actor.cpp
+++ b/fdbcli/IncludeCommand.actor.cpp
@ -59,7 +59,7 @@ ACTOR Future<Void> includeLocalities(Reference<IDatabase> db,
 			wait(safeThreadFutureToFuture(tr->commit()));
 			return Void();
 		} catch (Error& e) {
-			TraceEvent("IncludeLocalitiesError").error(e, true);
+			TraceEvent("IncludeLocalitiesError").errorUnsuppressed(e);
 			wait(safeThreadFutureToFuture(tr->onError(e)));
 		}
 	}
@ -99,7 +99,7 @@ ACTOR Future<Void> includeServers(Reference<IDatabase> db, std::vector<AddressEx
 			wait(safeThreadFutureToFuture(tr->commit()));
 			return Void();
 		} catch (Error& e) {
-			TraceEvent("IncludeServersError").error(e, true);
+			TraceEvent("IncludeServersError").errorUnsuppressed(e);
 			wait(safeThreadFutureToFuture(tr->onError(e)));
 		}
 	}
--- a/fdbcli/StatusCommand.actor.cpp
+++ b/fdbcli/StatusCommand.actor.cpp
@ -705,12 +705,12 @@ void printStatus(StatusObjectReader statusObj,
 										}
 									}
 									outputString += format(
-									    "  %s log epoch: %ld begin: %ld end: %s, missing "
+									    "  %s log epoch: %lld begin: %lld end: %s, missing "
 									    "log interfaces(id,address): %s\n",
 									    current ? "Current" : "Old",
 									    epoch,
 									    beginVersion,
-									    endVersion == invalidVersion ? "(unknown)" : format("%ld", endVersion).c_str(),
+									    endVersion == invalidVersion ? "(unknown)" : format("%lld", endVersion).c_str(),
 									    missing_log_interfaces.c_str());
 								}
 							}
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -1014,9 +1014,9 @@ struct CLIOptions {
 				} else {
 					fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", knobName.c_str(), e.what());
 					TraceEvent(SevError, "FailedToSetKnob")
+					    .error(e)
 					    .detail("Knob", printable(knobName))
-					    .detail("Value", printable(knobValueString))
-					    .error(e);
+					    .detail("Value", printable(knobValueString));
 					exit_code = FDB_EXIT_ERROR;
 				}
 			}
@ -1157,7 +1157,6 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {

 	state bool writeMode = false;

-	state std::string clusterConnectString;
 	state std::map<Key, std::pair<Value, ClientLeaderRegInterface>> address_interface;

 	state FdbOptions globalOptions;
@ -1171,6 +1170,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	    ClusterConnectionFile::lookupClusterFileName(opt.clusterFile);
 	try {
 		ccf = makeReference<ClusterConnectionFile>(resolvedClusterFile.first);
+		wait(ccf->resolveHostnames());
 	} catch (Error& e) {
 		fprintf(stderr, "%s\n", ClusterConnectionFile::getErrorString(resolvedClusterFile, e).c_str());
 		return 1;
@ -1615,7 +1615,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 					} else {
 						Version v = wait(makeInterruptable(
 						    safeThreadFutureToFuture(getTransaction(db, tr, options, intrans)->getReadVersion())));
-						printf("%ld\n", v);
+						fmt::print("{}\n", v);
 					}
 					continue;
 				}
--- a/fdbclient/AutoPublicAddress.cpp
+++ b/fdbclient/AutoPublicAddress.cpp
@ -28,6 +28,7 @@

 #include "fdbclient/CoordinationInterface.h"

+// Determine public IP address by calling the first coordinator.
 IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs) {
 	try {
 		using namespace boost::asio;
@ -35,6 +36,7 @@ IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs) {
 		io_service ioService;
 		ip::udp::socket socket(ioService);

+		ccs.resolveHostnamesBlocking();
 		const auto& coordAddr = ccs.coordinators()[0];
 		const auto boostIp = coordAddr.ip.isV6() ? ip::address(ip::address_v6(coordAddr.ip.toV6()))
 		                                         : ip::address(ip::address_v4(coordAddr.ip.toV4()));
--- a/fdbclient/BackupContainer.actor.cpp
+++ b/fdbclient/BackupContainer.actor.cpp
@ -305,9 +305,9 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u
 			throw;

 		TraceEvent m(SevWarn, "BackupContainer");
+		m.error(e);
 		m.detail("Description", "Invalid container specification.  See help.");
 		m.detail("URL", url);
-		m.error(e);
 		if (e.code() == error_code_backup_invalid_url)
 			m.detail("LastOpenError", lastOpenError);

@ -360,10 +360,9 @@ ACTOR Future<std::vector<std::string>> listContainers_impl(std::string baseURL)
 			throw;

 		TraceEvent m(SevWarn, "BackupContainer");
-
+		m.error(e);
 		m.detail("Description", "Invalid backup container URL prefix.  See help.");
 		m.detail("URL", baseURL);
-		m.error(e);
 		if (e.code() == error_code_backup_invalid_url)
 			m.detail("LastOpenError", IBackupContainer::lastOpenError);

--- a/fdbclient/BackupContainerFileSystem.actor.cpp
+++ b/fdbclient/BackupContainerFileSystem.actor.cpp
@ -1149,8 +1149,8 @@ public:
 			keyFile = _keyFile;
 		} catch (Error& e) {
 			TraceEvent(SevWarnAlways, "FailedToOpenEncryptionKeyFile")
-			    .detail("FileName", encryptionKeyFileName)
-			    .error(e);
+			    .error(e)
+			    .detail("FileName", encryptionKeyFileName);
 			throw e;
 		}
 		int bytesRead = wait(keyFile->read(cipherKey->data(), cipherKey->size(), 0));
@ -1377,8 +1377,8 @@ ACTOR static Future<KeyRange> getSnapshotFileKeyRange_impl(Reference<BackupConta
 			           e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
 				// blob http request failure, retry
 				TraceEvent(SevWarnAlways, "BackupContainerGetSnapshotFileKeyRangeConnectionFailure")
-				    .detail("Retries", ++readFileRetries)
-				    .error(e);
+				    .error(e)
+				    .detail("Retries", ++readFileRetries);
 				wait(delayJittered(0.1));
 			} else {
 				TraceEvent(SevError, "BackupContainerGetSnapshotFileKeyRangeUnexpectedError").error(e);
@ -1549,9 +1549,9 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS(
 			throw;

 		TraceEvent m(SevWarn, "BackupContainer");
+		m.error(e);
 		m.detail("Description", "Invalid container specification.  See help.");
 		m.detail("URL", url);
-		m.error(e);
 		if (e.code() == error_code_backup_invalid_url)
 			m.detail("LastOpenError", lastOpenError);

--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -86,6 +86,8 @@ void ClientKnobs::initialize(Randomize randomize) {

 	init( LOCATION_CACHE_EVICTION_SIZE,         600000 );
 	init( LOCATION_CACHE_EVICTION_SIZE_SIM,         10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3;
+	init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD,     60 );
+	init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL,    60 );

 	init( GET_RANGE_SHARD_LIMIT,                     2 );
 	init( WARM_RANGE_SHARD_LIMIT,                  100 );
--- a/fdbclient/ClientKnobs.h
+++ b/fdbclient/ClientKnobs.h
@ -86,6 +86,8 @@ public:
 	// When locationCache in DatabaseContext gets to be this size, items will be evicted
 	int LOCATION_CACHE_EVICTION_SIZE;
 	int LOCATION_CACHE_EVICTION_SIZE_SIM;
+	double LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD;
+	double LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL;

 	int GET_RANGE_SHARD_LIMIT;
 	int WARM_RANGE_SHARD_LIMIT;
--- a/fdbclient/ConfigKnobs.cpp
+++ b/fdbclient/ConfigKnobs.cpp
@ -27,7 +27,7 @@ ConfigKey ConfigKeyRef::decodeKey(KeyRef const& key) {
 	try {
 		tuple = Tuple::unpack(key);
 	} catch (Error& e) {
-		TraceEvent(SevWarnAlways, "FailedToUnpackConfigKey").detail("Key", printable(key)).error(e);
+		TraceEvent(SevWarnAlways, "FailedToUnpackConfigKey").error(e).detail("Key", printable(key));
 		throw invalid_config_db_key();
 	}
 	if (tuple.size() != 2) {
@ -96,7 +96,7 @@ public:

 struct ToStringFunc {
 	std::string operator()(int v) const { return format("int:%d", v); }
-	std::string operator()(int64_t v) const { return format("int64_t:%ld", v); }
+	std::string operator()(int64_t v) const { return format("int64_t:%lld", v); }
 	std::string operator()(bool v) const { return format("bool:%d", v); }
 	std::string operator()(ValueRef v) const { return "string:" + v.toString(); }
 	std::string operator()(double v) const { return format("double:%lf", v); }
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@ -58,13 +58,28 @@ struct ClientLeaderRegInterface {
 //  - There is no address present more than once
 class ClusterConnectionString {
 public:
+	enum ConnectionStringStatus { RESOLVED, RESOLVING, UNRESOLVED };
+
 	ClusterConnectionString() {}
 	ClusterConnectionString(const std::string& connStr);
 	ClusterConnectionString(const std::vector<NetworkAddress>& coordinators, Key key);
 	ClusterConnectionString(const std::vector<Hostname>& hosts, Key key);

+	ClusterConnectionString(const ClusterConnectionString& rhs) { operator=(rhs); }
+	ClusterConnectionString& operator=(const ClusterConnectionString& rhs) {
+		// Copy everything except AsyncTrigger resolveFinish.
+		status = rhs.status;
+		coords = rhs.coords;
+		hostnames = rhs.hostnames;
+		networkAddressToHostname = rhs.networkAddressToHostname;
+		key = rhs.key;
+		keyDesc = rhs.keyDesc;
+		connectionString = rhs.connectionString;
+		return *this;
+	}
+
 	std::vector<NetworkAddress> const& coordinators() const { return coords; }
-	void addResolved(Hostname hostname, NetworkAddress address) {
+	void addResolved(const Hostname& hostname, const NetworkAddress& address) {
 		coords.push_back(address);
 		networkAddressToHostname.emplace(address, hostname);
 	}
@ -78,16 +93,20 @@ public:
 	// This one should only be used when resolving asynchronously is impossible. For all other cases, resolveHostnames()
 	// should be preferred.
 	void resolveHostnamesBlocking();
-	void resetToUnresolved();
+	// This function derives the member connectionString from the current key, coordinators and hostnames.
+	void resetConnectionString();

-	bool hasUnresolvedHostnames = false;
+	void resetToUnresolved();
+	void parseKey(const std::string& key);
+
+	ConnectionStringStatus status = RESOLVED;
+	AsyncTrigger resolveFinish;
 	std::vector<NetworkAddress> coords;
 	std::vector<Hostname> hostnames;
+	std::unordered_map<NetworkAddress, Hostname> networkAddressToHostname;

 private:
 	void parseConnString();
-	void parseKey(const std::string& key);
-	std::unordered_map<NetworkAddress, Hostname> networkAddressToHostname;
 	Key key, keyDesc;
 	std::string connectionString;
 };
@ -139,7 +158,7 @@ public:
 	// Signals to the connection record that it was successfully used to connect to a cluster.
 	void notifyConnected();

-	bool hasUnresolvedHostnames() const;
+	ClusterConnectionString::ConnectionStringStatus connectionStringStatus() const;
 	Future<Void> resolveHostnames();
 	// This one should only be used when resolving asynchronously is impossible. For all other cases, resolveHostnames()
 	// should be preferred.
--- a/fdbclient/DatabaseBackupAgent.actor.cpp
+++ b/fdbclient/DatabaseBackupAgent.actor.cpp
@ -2142,7 +2142,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase {
 				wait(tr->commit());
 				break;
 			} catch (Error& e) {
-				TraceEvent("SetDestUidOrBeginVersionError").error(e, true);
+				TraceEvent("SetDestUidOrBeginVersionError").errorUnsuppressed(e);
 				wait(tr->onError(e));
 			}
 		}
@ -2907,7 +2907,7 @@ public:
 				TraceEvent("DBA_Abort").detail("CommitVersion", tr->getCommittedVersion());
 				break;
 			} catch (Error& e) {
-				TraceEvent("DBA_AbortError").error(e, true);
+				TraceEvent("DBA_AbortError").errorUnsuppressed(e);
 				wait(tr->onError(e));
 			}
 		}
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -198,6 +198,11 @@ struct ChangeFeedData : ReferenceCounted<ChangeFeedData> {
 	ChangeFeedData() : notAtLatest(1) {}
 };

+struct EndpointFailureInfo {
+	double startTime = 0;
+	double lastRefreshTime = 0;
+};
+
 class DatabaseContext : public ReferenceCounted<DatabaseContext>, public FastAllocated<DatabaseContext>, NonCopyable {
 public:
 	static DatabaseContext* allocateOnForeignThread() {
@ -241,6 +246,14 @@ public:
 	void invalidateCache(const KeyRef&, Reverse isBackward = Reverse::False);
 	void invalidateCache(const KeyRangeRef&);

+	// Records that `endpoint` is failed on a healthy server.
+	void setFailedEndpointOnHealthyServer(const Endpoint& endpoint);
+
+	// Updates `endpoint` refresh time if the `endpoint` is a failed endpoint. If not, this does nothing.
+	void updateFailedEndpointRefreshTime(const Endpoint& endpoint);
+	Optional<EndpointFailureInfo> getEndpointFailureInfo(const Endpoint& endpoint);
+	void clearFailedEndpointOnHealthyServer(const Endpoint& endpoint);
+
 	bool sampleReadTags() const;
 	bool sampleOnCost(uint64_t cost) const;

@ -394,6 +407,7 @@ public:
 	// Cache of location information
 	int locationCacheSize;
 	CoalescedKeyRangeMap<Reference<LocationInfo>> locationCache;
+	std::unordered_map<Endpoint, EndpointFailureInfo> failedEndpointsOnHealthyServersInfo;

 	std::map<UID, StorageServerInfo*> server_interf;
 	std::map<UID, BlobWorkerInterface> blobWorker_interf; // blob workers don't change endpoints for the same ID
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -1204,10 +1204,12 @@ struct ReadBlobGranuleContext {
 struct StorageMetadataType {
 	constexpr static FileIdentifier file_identifier = 732123;
 	// when the SS is initialized
-	uint64_t createdTime; // comes from Platform::timer_int()
+	uint64_t createdTime; // comes from currentTime()
 	StorageMetadataType() : createdTime(0) {}
 	StorageMetadataType(uint64_t t) : createdTime(t) {}

+	static uint64_t currentTime() { return g_network->timer() * 1e9; }
+
 	// To change this serialization, ProtocolVersion::StorageMetadata must be updated, and downgrades need
 	// to be considered
 	template <class Ar>
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -87,7 +87,7 @@ std::string secondsToTimeFormat(int64_t seconds) {
 	else if (seconds >= 60)
 		return format("%.2f minute(s)", seconds / 60.0);
 	else
-		return format("%ld second(s)", seconds);
+		return format("%lld second(s)", seconds);
 }

 const Key FileBackupAgent::keyLastRestorable = LiteralStringRef("last_restorable");
@ -4407,9 +4407,9 @@ public:
 				break;
 			} catch (Error& e) {
 				TraceEvent(numTries > 50 ? SevError : SevInfo, "FastRestoreToolSubmitRestoreRequestsMayFail")
+				    .error(e)
 				    .detail("Reason", "DB is not properly locked")
-				    .detail("ExpectedLockID", randomUID)
-				    .error(e);
+				    .detail("ExpectedLockID", randomUID);
 				numTries++;
 				wait(tr->onError(e));
 			}
@ -4443,8 +4443,8 @@ public:
 				break;
 			} catch (Error& e) {
 				TraceEvent(numTries > 50 ? SevError : SevInfo, "FastRestoreToolSubmitRestoreRequestsRetry")
-				    .detail("RestoreIndex", restoreIndex)
-				    .error(e);
+				    .error(e)
+				    .detail("RestoreIndex", restoreIndex);
 				numTries++;
 				wait(tr->onError(e));
 			}
@ -5183,7 +5183,7 @@ public:
 						else
 							statusText += "The initial snapshot is still running.\n";

-						statusText += format("\nDetails:\n LogBytes written - %ld\n RangeBytes written - %ld\n "
+						statusText += format("\nDetails:\n LogBytes written - %lld\n RangeBytes written - %lld\n "
 						                     "Last complete log version and timestamp        - %s, %s\n "
 						                     "Last complete snapshot version and timestamp   - %s, %s\n "
 						                     "Current Snapshot start version and timestamp   - %s, %s\n "
@ -5800,9 +5800,9 @@ ACTOR static Future<Void> transformDatabaseContents(Database cx,
 			break;
 		} catch (Error& e) {
 			TraceEvent("FastRestoreWorkloadTransformDatabaseContentsGetAllKeys")
+			    .error(e)
 			    .detail("Index", i)
-			    .detail("RestoreRange", restoreRanges[i])
-			    .error(e);
+			    .detail("RestoreRange", restoreRanges[i]);
 			oldData = Standalone<VectorRef<KeyValueRef>>(); // clear the vector
 			wait(tr.onError(e));
 		}
--- a/fdbclient/HTTP.actor.cpp
+++ b/fdbclient/HTTP.actor.cpp
@ -448,6 +448,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
 				err = http_bad_request_id();

 				TraceEvent(SevError, "HTTPRequestFailedIDMismatch")
+				    .error(err.get())
 				    .detail("DebugID", conn->getDebugID())
 				    .detail("RemoteAddress", conn->getPeerAddress())
 				    .detail("Verb", verb)
@ -456,8 +457,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
 				    .detail("ResponseCode", r->code)
 				    .detail("ResponseContentLen", r->contentLen)
 				    .detail("RequestIDSent", requestID)
-				    .detail("RequestIDReceived", responseID)
-				    .error(err.get());
+				    .detail("RequestIDReceived", responseID);
 			}
 		}

@ -501,7 +501,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
 			       contentLen,
 			       total_sent);
 		}
-		event.error(e);
+		event.errorUnsuppressed(e);
 		throw;
 	}
 }
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -169,7 +169,7 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
 			} else if (value == "gradual") {
 				type = StorageMigrationType::GRADUAL;
 			} else {
-				printf("Error: Only disabled|aggressive|gradual are valid for storage_migration_mode.\n");
+				printf("Error: Only disabled|aggressive|gradual are valid for storage_migration_type.\n");
 				return out;
 			}
 			out[p + key] = format("%d", type);
@ -772,7 +772,7 @@ ACTOR Future<std::vector<NetworkAddress>> getCoordinators(Database cx) {

 ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
                                                               Reference<IQuorumChange> change,
-                                                               std::vector<NetworkAddress>* desiredCoordinators) {
+                                                               ClusterConnectionString* conn) {
 	tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 	tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
@ -783,44 +783,47 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
 		return CoordinatorsResult::BAD_DATABASE_STATE; // Someone deleted this key entirely?

 	state ClusterConnectionString old(currentKey.get().toString());
+	wait(old.resolveHostnames());
 	if (tr->getDatabase()->getConnectionRecord() &&
 	    old.clusterKeyName().toString() !=
 	        tr->getDatabase()->getConnectionRecord()->getConnectionString().clusterKeyName())
 		return CoordinatorsResult::BAD_DATABASE_STATE; // Someone changed the "name" of the database??

 	state CoordinatorsResult result = CoordinatorsResult::SUCCESS;
-	if (!desiredCoordinators->size()) {
-		std::vector<NetworkAddress> _desiredCoordinators = wait(change->getDesiredCoordinators(
+	if (!conn->coords.size()) {
+		std::vector<NetworkAddress> desiredCoordinatorAddresses = wait(change->getDesiredCoordinators(
 		    tr,
 		    old.coordinators(),
 		    Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(old)),
 		    result));
-		*desiredCoordinators = _desiredCoordinators;
+		conn->coords = desiredCoordinatorAddresses;
 	}

 	if (result != CoordinatorsResult::SUCCESS)
 		return result;

-	if (!desiredCoordinators->size())
+	if (!conn->coordinators().size())
 		return CoordinatorsResult::INVALID_NETWORK_ADDRESSES;

-	std::sort(desiredCoordinators->begin(), desiredCoordinators->end());
+	std::sort(conn->coords.begin(), conn->coords.end());
+	std::sort(conn->hostnames.begin(), conn->hostnames.end());

 	std::string newName = change->getDesiredClusterKeyName();
 	if (newName.empty())
 		newName = old.clusterKeyName().toString();

-	if (old.coordinators() == *desiredCoordinators && old.clusterKeyName() == newName)
+	if (old.coordinators() == conn->coordinators() && old.clusterKeyName() == newName)
 		return CoordinatorsResult::SAME_NETWORK_ADDRESSES;

-	state ClusterConnectionString conn(*desiredCoordinators,
-	                                   StringRef(newName + ':' + deterministicRandom()->randomAlphaNumeric(32)));
+	std::string key(newName + ':' + deterministicRandom()->randomAlphaNumeric(32));
+	conn->parseKey(key);
+	conn->resetConnectionString();

 	if (g_network->isSimulated()) {
 		int i = 0;
 		int protectedCount = 0;
-		while ((protectedCount < ((desiredCoordinators->size() / 2) + 1)) && (i < desiredCoordinators->size())) {
-			auto process = g_simulator.getProcessByAddress((*desiredCoordinators)[i]);
+		while ((protectedCount < ((conn->coordinators().size() / 2) + 1)) && (i < conn->coordinators().size())) {
+			auto process = g_simulator.getProcessByAddress(conn->coordinators()[i]);
 			auto addresses = process->addresses;

 			if (!process->isReliable()) {
@ -832,14 +835,14 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
 			if (addresses.secondaryAddress.present()) {
 				g_simulator.protectedAddresses.insert(process->addresses.secondaryAddress.get());
 			}
-			TraceEvent("ProtectCoordinator").detail("Address", (*desiredCoordinators)[i]).backtrace();
+			TraceEvent("ProtectCoordinator").detail("Address", conn->coordinators()[i]).backtrace();
 			protectedCount++;
 			i++;
 		}
 	}

 	std::vector<Future<Optional<LeaderInfo>>> leaderServers;
-	ClientCoordinators coord(Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(conn)));
+	ClientCoordinators coord(Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(*conn)));

 	leaderServers.reserve(coord.clientLeaderServers.size());
 	for (int i = 0; i < coord.clientLeaderServers.size(); i++)
@ -851,7 +854,7 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
 		when(wait(waitForAll(leaderServers))) {}
 		when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; }
 	}
-	tr->set(coordinatorsKey, conn.toString());
+	tr->set(coordinatorsKey, conn->toString());
 	return Optional<CoordinatorsResult>();
 }

@ -1273,7 +1276,7 @@ ACTOR Future<Void> excludeServers(Database cx, std::vector<AddressExclusion> ser
 				wait(ryw.commit());
 				return Void();
 			} catch (Error& e) {
-				TraceEvent("ExcludeServersError").error(e, true);
+				TraceEvent("ExcludeServersError").errorUnsuppressed(e);
 				wait(ryw.onError(e));
 			}
 		}
@ -1285,7 +1288,7 @@ ACTOR Future<Void> excludeServers(Database cx, std::vector<AddressExclusion> ser
 				wait(tr.commit());
 				return Void();
 			} catch (Error& e) {
-				TraceEvent("ExcludeServersError").error(e, true);
+				TraceEvent("ExcludeServersError").errorUnsuppressed(e);
 				wait(tr.onError(e));
 			}
 		}
@ -1336,7 +1339,7 @@ ACTOR Future<Void> excludeLocalities(Database cx, std::unordered_set<std::string
 				wait(ryw.commit());
 				return Void();
 			} catch (Error& e) {
-				TraceEvent("ExcludeLocalitiesError").error(e, true);
+				TraceEvent("ExcludeLocalitiesError").errorUnsuppressed(e);
 				wait(ryw.onError(e));
 			}
 		}
@ -1348,7 +1351,7 @@ ACTOR Future<Void> excludeLocalities(Database cx, std::unordered_set<std::string
 				wait(tr.commit());
 				return Void();
 			} catch (Error& e) {
-				TraceEvent("ExcludeLocalitiesError").error(e, true);
+				TraceEvent("ExcludeLocalitiesError").errorUnsuppressed(e);
 				wait(tr.onError(e));
 			}
 		}
@ -1392,7 +1395,7 @@ ACTOR Future<Void> includeServers(Database cx, std::vector<AddressExclusion> ser
 				wait(ryw.commit());
 				return Void();
 			} catch (Error& e) {
-				TraceEvent("IncludeServersError").error(e, true);
+				TraceEvent("IncludeServersError").errorUnsuppressed(e);
 				wait(ryw.onError(e));
 			}
 		}
@ -1449,7 +1452,7 @@ ACTOR Future<Void> includeServers(Database cx, std::vector<AddressExclusion> ser
 				wait(tr.commit());
 				return Void();
 			} catch (Error& e) {
-				TraceEvent("IncludeServersError").error(e, true);
+				TraceEvent("IncludeServersError").errorUnsuppressed(e);
 				wait(tr.onError(e));
 			}
 		}
@ -1487,7 +1490,7 @@ ACTOR Future<Void> includeLocalities(Database cx, std::vector<std::string> local
 				wait(ryw.commit());
 				return Void();
 			} catch (Error& e) {
-				TraceEvent("IncludeLocalitiesError").error(e, true);
+				TraceEvent("IncludeLocalitiesError").errorUnsuppressed(e);
 				wait(ryw.onError(e));
 			}
 		}
@ -1535,7 +1538,7 @@ ACTOR Future<Void> includeLocalities(Database cx, std::vector<std::string> local
 				wait(tr.commit());
 				return Void();
 			} catch (Error& e) {
-				TraceEvent("IncludeLocalitiesError").error(e, true);
+				TraceEvent("IncludeLocalitiesError").errorUnsuppressed(e);
 				wait(tr.onError(e));
 			}
 		}
@ -1907,7 +1910,7 @@ ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UI
 		TraceEvent("SnapCreateSucceeded").detail("snapUID", snapUID);
 		return Void();
 	} catch (Error& e) {
-		TraceEvent(SevWarn, "SnapCreateFailed").detail("snapUID", snapUID).error(e);
+		TraceEvent(SevWarn, "SnapCreateFailed").error(e).detail("snapUID", snapUID);
 		throw;
 	}
 }
@ -2198,7 +2201,7 @@ ACTOR Future<Void> advanceVersion(Database cx, Version v) {
 				tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(v + 1, Unversioned()));
 				wait(tr.commit());
 			} else {
-				printf("Current read version is %ld\n", rv);
+				fmt::print("Current read version is {}\n", rv);
 				return Void();
 			}
 		} catch (Error& e) {
--- a/fdbclient/ManagementAPI.actor.h
+++ b/fdbclient/ManagementAPI.actor.h
@ -56,7 +56,7 @@ struct IQuorumChange : ReferenceCounted<IQuorumChange> {
 // Change to use the given set of coordination servers
 ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
                                                               Reference<IQuorumChange> change,
-                                                               std::vector<NetworkAddress>* desiredCoordinators);
+                                                               ClusterConnectionString* conn);
 ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChange> change);
 Reference<IQuorumChange> autoQuorumChange(int desired = -1);
 Reference<IQuorumChange> noQuorumChange();
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@ -77,8 +77,8 @@ void IClusterConnectionRecord::setPersisted() {
 	connectionStringNeedsPersisted = false;
 }

-bool IClusterConnectionRecord::hasUnresolvedHostnames() const {
-	return cs.hasUnresolvedHostnames;
+ClusterConnectionString::ConnectionStringStatus IClusterConnectionRecord::connectionStringStatus() const {
+	return cs.status;
 }

 Future<Void> IClusterConnectionRecord::resolveHostnames() {
@ -98,39 +98,56 @@ std::string ClusterConnectionString::getErrorString(std::string const& source, E
 }

 ACTOR Future<Void> resolveHostnamesImpl(ClusterConnectionString* self) {
-	std::vector<Future<Void>> fs;
-	for (auto const& hostName : self->hostnames) {
-		fs.push_back(map(INetworkConnections::net()->resolveTCPEndpoint(hostName.host, hostName.service),
-		                 [=](std::vector<NetworkAddress> const& addresses) -> Void {
-			                 NetworkAddress addr = addresses[deterministicRandom()->randomInt(0, addresses.size())];
-			                 addr.flags = 0; // Reset the parsed address to public
-			                 addr.fromHostname = NetworkAddressFromHostname::True;
-			                 if (hostName.isTLS) {
-				                 addr.flags |= NetworkAddress::FLAG_TLS;
-			                 }
-			                 self->addResolved(hostName, addr);
-			                 return Void();
-		                 }));
+	loop {
+		if (self->status == ClusterConnectionString::UNRESOLVED) {
+			self->status = ClusterConnectionString::RESOLVING;
+			std::vector<Future<Void>> fs;
+			for (auto const& hostname : self->hostnames) {
+				fs.push_back(map(INetworkConnections::net()->resolveTCPEndpoint(hostname.host, hostname.service),
+				                 [=](std::vector<NetworkAddress> const& addresses) -> Void {
+					                 NetworkAddress address =
+					                     addresses[deterministicRandom()->randomInt(0, addresses.size())];
+					                 address.flags = 0; // Reset the parsed address to public
+					                 address.fromHostname = NetworkAddressFromHostname::True;
+					                 if (hostname.isTLS) {
+						                 address.flags |= NetworkAddress::FLAG_TLS;
+					                 }
+					                 self->addResolved(hostname, address);
+					                 return Void();
+				                 }));
+			}
+			wait(waitForAll(fs));
+			std::sort(self->coords.begin(), self->coords.end());
+			if (std::unique(self->coords.begin(), self->coords.end()) != self->coords.end()) {
+				self->status = ClusterConnectionString::UNRESOLVED;
+				self->resolveFinish.trigger();
+				throw connection_string_invalid();
+			}
+			self->status = ClusterConnectionString::RESOLVED;
+			self->resolveFinish.trigger();
+			break;
+		} else if (self->status == ClusterConnectionString::RESOLVING) {
+			wait(self->resolveFinish.onTrigger());
+			if (self->status == ClusterConnectionString::RESOLVED) {
+				break;
+			}
+			// Otherwise, this means other threads failed on resolve, so here we go back to the loop and try to resolve
+			// again.
+		} else {
+			// status is RESOLVED, nothing to do.
+			break;
+		}
 	}
-	wait(waitForAll(fs));
-	std::sort(self->coords.begin(), self->coords.end());
-	if (std::unique(self->coords.begin(), self->coords.end()) != self->coords.end()) {
-		throw connection_string_invalid();
-	}
-	self->hasUnresolvedHostnames = false;
 	return Void();
 }

 Future<Void> ClusterConnectionString::resolveHostnames() {
-	if (!hasUnresolvedHostnames) {
-		return Void();
-	} else {
-		return resolveHostnamesImpl(this);
-	}
+	return resolveHostnamesImpl(this);
 }

 void ClusterConnectionString::resolveHostnamesBlocking() {
-	if (hasUnresolvedHostnames) {
+	if (status != RESOLVED) {
+		status = RESOLVING;
 		for (auto const& hostname : hostnames) {
 			std::vector<NetworkAddress> addresses =
 			    INetworkConnections::net()->resolveTCPEndpointBlocking(hostname.host, hostname.service);
@ -140,14 +157,14 @@ void ClusterConnectionString::resolveHostnamesBlocking() {
 			if (hostname.isTLS) {
 				address.flags |= NetworkAddress::FLAG_TLS;
 			}
-			coords.push_back(address);
-			networkAddressToHostname.emplace(address, hostname);
+			addResolved(hostname, address);
 		}
 		std::sort(coords.begin(), coords.end());
 		if (std::unique(coords.begin(), coords.end()) != coords.end()) {
+			status = UNRESOLVED;
 			throw connection_string_invalid();
 		}
-		hasUnresolvedHostnames = false;
+		status = RESOLVED;
 	}
 }

@ -156,11 +173,15 @@ void ClusterConnectionString::resetToUnresolved() {
 		coords.clear();
 		hostnames.clear();
 		networkAddressToHostname.clear();
-		hasUnresolvedHostnames = true;
+		status = UNRESOLVED;
 		parseConnString();
 	}
 }

+void ClusterConnectionString::resetConnectionString() {
+	connectionString = toString();
+}
+
 void ClusterConnectionString::parseConnString() {
 	// Split on '@' into key@addrs
 	int pAt = connectionString.find_first_of('@');
@ -184,7 +205,9 @@ void ClusterConnectionString::parseConnString() {
 		}
 		p = pComma + 1;
 	}
-	hasUnresolvedHostnames = hostnames.size() > 0;
+	if (hostnames.size() > 0) {
+		status = UNRESOLVED;
+	}
 	ASSERT((coords.size() + hostnames.size()) > 0);

 	std::sort(coords.begin(), coords.end());
@ -256,7 +279,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
 	{
 		input = "asdf:2345@localhost:1234";
 		ClusterConnectionString cs(input);
-		ASSERT(cs.hasUnresolvedHostnames);
+		ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 		ASSERT(cs.hostnames.size() == 1);
 		ASSERT(input == cs.toString());
 	}
@ -264,7 +287,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
 	{
 		input = "0xxdeadbeef:100100100@localhost:34534,host-name:23443";
 		ClusterConnectionString cs(input);
-		ASSERT(cs.hasUnresolvedHostnames);
+		ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 		ASSERT(cs.hostnames.size() == 2);
 		ASSERT(input == cs.toString());
 	}
@ -277,7 +300,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
 		commented += "# asdfasdf ##";

 		ClusterConnectionString cs(commented);
-		ASSERT(cs.hasUnresolvedHostnames);
+		ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 		ASSERT(cs.hostnames.size() == 2);
 		ASSERT(input == cs.toString());
 	}
@ -290,7 +313,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
 		commented += "# asdfasdf ##";

 		ClusterConnectionString cs(commented);
-		ASSERT(cs.hasUnresolvedHostnames);
+		ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 		ASSERT(cs.hostnames.size() == 2);
 		ASSERT(input == cs.toString());
 	}
@ -314,16 +337,16 @@ TEST_CASE("/fdbclient/MonitorLeader/ConnectionString") {
 	INetworkConnections::net()->addMockTCPEndpoint(hn2, port2, { address2 });

 	state ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0"));
-	ASSERT(cs.hasUnresolvedHostnames);
+	ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 	ASSERT(cs.hostnames.size() == 2);
 	ASSERT(cs.coordinators().size() == 0);
 	wait(cs.resolveHostnames());
-	ASSERT(!cs.hasUnresolvedHostnames);
+	ASSERT(cs.status == ClusterConnectionString::RESOLVED);
 	ASSERT(cs.hostnames.size() == 2);
 	ASSERT(cs.coordinators().size() == 2);
 	ASSERT(cs.toString() == connectionString);
 	cs.resetToUnresolved();
-	ASSERT(cs.hasUnresolvedHostnames);
+	ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
 	ASSERT(cs.hostnames.size() == 2);
 	ASSERT(cs.coordinators().size() == 0);
 	ASSERT(cs.toString() == connectionString);
@ -422,29 +445,17 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/fuzz") {
 }

 ClusterConnectionString::ClusterConnectionString(const std::vector<NetworkAddress>& servers, Key key)
-  : coords(servers) {
+  : status(RESOLVED), coords(servers) {
 	std::string keyString = key.toString();
 	parseKey(keyString);
-	connectionString = keyString + "@";
-	for (int i = 0; i < coords.size(); i++) {
-		if (i) {
-			connectionString += ',';
-		}
-		connectionString += coords[i].toString();
-	}
+	resetConnectionString();
 }

 ClusterConnectionString::ClusterConnectionString(const std::vector<Hostname>& hosts, Key key)
-  : hasUnresolvedHostnames(true), hostnames(hosts) {
+  : status(UNRESOLVED), hostnames(hosts) {
 	std::string keyString = key.toString();
 	parseKey(keyString);
-	connectionString = keyString + "@";
-	for (int i = 0; i < hostnames.size(); i++) {
-		if (i) {
-			connectionString += ',';
-		}
-		connectionString += hostnames[i].toString();
-	}
+	resetConnectionString();
 }

 void ClusterConnectionString::parseKey(const std::string& key) {
@ -497,6 +508,7 @@ std::string ClusterConnectionString::toString() const {
 }

 ClientCoordinators::ClientCoordinators(Reference<IClusterConnectionRecord> ccr) : ccr(ccr) {
+	ASSERT(ccr->connectionStringStatus() == ClusterConnectionString::RESOLVED);
 	ClusterConnectionString cs = ccr->getConnectionString();
 	for (auto s = cs.coordinators().begin(); s != cs.coordinators().end(); ++s)
 		clientLeaderServers.push_back(ClientLeaderRegInterface(*s));
@ -525,15 +537,44 @@ ClientLeaderRegInterface::ClientLeaderRegInterface(INetwork* local) {

 // Nominee is the worker among all workers that are considered as leader by one coordinator
 // This function contacts a coordinator coord to ask who is its nominee.
+// Note: for coordinators whose NetworkAddress is parsed out of a hostname, a connection failure will cause this actor
+// to throw `coordinators_changed()` error
 ACTOR Future<Void> monitorNominee(Key key,
                                  ClientLeaderRegInterface coord,
                                  AsyncTrigger* nomineeChange,
-                                  Optional<LeaderInfo>* info) {
+                                  Optional<LeaderInfo>* info,
+                                  Optional<Hostname> hostname = Optional<Hostname>()) {
 	loop {
-		state Optional<LeaderInfo> li =
-		    wait(retryBrokenPromise(coord.getLeader,
-		                            GetLeaderRequest(key, info->present() ? info->get().changeID : UID()),
-		                            TaskPriority::CoordinationReply));
+		state Optional<LeaderInfo> li;
+
+		if (coord.getLeader.getEndpoint().getPrimaryAddress().fromHostname) {
+			state ErrorOr<Optional<LeaderInfo>> rep =
+			    wait(coord.getLeader.tryGetReply(GetLeaderRequest(key, info->present() ? info->get().changeID : UID()),
+			                                     TaskPriority::CoordinationReply));
+			if (rep.isError()) {
+				// Connecting to nominee failed, most likely due to connection failed.
+				TraceEvent("MonitorNomineeError")
+				    .error(rep.getError())
+				    .detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
+				    .detail("OldAddr", coord.getLeader.getEndpoint().getPrimaryAddress().toString());
+				if (rep.getError().code() == error_code_request_maybe_delivered) {
+					// 50 milliseconds delay to prevent tight resolving loop due to outdated DNS cache
+					wait(delay(0.05));
+					throw coordinators_changed();
+				} else {
+					throw rep.getError();
+				}
+			} else if (rep.present()) {
+				li = rep.get();
+			}
+		} else {
+			Optional<LeaderInfo> tmp =
+			    wait(retryBrokenPromise(coord.getLeader,
+			                            GetLeaderRequest(key, info->present() ? info->get().changeID : UID()),
+			                            TaskPriority::CoordinationReply));
+			li = tmp;
+		}
+
 		wait(Future<Void>(Void())); // Make sure we weren't cancelled

 		TraceEvent("GetLeaderReply")
@ -608,53 +649,74 @@ Optional<std::pair<LeaderInfo, bool>> getLeader(const std::vector<Optional<Leade
 ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<IClusterConnectionRecord> connRecord,
                                                           Reference<AsyncVar<Value>> outSerializedLeaderInfo,
                                                           MonitorLeaderInfo info) {
-	state ClientCoordinators coordinators(info.intermediateConnRecord);
-	state AsyncTrigger nomineeChange;
-	state std::vector<Optional<LeaderInfo>> nominees;
-	state Future<Void> allActors;
-
-	nominees.resize(coordinators.clientLeaderServers.size());
-
-	std::vector<Future<Void>> actors;
-	// Ask all coordinators if the worker is considered as a leader (leader nominee) by the coordinator.
-	actors.reserve(coordinators.clientLeaderServers.size());
-	for (int i = 0; i < coordinators.clientLeaderServers.size(); i++)
-		actors.push_back(
-		    monitorNominee(coordinators.clusterKey, coordinators.clientLeaderServers[i], &nomineeChange, &nominees[i]));
-	allActors = waitForAll(actors);
-
 	loop {
-		Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees);
-		TraceEvent("MonitorLeaderChange")
-		    .detail("NewLeader", leader.present() ? leader.get().first.changeID : UID(1, 1));
-		if (leader.present()) {
-			if (leader.get().first.forward) {
-				TraceEvent("MonitorLeaderForwarding")
-				    .detail("NewConnStr", leader.get().first.serializedInfo.toString())
-				    .detail("OldConnStr", info.intermediateConnRecord->getConnectionString().toString())
-				    .trackLatest("MonitorLeaderForwarding");
-				info.intermediateConnRecord = connRecord->makeIntermediateRecord(
-				    ClusterConnectionString(leader.get().first.serializedInfo.toString()));
-				return info;
-			}
-			if (connRecord != info.intermediateConnRecord) {
-				if (!info.hasConnected) {
-					TraceEvent(SevWarnAlways, "IncorrectClusterFileContentsAtConnection")
-					    .detail("ClusterFile", connRecord->toString())
-					    .detail("StoredConnectionString", connRecord->getConnectionString().toString())
-					    .detail("CurrentConnectionString",
-					            info.intermediateConnRecord->getConnectionString().toString());
-				}
-				connRecord->setAndPersistConnectionString(info.intermediateConnRecord->getConnectionString());
-				info.intermediateConnRecord = connRecord;
-			}
+		wait(connRecord->resolveHostnames());
+		wait(info.intermediateConnRecord->resolveHostnames());
+		state ClientCoordinators coordinators(info.intermediateConnRecord);
+		state AsyncTrigger nomineeChange;
+		state std::vector<Optional<LeaderInfo>> nominees;
+		state Future<Void> allActors;

-			info.hasConnected = true;
-			connRecord->notifyConnected();
+		nominees.resize(coordinators.clientLeaderServers.size());

-			outSerializedLeaderInfo->set(leader.get().first.serializedInfo);
+		state std::vector<Future<Void>> actors;
+		// Ask all coordinators if the worker is considered as a leader (leader nominee) by the coordinator.
+		actors.reserve(coordinators.clientLeaderServers.size());
+		for (int i = 0; i < coordinators.clientLeaderServers.size(); i++) {
+			Optional<Hostname> hostname;
+			auto r = connRecord->getConnectionString().networkAddressToHostname.find(
+			    coordinators.clientLeaderServers[i].getLeader.getEndpoint().getPrimaryAddress());
+			if (r != connRecord->getConnectionString().networkAddressToHostname.end()) {
+				hostname = r->second;
+			}
+			actors.push_back(monitorNominee(
+			    coordinators.clusterKey, coordinators.clientLeaderServers[i], &nomineeChange, &nominees[i], hostname));
+		}
+		allActors = waitForAll(actors);
+
+		loop {
+			Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees);
+			TraceEvent("MonitorLeaderChange")
+			    .detail("NewLeader", leader.present() ? leader.get().first.changeID : UID(1, 1));
+			if (leader.present()) {
+				if (leader.get().first.forward) {
+					TraceEvent("MonitorLeaderForwarding")
+					    .detail("NewConnStr", leader.get().first.serializedInfo.toString())
+					    .detail("OldConnStr", info.intermediateConnRecord->getConnectionString().toString())
+					    .trackLatest("MonitorLeaderForwarding");
+					info.intermediateConnRecord = connRecord->makeIntermediateRecord(
+					    ClusterConnectionString(leader.get().first.serializedInfo.toString()));
+					return info;
+				}
+				if (connRecord != info.intermediateConnRecord) {
+					if (!info.hasConnected) {
+						TraceEvent(SevWarnAlways, "IncorrectClusterFileContentsAtConnection")
+						    .detail("ClusterFile", connRecord->toString())
+						    .detail("StoredConnectionString", connRecord->getConnectionString().toString())
+						    .detail("CurrentConnectionString",
+						            info.intermediateConnRecord->getConnectionString().toString());
+					}
+					connRecord->setAndPersistConnectionString(info.intermediateConnRecord->getConnectionString());
+					info.intermediateConnRecord = connRecord;
+				}
+
+				info.hasConnected = true;
+				connRecord->notifyConnected();
+
+				outSerializedLeaderInfo->set(leader.get().first.serializedInfo);
+			}
+			try {
+				wait(nomineeChange.onTrigger() || allActors);
+			} catch (Error& e) {
+				if (e.code() == error_code_coordinators_changed) {
+					TraceEvent("MonitorLeaderCoordinatorsChanged").suppressFor(1.0);
+					connRecord->getConnectionString().resetToUnresolved();
+					break;
+				} else {
+					throw e;
+				}
+			}
 		}
-		wait(nomineeChange.onTrigger() || allActors);
 	}
 }

@ -774,8 +836,8 @@ ACTOR Future<Void> getClientInfoFromLeader(Reference<AsyncVar<Optional<ClusterCo
 			when(ClientDBInfo ni =
 			         wait(brokenPromiseToNever(knownLeader->get().get().clientInterface.openDatabase.getReply(req)))) {
 				TraceEvent("GetClientInfoFromLeaderGotClientInfo", knownLeader->get().get().clientInterface.id())
-				    .detail("CommitProxy0", ni.commitProxies.size() ? ni.commitProxies[0].id() : UID())
-				    .detail("GrvProxy0", ni.grvProxies.size() ? ni.grvProxies[0].id() : UID())
+				    .detail("CommitProxy0", ni.commitProxies.size() ? ni.commitProxies[0].address().toString() : "")
+				    .detail("GrvProxy0", ni.grvProxies.size() ? ni.grvProxies[0].address().toString() : "")
 				    .detail("ClientID", ni.id);
 				clientData->clientInfo->set(CachedSerialization<ClientDBInfo>(ni));
 			}
@ -787,7 +849,8 @@ ACTOR Future<Void> getClientInfoFromLeader(Reference<AsyncVar<Optional<ClusterCo
 ACTOR Future<Void> monitorLeaderAndGetClientInfo(Key clusterKey,
                                                 std::vector<NetworkAddress> coordinators,
                                                 ClientData* clientData,
-                                                 Reference<AsyncVar<Optional<LeaderInfo>>> leaderInfo) {
+                                                 Reference<AsyncVar<Optional<LeaderInfo>>> leaderInfo,
+                                                 Reference<AsyncVar<Void>> coordinatorsChanged) {
 	state std::vector<ClientLeaderRegInterface> clientLeaderServers;
 	state AsyncTrigger nomineeChange;
 	state std::vector<Optional<LeaderInfo>> nominees;
@ -835,7 +898,14 @@ ACTOR Future<Void> monitorLeaderAndGetClientInfo(Key clusterKey,
 				leaderInfo->set(leader.get().first);
 			}
 		}
-		wait(nomineeChange.onTrigger() || allActors);
+		try {
+			wait(nomineeChange.onTrigger() || allActors);
+		} catch (Error& e) {
+			if (e.code() == error_code_coordinators_changed) {
+				coordinatorsChanged->trigger();
+			}
+			throw e;
+		}
 	}
 }

@ -964,9 +1034,15 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
 			successIndex = index;
 		} else {
 			TEST(rep.getError().code() == error_code_failed_to_progress); // Coordinator cant talk to cluster controller
+			if (rep.getError().code() == error_code_coordinators_changed) {
+				throw coordinators_changed();
+			}
 			index = (index + 1) % addrs.size();
 			if (index == successIndex) {
 				wait(delay(CLIENT_KNOBS->COORDINATOR_RECONNECTION_DELAY));
+				// When the client fails talking to all coordinators, we throw coordinators_changed() and let the caller
+				// re-resolve the connection string and retry.
+				throw coordinators_changed();
 			}
 		}
 	}
@ -978,16 +1054,27 @@ ACTOR Future<Void> monitorProxies(
    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
    Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
    Key traceLogGroup) {
+	wait(connRecord->get()->resolveHostnames());
 	state MonitorLeaderInfo info(connRecord->get());
 	loop {
-		choose {
-			when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration(
-			         connRecord->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) {
-				info = _info;
+		try {
+			wait(info.intermediateConnRecord->resolveHostnames());
+			choose {
+				when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration(
+				         connRecord->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) {
+					info = _info;
+				}
+				when(wait(connRecord->onChange())) {
+					info.hasConnected = false;
+					info.intermediateConnRecord = connRecord->get();
+				}
 			}
-			when(wait(connRecord->onChange())) {
-				info.hasConnected = false;
-				info.intermediateConnRecord = connRecord->get();
+		} catch (Error& e) {
+			if (e.code() == error_code_coordinators_changed) {
+				TraceEvent("MonitorProxiesCoordinatorsChanged").suppressFor(1.0);
+				info.intermediateConnRecord->getConnectionString().resetToUnresolved();
+			} else {
+				throw e;
 			}
 		}
 	}
--- a/fdbclient/MonitorLeader.h
+++ b/fdbclient/MonitorLeader.h
@ -74,10 +74,11 @@ Future<Void> monitorLeader(Reference<IClusterConnectionRecord> const& connFile,
 // This is one place where the leader election algorithm is run. The coodinator contacts all coodinators to collect
 // nominees, the nominee with the most nomination is the leader, and collects client data from the leader. This function
 // also monitors the change of the leader.
-Future<Void> monitorLeaderAndGetClientInfo(Value const& key,
+Future<Void> monitorLeaderAndGetClientInfo(Key const& clusterKey,
                                           std::vector<NetworkAddress> const& coordinators,
                                           ClientData* const& clientData,
-                                           Reference<AsyncVar<Optional<LeaderInfo>>> const& leaderInfo);
+                                           Reference<AsyncVar<Optional<LeaderInfo>>> const& leaderInfo,
+                                           Reference<AsyncVar<Void>> const& coordinatorsChanged);

 Future<Void> monitorProxies(
    Reference<AsyncVar<Reference<IClusterConnectionRecord>>> const& connRecord,
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -1202,9 +1202,9 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 					// but we may not see trace logs from this client until a successful connection
 					// is established.
 					TraceEvent(SevWarnAlways, "FailedToInitializeExternalClient")
+					    .error(e)
 					    .detail("LibraryPath", client->libPath)
-					    .detail("ClusterFilePath", clusterFilePath)
-					    .error(e);
+					    .detail("ClusterFilePath", clusterFilePath);
 				}
 			}
 		});
@ -1218,9 +1218,9 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 				} catch (Error& e) {
 					// This connection is discarded
 					TraceEvent(SevWarnAlways, "FailedToCreateLegacyDatabaseConnection")
+					    .error(e)
 					    .detail("LibraryPath", client->libPath)
-					    .detail("ClusterFilePath", clusterFilePath)
-					    .error(e);
+					    .detail("ClusterFilePath", clusterFilePath);
 				}
 			}
 		});
@ -1360,8 +1360,8 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 			}

 			TraceEvent("ErrorGettingClusterProtocolVersion")
-			    .detail("ExpectedProtocolVersion", expected)
-			    .error(cv.getError());
+			    .error(cv.getError())
+			    .detail("ExpectedProtocolVersion", expected);
 		}

 		ProtocolVersion clusterVersion =
@ -1409,10 +1409,10 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 				newDb = client->api->createDatabase(clusterFilePath.c_str());
 			} catch (Error& e) {
 				TraceEvent(SevWarnAlways, "MultiVersionClientFailedToCreateDatabase")
+				    .error(e)
 				    .detail("LibraryPath", client->libPath)
 				    .detail("External", client->external)
-				    .detail("ClusterFilePath", clusterFilePath)
-				    .error(e);
+				    .detail("ClusterFilePath", clusterFilePath);

 				// Put the client in a disconnected state until the version changes again
 				updateDatabase(Reference<IDatabase>(), Reference<ClientInfo>());
@ -1486,8 +1486,8 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 				// We can't create a new database to monitor the cluster version. This means we will continue using the
 				// previous one, which should hopefully continue to work.
 				TraceEvent(SevWarnAlways, "FailedToCreateDatabaseForVersionMonitoring")
-				    .detail("ClusterFilePath", clusterFilePath)
-				    .error(e);
+				    .error(e)
+				    .detail("ClusterFilePath", clusterFilePath);
 			}
 		}
 	} else {
@ -1499,8 +1499,8 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 			// We can't create a new database to monitor the cluster version. This means we will continue using the
 			// previous one, which should hopefully continue to work.
 			TraceEvent(SevWarnAlways, "FailedToCreateDatabaseForVersionMonitoring")
-			    .detail("ClusterFilePath", clusterFilePath)
-			    .error(e);
+			    .error(e)
+			    .detail("ClusterFilePath", clusterFilePath);
 		}
 	}

--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -732,16 +732,18 @@ Future<Void> attemptGRVFromOldProxies(std::vector<GrvProxyInterface> oldProxies,

 ACTOR static Future<Void> monitorClientDBInfoChange(DatabaseContext* cx,
                                                    Reference<AsyncVar<ClientDBInfo> const> clientDBInfo,
-                                                    AsyncTrigger* proxyChangeTrigger) {
+                                                    AsyncTrigger* proxiesChangeTrigger) {
 	state std::vector<CommitProxyInterface> curCommitProxies;
 	state std::vector<GrvProxyInterface> curGrvProxies;
 	state ActorCollection actors(false);
+	state Future<Void> clientDBInfoOnChange = clientDBInfo->onChange();
 	curCommitProxies = clientDBInfo->get().commitProxies;
 	curGrvProxies = clientDBInfo->get().grvProxies;

 	loop {
 		choose {
-			when(wait(clientDBInfo->onChange())) {
+			when(wait(clientDBInfoOnChange)) {
+				clientDBInfoOnChange = clientDBInfo->onChange();
 				if (clientDBInfo->get().commitProxies != curCommitProxies ||
 				    clientDBInfo->get().grvProxies != curGrvProxies) {
 					// This condition is a bit complicated. Here we want to verify that we're unable to receive a read
@ -758,7 +760,7 @@ ACTOR static Future<Void> monitorClientDBInfoChange(DatabaseContext* cx,
 					}
 					curCommitProxies = clientDBInfo->get().commitProxies;
 					curGrvProxies = clientDBInfo->get().grvProxies;
-					proxyChangeTrigger->trigger();
+					proxiesChangeTrigger->trigger();
 				}
 			}
 			when(wait(actors.getResult())) { UNSTOPPABLE_ASSERT(false); }
@ -1596,6 +1598,32 @@ void DatabaseContext::invalidateCache(const KeyRangeRef& keys) {
 	locationCache.insert(KeyRangeRef(begin, end), Reference<LocationInfo>());
 }

+void DatabaseContext::setFailedEndpointOnHealthyServer(const Endpoint& endpoint) {
+	if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) {
+		failedEndpointsOnHealthyServersInfo[endpoint] =
+		    EndpointFailureInfo{ .startTime = now(), .lastRefreshTime = now() };
+	}
+}
+
+void DatabaseContext::updateFailedEndpointRefreshTime(const Endpoint& endpoint) {
+	if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) {
+		// The endpoint is not failed. Nothing to update.
+		return;
+	}
+	failedEndpointsOnHealthyServersInfo[endpoint].lastRefreshTime = now();
+}
+
+Optional<EndpointFailureInfo> DatabaseContext::getEndpointFailureInfo(const Endpoint& endpoint) {
+	if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) {
+		return Optional<EndpointFailureInfo>();
+	}
+	return failedEndpointsOnHealthyServersInfo[endpoint];
+}
+
+void DatabaseContext::clearFailedEndpointOnHealthyServer(const Endpoint& endpoint) {
+	failedEndpointsOnHealthyServersInfo.erase(endpoint);
+}
+
 Future<Void> DatabaseContext::onProxiesChanged() const {
 	return this->proxiesChangeTrigger.onTrigger();
 }
@ -2449,6 +2477,35 @@ ACTOR Future<std::pair<KeyRange, Reference<LocationInfo>>> getKeyLocation_intern
 	}
 }

+// Checks if `endpoint` is failed on a healthy server or not. Returns true if we need to refresh the location cache for
+// the endpoint.
+bool checkOnlyEndpointFailed(const Database& cx, const Endpoint& endpoint) {
+	if (IFailureMonitor::failureMonitor().onlyEndpointFailed(endpoint)) {
+		// This endpoint is failed, but the server is still healthy. There are two cases this can happen:
+		//    - There is a recent bounce in the cluster where the endpoints in SSes get updated.
+		//    - The SS is failed and terminated on a server, but the server is kept running.
+		// To account for the first case, we invalidate the cache and issue GetKeyLocation requests to the proxy to
+		// update the cache with the new SS points. However, if the failure is caused by the second case, the
+		// requested key location will continue to be the failed endpoint until the data movement is finished. But
+		// every read will generate a GetKeyLocation request to the proxies (and still getting the failed endpoint
+		// back), which may overload the proxy and affect data movement speed. Therefore, we only refresh the
+		// location cache for short period of time, and after the initial grace period that we keep retrying
+		// resolving key location, we will slow it down to resolve it only once every
+		// `LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL`.
+		cx->setFailedEndpointOnHealthyServer(endpoint);
+		const auto& failureInfo = cx->getEndpointFailureInfo(endpoint);
+		ASSERT(failureInfo.present());
+		if (now() - failureInfo.get().startTime < CLIENT_KNOBS->LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD ||
+		    now() - failureInfo.get().lastRefreshTime > CLIENT_KNOBS->LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL) {
+			cx->updateFailedEndpointRefreshTime(endpoint);
+			return true;
+		}
+	} else {
+		cx->clearFailedEndpointOnHealthyServer(endpoint);
+	}
+	return false;
+}
+
 template <class F>
 Future<std::pair<KeyRange, Reference<LocationInfo>>> getKeyLocation(Database const& cx,
                                                                    Key const& key,
@ -2463,14 +2520,19 @@ Future<std::pair<KeyRange, Reference<LocationInfo>>> getKeyLocation(Database con
 		return getKeyLocation_internal(cx, key, spanID, debugID, useProvisionalProxies, isBackward);
 	}

+	bool onlyEndpointFailedAndNeedRefresh = false;
 	for (int i = 0; i < ssi.second->size(); i++) {
-		if (IFailureMonitor::failureMonitor().onlyEndpointFailed(ssi.second->get(i, member).getEndpoint())) {
-			cx->invalidateCache(key);
-			ssi.second.clear();
-			return getKeyLocation_internal(cx, key, spanID, debugID, useProvisionalProxies, isBackward);
+		if (checkOnlyEndpointFailed(cx, ssi.second->get(i, member).getEndpoint())) {
+			onlyEndpointFailedAndNeedRefresh = true;
 		}
 	}

+	if (onlyEndpointFailedAndNeedRefresh) {
+		cx->invalidateCache(key);
+		// Refresh the cache with a new getKeyLocations made to proxies.
+		return getKeyLocation_internal(cx, key, spanID, debugID, useProvisionalProxies, isBackward);
+	}
+
 	return ssi;
 }

@ -2553,21 +2615,21 @@ Future<std::vector<std::pair<KeyRange, Reference<LocationInfo>>>> getKeyRangeLoc

 	bool foundFailed = false;
 	for (const auto& [range, locInfo] : locations) {
-		bool onlyEndpointFailed = false;
+		bool onlyEndpointFailedAndNeedRefresh = false;
 		for (int i = 0; i < locInfo->size(); i++) {
-			if (IFailureMonitor::failureMonitor().onlyEndpointFailed(locInfo->get(i, member).getEndpoint())) {
-				onlyEndpointFailed = true;
-				break;
+			if (checkOnlyEndpointFailed(cx, locInfo->get(i, member).getEndpoint())) {
+				onlyEndpointFailedAndNeedRefresh = true;
 			}
 		}

-		if (onlyEndpointFailed) {
+		if (onlyEndpointFailedAndNeedRefresh) {
 			cx->invalidateCache(range.begin);
 			foundFailed = true;
 		}
 	}

 	if (foundFailed) {
+		// Refresh the cache with a new getKeyRangeLocations made to proxies.
 		return getKeyRangeLocations_internal(cx, keys, limit, reverse, spanID, debugID, useProvisionalProxies);
 	}

@ -5095,7 +5157,7 @@ ACTOR static Future<Void> commitDummyTransaction(Reference<TransactionState> trS
 			return Void();
 		} catch (Error& e) {
 			TraceEvent("CommitDummyTransactionError")
-			    .error(e, true)
+			    .errorUnsuppressed(e)
 			    .detail("Key", range.begin)
 			    .detail("Retries", retries);
 			wait(tr.onError(e));
@ -5713,9 +5775,10 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanID parentSpan,
 	loop {
 		try {
 			state GetReadVersionRequest req(span.context, transactionCount, priority, flags, tags, debugID);
+			state Future<Void> onProxiesChanged = cx->onProxiesChanged();

 			choose {
-				when(wait(cx->onProxiesChanged())) {}
+				when(wait(onProxiesChanged)) { onProxiesChanged = cx->onProxiesChanged(); }
 				when(GetReadVersionReply v =
 				         wait(basicLoadBalance(cx->getGrvProxies(UseProvisionalProxies(
 				                                   flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES)),
@ -6846,7 +6909,7 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
 			}
 		}
 	} catch (Error& e) {
-		TraceEvent("SnapCreateError").detail("SnapCmd", snapCmd.toString()).detail("UID", snapUID).error(e);
+		TraceEvent("SnapCreateError").error(e).detail("SnapCmd", snapCmd.toString()).detail("UID", snapUID);
 		throw;
 	}
 }
@ -6874,13 +6937,14 @@ ACTOR Future<bool> checkSafeExclusions(Database cx, std::vector<AddressExclusion
 	} catch (Error& e) {
 		if (e.code() != error_code_actor_cancelled) {
 			TraceEvent("ExclusionSafetyCheckError")
+			    .error(e)
 			    .detail("NumExclusion", exclusions.size())
-			    .detail("Exclusions", describe(exclusions))
-			    .error(e);
+			    .detail("Exclusions", describe(exclusions));
 		}
 		throw;
 	}
 	TraceEvent("ExclusionSafetyCheckCoordinators").log();
+	wait(cx->getConnectionRecord()->resolveHostnames());
 	state ClientCoordinators coordinatorList(cx->getConnectionRecord());
 	state std::vector<Future<Optional<LeaderInfo>>> leaderServers;
 	leaderServers.reserve(coordinatorList.clientLeaderServers.size());
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -2585,7 +2585,7 @@ void ReadYourWritesTransaction::debugLogRetries(Optional<Error> error) {
 			{
 				TraceEvent trace = TraceEvent("LongTransaction");
 				if (error.present())
-					trace.error(error.get(), true);
+					trace.errorUnsuppressed(error.get());
 				if (!transactionDebugInfo->transactionName.empty())
 					trace.detail("TransactionName", transactionDebugInfo->transactionName);
 				trace.detail("Elapsed", elapsed).detail("Retries", retries).detail("Committed", committed);
--- a/fdbclient/S3BlobStore.actor.cpp
+++ b/fdbclient/S3BlobStore.actor.cpp
@ -500,7 +500,7 @@ ACTOR Future<Optional<json_spirit::mObject>> tryReadJSONFile(std::string path) {

 	} catch (Error& e) {
 		if (e.code() != error_code_actor_cancelled)
-			TraceEvent(SevWarn, errorEventType).error(e).suppressFor(60).detail("File", path);
+			TraceEvent(SevWarn, errorEventType).errorUnsuppressed(e).suppressFor(60).detail("File", path);
 	}

 	return Optional<json_spirit::mObject>();
@ -744,7 +744,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp

 		// Attach err to trace event if present, otherwise extract some stuff from the response
 		if (err.present()) {
-			event.error(err.get());
+			event.errorUnsuppressed(err.get());
 		}
 		event.suppressFor(60);
 		if (!err.present()) {
@ -954,7 +954,7 @@ ACTOR Future<Void> listObjectsStream_impl(Reference<S3BlobStoreEndpoint> bstore,
 		} catch (Error& e) {
 			if (e.code() != error_code_actor_cancelled)
 				TraceEvent(SevWarn, "S3BlobStoreEndpointListResultParseError")
-				    .error(e)
+				    .errorUnsuppressed(e)
 				    .suppressFor(60)
 				    .detail("Resource", fullResource);
 			throw http_bad_response();
@ -1080,7 +1080,7 @@ ACTOR Future<std::vector<std::string>> listBuckets_impl(Reference<S3BlobStoreEnd
 		} catch (Error& e) {
 			if (e.code() != error_code_actor_cancelled)
 				TraceEvent(SevWarn, "S3BlobStoreEndpointListBucketResultParseError")
-				    .error(e)
+				    .errorUnsuppressed(e)
 				    .suppressFor(60)
 				    .detail("Resource", fullResource);
 			throw http_bad_response();
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -103,6 +103,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( TLOG_POP_BATCH_SIZE,                                  1000 ); if ( randomize && BUGGIFY ) TLOG_POP_BATCH_SIZE = 10;
 	init( TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE,     250e6 );
 	init( ENABLE_DETAILED_TLOG_POP_TRACE,                      false ); if ( randomize && BUGGIFY ) ENABLE_DETAILED_TLOG_POP_TRACE = true;
+	init( PEEK_BATCHING_EMPTY_MSG,                             false ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG = true;
+	init( PEEK_BATCHING_EMPTY_MSG_INTERVAL,                    0.001 ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG_INTERVAL = 0.01;

 	// disk snapshot max timeout, to be put in TLog, storage and coordinator nodes
 	init( MAX_FORKED_PROCESS_OUTPUT,                            1024 );
@ -362,7 +364,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC,                0 );
 	// If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO.
 	init( ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE,                 true );
-
+	init( ROCKSDB_PERFCONTEXT_ENABLE,                          false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true;
+	init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE, 					  0.0001 );

 	// Leader election
 	bool longLeaderElection = randomize && BUGGIFY;
@ -579,6 +582,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi

 	init( MIN_AVAILABLE_SPACE,                                   1e8 );
 	init( MIN_AVAILABLE_SPACE_RATIO,                            0.05 );
+	init( MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER,              0.01 );
 	init( TARGET_AVAILABLE_SPACE_RATIO,                         0.30 );
 	init( AVAILABLE_SPACE_UPDATE_DELAY,                          5.0 );

--- a/fdbclient/ServerKnobs.h
+++ b/fdbclient/ServerKnobs.h
@ -106,6 +106,8 @@ public:
 	double PUSH_STATS_SLOW_AMOUNT;
 	double PUSH_STATS_SLOW_RATIO;
 	int TLOG_POP_BATCH_SIZE;
+	bool PEEK_BATCHING_EMPTY_MSG;
+	double PEEK_BATCHING_EMPTY_MSG_INTERVAL;

 	// Data distribution queue
 	double HEALTH_POLL_TIME;
@ -293,6 +295,8 @@ public:
 	bool ROCKSDB_READ_RANGE_REUSE_ITERATORS;
 	int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC;
 	bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE;
+	bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead
+	double ROCKSDB_PERFCONTEXT_SAMPLE_RATE;

 	// Leader election
 	int MAX_NOTIFICATIONS;
@ -525,6 +529,7 @@ public:

 	int64_t MIN_AVAILABLE_SPACE;
 	double MIN_AVAILABLE_SPACE_RATIO;
+	double MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER;
 	double TARGET_AVAILABLE_SPACE_RATIO;
 	double AVAILABLE_SPACE_UPDATE_DELAY;

--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -1628,8 +1628,9 @@ Future<RangeResult> CoordinatorsImpl::getRange(ReadYourWritesTransaction* ryw, K

 ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
 	state Reference<IQuorumChange> change;
-	state std::vector<NetworkAddress> addressesVec;
-	state std::vector<std::string> process_address_strs;
+	state ClusterConnectionString
+	    conn; // We don't care about the Key here, it will be overrode in changeQuorumChecker().
+	state std::vector<std::string> process_address_or_hostname_strs;
 	state Optional<std::string> msg;
 	state int index;
 	state bool parse_error = false;
@ -1640,38 +1641,45 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
 	if (processes_entry.first) {
 		ASSERT(processes_entry.second.present()); // no clear should be seen here
 		auto processesStr = processes_entry.second.get().toString();
-		boost::split(process_address_strs, processesStr, [](char c) { return c == ','; });
-		if (!process_address_strs.size()) {
+		boost::split(process_address_or_hostname_strs, processesStr, [](char c) { return c == ','; });
+		if (!process_address_or_hostname_strs.size()) {
 			return ManagementAPIError::toJsonString(
 			    false,
 			    "coordinators",
 			    "New coordinators\' processes are empty, please specify new processes\' network addresses with format "
-			    "\"IP:PORT,IP:PORT,...,IP:PORT\"");
+			    "\"IP:PORT,IP:PORT,...,IP:PORT\" or \"HOSTNAME:PORT,HOSTNAME:PORT,...,HOSTNAME:PORT\"");
 		}
-		for (index = 0; index < process_address_strs.size(); index++) {
+		for (index = 0; index < process_address_or_hostname_strs.size(); index++) {
 			try {
-				auto a = NetworkAddress::parse(process_address_strs[index]);
-				if (!a.isValid())
-					parse_error = true;
-				else
-					addressesVec.push_back(a);
+				if (Hostname::isHostname(process_address_or_hostname_strs[index])) {
+					conn.hostnames.push_back(Hostname::parse(process_address_or_hostname_strs[index]));
+					conn.status = ClusterConnectionString::ConnectionStringStatus::UNRESOLVED;
+				} else {
+					NetworkAddress a = NetworkAddress::parse(process_address_or_hostname_strs[index]);
+					if (!a.isValid()) {
+						parse_error = true;
+					} else {
+						conn.coords.push_back(a);
+					}
+				}
 			} catch (Error& e) {
 				TraceEvent(SevDebug, "SpecialKeysNetworkParseError").error(e);
 				parse_error = true;
 			}

 			if (parse_error) {
-				std::string error =
-				    "ERROR: \'" + process_address_strs[index] + "\' is not a valid network endpoint address\n";
-				if (process_address_strs[index].find(":tls") != std::string::npos)
+				std::string error = "ERROR: \'" + process_address_or_hostname_strs[index] +
+				                    "\' is not a valid network endpoint address\n";
+				if (process_address_or_hostname_strs[index].find(":tls") != std::string::npos)
 					error += "        Do not include the `:tls' suffix when naming a process\n";
 				return ManagementAPIError::toJsonString(false, "coordinators", error);
 			}
 		}
 	}

-	if (addressesVec.size())
-		change = specifiedQuorumChange(addressesVec);
+	wait(conn.resolveHostnames());
+	if (conn.coordinators().size())
+		change = specifiedQuorumChange(conn.coordinators());
 	else
 		change = noQuorumChange();

@ -1693,10 +1701,11 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
 	ASSERT(change.isValid());

 	TraceEvent(SevDebug, "SKSChangeCoordinatorsStart")
-	    .detail("NewAddresses", describe(addressesVec))
+	    .detail("NewHostnames", conn.hostnames.size() ? describe(conn.hostnames) : "N/A")
+	    .detail("NewAddresses", describe(conn.coordinators()))
 	    .detail("Description", entry.first ? entry.second.get().toString() : "");

-	Optional<CoordinatorsResult> r = wait(changeQuorumChecker(&ryw->getTransaction(), change, &addressesVec));
+	Optional<CoordinatorsResult> r = wait(changeQuorumChecker(&ryw->getTransaction(), change, &conn));

 	TraceEvent(SevDebug, "SKSChangeCoordinatorsFinish")
 	    .detail("Result", r.present() ? static_cast<int>(r.get()) : -1); // -1 means success
--- a/fdbclient/StatusClient.actor.cpp
+++ b/fdbclient/StatusClient.actor.cpp
@ -306,6 +306,7 @@ ACTOR Future<Optional<StatusObject>> clientCoordinatorsStatusFetcher(Reference<I
                                                                     bool* quorum_reachable,
                                                                     int* coordinatorsFaultTolerance) {
 	try {
+		wait(connRecord->resolveHostnames());
 		state ClientCoordinators coord(connRecord);
 		state StatusObject statusObj;

--- a/fdbmonitor/CMakeLists.txt
+++ b/fdbmonitor/CMakeLists.txt
@ -35,3 +35,13 @@ add_custom_target(start_sandbox
                                             --lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)

 add_dependencies(start_sandbox fdbmonitor fdbserver)
+
+if(NOT EXISTS ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
+  configure_file(${CMAKE_SOURCE_DIR}/contrib/generate_profile.sh
+    ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
+endif()
+
+add_custom_target(generate_profile
+  COMMAND  ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
+
+add_dependencies(generate_profile fdbmonitor fdbserver mako fdbcli)
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@ -260,7 +260,7 @@ public:
 			std::string currentFilename =
 			    (wrappedFile.isReady() && !wrappedFile.isError()) ? wrappedFile.get()->getFilename() : actualFilename;
 			currentProcess->machine->openFiles.erase(currentFilename);
-			//TraceEvent("AsyncFileNonDurableOpenError").error(e, true).detail("Filename", filename).detail("Address", currentProcess->address).detail("Addr", g_simulator.getCurrentProcess()->address);
+			//TraceEvent("AsyncFileNonDurableOpenError").errorUnsuppressed(e).detail("Filename", filename).detail("Address", currentProcess->address).detail("Addr", g_simulator.getCurrentProcess()->address);
 			wait(g_simulator.onProcess(currentProcess, currentTaskID));
 			throw err;
 		}
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -732,13 +732,13 @@ ACTOR Future<Void> connectionKeeper(Reference<Peer> self,

 			if (self->compatible) {
 				TraceEvent(ok ? SevInfo : SevWarnAlways, "ConnectionClosed", conn ? conn->getDebugID() : UID())
-				    .error(e, true)
+				    .errorUnsuppressed(e)
 				    .suppressFor(1.0)
 				    .detail("PeerAddr", self->destination);
 			} else {
 				TraceEvent(
 				    ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed", conn ? conn->getDebugID() : UID())
-				    .error(e, true)
+				    .errorUnsuppressed(e)
 				    .suppressFor(1.0)
 				    .detail("PeerAddr", self->destination);
 			}
@ -783,7 +783,7 @@ ACTOR Future<Void> connectionKeeper(Reference<Peer> self,

 			if (self->peerReferences <= 0 && self->reliable.empty() && self->unsent.empty() &&
 			    self->outstandingReplies == 0) {
-				TraceEvent("PeerDestroy").error(e).suppressFor(1.0).detail("PeerAddr", self->destination);
+				TraceEvent("PeerDestroy").errorUnsuppressed(e).suppressFor(1.0).detail("PeerAddr", self->destination);
 				self->connect.cancel();
 				self->transport->peers.erase(self->destination);
 				self->transport->orderedAddresses.erase(self->destination);
@ -1330,10 +1330,12 @@ ACTOR static Future<Void> connectionIncoming(TransportData* self, Reference<ICon
 		}
 		return Void();
 	} catch (Error& e) {
-		TraceEvent("IncomingConnectionError", conn->getDebugID())
-		    .error(e)
-		    .suppressFor(1.0)
-		    .detail("FromAddress", conn->getPeerAddress());
+		if (e.code() != error_code_actor_cancelled) {
+			TraceEvent("IncomingConnectionError", conn->getDebugID())
+			    .errorUnsuppressed(e)
+			    .suppressFor(1.0)
+			    .detail("FromAddress", conn->getPeerAddress());
+		}
 		conn->close();
 		return Void();
 	}
--- a/fdbrpc/HealthMonitor.actor.cpp
+++ b/fdbrpc/HealthMonitor.actor.cpp
@ -29,12 +29,12 @@ void HealthMonitor::reportPeerClosed(const NetworkAddress& peerAddress) {
 }

 void HealthMonitor::purgeOutdatedHistory() {
-	for (auto it = peerClosedHistory.begin(); it != peerClosedHistory.end();) {
-		if (it->first < now() - FLOW_KNOBS->HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS) {
-			auto& count = peerClosedNum[it->second];
+	while (!peerClosedHistory.empty()) {
+		auto const& p = peerClosedHistory.front();
+		if (p.first < now() - FLOW_KNOBS->HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS) {
+			auto& count = peerClosedNum[p.second];
 			--count;
 			ASSERT(count >= 0);
-			++it; // Increment before pop_front to avoid iterator invalidation
 			peerClosedHistory.pop_front();
 		} else {
 			break;
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -1123,11 +1123,9 @@ public:
 		}
 	}

-	ACTOR static Future<Void> runLoop(Sim2* self) {
-		state ISimulator::ProcessInfo* callingMachine = self->currentProcess;
+	static void runLoop(Sim2* self) {
+		ISimulator::ProcessInfo* callingMachine = self->currentProcess;
 		while (!self->isStopped) {
-			wait(self->net2->yield(TaskPriority::DefaultYield));
-
 			self->mutex.enter();
 			if (self->tasks.size() == 0) {
 				self->mutex.leave();
@ -1144,18 +1142,13 @@ public:
 			self->yielded = false;
 		}
 		self->currentProcess = callingMachine;
-		self->net2->stop();
 		for (auto& fn : self->stopCallbacks) {
 			fn();
 		}
-		return Void();
 	}

 	// Implement ISimulator interface
-	void run() override {
-		Future<Void> loopFuture = runLoop(this);
-		net2->run();
-	}
+	void run() override { runLoop(this); }
 	ProcessInfo* newProcess(const char* name,
 	                        IPAddress ip,
 	                        uint16_t port,
@ -2094,7 +2087,7 @@ public:
 				t.action.send(Void());
 				ASSERT(this->currentProcess == t.machine);
 			} catch (Error& e) {
-				TraceEvent(SevError, "UnhandledSimulationEventError").error(e, true);
+				TraceEvent(SevError, "UnhandledSimulationEventError").errorUnsuppressed(e);
 				killProcess(t.machine, KillInstantly);
 			}

--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@ -1101,10 +1101,10 @@ ACTOR Future<Void> backupWorker(BackupInterface interf,
 			try {
 				wait(done);
 			} catch (Error& e) {
-				TraceEvent("BackupWorkerShutdownError", self.myId).error(e, true);
+				TraceEvent("BackupWorkerShutdownError", self.myId).errorUnsuppressed(e);
 			}
 		}
-		TraceEvent("BackupWorkerTerminated", self.myId).error(err, true);
+		TraceEvent("BackupWorkerTerminated", self.myId).errorUnsuppressed(err);
 		if (err.code() != error_code_actor_cancelled && err.code() != error_code_worker_removed) {
 			throw err;
 		}
--- a/fdbserver/BlobManager.actor.cpp
+++ b/fdbserver/BlobManager.actor.cpp
@ -843,8 +843,8 @@ ACTOR Future<Void> monitorBlobWorkerStatus(BlobManagerData* bmData, BlobWorkerIn
 				}
 				// TODO change back from SevError?
 				TraceEvent(SevError, "BWStatusMonitoringFailed", bmData->id)
-				    .detail("BlobWorkerID", bwInterf.id())
-				    .error(e);
+				    .error(e)
+				    .detail("BlobWorkerID", bwInterf.id());
 				throw e;
 			}
 		}
@ -877,7 +877,7 @@ ACTOR Future<Void> monitorBlobWorker(BlobManagerData* bmData, BlobWorkerInterfac
 			printf("BM got unexpected error %s monitoring BW %s\n", e.name(), bwInterf.id().toString().c_str());
 		}
 		// TODO change back from SevError?
-		TraceEvent(SevError, "BWMonitoringFailed", bmData->id).detail("BlobWorkerID", bwInterf.id()).error(e);
+		TraceEvent(SevError, "BWMonitoringFailed", bmData->id).error(e).detail("BlobWorkerID", bwInterf.id());
 		throw e;
 	}

@ -1152,7 +1152,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
 			}
 		}
 	} catch (Error& err) {
-		TraceEvent("BlobManagerDied", bmInterf.id()).error(err, true);
+		TraceEvent("BlobManagerDied", bmInterf.id()).errorUnsuppressed(err);
 	}
 	return Void();
 }
--- a/fdbserver/BlobWorker.actor.cpp
+++ b/fdbserver/BlobWorker.actor.cpp
@ -1589,7 +1589,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
 				       metadata->keyRange.end.printable().c_str(),
 				       e.name());
 			}
-			TraceEvent(SevWarn, "GranuleFileUpdaterError", bwData->id).detail("Granule", metadata->keyRange).error(e);
+			TraceEvent(SevWarn, "GranuleFileUpdaterError", bwData->id).error(e).detail("Granule", metadata->keyRange);

 			if (granuleCanRetry(e)) {
 				// explicitly cancel all outstanding write futures BEFORE updating promise stream, to ensure they
@ -2621,7 +2621,7 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
 		if (BW_DEBUG) {
 			printf("Blob worker got error %s. Exiting...\n", e.name());
 		}
-		TraceEvent("BlobWorkerDied", self->id).error(e, true);
+		TraceEvent("BlobWorkerDied", self->id).errorUnsuppressed(e);
 	}

 	wait(self->granuleMetadata.clearAsync());
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@ -90,6 +90,7 @@ set(FDBSERVER_SRCS
  QuietDatabase.actor.cpp
  QuietDatabase.h
  RadixTree.h
+  Ratekeeper.h
  Ratekeeper.actor.cpp
  RatekeeperInterface.h
  RecoveryState.h
@ -130,6 +131,8 @@ set(FDBSERVER_SRCS
  storageserver.actor.cpp
  TagPartitionedLogSystem.actor.cpp
  TagPartitionedLogSystem.actor.h
+  TagThrottler.actor.cpp
+  TagThrottler.h
  template_fdb.h
  TCInfo.actor.cpp
  TCInfo.h
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -296,7 +296,7 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
 			TraceEvent(SevWarn, "DetectedFailedRecovery", cluster->id).detail("OldMaster", iMaster.id());
 		} catch (Error& e) {
 			state Error err = e;
-			TraceEvent("CCWDB", cluster->id).error(e, true).detail("Master", iMaster.id());
+			TraceEvent("CCWDB", cluster->id).errorUnsuppressed(e).detail("Master", iMaster.id());
 			if (e.code() != error_code_actor_cancelled)
 				wait(delay(0.0));

@ -313,7 +313,7 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
 			TEST(err.code() == error_code_restart_cluster_controller); // Terminated due to cluster-controller restart.

 			if (cluster->shouldCommitSuicide || err.code() == error_code_coordinators_changed) {
-				TraceEvent("ClusterControllerTerminate", cluster->id).error(err, true);
+				TraceEvent("ClusterControllerTerminate", cluster->id).errorUnsuppressed(err);
 				throw restart_cluster_controller();
 			}

@ -427,10 +427,10 @@ void checkOutstandingStorageRequests(ClusterControllerData* self) {
 		} catch (Error& e) {
 			if (e.code() == error_code_no_more_servers) {
 				TraceEvent(SevWarn, "RecruitStorageNotAvailable", self->id)
+				    .errorUnsuppressed(e)
 				    .suppressFor(1.0)
 				    .detail("OutstandingReq", i)
-				    .detail("IsCriticalRecruitment", req.first.criticalRecruitment)
-				    .error(e);
+				    .detail("IsCriticalRecruitment", req.first.criticalRecruitment);
 			} else {
 				TraceEvent(SevError, "RecruitStorageError", self->id).error(e);
 				throw;
@ -464,9 +464,9 @@ void checkOutstandingBlobWorkerRequests(ClusterControllerData* self) {
 		} catch (Error& e) {
 			if (e.code() == error_code_no_more_servers) {
 				TraceEvent(SevWarn, "RecruitBlobWorkerNotAvailable", self->id)
+				    .errorUnsuppressed(e)
 				    .suppressFor(1.0)
-				    .detail("OutstandingReq", i)
-				    .error(e);
+				    .detail("OutstandingReq", i);
 			} else {
 				TraceEvent(SevError, "RecruitBlobWorkerError", self->id).error(e);
 				throw;
@ -876,8 +876,8 @@ void clusterRecruitStorage(ClusterControllerData* self, RecruitStorageRequest re
 		if (e.code() == error_code_no_more_servers) {
 			self->outstandingStorageRequests.emplace_back(req, now() + SERVER_KNOBS->RECRUITMENT_TIMEOUT);
 			TraceEvent(SevWarn, "RecruitStorageNotAvailable", self->id)
-			    .detail("IsCriticalRecruitment", req.criticalRecruitment)
-			    .error(e);
+			    .error(e)
+			    .detail("IsCriticalRecruitment", req.criticalRecruitment);
 		} else {
 			TraceEvent(SevError, "RecruitStorageError", self->id).error(e);
 			throw; // Any other error will bring down the cluster controller
@ -2599,6 +2599,7 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
 	state bool hasConnected = false;
 	loop {
 		try {
+			wait(connRecord->resolveHostnames());
 			ServerCoordinators coordinators(connRecord);
 			wait(clusterController(coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
 		} catch (Error& e) {
--- a/fdbserver/ClusterController.actor.h
+++ b/fdbserver/ClusterController.actor.h
@ -1896,8 +1896,8 @@ public:
 					throw;
 				}
 				TraceEvent(SevWarn, "AttemptingRecruitmentInRemoteDc", id)
-				    .detail("SetPrimaryDesired", setPrimaryDesired)
-				    .error(e);
+				    .error(e)
+				    .detail("SetPrimaryDesired", setPrimaryDesired);
 				auto reply = findWorkersForConfigurationFromDC(req, regions[1].dcId, checkGoodRecruitment);
 				if (!setPrimaryDesired) {
 					std::vector<Optional<Key>> dcPriority;
--- a/fdbserver/ClusterRecovery.actor.cpp
+++ b/fdbserver/ClusterRecovery.actor.cpp
@ -673,7 +673,9 @@ ACTOR Future<Void> changeCoordinators(Reference<ClusterRecoveryData> self) {
 		}

 		try {
-			wait(self->cstate.move(ClusterConnectionString(changeCoordinatorsRequest.newConnectionString.toString())));
+			state ClusterConnectionString conn(changeCoordinatorsRequest.newConnectionString.toString());
+			wait(conn.resolveHostnames());
+			wait(self->cstate.move(conn));
 		} catch (Error& e) {
 			if (e.code() != error_code_actor_cancelled)
 				changeCoordinatorsRequest.reply.sendError(e);
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@ -1769,17 +1769,17 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
 			wait(throwErrorOr(ddSnapReq));
 		} catch (Error& e) {
 			TraceEvent("SnapCommitProxy_DDSnapResponseError")
+			    .errorUnsuppressed(e)
 			    .detail("SnapPayload", snapReq.snapPayload)
-			    .detail("SnapUID", snapReq.snapUID)
-			    .error(e, true /*includeCancelled*/);
+			    .detail("SnapUID", snapReq.snapUID);
 			throw e;
 		}
 		snapReq.reply.send(Void());
 	} catch (Error& e) {
 		TraceEvent("SnapCommitProxy_SnapReqError")
+		    .errorUnsuppressed(e)
 		    .detail("SnapPayload", snapReq.snapPayload)
-		    .detail("SnapUID", snapReq.snapUID)
-		    .error(e, true /*includeCancelled*/);
+		    .detail("SnapUID", snapReq.snapUID);
 		if (e.code() != error_code_operation_cancelled) {
 			snapReq.reply.sendError(e);
 		} else {
@ -2188,7 +2188,7 @@ ACTOR Future<Void> commitProxyServer(CommitProxyInterface proxy,
 		                                                whitelistBinPaths);
 		wait(core || checkRemoved(db, req.recoveryCount, proxy));
 	} catch (Error& e) {
-		TraceEvent("CommitProxyTerminated", proxy.id()).error(e, true);
+		TraceEvent("CommitProxyTerminated", proxy.id()).errorUnsuppressed(e);

 		if (e.code() != error_code_worker_removed && e.code() != error_code_tlog_stopped &&
 		    e.code() != error_code_tlog_failed && e.code() != error_code_coordinators_changed &&
--- a/fdbserver/ConfigDatabaseUnitTests.actor.cpp
+++ b/fdbserver/ConfigDatabaseUnitTests.actor.cpp
@ -61,7 +61,7 @@ class WriteToTransactionEnvironment {
 	Version lastWrittenVersion{ 0 };

 	static Value longToValue(int64_t v) {
-		auto s = format("%ld", v);
+		auto s = format("%lld", v);
 		return StringRef(reinterpret_cast<uint8_t const*>(s.c_str()), s.size());
 	}

--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@ -96,6 +96,7 @@ LeaderElectionRegInterface::LeaderElectionRegInterface(INetwork* local) : Client
 }

 ServerCoordinators::ServerCoordinators(Reference<IClusterConnectionRecord> ccr) : ClientCoordinators(ccr) {
+	ASSERT(ccr->connectionStringStatus() == ClusterConnectionString::RESOLVED);
 	ClusterConnectionString cs = ccr->getConnectionString();
 	for (auto s = cs.coordinators().begin(); s != cs.coordinators().end(); ++s) {
 		leaderElectionServers.emplace_back(*s);
@ -205,8 +206,11 @@ ACTOR Future<Void> openDatabase(ClientData* db,
                                int* clientCount,
                                Reference<AsyncVar<bool>> hasConnectedClients,
                                OpenDatabaseCoordRequest req,
-                                Future<Void> checkStuck) {
+                                Future<Void> checkStuck,
+                                Reference<AsyncVar<Void>> coordinatorsChanged) {
 	state ErrorOr<CachedSerialization<ClientDBInfo>> replyContents;
+	state Future<Void> coordinatorsChangedOnChange = coordinatorsChanged->onChange();
+	state Future<Void> clientInfoOnChange = db->clientInfo->onChange();

 	++(*clientCount);
 	hasConnectedClients->set(true);
@ -223,7 +227,15 @@ ACTOR Future<Void> openDatabase(ClientData* db,
 				replyContents = failed_to_progress();
 				break;
 			}
-			when(wait(yieldedFuture(db->clientInfo->onChange()))) { replyContents = db->clientInfo->get(); }
+			when(wait(yieldedFuture(clientInfoOnChange))) {
+				clientInfoOnChange = db->clientInfo->onChange();
+				replyContents = db->clientInfo->get();
+			}
+			when(wait(coordinatorsChangedOnChange)) {
+				coordinatorsChangedOnChange = coordinatorsChanged->onChange();
+				replyContents = coordinators_changed();
+				break;
+			}
 			when(wait(delayJittered(SERVER_KNOBS->CLIENT_REGISTER_INTERVAL))) {
 				if (db->clientInfo->get().read().id.isValid()) {
 					replyContents = db->clientInfo->get();
@ -254,18 +266,33 @@ ACTOR Future<Void> openDatabase(ClientData* db,
 ACTOR Future<Void> remoteMonitorLeader(int* clientCount,
                                       Reference<AsyncVar<bool>> hasConnectedClients,
                                       Reference<AsyncVar<Optional<LeaderInfo>>> currentElectedLeader,
-                                       ElectionResultRequest req) {
+                                       ElectionResultRequest req,
+                                       Reference<AsyncVar<Void>> coordinatorsChanged) {
+	state bool coordinatorsChangeDetected = false;
+	state Future<Void> coordinatorsChangedOnChange = coordinatorsChanged->onChange();
+	state Future<Void> currentElectedLeaderOnChange = currentElectedLeader->onChange();
 	++(*clientCount);
 	hasConnectedClients->set(true);

 	while (!currentElectedLeader->get().present() || req.knownLeader == currentElectedLeader->get().get().changeID) {
 		choose {
-			when(wait(yieldedFuture(currentElectedLeader->onChange()))) {}
+			when(wait(yieldedFuture(currentElectedLeaderOnChange))) {
+				currentElectedLeaderOnChange = currentElectedLeader->onChange();
+			}
+			when(wait(coordinatorsChangedOnChange)) {
+				coordinatorsChangedOnChange = coordinatorsChanged->onChange();
+				coordinatorsChangeDetected = true;
+				break;
+			}
 			when(wait(delayJittered(SERVER_KNOBS->CLIENT_REGISTER_INTERVAL))) { break; }
 		}
 	}

-	req.reply.send(currentElectedLeader->get());
+	if (coordinatorsChangeDetected) {
+		req.reply.sendError(coordinators_changed());
+	} else {
+		req.reply.send(currentElectedLeader->get());
+	}

 	if (--(*clientCount) == 0) {
 		hasConnectedClients->set(false);
@ -296,6 +323,9 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 	state Reference<AsyncVar<Optional<LeaderInfo>>> currentElectedLeader =
 	    makeReference<AsyncVar<Optional<LeaderInfo>>>();
 	state LivenessChecker canConnectToLeader(SERVER_KNOBS->COORDINATOR_LEADER_CONNECTION_TIMEOUT);
+	state Reference<AsyncVar<Void>> coordinatorsChanged = makeReference<AsyncVar<Void>>();
+	state Future<Void> coordinatorsChangedOnChange = coordinatorsChanged->onChange();
+	state Future<Void> hasConnectedClientsOnChange = hasConnectedClients->onChange();

 	loop choose {
 		when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) {
@ -306,10 +336,14 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 			} else {
 				if (!leaderMon.isValid()) {
 					leaderMon = monitorLeaderAndGetClientInfo(
-					    req.clusterKey, req.coordinators, &clientData, currentElectedLeader);
+					    req.clusterKey, req.coordinators, &clientData, currentElectedLeader, coordinatorsChanged);
 				}
-				actors.add(
-				    openDatabase(&clientData, &clientCount, hasConnectedClients, req, canConnectToLeader.checkStuck()));
+				actors.add(openDatabase(&clientData,
+				                        &clientCount,
+				                        hasConnectedClients,
+				                        req,
+				                        canConnectToLeader.checkStuck(),
+				                        coordinatorsChanged));
 			}
 		}
 		when(ElectionResultRequest req = waitNext(interf.electionResult.getFuture())) {
@ -318,10 +352,11 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 				req.reply.send(currentElectedLeader->get());
 			} else {
 				if (!leaderMon.isValid()) {
-					leaderMon =
-					    monitorLeaderAndGetClientInfo(req.key, req.coordinators, &clientData, currentElectedLeader);
+					leaderMon = monitorLeaderAndGetClientInfo(
+					    req.key, req.coordinators, &clientData, currentElectedLeader, coordinatorsChanged);
 				}
-				actors.add(remoteMonitorLeader(&clientCount, hasConnectedClients, currentElectedLeader, req));
+				actors.add(remoteMonitorLeader(
+				    &clientCount, hasConnectedClients, currentElectedLeader, req, coordinatorsChanged));
 			}
 		}
 		when(GetLeaderRequest req = waitNext(interf.getLeader.getFuture())) {
@ -454,13 +489,18 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 				notify.pop_front();
 			}
 		}
-		when(wait(hasConnectedClients->onChange())) {
+		when(wait(hasConnectedClientsOnChange)) {
+			hasConnectedClientsOnChange = hasConnectedClients->onChange();
 			if (!hasConnectedClients->get() && !nextInterval.isValid()) {
 				TraceEvent("LeaderRegisterUnneeded").detail("Key", key);
 				return Void();
 			}
 		}
 		when(wait(actors.getResult())) {}
+		when(wait(coordinatorsChangedOnChange)) {
+			leaderMon = Future<Void>();
+			coordinatorsChangedOnChange = coordinatorsChanged->onChange();
+		}
 	}
 }

@ -756,7 +796,7 @@ ACTOR Future<Void> coordinationServer(std::string dataFolder,
 		     store.getError() || configDatabaseServer);
 		throw internal_error();
 	} catch (Error& e) {
-		TraceEvent("CoordinationServerError", myID).error(e, true);
+		TraceEvent("CoordinationServerError", myID).errorUnsuppressed(e);
 		throw;
 	}
 }
--- a/fdbserver/CoroFlow.actor.cpp
+++ b/fdbserver/CoroFlow.actor.cpp
@ -178,7 +178,7 @@ class WorkPool final : public IThreadPool, public ReferenceCounted<WorkPool<Thre
 				stopped.send(Void());
 				return;
 			} catch (Error& e) {
-				TraceEvent("WorkPoolError").error(e, true);
+				TraceEvent("WorkPoolError").errorUnsuppressed(e);
 				error.sendError(e);
 			} catch (...) {
 				TraceEvent("WorkPoolError").log();
@ -256,10 +256,10 @@ public:

 		pool->queueLock.enter();
 		TraceEvent("WorkPool_Stop")
+		    .errorUnsuppressed(e)
 		    .detail("Workers", pool->workers.size())
 		    .detail("Idle", pool->idle.size())
-		    .detail("Work", pool->work.size())
-		    .error(e, true);
+		    .detail("Work", pool->work.size());

 		for (uint32_t i = 0; i < pool->work.size(); i++)
 			pool->work[i]->cancel(); // What if cancel() does something to this?
--- a/fdbserver/CoroFlowCoro.actor.cpp
+++ b/fdbserver/CoroFlowCoro.actor.cpp
@ -154,7 +154,7 @@ class WorkPool final : public IThreadPool, public ReferenceCounted<WorkPool<Thre
 				stopped.send(Void());
 				return;
 			} catch (Error& e) {
-				TraceEvent("WorkPoolError").error(e, true);
+				TraceEvent("WorkPoolError").errorUnsuppressed(e);
 				error.sendError(e);
 			} catch (...) {
 				TraceEvent("WorkPoolError").log();
@ -232,10 +232,10 @@ public:

 		pool->queueLock.enter();
 		TraceEvent("WorkPool_Stop")
+		    .errorUnsuppressed(e)
 		    .detail("Workers", pool->workers.size())
 		    .detail("Idle", pool->idle.size())
-		    .detail("Work", pool->work.size())
-		    .error(e, true);
+		    .detail("Work", pool->work.size());

 		for (uint32_t i = 0; i < pool->work.size(); i++)
 			pool->work[i]->cancel(); // What if cancel() does something to this?
--- a/fdbserver/DDTeamCollection.actor.cpp
+++ b/fdbserver/DDTeamCollection.actor.cpp
--- a/fdbserver/DDTeamCollection.h
+++ b/fdbserver/DDTeamCollection.h
@ -171,6 +171,7 @@ typedef AsyncMap<UID, ServerStatus> ServerStatusMap;

 class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
 	friend class DDTeamCollectionImpl;
+	friend class DDTeamCollectionUnitTest;

 	enum class Status { NONE = 0, WIGGLING = 1, EXCLUDED = 2, FAILED = 3 };

@ -521,6 +522,37 @@ class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {

 	void noHealthyTeams() const;

+	// To enable verbose debug info, set shouldPrint to true
+	void traceAllInfo(bool shouldPrint = false) const;
+
+	// Check if the server belongs to a machine; if not, create the machine.
+	// Establish the two-direction link between server and machine
+	Reference<TCMachineInfo> checkAndCreateMachine(Reference<TCServerInfo> server);
+
+	// Group storage servers (process) based on their machineId in LocalityData
+	// All created machines are healthy
+	// Return The number of healthy servers we grouped into machines
+	int constructMachinesFromServers();
+
+	// Create machineTeamsToBuild number of machine teams
+	// No operation if machineTeamsToBuild is 0
+	// Note: The creation of machine teams should not depend on server teams:
+	// No matter how server teams will be created, we will create the same set of machine teams;
+	// We should never use server team number in building machine teams.
+	//
+	// Five steps to create each machine team, which are document in the function
+	// Reuse ReplicationPolicy selectReplicas func to select machine team
+	// return number of added machine teams
+	int addBestMachineTeams(int machineTeamsToBuild);
+
+	// Sanity check the property of teams in unit test
+	// Return true if all server teams belong to machine teams
+	bool sanityCheckTeams() const;
+
+	void disableBuildingTeams() { doBuildTeams = false; }
+
+	void setCheckTeamDelay() { this->checkTeamDelay = Void(); }
+
 public:
 	Database cx;

@ -595,39 +627,6 @@ public:

 	void addTeam(std::set<UID> const& team, bool isInitialTeam) { addTeam(team.begin(), team.end(), isInitialTeam); }

-	// FIXME: Public for testing only
-	void disableBuildingTeams() { doBuildTeams = false; }
-
-	// FIXME: Public for testing only
-	void setCheckTeamDelay() { this->checkTeamDelay = Void(); }
-
-	// FIXME: Public for testing only
-	// Group storage servers (process) based on their machineId in LocalityData
-	// All created machines are healthy
-	// Return The number of healthy servers we grouped into machines
-	int constructMachinesFromServers();
-
-	// FIXME: Public for testing only
-	// To enable verbose debug info, set shouldPrint to true
-	void traceAllInfo(bool shouldPrint = false) const;
-
-	// FIXME: Public for testing only
-	// Create machineTeamsToBuild number of machine teams
-	// No operation if machineTeamsToBuild is 0
-	// Note: The creation of machine teams should not depend on server teams:
-	// No matter how server teams will be created, we will create the same set of machine teams;
-	// We should never use server team number in building machine teams.
-	//
-	// Five steps to create each machine team, which are document in the function
-	// Reuse ReplicationPolicy selectReplicas func to select machine team
-	// return number of added machine teams
-	int addBestMachineTeams(int machineTeamsToBuild);
-
-	// FIXME: Public for testing only
-	// Sanity check the property of teams in unit test
-	// Return true if all server teams belong to machine teams
-	bool sanityCheckTeams() const;
-
 	// Create server teams based on machine teams
 	// Before the number of machine teams reaches the threshold, build a machine team for each server team
 	// When it reaches the threshold, first try to build a server team with existing machine teams; if failed,
@ -642,11 +641,6 @@ public:

 	bool removeTeam(Reference<TCTeamInfo> team);

-	// FIXME: Public for testing only
-	// Check if the server belongs to a machine; if not, create the machine.
-	// Establish the two-direction link between server and machine
-	Reference<TCMachineInfo> checkAndCreateMachine(Reference<TCServerInfo> server);
-
 	void removeTSS(UID removedServer);

 	void removeServer(UID removedServer);
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -865,7 +865,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
 				}

 				bool ddEnabled = wait(isDataDistributionEnabled(cx, ddEnabledState));
-				TraceEvent("DataDistributionMoveKeysConflict").detail("DataDistributionEnabled", ddEnabled).error(err);
+				TraceEvent("DataDistributionMoveKeysConflict").error(err).detail("DataDistributionEnabled", ddEnabled);
 				if (ddEnabled) {
 					throw err;
 				}
@ -891,7 +891,7 @@ Future<Void> sendSnapReq(RequestStream<Req> stream, Req req, Error e) {
 	ErrorOr<REPLY_TYPE(Req)> reply = wait(stream.tryGetReply(req));
 	if (reply.isError()) {
 		TraceEvent("SnapDataDistributor_ReqError")
-		    .error(reply.getError(), true)
+		    .errorUnsuppressed(reply.getError())
 		    .detail("ConvertedErrorType", e.what())
 		    .detail("Peer", stream.getEndpoint().getPrimaryAddress());
 		throw e;
@ -1012,9 +1012,9 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 	} catch (Error& err) {
 		state Error e = err;
 		TraceEvent("SnapDataDistributor_SnapReqExit")
+		    .errorUnsuppressed(e)
 		    .detail("SnapPayload", snapReq.snapPayload)
-		    .detail("SnapUID", snapReq.snapUID)
-		    .error(e, true /*includeCancelled */);
+		    .detail("SnapUID", snapReq.snapUID);
 		if (e.code() == error_code_snap_storage_failed || e.code() == error_code_snap_tlog_failed ||
 		    e.code() == error_code_operation_cancelled || e.code() == error_code_snap_disable_tlog_pop_failed) {
 			// enable tlog pop on local tlog nodes
@ -1072,9 +1072,9 @@ ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq,
 		}
 	} catch (Error& e) {
 		TraceEvent("SnapDDCreateError")
+		    .errorUnsuppressed(e)
 		    .detail("SnapPayload", snapReq.snapPayload)
-		    .detail("SnapUID", snapReq.snapUID)
-		    .error(e, true /*includeCancelled */);
+		    .detail("SnapUID", snapReq.snapUID);
 		if (e.code() != error_code_operation_cancelled) {
 			snapReq.reply.sendError(e);
 		} else {
@ -1251,10 +1251,10 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 		}
 	} catch (Error& err) {
 		if (normalDataDistributorErrors().count(err.code()) == 0) {
-			TraceEvent("DataDistributorError", di.id()).error(err, true);
+			TraceEvent("DataDistributorError", di.id()).errorUnsuppressed(err);
 			throw err;
 		}
-		TraceEvent("DataDistributorDied", di.id()).error(err, true);
+		TraceEvent("DataDistributorDied", di.id()).errorUnsuppressed(err);
 	}

 	return Void();
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -1265,10 +1265,12 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
 			}
 		}
 	} catch (Error& e) {
-		TraceEvent(relocateShardInterval.end(), distributorId).error(e, true).detail("Duration", now() - startTime);
+		TraceEvent(relocateShardInterval.end(), distributorId)
+		    .errorUnsuppressed(e)
+		    .detail("Duration", now() - startTime);
 		if (now() - startTime > 600) {
 			TraceEvent(SevWarnAlways, "RelocateShardTooLong")
-			    .error(e, true)
+			    .errorUnsuppressed(e)
 			    .detail("Duration", now() - startTime)
 			    .detail("Dest", describe(destIds))
 			    .detail("Src", describe(rd.src));
@ -1540,8 +1542,8 @@ ACTOR Future<Void> BgDDMountainChopper(DDQueueData* self, int teamCollectionInde
 			traceEvent.detail("ResetCount", resetCount);
 			tr.reset();
 		} catch (Error& e) {
-			traceEvent.error(
-			    e, true); // Log actor_cancelled because it's not legal to suppress an event that's initialized
+			// Log actor_cancelled because it's not legal to suppress an event that's initialized
+			traceEvent.errorUnsuppressed(e);
 			wait(tr.onError(e));
 		}

@ -1655,8 +1657,8 @@ ACTOR Future<Void> BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex)
 			traceEvent.detail("ResetCount", resetCount);
 			tr.reset();
 		} catch (Error& e) {
-			traceEvent.error(
-			    e, true); // Log actor_cancelled because it's not legal to suppress an event that's initialized
+			// Log actor_cancelled because it's not legal to suppress an event that's initialized
+			traceEvent.errorUnsuppressed(e);
 			wait(tr.onError(e));
 		}

--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@ -492,7 +492,9 @@ public:
 			delete pageMem;
 			TEST(true); // push error
 			TEST(2 == syncFiles.size()); // push spanning both files error
-			TraceEvent(SevError, "RDQPushAndCommitError", dbgid).error(e, true).detail("InitialFilename0", filename);
+			TraceEvent(SevError, "RDQPushAndCommitError", dbgid)
+			    .errorUnsuppressed(e)
+			    .detail("InitialFilename0", filename);

 			if (errorPromise.canBeSet())
 				errorPromise.sendError(e);
@ -612,7 +614,7 @@ public:
 			    .detail("File0", self->filename(0));
 		} catch (Error& e) {
 			TraceEvent(SevError, "DiskQueueShutdownError", self->dbgid)
-			    .error(e, true)
+			    .errorUnsuppressed(e)
 			    .detail("Reason", e.code() == error_code_platform_error ? "could not delete database" : "unknown");
 			error = e;
 		}
@ -731,7 +733,7 @@ public:
 		} catch (Error& e) {
 			bool ok = e.code() == error_code_file_not_found;
 			TraceEvent(ok ? SevInfo : SevError, "RDQReadFirstAndLastPagesError", self->dbgid)
-			    .error(e, true)
+			    .errorUnsuppressed(e)
 			    .detail("File0Name", self->files[0].dbgFilename);
 			if (!self->error.isSet())
 				self->error.sendError(e);
@ -804,7 +806,7 @@ public:
 		} catch (Error& e) {
 			TEST(true); // Read next page error
 			TraceEvent(SevError, "RDQReadNextPageError", self->dbgid)
-			    .error(e, true)
+			    .errorUnsuppressed(e)
 			    .detail("File0Name", self->files[0].dbgFilename);
 			if (!self->error.isSet())
 				self->error.sendError(e);
--- a/fdbserver/EncryptKeyProxy.actor.cpp
+++ b/fdbserver/EncryptKeyProxy.actor.cpp
@ -58,7 +58,7 @@ ACTOR Future<Void> encryptKeyProxyServer(EncryptKeyProxyInterface ekpInterface,
 			}
 		}
 	} catch (Error& e) {
-		TraceEvent("EKP_Terminated", ekpInterface.id()).error(e, true);
+		TraceEvent("EKP_Terminated", ekpInterface.id()).errorUnsuppressed(e);
 	}

 	return Void();
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@ -991,7 +991,7 @@ ACTOR Future<Void> grvProxyServer(GrvProxyInterface proxy,
 		state Future<Void> core = grvProxyServerCore(proxy, req.master, req.masterLifetime, db);
 		wait(core || checkRemoved(db, req.recoveryCount, proxy));
 	} catch (Error& e) {
-		TraceEvent("GrvProxyTerminated", proxy.id()).error(e, true);
+		TraceEvent("GrvProxyTerminated", proxy.id()).errorUnsuppressed(e);

 		if (e.code() != error_code_worker_removed && e.code() != error_code_tlog_stopped &&
 		    e.code() != error_code_tlog_failed && e.code() != error_code_coordinators_changed &&
--- a/fdbserver/KeyValueStoreMemory.actor.cpp
+++ b/fdbserver/KeyValueStoreMemory.actor.cpp
@ -634,7 +634,7 @@ private:
 			} catch (Error& e) {
 				bool ok = e.code() == error_code_operation_cancelled || e.code() == error_code_file_not_found ||
 				          e.code() == error_code_disk_adapter_reset;
-				TraceEvent(ok ? SevInfo : SevError, "ErrorDuringRecovery", dbgid).error(e, true);
+				TraceEvent(ok ? SevInfo : SevError, "ErrorDuringRecovery", dbgid).errorUnsuppressed(e);
 				if (e.code() != error_code_disk_adapter_reset) {
 					throw e;
 				}
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@ -11,6 +11,8 @@
 #include <rocksdb/version.h>
 #include <rocksdb/utilities/table_properties_collectors.h>
 #include <rocksdb/rate_limiter.h>
+#include <rocksdb/perf_context.h>
+#include <rocksdb/c.h>
 #if defined __has_include
 #if __has_include(<liburing.h>)
 #include <liburing.h>
@ -312,6 +314,271 @@ private:
 	uint64_t iteratorsReuseCount;
 };

+class PerfContextMetrics {
+public:
+	PerfContextMetrics();
+	void reset();
+	void set(int index);
+	void log(bool ignoreZeroMetric);
+
+private:
+	std::vector<std::tuple<const char*, int, std::vector<uint64_t>>> metrics;
+	uint64_t getRocksdbPerfcontextMetric(int metric);
+};
+
+PerfContextMetrics::PerfContextMetrics() {
+	metrics = {
+		{ "UserKeyComparisonCount", rocksdb_user_key_comparison_count, {} },
+		{ "BlockCacheHitCount", rocksdb_block_cache_hit_count, {} },
+		{ "BlockReadCount", rocksdb_block_read_count, {} },
+		{ "BlockReadByte", rocksdb_block_read_byte, {} },
+		{ "BlockReadTime", rocksdb_block_read_time, {} },
+		{ "BlockChecksumTime", rocksdb_block_checksum_time, {} },
+		{ "BlockDecompressTime", rocksdb_block_decompress_time, {} },
+		{ "GetReadBytes", rocksdb_get_read_bytes, {} },
+		{ "MultigetReadBytes", rocksdb_multiget_read_bytes, {} },
+		{ "IterReadBytes", rocksdb_iter_read_bytes, {} },
+		{ "InternalKeySkippedCount", rocksdb_internal_key_skipped_count, {} },
+		{ "InternalDeleteSkippedCount", rocksdb_internal_delete_skipped_count, {} },
+		{ "InternalRecentSkippedCount", rocksdb_internal_recent_skipped_count, {} },
+		{ "InternalMergeCount", rocksdb_internal_merge_count, {} },
+		{ "GetSnapshotTime", rocksdb_get_snapshot_time, {} },
+		{ "GetFromMemtableTime", rocksdb_get_from_memtable_time, {} },
+		{ "GetFromMemtableCount", rocksdb_get_from_memtable_count, {} },
+		{ "GetPostProcessTime", rocksdb_get_post_process_time, {} },
+		{ "GetFromOutputFilesTime", rocksdb_get_from_output_files_time, {} },
+		{ "SeekOnMemtableTime", rocksdb_seek_on_memtable_time, {} },
+		{ "SeekOnMemtableCount", rocksdb_seek_on_memtable_count, {} },
+		{ "NextOnMemtableCount", rocksdb_next_on_memtable_count, {} },
+		{ "PrevOnMemtableCount", rocksdb_prev_on_memtable_count, {} },
+		{ "SeekChildSeekTime", rocksdb_seek_child_seek_time, {} },
+		{ "SeekChildSeekCount", rocksdb_seek_child_seek_count, {} },
+		{ "SeekMinHeapTime", rocksdb_seek_min_heap_time, {} },
+		{ "SeekMaxHeapTime", rocksdb_seek_max_heap_time, {} },
+		{ "SeekInternalSeekTime", rocksdb_seek_internal_seek_time, {} },
+		{ "FindNextUserEntryTime", rocksdb_find_next_user_entry_time, {} },
+		{ "WriteWalTime", rocksdb_write_wal_time, {} },
+		{ "WriteMemtableTime", rocksdb_write_memtable_time, {} },
+		{ "WriteDelayTime", rocksdb_write_delay_time, {} },
+		{ "WritePreAndPostProcessTime", rocksdb_write_pre_and_post_process_time, {} },
+		{ "DbMutexLockNanos", rocksdb_db_mutex_lock_nanos, {} },
+		{ "DbConditionWaitNanos", rocksdb_db_condition_wait_nanos, {} },
+		{ "MergeOperatorTimeNanos", rocksdb_merge_operator_time_nanos, {} },
+		{ "ReadIndexBlockNanos", rocksdb_read_index_block_nanos, {} },
+		{ "ReadFilterBlockNanos", rocksdb_read_filter_block_nanos, {} },
+		{ "NewTableBlockIterNanos", rocksdb_new_table_block_iter_nanos, {} },
+		{ "NewTableIteratorNanos", rocksdb_new_table_iterator_nanos, {} },
+		{ "BlockSeekNanos", rocksdb_block_seek_nanos, {} },
+		{ "FindTableNanos", rocksdb_find_table_nanos, {} },
+		{ "BloomMemtableHitCount", rocksdb_bloom_memtable_hit_count, {} },
+		{ "BloomMemtableMissCount", rocksdb_bloom_memtable_miss_count, {} },
+		{ "BloomSstHitCount", rocksdb_bloom_sst_hit_count, {} },
+		{ "BloomSstMissCount", rocksdb_bloom_sst_miss_count, {} },
+		{ "KeyLockWaitTime", rocksdb_key_lock_wait_time, {} },
+		{ "KeyLockWaitCount", rocksdb_key_lock_wait_count, {} },
+		{ "EnvNewSequentialFileNanos", rocksdb_env_new_sequential_file_nanos, {} },
+		{ "EnvNewRandomAccessFileNanos", rocksdb_env_new_random_access_file_nanos, {} },
+		{ "EnvNewWritableFileNanos", rocksdb_env_new_writable_file_nanos, {} },
+		{ "EnvReuseWritableFileNanos", rocksdb_env_reuse_writable_file_nanos, {} },
+		{ "EnvNewRandomRwFileNanos", rocksdb_env_new_random_rw_file_nanos, {} },
+		{ "EnvNewDirectoryNanos", rocksdb_env_new_directory_nanos, {} },
+		{ "EnvFileExistsNanos", rocksdb_env_file_exists_nanos, {} },
+		{ "EnvGetChildrenNanos", rocksdb_env_get_children_nanos, {} },
+		{ "EnvGetChildrenFileAttributesNanos", rocksdb_env_get_children_file_attributes_nanos, {} },
+		{ "EnvDeleteFileNanos", rocksdb_env_delete_file_nanos, {} },
+		{ "EnvCreateDirNanos", rocksdb_env_create_dir_nanos, {} },
+		{ "EnvCreateDirIfMissingNanos", rocksdb_env_create_dir_if_missing_nanos, {} },
+		{ "EnvDeleteDirNanos", rocksdb_env_delete_dir_nanos, {} },
+		{ "EnvGetFileSizeNanos", rocksdb_env_get_file_size_nanos, {} },
+		{ "EnvGetFileModificationTimeNanos", rocksdb_env_get_file_modification_time_nanos, {} },
+		{ "EnvRenameFileNanos", rocksdb_env_rename_file_nanos, {} },
+		{ "EnvLinkFileNanos", rocksdb_env_link_file_nanos, {} },
+		{ "EnvLockFileNanos", rocksdb_env_lock_file_nanos, {} },
+		{ "EnvUnlockFileNanos", rocksdb_env_unlock_file_nanos, {} },
+		{ "EnvNewLoggerNanos", rocksdb_env_new_logger_nanos, {} },
+	};
+	for (auto& [name, metric, vals] : metrics) { // readers, then writer
+		for (int i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; i++) {
+			vals.push_back(0); // add reader
+		}
+		vals.push_back(0); // add writer
+	}
+}
+
+void PerfContextMetrics::reset() {
+	rocksdb::get_perf_context()->Reset();
+}
+
+void PerfContextMetrics::set(int index) {
+	for (auto& [name, metric, vals] : metrics) {
+		vals[index] = getRocksdbPerfcontextMetric(metric);
+	}
+}
+
+void PerfContextMetrics::log(bool ignoreZeroMetric) {
+	TraceEvent e("RocksDBPerfContextMetrics");
+	e.setMaxEventLength(20000);
+	for (auto& [name, metric, vals] : metrics) {
+		uint64_t s = 0;
+		for (auto& v : vals) {
+			s = s + v;
+		}
+		if (ignoreZeroMetric && s == 0)
+			continue;
+		e.detail("Sum" + (std::string)name, s);
+		for (int i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; i++) {
+			if (vals[i] != 0)
+				e.detail("RD" + std::to_string(i) + name, vals[i]);
+		}
+		if (vals[SERVER_KNOBS->ROCKSDB_READ_PARALLELISM] != 0)
+			e.detail("WR" + (std::string)name, vals[SERVER_KNOBS->ROCKSDB_READ_PARALLELISM]);
+	}
+}
+
+uint64_t PerfContextMetrics::getRocksdbPerfcontextMetric(int metric) {
+	switch (metric) {
+	case rocksdb_user_key_comparison_count:
+		return rocksdb::get_perf_context()->user_key_comparison_count;
+	case rocksdb_block_cache_hit_count:
+		return rocksdb::get_perf_context()->block_cache_hit_count;
+	case rocksdb_block_read_count:
+		return rocksdb::get_perf_context()->block_read_count;
+	case rocksdb_block_read_byte:
+		return rocksdb::get_perf_context()->block_read_byte;
+	case rocksdb_block_read_time:
+		return rocksdb::get_perf_context()->block_read_time;
+	case rocksdb_block_checksum_time:
+		return rocksdb::get_perf_context()->block_checksum_time;
+	case rocksdb_block_decompress_time:
+		return rocksdb::get_perf_context()->block_decompress_time;
+	case rocksdb_get_read_bytes:
+		return rocksdb::get_perf_context()->get_read_bytes;
+	case rocksdb_multiget_read_bytes:
+		return rocksdb::get_perf_context()->multiget_read_bytes;
+	case rocksdb_iter_read_bytes:
+		return rocksdb::get_perf_context()->iter_read_bytes;
+	case rocksdb_internal_key_skipped_count:
+		return rocksdb::get_perf_context()->internal_key_skipped_count;
+	case rocksdb_internal_delete_skipped_count:
+		return rocksdb::get_perf_context()->internal_delete_skipped_count;
+	case rocksdb_internal_recent_skipped_count:
+		return rocksdb::get_perf_context()->internal_recent_skipped_count;
+	case rocksdb_internal_merge_count:
+		return rocksdb::get_perf_context()->internal_merge_count;
+	case rocksdb_get_snapshot_time:
+		return rocksdb::get_perf_context()->get_snapshot_time;
+	case rocksdb_get_from_memtable_time:
+		return rocksdb::get_perf_context()->get_from_memtable_time;
+	case rocksdb_get_from_memtable_count:
+		return rocksdb::get_perf_context()->get_from_memtable_count;
+	case rocksdb_get_post_process_time:
+		return rocksdb::get_perf_context()->get_post_process_time;
+	case rocksdb_get_from_output_files_time:
+		return rocksdb::get_perf_context()->get_from_output_files_time;
+	case rocksdb_seek_on_memtable_time:
+		return rocksdb::get_perf_context()->seek_on_memtable_time;
+	case rocksdb_seek_on_memtable_count:
+		return rocksdb::get_perf_context()->seek_on_memtable_count;
+	case rocksdb_next_on_memtable_count:
+		return rocksdb::get_perf_context()->next_on_memtable_count;
+	case rocksdb_prev_on_memtable_count:
+		return rocksdb::get_perf_context()->prev_on_memtable_count;
+	case rocksdb_seek_child_seek_time:
+		return rocksdb::get_perf_context()->seek_child_seek_time;
+	case rocksdb_seek_child_seek_count:
+		return rocksdb::get_perf_context()->seek_child_seek_count;
+	case rocksdb_seek_min_heap_time:
+		return rocksdb::get_perf_context()->seek_min_heap_time;
+	case rocksdb_seek_max_heap_time:
+		return rocksdb::get_perf_context()->seek_max_heap_time;
+	case rocksdb_seek_internal_seek_time:
+		return rocksdb::get_perf_context()->seek_internal_seek_time;
+	case rocksdb_find_next_user_entry_time:
+		return rocksdb::get_perf_context()->find_next_user_entry_time;
+	case rocksdb_write_wal_time:
+		return rocksdb::get_perf_context()->write_wal_time;
+	case rocksdb_write_memtable_time:
+		return rocksdb::get_perf_context()->write_memtable_time;
+	case rocksdb_write_delay_time:
+		return rocksdb::get_perf_context()->write_delay_time;
+	case rocksdb_write_pre_and_post_process_time:
+		return rocksdb::get_perf_context()->write_pre_and_post_process_time;
+	case rocksdb_db_mutex_lock_nanos:
+		return rocksdb::get_perf_context()->db_mutex_lock_nanos;
+	case rocksdb_db_condition_wait_nanos:
+		return rocksdb::get_perf_context()->db_condition_wait_nanos;
+	case rocksdb_merge_operator_time_nanos:
+		return rocksdb::get_perf_context()->merge_operator_time_nanos;
+	case rocksdb_read_index_block_nanos:
+		return rocksdb::get_perf_context()->read_index_block_nanos;
+	case rocksdb_read_filter_block_nanos:
+		return rocksdb::get_perf_context()->read_filter_block_nanos;
+	case rocksdb_new_table_block_iter_nanos:
+		return rocksdb::get_perf_context()->new_table_block_iter_nanos;
+	case rocksdb_new_table_iterator_nanos:
+		return rocksdb::get_perf_context()->new_table_iterator_nanos;
+	case rocksdb_block_seek_nanos:
+		return rocksdb::get_perf_context()->block_seek_nanos;
+	case rocksdb_find_table_nanos:
+		return rocksdb::get_perf_context()->find_table_nanos;
+	case rocksdb_bloom_memtable_hit_count:
+		return rocksdb::get_perf_context()->bloom_memtable_hit_count;
+	case rocksdb_bloom_memtable_miss_count:
+		return rocksdb::get_perf_context()->bloom_memtable_miss_count;
+	case rocksdb_bloom_sst_hit_count:
+		return rocksdb::get_perf_context()->bloom_sst_hit_count;
+	case rocksdb_bloom_sst_miss_count:
+		return rocksdb::get_perf_context()->bloom_sst_miss_count;
+	case rocksdb_key_lock_wait_time:
+		return rocksdb::get_perf_context()->key_lock_wait_time;
+	case rocksdb_key_lock_wait_count:
+		return rocksdb::get_perf_context()->key_lock_wait_count;
+	case rocksdb_env_new_sequential_file_nanos:
+		return rocksdb::get_perf_context()->env_new_sequential_file_nanos;
+	case rocksdb_env_new_random_access_file_nanos:
+		return rocksdb::get_perf_context()->env_new_random_access_file_nanos;
+	case rocksdb_env_new_writable_file_nanos:
+		return rocksdb::get_perf_context()->env_new_writable_file_nanos;
+	case rocksdb_env_reuse_writable_file_nanos:
+		return rocksdb::get_perf_context()->env_reuse_writable_file_nanos;
+	case rocksdb_env_new_random_rw_file_nanos:
+		return rocksdb::get_perf_context()->env_new_random_rw_file_nanos;
+	case rocksdb_env_new_directory_nanos:
+		return rocksdb::get_perf_context()->env_new_directory_nanos;
+	case rocksdb_env_file_exists_nanos:
+		return rocksdb::get_perf_context()->env_file_exists_nanos;
+	case rocksdb_env_get_children_nanos:
+		return rocksdb::get_perf_context()->env_get_children_nanos;
+	case rocksdb_env_get_children_file_attributes_nanos:
+		return rocksdb::get_perf_context()->env_get_children_file_attributes_nanos;
+	case rocksdb_env_delete_file_nanos:
+		return rocksdb::get_perf_context()->env_delete_file_nanos;
+	case rocksdb_env_create_dir_nanos:
+		return rocksdb::get_perf_context()->env_create_dir_nanos;
+	case rocksdb_env_create_dir_if_missing_nanos:
+		return rocksdb::get_perf_context()->env_create_dir_if_missing_nanos;
+	case rocksdb_env_delete_dir_nanos:
+		return rocksdb::get_perf_context()->env_delete_dir_nanos;
+	case rocksdb_env_get_file_size_nanos:
+		return rocksdb::get_perf_context()->env_get_file_size_nanos;
+	case rocksdb_env_get_file_modification_time_nanos:
+		return rocksdb::get_perf_context()->env_get_file_modification_time_nanos;
+	case rocksdb_env_rename_file_nanos:
+		return rocksdb::get_perf_context()->env_rename_file_nanos;
+	case rocksdb_env_link_file_nanos:
+		return rocksdb::get_perf_context()->env_link_file_nanos;
+	case rocksdb_env_lock_file_nanos:
+		return rocksdb::get_perf_context()->env_lock_file_nanos;
+	case rocksdb_env_unlock_file_nanos:
+		return rocksdb::get_perf_context()->env_unlock_file_nanos;
+	case rocksdb_env_new_logger_nanos:
+		return rocksdb::get_perf_context()->env_new_logger_nanos;
+	default:
+		break;
+	}
+	return 0;
+}
+
 ACTOR Future<Void> refreshReadIteratorPool(std::shared_ptr<ReadIteratorPool> readIterPool) {
 	if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) {
 		loop {
@ -336,6 +603,7 @@ ACTOR Future<Void> flowLockLogger(const FlowLock* readLock, const FlowLock* fetc
 }

 ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> statistics,
+                                       std::shared_ptr<PerfContextMetrics> perfContextMetrics,
                                       rocksdb::DB* db,
                                       std::shared_ptr<ReadIteratorPool> readIterPool) {
 	state std::vector<std::tuple<const char*, uint32_t, uint64_t>> tickerStats = {
@ -431,6 +699,10 @@ ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> stat
 		stat = readIterPool->numTimesReadIteratorsReused();
 		e.detail("NumTimesReadIteratorsReused", stat - readIteratorPoolStats["NumTimesReadIteratorsReused"]);
 		readIteratorPoolStats["NumTimesReadIteratorsReused"] = stat;
+
+		if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) {
+			perfContextMetrics->log(true);
+		}
 	}
 }

@ -458,6 +730,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {

 	struct Writer : IThreadPoolReceiver {
 		DB& db;
+
 		UID id;
 		std::shared_ptr<rocksdb::RateLimiter> rateLimiter;
 		Reference<Histogram> commitLatencyHistogram;
@ -466,9 +739,16 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		Reference<Histogram> writeHistogram;
 		Reference<Histogram> deleteCompactRangeHistogram;
 		std::shared_ptr<ReadIteratorPool> readIterPool;
+		std::shared_ptr<PerfContextMetrics> perfContextMetrics;
+		int threadIndex;

-		explicit Writer(DB& db, UID id, std::shared_ptr<ReadIteratorPool> readIterPool)
-		  : db(db), id(id), readIterPool(readIterPool),
+		explicit Writer(DB& db,
+		                UID id,
+		                std::shared_ptr<ReadIteratorPool> readIterPool,
+		                std::shared_ptr<PerfContextMetrics> perfContextMetrics,
+		                int threadIndex)
+		  : db(db), id(id), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics),
+		    threadIndex(threadIndex),
 		    rateLimiter(SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC > 0
 		                    ? rocksdb::NewGenericRateLimiter(
 		                          SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, // rate_bytes_per_sec
@ -491,7 +771,13 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		                                           Histogram::Unit::microseconds)),
 		    deleteCompactRangeHistogram(Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP,
 		                                                        ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM,
-		                                                        Histogram::Unit::microseconds)) {}
+		                                                        Histogram::Unit::microseconds)) {
+			if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) {
+				// Enable perf context on the same thread with the db thread
+				rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
+				perfContextMetrics->reset();
+			}
+		}

 		~Writer() override {
 			if (db) {
@ -542,11 +828,11 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 					// The current thread and main thread are same when the code runs in simulation.
 					// blockUntilReady() is getting the thread into deadlock state, so directly calling
 					// the metricsLogger.
-					a.metrics = rocksDBMetricLogger(options.statistics, db, readIterPool) &&
+					a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) &&
 					            flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
 				} else {
 					onMainThread([&] {
-						a.metrics = rocksDBMetricLogger(options.statistics, db, readIterPool) &&
+						a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) &&
 						            flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
 						return Future<bool>(true);
 					}).blockUntilReady();
@ -586,6 +872,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			}
 		};
 		void action(CommitAction& a) {
+			bool doPerfContextMetrics =
+			    SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE &&
+			    (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE);
+			if (doPerfContextMetrics) {
+				perfContextMetrics->reset();
+			}
 			double commitBeginTime;
 			if (a.getHistograms) {
 				commitBeginTime = timer_monotonic();
@ -632,6 +924,9 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				commitActionHistogram->sampleSeconds(currTime - commitBeginTime);
 				commitLatencyHistogram->sampleSeconds(currTime - a.startTime);
 			}
+			if (doPerfContextMetrics) {
+				perfContextMetrics->set(threadIndex);
+			}
 		}

 		struct CloseAction : TypedAction<Writer, CloseAction> {
@ -684,9 +979,14 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		Reference<Histogram> readValueGetHistogram;
 		Reference<Histogram> readPrefixGetHistogram;
 		std::shared_ptr<ReadIteratorPool> readIterPool;
+		std::shared_ptr<PerfContextMetrics> perfContextMetrics;
+		int threadIndex;

-		explicit Reader(DB& db, std::shared_ptr<ReadIteratorPool> readIterPool)
-		  : db(db), readIterPool(readIterPool),
+		explicit Reader(DB& db,
+		                std::shared_ptr<ReadIteratorPool> readIterPool,
+		                std::shared_ptr<PerfContextMetrics> perfContextMetrics,
+		                int threadIndex)
+		  : db(db), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics), threadIndex(threadIndex),
 		    readRangeLatencyHistogram(Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP,
 		                                                      ROCKSDB_READRANGE_LATENCY_HISTOGRAM,
 		                                                      Histogram::Unit::microseconds)),
@ -734,6 +1034,11 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT;
 				readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT;
 			}
+			if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) {
+				// Enable perf context on the same thread with the db thread
+				rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
+				perfContextMetrics->reset();
+			}
 		}

 		void init() override {}
@ -752,6 +1057,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
 		};
 		void action(ReadValueAction& a) {
+			bool doPerfContextMetrics =
+			    SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE &&
+			    (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE);
+			if (doPerfContextMetrics) {
+				perfContextMetrics->reset();
+			}
 			double readBeginTime = timer_monotonic();
 			if (a.getHistograms) {
 				readValueQueueWaitHistogram->sampleSeconds(readBeginTime - a.startTime);
@ -801,6 +1112,9 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				readValueActionHistogram->sampleSeconds(currTime - readBeginTime);
 				readValueLatencyHistogram->sampleSeconds(currTime - a.startTime);
 			}
+			if (doPerfContextMetrics) {
+				perfContextMetrics->set(threadIndex);
+			}
 		}

 		struct ReadValuePrefixAction : TypedAction<Reader, ReadValuePrefixAction> {
@ -818,6 +1132,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
 		};
 		void action(ReadValuePrefixAction& a) {
+			bool doPerfContextMetrics =
+			    SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE &&
+			    (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE);
+			if (doPerfContextMetrics) {
+				perfContextMetrics->reset();
+			}
 			double readBeginTime = timer_monotonic();
 			if (a.getHistograms) {
 				readPrefixQueueWaitHistogram->sampleSeconds(readBeginTime - a.startTime);
@ -871,6 +1191,9 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				readPrefixActionHistogram->sampleSeconds(currTime - readBeginTime);
 				readPrefixLatencyHistogram->sampleSeconds(currTime - a.startTime);
 			}
+			if (doPerfContextMetrics) {
+				perfContextMetrics->set(threadIndex);
+			}
 		}

 		struct ReadRangeAction : TypedAction<Reader, ReadRangeAction>, FastAllocated<ReadRangeAction> {
@ -887,6 +1210,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; }
 		};
 		void action(ReadRangeAction& a) {
+			bool doPerfContextMetrics =
+			    SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE &&
+			    (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE);
+			if (doPerfContextMetrics) {
+				perfContextMetrics->reset();
+			}
 			double readBeginTime = timer_monotonic();
 			if (a.getHistograms) {
 				readRangeQueueWaitHistogram->sampleSeconds(readBeginTime - a.startTime);
@ -983,10 +1312,14 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 				readRangeActionHistogram->sampleSeconds(currTime - readBeginTime);
 				readRangeLatencyHistogram->sampleSeconds(currTime - a.startTime);
 			}
+			if (doPerfContextMetrics) {
+				perfContextMetrics->set(threadIndex);
+			}
 		}
 	};

 	DB db = nullptr;
+	std::shared_ptr<PerfContextMetrics> perfContextMetrics;
 	std::string path;
 	UID id;
 	Reference<IThreadPool> writeThread;
@ -1015,7 +1348,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 	Counters counters;

 	explicit RocksDBKeyValueStore(const std::string& path, UID id)
-	  : path(path), id(id), readIterPool(new ReadIteratorPool(db, path)),
+	  : path(path), id(id), perfContextMetrics(new PerfContextMetrics()), readIterPool(new ReadIteratorPool(db, path)),
 	    readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
 	    fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
 	    numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
@ -1038,10 +1371,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 			writeThread = createGenericThreadPool();
 			readThreads = createGenericThreadPool();
 		}
-		writeThread->addThread(new Writer(db, id, readIterPool), "fdb-rocksdb-wr");
+		writeThread->addThread(
+		    new Writer(db, id, readIterPool, perfContextMetrics, SERVER_KNOBS->ROCKSDB_READ_PARALLELISM),
+		    "fdb-rocksdb-wr");
 		TraceEvent("RocksDBReadThreads").detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM);
 		for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) {
-			readThreads->addThread(new Reader(db, readIterPool), "fdb-rocksdb-re");
+			readThreads->addThread(new Reader(db, readIterPool, perfContextMetrics, i), "fdb-rocksdb-re");
 		}
 	}

--- a/fdbserver/KeyValueStoreSQLite.actor.cpp
+++ b/fdbserver/KeyValueStoreSQLite.actor.cpp
@ -19,6 +19,7 @@
 */

 #define SQLITE_THREADSAFE 0 // also in sqlite3.amalgamation.c!
+#include "contrib/fmt-8.0.1/include/fmt/format.h"
 #include "flow/crc32c.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/CoroFlow.h"
@ -2061,8 +2062,8 @@ private:
 			}
 		} catch (Error& e) {
 			TraceEvent(SevError, "KVDoCloseError", self->logID)
+			    .errorUnsuppressed(e)
 			    .detail("Filename", self->filename)
-			    .error(e, true)
 			    .detail("Reason", e.code() == error_code_platform_error ? "could not delete database" : "unknown");
 			error = e;
 		}
@ -2359,7 +2360,7 @@ ACTOR Future<Void> KVFileDump(std::string filename) {
 		k = keyAfter(kv[kv.size() - 1].key);
 	}
 	fflush(stdout);
-	fprintf(stderr, "Counted: %ld\n", count);
+	fmt::print(stderr, "Counted: {}\n", count);

 	if (store->getError().isError())
 		wait(store->getError());
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@ -515,6 +515,8 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
 		wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
 	}

+	state double startTime = now();
+
 	Version poppedVer = poppedVersion(self, reqTag);

 	if (poppedVer > reqBegin || reqBegin < self->startVersion) {
@ -535,8 +537,33 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
 		return Void();
 	}

-	Version endVersion = self->version.get() + 1;
-	peekMessagesFromMemory(self, reqTag, reqBegin, messages, endVersion);
+	state Version endVersion;
+	// Run the peek logic in a loop to account for the case where there is no data to return to the caller, and we may
+	// want to wait a little bit instead of just sending back an empty message. This feature is controlled by a knob.
+	loop {
+		endVersion = self->version.get() + 1;
+		peekMessagesFromMemory(self, reqTag, reqBegin, messages, endVersion);
+
+		// Reply the peek request when
+		//   - Have data return to the caller, or
+		//   - Batching empty peek is disabled, or
+		//   - Batching empty peek interval has been reached.
+		if (messages.getLength() > 0 || !SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG ||
+		    now() - startTime > SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL) {
+			break;
+		}
+
+		state Version waitUntilVersion = self->version.get() + 1;
+
+		// Currently, from `reqBegin` to self->version are all empty peeks. Wait for more version, or the empty batching
+		// interval has expired.
+		wait(self->version.whenAtLeast(waitUntilVersion) ||
+		     delay(SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL - (now() - startTime)));
+		if (self->version.get() < waitUntilVersion) {
+			break; // We know that from `reqBegin` to self->version are all empty messages. Skip re-executing the peek
+			       // logic.
+		}
+	}

 	TLogPeekReply reply;
 	reply.maxKnownVersion = self->version.get();
@ -600,8 +627,8 @@ ACTOR Future<Void> logRouterPeekStream(LogRouterData* self, TLogPeekStreamReques
 		} catch (Error& e) {
 			self->activePeekStreams--;
 			TraceEvent(SevDebug, "TLogPeekStreamEnd", self->dbgid)
-			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
-			    .error(e, true);
+			    .errorUnsuppressed(e)
+			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());

 			if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
 				req.reply.sendError(e);
@ -737,7 +764,7 @@ ACTOR Future<Void> logRouter(TLogInterface interf,
 		}
 	} catch (Error& e) {
 		if (e.code() == error_code_actor_cancelled || e.code() == error_code_worker_removed) {
-			TraceEvent("LogRouterTerminated", interf.id()).error(e, true);
+			TraceEvent("LogRouterTerminated", interf.id()).errorUnsuppressed(e);
 			return Void();
 		}
 		throw;
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@ -367,7 +367,7 @@ ACTOR Future<Void> serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T
 				}
 			}
 		} catch (Error& e) {
-			DisabledTraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).error(e, true);
+			DisabledTraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).errorUnsuppressed(e);
 			if (e.code() == error_code_connection_failed || e.code() == error_code_operation_obsolete) {
 				// NOTE: delay in order to avoid the endless retry loop block other tasks
 				self->peekReplyStream.reset();
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@ -558,7 +558,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 		    .detail("Shards", shards)
 		    .detail("MaxRetries", maxRetries);
 	} catch (Error& e) {
-		TraceEvent(SevDebug, interval.end(), relocationIntervalId).error(e, true);
+		TraceEvent(SevDebug, interval.end(), relocationIntervalId).errorUnsuppressed(e);
 		throw;
 	}

@ -992,7 +992,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,

 		TraceEvent(SevDebug, interval.end(), relocationIntervalId);
 	} catch (Error& e) {
-		TraceEvent(SevDebug, interval.end(), relocationIntervalId).error(e, true);
+		TraceEvent(SevDebug, interval.end(), relocationIntervalId).errorUnsuppressed(e);
 		throw;
 	}
 	return Void();
@ -1151,7 +1151,7 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 				tr->addReadConflictRange(conflictRange);
 				tr->addWriteConflictRange(conflictRange);

-				StorageMetadataType metadata(timer_int());
+				StorageMetadataType metadata(StorageMetadataType::currentTime());
 				metadataMap.set(tr, server.id(), metadata);

 				if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
@ -1521,7 +1521,7 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, std::vector<Storag
 	tr.read_conflict_ranges.push_back_deep(arena, allKeys);
 	KeyBackedObjectMap<UID, StorageMetadataType, decltype(IncludeVersion())> metadataMap(serverMetadataKeys.begin,
 	                                                                                     IncludeVersion());
-	StorageMetadataType metadata(timer_int());
+	StorageMetadataType metadata(StorageMetadataType::currentTime());

 	for (auto& s : servers) {
 		tr.set(arena, serverTagKeyFor(s.id()), serverTagValue(server_tag[s.id()]));
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@ -1161,8 +1161,8 @@ ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref
 		} catch (Error& e) {
 			self->activePeekStreams--;
 			TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
-			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
-			    .error(e, true);
+			    .errorUnsuppressed(e)
+			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());

 			if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
 				req.reply.sendError(e);
@ -1646,7 +1646,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
 		wait(error);
 		throw internal_error();
 	} catch (Error& e) {
-		TraceEvent("TLogError", tlogId).error(e, true);
+		TraceEvent("TLogError", tlogId).errorUnsuppressed(e);

 		for (auto& it : self.id_data) {
 			if (it.second->recoverySuccessful.canBeSet()) {
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@ -1479,8 +1479,8 @@ ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref
 		} catch (Error& e) {
 			self->activePeekStreams--;
 			TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
-			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
-			    .error(e, true);
+			    .errorUnsuppressed(e)
+			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());

 			if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
 				req.reply.sendError(e);
@ -1912,7 +1912,7 @@ ACTOR Future<Void> tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Refer
 		}
 		snapReq.reply.send(Void());
 	} catch (Error& e) {
-		TraceEvent("TLogSnapCreateError").error(e, true /*includeCancelled */);
+		TraceEvent("TLogSnapCreateError").errorUnsuppressed(e);
 		if (e.code() != error_code_operation_cancelled) {
 			snapReq.reply.sendError(e);
 		} else {
@ -2555,7 +2555,7 @@ bool tlogTerminated(TLogData* self, IKeyValueStore* persistentData, TLogQueue* p

 	if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed ||
 	    e.code() == error_code_file_not_found) {
-		TraceEvent("TLogTerminated", self->dbgid).error(e, true);
+		TraceEvent("TLogTerminated", self->dbgid).errorUnsuppressed(e);
 		return true;
 	} else
 		return false;
@ -2848,7 +2848,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
 		}
 	} catch (Error& e) {
 		self.terminated.send(Void());
-		TraceEvent("TLogError", tlogId).error(e, true);
+		TraceEvent("TLogError", tlogId).errorUnsuppressed(e);
 		if (recovered.canBeSet())
 			recovered.send(Void());

--- a/fdbserver/OldTLogServer_6_2.actor.cpp
+++ b/fdbserver/OldTLogServer_6_2.actor.cpp
@ -1908,8 +1908,8 @@ ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref
 		} catch (Error& e) {
 			self->activePeekStreams--;
 			TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
-			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
-			    .error(e, true);
+			    .errorUnsuppressed(e)
+			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());

 			if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
 				req.reply.sendError(e);
@ -2357,7 +2357,7 @@ ACTOR Future<Void> tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Refer
 		}
 		snapReq.reply.send(Void());
 	} catch (Error& e) {
-		TraceEvent("TLogExecHelperError").error(e, true /*includeCancelled */);
+		TraceEvent("TLogExecHelperError").errorUnsuppressed(e);
 		if (e.code() != error_code_operation_cancelled) {
 			snapReq.reply.sendError(e);
 		} else {
@ -3038,7 +3038,7 @@ bool tlogTerminated(TLogData* self, IKeyValueStore* persistentData, TLogQueue* p

 	if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed ||
 	    e.code() == error_code_file_not_found) {
-		TraceEvent("TLogTerminated", self->dbgid).error(e, true);
+		TraceEvent("TLogTerminated", self->dbgid).errorUnsuppressed(e);
 		return true;
 	} else
 		return false;
@ -3336,7 +3336,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
 		}
 	} catch (Error& e) {
 		self.terminated.send(Void());
-		TraceEvent("TLogError", tlogId).error(e, true);
+		TraceEvent("TLogError", tlogId).errorUnsuppressed(e);
 		if (recovered.canBeSet())
 			recovered.send(Void());

--- a/fdbserver/ProxyCommitData.actor.h
+++ b/fdbserver/ProxyCommitData.actor.h
@ -113,7 +113,7 @@ struct ProxyStats {
 	                        id,
 	                        SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                        SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
-	    commitLatencyBands("CommitLatencyMetrics", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
+	    commitLatencyBands("CommitLatencyBands", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
 	    commitBatchingEmptyMessageRatio("CommitBatchingEmptyMessageRatio",
 	                                    id,
 	                                    SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@ -158,8 +158,9 @@ ACTOR Future<std::vector<WorkerInterface>> getCoordWorkers(Database cx,
 	if (!coordinators.present()) {
 		throw operation_failed();
 	}
-	std::vector<NetworkAddress> coordinatorsAddr =
-	    ClusterConnectionString(coordinators.get().toString()).coordinators();
+	state ClusterConnectionString ccs(coordinators.get().toString());
+	wait(ccs.resolveHostnames());
+	std::vector<NetworkAddress> coordinatorsAddr = ccs.coordinators();
 	std::set<NetworkAddress> coordinatorsAddrSet;
 	for (const auto& addr : coordinatorsAddr) {
 		TraceEvent(SevDebug, "CoordinatorAddress").detail("Addr", addr);
@ -731,7 +732,7 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
 				}
 			}
 		} catch (Error& e) {
-			TraceEvent(("QuietDatabase" + phase + "Error").c_str()).error(e, true);
+			TraceEvent(("QuietDatabase" + phase + "Error").c_str()).errorUnsuppressed(e);
 			if (e.code() != error_code_actor_cancelled && e.code() != error_code_attribute_not_found &&
 			    e.code() != error_code_timed_out)
 				TraceEvent(("QuietDatabase" + phase + "Error").c_str()).error(e);
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
--- a/fdbserver/Ratekeeper.h
+++ b/fdbserver/Ratekeeper.h
@ -0,0 +1,207 @@
+/*
+ * Ratekeeper.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "fdbclient/DatabaseConfiguration.h"
+#include "fdbclient/DatabaseContext.h"
+#include "fdbclient/StorageServerInterface.h"
+#include "fdbclient/TagThrottle.actor.h"
+#include "fdbrpc/Smoother.h"
+#include "fdbserver/Knobs.h"
+#include "fdbserver/RatekeeperInterface.h"
+#include "fdbserver/ServerDBInfo.h"
+#include "fdbserver/TLogInterface.h"
+
+enum limitReason_t {
+	unlimited, // TODO: rename to workload?
+	storage_server_write_queue_size, // 1
+	storage_server_write_bandwidth_mvcc,
+	storage_server_readable_behind,
+	log_server_mvcc_write_bandwidth,
+	log_server_write_queue, // 5
+	storage_server_min_free_space, // a storage server's normal limits are being reduced by low free space
+	storage_server_min_free_space_ratio, // a storage server's normal limits are being reduced by a low free space ratio
+	log_server_min_free_space,
+	log_server_min_free_space_ratio,
+	storage_server_durability_lag, // 10
+	storage_server_list_fetch_failed,
+	limitReason_t_end
+};
+
+struct StorageQueueInfo {
+	bool valid;
+	UID id;
+	LocalityData locality;
+	StorageQueuingMetricsReply lastReply;
+	StorageQueuingMetricsReply prevReply;
+	Smoother smoothDurableBytes, smoothInputBytes, verySmoothDurableBytes;
+	Smoother smoothDurableVersion, smoothLatestVersion;
+	Smoother smoothFreeSpace;
+	Smoother smoothTotalSpace;
+	limitReason_t limitReason;
+
+	Optional<TransactionTag> busiestReadTag, busiestWriteTag;
+	double busiestReadTagFractionalBusyness = 0, busiestWriteTagFractionalBusyness = 0;
+	double busiestReadTagRate = 0, busiestWriteTagRate = 0;
+
+	Reference<EventCacheHolder> busiestWriteTagEventHolder;
+
+	// refresh periodically
+	TransactionTagMap<TransactionCommitCostEstimation> tagCostEst;
+	uint64_t totalWriteCosts = 0;
+	int totalWriteOps = 0;
+
+	StorageQueueInfo(UID id, LocalityData locality)
+	  : valid(false), id(id), locality(locality), smoothDurableBytes(SERVER_KNOBS->SMOOTHING_AMOUNT),
+	    smoothInputBytes(SERVER_KNOBS->SMOOTHING_AMOUNT), verySmoothDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
+	    smoothDurableVersion(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothLatestVersion(SERVER_KNOBS->SMOOTHING_AMOUNT),
+	    smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalSpace(SERVER_KNOBS->SMOOTHING_AMOUNT),
+	    limitReason(limitReason_t::unlimited),
+	    busiestWriteTagEventHolder(makeReference<EventCacheHolder>(id.toString() + "/BusiestWriteTag")) {
+		// FIXME: this is a tacky workaround for a potential uninitialized use in trackStorageServerQueueInfo
+		lastReply.instanceID = -1;
+	}
+};
+
+struct TLogQueueInfo {
+	bool valid;
+	UID id;
+	TLogQueuingMetricsReply lastReply;
+	TLogQueuingMetricsReply prevReply;
+	Smoother smoothDurableBytes, smoothInputBytes, verySmoothDurableBytes;
+	Smoother smoothFreeSpace;
+	Smoother smoothTotalSpace;
+	TLogQueueInfo(UID id)
+	  : valid(false), id(id), smoothDurableBytes(SERVER_KNOBS->SMOOTHING_AMOUNT),
+	    smoothInputBytes(SERVER_KNOBS->SMOOTHING_AMOUNT), verySmoothDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
+	    smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalSpace(SERVER_KNOBS->SMOOTHING_AMOUNT) {
+		// FIXME: this is a tacky workaround for a potential uninitialized use in trackTLogQueueInfo (copied from
+		// storageQueueInfO)
+		lastReply.instanceID = -1;
+	}
+};
+
+struct RatekeeperLimits {
+	double tpsLimit;
+	Int64MetricHandle tpsLimitMetric;
+	Int64MetricHandle reasonMetric;
+
+	int64_t storageTargetBytes;
+	int64_t storageSpringBytes;
+	int64_t logTargetBytes;
+	int64_t logSpringBytes;
+	double maxVersionDifference;
+
+	int64_t durabilityLagTargetVersions;
+	int64_t lastDurabilityLag;
+	double durabilityLagLimit;
+
+	TransactionPriority priority;
+	std::string context;
+
+	Reference<EventCacheHolder> rkUpdateEventCacheHolder;
+
+	RatekeeperLimits(TransactionPriority priority,
+	                 std::string context,
+	                 int64_t storageTargetBytes,
+	                 int64_t storageSpringBytes,
+	                 int64_t logTargetBytes,
+	                 int64_t logSpringBytes,
+	                 double maxVersionDifference,
+	                 int64_t durabilityLagTargetVersions)
+	  : tpsLimit(std::numeric_limits<double>::infinity()), tpsLimitMetric(StringRef("Ratekeeper.TPSLimit" + context)),
+	    reasonMetric(StringRef("Ratekeeper.Reason" + context)), storageTargetBytes(storageTargetBytes),
+	    storageSpringBytes(storageSpringBytes), logTargetBytes(logTargetBytes), logSpringBytes(logSpringBytes),
+	    maxVersionDifference(maxVersionDifference),
+	    durabilityLagTargetVersions(
+	        durabilityLagTargetVersions +
+	        SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS), // The read transaction life versions are expected to not
+	                                                           // be durable on the storage servers
+	    lastDurabilityLag(0), durabilityLagLimit(std::numeric_limits<double>::infinity()), priority(priority),
+	    context(context), rkUpdateEventCacheHolder(makeReference<EventCacheHolder>("RkUpdate" + context)) {}
+};
+
+class Ratekeeper {
+	friend class RatekeeperImpl;
+
+	// Differentiate from GrvProxyInfo in DatabaseContext.h
+	struct GrvProxyInfo {
+		int64_t totalTransactions;
+		int64_t batchTransactions;
+		uint64_t lastThrottledTagChangeId;
+
+		double lastUpdateTime;
+		double lastTagPushTime;
+
+		GrvProxyInfo()
+		  : totalTransactions(0), batchTransactions(0), lastThrottledTagChangeId(0), lastUpdateTime(0),
+		    lastTagPushTime(0) {}
+	};
+
+	UID id;
+	Database db;
+
+	Map<UID, StorageQueueInfo> storageQueueInfo;
+	Map<UID, TLogQueueInfo> tlogQueueInfo;
+
+	std::map<UID, Ratekeeper::GrvProxyInfo> grvProxyInfo;
+	Smoother smoothReleasedTransactions, smoothBatchReleasedTransactions, smoothTotalDurableBytes;
+	HealthMetrics healthMetrics;
+	DatabaseConfiguration configuration;
+	PromiseStream<Future<Void>> addActor;
+
+	Int64MetricHandle actualTpsMetric;
+
+	double lastWarning;
+	double lastSSListFetchedTimestamp;
+
+	std::unique_ptr<class TagThrottler> tagThrottler;
+
+	RatekeeperLimits normalLimits;
+	RatekeeperLimits batchLimits;
+
+	Deque<double> actualTpsHistory;
+	Optional<Key> remoteDC;
+
+	Future<Void> expiredTagThrottleCleanup;
+
+	double lastBusiestCommitTagPick;
+
+	Ratekeeper(UID id, Database db);
+
+	Future<Void> configurationMonitor();
+	void updateCommitCostEstimation(UIDTransactionTagMap<TransactionCommitCostEstimation> const& costEstimation);
+	void updateRate(RatekeeperLimits* limits);
+	Future<Void> refreshStorageServerCommitCost();
+	Future<Void> monitorServerListChange(PromiseStream<std::pair<UID, Optional<StorageServerInterface>>> serverChanges);
+	Future<Void> trackEachStorageServer(FutureStream<std::pair<UID, Optional<StorageServerInterface>>> serverChanges);
+
+	// SOMEDAY: template trackStorageServerQueueInfo and trackTLogQueueInfo into one function
+	Future<Void> trackStorageServerQueueInfo(StorageServerInterface);
+	Future<Void> trackTLogQueueInfo(TLogInterface);
+
+	void tryAutoThrottleTag(TransactionTag, double rate, double busyness, TagThrottledReason);
+	void tryAutoThrottleTag(StorageQueueInfo&, int64_t storageQueue, int64_t storageDurabilityLag);
+	Future<Void> monitorThrottlingChanges();
+
+public:
+	static Future<Void> run(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo);
+};
--- a/fdbserver/Resolver.actor.cpp
+++ b/fdbserver/Resolver.actor.cpp
@ -373,7 +373,7 @@ ACTOR Future<Void> resolver(ResolverInterface resolver,
 		}
 	} catch (Error& e) {
 		if (e.code() == error_code_actor_cancelled || e.code() == error_code_worker_removed) {
-			TraceEvent("ResolverTerminated", resolver.id()).error(e, true);
+			TraceEvent("ResolverTerminated", resolver.id()).errorUnsuppressed(e);
 			return Void();
 		}
 		throw;
--- a/fdbserver/RestoreApplier.actor.cpp
+++ b/fdbserver/RestoreApplier.actor.cpp
@ -98,8 +98,8 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
 		} catch (Error& e) {
 			bool isError = e.code() != error_code_operation_cancelled;
 			TraceEvent(isError ? SevError : SevWarnAlways, "FastRestoreApplierError", self->id())
-			    .detail("RequestType", requestTypeStr)
-			    .error(e, true);
+			    .errorUnsuppressed(e)
+			    .detail("RequestType", requestTypeStr);
 			actors.clear(false);
 			break;
 		}
@ -251,9 +251,9 @@ ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRange
 			retries++;
 			if (retries > SERVER_KNOBS->FASTRESTORE_TXN_RETRY_MAX) {
 				TraceEvent(SevWarnAlways, "RestoreApplierApplyClearRangeMutationsStuck", applierID)
+				    .error(e)
 				    .detail("BatchIndex", batchIndex)
-				    .detail("ClearRanges", ranges.size())
-				    .error(e);
+				    .detail("ClearRanges", ranges.size());
 			}
 			wait(tr->onError(e));
 		}
@ -314,11 +314,13 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 		} catch (Error& e) {
 			cc->fetchTxnRetries += 1;
 			if (retries++ > incompleteStagingKeys.size()) {
-				TraceEvent(SevWarnAlways, "GetAndComputeStagingKeys", applierID)
-				    .suppressFor(1.0)
-				    .detail("RandomUID", randomID)
-				    .detail("BatchIndex", batchIndex)
-				    .error(e);
+				if (e.code() != error_code_actor_cancelled) {
+					TraceEvent(SevWarnAlways, "GetAndComputeStagingKeys", applierID)
+					    .errorUnsuppressed(e)
+					    .suppressFor(1.0)
+					    .detail("RandomUID", randomID)
+					    .detail("BatchIndex", batchIndex);
+				}
 			}
 			wait(tr->onError(e));
 		}
--- a/fdbserver/RestoreController.actor.cpp
+++ b/fdbserver/RestoreController.actor.cpp
@ -136,7 +136,7 @@ ACTOR Future<Void> startRestoreController(Reference<RestoreWorkerData> controlle
 		wait(startProcessRestoreRequests(self, cx) || error);
 	} catch (Error& e) {
 		if (e.code() != error_code_operation_cancelled) {
-			TraceEvent(SevError, "FastRestoreControllerStart").detail("Reason", "Unexpected unhandled error").error(e);
+			TraceEvent(SevError, "FastRestoreControllerStart").error(e).detail("Reason", "Unexpected unhandled error");
 		}
 	}

--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -224,7 +224,7 @@ ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
 		}
 	} catch (Error& e) {
 		if (e.code() != error_code_actor_cancelled) {
-			TraceEvent(SevError, "FastRestoreLoaderDispatchRequests").error(e, true);
+			TraceEvent(SevError, "FastRestoreLoaderDispatchRequests").errorUnsuppressed(e);
 			throw e;
 		}
 	}
@ -301,8 +301,8 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf,
 		} catch (Error& e) {
 			bool isError = e.code() != error_code_operation_cancelled; // == error_code_broken_promise
 			TraceEvent(isError ? SevError : SevWarnAlways, "FastRestoreLoaderError", self->id())
-			    .detail("RequestType", requestTypeStr)
-			    .error(e, true);
+			    .errorUnsuppressed(e)
+			    .detail("RequestType", requestTypeStr);
 			actors.clear(false);
 			break;
 		}
@ -513,8 +513,8 @@ ACTOR static Future<Void> parsePartitionedLogFileOnLoader(
 			           e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
 				// blob http request failure, retry
 				TraceEvent(SevWarnAlways, "FastRestoreDecodedPartitionedLogFileConnectionFailure")
-				    .detail("Retries", ++readFileRetries)
-				    .error(e);
+				    .error(e)
+				    .detail("Retries", ++readFileRetries);
 				wait(delayJittered(0.1));
 			} else {
 				TraceEvent(SevError, "FastRestoreParsePartitionedLogFileOnLoaderUnexpectedError").error(e);
@ -659,10 +659,10 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
 	} catch (Error& e) { // In case ci.samples throws broken_promise due to unstable network
 		if (e.code() == error_code_broken_promise || e.code() == error_code_operation_cancelled) {
 			TraceEvent(SevWarnAlways, "FastRestoreLoaderPhaseLoadFileSendSamples")
-			    .detail("SamplesMessages", samplesMessages)
-			    .error(e, true);
+			    .errorUnsuppressed(e)
+			    .detail("SamplesMessages", samplesMessages);
 		} else {
-			TraceEvent(SevError, "FastRestoreLoaderPhaseLoadFileSendSamplesUnexpectedError").error(e, true);
+			TraceEvent(SevError, "FastRestoreLoaderPhaseLoadFileSendSamplesUnexpectedError").errorUnsuppressed(e);
 		}
 	}

@ -1230,8 +1230,8 @@ ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
 			           e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
 				// blob http request failure, retry
 				TraceEvent(SevWarnAlways, "FastRestoreDecodedRangeFileConnectionFailure")
-				    .detail("Retries", ++readFileRetries)
-				    .error(e);
+				    .error(e)
+				    .detail("Retries", ++readFileRetries);
 				wait(delayJittered(0.1));
 			} else {
 				TraceEvent(SevError, "FastRestoreParseRangeFileOnLoaderUnexpectedError").error(e);
@ -1355,8 +1355,8 @@ ACTOR static Future<Void> parseLogFileToMutationsOnLoader(NotifiedVersion* pProc
 			           e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
 				// blob http request failure, retry
 				TraceEvent(SevWarnAlways, "FastRestoreDecodedLogFileConnectionFailure")
-				    .detail("Retries", ++readFileRetries)
-				    .error(e);
+				    .error(e)
+				    .detail("Retries", ++readFileRetries);
 				wait(delayJittered(0.1));
 			} else {
 				TraceEvent(SevError, "FastRestoreParseLogFileToMutationsOnLoaderUnexpectedError").error(e);
--- a/fdbserver/RestoreWorker.actor.cpp
+++ b/fdbserver/RestoreWorker.actor.cpp
@ -264,7 +264,7 @@ ACTOR Future<Void> startRestoreWorker(Reference<RestoreWorkerData> self, Restore
 				}
 			}
 		} catch (Error& e) {
-			TraceEvent(SevWarn, "FastRestoreWorkerError").detail("RequestType", requestTypeStr).error(e, true);
+			TraceEvent(SevWarn, "FastRestoreWorkerError").errorUnsuppressed(e).detail("RequestType", requestTypeStr);
 			break;
 		}
 	}
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -601,7 +601,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
 				               ? SevInfo
 				               : SevError,
 				           "SimulatedFDBDTerminated")
-				    .error(e, true)
+				    .errorUnsuppressed(e)
 				    .detail("ZoneId", localities.zoneId());
 			}

@ -617,7 +617,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
 				onShutdown = ISimulator::InjectFaults;
 		} catch (Error& e) {
 			TraceEvent(destructed ? SevInfo : SevError, "SimulatedFDBDRebooterError")
-			    .error(e, true)
+			    .errorUnsuppressed(e)
 			    .detail("ZoneId", localities.zoneId())
 			    .detail("RandomId", randomId);
 			onShutdown = e;
@ -1905,8 +1905,8 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
 	TEST(useIPv6); // Use IPv6
 	TEST(!useIPv6); // Use IPv4

-	// TODO(renxuan): Use hostname 25% of the time, unless it is disabled
-	bool useHostname = false; // !testConfig.disableHostname && deterministicRandom()->random01() < 0.25;
+	// Use hostname 25% of the time, unless it is disabled
+	bool useHostname = !testConfig.disableHostname && deterministicRandom()->random01() < 0.25;
 	TEST(useHostname); // Use hostname
 	TEST(!useHostname); // Use IP address
 	NetworkAddressFromHostname fromHostname =
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -1038,7 +1038,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
 			if (ssLag[address] >= 60) {
 				messages.push_back(JsonString::makeMessage(
 				    "storage_server_lagging",
-				    format("Storage server lagging by %ld seconds.", (int64_t)ssLag[address]).c_str()));
+				    format("Storage server lagging by %lld seconds.", (int64_t)ssLag[address]).c_str()));
 			}

 			// Store the message array into the status object that represents the worker process
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@ -1375,7 +1375,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
 				break;
 			} catch (Error& e) {
 				TraceEvent("SCFKBlockFail", data->thisServerID)
-				    .error(e, true)
+				    .errorUnsuppressed(e)
 				    .suppressFor(1.0)
 				    .detail("FKID", interval.pairID);
 				if (e.code() == error_code_transaction_too_old) {
@ -1507,7 +1507,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang

 		// TraceEvent(SevDebug, interval.end(), data->thisServerID);
 	} catch (Error& e) {
-		// TraceEvent(SevDebug, interval.end(), data->thisServerID).error(e, true).detail("Version", data->version.get());
+		// TraceEvent(SevDebug, interval.end(), data->thisServerID).errorUnsuppressed(e).detail("Version", data->version.get());

 		// TODO define the shuttingDown state of cache server
 		if (e.code() == error_code_actor_cancelled &&
--- a/fdbserver/TCInfo.actor.cpp
+++ b/fdbserver/TCInfo.actor.cpp
@ -137,6 +137,23 @@ TCServerInfo::TCServerInfo(StorageServerInterface ssi,
 	}
 }

+bool TCServerInfo::hasHealthyAvailableSpace(double minAvailableSpaceRatio) const {
+	ASSERT(serverMetricsPresent());
+
+	auto& metrics = getServerMetrics();
+	ASSERT(metrics.available.bytes >= 0);
+	ASSERT(metrics.capacity.bytes >= 0);
+
+	double availableSpaceRatio;
+	if (metrics.capacity.bytes == 0) {
+		availableSpaceRatio = 0;
+	} else {
+		availableSpaceRatio = (((double)metrics.available.bytes) / metrics.capacity.bytes);
+	}
+
+	return availableSpaceRatio >= minAvailableSpaceRatio;
+}
+
 Future<Void> TCServerInfo::updateServerMetrics() {
 	return TCServerInfoImpl::updateServerMetrics(this);
 }
@ -396,8 +413,23 @@ double TCTeamInfo::getMinAvailableSpaceRatio(bool includeInFlight) const {
 	return minRatio;
 }

+bool TCTeamInfo::allServersHaveHealthyAvailableSpace() const {
+	bool result = true;
+	double minAvailableSpaceRatio =
+	    SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO + SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER;
+	for (const auto& server : servers) {
+		if (!server->serverMetricsPresent() || !server->hasHealthyAvailableSpace(minAvailableSpaceRatio)) {
+			result = false;
+			break;
+		}
+	}
+
+	return result;
+}
+
 bool TCTeamInfo::hasHealthyAvailableSpace(double minRatio) const {
-	return getMinAvailableSpaceRatio() >= minRatio && getMinAvailableSpace() > SERVER_KNOBS->MIN_AVAILABLE_SPACE;
+	return getMinAvailableSpaceRatio() >= minRatio && getMinAvailableSpace() > SERVER_KNOBS->MIN_AVAILABLE_SPACE &&
+	       allServersHaveHealthyAvailableSpace();
 }

 bool TCTeamInfo::isOptimal() const {
--- a/fdbserver/TCInfo.h
+++ b/fdbserver/TCInfo.h
@ -93,6 +93,8 @@ public:
 		return (storeType == configStoreType || storeType == KeyValueStoreType::END);
 	}

+	bool hasHealthyAvailableSpace(double minAvailableSpaceRatio) const;
+
 	Future<Void> updateServerMetrics();
 	static Future<Void> updateServerMetrics(Reference<TCServerInfo> server);
 	Future<Void> serverMetricsPolling();
@ -220,4 +222,6 @@ private:
 	// Calculate an "average" of the metrics replies that we received.  Penalize teams from which we did not receive all
 	// replies.
 	int64_t getLoadAverage() const;
+
+	bool allServersHaveHealthyAvailableSpace() const;
 };
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -1740,140 +1740,168 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
 		return Void();
 	}

-	state Version endVersion = logData->version.get() + 1;
-	state bool onlySpilled = false;
+	state Version endVersion;
+	state bool onlySpilled;

-	// grab messages from disk
-	//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
-	if (reqBegin <= logData->persistentDataDurableVersion) {
-		// Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We
-		// may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if
-		// an initial attempt to read from disk results in insufficient data and the required data is no longer in
-		// memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the
-		// result?
+	// Run the peek logic in a loop to account for the case where there is no data to return to the caller, and we may
+	// want to wait a little bit instead of just sending back an empty message. This feature is controlled by a knob.
+	loop {
+		endVersion = logData->version.get() + 1;
+		onlySpilled = false;

-		if (reqOnlySpilled) {
-			endVersion = logData->persistentDataDurableVersion + 1;
-		} else {
-			peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion);
-		}
+		// grab messages from disk
+		//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
+		if (reqBegin <= logData->persistentDataDurableVersion) {
+			// Just in case the durable version changes while we are waiting for the read, we grab this data from
+			// memory. We may or may not actually send it depending on whether we get enough data from disk. SOMEDAY:
+			// Only do this if an initial attempt to read from disk results in insufficient data and the required data
+			// is no longer in memory SOMEDAY: Should we only send part of the messages we collected, to actually limit
+			// the size of the result?

-		if (logData->shouldSpillByValue(reqTag)) {
-			RangeResult kvs = wait(self->persistentData->readRange(
-			    KeyRangeRef(persistTagMessagesKey(logData->logId, reqTag, reqBegin),
-			                persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
-			    SERVER_KNOBS->DESIRED_TOTAL_BYTES,
-			    SERVER_KNOBS->DESIRED_TOTAL_BYTES));
-
-			for (auto& kv : kvs) {
-				auto ver = decodeTagMessagesKey(kv.key);
-				messages << VERSION_HEADER << ver;
-				messages.serializeBytes(kv.value);
-			}
-
-			if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
-				endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1;
-				onlySpilled = true;
+			if (reqOnlySpilled) {
+				endVersion = logData->persistentDataDurableVersion + 1;
 			} else {
-				messages.serializeBytes(messages2.toValue());
+				peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion);
 			}
-		} else {
-			// FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow.
-			RangeResult kvrefs = wait(self->persistentData->readRange(
-			    KeyRangeRef(
-			        persistTagMessageRefsKey(logData->logId, reqTag, reqBegin),
-			        persistTagMessageRefsKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
-			    SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1));

-			//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
+			if (logData->shouldSpillByValue(reqTag)) {
+				RangeResult kvs = wait(self->persistentData->readRange(
+				    KeyRangeRef(
+				        persistTagMessagesKey(logData->logId, reqTag, reqBegin),
+				        persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
+				    SERVER_KNOBS->DESIRED_TOTAL_BYTES,
+				    SERVER_KNOBS->DESIRED_TOTAL_BYTES));

-			state std::vector<std::pair<IDiskQueue::location, IDiskQueue::location>> commitLocations;
-			state bool earlyEnd = false;
-			uint32_t mutationBytes = 0;
-			state uint64_t commitBytes = 0;
-			state Version firstVersion = std::numeric_limits<Version>::max();
-			for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) {
-				auto& kv = kvrefs[i];
-				VectorRef<SpilledData> spilledData;
-				BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion));
-				r >> spilledData;
-				for (const SpilledData& sd : spilledData) {
-					if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
-						earlyEnd = true;
+				for (auto& kv : kvs) {
+					auto ver = decodeTagMessagesKey(kv.key);
+					messages << VERSION_HEADER << ver;
+					messages.serializeBytes(kv.value);
+				}
+
+				if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
+					endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1;
+					onlySpilled = true;
+				} else {
+					messages.serializeBytes(messages2.toValue());
+				}
+			} else {
+				// FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow.
+				RangeResult kvrefs = wait(self->persistentData->readRange(
+				    KeyRangeRef(
+				        persistTagMessageRefsKey(logData->logId, reqTag, reqBegin),
+				        persistTagMessageRefsKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
+				    SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1));
+
+				//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
+
+				state std::vector<std::pair<IDiskQueue::location, IDiskQueue::location>> commitLocations;
+				state bool earlyEnd = false;
+				uint32_t mutationBytes = 0;
+				state uint64_t commitBytes = 0;
+				state Version firstVersion = std::numeric_limits<Version>::max();
+				for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) {
+					auto& kv = kvrefs[i];
+					VectorRef<SpilledData> spilledData;
+					BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion));
+					r >> spilledData;
+					for (const SpilledData& sd : spilledData) {
+						if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
+							earlyEnd = true;
+							break;
+						}
+						if (sd.version >= reqBegin) {
+							firstVersion = std::min(firstVersion, sd.version);
+							const IDiskQueue::location end = sd.start.lo + sd.length;
+							commitLocations.emplace_back(sd.start, end);
+							// This isn't perfect, because we aren't accounting for page boundaries, but should be
+							// close enough.
+							commitBytes += sd.length;
+							mutationBytes += sd.mutationBytes;
+						}
+					}
+					if (earlyEnd)
 						break;
-					}
-					if (sd.version >= reqBegin) {
-						firstVersion = std::min(firstVersion, sd.version);
-						const IDiskQueue::location end = sd.start.lo + sd.length;
-						commitLocations.emplace_back(sd.start, end);
-						// This isn't perfect, because we aren't accounting for page boundaries, but should be
-						// close enough.
-						commitBytes += sd.length;
-						mutationBytes += sd.mutationBytes;
-					}
 				}
-				if (earlyEnd)
-					break;
-			}
-			earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1);
-			wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes));
-			state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes);
-			state std::vector<Future<Standalone<StringRef>>> messageReads;
-			messageReads.reserve(commitLocations.size());
-			for (const auto& pair : commitLocations) {
-				messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::True));
-			}
-			commitLocations.clear();
-			wait(waitForAll(messageReads));
+				earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1);
+				wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes));
+				state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes);
+				state std::vector<Future<Standalone<StringRef>>> messageReads;
+				messageReads.reserve(commitLocations.size());
+				for (const auto& pair : commitLocations) {
+					messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::True));
+				}
+				commitLocations.clear();
+				wait(waitForAll(messageReads));

-			state Version lastRefMessageVersion = 0;
-			state int index = 0;
-			loop {
-				if (index >= messageReads.size())
-					break;
-				Standalone<StringRef> queueEntryData = messageReads[index].get();
-				uint8_t valid;
-				const uint32_t length = *(uint32_t*)queueEntryData.begin();
-				queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4);
-				BinaryReader rd(queueEntryData, IncludeVersion());
-				state TLogQueueEntry entry;
-				rd >> entry >> valid;
-				ASSERT(valid == 0x01);
-				ASSERT(length + sizeof(valid) == queueEntryData.size());
+				state Version lastRefMessageVersion = 0;
+				state int index = 0;
+				loop {
+					if (index >= messageReads.size())
+						break;
+					Standalone<StringRef> queueEntryData = messageReads[index].get();
+					uint8_t valid;
+					const uint32_t length = *(uint32_t*)queueEntryData.begin();
+					queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4);
+					BinaryReader rd(queueEntryData, IncludeVersion());
+					state TLogQueueEntry entry;
+					rd >> entry >> valid;
+					ASSERT(valid == 0x01);
+					ASSERT(length + sizeof(valid) == queueEntryData.size());

-				messages << VERSION_HEADER << entry.version;
+					messages << VERSION_HEADER << entry.version;

-				std::vector<StringRef> rawMessages =
-				    wait(parseMessagesForTag(entry.messages, reqTag, logData->logRouterTags));
-				for (const StringRef& msg : rawMessages) {
-					messages.serializeBytes(msg);
-					DEBUG_TAGS_AND_MESSAGE("TLogPeekFromDisk", entry.version, msg, logData->logId)
-					    .detail("DebugID", self->dbgid)
-					    .detail("PeekTag", reqTag);
+					std::vector<StringRef> rawMessages =
+					    wait(parseMessagesForTag(entry.messages, reqTag, logData->logRouterTags));
+					for (const StringRef& msg : rawMessages) {
+						messages.serializeBytes(msg);
+						DEBUG_TAGS_AND_MESSAGE("TLogPeekFromDisk", entry.version, msg, logData->logId)
+						    .detail("DebugID", self->dbgid)
+						    .detail("PeekTag", reqTag);
+					}
+
+					lastRefMessageVersion = entry.version;
+					index++;
 				}

-				lastRefMessageVersion = entry.version;
-				index++;
-			}
+				messageReads.clear();
+				memoryReservation.release();

-			messageReads.clear();
-			memoryReservation.release();
-
-			if (earlyEnd) {
-				endVersion = lastRefMessageVersion + 1;
-				onlySpilled = true;
-			} else {
-				messages.serializeBytes(messages2.toValue());
+				if (earlyEnd) {
+					endVersion = lastRefMessageVersion + 1;
+					onlySpilled = true;
+				} else {
+					messages.serializeBytes(messages2.toValue());
+				}
 			}
-		}
-	} else {
-		if (reqOnlySpilled) {
-			endVersion = logData->persistentDataDurableVersion + 1;
 		} else {
-			peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion);
+			if (reqOnlySpilled) {
+				endVersion = logData->persistentDataDurableVersion + 1;
+			} else {
+				peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion);
+			}
+
+			//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
 		}

-		//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
+		// Reply the peek request when
+		//   - Have data return to the caller, or
+		//   - Batching empty peek is disabled, or
+		//   - Batching empty peek interval has been reached.
+		if (messages.getLength() > 0 || !SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG ||
+		    (now() - blockStart > SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL)) {
+			break;
+		}
+
+		state Version waitUntilVersion = logData->version.get() + 1;
+
+		// Currently, from `reqBegin` to logData->version are all empty peeks. Wait for more versions, or the empty
+		// batching interval has expired.
+		wait(logData->version.whenAtLeast(waitUntilVersion) ||
+		     delay(SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL - (now() - blockStart)));
+		if (logData->version.get() < waitUntilVersion) {
+			break; // We know that from `reqBegin` to logData->version are all empty messages. Skip re-executing the
+			       // peek logic.
+		}
 	}

 	TLogPeekReply reply;
@ -1969,8 +1997,8 @@ ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref
 		} catch (Error& e) {
 			self->activePeekStreams--;
 			TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
-			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
-			    .error(e, true);
+			    .errorUnsuppressed(e)
+			    .detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());

 			if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
 				req.reply.sendError(e);
@ -2441,7 +2469,7 @@ ACTOR Future<Void> tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Refer
 		}
 		snapReq.reply.send(Void());
 	} catch (Error& e) {
-		TraceEvent("TLogExecHelperError").error(e, true /*includeCancelled */);
+		TraceEvent("TLogExecHelperError").errorUnsuppressed(e);
 		if (e.code() != error_code_operation_cancelled) {
 			snapReq.reply.sendError(e);
 		} else {
@ -3158,7 +3186,7 @@ bool tlogTerminated(TLogData* self, IKeyValueStore* persistentData, TLogQueue* p

 	if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed ||
 	    e.code() == error_code_file_not_found || e.code() == error_code_invalid_cluster_id) {
-		TraceEvent("TLogTerminated", self->dbgid).error(e, true);
+		TraceEvent("TLogTerminated", self->dbgid).errorUnsuppressed(e);
 		return true;
 	} else
 		return false;
@ -3509,7 +3537,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
 		}
 	} catch (Error& e) {
 		self.terminated.send(Void());
-		TraceEvent("TLogError", tlogId).error(e, true);
+		TraceEvent("TLogError", tlogId).errorUnsuppressed(e);
 		if (recovered.canBeSet())
 			recovered.send(Void());

--- a/fdbserver/TagThrottler.actor.cpp
+++ b/fdbserver/TagThrottler.actor.cpp
@ -0,0 +1,598 @@
+/*
+ * TagThrottler.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbserver/TagThrottler.h"
+
+class RkTagThrottleCollection : NonCopyable {
+	struct RkTagData {
+		Smoother requestRate;
+		RkTagData() : requestRate(CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW) {}
+	};
+
+	struct RkTagThrottleData {
+		ClientTagThrottleLimits limits;
+		Smoother clientRate;
+
+		// Only used by auto-throttles
+		double created = now();
+		double lastUpdated = 0;
+		double lastReduced = now();
+		bool rateSet = false;
+
+		RkTagThrottleData() : clientRate(CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW) {}
+
+		double getTargetRate(Optional<double> requestRate) {
+			if (limits.tpsRate == 0.0 || !requestRate.present() || requestRate.get() == 0.0 || !rateSet) {
+				return limits.tpsRate;
+			} else {
+				return std::min(limits.tpsRate, (limits.tpsRate / requestRate.get()) * clientRate.smoothTotal());
+			}
+		}
+
+		Optional<double> updateAndGetClientRate(Optional<double> requestRate) {
+			if (limits.expiration > now()) {
+				double targetRate = getTargetRate(requestRate);
+				if (targetRate == std::numeric_limits<double>::max()) {
+					rateSet = false;
+					return targetRate;
+				}
+				if (!rateSet) {
+					rateSet = true;
+					clientRate.reset(targetRate);
+				} else {
+					clientRate.setTotal(targetRate);
+				}
+
+				double rate = clientRate.smoothTotal();
+				ASSERT(rate >= 0);
+				return rate;
+			} else {
+				TEST(true); // Get throttle rate for expired throttle
+				rateSet = false;
+				return Optional<double>();
+			}
+		}
+	};
+
+	void initializeTag(TransactionTag const& tag) { tagData.try_emplace(tag); }
+
+public:
+	RkTagThrottleCollection() {}
+
+	RkTagThrottleCollection(RkTagThrottleCollection&& other) {
+		autoThrottledTags = std::move(other.autoThrottledTags);
+		manualThrottledTags = std::move(other.manualThrottledTags);
+		tagData = std::move(other.tagData);
+	}
+
+	void operator=(RkTagThrottleCollection&& other) {
+		autoThrottledTags = std::move(other.autoThrottledTags);
+		manualThrottledTags = std::move(other.manualThrottledTags);
+		tagData = std::move(other.tagData);
+	}
+
+	double computeTargetTpsRate(double currentBusyness, double targetBusyness, double requestRate) {
+		ASSERT(currentBusyness > 0);
+
+		if (targetBusyness < 1) {
+			double targetFraction = targetBusyness * (1 - currentBusyness) / ((1 - targetBusyness) * currentBusyness);
+			return requestRate * targetFraction;
+		} else {
+			return std::numeric_limits<double>::max();
+		}
+	}
+
+	// Returns the TPS rate if the throttle is updated, otherwise returns an empty optional
+	Optional<double> autoThrottleTag(UID id,
+	                                 TransactionTag const& tag,
+	                                 double fractionalBusyness,
+	                                 Optional<double> tpsRate = Optional<double>(),
+	                                 Optional<double> expiration = Optional<double>()) {
+		ASSERT(!tpsRate.present() || tpsRate.get() >= 0);
+		ASSERT(!expiration.present() || expiration.get() > now());
+
+		auto itr = autoThrottledTags.find(tag);
+		bool present = (itr != autoThrottledTags.end());
+		if (!present) {
+			if (autoThrottledTags.size() >= SERVER_KNOBS->MAX_AUTO_THROTTLED_TRANSACTION_TAGS) {
+				TEST(true); // Reached auto-throttle limit
+				return Optional<double>();
+			}
+
+			itr = autoThrottledTags.try_emplace(tag).first;
+			initializeTag(tag);
+		} else if (itr->second.limits.expiration <= now()) {
+			TEST(true); // Re-throttling expired tag that hasn't been cleaned up
+			present = false;
+			itr->second = RkTagThrottleData();
+		}
+
+		auto& throttle = itr->second;
+
+		if (!tpsRate.present()) {
+			if (now() <= throttle.created + SERVER_KNOBS->AUTO_TAG_THROTTLE_START_AGGREGATION_TIME) {
+				tpsRate = std::numeric_limits<double>::max();
+				if (present) {
+					return Optional<double>();
+				}
+			} else if (now() <= throttle.lastUpdated + SERVER_KNOBS->AUTO_TAG_THROTTLE_UPDATE_FREQUENCY) {
+				TEST(true); // Tag auto-throttled too quickly
+				return Optional<double>();
+			} else {
+				tpsRate = computeTargetTpsRate(fractionalBusyness,
+				                               SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS,
+				                               tagData[tag].requestRate.smoothRate());
+
+				if (throttle.limits.expiration > now() && tpsRate.get() >= throttle.limits.tpsRate) {
+					TEST(true); // Tag auto-throttle rate increase attempt while active
+					return Optional<double>();
+				}
+
+				throttle.lastUpdated = now();
+				if (tpsRate.get() < throttle.limits.tpsRate) {
+					throttle.lastReduced = now();
+				}
+			}
+		}
+		if (!expiration.present()) {
+			expiration = now() + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION;
+		}
+
+		ASSERT(tpsRate.present() && tpsRate.get() >= 0);
+
+		throttle.limits.tpsRate = tpsRate.get();
+		throttle.limits.expiration = expiration.get();
+
+		Optional<double> clientRate = throttle.updateAndGetClientRate(getRequestRate(tag));
+
+		TraceEvent("RkSetAutoThrottle", id)
+		    .detail("Tag", tag)
+		    .detail("TargetRate", tpsRate.get())
+		    .detail("Expiration", expiration.get() - now())
+		    .detail("ClientRate", clientRate)
+		    .detail("Created", now() - throttle.created)
+		    .detail("LastUpdate", now() - throttle.lastUpdated)
+		    .detail("LastReduced", now() - throttle.lastReduced);
+
+		if (tpsRate.get() != std::numeric_limits<double>::max()) {
+			return tpsRate.get();
+		} else {
+			return Optional<double>();
+		}
+	}
+
+	void manualThrottleTag(UID id,
+	                       TransactionTag const& tag,
+	                       TransactionPriority priority,
+	                       double tpsRate,
+	                       double expiration,
+	                       Optional<ClientTagThrottleLimits> const& oldLimits) {
+		ASSERT(tpsRate >= 0);
+		ASSERT(expiration > now());
+
+		auto& priorityThrottleMap = manualThrottledTags[tag];
+		auto result = priorityThrottleMap.try_emplace(priority);
+		initializeTag(tag);
+		ASSERT(result.second); // Updating to the map is done by copying the whole map
+
+		result.first->second.limits.tpsRate = tpsRate;
+		result.first->second.limits.expiration = expiration;
+
+		if (!oldLimits.present()) {
+			TEST(true); // Transaction tag manually throttled
+			TraceEvent("RatekeeperAddingManualThrottle", id)
+			    .detail("Tag", tag)
+			    .detail("Rate", tpsRate)
+			    .detail("Priority", transactionPriorityToString(priority))
+			    .detail("SecondsToExpiration", expiration - now());
+		} else if (oldLimits.get().tpsRate != tpsRate || oldLimits.get().expiration != expiration) {
+			TEST(true); // Manual transaction tag throttle updated
+			TraceEvent("RatekeeperUpdatingManualThrottle", id)
+			    .detail("Tag", tag)
+			    .detail("Rate", tpsRate)
+			    .detail("Priority", transactionPriorityToString(priority))
+			    .detail("SecondsToExpiration", expiration - now());
+		}
+
+		Optional<double> clientRate = result.first->second.updateAndGetClientRate(getRequestRate(tag));
+		ASSERT(clientRate.present());
+	}
+
+	Optional<ClientTagThrottleLimits> getManualTagThrottleLimits(TransactionTag const& tag,
+	                                                             TransactionPriority priority) {
+		auto itr = manualThrottledTags.find(tag);
+		if (itr != manualThrottledTags.end()) {
+			auto priorityItr = itr->second.find(priority);
+			if (priorityItr != itr->second.end()) {
+				return priorityItr->second.limits;
+			}
+		}
+
+		return Optional<ClientTagThrottleLimits>();
+	}
+
+	PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates(bool autoThrottlingEnabled) {
+		PrioritizedTransactionTagMap<ClientTagThrottleLimits> clientRates;
+
+		for (auto tagItr = tagData.begin(); tagItr != tagData.end();) {
+			bool tagPresent = false;
+
+			double requestRate = tagItr->second.requestRate.smoothRate();
+			auto manualItr = manualThrottledTags.find(tagItr->first);
+			if (manualItr != manualThrottledTags.end()) {
+				Optional<ClientTagThrottleLimits> manualClientRate;
+				for (auto priority = allTransactionPriorities.rbegin(); !(priority == allTransactionPriorities.rend());
+				     ++priority) {
+					auto priorityItr = manualItr->second.find(*priority);
+					if (priorityItr != manualItr->second.end()) {
+						Optional<double> priorityClientRate = priorityItr->second.updateAndGetClientRate(requestRate);
+						if (!priorityClientRate.present()) {
+							TEST(true); // Manual priority throttle expired
+							priorityItr = manualItr->second.erase(priorityItr);
+						} else {
+							if (!manualClientRate.present() ||
+							    manualClientRate.get().tpsRate > priorityClientRate.get()) {
+								manualClientRate = ClientTagThrottleLimits(priorityClientRate.get(),
+								                                           priorityItr->second.limits.expiration);
+							} else {
+								TEST(true); // Manual throttle overriden by higher priority
+							}
+
+							++priorityItr;
+						}
+					}
+
+					if (manualClientRate.present()) {
+						tagPresent = true;
+						TEST(true); // Using manual throttle
+						clientRates[*priority][tagItr->first] = manualClientRate.get();
+					}
+				}
+
+				if (manualItr->second.empty()) {
+					TEST(true); // All manual throttles expired
+					manualThrottledTags.erase(manualItr);
+					break;
+				}
+			}
+
+			auto autoItr = autoThrottledTags.find(tagItr->first);
+			if (autoItr != autoThrottledTags.end()) {
+				Optional<double> autoClientRate = autoItr->second.updateAndGetClientRate(requestRate);
+				if (autoClientRate.present()) {
+					double adjustedRate = autoClientRate.get();
+					double rampStartTime = autoItr->second.lastReduced + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION -
+					                       SERVER_KNOBS->AUTO_TAG_THROTTLE_RAMP_UP_TIME;
+					if (now() >= rampStartTime && adjustedRate != std::numeric_limits<double>::max()) {
+						TEST(true); // Tag auto-throttle ramping up
+
+						double targetBusyness = SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS;
+						if (targetBusyness == 0) {
+							targetBusyness = 0.01;
+						}
+
+						double rampLocation = (now() - rampStartTime) / SERVER_KNOBS->AUTO_TAG_THROTTLE_RAMP_UP_TIME;
+						adjustedRate =
+						    computeTargetTpsRate(targetBusyness, pow(targetBusyness, 1 - rampLocation), adjustedRate);
+					}
+
+					tagPresent = true;
+					if (autoThrottlingEnabled) {
+						auto result = clientRates[TransactionPriority::DEFAULT].try_emplace(
+						    tagItr->first, adjustedRate, autoItr->second.limits.expiration);
+						if (!result.second && result.first->second.tpsRate > adjustedRate) {
+							result.first->second =
+							    ClientTagThrottleLimits(adjustedRate, autoItr->second.limits.expiration);
+						} else {
+							TEST(true); // Auto throttle overriden by manual throttle
+						}
+						clientRates[TransactionPriority::BATCH][tagItr->first] =
+						    ClientTagThrottleLimits(0, autoItr->second.limits.expiration);
+					}
+				} else {
+					ASSERT(autoItr->second.limits.expiration <= now());
+					TEST(true); // Auto throttle expired
+					if (BUGGIFY) { // Temporarily extend the window between expiration and cleanup
+						tagPresent = true;
+					} else {
+						autoThrottledTags.erase(autoItr);
+					}
+				}
+			}
+
+			if (!tagPresent) {
+				TEST(true); // All tag throttles expired
+				tagItr = tagData.erase(tagItr);
+			} else {
+				++tagItr;
+			}
+		}
+
+		return clientRates;
+	}
+
+	void addRequests(TransactionTag const& tag, int requests) {
+		if (requests > 0) {
+			TEST(true); // Requests reported for throttled tag
+
+			auto tagItr = tagData.try_emplace(tag);
+			tagItr.first->second.requestRate.addDelta(requests);
+
+			double requestRate = tagItr.first->second.requestRate.smoothRate();
+
+			auto autoItr = autoThrottledTags.find(tag);
+			if (autoItr != autoThrottledTags.end()) {
+				autoItr->second.updateAndGetClientRate(requestRate);
+			}
+
+			auto manualItr = manualThrottledTags.find(tag);
+			if (manualItr != manualThrottledTags.end()) {
+				for (auto priorityItr = manualItr->second.begin(); priorityItr != manualItr->second.end();
+				     ++priorityItr) {
+					priorityItr->second.updateAndGetClientRate(requestRate);
+				}
+			}
+		}
+	}
+
+	Optional<double> getRequestRate(TransactionTag const& tag) {
+		auto itr = tagData.find(tag);
+		if (itr != tagData.end()) {
+			return itr->second.requestRate.smoothRate();
+		}
+		return Optional<double>();
+	}
+
+	int64_t autoThrottleCount() const { return autoThrottledTags.size(); }
+
+	int64_t manualThrottleCount() const {
+		int64_t count = 0;
+		for (auto itr = manualThrottledTags.begin(); itr != manualThrottledTags.end(); ++itr) {
+			count += itr->second.size();
+		}
+
+		return count;
+	}
+
+	TransactionTagMap<RkTagThrottleData> autoThrottledTags;
+	TransactionTagMap<std::map<TransactionPriority, RkTagThrottleData>> manualThrottledTags;
+	TransactionTagMap<RkTagData> tagData;
+	uint32_t busyReadTagCount = 0, busyWriteTagCount = 0;
+};
+
+class TagThrottlerImpl {
+	Database db;
+	UID id;
+	RkTagThrottleCollection throttledTags;
+	uint64_t throttledTagChangeId{ 0 };
+	bool autoThrottlingEnabled{ false };
+
+	ACTOR static Future<Void> monitorThrottlingChanges(TagThrottlerImpl* self) {
+		state bool committed = false;
+		loop {
+			state ReadYourWritesTransaction tr(self->db);
+
+			loop {
+				try {
+					tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+					tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+					state Future<RangeResult> throttledTagKeys = tr.getRange(tagThrottleKeys, CLIENT_KNOBS->TOO_MANY);
+					state Future<Optional<Value>> autoThrottlingEnabled = tr.get(tagThrottleAutoEnabledKey);
+
+					if (!committed) {
+						BinaryWriter limitWriter(Unversioned());
+						limitWriter << SERVER_KNOBS->MAX_MANUAL_THROTTLED_TRANSACTION_TAGS;
+						tr.set(tagThrottleLimitKey, limitWriter.toValue());
+					}
+
+					wait(success(throttledTagKeys) && success(autoThrottlingEnabled));
+
+					if (autoThrottlingEnabled.get().present() &&
+					    autoThrottlingEnabled.get().get() == LiteralStringRef("0")) {
+						TEST(true); // Auto-throttling disabled
+						if (self->autoThrottlingEnabled) {
+							TraceEvent("AutoTagThrottlingDisabled", self->id).log();
+						}
+						self->autoThrottlingEnabled = false;
+					} else if (autoThrottlingEnabled.get().present() &&
+					           autoThrottlingEnabled.get().get() == LiteralStringRef("1")) {
+						TEST(true); // Auto-throttling enabled
+						if (!self->autoThrottlingEnabled) {
+							TraceEvent("AutoTagThrottlingEnabled", self->id).log();
+						}
+						self->autoThrottlingEnabled = true;
+					} else {
+						TEST(true); // Auto-throttling unspecified
+						if (autoThrottlingEnabled.get().present()) {
+							TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue", self->id)
+							    .detail("Value", autoThrottlingEnabled.get().get());
+						}
+						self->autoThrottlingEnabled = SERVER_KNOBS->AUTO_TAG_THROTTLING_ENABLED;
+						if (!committed)
+							tr.set(tagThrottleAutoEnabledKey,
+							       LiteralStringRef(self->autoThrottlingEnabled ? "1" : "0"));
+					}
+
+					RkTagThrottleCollection updatedTagThrottles;
+
+					TraceEvent("RatekeeperReadThrottledTags", self->id)
+					    .detail("NumThrottledTags", throttledTagKeys.get().size());
+					for (auto entry : throttledTagKeys.get()) {
+						TagThrottleKey tagKey = TagThrottleKey::fromKey(entry.key);
+						TagThrottleValue tagValue = TagThrottleValue::fromValue(entry.value);
+
+						ASSERT(tagKey.tags.size() == 1); // Currently, only 1 tag per throttle is supported
+
+						if (tagValue.expirationTime == 0 ||
+						    tagValue.expirationTime > now() + tagValue.initialDuration) {
+							TEST(true); // Converting tag throttle duration to absolute time
+							tagValue.expirationTime = now() + tagValue.initialDuration;
+							BinaryWriter wr(IncludeVersion(ProtocolVersion::withTagThrottleValueReason()));
+							wr << tagValue;
+							state Value value = wr.toValue();
+
+							tr.set(entry.key, value);
+						}
+
+						if (tagValue.expirationTime > now()) {
+							TransactionTag tag = *tagKey.tags.begin();
+							Optional<ClientTagThrottleLimits> oldLimits =
+							    self->throttledTags.getManualTagThrottleLimits(tag, tagKey.priority);
+
+							if (tagKey.throttleType == TagThrottleType::AUTO) {
+								updatedTagThrottles.autoThrottleTag(
+								    self->id, tag, 0, tagValue.tpsRate, tagValue.expirationTime);
+								if (tagValue.reason == TagThrottledReason::BUSY_READ) {
+									updatedTagThrottles.busyReadTagCount++;
+								} else if (tagValue.reason == TagThrottledReason::BUSY_WRITE) {
+									updatedTagThrottles.busyWriteTagCount++;
+								}
+							} else {
+								updatedTagThrottles.manualThrottleTag(self->id,
+								                                      tag,
+								                                      tagKey.priority,
+								                                      tagValue.tpsRate,
+								                                      tagValue.expirationTime,
+								                                      oldLimits);
+							}
+						}
+					}
+
+					self->throttledTags = std::move(updatedTagThrottles);
+					++self->throttledTagChangeId;
+
+					state Future<Void> watchFuture = tr.watch(tagThrottleSignalKey);
+					wait(tr.commit());
+					committed = true;
+
+					wait(watchFuture);
+					TraceEvent("RatekeeperThrottleSignaled", self->id).log();
+					TEST(true); // Tag throttle changes detected
+					break;
+				} catch (Error& e) {
+					TraceEvent("RatekeeperMonitorThrottlingChangesError", self->id).error(e);
+					wait(tr.onError(e));
+				}
+			}
+		}
+	}
+
+	Optional<double> autoThrottleTag(UID id, TransactionTag tag, double busyness) {
+		return throttledTags.autoThrottleTag(id, tag, busyness);
+	}
+
+	Future<Void> tryAutoThrottleTag(TransactionTag tag, double rate, double busyness, TagThrottledReason reason) {
+		// NOTE: before the comparison with MIN_TAG_COST, the busiest tag rate also compares with MIN_TAG_PAGES_RATE
+		// currently MIN_TAG_PAGES_RATE > MIN_TAG_COST in our default knobs.
+		if (busyness > SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS && rate > SERVER_KNOBS->MIN_TAG_COST) {
+			TEST(true); // Transaction tag auto-throttled
+			Optional<double> clientRate = autoThrottleTag(id, tag, busyness);
+			if (clientRate.present()) {
+				TagSet tags;
+				tags.addTag(tag);
+
+				Reference<DatabaseContext> dbRef = Reference<DatabaseContext>::addRef(db.getPtr());
+				return ThrottleApi::throttleTags(dbRef,
+				                                 tags,
+				                                 clientRate.get(),
+				                                 SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION,
+				                                 TagThrottleType::AUTO,
+				                                 TransactionPriority::DEFAULT,
+				                                 now() + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION,
+				                                 reason);
+			}
+		}
+		return Void();
+	}
+
+public:
+	TagThrottlerImpl(Database db, UID id) : db(db), id(id) {}
+	Future<Void> monitorThrottlingChanges() { return monitorThrottlingChanges(this); }
+
+	void addRequests(TransactionTag tag, int count) { throttledTags.addRequests(tag, count); }
+	uint64_t getThrottledTagChangeId() const { return throttledTagChangeId; }
+	PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() {
+		return throttledTags.getClientRates(autoThrottlingEnabled);
+	}
+	int64_t autoThrottleCount() const { return throttledTags.autoThrottleCount(); }
+	uint32_t busyReadTagCount() const { return throttledTags.busyReadTagCount; }
+	uint32_t busyWriteTagCount() const { return throttledTags.busyWriteTagCount; }
+	int64_t manualThrottleCount() const { return throttledTags.manualThrottleCount(); }
+	bool isAutoThrottlingEnabled() const { return autoThrottlingEnabled; }
+
+	Future<Void> tryAutoThrottleTag(StorageQueueInfo& ss, int64_t storageQueue, int64_t storageDurabilityLag) {
+		// NOTE: we just keep it simple and don't differentiate write-saturation and read-saturation at the moment. In
+		// most of situation, this works. More indicators besides queue size and durability lag could be investigated in
+		// the future
+		if (storageQueue > SERVER_KNOBS->AUTO_TAG_THROTTLE_STORAGE_QUEUE_BYTES ||
+		    storageDurabilityLag > SERVER_KNOBS->AUTO_TAG_THROTTLE_DURABILITY_LAG_VERSIONS) {
+			if (ss.busiestWriteTag.present()) {
+				return tryAutoThrottleTag(ss.busiestWriteTag.get(),
+				                          ss.busiestWriteTagRate,
+				                          ss.busiestWriteTagFractionalBusyness,
+				                          TagThrottledReason::BUSY_WRITE);
+			}
+			if (ss.busiestReadTag.present()) {
+				return tryAutoThrottleTag(ss.busiestReadTag.get(),
+				                          ss.busiestReadTagRate,
+				                          ss.busiestReadTagFractionalBusyness,
+				                          TagThrottledReason::BUSY_READ);
+			}
+		}
+		return Void();
+	}
+
+}; // class TagThrottlerImpl
+
+TagThrottler::TagThrottler(Database db, UID id) : impl(PImpl<TagThrottlerImpl>::create(db, id)) {}
+TagThrottler::~TagThrottler() = default;
+Future<Void> TagThrottler::monitorThrottlingChanges() {
+	return impl->monitorThrottlingChanges();
+}
+void TagThrottler::addRequests(TransactionTag tag, int count) {
+	impl->addRequests(tag, count);
+}
+uint64_t TagThrottler::getThrottledTagChangeId() const {
+	return impl->getThrottledTagChangeId();
+}
+PrioritizedTransactionTagMap<ClientTagThrottleLimits> TagThrottler::getClientRates() {
+	return impl->getClientRates();
+}
+int64_t TagThrottler::autoThrottleCount() const {
+	return impl->autoThrottleCount();
+}
+uint32_t TagThrottler::busyReadTagCount() const {
+	return impl->busyReadTagCount();
+}
+uint32_t TagThrottler::busyWriteTagCount() const {
+	return impl->busyWriteTagCount();
+}
+int64_t TagThrottler::manualThrottleCount() const {
+	return impl->manualThrottleCount();
+}
+bool TagThrottler::isAutoThrottlingEnabled() const {
+	return impl->isAutoThrottlingEnabled();
+}
+Future<Void> TagThrottler::tryAutoThrottleTag(StorageQueueInfo& ss,
+                                              int64_t storageQueue,
+                                              int64_t storageDurabilityLag) {
+	return impl->tryAutoThrottleTag(ss, storageQueue, storageDurabilityLag);
+}
--- a/fdbserver/TagThrottler.h
+++ b/fdbserver/TagThrottler.h
@ -0,0 +1,42 @@
+/*
+ * TagThrottler.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "fdbclient/PImpl.h"
+#include "fdbserver/Ratekeeper.h"
+
+class TagThrottler {
+	PImpl<class TagThrottlerImpl> impl;
+
+public:
+	TagThrottler(Database db, UID id);
+	~TagThrottler();
+	Future<Void> monitorThrottlingChanges();
+	void addRequests(TransactionTag tag, int count);
+	uint64_t getThrottledTagChangeId() const;
+	PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates();
+	int64_t autoThrottleCount() const;
+	uint32_t busyReadTagCount() const;
+	uint32_t busyWriteTagCount() const;
+	int64_t manualThrottleCount() const;
+	bool isAutoThrottlingEnabled() const;
+	Future<Void> tryAutoThrottleTag(StorageQueueInfo&, int64_t storageQueue, int64_t storageDurabilityLag);
+};
--- a/Show More
+++ b/Show More