Merge remote-tracking branch 'apple/master' into task/failmon-remove-server

2020-01-21 13:20:15 -08:00 · 2020-01-21 13:20:15 -08:00 · daef5f011a
parent 598b2eaeb0 d035d4f30c
commit daef5f011a
62 changed files with 1298 additions and 554 deletions
--- a/FDBLibTLS/FDBLibTLSSession.cpp
+++ b/FDBLibTLS/FDBLibTLSSession.cpp
@ -347,7 +347,7 @@ bool FDBLibTLSSession::verify_peer() {
 		if(now() - lastVerifyFailureLogged > 1.0) {
 			for (std::string reason : verify_failure_reasons) {
 				lastVerifyFailureLogged = now();
-				TraceEvent("FDBLibTLSVerifyFailure", uid).detail("Reason", reason);
+				TraceEvent("FDBLibTLSVerifyFailure", uid).suppressFor(1.0).detail("Reason", reason);
 			}
 		}
 	}
--- a/bindings/bindingtester/bindingtester.py
+++ b/bindings/bindingtester/bindingtester.py
@ -199,7 +199,7 @@ class TestRunner(object):
            raise Exception('Not all testers support concurrency')

        # Test types should be intersection of all tester supported types
-        self.args.types = reduce(lambda t1, t2: filter(t1.__contains__, t2), map(lambda tester: tester.types, self.testers))
+        self.args.types = list(reduce(lambda t1, t2: filter(t1.__contains__, t2), map(lambda tester: tester.types, self.testers)))

        self.args.no_directory_snapshot_ops = self.args.no_directory_snapshot_ops or any([not tester.directory_snapshot_ops_enabled for tester in self.testers])

--- a/bindings/bindingtester/tests/directory.py
+++ b/bindings/bindingtester/tests/directory.py
@ -52,12 +52,12 @@ class DirectoryTest(Test):
        self.dir_list.append(child)
        self.dir_index = directory_util.DEFAULT_DIRECTORY_INDEX

-    def generate_layer(self):
+    def generate_layer(self, allow_partition=True):
        if random.random() < 0.7:
            return b''
        else:
            choice = random.randint(0, 3)
-            if choice == 0:
+            if choice == 0 and allow_partition:
                return b'partition'
            elif choice == 1:
                return b'test_layer'
@ -184,7 +184,9 @@ class DirectoryTest(Test):
                    test_util.blocking_commit(instructions)

                path = generate_path()
-                op_args = test_util.with_length(path) + (self.generate_layer(),)
+                # Partitions that use the high-contention allocator can result in non-determinism if they fail to commit, 
+                # so we disallow them in comparison tests
+                op_args = test_util.with_length(path) + (self.generate_layer(allow_partition=args.concurrency>1),)
                directory_util.push_instruction_and_record_prefix(instructions, op, op_args, path, len(self.dir_list), self.random, self.prefix_log)

                if not op.endswith('_DATABASE') and args.concurrency == 1:
--- a/bindings/c/test/mako/mako.c
+++ b/bindings/c/test/mako/mako.c
@ -182,6 +182,7 @@ int populate(FDBTransaction *transaction, mako_args_t *args, int worker_id,
  int end = insert_end(args->rows, worker_id, thread_id, args->num_processes,
                       args->num_threads);
  int xacts = 0;
+  int tracetimer = 0;

  keystr = (char *)malloc(sizeof(char) * args->key_length + 1);
  if (!keystr)
@ -200,8 +201,13 @@ int populate(FDBTransaction *transaction, mako_args_t *args, int worker_id,

  for (i = begin; i <= end; i++) {

-    if ((thread_tps > 0) && (xacts >= thread_tps)) {
-      /* throttling is on */
+    /* sequential keys */
+    genkey(keystr, i, args->rows, args->key_length + 1);
+    /* random values */
+    randstr(valstr, args->value_length + 1);
+
+    if (((thread_tps > 0) && (xacts >= thread_tps)) /* throttle */ ||
+	(args->txntrace) /* txn tracing */){

    throttle:
      clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_now);
@ -212,17 +218,40 @@ int populate(FDBTransaction *transaction, mako_args_t *args, int worker_id,
        xacts = 0;
        timer_prev.tv_sec = timer_now.tv_sec;
        timer_prev.tv_nsec = timer_now.tv_nsec;
-      } else {
-        /* 1 second not passed, throttle */
-        usleep(1000); /* sleep for 1ms */
-        goto throttle;
-      }
-    } /* throttle */

-    /* sequential keys */
-    genkey(keystr, i, args->rows, args->key_length + 1);
-    /* random values */
-    randstr(valstr, args->value_length + 1);
+	/* enable transaction tracing */
+	if (args->txntrace) {
+	  tracetimer++;
+	  if (tracetimer == args->txntrace) {
+	    fdb_error_t err;
+	    tracetimer = 0;
+	    fprintf(debugme, "DEBUG: txn tracing %s\n", keystr);
+	    err = fdb_transaction_set_option(transaction,
+					     FDB_TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER,
+					     (uint8_t *)keystr, strlen(keystr));
+	    if (err) {
+	      fprintf(stderr,
+		      "ERROR: fdb_transaction_set_option(FDB_TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER): %s\n",
+		      fdb_get_error(err));
+	    }
+	    err = fdb_transaction_set_option(transaction,
+					     FDB_TR_OPTION_LOG_TRANSACTION,
+					     (uint8_t *)NULL, 0);
+	    if (err) {
+	      fprintf(stderr,
+		      "ERROR: fdb_transaction_set_option(FDB_TR_OPTION_LOG_TRANSACTION): %s\n",
+		      fdb_get_error(err));
+	    }
+	  }
+	}
+      } else {
+	if (thread_tps > 0) {
+	  /* 1 second not passed, throttle */
+	  usleep(1000); /* sleep for 1ms */
+	  goto throttle;
+	}
+      }
+    } /* throttle or txntrace */

    /* insert (SET) */
    fdb_transaction_set(transaction, (uint8_t *)keystr, strlen(keystr),
@ -427,8 +456,10 @@ int run_one_transaction(FDBTransaction *transaction, mako_args_t *args,
  int randstrlen;
  int rangei;

+#if 0 /* this call conflicts with debug transaction */
  /* make sure that the transaction object is clean */
  fdb_transaction_reset(transaction);
+#endif

  clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_start);

@ -531,6 +562,8 @@ int run_one_transaction(FDBTransaction *transaction, mako_args_t *args,
 		stats->errors[OP_COMMIT]++;
 	      }
 	      if (rc == FDB_ERROR_ABORT) {
+		/* make sure to reset transaction */
+		fdb_transaction_reset(transaction);
 		return rc; /* abort */
 	      }
 	      goto retryTxn;
@ -562,6 +595,8 @@ int run_one_transaction(FDBTransaction *transaction, mako_args_t *args,
 	    if (rc == FDB_ERROR_RETRY) {
 	      goto retryTxn;
 	    } else if (rc == FDB_ERROR_ABORT) {
+	      /* make sure to reset transaction */
+	      fdb_transaction_reset(transaction);
 	      return rc; /* abort */
 	    }
 	  }
@ -580,6 +615,8 @@ int run_one_transaction(FDBTransaction *transaction, mako_args_t *args,
 	      stats->errors[OP_COMMIT]++;
 	    }
 	    if (rc == FDB_ERROR_ABORT) {
+	      /* make sure to reset transaction */
+	      fdb_transaction_reset(transaction);
 	      return rc; /* abort */
 	    }
 	    goto retryTxn;
@ -612,6 +649,8 @@ int run_one_transaction(FDBTransaction *transaction, mako_args_t *args,
 	    stats->errors[OP_COMMIT]++;
 	  }
 	  if (rc == FDB_ERROR_ABORT) {
+	    /* make sure to reset transaction */
+	    fdb_transaction_reset(transaction);
 	    return rc; /* abort */
 	  }
 	  goto retryTxn;
@ -637,6 +676,8 @@ int run_one_transaction(FDBTransaction *transaction, mako_args_t *args,
 	stats->errors[OP_COMMIT]++;
      }
      if (rc == FDB_ERROR_ABORT) {
+	/* make sure to reset transaction */
+	fdb_transaction_reset(transaction);
 	return rc; /* abort */
      }
      goto retryTxn;
@ -645,24 +686,34 @@ int run_one_transaction(FDBTransaction *transaction, mako_args_t *args,

  stats->xacts++;

+  /* make sure to reset transaction */
+  fdb_transaction_reset(transaction);
  return 0;
 }


 int run_workload(FDBTransaction *transaction, mako_args_t *args,
 		 int thread_tps, volatile double *throttle_factor,
-                 int thread_iters, volatile int *signal, mako_stats_t *stats) {
+                 int thread_iters, volatile int *signal, mako_stats_t *stats,
+		 int dotrace) {
  int xacts = 0;
+  int64_t total_xacts = 0;
  int rc = 0;
  struct timespec timer_prev, timer_now;
  char *keystr;
  char *keystr2;
  char *valstr;
  int current_tps;
+  char *traceid;
+  int tracetimer = 0;

  if (thread_tps < 0)
    return 0;

+  if (dotrace) {
+    traceid = (char *)malloc(32);
+  }
+  
  current_tps = (int)((double)thread_tps * *throttle_factor);

  keystr = (char *)malloc(sizeof(char) * args->key_length + 1);
@ -685,25 +736,52 @@ int run_workload(FDBTransaction *transaction, mako_args_t *args,
  /* main transaction loop */
  while (1) {

-    if ((thread_tps > 0) && (xacts >= current_tps)) {
-      /* throttling is on */
+    if (((thread_tps > 0) && (xacts >= current_tps)) /* throttle on */ ||
+	dotrace /* transaction tracing on */ ){

      clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_now);
      if ((timer_now.tv_sec > timer_prev.tv_sec + 1) ||
          ((timer_now.tv_sec == timer_prev.tv_sec + 1) &&
           (timer_now.tv_nsec > timer_prev.tv_nsec))) {
        /* more than 1 second passed, no need to throttle */
-        xacts = 0;
+	xacts = 0;
        timer_prev.tv_sec = timer_now.tv_sec;
        timer_prev.tv_nsec = timer_now.tv_nsec;
+
 	/* update throttle rate */
-	current_tps = (int)((double)thread_tps * *throttle_factor);
+	if (thread_tps > 0) {
+	  current_tps = (int)((double)thread_tps * *throttle_factor);
+	}
+
+	/* enable transaction trace */
+	if (dotrace) {
+	  tracetimer++;
+	  if (tracetimer == dotrace) {
+	    fdb_error_t err;
+	    tracetimer = 0;
+	    snprintf(traceid, 32, "makotrace%019lld", total_xacts);
+	    fprintf(debugme, "DEBUG: txn tracing %s\n", traceid);
+	    err = fdb_transaction_set_option(transaction, FDB_TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER,
+					     (uint8_t *)traceid, strlen(traceid));
+	    if (err) {
+	      fprintf(stderr, "ERROR: FDB_TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER: %s\n", fdb_get_error(err));
+	    }
+	    err = fdb_transaction_set_option(transaction, FDB_TR_OPTION_LOG_TRANSACTION,
+					     (uint8_t *)NULL, 0);
+	    if (err) {
+	      fprintf(stderr, "ERROR: FDB_TR_OPTION_LOG_TRANSACTION: %s\n", fdb_get_error(err));
+	    }
+	  }
+	}
+	
      } else {
-        /* 1 second not passed, throttle */
-        usleep(1000);
-        continue;
+	if (thread_tps > 0) {
+	  /* 1 second not passed, throttle */
+	  usleep(1000);
+	  continue;
+	}
      }
-    }
+    } /* throttle or txntrace */

    rc = run_one_transaction(transaction, args, stats, keystr, keystr2, valstr);
    if (rc) {
@ -721,10 +799,14 @@ int run_workload(FDBTransaction *transaction, mako_args_t *args,
      break;
    }
    xacts++;
+    total_xacts++;
  }
  free(keystr);
  free(keystr2);
  free(valstr);
+  if (dotrace) {
+    free(traceid);
+  }

  return rc;
 }
@ -742,6 +824,7 @@ void *worker_thread(void *thread_args) {
  int thread_tps = 0;
  int thread_iters = 0;
  int op;
+  int dotrace = (worker_id == 0 && thread_id == 0 && args->txntrace) ? args->txntrace : 0;
  volatile int *signal = &((thread_args_t *)thread_args)->process->shm->signal;
  volatile double *throttle_factor = &((thread_args_t *)thread_args)->process->shm->throttle_factor;
  volatile int *readycount =
@ -801,7 +884,7 @@ void *worker_thread(void *thread_args) {
  /* run the workload */
  else if (args->mode == MODE_RUN) {
    rc = run_workload(transaction, args, thread_tps, throttle_factor,
-		      thread_iters, signal, stats);
+		      thread_iters, signal, stats, dotrace);
    if (rc < 0) {
      fprintf(stderr, "ERROR: run_workload failed\n");
    }
@ -859,9 +942,9 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) {

  /* enable tracing if specified */
  if (args->trace) {
-    fprintf(debugme, "DEBUG: Enable Tracing (%s)\n", (args->tracepath[0] == '\0')
-	    ? "current directory"
-	    : args->tracepath);
+    fprintf(debugme, "DEBUG: Enable Tracing in %s (%s)\n",
+	    (args->traceformat == 0) ? "XML" : "JSON",
+	    (args->tracepath[0] == '\0') ? "current directory" : args->tracepath);
    err = fdb_network_set_option(FDB_NET_OPTION_TRACE_ENABLE,
                                 (uint8_t *)args->tracepath,
                                 strlen(args->tracepath));
@ -871,6 +954,16 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) {
          "ERROR: fdb_network_set_option(FDB_NET_OPTION_TRACE_ENABLE): %s\n",
          fdb_get_error(err));
    }
+    if (args->traceformat == 1) {
+      err = fdb_network_set_option(FDB_NET_OPTION_TRACE_FORMAT,
+				   (uint8_t *)"json", 4);
+      if (err) {
+	fprintf(
+		stderr,
+		"ERROR: fdb_network_set_option(FDB_NET_OPTION_TRACE_FORMAT): %s\n",
+		fdb_get_error(err));
+      }
+    }
  }

  /* enable knobs if specified */
@ -1019,6 +1112,8 @@ int init_args(mako_args_t *args) {
  args->knobs[0] = '\0';
  args->trace = 0;
  args->tracepath[0] = '\0';
+  args->traceformat = 0; /* default to client's default (XML) */
+  args->txntrace = 0;
  for (i = 0; i < MAX_OP; i++) {
    args->txnspec.ops[i][OP_COUNT] = 0;
  }
@ -1148,40 +1243,42 @@ int parse_transaction(mako_args_t *args, char *optarg) {

 void usage() {
  printf("Usage:\n");
-  printf("%-24s%s\n", "-h, --help", "Print this message");
-  printf("%-24s%s\n", "    --version", "Print FDB version");
-  printf("%-24s%s\n", "-v, --verbose", "Specify verbosity");
-  printf("%-24s%s\n", "-a, --api_version=API_VERSION", "Specify API_VERSION to use");
-  printf("%-24s%s\n", "-c, --cluster=FILE", "Specify FDB cluster file");
-  printf("%-24s%s\n", "-p, --procs=PROCS",
+  printf("%-24s %s\n", "-h, --help", "Print this message");
+  printf("%-24s %s\n", "    --version", "Print FDB version");
+  printf("%-24s %s\n", "-v, --verbose", "Specify verbosity");
+  printf("%-24s %s\n", "-a, --api_version=API_VERSION", "Specify API_VERSION to use");
+  printf("%-24s %s\n", "-c, --cluster=FILE", "Specify FDB cluster file");
+  printf("%-24s %s\n", "-p, --procs=PROCS",
         "Specify number of worker processes");
-  printf("%-24s%s\n", "-t, --threads=THREADS",
+  printf("%-24s %s\n", "-t, --threads=THREADS",
         "Specify number of worker threads");
-  printf("%-24s%s\n", "-r, --rows=ROWS", "Specify number of records");
-  printf("%-24s%s\n", "-s, --seconds=SECONDS",
+  printf("%-24s %s\n", "-r, --rows=ROWS", "Specify number of records");
+  printf("%-24s %s\n", "-s, --seconds=SECONDS",
         "Specify the test duration in seconds\n");
-  printf("%-24s%s\n", "", "This option cannot be specified with --iteration.");
-  printf("%-24s%s\n", "-i, --iteration=ITERS",
+  printf("%-24s %s\n", "", "This option cannot be specified with --iteration.");
+  printf("%-24s %s\n", "-i, --iteration=ITERS",
         "Specify the number of iterations.\n");
-  printf("%-24s%s\n", "", "This option cannot be specified with --seconds.");
-  printf("%-24s%s\n", "    --keylen=LENGTH", "Specify the key lengths");
-  printf("%-24s%s\n", "    --vallen=LENGTH", "Specify the value lengths");
-  printf("%-24s%s\n", "-x, --transaction=SPEC", "Transaction specification");
-  printf("%-24s%s\n", "    --tps|--tpsmax=TPS", "Specify the target max TPS");
-  printf("%-24s%s\n", "    --tpsmin=TPS", "Specify the target min TPS");
-  printf("%-24s%s\n", "    --tpsinterval=SEC", "Specify the TPS change interval (Default: 10 seconds)");
-  printf("%-24s%s\n", "    --tpschange=<sin|square|pulse>", "Specify the TPS change type (Default: sin)");
-  printf("%-24s%s\n", "    --sampling=RATE",
+  printf("%-24s %s\n", "", "This option cannot be specified with --seconds.");
+  printf("%-24s %s\n", "    --keylen=LENGTH", "Specify the key lengths");
+  printf("%-24s %s\n", "    --vallen=LENGTH", "Specify the value lengths");
+  printf("%-24s %s\n", "-x, --transaction=SPEC", "Transaction specification");
+  printf("%-24s %s\n", "    --tps|--tpsmax=TPS", "Specify the target max TPS");
+  printf("%-24s %s\n", "    --tpsmin=TPS", "Specify the target min TPS");
+  printf("%-24s %s\n", "    --tpsinterval=SEC", "Specify the TPS change interval (Default: 10 seconds)");
+  printf("%-24s %s\n", "    --tpschange=<sin|square|pulse>", "Specify the TPS change type (Default: sin)");
+  printf("%-24s %s\n", "    --sampling=RATE",
         "Specify the sampling rate for latency stats");
-  printf("%-24s%s\n", "-m, --mode=MODE",
+  printf("%-24s %s\n", "-m, --mode=MODE",
         "Specify the mode (build, run, clean)");
-  printf("%-24s%s\n", "-z, --zipf",
+  printf("%-24s %s\n", "-z, --zipf",
         "Use zipfian distribution instead of uniform distribution");
-  printf("%-24s%s\n", "    --commitget", "Commit GETs");
-  printf("%-24s%s\n", "    --trace", "Enable tracing");
-  printf("%-24s%s\n", "    --tracepath=PATH", "Set trace file path");
-  printf("%-24s%s\n", "    --knobs=KNOBS", "Set client knobs");
-  printf("%-24s%s\n", "    --flatbuffers", "Use flatbuffers");
+  printf("%-24s %s\n", "    --commitget", "Commit GETs");
+  printf("%-24s %s\n", "    --trace", "Enable tracing");
+  printf("%-24s %s\n", "    --tracepath=PATH", "Set trace file path");
+  printf("%-24s %s\n", "    --trace_format <xml|json>", "Set trace format (Default: json)");
+  printf("%-24s %s\n", "    --txntrace=sec", "Specify transaction tracing interval (Default: 0)");
+  printf("%-24s %s\n", "    --knobs=KNOBS", "Set client knobs");
+  printf("%-24s %s\n", "    --flatbuffers", "Use flatbuffers");
 }


@ -1214,6 +1311,8 @@ int parse_args(int argc, char *argv[], mako_args_t *args) {
        {"mode", required_argument, NULL, 'm'},
        {"knobs", required_argument, NULL, ARG_KNOBS},
        {"tracepath", required_argument, NULL, ARG_TRACEPATH},
+        {"trace_format", required_argument, NULL, ARG_TRACEFORMAT},
+        {"txntrace", required_argument, NULL, ARG_TXNTRACE},
        /* no args */
        {"help", no_argument, NULL, 'h'},
        {"json", no_argument, NULL, 'j'},
@ -1324,6 +1423,19 @@ int parse_args(int argc, char *argv[], mako_args_t *args) {
      args->trace = 1;
      memcpy(args->tracepath, optarg, strlen(optarg) + 1);
      break;
+    case ARG_TRACEFORMAT:
+      if (strncmp(optarg, "json", 5) == 0) {
+	args->traceformat = 1;
+      } else if (strncmp(optarg, "xml", 4) == 0) {
+	args->traceformat = 0;
+      } else {
+	fprintf(stderr, "Error: Invalid trace_format %s\n", optarg);
+	exit(0);
+      }
+      break;
+    case ARG_TXNTRACE:
+      args->txntrace = atoi(optarg);
+      break;
    }
  }
  if ((args->tpsmin == -1) || (args->tpsmin > args->tpsmax)) {
--- a/bindings/c/test/mako/mako.h
+++ b/bindings/c/test/mako/mako.h
@ -67,10 +67,12 @@ enum Arguments {
  ARG_FLATBUFFERS,
  ARG_TRACE,
  ARG_TRACEPATH,
+  ARG_TRACEFORMAT,
  ARG_TPSMAX,
  ARG_TPSMIN,
  ARG_TPSINTERVAL,
-  ARG_TPSCHANGE
+  ARG_TPSCHANGE,
+  ARG_TXNTRACE
 };

 enum TPSChangeTypes {
@ -117,8 +119,10 @@ typedef struct {
  char cluster_file[PATH_MAX];
  int trace;
  char tracepath[PATH_MAX];
+  int traceformat; /* 0 - XML, 1 - JSON */
  char knobs[KNOB_MAX];
  uint8_t flatbuffers;
+  int txntrace;
 } mako_args_t;

 /* shared memory */
--- a/bindings/c/test/mako/mako.rst
+++ b/bindings/c/test/mako/mako.rst
@ -1,27 +1,27 @@
 ##############
-mako Benchmark
+🦈 Mako Benchmark
 ##############

-| mako (named after a small, but very fast shark) is a micro-benchmark for FoundationDB
+| Mako (named after a very fast shark) is a micro-benchmark for FoundationDB
 | which is designed to be very light and flexible
 | so that you can stress a particular part of an FoundationDB cluster without introducing unnecessary overhead.


 How to Build
 ============
-| ``mako`` gets build automatically when you build FoundationDB.
+| ``mako`` gets built automatically when you build FoundationDB.
 | To build ``mako`` manually, simply build ``mako`` target in the FoundationDB build directory.
-| e.g. If you're using Unix Makefiles
+| e.g. If you're using Unix Makefiles, type:
 | ``make mako``


 Architecture
 ============
 - mako is a stand-alone program written in C,
-  which communicates to FoundationDB using C binding API (``libfdb_c.so``)
- It creates one master process, and one or more worker processes (multi-process)
- Each worker process creates one or more multiple threads (multi-thread)
- All threads within the same process share the same network thread
+  which communicates to FoundationDB using C API (via ``libfdb_c.so``)
+- It creates one master process, one stats emitter process, and one or more worker processes (multi-process)
+- Each worker process creates one FDB network thread, and one or more worker threads (multi-thread)
+- All worker threads within the same process share the same network thread


 Data Specification
@ -32,18 +32,18 @@ Data Specification

 Arguments
 =========
- | ``--mode <mode>``
+- | ``-m | --mode <mode>``
  | One of the following modes must be specified.  (Required)
  | - ``clean``:  Clean up existing data
  | - ``build``:  Populate data
  | - ``run``:  Run the benchmark

- | ``-a | --api_version <api_version>``
-  | FDB API version to use (Default: Latest)
-
 - | ``-c | --cluster <cluster file>``
  | FDB cluster file (Required)

+- | ``-a | --api_version <api_version>``
+  | FDB API version to use (Default: Latest)
+
 - | ``-p | --procs <procs>``
  | Number of worker processes (Default: 1)

@ -51,7 +51,7 @@ Arguments
  | Number of threads per worker process (Default: 1)

 - | ``-r | --rows <rows>``
-  | Number of rows populated (Default: 100000)
+  | Number of rows initially populated (Default: 100000)

 - | ``-s | --seconds <seconds>``
  | Test duration in seconds (Default: 30)
@ -113,10 +113,10 @@ Arguments

 Transaction Specification
 =========================
-| A transaction may contain multiple operations of multiple types.
+| A transaction may contain multiple operations of various types.
 | You can specify multiple operations for one operation type by specifying "Count".
-| For RANGE operations, "Range" needs to be specified in addition to "Count".
-| Every transaction is committed unless it contains only GET / GET RANGE operations.
+| For RANGE operations, the "Range" needs to be specified in addition to "Count".
+| Every transaction is committed unless the transaction is read-only.

 Operation Types
 ---------------
@ -137,21 +137,22 @@ Format
 ------
 | One operation type is defined as ``<Type><Count>`` or ``<Type><Count>:<Range>``.
 | When Count is omitted, it's equivalent to setting it to 1.  (e.g. ``g`` is equivalent to ``g1``)
-| Multiple operation types can be concatenated.  (e.g. ``g9u1`` = 9 GETs and 1 update)
+| Multiple operation types within the same trancaction can be concatenated.  (e.g. ``g9u1`` = 9 GETs and 1 update)

 Transaction Specification Examples
 ----------------------------------
- | 100 GETs (No Commit)
+- | 100 GETs (Non-commited)
  | ``g100``

- | 10 GET RANGE with Range of 50 (No Commit)
+- | 10 GET RANGE with Range of 50 (Non-commited)
  | ``gr10:50``

 - | 90 GETs and 10 Updates (Committed)
  | ``g90u10``

- | 80 GETs, 10 Updates and 10 Inserts (Committed)
-  | ``g90u10i10``
+- | 70 GETs, 10 Updates and 10 Inserts (Committed)
+  | ``g70u10i10``
+  | This is 80-20.


 Execution Examples
@ -160,12 +161,14 @@ Execution Examples
 Preparation
 -----------
 - Start the FoundationDB cluster and create a database
- Set LD_LIBRARY_PATH pointing to a proper ``libfdb_c.so``
+- Set ``LD_LIBRARY_PATH`` environment variable pointing to a proper ``libfdb_c.so`` shared library

-Build
-----
+Populate Initial Database
+-------------------------
 ``mako --cluster /etc/foundationdb/fdb.cluster --mode build --rows 1000000 --procs 4``
+Note: You may be able to speed up the data population by increasing the number of processes or threads.

 Run
 ---
+Run a mixed workload with a total of 8 threads for 60 seconds, keeping the throughput limited to 1000 TPS.
 ``mako --cluster /etc/foundationdb/fdb.cluster --mode run --rows 1000000 --procs 2 --threads 8 --transaction "g8ui" --seconds 60 --tps 1000``
--- a/bindings/flow/CMakeLists.txt
+++ b/bindings/flow/CMakeLists.txt
@ -16,7 +16,7 @@ set(SRCS
  fdb_flow.actor.cpp
  fdb_flow.h)

-add_flow_target(NAME fdb_flow SRCS ${SRCS} STATIC_LIBRARY)
+add_flow_target(STATIC_LIBRARY NAME fdb_flow SRCS ${SRCS})
 target_link_libraries(fdb_flow PUBLIC fdb_c)

 add_subdirectory(tester)
--- a/bindings/flow/tester/CMakeLists.txt
+++ b/bindings/flow/tester/CMakeLists.txt
@ -2,5 +2,5 @@ set(TEST_SRCS
  DirectoryTester.actor.cpp
  Tester.actor.cpp
  Tester.actor.h)
-add_flow_target(NAME fdb_flow_tester EXECUTABLE SRCS ${TEST_SRCS})
+add_flow_target(EXECUTABLE NAME fdb_flow_tester SRCS ${TEST_SRCS})
 target_link_libraries(fdb_flow_tester fdb_flow)
--- a/bindings/go/src/fdb/doc.go
+++ b/bindings/go/src/fdb/doc.go
@ -139,6 +139,16 @@ error. The above example may be rewritten as:
        return []string{valueOne, valueTwo}, nil
    })

+MustGet returns nil (which is different from empty slice []byte{}), when the
+key doesn't exist, and hence non-existence can be checked as follows:
+
+    val := tr.Get(fdb.Key("foobar")).MustGet()
+    if val == nil {
+      fmt.Println("foobar does not exist.")
+    } else {
+      fmt.Println("foobar exists.")
+    }
+
 Any panic that occurs during execution of the caller-provided function will be
 recovered by the (Database).Transact method. If the error is an FDB Error, it
 will either result in a retry of the function or be returned by Transact. If the
--- a/cmake/FlowCommands.cmake
+++ b/cmake/FlowCommands.cmake
@ -131,15 +131,19 @@ function(strip_debug_symbols target)
  add_custom_command(OUTPUT "${out_file}"
    COMMAND ${strip_command} $<TARGET_FILE:${target}>
    COMMENT "Stripping symbols from ${target}")
-  set(out_files "${out_file}")
+  add_custom_target(strip_only_${target} DEPENDS ${out_file})
  if(is_exec AND NOT APPLE)
    add_custom_command(OUTPUT "${out_file}.debug"
-      COMMAND objcopy --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug" &&
-      objcopy --add-gnu-debuglink="${out_file}.debug" ${out_file}
+      COMMAND objcopy --verbose --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug"
+      COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}"
+      DEPENDS ${out_file}
      COMMENT "Copy debug symbols to ${out_name}.debug")
    list(APPEND out_files "${out_file}.debug")
+    add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
+  else()
+    add_custom_target(strip_${target})
  endif()
-  add_custom_target(strip_${target} DEPENDS ${out_files})
+  add_dependencies(strip_${target} strip_only_${target})
  add_dependencies(strip_${target} ${target})
  add_dependencies(strip_targets strip_${target})
 endfunction()
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@ -492,6 +492,19 @@ If a process has had more than 10 TCP segments retransmitted in the last 5 secon

      10.0.4.1:4500       ( 3% cpu;  2% machine; 0.004 Gbps;  0% disk; REXMIT! 2.5 GB / 4.1 GB RAM  )

+Machine-readable status
+--------------------------------
+
+The status command can provide a complete summary of statistics about the cluster and the database with the ``json`` argument. Full documentation for ``status json`` output can be found :doc:`here <mr-status>`.
+From the output of ``status json``, operators can find useful health metrics to determine whether or not their cluster is hitting performance limits.
+
+====================== ==============================================================================================================
+Ratekeeper limit        ``cluster.qos.transactions_per_second_limit`` contains the number of read versions per second that the cluster can give out. A low ratekeeper limit indicates that the cluster performance is limited in some way. The reason for a low ratekeeper limit can be found at ``cluster.qos.performance_limited_by``. ``cluster.qos.released_transactions_per_second`` describes the number of read versions given out per second, and can be used to tell how close the ratekeeper is to throttling.
+Storage queue size      ``cluster.qos.worst_queue_bytes_storage_server`` contains the maximum size in bytes of a storage queue. Each storage server has mutations that have not yet been made durable, stored in its storage queue. If this value gets too large, it indicates a storage server is falling behind. A large storage queue will cause the ratekeeper to increase throttling. However, depending on the configuration, the ratekeeper can ignore the worst storage queue from one fault domain. Thus, ratekeeper uses ``cluster.qos.limiting_queue_bytes_storage_server`` to determine the throttling level.
+Durable version lag     ``cluster.qos.worst_durability_lag_storage_server`` contains information about the worst storage server durability lag. The ``versions`` subfield contains the maximum number of versions in a storage queue. Ideally, this should be near 5 million. The ``seconds`` subfield contains the maximum number of seconds of non-durable data in a storage queue. Ideally, this should be near 5 seconds. If a storage server is overwhelmed, the durability lag could rise, causing performance issues.
+Transaction log queue   ``cluster.qos.worst_queue_bytes_log_server`` contains the maximum size in bytes of the mutations stored on a transaction log that have not yet been popped by storage servers. A large transaction log queue size can potentially cause the ratekeeper to increase throttling.
+====================== ==============================================================================================================
+
 .. _administration_fdbmonitor:

 ``fdbmonitor`` and ``fdbserver``
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@ -24,7 +24,7 @@ Backup vs DR

 FoundationDB can backup a database to local disks, a blob store (such as Amazon S3), or to another FoundationDB database.  

-Backing up one database to another is a special form of backup is called DR backup or just DR for short.  DR stands for Disaster Recovery, as it can be used to keep two geographically separated databases in close synchronization to recover from a catastrophic disaster.  Once a DR  operation has reached 'differential' mode, the secondary database (the destination of the DR job) will always contains a *consistent* copy of the primary database (the source of the DR job) but it will be from some past point in time.  If the primary database is lost and applications continue using the secondary database, the "ACI" in ACID is preserved but D (Durability) is lost for some amount of most recent changes.  When DR is operating normally, the secondary database will lag behind the primary database by as little as a few seconds worth of database commits.
+Backing up one database to another is a special form of backup is called DR backup or just DR for short.  DR stands for Disaster Recovery, as it can be used to keep two geographically separated databases in close synchronization to recover from a catastrophic disaster.  Once a DR  operation has reached 'differential' mode, the secondary database (the destination of the DR job) will always contain a *consistent* copy of the primary database (the source of the DR job) but it will be from some past point in time.  If the primary database is lost and applications continue using the secondary database, the "ACI" in ACID is preserved but D (Durability) is lost for some amount of most recent changes.  When DR is operating normally, the secondary database will lag behind the primary database by as little as a few seconds worth of database commits.

 While a cluster is being used as the destination for a DR operation it will be locked to prevent accidental use or modification.

--- a/documentation/sphinx/source/disk-snapshot-backup.rst
+++ b/documentation/sphinx/source/disk-snapshot-backup.rst
@ -0,0 +1,323 @@
+
+.. _disk-snapshot-backups:
+
+#################################
+Disk snapshot backup and Restore
+#################################
+
+This document covers disk snapshot based backup and restoration of a FoundationDB database. This tool leverages disk level snapshots and gets a point-in-time consistent copy of the database. The disk snapshot backup can be used for test and development purposes, for compliance reasons or to provide an additional level of protection in case of hardware or software failures.
+
+.. _disk-snapshot-backup-introduction:
+
+Introduction
+============
+
+FoundationDB's disk snapshot backup tool makes a consistent, point-in-time backup of FoundationDB database without downtime by taking crash consistent snapshot of all the disk stores that have persistent data.
+
+The prerequisite of this feature is to have crash consistent snapshot support on the filesystem (or the disks) on which FoundationDB is running.
+
+The disk snapshot backup tool orchestrates the snapshotting of all the disk images and ensures that they are restorable to a consistent point in time.
+
+Restore is achieved by copying or attaching the disk snapshot images to FoundationDB compute instances. Restore behaves as if the cluster were powered down and restarted.
+
+Backup vs Disk snapshot backup
+==============================
+Backup feature already exists in FoundationDB and is detailed here :ref:`backups`, any use of fdbbackup will refer to this feature.
+
+Both fdbbackup and Disk snapshot backup tools provide a point-in-time consistent backup of FoundationDB database, but they operate at different levels and there are differences in terms of performance, features and external dependency.
+
+fdbbackup operates at the key-value level. Backup involves copying of all the key-value pairs from the source cluster and restore involves applying all the key-value pairs to the destination database. Performance depends on the amount of data and the throughput with which the data can be read and written. This approach has no external dependency, there is no requirement for any snapshotting feature from the disk system. Additionally, it has an option for continuous backup with the flexibility to pick a restore point.
+
+Disk snapshot backup and restore are generally high performance because it operates at disk level and data is not read or written through the FoundationDB stack. In environments where disk snapshot and restore are highly performant this approach can be very fast. Frequent backups can be done as a substitute to continuous backup if the backups are performant.
+
+Limitations
+===========
+
+* No support for continuous backup
+* Feature is not supported on Windows operating system
+* Data encryption is dependent on the disk system
+* Backup and restore involves tooling which are deployment and environment specific to be developed by operators
+* ``snapshot`` command is a hidden fdbcli command in the current release and will be unhidden in a future patch release.
+
+Disk snapshot backup steps
+==========================
+
+``snapshot``
+    This command line tool is used to create the snapshot. It takes a full path to a ``snapshot create binary`` and reports the status. Optionally, it can take additional arguments to be passed down to the ``snapshot create binary``. It returns a unique identifier which can be used to identify all the disk snapshots of a backup. Even in case of failures the unique identifier is returned to identify and clear any partially create disk snapshots.
+
+In response to the snapshot request from the user, FoundationDB will run the user specified ``snapshot create binary`` on all processes which have persistent data, binary should call filesystem/disk system specific snapshot create API.
+
+Before using the ``snapshot`` command the following setup needs to be done
+
+* Write a program that will snapshot the local disk store when invoked by the ``fdbserver`` with the following arguments:
+
+  - UID - 32 byte alpha-numeric unique identifier, the same identifier will be passed to all the nodes in the cluster, can be used to identify the set of disk snapshots associated with this backup
+  - Version - version string of the FoundationDB binary
+  - Path - path of the FoundationDB ``datadir`` to be snapshotted, ``datadir`` specified in :ref:`foundationdb-conf-fdbserver`
+  - Role - ``tlog``/``storage``/``coordinator``, identifies the role of the node on which the snapshot is being invoked
+
+* Install ``snapshot create binary`` on the FoundationDB instance in a secure path that can be invoked by the ``fdbserver``
+* Set a new config parameter ``whitelist_binpath`` in :ref:`foundationdb-conf-fdbserver`, whose value is the ``snapshot create binary`` absolute path. Running any ``snapshot`` command will validate that it is in the ``whitelist_binpath``. This is a security mechanism to stop running a random/insecure command on the cluster by a client using the ``snapshot`` command. Example configuration entry will look like::
+
+    whitelist_binpath = "/bin/snap_create.sh"
+
+* ``snapshot create binary`` should capture any additional data needed to restore the cluster. Additional data can be stored as tags in cloud environments or it can be stored in an additional file/directory in the ``datadir`` and then snapshotted. The section :ref:`disk-snapshot-backup-specification` describes the recommended specification of the list of things that can be gathered by the binary.
+* Program should return a non-zero status for any failures and zero for success
+* If the ``snapshot create binary`` process takes longer than 5 minutes to return a status then it will be killed and ``snapshot`` command will fail. Timeout of 5 minutes is configurable and can be set with ``SNAP_CREATE_MAX_TIMEOUT`` config parameter in :ref:`foundationdb-conf-fdbserver`. Since the default value is large enough, there should not be a need to modify this configuration.
+
+``snapshot`` is a synchronous command and when it returns successfully backup is considered complete and restorable. The time it takes to finish a backup is a function of the time it takes to snapshot the disk store. For example, if disk snapshot takes 1 second, time to finish backup should be less than < 10 seconds, this is general guidance and in some cases it may take longer. If the command is aborted by the user then the disk snapshots should not be used for restore, because the state of backup is undefined. If the command fails or aborts, operator can retry by issuing another ``snapshot`` command.
+
+Example ``snapshot`` command usage::
+
+    fdb> snapshot /bin/snap_create.sh --param1 param1-value --param2 param2-value
+    Snapshot command succeeded with UID c50263df28be44ebb596f5c2a849adbb
+
+will invoke the ``snapshot create binary`` on ``tlog`` role with the following arguments::
+
+    --param1 param1-value --param2 param2-value --path /mnt/circus/data/4502 --version 6.2.6 --role tlog --uid c50263df28be44ebb596f5c2a849adbb
+
+
+.. _disk-snapshot-backup-specification:
+
+Disk snapshot backup specification
+----------------------------------
+
+Details the list of artifacts the ``snapshot create binary`` should gather to aid the restore.
+
+================================  ========================================================   ========================================================
+Field Name                        Description                                                Source of information
+================================  ========================================================   ========================================================
+``UID``                           unique identifier passed with all the                      ``snapshot`` CLI command output contains the UID
+                                  snapshot create binary invocations associated with
+                                  a backup. Disk snapshots could be tagged with this UID.
+``FoundationDB Server Version``   software version of the ``fdbserver``                      command line argument to snap create binary
+``CreationTime``                  current system date and time                               time obtained by calling the system time
+``FoundationDB Cluster File``     cluster file which has cluster-name, magic and             read from the location of the cluster file location
+                                  the list of coordinators, cluster file is detailed         mentioned in the command line arguments. Command
+                                  here :ref:`foundationdb-cluster-file`                      line arguments of ``fdbserver`` can be accessed from
+                                                                                             /proc/$PPID/cmdline
+``Config Knobs``                  command line arguments passed to ``fdbserver``             available from command line arguments of ``fdbserver``
+                                                                                             or from foundationdb.conf
+``IP Address + Port``             host address and port information of the ``fdbserver``     available from command line arguments of ``fdbserver``
+                                  that is invoking the snapshot
+``LocalityData``                  machine id, zone id or any other locality information      available from command line arguments of ``fdbserver``
+``Name for the snapshot file``    recommended name for the disk snapshot                     cluster-name:ip-addr:port:UID
+================================  ========================================================   ========================================================
+
+``snapshot create binary`` will not be invoked on processes which does not have any persistent data (for example, Cluster Controller or Master or MasterProxy). Since these processes are stateless, there is no need for a snapshot. Any specialized configuration knobs used for one of these stateless processes need to be copied and restored externally.
+
+Management of disk snapshots
+----------------------------
+
+Unused disk snapshots or disk snapshots that are part of failed backups have to deleted by the operator externally.
+
+Error codes
+-----------
+
+Error codes returned by ``snapshot`` command
+
+======================================= ============ ============================= =============================================================
+Name                                    Code         Description                    Comments
+======================================= ============ ============================= =============================================================
+snap_path_not_whitelisted               2505         Snapshot create binary path   Whitelist the ``snap create binary`` path and retry the
+                                                     not whitelisted               operation.
+snap_not_fully_recovered_unsupported    2506         Unsupported when the cluster  Wait for the cluster to finish recovery and then retry the
+                                                     is not fully recovered        operation
+snap_log_anti_quorum_unsupported        2507         Unsupported when log anti     Feature is not supported when log anti quorum is configured
+                                                     quorum is configured
+snap_with_recovery_unsupported          2508         Cluster recovery during       Recovery happened while snapshot operation was in progress,
+                                                     snapshot operation not        retry the operation.
+                                                     supported
+snap_storage_failed                     2501         Failed to snapshot storage    Verify that the ``snap create binary`` is installed and
+                                                     nodes                         can be executed by the user running ``fdbserver``
+snap_tlog_failed                        2502         Failed to snapshot TLog            ,,
+                                                     nodes
+snap_coord_failed                       2503         Failed to snapshot                 ,,
+                                                     coordinator nodes
+unknown_error                           4000         An unknown error occurred          ,,
+snap_disable_tlog_pop_failed            2500         Disk Snapshot error           No operator action is needed, retry the operation
+snap_enable_tlog_pop_failed             2504         Disk Snapshot error                ,,
+======================================= ============ ============================= =============================================================
+
+
+Disk snapshot restore steps
+===========================
+
+Restore is the process of building up the cluster from the snapshotted disk images. There is no option to specify a restore version because there is no support for continuous backup. Here is the list of steps for the restore process:
+
+* Identify the snapshot disk images associated with the backup to be restored with the help of UID or creation time
+* Group disk images of a backup by IP address and/or locality information
+* Bring up a new cluster similar to the source cluster with FoundationDB services stopped and either attach the snapshot disk images or copy the snapshot disk images to the cluster in the following manner:
+
+  * Map the old IP address to new IP address in a one to one fashion and use that mapping to guide the restoration of disk images
+* Compute the new fdb.cluster file based on where the new ``coordinators`` disk stores are placed and push it to the all the instances in the new cluster
+* Start the FoundationDB service on all the instances
+* NOTE: Process can have multiple roles with persistent data which share the same ``datadir``. ``snapshot create binary`` will create multiple snapshots, one per role. In such case, snapshot disk images needs to go through additional processing before restore, if a snapshot image of a role has files that belongs to other roles then they need to be deleted.
+
+Cluster will start and get to healthy state indicating the completion of restore. Applications can optionally do any additional validations and use the cluster.
+
+
+Example backup and restore steps 
+================================
+
+Here are the backup and restore steps on an over simplified setup with a single node cluster and ``cp`` command to create snapshots and restore. This is purely for illustration, real world backup and restore scripts needs to follow all the steps detailed above.
+
+
+* Create a single node cluster by following the steps here :ref:`building-cluster`
+
+* Check the status of the cluster and write a few sample keys::
+  
+    fdb> status
+
+    Using cluster file `/mnt/source/fdb.cluster'.
+
+    Configuration:
+      Redundancy mode        - single
+      Storage engine         - ssd-2
+      Coordinators           - 1
+
+    Cluster:
+      FoundationDB processes - 1
+      Zones                  - 1
+      Machines               - 1
+      Memory availability    - 30.6 GB per process on machine with least available
+      Fault Tolerance        - 0 machines
+      Server time            - 12/11/19 04:02:57
+
+    Data:
+      Replication health     - Healthy
+      Moving data            - 0.000 GB
+      Sum of key-value sizes - 0 MB
+      Disk space used        - 210 MB
+
+    Operating space:
+      Storage server         - 72.6 GB free on most full server
+      Log server             - 72.6 GB free on most full server
+
+    Workload:
+      Read rate              - 9 Hz
+      Write rate             - 0 Hz
+      Transactions started   - 5 Hz
+      Transactions committed - 0 Hz
+      Conflict rate          - 0 Hz
+
+    Backup and DR:
+      Running backups        - 0
+      Running DRs            - 0
+
+    Client time: 12/11/19 04:02:57
+
+    fdb> writemode on
+    fdb> set key1 value1
+    Committed (76339236)
+    fdb> set key2 value2
+    Committed (80235963)
+
+* Write a ``snap create binary`` which copies the ``datadir`` to a user passed destination directory location::
+
+    #!/bin/sh
+
+    while (( "$#" )); do
+        case "$1" in
+            --uid)
+                SNAPUID=$2
+                shift 2
+                ;;
+            --path)
+                DATADIR=$2
+                shift 2
+                ;;
+            --role)
+                ROLE=$2
+                shift 2
+                ;;
+            --destdir)
+                DESTDIR=$2
+                shift 2
+                ;;
+            *)
+                shift
+                ;;
+        esac
+    done
+
+    mkdir -p "$DESTDIR/$SNAPUID/$ROLE" || exit 1
+    cp "$DATADIR/"* "$DESTDIR/$SNAPUID/$ROLE/" || exit 1
+
+    exit 0
+
+* Install the ``snap create binary`` as ``/bin/snap_create.sh``, add the entry for ``whitelist_binpath`` in :ref:`foundationdb-conf-fdbserver`, stop and start the foundationdb service for the configuration change to take effect
+* Issue ``snapshot`` command as follows::
+
+    fdb> snapshot /bin/snap_create.sh --destdir /mnt/backup
+    Snapshot command succeeded with UID 69a5e0576621892f85f55b4ebfeb4312
+
+* ``snapshot create binary`` gets invoked once for each role namely ``tlog``, ``storage`` and ``coordinator`` in this process with the following arguments::
+
+    --path /mnt/source/datadir --version 6.2.6 --role storage --uid 69a5e0576621892f85f55b4ebfeb4312 --destdir /mnt/backup
+    --path /mnt/source/datadir --version 6.2.6 --role tlog --uid 69a5e0576621892f85f55b4ebfeb4312 --destdir /mnt/backup
+    --path /mnt/source/datadir --version 6.2.6 --role coord --uid 69a5e0576621892f85f55b4ebfeb4312 --destdir /mnt/backup
+
+* Snapshot is successful and all the snapshot images are in ``destdir`` specified by the user in the command line argument to ``snapshot`` command, here is a sample directory listing of one of the coordinator backup directory::
+
+    $ ls /mnt/backup/69a5e0576621892f85f55b4ebfeb4312/coord/
+    coordination-0.fdq                                     log2-V_3_LS_2-b9990ae9bc00672f07264ad43d9d0792.sqlite-wal  processId
+    coordination-1.fdq                                     logqueue-V_3_LS_2-b9990ae9bc00672f07264ad43d9d0792-0.fdq   storage-f0e72cdfed12a233e0e58291150ca597.sqlite
+    log2-V_3_LS_2-b9990ae9bc00672f07264ad43d9d0792.sqlite  logqueue-V_3_LS_2-b9990ae9bc00672f07264ad43d9d0792-1.fdq   storage-f0e72cdfed12a233e0e58291150ca597.sqlite-wal
+
+* To restore the ``coordinator`` backup image, setup a restore ``datadir`` and copy all the ``coordinator`` related files to it::
+
+    $ cp /mnt/backup/69a5e0576621892f85f55b4ebfeb4312/coord/coord* /mnt/restore/datadir/
+
+* Repeat the above steps to restore ``storage`` and ``tlog`` backup images
+* Prepare the ``fdb.cluster`` for the restore with new ``coordinator`` IP address, example::
+
+    znC1NC5b:iYHJLq7z@10.2.80.40:4500 -> znC1NC5b:iYHJLq7z@10.2.80.41:4500
+* ``foundationdb.conf`` can be exact same copy as the source cluster for this example
+* Once all the backup images are restored, start a new fdbserver with the ``datadir`` pointing to ``/mnt/restore/datadir`` and the new ``fdb.cluster``.
+* Verify the cluster is healthy and check the sample keys that we added are there::
+
+    fdb> status
+
+    Using cluster file `/mnt/restore/fdb.cluster'.
+
+    Configuration:
+      Redundancy mode        - single
+      Storage engine         - ssd-2
+      Coordinators           - 1
+
+    Cluster:
+      FoundationDB processes - 1
+      Zones                  - 1
+      Machines               - 1
+      Memory availability    - 30.5 GB per process on machine with least available
+      Fault Tolerance        - 0 machines
+      Server time            - 12/11/19 09:04:53
+
+    Data:
+      Replication health     - Healthy
+      Moving data            - 0.000 GB
+      Sum of key-value sizes - 0 MB
+      Disk space used        - 210 MB
+
+    Operating space:
+      Storage server         - 72.5 GB free on most full server
+      Log server             - 72.5 GB free on most full server
+
+    Workload:
+      Read rate              - 7 Hz
+      Write rate             - 0 Hz
+      Transactions started   - 3 Hz
+      Transactions committed - 0 Hz
+      Conflict rate          - 0 Hz
+
+    Backup and DR:
+      Running backups        - 0
+      Running DRs            - 0
+
+    Client time: 12/11/19 09:04:53
+
+    fdb> get key1
+    `key1' is `value1'
+    fdb> get key2
+    `key2' is `value2'
--- a/documentation/sphinx/source/downloads.rst
+++ b/documentation/sphinx/source/downloads.rst
@ -10,38 +10,38 @@ macOS

 The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.

-* `FoundationDB-6.2.11.pkg <https://www.foundationdb.org/downloads/6.2.11/macOS/installers/FoundationDB-6.2.11.pkg>`_
+* `FoundationDB-6.2.14.pkg <https://www.foundationdb.org/downloads/6.2.14/macOS/installers/FoundationDB-6.2.14.pkg>`_

 Ubuntu
 ------

 The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.

-* `foundationdb-clients-6.2.11-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.11/ubuntu/installers/foundationdb-clients_6.2.11-1_amd64.deb>`_
-* `foundationdb-server-6.2.11-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.11/ubuntu/installers/foundationdb-server_6.2.11-1_amd64.deb>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.14-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.14/ubuntu/installers/foundationdb-clients_6.2.14-1_amd64.deb>`_
+* `foundationdb-server-6.2.14-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.14/ubuntu/installers/foundationdb-server_6.2.14-1_amd64.deb>`_ (depends on the clients package)

 RHEL/CentOS EL6
 ---------------

 The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.

-* `foundationdb-clients-6.2.11-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.11/rhel6/installers/foundationdb-clients-6.2.11-1.el6.x86_64.rpm>`_
-* `foundationdb-server-6.2.11-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.11/rhel6/installers/foundationdb-server-6.2.11-1.el6.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.14-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.14/rhel6/installers/foundationdb-clients-6.2.14-1.el6.x86_64.rpm>`_
+* `foundationdb-server-6.2.14-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.14/rhel6/installers/foundationdb-server-6.2.14-1.el6.x86_64.rpm>`_ (depends on the clients package)

 RHEL/CentOS EL7
 ---------------

 The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.

-* `foundationdb-clients-6.2.11-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.11/rhel7/installers/foundationdb-clients-6.2.11-1.el7.x86_64.rpm>`_
-* `foundationdb-server-6.2.11-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.11/rhel7/installers/foundationdb-server-6.2.11-1.el7.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.14-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.14/rhel7/installers/foundationdb-clients-6.2.14-1.el7.x86_64.rpm>`_
+* `foundationdb-server-6.2.14-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.14/rhel7/installers/foundationdb-server-6.2.14-1.el7.x86_64.rpm>`_ (depends on the clients package)

 Windows
 -------

 The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.

-* `foundationdb-6.2.11-x64.msi <https://www.foundationdb.org/downloads/6.2.11/windows/installers/foundationdb-6.2.11-x64.msi>`_
+* `foundationdb-6.2.14-x64.msi <https://www.foundationdb.org/downloads/6.2.14/windows/installers/foundationdb-6.2.14-x64.msi>`_

 API Language Bindings
 =====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part

 If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:

-* `foundationdb-6.2.11.tar.gz <https://www.foundationdb.org/downloads/6.2.11/bindings/python/foundationdb-6.2.11.tar.gz>`_
+* `foundationdb-6.2.14.tar.gz <https://www.foundationdb.org/downloads/6.2.14/bindings/python/foundationdb-6.2.14.tar.gz>`_

 Ruby 1.9.3/2.0.0+
 -----------------

-* `fdb-6.2.11.gem <https://www.foundationdb.org/downloads/6.2.11/bindings/ruby/fdb-6.2.11.gem>`_
+* `fdb-6.2.14.gem <https://www.foundationdb.org/downloads/6.2.14/bindings/ruby/fdb-6.2.14.gem>`_

 Java 8+
 -------

-* `fdb-java-6.2.11.jar <https://www.foundationdb.org/downloads/6.2.11/bindings/java/fdb-java-6.2.11.jar>`_
-* `fdb-java-6.2.11-javadoc.jar <https://www.foundationdb.org/downloads/6.2.11/bindings/java/fdb-java-6.2.11-javadoc.jar>`_
+* `fdb-java-6.2.14.jar <https://www.foundationdb.org/downloads/6.2.14/bindings/java/fdb-java-6.2.14.jar>`_
+* `fdb-java-6.2.14-javadoc.jar <https://www.foundationdb.org/downloads/6.2.14/bindings/java/fdb-java-6.2.14-javadoc.jar>`_

 Go 1.11+
 --------
--- a/documentation/sphinx/source/operations.rst
+++ b/documentation/sphinx/source/operations.rst
@ -20,6 +20,8 @@ Ready to operate an externally accessible FoundationDB cluster? You'll find what

 * :doc:`backups` covers the FoundationDB backup tool, which provides an additional level of protection by supporting recovery from disasters or unintentional modification of the database.

+* :doc:`disk-snapshot-backup` covers disk snapshot based FoundationDB backup tool, which is an alternate backup solution.
+
 * :doc:`platforms` describes issues on particular platforms that affect the operation of FoundationDB.

 .. toctree::
@ -34,4 +36,5 @@ Ready to operate an externally accessible FoundationDB cluster? You'll find what
 mr-status
 tls
 backups
+ disk-snapshot-backup
 platforms
--- a/documentation/sphinx/source/tls.rst
+++ b/documentation/sphinx/source/tls.rst
@ -138,12 +138,14 @@ Default Peer Verification
 The default peer verification is ``Check.Valid=1``.

 Default Password
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^

 There is no default password. If no password is specified, it is assumed that the private key is unencrypted.

-Parameters and client bindings
------------------------------
+Permissions
+-----------
+
+All files used by TLS must have sufficient read permissions such that the user running the FoundationDB server or client process can access them. It may also be necessary to have similar read permissions on the parent directories of the files used in the TLS configuration.

 Automatic TLS certificate refresh
 ---------------------------------
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -1342,7 +1342,7 @@ enumDBType getDBType(std::string dbType)
 	return enBackupType;
 }

-ACTOR Future<std::string> getLayerStatus(Reference<ReadYourWritesTransaction> tr, std::string name, std::string id, enumProgramExe exe, Database dest) {
+ACTOR Future<std::string> getLayerStatus(Reference<ReadYourWritesTransaction> tr, std::string name, std::string id, enumProgramExe exe, Database dest, bool snapshot = false) {
 	// This process will write a document that looks like this:
 	// { backup : { $expires : {<subdoc>}, version: <version from approximately 30 seconds from now> }
 	// so that the value under 'backup' will eventually expire to null and thus be ignored by
@ -1393,28 +1393,28 @@ ACTOR Future<std::string> getLayerStatus(Reference<ReadYourWritesTransaction> tr
 			totalBlobStats.create(p.first + ".$sum") = p.second;

 		state FileBackupAgent fba;
-		state std::vector<KeyBackedTag> backupTags = wait(getAllBackupTags(tr));
+		state std::vector<KeyBackedTag> backupTags = wait(getAllBackupTags(tr, snapshot));
 		state std::vector<Future<Version>> tagLastRestorableVersions;
 		state std::vector<Future<EBackupState>> tagStates;
 		state std::vector<Future<Reference<IBackupContainer>>> tagContainers;
 		state std::vector<Future<int64_t>> tagRangeBytes;
 		state std::vector<Future<int64_t>> tagLogBytes;
-		state Future<Optional<Value>> fBackupPaused = tr->get(fba.taskBucket->getPauseKey());
+		state Future<Optional<Value>> fBackupPaused = tr->get(fba.taskBucket->getPauseKey(), snapshot);

 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		state std::vector<KeyBackedTag>::iterator tag;
 		state std::vector<UID> backupTagUids;
 		for (tag = backupTags.begin(); tag != backupTags.end(); tag++) {
-			UidAndAbortedFlagT uidAndAbortedFlag = wait(tag->getOrThrow(tr));
+			UidAndAbortedFlagT uidAndAbortedFlag = wait(tag->getOrThrow(tr, snapshot));
 			BackupConfig config(uidAndAbortedFlag.first);
 			backupTagUids.push_back(config.getUid());

-			tagStates.push_back(config.stateEnum().getOrThrow(tr));
-			tagRangeBytes.push_back(config.rangeBytesWritten().getD(tr, false, 0));
-			tagLogBytes.push_back(config.logBytesWritten().getD(tr, false, 0));
-			tagContainers.push_back(config.backupContainer().getOrThrow(tr));
-			tagLastRestorableVersions.push_back(fba.getLastRestorable(tr, StringRef(tag->tagName)));
+			tagStates.push_back(config.stateEnum().getOrThrow(tr, snapshot));
+			tagRangeBytes.push_back(config.rangeBytesWritten().getD(tr, snapshot, 0));
+			tagLogBytes.push_back(config.logBytesWritten().getD(tr, snapshot, 0));
+			tagContainers.push_back(config.backupContainer().getOrThrow(tr, snapshot));
+			tagLastRestorableVersions.push_back(fba.getLastRestorable(tr, StringRef(tag->tagName), snapshot));
 		}

 		wait( waitForAll(tagLastRestorableVersions) && waitForAll(tagStates) && waitForAll(tagContainers) && waitForAll(tagRangeBytes) && waitForAll(tagLogBytes) && success(fBackupPaused));
@ -1451,21 +1451,21 @@ ACTOR Future<std::string> getLayerStatus(Reference<ReadYourWritesTransaction> tr
 		state Reference<ReadYourWritesTransaction> tr2(new ReadYourWritesTransaction(dest));
 		tr2->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr2->setOption(FDBTransactionOptions::LOCK_AWARE);
-		state Standalone<RangeResultRef> tagNames = wait(tr2->getRange(dba.tagNames.range(), 10000));
+		state Standalone<RangeResultRef> tagNames = wait(tr2->getRange(dba.tagNames.range(), 10000, snapshot));
 		state std::vector<Future<Optional<Key>>> backupVersion;
 		state std::vector<Future<int>> backupStatus;
 		state std::vector<Future<int64_t>> tagRangeBytesDR;
 		state std::vector<Future<int64_t>> tagLogBytesDR;
-		state Future<Optional<Value>> fDRPaused = tr->get(dba.taskBucket->getPauseKey());
+		state Future<Optional<Value>> fDRPaused = tr->get(dba.taskBucket->getPauseKey(), snapshot);

 		state std::vector<UID> drTagUids;
 		for(int i = 0; i < tagNames.size(); i++) {
-			backupVersion.push_back(tr2->get(tagNames[i].value.withPrefix(applyMutationsBeginRange.begin)));
+			backupVersion.push_back(tr2->get(tagNames[i].value.withPrefix(applyMutationsBeginRange.begin), snapshot));
 			UID tagUID = BinaryReader::fromStringRef<UID>(tagNames[i].value, Unversioned());
 			drTagUids.push_back(tagUID);
-			backupStatus.push_back(dba.getStateValue(tr2, tagUID));
-			tagRangeBytesDR.push_back(dba.getRangeBytesWritten(tr2, tagUID));
-			tagLogBytesDR.push_back(dba.getLogBytesWritten(tr2, tagUID));
+			backupStatus.push_back(dba.getStateValue(tr2, tagUID, snapshot));
+			tagRangeBytesDR.push_back(dba.getRangeBytesWritten(tr2, tagUID, snapshot));
+			tagLogBytesDR.push_back(dba.getLogBytesWritten(tr2, tagUID, snapshot));
 		}

 		wait(waitForAll(backupStatus) && waitForAll(backupVersion) && waitForAll(tagRangeBytesDR) && waitForAll(tagLogBytesDR) && success(fDRPaused));
@ -1618,7 +1618,7 @@ ACTOR Future<Void> statusUpdateActor(Database statusUpdateDest, std::string name
 				try {
 					tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 					tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-					state Future<std::string> futureStatusDoc = getLayerStatus(tr, name, id, exe, taskDest);
+					state Future<std::string> futureStatusDoc = getLayerStatus(tr, name, id, exe, taskDest, true);
 					wait(cleanupStatus(tr, rootKey, name, id));
 					std::string statusdoc = wait(futureStatusDoc);
 					tr->set(instanceKey, statusdoc);
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -2587,6 +2587,27 @@ Future<T> stopNetworkAfter( Future<T> what ) {
 	}
 }

+ACTOR Future<Void> addInterface( std::map<Key,std::pair<Value,ClientLeaderRegInterface>>* address_interface, Reference<FlowLock> connectLock, KeyValue kv) {
+	wait(connectLock->take());
+	state FlowLock::Releaser releaser(*connectLock);
+	state ClientWorkerInterface workerInterf = BinaryReader::fromStringRef<ClientWorkerInterface>(kv.value, IncludeVersion());
+	state ClientLeaderRegInterface leaderInterf(workerInterf.address());
+	choose {
+		when( Optional<LeaderInfo> rep = wait( brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())) ) ) {
+			StringRef ip_port = kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key;
+			(*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf);
+
+			if(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) {
+				Key full_ip_port2 = StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString());
+				StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls")) ? full_ip_port2.removeSuffix(LiteralStringRef(":tls")) : full_ip_port2;
+				(*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf);
+			}
+		}
+		when( wait(delay(1.0)) ) {}
+	}
+	return Void();
+}
+
 ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	state LineNoise& linenoise = *plinenoise;
 	state bool intrans = false;
@ -2597,7 +2618,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	state bool writeMode = false;

 	state std::string clusterConnectString;
-	state std::map<Key,Value> address_interface;
+	state std::map<Key,std::pair<Value,ClientLeaderRegInterface>> address_interface;

 	state FdbOptions globalOptions;
 	state FdbOptions activeOptions;
@ -2990,10 +3011,12 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 					getTransaction(db, tr, options, intrans);
 					if (tokens.size() == 1) {
 						Standalone<RangeResultRef> kvs = wait( makeInterruptable( tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces"), LiteralStringRef("\xff\xff\xff")), 1) ) );
+						Reference<FlowLock> connectLock(new FlowLock(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM));
+						std::vector<Future<Void>> addInterfs;
 						for( auto it : kvs ) {
-							auto ip_port = it.key.endsWith(LiteralStringRef(":tls")) ? it.key.removeSuffix(LiteralStringRef(":tls")) : it.key;
-							address_interface[ip_port] = it.value;
+							addInterfs.push_back(addInterface(&address_interface, connectLock, it));
 						}
+						wait( waitForAll(addInterfs) );
 					}
 					if (tokens.size() == 1 || tokencmp(tokens[1], "list")) {
 						if(address_interface.size() == 0) {
@ -3009,7 +3032,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 						printf("\n");
 					} else if (tokencmp(tokens[1], "all")) {
 						for( auto it : address_interface ) {
-							tr->set(LiteralStringRef("\xff\xff/reboot_worker"), it.second);
+							tr->set(LiteralStringRef("\xff\xff/reboot_worker"), it.second.first);
 						}
 						if (address_interface.size() == 0) {
 							printf("ERROR: no processes to kill. You must run the `kill’ command before running `kill all’.\n");
@ -3027,7 +3050,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {

 						if(!is_error) {
 							for(int i = 1; i < tokens.size(); i++) {
-								tr->set(LiteralStringRef("\xff\xff/reboot_worker"), address_interface[tokens[i]]);
+								tr->set(LiteralStringRef("\xff\xff/reboot_worker"), address_interface[tokens[i]].first);
 							}
 							printf("Attempted to kill %zu processes\n", tokens.size() - 1);
 						}
@ -3302,9 +3325,12 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 					getTransaction(db, tr, options, intrans);
 					if (tokens.size() == 1) {
 						Standalone<RangeResultRef> kvs = wait( makeInterruptable( tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces"), LiteralStringRef("\xff\xff\xff")), 1) ) );
+						Reference<FlowLock> connectLock(new FlowLock(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM));
+						std::vector<Future<Void>> addInterfs;
 						for( auto it : kvs ) {
-							address_interface[it.key] = it.value;
+							addInterfs.push_back(addInterface(&address_interface, connectLock, it));
 						}
+						wait( waitForAll(addInterfs) );
 					}
 					if (tokens.size() == 1 || tokencmp(tokens[1], "list")) {
 						if(address_interface.size() == 0) {
@ -3320,7 +3346,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 						printf("\n");
 					} else if (tokencmp(tokens[1], "all")) {
 						for( auto it : address_interface ) {
-							tr->set(LiteralStringRef("\xff\xff/reboot_and_check_worker"), it.second);
+							tr->set(LiteralStringRef("\xff\xff/reboot_and_check_worker"), it.second.first);
 						}
 						if (address_interface.size() == 0) {
 							printf("ERROR: no processes to check. You must run the `expensive_data_check’ command before running `expensive_data_check all’.\n");
@ -3338,7 +3364,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {

 						if(!is_error) {
 							for(int i = 1; i < tokens.size(); i++) {
-								tr->set(LiteralStringRef("\xff\xff/reboot_and_check_worker"), address_interface[tokens[i]]);
+								tr->set(LiteralStringRef("\xff\xff/reboot_and_check_worker"), address_interface[tokens[i]].first);
 							}
 							printf("Attempted to kill and check %zu processes\n", tokens.size() - 1);
 						}
--- a/fdbcli/fdbcli.vcxproj
+++ b/fdbcli/fdbcli.vcxproj
@ -85,10 +85,10 @@
      <PreprocessorDefinitions>TLS_DISABLED;WIN32;_WIN32_WINNT=0x0502;WINVER=0x0502;BOOST_ALL_NO_LIB;NTDDI_VERSION=0x05020000;_DEBUG;_HAS_ITERATOR_DEBUGGING=0;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>..\zookeeper\win32;..\zookeeper\generated;..\zookeeper\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <AdditionalOptions>/bigobj @../flow/no_intellisense.opt %(AdditionalOptions)</AdditionalOptions>
      <MinimalRebuild>false</MinimalRebuild>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
-      <AdditionalOptions> @../flow/no_intellisense.opt %(AdditionalOptions)</AdditionalOptions>
-			<LanguageStandard>stdcpp17</LanguageStandard>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@ -110,12 +110,12 @@
      <PreprocessorDefinitions>TLS_DISABLED;WIN32;_WIN32_WINNT=0x0502;WINVER=0x0502;BOOST_ALL_NO_LIB;NTDDI_VERSION=0x05020000;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>..\zookeeper\win32;..\zookeeper\generated;..\zookeeper\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <AdditionalOptions>/bigobj @../flow/no_intellisense.opt %(AdditionalOptions)</AdditionalOptions>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
-      <AdditionalOptions> @../flow/no_intellisense.opt %(AdditionalOptions)</AdditionalOptions>
-			<LanguageStandard>stdcpp17</LanguageStandard>
+      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@ -333,7 +333,7 @@ public:
 	Future<std::string> getStatus(Database cx, bool showErrors, std::string tagName);
 	Future<std::string> getStatusJSON(Database cx, std::string tagName);

-	Future<Version> getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName);
+	Future<Version> getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName, bool snapshot = false);
 	void setLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName, Version version);

 	// stopWhenDone will return when the backup is stopped, if enabled. Otherwise, it
@ -414,23 +414,23 @@ public:

 	Future<std::string> getStatus(Database cx, int errorLimit, Key tagName);

-	Future<int> getStateValue(Reference<ReadYourWritesTransaction> tr, UID logUid);
+	Future<int> getStateValue(Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot = false);
 	Future<int> getStateValue(Database cx, UID logUid) {
 		return runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr){ return getStateValue(tr, logUid); });
 	}

-	Future<UID> getDestUid(Reference<ReadYourWritesTransaction> tr, UID logUid);
+	Future<UID> getDestUid(Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot = false);
 	Future<UID> getDestUid(Database cx, UID logUid) {
 		return runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr){ return getDestUid(tr, logUid); });
 	}

-	Future<UID> getLogUid(Reference<ReadYourWritesTransaction> tr, Key tagName);
+	Future<UID> getLogUid(Reference<ReadYourWritesTransaction> tr, Key tagName, bool snapshot = false);
 	Future<UID> getLogUid(Database cx, Key tagName) {
 		return runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr){ return getLogUid(tr, tagName); });
 	}

-	Future<int64_t> getRangeBytesWritten(Reference<ReadYourWritesTransaction> tr, UID logUid);
-	Future<int64_t> getLogBytesWritten(Reference<ReadYourWritesTransaction> tr, UID logUid);
+	Future<int64_t> getRangeBytesWritten(Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot = false);
+	Future<int64_t> getLogBytesWritten(Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot = false);

 	// stopWhenDone will return when the backup is stopped, if enabled. Otherwise, it
 	// will return when the backup directory is restorable.
@ -543,11 +543,10 @@ class TagUidMap : public KeyBackedMap<std::string, UidAndAbortedFlagT> {
 public:
 	TagUidMap(const StringRef & prefix) : TagMap(LiteralStringRef("tag->uid/").withPrefix(prefix)), prefix(prefix) {}

-	ACTOR static Future<std::vector<KeyBackedTag>> getAll_impl(TagUidMap* tagsMap,
-	                                                           Reference<ReadYourWritesTransaction> tr);
+	ACTOR static Future<std::vector<KeyBackedTag>> getAll_impl(TagUidMap* tagsMap, Reference<ReadYourWritesTransaction> tr, bool snapshot);

-	Future<std::vector<KeyBackedTag>> getAll(Reference<ReadYourWritesTransaction> tr) {
-		return getAll_impl(this, tr);
+	Future<std::vector<KeyBackedTag>> getAll(Reference<ReadYourWritesTransaction> tr, bool snapshot = false) {
+		return getAll_impl(this, tr, snapshot);
 	}

 	Key prefix;
@ -561,12 +560,12 @@ static inline KeyBackedTag makeBackupTag(std::string tagName) {
 	return KeyBackedTag(tagName, fileBackupPrefixRange.begin);
 }

-static inline Future<std::vector<KeyBackedTag>> getAllRestoreTags(Reference<ReadYourWritesTransaction> tr) {
-	return TagUidMap(fileRestorePrefixRange.begin).getAll(tr);
+static inline Future<std::vector<KeyBackedTag>> getAllRestoreTags(Reference<ReadYourWritesTransaction> tr, bool snapshot = false) {
+	return TagUidMap(fileRestorePrefixRange.begin).getAll(tr, snapshot);
 }

-static inline Future<std::vector<KeyBackedTag>> getAllBackupTags(Reference<ReadYourWritesTransaction> tr) {
-	return TagUidMap(fileBackupPrefixRange.begin).getAll(tr);
+static inline Future<std::vector<KeyBackedTag>> getAllBackupTags(Reference<ReadYourWritesTransaction> tr, bool snapshot = false) {
+	return TagUidMap(fileBackupPrefixRange.begin).getAll(tr, snapshot);
 }

 class KeyBackedConfig {
--- a/fdbclient/DatabaseBackupAgent.actor.cpp
+++ b/fdbclient/DatabaseBackupAgent.actor.cpp
@ -2391,28 +2391,28 @@ public:
 		return statusText;
 	}

-	ACTOR static Future<int> getStateValue(DatabaseBackupAgent* backupAgent, Reference<ReadYourWritesTransaction> tr, UID logUid) {
+	ACTOR static Future<int> getStateValue(DatabaseBackupAgent* backupAgent, Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		state Key statusKey = backupAgent->states.get(BinaryWriter::toValue(logUid, Unversioned())).pack(DatabaseBackupAgent::keyStateStatus);
-		Optional<Value> status = wait(tr->get(statusKey));
+		Optional<Value> status = wait(tr->get(statusKey, snapshot));

 		return (!status.present()) ? DatabaseBackupAgent::STATE_NEVERRAN : BackupAgentBase::getState(status.get().toString());
 	}

-	ACTOR static Future<UID> getDestUid(DatabaseBackupAgent* backupAgent, Reference<ReadYourWritesTransaction> tr, UID logUid) {
+	ACTOR static Future<UID> getDestUid(DatabaseBackupAgent* backupAgent, Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		state Key destUidKey = backupAgent->config.get(BinaryWriter::toValue(logUid, Unversioned())).pack(BackupAgentBase::destUid);
-		Optional<Value> destUid = wait(tr->get(destUidKey));
+		Optional<Value> destUid = wait(tr->get(destUidKey, snapshot));

 		return (destUid.present()) ? BinaryReader::fromStringRef<UID>(destUid.get(), Unversioned()) : UID();
 	}

-	ACTOR static Future<UID> getLogUid(DatabaseBackupAgent* backupAgent, Reference<ReadYourWritesTransaction> tr, Key tagName) {
+	ACTOR static Future<UID> getLogUid(DatabaseBackupAgent* backupAgent, Reference<ReadYourWritesTransaction> tr, Key tagName, bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-		state Optional<Value> logUid = wait(tr->get(backupAgent->tagNames.pack(tagName)));
+		state Optional<Value> logUid = wait(tr->get(backupAgent->tagNames.pack(tagName), snapshot));

 		return (logUid.present()) ? BinaryReader::fromStringRef<UID>(logUid.get(), Unversioned()) : UID();
 	}
@ -2442,16 +2442,16 @@ Future<std::string> DatabaseBackupAgent::getStatus(Database cx, int errorLimit,
 	return DatabaseBackupAgentImpl::getStatus(this, cx, errorLimit, tagName);
 }

-Future<int> DatabaseBackupAgent::getStateValue(Reference<ReadYourWritesTransaction> tr, UID logUid) {
-	return DatabaseBackupAgentImpl::getStateValue(this, tr, logUid);
+Future<int> DatabaseBackupAgent::getStateValue(Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot) {
+	return DatabaseBackupAgentImpl::getStateValue(this, tr, logUid, snapshot);
 }

-Future<UID> DatabaseBackupAgent::getDestUid(Reference<ReadYourWritesTransaction> tr, UID logUid) {
-	return DatabaseBackupAgentImpl::getDestUid(this, tr, logUid);
+Future<UID> DatabaseBackupAgent::getDestUid(Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot) {
+	return DatabaseBackupAgentImpl::getDestUid(this, tr, logUid, snapshot);
 }

-Future<UID> DatabaseBackupAgent::getLogUid(Reference<ReadYourWritesTransaction> tr, Key tagName) {
-	return DatabaseBackupAgentImpl::getLogUid(this, tr, tagName);
+Future<UID> DatabaseBackupAgent::getLogUid(Reference<ReadYourWritesTransaction> tr, Key tagName, bool snapshot) {
+	return DatabaseBackupAgentImpl::getLogUid(this, tr, tagName, snapshot);
 }

 Future<Void> DatabaseBackupAgent::waitUpgradeToLatestDrVersion(Database cx, Key tagName) {
@ -2466,10 +2466,10 @@ Future<int> DatabaseBackupAgent::waitSubmitted(Database cx, Key tagName) {
 	return DatabaseBackupAgentImpl::waitSubmitted(this, cx, tagName);
 }

-Future<int64_t> DatabaseBackupAgent::getRangeBytesWritten(Reference<ReadYourWritesTransaction> tr, UID logUid) {
-	return DRConfig(logUid).rangeBytesWritten().getD(tr);
+Future<int64_t> DatabaseBackupAgent::getRangeBytesWritten(Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot) {
+	return DRConfig(logUid).rangeBytesWritten().getD(tr, snapshot);
 }

-Future<int64_t> DatabaseBackupAgent::getLogBytesWritten(Reference<ReadYourWritesTransaction> tr, UID logUid) {
-	return DRConfig(logUid).logBytesWritten().getD(tr);
+Future<int64_t> DatabaseBackupAgent::getLogBytesWritten(Reference<ReadYourWritesTransaction> tr, UID logUid, bool snapshot) {
+	return DRConfig(logUid).logBytesWritten().getD(tr, snapshot);
 }
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -97,9 +97,9 @@ StringRef FileBackupAgent::restoreStateText(ERestoreState id) {
 template<> Tuple Codec<ERestoreState>::pack(ERestoreState const &val) { return Tuple().append(val); }
 template<> ERestoreState Codec<ERestoreState>::unpack(Tuple const &val) { return (ERestoreState)val.getInt(0); }

-ACTOR Future<std::vector<KeyBackedTag>> TagUidMap::getAll_impl(TagUidMap *tagsMap, Reference<ReadYourWritesTransaction> tr) {
+ACTOR Future<std::vector<KeyBackedTag>> TagUidMap::getAll_impl(TagUidMap *tagsMap, Reference<ReadYourWritesTransaction> tr, bool snapshot) {
 	state Key prefix = tagsMap->prefix; // Copying it here as tagsMap lifetime is not tied to this actor
-	TagMap::PairsType tagPairs = wait(tagsMap->getRange(tr, std::string(), {}, 1e6));
+	TagMap::PairsType tagPairs = wait(tagsMap->getRange(tr, std::string(), {}, 1e6, snapshot));
 	std::vector<KeyBackedTag> results;
 	for(auto &p : tagPairs)
 		results.push_back(KeyBackedTag(p.first, prefix));
@ -4202,10 +4202,10 @@ public:
 		return statusText;
 	}

-	ACTOR static Future<Version> getLastRestorable(FileBackupAgent* backupAgent, Reference<ReadYourWritesTransaction> tr, Key tagName) {
+	ACTOR static Future<Version> getLastRestorable(FileBackupAgent* backupAgent, Reference<ReadYourWritesTransaction> tr, Key tagName, bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-		state Optional<Value> version = wait(tr->get(backupAgent->lastRestorable.pack(tagName)));
+		state Optional<Value> version = wait(tr->get(backupAgent->lastRestorable.pack(tagName), snapshot));

 		return (version.present()) ? BinaryReader::fromStringRef<Version>(version.get(), Unversioned()) : 0;
 	}
@ -4418,8 +4418,8 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
 	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
 }

-Future<Version> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName) {
-	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName);
+Future<Version> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName, bool snapshot) {
+	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
 }

 void FileBackupAgent::setLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName, Version version) {
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@ -45,7 +45,7 @@ ClientKnobs::ClientKnobs(bool randomize) {
 	init( COORDINATOR_RECONNECTION_DELAY,          1.0 );
 	init( CLIENT_EXAMPLE_AMOUNT,                    20 );
 	init( MAX_CLIENT_STATUS_AGE,                   1.0 );
-	init( MAX_CLIENT_PROXY_CONNECTIONS,              5 ); if( randomize && BUGGIFY ) MAX_CLIENT_PROXY_CONNECTIONS = 1;
+	init( MAX_PROXY_CONNECTIONS,                     5 ); if( randomize && BUGGIFY ) MAX_PROXY_CONNECTIONS = 1;

 	// wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin

@ -76,6 +76,7 @@ ClientKnobs::ClientKnobs(bool randomize) {
 	init( GET_RANGE_SHARD_LIMIT,                     2 );
 	init( WARM_RANGE_SHARD_LIMIT,                  100 );
 	init( STORAGE_METRICS_SHARD_LIMIT,             100 ); if( randomize && BUGGIFY ) STORAGE_METRICS_SHARD_LIMIT = 3;
+	init( SHARD_COUNT_LIMIT,                        80 ); if( randomize && BUGGIFY ) SHARD_COUNT_LIMIT = 3;
 	init( STORAGE_METRICS_UNFAIR_SPLIT_LIMIT,  2.0/3.0 );
 	init( STORAGE_METRICS_TOO_MANY_SHARDS_DELAY,  15.0 );
 	init( AGGREGATE_HEALTH_METRICS_MAX_STALENESS,  0.5 );
@ -197,6 +198,9 @@ ClientKnobs::ClientKnobs(bool randomize) {
 	}
 	init(CSI_STATUS_DELAY,						  10.0  );

-	init( CONSISTENCY_CHECK_RATE_LIMIT_MAX,		  50e6 ); // Limit in per sec
+	init( CONSISTENCY_CHECK_RATE_LIMIT_MAX,        50e6 ); // Limit in per sec
 	init( CONSISTENCY_CHECK_ONE_ROUND_TARGET_COMPLETION_TIME,	7 * 24 * 60 * 60 ); // 7 days
+
+	//fdbcli
+	init( CLI_CONNECT_PARALLELISM,                   20 );
 }
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@ -44,7 +44,7 @@ public:
 	double COORDINATOR_RECONNECTION_DELAY;
 	int CLIENT_EXAMPLE_AMOUNT;
 	double MAX_CLIENT_STATUS_AGE;
-	int MAX_CLIENT_PROXY_CONNECTIONS;
+	int MAX_PROXY_CONNECTIONS;

 	// wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin
 	double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
@ -75,6 +75,7 @@ public:
 	int GET_RANGE_SHARD_LIMIT;
 	int WARM_RANGE_SHARD_LIMIT;
 	int STORAGE_METRICS_SHARD_LIMIT;
+	int SHARD_COUNT_LIMIT;
 	double STORAGE_METRICS_UNFAIR_SPLIT_LIMIT;
 	double STORAGE_METRICS_TOO_MANY_SHARDS_DELAY;
 	double AGGREGATE_HEALTH_METRICS_MAX_STALENESS;
@ -189,6 +190,9 @@ public:

 	int CONSISTENCY_CHECK_RATE_LIMIT_MAX;
 	int CONSISTENCY_CHECK_ONE_ROUND_TARGET_COMPLETION_TIME;
+	
+	//fdbcli
+	int CLI_CONNECT_PARALLELISM;

 	ClientKnobs(bool randomize = false);
 };
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@ -670,6 +670,25 @@ ACTOR Future<Void> monitorLeaderForProxies( Key clusterKey, vector<NetworkAddres
 	}
 }

+void shrinkProxyList( ClientDBInfo& ni, std::vector<UID>& lastProxyUIDs, std::vector<MasterProxyInterface>& lastProxies ) {
+	if(ni.proxies.size() > CLIENT_KNOBS->MAX_PROXY_CONNECTIONS) {
+		std::vector<UID> proxyUIDs;
+		for(auto& proxy : ni.proxies) {
+			proxyUIDs.push_back(proxy.id());
+		}
+		if(proxyUIDs != lastProxyUIDs) {
+			lastProxyUIDs = proxyUIDs;
+			lastProxies = ni.proxies;
+			deterministicRandom()->randomShuffle(lastProxies);
+			lastProxies.resize(CLIENT_KNOBS->MAX_PROXY_CONNECTIONS);
+			for(int i = 0; i < lastProxies.size(); i++) {
+				TraceEvent("ConnectedProxy").detail("Proxy", lastProxies[i].id());
+			}
+		}
+		ni.proxies = lastProxies;
+	}
+}
+
 // Leader is the process that will be elected by coordinators as the cluster controller
 ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration( Reference<ClusterConnectionFile> connFile, Reference<AsyncVar<ClientDBInfo>> clientInfo, MonitorLeaderInfo info, Standalone<VectorRef<ClientVersionRef>> supportedVersions, Key traceLogGroup) {
 	state ClusterConnectionString cs = info.intermediateConnFile->getConnectionString();
@ -730,24 +749,8 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration( Reference<ClusterCo
 			connFile->notifyConnected();

 			auto& ni = rep.get().mutate();
-			if(ni.proxies.size() > CLIENT_KNOBS->MAX_CLIENT_PROXY_CONNECTIONS) {
-				std::vector<UID> proxyUIDs;
-				for(auto& proxy : ni.proxies) {
-					proxyUIDs.push_back(proxy.id());
-				}
-				if(proxyUIDs != lastProxyUIDs) {
-					lastProxyUIDs = proxyUIDs;
-					lastProxies = ni.proxies;
-					deterministicRandom()->randomShuffle(lastProxies);
-					lastProxies.resize(CLIENT_KNOBS->MAX_CLIENT_PROXY_CONNECTIONS);
-					for(int i = 0; i < lastProxies.size(); i++) {
-						TraceEvent("ClientConnectedProxy").detail("Proxy", lastProxies[i].id());
-					}
-				}
-				ni.proxies = lastProxies;
-			}
-
-			clientInfo->set( rep.get().read() );
+			shrinkProxyList(ni, lastProxyUIDs, lastProxies);
+			clientInfo->set( ni );
 			successIdx = idx;
 		} else if(idx == successIdx) {
 			wait(delay(CLIENT_KNOBS->COORDINATOR_RECONNECTION_DELAY));
--- a/fdbclient/MonitorLeader.h
+++ b/fdbclient/MonitorLeader.h
@ -59,6 +59,8 @@ Future<Void> monitorLeaderForProxies( Value const& key, vector<NetworkAddress> c

 Future<Void> monitorProxies( Reference<AsyncVar<Reference<ClusterConnectionFile>>> const& connFile, Reference<AsyncVar<ClientDBInfo>> const& clientInfo, Standalone<VectorRef<ClientVersionRef>> const& supportedVersions, Key const& traceLogGroup );

+void shrinkProxyList( ClientDBInfo& ni, std::vector<UID>& lastProxyUIDs, std::vector<MasterProxyInterface>& lastProxies );
+
 #pragma region Implementation

 Future<Void> monitorLeaderInternal( Reference<ClusterConnectionFile> const& connFile, Reference<AsyncVar<Value>> const& outSerializedLeaderInfo );
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -1452,6 +1452,17 @@ ACTOR Future<Version> waitForCommittedVersion( Database cx, Version version ) {
 	}
 }

+ACTOR Future<Version> getRawVersion( Database cx ) {
+	loop {
+		choose {
+			when ( wait( cx->onMasterProxiesChanged() ) ) {}
+			when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE ), cx->taskID ) ) ) {
+				return v.version;
+			}
+		}
+	}
+}
+
 ACTOR Future<Void> readVersionBatcher(
 	DatabaseContext* cx, FutureStream<std::pair<Promise<GetReadVersionReply>, Optional<UID>>> versionStream,
 	uint32_t flags);
@ -2132,6 +2143,10 @@ ACTOR Future<Void> watch( Reference<Watch> watch, Database cx, Transaction *self
 	return Void();
 }

+Future<Version> Transaction::getRawReadVersion() {
+	return ::getRawVersion(cx);
+}
+
 Future< Void > Transaction::watch( Reference<Watch> watch ) {
 	return ::watch(watch, cx, this);
 }
@ -3214,16 +3229,25 @@ ACTOR Future< StorageMetrics > waitStorageMetricsMultipleLocations(
 	}
 }

-ACTOR Future< StorageMetrics > waitStorageMetrics(
+ACTOR Future< StorageMetrics > extractMetrics( Future<std::pair<Optional<StorageMetrics>, int>> fMetrics ) {
+	std::pair<Optional<StorageMetrics>, int> x = wait(fMetrics);
+	return x.first.get();
+}
+	
+ACTOR Future< std::pair<Optional<StorageMetrics>, int> > waitStorageMetrics(
 	Database cx,
 	KeyRange keys,
 	StorageMetrics min,
 	StorageMetrics max,
 	StorageMetrics permittedError,
-	int shardLimit )
+	int shardLimit,
+	int expectedShardCount )
 {
 	loop {
 		vector< pair<KeyRange, Reference<LocationInfo>> > locations = wait( getKeyRangeLocations( cx, keys, shardLimit, false, &StorageServerInterface::waitMetrics, TransactionInfo(TaskPriority::DataDistribution) ) );
+		if(expectedShardCount >= 0 && locations.size() != expectedShardCount) {
+			return std::make_pair(Optional<StorageMetrics>(), locations.size());
+		}

 		//SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better solution to this.
 		if(locations.size() < shardLimit) {
@ -3236,7 +3260,7 @@ ACTOR Future< StorageMetrics > waitStorageMetrics(
 					fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution );
 				}
 				StorageMetrics x = wait(fx);
-				return x;
+				return std::make_pair(x,-1);
 			} catch (Error& e) {
 				if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
 					TraceEvent(SevError, "WaitStorageMetricsError").error(e);
@ -3257,20 +3281,21 @@ ACTOR Future< StorageMetrics > waitStorageMetrics(
 	}
 }

-Future< StorageMetrics > Transaction::waitStorageMetrics(
+Future< std::pair<Optional<StorageMetrics>, int> > Transaction::waitStorageMetrics(
 	KeyRange const& keys,
 	StorageMetrics const& min,
 	StorageMetrics const& max,
 	StorageMetrics const& permittedError,
-	int shardLimit )
+	int shardLimit,
+	int expectedShardCount )
 {
-	return ::waitStorageMetrics( cx, keys, min, max, permittedError, shardLimit );
+	return ::waitStorageMetrics( cx, keys, min, max, permittedError, shardLimit, expectedShardCount );
 }

 Future< StorageMetrics > Transaction::getStorageMetrics( KeyRange const& keys, int shardLimit ) {
 	StorageMetrics m;
 	m.bytes = -1;
-	return ::waitStorageMetrics( cx, keys, StorageMetrics(), m, StorageMetrics(), shardLimit );
+	return extractMetrics( ::waitStorageMetrics( cx, keys, StorageMetrics(), m, StorageMetrics(), shardLimit, -1 ) );
 }

 ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx, KeyRange keys, StorageMetrics limit, StorageMetrics estimated )
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -211,6 +211,7 @@ public:

 	void setVersion( Version v );
 	Future<Version> getReadVersion() { return getReadVersion(0); }
+	Future<Version> getRawReadVersion();

 	[[nodiscard]] Future<Optional<Value>> get(const Key& key, bool snapshot = false);
 	[[nodiscard]] Future<Void> watch(Reference<Watch> watch);
@ -241,7 +242,7 @@ public:

 	Future< Void > warmRange( Database cx, KeyRange keys );

-	Future< StorageMetrics > waitStorageMetrics( KeyRange const& keys, StorageMetrics const& min, StorageMetrics const& max, StorageMetrics const& permittedError, int shardLimit );
+	Future< std::pair<Optional<StorageMetrics>, int> > waitStorageMetrics( KeyRange const& keys, StorageMetrics const& min, StorageMetrics const& max, StorageMetrics const& permittedError, int shardLimit, int expectedShardCount );
 	Future< StorageMetrics > getStorageMetrics( KeyRange const& keys, int shardLimit );
 	Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( KeyRange const& keys, StorageMetrics const& limit, StorageMetrics const& estimated );

--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -282,15 +282,13 @@ struct GetShardStateRequest {

 struct StorageMetrics {
 	constexpr static FileIdentifier file_identifier = 13622226;
-	int64_t bytes;				// total storage
-	int64_t bytesPerKSecond;	// network bandwidth (average over 10s)
-	int64_t iosPerKSecond;
-	int64_t bytesReadPerKSecond;
+	int64_t bytes = 0;				// total storage
+	int64_t bytesPerKSecond = 0;	// network bandwidth (average over 10s)
+	int64_t iosPerKSecond = 0;
+	int64_t bytesReadPerKSecond = 0;

 	static const int64_t infinity = 1LL<<60;

-	StorageMetrics() : bytes(0), bytesPerKSecond(0), iosPerKSecond(0), bytesReadPerKSecond(0) {}
-
 	bool allLessOrEqual( const StorageMetrics& rhs ) const {
 		return bytes <= rhs.bytes && bytesPerKSecond <= rhs.bytesPerKSecond && iosPerKSecond <= rhs.iosPerKSecond &&
 		       bytesReadPerKSecond <= rhs.bytesReadPerKSecond;
--- a/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/AsyncFileKAIO.actor.h
@ -416,7 +416,7 @@ public:
 			++ctx.countAIOSubmit;

 			double elapsed = timer_monotonic() - begin;
-			g_network->networkMetrics.secSquaredSubmit += elapsed*elapsed/2;	
+			g_network->networkInfo.metrics.secSquaredSubmit += elapsed*elapsed/2;	

 			//TraceEvent("Launched").detail("N", rc).detail("Queued", ctx.queue.size()).detail("Elapsed", elapsed).detail("Outstanding", ctx.outstanding+rc);
 			//printf("launched: %d/%d in %f us (%d outstanding; lowest prio %d)\n", rc, ctx.queue.size(), elapsed*1e6, ctx.outstanding + rc, toStart[n-1]->getTask());
@ -672,7 +672,7 @@ private:
 				double t = timer_monotonic();
 				double elapsed = t - ctx.ioStallBegin;
 				ctx.ioStallBegin = t;
-				g_network->networkMetrics.secSquaredDiskStall += elapsed*elapsed/2;
+				g_network->networkInfo.metrics.secSquaredDiskStall += elapsed*elapsed/2;
 			}

 			ctx.outstanding -= n;
--- a/fdbrpc/FailureMonitor.actor.cpp
+++ b/fdbrpc/FailureMonitor.actor.cpp
@ -73,9 +73,9 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat
 	// for an endpoint that is waited on changes, the waiter sees its failure status change
 	auto it = addressStatus.find(address);

-	//TraceEvent("NotifyFailureStatus").detail("Address", address).detail("Status", status.failed ? "Failed" : "OK").detail("Present", it == addressStatus.end());
 	if (it == addressStatus.end()) {
 		if (status != FailureStatus()) {
+			TraceEvent("NotifyAddressHealthy").suppressFor(1.0).detail("Address", address);
 			addressStatus[address]=status;
 			endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
 		}
@ -85,8 +85,14 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat
 			it->second = status;
 		else
 			addressStatus.erase(it);
-		if(triggerEndpoint)
+		if(triggerEndpoint) {
+			if(status.failed) {
+				TraceEvent("NotifyAddressFailed").suppressFor(1.0).detail("Address", address);
+			} else {
+				TraceEvent("NotifyAddressHealthyPresent").suppressFor(1.0).detail("Address", address);
+			}
 			endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
+		}
 	}
 }

--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -1106,6 +1106,7 @@ void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
 		return;

 	Reference<Peer> peer = self->getOrOpenPeer(endpoint.getPrimaryAddress());
+	
 	if(peer->peerReferences == -1) {
 		IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
 		peer->peerReferences = 1;
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@ -185,6 +185,7 @@ Future< REPLY_TYPE(Request) > loadBalance(
 	state Future<Void> secondDelay = Never();

 	state Promise<Void> requestFinished;
+	state double startTime = now();
 	
 	setReplyPriority(request, taskID);
 	if (!alternatives)
@ -278,6 +279,22 @@ Future< REPLY_TYPE(Request) > loadBalance(
 	state double backoff = 0;
 	state bool triedAllOptions = false;
 	loop {
+		if(now() - startTime > (g_network->isSimulated() ? 30.0 : 600.0)) {
+			TraceEvent ev(g_network->isSimulated() ? SevWarn : SevWarnAlways, "LoadBalanceTooLong");
+			ev.suppressFor(1.0);
+			ev.detail("Duration", now() - startTime);
+			ev.detail("NumAttempts", numAttempts);
+			ev.detail("Backoff", backoff);
+			ev.detail("TriedAllOptions", triedAllOptions);
+			if(ev.isEnabled()) {
+				ev.log();
+				for(int alternativeNum=0; alternativeNum<alternatives->size(); alternativeNum++) {
+					RequestStream<Request> const* thisStream = &alternatives->get( alternativeNum, channel );
+					TraceEvent(SevWarn, "LoadBalanceTooLongEndpoint").detail("Addr", thisStream->getEndpoint().getPrimaryAddress()).detail("Token", thisStream->getEndpoint().token).detail("Failed", IFailureMonitor::failureMonitor().getState( thisStream->getEndpoint() ).failed);
+				}
+			}
+		}
+
 		// Find an alternative, if any, that is not failed, starting with nextAlt
 		state RequestStream<Request> const* stream = NULL;
 		for(int alternativeNum=0; alternativeNum<alternatives->size(); alternativeNum++) {
@ -304,28 +321,28 @@ Future< REPLY_TYPE(Request) > loadBalance(
 			}

 			if(!alternatives->alwaysFresh()) {
-				if(now() - g_network->networkMetrics.newestAlternativesFailure > FLOW_KNOBS->ALTERNATIVES_FAILURE_RESET_TIME) {
-					g_network->networkMetrics.oldestAlternativesFailure = now();
+				if(now() - g_network->networkInfo.newestAlternativesFailure > FLOW_KNOBS->ALTERNATIVES_FAILURE_RESET_TIME) {
+					g_network->networkInfo.oldestAlternativesFailure = now();
 				}
 				
 				double delay = FLOW_KNOBS->ALTERNATIVES_FAILURE_MIN_DELAY;
-				if(now() - g_network->networkMetrics.lastAlternativesFailureSkipDelay > FLOW_KNOBS->ALTERNATIVES_FAILURE_SKIP_DELAY) {
-					g_network->networkMetrics.lastAlternativesFailureSkipDelay = now();
+				if(now() - g_network->networkInfo.lastAlternativesFailureSkipDelay > FLOW_KNOBS->ALTERNATIVES_FAILURE_SKIP_DELAY) {
+					g_network->networkInfo.lastAlternativesFailureSkipDelay = now();
 				} else {
-					double elapsed = now()-g_network->networkMetrics.oldestAlternativesFailure;
+					double elapsed = now()-g_network->networkInfo.oldestAlternativesFailure;
 					delay = std::max(delay, std::min(elapsed*FLOW_KNOBS->ALTERNATIVES_FAILURE_DELAY_RATIO, FLOW_KNOBS->ALTERNATIVES_FAILURE_MAX_DELAY));
 					delay = std::max(delay, std::min(elapsed*FLOW_KNOBS->ALTERNATIVES_FAILURE_SLOW_DELAY_RATIO, FLOW_KNOBS->ALTERNATIVES_FAILURE_SLOW_MAX_DELAY));
 				}

 				// Making this SevWarn means a lot of clutter
-				if(now() - g_network->networkMetrics.newestAlternativesFailure > 1 || deterministicRandom()->random01() < 0.01) {
+				if(now() - g_network->networkInfo.newestAlternativesFailure > 1 || deterministicRandom()->random01() < 0.01) {
 					TraceEvent("AllAlternativesFailed")
 						.detail("Interval", FLOW_KNOBS->CACHE_REFRESH_INTERVAL_WHEN_ALL_ALTERNATIVES_FAILED)
 						.detail("Alternatives", alternatives->description())
 						.detail("Delay", delay);
 				}

-				g_network->networkMetrics.newestAlternativesFailure = now();
+				g_network->networkInfo.newestAlternativesFailure = now();

 				choose {
 					when ( wait( quorum( ok, 1 ) ) ) {}
--- a/fdbrpc/TLSConnection.actor.cpp
+++ b/fdbrpc/TLSConnection.actor.cpp
@ -71,11 +71,29 @@ static int recv_func(void* ctx, uint8_t* buf, int len) {
 }

 ACTOR static Future<Void> handshake( TLSConnection* self ) {
+	state std::pair<IPAddress,uint16_t> peerIP = std::make_pair(self->conn->getPeerAddress().ip, self->is_client ? self->conn->getPeerAddress().port : static_cast<uint16_t>(0));
+	if(!self->is_client) {
+		auto iter(g_network->networkInfo.serverTLSConnectionThrottler.find(peerIP));
+		if(iter != g_network->networkInfo.serverTLSConnectionThrottler.end()) {
+			if (now() < iter->second) {
+				TraceEvent("TLSIncomingConnectionThrottlingWarning", self->getDebugID()).suppressFor(1.0).detail("PeerIP", peerIP.first.toString());
+				wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT));
+				throw connection_failed();
+			} else {
+				g_network->networkInfo.serverTLSConnectionThrottler.erase(peerIP);
+			}
+		}
+	}
+
 	loop {
 		int r = self->session->handshake();
+		if(BUGGIFY_WITH_PROB(0.001)) {
+			r = ITLSSession::FAILED;
+		}
 		if ( r == ITLSSession::SUCCESS ) break;
 		if ( r == ITLSSession::FAILED ) {
 			TraceEvent("TLSConnectionHandshakeError", self->getDebugID()).suppressFor(1.0).detail("Peer", self->getPeerAddress());
+			g_network->networkInfo.serverTLSConnectionThrottler[peerIP] = now() + (self->is_client ? FLOW_KNOBS->TLS_CLIENT_CONNECTION_THROTTLE_TIMEOUT : FLOW_KNOBS->TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT);
 			throw connection_failed();
 		}
 		ASSERT( r == ITLSSession::WANT_WRITE || r == ITLSSession::WANT_READ );
@ -87,7 +105,7 @@ ACTOR static Future<Void> handshake( TLSConnection* self ) {
 	return Void();
 }

-TLSConnection::TLSConnection( Reference<IConnection> const& conn, Reference<ITLSPolicy> const& policy, bool is_client, std::string host) : conn(conn), write_wants(0), read_wants(0), uid(conn->getDebugID()) {
+TLSConnection::TLSConnection( Reference<IConnection> const& conn, Reference<ITLSPolicy> const& policy, bool is_client, std::string host) : conn(conn), write_wants(0), read_wants(0), uid(conn->getDebugID()), is_client(is_client) {
 	const char * serverName = host.empty() ? NULL : host.c_str();
 	session = Reference<ITLSSession>( policy->create_session(is_client, serverName, send_func, this, recv_func, this, (void*)&uid) );
 	if ( !session ) {
@ -169,9 +187,25 @@ TLSNetworkConnections::TLSNetworkConnections( Reference<TLSOptions> options ) :
 	g_network->setGlobal(INetwork::enumGlobal::enNetworkConnections, (flowGlobalType) this);
 }

+ACTOR Future<Reference<IConnection>> waitAndFailConnection() {
+	wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT));
+	throw connection_failed();
+}
+
 Future<Reference<IConnection>> TLSNetworkConnections::connect( NetworkAddress toAddr, std::string host) {
 	if ( toAddr.isTLS() ) {
 		NetworkAddress clearAddr( toAddr.ip, toAddr.port, toAddr.isPublic(), false );
+		std::pair<IPAddress,uint16_t> peerIP = std::make_pair(toAddr.ip, toAddr.port);
+		auto iter(g_network->networkInfo.serverTLSConnectionThrottler.find(peerIP));
+		if(iter != g_network->networkInfo.serverTLSConnectionThrottler.end()) {
+			if (now() < iter->second) {
+				TraceEvent("TLSOutgoingConnectionThrottlingWarning").suppressFor(1.0).detail("PeerIP", toAddr);
+				return waitAndFailConnection();
+			} else {
+				g_network->networkInfo.serverTLSConnectionThrottler.erase(peerIP);
+			}
+		}
+
 		TraceEvent("TLSConnectionConnecting").suppressFor(1.0).detail("ToAddr", toAddr);
 		// For FDB<->FDB connections, we don't have hostnames and can't verify IP
 		// addresses against certificates, so we have our own peer verifying logic
--- a/fdbrpc/TLSConnection.h
+++ b/fdbrpc/TLSConnection.h
@ -36,6 +36,7 @@ struct TLSConnection : IConnection, ReferenceCounted<TLSConnection> {
 	int write_wants, read_wants;

 	UID uid;
+	bool is_client;

 	virtual void addref() { ReferenceCounted<TLSConnection>::addref(); }
 	virtual void delref() { ReferenceCounted<TLSConnection>::delref(); }
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -95,6 +95,8 @@ public:
 	struct DBInfo {
 		Reference<AsyncVar<ClientDBInfo>> clientInfo;
 		Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> serverInfo;
+		CachedSerialization<ServerDBInfo> serverInfoMasterOnly;
+		std::set<NetworkAddress> requiredAddresses;
 		ProcessIssuesMap workersWithIssues;
 		std::map<NetworkAddress, double> incompatibleConnections;
 		AsyncTrigger forceMasterFailure;
@ -117,6 +119,12 @@ public:
 		{
 		}

+		void addRequiredAddresses(const std::vector<WorkerInterface>& interfaces) {
+			for(auto& it : interfaces) {
+				requiredAddresses.insert(it.address());
+			}
+		}
+
 		void setDistributor(const DataDistributorInterface& interf) {
 			CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
 			auto& newInfo = newInfoCache.mutate();
@ -753,9 +761,16 @@ public:
 		for(int i = 0; i < proxies.size(); i++)
 			result.proxies.push_back(proxies[i].interf);

-		auto oldLogRouters = getWorkersForRoleInDatacenter( dcId, ProcessClass::LogRouter, req.maxOldLogRouters, req.configuration, id_used );
-		for(int i = 0; i < oldLogRouters.size(); i++) {
-			result.oldLogRouters.push_back(oldLogRouters[i].interf);
+		if(req.maxOldLogRouters > 0) {
+			if(tlogs.size() == 1) {
+				result.oldLogRouters.push_back(tlogs[0].interf);
+			} else {
+				for(int i = 0; i < tlogs.size(); i++) {
+					if(tlogs[i].interf.locality.processId() != clusterControllerProcessId) {
+						result.oldLogRouters.push_back(tlogs[i].interf);
+					}
+				}
+			}
 		}

 		if( now() - startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY &&
@ -837,6 +852,18 @@ public:
 				result.tLogs.push_back(tlogs[i].interf);
 			}

+			if(req.maxOldLogRouters > 0) {
+				if(tlogs.size() == 1) {
+					result.oldLogRouters.push_back(tlogs[0].interf);
+				} else {
+					for(int i = 0; i < tlogs.size(); i++) {
+						if(tlogs[i].interf.locality.processId() != clusterControllerProcessId) {
+							result.oldLogRouters.push_back(tlogs[i].interf);
+						}
+					}
+				}
+			}
+
 			if(req.recruitSeedServers) {
 				auto primaryStorageServers = getWorkersForSeedServers( req.configuration, req.configuration.storagePolicy );
 				for(int i = 0; i < primaryStorageServers.size(); i++)
@ -871,11 +898,6 @@ public:
 							result.resolvers.push_back(resolvers[i].interf);
 						for(int i = 0; i < proxies.size(); i++)
 							result.proxies.push_back(proxies[i].interf);
-
-						auto oldLogRouters = getWorkersForRoleInDatacenter( dcId, ProcessClass::LogRouter, req.maxOldLogRouters, req.configuration, used );
-						for(int i = 0; i < oldLogRouters.size(); i++) {
-							result.oldLogRouters.push_back(oldLogRouters[i].interf);
-						}
 						break;
 					} else {
 						if(fitness < bestFitness) {
@ -1148,7 +1170,7 @@ public:
 		std::transform(newTLogs.begin(), newTLogs.end(), std::back_inserter(exclusionWorkerIds), fn);
 		std::transform(newSatelliteTLogs.begin(), newSatelliteTLogs.end(), std::back_inserter(exclusionWorkerIds), fn);
 		RoleFitness newRemoteTLogFit(
-			(db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) ?
+			(db.config.usableRegions > 1 && (dbi.recoveryState == RecoveryState::ALL_LOGS_RECRUITED || dbi.recoveryState == RecoveryState::FULLY_RECOVERED)) ?
 			getWorkersForTlogs(db.config, db.config.getRemoteTLogReplicationFactor(), db.config.getDesiredRemoteLogs(), db.config.getRemoteTLogPolicy(), id_used, true, remoteDC, exclusionWorkerIds)
 			: remote_tlogs, ProcessClass::TLog);
 		if(oldRemoteTLogFit < newRemoteTLogFit) return false;
@ -1308,13 +1330,12 @@ public:
 			serversFailed("ServersFailed", clusterControllerMetrics),
 			serversUnfailed("ServersUnfailed", clusterControllerMetrics)
 	{
-		CachedSerialization<ServerDBInfo> newInfoCache = db.serverInfo->get();
-		auto& serverInfo = newInfoCache.mutate();
+		auto& serverInfo = db.serverInfoMasterOnly.mutate();
 		serverInfo.id = deterministicRandom()->randomUniqueID();
 		serverInfo.masterLifetime.ccID = id;
 		serverInfo.clusterInterface = ccInterface;
 		serverInfo.myLocality = locality;
-		db.serverInfo->set( newInfoCache );
+		db.serverInfo->set( db.serverInfoMasterOnly );
 		cx = openDBOnServer(db.serverInfo, TaskPriority::DefaultEndpoint, true, true);
 	}

@ -1369,8 +1390,8 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 				db->masterRegistrationCount = 0;
 				db->recoveryStalled = false;

-				auto cachedInfo = CachedSerialization<ServerDBInfo>();
-				auto& dbInfo = cachedInfo.mutate();
+				db->serverInfoMasterOnly = CachedSerialization<ServerDBInfo>();
+				auto& dbInfo = db->serverInfoMasterOnly.mutate();

 				dbInfo.master = iMaster;
 				dbInfo.id = deterministicRandom()->randomUniqueID();
@ -1383,7 +1404,8 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 				dbInfo.latencyBandConfig = db->serverInfo->get().read().latencyBandConfig;

 				TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id);
-				db->serverInfo->set( cachedInfo );
+				db->requiredAddresses.clear();
+				db->serverInfo->set( db->serverInfoMasterOnly );

 				state Future<Void> spinDelay = delay(SERVER_KNOBS->MASTER_SPIN_DELAY);  // Don't retry master recovery more than once per second, but don't delay the "first" recovery after more than a second of normal operation

@ -1422,12 +1444,17 @@ ACTOR Future<Void> clusterGetServerInfo(ClusterControllerData::DBInfo* db, UID k
                                        std::vector<NetworkAddress> incompatiblePeers,
                                        ReplyPromise<CachedSerialization<ServerDBInfo>> reply) {
 	state Optional<UID> issueID;
+	state bool useMasterOnly = false;
 	setIssues(db->workersWithIssues, reply.getEndpoint().getPrimaryAddress(), issues, issueID);
 	for(auto it : incompatiblePeers) {
 		db->incompatibleConnections[it] = now() + SERVER_KNOBS->INCOMPATIBLE_PEERS_LOGGING_INTERVAL;
 	}

-	while (db->serverInfo->get().read().id == knownServerInfoID) {
+	loop {
+		useMasterOnly = db->serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS && !db->requiredAddresses.count(reply.getEndpoint().getPrimaryAddress());
+		if((useMasterOnly ? db->serverInfoMasterOnly.read().id : db->serverInfo->get().read().id) != knownServerInfoID) {
+			break;
+		}
 		choose {
 			when (wait( yieldedFuture(db->serverInfo->onChange()) )) {}
 			when (wait( delayJittered( 300 ) )) { break; }  // The server might be long gone!
@ -1436,7 +1463,7 @@ ACTOR Future<Void> clusterGetServerInfo(ClusterControllerData::DBInfo* db, UID k

 	removeIssues(db->workersWithIssues, reply.getEndpoint().getPrimaryAddress(), issueID);

-	reply.send( db->serverInfo->get() );
+	reply.send( useMasterOnly ? db->serverInfoMasterOnly : db->serverInfo->get() );
 	return Void();
 }

@ -1461,7 +1488,14 @@ void checkOutstandingRecruitmentRequests( ClusterControllerData* self ) {
 	for( int i = 0; i < self->outstandingRecruitmentRequests.size(); i++ ) {
 		RecruitFromConfigurationRequest& req = self->outstandingRecruitmentRequests[i];
 		try {
-			req.reply.send( self->findWorkersForConfiguration( req ) );
+			RecruitFromConfigurationReply rep = self->findWorkersForConfiguration( req );
+			self->db.addRequiredAddresses(rep.oldLogRouters);
+			self->db.addRequiredAddresses(rep.proxies);
+			self->db.addRequiredAddresses(rep.resolvers);
+			self->db.addRequiredAddresses(rep.satelliteTLogs);
+			self->db.addRequiredAddresses(rep.tLogs);
+			self->db.serverInfo->trigger();
+			req.reply.send( rep );
 			swapAndPop( &self->outstandingRecruitmentRequests, i-- );
 		} catch (Error& e) {
 			if (e.code() == error_code_no_more_servers || e.code() == error_code_operation_failed) {
@ -1478,7 +1512,11 @@ void checkOutstandingRemoteRecruitmentRequests( ClusterControllerData* self ) {
 	for( int i = 0; i < self->outstandingRemoteRecruitmentRequests.size(); i++ ) {
 		RecruitRemoteFromConfigurationRequest& req = self->outstandingRemoteRecruitmentRequests[i];
 		try {
-			req.reply.send( self->findRemoteWorkersForConfiguration( req ) );
+			RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration( req );
+			self->db.addRequiredAddresses(rep.remoteTLogs);
+			self->db.addRequiredAddresses(rep.logRouters);
+			self->db.serverInfo->trigger();
+			req.reply.send( rep );
 			swapAndPop( &self->outstandingRemoteRecruitmentRequests, i-- );
 		} catch (Error& e) {
 			if (e.code() == error_code_no_more_servers || e.code() == error_code_operation_failed) {
@ -1890,7 +1928,14 @@ ACTOR Future<Void> clusterRecruitFromConfiguration( ClusterControllerData* self,
 	TEST(true); //ClusterController RecruitTLogsRequest
 	loop {
 		try {
-			req.reply.send( self->findWorkersForConfiguration( req ) );
+			auto rep = self->findWorkersForConfiguration( req );
+			self->db.addRequiredAddresses(rep.oldLogRouters);
+			self->db.addRequiredAddresses(rep.proxies);
+			self->db.addRequiredAddresses(rep.resolvers);
+			self->db.addRequiredAddresses(rep.satelliteTLogs);
+			self->db.addRequiredAddresses(rep.tLogs);
+			self->db.serverInfo->trigger();
+			req.reply.send( rep );
 			return Void();
 		} catch (Error& e) {
 			if (e.code() == error_code_no_more_servers && now() - self->startTime >= SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY) {
@ -1914,7 +1959,11 @@ ACTOR Future<Void> clusterRecruitRemoteFromConfiguration( ClusterControllerData*
 	TEST(true); //ClusterController RecruitTLogsRequest
 	loop {
 		try {
-			req.reply.send( self->findRemoteWorkersForConfiguration( req ) );
+			RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration( req );
+			self->db.addRequiredAddresses(rep.remoteTLogs);
+			self->db.addRequiredAddresses(rep.logRouters);
+			self->db.serverInfo->trigger();
+			req.reply.send( rep );
 			return Void();
 		} catch (Error& e) {
 			if (e.code() == error_code_no_more_servers && self->remoteStartTime.present() && now() - self->remoteStartTime.get() >= SERVER_KNOBS->WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY) {
--- a/fdbserver/ClusterRecruitmentInterface.h
+++ b/fdbserver/ClusterRecruitmentInterface.h
@ -63,12 +63,12 @@ struct ClusterControllerFullInterface {

 	void initEndpoints() {
 		clientInterface.initEndpoints();
-		recruitFromConfiguration.getEndpoint( TaskPriority::ClusterController );
-		recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterController );
+		recruitFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit );
+		recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit );
 		recruitStorage.getEndpoint( TaskPriority::ClusterController );
-		registerWorker.getEndpoint( TaskPriority::ClusterController );
+		registerWorker.getEndpoint( TaskPriority::ClusterControllerWorker );
 		getWorkers.getEndpoint( TaskPriority::ClusterController );
-		registerMaster.getEndpoint( TaskPriority::ClusterController );
+		registerMaster.getEndpoint( TaskPriority::ClusterControllerRegister );
 		getServerDBInfo.getEndpoint( TaskPriority::ClusterController );
 	}

--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -774,59 +774,28 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {

 			int64_t bestLoadBytes = 0;
 			Optional<Reference<IDataDistributionTeam>> bestOption;
-			std::vector<std::pair<int, Reference<IDataDistributionTeam>>> randomTeams;
-			std::set< UID > sources;
+			std::vector<Reference<IDataDistributionTeam>> randomTeams;
+			const std::set<UID> completeSources(req.completeSources.begin(), req.completeSources.end());

 			if( !req.wantsNewServers ) {
-				std::vector<Reference<IDataDistributionTeam>> similarTeams;
-				bool foundExact = false;
-
-				for( int i = 0; i < req.sources.size(); i++ )
-					sources.insert( req.sources[i] );
-
-				for( int i = 0; i < req.sources.size(); i++ ) {
-					if( self->server_info.count( req.sources[i] ) ) {
-						auto& teamList = self->server_info[ req.sources[i] ]->teams;
-						for( int j = 0; j < teamList.size(); j++ ) {
-							if( teamList[j]->isHealthy() && (!req.preferLowerUtilization || teamList[j]->hasHealthyFreeSpace())) {
-								int sharedMembers = 0;
-								for( const UID& id : teamList[j]->getServerIDs() )
-									if( sources.count( id ) )
-										sharedMembers++;
-
-								if( !foundExact && sharedMembers == teamList[j]->size() ) {
-									foundExact = true;
-									bestOption = Optional<Reference<IDataDistributionTeam>>();
-									similarTeams.clear();
-								}
-
-								if( (sharedMembers == teamList[j]->size()) || (!foundExact && req.wantsTrueBest) ) {
-									int64_t loadBytes = SOME_SHARED * teamList[j]->getLoadBytes(true, req.inflightPenalty);
-									if( !bestOption.present() || ( req.preferLowerUtilization && loadBytes < bestLoadBytes ) || ( !req.preferLowerUtilization && loadBytes > bestLoadBytes ) ) {
-										bestLoadBytes = loadBytes;
-										bestOption = teamList[j];
-									}
-								}
-								else if( !req.wantsTrueBest && !foundExact )
-									similarTeams.push_back( teamList[j] );
+				for( int i = 0; i < req.completeSources.size(); i++ ) {
+					if( !self->server_info.count( req.completeSources[i] ) ) {
+						continue;
+					}
+					auto& teamList = self->server_info[ req.completeSources[i] ]->teams;
+					for( int j = 0; j < teamList.size(); j++ ) {
+						bool found = true;
+						auto serverIDs = teamList[j]->getServerIDs();
+						for( int k = 0; k < teamList[j]->size(); k++ ) {
+							if( !completeSources.count( serverIDs[k] ) ) {
+								found = false;
+								break;
 							}
 						}
-					}
-				}
-
-				if( foundExact || (req.wantsTrueBest && bestOption.present() ) ) {
-					ASSERT( bestOption.present() );
-					// Check the team size: be sure team size is correct
-					ASSERT(bestOption.get()->size() == self->configuration.storageTeamSize);
-					req.reply.send( bestOption );
-					return Void();
-				}
-
-				if( !req.wantsTrueBest ) {
-					while( similarTeams.size() && randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT ) {
-						int randomTeam = deterministicRandom()->randomInt( 0, similarTeams.size() );
-						randomTeams.push_back( std::make_pair( SOME_SHARED, similarTeams[randomTeam] ) );
-						swapAndPop( &similarTeams, randomTeam );
+						if(found && teamList[j]->isHealthy()) {
+							req.reply.send( teamList[j] );
+							return Void();
+						}
 					}
 				}
 			}
@ -835,7 +804,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				ASSERT( !bestOption.present() );
 				for( int i = 0; i < self->teams.size(); i++ ) {
 					if( self->teams[i]->isHealthy() && (!req.preferLowerUtilization || self->teams[i]->hasHealthyFreeSpace()) ) {
-						int64_t loadBytes = NONE_SHARED * self->teams[i]->getLoadBytes(true, req.inflightPenalty);
+						int64_t loadBytes = self->teams[i]->getLoadBytes(true, req.inflightPenalty);
 						if( !bestOption.present() || ( req.preferLowerUtilization && loadBytes < bestLoadBytes ) || ( !req.preferLowerUtilization && loadBytes > bestLoadBytes ) ) {
 							bestLoadBytes = loadBytes;
 							bestOption = self->teams[i];
@ -850,12 +819,15 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 					Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams);

 					bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyFreeSpace());
-					for(int i=0; ok && i<randomTeams.size(); i++)
-						if (randomTeams[i].second->getServerIDs() == dest->getServerIDs())
+					for(int i=0; ok && i<randomTeams.size(); i++) {
+						if (randomTeams[i]->getServerIDs() == dest->getServerIDs()) {
 							ok = false;
+							break;
+						}
+					}

 					if (ok)
-						randomTeams.push_back( std::make_pair( NONE_SHARED, dest ) );
+						randomTeams.push_back( dest );
 					else
 						nTries++;
 				}
@ -866,10 +838,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				}

 				for( int i = 0; i < randomTeams.size(); i++ ) {
-					int64_t loadBytes = randomTeams[i].first * randomTeams[i].second->getLoadBytes(true, req.inflightPenalty);
+					int64_t loadBytes = randomTeams[i]->getLoadBytes(true, req.inflightPenalty);
 					if( !bestOption.present() || ( req.preferLowerUtilization && loadBytes < bestLoadBytes ) || ( !req.preferLowerUtilization && loadBytes > bestLoadBytes ) ) {
 						bestLoadBytes = loadBytes;
-						bestOption = randomTeams[i].second;
+						bestOption = randomTeams[i];
 					}
 				}
 			}
@ -878,30 +850,24 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			// We will get stuck at this! This only happens when a DC fails. No need to consider it right now.
 			if(!bestOption.present() && self->zeroHealthyTeams->get()) {
 				//Attempt to find the unhealthy source server team and return it
-				std::set<UID> completeSources;
 				for( int i = 0; i < req.completeSources.size(); i++ ) {
-					completeSources.insert( req.completeSources[i] );
-				}
-
-				int bestSize = 0;
-				for( int i = 0; i < req.completeSources.size(); i++ ) {
-					if( self->server_info.count( req.completeSources[i] ) ) {
-						auto& teamList = self->server_info[ req.completeSources[i] ]->teams;
-						for( int j = 0; j < teamList.size(); j++ ) {
-							bool found = true;
-							auto serverIDs = teamList[j]->getServerIDs();
-							for( int k = 0; k < teamList[j]->size(); k++ ) {
-								if( !completeSources.count( serverIDs[k] ) ) {
-									found = false;
-									break;
-								}
-							}
-							if(found && teamList[j]->size() > bestSize) {
-								bestOption = teamList[j];
-								bestSize = teamList[j]->size();
+					if( !self->server_info.count( req.completeSources[i] ) ) {
+						continue;
+					}
+					auto& teamList = self->server_info[ req.completeSources[i] ]->teams;
+					for( int j = 0; j < teamList.size(); j++ ) {
+						bool found = true;
+						auto serverIDs = teamList[j]->getServerIDs();
+						for( int k = 0; k < teamList[j]->size(); k++ ) {
+							if( !completeSources.count( serverIDs[k] ) ) {
+								found = false;
+								break;
 							}
 						}
-						break;
+						if(found) {
+							req.reply.send( teamList[j] );
+							return Void();
+						}
 					}
 				}
 			}
--- a/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/DataDistribution.actor.h
@ -38,11 +38,6 @@ struct RelocateShard {
 	RelocateShard( KeyRange const& keys, int priority ) : keys(keys), priority(priority) {}
 };

-enum {
-	SOME_SHARED = 2,
-	NONE_SHARED = 3
-};
-
 struct IDataDistributionTeam {
 	virtual vector<StorageServerInterface> getLastKnownServerInterfaces() = 0;
 	virtual int size() = 0;
@ -81,7 +76,6 @@ struct GetTeamRequest {
 	bool wantsTrueBest;
 	bool preferLowerUtilization;
 	double inflightPenalty;
-	std::vector<UID> sources;
 	std::vector<UID> completeSources;
 	Promise< Optional< Reference<IDataDistributionTeam> > > reply;

@ -93,10 +87,6 @@ struct GetTeamRequest {

 		ss << "WantsNewServers:" << wantsNewServers << " WantsTrueBest:" << wantsTrueBest
 		   << " PreferLowerUtilization:" << preferLowerUtilization << " inflightPenalty:" << inflightPenalty << ";";
-		ss << "Sources:";
-		for (auto& s : sources) {
-			ss << s.toString() << ",";
-		}
 		ss << "CompleteSources:";
 		for (auto& cs : completeSources) {
 			ss << cs.toString() << ",";
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -54,14 +54,7 @@ struct RelocateData {
 			rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM ||
 			rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
 			rs.priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD ||
-			rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT ||
-			mergeWantsNewServers(rs.keys, rs.priority)), interval("QueuedRelocation") {}
-
-	static bool mergeWantsNewServers(KeyRangeRef keys, int priority) {
-		return priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD &&
-		       (SERVER_KNOBS->MERGE_ONTO_NEW_TEAM == 2 ||
-		        (SERVER_KNOBS->MERGE_ONTO_NEW_TEAM == 1 && keys.begin.startsWith(LiteralStringRef("\xff"))));
-	}
+			rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT), interval("QueuedRelocation") {}

 	static bool isHealthPriority(int priority) {
 		return  priority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || 
@ -946,7 +939,6 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 					if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;

 					auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, inflightPenalty);
-					req.sources = rd.src;
 					req.completeSources = rd.completeSources;
 					Optional<Reference<IDataDistributionTeam>> bestTeam = wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)));
 					// If a DC has no healthy team, we stop checking the other DCs until
@ -1450,7 +1442,7 @@ ACTOR Future<Void> dataDistributionQueue(
 						.detail( "BytesWritten", self.bytesWritten )
 						.detail( "PriorityRecoverMove", self.priority_relocations[SERVER_KNOBS->PRIORITY_RECOVER_MOVE] )
 						.detail( "PriorityRebalanceUnderutilizedTeam", self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] )
-						.detail( "PriorityRebalannceOverutilizedTeam", self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] )
+						.detail( "PriorityRebalanceOverutilizedTeam", self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] )
 						.detail( "PriorityTeamHealthy", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_HEALTHY] )
 						.detail( "PriorityTeamContainsUndesiredServer", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER] )
 						.detail( "PriorityTeamRedundant", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT] )
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -35,6 +35,18 @@ enum BandwidthStatus {

 enum ReadBandwidthStatus { ReadBandwidthStatusNormal, ReadBandwidthStatusHigh };

+struct ShardMetrics {
+	StorageMetrics metrics;
+	double lastLowBandwidthStartTime;
+	int shardCount;
+
+	bool operator == ( ShardMetrics const& rhs ) const {
+		return metrics == rhs.metrics && lastLowBandwidthStartTime == rhs.lastLowBandwidthStartTime && shardCount == rhs.shardCount;
+	}
+
+	ShardMetrics(StorageMetrics const& metrics, double lastLowBandwidthStartTime, int shardCount) : metrics(metrics), lastLowBandwidthStartTime(lastLowBandwidthStartTime), shardCount(shardCount) {}
+};
+
 BandwidthStatus getBandwidthStatus( StorageMetrics const& metrics ) {
 	if( metrics.bytesPerKSecond > SERVER_KNOBS->SHARD_MAX_BYTES_PER_KSEC )
 		return BandwidthStatusHigh;
@ -69,7 +81,7 @@ ACTOR Future<Void> updateMaxShardSize( Reference<AsyncVar<int64_t>> dbSizeEstima
 struct ShardTrackedData {
 	Future<Void> trackShard;
 	Future<Void> trackBytes;
-	Reference<AsyncVar<Optional<StorageMetrics>>> stats;
+	Reference<AsyncVar<Optional<ShardMetrics>>> stats;
 };

 struct DataDistributionTracker {
@ -106,7 +118,7 @@ struct DataDistributionTracker {
 void restartShardTrackers(
 	DataDistributionTracker* self,
 	KeyRangeRef keys,
-	Optional<StorageMetrics> startingSize = Optional<StorageMetrics>());
+	Optional<ShardMetrics> startingSize = Optional<ShardMetrics>());

 // Gets the permitted size and IO bounds for a shard. A shard that starts at allKeys.begin
 //  (i.e. '') will have a permitted size of 0, since the database can contain no data.
@ -151,8 +163,13 @@ int64_t getMaxShardSize( double dbSizeEstimate ) {
 ACTOR Future<Void> trackShardBytes(
 		DataDistributionTracker* self,
 		KeyRange keys,
-		Reference<AsyncVar<Optional<StorageMetrics>>> shardMetrics)
+		Reference<AsyncVar<Optional<ShardMetrics>>> shardMetrics)
 {
+	state BandwidthStatus bandwidthStatus = shardMetrics->get().present() ? getBandwidthStatus( shardMetrics->get().get().metrics ) : BandwidthStatusNormal;
+	state double lastLowBandwidthStartTime = shardMetrics->get().present() ? shardMetrics->get().get().lastLowBandwidthStartTime : now();
+	state int shardCount = shardMetrics->get().present() ? shardMetrics->get().get().shardCount : 1;
+	state ReadBandwidthStatus readBandwidthStatus = shardMetrics->get().present() ? getReadBandwidthStatus(shardMetrics->get().get().metrics) : ReadBandwidthStatusNormal;
+
 	wait( delay( 0, TaskPriority::DataDistribution ) );

 	/*TraceEvent("TrackShardBytesStarting")
@ -162,15 +179,12 @@ ACTOR Future<Void> trackShardBytes(
 	    .detail("StartingMetrics", shardMetrics->get().present() ? shardMetrics->get().get().metrics.bytes : 0)
 	    .detail("StartingMerges", shardMetrics->get().present() ? shardMetrics->get().get().merges : 0);*/

-	state ReadBandwidthStatus readBandwidthStatus;
 	try {
 		loop {
-			ShardSizeBounds bounds;
-			if (shardMetrics->get().present()) {
-				auto bytes = shardMetrics->get().get().bytes;
-				auto bandwidthStatus = getBandwidthStatus(shardMetrics->get().get());
-				auto newReadBandwidthStatus = getReadBandwidthStatus(shardMetrics->get().get());
-
+			state ShardSizeBounds bounds;
+			if( shardMetrics->get().present() ) {
+				auto bytes = shardMetrics->get().get().metrics.bytes;
+				auto newReadBandwidthStatus = getReadBandwidthStatus(shardMetrics->get().get().metrics);
 				bounds.max.bytes = std::max( int64_t(bytes * 1.1), (int64_t)SERVER_KNOBS->MIN_SHARD_BYTES );
 				bounds.min.bytes = std::min( int64_t(bytes * 0.9), std::max(int64_t(bytes - (SERVER_KNOBS->MIN_SHARD_BYTES * 0.1)), (int64_t)0) );
 				bounds.permittedError.bytes = bytes * 0.1;
@ -227,30 +241,47 @@ ACTOR Future<Void> trackShardBytes(
 			bounds.min.iosPerKSecond = 0;
 			bounds.permittedError.iosPerKSecond = bounds.permittedError.infinity;

-			Transaction tr(self->cx);
-			StorageMetrics metrics = wait( tr.waitStorageMetrics( keys, bounds.min, bounds.max, bounds.permittedError, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT ) );
+			loop {
+				Transaction tr(self->cx);
+				std::pair<Optional<StorageMetrics>, int> metrics = wait( tr.waitStorageMetrics( keys, bounds.min, bounds.max, bounds.permittedError, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, shardCount ) );
+				if(metrics.first.present()) {
+					BandwidthStatus newBandwidthStatus = getBandwidthStatus( metrics.first.get() );
+					if(newBandwidthStatus == BandwidthStatusLow && bandwidthStatus != BandwidthStatusLow) {
+						lastLowBandwidthStartTime = now();
+					}
+					bandwidthStatus = newBandwidthStatus;

-			/*TraceEvent("ShardSizeUpdate")
-				.detail("Keys", keys)
-				.detail("UpdatedSize", metrics.metrics.bytes)
-				.detail("Bandwidth", metrics.metrics.bytesPerKSecond)
-				.detail("BandwidthStatus", getBandwidthStatus(metrics))
-				.detail("BytesLower", bounds.min.bytes)
-				.detail("BytesUpper", bounds.max.bytes)
-				.detail("BandwidthLower", bounds.min.bytesPerKSecond)
-				.detail("BandwidthUpper", bounds.max.bytesPerKSecond)
-				.detail("ShardSizePresent", shardSize->get().present())
-				.detail("OldShardSize", shardSize->get().present() ? shardSize->get().get().metrics.bytes : 0)
-				.detail("TrackerID", trackerID);*/
+					/*TraceEvent("ShardSizeUpdate")
+						.detail("Keys", keys)
+						.detail("UpdatedSize", metrics.metrics.bytes)
+						.detail("Bandwidth", metrics.metrics.bytesPerKSecond)
+						.detail("BandwithStatus", getBandwidthStatus(metrics))
+						.detail("BytesLower", bounds.min.bytes)
+						.detail("BytesUpper", bounds.max.bytes)
+						.detail("BandwidthLower", bounds.min.bytesPerKSecond)
+						.detail("BandwidthUpper", bounds.max.bytesPerKSecond)
+						.detail("ShardSizePresent", shardSize->get().present())
+						.detail("OldShardSize", shardSize->get().present() ? shardSize->get().get().metrics.bytes : 0)
+						.detail("TrackerID", trackerID);*/

-			if( shardMetrics->get().present() ) {
-				self->dbSizeEstimate->set( self->dbSizeEstimate->get() + metrics.bytes - shardMetrics->get().get().bytes );
-				if(keys.begin >= systemKeys.begin) {
-					self->systemSizeEstimate += metrics.bytes - shardMetrics->get().get().bytes;
+					if( shardMetrics->get().present() ) {
+						self->dbSizeEstimate->set( self->dbSizeEstimate->get() + metrics.first.get().bytes - shardMetrics->get().get().metrics.bytes );
+						if(keys.begin >= systemKeys.begin) {
+							self->systemSizeEstimate += metrics.first.get().bytes - shardMetrics->get().get().metrics.bytes;
+						}
+					}
+
+					shardMetrics->set( ShardMetrics(metrics.first.get(), lastLowBandwidthStartTime, shardCount) );
+					break;
+				} else {
+					shardCount = metrics.second;
+					if(shardMetrics->get().present()) {
+						auto newShardMetrics = shardMetrics->get().get();
+						newShardMetrics.shardCount = shardCount;
+						shardMetrics->set( newShardMetrics );
+					}
 				}
 			}
-
-			shardMetrics->set( metrics );
 		}
 	} catch( Error &e ) {
 		if (e.code() != error_code_actor_cancelled)
@ -290,10 +321,10 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> getSplitKeys( DataDistributionTracke
 	}
 }

-ACTOR Future<int64_t> getFirstSize( Reference<AsyncVar<Optional<StorageMetrics>>> stats ) {
+ACTOR Future<int64_t> getFirstSize( Reference<AsyncVar<Optional<ShardMetrics>>> stats ) {
 	loop {
 		if(stats->get().present())
-			return stats->get().get().bytes;
+			return stats->get().get().metrics.bytes;
 		wait( stats->onChange() );
 	}
 }
@ -333,8 +364,12 @@ ACTOR Future<Void> changeSizes( DataDistributionTracker* self, KeyRange keys, in
 	return Void();
 }

-struct HasBeenTrueFor : NonCopyable {
-	explicit HasBeenTrueFor( bool value ) : trigger( value ? Void() : Future<Void>() ) {}
+struct HasBeenTrueFor : ReferenceCounted<HasBeenTrueFor> {
+	explicit HasBeenTrueFor( Optional<ShardMetrics> value ) {
+		if(value.present()) {
+			trigger = delayJittered(std::max(0.0, SERVER_KNOBS->DD_MERGE_COALESCE_DELAY + value.get().lastLowBandwidthStartTime - now()), decrementPriority(TaskPriority::DataDistribution) ) || cleared.getFuture();
+		}
+	}

 	Future<Void> set() {
 		if( !trigger.isValid() ) {
@ -364,11 +399,11 @@ private:
 ACTOR Future<Void> shardSplitter(
 	DataDistributionTracker* self,
 	KeyRange keys,
-	Reference<AsyncVar<Optional<StorageMetrics>>> shardSize,
+	Reference<AsyncVar<Optional<ShardMetrics>>> shardSize,
 	ShardSizeBounds shardBounds )
 {
-	state StorageMetrics metrics = shardSize->get().get();
-	state BandwidthStatus bandwidthStatus = getBandwidthStatus( shardSize->get().get() );
+	state StorageMetrics metrics = shardSize->get().get().metrics;
+	state BandwidthStatus bandwidthStatus = getBandwidthStatus( metrics );

 	//Split
 	TEST(true);  // shard to be split
@ -418,17 +453,28 @@ ACTOR Future<Void> shardSplitter(
 			self->output.send( RelocateShard( r, SERVER_KNOBS->PRIORITY_SPLIT_SHARD) );
 		}

-		self->sizeChanges.add( changeSizes( self, keys, shardSize->get().get().bytes ) );
+		self->sizeChanges.add( changeSizes( self, keys, shardSize->get().get().metrics.bytes ) );
 	} else {
 		wait( delay(1.0, TaskPriority::DataDistribution) ); //In case the reason the split point was off was due to a discrepancy between storage servers
 	}
 	return Void();
 }

+ACTOR Future<Void> brokenPromiseToReady( Future<Void> f ) {
+	try {
+		wait(f);
+	} catch( Error &e ) {
+		if(e.code() != error_code_broken_promise) {
+			throw;
+		}
+	}
+	return Void();
+}
+
 Future<Void> shardMerger(
 	DataDistributionTracker* self,
 	KeyRange const& keys,
-	Reference<AsyncVar<Optional<StorageMetrics>>> shardSize )
+	Reference<AsyncVar<Optional<ShardMetrics>>> shardSize )
 {
 	int64_t maxShardSize = self->maxShardSize->get().get();

@ -442,11 +488,17 @@ Future<Void> shardMerger(
 	int shardsMerged = 1;
 	bool forwardComplete = false;
 	KeyRangeRef merged;
-	StorageMetrics endingStats = shardSize->get().get();
-	int64_t systemBytes = keys.begin >= systemKeys.begin ? shardSize->get().get().bytes : 0; 
+	StorageMetrics endingStats = shardSize->get().get().metrics;
+	int shardCount = shardSize->get().get().shardCount;
+	double lastLowBandwidthStartTime = shardSize->get().get().lastLowBandwidthStartTime;
+	if(FLOW_KNOBS->DELAY_JITTER_OFFSET*SERVER_KNOBS->DD_MERGE_COALESCE_DELAY > SERVER_KNOBS->DD_LOW_BANDWIDTH_DELAY && now() - lastLowBandwidthStartTime < SERVER_KNOBS->DD_LOW_BANDWIDTH_DELAY) {
+		TraceEvent( g_network->isSimulated() ? SevError : SevWarnAlways, "ShardMergeTooSoon", self->distributorId).detail("Keys", keys).detail("LastLowBandwidthStartTime", lastLowBandwidthStartTime);
+	}
+
+	int64_t systemBytes = keys.begin >= systemKeys.begin ? shardSize->get().get().metrics.bytes : 0; 

 	loop {
-		Optional<StorageMetrics> newMetrics;
+		Optional<ShardMetrics> newMetrics;
 		if( !forwardComplete ) {
 			if( nextIter->range().end == allKeys.end ) {
 				forwardComplete = true;
@ -456,7 +508,7 @@ Future<Void> shardMerger(
 			newMetrics = nextIter->value().stats->get();

 			// If going forward, give up when the next shard's stats are not yet present.
-			if( !newMetrics.present() ) {
+			if( !newMetrics.present() || shardCount + newMetrics.get().shardCount >= CLIENT_KNOBS->SHARD_COUNT_LIMIT ) {
 				--nextIter;
 				forwardComplete = true;
 				continue;
@ -468,10 +520,10 @@ Future<Void> shardMerger(
 			// If going backward, stop when the stats are not present or if the shard is already over the merge
 			//  bounds. If this check triggers right away (if we have not merged anything) then return a trigger
 			//  on the previous shard changing "size".
-			if( !newMetrics.present() ) {
+			if( !newMetrics.present() || shardCount + newMetrics.get().shardCount >= CLIENT_KNOBS->SHARD_COUNT_LIMIT ) {
 				if( shardsMerged == 1 ) {
 					TEST( true ); // shardMerger cannot merge anything
-					return prevIter->value().stats->onChange();
+					return brokenPromiseToReady( prevIter->value().stats->onChange() );
 				}

 				++prevIter;
@ -480,15 +532,18 @@ Future<Void> shardMerger(
 		}

 		merged = KeyRangeRef( prevIter->range().begin, nextIter->range().end );
-		endingStats += newMetrics.get();
+		endingStats += newMetrics.get().metrics;
+		shardCount += newMetrics.get().shardCount;
+		lastLowBandwidthStartTime = newMetrics.get().lastLowBandwidthStartTime;
 		if((forwardComplete ? prevIter->range().begin : nextIter->range().begin) >= systemKeys.begin) {
-			systemBytes += newMetrics.get().bytes;
+			systemBytes += newMetrics.get().metrics.bytes;
 		}
 		shardsMerged++;

 		auto shardBounds = getShardSizeBounds( merged, maxShardSize );
 		if( endingStats.bytes >= shardBounds.min.bytes ||
 				getBandwidthStatus( endingStats ) != BandwidthStatusLow ||
+				now() - lastLowBandwidthStartTime < SERVER_KNOBS->DD_LOW_BANDWIDTH_DELAY ||
 				shardsMerged >= SERVER_KNOBS->DD_MERGE_LIMIT ) {
 			// The merged range is larger than the min bounds so we cannot continue merging in this direction.
 			//  This means that:
@ -501,9 +556,10 @@ Future<Void> shardMerger(
 				break;

 			// If going forward, remove most recently added range
-			endingStats -= newMetrics.get();
+			endingStats -= newMetrics.get().metrics;
+			shardCount -= newMetrics.get().shardCount;
 			if(nextIter->range().begin >= systemKeys.begin) {
-				systemBytes -= newMetrics.get().bytes;
+				systemBytes -= newMetrics.get().metrics.bytes;
 			}
 			shardsMerged--;
 			--nextIter;
@ -519,12 +575,14 @@ Future<Void> shardMerger(
 		.detail("OldKeys", keys)
 		.detail("NewKeys", mergeRange)
 		.detail("EndingSize", endingStats.bytes)
-		.detail("BatchedMerges", shardsMerged);
+		.detail("BatchedMerges", shardsMerged)
+		.detail("LastLowBandwidthStartTime", lastLowBandwidthStartTime)
+		.detail("ShardCount", shardCount);

 	if(mergeRange.begin < systemKeys.begin) {
 		self->systemSizeEstimate -= systemBytes;
 	}
-	restartShardTrackers( self, mergeRange, endingStats );
+	restartShardTrackers( self, mergeRange, ShardMetrics(endingStats, lastLowBandwidthStartTime, shardCount) );
 	self->shardsAffectedByTeamFailure->defineShard( mergeRange );
 	self->output.send( RelocateShard( mergeRange, SERVER_KNOBS->PRIORITY_MERGE_SHARD ) );

@ -535,8 +593,8 @@ Future<Void> shardMerger(
 ACTOR Future<Void> shardEvaluator(
 	DataDistributionTracker* self,
 	KeyRange keys,
-	Reference<AsyncVar<Optional<StorageMetrics>>> shardSize,
-	HasBeenTrueFor *wantsToMerge)
+	Reference<AsyncVar<Optional<ShardMetrics>>> shardSize,
+	Reference<HasBeenTrueFor> wantsToMerge)
 {
 	Future<Void> onChange = shardSize->onChange() || yieldedFuture(self->maxShardSize->onChange());

@ -544,7 +602,7 @@ ACTOR Future<Void> shardEvaluator(
 	// getShardSizeBounds() will allways have shardBounds.min.bytes == 0 for shards that start at allKeys.begin,
 	//  so will will never attempt to merge that shard with the one previous.
 	ShardSizeBounds shardBounds = getShardSizeBounds(keys, self->maxShardSize->get().get());
-	StorageMetrics const& stats = shardSize->get().get();
+	StorageMetrics const& stats = shardSize->get().get().metrics;
 	auto bandwidthStatus = getBandwidthStatus( stats );

 	bool shouldSplit = stats.bytes > shardBounds.max.bytes ||
@ -592,11 +650,8 @@ ACTOR Future<Void> shardEvaluator(
 ACTOR Future<Void> shardTracker(
 		DataDistributionTracker* self,
 		KeyRange keys,
-		Reference<AsyncVar<Optional<StorageMetrics>>> shardSize)
+		Reference<AsyncVar<Optional<ShardMetrics>>> shardSize)
 {
-	// Survives multiple calls to shardEvaluator and keeps merges from happening too quickly.
-	state HasBeenTrueFor wantsToMerge( shardSize->get().present() );
-
 	wait( yieldedFuture(self->readyToStart.getFuture()) );

 	if( !shardSize->get().present() )
@ -608,6 +663,9 @@ ACTOR Future<Void> shardTracker(
 	// Since maxShardSize will become present for all shards at once, avoid slow tasks with a short delay
 	wait( delay( 0, TaskPriority::DataDistribution ) );

+	// Survives multiple calls to shardEvaluator and keeps merges from happening too quickly.
+	state Reference<HasBeenTrueFor> wantsToMerge( new HasBeenTrueFor( shardSize->get() ) );
+
 	/*TraceEvent("ShardTracker", self->distributorId)
 		.detail("Begin", keys.begin)
 		.detail("End", keys.end)
@ -619,7 +677,7 @@ ACTOR Future<Void> shardTracker(
 	try {
 		loop {
 			// Use the current known size to check for (and start) splits and merges.
-			wait( shardEvaluator( self, keys, shardSize, &wantsToMerge ) );
+			wait( shardEvaluator( self, keys, shardSize, wantsToMerge ) );

 			// We could have a lot of actors being released from the previous wait at the same time. Immediately calling
 			// delay(0) mitigates the resulting SlowTask
@ -632,7 +690,7 @@ ACTOR Future<Void> shardTracker(
 	}
 }

-void restartShardTrackers( DataDistributionTracker* self, KeyRangeRef keys, Optional<StorageMetrics> startingSize ) {
+void restartShardTrackers( DataDistributionTracker* self, KeyRangeRef keys, Optional<ShardMetrics> startingSize ) {
 	auto ranges = self->shards.getAffectedRangesAfterInsertion( keys, ShardTrackedData() );
 	for(int i=0; i<ranges.size(); i++) {
 		if( !ranges[i].value.trackShard.isValid() && ranges[i].begin != keys.begin ) {
@ -642,7 +700,7 @@ void restartShardTrackers( DataDistributionTracker* self, KeyRangeRef keys, Opti
 			continue;
 		}

-		Reference<AsyncVar<Optional<StorageMetrics>>> shardSize( new AsyncVar<Optional<StorageMetrics>>() );
+		Reference<AsyncVar<Optional<ShardMetrics>>> shardSize( new AsyncVar<Optional<ShardMetrics>>() );

 		// For the case where the new tracker will take over at the boundaries of current shard(s)
 		//  we can use the old size if it is available. This will be the case when merging shards.
@ -697,7 +755,7 @@ ACTOR Future<Void> fetchShardMetrics_impl( DataDistributionTracker* self, GetMet
 					onChange = stats->onChange();
 					break;
 				}
-				returnMetrics += t.value().stats->get().get();
+				returnMetrics += t.value().stats->get().get().metrics;
 			}

 			if( !onChange.isValid() ) {
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -26,7 +26,7 @@ ServerKnobs const* SERVER_KNOBS = new ServerKnobs();

 #define init( knob, value ) initKnob( knob, value, #knob )

-ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
+ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimulated) {
 	// clang-format off
 	// Versions
 	init( VERSIONS_PER_SECOND,                                   1e6 );
@ -106,7 +106,6 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( INFLIGHT_PENALTY_HEALTHY,                              1.0 );
 	init( INFLIGHT_PENALTY_UNHEALTHY,                           10.0 );
 	init( INFLIGHT_PENALTY_ONE_LEFT,                          1000.0 );
-	init( MERGE_ONTO_NEW_TEAM,                                     1 ); if( randomize && BUGGIFY ) MERGE_ONTO_NEW_TEAM = deterministicRandom()->coinflip() ? 0 : 2;
 	
 	init( PRIORITY_RECOVER_MOVE,                                 110 );
 	init( PRIORITY_REBALANCE_UNDERUTILIZED_TEAM,                 120 );
@ -120,7 +119,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( PRIORITY_TEAM_1_LEFT,                                  800 );
 	init( PRIORITY_TEAM_FAILED,                                  805 );
 	init( PRIORITY_TEAM_0_LEFT,                                  809 );
-	init( PRIORITY_SPLIT_SHARD,                                  900 ); if( randomize && BUGGIFY ) PRIORITY_SPLIT_SHARD = 350;
+	init( PRIORITY_SPLIT_SHARD,                                  950 ); if( randomize && BUGGIFY ) PRIORITY_SPLIT_SHARD = 350;

 	// Data distribution
 	init( RETRY_RELOCATESHARD_DELAY,                             0.1 );
@ -190,7 +189,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( DATA_DISTRIBUTION_LOGGING_INTERVAL,                    5.0 );
 	init( DD_ENABLED_CHECK_DELAY,                                1.0 );
 	init( DD_STALL_CHECK_DELAY,                                  0.4 ); //Must be larger than 2*MAX_BUGGIFIED_DELAY
-	init( DD_MERGE_COALESCE_DELAY,                             120.0 ); if( randomize && BUGGIFY ) DD_MERGE_COALESCE_DELAY = 0.001;
+	init( DD_LOW_BANDWIDTH_DELAY,         isSimulated ? 90.0 : 240.0 ); if( randomize && BUGGIFY ) DD_LOW_BANDWIDTH_DELAY = 0; //Because of delayJitter, this should be less than 0.9 * DD_MERGE_COALESCE_DELAY
+	init( DD_MERGE_COALESCE_DELAY,       isSimulated ? 120.0 : 300.0 ); if( randomize && BUGGIFY ) DD_MERGE_COALESCE_DELAY = 0.001;
 	init( STORAGE_METRICS_POLLING_DELAY,                         2.0 ); if( randomize && BUGGIFY ) STORAGE_METRICS_POLLING_DELAY = 15.0;
 	init( STORAGE_METRICS_RANDOM_DELAY,                          0.2 );
 	init( FREE_SPACE_RATIO_CUTOFF,                               0.1 );
@ -329,7 +329,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( ENFORCED_MIN_RECOVERY_DURATION,                       0.085 ); if( shortRecoveryDuration ) ENFORCED_MIN_RECOVERY_DURATION = 0.01;
 	init( REQUIRED_MIN_RECOVERY_DURATION,                       0.080 ); if( shortRecoveryDuration ) REQUIRED_MIN_RECOVERY_DURATION = 0.01;
 	init( ALWAYS_CAUSAL_READ_RISKY,                             false );
-	init( MAX_COMMIT_UPDATES,                                  100000 ); if( randomize && BUGGIFY ) MAX_COMMIT_UPDATES = 1;
+	init( MAX_COMMIT_UPDATES,                                    2000 ); if( randomize && BUGGIFY ) MAX_COMMIT_UPDATES = 1;

 	// Master Server
 	// masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution)
@ -352,7 +352,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( SAMPLE_EXPIRATION_TIME,                                1.0 );
 	init( SAMPLE_POLL_TIME,                                      0.1 );
 	init( RESOLVER_STATE_MEMORY_LIMIT,                           1e6 );
-	init( LAST_LIMITED_RATIO,                                    0.6 );
+	init( LAST_LIMITED_RATIO,                                    2.0 );

 	//Cluster Controller
 	init( CLUSTER_CONTROLLER_LOGGING_DELAY,                      5.0 );
@ -414,6 +414,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( SLOW_SMOOTHING_AMOUNT,                                10.0 ); if( slowRatekeeper ) SLOW_SMOOTHING_AMOUNT = 50.0;
 	init( METRIC_UPDATE_RATE,                                     .1 ); if( slowRatekeeper ) METRIC_UPDATE_RATE = 0.5;
 	init( DETAILED_METRIC_UPDATE_RATE,                           5.0 );
+	init (RATEKEEPER_DEFAULT_LIMIT,                              1e6 ); if( randomize && BUGGIFY ) RATEKEEPER_DEFAULT_LIMIT = 0;

 	bool smallStorageTarget = randomize && BUGGIFY;
 	init( TARGET_BYTES_PER_STORAGE_SERVER,                    1000e6 ); if( smallStorageTarget ) TARGET_BYTES_PER_STORAGE_SERVER = 3000e3;
@ -487,6 +488,10 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( BYTE_SAMPLE_LOAD_DELAY,                                0.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_DELAY = 0.1;
 	init( BYTE_SAMPLE_START_DELAY,                               1.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_START_DELAY = 0.0;
 	init( UPDATE_STORAGE_PROCESS_STATS_INTERVAL,                 5.0 );
+	init( BEHIND_CHECK_DELAY,                                    2.0 );
+	init( BEHIND_CHECK_COUNT,                                      2 );
+	init( BEHIND_CHECK_VERSIONS,             5 * VERSIONS_PER_SECOND );
+	init( WAIT_METRICS_WRONG_SHARD_CHANCE,                       0.1 );

 	//Wait Failure
 	init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS,                 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@ -106,8 +106,7 @@ public:
 	double INFLIGHT_PENALTY_REDUNDANT;
 	double INFLIGHT_PENALTY_UNHEALTHY;
 	double INFLIGHT_PENALTY_ONE_LEFT;
-	int MERGE_ONTO_NEW_TEAM; // Merges will request new servers. 0 for off, 1 for \xff only, 2 for all shards.
-
+	
 	// Higher priorities are executed first
 	// Priority/100 is the "priority group"/"superpriority".  Priority inversion
 	//   is possible within but not between priority groups; fewer priority groups
@ -151,6 +150,7 @@ public:
 	double DATA_DISTRIBUTION_LOGGING_INTERVAL;
 	double DD_ENABLED_CHECK_DELAY;
 	double DD_STALL_CHECK_DELAY;
+	double DD_LOW_BANDWIDTH_DELAY;
 	double DD_MERGE_COALESCE_DELAY;
 	double STORAGE_METRICS_POLLING_DELAY;
 	double STORAGE_METRICS_RANDOM_DELAY;
@ -352,6 +352,7 @@ public:
 	double METRIC_UPDATE_RATE;
 	double DETAILED_METRIC_UPDATE_RATE;
 	double LAST_LIMITED_RATIO;
+	double RATEKEEPER_DEFAULT_LIMIT;

 	int64_t TARGET_BYTES_PER_STORAGE_SERVER;
 	int64_t SPRING_BYTES_STORAGE_SERVER;
@ -427,6 +428,10 @@ public:
 	double BYTE_SAMPLE_LOAD_DELAY;
 	double BYTE_SAMPLE_START_DELAY;
 	double UPDATE_STORAGE_PROCESS_STATS_INTERVAL;
+	double BEHIND_CHECK_DELAY;
+	int BEHIND_CHECK_COUNT;
+	int64_t BEHIND_CHECK_VERSIONS;
+	double WAIT_METRICS_WRONG_SHARD_CHANCE;

 	//Wait Failure
 	int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -473,7 +478,7 @@ public:
 	int64_t FASTRESTORE_HEARTBEAT_INTERVAL;
 	double FASTRESTORE_SAMPLING_PERCENT;

-	ServerKnobs(bool randomize = false, ClientKnobs* clientKnobs = NULL);
+	ServerKnobs(bool randomize = false, ClientKnobs* clientKnobs = NULL, bool isSimulated = false);
 };

 extern ServerKnobs const* SERVER_KNOBS;
--- a/fdbserver/LogSystemConfig.h
+++ b/fdbserver/LogSystemConfig.h
@ -217,6 +217,19 @@ struct LogSystemConfig {
 		return format("type: %d oldGenerations: %d tags: %d %s", logSystemType, oldTLogs.size(), logRouterTags, describe(tLogs).c_str());
 	}

+	Optional<Key> getRemoteDcId() const {
+		for( int i = 0; i < tLogs.size(); i++ ) {
+			if(!tLogs[i].isLocal) {
+				for( int j = 0; j < tLogs[i].tLogs.size(); j++ ) {
+					if( tLogs[i].tLogs[j].present() ) {
+						return tLogs[i].tLogs[j].interf().locality.dcId();
+					}
+				}
+			}
+		}
+		return Optional<Key>();
+	}
+
 	std::vector<TLogInterface> allLocalLogs(bool includeSatellite = true) const {
 		std::vector<TLogInterface> results;
 		for( int i = 0; i < tLogs.size(); i++ ) {
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@ -219,8 +219,6 @@ struct ProxyCommitData {
 	NotifiedVersion latestLocalCommitBatchResolving;
 	NotifiedVersion latestLocalCommitBatchLogging;

-	PromiseStream<Void> commitBatchStartNotifications;
-	PromiseStream<Future<GetCommitVersionReply>> commitBatchVersions;  // 1:1 with commitBatchStartNotifications
 	RequestStream<GetReadVersionRequest> getConsistentReadVersion;
 	RequestStream<CommitTransactionRequest> commit;
 	Database cx;
@ -432,7 +430,6 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
 					}

 					if(!batch.size()) {
-						commitData->commitBatchStartNotifications.send(Void());
 						if(now() - lastBatch > commitData->commitBatchInterval) {
 							timeout = delayJittered(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, TaskPriority::ProxyCommitBatcher);
 						}
@ -444,7 +441,6 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
 					if((batchBytes + bytes > CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT || req.firstInBatch()) && batch.size()) {
 						out.send({ batch, batchBytes });
 						lastBatch = now();
-						commitData->commitBatchStartNotifications.send(Void());
 						timeout = delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher);
 						batch = std::vector<CommitTransactionRequest>();
 						batchBytes = 0;
@ -508,7 +504,7 @@ ACTOR Future<Void> addBackupMutations(ProxyCommitData* self, std::map<Key, Mutat
 		while(blobIter) {
 			if(yieldBytes > SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
 				yieldBytes = 0;
-				wait(yield());
+				wait(yield(TaskPriority::ProxyCommitYield2));
 			}
 			valueWriter.serializeBytes(blobIter->data);
 			yieldBytes += blobIter->data.size();
@ -602,21 +598,16 @@ ACTOR Future<Void> commitBatch(
 	if (debugID.present())
 		g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.Before");

-	if (trs.empty()) {
-		// We are sending an empty batch, so we have to trigger the version fetcher
-		self->commitBatchStartNotifications.send(Void());
-	}
-
 	/////// Phase 1: Pre-resolution processing (CPU bound except waiting for a version # which is separately pipelined and *should* be available by now (unless empty commit); ordered; currently atomic but could yield)
 	TEST(self->latestLocalCommitBatchResolving.get() < localBatchNumber-1); // Queuing pre-resolution commit processing 
 	wait(self->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber-1));
-	wait(yield());
+	wait(yield(TaskPriority::ProxyCommitYield1));

 	if (debugID.present())
 		g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.GettingCommitVersion");

-	Future<GetCommitVersionReply> fVersionReply = waitNext(self->commitBatchVersions.getFuture());
-	GetCommitVersionReply versionReply = wait(fVersionReply);
+	GetCommitVersionRequest req(self->commitVersionRequestNumber++, self->mostRecentProcessedRequestNumber, self->dbgid);
+	GetCommitVersionReply versionReply = wait( brokenPromiseToNever(self->master.getCommitVersion.getReply(req, TaskPriority::ProxyMasterVersionReply)) );
 	self->mostRecentProcessedRequestNumber = versionReply.requestNum;

 	self->stats.txnCommitVersionAssigned += trs.size();
@ -674,7 +665,7 @@ ACTOR Future<Void> commitBatch(
 	////// Phase 3: Post-resolution processing (CPU bound except for very rare situations; ordered; currently atomic but doesn't need to be)
 	TEST(self->latestLocalCommitBatchLogging.get() < localBatchNumber-1); // Queuing post-resolution commit processing 
 	wait(self->latestLocalCommitBatchLogging.whenAtLeast(localBatchNumber-1));
-	wait(yield());
+	wait(yield(TaskPriority::ProxyCommitYield2));

 	self->stats.txnCommitResolved += trs.size();

@ -832,7 +823,7 @@ ACTOR Future<Void> commitBatch(
 			for (; mutationNum < pMutations->size(); mutationNum++) {
 				if(yieldBytes > SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
 					yieldBytes = 0;
-					wait(yield());
+					wait(yield(TaskPriority::ProxyCommitYield2));
 				}

 				auto& m = (*pMutations)[mutationNum];
@ -1014,7 +1005,7 @@ ACTOR Future<Void> commitBatch(
 	}
 	self->lastCommitLatency = now()-commitStartTime;
 	self->lastCommitTime = std::max(self->lastCommitTime.get(), commitStartTime);
-	wait(yield());
+	wait(yield(TaskPriority::ProxyCommitYield3));

 	if( self->popRemoteTxs && msg.popTo > ( self->txsPopVersions.size() ? self->txsPopVersions.back().second : self->lastTxsPop ) ) {
 		if(self->txsPopVersions.size() >= SERVER_KNOBS->MAX_TXS_POP_VERSION_HISTORY) {
@ -1162,14 +1153,6 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(ProxyCommitData* commi
 	return rep;
 }

-ACTOR Future<Void> fetchVersions(ProxyCommitData *commitData) {
-	loop {
-		waitNext(commitData->commitBatchStartNotifications.getFuture());
-		GetCommitVersionRequest req(commitData->commitVersionRequestNumber++, commitData->mostRecentProcessedRequestNumber, commitData->dbgid);
-		commitData->commitBatchVersions.send(brokenPromiseToNever(commitData->master.getCommitVersion.getReply(req)));
-	}
-}
-
 struct TransactionRateInfo {
 	double rate;
 	double limit;
@ -1661,7 +1644,6 @@ ACTOR Future<Void> masterProxyServerCore(
 	state GetHealthMetricsReply healthMetricsReply;
 	state GetHealthMetricsReply detailedHealthMetricsReply;

-	addActor.send( fetchVersions(&commitData) );
 	addActor.send( waitFailureServer(proxy.waitFailure.getFuture()) );

 	//TraceEvent("ProxyInit1", proxy.id());
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@ -2301,9 +2301,7 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
 		}
 		wait(logData->committingQueue.getFuture() || logData->removed );
 	} catch( Error &e ) {
-		if(e.code() != error_code_actor_cancelled) {
-			req.reply.sendError(e);
-		}
+		req.reply.sendError(recruitment_failed());

 		if( e.code() != error_code_worker_removed ) {
 			throw;
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@ -182,6 +182,7 @@ struct RatekeeperData {
 	RatekeeperLimits batchLimits;

 	Deque<double> actualTpsHistory;
+	Optional<Key> remoteDC;

 	RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), 
 		actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")),
@ -384,7 +385,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 	// Look at each storage server's write queue and local rate, compute and store the desired rate ratio
 	for(auto i = self->storageQueueInfo.begin(); i != self->storageQueueInfo.end(); ++i) {
 		auto& ss = i->value;
-		if (!ss.valid) continue;
+		if (!ss.valid || (self->remoteDC.present() && ss.locality.dcId() == self->remoteDC)) continue;
 		++sscount;

 		limitReason_t ssLimitReason = limitReason_t::unlimited;
@ -537,7 +538,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		Version minLimitingSSVer = std::numeric_limits<Version>::max();
 		for (const auto& it : self->storageQueueInfo) {
 			auto& ss = it.value;
-			if (!ss.valid) continue;
+			if (!ss.valid || (self->remoteDC.present() && ss.locality.dcId() == self->remoteDC)) continue;

 			minSSVer = std::min(minSSVer, ss.lastReply.version);

@ -657,6 +658,9 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		reasonID = UID();
 		TraceEvent(SevWarnAlways, "RkSSListFetchTimeout").suppressFor(1.0);
 	}
+	else if(limits->tpsLimit == std::numeric_limits<double>::infinity()) {
+		limits->tpsLimit = SERVER_KNOBS->RATEKEEPER_DEFAULT_LIMIT;
+	}

 	limits->tpsLimitMetric = std::min(limits->tpsLimit, 1e6);
 	limits->reasonMetric = limitReason;
@ -738,6 +742,8 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 	for( int i = 0; i < tlogInterfs.size(); i++ )
 		tlogTrackers.push_back( splitError( trackTLogQueueInfo(&self, tlogInterfs[i]), err ) );

+	self.remoteDC = dbInfo->get().logSystemConfig.getRemoteDcId();
+
 	try {
 		state bool lastLimited = false;
 		loop choose {
@ -794,6 +800,7 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 					for( int i = 0; i < tlogInterfs.size(); i++ )
 						tlogTrackers.push_back( splitError( trackTLogQueueInfo(&self, tlogInterfs[i]), err ) );
 				}
+				self.remoteDC = dbInfo->get().logSystemConfig.getRemoteDcId();
 			}
 			when ( wait(collection) ) {
 				ASSERT(false);
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -2756,9 +2756,7 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
 		}
 		wait(logData->committingQueue.getFuture() || logData->removed );
 	} catch( Error &e ) {
-		if(e.code() != error_code_actor_cancelled) {
-			req.reply.sendError(e);
-		}
+		req.reply.sendError(recruitment_failed());

 		if( e.code() != error_code_worker_removed ) {
 			throw;
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -436,7 +436,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				vector<Future<Void>> tLogCommitResults;
 				for(int loc=0; loc< it->logServers.size(); loc++) {
 					Standalone<StringRef> msg = data.getMessages(location);
-					allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, debugID ), TaskPriority::TLogCommitReply ) );
+					allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, debugID ), TaskPriority::ProxyTLogCommitReply ) );
 					Future<Void> commitSuccess = success(allReplies.back());
 					addActor.get().send(commitSuccess);
 					tLogCommitResults.push_back(commitSuccess);
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -1142,10 +1142,8 @@ private:
 			}
 			case OPT_TRACECLOCK: {
 				const char* a = args.OptionArg();
-				if (!strcmp(a, "realtime"))
-					g_trace_clock = TRACE_CLOCK_REALTIME;
-				else if (!strcmp(a, "now"))
-					g_trace_clock = TRACE_CLOCK_NOW;
+				if (!strcmp(a, "realtime")) g_trace_clock.store(TRACE_CLOCK_REALTIME);
+				else if (!strcmp(a, "now")) g_trace_clock.store(TRACE_CLOCK_NOW);
 				else {
 					fprintf(stderr, "ERROR: Unknown clock source `%s'\n", a);
 					printHelpTeaser(argv[0]);
@ -1537,7 +1535,7 @@ int main(int argc, char* argv[]) {
 		delete CLIENT_KNOBS;
 		FlowKnobs* flowKnobs = new FlowKnobs(true, role == Simulation);
 		ClientKnobs* clientKnobs = new ClientKnobs(true);
-		ServerKnobs* serverKnobs = new ServerKnobs(true, clientKnobs);
+		ServerKnobs* serverKnobs = new ServerKnobs(true, clientKnobs, role == Simulation);
 		FLOW_KNOBS = flowKnobs;
 		SERVER_KNOBS = serverKnobs;
 		CLIENT_KNOBS = clientKnobs;
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -434,6 +434,7 @@ public:
 	bool shuttingDown;

 	bool behind;
+	bool versionBehind;

 	bool debug_inApplyUpdate;
 	double debug_lastValidateTime;
@ -530,7 +531,7 @@ public:
 			shuttingDown(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0),
 			logProtocol(0), counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()),
 			readQueueSizeMetric(LiteralStringRef("StorageServer.ReadQueueSize")),
-			behind(false), byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), noRecentUpdates(false),
+			behind(false), versionBehind(false), byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), noRecentUpdates(false),
 			lastUpdate(now()), poppedAllAfter(std::numeric_limits<Version>::max()), cpuUsage(0.0), diskUsage(0.0)
 	{
 		version.initMetric(LiteralStringRef("StorageServer.Version"), counters.cc.id);
@ -765,7 +766,7 @@ ACTOR Future<Version> waitForVersion( StorageServer* data, Version version ) {
 	else if (version <= data->version.get())
 		return version;

-	if(data->behind && version > data->version.get()) {
+	if((data->behind || data->versionBehind) && version > data->version.get()) {
 		throw process_behind();
 	}

@ -3419,9 +3420,18 @@ ACTOR Future<Void> waitMetrics( StorageServerMetrics* self, WaitMetricsRequest r
 				break;
 			}

-			if ( timedout || !req.min.allLessOrEqual( metrics ) || !metrics.allLessOrEqual( req.max ) ) {
-				TEST( !timedout ); // ShardWaitMetrics return case 2 (delayed)
-				TEST( timedout ); // ShardWaitMetrics return on timeout
+			if( timedout ) {
+				TEST( true ); // ShardWaitMetrics return on timeout
+				if(deterministicRandom()->random01() < SERVER_KNOBS->WAIT_METRICS_WRONG_SHARD_CHANCE) {
+					req.reply.sendError( wrong_shard_server() );
+				} else {
+					req.reply.send( metrics );
+				}
+				break;
+			}
+
+			if ( !req.min.allLessOrEqual( metrics ) || !metrics.allLessOrEqual( req.max ) ) {
+				TEST( true ); // ShardWaitMetrics return case 2 (delayed)
 				req.reply.send( metrics );
 				break;
 			}
@ -3510,6 +3520,28 @@ ACTOR Future<Void> logLongByteSampleRecovery(Future<Void> recovery) {
 	return Void();
 }

+ACTOR Future<Void> checkBehind( StorageServer* self ) {
+	state int behindCount = 0;
+	loop {
+		wait( delay(SERVER_KNOBS->BEHIND_CHECK_DELAY) );
+		state Transaction tr(self->cx);
+		loop {
+			try {
+				Version readVersion = wait( tr.getRawReadVersion() );
+				if( readVersion > self->version.get() + SERVER_KNOBS->BEHIND_CHECK_VERSIONS ) {
+					behindCount++;
+				} else {
+					behindCount = 0;
+				}
+				self->versionBehind = behindCount >= SERVER_KNOBS->BEHIND_CHECK_COUNT;
+				break;
+			} catch( Error &e ) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+}
+
 ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterface ssi )
 {
 	state Future<Void> doUpdate = Void();
@ -3526,6 +3558,7 @@ ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterfac
 	actors.add(self->otherError.getFuture());
 	actors.add(metricsCore(self, ssi));
 	actors.add(logLongByteSampleRecovery(self->byteSampleRecovery));
+	actors.add(checkBehind(self));

 	self->coreStarted.send( Void() );

--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -68,15 +68,23 @@ extern IKeyValueStore* keyValueStoreCompressTestData(IKeyValueStore* store);


 ACTOR static Future<Void> extractClientInfo( Reference<AsyncVar<ServerDBInfo>> db, Reference<AsyncVar<ClientDBInfo>> info ) {
+	state std::vector<UID> lastProxyUIDs;
+	state std::vector<MasterProxyInterface> lastProxies;
 	loop {
-		info->set( db->get().client );
+		ClientDBInfo ni = db->get().client;
+		shrinkProxyList(ni, lastProxyUIDs, lastProxies);
+		info->set( ni );
 		wait( db->onChange() );
 	}
 }

 ACTOR static Future<Void> extractClientInfo( Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> db, Reference<AsyncVar<ClientDBInfo>> info ) {
+	state std::vector<UID> lastProxyUIDs;
+	state std::vector<MasterProxyInterface> lastProxies;
 	loop {
-		info->set( db->get().read().client );
+		ClientDBInfo ni = db->get().read().client;
+		shrinkProxyList(ni, lastProxyUIDs, lastProxies);
+		info->set( ni );
 		wait( db->onChange() );
 	}
 }
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@ -75,6 +75,8 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
 	init( PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT,           3600.0 );

 	init( TLS_CERT_REFRESH_DELAY_SECONDS,                 12*60*60 );
+	init( TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT,              9.0 );
+	init( TLS_CLIENT_CONNECTION_THROTTLE_TIMEOUT,             11.0 );

 	init( NETWORK_TEST_REPLY_SIZE,                           600e3 );

--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@ -91,9 +91,11 @@ public:
 	int USE_OBJECT_SERIALIZER;

 	int TLS_CERT_REFRESH_DELAY_SECONDS;
+	double TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT;
+	double TLS_CLIENT_CONNECTION_THROTTLE_TIMEOUT;

 	int NETWORK_TEST_REPLY_SIZE;
-
+	
 	//AsyncFileCached
 	int64_t PAGE_CACHE_4K;
 	int64_t PAGE_CACHE_64K;
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@ -167,7 +167,6 @@ public:

 	uint64_t numYields;

-	double lastPriorityTrackTime;
 	TaskPriority lastMinTaskID;

 	std::priority_queue<OrderedTask, std::vector<OrderedTask>> ready;
@ -521,7 +520,7 @@ Net2::Net2(bool useThreadPool, bool useMetrics)
 	int priBins[] = { 1, 2050, 3050, 4050, 4950, 5050, 7050, 8050, 10050 };
 	static_assert( sizeof(priBins) == sizeof(int)*NetworkMetrics::PRIORITY_BINS, "Fix priority bins");
 	for(int i=0; i<NetworkMetrics::PRIORITY_BINS; i++)
-		networkMetrics.priorityBins[i] = static_cast<TaskPriority>(priBins[i]);
+		networkInfo.metrics.priorityBins[i] = static_cast<TaskPriority>(priBins[i]);
 	updateNow();

 }
@ -737,22 +736,21 @@ void Net2::run() {
 void Net2::trackMinPriority( TaskPriority minTaskID, double now ) {
 	if (minTaskID != lastMinTaskID) {
 		for(int c=0; c<NetworkMetrics::PRIORITY_BINS; c++) {
-			TaskPriority pri = networkMetrics.priorityBins[c];
+			TaskPriority pri = networkInfo.metrics.priorityBins[c];
 			if (pri > minTaskID && pri <= lastMinTaskID) {  // busy -> idle
-				double busyFor = lastPriorityTrackTime - networkMetrics.priorityTimer[c];
-				networkMetrics.priorityBlocked[c] = false;
-				networkMetrics.priorityBlockedDuration[c] += busyFor;
-				networkMetrics.secSquaredPriorityBlocked[c] += busyFor * busyFor;
+				networkInfo.metrics.priorityBlocked[c] = false;
+				networkInfo.metrics.priorityBlockedDuration[c] += now - networkInfo.metrics.windowedPriorityTimer[c];
+				networkInfo.metrics.priorityMaxBlockedDuration[c] = std::max(networkInfo.metrics.priorityMaxBlockedDuration[c], now - networkInfo.metrics.priorityTimer[c]);
 			}
 			if (pri <= minTaskID && pri > lastMinTaskID) {  // idle -> busy
-				networkMetrics.priorityBlocked[c] = true;
-				networkMetrics.priorityTimer[c] = now;
+				networkInfo.metrics.priorityBlocked[c] = true;
+				networkInfo.metrics.priorityTimer[c] = now;
+				networkInfo.metrics.windowedPriorityTimer[c] = now;
 			}
 		}
 	}

 	lastMinTaskID = minTaskID;
-	lastPriorityTrackTime = now;
 }

 void Net2::processThreadReady() {
@ -772,7 +770,7 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, T
 	int64_t elapsed = tscEnd-tscBegin;
 	if (elapsed > FLOW_KNOBS->TSC_YIELD_TIME && tscBegin > 0) {
 		int i = std::min<double>(NetworkMetrics::SLOW_EVENT_BINS-1, log( elapsed/1e6 ) / log(2.));
-		++networkMetrics.countSlowEvents[i];
+		++networkInfo.metrics.countSlowEvents[i];
 		int64_t warnThreshold = g_network->isSimulated() ? 10e9 : 500e6;

 		//printf("SlowTask: %d, %d yields\n", (int)(elapsed/1e6), numYields);
--- a/flow/Platform.cpp
+++ b/flow/Platform.cpp
@ -133,12 +133,12 @@ std::string removeWhitespace(const std::string &t)
 	if (found != std::string::npos)
 		str.erase(found + 1);
 	else
-		str.clear();			// str is all whitespace
+		str.clear(); // str is all whitespace
 	found = str.find_first_not_of(ws);
 	if (found != std::string::npos)
 		str.erase(0, found);
 	else
-		str.clear();			// str is all whitespace
+		str.clear(); // str is all whitespace

 	return str;
 }
@ -1786,7 +1786,7 @@ bool deleteFile( std::string const& filename ) {
 #endif
 	Error e = systemErrorCodeToError();
 	TraceEvent(SevError, "DeleteFile").detail("Filename", filename).GetLastError().error(e);
-	throw errno;
+	throw e;
 }

 static void createdDirectory() { INJECT_FAULT( platform_error, "createDirectory" ); }
@ -2805,6 +2805,8 @@ extern volatile bool net2backtraces_overflow;
 extern volatile int64_t net2backtraces_count;
 extern std::atomic<int64_t> net2liveness;
 extern void initProfiling();
+
+std::atomic<double> checkThreadTime;
 #endif

 volatile thread_local bool profileThread = false;
@ -2852,7 +2854,9 @@ void profileHandler(int sig) {
 	// We are casting away the volatile-ness of the backtrace array, but we believe that should be reasonably safe in the signal handler
 	ProfilingSample* ps = const_cast<ProfilingSample*>((volatile ProfilingSample*)(net2backtraces + net2backtraces_offset));

-	ps->timestamp = timer();
+	// We can only read the check thread time in a signal handler if the atomic is lock free.
+	// We can't get the time from a timer() call because it's not signal safe.
+	ps->timestamp = checkThreadTime.is_lock_free() ? checkThreadTime.load() : 0;

 	// SOMEDAY: should we limit the maximum number of frames from backtrace beyond just available space?
 	size_t size = backtrace(ps->frames, net2backtraces_max - net2backtraces_offset - 2);
@ -2899,6 +2903,7 @@ void* checkThread(void *arg) {
 				}

 				lastSignal = t;
+				checkThreadTime.store(lastSignal);
 				pthread_kill(mainThread, SIGPROF);
 			}
 		}
--- a/flow/SystemMonitor.cpp
+++ b/flow/SystemMonitor.cpp
@ -95,8 +95,8 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 				.detail("MachineID", machineState.machineId)
 				.detail("AIOSubmitCount", netData.countAIOSubmit - statState->networkState.countAIOSubmit)
 				.detail("AIOCollectCount", netData.countAIOCollect - statState->networkState.countAIOCollect)
-				.detail("AIOSubmitLag", (g_network->networkMetrics.secSquaredSubmit - statState->networkMetricsState.secSquaredSubmit) / currentStats.elapsed)
-				.detail("AIODiskStall", (g_network->networkMetrics.secSquaredDiskStall - statState->networkMetricsState.secSquaredDiskStall) / currentStats.elapsed)
+				.detail("AIOSubmitLag", (g_network->networkInfo.metrics.secSquaredSubmit - statState->networkMetricsState.secSquaredSubmit) / currentStats.elapsed)
+				.detail("AIODiskStall", (g_network->networkInfo.metrics.secSquaredDiskStall - statState->networkMetricsState.secSquaredDiskStall) / currentStats.elapsed)
 				.detail("CurrentConnections", netData.countConnEstablished - netData.countConnClosedWithError - netData.countConnClosedWithoutError)
 				.detail("ConnectionsEstablished", (double) (netData.countConnEstablished - statState->networkState.countConnEstablished) / currentStats.elapsed)
 				.detail("ConnectionsClosed", ((netData.countConnClosedWithError - statState->networkState.countConnClosedWithError) + (netData.countConnClosedWithoutError - statState->networkState.countConnClosedWithoutError)) / currentStats.elapsed)
@ -142,23 +142,22 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 				.detail("ReactTime", netData.countReactTime - statState->networkState.countReactTime);

 			for (int i = 0; i<NetworkMetrics::SLOW_EVENT_BINS; i++) {
-				if (int c = g_network->networkMetrics.countSlowEvents[i] - statState->networkMetricsState.countSlowEvents[i]) {
+				if (int c = g_network->networkInfo.metrics.countSlowEvents[i] - statState->networkMetricsState.countSlowEvents[i]) {
 					n.detail(format("SlowTask%dM", 1 << i).c_str(), c);
 				}
 			}

-			for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkMetrics.priorityBins[i] != TaskPriority::Zero; i++) {
-				if(g_network->networkMetrics.priorityBlocked[i]) {
-					double lastSegment = std::min(currentStats.elapsed, now() - g_network->networkMetrics.priorityTimer[i]);
-					g_network->networkMetrics.priorityBlockedDuration[i] += lastSegment;
-					g_network->networkMetrics.secSquaredPriorityBlocked[i] += lastSegment * lastSegment;
-					g_network->networkMetrics.priorityTimer[i] = now();
+			for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkInfo.metrics.priorityBins[i] != TaskPriority::Zero; i++) {
+				if(g_network->networkInfo.metrics.priorityBlocked[i]) {
+					g_network->networkInfo.metrics.priorityBlockedDuration[i] += now() - g_network->networkInfo.metrics.windowedPriorityTimer[i];
+					g_network->networkInfo.metrics.priorityMaxBlockedDuration[i] = std::max(g_network->networkInfo.metrics.priorityMaxBlockedDuration[i], now() - g_network->networkInfo.metrics.priorityTimer[i]);
+					g_network->networkInfo.metrics.windowedPriorityTimer[i] = now();
 				}

-				double blocked = g_network->networkMetrics.priorityBlockedDuration[i] - statState->networkMetricsState.priorityBlockedDuration[i];
-				double s2Blocked = g_network->networkMetrics.secSquaredPriorityBlocked[i] - statState->networkMetricsState.secSquaredPriorityBlocked[i];
-				n.detail(format("PriorityBusy%d", g_network->networkMetrics.priorityBins[i]).c_str(), blocked);
-				n.detail(format("SumOfSquaredPriorityBusy%d", g_network->networkMetrics.priorityBins[i]).c_str(), s2Blocked);
+				n.detail(format("PriorityBusy%d", g_network->networkInfo.metrics.priorityBins[i]).c_str(), std::min(currentStats.elapsed, g_network->networkInfo.metrics.priorityBlockedDuration[i] - statState->networkMetricsState.priorityBlockedDuration[i]));
+				n.detail(format("PriorityMaxBusy%d", g_network->networkInfo.metrics.priorityBins[i]).c_str(), g_network->networkInfo.metrics.priorityMaxBlockedDuration[i]);
+
+				g_network->networkInfo.metrics.priorityMaxBlockedDuration[i] = 0;
 			}

 			n.trackLatest("NetworkMetrics");
@ -288,7 +287,7 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 		}
 	}
 #endif
-	statState->networkMetricsState = g_network->networkMetrics;
+	statState->networkMetricsState = g_network->networkInfo.metrics;
 	statState->networkState = netData;
 	return currentStats;
 }
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@ -113,7 +113,7 @@ struct SuppressionMap {
 };

 TraceBatch g_traceBatch;
-thread_local trace_clock_t g_trace_clock = TRACE_CLOCK_REALTIME;
+std::atomic<trace_clock_t> g_trace_clock{ TRACE_CLOCK_NOW };

 LatestEventCache latestEventCache;
 SuppressionMap suppressedEvents;
@ -422,7 +422,7 @@ public:
 					TraceEventFields rolledFields;
 					for(auto itr = events[idx].begin(); itr != events[idx].end(); ++itr) {
 						if(itr->first == "Time") {
-							rolledFields.addField("Time", format("%.6f", (g_trace_clock == TRACE_CLOCK_NOW) ? now() : timer()));
+							rolledFields.addField("Time", format("%.6f", TraceEvent::getCurrentTime()));
 							rolledFields.addField("OriginalTime", itr->second);
 						}
 						else if(itr->first == "TrackLatestType") {
@ -653,13 +653,13 @@ void removeTraceRole(std::string role) {
 	g_traceLog.removeRole(role);
 }

-TraceEvent::TraceEvent( const char* type, UID id ) : id(id), type(type), severity(SevInfo), initialized(false), enabled(true) {
+TraceEvent::TraceEvent( const char* type, UID id ) : id(id), type(type), severity(SevInfo), initialized(false), enabled(true), logged(false) {
 	g_trace_depth++;
 	setMaxFieldLength(0);
 	setMaxEventLength(0);
 }
 TraceEvent::TraceEvent( Severity severity, const char* type, UID id )
-	: id(id), type(type), severity(severity), initialized(false),
+	: id(id), type(type), severity(severity), initialized(false), logged(false),
 	  enabled(g_network == nullptr || FLOW_KNOBS->MIN_TRACE_SEVERITY <= severity) {
 	g_trace_depth++;
 	setMaxFieldLength(0);
@ -668,7 +668,7 @@ TraceEvent::TraceEvent( Severity severity, const char* type, UID id )
 TraceEvent::TraceEvent( TraceInterval& interval, UID id )
 	: id(id), type(interval.type),
 	  severity(interval.severity),
-	  initialized(false),
+	  initialized(false), logged(false),
 	  enabled(g_network == nullptr || FLOW_KNOBS->MIN_TRACE_SEVERITY <= interval.severity) {

 	g_trace_depth++;
@ -680,7 +680,7 @@ TraceEvent::TraceEvent( TraceInterval& interval, UID id )
 TraceEvent::TraceEvent( Severity severity, TraceInterval& interval, UID id )
 	: id(id), type(interval.type),
 	  severity(severity),
-	  initialized(false),
+	  initialized(false), logged(false),
 	  enabled(g_network == nullptr || FLOW_KNOBS->MIN_TRACE_SEVERITY <= severity) {

 	g_trace_depth++;
@ -701,6 +701,7 @@ bool TraceEvent::init( TraceInterval& interval ) {
 }

 bool TraceEvent::init() {
+	ASSERT(!logged);
 	if(initialized) {
 		return enabled;
 	}
@ -723,26 +724,12 @@ bool TraceEvent::init() {
 	if(enabled) {
 		tmpEventMetric = new DynamicEventMetric(MetricNameRef());

-		double time;
-		if(g_trace_clock == TRACE_CLOCK_NOW) {
-			if(!g_network) {
-				static double preNetworkTime = timer_monotonic();
-				time = preNetworkTime;
-			}
-			else {
-				time = now();
-			}
-		}
-		else {
-			time = timer();
-		}
-
 		if(err.isValid() && err.isInjectedFault() && severity == SevError) {
 			severity = SevWarnAlways;
 		}

 		detail("Severity", int(severity));
-		detailf("Time", "%.6f", time);
+		detailf("Time", "%.6f", getCurrentTime());
 		detail("Type", type);
 		if(g_network && g_network->isSimulated()) {
 			NetworkAddress local = g_network->getLocalAddress();
@ -765,6 +752,7 @@ bool TraceEvent::init() {
 }

 TraceEvent& TraceEvent::errorImpl(class Error const& error, bool includeCancelled) {
+	ASSERT(!logged);
 	if (error.code() != error_code_actor_cancelled || includeCancelled) {
 		err = error;
 		if (initialized) {
@ -847,12 +835,14 @@ TraceEvent& TraceEvent::detailfNoMetric( std::string&& key, const char* valueFor
 }

 TraceEvent& TraceEvent::trackLatest( const char *trackingKey ){
+	ASSERT(!logged);
 	this->trackingKey = trackingKey;
 	ASSERT( this->trackingKey.size() != 0 && this->trackingKey[0] != '/' && this->trackingKey[0] != '\\');
 	return *this;
 }

 TraceEvent& TraceEvent::sample( double sampleRate, bool logSampleRate ) {
+	ASSERT(!logged);
 	if(enabled) {
 		if(initialized) {
 			TraceEvent(g_network && g_network->isSimulated() ? SevError : SevWarnAlways, std::string(TRACE_EVENT_INVALID_SUPPRESSION).append(type).c_str()).suppressFor(5);
@ -870,6 +860,7 @@ TraceEvent& TraceEvent::sample( double sampleRate, bool logSampleRate ) {
 }

 TraceEvent& TraceEvent::suppressFor( double duration, bool logSuppressedEventCount ) {
+	ASSERT(!logged);
 	if(enabled) {
 		if(initialized) {
 			TraceEvent(g_network && g_network->isSimulated() ? SevError : SevWarnAlways, std::string(TRACE_EVENT_INVALID_SUPPRESSION).append(type).c_str()).suppressFor(5);
@ -896,6 +887,7 @@ TraceEvent& TraceEvent::suppressFor( double duration, bool logSuppressedEventCou
 }

 TraceEvent& TraceEvent::setMaxFieldLength(int maxFieldLength) {
+	ASSERT(!logged);
 	if(maxFieldLength == 0) {
 		this->maxFieldLength = FLOW_KNOBS ? FLOW_KNOBS->MAX_TRACE_FIELD_LENGTH : 495;
 	} 
@ -907,6 +899,7 @@ TraceEvent& TraceEvent::setMaxFieldLength(int maxFieldLength) {
 }

 TraceEvent& TraceEvent::setMaxEventLength(int maxEventLength) {
+	ASSERT(!logged);
 	if(maxEventLength == 0) {
 		this->maxEventLength = FLOW_KNOBS ? FLOW_KNOBS->MAX_TRACE_EVENT_LENGTH : 4000;
 	} 
@ -934,44 +927,52 @@ unsigned long TraceEvent::CountEventsLoggedAt(Severity sev) {
 }

 TraceEvent& TraceEvent::backtrace(const std::string& prefix) {
+	ASSERT(!logged);
 	if (this->severity == SevError || !enabled) return *this; // We'll backtrace this later in ~TraceEvent
 	return detail(prefix + "Backtrace", platform::get_backtrace());
 }

-TraceEvent::~TraceEvent() {
-	init();
-	try {
-		if (enabled) {
-			if (this->severity == SevError) {
-				severity = SevInfo;
-				backtrace();
-				severity = SevError;
-			}
+void TraceEvent::log() {
+	if(!logged) {
+		init();
+		try {
+			if (enabled) {
+				if (this->severity == SevError) {
+					severity = SevInfo;
+					backtrace();
+					severity = SevError;
+				}

-			if(isNetworkThread()) {
-				TraceEvent::eventCounts[severity/10]++;
-			}
+				if(isNetworkThread()) {
+					TraceEvent::eventCounts[severity/10]++;
+				}

-			g_traceLog.writeEvent( fields, trackingKey, severity > SevWarnAlways );
+				g_traceLog.writeEvent( fields, trackingKey, severity > SevWarnAlways );

-			if (g_traceLog.isOpen()) {
-				// Log Metrics
-				if(g_traceLog.logTraceEventMetrics && isNetworkThread()) {
-					// Get the persistent Event Metric representing this trace event and push the fields (details) accumulated in *this to it and then log() it.
-					// Note that if the event metric is disabled it won't actually be logged BUT any new fields added to it will be registered.
-					// If the event IS logged, a timestamp will be returned, if not then 0.  Either way, pass it through to be used if possible
-					// in the Sev* event metrics.
+				if (g_traceLog.isOpen()) {
+					// Log Metrics
+					if(g_traceLog.logTraceEventMetrics && isNetworkThread()) {
+						// Get the persistent Event Metric representing this trace event and push the fields (details) accumulated in *this to it and then log() it.
+						// Note that if the event metric is disabled it won't actually be logged BUT any new fields added to it will be registered.
+						// If the event IS logged, a timestamp will be returned, if not then 0.  Either way, pass it through to be used if possible
+						// in the Sev* event metrics.

-					uint64_t event_ts = DynamicEventMetric::getOrCreateInstance(format("TraceEvent.%s", type), StringRef(), true)->setFieldsAndLogFrom(tmpEventMetric);
-					g_traceLog.log(severity, type, id, event_ts);
+						uint64_t event_ts = DynamicEventMetric::getOrCreateInstance(format("TraceEvent.%s", type), StringRef(), true)->setFieldsAndLogFrom(tmpEventMetric);
+						g_traceLog.log(severity, type, id, event_ts);
+					}
 				}
 			}
+		} catch( Error &e ) {
+			TraceEvent(SevError, "TraceEventLoggingError").error(e,true);
 		}
-	} catch( Error &e ) {
-		TraceEvent(SevError, "TraceEventDestructorError").error(e,true);
+		delete tmpEventMetric;
+		g_trace_depth--;
+		logged = true;
 	}
-	delete tmpEventMetric;
-	g_trace_depth--;
+}
+
+TraceEvent::~TraceEvent() {
+	log();
 }

 thread_local bool TraceEvent::networkThread = false;
@ -979,13 +980,26 @@ thread_local bool TraceEvent::networkThread = false;
 void TraceEvent::setNetworkThread() {
 	traceEventThrottlerCache = new TransientThresholdMetricSample<Standalone<StringRef>>(FLOW_KNOBS->TRACE_EVENT_METRIC_UNITS_PER_SAMPLE, FLOW_KNOBS->TRACE_EVENT_THROTTLER_MSG_LIMIT);
 	networkThread = true;
-	g_trace_clock = TRACE_CLOCK_NOW;
 }

 bool TraceEvent::isNetworkThread() {
 	return networkThread;
 }

+double TraceEvent::getCurrentTime() {
+	if(g_trace_clock.load() == TRACE_CLOCK_NOW) {
+		if(!isNetworkThread() || !g_network) {
+			return timer_monotonic();
+		}
+		else {
+			return now();
+		}
+	}
+	else {
+		return timer();
+	}
+}
+
 TraceInterval& TraceInterval::begin() {
 	pairID = nondeterministicRandom()->randomUniqueID();
 	count = 0;
@ -993,20 +1007,20 @@ TraceInterval& TraceInterval::begin() {
 }

 void TraceBatch::addEvent( const char *name, uint64_t id, const char *location ) {
-	eventBatch.push_back( EventInfo(g_trace_clock == TRACE_CLOCK_NOW ? now() : timer(), name, id, location));
+	eventBatch.push_back( EventInfo(TraceEvent::getCurrentTime(), name, id, location));
 	if( g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP )
 		dump();
 }

 void TraceBatch::addAttach( const char *name, uint64_t id, uint64_t to ) {
-	attachBatch.push_back( AttachInfo(g_trace_clock == TRACE_CLOCK_NOW ? now() : timer(), name, id, to));
+	attachBatch.push_back( AttachInfo(TraceEvent::getCurrentTime(), name, id, to));
 	if( g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP )
 		dump();
 }

 void TraceBatch::addBuggify( int activated, int line, std::string file ) {
 	if( g_network ) {
-		buggifyBatch.push_back( BuggifyInfo(g_trace_clock == TRACE_CLOCK_NOW ? now() : timer(), activated, line, file));
+		buggifyBatch.push_back( BuggifyInfo(TraceEvent::getCurrentTime(), activated, line, file));
 		if( g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP )
 			dump();
 	} else {
--- a/flow/Trace.h
+++ b/flow/Trace.h
@ -22,6 +22,7 @@
 #define FLOW_TRACE_H
 #pragma once

+#include <atomic>
 #include <stdarg.h>
 #include <stdint.h>
 #include <string>
@ -381,6 +382,8 @@ struct TraceEvent {
 	static void setNetworkThread();
 	static bool isNetworkThread();

+	static double getCurrentTime();
+
 	//Must be called directly after constructing the trace event
 	TraceEvent& error(const class Error& e, bool includeCancelled=false) {
 		if (enabled) {
@ -461,6 +464,12 @@ public:

 	TraceEvent& GetLastError();

+	bool isEnabled() const {
+		return enabled;
+	}
+
+	void log();
+
 	~TraceEvent();  // Actually logs the event

 	// Return the number of invocations of TraceEvent() at the specified logging level.
@ -471,6 +480,7 @@ public:
 private:
 	bool initialized;
 	bool enabled;
+	bool logged;
 	std::string trackingKey;
 	TraceEventFields fields;
 	Severity severity;
@ -565,7 +575,7 @@ void addTraceRole(std::string role);
 void removeTraceRole(std::string role);

 enum trace_clock_t { TRACE_CLOCK_NOW, TRACE_CLOCK_REALTIME };
-extern thread_local trace_clock_t g_trace_clock;
+extern std::atomic<trace_clock_t> g_trace_clock;
 extern TraceBatch g_traceBatch;

 #define DUMPTOKEN(name)                                                                                                \
--- a/flow/network.h
+++ b/flow/network.h
@ -45,6 +45,9 @@ enum class TaskPriority {
 	FailureMonitor = 8700,
 	ResolutionMetrics = 8700,
 	Worker = 8660,
+	ClusterControllerWorker = 8656,
+	ClusterControllerRecruit = 8654,
+	ClusterControllerRegister = 8652,
 	ClusterController = 8650,
 	MasterTLogRejoin = 8646,
 	ProxyStorageRejoin = 8645,
@ -56,9 +59,14 @@ enum class TaskPriority {
 	TLogCommitReply = 8580,
 	TLogCommit = 8570,
 	ProxyGetRawCommittedVersion = 8565,
-	ProxyResolverReply = 8560,
-	ProxyCommitBatcher = 8550,
-	ProxyCommit = 8540,
+	ProxyCommitYield3 = 8562,
+	ProxyTLogCommitReply = 8560,
+	ProxyCommitYield2 = 8557,
+	ProxyResolverReply = 8555,
+	ProxyMasterVersionReply = 8550,
+	ProxyCommitYield1 = 8547,
+	ProxyCommit = 8545,
+	ProxyCommitBatcher = 8540,
 	TLogConfirmRunningReply = 8530,
 	TLogConfirmRunning = 8520,
 	ProxyGRVTimer = 8510,
@ -297,24 +305,31 @@ template <class T> class Promise;

 struct NetworkMetrics {
 	enum { SLOW_EVENT_BINS = 16 };
-	uint64_t countSlowEvents[SLOW_EVENT_BINS];
+	uint64_t countSlowEvents[SLOW_EVENT_BINS] = {};

 	enum { PRIORITY_BINS = 9 };
-	TaskPriority priorityBins[ PRIORITY_BINS ];
-	bool priorityBlocked[PRIORITY_BINS];
-	double priorityBlockedDuration[PRIORITY_BINS];
-	double secSquaredPriorityBlocked[PRIORITY_BINS];
-	double priorityTimer[PRIORITY_BINS];
+	TaskPriority priorityBins[PRIORITY_BINS] = {};
+	bool priorityBlocked[PRIORITY_BINS] = {};
+	double priorityBlockedDuration[PRIORITY_BINS] = {};
+	double priorityMaxBlockedDuration[PRIORITY_BINS] = {};
+	double priorityTimer[PRIORITY_BINS] = {};
+	double windowedPriorityTimer[PRIORITY_BINS] = {};

-	double oldestAlternativesFailure;
-	double newestAlternativesFailure;
-	double lastAlternativesFailureSkipDelay;
-	double lastSync;
+	double secSquaredSubmit = 0;
+	double secSquaredDiskStall = 0;

-	double secSquaredSubmit;
-	double secSquaredDiskStall;
+	NetworkMetrics() {}
+};

-	NetworkMetrics() { memset(this, 0, sizeof(*this)); }
+struct NetworkInfo {
+	NetworkMetrics metrics;
+	double oldestAlternativesFailure = 0;
+	double newestAlternativesFailure = 0;
+	double lastAlternativesFailureSkipDelay = 0;
+
+	std::map<std::pair<IPAddress, uint16_t>, double> serverTLSConnectionThrottler;
+
+	NetworkInfo() {}
 };

 class IEventFD : public ReferenceCounted<IEventFD> {
@ -465,7 +480,7 @@ public:
 		return (netAddressesFuncPtr) ? reinterpret_cast<NetworkAddressesFuncPtr>(netAddressesFuncPtr)() : NetworkAddressList();
 	}

-	NetworkMetrics networkMetrics;
+	NetworkInfo networkInfo;
 protected:
 	INetwork() {}

--- a/packaging/make_public.py
+++ b/packaging/make_public.py
@ -1,4 +1,4 @@
-#!/usr/bin/env python3 -B
+#!/usr/bin/env python3
 #
 # make_public.py
 #
--- a/packaging/msi/FDBInstaller.wxs
+++ b/packaging/msi/FDBInstaller.wxs
@ -32,7 +32,7 @@

 <Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
  <Product Name='$(var.Title)'
-           Id='{51E254F0-440E-4746-B7B3-83051EB87E6B}'
+           Id='{FD48B02E-BC76-48C8-9709-FF3538200932}'
           UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
           Version='$(var.Version)'
           Manufacturer='$(var.Manufacturer)'