Merge branch 'master' into mengxu/storage-engine-switch-PR-v2

2019-09-03 14:11:33 -07:00 · 2019-09-03 14:11:33 -07:00 · bd80a67d46
parent cc6dccf6a4 8f912ca4a6
commit bd80a67d46
66 changed files with 1267 additions and 949 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -18,7 +18,7 @@
 # limitations under the License.
 cmake_minimum_required(VERSION 3.13)
 project(foundationdb
-  VERSION 6.1.0
+  VERSION 7.0.0
  DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions."
  HOMEPAGE_URL "http://www.foundationdb.org/"
  LANGUAGES C CXX ASM)
--- a/build/link-wrapper.sh
+++ b/build/link-wrapper.sh
@ -1,21 +1,44 @@
 #!/bin/bash

 set -e
+OPTIONS=''
+
+# Get compiler version and major version
+COMPILER_VER=$("${CC}" -dumpversion)
+COMPILER_MAJVER="${COMPILER_VER%%\.*}"
+
+# Add linker, if specified and valid
+# The linker to use for building:
+# can be LD (system default, default choice), GOLD, LLD, or BFD
+if [ -n "${USE_LD}" ] && \
+	 (([[ "${CC}" == *"gcc"* ]] && [ "${COMPILER_MAJVER}" -ge 9 ]) || \
+    ([[ "${CXX}" == *"clang++"* ]] && [ "${COMPILER_MAJVER}" -ge 4 ]) )
+then
+	if [ "${PLATFORM}" == "linux" ]; then
+		if [ "${USE_LD}" == "BFD" ]; then
+			OPTIONS+='-fuse-ld=bfd -Wl,--disable-new-dtags'
+		elif [ "${USE_LD}" == "GOLD" ]; then
+			OPTIONS+='-fuse-ld=gold -Wl,--disable-new-dtags'
+		elif [ "${USE_LD}" == "LLD" ]; then
+			OPTIONS+='-fuse-ld=lld -Wl,--disable-new-dtags'
+		elif [ "${USE_LD}" != "DEFAULT" ] && [ "${USE_LD}" != "LD" ]; then
+			echo 'USE_LD must be set to DEFAULT, LD, BFD, GOLD, or LLD!'
+			exit 1
+		fi
+	fi
+fi

 case $1 in
    Application | DynamicLibrary)
 	echo "Linking        $3"

 	if [ "$1" = "DynamicLibrary" ]; then
-	    OPTIONS="-shared"
+		OPTIONS+=" -shared"
 		if [ "$PLATFORM" = "linux" ]; then
-		OPTIONS="$OPTIONS -Wl,-z,noexecstack -Wl,-soname,$( basename $3 )"
+			OPTIONS+=" -Wl,-z,noexecstack -Wl,-soname,$( basename $3 )"
+		elif [ "$PLATFORM" = "osx" ]; then
+			OPTIONS+=" -Wl,-dylib_install_name -Wl,$( basename $3 )"
 		fi
-	    if [ "$PLATFORM" = "osx" ]; then
-		OPTIONS="$OPTIONS -Wl,-dylib_install_name -Wl,$( basename $3 )"
-	    fi
-	else
-	    OPTIONS=
 	fi

 	OPTIONS=$( eval echo "$OPTIONS $LDFLAGS \$$2_OBJECTS \$$2_LIBS \$$2_STATIC_LIBS_REAL \$$2_LDFLAGS -o $3" )
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -5,7 +5,7 @@ set(ALLOC_INSTRUMENTATION OFF CACHE BOOL "Instrument alloc")
 set(WITH_UNDODB OFF CACHE BOOL "Use rr or undodb")
 set(USE_ASAN OFF CACHE BOOL "Compile with address sanitizer")
 set(FDB_RELEASE OFF CACHE BOOL "This is a building of a final release")
-set(USE_LD "LD" CACHE STRING "The linker to use for building: can be LD (system default, default choice), GOLD, or LLD")
+set(USE_LD "DEFAULT" CACHE STRING "The linker to use for building: can be LD (system default, default choice), BFD, GOLD, or LLD")
 set(USE_LIBCXX OFF CACHE BOOL "Use libc++")
 set(USE_CCACHE OFF CACHE BOOL "Use ccache for compilation if available")
 set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info")
@ -90,9 +90,23 @@ else()
    set(GCC YES)
  endif()

+  # Use the linker environmental variable, if specified and valid
+  if ((USE_LD STREQUAL "DEFAULT") AND (NOT "$ENV{USE_LD}" STREQUAL ""))
+    string(TOUPPER "$ENV{USE_LD}" USE_LDENV)
+    if (("${USE_LDENV}" STREQUAL "LD") OR ("${USE_LDENV}" STREQUAL "GOLD") OR ("${USE_LDENV}" STREQUAL "LLD") OR ("${USE_LDENV}" STREQUAL "BFD") OR ("${USE_LDENV}" STREQUAL "DEFAULT"))
+      set(USE_LD "${USE_LDENV}")
+    else()
+      message (FATAL_ERROR "USE_LD must be set to DEFAULT, LD, BFD, GOLD, or LLD!")
+    endif()
+  endif()
+
  # check linker flags.
-  if ((NOT (USE_LD STREQUAL "LD")) AND (NOT (USE_LD STREQUAL "GOLD")) AND (NOT (USE_LD STREQUAL "LLD")))
-    message (FATAL_ERROR "USE_LD must be set to LD, GOLD, or LLD!")
+  if (USE_LD STREQUAL "DEFAULT")
+    set(USE_LD "LD")
+  else()
+    if ((NOT (USE_LD STREQUAL "LD")) AND (NOT (USE_LD STREQUAL "GOLD")) AND (NOT (USE_LD STREQUAL "LLD")) AND (NOT (USE_LD STREQUAL "BFD")))
+      message (FATAL_ERROR "USE_LD must be set to DEFAULT, LD, BFD, GOLD, or LLD!")
+    endif()
  endif()

  # if USE_LD=LD, then we don't do anything, defaulting to whatever system
@ -100,6 +114,11 @@ else()
  # implies the default xcode linker, and other distros may choose others by
  # default).

+  if(USE_LD STREQUAL "BFD")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=bfd -Wl,--disable-new-dtags")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=bfd -Wl,--disable-new-dtags")
+  endif()
+
  if(USE_LD STREQUAL "GOLD")
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold -Wl,--disable-new-dtags")
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold -Wl,--disable-new-dtags")
--- a/documentation/sphinx/source/configuration.rst
+++ b/documentation/sphinx/source/configuration.rst
@ -199,14 +199,12 @@ The ``foundationdb.conf`` file contains several sections, detailed below. Note t
    ## foundationdb.conf 
    ##
    ## Configuration file for FoundationDB server processes 
-    ## Full documentation is available in the FoundationDB Administration document.

    [fdbmonitor]
-    restart_delay = 60
    user = foundationdb
    group = foundationdb

-Contains basic configuration parameters of the ``fdbmonitor`` process. ``restart_delay`` specifies the number of seconds that ``fdbmonitor`` waits before restarting a failed process. ``user`` and ``group`` are used on Linux systems to control the privilege level of child processes.
+Contains basic configuration parameters of the ``fdbmonitor`` process. ``user`` and ``group`` are used on Linux systems to control the privilege level of child processes.

 ``[general]`` section
 -----------------------
@ -215,8 +213,41 @@ Contains basic configuration parameters of the ``fdbmonitor`` process. ``restart

    [general]
    cluster_file = /etc/foundationdb/fdb.cluster
+    restart_delay = 60
+    ## restart_backoff and restart_delay_reset_interval default to the value that is used for restart_delay
+    # initial_restart_delay = 0
+    # restart_backoff = 60.0
+    # restart_delay_reset_interval = 60
+    # delete_envvars =
+    # kill_on_configuration_change = true
+    # disable_lifecycle_logging = false

-Contains settings applicable to all processes (e.g. fdbserver, backup_agent). The main setting of interest is ``cluster_file``, which specifies the location of the cluster file. This file and the directory that contains it must be writable by all processes (i.e. by the user or group set in the [fdbmonitor] section).
+Contains settings applicable to all processes (e.g. fdbserver, backup_agent).
+
+* ``cluster_file``: Specifies the location of the cluster file. This file and the directory that contains it must be writable by all processes (i.e. by the user or group set in the ``[fdbmonitor]`` section).
+* ``delete_envvars``: A space separated list of environment variables to remove from the environments of child processes. This can be used if the ``fdbmonitor`` process needs to be run with environment variables that are undesired in its children.
+* ``kill_on_configuration_change``: If ``true``, affected processes will be restarted whenever the configuration file changes. Defaults to ``true``.
+* ``disable_lifecycle_logging``: If ``true``, ``fdbmonitor`` will not write log events when processes start or terminate. Defaults to ``false``.
+
+The ``[general]`` section also contains some parameters to control how processes are restarted when they die. ``fdbmonitor`` uses backoff logic to prevent a process that dies repeatedly from cycling too quickly, and it also introduces up to +/-10% random jitter into the delay to avoid multiple processes all restarting simultaneously. ``fdbmonitor`` tracks separate backoff state for each process, so the restarting of one process will have no effect on the backoff behavior of another.
+
+* ``restart_delay``: The maximum number of seconds (subject to jitter) that fdbmonitor will delay before restarting a failed process.
+* ``initial_restart_delay``: The number of seconds ``fdbmonitor`` waits to restart a process the first time it dies. Defaults to 0 (i.e. the process gets restarted immediately). 
+* ``restart_backoff``: Controls how quickly ``fdbmonitor`` backs off when a process dies repeatedly. The previous delay (or 1, if the previous delay is 0) is multiplied by ``restart_backoff`` to get the next delay, maxing out at the value of ``restart_delay``. Defaults to the value of ``restart_delay``, meaning that the second and subsequent failures will all delay ``restart_delay`` between restarts.
+* ``restart_delay_reset_interval``: The number of seconds a process must be running before resetting the backoff back to the value of ``initial_restart_delay``. Defaults to the value of ``restart_delay``.
+
+As an example, let's say the following parameters have been set:
+
+.. code-block:: ini
+
+    restart_delay = 60
+    initial_restart_delay = 0
+    restart_backoff = 2.0
+    restart_delay_reset_interval = 180
+
+The progression of delays for a process that fails repeatedly would be ``0, 2, 4, 8, 16, 32, 60, 60, ...``, each subject to a 10% random jitter. After the process stays alive for 180 seconds, the backoff would reset and the next failure would restart the process immediately.
+
+Using the default parameters, a process will restart immediately if it fails and then delay ``restart_delay`` seconds if it fails again within ``restart_delay`` seconds. 

 .. _foundationdb-conf-fdbserver:

--- a/documentation/sphinx/source/downloads.rst
+++ b/documentation/sphinx/source/downloads.rst
@ -10,38 +10,38 @@ macOS

 The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.

-* `FoundationDB-6.2.2.pkg <https://www.foundationdb.org/downloads/6.2.2/macOS/installers/FoundationDB-6.2.2.pkg>`_
+* `FoundationDB-6.2.3.pkg <https://www.foundationdb.org/downloads/6.2.3/macOS/installers/FoundationDB-6.2.3.pkg>`_

 Ubuntu
 ------

 The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.

-* `foundationdb-clients-6.2.2-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.2/ubuntu/installers/foundationdb-clients_6.2.2-1_amd64.deb>`_
-* `foundationdb-server-6.2.2-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.2/ubuntu/installers/foundationdb-server_6.2.2-1_amd64.deb>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.3-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.3/ubuntu/installers/foundationdb-clients_6.2.3-1_amd64.deb>`_
+* `foundationdb-server-6.2.3-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.3/ubuntu/installers/foundationdb-server_6.2.3-1_amd64.deb>`_ (depends on the clients package)

 RHEL/CentOS EL6
 ---------------

 The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.

-* `foundationdb-clients-6.2.2-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.2/rhel6/installers/foundationdb-clients-6.2.2-1.el6.x86_64.rpm>`_
-* `foundationdb-server-6.2.2-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.2/rhel6/installers/foundationdb-server-6.2.2-1.el6.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.3-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.3/rhel6/installers/foundationdb-clients-6.2.3-1.el6.x86_64.rpm>`_
+* `foundationdb-server-6.2.3-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.3/rhel6/installers/foundationdb-server-6.2.3-1.el6.x86_64.rpm>`_ (depends on the clients package)

 RHEL/CentOS EL7
 ---------------

 The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.

-* `foundationdb-clients-6.2.2-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.2/rhel7/installers/foundationdb-clients-6.2.2-1.el7.x86_64.rpm>`_
-* `foundationdb-server-6.2.2-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.2/rhel7/installers/foundationdb-server-6.2.2-1.el7.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.3-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.3/rhel7/installers/foundationdb-clients-6.2.3-1.el7.x86_64.rpm>`_
+* `foundationdb-server-6.2.3-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.3/rhel7/installers/foundationdb-server-6.2.3-1.el7.x86_64.rpm>`_ (depends on the clients package)

 Windows
 -------

 The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.

-* `foundationdb-6.2.2-x64.msi <https://www.foundationdb.org/downloads/6.2.2/windows/installers/foundationdb-6.2.2-x64.msi>`_
+* `foundationdb-6.2.3-x64.msi <https://www.foundationdb.org/downloads/6.2.3/windows/installers/foundationdb-6.2.3-x64.msi>`_

 API Language Bindings
 =====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part

 If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:

-* `foundationdb-6.2.2.tar.gz <https://www.foundationdb.org/downloads/6.2.2/bindings/python/foundationdb-6.2.2.tar.gz>`_
+* `foundationdb-6.2.3.tar.gz <https://www.foundationdb.org/downloads/6.2.3/bindings/python/foundationdb-6.2.3.tar.gz>`_

 Ruby 1.9.3/2.0.0+
 -----------------

-* `fdb-6.2.2.gem <https://www.foundationdb.org/downloads/6.2.2/bindings/ruby/fdb-6.2.2.gem>`_
+* `fdb-6.2.3.gem <https://www.foundationdb.org/downloads/6.2.3/bindings/ruby/fdb-6.2.3.gem>`_

 Java 8+
 -------

-* `fdb-java-6.2.2.jar <https://www.foundationdb.org/downloads/6.2.2/bindings/java/fdb-java-6.2.2.jar>`_
-* `fdb-java-6.2.2-javadoc.jar <https://www.foundationdb.org/downloads/6.2.2/bindings/java/fdb-java-6.2.2-javadoc.jar>`_
+* `fdb-java-6.2.3.jar <https://www.foundationdb.org/downloads/6.2.3/bindings/java/fdb-java-6.2.3.jar>`_
+* `fdb-java-6.2.3-javadoc.jar <https://www.foundationdb.org/downloads/6.2.3/bindings/java/fdb-java-6.2.3-javadoc.jar>`_

 Go 1.11+
 --------
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@ -66,7 +66,8 @@
                        "cluster_controller",
                        "data_distributor",
                        "ratekeeper",
-                        "router"
+                        "router",
+                        "coordinator"
                     ]
                  },
                  "data_version":12341234,
@ -264,7 +265,23 @@
         "limiting_queue_bytes_storage_server":0,
         "worst_queue_bytes_storage_server":0,
         "limiting_version_lag_storage_server":0,
-         "worst_version_lag_storage_server":0
+         "worst_version_lag_storage_server":0,
+         "limiting_data_lag_storage_server":{
+            "versions":0,
+            "seconds":0.0
+         },
+         "worst_data_lag_storage_server":{
+            "versions":0,
+            "seconds":0.0
+         },
+         "limiting_durability_lag_storage_server":{
+            "versions":0,
+            "seconds":0.0
+         },
+         "worst_durability_lag_storage_server":{
+            "versions":0,
+            "seconds":0.0
+         }
      },
      "incompatible_connections":[
      ],
@ -460,6 +477,9 @@
      "full_replication":true,
      "maintenance_zone":"0ccb4e0fdbdb5583010f6b77d9d10ece",
      "maintenance_seconds_remaining":1.0,
+      "data_distribution_disabled_for_ss_failures":true,
+      "data_distribution_disabled_for_rebalance":true,
+      "data_distribution_disabled":true,
      "configuration":{
         "log_anti_quorum":0,
         "log_replicas":2,
--- a/documentation/sphinx/source/old-release-notes/release-notes-620.rst
+++ b/documentation/sphinx/source/old-release-notes/release-notes-620.rst
@ -2,7 +2,7 @@
 Release Notes
 #############

-6.2.2
+6.2.3
 =====

 Performance
@ -25,6 +25,7 @@ Performance
 * Made the storage cache eviction policy configurable, and added an LRU policy. `(PR #1506) <https://github.com/apple/foundationdb/pull/1506>`_.
 * Improved the speed of recoveries on large clusters at ``log_version >= 4``. `(PR #1729) <https://github.com/apple/foundationdb/pull/1729>`_.
 * Log routers will prefer to peek from satellites at ``log_version >= 4``. `(PR #1795) <https://github.com/apple/foundationdb/pull/1795>`_.
+* In clusters using a region configuration, clients will read from the remote region if all of the servers in the primary region are overloaded. [6.2.3] `(PR #2019) <https://github.com/apple/foundationdb/pull/2019>`_.

 Fixes
 -----
@ -40,6 +41,10 @@ Fixes
 * In very rare scenarios, master recovery would restart because system metadata was loaded incorrectly. `(PR #1919) <https://github.com/apple/foundationdb/pull/1919>`_.
 * Ratekeeper will aggressively throttle when unable to fetch the list of storage servers for a considerable period of time. `(PR #1858) <https://github.com/apple/foundationdb/pull/1858>`_.
 * Proxies could become overloaded when all storage servers on a team fail. [6.2.1] `(PR #1976) <https://github.com/apple/foundationdb/pull/1976>`_.
+* Proxies could start too few transactions if they didn't receive get read version requests frequently enough. [6.2.3] `(PR #1999) <https://github.com/apple/foundationdb/pull/1999>`_.
+* The ``fileconfigure`` command in ``fdbcli`` could fail with an unknown error if the file did not contain a valid JSON object. `(PR #2017) <https://github.com/apple/foundationdb/pull/2017>`_.
+* Configuring regions would fail with an internal error if the cluster contained storage servers that didn't set a datacenter ID. `(PR #2017) <https://github.com/apple/foundationdb/pull/2017>`_.
+* Clients no longer prefer reading from servers with the same zone ID, because it could create hot shards. [6.2.3] `(PR #2019) <https://github.com/apple/foundationdb/pull/2019>`_.

 Status
 ------
@ -54,6 +59,9 @@ Status
 * ``connected_clients`` is now only a sample of the connected clients, rather than a complete list. `(PR #1902) <https://github.com/apple/foundationdb/pull/1902>`_.
 * Added ``max_protocol_clients`` to the ``supported_versions`` section, which provides a sample of connected clients which cannot connect to any higher protocol version. `(PR #1902) <https://github.com/apple/foundationdb/pull/1902>`_.
 * Clients which connect without specifying their supported versions are tracked as an ``Unknown`` version in the ``supported_versions`` section. [6.2.2] `(PR #1990) <https://github.com/apple/foundationdb/pull/1990>`_.
+* Add ``coordinator`` to the list of roles that can be reported for a process. [6.2.3] `(PR #2006) <https://github.com/apple/foundationdb/pull/2006>`_.
+* Added ``worst_durability_lag_storage_server`` and ``limiting_durability_lag_storage_server`` to  the ``cluster.qos`` section, each with subfields ``versions`` and ``seconds``. These report the durability lag values being used by ratekeeper to potentially limit the transaction rate. [6.2.3] `(PR #2003) <https://github.com/apple/foundationdb/pull/2003>`_.
+* Added ``worst_data_lag_storage_server`` and ``limiting_data_lag_storage_server`` to  the ``cluster.qos`` section, each with subfields ``versions`` and ``seconds``. These are meant to replace ``worst_version_lag_storage_server`` and ``limiting_version_lag_storage_server``, which are now deprecated. [6.2.3] `(PR #2003) <https://github.com/apple/foundationdb/pull/2003>`_.

 Bindings
 --------
@ -92,6 +100,10 @@ Fixes only impacting 6.2.0+
 * Clients could crash when closing connections with incompatible servers. [6.2.1] `(PR #1976) <https://github.com/apple/foundationdb/pull/1976>`_.
 * Do not close idle network connections with incompatible servers. [6.2.1] `(PR #1976) <https://github.com/apple/foundationdb/pull/1976>`_.
 * In status, ``max_protocol_clients`` were incorrectly added to the ``connected_clients`` list. [6.2.2] `(PR #1990) <https://github.com/apple/foundationdb/pull/1990>`_.
+* Ratekeeper ignores the (default 5 second) MVCC window when controlling on durability lag. [6.2.3] `(PR #2012) <https://github.com/apple/foundationdb/pull/2012>`_.
+* The macOS client was not compatible with a Linux server. [6.2.3] `(PR #2045) <https://github.com/apple/foundationdb/pull/2045>`_.
+* Incompatible clients would continually reconnect with coordinators. [6.2.3] `(PR #2048) <https://github.com/apple/foundationdb/pull/2048>`_.
+* Connections were being closed as idle when there were still unreliable requests waiting for a response. [6.2.3] `(PR #2048) <https://github.com/apple/foundationdb/pull/2048>`_.

 Earlier release notes
 ---------------------
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -586,7 +586,6 @@ CSimpleOpt::SOption g_rgDBAgentOptions[] = {
 #ifdef _WIN32
 	{ OPT_PARENTPID,      "--parentpid",       SO_REQ_SEP },
 #endif
-	{ OPT_TRACE_LOG_GROUP, "--loggroup",       SO_REQ_SEP },
 	{ OPT_SOURCE_CLUSTER,  "-s",               SO_REQ_SEP },
 	{ OPT_SOURCE_CLUSTER,  "--source",         SO_REQ_SEP },
 	{ OPT_DEST_CLUSTER,    "-d",               SO_REQ_SEP },
@ -826,6 +825,9 @@ static void printAgentUsage(bool devhelp) {
 		   "  --logdir PATH  Specifes the output directory for trace files. If\n"
 		   "                 unspecified, defaults to the current directory. Has\n"
 		   "                 no effect unless --log is specified.\n");
+	printf("  --loggroup LOG_GROUP\n"
+	       "                 Sets the LogGroup field with the specified value for all\n"
+	       "                 events in the trace output (defaults to `default').\n");
 	printf("  --trace_format FORMAT\n"
 		   "                 Select the format of the trace files. xml (the default) and json are supported.\n"
 		   "                 Has no effect unless --log is specified.\n");
@ -912,6 +914,9 @@ static void printBackupUsage(bool devhelp) {
 		   "  --logdir PATH  Specifes the output directory for trace files. If\n"
 		   "                 unspecified, defaults to the current directory. Has\n"
 		   "                 no effect unless --log is specified.\n");
+	printf("  --loggroup LOG_GROUP\n"
+	       "                 Sets the LogGroup field with the specified value for all\n"
+	       "                 events in the trace output (defaults to `default').\n");
 	printf("  --trace_format FORMAT\n"
 		   "                 Select the format of the trace files. xml (the default) and json are supported.\n"
 		   "                 Has no effect unless --log is specified.\n");
@ -970,6 +975,9 @@ static void printRestoreUsage(bool devhelp ) {
 		   "  --logdir PATH  Specifes the output directory for trace files. If\n"
 		   "                 unspecified, defaults to the current directory. Has\n"
 		   "                 no effect unless --log is specified.\n");
+	printf("  --loggroup LOG_GROUP\n"
+	       "                 Sets the LogGroup field with the specified value for all\n"
+	       "                 events in the trace output (defaults to `default').\n");
 	printf("  --trace_format FORMAT\n"
 		   "                 Select the format of the trace files. xml (the default) and json are supported.\n"
 		   "                 Has no effect unless --log is specified.\n");
@ -1015,6 +1023,9 @@ static void printDBAgentUsage(bool devhelp) {
 		   "  --logdir PATH  Specifes the output directory for trace files. If\n"
 		   "                 unspecified, defaults to the current directory. Has\n"
 		   "                 no effect unless --log is specified.\n");
+	printf("  --loggroup LOG_GROUP\n"
+	       "                 Sets the LogGroup field with the specified value for all\n"
+	       "                 events in the trace output (defaults to `default').\n");
 	printf("  --trace_format FORMAT\n"
 		   "                 Select the format of the trace files. xml (the default) and json are supported.\n"
 		   "                 Has no effect unless --log is specified.\n");
@ -1062,6 +1073,9 @@ static void printDBBackupUsage(bool devhelp) {
 		   "  --logdir PATH  Specifes the output directory for trace files. If\n"
 		   "                 unspecified, defaults to the current directory. Has\n"
 		   "                 no effect unless --log is specified.\n");
+	printf("  --loggroup LOG_GROUP\n"
+	       "                 Sets the LogGroup field with the specified value for all\n"
+	       "                 events in the trace output (defaults to `default').\n");
 	printf("  --trace_format FORMAT\n"
 		   "                 Select the format of the trace files. xml (the default) and json are supported.\n"
 		   "                 Has no effect unless --log is specified.\n");
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -498,6 +498,10 @@ void initHelp() {
 	helpMap["quit"] = CommandHelp();
 	helpMap["waitconnected"] = CommandHelp();
 	helpMap["waitopen"] = CommandHelp();
+	helpMap["sleep"] = CommandHelp(
+		"sleep <SECONDS>",
+		"sleep for a period of time",
+		"");
 	helpMap["get"] = CommandHelp(
 		"get <KEY>",
 		"fetch the value for a given key",
@ -1493,6 +1497,17 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level,
 				outputString += "\n\nWARNING: A single process is both a transaction log and a storage server.\n  For best performance use dedicated disks for the transaction logs by setting process classes.";
 			}

+			if (statusObjCluster.has("data_distribution_disabled")) {
+				outputString += "\n\nWARNING: Data distribution is off.";
+			} else {
+				if (statusObjCluster.has("data_distribution_disabled_for_ss_failures")) {
+					outputString += "\n\nWARNING: Data distribution is currently turned on but disabled for all storage server failures.";
+				}
+				if (statusObjCluster.has("data_distribution_disabled_for_rebalance")) {
+					outputString += "\n\nWARNING: Data distribution is currently turned on but shard size balancing is currently disabled.";
+				}
+			}
+
 			printf("%s\n", outputString.c_str());
 		}

@ -1777,6 +1792,10 @@ ACTOR Future<bool> fileConfigure(Database db, std::string filePath, bool isNewDa
 		printf("ERROR: Invalid JSON\n");
 		return true;
 	}
+	if(config.type() != json_spirit::obj_type) {
+		printf("ERROR: Configuration file must contain a JSON object\n");
+		return true;
+	}
 	StatusObject configJSON = config.get_obj();

 	json_spirit::mValue schema;
@ -2195,7 +2214,14 @@ ACTOR Future<bool> exclude( Database db, std::vector<StringRef> tokens, Referenc
 	}
 }

-ACTOR Future<bool> createSnapshot(Database db, StringRef snapCmd) {
+ACTOR Future<bool> createSnapshot(Database db, std::vector<StringRef> tokens ) {
+	state Standalone<StringRef> snapCmd;
+	for ( int i = 1; i < tokens.size(); i++) {
+		snapCmd = snapCmd.withSuffix(tokens[i]);
+		if (i != tokens.size() - 1) {
+			snapCmd = snapCmd.withSuffix(LiteralStringRef(" "));
+		}
+	}
 	try {
 		UID snapUID = wait(makeInterruptable(mgmtSnapCreate(db, snapCmd)));
 		printf("Snapshot command succeeded with UID %s\n", snapUID.toString().c_str());
@ -2589,8 +2615,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	if (!opt.exec.present()) {
 		if(opt.initialStatusCheck) {
 			Future<Void> checkStatusF = checkStatus(Void(), db->getConnectionFile());
-			Future<Void> checkDDStatusF = checkDataDistributionStatus(db, true);
-			wait(makeInterruptable(success(checkStatusF) && success(checkDDStatusF)));
+			wait(makeInterruptable(success(checkStatusF)));
 		}
 		else {
 			printf("\n");
@ -2736,6 +2761,23 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 					continue;
 				}

+				if( tokencmp(tokens[0], "sleep")) {
+					if(tokens.size() != 2) {
+						printUsage(tokens[0]);
+						is_error = true;
+					} else {
+						double v;
+						int n=0;
+						if (sscanf(tokens[1].toString().c_str(), "%lf%n", &v, &n) != 1 || n != tokens[1].size()) {
+							printUsage(tokens[0]);
+							is_error = true;
+						} else {
+							wait(delay(v));
+						}
+					}
+					continue;
+				}
+
 				if (tokencmp(tokens[0], "status")) {
 					// Warn at 7 seconds since status will spend as long as 5 seconds trying to read/write from the database
 					warn = timeWarning( 7.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n" );
@ -2811,11 +2853,11 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 				}

 				if (tokencmp(tokens[0], "snapshot")) {
-					if (tokens.size() != 2) {
+					if (tokens.size() < 2) {
 						printUsage(tokens[0]);
 						is_error = true;
 					} else {
-						bool err = wait(createSnapshot(db, tokens[1]));
+						bool err = wait(createSnapshot(db, tokens));
 						if (err) is_error = true;
 					}
 					continue;
@ -3426,13 +3468,11 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {

 				if (tokencmp(tokens[0], "datadistribution")) {
 					if (tokens.size() != 2 && tokens.size() != 3) {
-						printf("Usage: datadistribution <status|on|off|disable <ssfailure|rebalance>|enable "
+						printf("Usage: datadistribution <on|off|disable <ssfailure|rebalance>|enable "
 						       "<ssfailure|rebalance>>\n");
 						is_error = true;
 					} else {
-						if (tokencmp(tokens[1], "status")) {
-							wait(makeInterruptable(checkDataDistributionStatus(db)));
-						} else if (tokencmp(tokens[1], "on")) {
+						if (tokencmp(tokens[1], "on")) {
 							wait(success(setDDMode(db, 1)));
 							printf("Data distribution is turned on.\n");
 						} else if (tokencmp(tokens[1], "off")) {
@ -3446,7 +3486,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 								wait(makeInterruptable(setDDIgnoreRebalanceSwitch(db, true)));
 								printf("Data distribution is disabled for rebalance.\n");
 							} else {
-								printf("Usage: datadistribution <status|on|off|disable <ssfailure|rebalance>|enable "
+								printf("Usage: datadistribution <on|off|disable <ssfailure|rebalance>|enable "
 								       "<ssfailure|rebalance>>\n");
 								is_error = true;
 							}
@ -3458,12 +3498,12 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 								wait(makeInterruptable(setDDIgnoreRebalanceSwitch(db, false)));
 								printf("Data distribution is enabled for rebalance.\n");
 							} else {
-								printf("Usage: datadistribution <status|on|off|disable <ssfailure|rebalance>|enable "
+								printf("Usage: datadistribution <on|off|disable <ssfailure|rebalance>|enable "
 								       "<ssfailure|rebalance>>\n");
 								is_error = true;
 							}
 						} else {
-							printf("Usage: datadistribution <status|on|off|disable <ssfailure|rebalance>|enable "
+							printf("Usage: datadistribution <on|off|disable <ssfailure|rebalance>|enable "
 							       "<ssfailure|rebalance>>\n");
 							is_error = true;
 						}
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -172,7 +172,6 @@ public:
 	Counter transactionsMaybeCommitted;
 	Counter transactionsResourceConstrained;
 	Counter transactionsProcessBehind;
-	Counter transactionWaitsForFullRecovery;

 	ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;

--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -423,11 +423,11 @@ ACTOR Future<ConfigurationResult::Type> changeConfig( Database cx, std::map<std:
 						for(auto& it : newConfig.regions) {
 							newDcIds.insert(it.dcId);
 						}
-						std::set<Key> missingDcIds;
+						std::set<Optional<Key>> missingDcIds;
 						for(auto& s : serverList) {
 							auto ssi = decodeServerListValue( s.value );
 							if ( !ssi.locality.dcId().present() || !newDcIds.count(ssi.locality.dcId().get()) ) {
-								missingDcIds.insert(ssi.locality.dcId().get());
+								missingDcIds.insert(ssi.locality.dcId());
 							}
 						}
 						if(missingDcIds.size() > (oldReplicationUsesDcId ? 1 : 0)) {
@ -1340,61 +1340,17 @@ ACTOR Future<vector<AddressExclusion>> getExcludedServers( Database cx ) {
 	}
 }

-ACTOR Future<Void> checkDataDistributionStatus(Database cx, bool printWarningOnly) {
-	state Transaction tr(cx);
-	loop {
-		try {
-			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-			state Future<Optional<Value>> overallSwitchF = tr.get(dataDistributionModeKey);
-			state Future<Optional<Value>> healthyZoneValueF = tr.get(healthyZoneKey);
-			state Future<Optional<Value>> rebalanceDDIgnoreValueF = tr.get(rebalanceDDIgnoreKey);
-			wait(success(overallSwitchF) && success(healthyZoneValueF) && success(rebalanceDDIgnoreValueF));
-			if (overallSwitchF.get().present()) {
-				BinaryReader rd(overallSwitchF.get().get(), Unversioned());
-				int currentMode;
-				rd >> currentMode;
-				if (currentMode == 0) {
-					printf("WARNING: Data distribution is off.\n");
-					return Void();
-				}
-			}
-			if (!printWarningOnly) {
-				printf("Data distribution is on.\n");
-			}
-			if (healthyZoneValueF.get().present()) {
-				auto healthyZoneKV = decodeHealthyZoneValue(healthyZoneValueF.get().get());
-				if (healthyZoneKV.first == ignoreSSFailuresZoneString) {
-					printf("WARNING: Data distribution is currently turned on but disabled for all storage server "
-					       "failures.\n");
-				} else {
-					printf("WARNING: Data distribution is currently turned on but zone %s is under maintenance and "
-					       "will continue for %" PRId64 " seconds.\n",
-					       healthyZoneKV.first.toString().c_str(),
-					       (healthyZoneKV.second - tr.getReadVersion().get()) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND);
-				}
-			}
-			if (rebalanceDDIgnoreValueF.get().present()) {
-				printf("WARNING: Data distribution is currently turned on but shard size balancing is currently "
-				       "disabled.\n");
-			}
-			return Void();
-		} catch (Error& e) {
-			wait(tr.onError(e));
-		}
-	}
-}
-
 ACTOR Future<Void> printHealthyZone( Database cx ) {
 	state Transaction tr(cx);
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			Optional<Value> val = wait( tr.get(healthyZoneKey) );
-			if(!val.present() || decodeHealthyZoneValue(val.get()).second <= tr.getReadVersion().get()) {
-				printf("No ongoing maintenance.\n");
-			} else if (val.present() && decodeHealthyZoneValue(val.get()).first == ignoreSSFailuresZoneString) {
+			if (val.present() && decodeHealthyZoneValue(val.get()).first == ignoreSSFailuresZoneString) {
 				printf("Data distribution has been disabled for all storage server failures in this cluster and thus "
 				       "maintenance mode is not active.\n");
+			} else if(!val.present() || decodeHealthyZoneValue(val.get()).second <= tr.getReadVersion().get()) {
+				printf("No ongoing maintenance.\n");
 			} else {
 				auto healthyZone = decodeHealthyZoneValue(val.get());
 				printf("Maintenance for zone %s will continue for %" PRId64 " seconds.\n", healthyZone.first.toString().c_str(), (healthyZone.second-tr.getReadVersion().get())/CLIENT_KNOBS->CORE_VERSIONSPERSECOND);
@ -1577,7 +1533,7 @@ ACTOR Future<std::set<NetworkAddress>> checkForExcludingServers(Database cx, vec
 	return inProgressExclusion;
 }

-ACTOR Future<UID> mgmtSnapCreate(Database cx, StringRef snapCmd) {
+ACTOR Future<UID> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd) {
 	state UID snapUID = deterministicRandom()->randomUniqueID();
 	try {
 		wait(snapCreate(cx, snapCmd, snapUID));
--- a/fdbclient/ManagementAPI.actor.h
+++ b/fdbclient/ManagementAPI.actor.h
@ -181,7 +181,6 @@ ACTOR Future<int> setDDMode( Database  cx, int  mode );

 ACTOR Future<Void> forceRecovery( Reference<ClusterConnectionFile> clusterFile, Standalone<StringRef> dcId );

-ACTOR Future<Void> checkDataDistributionStatus(Database cx, bool printWarningOnly = false);
 ACTOR Future<Void> printHealthyZone( Database cx );
 ACTOR Future<Void> setDDIgnoreRebalanceSwitch(Database cx, bool ignoreRebalance);
 ACTOR Future<bool> clearHealthyZone(Database cx, bool printWarning = false, bool clearSSFailureZoneString = false);
@ -197,7 +196,7 @@ bool schemaMatch( json_spirit::mValue const& schema, json_spirit::mValue const&

 // execute payload in 'snapCmd' on all the coordinators, TLogs and
 // storage nodes
-ACTOR Future<UID> mgmtSnapCreate(Database cx, StringRef snapCmd);
+ACTOR Future<UID> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd);

 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -519,7 +519,7 @@ DatabaseContext::DatabaseContext(
 	transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), 
 	transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), 
 	transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), 
-	transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0),
+	transactionsProcessBehind("ProcessBehind", cc), outstandingWatches(0),
 	latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
 	healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal)
 {
@ -548,7 +548,7 @@ DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("T
 	transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), 
 	transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), 
 	transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), 
-	transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), latencies(1000), readLatencies(1000), commitLatencies(1000), 
+	transactionsProcessBehind("ProcessBehind", cc), latencies(1000), readLatencies(1000), commitLatencies(1000),
 	GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), 
 	internal(false) {}

@ -1474,9 +1474,9 @@ ACTOR Future<Void> watchValue(Future<Version> version, Key key, Optional<Value>
 				g_traceBatch.addAttach("WatchValueAttachID", info.debugID.get().first(), watchValueID.get().first());
 				g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.Before"); //.detail("TaskID", g_network->getCurrentTask());
 			}
-			state Version resp;
+			state WatchValueReply resp;
 			choose {
-				when(Version r = wait(loadBalance(ssi.second, &StorageServerInterface::watchValue,
+				when(WatchValueReply r = wait(loadBalance(ssi.second, &StorageServerInterface::watchValue,
 				                                          WatchValueRequest(key, value, ver, watchValueID),
 				                                          TaskPriority::DefaultPromiseEndpoint))) {
 					resp = r;
@ -1489,12 +1489,13 @@ ACTOR Future<Void> watchValue(Future<Version> version, Key key, Optional<Value>

 			//FIXME: wait for known committed version on the storage server before replying,
 			//cannot do this until the storage server is notified on knownCommittedVersion changes from tlog (faster than the current update loop)
-			Version v = wait( waitForCommittedVersion( cx, resp ) );
+			Version v = wait(waitForCommittedVersion(cx, resp.version));

-			//TraceEvent("WatcherCommitted").detail("CommittedVersion", v).detail("WatchVersion", resp).detail("Key",  key ).detail("Value", value);
+			//TraceEvent("WatcherCommitted").detail("CommittedVersion", v).detail("WatchVersion", resp.version).detail("Key",  key ).detail("Value", value);

-			if( v - resp < 50000000 ) // False if there is a master failure between getting the response and getting the committed version, Dependent on SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT
-				return Void();
+			// False if there is a master failure between getting the response and getting the committed version,
+			// Dependent on SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT
+			if (v - resp.version < 50000000) return Void();
 			ver = v;
 		} catch (Error& e) {
 			if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
@ -2705,10 +2706,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 			if (e.code() != error_code_transaction_too_old
 				&& e.code() != error_code_not_committed
 				&& e.code() != error_code_database_locked
-				&& e.code() != error_code_proxy_memory_limit_exceeded
-				&& e.code() != error_code_transaction_not_permitted
-				&& e.code() != error_code_cluster_not_fully_recovered
-				&& e.code() != error_code_txn_exec_log_anti_quorum)
+				&& e.code() != error_code_proxy_memory_limit_exceeded)
 				TraceEvent(SevError, "TryCommitError").error(e);
 			if (trLogInfo)
 				trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, static_cast<int>(e.code()), req));
@ -3115,8 +3113,7 @@ Future<Void> Transaction::onError( Error const& e ) {
 		e.code() == error_code_commit_unknown_result ||
 		e.code() == error_code_database_locked ||
 		e.code() == error_code_proxy_memory_limit_exceeded ||
-		e.code() == error_code_process_behind ||
-		e.code() == error_code_cluster_not_fully_recovered)
+		e.code() == error_code_process_behind)
 	{
 		if(e.code() == error_code_not_committed)
 			++cx->transactionsNotCommitted;
@ -3126,9 +3123,6 @@ Future<Void> Transaction::onError( Error const& e ) {
 			++cx->transactionsResourceConstrained;
 		if (e.code() == error_code_process_behind)
 			++cx->transactionsProcessBehind;
-		if (e.code() == error_code_cluster_not_fully_recovered) {
-			++cx->transactionWaitsForFullRecovery;
-		}

 		double backoff = getBackoff(e.code());
 		reset();
@ -3348,54 +3342,22 @@ void enableClientInfoLogging() {
 	TraceEvent(SevInfo, "ClientInfoLoggingEnabled");
 }

-ACTOR Future<Void> snapshotDatabase(Reference<DatabaseContext> cx, StringRef snapPayload, UID snapUID, Optional<UID> debugID) {
-	TraceEvent("SnapshotDatabaseEnter")
-		.detail("SnapPayload", snapPayload)
-		.detail("SnapUID", snapUID);
-	try {
-		if (debugID.present()) {
-			g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.snapshotDatabase.Before");
-		}
-
-		choose {
-			when(wait(cx->onMasterProxiesChanged())) { throw operation_failed(); }
-			when(wait(loadBalance(cx->getMasterProxies(false), &MasterProxyInterface::proxySnapReq, ProxySnapRequest(snapPayload, snapUID, debugID), cx->taskID, true /*atmostOnce*/ ))) {
-				if (debugID.present())
-					g_traceBatch.addEvent("TransactionDebug", debugID.get().first(),
-											"NativeAPI.SnapshotDatabase.After");
-			}
-		}
-	} catch (Error& e) {
-		TraceEvent("SnapshotDatabaseError")
-			.error(e)
-			.detail("SnapPayload", snapPayload)
-			.detail("SnapUID", snapUID);
-		throw;
-	}
-	return Void();
-}
-
-ACTOR Future<Void> snapCreate(Database cx, StringRef snapCmd, UID snapUID) {
-	// remember the client ID before the snap operation
-	state UID preSnapClientUID = cx->clientInfo->get().id;
-
+ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID) {
 	TraceEvent("SnapCreateEnter")
 	    .detail("SnapCmd", snapCmd.toString())
-	    .detail("UID", snapUID)
-	    .detail("PreSnapClientUID", preSnapClientUID);
-
-	StringRef snapCmdArgs = snapCmd;
-	StringRef snapCmdPart = snapCmdArgs.eat(":");
-	Standalone<StringRef> snapUIDRef(snapUID.toString());
-	Standalone<StringRef> snapPayloadRef = snapCmdPart
-		.withSuffix(LiteralStringRef(":uid="))
-		.withSuffix(snapUIDRef)
-		.withSuffix(LiteralStringRef(","))
-		.withSuffix(snapCmdArgs);
-
+	    .detail("UID", snapUID);
 	try {
-		Future<Void> exec = snapshotDatabase(Reference<DatabaseContext>::addRef(cx.getPtr()), snapPayloadRef, snapUID, snapUID);
-		wait(exec);
+		loop {
+			choose {
+				when(wait(cx->onMasterProxiesChanged())) {}
+				when(wait(loadBalance(cx->getMasterProxies(false), &MasterProxyInterface::proxySnapReq, ProxySnapRequest(snapCmd, snapUID, snapUID), cx->taskID, true /*atmostOnce*/ ))) {
+					TraceEvent("SnapCreateExit")
+						.detail("SnapCmd", snapCmd.toString())
+						.detail("UID", snapUID);
+					return Void();
+				}
+			}
+		}
 	} catch (Error& e) {
 		TraceEvent("SnapCreateError")
 			.detail("SnapCmd", snapCmd.toString())
@ -3403,19 +3365,4 @@ ACTOR Future<Void> snapCreate(Database cx, StringRef snapCmd, UID snapUID) {
 			.error(e);
 		throw;
 	}
-
-	UID postSnapClientUID = cx->clientInfo->get().id;
-	if (preSnapClientUID != postSnapClientUID) {
-		// if the client IDs changed then we fail the snapshot
-		TraceEvent("SnapCreateUIDMismatch")
-		    .detail("SnapPreSnapClientUID", preSnapClientUID)
-		    .detail("SnapPostSnapClientUID", postSnapClientUID);
-		throw coordinators_changed();
-	}
-
-	TraceEvent("SnapCreateExit")
-	    .detail("SnapCmd", snapCmd.toString())
-	    .detail("UID", snapUID)
-	    .detail("PreSnapClientUID", preSnapClientUID);
-	return Void();
 }
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -316,7 +316,7 @@ int64_t extractIntOption( Optional<StringRef> value, int64_t minValue = std::num

 // Takes a snapshot of the cluster, specifically the following persistent
 // states: coordinator, TLog and storage state
-ACTOR Future<Void> snapCreate(Database cx, StringRef snapCmd, UID snapUID);
+ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);

 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@ -86,7 +86,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                        "cluster_controller",
                        "data_distributor",
                        "ratekeeper",
-                        "router"
+                        "router",
+                        "coordinator"
                     ]
                  },
                  "data_version":12341234,
@ -286,7 +287,23 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
         "limiting_queue_bytes_storage_server":0,
         "worst_queue_bytes_storage_server":0,
         "limiting_version_lag_storage_server":0,
-         "worst_version_lag_storage_server":0
+         "worst_version_lag_storage_server":0,
+         "limiting_data_lag_storage_server":{
+            "versions":0,
+            "seconds":0.0
+         },
+         "worst_data_lag_storage_server":{
+            "versions":0,
+            "seconds":0.0
+         },
+         "limiting_durability_lag_storage_server":{
+            "versions":0,
+            "seconds":0.0
+         },
+         "worst_durability_lag_storage_server":{
+            "versions":0,
+            "seconds":0.0
+         }
      },
      "incompatible_connections":[

@ -484,6 +501,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
      "full_replication":true,
      "maintenance_zone":"0ccb4e0fdbdb5583010f6b77d9d10ece",
      "maintenance_seconds_remaining":1.0,
+      "data_distribution_disabled_for_ss_failures":true,
+      "data_distribution_disabled_for_rebalance":true,
+      "data_distribution_disabled":true,
      "configuration":{
         "log_anti_quorum":0,
         "log_replicas":2,
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -30,6 +30,20 @@
 #include "flow/Stats.h"
 #include "fdbrpc/TimedRequest.h"

+// Dead code, removed in the next protocol version
+struct VersionReply {
+	constexpr static FileIdentifier file_identifier = 3;
+
+	Version version;
+	VersionReply() = default;
+	explicit VersionReply(Version version) : version(version) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, version);
+	}
+};
+
 struct StorageServerInterface {
 	constexpr static FileIdentifier file_identifier = 15302073;
 	enum { BUSY_ALLOWED = 0, BUSY_FORCE = 1, BUSY_LOCAL = 2 };
@ -40,7 +54,7 @@ struct StorageServerInterface {
 	LocalityData locality;
 	UID uniqueID;

-	RequestStream<ReplyPromise<Version>> getVersion;
+	RequestStream<ReplyPromise<VersionReply>> getVersion;
 	RequestStream<struct GetValueRequest> getValue;
 	RequestStream<struct GetKeyRequest> getKey;

@ -140,13 +154,26 @@ struct GetValueRequest : TimedRequest {
 	}
 };

+struct WatchValueReply {
+	constexpr static FileIdentifier file_identifier = 3;
+
+	Version version;
+	WatchValueReply() = default;
+	explicit WatchValueReply(Version version) : version(version) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, version);
+	}
+};
+
 struct WatchValueRequest {
 	constexpr static FileIdentifier file_identifier = 14747733;
 	Key key;
 	Optional<Value> value;
 	Version version;
 	Optional<UID> debugID;
-	ReplyPromise< Version > reply;
+	ReplyPromise<WatchValueReply> reply;

 	WatchValueRequest(){}
 	WatchValueRequest(const Key& key, Optional<Value> value, Version ver, Optional<UID> debugID) : key(key), value(value), version(ver), debugID(debugID) {}
@ -219,6 +246,20 @@ struct GetKeyRequest : TimedRequest {
 	}
 };

+struct GetShardStateReply {
+	constexpr static FileIdentifier file_identifier = 0;
+
+	Version first;
+	Version second;
+	GetShardStateReply() = default;
+	GetShardStateReply(Version first, Version second) : first(first), second(second) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, first, second);
+	}
+};
+
 struct GetShardStateRequest {
 	constexpr static FileIdentifier file_identifier = 15860168;
 	enum waitMode {
@ -229,7 +270,7 @@ struct GetShardStateRequest {
 	
 	KeyRange keys;
 	int32_t mode;
-	ReplyPromise< std::pair<Version,Version> > reply;
+	ReplyPromise<GetShardStateReply> reply;
 	GetShardStateRequest() {}
 	GetShardStateRequest( KeyRange const& keys, waitMode mode ) : keys(keys), mode(mode) {}

--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@ -268,6 +268,20 @@ TEST_CASE("/flow/flow/cancel2")
 	return Void();
 }

+namespace {
+// Simple message for flatbuffers unittests
+struct Int {
+	constexpr static FileIdentifier file_identifier = 12345;
+	uint32_t value;
+	Int() = default;
+	Int(uint32_t value) : value(value) {}
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, value);
+	}
+};
+} // namespace
+
 TEST_CASE("/flow/flow/nonserializable futures")
 {
 	// Types no longer need to be statically serializable to make futures, promises, actors
@ -283,20 +297,20 @@ TEST_CASE("/flow/flow/nonserializable futures")

 	// ReplyPromise can be used like a normal promise
 	{
-		ReplyPromise<int> rpInt;
-		Future<int> f = rpInt.getFuture();
+		ReplyPromise<Int> rpInt;
+		Future<Int> f = rpInt.getFuture();
 		ASSERT(!f.isReady());
 		rpInt.send(123);
-		ASSERT(f.get() == 123);
+		ASSERT(f.get().value == 123);
 	}

 	{
-		RequestStream<int> rsInt;
-		FutureStream<int> f = rsInt.getFuture();
+		RequestStream<Int> rsInt;
+		FutureStream<Int> f = rsInt.getFuture();
 		rsInt.send(1);
 		rsInt.send(2);
-		ASSERT(f.pop() == 1);
-		ASSERT(f.pop() == 2);
+		ASSERT(f.pop().value == 1);
+		ASSERT(f.pop().value == 2);
 	}

 	return Void();
@ -306,14 +320,14 @@ TEST_CASE("/flow/flow/networked futures")
 {
 	// RequestStream can be serialized
 	{
-		RequestStream<int> locInt;
+		RequestStream<Int> locInt;
 		BinaryWriter wr(IncludeVersion());
 		wr << locInt;

 		ASSERT(locInt.getEndpoint().isValid() && locInt.getEndpoint().isLocal() && locInt.getEndpoint().getPrimaryAddress() == FlowTransport::transport().getLocalAddress());

 		BinaryReader rd(wr.toValue(), IncludeVersion());
-		RequestStream<int> remoteInt;
+		RequestStream<Int> remoteInt;
 		rd >> remoteInt;

 		ASSERT(remoteInt.getEndpoint() == locInt.getEndpoint());
@ -323,14 +337,14 @@ TEST_CASE("/flow/flow/networked futures")
 	// ReplyPromise can be serialized
 	// TODO: This needs to fiddle with g_currentDeliveryPeerAddress
 	if (0) {
-		ReplyPromise<int> locInt;
+		ReplyPromise<Int> locInt;
 		BinaryWriter wr(IncludeVersion());
 		wr << locInt;

 		ASSERT(locInt.getEndpoint().isValid() && locInt.getEndpoint().isLocal());

 		BinaryReader rd(wr.toValue(), IncludeVersion());
-		ReplyPromise<int> remoteInt;
+		ReplyPromise<Int> remoteInt;
 		rd >> remoteInt;

 		ASSERT(remoteInt.getEndpoint() == locInt.getEndpoint());
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -288,117 +288,16 @@ struct ConnectPacket {
 ACTOR static Future<Void> connectionReader(TransportData* transport, Reference<IConnection> conn, Reference<struct Peer> peer,
                                           Promise<Reference<struct Peer>> onConnected);

-static PacketID sendPacket( TransportData* self, ISerializeSource const& what, const Endpoint& destination, bool reliable, bool openConnection );
+static void sendLocal( TransportData* self, ISerializeSource const& what, const Endpoint& destination );
+static ReliablePacket* sendPacket( TransportData* self, Reference<Peer> peer, ISerializeSource const& what, const Endpoint& destination, bool reliable );

-struct Peer : public ReferenceCounted<Peer> {
-	TransportData* transport;
-	NetworkAddress destination;
-	UnsentPacketQueue unsent;
-	ReliablePacketList reliable;
-	AsyncTrigger dataToSend;  // Triggered when unsent.empty() becomes false
-	Future<Void> connect;
-	AsyncTrigger resetPing;
-	bool compatible;
-	bool outgoingConnectionIdle;  // We don't actually have a connection open and aren't trying to open one because we don't have anything to send
-	double lastConnectTime;
-	double reconnectionDelay;
-	int peerReferences;
-	bool incompatibleProtocolVersionNewer;
-	int64_t bytesReceived;
-	double lastDataPacketSentTime;
-
-	explicit Peer(TransportData* transport, NetworkAddress const& destination)
-	  : transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0),
-	    reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true),
-	    incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {}
-
-	void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) {
-		unsent.setWriteBuffer(pb);
-		if (rp) reliable.insert(rp);
-		if (firstUnsent) dataToSend.trigger();
-	}
-
-	void prependConnectPacket() {
-		// Send the ConnectPacket expected at the beginning of a new connection
-		ConnectPacket pkt;
-		if(transport->localAddresses.address.isTLS() == destination.isTLS()) {
-			pkt.canonicalRemotePort = transport->localAddresses.address.port;
-			pkt.setCanonicalRemoteIp(transport->localAddresses.address.ip);
-		} else if(transport->localAddresses.secondaryAddress.present()) {
-			pkt.canonicalRemotePort = transport->localAddresses.secondaryAddress.get().port;
-			pkt.setCanonicalRemoteIp(transport->localAddresses.secondaryAddress.get().ip);
-		} else {
-			// a "mixed" TLS/non-TLS connection is like a client/server connection - there's no way to reverse it
-			pkt.canonicalRemotePort = 0;
-			pkt.setCanonicalRemoteIp(IPAddress(0));
-		}
-
-		pkt.connectPacketLength = sizeof(pkt) - sizeof(pkt.connectPacketLength);
-		pkt.protocolVersion = currentProtocolVersion;
-		if (FLOW_KNOBS->USE_OBJECT_SERIALIZER) {
-			pkt.protocolVersion.addObjectSerializerFlag();
-		}
-		pkt.connectionId = transport->transportId;
-
-		PacketBuffer* pb_first = PacketBuffer::create();
-		PacketWriter wr( pb_first, nullptr, Unversioned() );
-		pkt.serialize(wr);
-		unsent.prependWriteBuffer(pb_first, wr.finish());
-	}
-
-	void discardUnreliablePackets() {
-		// Throw away the current unsent list, dropping the reference count on each PacketBuffer that accounts for presence in the unsent list
-		unsent.discardAll();
-
-		// If there are reliable packets, compact reliable packets into a new unsent range
-		if(!reliable.empty()) {
-			PacketBuffer* pb = unsent.getWriteBuffer();
-			pb = reliable.compact(pb, nullptr);
-			unsent.setWriteBuffer(pb);
-		}
-	}
-
-	void onIncomingConnection( Reference<Peer> self, Reference<IConnection> conn, Future<Void> reader ) {
-		// In case two processes are trying to connect to each other simultaneously, the process with the larger canonical NetworkAddress
-		// gets to keep its outgoing connection.
-		if ( !destination.isPublic() && !outgoingConnectionIdle ) throw address_in_use();
-		NetworkAddress compatibleAddr = transport->localAddresses.address;
-		if(transport->localAddresses.secondaryAddress.present() && transport->localAddresses.secondaryAddress.get().isTLS() == destination.isTLS()) {
-			compatibleAddr = transport->localAddresses.secondaryAddress.get();
-		}
-
-		if ( !destination.isPublic() || outgoingConnectionIdle || destination > compatibleAddr ) {
-			// Keep the new connection
-			TraceEvent("IncomingConnection", conn->getDebugID())
-				.suppressFor(1.0)
-				.detail("FromAddr", conn->getPeerAddress())
-				.detail("CanonicalAddr", destination)
-				.detail("IsPublic", destination.isPublic());
-
-			connect.cancel();
-			prependConnectPacket();
-			connect = connectionKeeper( self, conn, reader );
-		} else {
-			TraceEvent("RedundantConnection", conn->getDebugID())
-				.suppressFor(1.0)
-				.detail("FromAddr", conn->getPeerAddress().toString())
-				.detail("CanonicalAddr", destination)
-				.detail("LocalAddr", compatibleAddr);
-
-			// Keep our prior connection
-			reader.cancel();
-			conn->close();
-
-			// Send an (ignored) packet to make sure that, if our outgoing connection died before the peer made this connection attempt,
-			// we eventually find out that our connection is dead, close it, and then respond to the next connection reattempt from peer.
-		}
-	}
-
-	ACTOR static Future<Void> connectionMonitor( Reference<Peer> peer ) {
+ACTOR Future<Void> connectionMonitor( Reference<Peer> peer ) {
 	state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET);
 	loop {
-			if (!FlowTransport::transport().isClient() && !peer->destination.isPublic()) {
+		if (!FlowTransport::transport().isClient() && !peer->destination.isPublic() && peer->compatible) {
 			// Don't send ping messages to clients unless necessary. Instead monitor incoming client pings.
+			// We ignore this block for incompatible clients because pings from server would trigger the
+			// peer->resetPing and prevent 'connection_failed' due to ping timeout.
 			state double lastRefreshed = now();
 			state int64_t lastBytesReceived = peer->bytesReceived;
 			loop {
@ -419,7 +318,7 @@ struct Peer : public ReferenceCounted<Peer> {
 		//because then it would not call the destructor of connectionReader when connectionReader is cancelled.
 		wait(delay(0));

-			if (peer->reliable.empty() && peer->unsent.empty()) {
+		if (peer->reliable.empty() && peer->unsent.empty() && peer->outstandingReplies==0) {
 			if (peer->peerReferences == 0 &&
 					(peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY)) {
 				// TODO: What about when peerReference == -1?
@ -436,7 +335,7 @@ struct Peer : public ReferenceCounted<Peer> {

 		// TODO: Stop monitoring and close the connection with no onDisconnect requests outstanding
 		state ReplyPromise<Void> reply;
-			FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePingEndpoint );
+		FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePingEndpoint, true );
 		state int64_t startingBytes = peer->bytesReceived;
 		state int timeouts = 0;
 		loop {
@ -466,7 +365,7 @@ struct Peer : public ReferenceCounted<Peer> {
 	}
 }

-	ACTOR static Future<Void> connectionWriter( Reference<Peer> self, Reference<IConnection> conn ) {
+ACTOR Future<Void> connectionWriter( Reference<Peer> self, Reference<IConnection> conn ) {
 	state double lastWriteTime = now();
 	loop {
 		//wait( delay(0, TaskPriority::WriteSocket) );
@ -496,7 +395,7 @@ struct Peer : public ReferenceCounted<Peer> {
 	}
 }

-	ACTOR static Future<Void> connectionKeeper( Reference<Peer> self,
+ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
 		Reference<IConnection> conn = Reference<IConnection>(),
 		Future<Void> reader = Void()) {
 	TraceEvent(SevDebug, "ConnectionKeeper", conn ? conn->getDebugID() : UID())
@ -625,7 +524,7 @@ struct Peer : public ReferenceCounted<Peer> {
 			if (e.code() == error_code_actor_cancelled) throw;
 			// Try to recover, even from serious errors, by retrying

-				if(self->peerReferences <= 0 && self->reliable.empty() && self->unsent.empty()) {
+			if(self->peerReferences <= 0 && self->reliable.empty() && self->unsent.empty() && self->outstandingReplies==0) {
 				TraceEvent("PeerDestroy").error(e).suppressFor(1.0).detail("PeerAddr", self->destination);
 				self->connect.cancel();
 				self->transport->peers.erase(self->destination);
@ -634,7 +533,88 @@ struct Peer : public ReferenceCounted<Peer> {
 		}
 	}
 }
-};
+
+void Peer::send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) {
+	unsent.setWriteBuffer(pb);
+	if (rp) reliable.insert(rp);
+	if (firstUnsent) dataToSend.trigger();
+}
+
+void Peer::prependConnectPacket() {
+	// Send the ConnectPacket expected at the beginning of a new connection
+	ConnectPacket pkt;
+	if(transport->localAddresses.address.isTLS() == destination.isTLS()) {
+		pkt.canonicalRemotePort = transport->localAddresses.address.port;
+		pkt.setCanonicalRemoteIp(transport->localAddresses.address.ip);
+	} else if(transport->localAddresses.secondaryAddress.present()) {
+		pkt.canonicalRemotePort = transport->localAddresses.secondaryAddress.get().port;
+		pkt.setCanonicalRemoteIp(transport->localAddresses.secondaryAddress.get().ip);
+	} else {
+		// a "mixed" TLS/non-TLS connection is like a client/server connection - there's no way to reverse it
+		pkt.canonicalRemotePort = 0;
+		pkt.setCanonicalRemoteIp(IPAddress(0));
+	}
+
+	pkt.connectPacketLength = sizeof(pkt) - sizeof(pkt.connectPacketLength);
+	pkt.protocolVersion = currentProtocolVersion;
+	if (FLOW_KNOBS->USE_OBJECT_SERIALIZER) {
+		pkt.protocolVersion.addObjectSerializerFlag();
+	}
+	pkt.connectionId = transport->transportId;
+
+	PacketBuffer* pb_first = PacketBuffer::create();
+	PacketWriter wr( pb_first, nullptr, Unversioned() );
+	pkt.serialize(wr);
+	unsent.prependWriteBuffer(pb_first, wr.finish());
+}
+
+void Peer::discardUnreliablePackets() {
+	// Throw away the current unsent list, dropping the reference count on each PacketBuffer that accounts for presence in the unsent list
+	unsent.discardAll();
+
+	// If there are reliable packets, compact reliable packets into a new unsent range
+	if(!reliable.empty()) {
+		PacketBuffer* pb = unsent.getWriteBuffer();
+		pb = reliable.compact(pb, nullptr);
+		unsent.setWriteBuffer(pb);
+	}
+}
+
+void Peer::onIncomingConnection( Reference<Peer> self, Reference<IConnection> conn, Future<Void> reader ) {
+	// In case two processes are trying to connect to each other simultaneously, the process with the larger canonical NetworkAddress
+	// gets to keep its outgoing connection.
+	if ( !destination.isPublic() && !outgoingConnectionIdle ) throw address_in_use();
+	NetworkAddress compatibleAddr = transport->localAddresses.address;
+	if(transport->localAddresses.secondaryAddress.present() && transport->localAddresses.secondaryAddress.get().isTLS() == destination.isTLS()) {
+		compatibleAddr = transport->localAddresses.secondaryAddress.get();
+	}
+
+	if ( !destination.isPublic() || outgoingConnectionIdle || destination > compatibleAddr ) {
+		// Keep the new connection
+		TraceEvent("IncomingConnection", conn->getDebugID())
+			.suppressFor(1.0)
+			.detail("FromAddr", conn->getPeerAddress())
+			.detail("CanonicalAddr", destination)
+			.detail("IsPublic", destination.isPublic());
+
+		connect.cancel();
+		prependConnectPacket();
+		connect = connectionKeeper( self, conn, reader );
+	} else {
+		TraceEvent("RedundantConnection", conn->getDebugID())
+			.suppressFor(1.0)
+			.detail("FromAddr", conn->getPeerAddress().toString())
+			.detail("CanonicalAddr", destination)
+			.detail("LocalAddr", compatibleAddr);
+
+		// Keep our prior connection
+		reader.cancel();
+		conn->close();
+
+		// Send an (ignored) packet to make sure that, if our outgoing connection died before the peer made this connection attempt,
+		// we eventually find out that our connection is dead, close it, and then respond to the next connection reattempt from peer.
+	}
+}

 TransportData::~TransportData() {
 	for(auto &p : peers) {
@ -671,9 +651,12 @@ ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader
 	} else if (destination.token.first() & TOKEN_STREAM_FLAG) {
 		// We don't have the (stream) endpoint 'token', notify the remote machine
 		if (destination.token.first() != -1) {
-			sendPacket(self,
-			           SerializeSource<Endpoint>(Endpoint(self->localAddresses, destination.token)),
-			           Endpoint(destination.addresses, WLTOKEN_ENDPOINT_NOT_FOUND), false, true);
+			if (self->isLocalAddress(destination.getPrimaryAddress())) {
+				sendLocal(self, SerializeSource<Endpoint>(Endpoint(self->localAddresses, destination.token)), Endpoint(destination.addresses, WLTOKEN_ENDPOINT_NOT_FOUND));
+			} else {
+				Reference<Peer> peer = self->getPeer(destination.getPrimaryAddress());
+				sendPacket(self, peer, SerializeSource<Endpoint>(Endpoint(self->localAddresses, destination.token)), Endpoint(destination.addresses, WLTOKEN_ENDPOINT_NOT_FOUND), false);
+			}
 		}
 	}

@ -1013,7 +996,7 @@ Reference<Peer> TransportData::getPeer( NetworkAddress const& address, bool open
 		return Reference<Peer>();
 	}
 	Reference<Peer> newPeer = Reference<Peer>( new Peer(this, address) );
-	newPeer->connect = Peer::connectionKeeper(newPeer);
+	newPeer->connect = connectionKeeper(newPeer);
 	peers[address] = newPeer;
 	return newPeer;
 }
@ -1113,7 +1096,7 @@ void FlowTransport::removePeerReference(const Endpoint& endpoint, bool isStream)
 				.detail("Address", endpoint.getPrimaryAddress())
 				.detail("Token", endpoint.token);
 		}
-		if(peer->peerReferences == 0 && peer->reliable.empty() && peer->unsent.empty()) {
+		if(peer->peerReferences == 0 && peer->reliable.empty() && peer->unsent.empty() && peer->outstandingReplies==0) {
 			peer->resetPing.trigger();
 		}
 	}
@ -1143,8 +1126,7 @@ void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageRece
 	ASSERT( endpoint.token == otoken );
 }

-static PacketID sendPacket( TransportData* self, ISerializeSource const& what, const Endpoint& destination, bool reliable, bool openConnection ) {
-	if (self->isLocalAddress(destination.getPrimaryAddress())) {
+static void sendLocal( TransportData* self, ISerializeSource const& what, const Endpoint& destination ) {
 	TEST(true); // "Loopback" delivery
 	// SOMEDAY: Would it be better to avoid (de)serialization by doing this check in flow?

@ -1164,18 +1146,16 @@ static PacketID sendPacket( TransportData* self, ISerializeSource const& what, c

 	ASSERT(copy.size() > 0);
 	deliver(self, destination, ArenaReader(copy.arena(), copy, AssumeVersion(currentProtocolVersion)), false);
+}

-		return (PacketID)nullptr;
-	} else {
+static ReliablePacket* sendPacket( TransportData* self, Reference<Peer> peer, ISerializeSource const& what, const Endpoint& destination, bool reliable ) {
 	const bool checksumEnabled = !destination.getPrimaryAddress().isTLS();
 	++self->countPacketsGenerated;

-		Reference<Peer> peer = self->getPeer(destination.getPrimaryAddress(), openConnection);
-
 	// If there isn't an open connection, a public address, or the peer isn't compatible, we can't send
 	if (!peer || (peer->outgoingConnectionIdle && !destination.getPrimaryAddress().isPublic()) || (peer->incompatibleProtocolVersionNewer && destination.token != WLTOKEN_PING_PACKET)) {
 		TEST(true);  // Can't send to private address without a compatible open connection
-			return (PacketID)nullptr;
+		return nullptr;
 	}

 	bool firstUnsent = peer->unsent.empty();
@ -1258,22 +1238,31 @@ static PacketID sendPacket( TransportData* self, ISerializeSource const& what, c
 	if (destination.token != WLTOKEN_PING_PACKET) {
 		peer->lastDataPacketSentTime = now();
 	}
-		return (PacketID)rp;
-	}
+	return rp;
 }

-PacketID FlowTransport::sendReliable( ISerializeSource const& what, const Endpoint& destination ) {
-	return sendPacket( self, what, destination, true, true );
+ReliablePacket* FlowTransport::sendReliable( ISerializeSource const& what, const Endpoint& destination ) {
+	if (self->isLocalAddress(destination.getPrimaryAddress())) {
+		sendLocal( self, what, destination );
+		return nullptr;
+	}
+	Reference<Peer> peer = self->getPeer(destination.getPrimaryAddress());
+	return sendPacket( self, peer, what, destination, true );
 }

-void FlowTransport::cancelReliable( PacketID pid ) {
-	ReliablePacket* p = (ReliablePacket*)pid;
+void FlowTransport::cancelReliable( ReliablePacket* p ) {
 	if (p) p->remove();
 	// SOMEDAY: Call reliable.compact() if a lot of memory is wasted in PacketBuffers by formerly reliable packets mixed with a few reliable ones.  Don't forget to delref the new PacketBuffers since they are unsent.
 }

-void FlowTransport::sendUnreliable( ISerializeSource const& what, const Endpoint& destination, bool openConnection ) {
-	sendPacket( self, what, destination, false, openConnection );
+Reference<Peer> FlowTransport::sendUnreliable( ISerializeSource const& what, const Endpoint& destination, bool openConnection ) {
+	if (self->isLocalAddress(destination.getPrimaryAddress())) {
+		sendLocal( self, what, destination );
+		return Reference<Peer>();
+	}
+	Reference<Peer> peer = self->getPeer(destination.getPrimaryAddress(), openConnection);
+	sendPacket( self, peer, what, destination, false );
+	return peer;
 }

 int FlowTransport::getEndpointCount() {
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@ -26,6 +26,7 @@
 #include "flow/genericactors.actor.h"
 #include "flow/network.h"
 #include "flow/FileIdentifier.h"
+#include "flow/Net2Packet.h"

 #pragma pack(push, 4)
 class Endpoint {
@ -103,7 +104,39 @@ public:
 	virtual bool isStream() const { return false; }
 };

-typedef struct NetworkPacket* PacketID;
+struct TransportData;
+
+struct Peer : public ReferenceCounted<Peer> {
+	TransportData* transport;
+	NetworkAddress destination;
+	UnsentPacketQueue unsent;
+	ReliablePacketList reliable;
+	AsyncTrigger dataToSend;  // Triggered when unsent.empty() becomes false
+	Future<Void> connect;
+	AsyncTrigger resetPing;
+	bool compatible;
+	bool outgoingConnectionIdle;  // We don't actually have a connection open and aren't trying to open one because we don't have anything to send
+	double lastConnectTime;
+	double reconnectionDelay;
+	int peerReferences;
+	bool incompatibleProtocolVersionNewer;
+	int64_t bytesReceived;
+	double lastDataPacketSentTime;
+	int outstandingReplies;
+
+	explicit Peer(TransportData* transport, NetworkAddress const& destination)
+	  : transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0),
+	    reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0),
+	    incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {}
+
+	void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent);
+
+	void prependConnectPacket();
+
+	void discardUnreliablePackets();
+
+	void onIncomingConnection( Reference<Peer> self, Reference<IConnection> conn, Future<Void> reader );
+};

 class FlowTransport {
 public:
@ -148,19 +181,19 @@ public:
 	// Sets endpoint to a new local endpoint (without changing its token) which delivers messages to the given receiver
 	// Implementations may have limitations on when this function is called and what endpoint.token may be!

-	PacketID sendReliable( ISerializeSource const& what, const Endpoint& destination );
+	ReliablePacket* sendReliable( ISerializeSource const& what, const Endpoint& destination );
 	// sendReliable will keep trying to deliver the data to the destination until cancelReliable is
 	//   called.  It will retry sending if the connection is closed or the failure manager reports
 	//   the destination become available (edge triggered).

-	void cancelReliable( PacketID );
-	// Makes PacketID "unreliable" (either the data or a connection close event will be delivered
+	void cancelReliable( ReliablePacket* );
+	// Makes Packet "unreliable" (either the data or a connection close event will be delivered
 	//   eventually).  It can still be used safely to send a reply to a "reliable" request.

 	Reference<AsyncVar<bool>> getDegraded();
 	// This async var will be set to true when the process cannot connect to a public network address that the failure monitor thinks is healthy.

-	void sendUnreliable( ISerializeSource const& what, const Endpoint& destination, bool openConnection = true );// { cancelReliable(sendReliable(what,destination)); }
+	Reference<Peer> sendUnreliable( ISerializeSource const& what, const Endpoint& destination, bool openConnection );// { cancelReliable(sendReliable(what,destination)); }

 	int getEndpointCount();
 	// for tracing only
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@ -202,8 +202,10 @@ Future< REPLY_TYPE(Request) > loadBalance(
 		double nextMetric = 1e9;
 		double bestTime = 1e9;
 		double nextTime = 1e9;
+		int badServers = 0;
+
 		for(int i=0; i<alternatives->size(); i++) {
-			if(bestMetric < 1e8 && i == alternatives->countBest()) {
+			if(badServers < std::min(i, FLOW_KNOBS->LOAD_BALANCE_MAX_BAD_OPTIONS + 1) && i == alternatives->countBest()) {
 				break;
 			}
 			
@ -213,6 +215,9 @@ Future< REPLY_TYPE(Request) > loadBalance(
 				if(now() > qd.failedUntil) {
 					double thisMetric = qd.smoothOutstanding.smoothTotal();
 					double thisTime = qd.latency;
+					if(FLOW_KNOBS->LOAD_BALANCE_PENALTY_IS_BAD && qd.penalty > 1.001) {
+						++badServers;
+					}
 				
 					if(thisMetric < bestMetric) {
 						if(i != bestAlt) {
@ -228,7 +233,11 @@ Future< REPLY_TYPE(Request) > loadBalance(
 						nextMetric = thisMetric;
 						nextTime = thisTime;
 					}
+				} else {
+					++badServers;
 				}
+			} else {
+				++badServers;
 			}
 		}
 		if( nextMetric > 1e8 ) {
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@ -265,7 +265,7 @@ public:

 	void send(const T& value) const {
 		if (queue->isRemoteEndpoint()) {
-			FlowTransport::transport().sendUnreliable(SerializeSource<T>(value), getEndpoint());
+			FlowTransport::transport().sendUnreliable(SerializeSource<T>(value), getEndpoint(), true);
 		}
 		else
 			queue->send(value);
@ -317,9 +317,9 @@ public:
 			if (disc.isReady()) {
 				return ErrorOr<REPLY_TYPE(X)>(request_maybe_delivered());
 			}
-			FlowTransport::transport().sendUnreliable(SerializeSource<T>(value), getEndpoint(taskID));
+			Reference<Peer> peer = FlowTransport::transport().sendUnreliable(SerializeSource<T>(value), getEndpoint(taskID), true);
 			auto& p = getReplyPromise(value);
-			return waitValueOrSignal(p.getFuture(), disc, getEndpoint(taskID), p);
+			return waitValueOrSignal(p.getFuture(), disc, getEndpoint(taskID), p, peer);
 		}
 		send(value);
 		auto& p = getReplyPromise(value);
@ -333,9 +333,9 @@ public:
 			if (disc.isReady()) {
 				return ErrorOr<REPLY_TYPE(X)>(request_maybe_delivered());
 			}
-			FlowTransport::transport().sendUnreliable(SerializeSource<T>(value), getEndpoint());
+			Reference<Peer> peer = FlowTransport::transport().sendUnreliable(SerializeSource<T>(value), getEndpoint(), true);
 			auto& p = getReplyPromise(value);
-			return waitValueOrSignal(p.getFuture(), disc, getEndpoint(), p);
+			return waitValueOrSignal(p.getFuture(), disc, getEndpoint(), p, peer);
 		}
 		else {
 			send(value);
--- a/fdbrpc/genericactors.actor.h
+++ b/fdbrpc/genericactors.actor.h
@ -152,9 +152,24 @@ ACTOR template <class T> Future<Void> incrementalBroadcast( Future<T> input, std
 // Needed for the call to endpointNotFound()
 #include "fdbrpc/FailureMonitor.h"

+struct PeerHolder {
+	Reference<Peer> peer;
+	explicit PeerHolder(Reference<Peer> peer) : peer(peer) {
+		if(peer) {
+			peer->outstandingReplies++;
+		}
+	}
+	~PeerHolder() {
+		if(peer) {
+			peer->outstandingReplies--;
+		}
+	}
+};
+
 // Implements tryGetReply, getReplyUnlessFailedFor
 ACTOR template <class X>
-Future<ErrorOr<X>> waitValueOrSignal( Future<X> value, Future<Void> signal, Endpoint endpoint, ReplyPromise<X> holdme = ReplyPromise<X>() ) {
+Future<ErrorOr<X>> waitValueOrSignal( Future<X> value, Future<Void> signal, Endpoint endpoint, ReplyPromise<X> holdme = ReplyPromise<X>(), Reference<Peer> peer = Reference<Peer>() ) {
+	state PeerHolder holder = PeerHolder(peer);
 	loop {
 		try {
 			choose {
@ -185,7 +200,7 @@ Future<ErrorOr<X>> waitValueOrSignal( Future<X> value, Future<Void> signal, Endp
 }

 ACTOR template <class T> 
-Future<T> sendCanceler( ReplyPromise<T> reply, PacketID send, Endpoint endpoint ) {
+Future<T> sendCanceler( ReplyPromise<T> reply, ReliablePacket* send, Endpoint endpoint ) {
 	try {
 		T t = wait( reply.getFuture() );
 		FlowTransport::transport().cancelReliable(send);
--- a/fdbrpc/networksender.actor.h
+++ b/fdbrpc/networksender.actor.h
@ -35,7 +35,7 @@ void networkSender(Future<T> input, Endpoint endpoint) {
 	try {
 		T value = wait(input);
 		if (FLOW_KNOBS->USE_OBJECT_SERIALIZER) {
-			FlowTransport::transport().sendUnreliable(SerializeSource<ErrorOr<EnsureTable<T>>>(value), endpoint);
+			FlowTransport::transport().sendUnreliable(SerializeSource<ErrorOr<EnsureTable<T>>>(value), endpoint, false);
 		} else {
 			FlowTransport::transport().sendUnreliable(SerializeBoolAnd<T>(true, value), endpoint, false);
 		}
@ -43,7 +43,7 @@ void networkSender(Future<T> input, Endpoint endpoint) {
 		// if (err.code() == error_code_broken_promise) return;
 		ASSERT(err.code() != error_code_actor_cancelled);
 		if (FLOW_KNOBS->USE_OBJECT_SERIALIZER) {
-			FlowTransport::transport().sendUnreliable(SerializeSource<ErrorOr<EnsureTable<T>>>(err), endpoint);
+			FlowTransport::transport().sendUnreliable(SerializeSource<ErrorOr<EnsureTable<T>>>(err), endpoint, false);
 		} else {
 			FlowTransport::transport().sendUnreliable(SerializeBoolAnd<Error>(false, err), endpoint, false);
 		}
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -369,7 +369,13 @@ private:
 			g_simulator.lastConnectionFailure = now();
 			double a = deterministicRandom()->random01(), b = deterministicRandom()->random01();
 			TEST(true);  // Simulated connection failure
-			TraceEvent("ConnectionFailure", dbgid).detail("MyAddr", process->address).detail("PeerAddr", peerProcess->address).detail("SendClosed", a > .33).detail("RecvClosed", a < .66).detail("Explicit", b < .3);
+			TraceEvent("ConnectionFailure", dbgid)
+			    .detail("MyAddr", process->address)
+			    .detail("PeerAddr", peerProcess->address)
+			    .detail("PeerIsValid", peer.isValid())
+			    .detail("SendClosed", a > .33)
+			    .detail("RecvClosed", a < .66)
+			    .detail("Explicit", b < .3);
 			if (a < .66 && peer) peer->closeInternal();
 			if (a > .33) closeInternal();
 			// At the moment, we occasionally notice the connection failed immediately.  In principle, this could happen but only after a delay.
@ -381,7 +387,8 @@ private:
 	ACTOR static Future<Void> trackLeakedConnection( Sim2Conn* self ) {
 		wait( g_simulator.onProcess( self->process ) );
 		if (self->process->address.isPublic()) {
-			wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) );
+			wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 +
+			           FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME * 2.1 + FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT));
 		} else {
 			wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) );
 		}
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@ -302,7 +302,8 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 			//TODO: use notify to only send a heartbeat once per interval
 			availableLeaders.erase( LeaderInfo(req.prevChangeID) );
 			availableLeaders.insert( req.myInfo );
-			req.reply.send( currentNominee.present() && currentNominee.get().equalInternalId(req.myInfo) );
+			req.reply.send(
+			    LeaderHeartbeatReply{ currentNominee.present() && currentNominee.get().equalInternalId(req.myInfo) });
 		}
 		when (ForwardRequest req = waitNext( interf.forward.getFuture() ) ) {
 			LeaderInfo newInfo;
@ -499,7 +500,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 		when ( LeaderHeartbeatRequest req = waitNext( interf.leaderHeartbeat.getFuture() ) ) {
 			Optional<LeaderInfo> forward = regs.getForward(req.key);
 			if( forward.present() )
-				req.reply.send( false );
+				req.reply.send(LeaderHeartbeatReply{ false });
 			else
 				regs.getInterface(req.key, id).leaderHeartbeat.send(req);
 		}
--- a/fdbserver/CoordinationInterface.h
+++ b/fdbserver/CoordinationInterface.h
@ -136,12 +136,29 @@ struct CandidacyRequest {
 	}
 };

+struct LeaderHeartbeatReply {
+	constexpr static FileIdentifier file_identifier = 11;
+
+	bool value = false;
+	LeaderHeartbeatReply() = default;
+	explicit LeaderHeartbeatReply(bool value) : value(value) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, value);
+	}
+};
+
+inline bool operator==(const LeaderHeartbeatReply& lhs, const LeaderHeartbeatReply& rhs) {
+	return lhs.value == rhs.value;
+}
+
 struct LeaderHeartbeatRequest {
 	constexpr static FileIdentifier file_identifier = 9495992;
 	Key key;
 	LeaderInfo myInfo;
 	UID prevChangeID;
-	ReplyPromise<bool> reply;
+	ReplyPromise<LeaderHeartbeatReply> reply;

 	LeaderHeartbeatRequest() {}
 	explicit LeaderHeartbeatRequest( Key key, LeaderInfo const& myInfo, UID prevChangeID ) : key(key), myInfo(myInfo), prevChangeID(prevChangeID) {}
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -3008,7 +3008,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 		}
 	} catch(Error& e) {
 		if(logTeamEvents) {
-			TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc());
+			TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()).detail("Priority", team->getPriority());
 		}
 		self->priority_teams[team->getPriority()]--;
 		if (team->isHealthy()) {
@ -4347,7 +4347,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 		std::vector<Future<Void>> disablePops;
 		for (const auto & tlog : tlogs) {
 			disablePops.push_back(
-				transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), operation_failed())
+				transformErrors(throwErrorOr(tlog.disablePopRequest.tryGetReply(TLogDisablePopRequest(snapReq.snapUID))), snap_disable_tlog_pop_failed())
 				);
 		}
 		wait(waitForAll(disablePops));
@ -4356,14 +4356,14 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 			.detail("SnapPayload", snapReq.snapPayload)
 			.detail("SnapUID", snapReq.snapUID);
 		// snap local storage nodes
-		std::vector<WorkerInterface> storageWorkers = wait(getStorageWorkers(cx, db, true /* localOnly */));
+		std::vector<WorkerInterface> storageWorkers = wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed()));
 		TraceEvent("SnapDataDistributor_GotStorageWorkers")
 			.detail("SnapPayload", snapReq.snapPayload)
 			.detail("SnapUID", snapReq.snapUID);
 		std::vector<Future<Void>> storageSnapReqs;
 		for (const auto & worker : storageWorkers) {
 			storageSnapReqs.push_back(
-				transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), operation_failed())
+				transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("storage")))), snap_storage_failed())
 				);
 		}
 		wait(waitForAll(storageSnapReqs));
@ -4375,7 +4375,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 		std::vector<Future<Void>> tLogSnapReqs;
 		for (const auto & tlog : tlogs) {
 			tLogSnapReqs.push_back(
-				transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), operation_failed())
+				transformErrors(throwErrorOr(tlog.snapRequest.tryGetReply(TLogSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("tlog")))), snap_tlog_failed())
 				);
 		}
 		wait(waitForAll(tLogSnapReqs));
@ -4387,7 +4387,7 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 		std::vector<Future<Void>> enablePops;
 		for (const auto & tlog : tlogs) {
 			enablePops.push_back(
-				transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), operation_failed())
+				transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed())
 				);
 		}
 		wait(waitForAll(enablePops));
@ -4403,18 +4403,36 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
 		std::vector<Future<Void>> coordSnapReqs;
 		for (const auto & worker : coordWorkers) {
 			coordSnapReqs.push_back(
-				transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), operation_failed())
+				transformErrors(throwErrorOr(worker.workerSnapReq.tryGetReply(WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, LiteralStringRef("coord")))), snap_coord_failed())
 				);
 		}
 		wait(waitForAll(coordSnapReqs));
 		TraceEvent("SnapDataDistributor_AfterSnapCoords")
 			.detail("SnapPayload", snapReq.snapPayload)
 			.detail("SnapUID", snapReq.snapUID);
-	} catch (Error& e) {
+	} catch (Error& err) {
+		state Error e = err;
 		TraceEvent("SnapDataDistributor_SnapReqExit")
 			.detail("SnapPayload", snapReq.snapPayload)
 			.detail("SnapUID", snapReq.snapUID)
 			.error(e, true /*includeCancelled */);
+		if (e.code() == error_code_snap_storage_failed
+			|| e.code() == error_code_snap_tlog_failed
+			|| e.code() == error_code_operation_cancelled) {
+			// enable tlog pop on local tlog nodes
+			std::vector<TLogInterface> tlogs = db->get().logSystemConfig.allLocalLogs(false);
+			try {
+				std::vector<Future<Void>> enablePops;
+				for (const auto & tlog : tlogs) {
+					enablePops.push_back(
+						transformErrors(throwErrorOr(tlog.enablePopRequest.tryGetReply(TLogEnablePopRequest(snapReq.snapUID))), snap_enable_tlog_pop_failed())
+						);
+				}
+				wait(waitForAll(enablePops));
+			} catch (Error& error) {
+				TraceEvent(SevDebug, "IgnoreEnableTLogPopFailure");
+			}
+		}
 		throw e;
 	}
 	return Void();
@ -4435,7 +4453,7 @@ ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq, Reference<AsyncV
 				TraceEvent("SnapDDCreateDBInfoChanged")
 					.detail("SnapPayload", snapReq.snapPayload)
 					.detail("SnapUID", snapReq.snapUID);
-				snapReq.reply.sendError(operation_failed());
+				snapReq.reply.sendError(snap_with_recovery_unsupported());
 			}
 			when (wait(ddSnapCreateCore(snapReq, db))) {
 				TraceEvent("SnapDDCreateSuccess")
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -888,6 +888,8 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 	state bool allHealthy = true;
 	state bool anyWithSource = false;
 	state std::vector<std::pair<Reference<IDataDistributionTeam>,bool>> bestTeams;
+	state double startTime = now();
+	state std::vector<UID> destIds;

 	try {
 		if(now() - self->lastInterval < 1.0) {
@ -964,7 +966,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 				wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) );
 			}

-			state std::vector<UID> destIds;
+			destIds.clear();
 			state std::vector<UID> healthyIds;
 			state std::vector<UID> extraIds;
 			state std::vector<ShardsAffectedByTeamFailure::Team> destinationTeams;
@ -1075,7 +1077,10 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd

 				// onFinished.send( rs );
 				if( !error.code() ) {
-					TraceEvent(relocateShardInterval.end(), distributorId).detail("Result","Success");
+					TraceEvent(relocateShardInterval.end(), distributorId).detail("Duration", now() - startTime).detail("Result","Success");
+					if(now() - startTime > 600) {
+						TraceEvent(SevWarnAlways, "RelocateShardTooLong").detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src));
+					}
 					if(rd.keys.begin == keyServersPrefix) {
 						TraceEvent("MovedKeyServerKeys").detail("Dest", describe(destIds)).trackLatest("MovedKeyServers");
 					}
@ -1099,7 +1104,10 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 			}
 		}
 	} catch (Error& e) {
-		TraceEvent(relocateShardInterval.end(), distributorId).error(e, true);
+		TraceEvent(relocateShardInterval.end(), distributorId).error(e, true).detail("Duration", now() - startTime);
+		if(now() - startTime > 600) {
+			TraceEvent(SevWarnAlways, "RelocateShardTooLong").error(e, true).detail("Duration", now() - startTime).detail("Dest", describe(destIds)).detail("Src", describe(rd.src));
+		}
 		if( !signalledTransferComplete )
 			dataTransferComplete.send( rd );

--- a/fdbserver/FDBExecHelper.actor.cpp
+++ b/fdbserver/FDBExecHelper.actor.cpp
@ -21,7 +21,6 @@ ExecCmdValueString::ExecCmdValueString(StringRef pCmdValueString) {
 void ExecCmdValueString::setCmdValueString(StringRef pCmdValueString) {
 	// reset everything
 	binaryPath = StringRef();
-	keyValueMap.clear();

 	// set the new cmdValueString
 	cmdValueString = pCmdValueString;
@ -42,18 +41,10 @@ VectorRef<StringRef> ExecCmdValueString::getBinaryArgs() {
 	return binaryArgs;
 }

-StringRef ExecCmdValueString::getBinaryArgValue(StringRef key) {
-	StringRef res;
-	if (keyValueMap.find(key) != keyValueMap.end()) {
-		res = keyValueMap[key];
-	}
-	return res;
-}
-
 void ExecCmdValueString::parseCmdValue() {
 	StringRef param = this->cmdValueString;
 	// get the binary path
-	this->binaryPath = param.eat(LiteralStringRef(":"));
+	this->binaryPath = param.eat(LiteralStringRef(" "));

 	// no arguments provided
 	if (param == StringRef()) {
@ -62,11 +53,8 @@ void ExecCmdValueString::parseCmdValue() {

 	// extract the arguments
 	while (param != StringRef()) {
-		StringRef token = param.eat(LiteralStringRef(","));
+		StringRef token = param.eat(LiteralStringRef(" "));
 		this->binaryArgs.push_back(this->binaryArgs.arena(), token);
-
-		StringRef key = token.eat(LiteralStringRef("="));
-		keyValueMap.insert(std::make_pair(key, token));
 	}
 	return;
 }
@ -153,29 +141,30 @@ ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> par
 }
 #endif

-ACTOR Future<int> execHelper(ExecCmdValueString* execArg, std::string folder, std::string role) {
-	state StringRef uidStr = execArg->getBinaryArgValue(LiteralStringRef("uid"));
+ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::string folder, std::string role) {
+	state Standalone<StringRef> uidStr = snapUID.toString();
 	state int err = 0;
 	state Future<int> cmdErr;
 	state double maxWaitTime = SERVER_KNOBS->SNAP_CREATE_MAX_TIMEOUT;
 	if (!g_network->isSimulated()) {
 		// get bin path
 		auto snapBin = execArg->getBinaryPath();
-		auto dataFolder = "path=" + folder;
 		std::vector<std::string> paramList;
-		paramList.push_back(snapBin.toString());
 		// get user passed arguments
 		auto listArgs = execArg->getBinaryArgs();
 		for (auto elem : listArgs) {
 			paramList.push_back(elem.toString());
 		}
 		// get additional arguments
-		paramList.push_back(dataFolder);
+		paramList.push_back("--path");
+		paramList.push_back(folder);
 		const char* version = FDB_VT_VERSION;
-		std::string versionString = "version=";
-		versionString += version;
-		paramList.push_back(versionString);
+		paramList.push_back("--version");
+		paramList.push_back(version);
+		paramList.push_back("--role");
 		paramList.push_back(role);
+		paramList.push_back("--uid");
+		paramList.push_back(uidStr.toString());
 		cmdErr = spawnProcess(snapBin.toString(), paramList, maxWaitTime, false /*isSync*/, 0);
 		wait(success(cmdErr));
 		err = cmdErr.get();
--- a/fdbserver/FDBExecHelper.actor.h
+++ b/fdbserver/FDBExecHelper.actor.h
@ -27,7 +27,6 @@ public: // ctor & dtor
 public: // interfaces
 	StringRef getBinaryPath();
 	VectorRef<StringRef> getBinaryArgs();
-	StringRef getBinaryArgValue(StringRef key);
 	void setCmdValueString(StringRef cmdValueString);
 	StringRef getCmdValueString(void);

@ -41,7 +40,6 @@ private: // data
 	Standalone<StringRef> cmdValueString;
 	Standalone<VectorRef<StringRef>> binaryArgs;
 	StringRef binaryPath;
-	std::map<StringRef, StringRef> keyValueMap;
 };

 // FIXME: move this function to a common location
@ -52,7 +50,7 @@ private: // data
 ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> paramList, double maxWaitTime, bool isSync, double maxSimDelayTime);

 // helper to run all the work related to running the exec command
-ACTOR Future<int> execHelper(ExecCmdValueString* execArg, std::string folder, std::string role);
+ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::string folder, std::string role);

 // returns true if the execUID op is in progress
 bool isExecOpInProgress(UID execUID);
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -177,7 +177,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( MAX_TEAMS_PER_SERVER,           5*DESIRED_TEAMS_PER_SERVER );
 	init( DD_SHARD_SIZE_GRANULARITY,                         5000000 );
 	init( DD_SHARD_SIZE_GRANULARITY_SIM,                      500000 ); if( randomize && BUGGIFY ) DD_SHARD_SIZE_GRANULARITY_SIM = 0;
-	init( DD_MOVE_KEYS_PARALLELISM,                               20 ); if( randomize && BUGGIFY ) DD_MOVE_KEYS_PARALLELISM = 1;
+	init( DD_MOVE_KEYS_PARALLELISM,                               15 ); if( randomize && BUGGIFY ) DD_MOVE_KEYS_PARALLELISM = 1;
 	init( DD_MERGE_LIMIT,                                       2000 ); if( randomize && BUGGIFY ) DD_MERGE_LIMIT = 2;
 	init( DD_SHARD_METRICS_TIMEOUT,                             60.0 ); if( randomize && BUGGIFY ) DD_SHARD_METRICS_TIMEOUT = 0.1;
 	init( DD_LOCATION_CACHE_SIZE,                            2000000 ); if( randomize && BUGGIFY ) DD_LOCATION_CACHE_SIZE = 3;
@ -352,8 +352,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( RATEKEEPER_FAILURE_TIME,                               1.0 );
 	init( REPLACE_INTERFACE_DELAY,                              60.0 );
 	init( REPLACE_INTERFACE_CHECK_DELAY,                         5.0 );
-	init( COORDINATOR_REGISTER_INTERVAL,                        30.0 );
-	init( CLIENT_REGISTER_INTERVAL,                            300.0 );
+	init( COORDINATOR_REGISTER_INTERVAL,                         5.0 );
+	init( CLIENT_REGISTER_INTERVAL,                            600.0 );

 	init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL,                   600 ); if( randomize && BUGGIFY ) INCOMPATIBLE_PEERS_LOGGING_INTERVAL = 60.0;
 	init( EXPECTED_MASTER_FITNESS,             ProcessClass::UnsetFit );
@ -420,8 +420,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {

 	init( MAX_TPS_HISTORY_SAMPLES,                               600 );
 	init( NEEDED_TPS_HISTORY_SAMPLES,                            200 );
-	init( TARGET_DURABILITY_LAG_VERSIONS,                      200e6 );
-	init( TARGET_DURABILITY_LAG_VERSIONS_BATCH,                100e6 );
+	init( TARGET_DURABILITY_LAG_VERSIONS,                      350e6 ); // Should be larger than STORAGE_DURABILITY_LAG_SOFT_MAX
+	init( TARGET_DURABILITY_LAG_VERSIONS_BATCH,                250e6 ); // Should be larger than STORAGE_DURABILITY_LAG_SOFT_MAX
 	init( DURABILITY_LAG_UNLIMITED_THRESHOLD,                   50e6 );
 	init( INITIAL_DURABILITY_LAG_MULTIPLIER,                    1.02 );
 	init( DURABILITY_LAG_REDUCTION_RATE,                      0.9999 );
--- a/fdbserver/LeaderElection.actor.cpp
+++ b/fdbserver/LeaderElection.actor.cpp
@ -183,9 +183,11 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators, Valu
 		state vector<Future<Void>> true_heartbeats;
 		state vector<Future<Void>> false_heartbeats;
 		for(int i=0; i<coordinators.leaderElectionServers.size(); i++) {
-			Future<bool> hb = retryBrokenPromise( coordinators.leaderElectionServers[i].leaderHeartbeat, LeaderHeartbeatRequest( coordinators.clusterKey, myInfo, prevChangeID ), TaskPriority::CoordinationReply );
-			true_heartbeats.push_back( onEqual(hb, true) );
-			false_heartbeats.push_back( onEqual(hb, false) );
+			Future<LeaderHeartbeatReply> hb = retryBrokenPromise(
+			    coordinators.leaderElectionServers[i].leaderHeartbeat,
+			    LeaderHeartbeatRequest(coordinators.clusterKey, myInfo, prevChangeID), TaskPriority::CoordinationReply);
+			true_heartbeats.push_back(onEqual(hb, LeaderHeartbeatReply{ true }));
+			false_heartbeats.push_back(onEqual(hb, LeaderHeartbeatReply{ false }));
 		}

 		state Future<Void> rate = delay( SERVER_KNOBS->HEARTBEAT_FREQUENCY, TaskPriority::CoordinationReply ) || asyncPriorityInfo->onChange(); // SOMEDAY: Move to server side?
--- a/fdbserver/MasterInterface.h
+++ b/fdbserver/MasterInterface.h
@ -55,10 +55,25 @@ struct MasterInterface {
 	}
 };

+struct TLogRejoinReply {
+	constexpr static FileIdentifier file_identifier = 11;
+
+	// false means someone else registered, so we should re-register.  true means this master is recovered, so don't
+	// send again to the same master.
+	bool masterIsRecovered;
+	TLogRejoinReply() = default;
+	explicit TLogRejoinReply(bool masterIsRecovered) : masterIsRecovered(masterIsRecovered) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, masterIsRecovered);
+	}
+};
+
 struct TLogRejoinRequest {
 	constexpr static FileIdentifier file_identifier = 15692200;
 	TLogInterface myInterface;
-	ReplyPromise<bool> reply;   // false means someone else registered, so we should re-register.  true means this master is recovered, so don't send again to the same master.
+	ReplyPromise<TLogRejoinReply> reply;

 	TLogRejoinRequest() { }
 	explicit TLogRejoinRequest(const TLogInterface &interf) : myInterface(interf) { }
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@ -128,7 +128,7 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
 		when ( wait( leaseTimeout ) ) {
 			*outTransactionRate = 0;
 			*outBatchTransactionRate = 0;
-			//TraceEvent("MasterProxyRate", myID).detail("Rate", 0).detail("BatchRate", 0).detail("Lease", "Expired");
+			//TraceEvent("MasterProxyRate", myID).detail("Rate", 0.0).detail("BatchRate", 0.0).detail("Lease", "Expired");
 			leaseTimeout = Never();
 		}
 	}
@ -156,10 +156,7 @@ ACTOR Future<Void> queueTransactionStartRequests(
 				stats->txnBatchPriorityStartIn += req.transactionCount;

 			if (transactionQueue->empty()) {
-				if (now() - *lastGRVTime > *GRVBatchTime)
-					*lastGRVTime = now() - *GRVBatchTime;
-
-				forwardPromise(GRVTimer, delayJittered(*GRVBatchTime - (now() - *lastGRVTime), TaskPriority::ProxyGRVTimer));
+				forwardPromise(GRVTimer, delayJittered(std::max(0.0, *GRVBatchTime - (now() - *lastGRVTime)), TaskPriority::ProxyGRVTimer));
 			}

 			transactionQueue->push(std::make_pair(req, counter--));
@ -1108,7 +1105,9 @@ struct TransactionRateInfo {
 	TransactionRateInfo(double rate) : rate(rate), limit(0) {}

 	void reset(double elapsed) {
-		limit = std::min(0.0,limit) + std::min(rate * elapsed, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
+		limit = std::min(0.0, limit) + rate * elapsed; // Adjust the limit based on the full elapsed interval in order to properly erase a deficit
+		limit = std::min(limit, rate * SERVER_KNOBS->START_TRANSACTION_BATCH_INTERVAL_MAX); // Don't allow the rate to exceed what would be allowed in the maximum batch interval
+		limit = std::min(limit, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
 	}

 	bool canStart(int64_t numAlreadyStarted) {
@ -1170,7 +1169,7 @@ ACTOR static Future<Void> transactionStarter(
 		waitNext(GRVTimer.getFuture());
 		// Select zero or more transactions to start
 		double t = now();
-		double elapsed = std::min<double>(now() - lastGRVTime, SERVER_KNOBS->START_TRANSACTION_BATCH_INTERVAL_MAX);
+		double elapsed = now() - lastGRVTime;
 		lastGRVTime = t;

 		if(elapsed == 0) elapsed = 1e-15; // resolve a possible indeterminant multiplication with infinite transaction rate
@ -1466,7 +1465,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
 			TraceEvent("SnapMasterProxy_WhiteListCheckFailed")
 				.detail("SnapPayload", snapReq.snapPayload)
 				.detail("SnapUID", snapReq.snapUID);
-			throw transaction_not_permitted();
+			throw snap_path_not_whitelisted();
 		}
 		// db fully recovered check
 		if (commitData->db->get().recoveryState != RecoveryState::FULLY_RECOVERED)  {
@ -1478,7 +1477,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
 			TraceEvent("SnapMasterProxy_ClusterNotFullyRecovered")
 				.detail("SnapPayload", snapReq.snapPayload)
 				.detail("SnapUID", snapReq.snapUID);
-			throw cluster_not_fully_recovered();
+			throw snap_not_fully_recovered_unsupported();
 		}

 		auto result =
@ -1493,7 +1492,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
 			TraceEvent("SnapMasterProxy_LogAnitQuorumNotSupported")
 				.detail("SnapPayload", snapReq.snapPayload)
 				.detail("SnapUID", snapReq.snapUID);
-			throw txn_exec_log_anti_quorum();
+			throw snap_log_anti_quorum_unsupported();
 		}

 		// send a snap request to DD
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@ -139,8 +139,8 @@ Future<Void> checkMoveKeysLockReadOnly( Transaction* tr, MoveKeysLock lock ) {
 	return checkMoveKeysLock(tr, lock, false);
 }

-ACTOR Future<Optional<UID>> checkReadWrite( Future< ErrorOr<std::pair<Version,Version>> > fReply, UID uid, Version version ) {
-	ErrorOr<std::pair<Version,Version>> reply = wait( fReply );
+ACTOR Future<Optional<UID>> checkReadWrite(Future<ErrorOr<GetShardStateReply>> fReply, UID uid, Version version) {
+	ErrorOr<GetShardStateReply> reply = wait(fReply);
 	if (!reply.present() || reply.get().first < version)
 		return Optional<UID>();
 	return Optional<UID>(uid);
@ -258,6 +258,14 @@ ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> s
 	return result;
 }

+ACTOR Future<Void> logWarningAfter( const char * context, double duration, vector<UID> servers) {
+	state double startTime = now();
+	loop {
+		wait(delay(duration));
+		TraceEvent(SevWarnAlways, context).detail("Duration", now() - startTime).detail("Servers", describe(servers));
+	}
+}
+
 // keyServer: map from keys to destination servers
 // serverKeys: two-dimension map: [servers][keys], value is the servers' state of having the keys: active(not-have), complete(already has), ""().
 // Set keyServers[keys].dest = servers
@ -265,6 +273,7 @@ ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> s
 // Set serverKeys[dest][keys] = "" for the dest servers of each existing shard in keys (unless that destination is a member of servers OR if the source list is sufficiently degraded)
 ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> servers, MoveKeysLock lock, FlowLock *startMoveKeysLock, UID relocationIntervalId ) {
 	state TraceInterval interval("RelocateShard_StartMoveKeys");
+	state Future<Void> warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers);
 	//state TraceInterval waitInterval("");

 	wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) );
@ -436,7 +445,8 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
 ACTOR Future<Void> waitForShardReady( StorageServerInterface server, KeyRange keys, Version minVersion, GetShardStateRequest::waitMode mode ) {
 	loop {
 		try {
-			std::pair<Version,Version> rep = wait( server.getShardState.getReply( GetShardStateRequest(keys, mode), TaskPriority::MoveKeys ) );
+			GetShardStateReply rep =
+			    wait(server.getShardState.getReply(GetShardStateRequest(keys, mode), TaskPriority::MoveKeys));
 			if (rep.first >= minVersion) {
 				return Void();
 			}
@ -502,6 +512,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
 {
 	state TraceInterval interval("RelocateShard_FinishMoveKeys");
 	state TraceInterval waitInterval("");
+	state Future<Void> warningLogger = logWarningAfter("FinishMoveKeysTooLong", 600, destinationTeam);
 	state Key begin = keys.begin;
 	state Key endKey;
 	state int retries = 0;
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@ -1119,9 +1119,9 @@ namespace oldTLog_4_6 {
 					req.myInterface = tli;
 					TraceEvent("TLogRejoining", self->dbgid).detail("Master", self->dbInfo->get().master.id());
 					choose {
-						when ( bool success = wait( brokenPromiseToNever( self->dbInfo->get().master.tlogRejoin.getReply( req ) ) ) ) {
-							if (success)
-								lastMasterID = self->dbInfo->get().master.id();
+					    when(TLogRejoinReply rep =
+					             wait(brokenPromiseToNever(self->dbInfo->get().master.tlogRejoin.getReply(req)))) {
+						    if (rep.masterIsRecovered) lastMasterID = self->dbInfo->get().master.id();
 					    }
 					    when ( wait( self->dbInfo->onChange() ) ) { }
 					}
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@ -1477,9 +1477,9 @@ ACTOR Future<Void> rejoinMasters( TLogData* self, TLogInterface tli, DBRecoveryC
 				TLogRejoinRequest req(tli);
 				TraceEvent("TLogRejoining", self->dbgid).detail("Master", self->dbInfo->get().master.id());
 				choose {
-					when ( bool success = wait( brokenPromiseToNever( self->dbInfo->get().master.tlogRejoin.getReply( req ) ) ) ) {
-						if (success)
-							lastMasterID = self->dbInfo->get().master.id();
+					when(TLogRejoinReply rep =
+					         wait(brokenPromiseToNever(self->dbInfo->get().master.tlogRejoin.getReply(req)))) {
+						if (rep.masterIsRecovered) lastMasterID = self->dbInfo->get().master.id();
 					}
 					when ( wait( self->dbInfo->onChange() ) ) { }
 				}
@ -1556,8 +1556,7 @@ tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Reference<LogData> logDa
 	}
 	ExecCmdValueString snapArg(snapReq.snapPayload);
 	try {
-		Standalone<StringRef> role = LiteralStringRef("role=").withSuffix(snapReq.role);
-		int err = wait(execHelper(&snapArg, self->dataFolder, role.toString()));
+		int err = wait(execHelper(&snapArg, snapReq.snapUID, self->dataFolder, snapReq.role.toString()));

 		std::string uidStr = snapReq.snapUID.toString();
 		TraceEvent("ExecTraceTLog")
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@ -93,7 +93,6 @@ struct StorageQueueInfo {
 	Smoother verySmoothDurableVersion, smoothLatestVersion;
 	Smoother smoothFreeSpace;
 	Smoother smoothTotalSpace;
-	double localRateLimit;
 	limitReason_t limitReason;
 	StorageQueueInfo(UID id, LocalityData locality) : valid(false), id(id), locality(locality), smoothDurableBytes(SERVER_KNOBS->SMOOTHING_AMOUNT),
 		smoothInputBytes(SERVER_KNOBS->SMOOTHING_AMOUNT), verySmoothDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
@ -147,7 +146,7 @@ struct RatekeeperLimits {
 		logTargetBytes(logTargetBytes),
 		logSpringBytes(logSpringBytes),
 		maxVersionDifference(maxVersionDifference),
-		durabilityLagTargetVersions(durabilityLagTargetVersions),
+		durabilityLagTargetVersions(durabilityLagTargetVersions + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS), // The read transaction life versions are expected to not be durable on the storage servers
 		durabilityLagLimit(std::numeric_limits<double>::infinity()),
 		lastDurabilityLag(0),
 		context(context)
@ -203,7 +202,6 @@ ACTOR Future<Void> trackStorageServerQueueInfo( RatekeeperData* self, StorageSer
 				myQueueInfo->value.valid = true;
 				myQueueInfo->value.prevReply = myQueueInfo->value.lastReply;
 				myQueueInfo->value.lastReply = reply.get();
-				myQueueInfo->value.localRateLimit = reply.get().localRateLimit;
 				if (myQueueInfo->value.prevReply.instanceID != reply.get().instanceID) {
 					myQueueInfo->value.smoothDurableBytes.reset(reply.get().bytesDurable);
 					myQueueInfo->value.verySmoothDurableBytes.reset(reply.get().bytesDurable);
@ -376,8 +374,6 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 	int64_t worstStorageQueueStorageServer = 0;
 	int64_t limitingStorageQueueStorageServer = 0;
 	int64_t worstDurabilityLag = 0;
-	double worstStorageLocalLimit = 0;
-	double limitingStorageLocalLimit = 0;

 	std::multimap<double, StorageQueueInfo*> storageTpsLimitReverseIndex;
 	std::multimap<int64_t, StorageQueueInfo*> storageDurabilityLagReverseIndex;
@ -408,7 +404,6 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {

 		int64_t storageQueue = ss.lastReply.bytesInput - ss.smoothDurableBytes.smoothTotal();
 		worstStorageQueueStorageServer = std::max(worstStorageQueueStorageServer, storageQueue);
-		worstStorageLocalLimit = std::min(worstStorageLocalLimit, ss.localRateLimit);

 		int64_t storageDurabilityLag = ss.smoothLatestVersion.smoothTotal() - ss.verySmoothDurableVersion.smoothTotal();
 		worstDurabilityLag = std::max(worstDurabilityLag, storageDurabilityLag);
@ -485,7 +480,6 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		}

 		limitingStorageQueueStorageServer = ss->second->lastReply.bytesInput - ss->second->smoothDurableBytes.smoothTotal();
-		limitingStorageLocalLimit = ss->second->lastReply.localRateLimit;
 		limits->tpsLimit = ss->first;
 		reasonID = storageTpsLimitReverseIndex.begin()->second->id; // Although we aren't controlling based on the worst SS, we still report it as the limiting process
 		limitReason = ssReasons[reasonID];
@ -679,14 +673,12 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 			.detail("WorstFreeSpaceTLog", worstFreeSpaceTLog)
 			.detail("WorstStorageServerQueue", worstStorageQueueStorageServer)
 			.detail("LimitingStorageServerQueue", limitingStorageQueueStorageServer)
-			.detail("WorstStorageLocalLimit", worstStorageLocalLimit)
-			.detail("LimitingStorageLocalLimit", limitingStorageLocalLimit)
 			.detail("WorstTLogQueue", worstStorageQueueTLog)
 			.detail("TotalDiskUsageBytes", totalDiskUsageBytes)
 			.detail("WorstStorageServerVersionLag", worstVersionLag)
 			.detail("LimitingStorageServerVersionLag", limitingVersionLag)
-			.detail("WorstDurabilityLag", worstDurabilityLag)
-			.detail("LimitingDurabilityLag", limitingDurabilityLag)
+			.detail("WorstStorageServerDurabilityLag", worstDurabilityLag)
+			.detail("LimitingStorageServerDurabilityLag", limitingDurabilityLag)
 			.trackLatest(name.c_str());
 	}
 }
--- a/fdbserver/ResolverInterface.h
+++ b/fdbserver/ResolverInterface.h
@ -103,9 +103,22 @@ struct ResolveTransactionBatchRequest {
 	}
 };

+struct ResolutionMetricsReply {
+	constexpr static FileIdentifier file_identifier = 3;
+
+	int64_t value;
+	ResolutionMetricsReply() = default;
+	explicit ResolutionMetricsReply(int64_t value) : value(value) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, value);
+	}
+};
+
 struct ResolutionMetricsRequest {
 	constexpr static FileIdentifier file_identifier = 11663527;
-	ReplyPromise<int64_t> reply;
+	ReplyPromise<ResolutionMetricsReply> reply;

 	template <class Archive>
 	void serialize(Archive& ar) {
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -388,6 +388,13 @@ static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics, vector<Work
 	return machineMap;
 }

+JsonBuilderObject getLagObject(int64_t versions) {
+	JsonBuilderObject lag;
+	lag["versions"] = versions;
+	lag["seconds"] = versions / (double)SERVER_KNOBS->VERSIONS_PER_SECOND;
+	return lag;
+}
+
 struct MachineMemoryInfo {
 	double memoryUsage;
 	double numProcesses;
@ -474,17 +481,8 @@ struct RolesInfo {
 				obj["read_latency_bands"] = addLatencyBandInfo(readLatencyMetrics);
 			}

-			JsonBuilderObject dataLag;
-			dataLag["versions"] = versionLag;
-			dataLagSeconds = versionLag / (double)SERVER_KNOBS->VERSIONS_PER_SECOND;
-			dataLag["seconds"] = dataLagSeconds;
-
-			JsonBuilderObject durabilityLag;
-			durabilityLag["versions"] = version - durableVersion;
-			durabilityLag["seconds"] = (version - durableVersion) / (double)SERVER_KNOBS->VERSIONS_PER_SECOND;
-
-			obj["data_lag"] = dataLag;
-			obj["durability_lag"] = durabilityLag;
+			obj["data_lag"] = getLagObject(versionLag);
+			obj["durability_lag"] = getLagObject(version - durableVersion);

 		} catch (Error& e) {
 			if(e.code() != error_code_attribute_not_found)
@ -551,6 +549,11 @@ struct RolesInfo {
 	JsonBuilderObject& addRole(std::string const& role, InterfaceType& iface) {
 		return addRole(iface.address(), role, iface.id());
 	}
+	JsonBuilderObject& addCoordinatorRole(NetworkAddress addr) {
+		JsonBuilderObject obj;
+		obj["role"] = "coordinator";
+		return roles.insert(std::make_pair(addr, obj))->second;
+	}
 	JsonBuilderArray getStatusForAddress( NetworkAddress a ) {
 		JsonBuilderArray v;
 		auto it = roles.lower_bound(a);
@ -568,7 +571,8 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
    WorkerEvents programStarts, std::map<std::string, std::vector<JsonBuilderObject>> processIssues,
    vector<std::pair<StorageServerInterface, EventMap>> storageServers,
    vector<std::pair<TLogInterface, EventMap>> tLogs, vector<std::pair<MasterProxyInterface, EventMap>> proxies,
-    Database cx, Optional<DatabaseConfiguration> configuration, Optional<Key> healthyZone, std::set<std::string>* incomplete_reasons) {
+    ServerCoordinators coordinators, Database cx, Optional<DatabaseConfiguration> configuration, 
+    Optional<Key> healthyZone, std::set<std::string>* incomplete_reasons) {

 	state JsonBuilderObject processMap;

@ -649,6 +653,10 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
 		}
 	}

+	for(auto& coordinator : coordinators.ccf->getConnectionString().coordinators()) {
+		roles.addCoordinatorRole(coordinator);
+	}
+
 	state std::vector<std::pair<MasterProxyInterface, EventMap>>::iterator proxy;
 	for(proxy = proxies.begin(); proxy != proxies.end(); ++proxy) {
 		roles.addRole( "proxy", proxy->first, proxy->second );
@ -1150,8 +1158,9 @@ struct LoadConfigurationResult {
 	Optional<Key> healthyZone;
 	double healthyZoneSeconds;
 	bool rebalanceDDIgnored;
+	bool dataDistributionDisabled;

-	LoadConfigurationResult() : fullReplication(true), healthyZoneSeconds(0), rebalanceDDIgnored(false) {}
+	LoadConfigurationResult() : fullReplication(true), healthyZoneSeconds(0), rebalanceDDIgnored(false), dataDistributionDisabled(false) {}
 };

 ACTOR static Future<std::pair<Optional<DatabaseConfiguration>,Optional<LoadConfigurationResult>>> loadConfiguration(Database cx, JsonBuilderArray *messages, std::set<std::string> *status_incomplete_reasons){
@ -1193,9 +1202,10 @@ ACTOR static Future<std::pair<Optional<DatabaseConfiguration>,Optional<LoadConfi
 			}
 			state Future<Optional<Value>> healthyZoneValue = tr.get(healthyZoneKey);
 			state Future<Optional<Value>> rebalanceDDIgnored = tr.get(rebalanceDDIgnoreKey);
+			state Future<Optional<Value>> ddModeKey = tr.get(dataDistributionModeKey);

 			choose {
-				when(wait(waitForAll(replicasFutures) && success(healthyZoneValue) && success(rebalanceDDIgnored))) {
+				when(wait(waitForAll(replicasFutures) && success(healthyZoneValue) && success(rebalanceDDIgnored) && success(ddModeKey))) {
 					int unreplicated = 0;
 					for(int i = 0; i < result.get().regions.size(); i++) {
 						if( !replicasFutures[i].get().present() || decodeDatacenterReplicasValue(replicasFutures[i].get().get()) < result.get().storageTeamSize ) { 
@ -1206,12 +1216,23 @@ ACTOR static Future<std::pair<Optional<DatabaseConfiguration>,Optional<LoadConfi
 					res.fullReplication = (!unreplicated || (result.get().usableRegions == 1 && unreplicated < result.get().regions.size()));
 					if(healthyZoneValue.get().present()) {
 						auto healthyZone = decodeHealthyZoneValue(healthyZoneValue.get().get());
-						if(healthyZone.second > tr.getReadVersion().get()) {
+						if(healthyZone.first == ignoreSSFailuresZoneString) {
+							res.healthyZone = healthyZone.first;
+						}
+						else if(healthyZone.second > tr.getReadVersion().get()) {
 							res.healthyZone = healthyZone.first;
 							res.healthyZoneSeconds = (healthyZone.second-tr.getReadVersion().get())/CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
 						}
 					}
 					res.rebalanceDDIgnored = rebalanceDDIgnored.get().present();
+					if (ddModeKey.get().present()) {
+						BinaryReader rd(ddModeKey.get().get(), Unversioned());
+						int currentMode;
+						rd >> currentMode;
+						if (currentMode == 0) {
+							res.dataDistributionDisabled = true;
+						}
+					}
 					loadResult = res;
 				}
 				when(wait(getConfTimeout)) {
@ -1611,8 +1632,15 @@ ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(Reference<AsyncVar<
 			(*data_overlay)["least_operating_space_bytes_storage_server"] = std::max(worstFreeSpaceStorageServer, (int64_t)0);
 			(*qos).setKeyRawNumber("worst_queue_bytes_storage_server", ratekeeper.getValue("WorstStorageServerQueue"));
 			(*qos).setKeyRawNumber("limiting_queue_bytes_storage_server", ratekeeper.getValue("LimitingStorageServerQueue"));
+
+			// TODO: These can be removed in the next release after 6.2
 			(*qos).setKeyRawNumber("worst_version_lag_storage_server", ratekeeper.getValue("WorstStorageServerVersionLag"));
 			(*qos).setKeyRawNumber("limiting_version_lag_storage_server", ratekeeper.getValue("LimitingStorageServerVersionLag"));
+
+			(*qos)["worst_data_lag_storage_server"] = getLagObject(ratekeeper.getInt64("WorstStorageServerVersionLag"));
+			(*qos)["limiting_data_lag_storage_server"] = getLagObject(ratekeeper.getInt64("LimitingStorageServerVersionLag"));
+			(*qos)["worst_durability_lag_storage_server"] = getLagObject(ratekeeper.getInt64("WorstStorageServerDurabilityLag"));
+			(*qos)["limiting_durability_lag_storage_server"] = getLagObject(ratekeeper.getInt64("LimitingStorageServerDurabilityLag"));
 		}

 		if(tlogCount > 0) {
@ -2173,6 +2201,9 @@ ACTOR Future<StatusReply> clusterGetStatus(
 			if (loadResult.get().rebalanceDDIgnored) {
 				statusObj["data_distribution_disabled_for_rebalance"] = true;
 			}
+			if (loadResult.get().dataDistributionDisabled) {
+				statusObj["data_distribution_disabled"] = true;
+			}
 		}

 		statusObj["machines"] = machineStatusFetcher(mMetrics, workers, configuration, &status_incomplete_reasons);
@ -2207,7 +2238,6 @@ ACTOR Future<StatusReply> clusterGetStatus(
 			futures2.push_back(layerStatusFetcher(cx, &messages, &status_incomplete_reasons));
 			futures2.push_back(lockedStatusFetcher(db, &messages, &status_incomplete_reasons));
 			futures2.push_back(clusterSummaryStatisticsFetcher(pMetrics, storageServerFuture, tLogFuture, &status_incomplete_reasons));
-
 			state std::vector<JsonBuilderObject> workerStatuses = wait(getAll(futures2));

 			int oldLogFaultTolerance = 100;
@ -2293,8 +2323,9 @@ ACTOR Future<StatusReply> clusterGetStatus(

 		JsonBuilderObject processStatus = wait(processStatusFetcher(db, workers, pMetrics, mMetrics, networkMetrics,
 		                                                            latestError, traceFileOpenErrors, programStarts,
-		                                                            processIssues, storageServers, tLogs, proxies, cx,
-		                                                            configuration, loadResult.present() ? loadResult.get().healthyZone : Optional<Key>(),
+		                                                            processIssues, storageServers, tLogs, proxies, 
+		                                                            coordinators, cx, configuration, 
+		                                                            loadResult.present() ? loadResult.get().healthyZone : Optional<Key>(),
 		                                                            &status_incomplete_reasons));
 		statusObj["processes"] = processStatus;
 		statusObj["clients"] = clientStatusFetcher(clientStatus);
@ -2304,11 +2335,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
 			incompatibleConnectionsArray.push_back(it.toString());
 		}
 		statusObj["incompatible_connections"] = incompatibleConnectionsArray;
-
-		StatusObject datacenterLag;
-		datacenterLag["versions"] = datacenterVersionDifference;
-		datacenterLag["seconds"] = datacenterVersionDifference / (double)SERVER_KNOBS->VERSIONS_PER_SECOND;
-		statusObj["datacenter_lag"] = datacenterLag;
+		statusObj["datacenter_lag"] = getLagObject(datacenterVersionDifference);

 		int totalDegraded = 0;
 		for(auto& it : workers) {
--- a/fdbserver/TLogInterface.h
+++ b/fdbserver/TLogInterface.h
@ -216,6 +216,19 @@ struct TagMessagesRef {
 	}
 };

+struct TLogCommitReply {
+	constexpr static FileIdentifier file_identifier = 3;
+
+	Version version;
+	TLogCommitReply() = default;
+	explicit TLogCommitReply(Version version) : version(version) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, version);
+	}
+};
+
 struct TLogCommitRequest {
 	constexpr static FileIdentifier file_identifier = 4022206;
 	Arena arena;
@ -223,7 +236,7 @@ struct TLogCommitRequest {

 	StringRef messages;// Each message prefixed by a 4-byte length

-	ReplyPromise<Version> reply;
+	ReplyPromise<TLogCommitReply> reply;
 	Optional<UID> debugID;

 	TLogCommitRequest() {}
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -1861,9 +1861,9 @@ ACTOR Future<Void> rejoinMasters( TLogData* self, TLogInterface tli, DBRecoveryC
 				TLogRejoinRequest req(tli);
 				TraceEvent("TLogRejoining", self->dbgid).detail("Master", self->dbInfo->get().master.id());
 				choose {
-					when ( bool success = wait( brokenPromiseToNever( self->dbInfo->get().master.tlogRejoin.getReply( req ) ) ) ) {
-						if (success)
-							lastMasterID = self->dbInfo->get().master.id();
+					when(TLogRejoinReply rep =
+					         wait(brokenPromiseToNever(self->dbInfo->get().master.tlogRejoin.getReply(req)))) {
+						if (rep.masterIsRecovered) lastMasterID = self->dbInfo->get().master.id();
 					}
 					when ( wait( self->dbInfo->onChange() ) ) { }
 				}
@ -1941,8 +1941,7 @@ tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Reference<LogData> logDa
 	}
 	ExecCmdValueString snapArg(snapReq.snapPayload);
 	try {
-		Standalone<StringRef> role = LiteralStringRef("role=").withSuffix(snapReq.role);
-		int err = wait(execHelper(&snapArg, self->dataFolder, role.toString()));
+		int err = wait(execHelper(&snapArg, snapReq.snapUID, self->dataFolder, snapReq.role.toString()));

 		std::string uidStr = snapReq.snapUID.toString();
 		TraceEvent("ExecTraceTLog")
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -30,12 +30,12 @@
 #include "fdbserver/RecoveryState.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.

-ACTOR Future<Version> minVersionWhenReady( Future<Void> f, std::vector<Future<Version>> replies) {
+ACTOR Future<Version> minVersionWhenReady(Future<Void> f, std::vector<Future<TLogCommitReply>> replies) {
 	wait(f);
 	Version minVersion = std::numeric_limits<Version>::max();
 	for(auto& reply : replies) {
 		if(reply.isReady() && !reply.isError()) {
-			minVersion = std::min(minVersion, reply.get());
+			minVersion = std::min(minVersion, reply.get().version);
 		}
 	}
 	return minVersion;
@ -429,7 +429,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	virtual Future<Version> push( Version prevVersion, Version version, Version knownCommittedVersion, Version minKnownCommittedVersion, LogPushData& data, Optional<UID> debugID ) {
 		// FIXME: Randomize request order as in LegacyLogSystem?
 		vector<Future<Void>> quorumResults;
-		vector<Future<Version>> allReplies;
+		vector<Future<TLogCommitReply>> allReplies;
 		int location = 0;
 		for(auto& it : tLogs) {
 			if(it->isLocal && it->logServers.size()) {
@ -2271,7 +2271,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	}

 	ACTOR static Future<Void> trackRejoins( UID dbgid, std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> logServers, FutureStream< struct TLogRejoinRequest > rejoinRequests ) {
-		state std::map<UID,ReplyPromise<bool>> lastReply;
+		state std::map<UID, ReplyPromise<TLogRejoinReply>> lastReply;

 		try {
 			loop {
@ -2287,7 +2287,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					TraceEvent("TLogJoinedMe", dbgid).detail("TLog", req.myInterface.id()).detail("Address", req.myInterface.commit.getEndpoint().getPrimaryAddress().toString());
 					if( !logServers[pos]->get().present() || req.myInterface.commit.getEndpoint() != logServers[pos]->get().interf().commit.getEndpoint())
 						logServers[pos]->setUnconditional( OptionalInterface<TLogInterface>(req.myInterface) );
-					lastReply[req.myInterface.id()].send(false);
+					lastReply[req.myInterface.id()].send(TLogRejoinReply{ false });
 					lastReply[req.myInterface.id()] = req.reply;
 				}
 				else {
@ -2296,8 +2296,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				}
 			}
 		} catch (...) {
-			for( auto it = lastReply.begin(); it != lastReply.end(); ++it)
-				it->second.send(true);
+			for (auto it = lastReply.begin(); it != lastReply.end(); ++it) it->second.send(TLogRejoinReply{ true });
 			throw;
 		}
 	}
--- a/fdbserver/TesterInterface.actor.h
+++ b/fdbserver/TesterInterface.actor.h
@ -31,12 +31,22 @@
 #include "fdbrpc/PerfMetric.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "flow/actorcompiler.h" // has to be last include
+struct CheckReply {
+	constexpr static FileIdentifier file_identifier = 11;
+
+	bool value = false;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, value);
+	}
+};

 struct WorkloadInterface {
 	constexpr static FileIdentifier file_identifier = 4454551;
 	RequestStream<ReplyPromise<Void>> setup;
 	RequestStream<ReplyPromise<Void>> start;
-	RequestStream<ReplyPromise<bool>> check;
+	RequestStream<ReplyPromise<CheckReply>> check;
 	RequestStream<ReplyPromise< std::vector<PerfMetric> > > metrics;
 	RequestStream<ReplyPromise<Void>> stop;

@ -70,7 +80,7 @@ struct WorkloadRequest {

 	VectorRef< VectorRef<KeyValueRef> > options;

-	int clientId;				// the "id" of the client recieving the request (0 indexed)
+	int clientId;				// the "id" of the client receiving the request (0 indexed)
 	int clientCount;			// the total number of test clients participating in the workload
 	ReplyPromise< struct WorkloadInterface > reply;

--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -574,6 +574,9 @@ static void printUsage( const char *name, bool devhelp ) {
 		   "                 Delete the oldest log file when the total size of all log\n"
 		   "                 files exceeds SIZE bytes. If set to 0, old log files will not\n"
 		   "                 be deleted. The default value is 100MiB.\n");
+	printf("  --loggroup LOG_GROUP\n"
+	       "                 Sets the LogGroup field with the specified value for all\n"
+	       "                 events in the trace output (defaults to `default').\n");
 	printf("  --trace_format FORMAT\n"
 	       "                 Select the format of the log files. xml (the default) and json\n"
 	       "                 are supported.\n");
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@ -1018,7 +1018,7 @@ ACTOR Future<Void> resolutionBalancing(Reference<MasterData> self) {
 		wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskPriority::ResolutionMetrics));
 		while(self->resolverChanges.get().size())
 			wait(self->resolverChanges.onChange());
-		state std::vector<Future<int64_t>> futures;
+		state std::vector<Future<ResolutionMetricsReply>> futures;
 		for (auto& p : self->resolvers)
 			futures.push_back(brokenPromiseToNever(p.metrics.getReply(ResolutionMetricsRequest(), TaskPriority::ResolutionMetrics)));
 		wait( waitForAll(futures) );
@ -1026,8 +1026,8 @@ ACTOR Future<Void> resolutionBalancing(Reference<MasterData> self) {

 		int64_t total = 0;
 		for (int i = 0; i < futures.size(); i++) {
-			total += futures[i].get();
-			metrics.insert(std::make_pair(futures[i].get(), i), NoMetric());
+			total += futures[i].get().value;
+			metrics.insert(std::make_pair(futures[i].get().value, i), NoMetric());
 			//TraceEvent("ResolverMetric").detail("I", i).detail("Metric", futures[i].get());
 		}
 		if( metrics.lastItem()->first - metrics.begin()->first > SERVER_KNOBS->MIN_BALANCE_DIFFERENCE ) {
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -934,7 +934,7 @@ ACTOR Future<Void> watchValue_impl( StorageServer* data, WatchValueRequest req )
 					g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask());

 				if( reply.value != req.value ) {
-					req.reply.send( latest );
+					req.reply.send(WatchValueReply{ latest });
 					return Void();
 				}

@ -1012,7 +1012,7 @@ ACTOR Future<Void> getShardState_impl( StorageServer* data, GetShardStateRequest
 		}

 		if( !onChange.size() ) {
-			req.reply.send(std::make_pair(data->version.get(), data->durableVersion.get()));
+			req.reply.send(GetShardStateReply{ data->version.get(), data->durableVersion.get() });
 			return Void();
 		}

@ -1936,9 +1936,18 @@ void splitMutation(StorageServer* data, KeyRangeMap<T>& map, MutationRef const&
 		ASSERT(false);  // Unknown mutation type in splitMutations
 }

+ACTOR Future<Void> logFetchKeysWarning(AddingShard* shard) {
+	state double startTime = now();
+	loop {
+		wait(delay(600));
+		TraceEvent(SevWarnAlways, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable());
+	}
+}
+
 ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
 	state TraceInterval interval("FetchKeys");
 	state KeyRange keys = shard->keys;
+	state Future<Void> warningLogger = logFetchKeysWarning(shard);
 	state double startt = now();
 	state int fetchBlockBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_BLOCK_BYTES : SERVER_KNOBS->FETCH_BLOCK_BYTES;

@ -3313,7 +3322,7 @@ ACTOR Future<Void> waitMetrics( StorageServerMetrics* self, WaitMetricsRequest r
 					when( StorageMetrics c = waitNext( change.getFuture() ) ) {
 						metrics += c;

-						// SOMEDAY: validation! The changes here are possibly partial changes (we recieve multiple messages per
+						// SOMEDAY: validation! The changes here are possibly partial changes (we receive multiple messages per
 						//  update to our requested range). This means that the validation would have to occur after all
 						//  the messages for one clear or set have been dispatched.

@ -3501,7 +3510,7 @@ ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterfac
 			when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) {
 				// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
 				if( req.debugID.present() )
-					g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "storageServer.recieved"); //.detail("TaskID", g_network->getCurrentTask());
+					g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "storageServer.received"); //.detail("TaskID", g_network->getCurrentTask());

 				if (SHORT_CIRCUT_ACTUAL_STORAGE && normalKeys.contains(req.key))
 					req.reply.send(GetValueReply());
@ -3524,7 +3533,7 @@ ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterfac
 			when (GetShardStateRequest req = waitNext(ssi.getShardState.getFuture()) ) {
 				if (req.mode == GetShardStateRequest::NO_WAIT ) {
 					if( self->isReadable( req.keys ) )
-						req.reply.send(std::make_pair(self->version.get(),self->durableVersion.get()));
+						req.reply.send(GetShardStateReply{ self->version.get(), self->durableVersion.get() });
 					else
 						req.reply.sendError(wrong_shard_server());
 				} else {
@ -3534,7 +3543,7 @@ ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterfac
 			when (StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) {
 				getQueuingMetrics(self, req);
 			}
-			when( ReplyPromise<Version> reply = waitNext(ssi.getVersion.getFuture()) ) {
+			when(ReplyPromise<VersionReply> reply = waitNext(ssi.getVersion.getFuture())) {
 				reply.send( self->version.get() );
 			}
 			when( ReplyPromise<KeyValueStoreType> reply = waitNext(ssi.getKeyValueStoreType.getFuture()) ) {
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@ -404,10 +404,10 @@ ACTOR Future<Void> runWorkloadAsync( Database cx, WorkloadInterface workIface, T
 	state unique_ptr<TestWorkload> delw(workload);
 	state Optional<ErrorOr<Void>> setupResult;
 	state Optional<ErrorOr<Void>> startResult;
-	state Optional<ErrorOr<bool>> checkResult;
+	state Optional<ErrorOr<CheckReply>> checkResult;
 	state ReplyPromise<Void> setupReq;
 	state ReplyPromise<Void> startReq;
-	state ReplyPromise<bool> checkReq;
+	state ReplyPromise<CheckReply> checkReq;

 	TraceEvent("TestBeginAsync", workIface.id()).detail("Workload", workload->description()).detail("DatabasePingDelay", databasePingDelay);

@ -452,12 +452,12 @@ ACTOR Future<Void> runWorkloadAsync( Database cx, WorkloadInterface workIface, T
 			}
 			sendResult( startReq, startResult );
 		}
-		when( ReplyPromise<bool> req = waitNext( workIface.check.getFuture() ) ) {
+		when(ReplyPromise<CheckReply> req = waitNext(workIface.check.getFuture())) {
 			checkReq = req;
 			if (!checkResult.present()) {
 				try {
 					bool check = wait( timeoutError( workload->check(cx), workload->getCheckTimeout() ) );
-					checkResult = (!startResult.present() || !startResult.get().isError()) && check;
+					checkResult = CheckReply{ (!startResult.present() || !startResult.get().isError()) && check };
 				} catch (Error& e) {
 					checkResult = operation_failed();  // was: checkResult = false;
 					if( e.code() == error_code_please_reboot || e.code() == error_code_please_reboot_delete) throw;
@ -693,16 +693,16 @@ ACTOR Future<DistributedTestResults> runWorkload( Database cx, std::vector< Test
 			wait( delay(3.0) );
 		}

-		state std::vector< Future<ErrorOr<bool>> > checks;
+		state std::vector<Future<ErrorOr<CheckReply>>> checks;
 		TraceEvent("CheckingResults");
 		printf("checking test (%s)...\n", printable(spec.title).c_str());
 		for(int i= 0; i < workloads.size(); i++)
-			checks.push_back( workloads[i].check.template getReplyUnlessFailedFor<bool>(waitForFailureTime, 0) );
+			checks.push_back(workloads[i].check.template getReplyUnlessFailedFor<CheckReply>(waitForFailureTime, 0));
 		wait( waitForAll( checks ) );
 		throwIfError(checks, "CheckFailedForWorkload" + printable(spec.title));

 		for(int i = 0; i < checks.size(); i++) {
-			if(checks[i].get().get())
+			if (checks[i].get().get().value)
 				success++;
 			else
 				failure++;
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -658,8 +658,7 @@ void endRole(const Role &role, UID id, std::string reason, bool ok, Error e) {
 ACTOR Future<Void> workerSnapCreate(WorkerSnapRequest snapReq, StringRef snapFolder) {
 	state ExecCmdValueString snapArg(snapReq.snapPayload);
 	try {
-		Standalone<StringRef> role = LiteralStringRef("role=").withSuffix(snapReq.role);
-		int err = wait(execHelper(&snapArg, snapFolder.toString(), role.toString()));
+		int err = wait(execHelper(&snapArg, snapReq.snapUID, snapFolder.toString(), snapReq.role.toString()));
 		std::string uidStr = snapReq.snapUID.toString();
 		TraceEvent("ExecTraceWorker")
 			.detail("Uid", uidStr)
@ -819,10 +818,11 @@ ACTOR Future<Void> workerServer(
 		DUMPTOKEN(recruited.traceBatchDumpRequest);
 	}

+	state std::vector<Future<Void>> recoveries;
+
 	try {
 		std::vector<DiskStore> stores = getDiskStores( folder );
 		bool validateDataFiles = deleteFile(joinPath(folder, validationFilename));
-		std::vector<Future<Void>> recoveries;
 		for( int f = 0; f < stores.size(); f++ ) {
 			DiskStore s = stores[f];
 			// FIXME: Error handling
@ -1239,6 +1239,8 @@ ACTOR Future<Void> workerServer(
 			when( wait( handleErrors ) ) {}
 		}
 	} catch (Error& err) {
+		// Make sure actors are cancelled before "recovery" promises are destructed.
+		for (auto f : recoveries) f.cancel();
 		state Error e = err;
 		bool ok = e.code() == error_code_please_reboot || e.code() == error_code_actor_cancelled || e.code() == error_code_please_reboot_delete;

@ -1389,8 +1391,10 @@ ACTOR Future<Void> fdbd(
 	int64_t memoryProfileThreshold,
 	std::string whitelistBinPaths)
 {
-	try {
+	state vector<Future<Void>> actors;
+	state Promise<Void> recoveredDiskFiles;

+	try {
 		ServerCoordinators coordinators( connFile );
 		if (g_network->isSimulated()) {
 			whitelistBinPaths = ",, random_path,  /bin/snap_create.sh,,";
@ -1398,10 +1402,9 @@ ACTOR Future<Void> fdbd(
 		TraceEvent("StartingFDBD").detail("ZoneID", localities.zoneId()).detail("MachineId", localities.machineId()).detail("DiskPath", dataFolder).detail("CoordPath", coordFolder).detail("WhiteListBinPath", whitelistBinPaths);

 		// SOMEDAY: start the services on the machine in a staggered fashion in simulation?
-		state vector<Future<Void>> v;
 		// Endpoints should be registered first before any process trying to connect to it. So coordinationServer actor should be the first one executed before any other.
 		if ( coordFolder.size() )
-			v.push_back( fileNotFoundToNever( coordinationServer( coordFolder ) ) ); //SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up their files
+			actors.push_back( fileNotFoundToNever( coordinationServer( coordFolder ) ) ); //SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up their files
 		
 		state UID processIDUid = wait(createAndLockProcessIdFile(dataFolder));
 		localities.set(LocalityData::keyProcessId, processIDUid.toString());
@ -1411,19 +1414,21 @@ ACTOR Future<Void> fdbd(
 		Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> cc(new AsyncVar<Optional<ClusterControllerFullInterface>>);
 		Reference<AsyncVar<Optional<ClusterInterface>>> ci(new AsyncVar<Optional<ClusterInterface>>);
 		Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo(new AsyncVar<ClusterControllerPriorityInfo>(getCCPriorityInfo(fitnessFilePath, processClass)));
-		Promise<Void> recoveredDiskFiles;

-		v.push_back(reportErrors(monitorAndWriteCCPriorityInfo(fitnessFilePath, asyncPriorityInfo), "MonitorAndWriteCCPriorityInfo"));
-		v.push_back( reportErrors( processClass == ProcessClass::TesterClass ? monitorLeader( connFile, cc ) : clusterController( connFile, cc , asyncPriorityInfo, recoveredDiskFiles.getFuture(), localities ), "ClusterController") );
-		v.push_back( reportErrors(extractClusterInterface( cc, ci ), "ExtractClusterInterface") );
-		v.push_back( reportErrors(failureMonitorClient( ci, true ), "FailureMonitorClient") );
-		v.push_back( reportErrorsExcept(workerServer(connFile, cc, localities, asyncPriorityInfo, processClass, dataFolder, memoryLimit, metricsConnFile, metricsPrefix, recoveredDiskFiles, memoryProfileThreshold, coordFolder, whitelistBinPaths), "WorkerServer", UID(), &normalWorkerErrors()) );
+		actors.push_back(reportErrors(monitorAndWriteCCPriorityInfo(fitnessFilePath, asyncPriorityInfo), "MonitorAndWriteCCPriorityInfo"));
+		actors.push_back( reportErrors( processClass == ProcessClass::TesterClass ? monitorLeader( connFile, cc ) : clusterController( connFile, cc , asyncPriorityInfo, recoveredDiskFiles.getFuture(), localities ), "ClusterController") );
+		actors.push_back( reportErrors(extractClusterInterface( cc, ci ), "ExtractClusterInterface") );
+		actors.push_back( reportErrors(failureMonitorClient( ci, true ), "FailureMonitorClient") );
+		actors.push_back( reportErrorsExcept(workerServer(connFile, cc, localities, asyncPriorityInfo, processClass, dataFolder, memoryLimit, metricsConnFile, metricsPrefix, recoveredDiskFiles, memoryProfileThreshold, coordFolder, whitelistBinPaths), "WorkerServer", UID(), &normalWorkerErrors()) );
 		state Future<Void> firstConnect = reportErrors( printOnFirstConnected(ci), "ClusterFirstConnectedError" );

-		wait( quorum(v,1) );
+		wait( quorum(actors,1) );
 		ASSERT(false);  // None of these actors should terminate normally
 		throw internal_error();
 	} catch (Error& e) {
+		// Make sure actors are cancelled before recoveredDiskFiles is destructed.
+		// Otherwise, these actors may get a broken promise error.
+		for (auto f : actors) f.cancel();
 		Error err = checkIOTimeout(e);
 		throw err;
 	}
--- a/fdbserver/workloads/MachineAttrition.actor.cpp
+++ b/fdbserver/workloads/MachineAttrition.actor.cpp
@ -199,7 +199,8 @@ struct MachineAttritionWorkload : TestWorkload {
 					// }
 				} else if (BUGGIFY_WITH_PROB(0.005)) {
 					TEST(true); // Disable DD for all storage server failures
-					self->ignoreSSFailures = ignoreSSFailuresForDuration(cx, deterministicRandom()->random01() * 5);
+					self->ignoreSSFailures =
+					    uncancellable(ignoreSSFailuresForDuration(cx, deterministicRandom()->random01() * 5));
 				}

 				TraceEvent("Assassination").detail("TargetMachine", targetMachine.toString())
--- a/fdbserver/workloads/SnapTest.actor.cpp
+++ b/fdbserver/workloads/SnapTest.actor.cpp
@ -211,7 +211,7 @@ public: // workload functions
 					wait(status);
 					break;
 				} catch (Error& e) {
-					if (e.code() == error_code_txn_exec_log_anti_quorum) {
+					if (e.code() == error_code_snap_log_anti_quorum_unsupported) {
 						snapFailed = true;
 						break;
 					}
@ -298,12 +298,12 @@ public: // workload functions
 					wait(status);
 					break;
 				} catch (Error& e) {
-					if (e.code() == error_code_cluster_not_fully_recovered ||
-						e.code() == error_code_txn_exec_log_anti_quorum) {
+					if (e.code() == error_code_snap_not_fully_recovered_unsupported ||
+						e.code() == error_code_snap_log_anti_quorum_unsupported) {
 						snapFailed = true;
 						break;
 					}
-					if (e.code() == error_code_transaction_not_permitted) {
+					if (e.code() == error_code_snap_path_not_whitelisted) {
 						testedFailure = true;
 						break;
 					}
--- a/flow/AsioReactor.h
+++ b/flow/AsioReactor.h
@ -37,7 +37,8 @@ class ASIOReactor {
 public:
 	explicit ASIOReactor(Net2*);

-	void sleepAndReact(double timeout);
+	void sleep(double timeout);
+	void react();

 	void wake();
 	
--- a/flow/FileIdentifier.h
+++ b/flow/FileIdentifier.h
@ -72,68 +72,3 @@ template <class T, uint32_t B>
 struct ComposedIdentifierExternal<T, B, true> {
 	static constexpr FileIdentifier value = ComposedIdentifier<T, B>::file_identifier;
 };
-
-template <>
-struct FileIdentifierFor<int> {
-	constexpr static FileIdentifier value = 1;
-};
-
-template <>
-struct FileIdentifierFor<unsigned> {
-	constexpr static FileIdentifier value = 2;
-};
-
-template <>
-struct FileIdentifierFor<long> {
-	constexpr static FileIdentifier value = 3;
-};
-
-template <>
-struct FileIdentifierFor<unsigned long> {
-	constexpr static FileIdentifier value = 4;
-};
-
-template <>
-struct FileIdentifierFor<long long> {
-	constexpr static FileIdentifier value = 5;
-};
-
-template <>
-struct FileIdentifierFor<unsigned long long> {
-	constexpr static FileIdentifier value = 6;
-};
-
-template <>
-struct FileIdentifierFor<short> {
-	constexpr static FileIdentifier value = 7;
-};
-
-template <>
-struct FileIdentifierFor<unsigned short> {
-	constexpr static FileIdentifier value = 8;
-};
-
-template <>
-struct FileIdentifierFor<signed char> {
-	constexpr static FileIdentifier value = 9;
-};
-
-template <>
-struct FileIdentifierFor<unsigned char> {
-	constexpr static FileIdentifier value = 10;
-};
-
-template <>
-struct FileIdentifierFor<bool> {
-	constexpr static FileIdentifier value = 11;
-};
-
-template <>
-struct FileIdentifierFor<float> {
-	constexpr static FileIdentifier value = 7266212;
-};
-
-template <>
-struct FileIdentifierFor<double> {
-	constexpr static FileIdentifier value = 9348150;
-};
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@ -150,7 +150,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
 	init( METRIC_LIMIT_RESPONSE_FACTOR,                         10 );  // The additional queue size at which to disable logging of another level (higher == less restrictive)

 	//Load Balancing
-	init( LOAD_BALANCE_ZONE_ID_LOCALITY_ENABLED,                 1 );
+	init( LOAD_BALANCE_ZONE_ID_LOCALITY_ENABLED,                 0 );
 	init( LOAD_BALANCE_DC_ID_LOCALITY_ENABLED,                   1 );
 	init( LOAD_BALANCE_MAX_BACKOFF,                            5.0 );
 	init( LOAD_BALANCE_START_BACKOFF,                         0.01 );
@ -172,6 +172,8 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
 	init( FUTURE_VERSION_INITIAL_BACKOFF,                      1.0 );
 	init( FUTURE_VERSION_MAX_BACKOFF,                          8.0 );
 	init( FUTURE_VERSION_BACKOFF_GROWTH,                       2.0 );
+	init( LOAD_BALANCE_MAX_BAD_OPTIONS,                          1 ); //should be the same as MAX_MACHINES_FALLING_BEHIND
+	init( LOAD_BALANCE_PENALTY_IS_BAD,                        true );
 }

 static std::string toLower( std::string const& name ) {
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@ -194,6 +194,8 @@ public:
 	double FUTURE_VERSION_INITIAL_BACKOFF;
 	double FUTURE_VERSION_MAX_BACKOFF;
 	double FUTURE_VERSION_BACKOFF_GROWTH;
+	int LOAD_BALANCE_MAX_BAD_OPTIONS;
+	bool LOAD_BALANCE_PENALTY_IS_BAD;

 	FlowKnobs(bool randomize = false, bool isSimulated = false);
 };
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@ -209,6 +209,8 @@ public:
 	Int64MetricHandle countASIOEvents;
 	Int64MetricHandle countSlowTaskSignals;
 	Int64MetricHandle priorityMetric;
+	DoubleMetricHandle countLaunchTime;
+	DoubleMetricHandle countReactTime;
 	BoolMetricHandle awakeMetric;

 	EventMetricHandle<SlowTask> slowTaskMetric;
@ -545,6 +547,8 @@ void Net2::initMetrics() {
 	priorityMetric.init(LiteralStringRef("Net2.Priority"));
 	awakeMetric.init(LiteralStringRef("Net2.Awake"));
 	slowTaskMetric.init(LiteralStringRef("Net2.SlowTask"));
+	countLaunchTime.init(LiteralStringRef("Net2.CountLaunchTime"));
+	countReactTime.init(LiteralStringRef("Net2.CountReactTime"));
 }

 void Net2::run() {
@ -580,7 +584,9 @@ void Net2::run() {
 			taskBegin = nnow;
 			trackMinPriority(TaskPriority::RunCycleFunction, taskBegin);
 			runFunc();
-			checkForSlowTask(tsc_begin, __rdtsc(), timer_monotonic() - taskBegin, TaskPriority::RunCycleFunction);
+			double taskEnd = timer_monotonic();
+			countLaunchTime += taskEnd - taskBegin;
+			checkForSlowTask(tsc_begin, __rdtsc(), taskEnd - taskBegin, TaskPriority::RunCycleFunction);
 		}

 		double sleepTime = 0;
@ -596,18 +602,26 @@ void Net2::run() {
 			if (!timers.empty()) {
 				sleepTime = timers.top().at - sleepStart;  // + 500e-6?
 			}
+			if (sleepTime > 0) {
 				trackMinPriority(TaskPriority::Zero, sleepStart);
+				awakeMetric = false;
+				priorityMetric = 0;
+				reactor.sleep(sleepTime);
+				awakeMetric = true;
+			}
 		}

-		awakeMetric = false;
-		if( sleepTime > 0 )
-			priorityMetric = 0;
-		reactor.sleepAndReact(sleepTime);
-		awakeMetric = true;
+		tsc_begin = __rdtsc();
+		taskBegin = timer_monotonic();
+		trackMinPriority(TaskPriority::ASIOReactor, taskBegin);
+		reactor.react();
 		
 		updateNow();
 		double now = this->currentTime;

+		countReactTime += now - taskBegin;
+		checkForSlowTask(tsc_begin, __rdtsc(), now - taskBegin, TaskPriority::ASIOReactor);
+
 		if ((now-nnow) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (now-nnow)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE)
 			TraceEvent("SomewhatSlowRunLoopTop").detail("Elapsed", now - nnow);

@ -988,7 +1002,7 @@ ASIOReactor::ASIOReactor(Net2* net)
 #endif
 }

-void ASIOReactor::sleepAndReact(double sleepTime) {
+void ASIOReactor::sleep(double sleepTime) {
 	if (sleepTime > FLOW_KNOBS->BUSY_WAIT_THRESHOLD) {
 		if (FLOW_KNOBS->REACTOR_FLAGS & 4) {
 #ifdef __linux
@ -1015,6 +1029,9 @@ void ASIOReactor::sleepAndReact(double sleepTime) {
 		if (!(FLOW_KNOBS->REACTOR_FLAGS & 8))
 			threadYield();
 	}
+}
+
+void ASIOReactor::react() {
 	while (ios.poll_one()) ++network->countASIOEvents;  // Make this a task?
 }

--- a/flow/SystemMonitor.cpp
+++ b/flow/SystemMonitor.cpp
@ -137,7 +137,9 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 				.detail("WriteProbes", netData.countWriteProbes - statState->networkState.countWriteProbes)
 				.detail("PacketsRead", netData.countPacketsReceived - statState->networkState.countPacketsReceived)
 				.detail("PacketsGenerated", netData.countPacketsGenerated - statState->networkState.countPacketsGenerated)
-				.detail("WouldBlock", netData.countWouldBlock - statState->networkState.countWouldBlock);
+				.detail("WouldBlock", netData.countWouldBlock - statState->networkState.countWouldBlock)
+				.detail("LaunchTime", netData.countLaunchTime - statState->networkState.countLaunchTime)
+				.detail("ReactTime", netData.countReactTime - statState->networkState.countReactTime);

 			for (int i = 0; i<NetworkMetrics::SLOW_EVENT_BINS; i++) {
 				if (int c = g_network->networkMetrics.countSlowEvents[i] - statState->networkMetricsState.countSlowEvents[i]) {
--- a/flow/SystemMonitor.h
+++ b/flow/SystemMonitor.h
@ -80,53 +80,49 @@ struct NetworkData {
 	int64_t countConnEstablished;
 	int64_t countConnClosedWithError;
 	int64_t countConnClosedWithoutError;
+	double countLaunchTime;
+	double countReactTime;

 	void init() {
-		auto getValue = [] (StringRef name) -> int64_t {
-			Reference<Int64Metric> r = Int64Metric::getOrCreateInstance(name);
-			int64_t v = 0;
-			if(r)
-				v = r->getValue();
-			return v;
-		};
-
-		bytesSent = getValue(LiteralStringRef("Net2.BytesSent"));
-		countPacketsReceived = getValue(LiteralStringRef("Net2.CountPacketsReceived"));
-		countPacketsGenerated = getValue(LiteralStringRef("Net2.CountPacketsGenerated"));
-		bytesReceived = getValue(LiteralStringRef("Net2.BytesReceived"));
-		countWriteProbes = getValue(LiteralStringRef("Net2.CountWriteProbes"));
-		countReadProbes = getValue(LiteralStringRef("Net2.CountReadProbes"));
-		countReads = getValue(LiteralStringRef("Net2.CountReads"));
-		countWouldBlock = getValue(LiteralStringRef("Net2.CountWouldBlock"));
-		countWrites = getValue(LiteralStringRef("Net2.CountWrites"));
-		countRunLoop = getValue(LiteralStringRef("Net2.CountRunLoop"));
-		countCantSleep = getValue(LiteralStringRef("Net2.CountCantSleep"));
-		countWontSleep = getValue(LiteralStringRef("Net2.CountWontSleep"));
-		countTimers = getValue(LiteralStringRef("Net2.CountTimers"));
-		countTasks = getValue(LiteralStringRef("Net2.CountTasks"));
-		countYields = getValue(LiteralStringRef("Net2.CountYields"));
-		countYieldBigStack = getValue(LiteralStringRef("Net2.CountYieldBigStack"));
-		countYieldCalls = getValue(LiteralStringRef("Net2.CountYieldCalls"));
-		countASIOEvents = getValue(LiteralStringRef("Net2.CountASIOEvents"));
-		countYieldCallsTrue = getValue(LiteralStringRef("Net2.CountYieldCallsTrue"));
-		countSlowTaskSignals = getValue(LiteralStringRef("Net2.CountSlowTaskSignals"));
-		countConnEstablished = getValue(LiteralStringRef("Net2.CountConnEstablished"));
-		countConnClosedWithError = getValue(LiteralStringRef("Net2.CountConnClosedWithError"));
-		countConnClosedWithoutError = getValue(LiteralStringRef("Net2.CountConnClosedWithoutError"));
-		countFileLogicalWrites = getValue(LiteralStringRef("AsyncFile.CountLogicalWrites"));
-		countFileLogicalReads = getValue(LiteralStringRef("AsyncFile.CountLogicalReads"));
-		countAIOSubmit = getValue(LiteralStringRef("AsyncFile.CountAIOSubmit"));
-		countAIOCollect = getValue(LiteralStringRef("AsyncFile.CountAIOCollect"));
-		countFileCacheWrites = getValue(LiteralStringRef("AsyncFile.CountCacheWrites"));
-		countFileCacheReads = getValue(LiteralStringRef("AsyncFile.CountCacheReads"));
-		countFileCacheWritesBlocked = getValue(LiteralStringRef("AsyncFile.CountCacheWritesBlocked"));
-		countFileCacheReadsBlocked = getValue(LiteralStringRef("AsyncFile.CountCacheReadsBlocked"));
-		countFileCachePageReadsMerged = getValue(LiteralStringRef("AsyncFile.CountCachePageReadsMerged"));
-		countFileCacheFinds = getValue(LiteralStringRef("AsyncFile.CountCacheFinds"));
-		countFileCacheReadBytes = getValue(LiteralStringRef("AsyncFile.CountCacheReadBytes"));
-		countFilePageCacheHits = getValue(LiteralStringRef("AsyncFile.CountCachePageReadsHit"));
-		countFilePageCacheMisses = getValue(LiteralStringRef("AsyncFile.CountCachePageReadsMissed"));
-		countFilePageCacheEvictions = getValue(LiteralStringRef("EvictablePageCache.CacheEvictions"));
+		bytesSent = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.BytesSent"));
+		countPacketsReceived = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountPacketsReceived"));
+		countPacketsGenerated = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountPacketsGenerated"));
+		bytesReceived = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.BytesReceived"));
+		countWriteProbes = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountWriteProbes"));
+		countReadProbes = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountReadProbes"));
+		countReads = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountReads"));
+		countWouldBlock = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountWouldBlock"));
+		countWrites = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountWrites"));
+		countRunLoop = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountRunLoop"));
+		countCantSleep = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountCantSleep"));
+		countWontSleep = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountWontSleep"));
+		countTimers = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountTimers"));
+		countTasks = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountTasks"));
+		countYields = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountYields"));
+		countYieldBigStack = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountYieldBigStack"));
+		countYieldCalls = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountYieldCalls"));
+		countASIOEvents = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountASIOEvents"));
+		countYieldCallsTrue = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountYieldCallsTrue"));
+		countSlowTaskSignals = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountSlowTaskSignals"));
+		countConnEstablished = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountConnEstablished"));
+		countConnClosedWithError = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountConnClosedWithError"));
+		countConnClosedWithoutError = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountConnClosedWithoutError"));
+		countLaunchTime = DoubleMetric::getValueOrDefault(LiteralStringRef("Net2.CountLaunchTime"));
+		countReactTime = DoubleMetric::getValueOrDefault(LiteralStringRef("Net2.CountReactTime"));
+		countFileLogicalWrites = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountLogicalWrites"));
+		countFileLogicalReads = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountLogicalReads"));
+		countAIOSubmit = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountAIOSubmit"));
+		countAIOCollect = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountAIOCollect"));
+		countFileCacheWrites = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountCacheWrites"));
+		countFileCacheReads = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountCacheReads"));
+		countFileCacheWritesBlocked = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountCacheWritesBlocked"));
+		countFileCacheReadsBlocked = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountCacheReadsBlocked"));
+		countFileCachePageReadsMerged = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountCachePageReadsMerged"));
+		countFileCacheFinds = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountCacheFinds"));
+		countFileCacheReadBytes = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountCacheReadBytes"));
+		countFilePageCacheHits = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountCachePageReadsHit"));
+		countFilePageCacheMisses = Int64Metric::getValueOrDefault(LiteralStringRef("AsyncFile.CountCachePageReadsMissed"));
+		countFilePageCacheEvictions = Int64Metric::getValueOrDefault(LiteralStringRef("EvictablePageCache.CacheEvictions"));
 	}
 };

--- a/flow/TDMetric.actor.h
+++ b/flow/TDMetric.actor.h
@ -269,6 +269,14 @@ struct MetricUtil {
 		return m;
 	}

+	static ValueType getValueOrDefault(StringRef const& name, StringRef const& id = StringRef(), ValueType defaultValue = ValueType()) {
+		Reference<T> r = getOrCreateInstance(name, id);
+		if(r) {
+			return r->getValue();
+		}
+		return defaultValue;
+	}
+
 	// Lookup the T metric by name and return its value (or nullptr if it doesn't exist)
 	static T * lookupMetric(MetricNameRef const &name) {
 		auto it = T::metricMap().find(name);
@ -1319,6 +1327,7 @@ public:
 };

 typedef ContinuousMetric<int64_t> Int64Metric;
+typedef ContinuousMetric<double> DoubleMetric;
 typedef Int64Metric VersionMetric;
 typedef ContinuousMetric<bool> BoolMetric;
 typedef ContinuousMetric<Standalone<StringRef>> StringMetric;
@ -1406,6 +1415,7 @@ typedef MetricHandle<Int64Metric> Int64MetricHandle;
 typedef MetricHandle<VersionMetric> VersionMetricHandle;
 typedef MetricHandle<BoolMetric> BoolMetricHandle;
 typedef MetricHandle<StringMetric> StringMetricHandle;
+typedef MetricHandle<DoubleMetric> DoubleMetricHandle;

 template <typename E>
 using EventMetricHandle = MetricHandle<EventMetric<E>>;
--- a/flow/TDMetric.cpp
+++ b/flow/TDMetric.cpp
@ -23,6 +23,7 @@

 const StringRef BaseEventMetric::metricType = LiteralStringRef("Event");
 template<> const StringRef Int64Metric::metricType = LiteralStringRef("Int64");
+template<> const StringRef DoubleMetric::metricType = LiteralStringRef("Double");
 template<> const StringRef BoolMetric::metricType = LiteralStringRef("Bool");
 template<> const StringRef StringMetric::metricType = LiteralStringRef("String");

--- a/flow/error_definitions.h
+++ b/flow/error_definitions.h
@ -65,9 +65,6 @@ ERROR( lookup_failed, 1041, "DNS lookup failed" )
 ERROR( proxy_memory_limit_exceeded, 1042, "Proxy commit memory limit exceeded" )
 ERROR( shutdown_in_progress, 1043, "Operation no longer supported due to shutdown" )
 ERROR( serialization_failed, 1044, "Failed to deserialize an object" )
-ERROR( transaction_not_permitted, 1045, "Operation not permitted")
-ERROR( cluster_not_fully_recovered, 1046, "Cluster not fully recovered")
-ERROR( txn_exec_log_anti_quorum, 1047, "Execute Transaction not supported when log anti quorum is configured")
 ERROR( connection_unreferenced, 1048, "No peer references for connection" )
 ERROR( connection_idle, 1049, "Connection closed after idle timeout" )
 ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" )
@ -206,6 +203,17 @@ ERROR( key_not_found, 2400, "Expected key is missing")
 ERROR( json_malformed, 2401, "JSON string was malformed")
 ERROR( json_eof_expected, 2402, "JSON string did not terminate where expected")

+// 2500 - disk snapshot based backup errors
+ERROR( snap_disable_tlog_pop_failed,  2500, "Disk Snapshot error")
+ERROR( snap_storage_failed,  2501, "Failed to snapshot storage nodes")
+ERROR( snap_tlog_failed,  2502, "Failed to snapshot TLog nodes")
+ERROR( snap_coord_failed,  2503, "Failed to snapshot coordinator nodes")
+ERROR( snap_enable_tlog_pop_failed,  2504, "Disk Snapshot error")
+ERROR( snap_path_not_whitelisted, 2505, "Snapshot create binary path not whitelisted")
+ERROR( snap_not_fully_recovered_unsupported, 2506, "Unsupported when the cluster is not fully recovered")
+ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum is configured")
+ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported")
+
 // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
 ERROR( unknown_error, 4000, "An unknown error occurred" )  // C++ exception not of type Error
 ERROR( internal_error, 4100, "An internal error occurred" )
--- a/flow/flat_buffers.h
+++ b/flow/flat_buffers.h
@ -1140,12 +1140,19 @@ inline FileIdentifier read_file_identifier(const uint8_t* in) {
 	return result;
 }

+namespace detail {
+template <class T>
+struct YesFileIdentifier {
+	constexpr static FileIdentifier file_identifier = FileIdentifierFor<T>::value;
+};
+struct NoFileIdentifier {};
+}; // namespace detail
+
 // members of unions must be tables in flatbuffers, so you can use this to
 // introduce the indirection only when necessary.
 template <class T>
-struct EnsureTable {
-	static_assert(HasFileIdentifier<T>::value);
-	constexpr static FileIdentifier file_identifier = FileIdentifierFor<T>::value;
+struct EnsureTable
+  : std::conditional_t<HasFileIdentifier<T>::value, detail::YesFileIdentifier<T>, detail::NoFileIdentifier> {
 	EnsureTable() = default;
 	EnsureTable(const T& t) : t(t) {}
 	template <class Archive>
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@ -249,10 +249,24 @@ void enableBuggify(bool enabled, BuggifyType type) {
 	buggifyActivated[int(type)] = enabled;
 }

+namespace {
+// Simple message for flatbuffers unittests
+struct Int {
+	constexpr static FileIdentifier file_identifier = 12345;
+	uint32_t value;
+	Int() = default;
+	Int(uint32_t value) : value(value) {}
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, value);
+	}
+};
+} // namespace
+
 TEST_CASE("/flow/FlatBuffers/ErrorOr") {
 	{
-		ErrorOr<int> in(worker_removed());
-		ErrorOr<int> out;
+		ErrorOr<Int> in(worker_removed());
+		ErrorOr<Int> out;
 		ObjectWriter writer(Unversioned());
 		writer.serialize(in);
 		Standalone<StringRef> copy = writer.toStringRef();
@ -262,23 +276,23 @@ TEST_CASE("/flow/FlatBuffers/ErrorOr") {
 		ASSERT(out.getError().code() == in.getError().code());
 	}
 	{
-		ErrorOr<uint32_t> in(deterministicRandom()->randomUInt32());
-		ErrorOr<uint32_t> out;
+		ErrorOr<Int> in(deterministicRandom()->randomUInt32());
+		ErrorOr<Int> out;
 		ObjectWriter writer(Unversioned());
 		writer.serialize(in);
 		Standalone<StringRef> copy = writer.toStringRef();
 		ArenaObjectReader reader(copy.arena(), copy, Unversioned());
 		reader.deserialize(out);
 		ASSERT(!out.isError());
-		ASSERT(out.get() == in.get());
+		ASSERT(out.get().value == in.get().value);
 	}
 	return Void();
 }

 TEST_CASE("/flow/FlatBuffers/Optional") {
 	{
-		Optional<int> in;
-		Optional<int> out;
+		Optional<Int> in;
+		Optional<Int> out;
 		ObjectWriter writer(Unversioned());
 		writer.serialize(in);
 		Standalone<StringRef> copy = writer.toStringRef();
@ -287,15 +301,15 @@ TEST_CASE("/flow/FlatBuffers/Optional") {
 		ASSERT(!out.present());
 	}
 	{
-		Optional<uint32_t> in(deterministicRandom()->randomUInt32());
-		Optional<uint32_t> out;
+		Optional<Int> in(deterministicRandom()->randomUInt32());
+		Optional<Int> out;
 		ObjectWriter writer(Unversioned());
 		writer.serialize(in);
 		Standalone<StringRef> copy = writer.toStringRef();
 		ArenaObjectReader reader(copy.arena(), copy, Unversioned());
 		reader.deserialize(out);
 		ASSERT(out.present());
-		ASSERT(out.get() == in.get());
+		ASSERT(out.get().value == in.get().value);
 	}
 	return Void();
 }
--- a/flow/network.h
+++ b/flow/network.h
@ -32,6 +32,7 @@

 enum class TaskPriority {
 	Max = 1000000,
+	ASIOReactor = 20001,
 	RunCycleFunction = 20000,
 	FlushTrace = 10500,
 	WriteSocket = 10000,
--- a/packaging/deb/DEBIAN-foundationdb-server/postinst
+++ b/packaging/deb/DEBIAN-foundationdb-server/postinst
@ -31,9 +31,15 @@ if [ "$1" = configure ]; then
        fi
    fi

-    # It would be better to use 'systemctl start foundationdb.service'. 
-    # Since it does not work on Ubuntu 14.04, use this workaround as of now.
+    # Start the service with systemd if it is available.
+    if pidof systemd > /dev/null; then
+        # Use deb-systemd-invoke if available to respect policy-rc.d.
+        systemctl=$(command -v deb-systemd-invoke || command -v systemctl)
+        systemctl --system daemon-reload > /dev/null || true
+        systemctl start foundationdb.service
+    else
        /etc/init.d/foundationdb start
+    fi

    if [ "$2" = "" ]; then
        update-rc.d foundationdb defaults
--- a/packaging/msi/FDBInstaller.wxs
+++ b/packaging/msi/FDBInstaller.wxs
@ -32,7 +32,7 @@

 <Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
  <Product Name='$(var.Title)'
-           Id='{E2FB8839-9C35-4E40-AFB1-7409961781F7}'
+           Id='{7AD1AE5E-FD5B-42F3-A638-A81A963B1CE4}'
           UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
           Version='$(var.Version)'
           Manufacturer='$(var.Manufacturer)'