Merge branch 'release-6.2' into features/toml11-docker

2020-12-09 12:16:52 -07:00 · 2020-12-09 12:16:52 -07:00 · 5b57f03eac
parent 2bb2a45448 05b48cd44d
commit 5b57f03eac
53 changed files with 1476 additions and 442 deletions
--- a/.gitignore
+++ b/.gitignore
@ -89,3 +89,4 @@ flow/coveragetool/obj
 temp/
 /compile_commands.json
 /.ccls-cache
+.clangd/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -18,7 +18,7 @@
 # limitations under the License.
 cmake_minimum_required(VERSION 3.12)
 project(foundationdb
-  VERSION 6.2.28
+  VERSION 6.2.29
  DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions."
  HOMEPAGE_URL "http://www.foundationdb.org/"
  LANGUAGES C CXX ASM)
--- a/build/Dockerfile
+++ b/build/Dockerfile
@ -1,17 +1,27 @@
 FROM centos:6

+# Clean yum cache, disable default Base repo and enable Vault
+RUN yum clean all &&\
+  sed -i -e 's/gpgcheck=1/enabled=0/g' /etc/yum.repos.d/CentOS-Base.repo &&\
+  sed -i -e 's/enabled=0/enabled=1/g' /etc/yum.repos.d/CentOS-Vault.repo &&\
+  sed -i -n '/6.1/q;p' /etc/yum.repos.d/CentOS-Vault.repo &&\
+  sed -i -e "s/6\.0/$(cut -d\  -f3 /etc/redhat-release)/g" /etc/yum.repos.d/CentOS-Vault.repo &&\
+  yum install -y yum-utils &&\
+  yum-config-manager --enable rhel-server-rhscl-7-rpms &&\
+  yum -y install centos-release-scl-rh epel-release  \
+    http://opensource.wandisco.com/centos/6/git/x86_64/wandisco-git-release-6-1.noarch.rpm &&\
+  sed -i -e 's/#baseurl=/baseurl=/g' -e 's/mirror.centos.org/vault.centos.org/g' \
+     -e 's/mirrorlist=/#mirrorlist=/g' /etc/yum.repos.d/CentOS-SCLo-scl-rh.repo &&\
+  yum clean all
+
 # Install dependencies for developer tools, bindings,\
 # documentation, actorcompiler, and packaging tools\
-RUN yum install -y yum-utils &&\
-  yum-config-manager --enable rhel-server-rhscl-7-rpms &&\
-  yum -y install centos-release-scl epel-release \
-    http://opensource.wandisco.com/centos/6/git/x86_64/wandisco-git-release-6-1.noarch.rpm &&\
-  yum -y install devtoolset-8-8.1-1.el6 java-1.8.0-openjdk-devel \
-    devtoolset-8-gcc-8.3.1 devtoolset-8-gcc-c++-8.3.1 \
-    devtoolset-8-libubsan-devel devtoolset-8-libasan-devel devtoolset-8-valgrind-devel \
-    rh-python36-python-devel rh-ruby24 golang python27 rpm-build \
-    mono-core debbuild python-pip dos2unix valgrind-devel ccache \
-    distcc wget git lz4 lz4-devel lz4-static &&\
+RUN yum -y install devtoolset-8-8.1-1.el6 java-1.8.0-openjdk-devel \
+      devtoolset-8-gcc-8.3.1 devtoolset-8-gcc-c++-8.3.1 \
+      devtoolset-8-libubsan-devel devtoolset-8-libasan-devel devtoolset-8-valgrind-devel \
+      rh-python36-python-devel rh-ruby24 golang python27 rpm-build \
+      mono-core debbuild python-pip dos2unix valgrind-devel ccache \
+      distcc wget libxslt git lz4 lz4-devel lz4-static &&\
  pip install boto3==1.1.1

 USER root
@ -41,6 +51,8 @@ RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.13.4/cmake-3.1

 # install Ninja
 RUN cd /tmp && curl -L https://github.com/ninja-build/ninja/archive/v1.9.0.zip -o ninja.zip &&\
+    echo "8e2e654a418373f10c22e4cc9bdbe9baeca8527ace8d572e0b421e9d9b85b7ef  ninja.zip" > /tmp/ninja-sha.txt &&\
+    sha256sum -c /tmp/ninja-sha.txt &&\
    unzip ninja.zip && cd ninja-1.9.0 && scl enable devtoolset-8 -- ./configure.py --bootstrap && cp ninja /usr/bin &&\
    cd .. && rm -rf ninja-1.9.0 ninja.zip

@ -64,13 +76,48 @@ RUN cd /opt/ && curl -L https://github.com/facebook/rocksdb/archive/v6.10.1.tar.
    echo "d573d2f15cdda883714f7e0bc87b814a8d4a53a82edde558f08f940e905541ee  rocksdb.tar.gz" > rocksdb-sha.txt &&\
    sha256sum -c rocksdb-sha.txt && tar xf rocksdb.tar.gz && rm -rf rocksdb.tar.gz rocksdb-sha.txt

+RUN cd /opt/ && curl -L https://github.com/manticoresoftware/manticoresearch/raw/master/misc/junit/ctest2junit.xsl -o ctest2junit.xsl
+
+# Setting this environment variable switches from OpenSSL to BoringSSL
+#ENV OPENSSL_ROOT_DIR=/opt/boringssl
+
+# install BoringSSL:  TODO: They don't seem to have releases(?)  I picked today's master SHA.
+RUN cd /opt &&\
+    git clone https://boringssl.googlesource.com/boringssl &&\
+    cd boringssl &&\
+    git checkout e796cc65025982ed1fb9ef41b3f74e8115092816 &&\
+    mkdir build
+
+# ninja doesn't respect CXXFLAGS, and the boringssl CMakeLists doesn't expose an option to define __STDC_FORMAT_MACROS
+# also, enable -fPIC.
+# this is moderately uglier than creating a patchfile, but easier to maintain.
+RUN cd /opt/boringssl &&\
+	for f in crypto/fipsmodule/rand/fork_detect_test.cc \
+	         include/openssl/bn.h \
+		 ssl/test/bssl_shim.cc ; do \
+		perl -p -i -e 's/#include <inttypes.h>/#define __STDC_FORMAT_MACROS 1\n#include <inttypes.h>/g;' $f ; \
+	done &&\
+	perl -p -i -e 's/-Werror/-Werror -fPIC/' CMakeLists.txt &&\
+	git diff
+
+RUN cd /opt/boringssl/build &&\
+    scl enable devtoolset-8 rh-python36 rh-ruby24 -- cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. &&\
+    scl enable devtoolset-8 rh-python36 rh-ruby24 -- ninja &&\
+	./ssl/ssl_test &&\
+	mkdir -p ../lib && cp crypto/libcrypto.a ssl/libssl.a ../lib
+
 # Localize time zone
 ARG TIMEZONEINFO=America/Los_Angeles
 RUN rm -f /etc/localtime && ln -s /usr/share/zoneinfo/${TIMEZONEINFO} /etc/localtime

-LABEL version=0.1.20
-ENV DOCKER_IMAGEVER=0.1.20
+LABEL version=0.1.22
+ENV DOCKER_IMAGEVER=0.1.22
 ENV JAVA_HOME=/usr/lib/jvm/java-1.8.0
 ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
 ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/g++
+
+ENV CCACHE_NOHASHDIR=true
+ENV CCACHE_UMASK=0000
+ENV CCACHE_SLOPPINESS="file_macro,time_macros,include_file_mtime,include_file_ctime,file_stat_matches"
+
 CMD scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash
--- a/build/Dockerfile.devel
+++ b/build/Dockerfile.devel
@ -1,4 +1,5 @@
-FROM foundationdb/foundationdb-build:0.1.19
+ARG IMAGE_TAG=0.1.21
+FROM foundationdb/foundationdb-build:${IMAGE_TAG}

 USER root

@ -50,8 +51,8 @@ RUN cp -iv /usr/local/bin/clang++ /usr/local/bin/clang++.deref &&\
 	ldconfig &&\
 	rm -rf /mnt/artifacts

-LABEL version=0.11.11
-ENV DOCKER_IMAGEVER=0.11.11
+LABEL version=0.11.13
+ENV DOCKER_IMAGEVER=0.11.13

 ENV CLANGCC=/usr/local/bin/clang.de8a65ef
 ENV CLANGCXX=/usr/local/bin/clang++.de8a65ef
@ -63,8 +64,5 @@ ENV CC=/usr/local/bin/clang.de8a65ef
 ENV CXX=/usr/local/bin/clang++.de8a65ef
 ENV USE_LD=LLD
 ENV USE_LIBCXX=1
-ENV CCACHE_NOHASHDIR=true
-ENV CCACHE_UMASK=0000
-ENV CCACHE_SLOPPINESS="file_macro,time_macros,include_file_mtime,include_file_ctime,file_stat_matches"

 CMD scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash
--- a/build/docker-compose.yaml
+++ b/build/docker-compose.yaml
@ -2,7 +2,7 @@ version: "3"

 services:
  common: &common
-    image: foundationdb/foundationdb-build:0.1.20
+    image: foundationdb/foundationdb-build:0.1.22

  build-setup: &build-setup
    <<: *common
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -233,7 +233,6 @@ else()
      -Wno-unused-function
      -Wno-unused-local-typedef
      -Wno-unused-parameter
-      -Wno-unused-value
      )
    if (USE_CCACHE)
      add_compile_options(
--- a/documentation/sphinx/source/class-scheduling-go.rst
+++ b/documentation/sphinx/source/class-scheduling-go.rst
@ -229,7 +229,7 @@ Furthermore, this version can only be called with a ``Database``, making it impo
 Note that by default, the operation will be retried an infinite number of times and the transaction will never time out. It is therefore recommended that the client choose a default transaction retry limit or timeout value that is suitable for their application. This can be set either at the transaction level using the ``SetRetryLimit`` or ``SetTimeout`` transaction options or at the database level with the ``SetTransactionRetryLimit`` or ``SetTransactionTimeout`` database options. For example, one can set a one minute timeout on each transaction and a default retry limit of 100 by calling::

    db.Options().SetTransactionTimeout(60000)  // 60,000 ms = 1 minute
-    db.Options().SetRetryLimit(100)
+    db.Options().SetTransactionRetryLimit(100)

 Making some sample classes
 --------------------------
--- a/documentation/sphinx/source/class-scheduling-java.rst
+++ b/documentation/sphinx/source/class-scheduling-java.rst
@ -157,7 +157,7 @@ If instead you pass a :class:`Transaction` for the :class:`TransactionContext` p
 Note that by default, the operation will be retried an infinite number of times and the transaction will never time out. It is therefore recommended that the client choose a default transaction retry limit or timeout value that is suitable for their application. This can be set either at the transaction level using the ``setRetryLimit`` or ``setTimeout`` transaction options or at the database level with the ``setTransactionRetryLimit`` or ``setTransactionTimeout`` database options. For example, one can set a one minute timeout on each transaction and a default retry limit of 100 by calling::

    db.options().setTransactionTimeout(60000);  // 60,000 ms = 1 minute
-    db.options().setRetryLimit(100);
+    db.options().setTransactionRetryLimit(100);

 Making some sample classes
 --------------------------
@ -444,7 +444,7 @@ Here's the code for the scheduling tutorial:
      fdb = FDB.selectAPIVersion(620);
      db = fdb.open();
      db.options().setTransactionTimeout(60000);  // 60,000 ms = 1 minute
-      db.options().setRetryLimit(100);
+      db.options().setTransactionRetryLimit(100);
    }

    // Generate 1,620 classes like '9:00 chem for dummies'
--- a/documentation/sphinx/source/class-scheduling-ruby.rst
+++ b/documentation/sphinx/source/class-scheduling-ruby.rst
@ -126,7 +126,7 @@ If instead you pass a :class:`Transaction` for the ``db_or_tr`` parameter, the t
 Note that by default, the operation will be retried an infinite number of times and the transaction will never time out. It is therefore recommended that the client choose a default transaction retry limit or timeout value that is suitable for their application. This can be set either at the transaction level using the ``set_retry_limit`` or ``set_timeout`` transaction options or at the database level with the ``set_transaction_retry_limit`` or ``set_transaction_timeout`` database options. For example, one can set a one minute timeout on each transaction and a default retry limit of 100 by calling::

    @db.options.set_transaction_timeout(60000)  # 60,000 ms = 1 minute
-    @db.options.set_retry_limit(100)
+    @db.options.set_transaction_retry_limit(100)

 Making some sample classes
 --------------------------
--- a/documentation/sphinx/source/downloads.rst
+++ b/documentation/sphinx/source/downloads.rst
@ -10,38 +10,38 @@ macOS

 The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.

-* `FoundationDB-6.2.27.pkg <https://www.foundationdb.org/downloads/6.2.27/macOS/installers/FoundationDB-6.2.27.pkg>`_
+* `FoundationDB-6.2.28.pkg <https://www.foundationdb.org/downloads/6.2.28/macOS/installers/FoundationDB-6.2.28.pkg>`_

 Ubuntu
 ------

 The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.

-* `foundationdb-clients-6.2.27-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.27/ubuntu/installers/foundationdb-clients_6.2.27-1_amd64.deb>`_
-* `foundationdb-server-6.2.27-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.27/ubuntu/installers/foundationdb-server_6.2.27-1_amd64.deb>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.28-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.28/ubuntu/installers/foundationdb-clients_6.2.28-1_amd64.deb>`_
+* `foundationdb-server-6.2.28-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.28/ubuntu/installers/foundationdb-server_6.2.28-1_amd64.deb>`_ (depends on the clients package)

 RHEL/CentOS EL6
 ---------------

 The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.

-* `foundationdb-clients-6.2.27-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.27/rhel6/installers/foundationdb-clients-6.2.27-1.el6.x86_64.rpm>`_
-* `foundationdb-server-6.2.27-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.27/rhel6/installers/foundationdb-server-6.2.27-1.el6.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.28-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.28/rhel6/installers/foundationdb-clients-6.2.28-1.el6.x86_64.rpm>`_
+* `foundationdb-server-6.2.28-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.28/rhel6/installers/foundationdb-server-6.2.28-1.el6.x86_64.rpm>`_ (depends on the clients package)

 RHEL/CentOS EL7
 ---------------

 The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.

-* `foundationdb-clients-6.2.27-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.27/rhel7/installers/foundationdb-clients-6.2.27-1.el7.x86_64.rpm>`_
-* `foundationdb-server-6.2.27-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.27/rhel7/installers/foundationdb-server-6.2.27-1.el7.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.28-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.28/rhel7/installers/foundationdb-clients-6.2.28-1.el7.x86_64.rpm>`_
+* `foundationdb-server-6.2.28-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.28/rhel7/installers/foundationdb-server-6.2.28-1.el7.x86_64.rpm>`_ (depends on the clients package)

 Windows
 -------

 The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.

-* `foundationdb-6.2.27-x64.msi <https://www.foundationdb.org/downloads/6.2.27/windows/installers/foundationdb-6.2.27-x64.msi>`_
+* `foundationdb-6.2.28-x64.msi <https://www.foundationdb.org/downloads/6.2.28/windows/installers/foundationdb-6.2.28-x64.msi>`_

 API Language Bindings
 =====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part

 If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:

-* `foundationdb-6.2.27.tar.gz <https://www.foundationdb.org/downloads/6.2.27/bindings/python/foundationdb-6.2.27.tar.gz>`_
+* `foundationdb-6.2.28.tar.gz <https://www.foundationdb.org/downloads/6.2.28/bindings/python/foundationdb-6.2.28.tar.gz>`_

 Ruby 1.9.3/2.0.0+
 -----------------

-* `fdb-6.2.27.gem <https://www.foundationdb.org/downloads/6.2.27/bindings/ruby/fdb-6.2.27.gem>`_
+* `fdb-6.2.28.gem <https://www.foundationdb.org/downloads/6.2.28/bindings/ruby/fdb-6.2.28.gem>`_

 Java 8+
 -------

-* `fdb-java-6.2.27.jar <https://www.foundationdb.org/downloads/6.2.27/bindings/java/fdb-java-6.2.27.jar>`_
-* `fdb-java-6.2.27-javadoc.jar <https://www.foundationdb.org/downloads/6.2.27/bindings/java/fdb-java-6.2.27-javadoc.jar>`_
+* `fdb-java-6.2.28.jar <https://www.foundationdb.org/downloads/6.2.28/bindings/java/fdb-java-6.2.28.jar>`_
+* `fdb-java-6.2.28-javadoc.jar <https://www.foundationdb.org/downloads/6.2.28/bindings/java/fdb-java-6.2.28-javadoc.jar>`_

 Go 1.11+
 --------
--- a/documentation/sphinx/source/release-notes/release-notes-620.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-620.rst
@ -4,12 +4,17 @@
 Release Notes
 #############

+6.2.29
+======
+* Fix invalid memory access on data distributor when snapshotting large clusters. `(PR #4076) <https://github.com/apple/foundationdb/pull/4076>`_
+* Add human-readable DateTime to trace events `(PR #4087) <https://github.com/apple/foundationdb/pull/4087>`_
+* Proxy rejects transaction batch that exceeds MVCC window `(PR #4113) <https://github.com/apple/foundationdb/pull/4113>`_
+
 6.2.28
 ======
 * Log detailed team collection information when median available space ratio of all teams is too low. `(PR #3912) <https://github.com/apple/foundationdb/pull/3912>`_
 * Bug fix, blob client did not support authentication key sizes over 64 bytes.  `(PR #3964) <https://github.com/apple/foundationdb/pull/3964>`_

-
 6.2.27
 ======
 * For clusters with a large number of shards, avoid slow tasks in the data distributor by adding yields to the shard map destruction. `(PR #3834) <https://github.com/apple/foundationdb/pull/3834>`_
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -37,18 +37,23 @@ typedef StringRef ValueRef;
 typedef int64_t Generation;

 enum {
-	tagLocalitySpecial = -1,
+	tagLocalitySpecial = -1, // tag with this locality means it is invalidTag (id=0), txsTag (id=1), or cacheTag (id=2)
 	tagLocalityLogRouter = -2,
-	tagLocalityRemoteLog = -3,
+	tagLocalityRemoteLog = -3, // tag created by log router for remote tLogs
 	tagLocalityUpgraded = -4,
 	tagLocalitySatellite = -5,
-	tagLocalityLogRouterMapped = -6,
+	tagLocalityLogRouterMapped = -6, // The pseudo tag used by log routers to pop the real LogRouter tag (i.e., -2)
 	tagLocalityTxs = -7,
 	tagLocalityInvalid = -99
-}; //The TLog and LogRouter require these number to be as compact as possible
+}; // The TLog and LogRouter require these number to be as compact as possible

 #pragma pack(push, 1)
 struct Tag {
+	// if locality > 0,
+	//    locality decides which DC id the tLog is in;
+	//    id decides which SS owns the tag; id <-> SS mapping is in the system keyspace: serverTagKeys.
+	// if locality < 0, locality decides the type of tLog set: satellite, LR, or remote tLog, etc.
+	//    id decides which tLog in the tLog type will be used.
 	int8_t locality;
 	uint16_t id;

@ -138,6 +143,10 @@ static std::string describe( Reference<T> const& item ) {
 	return item->toString();
 }

+static std::string describe(UID const& item) {
+	return item.shortString();
+}
+
 template <class T>
 static std::string describe( T const& item ) {
 	return item.toString();
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -1230,6 +1230,12 @@ ACTOR Future< vector< pair<KeyRange,Reference<LocationInfo>> > > getKeyRangeLoca
 	}
 }

+// Get the SS locations for each shard in the 'keys' key-range;
+// Returned vector size is the number of shards in the input keys key-range.
+// Returned vector element is <ShardRange, storage server location info> pairs, where
+// ShardRange is the whole shard key-range, not a part of the given key range.
+// Example: If query the function with  key range (b, d), the returned list of pairs could be something like:
+// [([a, b1), locationInfo), ([b1, c), locationInfo), ([c, d1), locationInfo)].
 template <class F>
 Future< vector< pair<KeyRange,Reference<LocationInfo>> > > getKeyRangeLocations( Database const& cx, KeyRange const& keys, int limit, bool reverse, F StorageServerInterface::*member, TransactionInfo const& info ) {
 	ASSERT (!keys.empty());
@ -1896,7 +1902,6 @@ ACTOR Future<Standalone<RangeResultRef>> getRange( Database cx, Reference<Transa
 				}

 				++cx->transactionPhysicalReads;
-				++cx->transactionGetRangeRequests;
 				state GetKeyValuesReply rep;
 				try {
 					if (CLIENT_BUGGIFY) {
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -220,18 +220,36 @@ ACTOR Future<Void> pingLatencyLogger(TransportData* self) {
 			if(!peer) {
 				TraceEvent(SevWarnAlways, "MissingNetworkAddress").suppressFor(10.0).detail("PeerAddr", lastAddress);
 			}
+			if (peer->lastLoggedTime <= 0.0) {
+				peer->lastLoggedTime = peer->lastConnectTime;
+			}
+
 			if(peer && peer->pingLatencies.getPopulationSize() >= 10) {
 				TraceEvent("PingLatency")
-				  .detail("PeerAddr", lastAddress)
-				  .detail("MinLatency", peer->pingLatencies.min())
-				  .detail("MaxLatency", peer->pingLatencies.max())
-				  .detail("MeanLatency", peer->pingLatencies.mean())
-				  .detail("MedianLatency", peer->pingLatencies.median())
-				  .detail("P90Latency", peer->pingLatencies.percentile(0.90))
-				  .detail("Count", peer->pingLatencies.getPopulationSize())
-				  .detail("BytesReceived", peer->bytesReceived - peer->lastLoggedBytesReceived)
-				  .detail("BytesSent", peer->bytesSent - peer->lastLoggedBytesSent);
+				    .detail("Elapsed", now() - peer->lastLoggedTime)
+				    .detail("PeerAddr", lastAddress)
+				    .detail("MinLatency", peer->pingLatencies.min())
+				    .detail("MaxLatency", peer->pingLatencies.max())
+				    .detail("MeanLatency", peer->pingLatencies.mean())
+				    .detail("MedianLatency", peer->pingLatencies.median())
+				    .detail("P90Latency", peer->pingLatencies.percentile(0.90))
+				    .detail("Count", peer->pingLatencies.getPopulationSize())
+				    .detail("BytesReceived", peer->bytesReceived - peer->lastLoggedBytesReceived)
+				    .detail("BytesSent", peer->bytesSent - peer->lastLoggedBytesSent)
+				    .detail("ConnectOutgoingCount", peer->connectOutgoingCount)
+				    .detail("ConnectIncomingCount", peer->connectIncomingCount)
+				    .detail("ConnectFailedCount", peer->connectFailedCount)
+				    .detail("ConnectMinLatency", peer->connectLatencies.min())
+				    .detail("ConnectMaxLatency", peer->connectLatencies.max())
+				    .detail("ConnectMeanLatency", peer->connectLatencies.mean())
+				    .detail("ConnectMedianLatency", peer->connectLatencies.median())
+				    .detail("ConnectP90Latency", peer->connectLatencies.percentile(0.90));
+				peer->lastLoggedTime = now();
+				peer->connectOutgoingCount = 0;
+				peer->connectIncomingCount = 0;
+				peer->connectFailedCount = 0;
 				peer->pingLatencies.clear();
+				peer->connectLatencies.clear();
 				peer->lastLoggedBytesReceived = peer->bytesReceived;
 				peer->lastLoggedBytesSent = peer->bytesSent;
 				wait(delay(FLOW_KNOBS->PING_LOGGING_INTERVAL));
@ -476,7 +494,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
 						std::max(0.0, self->lastConnectTime + self->reconnectionDelay -
 															now()))); // Don't connect() to the same peer more than once per 2 sec
 				self->lastConnectTime = now();
-
+				++self->connectOutgoingCount;
 				TraceEvent("ConnectingTo", conn ? conn->getDebugID() : UID()).suppressFor(1.0).detail("PeerAddr", self->destination);

 				try {
@ -484,6 +502,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
 						when( Reference<IConnection> _conn = wait( INetworkConnections::net()->connect(self->destination) ) ) { 
 							conn = _conn;
 							wait(conn->connectHandshake());
+							self->connectLatencies.addSample(now() - self->lastConnectTime);
 							if (FlowTransport::isClient()) {
 								IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(false));
 							}
@ -505,6 +524,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
 						}
 					}
 				} catch( Error &e ) {
+					++self->connectFailedCount;
 					if(e.code() != error_code_connection_failed) {
 						throw;
 					}
@ -648,6 +668,7 @@ void Peer::discardUnreliablePackets() {
 void Peer::onIncomingConnection( Reference<Peer> self, Reference<IConnection> conn, Future<Void> reader ) {
 	// In case two processes are trying to connect to each other simultaneously, the process with the larger canonical NetworkAddress
 	// gets to keep its outgoing connection.
+	++self->connectIncomingCount;
 	if ( !destination.isPublic() && !outgoingConnectionIdle ) throw address_in_use();
 	NetworkAddress compatibleAddr = transport->localAddresses.address;
 	if(transport->localAddresses.secondaryAddress.present() && transport->localAddresses.secondaryAddress.get().isTLS() == destination.isTLS()) {
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@ -127,15 +127,23 @@ struct Peer : public ReferenceCounted<Peer> {
 	double lastDataPacketSentTime;
 	int outstandingReplies;
 	ContinuousSample<double> pingLatencies;
+	double lastLoggedTime;
 	int64_t lastLoggedBytesReceived;
 	int64_t lastLoggedBytesSent;

+	// Cleared every time stats are logged for this peer.
+	int connectOutgoingCount;
+	int connectIncomingCount;
+	int connectFailedCount;
+	ContinuousSample<double> connectLatencies;
+
 	explicit Peer(TransportData* transport, NetworkAddress const& destination)
 	  : transport(transport), destination(destination), outgoingConnectionIdle(true), lastConnectTime(0.0),
 	    reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0),
 	    incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()),
-		pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), lastLoggedBytesReceived(0),
-		bytesSent(0), lastLoggedBytesSent(0) {}
+	    pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), lastLoggedBytesReceived(0),
+	    bytesSent(0), lastLoggedBytesSent(0), lastLoggedTime(0.0), connectOutgoingCount(0), connectIncomingCount(0),
+	    connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1) {}

 	void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent);

--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@ -171,14 +171,11 @@ void addLaggingRequest(Future<Optional<Reply>> reply, Promise<Void> requestFinis
 //   failMon's information for load balancing and avoiding failed servers
 // If ALL the servers are failed and the list of servers is not fresh, throws an exception to let the caller refresh the list of servers
 ACTOR template <class Interface, class Request, class Multi>
-Future< REPLY_TYPE(Request) > loadBalance(
-	Reference<MultiInterface<Multi>> alternatives,
-	RequestStream<Request> Interface::* channel,
-	Request request = Request(),
-	TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
-	bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically
-	QueueModel* model = NULL) 
-{
+Future<REPLY_TYPE(Request)> loadBalance(
+    Reference<MultiInterface<Multi>> alternatives, RequestStream<Request> Interface::*channel,
+    Request request = Request(), TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
+    bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically
+    QueueModel* model = NULL) {
 	state Future<Optional<REPLY_TYPE(Request)>> firstRequest;
 	state Optional<uint64_t> firstRequestEndpoint;
 	state Future<Optional<REPLY_TYPE(Request)>> secondRequest;
--- a/fdbrpc/Stats.actor.cpp
+++ b/fdbrpc/Stats.actor.cpp
@ -76,7 +76,9 @@ void CounterCollection::logToTraceEvent(TraceEvent &te) const {
 	}
 }

-ACTOR Future<Void> traceCounters(std::string traceEventName, UID traceEventID, double interval, CounterCollection* counters, std::string trackLatestName) {
+ACTOR Future<Void> traceCounters(std::string traceEventName, UID traceEventID, double interval,
+                                 CounterCollection* counters, std::string trackLatestName,
+                                 std::function<void(TraceEvent&)> decorator) {
 	wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized

 	for (ICounter* c : counters->counters)
@ -89,6 +91,7 @@ ACTOR Future<Void> traceCounters(std::string traceEventName, UID traceEventID, d
 		te.detail("Elapsed", now() - last_interval);

 		counters->logToTraceEvent(te);
+		decorator(te);

 		if (!trackLatestName.empty()) {
 			te.trackLatest(trackLatestName);
--- a/fdbrpc/Stats.h
+++ b/fdbrpc/Stats.h
@ -132,7 +132,9 @@ struct SpecialCounter : ICounter, FastAllocated<SpecialCounter<F>>, NonCopyable
 template <class F>
 static void specialCounter(CounterCollection& collection, std::string const& name, F && f) { new SpecialCounter<F>(collection, name, std::move(f)); }

-Future<Void> traceCounters(std::string const& traceEventName, UID const& traceEventID, double const& interval, CounterCollection* const& counters, std::string const& trackLatestName = std::string());
+Future<Void> traceCounters(std::string const& traceEventName, UID const& traceEventID, double const& interval,
+                           CounterCollection* const& counters, std::string const& trackLatestName = std::string(),
+                           std::function<void(TraceEvent&)> const& decorator = [](TraceEvent& te) {});

 class LatencyBands {
 public:
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -102,8 +102,6 @@ bool onlyBeforeSimulatorInit() {

 const UID TOKEN_ENDPOINT_NOT_FOUND(-1, -1);

-ISimulator* g_pSimulator = 0;
-thread_local ISimulator::ProcessInfo* ISimulator::currentProcess = 0;
 int openCount = 0;

 struct SimClogging {
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@ -23,6 +23,7 @@
 #pragma once

 #include "flow/flow.h"
+#include "flow/Histogram.h"
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/Locality.h"
 #include "fdbrpc/IAsyncFile.h"
@ -54,6 +55,7 @@ public:
 		LocalityData	locality;
 		ProcessClass startingClass;
 		TDMetricCollection tdmetrics;
+		HistogramRegistry histograms;
 		std::map<NetworkAddress, Reference<IListener>> listenerMap;
 		bool failed;
 		bool excluded;
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -146,8 +146,10 @@ public:
 	vector<Reference<TCMachineInfo>> machines;
 	vector<Standalone<StringRef>> machineIDs;
 	vector<Reference<TCTeamInfo>> serverTeams;
+	UID id;

-	explicit TCMachineTeamInfo(vector<Reference<TCMachineInfo>> const& machines) : machines(machines) {
+	explicit TCMachineTeamInfo(vector<Reference<TCMachineInfo>> const& machines)
+	  : machines(machines), id(deterministicRandom()->randomUniqueID()) {
 		machineIDs.reserve(machines.size());
 		for (int i = 0; i < machines.size(); i++) {
 			machineIDs.push_back(machines[i]->machineID);
@ -180,6 +182,7 @@ class TCTeamInfo : public ReferenceCounted<TCTeamInfo>, public IDataDistribution
 private:
 	vector< Reference<TCServerInfo> > servers;
 	vector<UID> serverIDs;
+	UID id;

 public:
 	Reference<TCMachineTeamInfo> machineTeam;
@ -189,7 +192,8 @@ public:
 	int priority;

 	explicit TCTeamInfo(vector<Reference<TCServerInfo>> const& servers)
-	  : servers(servers), healthy(true), priority(SERVER_KNOBS->PRIORITY_TEAM_HEALTHY), wrongConfiguration(false) {
+	  : servers(servers), healthy(true), priority(SERVER_KNOBS->PRIORITY_TEAM_HEALTHY), wrongConfiguration(false),
+	    id(deterministicRandom()->randomUniqueID()) {
 		if (servers.empty()) {
 			TraceEvent(SevInfo, "ConstructTCTeamFromEmptyServers");
 		}
@ -199,6 +203,8 @@ public:
 		}
 	}

+	std::string getTeamID() override { return id.shortString(); }
+
 	virtual vector<StorageServerInterface> getLastKnownServerInterfaces() {
 		vector<StorageServerInterface> v;
 		v.reserve(servers.size());
@ -631,6 +637,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	int highestUtilizationTeam;

 	AsyncTrigger printDetailedTeamsInfo;
+	PromiseStream<GetMetricsRequest> getShardMetrics;

 	void resetLocalitySet() {
 		storageServerSet = Reference<LocalitySet>(new LocalityMap<UID>());
@ -662,7 +669,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	                 DatabaseConfiguration configuration, std::vector<Optional<Key>> includedDCs,
 	                 Optional<std::vector<Optional<Key>>> otherTrackedDCs, Future<Void> readyToStart,
 	                 Reference<AsyncVar<bool>> zeroHealthyTeams, bool primary,
-	                 Reference<AsyncVar<bool>> processingUnhealthy)
+	                 Reference<AsyncVar<bool>> processingUnhealthy, PromiseStream<GetMetricsRequest> getShardMetrics)
 	  : cx(cx), distributorId(distributorId), lock(lock), output(output),
 	    shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), lastBuildTeamsFailed(false),
 	    teamBuilder(Void()), badTeamRemover(Void()), redundantMachineTeamRemover(Void()),
@ -675,8 +682,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	    initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)),
 	    optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
 	    unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs),
-	    zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary), medianAvailableSpace(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO),
-		lastMedianAvailableSpaceUpdate(0), processingUnhealthy(processingUnhealthy), lowestUtilizationTeam(0), highestUtilizationTeam(0) {
+	    zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary),
+	    medianAvailableSpace(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO), lastMedianAvailableSpaceUpdate(0),
+	    processingUnhealthy(processingUnhealthy), lowestUtilizationTeam(0), highestUtilizationTeam(0),
+	    getShardMetrics(getShardMetrics) {
 		if(!primary || configuration.usableRegions == 1) {
 			TraceEvent("DDTrackerStarting", distributorId)
 				.detail( "State", "Inactive" )
@ -1352,7 +1361,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			    .detail("TeamIndex", i++)
 			    .detail("Healthy", team->isHealthy())
 			    .detail("TeamSize", team->size())
-			    .detail("MemberIDs", team->getServerIDsStr());
+			    .detail("MemberIDs", team->getServerIDsStr())
+			    .detail("TeamID", team->getTeamID());
 		}
 	}

@ -2094,7 +2104,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("Primary", primary)
 		    .detail("AddedTeams", 0)
 		    .detail("TeamsToBuild", 0)
-		    .detail("CurrentTeams", teams.size())
+		    .detail("CurrentServerTeams", teams.size())
 		    .detail("DesiredTeams", desiredServerTeams)
 		    .detail("MaxTeams", maxServerTeams)
 		    .detail("StorageTeamSize", configuration.storageTeamSize)
@ -2143,11 +2153,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			}
 		}
 		uniqueMachines = machines.size();
-		TraceEvent("BuildTeams")
-			.detail("ServerCount", self->server_info.size())
-			.detail("UniqueMachines", uniqueMachines)
-			.detail("Primary", self->primary)
-			.detail("StorageTeamSize", self->configuration.storageTeamSize);
+		TraceEvent("BuildTeams", self->distributorId)
+		    .detail("ServerCount", self->server_info.size())
+		    .detail("UniqueMachines", uniqueMachines)
+		    .detail("Primary", self->primary)
+		    .detail("StorageTeamSize", self->configuration.storageTeamSize);

 		// If there are too few machines to even build teams or there are too few represented datacenters, build no new teams
 		if( uniqueMachines >= self->configuration.storageTeamSize ) {
@ -2174,11 +2184,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			    .detail("TeamsToBuild", teamsToBuild)
 			    .detail("DesiredTeams", desiredTeams)
 			    .detail("MaxTeams", maxTeams)
-			    .detail("BadTeams", self->badTeams.size())
+			    .detail("BadServerTeams", self->badTeams.size())
 			    .detail("UniqueMachines", uniqueMachines)
 			    .detail("TeamSize", self->configuration.storageTeamSize)
 			    .detail("Servers", serverCount)
-			    .detail("CurrentTrackedTeams", self->teams.size())
+			    .detail("CurrentTrackedServerTeams", self->teams.size())
 			    .detail("HealthyTeamCount", teamCount)
 			    .detail("TotalTeamCount", totalTeamCount)
 			    .detail("MachineTeamCount", self->machineTeams.size())
@ -2195,9 +2205,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams);

 				if (addedTeams <= 0 && self->teams.size() == 0) {
-					TraceEvent(SevWarn, "NoTeamAfterBuildTeam")
-						.detail("TeamNum", self->teams.size())
-						.detail("Debug", "Check information below");
+					TraceEvent(SevWarn, "NoTeamAfterBuildTeam", self->distributorId)
+					    .detail("ServerTeamNum", self->teams.size())
+					    .detail("Debug", "Check information below");
 					// Debug: set true for traceAllInfo() to print out more information
 					self->traceAllInfo();
 				}
@ -2215,7 +2225,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				    .detail("Primary", self->primary)
 				    .detail("AddedTeams", 0)
 				    .detail("TeamsToBuild", teamsToBuild)
-				    .detail("CurrentTeams", self->teams.size())
+				    .detail("CurrentServerTeams", self->teams.size())
 				    .detail("DesiredTeams", desiredTeams)
 				    .detail("MaxTeams", maxTeams)
 				    .detail("StorageTeamSize", self->configuration.storageTeamSize)
@ -2254,9 +2264,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		}

 		TraceEvent(SevWarn, "NoHealthyTeams", distributorId)
-			.detail("CurrentTeamCount", teams.size())
-			.detail("ServerCount", server_info.size())
-			.detail("NonFailedServerCount", desiredServerSet.size());
+		    .detail("CurrentServerTeamCount", teams.size())
+		    .detail("ServerCount", server_info.size())
+		    .detail("NonFailedServerCount", desiredServerSet.size());
 	}

 	bool shouldHandleServer(const StorageServerInterface &newServer) {
@ -2284,7 +2294,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	}

 	bool removeTeam( Reference<TCTeamInfo> team ) {
-		TraceEvent("RemovedTeam", distributorId).detail("Team", team->getDesc());
+		TraceEvent("RemovedServerTeam", distributorId).detail("Team", team->getDesc());
 		bool found = false;
 		for(int t=0; t<teams.size(); t++) {
 			if( teams[t] == team ) {
@ -2478,9 +2488,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		int removedCount = 0;
 		for (int t = 0; t < teams.size(); t++) {
 			if ( std::count( teams[t]->getServerIDs().begin(), teams[t]->getServerIDs().end(), removedServer ) ) {
-				TraceEvent("TeamRemoved")
+				TraceEvent("ServerTeamRemoved")
 				    .detail("Primary", primary)
-				    .detail("TeamServerIDs", teams[t]->getServerIDsStr());
+				    .detail("TeamServerIDs", teams[t]->getServerIDsStr())
+				    .detail("TeamID", teams[t]->getTeamID());
 				// removeTeam also needs to remove the team from the machine team info.
 				removeTeam(teams[t]);
 				t--;
@ -2547,8 +2558,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		restartTeamBuilder.trigger();

 		TraceEvent("DataDistributionTeamCollectionUpdate", distributorId)
-		    .detail("Teams", teams.size())
-		    .detail("BadTeams", badTeams.size())
+		    .detail("ServerTeams", teams.size())
+		    .detail("BadServerTeams", badTeams.size())
 		    .detail("Servers", allServers.size())
 		    .detail("Machines", machine_info.size())
 		    .detail("MachineTeams", machineTeams.size())
@ -2772,7 +2783,7 @@ ACTOR Future<Void> removeBadTeams(DDTeamCollection* self) {
 	wait(self->initialFailureReactionDelay);
 	wait(waitUntilHealthy(self));
 	wait(self->addSubsetComplete.getFuture());
-	TraceEvent("DDRemovingBadTeams", self->distributorId).detail("Primary", self->primary);
+	TraceEvent("DDRemovingBadServerTeams", self->distributorId).detail("Primary", self->primary);
 	for(auto it : self->badTeams) {
 		it->tracker.cancel();
 	}
@ -2842,9 +2853,9 @@ ACTOR Future<Void> machineTeamRemover(DDTeamCollection* self) {
 				// Check if a server will have 0 team after the team is removed
 				for (auto& s : team->getServers()) {
 					if (s->teams.size() == 0) {
-						TraceEvent(SevError, "TeamRemoverTooAggressive")
+						TraceEvent(SevError, "MachineTeamRemoverTooAggressive", self->distributorId)
 						    .detail("Server", s->id)
-						    .detail("Team", team->getServerIDsStr());
+						    .detail("ServerTeam", team->getDesc());
 						self->traceAllInfo(true);
 					}
 				}
@ -2867,6 +2878,7 @@ ACTOR Future<Void> machineTeamRemover(DDTeamCollection* self) {
 			}

 			TraceEvent("MachineTeamRemover", self->distributorId)
+			    .detail("MachineTeamIDToRemove", mt->id.shortString())
 			    .detail("MachineTeamToRemove", mt->getMachineIDsStr())
 			    .detail("NumProcessTeamsOnTheMachineTeam", minNumProcessTeams)
 			    .detail("CurrentMachineTeams", self->machineTeams.size())
@ -2882,7 +2894,7 @@ ACTOR Future<Void> machineTeamRemover(DDTeamCollection* self) {
 		} else {
 			if (numMachineTeamRemoved > 0) {
 				// Only trace the information when we remove a machine team
-				TraceEvent("TeamRemoverDone")
+				TraceEvent("MachineTeamRemoverDone", self->distributorId)
 				    .detail("HealthyMachines", healthyMachineCount)
 				    // .detail("CurrentHealthyMachineTeams", currentHealthyMTCount)
 				    .detail("CurrentMachineTeams", self->machineTeams.size())
@ -2946,6 +2958,7 @@ ACTOR Future<Void> serverTeamRemover(DDTeamCollection* self) {

 			TraceEvent("ServerTeamRemover", self->distributorId)
 			    .detail("ServerTeamToRemove", st->getServerIDsStr())
+			    .detail("ServerTeamID", st->getTeamID())
 			    .detail("NumProcessTeamsOnTheServerTeam", maxNumProcessTeams)
 			    .detail("CurrentServerTeams", self->teams.size())
 			    .detail("DesiredServerTeams", desiredServerTeams);
@ -2965,6 +2978,35 @@ ACTOR Future<Void> serverTeamRemover(DDTeamCollection* self) {
 	}
 }

+ACTOR Future<Void> zeroServerLeftLogger_impl(DDTeamCollection* self, Reference<TCTeamInfo> team) {
+	wait(delay(SERVER_KNOBS->DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY));
+	state vector<KeyRange> shards = self->shardsAffectedByTeamFailure->getShardsFor(
+	    ShardsAffectedByTeamFailure::Team(team->getServerIDs(), self->primary));
+	state std::vector<Future<StorageMetrics>> sizes;
+	sizes.reserve(shards.size());
+
+	for (auto const& shard : shards) {
+		sizes.emplace_back(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(shard))));
+		TraceEvent(SevWarnAlways, "DDShardLost", self->distributorId)
+		    .detail("ServerTeamID", team->getTeamID())
+		    .detail("ShardBegin", shard.begin)
+		    .detail("ShardEnd", shard.end);
+	}
+
+	wait(waitForAll(sizes));
+
+	int64_t bytesLost = 0;
+	for (auto const& size : sizes) {
+		bytesLost += size.get().bytes;
+	}
+
+	TraceEvent(SevWarnAlways, "DDZeroServerLeftInTeam", self->distributorId)
+	    .detail("Team", team->getDesc())
+	    .detail("TotalBytesLost", bytesLost);
+
+	return Void();
+}
+
 // Track a team and issue RelocateShards when the level of degradation changes
 // A badTeam can be unhealthy or just a redundantTeam removed by machineTeamRemover() or serverTeamRemover()
 ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> team, bool badTeam, bool redundantTeam) {
@ -2979,18 +3021,22 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 	state bool lastZeroHealthy = self->zeroHealthyTeams->get();
 	state bool firstCheck = true;

+	state Future<Void> zeroServerLeftLogger;
+
 	if(logTeamEvents) {
-		TraceEvent("TeamTrackerStarting", self->distributorId).detail("Reason", "Initial wait complete (sc)").detail("Team", team->getDesc());
+		TraceEvent("ServerTeamTrackerStarting", self->distributorId)
+		    .detail("Reason", "Initial wait complete (sc)")
+		    .detail("ServerTeam", team->getDesc());
 	}
 	self->priority_teams[team->getPriority()]++;

 	try {
 		loop {
 			if(logTeamEvents) {
-				TraceEvent("TeamHealthChangeDetected", self->distributorId)
-					.detail("Team", team->getDesc())
-					.detail("Primary", self->primary)
-					.detail("IsReady", self->initialFailureReactionDelay.isReady());
+				TraceEvent("ServerTeamHealthChangeDetected", self->distributorId)
+				    .detail("ServerTeam", team->getDesc())
+				    .detail("Primary", self->primary)
+				    .detail("IsReady", self->initialFailureReactionDelay.isReady());
 				self->traceTeamCollectionInfo();
 			}
 			// Check if the number of degraded machines has changed
@ -3053,10 +3099,13 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 			if (serversLeft != lastServersLeft || anyUndesired != lastAnyUndesired ||
 			    anyWrongConfiguration != lastWrongConfiguration || recheck) { // NOTE: do not check wrongSize
 				if(logTeamEvents) {
-					TraceEvent("TeamHealthChanged", self->distributorId)
-						.detail("Team", team->getDesc()).detail("ServersLeft", serversLeft)
-						.detail("LastServersLeft", lastServersLeft).detail("ContainsUndesiredServer", anyUndesired)
-						.detail("HealthyTeamsCount", self->healthyTeamCount).detail("IsWrongConfiguration", anyWrongConfiguration);
+					TraceEvent("ServerTeamHealthChanged", self->distributorId)
+					    .detail("ServerTeam", team->getDesc())
+					    .detail("ServersLeft", serversLeft)
+					    .detail("LastServersLeft", lastServersLeft)
+					    .detail("ContainsUndesiredServer", anyUndesired)
+					    .detail("HealthyTeamsCount", self->healthyTeamCount)
+					    .detail("IsWrongConfiguration", anyWrongConfiguration);
 				}

 				team->setWrongConfiguration( anyWrongConfiguration );
@ -3078,18 +3127,18 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 					self->zeroHealthyTeams->set(self->healthyTeamCount == 0);

 					if( self->healthyTeamCount == 0 ) {
-						TraceEvent(SevWarn, "ZeroTeamsHealthySignalling", self->distributorId)
-							.detail("SignallingTeam", team->getDesc())
-							.detail("Primary", self->primary);
+						TraceEvent(SevWarn, "ZeroServerTeamsHealthySignalling", self->distributorId)
+						    .detail("SignallingTeam", team->getDesc())
+						    .detail("Primary", self->primary);
 					}

 					if(logTeamEvents) {
-						TraceEvent("TeamHealthDifference", self->distributorId)
-							.detail("Team", team->getDesc())
-							.detail("LastOptimal", lastOptimal)
-							.detail("LastHealthy", lastHealthy)
-							.detail("Optimal", optimal)
-							.detail("OptimalTeamCount", self->optimalTeamCount);
+						TraceEvent("ServerTeamHealthDifference", self->distributorId)
+						    .detail("ServerTeam", team->getDesc())
+						    .detail("LastOptimal", lastOptimal)
+						    .detail("LastHealthy", lastHealthy)
+						    .detail("Optimal", optimal)
+						    .detail("OptimalTeamCount", self->optimalTeamCount);
 					}
 				}

@ -3126,12 +3175,24 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 				if(lastPriority != team->getPriority()) {
 					self->priority_teams[lastPriority]--;
 					self->priority_teams[team->getPriority()]++;
+					if (lastPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT &&
+					    team->getPriority() < SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) {
+						zeroServerLeftLogger = Void();
+					}
+					if (logTeamEvents) {
+						int dataLoss = team->getPriority() == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT;
+						Severity severity = dataLoss ? SevWarnAlways : SevInfo;
+						TraceEvent(severity, "ServerTeamPriorityChange", self->distributorId)
+						    .detail("Priority", team->getPriority())
+						    .detail("Info", team->getDesc())
+						    .detail("ZeroHealthyServerTeams", self->zeroHealthyTeams->get());
+						if (team->getPriority() == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) {
+							// 0 servers left in this team, data might be lost.
+							zeroServerLeftLogger = zeroServerLeftLogger_impl(self, team);
+						}
+					}
 				}

-				if(logTeamEvents) {
-					TraceEvent("TeamPriorityChange", self->distributorId).detail("Priority", team->getPriority())
-					.detail("Info", team->getDesc()).detail("ZeroHealthyTeams", self->zeroHealthyTeams->get());
-				}

 				lastZeroHealthy = self->zeroHealthyTeams->get(); //set this again in case it changed from this teams health changing
 				if( self->initialFailureReactionDelay.isReady() && !self->zeroHealthyTeams->get() ) {
@ -3185,17 +3246,19 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 						self->output.send(rs);
 						if(deterministicRandom()->random01() < 0.01) {
 							TraceEvent("SendRelocateToDDQx100", self->distributorId)
-								.detail("Team", team->getDesc())
-								.detail("KeyBegin", rs.keys.begin)
-								.detail("KeyEnd", rs.keys.end)
-								.detail("Priority", rs.priority)
-								.detail("TeamFailedMachines", team->size() - serversLeft)
-								.detail("TeamOKMachines", serversLeft);
+							    .detail("ServerTeam", team->getDesc())
+							    .detail("KeyBegin", rs.keys.begin)
+							    .detail("KeyEnd", rs.keys.end)
+							    .detail("Priority", rs.priority)
+							    .detail("ServerTeamFailedMachines", team->size() - serversLeft)
+							    .detail("ServerTeamOKMachines", serversLeft);
 						}
 					}
 				} else {
 					if(logTeamEvents) {
-						TraceEvent("TeamHealthNotReady", self->distributorId).detail("HealthyTeamCount", self->healthyTeamCount);
+						TraceEvent("ServerTeamHealthNotReady", self->distributorId)
+						    .detail("HealthyServerTeamCount", self->healthyTeamCount)
+						    .detail("ServerTeamID", team->getTeamID());
 					}
 				}
 			}
@ -3206,7 +3269,9 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 		}
 	} catch(Error& e) {
 		if(logTeamEvents) {
-			TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()).detail("Priority", team->getPriority());
+			TraceEvent("ServerTeamTrackerStopping", self->distributorId)
+			    .detail("ServerTeam", team->getDesc())
+			    .detail("Priority", team->getPriority());
 		}
 		self->priority_teams[team->getPriority()]--;
 		if (team->isHealthy()) {
@ -3214,7 +3279,8 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 			ASSERT( self->healthyTeamCount >= 0 );

 			if( self->healthyTeamCount == 0 ) {
-				TraceEvent(SevWarn, "ZeroTeamsHealthySignalling", self->distributorId).detail("SignallingTeam", team->getDesc());
+				TraceEvent(SevWarn, "ZeroServerTeamsHealthySignalling", self->distributorId)
+				    .detail("SignallingServerTeam", team->getDesc());
 				self->zeroHealthyTeams->set(true);
 			}
 		}
@ -4352,7 +4418,9 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
 	state DatabaseConfiguration configuration;
 	state Reference<InitialDataDistribution> initData;
 	state MoveKeysLock lock;
+	state bool trackerCancelled;
 	loop {
+		trackerCancelled = false;
 		try {
 			loop {
 				TraceEvent("DDInitTakingMoveKeysLock", self->ddId);
@ -4513,18 +4581,24 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
 			}

 			actors.push_back( pollMoveKeysLock(cx, lock) );
-			actors.push_back(
-			    reportErrorsExcept(dataDistributionTracker(initData, cx, output, shardsAffectedByTeamFailure,
-			                                               getShardMetrics, getAverageShardBytes.getFuture(),
-			                                               readyToStart, anyZeroHealthyTeams, self->ddId, &shards),
-			                       "DDTracker", self->ddId, &normalDDQueueErrors()));
+			actors.push_back(reportErrorsExcept(
+			    dataDistributionTracker(initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics,
+			                            getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId,
+			                            &shards, &trackerCancelled),
+			    "DDTracker", self->ddId, &normalDDQueueErrors()));
 			actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, configuration.storageTeamSize, &lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) );

 			vector<DDTeamCollection*> teamCollectionsPtrs;
-			Reference<DDTeamCollection> primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) );
+			Reference<DDTeamCollection> primaryTeamCollection(new DDTeamCollection(
+			    cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId,
+			    configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(), readyToStart.getFuture(),
+			    zeroHealthyTeams[0], true, processingUnhealthy, getShardMetrics));
 			teamCollectionsPtrs.push_back(primaryTeamCollection.getPtr());
 			if (configuration.usableRegions > 1) {
-				Reference<DDTeamCollection> remoteTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, remoteDcIds, Optional<std::vector<Optional<Key>>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo), zeroHealthyTeams[1], false, processingUnhealthy) );
+				Reference<DDTeamCollection> remoteTeamCollection(new DDTeamCollection(
+				    cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, remoteDcIds,
+				    Optional<std::vector<Optional<Key>>>(), readyToStart.getFuture() && remoteRecovered(self->dbInfo),
+				    zeroHealthyTeams[1], false, processingUnhealthy, getShardMetrics));
 				teamCollectionsPtrs.push_back(remoteTeamCollection.getPtr());
 				remoteTeamCollection->teamCollections = teamCollectionsPtrs;
 				actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( remoteTeamCollection, initData, tcis[1], self->dbInfo ), "DDTeamCollectionSecondary", self->ddId, &normalDDQueueErrors() ) );
@ -4540,6 +4614,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
 		}
 		catch( Error &e ) {
 			state Error err = e;
+			trackerCancelled = true;
 			wait(shards.clearAsync());
 			if (err.code() != error_code_movekeys_conflict) throw err;
 			bool ddEnabled = wait( isDataDistributionEnabled(cx) );
@ -4764,20 +4839,11 @@ DDTeamCollection* testTeamCollection(int teamSize, Reference<IReplicationPolicy>
 	conf.storageTeamSize = teamSize;
 	conf.storagePolicy = policy;

-	DDTeamCollection* collection = new DDTeamCollection(
-		database,
-		UID(0, 0),
-		MoveKeysLock(),
-		PromiseStream<RelocateShard>(),
-		Reference<ShardsAffectedByTeamFailure>(new ShardsAffectedByTeamFailure()),
-		conf,
-		{},
-		{},
-		Future<Void>(Void()),
-		Reference<AsyncVar<bool>>( new AsyncVar<bool>(true) ),
-		true,
-		Reference<AsyncVar<bool>>( new AsyncVar<bool>(false) )
-	);
+	DDTeamCollection* collection =
+	    new DDTeamCollection(database, UID(0, 0), MoveKeysLock(), PromiseStream<RelocateShard>(),
+	                         Reference<ShardsAffectedByTeamFailure>(new ShardsAffectedByTeamFailure()), conf, {}, {},
+	                         Future<Void>(Void()), Reference<AsyncVar<bool>>(new AsyncVar<bool>(true)), true,
+	                         Reference<AsyncVar<bool>>(new AsyncVar<bool>(false)), PromiseStream<GetMetricsRequest>());

 	for (int id = 1; id <= processCount; ++id) {
 		UID uid(id, 0);
@ -4805,9 +4871,8 @@ DDTeamCollection* testMachineTeamCollection(int teamSize, Reference<IReplication
 	DDTeamCollection* collection =
 	    new DDTeamCollection(database, UID(0, 0), MoveKeysLock(), PromiseStream<RelocateShard>(),
 	                         Reference<ShardsAffectedByTeamFailure>(new ShardsAffectedByTeamFailure()), conf, {}, {},
-	                         Future<Void>(Void()),
-	                         Reference<AsyncVar<bool>>(new AsyncVar<bool>(true)), true,
-	                         Reference<AsyncVar<bool>>(new AsyncVar<bool>(false)));
+	                         Future<Void>(Void()), Reference<AsyncVar<bool>>(new AsyncVar<bool>(true)), true,
+	                         Reference<AsyncVar<bool>>(new AsyncVar<bool>(false)), PromiseStream<GetMetricsRequest>());

 	for (int id = 1; id <= processCount; id++) {
 		UID uid(id, 0);
--- a/fdbserver/DataDistribution.actor.h
+++ b/fdbserver/DataDistribution.actor.h
@ -59,10 +59,12 @@ struct IDataDistributionTeam {
 	virtual bool isWrongConfiguration() = 0;
 	virtual void setWrongConfiguration(bool) = 0;
 	virtual void addServers(const vector<UID> &servers) = 0;
+	virtual std::string getTeamID() = 0;

 	std::string getDesc() {
 		const auto& servers = getLastKnownServerInterfaces();
-		std::string s = format("Size %d; ", servers.size());
+		std::string s = format("TeamID:%s", getTeamID().c_str());
+		s += format("Size %d; ", servers.size());
 		for(int i=0; i<servers.size(); i++) {
 			if (i) s += ", ";
 			s += servers[i].address().toString() + " " + servers[i].id().shortString();
@ -186,7 +188,7 @@ struct InitialDataDistribution : ReferenceCounted<InitialDataDistribution> {
 struct ShardMetrics {
 	StorageMetrics metrics;
 	double lastLowBandwidthStartTime;
-	int shardCount;
+	int shardCount; // number of smaller shards whose metrics are aggregated in the ShardMetrics

 	bool operator==(ShardMetrics const& rhs) const {
 		return metrics == rhs.metrics && lastLowBandwidthStartTime == rhs.lastLowBandwidthStartTime &&
@ -209,7 +211,8 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
                                           PromiseStream<GetMetricsRequest> getShardMetrics,
                                           FutureStream<Promise<int64_t>> getAverageShardBytes,
                                           Promise<Void> readyToStart, Reference<AsyncVar<bool>> zeroHealthyTeams,
-                                           UID distributorId, KeyRangeMap<ShardTrackedData>* shards);
+                                           UID distributorId, KeyRangeMap<ShardTrackedData>* shards,
+                                           bool const* trackerCancelled);

 ACTOR Future<Void> dataDistributionQueue(
    Database cx, PromiseStream<RelocateShard> output, FutureStream<RelocateShard> input,
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -18,8 +18,9 @@
 * limitations under the License.
 */

-#include <numeric>
 #include <limits>
+#include <numeric>
+#include <vector>

 #include "flow/ActorCollection.h"
 #include "flow/Util.h"
@ -83,10 +84,10 @@ struct RelocateData {

 class ParallelTCInfo : public ReferenceCounted<ParallelTCInfo>, public IDataDistributionTeam {
 public:
-	vector<Reference<IDataDistributionTeam>> teams;
-	vector<UID> tempServerIDs;
+	std::vector<Reference<IDataDistributionTeam>> teams;
+	std::vector<UID> tempServerIDs;

-	ParallelTCInfo() { }
+	ParallelTCInfo() {}

 	void addTeam(Reference<IDataDistributionTeam> team) {
 		teams.push_back(team);
@ -105,11 +106,11 @@ public:
 	}

 	template<class T>
-	vector<T> collect(std::function < vector<T>(Reference<IDataDistributionTeam>)> func) {
-		vector<T> result;
+	std::vector<T> collect(std::function<std::vector<T>(Reference<IDataDistributionTeam>)> func) {
+		std::vector<T> result;

 		for (auto it = teams.begin(); it != teams.end(); it++) {
-			vector<T> newItems = func(*it);
+			std::vector<T> newItems = func(*it);
 			result.insert(result.end(), newItems.begin(), newItems.end());
 		}
 		return result;
@ -130,7 +131,7 @@ public:
 		});
 	}

-	virtual vector<StorageServerInterface> getLastKnownServerInterfaces() {
+	virtual std::vector<StorageServerInterface> getLastKnownServerInterfaces() {
 		return collect<StorageServerInterface>([](Reference<IDataDistributionTeam> team) {
 			return team->getLastKnownServerInterfaces();
 		});
@ -144,10 +145,10 @@ public:
 		return totalSize;
 	}

-	virtual vector<UID> const& getServerIDs() {
+	virtual std::vector<UID> const& getServerIDs() {
 		tempServerIDs.clear();
 		for (auto it = teams.begin(); it != teams.end(); it++) {
-			vector<UID> const& childIDs = (*it)->getServerIDs();
+			std::vector<UID> const& childIDs = (*it)->getServerIDs();
 			tempServerIDs.insert(tempServerIDs.end(), childIDs.begin(), childIDs.end());
 		}
 		return tempServerIDs;
@ -194,7 +195,7 @@ public:
 	}

 	virtual Future<Void> updateStorageMetrics() {
-		vector<Future<Void>> futures;
+		std::vector<Future<Void>> futures;

 		for (auto it = teams.begin(); it != teams.end(); it++) {
 			futures.push_back((*it)->updateStorageMetrics());
@ -250,10 +251,19 @@ public:
 		ASSERT(!teams.empty());
 		teams[0]->addServers(servers);
 	}
+
+	std::string getTeamID() override {
+		std::string id;
+		for (int i = 0; i < teams.size(); i++) {
+			auto const& team = teams[i];
+			id += (i == teams.size() - 1) ? team->getTeamID() : format("%s, ", team->getTeamID().c_str());
+		}
+		return id;
+	}
 };

 struct Busyness {
-	vector<int> ledger;
+	std::vector<int> ledger;

 	Busyness() : ledger( 10, 0 ) {}

@ -553,7 +563,7 @@ struct DDQueueData {

 				if(keyServersEntries.size() < SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS) {
 					for( int shard = 0; shard < keyServersEntries.size(); shard++ ) {
-						vector<UID> src, dest;
+						std::vector<UID> src, dest;
 						decodeKeyServersValue( keyServersEntries[shard].value, src, dest );
 						ASSERT( src.size() );
 						for( int i = 0; i < src.size(); i++ ) {
@ -852,7 +862,7 @@ struct DDQueueData {
 			startedHere++;

 			// update both inFlightActors and inFlight key range maps, cancelling deleted RelocateShards
-			vector<KeyRange> ranges;
+			std::vector<KeyRange> ranges;
 			inFlightActors.getRangesAffectedByInsertion( rd.keys, ranges );
 			inFlightActors.cancel( KeyRangeRef( ranges.front().begin, ranges.back().end ) );
 			inFlight.insert( rd.keys, rd );
@ -1036,6 +1046,9 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 			} else {
 				TraceEvent(relocateShardInterval.severity, "RelocateShardHasDestination", distributorId)
 				    .detail("PairId", relocateShardInterval.pairID)
+				    .detail("KeyBegin", rd.keys.begin)
+				    .detail("KeyEnd", rd.keys.end)
+				    .detail("SourceServers", describe(rd.src))
 				    .detail("DestinationTeam", describe(destIds))
 				    .detail("ExtraIds", describe(extraIds));
 			}
@ -1421,7 +1434,7 @@ ACTOR Future<Void> dataDistributionQueue(
 	state RelocateData launchData;
 	state Future<Void> recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL);

-	state vector<Future<Void>> balancingFutures;
+	state std::vector<Future<Void>> balancingFutures;

 	state ActorCollectionNoErrors actors;
 	state PromiseStream<KeyRange> rangesComplete;
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -76,14 +76,43 @@ struct DataDistributionTracker {
 	Promise<Void> readyToStart;
 	Reference<AsyncVar<bool>> anyZeroHealthyTeams;

+	// The reference to trackerCancelled must be extracted by actors,
+	// because by the time (trackerCancelled == true) this memory cannot
+	// be accessed
+	bool const& trackerCancelled;
+
+	// This class extracts the trackerCancelled reference from a DataDistributionTracker object
+	// Because some actors spawned by the dataDistributionTracker outlive the DataDistributionTracker
+	// object, we must guard against memory errors by using a GetTracker functor to access
+	// the DataDistributionTracker object.
+	class SafeAccessor {
+		bool const& trackerCancelled;
+		DataDistributionTracker& tracker;
+
+	public:
+		SafeAccessor(DataDistributionTracker* tracker)
+		  : trackerCancelled(tracker->trackerCancelled), tracker(*tracker) {
+			ASSERT(!trackerCancelled);
+		}
+
+		DataDistributionTracker* operator()() {
+			if (trackerCancelled) {
+				TEST(true); // Trying to access DataDistributionTracker after tracker has been cancelled
+				throw dd_tracker_cancelled();
+			}
+			return &tracker;
+		}
+	};
+
 	DataDistributionTracker(Database cx, UID distributorId, Promise<Void> const& readyToStart,
 	                        PromiseStream<RelocateShard> const& output,
 	                        Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
-	                        Reference<AsyncVar<bool>> anyZeroHealthyTeams, KeyRangeMap<ShardTrackedData>& shards)
+	                        Reference<AsyncVar<bool>> anyZeroHealthyTeams, KeyRangeMap<ShardTrackedData>& shards,
+	                        bool const& trackerCancelled)
 	  : cx(cx), distributorId(distributorId), dbSizeEstimate(new AsyncVar<int64_t>()), systemSizeEstimate(0),
 	    maxShardSize(new AsyncVar<Optional<int64_t>>()), sizeChanges(false), readyToStart(readyToStart), output(output),
 	    shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), anyZeroHealthyTeams(anyZeroHealthyTeams),
-	    shards(shards) {}
+	    shards(shards), trackerCancelled(trackerCancelled) {}

 	~DataDistributionTracker()
 	{
@ -134,11 +163,8 @@ int64_t getMaxShardSize( double dbSizeEstimate ) {
 		(int64_t)SERVER_KNOBS->MAX_SHARD_BYTES);
 }

-ACTOR Future<Void> trackShardBytes(
-		DataDistributionTracker* self,
-		KeyRange keys,
-		Reference<AsyncVar<Optional<ShardMetrics>>> shardSize)
-{
+ACTOR Future<Void> trackShardBytes(DataDistributionTracker::SafeAccessor self, KeyRange keys,
+                                   Reference<AsyncVar<Optional<ShardMetrics>>> shardSize) {
 	state BandwidthStatus bandwidthStatus = shardSize->get().present() ? getBandwidthStatus( shardSize->get().get().metrics ) : BandwidthStatusNormal;
 	state double lastLowBandwidthStartTime = shardSize->get().present() ? shardSize->get().get().lastLowBandwidthStartTime : now();
 	state int shardCount = shardSize->get().present() ? shardSize->get().get().shardCount : 1;
@ -188,7 +214,8 @@ ACTOR Future<Void> trackShardBytes(
 			bounds.permittedError.iosPerKSecond = bounds.permittedError.infinity;

 			loop {
-				Transaction tr(self->cx);
+				Transaction tr(self()->cx);
+				// metrics.second is the number of key-ranges (i.e., shards) in the 'keys' key-range
 				std::pair<Optional<StorageMetrics>, int> metrics = wait( tr.waitStorageMetrics( keys, bounds.min, bounds.max, bounds.permittedError, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, shardCount ) );
 				if(metrics.first.present()) {
 					BandwidthStatus newBandwidthStatus = getBandwidthStatus( metrics.first.get() );
@ -211,9 +238,11 @@ ACTOR Future<Void> trackShardBytes(
 						.detail("TrackerID", trackerID);*/

 					if( shardSize->get().present() ) {
-						self->dbSizeEstimate->set( self->dbSizeEstimate->get() + metrics.first.get().bytes - shardSize->get().get().metrics.bytes );
+						self()->dbSizeEstimate->set(self()->dbSizeEstimate->get() + metrics.first.get().bytes -
+						                            shardSize->get().get().metrics.bytes);
 						if(keys.begin >= systemKeys.begin) {
-							self->systemSizeEstimate += metrics.first.get().bytes - shardSize->get().get().metrics.bytes;
+							self()->systemSizeEstimate +=
+							    metrics.first.get().bytes - shardSize->get().get().metrics.bytes;
 						}
 					}

@ -230,8 +259,9 @@ ACTOR Future<Void> trackShardBytes(
 			}
 		}
 	} catch( Error &e ) {
-		if (e.code() != error_code_actor_cancelled)
-			self->output.sendError(e);		// Propagate failure to dataDistributionTracker
+		if (e.code() != error_code_actor_cancelled && e.code() != error_code_dd_tracker_cancelled) {
+			self()->output.sendError(e); // Propagate failure to dataDistributionTracker
+		}
 		throw e;
 	}
 }
@ -486,6 +516,8 @@ Future<Void> shardMerger(
 		shardsMerged++;

 		auto shardBounds = getShardSizeBounds( merged, maxShardSize );
+		// If we just recently get the current shard's metrics (i.e., less than DD_LOW_BANDWIDTH_DELAY ago), it means
+		// the shard's metric may not be stable yet. So we cannot continue merging in this direction.
 		if( endingStats.bytes >= shardBounds.min.bytes ||
 				getBandwidthStatus( endingStats ) != BandwidthStatusLow ||
 				now() - lastLowBandwidthStartTime < SERVER_KNOBS->DD_LOW_BANDWIDTH_DELAY ||
@ -516,13 +548,21 @@ Future<Void> shardMerger(
 	//restarting shard tracker will derefenced values in the shard map, so make a copy
 	KeyRange mergeRange = merged;

+	// OldKeys: Shards in the key range are merged as one shard defined by NewKeys;
+	// NewKeys: New key range after shards are merged;
+	// EndingSize: The new merged shard size in bytes;
+	// BatchedMerges: The number of shards merged. Each shard is defined in self->shards;
+	// LastLowBandwidthStartTime: When does a shard's bandwidth status becomes BandwidthStatusLow. If a shard's status
+	//   becomes BandwidthStatusLow less than DD_LOW_BANDWIDTH_DELAY ago, the merging logic will stop at the shard;
+	// ShardCount: The number of non-splittable shards that are merged. Each shard is defined in self->shards may have
+	//   more than 1 shards.
 	TraceEvent("RelocateShardMergeMetrics", self->distributorId)
-		.detail("OldKeys", keys)
-		.detail("NewKeys", mergeRange)
-		.detail("EndingSize", endingStats.bytes)
-		.detail("BatchedMerges", shardsMerged)
-		.detail("LastLowBandwidthStartTime", lastLowBandwidthStartTime)
-		.detail("ShardCount", shardCount);
+	    .detail("OldKeys", keys)
+	    .detail("NewKeys", mergeRange)
+	    .detail("EndingSize", endingStats.bytes)
+	    .detail("BatchedMerges", shardsMerged)
+	    .detail("LastLowBandwidthStartTime", lastLowBandwidthStartTime)
+	    .detail("ShardCount", shardCount);

 	if(mergeRange.begin < systemKeys.begin) {
 		self->systemSizeEstimate -= systemBytes;
@ -584,18 +624,14 @@ ACTOR Future<Void> shardEvaluator(
 	return Void();
 }

-ACTOR Future<Void> shardTracker(
-		DataDistributionTracker* self,
-		KeyRange keys,
-		Reference<AsyncVar<Optional<ShardMetrics>>> shardSize)
-{
-	wait( yieldedFuture(self->readyToStart.getFuture()) );
+ACTOR Future<Void> shardTracker(DataDistributionTracker::SafeAccessor self, KeyRange keys,
+                                Reference<AsyncVar<Optional<ShardMetrics>>> shardSize) {
+	wait(yieldedFuture(self()->readyToStart.getFuture()));

 	if( !shardSize->get().present() )
 		wait( shardSize->onChange() );

-	if( !self->maxShardSize->get().present() )
-		wait( yieldedFuture(self->maxShardSize->onChange()) );
+	if (!self()->maxShardSize->get().present()) wait(yieldedFuture(self()->maxShardSize->onChange()));

 	// Since maxShardSize will become present for all shards at once, avoid slow tasks with a short delay
 	wait( delay( 0, TaskPriority::DataDistribution ) );
@ -603,26 +639,27 @@ ACTOR Future<Void> shardTracker(
 	// Survives multiple calls to shardEvaluator and keeps merges from happening too quickly.
 	state Reference<HasBeenTrueFor> wantsToMerge( new HasBeenTrueFor( shardSize->get() ) );

-	/*TraceEvent("ShardTracker", self->distributorId)
-		.detail("Begin", keys.begin)
-		.detail("End", keys.end)
-		.detail("TrackerID", trackerID)
-		.detail("MaxBytes", self->maxShardSize->get().get())
-		.detail("ShardSize", shardSize->get().get().bytes)
-		.detail("BytesPerKSec", shardSize->get().get().bytesPerKSecond);*/
+	/*TraceEvent("ShardTracker", self()->distributorId)
+	    .detail("Begin", keys.begin)
+	    .detail("End", keys.end)
+	    .detail("TrackerID", trackerID)
+	    .detail("MaxBytes", self()->maxShardSize->get().get())
+	    .detail("ShardSize", shardSize->get().get().bytes)
+	    .detail("BytesPerKSec", shardSize->get().get().bytesPerKSecond);*/

 	try {
 		loop {
 			// Use the current known size to check for (and start) splits and merges.
-			wait( shardEvaluator( self, keys, shardSize, wantsToMerge ) );
+			wait(shardEvaluator(self(), keys, shardSize, wantsToMerge));

 			// We could have a lot of actors being released from the previous wait at the same time. Immediately calling
 			// delay(0) mitigates the resulting SlowTask
 			wait( delay(0, TaskPriority::DataDistribution) );
 		}
 	} catch (Error& e) {
-		if (e.code() != error_code_actor_cancelled)
-			self->output.sendError(e);		// Propagate failure to dataDistributionTracker
+		if (e.code() != error_code_actor_cancelled && e.code() != error_code_dd_tracker_cancelled) {
+			self()->output.sendError(e); // Propagate failure to dataDistributionTracker
+		}
 		throw e;
 	}
 }
@ -653,8 +690,8 @@ void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optio

 		ShardTrackedData data;
 		data.stats = shardSize;
-		data.trackShard = shardTracker( self, ranges[i], shardSize );
-		data.trackBytes = trackShardBytes( self, ranges[i], shardSize );
+		data.trackShard = shardTracker(DataDistributionTracker::SafeAccessor(self), ranges[i], shardSize);
+		data.trackBytes = trackShardBytes(DataDistributionTracker::SafeAccessor(self), ranges[i], shardSize);
 		self->shards.insert( ranges[i], data );
 	}
 }
@ -728,9 +765,10 @@ ACTOR Future<Void> dataDistributionTracker(Reference<InitialDataDistribution> in
                                           PromiseStream<GetMetricsRequest> getShardMetrics,
                                           FutureStream<Promise<int64_t>> getAverageShardBytes,
                                           Promise<Void> readyToStart, Reference<AsyncVar<bool>> anyZeroHealthyTeams,
-                                           UID distributorId, KeyRangeMap<ShardTrackedData>* shards) {
+                                           UID distributorId, KeyRangeMap<ShardTrackedData>* shards,
+                                           bool const* trackerCancelled) {
 	state DataDistributionTracker self(cx, distributorId, readyToStart, output, shardsAffectedByTeamFailure,
-	                                   anyZeroHealthyTeams, *shards);
+	                                   anyZeroHealthyTeams, *shards, *trackerCancelled);
 	state Future<Void> loggingTrigger = Void();
 	try {
 		wait( trackInitialShards( &self, initData ) );
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -221,6 +221,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
 	init( DD_ENABLE_VERBOSE_TRACING,                           false ); if( randomize && BUGGIFY ) DD_ENABLE_VERBOSE_TRACING = true;
 	init( DD_TEAMS_INFO_PRINT_INTERVAL,                           60 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_INTERVAL = 10;
 	init( DD_TEAMS_INFO_PRINT_YIELD_COUNT,                       100 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_YIELD_COUNT = deterministicRandom()->random01() * 1000 + 1;
+	init( DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY,                    120 ); if( randomize && BUGGIFY ) DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY = 5;

 	// TeamRemover
 	init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER,                false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -343,6 +344,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
 	init( MAX_PROXY_COMPUTE,                                      2.0 );
 	init( PROXY_COMPUTE_BUCKETS,                                20000 );
 	init( PROXY_COMPUTE_GROWTH_RATE,                             0.01 );
+	init( PROXY_REJECT_BATCH_QUEUED_TOO_LONG,                    true );

 	init( RESET_MASTER_BATCHES,                                   200 );
 	init( RESET_RESOLVER_BATCHES,                                 200 );
@ -508,6 +510,9 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
 	init( BEHIND_CHECK_COUNT,                                      2 );
 	init( BEHIND_CHECK_VERSIONS,             5 * VERSIONS_PER_SECOND );
 	init( WAIT_METRICS_WRONG_SHARD_CHANCE,   isSimulated ? 1.0 : 0.1 );
+	init( REPORT_DD_METRICS,                                    true );
+	init( DD_METRICS_REPORT_INTERVAL,                           30.0 );
+	init( FETCH_KEYS_TOO_LONG_TIME_CRITERIA,                   300.0 );

 	//Wait Failure
 	init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS,                 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -536,6 +541,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
 	init( MAX_STATUS_REQUESTS_PER_SECOND,                      256.0 );
 	init( CONFIGURATION_ROWS_TO_FETCH,                         20000 );
 	init( DISABLE_DUPLICATE_LOG_WARNING,                       false );
+	init( HISTOGRAM_REPORT_INTERVAL,                           300.0 );

 	// IPager
 	init( PAGER_RESERVED_PAGES,                                    1 );
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@ -184,6 +184,7 @@ public:
 	bool DD_ENABLE_VERBOSE_TRACING;
 	int DD_TEAMS_INFO_PRINT_INTERVAL;
 	int DD_TEAMS_INFO_PRINT_YIELD_COUNT;
+	int DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY;

 	// TeamRemover to remove redundant teams
 	bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -288,6 +289,7 @@ public:
 	double MAX_PROXY_COMPUTE;
 	int PROXY_COMPUTE_BUCKETS;
 	double PROXY_COMPUTE_GROWTH_RATE;
+	bool PROXY_REJECT_BATCH_QUEUED_TOO_LONG;

 	int RESET_MASTER_BATCHES;
 	int RESET_RESOLVER_BATCHES;
@ -451,6 +453,9 @@ public:
 	int BEHIND_CHECK_COUNT;
 	int64_t BEHIND_CHECK_VERSIONS;
 	double WAIT_METRICS_WRONG_SHARD_CHANCE;
+	bool REPORT_DD_METRICS;
+	double DD_METRICS_REPORT_INTERVAL;
+	double FETCH_KEYS_TOO_LONG_TIME_CRITERIA;

 	//Wait Failure
 	int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -479,6 +484,7 @@ public:
 	double MAX_STATUS_REQUESTS_PER_SECOND;
 	int CONFIGURATION_ROWS_TO_FETCH;
 	bool DISABLE_DUPLICATE_LOG_WARNING;
+	double HISTOGRAM_REPORT_INTERVAL;

 	// IPager
 	int PAGER_RESERVED_PAGES;
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@ -30,6 +30,8 @@
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/RecoveryState.h"
 #include "fdbclient/Atomic.h"
+#include "flow/Arena.h"
+#include "flow/Histogram.h"
 #include "flow/TDMetric.actor.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.

@ -75,20 +77,26 @@ struct LogRouterData {

 	UID dbgid;
 	Reference<AsyncVar<Reference<ILogSystem>>> logSystem;
-	NotifiedVersion version;
-	NotifiedVersion minPopped;
+	Optional<UID> primaryPeekLocation;
+	NotifiedVersion version; // The largest version at which the log router has peeked mutations
+	                         // from satellite tLog or primary tLogs.
+	NotifiedVersion minPopped; // The minimum version among all tags that has been popped by remote tLogs.
 	Version startVersion;
-	Version minKnownCommittedVersion;
+	Version minKnownCommittedVersion; // The minimum durable version among all LRs.
+	                                  // A LR's durable version is the maximum version of mutations that have been
+	                                  // popped by remote tLog.
 	Version poppedVersion;
 	Deque<std::pair<Version, Standalone<VectorRef<uint8_t>>>> messageBlocks;
 	Tag routerTag;
 	bool allowPops;
 	LogSet logSet;
-	bool foundEpochEnd;
-	double waitForVersionTime = 0;
-	double maxWaitForVersionTime = 0;
-	double getMoreTime = 0;
-	double maxGetMoreTime = 0;
+	bool foundEpochEnd; // Cluster is not fully recovered yet. LR has to handle recovery
+	double waitForVersionTime = 0; // The total amount of time LR waits for remote tLog to peek and pop its data.
+	double maxWaitForVersionTime = 0; // The max one-instance wait time when LR must wait for remote tLog to pop data.
+	double getMoreTime = 0; // The total amount of time LR waits for satellite tLog's data to become available.
+	double maxGetMoreTime = 0; // The max wait time LR spent in a pull-data-request to satellite tLog.
+	int64_t generation = -1;
+	Reference<Histogram> peekLatencyDist;

 	struct PeekTrackerData {
 		std::map<int, Promise<std::pair<Version, bool>>> sequence_version;
@ -98,7 +106,9 @@ struct LogRouterData {
 	std::map<UID, PeekTrackerData> peekTracker;

 	CounterCollection cc;
-	Counter getMoreCount, getMoreBlockedCount;
+	Counter getMoreCount; // Increase by 1 when LR tries to pull data from satellite tLog.
+	Counter
+	    getMoreBlockedCount; // Increase by 1 if data is not available when LR tries to pull data from satellite tLog.
 	Future<Void> logger;
 	Reference<EventCacheHolder> eventCacheHolder;

@ -119,9 +129,14 @@ struct LogRouterData {
 		return newTagData;
 	}

-	LogRouterData(UID dbgid, const InitializeLogRouterRequest& req) : dbgid(dbgid), routerTag(req.routerTag), logSystem(new AsyncVar<Reference<ILogSystem>>()), 
-	  version(req.startVersion-1), minPopped(0), startVersion(req.startVersion), allowPops(false), minKnownCommittedVersion(0), poppedVersion(0), foundEpochEnd(false),
-		cc("LogRouter", dbgid.toString()), getMoreCount("GetMoreCount", cc), getMoreBlockedCount("GetMoreBlockedCount", cc) {
+	LogRouterData(UID dbgid, const InitializeLogRouterRequest& req)
+	  : dbgid(dbgid), routerTag(req.routerTag), logSystem(new AsyncVar<Reference<ILogSystem>>()),
+	    version(req.startVersion - 1), minPopped(0), generation(req.recoveryCount), startVersion(req.startVersion),
+	    allowPops(false), minKnownCommittedVersion(0), poppedVersion(0), foundEpochEnd(false),
+	    cc("LogRouter", dbgid.toString()), getMoreCount("GetMoreCount", cc),
+	    getMoreBlockedCount("GetMoreBlockedCount", cc),
+	    peekLatencyDist(Histogram::getHistogram(LiteralStringRef("LogRouter"), LiteralStringRef("PeekTLogLatency"),
+	                                            Histogram::Unit::microseconds)) {
 		//setup just enough of a logSet to be able to call getPushLocations
 		logSet.logServers.resize(req.tLogLocalities.size());
 		logSet.tLogPolicy = req.tLogPolicy;
@ -138,8 +153,10 @@ struct LogRouterData {

 		eventCacheHolder = Reference<EventCacheHolder>( new EventCacheHolder(dbgid.shortString() + ".PeekLocation") );

-		specialCounter(cc, "Version", [this](){ return this->version.get(); });
+		// FetchedVersions: How many version of mutations buffered at LR and have not been popped by remote tLogs
+		specialCounter(cc, "Version", [this]() { return this->version.get(); });
 		specialCounter(cc, "MinPopped", [this](){ return this->minPopped.get(); });
+		// TODO: Add minPopped locality and minPoppedId, similar as tLog Metrics
 		specialCounter(cc, "FetchedVersions", [this](){ return std::max<Version>(0, std::min<Version>(SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS, this->version.get() - this->minPopped.get())); });
 		specialCounter(cc, "MinKnownCommittedVersion", [this](){ return this->minKnownCommittedVersion; });
 		specialCounter(cc, "PoppedVersion", [this](){ return this->poppedVersion; });
@ -148,7 +165,12 @@ struct LogRouterData {
 		specialCounter(cc, "WaitForVersionMaxMS", [this](){ double val = this->maxWaitForVersionTime; this->maxWaitForVersionTime = 0; return 1000*val; });
 		specialCounter(cc, "GetMoreMS", [this](){ double val = this->getMoreTime; this->getMoreTime = 0; return 1000*val; });
 		specialCounter(cc, "GetMoreMaxMS", [this](){ double val = this->maxGetMoreTime; this->maxGetMoreTime = 0; return 1000*val; });
-		logger = traceCounters("LogRouterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "LogRouterMetrics");
+		specialCounter(cc, "Generation", [this]() { return this->generation; });
+		logger = traceCounters("LogRouterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc,
+		                       "LogRouterMetrics", [this](TraceEvent& te) {
+			                       te.detail("PrimaryPeekLocation", this->primaryPeekLocation);
+			                       te.detail("RouterTag", this->routerTag.toString());
+		                       });
 	}
 };

@ -207,8 +229,15 @@ ACTOR Future<Void> waitForVersion( LogRouterData *self, Version ver ) {
 	// Since one set of log routers is created per generation of transaction logs, the gap caused by epoch end will be within MAX_VERSIONS_IN_FLIGHT of the log routers start version.
 	state double startTime = now();
 	if(self->version.get() < self->startVersion) {
+		// Log router needs to wait for remote tLogs to process data, whose version is less than self->startVersion,
+		// before the log router can pull more data (i.e., data after self->startVersion) from satellite tLog;
+		// This prevents LR from getting OOM due to it pulls too much data from satellite tLog at once;
+		// Note: each commit writes data to both primary tLog and satellite tLog. Satellite tLog can be viewed as
+		//       a part of primary tLogs.
 		if(ver > self->startVersion) {
 			self->version.set(self->startVersion);
+			// Wait for remote tLog to peek and pop from LR,
+			// so that LR's minPopped version can increase to self->startVersion
 			wait(self->minPopped.whenAtLeast(self->version.get()));
 		}
 		self->waitForVersionTime += now() - startTime;
@ -216,6 +245,9 @@ ACTOR Future<Void> waitForVersion( LogRouterData *self, Version ver ) {
 		return Void();
 	}
 	if(!self->foundEpochEnd) {
+		// Similar to proxy that does not keep more than MAX_READ_TRANSACTION_LIFE_VERSIONS transactions oustanding;
+		// Log router does not keep more than MAX_READ_TRANSACTION_LIFE_VERSIONS transactions outstanding because
+		// remote SS cannot roll back to more than MAX_READ_TRANSACTION_LIFE_VERSIONS ago.
 		wait(self->minPopped.whenAtLeast(std::min(self->version.get(), ver - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS)));
 	} else {
 		while(self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < ver) {
@ -235,6 +267,7 @@ ACTOR Future<Void> waitForVersion( LogRouterData *self, Version ver ) {
 	return Void();
 }

+// Log router pull data from satellite tLog
 ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
 	state Future<Void> dbInfoChange = Void();
 	state Reference<ILogSystem::IPeekCursor> r;
@ -256,14 +289,17 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
 			state double startTime = now();
 			choose {
 				when(wait( getMoreF ) ) {
-					self->getMoreTime += now() - startTime;
-					self->maxGetMoreTime = std::max(self->maxGetMoreTime, now() - startTime);
+					double peekTime = now() - startTime;
+					self->peekLatencyDist->sampleSeconds(peekTime);
+					self->getMoreTime += peekTime;
+					self->maxGetMoreTime = std::max(self->maxGetMoreTime, peekTime);
 					break;
 				}
 				when( wait( dbInfoChange ) ) { //FIXME: does this actually happen?
 					if(r) tagPopped = std::max(tagPopped, r->popped());
 					if( self->logSystem->get() ) {
 						r = self->logSystem->get()->peekLogRouter( self->dbgid, tagAt, self->routerTag );
+						self->primaryPeekLocation = r->getPrimaryPeekLocation();
 						TraceEvent("LogRouterPeekLocation", self->dbgid).detail("LogID", r->getPrimaryPeekLocation()).trackLatest(self->eventCacheHolder->trackingKey);
 					} else {
 						r = Reference<ILogSystem::IPeekCursor>();
@ -564,6 +600,7 @@ ACTOR Future<Void> logRouterCore(
 			addActor.send( logRouterPeekMessages( &logRouterData, req ) );
 		}
 		when( TLogPopRequest req = waitNext( interf.popMessages.getFuture() ) ) {
+			// Request from remote tLog to pop data from LR
 			addActor.send( logRouterPop( &logRouterData, req ) );
 		}
 		when (wait(error)) {}
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@ -138,8 +138,20 @@ ACTOR Future<Void> resetChecker( ILogSystem::ServerPeekCursor* self, NetworkAddr
 	self->unknownReplies = 0;
 	self->fastReplies = 0;
 	wait(delay(SERVER_KNOBS->PEEK_STATS_INTERVAL));
-	TraceEvent("SlowPeekStats").detail("PeerAddress", addr).detail("SlowReplies", self->slowReplies).detail("FastReplies", self->fastReplies).detail("UnknownReplies", self->unknownReplies);
-	if(self->slowReplies >= SERVER_KNOBS->PEEK_STATS_SLOW_AMOUNT && self->slowReplies/double(self->slowReplies+self->fastReplies) >= SERVER_KNOBS->PEEK_STATS_SLOW_RATIO) {
+	TraceEvent("SlowPeekStats", self->randomID)
+	    .detail("PeerAddress", addr)
+	    .detail("SlowReplies", self->slowReplies)
+	    .detail("FastReplies", self->fastReplies)
+	    .detail("UnknownReplies", self->unknownReplies);
+
+	if (self->slowReplies >= SERVER_KNOBS->PEEK_STATS_SLOW_AMOUNT &&
+	    self->slowReplies / double(self->slowReplies + self->fastReplies) >= SERVER_KNOBS->PEEK_STATS_SLOW_RATIO) {
+
+		TraceEvent("ConnectionResetSlowPeek", self->randomID)
+		    .detail("PeerAddress", addr)
+		    .detail("SlowReplies", self->slowReplies)
+		    .detail("FastReplies", self->fastReplies)
+		    .detail("UnknownReplies", self->unknownReplies);
 		FlowTransport::transport().resetConnection(addr);
 		self->lastReset = now();
 	}
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@ -63,6 +63,14 @@ struct ProxyStats {
 	Counter conflictRanges;
 	Counter keyServerLocationIn, keyServerLocationOut, keyServerLocationErrors;
 	Version lastCommitVersionAssigned;
+	double transactionRateAllowed, batchTransactionRateAllowed;
+	double transactionLimit, batchTransactionLimit;
+	// how much of the GRV requests queue was processed in one attempt to hand out read version.
+	double percentageOfDefaultGRVQueueProcessed;
+	double percentageOfBatchGRVQueueProcessed;
+
+	LatencySample defaultTxnGRVTimeInQueue;
+	LatencySample batchTxnGRVTimeInQueue;

 	LatencySample commitLatencySample;
 	LatencySample grvLatencySample;
@ -72,24 +80,56 @@ struct ProxyStats {

 	Future<Void> logger;

-	explicit ProxyStats(UID id, Version* pVersion, NotifiedVersion* pCommittedVersion, int64_t *commitBatchesMemBytesCountPtr)
-	  : cc("ProxyStats", id.toString()), txnRequestIn("TxnRequestIn", cc), txnRequestOut("TxnRequestOut", cc), txnRequestErrors("TxnRequestErrors", cc),
-		txnStartIn("TxnStartIn", cc), txnStartOut("TxnStartOut", cc), txnStartBatch("TxnStartBatch", cc), txnSystemPriorityStartIn("TxnSystemPriorityStartIn", cc), txnSystemPriorityStartOut("TxnSystemPriorityStartOut", cc), txnBatchPriorityStartIn("TxnBatchPriorityStartIn", cc), txnBatchPriorityStartOut("TxnBatchPriorityStartOut", cc),
-		txnDefaultPriorityStartIn("TxnDefaultPriorityStartIn", cc), txnDefaultPriorityStartOut("TxnDefaultPriorityStartOut", cc), txnCommitIn("TxnCommitIn", cc),	txnCommitVersionAssigned("TxnCommitVersionAssigned", cc), txnCommitResolving("TxnCommitResolving", cc), txnCommitResolved("TxnCommitResolved", cc), txnCommitOut("TxnCommitOut", cc),
-		txnCommitOutSuccess("TxnCommitOutSuccess", cc), txnCommitErrors("TxnCommitErrors", cc), txnConflicts("TxnConflicts", cc), commitBatchIn("CommitBatchIn", cc), commitBatchOut("CommitBatchOut", cc), mutationBytes("MutationBytes", cc), mutations("Mutations", cc), conflictRanges("ConflictRanges", cc), keyServerLocationIn("KeyServerLocationIn", cc), keyServerLocationOut("KeyServerLocationOut", cc), keyServerLocationErrors("KeyServerLocationErrors", cc), 
-		lastCommitVersionAssigned(0), commitLatencySample("CommitLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE), grvLatencySample("GRVLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
-		commitLatencyBands("CommitLatencyBands", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY), grvLatencyBands("GRVLatencyBands", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY)
-	{
+	explicit ProxyStats(UID id, Version* pVersion, NotifiedVersion* pCommittedVersion,
+	                    int64_t* commitBatchesMemBytesCountPtr)
+	  : cc("ProxyStats", id.toString()), txnRequestIn("TxnRequestIn", cc), txnRequestOut("TxnRequestOut", cc),
+	    txnRequestErrors("TxnRequestErrors", cc), txnStartIn("TxnStartIn", cc), txnStartOut("TxnStartOut", cc),
+	    txnStartBatch("TxnStartBatch", cc), txnSystemPriorityStartIn("TxnSystemPriorityStartIn", cc),
+	    txnSystemPriorityStartOut("TxnSystemPriorityStartOut", cc),
+	    txnBatchPriorityStartIn("TxnBatchPriorityStartIn", cc),
+	    txnBatchPriorityStartOut("TxnBatchPriorityStartOut", cc),
+	    txnDefaultPriorityStartIn("TxnDefaultPriorityStartIn", cc),
+	    txnDefaultPriorityStartOut("TxnDefaultPriorityStartOut", cc), txnCommitIn("TxnCommitIn", cc),
+	    txnCommitVersionAssigned("TxnCommitVersionAssigned", cc), txnCommitResolving("TxnCommitResolving", cc),
+	    txnCommitResolved("TxnCommitResolved", cc), txnCommitOut("TxnCommitOut", cc),
+	    txnCommitOutSuccess("TxnCommitOutSuccess", cc), txnCommitErrors("TxnCommitErrors", cc),
+	    txnConflicts("TxnConflicts", cc), commitBatchIn("CommitBatchIn", cc), commitBatchOut("CommitBatchOut", cc),
+	    mutationBytes("MutationBytes", cc), mutations("Mutations", cc), conflictRanges("ConflictRanges", cc),
+	    keyServerLocationIn("KeyServerLocationIn", cc), keyServerLocationOut("KeyServerLocationOut", cc),
+	    keyServerLocationErrors("KeyServerLocationErrors", cc), lastCommitVersionAssigned(0),
+	    commitLatencySample("CommitLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                        SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    grvLatencySample("GRVLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    commitLatencyBands("CommitLatencyBands", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
+	    grvLatencyBands("GRVLatencyBands", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
+	    defaultTxnGRVTimeInQueue("DefaultTxnGRVTimeInQueue", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                             SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    batchTxnGRVTimeInQueue("BatchTxnGRVTimeInQueue", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                           SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    transactionRateAllowed(0), batchTransactionRateAllowed(0), transactionLimit(0), batchTransactionLimit(0),
+	    percentageOfDefaultGRVQueueProcessed(0), percentageOfBatchGRVQueueProcessed(0) {
 		specialCounter(cc, "LastAssignedCommitVersion", [this](){return this->lastCommitVersionAssigned;});
 		specialCounter(cc, "Version", [pVersion](){return *pVersion; });
 		specialCounter(cc, "CommittedVersion", [pCommittedVersion](){ return pCommittedVersion->get(); });
 		specialCounter(cc, "CommitBatchesMemBytesCount", [commitBatchesMemBytesCountPtr]() { return *commitBatchesMemBytesCountPtr; });
+		// The rate at which the limit(budget) is allowed to grow.
+		specialCounter(cc, "SystemAndDefaultTxnRateAllowed", [this]() { return this->transactionRateAllowed; });
+		specialCounter(cc, "BatchTransactionRateAllowed", [this]() { return this->batchTransactionRateAllowed; });
+		specialCounter(cc, "SystemAndDefaultTxnLimit", [this]() { return this->transactionLimit; });
+		specialCounter(cc, "BatchTransactionLimit", [this]() { return this->batchTransactionLimit; });
+		specialCounter(cc, "PercentageOfDefaultGRVQueueProcessed",
+		               [this]() { return this->percentageOfDefaultGRVQueueProcessed; });
+		specialCounter(cc, "PercentageOfBatchGRVQueueProcessed",
+		               [this]() { return this->percentageOfBatchGRVQueueProcessed; });
 		logger = traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ProxyMetrics");
 	}
 };

-ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64_t* inTransactionCount, int64_t* inBatchTransactionCount, double* outTransactionRate,
-						   double* outBatchTransactionRate, GetHealthMetricsReply* healthMetricsReply, GetHealthMetricsReply* detailedHealthMetricsReply) {
+ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64_t* inTransactionCount,
+                           int64_t* inBatchTransactionCount, double* outTransactionRate,
+                           double* outBatchTransactionRate, GetHealthMetricsReply* healthMetricsReply,
+                           GetHealthMetricsReply* detailedHealthMetricsReply, ProxyStats* stats) {
 	state Future<Void> nextRequestTimer = Never();
 	state Future<Void> leaseTimeout = Never();
 	state Future<GetRateInfoReply> reply = Never();
@ -120,7 +160,14 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
 			reply = Never();
 			*outTransactionRate = rep.transactionRate;
 			*outBatchTransactionRate = rep.batchTransactionRate;
-			//TraceEvent("MasterProxyRate", myID).detail("Rate", rep.transactionRate).detail("BatchRate", rep.batchTransactionRate).detail("Lease", rep.leaseDuration).detail("ReleasedTransactions", *inTransactionCount - lastTC);
+			stats->transactionRateAllowed = rep.transactionRate;
+			stats->batchTransactionRateAllowed = rep.batchTransactionRate;
+			// TraceEvent("MasterProxyTxRate", myID)
+			//     .detail("RKID", db->get().ratekeeper.get().id())
+			//     .detail("RateAllowed", rep.transactionRate)
+			//     .detail("BatchRateAllowed", rep.batchTransactionRate)
+			//     .detail("Lease", rep.leaseDuration)
+			//     .detail("ReleasedTransactions", *inTransactionCount - lastTC);
 			lastTC = *inTransactionCount;
 			leaseTimeout = delay(rep.leaseDuration);
 			nextRequestTimer = delayJittered(rep.leaseDuration / 2);
@ -520,6 +567,20 @@ ACTOR Future<Void> releaseResolvingAfter(ProxyCommitData* self, Future<Void> rel
 	return Void();
 }

+// Try to identify recovery transaction and backup's apply mutations (blind writes).
+// Both cannot be rejected and are approximated by looking at first mutation
+// starting with 0xff.
+bool canReject(const std::vector<CommitTransactionRequest>& trs) {
+	for (const auto& tr : trs) {
+		if (tr.transaction.mutations.empty()) continue;
+		if (tr.transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff")) ||
+		    tr.transaction.read_conflict_ranges.empty()) {
+			return false;
+		}
+	}
+	return true;
+}
+
 ACTOR Future<Void> commitBatch(
 	ProxyCommitData* self,
 	vector<CommitTransactionRequest> trs,
@ -565,9 +626,13 @@ ACTOR Future<Void> commitBatch(

 	if (debugID.present())
 		g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.Before");
+	state double timeStart = now();

 	if(localBatchNumber-self->latestLocalCommitBatchResolving.get()>SERVER_KNOBS->RESET_MASTER_BATCHES && now()-self->lastMasterReset>SERVER_KNOBS->RESET_MASTER_DELAY) {
-		TraceEvent(SevWarnAlways, "ResetMasterNetwork").detail("CurrentBatch", localBatchNumber).detail("InProcessBatch", self->latestLocalCommitBatchResolving.get());
+		TraceEvent(SevWarnAlways, "ConnectionResetMaster", self->dbgid)
+		    .detail("PeerAddress", self->master.address())
+		    .detail("CurrentBatch", localBatchNumber)
+		    .detail("InProcessBatch", self->latestLocalCommitBatchResolving.get());
 		FlowTransport::transport().resetConnection(self->master.address());
 		self->lastMasterReset=now();
 	}
@ -575,6 +640,32 @@ ACTOR Future<Void> commitBatch(
 	/////// Phase 1: Pre-resolution processing (CPU bound except waiting for a version # which is separately pipelined and *should* be available by now (unless empty commit); ordered; currently atomic but could yield)
 	TEST(self->latestLocalCommitBatchResolving.get() < localBatchNumber-1); // Queuing pre-resolution commit processing 
 	wait(self->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber-1));
+	double queuingDelay = g_network->now() - timeStart;
+	if ((queuingDelay > (double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND ||
+	     (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01))) &&
+	    SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && canReject(trs)) {
+		// Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries.
+		TEST(true); // Reject transactions in the batch
+		TraceEvent(SevWarnAlways, "ProxyReject", self->dbgid)
+		    .suppressFor(0.1)
+		    .detail("QDelay", queuingDelay)
+		    .detail("Transactions", trs.size())
+		    .detail("BatchNumber", localBatchNumber);
+		ASSERT(self->latestLocalCommitBatchResolving.get() == localBatchNumber - 1);
+		self->latestLocalCommitBatchResolving.set(localBatchNumber);
+
+		wait(self->latestLocalCommitBatchLogging.whenAtLeast(localBatchNumber-1));
+		ASSERT(self->latestLocalCommitBatchLogging.get() == localBatchNumber - 1);
+		self->latestLocalCommitBatchLogging.set(localBatchNumber);
+		for (const auto& tr : trs) {
+			tr.reply.sendError(transaction_too_old());
+		}
+		++self->stats.commitBatchOut;
+		self->stats.txnCommitOut += trs.size();
+		self->stats.txnConflicts += trs.size();
+		return Void();
+	}
+
 	state Future<Void> releaseDelay = delay(std::min(SERVER_KNOBS->MAX_PROXY_COMPUTE, batchOperations*self->commitComputePerOperation[latencyBucket]), TaskPriority::ProxyMasterVersionReply);

 	if (debugID.present())
@ -628,9 +719,14 @@ ACTOR Future<Void> commitBatch(
 	state vector<vector<int>> transactionResolverMap = std::move( requests.transactionResolverMap );
 	state Future<Void> releaseFuture = releaseResolvingAfter(self, releaseDelay, localBatchNumber);

-	if(localBatchNumber-self->latestLocalCommitBatchLogging.get()>SERVER_KNOBS->RESET_RESOLVER_BATCHES && now()-self->lastResolverReset>SERVER_KNOBS->RESET_RESOLVER_DELAY) {
-		TraceEvent(SevWarnAlways, "ResetResolverNetwork").detail("CurrentBatch", localBatchNumber).detail("InProcessBatch", self->latestLocalCommitBatchLogging.get());
+	if (localBatchNumber - self->latestLocalCommitBatchLogging.get() > SERVER_KNOBS->RESET_RESOLVER_BATCHES &&
+	    now() - self->lastResolverReset > SERVER_KNOBS->RESET_RESOLVER_DELAY) {
+
 		for (int r = 0; r<self->resolvers.size(); r++) {
+			TraceEvent(SevWarnAlways, "ConnectionResetResolver", self->dbgid)
+			    .detail("PeerAddr", self->resolvers[r].address())
+			    .detail("CurrentBatch", localBatchNumber)
+			    .detail("InProcessBatch", self->latestLocalCommitBatchLogging.get());
 			FlowTransport::transport().resetConnection(self->resolvers[r].address());
 		}
 		self->lastResolverReset=now();
@ -1258,13 +1354,10 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture, std::
 	return Void();
 }

-ACTOR static Future<Void> transactionStarter(
-	MasterProxyInterface proxy,
-	Reference<AsyncVar<ServerDBInfo>> db,
-	PromiseStream<Future<Void>> addActor,
-	ProxyCommitData* commitData, GetHealthMetricsReply* healthMetricsReply,
-	GetHealthMetricsReply* detailedHealthMetricsReply)
-{
+ACTOR static Future<Void> transactionStarter(MasterProxyInterface proxy, Reference<AsyncVar<ServerDBInfo>> db,
+                                             PromiseStream<Future<Void>> addActor, ProxyCommitData* commitData,
+                                             GetHealthMetricsReply* healthMetricsReply,
+                                             GetHealthMetricsReply* detailedHealthMetricsReply, ProxyStats* stats) {
 	state double lastGRVTime = 0;
 	state PromiseStream<Void> GRVTimer;
 	state double GRVBatchTime = SERVER_KNOBS->START_TRANSACTION_BATCH_INTERVAL_MIN;
@ -1280,7 +1373,8 @@ ACTOR static Future<Void> transactionStarter(
 	state vector<MasterProxyInterface> otherProxies;

 	state PromiseStream<double> replyTimes;
-	addActor.send(getRate(proxy.id(), db, &transactionCount, &batchTransactionCount, &normalRateInfo.rate, &batchRateInfo.rate, healthMetricsReply, detailedHealthMetricsReply));
+	addActor.send(getRate(proxy.id(), db, &transactionCount, &batchTransactionCount, &normalRateInfo.rate,
+	                      &batchRateInfo.rate, healthMetricsReply, detailedHealthMetricsReply, stats));
 	addActor.send(queueTransactionStartRequests(&systemQueue, &defaultQueue, &batchQueue, proxy.getConsistentReadVersion.getFuture(), GRVTimer, &lastGRVTime, &GRVBatchTime, replyTimes.getFuture(), &commitData->stats));

 	// Get a list of the other proxies that go together with us
@ -1307,6 +1401,9 @@ ACTOR static Future<Void> transactionStarter(
 		normalRateInfo.reset(elapsed);
 		batchRateInfo.reset(elapsed);

+		stats->transactionLimit = normalRateInfo.limit;
+		stats->batchTransactionLimit = batchRateInfo.limit;
+
 		int transactionsStarted[2] = {0,0};
 		int systemTransactionsStarted[2] = {0,0};
 		int defaultPriTransactionsStarted[2] = { 0, 0 };
@ -1317,6 +1414,8 @@ ACTOR static Future<Void> transactionStarter(

 		int requestsToStart = 0;

+		uint32_t defaultQueueSize = defaultQueue.size();
+		uint32_t batchQueueSize = batchQueue.size();
 		while (requestsToStart < SERVER_KNOBS->START_TRANSACTION_MAX_REQUESTS_TO_START) {
 			Deque<GetReadVersionRequest>* transactionQueue;
 			if(!systemQueue.empty()) {
@ -1345,12 +1444,16 @@ ACTOR static Future<Void> transactionStarter(
 			}

 			transactionsStarted[req.flags&1] += tc;
-			if (req.priority() >= GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE)
+			double currentTime = g_network->timer();
+			if (req.priority() >= GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE) {
 				systemTransactionsStarted[req.flags & 1] += tc;
-			else if (req.priority() >= GetReadVersionRequest::PRIORITY_DEFAULT)
+			} else if (req.priority() >= GetReadVersionRequest::PRIORITY_DEFAULT) {
 				defaultPriTransactionsStarted[req.flags & 1] += tc;
-			else
+				stats->defaultTxnGRVTimeInQueue.addMeasurement(currentTime - req.requestTime());
+			} else {
 				batchPriTransactionsStarted[req.flags & 1] += tc;
+				stats->batchTxnGRVTimeInQueue.addMeasurement(currentTime - req.requestTime());
+			}

 			start[req.flags & 1].push_back(std::move(req));  static_assert(GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY == 1, "Implementation dependent on flag value");
 			transactionQueue->pop_front();
@ -1382,6 +1485,8 @@ ACTOR static Future<Void> transactionStarter(
 			g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "MasterProxyServer.masterProxyServerCore.Broadcast");
 		}

+		int defaultGRVProcessed = 0;
+		int batchGRVProcessed = 0;
 		for (int i = 0; i < start.size(); i++) {
 			if (start[i].size()) {
 				Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(commitData, i, &otherProxies, debugID, transactionsStarted[i], systemTransactionsStarted[i], defaultPriTransactionsStarted[i], batchPriTransactionsStarted[i]);
@ -1391,8 +1496,12 @@ ACTOR static Future<Void> transactionStarter(
 				if (i == 0) { 
 					addActor.send(timeReply(readVersionReply, replyTimes));
 				}
+				defaultGRVProcessed += defaultPriTransactionsStarted[i];
+				batchGRVProcessed += batchPriTransactionsStarted[i];
 			}
 		}
+		stats->percentageOfDefaultGRVQueueProcessed = (double)defaultGRVProcessed / defaultQueueSize;
+		stats->percentageOfBatchGRVQueueProcessed = (double)batchGRVProcessed / batchQueueSize;
 	}
 }

@ -1747,7 +1856,8 @@ ACTOR Future<Void> masterProxyServerCore(
 	TraceEvent(SevInfo, "CommitBatchesMemoryLimit").detail("BytesLimit", commitBatchesMemoryLimit);

 	addActor.send(monitorRemoteCommitted(&commitData));
-	addActor.send(transactionStarter(proxy, commitData.db, addActor, &commitData, &healthMetricsReply, &detailedHealthMetricsReply));
+	addActor.send(transactionStarter(proxy, commitData.db, addActor, &commitData, &healthMetricsReply,
+	                                 &detailedHealthMetricsReply, &commitData.stats));
 	addActor.send(readRequestServer(proxy, addActor, &commitData));
 	addActor.send(rejoinServer(proxy, &commitData));
 	addActor.send(healthMetricsRequestServer(proxy, &healthMetricsReply, &detailedHealthMetricsReply));
--- a/fdbserver/StorageMetrics.actor.h
+++ b/fdbserver/StorageMetrics.actor.h
@ -27,6 +27,11 @@
 #include "fdbserver/Knobs.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.

+const StringRef STORAGESERVER_HISTOGRAM_GROUP = LiteralStringRef("StorageServer");
+const StringRef FETCH_KEYS_LATENCY_HISTOGRAM = LiteralStringRef("FetchKeysLatency");
+const StringRef FETCH_KEYS_BYTES_HISTOGRAM = LiteralStringRef("FetchKeysSize");
+const StringRef FETCH_KEYS_BYTES_PER_SECOND_HISTOGRAM = LiteralStringRef("FetchKeysBandwidth");
+
 struct StorageMetricSample {
 	IndexedSet<Key, int64_t> sample;
 	int64_t metricUnitsPerSample;
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -40,6 +40,7 @@
 #include "fdbserver/WaitFailure.h"
 #include "fdbserver/RecoveryState.h"
 #include "fdbserver/FDBExecHelper.actor.h"
+#include "flow/Histogram.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.

 using std::pair;
@ -325,6 +326,7 @@ struct TLogData : NonCopyable {
 	FlowLock concurrentLogRouterReads;
 	FlowLock persistentDataCommitLock;

+	// Beginning of fields used by snapshot based backup and restore
 	bool ignorePopRequest;    // ignore pop request from storage servers
 	double ignorePopDeadline; // time until which the ignorePopRequest will be
                              // honored
@ -336,19 +338,26 @@ struct TLogData : NonCopyable {
 	std::map<Tag, Version> toBePopped; // map of Tag->Version for all the pops
                                       // that came when ignorePopRequest was set
 	Reference<AsyncVar<bool>> degraded;
+	// End of fields used by snapshot based backup and restore
+
 	std::vector<TagsAndMessage> tempTagMessages;

-	TLogData(UID dbgid, UID workerID, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference<AsyncVar<ServerDBInfo>> dbInfo, Reference<AsyncVar<bool>> degraded, std::string folder)
-			: dbgid(dbgid), workerID(workerID), instanceID(deterministicRandom()->randomUniqueID().first()),
-			  persistentData(persistentData), rawPersistentQueue(persistentQueue), persistentQueue(new TLogQueue(persistentQueue, dbgid)),
-			  dbInfo(dbInfo), degraded(degraded), queueCommitBegin(0), queueCommitEnd(0),
-			  diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0), targetVolatileBytes(SERVER_KNOBS->TLOG_SPILL_THRESHOLD), overheadBytesInput(0), overheadBytesDurable(0),
-			  peekMemoryLimiter(SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES),
-			  concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS),
-			  ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped()
-		{
-			cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true);
-		}
+	Reference<Histogram> commitLatencyDist;
+
+	TLogData(UID dbgid, UID workerID, IKeyValueStore* persistentData, IDiskQueue* persistentQueue,
+	         Reference<AsyncVar<ServerDBInfo>> dbInfo, Reference<AsyncVar<bool>> degraded, std::string folder)
+	  : dbgid(dbgid), workerID(workerID), instanceID(deterministicRandom()->randomUniqueID().first()),
+	    persistentData(persistentData), rawPersistentQueue(persistentQueue),
+	    persistentQueue(new TLogQueue(persistentQueue, dbgid)), dbInfo(dbInfo), degraded(degraded), queueCommitBegin(0),
+	    queueCommitEnd(0), diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0),
+	    targetVolatileBytes(SERVER_KNOBS->TLOG_SPILL_THRESHOLD), overheadBytesInput(0), overheadBytesDurable(0),
+	    peekMemoryLimiter(SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES),
+	    concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopRequest(false),
+	    ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped(),
+	    commitLatencyDist(Histogram::getHistogram(LiteralStringRef("tLog"), LiteralStringRef("commit"),
+	                                              Histogram::Unit::microseconds)) {
+		cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true);
+	}
 };

 struct LogData : NonCopyable, public ReferenceCounted<LogData> {
@ -432,13 +441,19 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	bool stopped, initialized;
 	DBRecoveryCount recoveryCount;

+	// If persistentDataVersion != persistentDurableDataVersion,
+	// then spilling is happening from persistentDurableDataVersion to persistentDataVersion.
+	// Data less than persistentDataDurableVersion is spilled on disk (or fully popped from the TLog);
 	VersionMetricHandle persistentDataVersion, persistentDataDurableVersion;  // The last version number in the portion of the log (written|durable) to persistentData
-	NotifiedVersion version, queueCommittedVersion;
+	NotifiedVersion version;
+	NotifiedVersion queueCommittedVersion; // The disk queue has committed up until the queueCommittedVersion version.
 	Version queueCommittingVersion;
-	Version knownCommittedVersion, durableKnownCommittedVersion, minKnownCommittedVersion;
-	Version queuePoppedVersion;
+	Version knownCommittedVersion; // The maximum version that a proxy has told us that is committed (all TLogs have
+	                               // ack'd a commit for this version).
+	Version durableKnownCommittedVersion, minKnownCommittedVersion;
+	Version queuePoppedVersion; // The disk queue has been popped up until the location which represents this version.
 	Version minPoppedTagVersion;
-	Tag minPoppedTag;
+	Tag minPoppedTag; // The tag that makes tLog hold its data and cause tLog's disk queue increasing.

 	Deque<std::pair<Version, Standalone<VectorRef<uint8_t>>>> messageBlocks;
 	std::vector<std::vector<Reference<TagData>>> tag_data; //tag.locality | tag.id
@ -481,7 +496,8 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	Version unrecoveredBefore, recoveredAt;

 	struct PeekTrackerData {
-		std::map<int, Promise<std::pair<Version, bool>>> sequence_version;
+		std::map<int, Promise<std::pair<Version, bool>>>
+		    sequence_version; // second: Version is peeked begin version. bool is onlySpilled
 		double lastUpdate;

 		Tag tag;
@ -554,12 +570,15 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		queueCommittedVersion.initMetric(LiteralStringRef("TLog.QueueCommittedVersion"), cc.id);

 		specialCounter(cc, "Version", [this](){ return this->version.get(); });
-		specialCounter(cc, "QueueCommittedVersion", [this](){ return this->queueCommittedVersion.get(); });
+		specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
 		specialCounter(cc, "PersistentDataVersion", [this](){ return this->persistentDataVersion; });
 		specialCounter(cc, "PersistentDataDurableVersion", [this](){ return this->persistentDataDurableVersion; });
 		specialCounter(cc, "KnownCommittedVersion", [this](){ return this->knownCommittedVersion; });
 		specialCounter(cc, "QueuePoppedVersion", [this](){ return this->queuePoppedVersion; });
-		specialCounter(cc, "MinPoppedTagVersion", [this](){ return this->minPoppedTagVersion; });
+		specialCounter(cc, "MinPoppedTagVersion", [this]() { return this->minPoppedTagVersion; });
+		// The locality and id of the tag that is responsible for making the TLog hold onto its oldest piece of data.
+		// If disk queues are growing and no one is sure why, then you shall look at this to find the tag responsible
+		// for why the TLog thinks it can't throw away data.
 		specialCounter(cc, "MinPoppedTagLocality", [this](){ return this->minPoppedTag.locality; });
 		specialCounter(cc, "MinPoppedTagId", [this](){ return this->minPoppedTag.id; });
 		specialCounter(cc, "SharedBytesInput", [tLogData](){ return tLogData->bytesInput; });
@ -576,6 +595,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		specialCounter(cc, "QueueDiskBytesTotal", [tLogData](){ return tLogData->rawPersistentQueue->getStorageBytes().total; });
 		specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); });
 		specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); });
+		specialCounter(cc, "Geneartion", [this]() { return this->recoveryCount; });
 	}

 	~LogData() {
@ -759,6 +779,9 @@ ACTOR Future<Void> updatePoppedLocation( TLogData* self, Reference<LogData> logD
 	return Void();
 }

+// It runs against the oldest TLog instance, calculates the first location in the disk queue that contains un-popped
+// data, and then issues a pop to the disk queue at that location so that anything earlier can be
+// removed/forgotten/overwritten. In effect, it applies the effect of TLogPop RPCs to disk.
 ACTOR Future<Void> popDiskQueue( TLogData* self, Reference<LogData> logData ) {
 	if (!logData->initialized) return Void();

@ -973,9 +996,11 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 	return Void();
 }

-// This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources.
-// For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important
-// work (e.g. commits).
+// This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all
+// CPU resources. For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce
+// latencies for more important work (e.g. commits).
+// This actor is just a loop that calls updatePersistentData and popDiskQueue whenever
+// (a) there's data to be spilled or (b) we should update metadata after some commits have been fully popped.
 ACTOR Future<Void> updateStorage( TLogData* self ) {
 	while(self->spillOrder.size() && !self->id_data.count(self->spillOrder.front())) {
 		self->spillOrder.pop_front();
@ -1823,7 +1848,11 @@ ACTOR Future<Void> tLogCommit(
 		return Void();
 	}

-	if (logData->version.get() == req.prevVersion) {  // Not a duplicate (check relies on critical section between here self->version.set() below!)
+	state double beforeCommitT = now();
+
+	// Not a duplicate (check relies on critical section between here self->version.set() below!)
+	state bool isNotDuplicate = (logData->version.get() == req.prevVersion);
+	if (isNotDuplicate) {
 		if(req.debugID.present())
 			g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.Before");

@ -1861,6 +1890,10 @@ ACTOR Future<Void> tLogCommit(
 		return Void();
 	}

+	if (isNotDuplicate) {
+		self->commitLatencyDist->sampleSeconds(now() - beforeCommitT);
+	}
+
 	if(req.debugID.present())
 		g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.After");

@ -2235,6 +2268,7 @@ void removeLog( TLogData* self, Reference<LogData> logData ) {
 	}
 }

+// remote tLog pull data from log routers
 ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, std::vector<Tag> tags, Version beginVersion, Optional<Version> endVersion, bool poppedIsKnownCommitted ) {
 	state Future<Void> dbInfoChange = Void();
 	state Reference<ILogSystem::IPeekCursor> r;
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -160,7 +160,7 @@ OldTLogCoreData::OldTLogCoreData(const OldLogData& oldData) :
 struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogSystem> {
 	UID dbgid;
 	LogSystemType logSystemType;
-	std::vector<Reference<LogSet>> tLogs;
+	std::vector<Reference<LogSet>> tLogs; // LogSets in different locations: primary, satellite, or remote
 	int expectedLogSets;
 	int logRouterTags;
 	int txsTags;
@ -168,7 +168,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	int repopulateRegionAntiQuorum;
 	bool stopped;
 	std::set<int8_t> pseudoLocalities;
-	std::map<int8_t, Version> pseudoLocalityPopVersion;
+	std::map<int8_t, Version> pseudoLocalityPopVersion; // first:locality, second:popped version at the locality

 	// new members
 	Future<Void> rejoins;
@ -184,7 +184,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	Optional<Version> recoveredAt;
 	Version knownCommittedVersion;
 	LocalityData locality;
-	std::map< std::pair<UID, Tag>, std::pair<Version, Version> > outstandingPops;  // For each currently running popFromLog actor, (log server #, tag)->popped version
+	// For each currently running popFromLog actor, outstandingPops is
+	// (logID, tag)->(max popped version, durableKnownCommittedVersion).
+	// Why do we need durableKnownCommittedVersion? knownCommittedVersion gives the lower bound of what data
+	// will need to be copied into the next generation to restore the replication factor.
+	// Guess: It probably serves as a minimum version of what data should be on a TLog in the next generation and
+	// sending a pop for anything less than durableKnownCommittedVersion for the TLog will be absurd.
+	std::map<std::pair<UID, Tag>, std::pair<Version, Version>> outstandingPops;
+
 	Optional<PromiseStream<Future<Void>>> addActor;
 	ActorCollection popActors;
 	std::vector<OldLogData> oldLogData;
@ -245,11 +252,15 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		return pseudoLocalities.count(locality) > 0;
 	}

+	// Return the max version that can be popped for the locality;
 	Version popPseudoLocalityTag(int8_t locality, Version upTo) override {
 		ASSERT(isPseudoLocality(locality));
 		auto& localityVersion = pseudoLocalityPopVersion[locality];
 		localityVersion = std::max(localityVersion, upTo);
 		Version minVersion = localityVersion;
+		// Why do we need to use the minimum popped version among all tags? Reason: for example,
+		// 2 pseudo tags pop 100 or 150, respectively. It's only safe to pop min(100, 150),
+		// because [101,150) is needed by another pseudo tag.
 		for (const auto& it : pseudoLocalityPopVersion) {
 			minVersion = std::min(minVersion, it.second);
 		}
@ -1045,6 +1056,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 	}

+	// pop 'tag.locality' type data up to the 'upTo' version
 	virtual void pop( Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality ) {
 		if (upTo <= 0) return;
 		if( tag.locality == tagLocalityRemoteLog) {
@ -1057,18 +1069,22 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					Version prev = outstandingPops[std::make_pair(log->get().id(),tag)].first;
 					if (prev < upTo)
 						outstandingPops[std::make_pair(log->get().id(),tag)] = std::make_pair(upTo, durableKnownCommittedVersion);
-					if (prev == 0)
-						popActors.add( popFromLog( this, log, tag, 1.0 ) ); //< FIXME: knob
+
+					if (prev == 0) {
+						popActors.add(popFromLog(this, log, tag, 1.0)); //< FIXME: knob // TODO: Knobify it
+					}
 				}
 			}
 		}
 	}

+	// pop tag from log up to the version defined in self->outstandingPops[].first
 	ACTOR static Future<Void> popFromLog( TagPartitionedLogSystem* self, Reference<AsyncVar<OptionalInterface<TLogInterface>>> log, Tag tag, double time ) {
 		state Version last = 0;
 		loop {
 			wait( delay(time, TaskPriority::TLogPop) );

+			// to: first is upto version, second is durableKnownComittedVersion
 			state std::pair<Version,Version> to = self->outstandingPops[ std::make_pair(log->get().id(),tag) ];

 			if (to.first <= last) {
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -373,6 +373,14 @@ void failAfter( Future<Void> trigger, Endpoint e ) {
 		failAfter( trigger, g_simulator.getProcess( e ) );
 }

+ACTOR Future<Void> histogramReport() {
+	loop {
+		wait(delay(SERVER_KNOBS->HISTOGRAM_REPORT_INTERVAL));
+
+		GetHistogramRegistry().logReport();
+	}
+}
+
 void testSerializationSpeed() {
 	double tstart;
 	double build = 0, serialize = 0, deserialize = 0, copy = 0, deallocate = 0;
@ -492,8 +500,10 @@ ACTOR Future<Void> dumpDatabase( Database cx, std::string outputFilename, KeyRan
 void memoryTest();
 void skipListTest();

-Future<Void> startSystemMonitor(std::string dataFolder, Optional<Standalone<StringRef>> zoneId, Optional<Standalone<StringRef>> machineId) {
-	initializeSystemMonitorMachineState(SystemMonitorMachineState(dataFolder, zoneId, machineId, g_network->getLocalAddress().ip));
+Future<Void> startSystemMonitor(std::string dataFolder, Optional<Standalone<StringRef>> dcId,
+                                Optional<Standalone<StringRef>> zoneId, Optional<Standalone<StringRef>> machineId) {
+	initializeSystemMonitorMachineState(
+	    SystemMonitorMachineState(dataFolder, dcId, zoneId, machineId, g_network->getLocalAddress().ip));

 	systemMonitor();
 	return recurring( &systemMonitor, 5.0, TaskPriority::FlushTrace );
@ -1656,6 +1666,8 @@ int main(int argc, char* argv[]) {
 		if (role == Simulation) {
 			TraceEvent("Simulation").detail("TestFile", testFile);

+			auto histogramReportActor = histogramReport();
+
 			clientKnobs->trace();
 			flowKnobs->trace();
 			serverKnobs->trace();
@ -1786,6 +1798,7 @@ int main(int argc, char* argv[]) {

 			vector<Future<Void>> actors(listenErrors.begin(), listenErrors.end());
 			actors.push_back( fdbd(connectionFile, localities, processClass, dataFolder, dataFolder, storageMemLimit, metricsConnFile, metricsPrefix, rsssize, whitelistBinPaths) );
+			actors.push_back(histogramReport());
 			//actors.push_back( recurring( []{}, .001 ) );  // for ASIO latency measurement

 			f = stopAfter( waitForAll(actors) );
@ -1794,13 +1807,13 @@ int main(int argc, char* argv[]) {
 			f = stopAfter( runTests( connectionFile, TEST_TYPE_FROM_FILE, testOnServers ? TEST_ON_SERVERS : TEST_ON_TESTERS, minTesterCount, testFile, StringRef(), localities ) );
 			g_network->run();
 		} else if (role == Test) {
-			auto m = startSystemMonitor(dataFolder, zoneId, zoneId);
+			auto m = startSystemMonitor(dataFolder, dcId, zoneId, zoneId);
 			f = stopAfter( runTests( connectionFile, TEST_TYPE_FROM_FILE, TEST_HERE, 1, testFile, StringRef(), localities ) );
 			g_network->run();
 		} else if (role == ConsistencyCheck) {
 			setupSlowTaskProfiler();

-			auto m = startSystemMonitor(dataFolder, zoneId, zoneId);
+			auto m = startSystemMonitor(dataFolder, dcId, zoneId, zoneId);
 			f = stopAfter( runTests( connectionFile, TEST_TYPE_CONSISTENCY_CHECK, TEST_HERE, 1, testFile, StringRef(), localities ) );
 			g_network->run();
 		} else if (role == CreateTemplateDatabase) {
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -19,11 +19,16 @@
 */

 #include <cinttypes>
+#include <functional>
+#include <type_traits>
+#include <unordered_map>
+
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/LoadBalance.h"
-#include "flow/IndexedSet.h"
-#include "flow/Hash3.h"
 #include "flow/ActorCollection.h"
+#include "flow/Hash3.h"
+#include "flow/Histogram.h"
+#include "flow/IndexedSet.h"
 #include "flow/SystemMonitor.h"
 #include "flow/Util.h"
 #include "fdbclient/Atomic.h"
@ -52,11 +57,8 @@
 #include "fdbrpc/Smoother.h"
 #include "fdbrpc/Stats.h"
 #include "flow/TDMetric.actor.h"
-#include <type_traits>
-#include "flow/actorcompiler.h"  // This must be the last #include.

-using std::pair;
-using std::make_pair;
+#include "flow/actorcompiler.h"  // This must be the last #include.

 #pragma region Data Structures

@ -240,13 +242,13 @@ struct UpdateEagerReadInfo {
 	void finishKeyBegin() {
 		std::sort(keyBegin.begin(), keyBegin.end());
 		keyBegin.resize( std::unique(keyBegin.begin(), keyBegin.end()) - keyBegin.begin() );
-		std::sort(keys.begin(), keys.end(), [](const pair<KeyRef, int>& lhs, const pair<KeyRef, int>& rhs) { return (lhs.first < rhs.first) || (lhs.first == rhs.first && lhs.second > rhs.second); } );
-		keys.resize(std::unique(keys.begin(), keys.end(), [](const pair<KeyRef, int>& lhs, const pair<KeyRef, int>& rhs) { return lhs.first == rhs.first; } ) - keys.begin());
+		std::sort(keys.begin(), keys.end(), [](const std::pair<KeyRef, int>& lhs, const std::pair<KeyRef, int>& rhs) { return (lhs.first < rhs.first) || (lhs.first == rhs.first && lhs.second > rhs.second); } );
+		keys.resize(std::unique(keys.begin(), keys.end(), [](const std::pair<KeyRef, int>& lhs, const std::pair<KeyRef, int>& rhs) { return lhs.first == rhs.first; } ) - keys.begin());
 		//value gets populated in doEagerReads
 	}

 	Optional<Value>& getValue(KeyRef key) {
-		int i = std::lower_bound(keys.begin(), keys.end(), pair<KeyRef, int>(key, 0), [](const pair<KeyRef, int>& lhs, const pair<KeyRef, int>& rhs) { return lhs.first < rhs.first; } ) - keys.begin();
+		int i = std::lower_bound(keys.begin(), keys.end(),std::pair<KeyRef, int>(key, 0), [](const std::pair<KeyRef, int>& lhs, const std::pair<KeyRef, int>& rhs) { return lhs.first < rhs.first; } ) - keys.begin();
 		ASSERT( i < keys.size() && keys[i].first == key );
 		return value[i];
 	}
@ -296,9 +298,63 @@ private:
 	std::map<Version, Standalone<VersionUpdateRef>> mutationLog; // versions (durableVersion, version]

 public:
+public:
+	// Histograms
+	struct FetchKeysHistograms {
+		const Reference<Histogram> latency;
+		const Reference<Histogram> bytes;
+		const Reference<Histogram> bandwidth;
+
+		FetchKeysHistograms()
+		  : latency(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, FETCH_KEYS_LATENCY_HISTOGRAM,
+		                                    Histogram::Unit::microseconds)),
+		    bytes(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, FETCH_KEYS_BYTES_HISTOGRAM,
+		                                  Histogram::Unit::bytes)),
+		    bandwidth(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, FETCH_KEYS_BYTES_PER_SECOND_HISTOGRAM,
+		                                      Histogram::Unit::bytes_per_second)) {}
+	} fetchKeysHistograms;
+
+	class CurrentRunningFetchKeys {
+		std::unordered_map<UID, double> startTimeMap;
+		std::unordered_map<UID, KeyRangeRef> keyRangeMap;
+
+		static const StringRef emptyString;
+		static const KeyRangeRef emptyKeyRange;
+	public:
+		void recordStart(const UID id, const KeyRange keyRange) {
+			startTimeMap[id] =  now();
+			keyRangeMap[id] = keyRange;
+		}
+
+		void recordFinish(const UID id) {
+			startTimeMap.erase(id);
+			keyRangeMap.erase(id);
+		}
+
+		std::pair<double, KeyRangeRef> longestTime() const {
+			if (numRunning() == 0) {
+				return {-1, emptyKeyRange};
+			}
+
+			const double currentTime = now();
+			double longest = 0;
+			UID UIDofLongest;
+			for (const auto kv: startTimeMap) {
+				const double currentRunningTime = currentTime - kv.second;
+				if (longest < currentRunningTime) {
+					longest = currentRunningTime;
+					UIDofLongest = kv.first;
+				}
+			}
+			return {longest, keyRangeMap.at(UIDofLongest)};
+		}
+
+		int numRunning() const { return startTimeMap.size(); }
+	} currentRunningFetchKeys;
+
 	Tag tag;
-	vector<pair<Version,Tag>> history;
-	vector<pair<Version,Tag>> allHistory;
+	vector<std::pair<Version,Tag>> history;
+	vector<std::pair<Version,Tag>> allHistory;
 	Version poppedAllAfter;
 	std::map<Version, Arena> freeable;  // for each version, an Arena that must be held until that version is < oldestVersion
 	Arena lastArena;
@ -345,8 +401,8 @@ public:
 				poppedAllAfter = std::numeric_limits<Version>::max();
 			}

-			vector<pair<Version,Tag>>* hist = &history;
-			vector<pair<Version,Tag>> allHistoryCopy;
+			vector<std::pair<Version,Tag>>* hist = &history;
+			vector<std::pair<Version,Tag>> allHistoryCopy;
 			if(popAllTags) {
 				allHistoryCopy = allHistory;
 				hist = &allHistoryCopy;
@ -535,22 +591,18 @@ public:
 		}
 	} counters;

-	StorageServer(IKeyValueStore* storage, Reference<AsyncVar<ServerDBInfo>> const& db, StorageServerInterface const& ssi)
-		:	instanceID(deterministicRandom()->randomUniqueID().first()),
-			storage(this, storage), db(db),
-			lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0),
-			rebootAfterDurableVersion(std::numeric_limits<Version>::max()),
-			durableInProgress(Void()),
-			versionLag(0), primaryLocality(tagLocalityInvalid),
-			updateEagerReads(0),
-			shardChangeCounter(0),
-			fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES),
-			shuttingDown(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0),
-			logProtocol(0), counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()),
-			readQueueSizeMetric(LiteralStringRef("StorageServer.ReadQueueSize")),
-			behind(false), versionBehind(false), byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), noRecentUpdates(false),
-			lastUpdate(now()), poppedAllAfter(std::numeric_limits<Version>::max()), cpuUsage(0.0), diskUsage(0.0)
-	{
+	StorageServer(IKeyValueStore* storage, Reference<AsyncVar<ServerDBInfo>> const& db,
+	              StorageServerInterface const& ssi)
+	  : fetchKeysHistograms(), instanceID(deterministicRandom()->randomUniqueID().first()), storage(this, storage),
+	    db(db), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0),
+	    rebootAfterDurableVersion(std::numeric_limits<Version>::max()), durableInProgress(Void()), versionLag(0),
+	    primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0),
+	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), shuttingDown(false),
+	    debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0), logProtocol(0),
+	    counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()),
+	    readQueueSizeMetric(LiteralStringRef("StorageServer.ReadQueueSize")), behind(false), versionBehind(false),
+	    byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), noRecentUpdates(false), lastUpdate(now()),
+	    poppedAllAfter(std::numeric_limits<Version>::max()), cpuUsage(0.0), diskUsage(0.0) {
 		version.initMetric(LiteralStringRef("StorageServer.Version"), counters.cc.id);
 		oldestVersion.initMetric(LiteralStringRef("StorageServer.OldestVersion"), counters.cc.id);
 		durableVersion.initMetric(LiteralStringRef("StorageServer.DurableVersion"), counters.cc.id);
@ -662,6 +714,9 @@ public:
 	}
 };

+const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = LiteralStringRef("");
+const KeyRangeRef StorageServer::CurrentRunningFetchKeys::emptyKeyRange = KeyRangeRef(StorageServer::CurrentRunningFetchKeys::emptyString, StorageServer::CurrentRunningFetchKeys::emptyString);
+
 // If and only if key:=value is in (storage+versionedData),    // NOT ACTUALLY: and key < allKeys.end,
 //   and H(key) < |key+value|/bytesPerSample,
 //     let sampledSize = max(|key+value|,bytesPerSample)
@ -1622,7 +1677,7 @@ bool changeDurableVersion( StorageServer* data, Version desiredDurableVersion )
 	setDataDurableVersion(data->thisServerID, data->durableVersion.get());
 	if (checkFatalError.isReady()) checkFatalError.get();

-	//TraceEvent("ForgotVersionsBefore", data->thisServerID).detail("Version", nextDurableVersion);
+	// TraceEvent("ForgotVersionsBefore", data->thisServerID).detail("Version", nextDurableVersion);
 	validate(data);

 	return nextDurableVersion == desiredDurableVersion;
@ -1942,16 +1997,56 @@ ACTOR Future<Void> logFetchKeysWarning(AddingShard* shard) {
 	loop {
 		state double waitSeconds = BUGGIFY ? 5.0 : 600.0;
 		wait(delay(waitSeconds));
-		TraceEvent(waitSeconds > 300.0 ? SevWarnAlways : SevInfo, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable());
+
+		const auto traceEventLevel = waitSeconds > SERVER_KNOBS->FETCH_KEYS_TOO_LONG_TIME_CRITERIA ? SevWarnAlways : SevInfo;
+		TraceEvent(traceEventLevel, "FetchKeysTooLong")
+		    .detail("Duration", now() - startTime)
+		    .detail("Phase", shard->phase)
+		    .detail("Begin", shard->keys.begin.printable())
+		    .detail("End", shard->keys.end.printable());
 	}
 }

+class FetchKeysMetricReporter {
+	const UID uid;
+	const double startTime;
+	int fetchedBytes;
+	StorageServer::FetchKeysHistograms& histograms;
+	StorageServer::CurrentRunningFetchKeys& currentRunning;
+
+public:
+	FetchKeysMetricReporter(const UID& uid_, const double startTime_, const KeyRange& keyRange, StorageServer::FetchKeysHistograms& histograms_, StorageServer::CurrentRunningFetchKeys& currentRunning_)
+		 : uid(uid_), startTime(startTime_), fetchedBytes(0), histograms(histograms_), currentRunning(currentRunning_) {
+
+		currentRunning.recordStart(uid, keyRange);
+	}
+
+	void addFetchedBytes(const int bytes) { fetchedBytes += bytes; }
+
+	~FetchKeysMetricReporter() {
+		double latency = now() - startTime;
+
+		// If fetchKeys is *NOT* run, i.e. returning immediately, still report a record.
+		if (latency == 0) latency = 1e6;
+
+		const uint32_t bandwidth = fetchedBytes / latency;
+
+		histograms.latency->sampleSeconds(latency);
+		histograms.bytes->sample(fetchedBytes);
+		histograms.bandwidth->sample(bandwidth);
+
+		currentRunning.recordFinish(uid);
+	}
+};
+
 ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
+	state const UID fetchKeysID = deterministicRandom()->randomUniqueID();
 	state TraceInterval interval("FetchKeys");
 	state KeyRange keys = shard->keys;
 	state Future<Void> warningLogger = logFetchKeysWarning(shard);
-	state double startt = now();
+	state const double startTime = now();
 	state int fetchBlockBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_BLOCK_BYTES : SERVER_KNOBS->FETCH_BLOCK_BYTES;
+	state FetchKeysMetricReporter metricReporter(fetchKeysID, startTime, keys, data->fetchKeysHistograms, data->currentRunningFetchKeys);

 	// delay(0) to force a return to the run loop before the work of fetchKeys is started.
 	//  This allows adding->start() to be called inline with CSK.
@ -1989,7 +2084,7 @@ ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {

 		state double executeStart = now();
 		++data->counters.fetchWaitingCount;
-		data->counters.fetchWaitingMS += 1000*(executeStart - startt);
+		data->counters.fetchWaitingMS += 1000 * (executeStart - startTime);

 		// Fetch keys gets called while the update actor is processing mutations. data->version will not be updated until all mutations for a version
 		// have been processed. We need to take the durableVersionLock to ensure data->version is greater than the version of the mutation which caused
@ -2029,6 +2124,7 @@ ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
 				debugKeyRange("fetchRange", fetchVersion, keys);
 				for(auto k = this_block.begin(); k != this_block.end(); ++k) debugMutation("fetch", fetchVersion, MutationRef(MutationRef::SetValue, k->key, k->value));

+				metricReporter.addFetchedBytes(expectedSize);
 				data->counters.bytesFetched += expectedSize;
 				if( fetchBlockBytes > expectedSize ) {
 					holdingFKPL.release( fetchBlockBytes - expectedSize );
@ -2096,8 +2192,9 @@ ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
 					while (!shard->updates.empty() && shard->updates[0].version <= fetchVersion) shard->updates.pop_front();

 					//FIXME: remove when we no longer support upgrades from 5.X
-					if(debug_getRangeRetries >= 100) {
+					if (debug_getRangeRetries >= 100) {
 						data->cx->enableLocalityLoadBalance = false;
+						// TODO: Add SevWarnAlways to say it was disabled.
 					}

 					debug_getRangeRetries++;
@ -2214,7 +2311,7 @@ ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {

 		TraceEvent(SevError, "FetchKeysError", data->thisServerID)
 			.error(e)
-			.detail("Elapsed", now()-startt)
+			.detail("Elapsed", now() - startTime)
 			.detail("KeyBegin", keys.begin)
 			.detail("KeyEnd",keys.end);
 		if (e.code() != error_code_actor_cancelled)
@ -3014,7 +3111,9 @@ bool StorageServerDisk::makeVersionMutationsDurable( Version& prevStorageVersion
 void StorageServerDisk::makeVersionDurable( Version version ) {
 	storage->set( KeyValueRef(persistVersion, BinaryWriter::toValue(version, Unversioned())) );

-	//TraceEvent("MakeDurable", data->thisServerID).detail("FromVersion", prevStorageVersion).detail("ToVersion", version);
+	// TraceEvent("MakeDurable", data->thisServerID)
+	//     .detail("FromVersion", prevStorageVersion)
+	//     .detail("ToVersion", version);
 }

 void StorageServerDisk::changeLogProtocol(Version version, ProtocolVersion protocol) {
@ -3406,7 +3505,10 @@ ACTOR Future<Void> metricsCore( StorageServer* self, StorageServerInterface ssi

 	wait( self->byteSampleRecovery );

-	actors.add(traceCounters("StorageMetrics", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self->counters.cc, self->thisServerID.toString() + "/StorageMetrics"));
+	Tag tag = self->tag;
+	actors.add(traceCounters("StorageMetrics", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                         &self->counters.cc, self->thisServerID.toString() + "/StorageMetrics",
+	                         [tag](TraceEvent& te) { te.detail("Tag", tag.toString()); }));

 	loop {
 		choose {
@ -3472,6 +3574,35 @@ ACTOR Future<Void> checkBehind( StorageServer* self ) {
 	}
 }

+ACTOR Future<Void> reportStorageServerState(StorageServer* self) {
+	if (!SERVER_KNOBS->REPORT_DD_METRICS) {
+		return Void();
+	}
+
+	loop {
+		wait(delay(SERVER_KNOBS->DD_METRICS_REPORT_INTERVAL));
+
+		const auto numRunningFetchKeys = self->currentRunningFetchKeys.numRunning();
+		if (numRunningFetchKeys == 0) {
+			continue;
+		}
+
+		const auto longestRunningFetchKeys = self->currentRunningFetchKeys.longestTime();
+
+		auto level = SevInfo;
+		if (longestRunningFetchKeys.first >= SERVER_KNOBS->FETCH_KEYS_TOO_LONG_TIME_CRITERIA) {
+			level = SevWarnAlways;
+		}
+
+		TraceEvent(level, "FetchKeyCurrentStatus")
+		    .detail("Timestamp", now())
+		    .detail("LongestRunningTime", longestRunningFetchKeys.first)
+		    .detail("StartKey", longestRunningFetchKeys.second.begin.printable())
+		    .detail("EndKey", longestRunningFetchKeys.second.end.printable())
+		    .detail("NumRunning", numRunningFetchKeys);
+	}
+}
+
 ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterface ssi )
 {
 	state Future<Void> doUpdate = Void();
@ -3489,6 +3620,7 @@ ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterfac
 	actors.add(metricsCore(self, ssi));
 	actors.add(logLongByteSampleRecovery(self->byteSampleRecovery));
 	actors.add(checkBehind(self));
+	actors.add(reportStorageServerState(self));

 	self->coreStarted.send( Void() );

--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -835,7 +835,8 @@ ACTOR Future<Void> workerServer(

 	filesClosed.add(stopping.getFuture());

-	initializeSystemMonitorMachineState(SystemMonitorMachineState(folder, locality.zoneId(), locality.machineId(), g_network->getLocalAddress().ip));
+	initializeSystemMonitorMachineState(SystemMonitorMachineState(
+	    folder, locality.dcId(), locality.zoneId(), locality.machineId(), g_network->getLocalAddress().ip));

 	{
 		auto recruited = interf;  //ghetto! don't we all love a good #define
--- a/fdbserver/workloads/KVStoreTest.actor.cpp
+++ b/fdbserver/workloads/KVStoreTest.actor.cpp
@ -28,9 +28,9 @@
 extern IKeyValueStore *makeDummyKeyValueStore();

 template <class T>
-class Histogram {
+class TestHistogram {
 public:
-	Histogram(int minSamples = 100) : minSamples(minSamples) { reset(); }
+	TestHistogram(int minSamples = 100) : minSamples(minSamples) { reset(); }

 	void reset(){
 		N = 0;
@ -153,7 +153,7 @@ struct KVTest {
 	}
 };

-ACTOR Future<Void> testKVRead( KVTest* test, Key key, Histogram<float>* latency, PerfIntCounter* count ) {
+ACTOR Future<Void> testKVRead(KVTest* test, Key key, TestHistogram<float>* latency, PerfIntCounter* count) {
 	//state Version s1 = test->lastCommit;
 	state Version s2 = test->lastDurable;

@ -171,7 +171,7 @@ ACTOR Future<Void> testKVRead( KVTest* test, Key key, Histogram<float>* latency,
 	return Void();
 }

-ACTOR Future<Void> testKVReadSaturation( KVTest* test, Histogram<float>* latency, PerfIntCounter* count ) {
+ACTOR Future<Void> testKVReadSaturation(KVTest* test, TestHistogram<float>* latency, PerfIntCounter* count) {
 	while (true) {
 		state double begin = timer();
 		Optional<Value> val = wait( test->store->readValue(test->randomKey()) );
@ -181,7 +181,7 @@ ACTOR Future<Void> testKVReadSaturation( KVTest* test, Histogram<float>* latency
 	}
 }

-ACTOR Future<Void> testKVCommit( KVTest* test, Histogram<float>* latency, PerfIntCounter* count ) {
+ACTOR Future<Void> testKVCommit(KVTest* test, TestHistogram<float>* latency, PerfIntCounter* count) {
 	state Version v = test->lastSet;
 	test->lastCommit = v;
 	state double begin = timer();
@ -202,7 +202,7 @@ struct KVStoreTestWorkload : TestWorkload {
 	bool doSetup, doClear, doCount;
 	std::string filename;
 	PerfIntCounter reads, sets, commits;
-	Histogram<float> readLatency, commitLatency;
+	TestHistogram<float> readLatency, commitLatency;
 	double setupTook;
 	std::string storeType;

@ -232,7 +232,7 @@ struct KVStoreTestWorkload : TestWorkload {
 		return Void();
 	}
 	virtual Future<bool> check( Database const& cx ) { return true; }
-	void metricsFromHistogram(vector<PerfMetric>& m, std::string name, Histogram<float>& h){
+	void metricsFromHistogram(vector<PerfMetric>& m, std::string name, TestHistogram<float>& h) {
 		m.push_back( PerfMetric( "Min " + name, 1000.0 * h.min(), true) );
 		m.push_back( PerfMetric( "Average " + name, 1000.0 * h.mean(), true) );
 		m.push_back( PerfMetric( "Median " + name, 1000.0 * h.medianEstimate(), true) );
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@ -24,6 +24,8 @@ set(FLOW_SRCS
  FileTraceLogWriter.h
  Hash3.c
  Hash3.h
+  Histogram.cpp
+  Histogram.h
  IDispatched.h
  IRandom.h
  IThreadPool.cpp
--- a/flow/Histogram.cpp
+++ b/flow/Histogram.cpp
@ -0,0 +1,187 @@
+/*
+ * Histogram.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <flow/Histogram.h>
+#include <flow/flow.h>
+#include <flow/UnitTest.h>
+// TODO: remove dependency on fdbrpc.
+
+// we need to be able to check if we're in simulation so that the histograms are properly
+// scoped to the right "machine".
+// either we pull g_simulator into flow, or flow (and the I/O path) will be unable to log performance
+// metrics.
+#include <fdbrpc/simulator.h>
+
+// pull in some global pointers too:  These types are implemented in fdbrpc/sim2.actor.cpp, which is not available here.
+// Yuck. If you're not using the simulator, these will remain null, and all should be well.
+
+// TODO: create a execution context abstraction that allows independent flow instances within a process.
+// The simulator would be the main user of it, and histogram would be the only other user (for now).
+ISimulator* g_pSimulator = nullptr;
+thread_local ISimulator::ProcessInfo* ISimulator::currentProcess = nullptr;
+
+// Fallback registry when we're not in simulation -- if we had execution contexts we wouldn't need to check if
+// we have a simulated contex here; we'd just use the current context regardless.
+static HistogramRegistry* globalHistograms = nullptr;
+
+#pragma region HistogramRegistry
+
+HistogramRegistry& GetHistogramRegistry() {
+	ISimulator::ProcessInfo* info = g_simulator.getCurrentProcess();
+
+	if (info) {
+		// in simulator; scope histograms to simulated process
+		return info->histograms;
+	}
+	// avoid link order issues where the registry hasn't been initialized, but we're
+	// instantiating a histogram
+	if (globalHistograms == nullptr) {
+		// Note:  This will show up as a leak on shutdown, but we're OK with that.
+		globalHistograms = new HistogramRegistry();
+	}
+	return *globalHistograms;
+}
+
+void HistogramRegistry::registerHistogram(Histogram* h) {
+	if (histograms.find(h->name()) != histograms.end()) {
+		TraceEvent(SevError, "HistogramDoubleRegistered").detail("group", h->group).detail("op", h->op);
+		ASSERT(false);
+	}
+	histograms.insert(std::pair<std::string, Histogram*>(h->name(), h));
+}
+
+void HistogramRegistry::unregisterHistogram(Histogram* h) {
+	std::string name = h->name();
+	if (histograms.find(name) == histograms.end()) {
+		TraceEvent(SevError, "HistogramNotRegistered").detail("group", h->group).detail("op", h->op);
+	}
+	int count = histograms.erase(name);
+	ASSERT(count == 1);
+}
+
+Histogram* HistogramRegistry::lookupHistogram(std::string name) {
+	auto h = histograms.find(name);
+	if (h == histograms.end()) {
+		return nullptr;
+	}
+	return h->second;
+}
+
+void HistogramRegistry::logReport() {
+	for (auto& i : histograms) {
+		i.second->writeToLog();
+		i.second->clear();
+	}
+}
+
+#pragma endregion // HistogramRegistry
+
+#pragma region Histogram
+
+const std::unordered_map<Histogram::Unit, std::string> Histogram::UnitToStringMapper = {
+	{ Histogram::Unit::microseconds, "microseconds" },
+	{ Histogram::Unit::bytes, "bytes" },
+	{ Histogram::Unit::bytes_per_second, "bytes_per_second" }
+};
+
+void Histogram::writeToLog() {
+	bool active = false;
+	for (uint32_t i = 0; i < 32; i++) {
+		if (buckets[i]) {
+			active = true;
+			break;
+		}
+	}
+	if (!active) {
+		return;
+	}
+
+	TraceEvent e(SevInfo, "Histogram");
+	e.detail("Group", group).detail("Op", op).detail("Unit", UnitToStringMapper.at(unit));
+
+	for (uint32_t i = 0; i < 32; i++) {
+		uint32_t value = ((uint32_t)1) << (i + 1);
+
+		if (buckets[i]) {
+			switch (unit) {
+			case Unit::microseconds:
+				e.detail(format("LessThan%u.%03u", value / 1000, value % 1000), buckets[i]);
+				break;
+			case Unit::bytes:
+			case Unit::bytes_per_second:
+				e.detail(format("LessThan%u", value), buckets[i]);
+				break;
+			default:
+				ASSERT(false);
+			}
+		}
+	}
+}
+
+#pragma endregion // Histogram
+
+TEST_CASE("/flow/histogram/smoke_test") {
+
+	{
+		Reference<Histogram> h =
+		    Histogram::getHistogram(LiteralStringRef("smoke_test"), LiteralStringRef("counts"), Histogram::Unit::bytes);
+
+		h->sample(0);
+		ASSERT(h->buckets[0] == 1);
+		h->sample(1);
+		ASSERT(h->buckets[0] == 2);
+
+		h->sample(2);
+		ASSERT(h->buckets[1] == 1);
+
+		GetHistogramRegistry().logReport();
+
+		ASSERT(h->buckets[0] == 0);
+		h->sample(0);
+		ASSERT(h->buckets[0] == 1);
+		h = Histogram::getHistogram(LiteralStringRef("smoke_test"), LiteralStringRef("counts2"),
+		                            Histogram::Unit::bytes);
+
+		// confirm that old h was deallocated.
+		h = Histogram::getHistogram(LiteralStringRef("smoke_test"), LiteralStringRef("counts"), Histogram::Unit::bytes);
+		ASSERT(h->buckets[0] == 0);
+
+		h = Histogram::getHistogram(LiteralStringRef("smoke_test"), LiteralStringRef("times"),
+		                            Histogram::Unit::microseconds);
+
+		h->sampleSeconds(0.000000);
+		h->sampleSeconds(0.0000019);
+		ASSERT(h->buckets[0] == 2);
+		h->sampleSeconds(0.0000021);
+		ASSERT(h->buckets[1] == 1);
+		h->sampleSeconds(0.000015);
+		ASSERT(h->buckets[3] == 1);
+
+		h->sampleSeconds(4400.0);
+		ASSERT(h->buckets[31] == 1);
+
+		GetHistogramRegistry().logReport();
+	}
+
+	// h has been deallocated.  Does this crash?
+	GetHistogramRegistry().logReport();
+
+	return Void();
+}
--- a/flow/Histogram.h
+++ b/flow/Histogram.h
@ -0,0 +1,137 @@
+/*
+ * Histogram.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLOW_HISTOGRAM_H
+#define FLOW_HISTOGRAM_H
+#pragma once
+
+#include <flow/Arena.h>
+
+#include <string>
+#include <map>
+#include <unordered_map>
+
+#ifdef _WIN32
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+#endif
+
+class Histogram;
+
+class HistogramRegistry {
+public:
+	void registerHistogram(Histogram* h);
+	void unregisterHistogram(Histogram* h);
+	Histogram* lookupHistogram(std::string name);
+	void logReport();
+
+private:
+	// This map is ordered by key so that ops within the same group end up
+	// next to each other in the trace log.
+	std::map<std::string, Histogram*> histograms;
+};
+
+HistogramRegistry& GetHistogramRegistry();
+
+/*
+ * A fast histogram with power-of-two spaced buckets.
+ *
+ * For more information about this technique, see:
+ * https://www.fsl.cs.stonybrook.edu/project-osprof.html
+ */
+class Histogram sealed : public ReferenceCounted<Histogram> {
+public:
+	enum class Unit { microseconds, bytes, bytes_per_second };
+
+private:
+	static const std::unordered_map<Unit, std::string> UnitToStringMapper;
+
+	Histogram(std::string group, std::string op, Unit unit, HistogramRegistry& registry)
+	  : group(group), op(op), unit(unit), registry(registry), ReferenceCounted<Histogram>() {
+
+		ASSERT(UnitToStringMapper.find(unit) != UnitToStringMapper.end());
+
+		clear();
+	}
+
+	static std::string generateName(std::string group, std::string op) { return group + ":" + op; }
+
+public:
+	~Histogram() { registry.unregisterHistogram(this); }
+
+	static Reference<Histogram> getHistogram(StringRef group, StringRef op, Unit unit) {
+		std::string group_str = group.toString();
+		std::string op_str = op.toString();
+		std::string name = generateName(group_str, op_str);
+		HistogramRegistry& registry = GetHistogramRegistry();
+		Histogram* h = registry.lookupHistogram(name);
+		if (!h) {
+			h = new Histogram(group_str, op_str, unit, registry);
+			registry.registerHistogram(h);
+			return Reference<Histogram>(h);
+		} else {
+			return Reference<Histogram>::addRef(h);
+		}
+	}
+
+	// This histogram buckets samples into powers of two.
+	inline void sample(uint32_t sample) {
+		size_t idx;
+#ifdef _WIN32
+		unsigned long index;
+		// _BitScanReverse sets index to the position of the first non-zero bit, so
+		// _BitScanReverse(sample) ~= log_2(sample).  _BitScanReverse returns false if
+		// sample is zero.
+		idx = _BitScanReverse(&index, sample) ? index : 0;
+#else
+		// __builtin_clz counts the leading zeros in its uint32_t argument.  So, 31-clz ~= log_2(sample).
+		// __builtin_clz(0) is undefined.
+		idx = sample ? (31 - __builtin_clz(sample)) : 0;
+#endif
+		ASSERT(idx < 32);
+		buckets[idx]++;
+	}
+
+	inline void sampleSeconds(double delta) {
+		uint64_t delta_usec = (delta * 1000000);
+		if (delta_usec > UINT32_MAX) {
+			sample(UINT32_MAX);
+		} else {
+			sample((uint32_t)(delta * 1000000)); // convert to microseconds and truncate to integer
+		}
+	}
+
+	void clear() {
+		for (uint32_t& i : buckets) {
+			i = 0;
+		}
+	}
+	void writeToLog();
+
+	std::string name() { return generateName(this->group, this->op); }
+
+	std::string const group;
+	std::string const op;
+	Unit const unit;
+	HistogramRegistry& registry;
+	uint32_t buckets[32];
+};
+
+#endif // FLOW_HISTOGRAM_H
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@ -74,6 +74,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
 	init( TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT,                20.0 );
 	init( PING_LOGGING_INTERVAL,                               3.0 );
 	init( PING_SAMPLE_AMOUNT,                                  100 );
+	init( NETWORK_CONNECT_SAMPLE_AMOUNT,                       100 );

 	init( TLS_CERT_REFRESH_DELAY_SECONDS,                 12*60*60 );
 	init( TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT,              9.0 );
@ -150,6 +151,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
 	init( TRACE_RETRY_OPEN_INTERVAL,						  1.00 );
 	init( MIN_TRACE_SEVERITY,                 isSimulated ? 0 : 10 ); // Related to the trace severity in Trace.h
 	init( MAX_TRACE_SUPPRESSIONS,                              1e4 );
+	init( TRACE_DATETIME_ENABLED,                             true ); // trace time in human readable format (always real time)
 	init( TRACE_SYNC_ENABLED,                                    0 );
 	init( TRACE_EVENT_METRIC_UNITS_PER_SAMPLE,                 500 );
 	init( TRACE_EVENT_THROTTLER_SAMPLE_EXPIRY,              1800.0 ); // 30 mins
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@ -92,6 +92,7 @@ public:
 	int USE_OBJECT_SERIALIZER;
 	double PING_LOGGING_INTERVAL;
 	int PING_SAMPLE_AMOUNT;
+	int NETWORK_CONNECT_SAMPLE_AMOUNT;

 	int TLS_CERT_REFRESH_DELAY_SECONDS;
 	double TLS_SERVER_CONNECTION_THROTTLE_TIMEOUT;
@ -172,6 +173,7 @@ public:
 	double TRACE_RETRY_OPEN_INTERVAL;
 	int MIN_TRACE_SEVERITY;
 	int MAX_TRACE_SUPPRESSIONS;
+	bool TRACE_DATETIME_ENABLED;
 	int TRACE_SYNC_ENABLED;
 	int TRACE_EVENT_METRIC_UNITS_PER_SAMPLE;
 	int TRACE_EVENT_THROTTLER_SAMPLE_EXPIRY;
--- a/flow/Net2Packet.cpp
+++ b/flow/Net2Packet.cpp
@ -150,6 +150,8 @@ void UnsentPacketQueue::sent(int bytes) {
 		bytes -= b->bytes_written - b->bytes_sent;
 		b->bytes_sent = b->bytes_written;
 		ASSERT(b->bytes_written <= b->size());
+		double queue_time = now() - b->enqueue_time;
+		sendQueueLatencyHistogram->sampleSeconds(queue_time);
 		unsent_first = b->nextPacketBuffer();
 		if (!unsent_first) unsent_last = NULL;
 		b->delref();
--- a/flow/Net2Packet.h
+++ b/flow/Net2Packet.h
@ -23,6 +23,7 @@
 #pragma once

 #include "flow/flow.h"
+#include "flow/Histogram.h"

 // PacketWriter and PacketBuffer are in serialize.h because they are needed by the SerializeSource<> template

@ -40,8 +41,17 @@ struct ReliablePacket : FastAllocated<ReliablePacket> {

 class UnsentPacketQueue : NonCopyable {
 public:
-	UnsentPacketQueue() : unsent_first(0), unsent_last(0) {}
-	~UnsentPacketQueue() { discardAll(); }
+	UnsentPacketQueue()
+	  : unsent_first(0), unsent_last(0),
+	    sendQueueLatencyHistogram(Histogram::getHistogram(
+	        LiteralStringRef("UnsentPacketQueue"), LiteralStringRef("QueueWait"), Histogram::Unit::microseconds)) {}
+
+	~UnsentPacketQueue() {
+		discardAll();
+		unsent_first = (PacketBuffer*)0xDEADBEEF;
+		unsent_last = (PacketBuffer*)0xCAFEBABE;
+		sendQueueLatencyHistogram = Reference<Histogram>(nullptr);
+	}

 	// Get a PacketBuffer to write new packets into
 	PacketBuffer* getWriteBuffer() {
@ -70,6 +80,7 @@ public:

 private:
 	PacketBuffer *unsent_first, *unsent_last;  // Both NULL, or inclusive range of PacketBuffers that haven't been sent.  The last one may have space for more packets to be written.
+	Reference<Histogram> sendQueueLatencyHistogram;
 };

 class ReliablePacketList : NonCopyable {
--- a/flow/SystemMonitor.cpp
+++ b/flow/SystemMonitor.cpp
@ -19,6 +19,7 @@
 */

 #include "flow/flow.h"
+#include "flow/Histogram.h"
 #include "flow/Platform.h"
 #include "flow/TDMetric.actor.h"
 #include "flow/SystemMonitor.h"
@ -60,87 +61,116 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 	if (!DEBUG_DETERMINISM && currentStats.initialized) {
 		{
 			TraceEvent(eventName.c_str())
-				.detail("Elapsed", currentStats.elapsed)
-				.detail("CPUSeconds", currentStats.processCPUSeconds)
-				.detail("MainThreadCPUSeconds", currentStats.mainThreadCPUSeconds)
-				.detail("UptimeSeconds", now() - machineState.monitorStartTime)
-				.detail("Memory", currentStats.processMemory)
-				.detail("ResidentMemory", currentStats.processResidentMemory)
-				.detail("UnusedAllocatedMemory", getTotalUnusedAllocatedMemory())
-				.detail("MbpsSent", ((netData.bytesSent - statState->networkState.bytesSent) * 8e-6) / currentStats.elapsed)
-				.detail("MbpsReceived", ((netData.bytesReceived - statState->networkState.bytesReceived) * 8e-6) / currentStats.elapsed)
-				.detail("DiskTotalBytes", currentStats.processDiskTotalBytes)
-				.detail("DiskFreeBytes", currentStats.processDiskFreeBytes)
-				.detail("DiskQueueDepth", currentStats.processDiskQueueDepth)
-				.detail("DiskIdleSeconds", currentStats.processDiskIdleSeconds)
-				.detail("DiskReads", currentStats.processDiskRead)
-				.detail("DiskWrites", currentStats.processDiskWrite)
-				.detail("DiskReadsCount", currentStats.processDiskReadCount)
-				.detail("DiskWritesCount", currentStats.processDiskWriteCount)
-				.detail("DiskWriteSectors", currentStats.processDiskWriteSectors)
-				.detail("DiskReadSectors", currentStats.processDiskReadSectors)
-				.detail("FileWrites", netData.countFileLogicalWrites - statState->networkState.countFileLogicalWrites)
-				.detail("FileReads", netData.countFileLogicalReads - statState->networkState.countFileLogicalReads)
-				.detail("CacheReadBytes", netData.countFileCacheReadBytes - statState->networkState.countFileCacheReadBytes)
-				.detail("CacheFinds", netData.countFileCacheFinds - statState->networkState.countFileCacheFinds)
-				.detail("CacheWritesBlocked", netData.countFileCacheWritesBlocked - statState->networkState.countFileCacheWritesBlocked)
-				.detail("CacheReadsBlocked", netData.countFileCacheReadsBlocked - statState->networkState.countFileCacheReadsBlocked)
-				.detail("CachePageReadsMerged", netData.countFileCachePageReadsMerged - statState->networkState.countFileCachePageReadsMerged)
-				.detail("CacheWrites", netData.countFileCacheWrites - statState->networkState.countFileCacheWrites)
-				.detail("CacheReads", netData.countFileCacheReads - statState->networkState.countFileCacheReads)
-				.detail("CacheHits", netData.countFilePageCacheHits - statState->networkState.countFilePageCacheHits)
-				.detail("CacheMisses", netData.countFilePageCacheMisses - statState->networkState.countFilePageCacheMisses)
-				.detail("CacheEvictions", netData.countFilePageCacheEvictions - statState->networkState.countFilePageCacheEvictions)
-				.detail("ZoneID", machineState.zoneId)
-				.detail("MachineID", machineState.machineId)
-				.detail("AIOSubmitCount", netData.countAIOSubmit - statState->networkState.countAIOSubmit)
-				.detail("AIOCollectCount", netData.countAIOCollect - statState->networkState.countAIOCollect)
-				.detail("AIOSubmitLag", (g_network->networkInfo.metrics.secSquaredSubmit - statState->networkMetricsState.secSquaredSubmit) / currentStats.elapsed)
-				.detail("AIODiskStall", (g_network->networkInfo.metrics.secSquaredDiskStall - statState->networkMetricsState.secSquaredDiskStall) / currentStats.elapsed)
-				.detail("CurrentConnections", netData.countConnEstablished - netData.countConnClosedWithError - netData.countConnClosedWithoutError)
-				.detail("ConnectionsEstablished", (double) (netData.countConnEstablished - statState->networkState.countConnEstablished) / currentStats.elapsed)
-				.detail("ConnectionsClosed", ((netData.countConnClosedWithError - statState->networkState.countConnClosedWithError) + (netData.countConnClosedWithoutError - statState->networkState.countConnClosedWithoutError)) / currentStats.elapsed)
-				.detail("ConnectionErrors", (netData.countConnClosedWithError - statState->networkState.countConnClosedWithError) / currentStats.elapsed)
-				.detail("TLSPolicyFailures", (netData.countTLSPolicyFailures - statState->networkState.countTLSPolicyFailures) / currentStats.elapsed)
-				.trackLatest(eventName);
+			    .detail("Elapsed", currentStats.elapsed)
+			    .detail("CPUSeconds", currentStats.processCPUSeconds)
+			    .detail("MainThreadCPUSeconds", currentStats.mainThreadCPUSeconds)
+			    .detail("UptimeSeconds", now() - machineState.monitorStartTime)
+			    .detail("Memory", currentStats.processMemory)
+			    .detail("ResidentMemory", currentStats.processResidentMemory)
+			    .detail("UnusedAllocatedMemory", getTotalUnusedAllocatedMemory())
+			    .detail("MbpsSent",
+			            ((netData.bytesSent - statState->networkState.bytesSent) * 8e-6) / currentStats.elapsed)
+			    .detail("MbpsReceived",
+			            ((netData.bytesReceived - statState->networkState.bytesReceived) * 8e-6) / currentStats.elapsed)
+			    .detail("DiskTotalBytes", currentStats.processDiskTotalBytes)
+			    .detail("DiskFreeBytes", currentStats.processDiskFreeBytes)
+			    .detail("DiskQueueDepth", currentStats.processDiskQueueDepth)
+			    .detail("DiskIdleSeconds", currentStats.processDiskIdleSeconds)
+			    .detail("DiskReads", currentStats.processDiskRead)
+			    .detail("DiskWrites", currentStats.processDiskWrite)
+			    .detail("DiskReadsCount", currentStats.processDiskReadCount)
+			    .detail("DiskWritesCount", currentStats.processDiskWriteCount)
+			    .detail("DiskWriteSectors", currentStats.processDiskWriteSectors)
+			    .detail("DiskReadSectors", currentStats.processDiskReadSectors)
+			    .detail("FileWrites", netData.countFileLogicalWrites - statState->networkState.countFileLogicalWrites)
+			    .detail("FileReads", netData.countFileLogicalReads - statState->networkState.countFileLogicalReads)
+			    .detail("CacheReadBytes",
+			            netData.countFileCacheReadBytes - statState->networkState.countFileCacheReadBytes)
+			    .detail("CacheFinds", netData.countFileCacheFinds - statState->networkState.countFileCacheFinds)
+			    .detail("CacheWritesBlocked",
+			            netData.countFileCacheWritesBlocked - statState->networkState.countFileCacheWritesBlocked)
+			    .detail("CacheReadsBlocked",
+			            netData.countFileCacheReadsBlocked - statState->networkState.countFileCacheReadsBlocked)
+			    .detail("CachePageReadsMerged",
+			            netData.countFileCachePageReadsMerged - statState->networkState.countFileCachePageReadsMerged)
+			    .detail("CacheWrites", netData.countFileCacheWrites - statState->networkState.countFileCacheWrites)
+			    .detail("CacheReads", netData.countFileCacheReads - statState->networkState.countFileCacheReads)
+			    .detail("CacheHits", netData.countFilePageCacheHits - statState->networkState.countFilePageCacheHits)
+			    .detail("CacheMisses",
+			            netData.countFilePageCacheMisses - statState->networkState.countFilePageCacheMisses)
+			    .detail("CacheEvictions",
+			            netData.countFilePageCacheEvictions - statState->networkState.countFilePageCacheEvictions)
+			    .detail("DCID", machineState.dcId)
+			    .detail("ZoneID", machineState.zoneId)
+			    .detail("MachineID", machineState.machineId)
+			    .detail("AIOSubmitCount", netData.countAIOSubmit - statState->networkState.countAIOSubmit)
+			    .detail("AIOCollectCount", netData.countAIOCollect - statState->networkState.countAIOCollect)
+			    .detail("AIOSubmitLag", (g_network->networkInfo.metrics.secSquaredSubmit -
+			                             statState->networkMetricsState.secSquaredSubmit) /
+			                                currentStats.elapsed)
+			    .detail("AIODiskStall", (g_network->networkInfo.metrics.secSquaredDiskStall -
+			                             statState->networkMetricsState.secSquaredDiskStall) /
+			                                currentStats.elapsed)
+			    .detail("CurrentConnections", netData.countConnEstablished - netData.countConnClosedWithError -
+			                                      netData.countConnClosedWithoutError)
+			    .detail("ConnectionsEstablished",
+			            (double)(netData.countConnEstablished - statState->networkState.countConnEstablished) /
+			                currentStats.elapsed)
+			    .detail("ConnectionsClosed",
+			            ((netData.countConnClosedWithError - statState->networkState.countConnClosedWithError) +
+			             (netData.countConnClosedWithoutError - statState->networkState.countConnClosedWithoutError)) /
+			                currentStats.elapsed)
+			    .detail("ConnectionErrors",
+			            (netData.countConnClosedWithError - statState->networkState.countConnClosedWithError) /
+			                currentStats.elapsed)
+			    .detail("TLSPolicyFailures",
+			            (netData.countTLSPolicyFailures - statState->networkState.countTLSPolicyFailures) /
+			                currentStats.elapsed)
+			    .trackLatest(eventName);

 			TraceEvent("MemoryMetrics")
-				.DETAILALLOCATORMEMUSAGE(16)
-				.DETAILALLOCATORMEMUSAGE(32)
-				.DETAILALLOCATORMEMUSAGE(64)
-				.DETAILALLOCATORMEMUSAGE(96)
-				.DETAILALLOCATORMEMUSAGE(128)
-				.DETAILALLOCATORMEMUSAGE(256)
-				.DETAILALLOCATORMEMUSAGE(512)
-				.DETAILALLOCATORMEMUSAGE(1024)
-				.DETAILALLOCATORMEMUSAGE(2048)
-				.DETAILALLOCATORMEMUSAGE(4096)
-				.DETAILALLOCATORMEMUSAGE(8192)
-				.detail("HugeArenaMemory", g_hugeArenaMemory.load());
+			    .DETAILALLOCATORMEMUSAGE(16)
+			    .DETAILALLOCATORMEMUSAGE(32)
+			    .DETAILALLOCATORMEMUSAGE(64)
+			    .DETAILALLOCATORMEMUSAGE(96)
+			    .DETAILALLOCATORMEMUSAGE(128)
+			    .DETAILALLOCATORMEMUSAGE(256)
+			    .DETAILALLOCATORMEMUSAGE(512)
+			    .DETAILALLOCATORMEMUSAGE(1024)
+			    .DETAILALLOCATORMEMUSAGE(2048)
+			    .DETAILALLOCATORMEMUSAGE(4096)
+			    .DETAILALLOCATORMEMUSAGE(8192)
+			    .detail("HugeArenaMemory", g_hugeArenaMemory.load())
+			    .detail("DCID", machineState.dcId)
+			    .detail("ZoneID", machineState.zoneId)
+			    .detail("MachineID", machineState.machineId);

 			TraceEvent n("NetworkMetrics");
-			n
-				.detail("Elapsed", currentStats.elapsed)
-				.detail("CantSleep", netData.countCantSleep - statState->networkState.countCantSleep)
-				.detail("WontSleep", netData.countWontSleep - statState->networkState.countWontSleep)
-				.detail("Yields", netData.countYields - statState->networkState.countYields)
-				.detail("YieldCalls", netData.countYieldCalls - statState->networkState.countYieldCalls)
-				.detail("YieldCallsTrue", netData.countYieldCallsTrue - statState->networkState.countYieldCallsTrue)
-				.detail("SlowTaskSignals", netData.countSlowTaskSignals - statState->networkState.countSlowTaskSignals)
-				.detail("YieldBigStack", netData.countYieldBigStack - statState->networkState.countYieldBigStack)
-				.detail("RunLoopIterations", netData.countRunLoop - statState->networkState.countRunLoop)
-				.detail("TimersExecuted", netData.countTimers - statState->networkState.countTimers)
-				.detail("TasksExecuted", netData.countTasks - statState->networkState.countTasks)
-				.detail("ASIOEventsProcessed", netData.countASIOEvents - statState->networkState.countASIOEvents)
-				.detail("ReadCalls", netData.countReads - statState->networkState.countReads)
-				.detail("WriteCalls", netData.countWrites - statState->networkState.countWrites)
-				.detail("ReadProbes", netData.countReadProbes - statState->networkState.countReadProbes)
-				.detail("WriteProbes", netData.countWriteProbes - statState->networkState.countWriteProbes)
-				.detail("PacketsRead", netData.countPacketsReceived - statState->networkState.countPacketsReceived)
-				.detail("PacketsGenerated", netData.countPacketsGenerated - statState->networkState.countPacketsGenerated)
-				.detail("WouldBlock", netData.countWouldBlock - statState->networkState.countWouldBlock)
-				.detail("LaunchTime", netData.countLaunchTime - statState->networkState.countLaunchTime)
-				.detail("ReactTime", netData.countReactTime - statState->networkState.countReactTime);
+			n.detail("Elapsed", currentStats.elapsed)
+			    .detail("CantSleep", netData.countCantSleep - statState->networkState.countCantSleep)
+			    .detail("WontSleep", netData.countWontSleep - statState->networkState.countWontSleep)
+			    .detail("Yields", netData.countYields - statState->networkState.countYields)
+			    .detail("YieldCalls", netData.countYieldCalls - statState->networkState.countYieldCalls)
+			    .detail("YieldCallsTrue", netData.countYieldCallsTrue - statState->networkState.countYieldCallsTrue)
+			    .detail("SlowTaskSignals", netData.countSlowTaskSignals - statState->networkState.countSlowTaskSignals)
+			    .detail("YieldBigStack", netData.countYieldBigStack - statState->networkState.countYieldBigStack)
+			    .detail("RunLoopIterations", netData.countRunLoop - statState->networkState.countRunLoop)
+			    .detail("TimersExecuted", netData.countTimers - statState->networkState.countTimers)
+			    .detail("TasksExecuted", netData.countTasks - statState->networkState.countTasks)
+			    .detail("ASIOEventsProcessed", netData.countASIOEvents - statState->networkState.countASIOEvents)
+			    .detail("ReadCalls", netData.countReads - statState->networkState.countReads)
+			    .detail("WriteCalls", netData.countWrites - statState->networkState.countWrites)
+			    .detail("ReadProbes", netData.countReadProbes - statState->networkState.countReadProbes)
+			    .detail("WriteProbes", netData.countWriteProbes - statState->networkState.countWriteProbes)
+			    .detail("PacketsRead", netData.countPacketsReceived - statState->networkState.countPacketsReceived)
+			    .detail("PacketsGenerated",
+			            netData.countPacketsGenerated - statState->networkState.countPacketsGenerated)
+			    .detail("WouldBlock", netData.countWouldBlock - statState->networkState.countWouldBlock)
+			    .detail("LaunchTime", netData.countLaunchTime - statState->networkState.countLaunchTime)
+			    .detail("ReactTime", netData.countReactTime - statState->networkState.countReactTime)
+			    .detail("DCID", machineState.dcId)
+			    .detail("ZoneID", machineState.zoneId)
+			    .detail("MachineID", machineState.machineId);

 			for (int i = 0; i<NetworkMetrics::SLOW_EVENT_BINS; i++) {
 				if (int c = g_network->networkInfo.metrics.countSlowEvents[i] - statState->networkMetricsState.countSlowEvents[i]) {
@ -165,18 +195,20 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 		}

 		if(machineMetrics) {
-			TraceEvent("MachineMetrics").detail("Elapsed", currentStats.elapsed)
-				.detail("MbpsSent", currentStats.machineMegabitsSent / currentStats.elapsed)
-				.detail("MbpsReceived", currentStats.machineMegabitsReceived / currentStats.elapsed)
-				.detail("OutSegs", currentStats.machineOutSegs)
-				.detail("RetransSegs", currentStats.machineRetransSegs)
-				.detail("CPUSeconds", currentStats.machineCPUSeconds)
-				.detail("TotalMemory", currentStats.machineTotalRAM)
-				.detail("CommittedMemory", currentStats.machineCommittedRAM)
-				.detail("AvailableMemory", currentStats.machineAvailableRAM)
-				.detail("ZoneID", machineState.zoneId)
-				.detail("MachineID", machineState.machineId)
-				.trackLatest("MachineMetrics");
+			TraceEvent("MachineMetrics")
+			    .detail("Elapsed", currentStats.elapsed)
+			    .detail("MbpsSent", currentStats.machineMegabitsSent / currentStats.elapsed)
+			    .detail("MbpsReceived", currentStats.machineMegabitsReceived / currentStats.elapsed)
+			    .detail("OutSegs", currentStats.machineOutSegs)
+			    .detail("RetransSegs", currentStats.machineRetransSegs)
+			    .detail("CPUSeconds", currentStats.machineCPUSeconds)
+			    .detail("TotalMemory", currentStats.machineTotalRAM)
+			    .detail("CommittedMemory", currentStats.machineCommittedRAM)
+			    .detail("AvailableMemory", currentStats.machineAvailableRAM)
+			    .detail("DCID", machineState.dcId)
+			    .detail("ZoneID", machineState.zoneId)
+			    .detail("MachineID", machineState.machineId)
+			    .trackLatest("MachineMetrics");
 		}
 	}

--- a/flow/SystemMonitor.h
+++ b/flow/SystemMonitor.h
@ -27,6 +27,7 @@

 struct SystemMonitorMachineState {
 	Optional<std::string> folder;
+	Optional<Standalone<StringRef>> dcId;
 	Optional<Standalone<StringRef>> zoneId;
 	Optional<Standalone<StringRef>> machineId;
 	Optional<IPAddress> ip;
@ -35,9 +36,10 @@ struct SystemMonitorMachineState {

 	SystemMonitorMachineState() : monitorStartTime(0) {}
 	explicit SystemMonitorMachineState(const IPAddress& ip) : ip(ip), monitorStartTime(0) {}
-	SystemMonitorMachineState(std::string folder, Optional<Standalone<StringRef>> zoneId,
-	                          Optional<Standalone<StringRef>> machineId, const IPAddress& ip)
-	  : folder(folder), zoneId(zoneId), machineId(machineId), ip(ip), monitorStartTime(0) {}
+	SystemMonitorMachineState(std::string folder, Optional<Standalone<StringRef>> dcId,
+	                          Optional<Standalone<StringRef>> zoneId, Optional<Standalone<StringRef>> machineId,
+	                          const IPAddress& ip)
+	  : folder(folder), dcId(dcId), zoneId(zoneId), machineId(machineId), ip(ip), monitorStartTime(0) {}
 };

 void initializeSystemMonitorMachineState(SystemMonitorMachineState machineState);
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@ -29,7 +29,8 @@
 #include <stdarg.h>
 #include <cctype>
 #include <time.h>
-
+#include <set>
+#include <iomanip>
 #include "flow/IThreadPool.h"
 #include "flow/ThreadHelper.actor.h"
 #include "flow/FastRef.h"
@ -422,6 +423,7 @@ public:

 		if (roll) {
 			auto o = new WriterThread::Roll;
+			double time = 0;
 			writer->post(o);

 			std::vector<TraceEventFields> events = latestEventCache.getAllUnsafe();
@ -430,9 +432,15 @@ public:
 					TraceEventFields rolledFields;
 					for(auto itr = events[idx].begin(); itr != events[idx].end(); ++itr) {
 						if(itr->first == "Time") {
-							rolledFields.addField("Time", format("%.6f", TraceEvent::getCurrentTime()));
+							time = TraceEvent::getCurrentTime();
+							rolledFields.addField("Time", format("%.6f", time));
 							rolledFields.addField("OriginalTime", itr->second);
 						}
+						else if (itr->first == "DateTime") {
+							UNSTOPPABLE_ASSERT(time > 0); // "Time" field should always come first
+							rolledFields.addField("DateTime", TraceEvent::printRealTime(time));
+							rolledFields.addField("OriginalDateTime", itr->second);
+						}
 						else if(itr->first == "TrackLatestType") {
 							rolledFields.addField("TrackLatestType", "Rolled");
 						}
@ -676,6 +684,13 @@ TraceEvent::TraceEvent(TraceEvent &&ev) {
 	tmpEventMetric = ev.tmpEventMetric;
 	trackingKey = ev.trackingKey;
 	type = ev.type;
+	timeIndex = ev.timeIndex;
+
+	for (int i = 0; i < 5; i++) {
+		eventCounts[i] = ev.eventCounts[i];
+	}
+
+	networkThread = ev.networkThread;

 	ev.initialized = true;
 	ev.enabled = false;
@ -684,6 +699,7 @@ TraceEvent::TraceEvent(TraceEvent &&ev) {
 }

 TraceEvent& TraceEvent::operator=(TraceEvent &&ev) {
+	// Note: still broken if ev and this are the same memory address.
 	enabled = ev.enabled;
 	err = ev.err;
 	fields = std::move(ev.fields);
@ -696,6 +712,13 @@ TraceEvent& TraceEvent::operator=(TraceEvent &&ev) {
 	tmpEventMetric = ev.tmpEventMetric;
 	trackingKey = ev.trackingKey;
 	type = ev.type;
+	timeIndex = ev.timeIndex;
+
+	for (int i = 0; i < 5; i++) {
+		eventCounts[i] = ev.eventCounts[i];
+	}
+
+	networkThread = ev.networkThread;

 	ev.initialized = true;
 	ev.enabled = false;
@ -782,6 +805,9 @@ bool TraceEvent::init() {
 		detail("Severity", int(severity));
 		detail("Time", "0.000000");
 		timeIndex = fields.size() - 1;
+		if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) {
+			detail("DateTime", "");
+		}

 		detail("Type", type);
 		if(g_network && g_network->isSimulated()) {
@ -982,7 +1008,7 @@ TraceEvent& TraceEvent::GetLastError() {

 // We're cheating in counting, as in practice, we only use {10,20,30,40}.
 static_assert(SevMaxUsed / 10 + 1 == 5, "Please bump eventCounts[5] to SevMaxUsed/10+1");
-unsigned long TraceEvent::eventCounts[5] = {0,0,0,0,0};
+unsigned long TraceEvent::eventCounts[5] = { 0, 0, 0, 0, 0 };

 unsigned long TraceEvent::CountEventsLoggedAt(Severity sev) {
  return TraceEvent::eventCounts[sev/10];
@ -1000,7 +1026,11 @@ void TraceEvent::log() {
 		++g_allocation_tracing_disabled;
 		try {
 			if (enabled) {
-				fields.mutate(timeIndex).second = format("%.6f", TraceEvent::getCurrentTime());
+				double time = TraceEvent::getCurrentTime();
+				fields.mutate(timeIndex).second = format("%.6f", time);
+				if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) {
+					fields.mutate(timeIndex + 1).second = TraceEvent::printRealTime(time);
+				}

 				if (this->severity == SevError) {
 					severity = SevInfo;
@ -1071,6 +1101,31 @@ double TraceEvent::getCurrentTime() {
 	}
 }

+// converts the given flow time into a string
+// with format: %Y-%m-%dT%H:%M:%S
+// This only has second-resolution for the simple reason
+// that std::put_time does not support higher resolution.
+// This is fine since we always log the flow time as well.
+std::string TraceEvent::printRealTime(double time) {
+	using Clock = std::chrono::system_clock;
+	time_t ts = Clock::to_time_t(Clock::time_point(
+	    std::chrono::duration_cast<Clock::duration>(std::chrono::duration<double, std::ratio<1>>(time))));
+	if (g_network && g_network->isSimulated()) {
+		// The clock is simulated, so return the real time
+		ts = Clock::to_time_t(Clock::now());
+	}
+	std::stringstream ss;
+#ifdef _WIN32
+	// MSVC gmtime is threadsafe
+	ss << std::put_time(::gmtime(&ts), "%Y-%m-%dT%H:%M:%SZ");
+#else
+	// use threadsafe gmt
+	struct tm result;
+	ss << std::put_time(::gmtime_r(&ts, &result), "%Y-%m-%dT%H:%M:%SZ");
+#endif
+	return ss.str();
+}
+
 TraceInterval& TraceInterval::begin() {
 	pairID = nondeterministicRandom()->randomUniqueID();
 	count = 0;
@ -1138,6 +1193,9 @@ void TraceBatch::dump() {
 TraceBatch::EventInfo::EventInfo(double time, const char *name, uint64_t id, const char *location) {
 	fields.addField("Severity", format("%d", (int)SevInfo));
 	fields.addField("Time", format("%.6f", time));
+	if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) {
+		fields.addField("DateTime", TraceEvent::printRealTime(time));
+	}
 	fields.addField("Type", name);
 	fields.addField("ID", format("%016" PRIx64, id));
 	fields.addField("Location", location);
@ -1146,6 +1204,9 @@ TraceBatch::EventInfo::EventInfo(double time, const char *name, uint64_t id, con
 TraceBatch::AttachInfo::AttachInfo(double time, const char *name, uint64_t id, uint64_t to) {
 	fields.addField("Severity", format("%d", (int)SevInfo));
 	fields.addField("Time", format("%.6f", time));
+	if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) {
+		fields.addField("DateTime", TraceEvent::printRealTime(time));
+	}
 	fields.addField("Type", name);
 	fields.addField("ID", format("%016" PRIx64, id));
 	fields.addField("To", format("%016" PRIx64, to));
@ -1154,6 +1215,9 @@ TraceBatch::AttachInfo::AttachInfo(double time, const char *name, uint64_t id, u
 TraceBatch::BuggifyInfo::BuggifyInfo(double time, int activated, int line, std::string file) {
 	fields.addField("Severity", format("%d", (int)SevInfo));
 	fields.addField("Time", format("%.6f", time));
+	if (FLOW_KNOBS->TRACE_DATETIME_ENABLED) {
+		fields.addField("DateTime", TraceEvent::printRealTime(time));
+	}
 	fields.addField("Type", "BuggifySection");
 	fields.addField("Activated", format("%d", activated));
 	fields.addField("File", std::move(file));
--- a/flow/Trace.h
+++ b/flow/Trace.h
@ -26,6 +26,7 @@
 #include <stdarg.h>
 #include <stdint.h>
 #include <string>
+#include <chrono>
 #include <map>
 #include <type_traits>
 #include "flow/IRandom.h"
@ -388,6 +389,7 @@ struct TraceEvent {
 	static bool isNetworkThread();

 	static double getCurrentTime();
+	static std::string printRealTime(double time);

 	//Must be called directly after constructing the trace event
 	TraceEvent& error(const class Error& e, bool includeCancelled=false) {
--- a/flow/error_definitions.h
+++ b/flow/error_definitions.h
@ -86,6 +86,7 @@ ERROR( please_reboot_delete, 1208, "Reboot of server process requested, with del
 ERROR( master_proxy_failed, 1209, "Master terminating because a Proxy failed" )
 ERROR( master_resolver_failed, 1210, "Master terminating because a Resolver failed" )
 ERROR( server_overloaded, 1211, "Server is under too much load and cannot respond" )
+ERROR( dd_tracker_cancelled, 1215, "The data distribution tracker has been cancelled" )

 // 15xx Platform errors
 ERROR( platform_error, 1500, "Platform error" )
--- a/flow/flow.vcxproj
+++ b/flow/flow.vcxproj
@ -22,11 +22,13 @@
    <ClCompile Include="FileTraceLogWriter.cpp" />
    <ClCompile Include="XmlTraceLogFormatter.cpp" />
    <ClCompile Include="JsonTraceLogFormatter.cpp" />
+    <ClCompile Include="Histogram.cpp" />
    <ClInclude Include="FileTraceLogWriter.h" />
    <ClInclude Include="XmlTraceLogFormatter.h" />
    <ClInclude Include="JsonTraceLogFormatter.h" />
    <ClInclude Include="MetricSample.h" />
    <ClInclude Include="Profiler.h" />
+    <ClInclude Include="Histogram.h" />
    <ActorCompiler Include="Profiler.actor.cpp" />
    <ActorCompiler Include="Net2.actor.cpp" />
    <ClCompile Include="IThreadPool.cpp" />
--- a/flow/network.h
+++ b/flow/network.h
@ -30,8 +30,9 @@
 #ifndef TLS_DISABLED
 #include "boost/asio/ssl.hpp"
 #endif
-#include "flow/serialize.h"
+#include "flow/Arena.h"
 #include "flow/IRandom.h"
+#include "flow/Trace.h"

 enum class TaskPriority {
 	Max = 1000000,
@ -111,8 +112,6 @@ inline TaskPriority incrementPriorityIfEven(TaskPriority p) {

 class Void;

-template<class T> class Optional;
-
 struct IPAddress {
 	typedef boost::asio::ip::address_v6::bytes_type IPAddressStore;
 	static_assert(std::is_same<IPAddressStore, std::array<uint8_t, 16>>::value,
@ -351,6 +350,12 @@ public:
 	virtual Future<int64_t> read() = 0;
 };

+struct SendBuffer {
+	uint8_t const* data;
+	SendBuffer* next;
+	int bytes_written, bytes_sent;
+};
+
 class IConnection {
 public:
 	// IConnection is reference-counted (use Reference<IConnection>), but the caller must explicitly call close()
--- a/flow/serialize.h
+++ b/flow/serialize.h
@ -30,6 +30,7 @@
 #include "flow/Arena.h"
 #include "flow/FileIdentifier.h"
 #include "flow/ObjectSerializer.h"
+#include "flow/network.h"
 #include <algorithm>

 // Though similar, is_binary_serializable cannot be replaced by std::is_pod, as doing so would prefer
@ -664,24 +665,20 @@ private:
 	ProtocolVersion m_protocolVersion;
 };

-struct SendBuffer {
-	uint8_t const* data;
-	SendBuffer* next;
-	int bytes_written, bytes_sent;
-};
-
 struct PacketBuffer : SendBuffer {
 private:
+	static constexpr size_t PACKET_BUFFER_OVERHEAD = 40;
 	int reference_count;
-	uint32_t size_;
-	static constexpr size_t PACKET_BUFFER_OVERHEAD = 32;
+	uint32_t const size_;

 public:
+	double const enqueue_time;
+
 	uint8_t* data() { return const_cast<uint8_t*>(static_cast<SendBuffer*>(this)->data); }
 	size_t size() { return size_; }

 private:
-	explicit PacketBuffer(size_t size) : reference_count(1), size_(size) {
+	explicit PacketBuffer(size_t size) : reference_count(1), size_(size), enqueue_time(g_network->now()) {
 		next = 0;
 		bytes_written = bytes_sent = 0;
 		((SendBuffer*)this)->data = reinterpret_cast<uint8_t*>(this + 1);
--- a/packaging/msi/FDBInstaller.wxs
+++ b/packaging/msi/FDBInstaller.wxs
@ -32,7 +32,7 @@

 <Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
  <Product Name='$(var.Title)'
-           Id='{7DDBF1DA-C17A-4519-A893-6CED9B1D9B5A}'
+           Id='{88AA3058-920F-4DB3-8E3E-492E35F13DDE}'
           UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
           Version='$(var.Version)'
           Manufacturer='$(var.Manufacturer)'
--- a/versions.target
+++ b/versions.target
@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
-    <Version>6.2.28</Version>
+    <Version>6.2.29</Version>
    <PackageName>6.2</PackageName>
  </PropertyGroup>
 </Project>