From 88e765b9e6e9b45941fc9847dbf764ec4f1d84e0 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 14 Jun 2019 15:11:34 -0700
Subject: [PATCH 001/136] Fix: the binding tester was taking the min() of a
 list of tuples, but that could fail if the tuple contained incomparable
 types. Instead, use fdb.tuple.compare() to do the comparison.

---
 bindings/bindingtester/bindingtester.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/bindings/bindingtester/bindingtester.py b/bindings/bindingtester/bindingtester.py
index 5a60d1112a..559244233b 100755
--- a/bindings/bindingtester/bindingtester.py
+++ b/bindings/bindingtester/bindingtester.py
@@ -68,6 +68,10 @@ class ResultSet(object):
 
         self.tester_results[name] = results
 
+    @staticmethod
+    def _min_tuple(t1, t2):
+        return t1 if fdb.tuple.compare(t1, t2) < 0 else t2
+
     def check_for_errors(self):
         if len(self.tester_results) == 1:
             return (0, False)
@@ -97,7 +101,7 @@ class ResultSet(object):
 
             # If these results aren't using sequence numbers, then we match two results based on whether they share the same key
             else:
-                min_key = min([r.key(self.specification) for r in results.values()])
+                min_key = reduce(ResultSet._min_tuple, [r.key(self.specification) for r in results.values()])
                 results = {i: r for i, r in results.items() if Result.tuples_match(r.key(self.specification), min_key)}
 
             # Increment the indices for those testers which produced a result in this iteration

From 7b12374a87d11ea103e9be89a13766685d5c8590 Mon Sep 17 00:00:00 2001
From: Alec Grieser <agrieser@apple.com>
Date: Tue, 18 Jun 2019 18:36:12 -0700
Subject: [PATCH 002/136] Fixes #1690: Server docker image hard-codes 4500 in a
 few places

This makes the default public port for starting FDB processes the same as the FDB_PORT. This is probably necessary given #1714, especially for coordinators, though it might not be necessary for other processes in the cluster. This can *almost* be used to start up multiple FDB processes locally and then access them from the same machine, but that (unfortunately) requires both the other processes in the docker compose network and the host machine to agree on what IP to use for the coordinator. But as that machine has different IPs in those networks, they cannot be made to agree.
---
 packaging/docker/Dockerfile                   |  1 +
 packaging/docker/README.md                    |  9 +++-
 packaging/docker/create_cluster_file.bash     |  4 +-
 .../docker/create_server_environment.bash     |  2 +-
 packaging/docker/fdb.bash                     |  6 +--
 packaging/docker/samples/local/README.md      | 45 +++++++++++++++++++
 .../docker/samples/local/docker-compose.yml   | 32 +++++++++++++
 packaging/docker/samples/local/start.bash     | 39 ++++++++++++++++
 packaging/docker/samples/local/stop.bash      | 28 ++++++++++++
 .../docker/samples/python/app/Dockerfile      |  8 ++--
 .../docker/samples/python/docker-compose.yml  | 28 +++++++++---
 11 files changed, 185 insertions(+), 17 deletions(-)
 create mode 100644 packaging/docker/samples/local/README.md
 create mode 100644 packaging/docker/samples/local/docker-compose.yml
 create mode 100755 packaging/docker/samples/local/start.bash
 create mode 100755 packaging/docker/samples/local/stop.bash

diff --git a/packaging/docker/Dockerfile b/packaging/docker/Dockerfile
index 101ba295ab..dc514870f3 100644
--- a/packaging/docker/Dockerfile
+++ b/packaging/docker/Dockerfile
@@ -70,5 +70,6 @@ ENV FDB_PORT 4500
 ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster
 ENV FDB_NETWORKING_MODE container
 ENV FDB_COORDINATOR ""
+ENV FDB_COORDINATOR_PORT 4500
 ENV FDB_CLUSTER_FILE_CONTENTS ""
 ENV FDB_PROCESS_CLASS unset
diff --git a/packaging/docker/README.md b/packaging/docker/README.md
index a8d6f48de8..39fc94844a 100644
--- a/packaging/docker/README.md
+++ b/packaging/docker/README.md
@@ -57,6 +57,13 @@ helpful when setting up a larger cluster inside a docker network, for instance
 when using Docker Compose. The name you provide must be resolvable through the
 DNS on the container you are running.
 
+### FDB_COORDINATOR_PORT
+
+The port to use for connecting to the FDB coordinator process. This should be
+set by other processes in a multi-process cluster to the same value as the
+`FDB_PORT` environment variable of the coordinator process. It will default
+to 4500, which is also the default for `FDB_PORT`.
+
 # Copying Into Other Images
 
 You can also use this image to provide files for images that are clients of a
@@ -68,4 +75,4 @@ files you may want to copy are:
 	library, which you can use if you are setting up a multiversion client.
 *	`/var/fdb/scripts/create_cluster_file.bash`: A script for setting up the
 	cluster file based on an `FDB_COORDINATOR` environment variable.
-*	`/usr/bin/fdbcli`: The FoundationDB CLI.
\ No newline at end of file
+*	`/usr/bin/fdbcli`: The FoundationDB CLI.
diff --git a/packaging/docker/create_cluster_file.bash b/packaging/docker/create_cluster_file.bash
index b701b03d1a..863ca43ac8 100644
--- a/packaging/docker/create_cluster_file.bash
+++ b/packaging/docker/create_cluster_file.bash
@@ -39,7 +39,7 @@ function create_cluster_file() {
 			echo "Failed to look up coordinator address for $FDB_COORDINATOR" 1>&2
 			exit 1
 		fi
-		echo "docker:docker@$coordinator_ip:4500" > $FDB_CLUSTER_FILE
+		echo "docker:docker@$coordinator_ip:$FDB_COORDINATOR_PORT" > $FDB_CLUSTER_FILE
 	else
 		echo "FDB_COORDINATOR environment variable not defined" 1>&2
 		exit 1
@@ -48,4 +48,4 @@ function create_cluster_file() {
 
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
     create_cluster_file "$@"
-fi
\ No newline at end of file
+fi
diff --git a/packaging/docker/create_server_environment.bash b/packaging/docker/create_server_environment.bash
index 67979839b9..54d90f0854 100644
--- a/packaging/docker/create_server_environment.bash
+++ b/packaging/docker/create_server_environment.bash
@@ -43,4 +43,4 @@ function create_server_environment() {
 	fi
 
 	create_cluster_file
-}
\ No newline at end of file
+}
diff --git a/packaging/docker/fdb.bash b/packaging/docker/fdb.bash
index 3fb322c431..3bf1c6a680 100644
--- a/packaging/docker/fdb.bash
+++ b/packaging/docker/fdb.bash
@@ -23,7 +23,7 @@
 source /var/fdb/scripts/create_server_environment.bash
 create_server_environment
 source /var/fdb/.fdbenv
-echo "Starting FDB server on $PUBLIC_IP:4500"
-fdbserver --listen_address 0.0.0.0:$FDB_PORT --public_address $PUBLIC_IP:4500 \
+echo "Starting FDB server on $PUBLIC_IP:$FDB_PORT"
+fdbserver --listen_address 0.0.0.0:$FDB_PORT --public_address $PUBLIC_IP:$FDB_PORT \
 	--datadir /var/fdb/data --logdir /var/fdb/logs \
-	--locality_zoneid=`hostname` --locality_machineid=`hostname` --class $FDB_PROCESS_CLASS
\ No newline at end of file
+	--locality_zoneid=`hostname` --locality_machineid=`hostname` --class $FDB_PROCESS_CLASS
diff --git a/packaging/docker/samples/local/README.md b/packaging/docker/samples/local/README.md
new file mode 100644
index 0000000000..f7f5b3e979
--- /dev/null
+++ b/packaging/docker/samples/local/README.md
@@ -0,0 +1,45 @@
+# Local Docker-based FoundationDB Cluster
+
+This contains a sample `docker-compose.yaml` and some simple startup and teardown
+scripts for running a simple single-instance FoundationDB using the Docker image
+specified in this repository. This uses the `host` networking option to expose
+the server process to its host machine.
+
+This depends on having the FoundationDB client installed on your host machine
+to work properly. This can be done using one of the client packages available
+on our [Download](https://www.foundationdb.org/download/) page. The startup
+scripts included here depend on `fdbcli` from one of those packages, and any
+client that wishes to connect will need a copy of the FoundationDB native client
+in addition to its binding of choice. Both the CLI and the native client
+are installed in all of our client packages
+
+Once those dependencies are installed, one can build the FoundationDB Docker
+image:
+
+```
+docker build --build-arg FDB_VERSION=6.1.8 -t foundationdb:6.1.8 ../..
+```
+
+Then one can start the cluster by running:
+
+```
+./start.bash
+```
+
+This starts up a single instance FoundationDB cluster using the `docker-compose.yaml`
+and configures it as a new database. This will write the cluster file information to
+`docker.cluster`. One should then be able to access the cluster through the CLI
+or one of the bindings by using this cluster file. For example:
+
+```
+fdbcli --exec status -C docker.cluster
+```
+
+To stop the cluster, one can run:
+
+```
+./stop.bash
+```
+
+Note that all data are lost between reboots of the processes as they have not
+been configured to use a persistent volume (but write to Docker's temporary file system).
diff --git a/packaging/docker/samples/local/docker-compose.yml b/packaging/docker/samples/local/docker-compose.yml
new file mode 100644
index 0000000000..3ce177afb5
--- /dev/null
+++ b/packaging/docker/samples/local/docker-compose.yml
@@ -0,0 +1,32 @@
+# docker-compose.yaml
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Specification for a one node cluster than can be accessed from the host.
+# The user must specify the FDB_PORT on which it is run.
+
+version: '3'
+services:
+  fdb:
+    image: foundationdb:6.1.8
+    ports:
+      - $FDB_PORT:$FDB_PORT/tcp
+    environment:
+      FDB_NETWORKING_MODE: host
+      FDB_COORDINATOR_PORT: $FDB_PORT
+      FDB_PORT: $FDB_PORT
diff --git a/packaging/docker/samples/local/start.bash b/packaging/docker/samples/local/start.bash
new file mode 100755
index 0000000000..64def42f51
--- /dev/null
+++ b/packaging/docker/samples/local/start.bash
@@ -0,0 +1,39 @@
+#! /bin/bash
+
+#
+# start.bash
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -eu
+
+FDB_CLUSTER_FILE="${FDB_CLUSTER_FILE:-docker.cluster}"
+FDB_PORT="${FDB_PORT:-4550}"
+
+FDB_PORT=$FDB_PORT docker-compose up -d fdb
+echo "docker:docker@127.0.0.1:$FDB_PORT" > $FDB_CLUSTER_FILE
+
+# Attempt to connect. Configure the database if necessary.
+if ! fdbcli -C $FDB_CLUSTER_FILE --exec status --timeout 1 ; then
+    if ! fdbcli -C $FDB_CLUSTER_FILE --exec "configure new single memory ; status" --timeout 10 ; then 
+        echo "Unable to configure new FDB cluster."
+        exit 1
+    fi
+fi
+
+echo "Can now connect to docker-based FDB cluster using $FDB_CLUSTER_FILE."
diff --git a/packaging/docker/samples/local/stop.bash b/packaging/docker/samples/local/stop.bash
new file mode 100755
index 0000000000..55acc50953
--- /dev/null
+++ b/packaging/docker/samples/local/stop.bash
@@ -0,0 +1,28 @@
+#! /bin/bash
+
+#
+# stop.bash
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -eu
+
+FDB_PORT="${FDB_PORT:-4550}"
+
+FDB_PORT=$FDB_PORT docker-compose down
+echo "Docker-based FDB cluster is now down."
diff --git a/packaging/docker/samples/python/app/Dockerfile b/packaging/docker/samples/python/app/Dockerfile
index 8172f5aaea..7a3ed818a2 100644
--- a/packaging/docker/samples/python/app/Dockerfile
+++ b/packaging/docker/samples/python/app/Dockerfile
@@ -24,9 +24,9 @@ RUN apt-get update; apt-get install -y dnsutils
 RUN mkdir -p /app
 WORKDIR /app
 
-COPY --from=foundationdb:5.2.5 /usr/lib/libfdb_c.so /usr/lib
-COPY --from=foundationdb:5.2.5 /usr/bin/fdbcli /usr/bin/
-COPY --from=foundationdb:5.2.5 /var/fdb/scripts/create_cluster_file.bash /app
+COPY --from=foundationdb:6.1.8 /usr/lib/libfdb_c.so /usr/lib
+COPY --from=foundationdb:6.1.8 /usr/bin/fdbcli /usr/bin/
+COPY --from=foundationdb:6.1.8 /var/fdb/scripts/create_cluster_file.bash /app
 
 COPY requirements.txt /app
 RUN pip install -r requirements.txt
@@ -38,4 +38,4 @@ RUN chmod u+x /app/start.bash
 CMD /app/start.bash
 
 ENV FLASK_APP=server.py
-ENV FLASK_ENV=development
\ No newline at end of file
+ENV FLASK_ENV=development
diff --git a/packaging/docker/samples/python/docker-compose.yml b/packaging/docker/samples/python/docker-compose.yml
index 2280414688..34c62914a1 100644
--- a/packaging/docker/samples/python/docker-compose.yml
+++ b/packaging/docker/samples/python/docker-compose.yml
@@ -19,18 +19,34 @@
 
 version: '3'
 services:
-  fdb:
-    image: foundationdb:5.2.5
-    environment:
-      FDB_COORDINATOR: fdb-coordinator
+  # Specify three fdbserver processes.
   fdb-coordinator:
-    image: foundationdb:5.2.5
+    image: foundationdb:6.1.8
     environment:
       FDB_COORDINATOR: fdb-coordinator
+  fdb-server-1:
+    depends_on:
+      - fdb-coordinator
+    image: foundationdb:6.1.8
+    environment:
+      FDB_COORDINATOR: fdb-coordinator
+  fdb-server-2:
+    depends_on:
+      - fdb-coordinator
+    image: foundationdb:6.1.8
+    environment:
+      FDB_COORDINATOR: fdb-coordinator
+
+  # Bring up the application so that it depends on the cluster.
   app:
+    depends_on:
+      - fdb-coordinator
+      - fdb-server-1
+      - fdb-server-2
     build:
       context: app
     ports:
       - 5000:5000
     environment:
-      FDB_COORDINATOR: fdb-coordinator
\ No newline at end of file
+      FDB_COORDINATOR: fdb-coordinator
+      FDB_COORDINATOR_PORT: 4550

From e0be6314145688e01c97720812c74dd9a823eb03 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Wed, 19 Jun 2019 18:15:09 -0700
Subject: [PATCH 003/136] shard the txs tag so that more transaction logs are
 involved in its recovery

---
 fdbclient/FDBTypes.h                          |  1 +
 fdbserver/LogSystem.h                         | 24 +++--
 fdbserver/LogSystemDiskQueueAdapter.actor.cpp | 12 +--
 fdbserver/LogSystemDiskQueueAdapter.h         | 15 ++-
 fdbserver/LogSystemPeekCursor.actor.cpp       | 17 ++--
 fdbserver/MasterProxyServer.actor.cpp         | 10 +-
 fdbserver/OldTLogServer_4_6.actor.cpp         |  2 +-
 fdbserver/OldTLogServer_6_0.actor.cpp         | 35 ++++---
 fdbserver/TLogServer.actor.cpp                | 49 ++++++----
 fdbserver/TagPartitionedLogSystem.actor.cpp   | 97 ++++++++++++++++---
 fdbserver/WorkerInterface.actor.h             |  3 +-
 fdbserver/masterserver.actor.cpp              | 11 ++-
 12 files changed, 193 insertions(+), 83 deletions(-)

diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h
index edb83f5f92..ee19f11961 100644
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@@ -43,6 +43,7 @@ enum {
 	tagLocalityUpgraded = -4,
 	tagLocalitySatellite = -5,
 	tagLocalityLogRouterMapped = -6,
+	tagLocalityTxs = -7,
 	tagLocalityInvalid = -99
 }; //The TLog and LogRouter require these number to be as compact as possible
 
diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h
index dad3779938..caa957f8d6 100644
--- a/fdbserver/LogSystem.h
+++ b/fdbserver/LogSystem.h
@@ -89,9 +89,9 @@ public:
 		return result;
 	}
 
-	void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags) {
+	void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags) {
 		satelliteTagLocations.clear();
-		satelliteTagLocations.resize(std::max(logRouterTags,oldLogRouterTags) + 1);
+		satelliteTagLocations.resize(std::max({logRouterTags,oldLogRouterTags,txsTags,oldTxsTags})+1);
 
 		std::map<int,int> server_usedBest;
 		std::set<std::pair<int,int>> used_servers;
@@ -235,7 +235,7 @@ public:
 	                      bool allLocations = false) {
 		if(locality == tagLocalitySatellite) {
 			for(auto& t : tags) {
-				if(t == txsTag || t.locality == tagLocalityLogRouter) {
+				if(t == txsTag || t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter) {
 					for(int loc : satelliteTagLocations[t == txsTag ? 0 : t.id + 1]) {
 						locations.push_back(locationOffset + loc);
 					}
@@ -520,8 +520,9 @@ struct ILogSystem {
 		std::vector<Reference<IPeekCursor>> cursors;
 		std::vector<LogMessageVersion> epochEnds;
 		Version poppedVersion;
+		bool needsPopped;
 
-		MultiCursor( std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds );
+		MultiCursor( std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds, bool needsPopped = true );
 
 		virtual Reference<IPeekCursor> cloneNoMore();
 		virtual void setProtocolVersion( ProtocolVersion version );
@@ -575,13 +576,14 @@ struct ILogSystem {
 		LogMessageVersion messageVersion;
 		Version end;
 		bool hasNextMessage;
+		bool withTags;
 
 		//FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support that upgrade.
 		bool collectTags;
 		std::vector<Tag> tags;
 		void combineMessages();
 
-		BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool collectTags );
+		BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool withTags, bool collectTags = false );
 
 		virtual Reference<IPeekCursor> cloneNoMore();
 		virtual void setProtocolVersion( ProtocolVersion version );
@@ -652,13 +654,15 @@ struct ILogSystem {
 		// Same contract as peek(), but can only peek from the logs elected in the same generation.
 		// If the preferred log server is down, a different log from the same generation will merge results locally before sending them to the log router.
 
-	virtual Reference<IPeekCursor> peekSpecial( UID dbgid, Version begin, Tag tag, int8_t peekLocality, Version localEnd ) = 0;
-		// Same contract as peek(), but it allows specifying a preferred peek locality for tags that do not have locality
+	virtual Reference<IPeekCursor> peekTxs( UID dbgid, Version begin, int8_t peekLocality, Version localEnd ) = 0;
+		// Same contract as peek(), but only for peeking the txsLocality. It allows specifying a preferred peek locality.
 
 	virtual Version getKnownCommittedVersion() = 0;
 
 	virtual Future<Void> onKnownCommittedVersionChange() = 0;
 
+	virtual void popTxs( Version upTo, int8_t popLocality = tagLocalityInvalid ) = 0;
+
 	virtual void pop( Version upTo, Tag tag, Version knownCommittedVersion = 0, int8_t popLocality = tagLocalityInvalid ) = 0;
 		// Permits, but does not require, the log subsystem to strip `tag` from any or all messages with message versions < (upTo,0)
 		// The popping of any given message may be arbitrarily delayed.
@@ -705,6 +709,8 @@ struct ILogSystem {
 
 	virtual Tag getRandomRouterTag() = 0;
 
+	virtual Tag getRandomTxsTag() = 0;
+
 	virtual void stopRejoins() = 0;
 
 	// Returns the pseudo tag to be popped for the given process class. If the
@@ -752,6 +758,10 @@ struct LogPushData : NonCopyable {
 		}
 	}
 
+	void addTxsTag() {
+		next_message_tags.push_back( logSystem->getRandomTxsTag() );
+	}
+
 	// addTag() adds a tag for the *next* message to be added
 	void addTag( Tag tag ) {
 		next_message_tags.push_back( tag );
diff --git a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
index 9fed1af178..b145b8db84 100644
--- a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
+++ b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
@@ -42,19 +42,19 @@ public:
 							break;
 						}
 						when( wait( self->localityChanged ) ) {
-							self->cursor = self->logSystem->peekSpecial( UID(), self->recoveryLoc, self->tag, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
+							self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
 							self->localityChanged = self->peekLocality->onChange();
 						}
 						when( wait( delay(self->peekTypeSwitches==0 ? SERVER_KNOBS->DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME : SERVER_KNOBS->DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME)) ) {
 							self->peekTypeSwitches++;
 							if(self->peekTypeSwitches%3==1) {
-								self->cursor = self->logSystem->peek( UID(), self->recoveryLoc, self->tag, true );
+								self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, tagLocalityInvalid, invalidVersion );
 								self->localityChanged = Never();
 							} else if(self->peekTypeSwitches%3==2) {
-								self->cursor = self->logSystem->peekSpecial( UID(), self->recoveryLoc, self->tag, self->peekLocality ? self->peekLocality->get().secondaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
+								self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, self->peekLocality ? self->peekLocality->get().secondaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
 								self->localityChanged = self->peekLocality->onChange();
 							} else {
-								self->cursor = self->logSystem->peekSpecial( UID(), self->recoveryLoc, self->tag, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
+								self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion );
 								self->localityChanged = self->peekLocality->onChange();
 							}
 						}
@@ -168,6 +168,6 @@ Future<LogSystemDiskQueueAdapter::CommitMessage> LogSystemDiskQueueAdapter::getC
 	return pcm.getFuture();
 }
 
-LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference<ILogSystem> logSystem, Tag tag, Reference<AsyncVar<PeekSpecialInfo>> peekLocality ) {
-	return new LogSystemDiskQueueAdapter( logSystem, tag, peekLocality );
+LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference<ILogSystem> logSystem, Reference<AsyncVar<PeekTxsInfo>> peekLocality ) {
+	return new LogSystemDiskQueueAdapter( logSystem, peekLocality );
 }
diff --git a/fdbserver/LogSystemDiskQueueAdapter.h b/fdbserver/LogSystemDiskQueueAdapter.h
index c4ebc2ccbe..d652ba9a5b 100644
--- a/fdbserver/LogSystemDiskQueueAdapter.h
+++ b/fdbserver/LogSystemDiskQueueAdapter.h
@@ -25,16 +25,16 @@
 #include "fdbclient/FDBTypes.h"
 #include "fdbserver/IDiskQueue.h"
 
-struct PeekSpecialInfo {
+struct PeekTxsInfo {
 	int8_t primaryLocality;
 	int8_t secondaryLocality;
 	Version knownCommittedVersion;
 
-	bool operator == (const PeekSpecialInfo& r) const {
+	bool operator == (const PeekTxsInfo& r) const {
 		return primaryLocality == r.primaryLocality && secondaryLocality == r.secondaryLocality && knownCommittedVersion == r.knownCommittedVersion;
 	}
 
-	PeekSpecialInfo(int8_t primaryLocality, int8_t secondaryLocality, Version knownCommittedVersion) : primaryLocality(primaryLocality), secondaryLocality(secondaryLocality), knownCommittedVersion(knownCommittedVersion) {}
+	PeekTxsInfo(int8_t primaryLocality, int8_t secondaryLocality, Version knownCommittedVersion) : primaryLocality(primaryLocality), secondaryLocality(secondaryLocality), knownCommittedVersion(knownCommittedVersion) {}
 };
 
 class LogSystemDiskQueueAdapter : public IDiskQueue {
@@ -52,10 +52,10 @@ public:
 
 	// It does, however, peek the specified tag directly at recovery time.
 
-	LogSystemDiskQueueAdapter( Reference<ILogSystem> logSystem, Tag tag, Reference<AsyncVar<PeekSpecialInfo>> peekLocality, bool recover=true ) : logSystem(logSystem), tag(tag), peekLocality(peekLocality), enableRecovery(recover), recoveryLoc(1), recoveryQueueLoc(1), poppedUpTo(0), nextCommit(1), recoveryQueueDataSize(0), peekTypeSwitches(0) {
+	LogSystemDiskQueueAdapter( Reference<ILogSystem> logSystem, Reference<AsyncVar<PeekTxsInfo>> peekLocality, bool recover=true ) : logSystem(logSystem), peekLocality(peekLocality), enableRecovery(recover), recoveryLoc(1), recoveryQueueLoc(1), poppedUpTo(0), nextCommit(1), recoveryQueueDataSize(0), peekTypeSwitches(0) {
 		if (enableRecovery) {
 			localityChanged = peekLocality ? peekLocality->onChange() : Never();
-			cursor = logSystem->peekSpecial( UID(), 1, tag, peekLocality ? peekLocality->get().primaryLocality : tagLocalityInvalid, peekLocality ? peekLocality->get().knownCommittedVersion : invalidVersion );
+			cursor = logSystem->peekTxs( UID(), 1, peekLocality ? peekLocality->get().primaryLocality : tagLocalityInvalid, peekLocality ? peekLocality->get().knownCommittedVersion : invalidVersion );
 		}
 	}
 
@@ -92,11 +92,10 @@ public:
 	virtual int getCommitOverhead() { return 0; } //SOMEDAY: could this be more accurate?
 
 private:
-	Reference<AsyncVar<PeekSpecialInfo>> peekLocality;
+	Reference<AsyncVar<PeekTxsInfo>> peekLocality;
 	Future<Void> localityChanged;
 	Reference<ILogSystem::IPeekCursor> cursor;
 	int peekTypeSwitches;
-	Tag tag;
 
 	// Recovery state (used while readNext() is being called repeatedly)
 	bool enableRecovery;
@@ -114,6 +113,6 @@ private:
 	friend class LogSystemDiskQueueAdapterImpl;
 };
 
-LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference<ILogSystem> logSystem, Tag tag, Reference<AsyncVar<PeekSpecialInfo>> peekLocality );
+LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference<ILogSystem> logSystem, Reference<AsyncVar<PeekTxsInfo>> peekLocality );
 
 #endif
diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp
index ecf1877536..e58d365204 100644
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@@ -797,7 +797,7 @@ Version ILogSystem::SetPeekCursor::popped() {
 	return poppedVersion;
 }
 
-ILogSystem::MultiCursor::MultiCursor( std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds ) : cursors(cursors), epochEnds(epochEnds), poppedVersion(0) {
+ILogSystem::MultiCursor::MultiCursor( std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds, bool needsPopped ) : cursors(cursors), epochEnds(epochEnds), needsPopped(needsPopped), poppedVersion(0) {
 	for(int i = 0; i < std::min<int>(cursors.size(),SERVER_KNOBS->MULTI_CURSOR_PRE_FETCH_LIMIT); i++) {
 		cursors[cursors.size()-i-1]->getMore();
 	}
@@ -841,7 +841,7 @@ const std::vector<Tag>& ILogSystem::MultiCursor::getTags() {
 
 void ILogSystem::MultiCursor::advanceTo(LogMessageVersion n) {
 	while( cursors.size() > 1 && n >= epochEnds.back() ) {
-		poppedVersion = std::max(poppedVersion, cursors.back()->popped());
+		if(needsPopped) poppedVersion = std::max(poppedVersion, cursors.back()->popped());
 		cursors.pop_back();
 		epochEnds.pop_back();
 	}
@@ -851,7 +851,7 @@ void ILogSystem::MultiCursor::advanceTo(LogMessageVersion n) {
 Future<Void> ILogSystem::MultiCursor::getMore(int taskID) {
 	LogMessageVersion startVersion = cursors.back()->version();
 	while( cursors.size() > 1 && cursors.back()->version() >= epochEnds.back() ) {
-		poppedVersion = std::max(poppedVersion, cursors.back()->popped());
+		if(needsPopped) poppedVersion = std::max(poppedVersion, cursors.back()->popped());
 		cursors.pop_back();
 		epochEnds.pop_back();
 	}
@@ -882,10 +882,11 @@ Version ILogSystem::MultiCursor::getMinKnownCommittedVersion() {
 }
 
 Version ILogSystem::MultiCursor::popped() {
+	ASSERT(needsPopped);
 	return std::max(poppedVersion, cursors.back()->popped());
 }
 
-ILogSystem::BufferedCursor::BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool collectTags ) : cursors(cursors), messageVersion(begin), end(end), collectTags(collectTags), hasNextMessage(false), messageIndex(0) {
+ILogSystem::BufferedCursor::BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool withTags, bool collectTags ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0) {
 	messages.reserve(10000);
 }
 
@@ -948,15 +949,17 @@ void ILogSystem::BufferedCursor::nextMessage() {
 }
 
 StringRef ILogSystem::BufferedCursor::getMessage() {
-	ASSERT(false);
-	return StringRef();
+	ASSERT(!withTags);
+	return messages[messageIndex].message;
 }
 
 StringRef ILogSystem::BufferedCursor::getMessageWithTags() {
+	ASSERT(withTags);
 	return messages[messageIndex].message;
 }
 
 const std::vector<Tag>& ILogSystem::BufferedCursor::getTags() {
+	ASSERT(withTags);
 	return messages[messageIndex].tags;
 }
 
@@ -971,7 +974,7 @@ ACTOR Future<Void> bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Refe
 			return Void();
 		}
 		while(cursor->hasMessage()) {
-			self->messages.push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), self->collectTags ? cursor->getMessage() : cursor->getMessageWithTags(), cursor->getTags(), cursor->version()));
+			self->messages.push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? std::vector<Tag>() : cursor->getTags(), cursor->version()));
 			cursor->nextMessage();
 			if(cursor->version().version >= maxVersion) {
 				return Void();
diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp
index 3fc4665a15..7b68e4d646 100644
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@@ -986,7 +986,7 @@ ACTOR Future<Void> commitBatch(
 	bool firstMessage = true;
 	for(auto m : msg.messages) {
 		if(firstMessage) {
-			toCommit.addTag(txsTag);
+			toCommit.addTxsTag();
 		}
 		toCommit.addMessage(StringRef(m.begin(), m.size()), !firstMessage);
 		firstMessage = false;
@@ -1033,7 +1033,7 @@ ACTOR Future<Void> commitBatch(
 
 		self->txsPopVersions.emplace_back(commitVersion, msg.popTo);
 	}
-	self->logSystem->pop(msg.popTo, txsTag);
+	self->logSystem->popTxs(msg.popTo);
 
 	/////// Phase 5: Replies (CPU bound; no particular order required, though ordered execution would be best for latency)
 	if ( prevVersion && commitVersion - prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT/2 )
@@ -1505,7 +1505,7 @@ ACTOR Future<Void> monitorRemoteCommitted(ProxyCommitData* self) {
 
 			while(self->txsPopVersions.size() && self->txsPopVersions.front().first <= minVersion) {
 				self->lastTxsPop = self->txsPopVersions.front().second;
-				self->logSystem->pop(self->txsPopVersions.front().second, txsTag, 0, tagLocalityRemoteLog);
+				self->logSystem->popTxs(self->txsPopVersions.front().second, tagLocalityRemoteLog);
 				self->txsPopVersions.pop_front();
 			}
 
@@ -1563,7 +1563,7 @@ ACTOR Future<Void> masterProxyServerCore(
 		r->value().emplace_back(0,0);
 
 	commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor);
-	commitData.logAdapter = new LogSystemDiskQueueAdapter(commitData.logSystem, txsTag, Reference<AsyncVar<PeekSpecialInfo>>(), false);
+	commitData.logAdapter = new LogSystemDiskQueueAdapter(commitData.logSystem, Reference<AsyncVar<PeekTxsInfo>>(), false);
 	commitData.txnStateStore = keyValueStoreLogSystem(commitData.logAdapter, proxy.id(), 2e9, true, true, true);
 	createWhitelistBinPathVec(whitelistBinPaths, commitData.whitelistedBinPathVec);
 
@@ -1595,7 +1595,7 @@ ACTOR Future<Void> masterProxyServerCore(
 				for(auto it : commitData.tag_popped) {
 					commitData.logSystem->pop(it.second, it.first);
 				}
-				commitData.logSystem->pop(commitData.lastTxsPop, txsTag, 0, tagLocalityRemoteLog);
+				commitData.logSystem->popTxs(commitData.lastTxsPop, tagLocalityRemoteLog);
 			}
 
 			Optional<LatencyBandConfig> newLatencyBandConfig = commitData.db->get().latencyBandConfig;
diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp
index fd2be1f08f..7be4599a7e 100644
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@@ -48,7 +48,7 @@ namespace oldTLog_4_6 {
 	typedef int16_t OldTag;
 
 	OldTag convertTag( Tag tag ) {
-		if(tag == invalidTag) return invalidTagOld;
+		if(tag == invalidTag || tag.locality == tagLocalityTxs) return invalidTagOld;
 		if(tag == txsTag) return txsTagOld;
 		ASSERT(tag.id >= 0);
 		return tag.id;
diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp
index fc9251ec78..6e3034821e 100644
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@@ -195,6 +195,7 @@ static const KeyRangeRef persistCurrentVersionKeys = KeyRangeRef( LiteralStringR
 static const KeyRangeRef persistKnownCommittedVersionKeys = KeyRangeRef( LiteralStringRef( "knownCommitted/" ), LiteralStringRef( "knownCommitted0" ) );
 static const KeyRangeRef persistLocalityKeys = KeyRangeRef( LiteralStringRef( "Locality/" ), LiteralStringRef( "Locality0" ) );
 static const KeyRangeRef persistLogRouterTagsKeys = KeyRangeRef( LiteralStringRef( "LogRouterTags/" ), LiteralStringRef( "LogRouterTags0" ) );
+static const KeyRangeRef persistTxsTagsKeys = KeyRangeRef( LiteralStringRef( "TxsTags/" ), LiteralStringRef( "TxsTags0" ) );
 static const KeyRange persistTagMessagesKeys = prefixRange(LiteralStringRef("TagMsg/"));
 static const KeyRange persistTagPoppedKeys = prefixRange(LiteralStringRef("TagPop/"));
 
@@ -333,7 +334,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 					auto const& m = self->versionMessages.front();
 					++messagesErased;
 
-					if(self->tag != txsTag) {
+					if(self->tag.locality != tagLocalityTxs && self->tag != txsTag) {
 						sizes.first -= m.second.expectedSize();
 					} else {
 						sizes.second -= m.second.expectedSize();
@@ -433,9 +434,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	Future<Void> terminated;
 	FlowLock execOpLock;
 	bool execOpCommitInProgress;
+	int txsTags;
 
-	explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, UID recruitmentID, std::vector<Tag> tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()),
-			cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), recruitmentID(recruitmentID),
+	explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, int txsTags, UID recruitmentID, std::vector<Tag> tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()),
+			cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), txsTags(txsTags), recruitmentID(recruitmentID),
 			logSystem(new AsyncVar<Reference<ILogSystem>>()), logRouterPoppedVersion(0), durableKnownCommittedVersion(0), minKnownCommittedVersion(0), allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()),
 			// These are initialized differently on init() or recovery
 			recoveryCount(), stopped(false), initialized(false), queueCommittingVersion(0), newPersistentDataVersion(invalidVersion), unrecoveredBefore(1), recoveredAt(1), unpoppedRecoveredTags(0),
@@ -482,6 +484,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistKnownCommittedVersionKeys.begin)) );
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLocalityKeys.begin)) );
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLogRouterTagsKeys.begin)) );
+			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistTxsTagsKeys.begin)) );
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistRecoveryCountKeys.begin)) );
 			Key msgKey = logIdKey.withPrefix(persistTagMessagesKeys.begin);
 			tLogData->persistentData->clear( KeyRangeRef( msgKey, strinc(msgKey) ) );
@@ -814,7 +817,7 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version
 		block.append(block.arena(), msg.message.begin(), msg.message.size());
 		for(auto tag : msg.tags) {
 			if(logData->locality == tagLocalitySatellite) {
-				if(!(tag == txsTag || tag.locality == tagLocalityLogRouter)) {
+				if(!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || tag == txsTag)) {
 					continue;
 				}
 			} else if(!(logData->locality == tagLocalitySpecial || logData->locality == tag.locality || tag.locality < 0)) {
@@ -827,6 +830,9 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version
 				}
 				tag.id = tag.id % logData->logRouterTags;
 			}
+			if(tag.locality == tagLocalityTxs) {
+				tag.id = tag.id % logData->txsTags;
+			}
 			Reference<LogData::TagData> tagData = logData->getTagData(tag);
 			if(!tagData) {
 				tagData = logData->createTagData(tag, 0, true, true, false);
@@ -837,7 +843,7 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version
 				if(tagData->versionMessages.back().second.expectedSize() > SERVER_KNOBS->MAX_MESSAGE_SIZE) {
 					TraceEvent(SevWarnAlways, "LargeMessage").detail("Size", tagData->versionMessages.back().second.expectedSize());
 				}
-				if (tag != txsTag) {
+				if (tag.locality != tagLocalityTxs && tag != txsTag) {
 					expectedBytes += tagData->versionMessages.back().second.expectedSize();
 				} else {
 					txsBytes += tagData->versionMessages.back().second.expectedSize();
@@ -905,7 +911,7 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>> & getVersionMessages( Re
 };
 
 ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference<LogData> logData ) {
-	if (self->ignorePopRequest && inputTag != txsTag) {
+	if (self->ignorePopRequest && inputTag.locality != tagLocalityTxs && inputTag != txsTag) {
 		TraceEvent("IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline);
 
 		if (self->toBePopped.find(inputTag) == self->toBePopped.end()
@@ -1062,7 +1068,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 		wait( delay(0.0, TaskLowPriority) );
 	}
 
-	if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) {
+	if( req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) {
 		// Reading spilled data will almost always imply that the storage server is >5s behind the rest
 		// of the cluster.  We shouldn't prioritize spending CPU on helping this server catch up
 		// slightly faster over keeping the rest of the cluster operating normally.
@@ -1303,7 +1309,7 @@ void execProcessingHelper(TLogData* self,
 		rd >> messageLength >> sub >> tagCount;
 		for (int i = 0; i < tagCount; i++) {
 			rd >> tmpTag;
-			if (tmpTag == txsTag) {
+			if (tmpTag.locality == tagLocalityTxs || tmpTag == txsTag) {
 				hasTxsTag = true;
 			}
 			execTags->push_back(execTags->arena(), tmpTag);
@@ -1632,6 +1638,7 @@ ACTOR Future<Void> initPersistentState( TLogData* self, Reference<LogData> logDa
 	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistKnownCommittedVersionKeys.begin), BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned()) ) );
 	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLocalityKeys.begin), BinaryWriter::toValue(logData->locality, Unversioned()) ) );
 	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLogRouterTagsKeys.begin), BinaryWriter::toValue(logData->logRouterTags, Unversioned()) ) );
+	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistTxsTagsKeys.begin), BinaryWriter::toValue(logData->txsTags, Unversioned()) ) );
 	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistRecoveryCountKeys.begin), BinaryWriter::toValue(logData->recoveryCount, Unversioned()) ) );
 
 	for(auto tag : logData->allTags) {
@@ -2039,12 +2046,13 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 	state Future<Standalone<VectorRef<KeyValueRef>>> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
 	state Future<Standalone<VectorRef<KeyValueRef>>> fLocality = storage->readRange(persistLocalityKeys);
 	state Future<Standalone<VectorRef<KeyValueRef>>> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
+	state Future<Standalone<VectorRef<KeyValueRef>>> fTxsTags = storage->readRange(persistTxsTagsKeys);
 	state Future<Standalone<VectorRef<KeyValueRef>>> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
 
 	// FIXME: metadata in queue?
 
 	wait( waitForAll( (vector<Future<Optional<Value>>>(), fFormat ) ) );
-	wait( waitForAll( (vector<Future<Standalone<VectorRef<KeyValueRef>>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fRecoverCounts) ) );
+	wait( waitForAll( (vector<Future<Standalone<VectorRef<KeyValueRef>>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fTxsTags, fRecoverCounts) ) );
 
 	if (fFormat.get().present() && !persistFormatReadableRange.contains( fFormat.get().get() )) {
 		//FIXME: remove when we no longer need to test upgrades from 4.X releases
@@ -2096,6 +2104,11 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 		id_logRouterTags[ BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistLogRouterTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef<int>( it.value, Unversioned() );
 	}
 
+	state std::map<UID, int> id_txsTags;
+	for(auto it : fTxsTags.get()) {
+		id_txsTags[ BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistTxsTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef<int>( it.value, Unversioned() );
+	}
+
 	state std::map<UID, Version> id_knownCommitted;
 	for(auto it : fKnownCommitted.get()) {
 		id_knownCommitted[ BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistKnownCommittedVersionKeys.begin), Unversioned())] = BinaryReader::fromStringRef<Version>( it.value, Unversioned() );
@@ -2121,7 +2134,7 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 		DUMPTOKEN( recruited.confirmRunning );
 
 		//We do not need the remoteTag, because we will not be loading any additional data
-		logData = Reference<LogData>( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], UID(), std::vector<Tag>()) );
+		logData = Reference<LogData>( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], id_txsTags[id1], UID(), std::vector<Tag>()) );
 		logData->locality = id_locality[id1];
 		logData->stopped = true;
 		self->id_data[id1] = logData;
@@ -2304,7 +2317,7 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
 		it.second->stopCommit.trigger();
 	}
 
-	state Reference<LogData> logData = Reference<LogData>( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.recruitmentID, req.allTags) );
+	state Reference<LogData> logData = Reference<LogData>( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.txsTags, req.recruitmentID, req.allTags) );
 	self->id_data[recruited.id()] = logData;
 	logData->locality = req.locality;
 	logData->recoveryCount = req.epoch;
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 52d0079ab7..e34052a0a4 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -205,6 +205,7 @@ static const KeyRangeRef persistKnownCommittedVersionKeys = KeyRangeRef( Literal
 static const KeyRef persistRecoveryLocationKey = KeyRef( LiteralStringRef( "recoveryLocation" ) );
 static const KeyRangeRef persistLocalityKeys = KeyRangeRef( LiteralStringRef( "Locality/" ), LiteralStringRef( "Locality0" ) );
 static const KeyRangeRef persistLogRouterTagsKeys = KeyRangeRef( LiteralStringRef( "LogRouterTags/" ), LiteralStringRef( "LogRouterTags0" ) );
+static const KeyRangeRef persistTxsTagsKeys = KeyRangeRef( LiteralStringRef( "TxsTags/" ), LiteralStringRef( "TxsTags0" ) );
 static const KeyRange persistTagMessagesKeys = prefixRange(LiteralStringRef("TagMsg/"));
 static const KeyRange persistTagMessageRefsKeys = prefixRange(LiteralStringRef("TagMsgRef/"));
 static const KeyRange persistTagPoppedKeys = prefixRange(LiteralStringRef("TagPop/"));
@@ -389,7 +390,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 					auto const& m = self->versionMessages.front();
 					++messagesErased;
 
-					if(self->tag != txsTag) {
+					if(self->tag.locality != tagLocalityTxs && self->tag != txsTag) {
 						sizes.first -= m.second.expectedSize();
 					} else {
 						sizes.second -= m.second.expectedSize();
@@ -491,9 +492,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 	Future<Void> terminated;
 	FlowLock execOpLock;
 	bool execOpCommitInProgress;
+	int txsTags;
 
-	explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, UID recruitmentID, ProtocolVersion protocolVersion, std::vector<Tag> tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()),
-			cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), recruitmentID(recruitmentID), protocolVersion(protocolVersion),
+	explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, int txsTags, UID recruitmentID, ProtocolVersion protocolVersion, std::vector<Tag> tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()),
+			cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), txsTags(txsTags), recruitmentID(recruitmentID), protocolVersion(protocolVersion),
 			logSystem(new AsyncVar<Reference<ILogSystem>>()), logRouterPoppedVersion(0), durableKnownCommittedVersion(0), minKnownCommittedVersion(0), queuePoppedVersion(0), allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()),
 			// These are initialized differently on init() or recovery
 			recoveryCount(), stopped(false), initialized(false), queueCommittingVersion(0), newPersistentDataVersion(invalidVersion), unrecoveredBefore(1), recoveredAt(1), unpoppedRecoveredTags(0),
@@ -542,6 +544,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistKnownCommittedVersionKeys.begin)) );
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLocalityKeys.begin)) );
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLogRouterTagsKeys.begin)) );
+			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistTxsTagsKeys.begin)) );
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistRecoveryCountKeys.begin)) );
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistProtocolVersionKeys.begin)) );
 			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistRecoveryLocationKey)) );
@@ -637,7 +640,7 @@ void updatePersistentPopped( TLogData* self, Reference<LogData> logData, Referen
 
 	if (data->nothingPersistent) return;
 
-	if (data->tag == txsTag) {
+	if (data->tag.locality == tagLocalityTxs || data->tag == txsTag) {
 		self->persistentData->clear( KeyRangeRef(
 					persistTagMessagesKey( logData->logId, data->tag, Version(0) ),
 					persistTagMessagesKey( logData->logId, data->tag, data->popped ) ) );
@@ -654,7 +657,7 @@ void updatePersistentPopped( TLogData* self, Reference<LogData> logData, Referen
 
 ACTOR Future<Void> updatePoppedLocation( TLogData* self, Reference<LogData> logData, Reference<LogData::TagData> data ) {
 	// txsTag is spilled by value, so we do not need to track its popped location.
-	if (data->tag == txsTag) {
+	if (data->tag.locality == tagLocalityTxs || data->tag == txsTag) {
 		return Void();
 	}
 
@@ -724,7 +727,7 @@ ACTOR Future<Void> popDiskQueue( TLogData* self, Reference<LogData> logData ) {
 	for(int tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
 		for(int tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
 			Reference<LogData::TagData> tagData = logData->tag_data[tagLocality][tagId];
-			if (tagData && tagData->tag != txsTag && !tagData->nothingPersistent) {
+			if (tagData && tagData->tag.locality != tagLocalityTxs && tagData->tag != txsTag && !tagData->nothingPersistent) {
 				minLocation = std::min(minLocation, tagData->poppedLocation);
 				minVersion = std::min(minVersion, tagData->popped);
 			}
@@ -783,7 +786,7 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 					anyData = true;
 					tagData->nothingPersistent = false;
 
-					if (tagData->tag == txsTag) {
+					if (tagData->tag.locality == tagLocalityTxs || tagData->tag == txsTag) {
 						// spill txsTag by value
 						wr = BinaryWriter( Unversioned() );
 						for(; msg != tagData->versionMessages.end() && msg->first == currentVersion; ++msg) {
@@ -889,7 +892,7 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 			for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
 				Reference<LogData::TagData> tagData = logData->tag_data[tagLocality][tagId];
 				if (tagData) {
-					if (tagData->tag == txsTag) {
+					if (tagData->tag.locality == tagLocalityTxs || tagData->tag == txsTag) {
 						minVersion = std::min(minVersion, newPersistentDataVersion);
 					} else {
 						minVersion = std::min(minVersion, tagData->popped);
@@ -1064,7 +1067,7 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version
 		block.append(block.arena(), msg.message.begin(), msg.message.size());
 		for(auto tag : msg.tags) {
 			if(logData->locality == tagLocalitySatellite) {
-				if(!(tag == txsTag || tag.locality == tagLocalityLogRouter)) {
+				if(!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || tag == txsTag)) {
 					continue;
 				}
 			} else if(!(logData->locality == tagLocalitySpecial || logData->locality == tag.locality || tag.locality < 0)) {
@@ -1077,6 +1080,9 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version
 				}
 				tag.id = tag.id % logData->logRouterTags;
 			}
+			if(tag.locality == tagLocalityTxs) {
+				tag.id = tag.id % logData->txsTags;
+			}
 			Reference<LogData::TagData> tagData = logData->getTagData(tag);
 			if(!tagData) {
 				tagData = logData->createTagData(tag, 0, true, true, false);
@@ -1087,7 +1093,7 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version
 				if(tagData->versionMessages.back().second.expectedSize() > SERVER_KNOBS->MAX_MESSAGE_SIZE) {
 					TraceEvent(SevWarnAlways, "LargeMessage").detail("Size", tagData->versionMessages.back().second.expectedSize());
 				}
-				if (tag != txsTag) {
+				if (tag.locality != tagLocalityTxs && tag != txsTag) {
 					expectedBytes += tagData->versionMessages.back().second.expectedSize();
 				} else {
 					txsBytes += tagData->versionMessages.back().second.expectedSize();
@@ -1155,7 +1161,7 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>> & getVersionMessages( Re
 };
 
 ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference<LogData> logData ) {
-	if (self->ignorePopRequest && inputTag != txsTag) {
+	if (self->ignorePopRequest && inputTag.locality != tagLocalityTxs && inputTag != txsTag) {
 		TraceEvent("IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline);
 
 		if (self->toBePopped.find(inputTag) == self->toBePopped.end()
@@ -1296,7 +1302,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 	state BinaryWriter messages2(Unversioned());
 	state int sequence = -1;
 	state UID peekId;
-
+	
 	if(req.sequence.present()) {
 		try {
 			peekId = req.sequence.get().first;
@@ -1349,7 +1355,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 		wait( delay(0.0, TaskLowPriority) );
 	}
 
-	if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) {
+	if( req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) {
 		// Reading spilled data will almost always imply that the storage server is >5s behind the rest
 		// of the cluster.  We shouldn't prioritize spending CPU on helping this server catch up
 		// slightly faster over keeping the rest of the cluster operating normally.
@@ -1402,7 +1408,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 
 		peekMessagesFromMemory( logData, req, messages2, endVersion );
 
-		if (req.tag == txsTag) {
+		if (req.tag.locality == tagLocalityTxs || req.tag == txsTag) {
 			Standalone<VectorRef<KeyValueRef>> kvs = wait(
 					self->persistentData->readRange(KeyRangeRef(
 							persistTagMessagesKey(logData->logId, req.tag, req.begin),
@@ -1670,7 +1676,7 @@ void execProcessingHelper(TLogData* self,
 		rd >> messageLength >> sub >> tagCount;
 		for (int i = 0; i < tagCount; i++) {
 			rd >> tmpTag;
-			if (tmpTag == txsTag) {
+			if (tmpTag.locality == tagLocalityTxs || tmpTag == txsTag) {
 				hasTxsTag = true;
 			}
 			execTags->push_back(execTags->arena(), tmpTag);
@@ -2001,6 +2007,7 @@ ACTOR Future<Void> initPersistentState( TLogData* self, Reference<LogData> logDa
 	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistKnownCommittedVersionKeys.begin), BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned()) ) );
 	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLocalityKeys.begin), BinaryWriter::toValue(logData->locality, Unversioned()) ) );
 	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLogRouterTagsKeys.begin), BinaryWriter::toValue(logData->logRouterTags, Unversioned()) ) );
+	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistTxsTagsKeys.begin), BinaryWriter::toValue(logData->txsTags, Unversioned()) ) );
 	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistRecoveryCountKeys.begin), BinaryWriter::toValue(logData->recoveryCount, Unversioned()) ) );
 	storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistProtocolVersionKeys.begin), BinaryWriter::toValue(logData->protocolVersion, Unversioned()) ) );
 
@@ -2417,13 +2424,14 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 	state Future<Standalone<VectorRef<KeyValueRef>>> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
 	state Future<Standalone<VectorRef<KeyValueRef>>> fLocality = storage->readRange(persistLocalityKeys);
 	state Future<Standalone<VectorRef<KeyValueRef>>> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
+	state Future<Standalone<VectorRef<KeyValueRef>>> fTxsTags = storage->readRange(persistTxsTagsKeys);
 	state Future<Standalone<VectorRef<KeyValueRef>>> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
 	state Future<Standalone<VectorRef<KeyValueRef>>> fProtocolVersions = storage->readRange(persistProtocolVersionKeys);
 
 	// FIXME: metadata in queue?
 
 	wait( waitForAll( (vector<Future<Optional<Value>>>(), fFormat, fRecoveryLocation ) ) );
-	wait( waitForAll( (vector<Future<Standalone<VectorRef<KeyValueRef>>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fRecoverCounts, fProtocolVersions ) ) );
+	wait( waitForAll( (vector<Future<Standalone<VectorRef<KeyValueRef>>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fTxsTags, fRecoverCounts, fProtocolVersions ) ) );
 
 	if (fFormat.get().present() && !persistFormatReadableRange.contains( fFormat.get().get() )) {
 		//FIXME: remove when we no longer need to test upgrades from 4.X releases
@@ -2465,6 +2473,11 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 		id_logRouterTags[ BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistLogRouterTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef<int>( it.value, Unversioned() );
 	}
 
+	state std::map<UID, int> id_txsTags;
+	for(auto it : fTxsTags.get()) {
+		id_txsTags[ BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistTxsTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef<int>( it.value, Unversioned() );
+	}
+
 	state std::map<UID, Version> id_knownCommitted;
 	for(auto it : fKnownCommitted.get()) {
 		id_knownCommitted[ BinaryReader::fromStringRef<UID>(it.key.removePrefix(persistKnownCommittedVersionKeys.begin), Unversioned())] = BinaryReader::fromStringRef<Version>( it.value, Unversioned() );
@@ -2498,7 +2511,7 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 		ProtocolVersion protocolVersion = BinaryReader::fromStringRef<ProtocolVersion>( fProtocolVersions.get()[idx].value, Unversioned() );
 
 		//We do not need the remoteTag, because we will not be loading any additional data
-		logData = Reference<LogData>( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], UID(), protocolVersion, std::vector<Tag>()) );
+		logData = Reference<LogData>( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], id_txsTags[id1], UID(), protocolVersion, std::vector<Tag>()) );
 		logData->locality = id_locality[id1];
 		logData->stopped = true;
 		self->id_data[id1] = logData;
@@ -2700,7 +2713,7 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
 		it.second->stopCommit.trigger();
 	}
 
-	state Reference<LogData> logData = Reference<LogData>( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.recruitmentID, currentProtocolVersion, req.allTags) );
+	state Reference<LogData> logData = Reference<LogData>( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.txsTags, req.recruitmentID, currentProtocolVersion, req.allTags) );
 	self->id_data[recruited.id()] = logData;
 	logData->locality = req.locality;
 	logData->recoveryCount = req.epoch;
diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index 2e25daae3b..eafb2e9554 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -454,7 +454,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				foundSpecial = true;
 			}
 			if(log->isLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality ||
-				tag == txsTag || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) {
+				tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) {
 				lastBegin = std::max(lastBegin, log->startVersion);
 				localSets.push_back(log);
 				if(log->locality != tagLocalitySatellite) {
@@ -481,7 +481,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			int i = 0;
 			while(begin < lastBegin) {
 				if(i == oldLogData.size()) {
-					if(tag == txsTag) {
+					if(tag == txsTag || tag.locality == tagLocalityTxs) {
 						break;
 					}
 					TraceEvent("TLogPeekAllDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size());
@@ -497,7 +497,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 						thisSpecial = true;
 					}
 					if(log->isLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality ||
-						tag == txsTag || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) {
+						tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) {
 						thisBegin = std::max(thisBegin, log->startVersion);
 						localOldSets.push_back(log);
 						if(log->locality != tagLocalitySatellite) {
@@ -624,7 +624,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		for(auto tag : tags) {
 			cursors.push_back(peek(dbgid, begin, tag, parallelGetMore));
 		}
-		return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(cursors, begin, end.present() ? end.get() + 1 : getPeekEnd(), tLogs[0]->locality == tagLocalityUpgraded) );
+		return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(cursors, begin, end.present() ? end.get() + 1 : getPeekEnd(), true, tLogs[0]->locality == tagLocalityUpgraded) );
 	}
 
 	Reference<IPeekCursor> peekLocal( UID dbgid, Tag tag, Version begin, Version end, bool useMergePeekCursors, int8_t peekLocality = tagLocalityInvalid ) {
@@ -682,7 +682,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			int i = 0;
 			while(begin < lastBegin) {
 				if(i == oldLogData.size()) {
-					if(tag == txsTag && cursors.size()) {
+					if((tag == txsTag || tag.locality == tagLocalityTxs) && cursors.size()) {
 						break;
 					}
 					TraceEvent("TLogPeekLocalDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size());
@@ -738,30 +738,67 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 	}
 
-	virtual Reference<IPeekCursor> peekSpecial( UID dbgid, Version begin, Tag tag, int8_t peekLocality, Version localEnd ) {
+	virtual Reference<IPeekCursor> peekTxs( UID dbgid, Version begin, int8_t peekLocality, Version localEnd ) {
 		Version end = getEnd();
-		TraceEvent("TLogPeekSpecial", dbgid).detail("Begin", begin).detail("End", end).detail("LocalEnd", localEnd).detail("PeekLocality", peekLocality);
+		if(!tLogs.size()) {
+			TraceEvent("TLogPeekTxsNoLogs", dbgid);
+			return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>>(), txsTag, begin, end, false, false ) );
+		}
+		TraceEvent("TLogPeekTxs", dbgid).detail("Begin", begin).detail("End", end).detail("LocalEnd", localEnd).detail("PeekLocality", peekLocality);
+
 		if(peekLocality < 0 || localEnd == invalidVersion || localEnd <= begin) {
-			return peekAll(dbgid, begin, end, tag, true);
+			std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
+			for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+				cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true));
+			}
+			//SOMEDAY: remove once upgrades from 6.2 are no longer supported
+			cursors.push_back(peekAll(dbgid, begin, end, txsTag, true));
+
+			return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(cursors, begin, end, false) );
 		}
 
 		try {
 			if(localEnd >= end) {
-				return peekLocal(dbgid, tag, begin, end, true, peekLocality);
+				std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
+				for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+					cursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, end, true, peekLocality));
+				}
+				//SOMEDAY: remove once upgrades from 6.2 are no longer supported
+				cursors.push_back(peekLocal(dbgid, txsTag, begin, end, true, peekLocality));
+
+				return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(cursors, begin, end, false) );
 			}
 
 			std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
 			std::vector< LogMessageVersion > epochEnds;
 
 			cursors.resize(2);
-			cursors[1] = peekLocal(dbgid, tag, begin, localEnd, true, peekLocality);
-			cursors[0] = peekAll(dbgid, localEnd, end, tag, true);
+
+			std::vector< Reference<ILogSystem::IPeekCursor> > localCursors;
+			std::vector< Reference<ILogSystem::IPeekCursor> > allCursors;
+			for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+				localCursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, localEnd, true, peekLocality));
+				allCursors.push_back(peekAll(dbgid, localEnd, end, Tag(tagLocalityTxs, i), true));
+			}
+			//SOMEDAY: remove once upgrades from 6.2 are no longer supported
+			localCursors.push_back(peekLocal(dbgid, txsTag, begin, localEnd, true, peekLocality));
+			allCursors.push_back(peekAll(dbgid, localEnd, end, txsTag, true));
+
+			cursors[1] = Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(localCursors, begin, localEnd, false) );
+			cursors[0] = Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(allCursors, localEnd, end, false) );
 			epochEnds.emplace_back(localEnd);
 
-			return Reference<ILogSystem::MultiCursor>( new ILogSystem::MultiCursor(cursors, epochEnds) );
+			return Reference<ILogSystem::MultiCursor>( new ILogSystem::MultiCursor(cursors, epochEnds, false) );
 		} catch( Error& e ) {
 			if(e.code() == error_code_worker_removed) {
-				return peekAll(dbgid, begin, end, tag, true);
+				std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
+				for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+					cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true));
+				}
+				//SOMEDAY: remove once upgrades from 6.2 are no longer supported
+				cursors.push_back(peekAll(dbgid, begin, end, txsTag, true));
+
+				return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(cursors, begin, end, false) );
 			}
 			throw;
 		}
@@ -909,6 +946,16 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 	}
 
+	virtual void popTxs( Version upTo, int8_t popLocality ) {
+		if(tLogs.size()) {
+			for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+				pop(upTo, Tag(tagLocalityTxs, i), 0, popLocality);
+			}
+		}
+		//SOMEDAY: remove once upgrades from 6.2 are no longer supported
+		pop(upTo, txsTag, 0, popLocality);
+	}
+
 	virtual void pop( Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality ) {
 		if (upTo <= 0) return;
 		if( tag.locality == tagLocalityRemoteLog) {
@@ -1126,6 +1173,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		return Tag(tagLocalityLogRouter, deterministicRandom()->randomInt(0, logRouterTags));
 	}
 
+	virtual Tag getRandomTxsTag() {
+		ASSERT(tLogs.size());
+		return Tag(tagLocalityTxs, deterministicRandom()->randomInt(0, tLogs[0]->logServers.size()));
+	}
+
 	ACTOR static Future<Void> monitorLog(Reference<AsyncVar<OptionalInterface<TLogInterface>>> logServer, Reference<AsyncVar<bool>> failed) {
 		state Future<Void> waitFailure;
 		loop {
@@ -1730,6 +1782,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			req.allTags = localTags;
 			req.startVersion = logSet->startVersion;
 			req.logRouterTags = 0;
+			req.txsTags = self->tLogs[0]->logServers.size();
 		}
 
 		logSet->tLogLocalities.resize( remoteWorkers.remoteTLogs.size() );
@@ -1823,7 +1876,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 
 			logSystem->tLogs[1]->logServers.resize( recr.satelliteTLogs.size() );  // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size
 			logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities);
-			logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags);
+			logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,recr.tLogs.size(),oldLogSystem->tLogs.size() ? oldLogSystem->tLogs[0]->logServers.size() : 0);
 			logSystem->expectedLogSets++;
 		}
 
@@ -1903,6 +1956,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			req.allTags = localTags;
 			req.startVersion = logSystem->tLogs[0]->startVersion;
 			req.logRouterTags = logSystem->logRouterTags;
+			req.txsTags = recr.tLogs.size();
 		}
 
 		logSystem->tLogs[0]->tLogLocalities.resize( recr.tLogs.size() );
@@ -1927,7 +1981,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		state std::vector<Future<Void>> recoveryComplete;
 
 		if(region.satelliteTLogReplicationFactor > 0) {
-			std::vector<Tag> satelliteTags(1, txsTag);
+			std::vector<Tag> satelliteTags;
+			for(int i = 0; i < recr.tLogs.size(); i++) {
+				satelliteTags.push_back(Tag(tagLocalityTxs, i));
+			}
+			satelliteTags.push_back(txsTag);
 
 			state vector<Future<TLogInterface>> satelliteInitializationReplies;
 			vector< InitializeTLogRequest > sreqs( recr.satelliteTLogs.size() );
@@ -1947,6 +2005,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				req.allTags = satelliteTags;
 				req.startVersion = oldLogSystem->knownCommittedVersion + 1;
 				req.logRouterTags = logSystem->logRouterTags;
+				req.txsTags = recr.tLogs.size();
 			}
 
 			for(int i = -1; i < oldLogSystem->logRouterTags; i++) {
@@ -1957,6 +2016,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					sreqs[ loc ].recoverTags.push_back( tag );
 			}
 
+			for(int i = 0; i < recr.tLogs.size(); i++) {
+				Tag tag = Tag(tagLocalityTxs, i);
+				locations.clear();
+				logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+				for(int loc : locations)
+					sreqs[ loc ].recoverTags.push_back( tag );
+			}
+
 			for( int i = 0; i < recr.satelliteTLogs.size(); i++ )
 				satelliteInitializationReplies.push_back( transformErrors( throwErrorOr( recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor( sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ), master_recovery_failed() ) );
 
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index 8370e7fdde..d739f8770c 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -108,6 +108,7 @@ struct InitializeTLogRequest {
 	bool isPrimary;
 	Version startVersion;
 	int logRouterTags;
+	int txsTags;
 
 	ReplyPromise< struct TLogInterface > reply;
 
@@ -115,7 +116,7 @@ struct InitializeTLogRequest {
 
 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, recruitmentID, recoverFrom, recoverAt, knownCommittedVersion, epoch, recoverTags, allTags, storeType, remoteTag, locality, isPrimary, startVersion, logRouterTags, reply, logVersion, spillType);
+		serializer(ar, recruitmentID, recoverFrom, recoverAt, knownCommittedVersion, epoch, recoverTags, allTags, storeType, remoteTag, locality, isPrimary, startVersion, logRouterTags, reply, logVersion, spillType, txsTags);
 	}
 };
 
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index 205b1dbc19..55de67106a 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -605,21 +605,21 @@ ACTOR Future<vector<Standalone<CommitTransactionRef>>> recruitEverything( Refere
 	return confChanges;
 }
 
-ACTOR Future<Void> updateLocalityForDcId(Optional<Key> dcId, Reference<ILogSystem> oldLogSystem, Reference<AsyncVar<PeekSpecialInfo>> locality) {
+ACTOR Future<Void> updateLocalityForDcId(Optional<Key> dcId, Reference<ILogSystem> oldLogSystem, Reference<AsyncVar<PeekTxsInfo>> locality) {
 	loop {
 		std::pair<int8_t,int8_t> loc = oldLogSystem->getLogSystemConfig().getLocalityForDcId(dcId);
 		Version ver = locality->get().knownCommittedVersion;
 		if(ver == invalidVersion) {
 			ver = oldLogSystem->getKnownCommittedVersion();
 		}
-		locality->set( PeekSpecialInfo(loc.first,loc.second,ver) );
+		locality->set( PeekTxsInfo(loc.first,loc.second,ver) );
 		TraceEvent("UpdatedLocalityForDcId").detail("DcId", dcId).detail("Locality0", loc.first).detail("Locality1", loc.second).detail("Version", ver);
 		wait( oldLogSystem->onLogSystemConfigChange() || oldLogSystem->onKnownCommittedVersionChange() );
 	}
 }
 
 ACTOR Future<Void> readTransactionSystemState( Reference<MasterData> self, Reference<ILogSystem> oldLogSystem ) {
-	state Reference<AsyncVar<PeekSpecialInfo>> myLocality = Reference<AsyncVar<PeekSpecialInfo>>( new AsyncVar<PeekSpecialInfo>(PeekSpecialInfo(tagLocalityInvalid,tagLocalityInvalid,invalidVersion) ) );
+	state Reference<AsyncVar<PeekTxsInfo>> myLocality = Reference<AsyncVar<PeekTxsInfo>>( new AsyncVar<PeekTxsInfo>(PeekTxsInfo(tagLocalityInvalid,tagLocalityInvalid,invalidVersion) ) );
 	state Future<Void> localityUpdater = updateLocalityForDcId(self->myInterface.locality.dcId(), oldLogSystem, myLocality);
 	// Peek the txnStateTag in oldLogSystem and recover self->txnStateStore
 
@@ -630,7 +630,7 @@ ACTOR Future<Void> readTransactionSystemState( Reference<MasterData> self, Refer
 
 	// Recover transaction state store
 	if(self->txnStateStore) self->txnStateStore->close();
-	self->txnStateLogAdapter = openDiskQueueAdapter( oldLogSystem, txsTag, myLocality );
+	self->txnStateLogAdapter = openDiskQueueAdapter( oldLogSystem, myLocality );
 	self->txnStateStore = keyValueStoreLogSystem( self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false, false, true );
 
 	// Versionstamped operations (particularly those applied from DR) define a minimum commit version
@@ -676,6 +676,9 @@ ACTOR Future<Void> readTransactionSystemState( Reference<MasterData> self, Refer
 	Standalone<VectorRef<KeyValueRef>> rawTags = wait( self->txnStateStore->readRange( serverTagKeys ) );
 	self->allTags.clear();
 	if(self->lastEpochEnd > 0) {
+		for(int i = 0; i < oldLogSystem->getLogSystemConfig().tLogs[0].tLogs.size(); i++) {
+			self->allTags.push_back(Tag(tagLocalityTxs, i));
+		}
 		self->allTags.push_back(txsTag);
 	}
 

From c92324b8948cfef41a94acd59c744826e5959f1c Mon Sep 17 00:00:00 2001
From: Alec Grieser <agrieser@apple.com>
Date: Thu, 20 Jun 2019 08:54:12 -0700
Subject: [PATCH 004/136] python sample docker app uses default coordinator
 port

---
 packaging/docker/create_cluster_file.bash          | 5 +++--
 packaging/docker/samples/python/docker-compose.yml | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/packaging/docker/create_cluster_file.bash b/packaging/docker/create_cluster_file.bash
index 863ca43ac8..c1bb959b8e 100644
--- a/packaging/docker/create_cluster_file.bash
+++ b/packaging/docker/create_cluster_file.bash
@@ -39,7 +39,8 @@ function create_cluster_file() {
 			echo "Failed to look up coordinator address for $FDB_COORDINATOR" 1>&2
 			exit 1
 		fi
-		echo "docker:docker@$coordinator_ip:$FDB_COORDINATOR_PORT" > $FDB_CLUSTER_FILE
+		coordinator_port=${FDB_COORDINATOR_PORT:-4500}
+		echo "docker:docker@$coordinator_ip:$coordinator_port" > $FDB_CLUSTER_FILE
 	else
 		echo "FDB_COORDINATOR environment variable not defined" 1>&2
 		exit 1
@@ -47,5 +48,5 @@ function create_cluster_file() {
 }
 
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
-    create_cluster_file "$@"
+	create_cluster_file "$@"
 fi
diff --git a/packaging/docker/samples/python/docker-compose.yml b/packaging/docker/samples/python/docker-compose.yml
index 34c62914a1..e239bff80f 100644
--- a/packaging/docker/samples/python/docker-compose.yml
+++ b/packaging/docker/samples/python/docker-compose.yml
@@ -46,7 +46,6 @@ services:
     build:
       context: app
     ports:
-      - 5000:5000
+      - 5000:5000/tcp
     environment:
       FDB_COORDINATOR: fdb-coordinator
-      FDB_COORDINATOR_PORT: 4550

From 76ba4e60b70ee9f7993869a311fdc0c1c591dd65 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Mon, 24 Jun 2019 13:03:35 -0700
Subject: [PATCH 005/136] fixed a stack overflow bug

---
 fdbserver/LogSystemDiskQueueAdapter.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
index b145b8db84..af2323923a 100644
--- a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
+++ b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
@@ -66,6 +66,7 @@ public:
 				}
 				if(!self->cursor->hasMessage()) {
 					self->recoveryLoc = self->cursor->version().version;
+					wait(delay(0));
 					continue;
 				}
 			}

From 7a500cd37f10c135321e27eac5b541a4e504efd1 Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Tue, 25 Jun 2019 02:47:35 -0700
Subject: [PATCH 006/136] A giant translation of TaskFooPriority ->
 TaskPriority::Foo

This is so that APIs that take priorities don't take ints, which are
common and easy to accidentally pass the wrong thing.
---
 bindings/flow/fdb_flow.actor.cpp            |   4 +-
 fdbclient/BackupAgentBase.actor.cpp         |  10 +-
 fdbclient/ClusterInterface.h                |  12 +-
 fdbclient/DatabaseContext.h                 |   6 +-
 fdbclient/FailureMonitorClient.actor.cpp    |  14 +--
 fdbclient/HTTP.actor.cpp                    |   4 +-
 fdbclient/ManagementAPI.actor.cpp           |   4 +-
 fdbclient/MasterProxyInterface.h            |   8 +-
 fdbclient/MonitorLeader.actor.cpp           |   4 +-
 fdbclient/NativeAPI.actor.cpp               |  48 ++++----
 fdbclient/NativeAPI.actor.h                 |   6 +-
 fdbclient/StatusClient.actor.cpp            |   2 +-
 fdbclient/StorageServerInterface.h          |   6 +-
 fdbclient/VersionedMap.actor.h              |   2 +-
 fdbclient/VersionedMap.h                    |   2 +-
 fdbrpc/AsyncFileEIO.actor.h                 |  16 +--
 fdbrpc/AsyncFileKAIO.actor.h                |   6 +-
 fdbrpc/AsyncFileNonDurable.actor.cpp        |   4 +-
 fdbrpc/AsyncFileNonDurable.actor.h          |  22 ++--
 fdbrpc/FlowTests.actor.cpp                  |  12 +-
 fdbrpc/FlowTransport.actor.cpp              |  42 +++----
 fdbrpc/FlowTransport.h                      |   4 +-
 fdbrpc/LoadBalance.actor.h                  |   2 +-
 fdbrpc/batcher.actor.h                      |   2 +-
 fdbrpc/fdbrpc.h                             |  28 ++---
 fdbrpc/genericactors.actor.h                |   2 +-
 fdbrpc/sim2.actor.cpp                       |  48 ++++----
 fdbrpc/simulator.h                          |   4 +-
 fdbserver/ClusterController.actor.cpp       |   4 +-
 fdbserver/ClusterRecruitmentInterface.h     |  14 +--
 fdbserver/Coordination.actor.cpp            |  10 +-
 fdbserver/CoroFlow.actor.cpp                |   2 +-
 fdbserver/DataDistribution.actor.cpp        |  40 +++----
 fdbserver/DataDistributionQueue.actor.cpp   |  22 ++--
 fdbserver/DataDistributionTracker.actor.cpp |  16 +--
 fdbserver/KeyValueStoreSQLite.actor.cpp     |   8 +-
 fdbserver/LeaderElection.actor.cpp          |   8 +-
 fdbserver/LogRouter.actor.cpp               |  16 +--
 fdbserver/LogSystem.h                       |  12 +-
 fdbserver/LogSystemPeekCursor.actor.cpp     |  22 ++--
 fdbserver/MasterInterface.h                 |   2 +-
 fdbserver/MasterProxyServer.actor.cpp       |  22 ++--
 fdbserver/MoveKeys.actor.cpp                |  24 ++--
 fdbserver/OldTLogServer_4_6.actor.cpp       |  44 ++++----
 fdbserver/OldTLogServer_6_0.actor.cpp       |  58 +++++-----
 fdbserver/Orderer.actor.h                   |   4 +-
 fdbserver/Ratekeeper.actor.cpp              |   4 +-
 fdbserver/Resolver.actor.cpp                |   6 +-
 fdbserver/ResolverInterface.h               |   4 +-
 fdbserver/RestoreInterface.h                |   2 +-
 fdbserver/SimulatedCluster.actor.cpp        |   4 +-
 fdbserver/Status.actor.cpp                  |   2 +-
 fdbserver/TLogInterface.h                   |  10 +-
 fdbserver/TLogServer.actor.cpp              |  62 +++++------
 fdbserver/TagPartitionedLogSystem.actor.cpp |   4 +-
 fdbserver/VFSAsync.cpp                      |   2 +-
 fdbserver/WaitFailure.actor.cpp             |   6 +-
 fdbserver/WaitFailure.h                     |   8 +-
 fdbserver/WorkerInterface.actor.h           |   2 +-
 fdbserver/fdbserver.actor.cpp               |   2 +-
 fdbserver/masterserver.actor.cpp            |  10 +-
 fdbserver/networktest.actor.cpp             |   2 +-
 fdbserver/storageserver.actor.cpp           |  34 +++---
 fdbserver/worker.actor.cpp                  |   6 +-
 flow/IThreadPool.h                          |   6 +-
 flow/Net2.actor.cpp                         |  68 ++++++------
 flow/Profiler.actor.cpp                     |   2 +-
 flow/ThreadHelper.actor.h                   |   6 +-
 flow/Trace.cpp                              |   2 +-
 flow/flow.h                                 |  14 +--
 flow/genericactors.actor.h                  |  18 +--
 flow/network.h                              | 115 +++++++++++---------
 72 files changed, 531 insertions(+), 522 deletions(-)

diff --git a/bindings/flow/fdb_flow.actor.cpp b/bindings/flow/fdb_flow.actor.cpp
index 96512a0ce4..99af1a665e 100644
--- a/bindings/flow/fdb_flow.actor.cpp
+++ b/bindings/flow/fdb_flow.actor.cpp
@@ -85,7 +85,7 @@ void fdb_flow_test() {
 
 	openTraceFile(NetworkAddress(), 1000000, 1000000, ".");
 	systemMonitor();
-	uncancellable(recurring(&systemMonitor, 5.0, TaskFlushTrace));
+	uncancellable(recurring(&systemMonitor, 5.0, TaskPriority::FlushTrace));
 
 	Future<Void> t = _test();
 
@@ -179,7 +179,7 @@ namespace FDB {
 	}
 
 	void backToFutureCallback( FDBFuture* f, void* data ) {
-		g_network->onMainThread( Promise<Void>((SAV<Void>*)data), TaskDefaultOnMainThread ); // SOMEDAY: think about this priority
+		g_network->onMainThread( Promise<Void>((SAV<Void>*)data), TaskPriority::DefaultOnMainThread ); // SOMEDAY: think about this priority
 	}
 
 	// backToFuture<Type>( FDBFuture*, (FDBFuture* -> Type) ) -> Future<Type>
diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp
index 1de08c64f8..25bc58c71d 100644
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@@ -419,7 +419,7 @@ ACTOR Future<Void> readCommitted(Database cx, PromiseStream<RangeResultWithVersi
 
 			//add lock
 			releaser.release();
-			wait(lock->take(TaskDefaultYield, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT));
+			wait(lock->take(TaskPriority::DefaultYield, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT));
 			releaser = FlowLock::Releaser(*lock, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT);
 
 			state Standalone<RangeResultRef> values = wait(tr.getRange(begin, end, limits));
@@ -495,7 +495,7 @@ ACTOR Future<Void> readCommitted(Database cx, PromiseStream<RCGroup> results, Fu
 			//add lock
 			wait(active);
 			releaser.release();
-			wait(lock->take(TaskDefaultYield, rangevalue.expectedSize() + rcGroup.items.expectedSize()));
+			wait(lock->take(TaskPriority::DefaultYield, rangevalue.expectedSize() + rcGroup.items.expectedSize()));
 			releaser = FlowLock::Releaser(*lock, rangevalue.expectedSize() + rcGroup.items.expectedSize());
 
 			for (auto & s : rangevalue){
@@ -613,7 +613,7 @@ ACTOR Future<int> dumpData(Database cx, PromiseStream<RCGroup> results, Referenc
 		req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE;
 
 		totalBytes += mutationSize;
-		wait( commitLock->take(TaskDefaultYield, mutationSize) );
+		wait( commitLock->take(TaskPriority::DefaultYield, mutationSize) );
 		addActor.send( commitLock->releaseWhen( success(commit.getReply(req)), mutationSize ) );
 
 		if(endOfStream) {
@@ -653,7 +653,7 @@ ACTOR Future<Void> coalesceKeyVersionCache(Key uid, Version endVersion, Referenc
 		req.transaction.read_snapshot = committedVersion->get();
 		req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE;
 
-		wait( commitLock->take(TaskDefaultYield, mutationSize) );
+		wait( commitLock->take(TaskPriority::DefaultYield, mutationSize) );
 		addActor.send( commitLock->releaseWhen( success(commit.getReply(req)), mutationSize ) );
 	}
 
@@ -671,7 +671,7 @@ ACTOR Future<Void> applyMutations(Database cx, Key uid, Key addPrefix, Key remov
 	try {
 		loop {
 			if(beginVersion >= *endVersion) {
-				wait( commitLock.take(TaskDefaultYield, CLIENT_KNOBS->BACKUP_LOCK_BYTES) );
+				wait( commitLock.take(TaskPriority::DefaultYield, CLIENT_KNOBS->BACKUP_LOCK_BYTES) );
 				commitLock.release(CLIENT_KNOBS->BACKUP_LOCK_BYTES);
 				if(beginVersion >= *endVersion) {
 					return Void();
diff --git a/fdbclient/ClusterInterface.h b/fdbclient/ClusterInterface.h
index bb51ce74f2..5e17807c4d 100644
--- a/fdbclient/ClusterInterface.h
+++ b/fdbclient/ClusterInterface.h
@@ -52,12 +52,12 @@ struct ClusterInterface {
 	}
 
 	void initEndpoints() {
-		openDatabase.getEndpoint( TaskClusterController );
-		failureMonitoring.getEndpoint( TaskFailureMonitor );
-		databaseStatus.getEndpoint( TaskClusterController );
-		ping.getEndpoint( TaskClusterController );
-		getClientWorkers.getEndpoint( TaskClusterController );
-		forceRecovery.getEndpoint( TaskClusterController );
+		openDatabase.getEndpoint( TaskPriority::ClusterController );
+		failureMonitoring.getEndpoint( TaskPriority::FailureMonitor );
+		databaseStatus.getEndpoint( TaskPriority::ClusterController );
+		ping.getEndpoint( TaskPriority::ClusterController );
+		getClientWorkers.getEndpoint( TaskPriority::ClusterController );
+		forceRecovery.getEndpoint( TaskPriority::ClusterController );
 	}
 
 	template <class Ar>
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index 0245c2abdb..606952fb9c 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -54,7 +54,7 @@ public:
 
 	// For internal (fdbserver) use only
 	static Database create( Reference<AsyncVar<Optional<ClusterInterface>>> clusterInterface, Reference<ClusterConnectionFile> connFile, LocalityData const& clientLocality );
-	static Database create( Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, int taskID=TaskDefaultEndpoint, bool lockAware=false, int apiVersion=Database::API_VERSION_LATEST );
+	static Database create( Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID=TaskPriority::DefaultEndpoint, bool lockAware=false, int apiVersion=Database::API_VERSION_LATEST );
 
 	~DatabaseContext();
 
@@ -97,7 +97,7 @@ public:
 
 //private: 
 	explicit DatabaseContext( Reference<Cluster> cluster, Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
-		Future<Void> clientInfoMonitor, Standalone<StringRef> dbId, int taskID, LocalityData const& clientLocality, 
+		Future<Void> clientInfoMonitor, Standalone<StringRef> dbId, TaskPriority taskID, LocalityData const& clientLocality, 
 		bool enableLocalityLoadBalance, bool lockAware, int apiVersion = Database::API_VERSION_LATEST );
 
 	explicit DatabaseContext( const Error &err );
@@ -161,7 +161,7 @@ public:
 
 	Future<Void> logger;
 
-	int taskID;
+	TaskPriority taskID;
 
 	Int64MetricHandle getValueSubmitted;
 	EventMetricHandle<GetValueComplete> getValueCompleted;
diff --git a/fdbclient/FailureMonitorClient.actor.cpp b/fdbclient/FailureMonitorClient.actor.cpp
index 3be7a4dccd..7cb1a3144e 100644
--- a/fdbclient/FailureMonitorClient.actor.cpp
+++ b/fdbclient/FailureMonitorClient.actor.cpp
@@ -41,7 +41,7 @@ ACTOR Future<Void> failureMonitorClientLoop(
 {
 	state Version version = 0;
 	state Future<FailureMonitoringReply> request = Never();
-	state Future<Void> nextRequest = delay(0, TaskFailureMonitor);
+	state Future<Void> nextRequest = delay(0, TaskPriority::FailureMonitor);
 	state Future<Void> requestTimeout = Never();
 	state double before = now();
 	state double waitfor = 0;
@@ -61,7 +61,7 @@ ACTOR Future<Void> failureMonitorClientLoop(
 		loop {
 			choose {
 				when( FailureMonitoringReply reply = wait( request ) ) {
-					g_network->setCurrentTask(TaskDefaultDelay);
+					g_network->setCurrentTask(TaskPriority::DefaultDelay);
 					request = Never();
 					requestTimeout = Never();
 					if (reply.allOthersFailed) {
@@ -122,10 +122,10 @@ ACTOR Future<Void> failureMonitorClientLoop(
 					}
 					before = now();
 					waitfor = reply.clientRequestIntervalMS * .001;
-					nextRequest = delayJittered( waitfor, TaskFailureMonitor );
+					nextRequest = delayJittered( waitfor, TaskPriority::FailureMonitor );
 				}
 				when( wait( requestTimeout ) ) {
-					g_network->setCurrentTask(TaskDefaultDelay);
+					g_network->setCurrentTask(TaskPriority::DefaultDelay);
 					requestTimeout = Never();
 					TraceEvent(SevWarn, "FailureMonitoringServerDown").detail("OldServerID",controller.id());
 					monitor->setStatus(controlAddr.address, FailureStatus(true));
@@ -136,7 +136,7 @@ ACTOR Future<Void> failureMonitorClientLoop(
 					}
 				}
 				when( wait( nextRequest ) ) {
-					g_network->setCurrentTask(TaskDefaultDelay);
+					g_network->setCurrentTask(TaskPriority::DefaultDelay);
 					nextRequest = Never();
 
 					double elapsed = now() - before;
@@ -152,9 +152,9 @@ ACTOR Future<Void> failureMonitorClientLoop(
 					req.addresses = g_network->getLocalAddresses();
 					if (trackMyStatus)
 						req.senderStatus = FailureStatus(false);
-					request = controller.failureMonitoring.getReply( req, TaskFailureMonitor );
+					request = controller.failureMonitoring.getReply( req, TaskPriority::FailureMonitor );
 					if(!controller.failureMonitoring.getEndpoint().isLocal())
-						requestTimeout = delay( fmState->serverFailedTimeout, TaskFailureMonitor );
+						requestTimeout = delay( fmState->serverFailedTimeout, TaskPriority::FailureMonitor );
 				}
 			}
 		}
diff --git a/fdbclient/HTTP.actor.cpp b/fdbclient/HTTP.actor.cpp
index 00cece10a1..5893588406 100644
--- a/fdbclient/HTTP.actor.cpp
+++ b/fdbclient/HTTP.actor.cpp
@@ -93,7 +93,7 @@ namespace HTTP {
 		loop {
 			// Wait for connection to have something to read
 			wait(conn->onReadable());
-			wait( delay( 0, TaskReadSocket ) );
+			wait( delay( 0, TaskPriority::ReadSocket ) );
 
 			// Read into buffer
 			int originalSize = buf->size();
@@ -353,7 +353,7 @@ namespace HTTP {
 
 			loop {
 				wait(conn->onWritable());
-				wait( delay( 0, TaskWriteSocket ) );
+				wait( delay( 0, TaskPriority::WriteSocket ) );
 
 				// If we already got a response, before finishing sending the request, then close the connection,
 				// set the Connection header to "close" as a hint to the caller that this connection can't be used
diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index a371ac2624..afc64d62c2 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -967,7 +967,7 @@ ACTOR Future<CoordinatorsResult::Type> changeQuorum( Database cx, Reference<IQuo
 			vector<Future<Optional<LeaderInfo>>> leaderServers;
 			ClientCoordinators coord( Reference<ClusterConnectionFile>( new ClusterConnectionFile( conn ) ) );
 			for( int i = 0; i < coord.clientLeaderServers.size(); i++ )
-				leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskCoordinationReply ) );
+				leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskPriority::CoordinationReply ) );
 
 			choose {
 				when( wait( waitForAll( leaderServers ) ) ) {}
@@ -1047,7 +1047,7 @@ struct AutoQuorumChange : IQuorumChange {
 		ClientCoordinators coord(ccf);
 		vector<Future<Optional<LeaderInfo>>> leaderServers;
 		for( int i = 0; i < coord.clientLeaderServers.size(); i++ )
-			leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskCoordinationReply ) );
+			leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskPriority::CoordinationReply ) );
 		Optional<vector<Optional<LeaderInfo>>> results = wait( timeout( getAll(leaderServers), CLIENT_KNOBS->IS_ACCEPTABLE_DELAY ) );
 		if (!results.present()) return false;  // Not all responded
 		for(auto& r : results.get())
diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h
index 9b65ec572c..dea0d8b797 100644
--- a/fdbclient/MasterProxyInterface.h
+++ b/fdbclient/MasterProxyInterface.h
@@ -67,10 +67,10 @@ struct MasterProxyInterface {
 	}
 
 	void initEndpoints() {
-		getConsistentReadVersion.getEndpoint(TaskProxyGetConsistentReadVersion);
-		getRawCommittedVersion.getEndpoint(TaskProxyGetRawCommittedVersion);
-		commit.getEndpoint(TaskProxyCommitDispatcher);
-		getStorageServerRejoinInfo.getEndpoint(TaskProxyStorageRejoin);
+		getConsistentReadVersion.getEndpoint(TaskPriority::ProxyGetConsistentReadVersion);
+		getRawCommittedVersion.getEndpoint(TaskPriority::ProxyGetRawCommittedVersion);
+		commit.getEndpoint(TaskPriority::ProxyCommitDispatcher);
+		getStorageServerRejoinInfo.getEndpoint(TaskPriority::ProxyStorageRejoin);
 		//getKeyServersLocations.getEndpoint(TaskProxyGetKeyServersLocations); //do not increase the priority of these requests, because clients cans bring down the cluster with too many of these messages.
 	}
 };
diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index 6210eb8810..b066b03b13 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -371,7 +371,7 @@ ClientLeaderRegInterface::ClientLeaderRegInterface( NetworkAddress remote )
 }
 
 ClientLeaderRegInterface::ClientLeaderRegInterface( INetwork* local ) {
-	getLeader.makeWellKnownEndpoint( WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskCoordination );
+	getLeader.makeWellKnownEndpoint( WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination );
 }
 
 // Nominee is the worker among all workers that are considered as leader by a coordinator
@@ -380,7 +380,7 @@ ClientLeaderRegInterface::ClientLeaderRegInterface( INetwork* local ) {
 ACTOR Future<Void> monitorNominee( Key key, ClientLeaderRegInterface coord, AsyncTrigger* nomineeChange, Optional<LeaderInfo> *info, int generation, Reference<AsyncVar<int>> connectedCoordinatorsNum ) {
 	state bool hasCounted = false;
 	loop {
-		state Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, info->present() ? info->get().changeID : UID() ), TaskCoordinationReply ) );
+		state Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, info->present() ? info->get().changeID : UID() ), TaskPriority::CoordinationReply ) );
 		if (li.present() && !hasCounted && connectedCoordinatorsNum.isValid()) {
 			connectedCoordinatorsNum->set(connectedCoordinatorsNum->get() + 1);
 			hasCounted = true;
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 6fbf778997..38b373c954 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -509,7 +509,7 @@ Future<HealthMetrics> DatabaseContext::getHealthMetrics(bool detailed = false) {
 
 DatabaseContext::DatabaseContext(
 	Reference<Cluster> cluster, Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, Standalone<StringRef> dbId, 
-	int taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion ) 
+	TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion ) 
 	: cluster(cluster), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), dbId(dbId), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance),
 	lockAware(lockAware), apiVersion(apiVersion), provisional(false),
 	transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), 
@@ -629,10 +629,10 @@ Database DatabaseContext::create(Reference<AsyncVar<Optional<ClusterInterface>>>
 	Reference<AsyncVar<ClientDBInfo>> clientInfo(new AsyncVar<ClientDBInfo>());
 	Future<Void> clientInfoMonitor = delayedAsyncVar(connectedCoordinatorsNum, connectedCoordinatorsNumDelayed, CLIENT_KNOBS->CHECK_CONNECTED_COORDINATOR_NUM_DELAY) || monitorClientInfo(clusterInterface, connFile, clientInfo, connectedCoordinatorsNumDelayed);
 
-	return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false));
+	return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false));
 }
 
-Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, int taskID, bool lockAware, int apiVersion) {
+Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID, bool lockAware, int apiVersion) {
 	return Database( new DatabaseContext( Reference<Cluster>(nullptr), clientInfo, clientInfoMonitor, LiteralStringRef(""), taskID, clientLocality, enableLocalityLoadBalance, lockAware, apiVersion ) );
 }
 
@@ -820,10 +820,10 @@ Database Database::createDatabase( Reference<ClusterConnectionFile> connFile, in
 
 	DatabaseContext *db;
 	if(preallocatedDb) {
-		db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false, apiVersion);
+		db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion);
 	}
 	else {
-		db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false, apiVersion);
+		db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion);
 	}
 
 	return Database(db);
@@ -879,7 +879,7 @@ void Cluster::init( Reference<ClusterConnectionFile> connFile, bool startClientI
 			initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(publicIP)));
 
 			systemMonitor();
-			uncancellable( recurring( &systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskFlushTrace ) );
+			uncancellable( recurring( &systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace ) );
 		}
 
 		failMon = failureMonitorClient( clusterInterface, false );
@@ -1235,7 +1235,7 @@ ACTOR Future< pair<KeyRange,Reference<LocationInfo>> > getKeyLocation_internal(
 	loop {
 		choose {
 			when ( wait( cx->onMasterProxiesChanged() ) ) {}
-			when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional<KeyRef>(), 100, isBackward, key.arena()), TaskDefaultPromiseEndpoint ) ) ) {
+			when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional<KeyRef>(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
 				if( info.debugID.present() )
 					g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After");
 				ASSERT( rep.results.size() == 1 );
@@ -1272,7 +1272,7 @@ ACTOR Future< vector< pair<KeyRange,Reference<LocationInfo>> > > getKeyRangeLoca
 	loop {
 		choose {
 			when ( wait( cx->onMasterProxiesChanged() ) ) {}
-			when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskDefaultPromiseEndpoint ) ) ) {
+			when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
 				state GetKeyServerLocationsReply rep = _rep;
 				if( info.debugID.present() )
 					g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.After");
@@ -1393,7 +1393,7 @@ ACTOR Future<Optional<Value>> getValue( Future<Version> version, Key key, Databa
 			}
 			state GetValueReply reply = wait(
 			    loadBalance(ssi.second, &StorageServerInterface::getValue, GetValueRequest(key, ver, getValueID),
-			                TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL));
+			                TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL));
 			double latency = now() - startTimeD;
 			cx->readLatencies.addSample(latency);
 			if (trLogInfo) {
@@ -1456,7 +1456,7 @@ ACTOR Future<Key> getKey( Database cx, KeySelector k, Future<Version> version, T
 			if( info.debugID.present() )
 				g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.Before"); //.detail("StartKey", k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual);
 			++cx->transactionPhysicalReads;
-			GetKeyReply reply = wait( loadBalance( ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get()), TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
+			GetKeyReply reply = wait( loadBalance( ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get()), TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
 			if( info.debugID.present() )
 				g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.After"); //.detail("NextKey",reply.sel.key).detail("Offset", reply.sel.offset).detail("OrEqual", k.orEqual);
 			k = reply.sel;
@@ -1519,7 +1519,7 @@ ACTOR Future< Void > watchValue( Future<Version> version, Key key, Optional<Valu
 				g_traceBatch.addAttach("WatchValueAttachID", info.debugID.get().first(), watchValueID.get().first());
 				g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.Before"); //.detail("TaskID", g_network->getCurrentTask());
 			}
-			state Version resp = wait( loadBalance( ssi.second, &StorageServerInterface::watchValue, WatchValueRequest(key, value, ver, watchValueID), TaskDefaultPromiseEndpoint ) );
+			state Version resp = wait( loadBalance( ssi.second, &StorageServerInterface::watchValue, WatchValueRequest(key, value, ver, watchValueID), TaskPriority::DefaultPromiseEndpoint ) );
 			if( info.debugID.present() ) {
 				g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After"); //.detail("TaskID", g_network->getCurrentTask());
 			}
@@ -1611,7 +1611,7 @@ ACTOR Future<Standalone<RangeResultRef>> getExactRange( Database cx, Version ver
 						.detail("Servers", locations[shard].second->description());*/
 				}
 				++cx->transactionPhysicalReads;
-				GetKeyValuesReply rep = wait( loadBalance( locations[shard].second, &StorageServerInterface::getKeyValues, req, TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
+				GetKeyValuesReply rep = wait( loadBalance( locations[shard].second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
 				if( info.debugID.present() )
 					g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getExactRange.After");
 				output.arena().dependsOn( rep.arena );
@@ -1888,7 +1888,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange( Database cx, Reference<Transa
 							transaction_too_old(), future_version()
 								});
 				}
-				GetKeyValuesReply rep = wait( loadBalance(beginServer.second, &StorageServerInterface::getKeyValues, req, TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
+				GetKeyValuesReply rep = wait( loadBalance(beginServer.second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
 
 				if( info.debugID.present() ) {
 					g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getRange.After");//.detail("SizeOf", rep.data.size());
@@ -2698,7 +2698,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 			const std::vector<MasterProxyInterface>& proxies = cx->clientInfo->get().proxies;
 			reply = proxies.size() ? throwErrorOr ( brokenPromiseToMaybeDelivered ( proxies[0].commit.tryGetReply(req) ) ) : Never();
 		} else {
-			reply = loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskDefaultPromiseEndpoint, true );
+			reply = loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskPriority::DefaultPromiseEndpoint, true );
 		}
 
 		choose {
@@ -3073,7 +3073,7 @@ ACTOR Future<Void> readVersionBatcher( DatabaseContext *cx, FutureStream< std::p
 				if (requests.size() == CLIENT_KNOBS->MAX_BATCH_SIZE)
 					send_batch = true;
 				else if (!timeout.isValid())
-					timeout = delay(batchTime, TaskProxyGetConsistentReadVersion);
+					timeout = delay(batchTime, TaskPriority::ProxyGetConsistentReadVersion);
 			}
 			when(wait(timeout.isValid() ? timeout : Never())) {
 				send_batch = true;
@@ -3240,7 +3240,7 @@ ACTOR Future< StorageMetrics > waitStorageMetricsMultipleLocations(
 		WaitMetricsRequest req(locations[i].first, StorageMetrics(), StorageMetrics());
 		req.min.bytes = 0;
 		req.max.bytes = -1;
-		fx[i] = loadBalance( locations[i].second, &StorageServerInterface::waitMetrics, req, TaskDataDistribution );
+		fx[i] = loadBalance( locations[i].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution );
 	}
 	wait( waitForAll(fx) );
 
@@ -3271,7 +3271,7 @@ ACTOR Future< StorageMetrics > waitStorageMetrics(
 	int shardLimit )
 {
 	loop {
-		vector< pair<KeyRange, Reference<LocationInfo>> > locations = wait( getKeyRangeLocations( cx, keys, shardLimit, false, &StorageServerInterface::waitMetrics, TransactionInfo(TaskDataDistribution) ) );
+		vector< pair<KeyRange, Reference<LocationInfo>> > locations = wait( getKeyRangeLocations( cx, keys, shardLimit, false, &StorageServerInterface::waitMetrics, TransactionInfo(TaskPriority::DataDistribution) ) );
 
 		//SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better solution to this.
 		if(locations.size() < shardLimit) {
@@ -3281,7 +3281,7 @@ ACTOR Future< StorageMetrics > waitStorageMetrics(
 					fx = waitStorageMetricsMultipleLocations( locations, min, max, permittedError );
 				} else {
 					WaitMetricsRequest req( keys, min, max );
-					fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskDataDistribution );
+					fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution );
 				}
 				StorageMetrics x = wait(fx);
 				return x;
@@ -3291,14 +3291,14 @@ ACTOR Future< StorageMetrics > waitStorageMetrics(
 					throw;
 				}
 				cx->invalidateCache(keys);
-				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskDataDistribution));
+				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
 			}
 		} else {
 			TraceEvent(SevWarn, "WaitStorageMetricsPenalty")
 				.detail("Keys", keys)
 				.detail("Limit", CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT)
 				.detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY);
-			wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskDataDistribution));
+			wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
 			// make sure that the next getKeyRangeLocations() call will actually re-fetch the range
 			cx->invalidateCache( keys );
 		}
@@ -3324,13 +3324,13 @@ Future< StorageMetrics > Transaction::getStorageMetrics( KeyRange const& keys, i
 ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx, KeyRange keys, StorageMetrics limit, StorageMetrics estimated )
 {
 	loop {
-		state vector< pair<KeyRange, Reference<LocationInfo>> > locations = wait( getKeyRangeLocations( cx, keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, false, &StorageServerInterface::splitMetrics, TransactionInfo(TaskDataDistribution) ) );
+		state vector< pair<KeyRange, Reference<LocationInfo>> > locations = wait( getKeyRangeLocations( cx, keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, false, &StorageServerInterface::splitMetrics, TransactionInfo(TaskPriority::DataDistribution) ) );
 		state StorageMetrics used;
 		state Standalone<VectorRef<KeyRef>> results;
 
 		//SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better solution to this.
 		if(locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
-			wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskDataDistribution));
+			wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
 			cx->invalidateCache(keys);
 		}
 		else {
@@ -3341,7 +3341,7 @@ ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx,
 				state int i = 0;
 				for(; i<locations.size(); i++) {
 					SplitMetricsRequest req( locations[i].first, limit, used, estimated, i == locations.size() - 1 );
-					SplitMetricsReply res = wait( loadBalance( locations[i].second, &StorageServerInterface::splitMetrics, req, TaskDataDistribution ) );
+					SplitMetricsReply res = wait( loadBalance( locations[i].second, &StorageServerInterface::splitMetrics, req, TaskPriority::DataDistribution ) );
 					if( res.splits.size() && res.splits[0] <= results.back() ) { // split points are out of order, possibly because of moving data, throw error to retry
 						ASSERT_WE_THINK(false);   // FIXME: This seems impossible and doesn't seem to be covered by testing
 						throw all_alternatives_failed();
@@ -3367,7 +3367,7 @@ ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx,
 					throw;
 				}
 				cx->invalidateCache( keys );
-				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskDataDistribution));
+				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
 			}
 		}
 	}
diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h
index 0f59d368c5..e4310e9721 100644
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@@ -164,10 +164,10 @@ struct TransactionOptions {
 
 struct TransactionInfo {
 	Optional<UID> debugID;
-	int taskID;
+	TaskPriority taskID;
 	bool useProvisionalProxies;
 
-	explicit TransactionInfo( int taskID ) : taskID(taskID), useProvisionalProxies(false) {}
+	explicit TransactionInfo( TaskPriority taskID ) : taskID(taskID), useProvisionalProxies(false) {}
 };
 
 struct TransactionLogInfo : public ReferenceCounted<TransactionLogInfo>, NonCopyable {
@@ -287,7 +287,7 @@ public:
 	void flushTrLogsIfEnabled();
 
 	// These are to permit use as state variables in actors:
-	Transaction() : info( TaskDefaultEndpoint ) {}
+	Transaction() : info( TaskPriority::DefaultEndpoint ) {}
 	void operator=(Transaction&& r) BOOST_NOEXCEPT;
 
 	void reset();
diff --git a/fdbclient/StatusClient.actor.cpp b/fdbclient/StatusClient.actor.cpp
index d4b06a5182..8e706987a9 100644
--- a/fdbclient/StatusClient.actor.cpp
+++ b/fdbclient/StatusClient.actor.cpp
@@ -291,7 +291,7 @@ ACTOR Future<Optional<StatusObject>> clientCoordinatorsStatusFetcher(Reference<C
 
 		state vector<Future<Optional<LeaderInfo>>> leaderServers;
 		for (int i = 0; i < coord.clientLeaderServers.size(); i++)
-			leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskCoordinationReply));
+			leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskPriority::CoordinationReply));
 
 		wait( smartQuorum(leaderServers, leaderServers.size() / 2 + 1, 1.5) || delay(2.0) );
 
diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h
index 6225fd50f7..ebc880f8ce 100644
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@@ -80,9 +80,9 @@ struct StorageServerInterface {
 	bool operator == (StorageServerInterface const& s) const { return uniqueID == s.uniqueID; }
 	bool operator < (StorageServerInterface const& s) const { return uniqueID < s.uniqueID; }
 	void initEndpoints() {
-		getValue.getEndpoint( TaskLoadBalancedEndpoint );
-		getKey.getEndpoint( TaskLoadBalancedEndpoint );
-		getKeyValues.getEndpoint( TaskLoadBalancedEndpoint );
+		getValue.getEndpoint( TaskPriority::LoadBalancedEndpoint );
+		getKey.getEndpoint( TaskPriority::LoadBalancedEndpoint );
+		getKeyValues.getEndpoint( TaskPriority::LoadBalancedEndpoint );
 	}
 };
 
diff --git a/fdbclient/VersionedMap.actor.h b/fdbclient/VersionedMap.actor.h
index cfb9e650f6..953c2f4c1f 100644
--- a/fdbclient/VersionedMap.actor.h
+++ b/fdbclient/VersionedMap.actor.h
@@ -31,7 +31,7 @@
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
 ACTOR template <class Tree>
-Future<Void> deferredCleanupActor( std::vector<Tree> toFree, int taskID = 7000 ) {
+Future<Void> deferredCleanupActor( std::vector<Tree> toFree, TaskPriority taskID = 7000 ) {
 	state int freeCount = 0;
 	while (!toFree.empty()) {
 		Tree a = std::move( toFree.back() );
diff --git a/fdbclient/VersionedMap.h b/fdbclient/VersionedMap.h
index 705108ce72..58c440c679 100644
--- a/fdbclient/VersionedMap.h
+++ b/fdbclient/VersionedMap.h
@@ -511,7 +511,7 @@ public:
 		oldestVersion = newOldestVersion;
 	}
 
-	Future<Void> forgetVersionsBeforeAsync( Version newOldestVersion, int taskID = 7000 ) {
+	Future<Void> forgetVersionsBeforeAsync( Version newOldestVersion, TaskPriority taskID = 7000 ) {
 		ASSERT( newOldestVersion <= latestVersion );
 		roots[newOldestVersion] = getRoot(newOldestVersion);
 
diff --git a/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/AsyncFileEIO.actor.h
index 12ca1866ad..f786266888 100644
--- a/fdbrpc/AsyncFileEIO.actor.h
+++ b/fdbrpc/AsyncFileEIO.actor.h
@@ -266,7 +266,7 @@ private:
 	}
 
 	ACTOR static Future<int> read_impl( int fd, void* data, int length, int64_t offset ) {
-		state int taskID = g_network->getCurrentTask();
+		state TaskPriority taskID = g_network->getCurrentTask();
 		state Promise<Void> p;
 		//fprintf(stderr, "eio_read (fd=%d length=%d offset=%lld)\n", fd, length, offset);
 		state eio_req* r = eio_read(fd, data, length, offset, 0, eio_callback, &p);
@@ -289,7 +289,7 @@ private:
 	}
 
 	ACTOR static Future<Void> write_impl( int fd, Reference<ErrorInfo> err, StringRef data, int64_t offset ) {
-		state int taskID = g_network->getCurrentTask();
+		state TaskPriority taskID = g_network->getCurrentTask();
 		state Promise<Void> p;
 		state eio_req* r = eio_write(fd, (void*)data.begin(), data.size(), offset, 0, eio_callback, &p);
 		try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; }
@@ -299,7 +299,7 @@ private:
 	}
 
 	ACTOR static Future<Void> truncate_impl( int fd, Reference<ErrorInfo> err, int64_t size ) {
-		state int taskID = g_network->getCurrentTask();
+		state TaskPriority taskID = g_network->getCurrentTask();
 		state Promise<Void> p;
 		state eio_req* r = eio_ftruncate(fd, size, 0, eio_callback, &p);
 		try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; }
@@ -330,7 +330,7 @@ private:
 	}
 
 	ACTOR static Future<Void> sync_impl( int fd, Reference<ErrorInfo> err, bool sync_metadata=false ) {
-		state int taskID = g_network->getCurrentTask();
+		state TaskPriority taskID = g_network->getCurrentTask();
 		state Promise<Void> p;
 		state eio_req* r = start_fsync( fd, p, sync_metadata );
 		
@@ -350,7 +350,7 @@ private:
 	}
 
 	ACTOR static Future<int64_t> size_impl( int fd ) {
-		state int taskID = g_network->getCurrentTask();
+		state TaskPriority taskID = g_network->getCurrentTask();
 		state Promise<Void> p;
 		state eio_req* r = eio_fstat( fd, 0, eio_callback, &p );
 		try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; }
@@ -363,7 +363,7 @@ private:
 	}
 
 	ACTOR static Future<EIO_STRUCT_STAT> stat_impl( std::string filename ) {
-		state int taskID = g_network->getCurrentTask();
+		state TaskPriority taskID = g_network->getCurrentTask();
 		state Promise<Void> p;
 		state EIO_STRUCT_STAT statdata;
 		state eio_req* r = eio_stat( filename.c_str(), 0, eio_callback, &p );
@@ -377,7 +377,7 @@ private:
 
 	ACTOR template <class R> static Future<R> dispatch_impl( std::function<R()> func) {
 		state Dispatch<R> data( func );
-		state int taskID = g_network->getCurrentTask();
+		state TaskPriority taskID = g_network->getCurrentTask();
 
 		state eio_req* r = eio_custom( [](eio_req* req) {
 			// Runs on the eio thread pool
@@ -418,7 +418,7 @@ private:
 	static void eio_want_poll() {
 		want_poll = 1;
 		// SOMEDAY: NULL for deferred error, no analysis of correctness (itp)
-		onMainThreadVoid([](){ poll_eio(); }, NULL, TaskPollEIO);
+		onMainThreadVoid([](){ poll_eio(); }, NULL, TaskPriority::PollEIO);
 	}
 
 	static int eio_callback( eio_req* req ) {
diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h
index ac66605be3..14495a6cdf 100644
--- a/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/AsyncFileKAIO.actor.h
@@ -472,9 +472,9 @@ private:
 #endif
 		}
 
-		int getTask() const { return (prio>>32)+1; }
+		TaskPriority getTask() const { return static_cast<TaskPriority>((prio>>32)+1); }
 
-		ACTOR static void deliver( Promise<int> result, bool failed, int r, int task ) {
+		ACTOR static void deliver( Promise<int> result, bool failed, int r, TaskPriority task ) {
 			wait( delay(0, task) );
 			if (failed) result.sendError(io_timeout());
 			else if (r < 0) result.sendError(io_error());
@@ -649,7 +649,7 @@ private:
 		loop {
 			wait(success(ev->read()));
 
-			wait(delay(0, TaskDiskIOComplete));
+			wait(delay(0, TaskPriority::DiskIOComplete));
 
 			linux_ioresult ev[FLOW_KNOBS->MAX_OUTSTANDING];
 			timespec tm; tm.tv_sec = 0; tm.tv_nsec = 0;
diff --git a/fdbrpc/AsyncFileNonDurable.actor.cpp b/fdbrpc/AsyncFileNonDurable.actor.cpp
index a3257f1fa8..6ea0129a27 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.cpp
+++ b/fdbrpc/AsyncFileNonDurable.actor.cpp
@@ -23,13 +23,13 @@
 
 std::map<std::string, Future<Void>> AsyncFileNonDurable::filesBeingDeleted;
 
-ACTOR Future<Void> sendOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, int taskID ) {
+ACTOR Future<Void> sendOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, TaskPriority taskID ) {
 	wait( g_simulator.onProcess( process, taskID ) );
 	promise.send(Void());
 	return Void();
 }
 
-ACTOR Future<Void> sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, Error e, int taskID ) {
+ACTOR Future<Void> sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, Error e, TaskPriority taskID ) {
 	wait( g_simulator.onProcess( process, taskID ) );
 	promise.sendError(e);
 	return Void();
diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 03fe8e852c..7e8e551b3e 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -38,8 +38,8 @@
 #undef max
 #undef min
 
-Future<Void> sendOnProcess( ISimulator::ProcessInfo* const& process, Promise<Void> const& promise, int const& taskID );
-Future<Void> sendErrorOnProcess( ISimulator::ProcessInfo* const& process, Promise<Void> const& promise, Error const& e, int const& taskID );
+ACTOR Future<Void> sendOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, TaskPriority taskID );
+ACTOR Future<Void> sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise<Void> promise, Error e, TaskPriority taskID );
 
 ACTOR template <class T> 
 Future<T> sendErrorOnShutdown( Future<T> in ) {
@@ -198,7 +198,7 @@ public:
 	//Creates a new AsyncFileNonDurable which wraps the provided IAsyncFile
 	ACTOR static Future<Reference<IAsyncFile>> open(std::string filename, std::string actualFilename, Future<Reference<IAsyncFile>> wrappedFile, Reference<DiskParameters> diskParameters) {
 		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state int currentTaskID = g_network->getCurrentTask();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		state Future<Void> shutdown = success(currentProcess->shutdownSignal.getFuture());
 
 		//TraceEvent("AsyncFileNonDurableOpenBegin").detail("Filename", filename).detail("Addr", g_simulator.getCurrentProcess()->address);
@@ -391,7 +391,7 @@ private:
 
 	ACTOR Future<int> read(AsyncFileNonDurable *self, void *data, int length, int64_t offset) {
 		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state int currentTaskID = g_network->getCurrentTask();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait( g_simulator.onMachine( currentProcess ) );
 
 		try {
@@ -411,7 +411,7 @@ private:
 	//or none of the write.  It may also corrupt parts of sectors which have not been written correctly
 	ACTOR Future<Void> write(AsyncFileNonDurable *self, Promise<Void> writeStarted, Future<Future<Void>> ownFuture, void const* data, int length, int64_t offset) {
 		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state int currentTaskID = g_network->getCurrentTask();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait( g_simulator.onMachine( currentProcess ) );
 		
 		state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
@@ -535,7 +535,7 @@ private:
 	//If a kill interrupts the delay, then the truncate may or may not be performed
 	ACTOR Future<Void> truncate(AsyncFileNonDurable *self, Promise<Void> truncateStarted, Future<Future<Void>> ownFuture, int64_t size) {
 		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state int currentTaskID = g_network->getCurrentTask();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait( g_simulator.onMachine( currentProcess ) );
 		
 		state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
@@ -573,8 +573,8 @@ private:
 			}
 		}
 
-		if(g_network->check_yield(TaskDefaultYield)) {
-			wait(delay(0, TaskDefaultYield));
+		if(g_network->check_yield(TaskPriority::DefaultYield)) {
+			wait(delay(0, TaskPriority::DefaultYield));
 		}
 
 		//If performing a durable truncate, then pass it through to the file.  Otherwise, pass it through with a 1/2 chance
@@ -663,7 +663,7 @@ private:
 
 	ACTOR Future<Void> sync(AsyncFileNonDurable *self, bool durable) {
 		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state int currentTaskID = g_network->getCurrentTask();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait( g_simulator.onMachine( currentProcess ) );
 
 		try {
@@ -695,7 +695,7 @@ private:
 
 	ACTOR Future<int64_t> size(AsyncFileNonDurable *self) {
 		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state int currentTaskID = g_network->getCurrentTask();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
 
 		wait( g_simulator.onMachine( currentProcess ) );
 
@@ -714,7 +714,7 @@ private:
 	//Finishes all outstanding actors on an AsyncFileNonDurable and then deletes it
 	ACTOR Future<Void> deleteFile(AsyncFileNonDurable *self) {
 		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state int currentTaskID = g_network->getCurrentTask();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		state std::string filename = self->filename;
 
 		wait( g_simulator.onMachine( currentProcess ) );
diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp
index dabc9800f7..46ca17f8e7 100644
--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@@ -172,28 +172,28 @@ struct YieldMockNetwork : INetwork, ReferenceCounted<YieldMockNetwork> {
 		t.send(Void());
 	}
 
-	virtual Future<class Void> delay(double seconds, int taskID) {
+	virtual Future<class Void> delay(double seconds, TaskPriority taskID) {
 		return nextTick.getFuture();
 	}
 
-	virtual Future<class Void> yield(int taskID) {
+	virtual Future<class Void> yield(TaskPriority taskID) {
 		if (check_yield(taskID))
 			return delay(0,taskID);
 		return Void();
 	}
 
-	virtual bool check_yield(int taskID) {
+	virtual bool check_yield(TaskPriority taskID) {
 		if (nextYield > 0) --nextYield;
 		return nextYield == 0;
 	}
 
 	// Delegate everything else.  TODO: Make a base class NetworkWrapper for delegating everything in INetwork
-	virtual int getCurrentTask() { return baseNetwork->getCurrentTask(); }
-	virtual void setCurrentTask(int taskID) { baseNetwork->setCurrentTask(taskID); }
+	virtual TaskPriority getCurrentTask() { return baseNetwork->getCurrentTask(); }
+	virtual void setCurrentTask(TaskPriority taskID) { baseNetwork->setCurrentTask(taskID); }
 	virtual double now() { return baseNetwork->now(); }
 	virtual void stop() { return baseNetwork->stop(); }
 	virtual bool isSimulated() const { return baseNetwork->isSimulated(); }
-	virtual void onMainThread(Promise<Void>&& signal, int taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); }
+	virtual void onMainThread(Promise<Void>&& signal, TaskPriority taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); }
 	virtual THREAD_HANDLE startThread(THREAD_FUNC_RETURN(*func) (void *), void *arg) { return baseNetwork->startThread(func,arg); }
 	virtual Future< Reference<class IAsyncFile> > open(std::string filename, int64_t flags, int64_t mode) { return IAsyncFileSystem::filesystem()->open(filename,flags,mode); }
 	virtual Future< Void > deleteFile(std::string filename, bool mustBeDurable) { return IAsyncFileSystem::filesystem()->deleteFile(filename,mustBeDurable); }
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index b08ef9756a..ae709cd675 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -49,7 +49,7 @@ public:
 	EndpointMap();
 	void insert( NetworkMessageReceiver* r, Endpoint::Token& token, uint32_t priority );
 	NetworkMessageReceiver* get( Endpoint::Token const& token );
-	uint32_t getPriority( Endpoint::Token const& token );
+	TaskPriority getPriority( Endpoint::Token const& token );
 	void remove( Endpoint::Token const& token, NetworkMessageReceiver* r );
 
 private:
@@ -99,11 +99,11 @@ NetworkMessageReceiver* EndpointMap::get( Endpoint::Token const& token ) {
 	return 0;
 }
 
-uint32_t EndpointMap::getPriority( Endpoint::Token const& token ) {
+TaskPriority EndpointMap::getPriority( Endpoint::Token const& token ) {
 	uint32_t index = token.second();
 	if ( index < data.size() && data[index].token().first() == token.first() && ((data[index].token().second()&0xffffffff00000000LL)|index)==token.second() )
-		return data[index].token().second();
-	return TaskUnknownEndpoint;
+		return static_cast<TaskPriority>(data[index].token().second());
+	return TaskPriority::UnknownEndpoint;
 }
 
 void EndpointMap::remove( Endpoint::Token const& token, NetworkMessageReceiver* r ) {
@@ -119,7 +119,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver {
 	EndpointNotFoundReceiver(EndpointMap& endpoints) {
 		//endpoints[WLTOKEN_ENDPOINT_NOT_FOUND] = this;
 		Endpoint::Token e = WLTOKEN_ENDPOINT_NOT_FOUND;
-		endpoints.insert(this, e, TaskDefaultEndpoint);
+		endpoints.insert(this, e, static_cast<uint32_t>(TaskPriority::DefaultEndpoint));
 		ASSERT( e == WLTOKEN_ENDPOINT_NOT_FOUND );
 	}
 	virtual void receive( ArenaReader& reader ) {
@@ -138,7 +138,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver {
 struct PingReceiver : NetworkMessageReceiver {
 	PingReceiver(EndpointMap& endpoints) {
 		Endpoint::Token e = WLTOKEN_PING_PACKET;
-		endpoints.insert(this, e, TaskReadSocket);
+		endpoints.insert(this, e, static_cast<uint32_t>(TaskPriority::ReadSocket));
 		ASSERT( e == WLTOKEN_PING_PACKET );
 	}
 	virtual void receive( ArenaReader& reader ) {
@@ -435,10 +435,10 @@ struct Peer : NonCopyable {
 	ACTOR static Future<Void> connectionWriter( Peer* self, Reference<IConnection> conn ) {
 		state double lastWriteTime = now();
 		loop {
-			//wait( delay(0, TaskWriteSocket) );
-			wait( delayJittered(std::max<double>(FLOW_KNOBS->MIN_COALESCE_DELAY, FLOW_KNOBS->MAX_COALESCE_DELAY - (now() - lastWriteTime)), TaskWriteSocket) );
-			//wait( delay(500e-6, TaskWriteSocket) );
-			//wait( yield(TaskWriteSocket) );
+			//wait( delay(0, TaskPriority::WriteSocket) );
+			wait( delayJittered(std::max<double>(FLOW_KNOBS->MIN_COALESCE_DELAY, FLOW_KNOBS->MAX_COALESCE_DELAY - (now() - lastWriteTime)), TaskPriority::WriteSocket) );
+			//wait( delay(500e-6, TaskPriority::WriteSocket) );
+			//wait( yield(TaskPriority::WriteSocket) );
 
 			// Send until there is nothing left to send
 			loop {
@@ -453,7 +453,7 @@ struct Peer : NonCopyable {
 
 				TEST(true); // We didn't write everything, so apparently the write buffer is full.  Wait for it to be nonfull.
 				wait( conn->onWritable() );
-				wait( yield(TaskWriteSocket) );
+				wait( yield(TaskPriority::WriteSocket) );
 			}
 
 			// Wait until there is something to send
@@ -599,8 +599,8 @@ TransportData::~TransportData() {
 }
 
 ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader reader, bool inReadSocket) {
-	int priority = self->endpoints.getPriority(destination.token);
-	if (priority < TaskReadSocket || !inReadSocket) {
+	TaskPriority priority = self->endpoints.getPriority(destination.token);
+	if (priority < TaskPriority::ReadSocket || !inReadSocket) {
 		wait( delay(0, priority) );
 	} else {
 		g_network->setCurrentTask( priority );
@@ -634,7 +634,7 @@ ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader
 	}
 
 	if( inReadSocket )
-		g_network->setCurrentTask( TaskReadSocket );
+		g_network->setCurrentTask( TaskPriority::ReadSocket );
 }
 
 static void scanPackets(TransportData* transport, uint8_t*& unprocessed_begin, uint8_t* e, Arena& arena,
@@ -884,11 +884,11 @@ ACTOR static Future<Void> connectionReader(
 				if (readWillBlock)
 					break;
 
-				wait(yield(TaskReadSocket));
+				wait(yield(TaskPriority::ReadSocket));
 			}
 
 			wait( conn->onReadable() );
-			wait(delay(0, TaskReadSocket));  // We don't want to call conn->read directly from the reactor - we could get stuck in the reactor reading 1 packet at a time
+			wait(delay(0, TaskPriority::ReadSocket));  // We don't want to call conn->read directly from the reactor - we could get stuck in the reactor reading 1 packet at a time
 		}
 	}
 	catch (Error& e) {
@@ -932,7 +932,7 @@ ACTOR static Future<Void> listen( TransportData* self, NetworkAddress listenAddr
 				.detail("FromAddress", conn->getPeerAddress())
 				.detail("ListenAddress", listenAddr.toString());
 			incoming.add( connectionIncoming(self, conn) );
-			wait(delay(0) || delay(FLOW_KNOBS->CONNECTION_ACCEPT_DELAY, TaskWriteSocket));
+			wait(delay(0) || delay(FLOW_KNOBS->CONNECTION_ACCEPT_DELAY, TaskPriority::WriteSocket));
 		}
 	} catch (Error& e) {
 		TraceEvent(SevError, "ListenError").error(e);
@@ -1054,7 +1054,7 @@ void FlowTransport::removePeerReference( const Endpoint& endpoint, NetworkMessag
 	}
 }
 
-void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, uint32_t taskID ) {
+void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID ) {
 	endpoint.token = deterministicRandom()->randomUniqueID();
 	if (receiver->isStream()) {
 		endpoint.addresses = self->localAddresses;
@@ -1063,18 +1063,18 @@ void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* rec
 		endpoint.addresses = NetworkAddressList();
 		endpoint.token = UID( endpoint.token.first() & ~TOKEN_STREAM_FLAG, endpoint.token.second() );
 	}
-	self->endpoints.insert( receiver, endpoint.token, taskID );
+	self->endpoints.insert( receiver, endpoint.token, static_cast<uint32_t>(taskID) );
 }
 
 void FlowTransport::removeEndpoint( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) {
 	self->endpoints.remove(endpoint.token, receiver);
 }
 
-void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, uint32_t taskID ) {
+void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID ) {
 	endpoint.addresses = self->localAddresses;
 	ASSERT( ((endpoint.token.first() & TOKEN_STREAM_FLAG)!=0) == receiver->isStream() );
 	Endpoint::Token otoken = endpoint.token;
-	self->endpoints.insert( receiver, endpoint.token, taskID );
+	self->endpoints.insert( receiver, endpoint.token, static_cast<uint32_t>(taskID) );
 	ASSERT( endpoint.token == otoken );
 }
 
diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h
index 827d2727e6..d1be8c3411 100644
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@@ -137,13 +137,13 @@ public:
 	void removePeerReference( const Endpoint&, NetworkMessageReceiver* );
 	// Signal that a peer connection is no longer being used
 
-	void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, uint32_t taskID );
+	void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID );
 	// Sets endpoint to be a new local endpoint which delivers messages to the given receiver
 
 	void removeEndpoint( const Endpoint&, NetworkMessageReceiver* );
 	// The given local endpoint no longer delivers messages to the given receiver or uses resources
 
-	void addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, uint32_t taskID );
+	void addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID );
 	// Sets endpoint to a new local endpoint (without changing its token) which delivers messages to the given receiver
 	// Implementations may have limitations on when this function is called and what endpoint.token may be!
 
diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index 557759d9a5..903a197f58 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -178,7 +178,7 @@ Future< REPLY_TYPE(Request) > loadBalance(
 	Reference<MultiInterface<Multi>> alternatives,
 	RequestStream<Request> Interface::* channel,
 	Request request = Request(),
-	int taskID = TaskDefaultPromiseEndpoint,
+	TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
 	bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically
 	QueueModel* model = NULL) 
 {
diff --git a/fdbrpc/batcher.actor.h b/fdbrpc/batcher.actor.h
index 7e276ad574..72a9bc9094 100644
--- a/fdbrpc/batcher.actor.h
+++ b/fdbrpc/batcher.actor.h
@@ -47,7 +47,7 @@ bool firstInBatch(CommitTransactionRequest x) {
 }
 
 ACTOR template <class X>
-Future<Void> batcher(PromiseStream<std::pair<std::vector<X>, int> > out, FutureStream<X> in, double avgMinDelay, double* avgMaxDelay, double emptyBatchTimeout, int maxCount, int desiredBytes, int maxBytes, Optional<PromiseStream<Void>> batchStartedStream, int64_t *commitBatchesMemBytesCount, int64_t commitBatchesMemBytesLimit, int taskID = TaskDefaultDelay, Counter* counter = 0)
+Future<Void> batcher(PromiseStream<std::pair<std::vector<X>, int> > out, FutureStream<X> in, double avgMinDelay, double* avgMaxDelay, double emptyBatchTimeout, int maxCount, int desiredBytes, int maxBytes, Optional<PromiseStream<Void>> batchStartedStream, int64_t *commitBatchesMemBytesCount, int64_t commitBatchesMemBytesLimit, TaskPriority taskID = TaskPriority::DefaultDelay, Counter* counter = 0)
 {
 	wait( delayJittered(*avgMaxDelay, taskID) );  // smooth out
 	// This is set up to deliver even zero-size batches if emptyBatchTimeout elapses, because that's what master proxy wants.  The source control history
diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h
index 9853cbe968..470cec10d9 100644
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@@ -48,7 +48,7 @@ struct FlowReceiver : private NetworkMessageReceiver {
 
 	// If already a remote endpoint, returns that.  Otherwise makes this
 	//   a local endpoint and returns that.
-	const Endpoint& getEndpoint(int taskID) {
+	const Endpoint& getEndpoint(TaskPriority taskID) {
 		if (!endpoint.isValid()) {
 			m_isLocalEndpoint = true;
 			FlowTransport::transport().addEndpoint(endpoint, this, taskID);
@@ -56,7 +56,7 @@ struct FlowReceiver : private NetworkMessageReceiver {
 		return endpoint;
 	}
 
-	void makeWellKnownEndpoint(Endpoint::Token token, int taskID) {
+	void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) {
 		ASSERT(!endpoint.isValid());
 		m_isLocalEndpoint = true;
 		endpoint.token = token;
@@ -128,7 +128,7 @@ public:
 	~ReplyPromise() { if (sav) sav->delPromiseRef(); }
 
 	ReplyPromise(const Endpoint& endpoint) : sav(new NetSAV<T>(0, 1, endpoint)) {}
-	const Endpoint& getEndpoint(int taskID = TaskDefaultPromiseEndpoint) const { return sav->getEndpoint(taskID); }
+	const Endpoint& getEndpoint(TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint) const { return sav->getEndpoint(taskID); }
 
 	void operator=(const ReplyPromise& rhs) {
 		if (rhs.sav) rhs.sav->addPromiseRef();
@@ -204,19 +204,19 @@ template <class Reply>
 void resetReply(ReplyPromise<Reply> & p) { p.reset(); }
 
 template <class Request>
-void resetReply(Request& r, int taskID) { r.reply.reset(); r.reply.getEndpoint(taskID); }
+void resetReply(Request& r, TaskPriority taskID) { r.reply.reset(); r.reply.getEndpoint(taskID); }
 
 template <class Reply>
-void resetReply(ReplyPromise<Reply> & p, int taskID) { p.reset(); p.getEndpoint(taskID); }
+void resetReply(ReplyPromise<Reply> & p, TaskPriority taskID) { p.reset(); p.getEndpoint(taskID); }
 
 template <class Request>
-void setReplyPriority(Request& r, int taskID) { r.reply.getEndpoint(taskID); }
+void setReplyPriority(Request& r, TaskPriority taskID) { r.reply.getEndpoint(taskID); }
 
 template <class Reply>
-void setReplyPriority(ReplyPromise<Reply> & p, int taskID) { p.getEndpoint(taskID); }
+void setReplyPriority(ReplyPromise<Reply> & p, TaskPriority taskID) { p.getEndpoint(taskID); }
 
 template <class Reply>
-void setReplyPriority(const ReplyPromise<Reply> & p, int taskID) { p.getEndpoint(taskID); }
+void setReplyPriority(const ReplyPromise<Reply> & p, TaskPriority taskID) { p.getEndpoint(taskID); }
 
 
 
@@ -281,7 +281,7 @@ public:
 		return reportEndpointFailure(getReplyPromise(value).getFuture(), getEndpoint());
 	}
 	template <class X>
-	Future<REPLY_TYPE(X)> getReply(const X& value, int taskID) const {
+	Future<REPLY_TYPE(X)> getReply(const X& value, TaskPriority taskID) const {
 		setReplyPriority(value, taskID);
 		return getReply(value);
 	}
@@ -290,7 +290,7 @@ public:
 		return getReply(ReplyPromise<X>());
 	}
 	template <class X>
-	Future<X> getReplyWithTaskID(int taskID) const {
+	Future<X> getReplyWithTaskID(TaskPriority taskID) const {
 		ReplyPromise<X> reply;
 		reply.getEndpoint(taskID);
 		return getReply(reply);
@@ -302,7 +302,7 @@ public:
 	//   If cancelled or returns failure, request was or will be delivered zero or one times.
 	//   The caller must be capable of retrying if this request returns failure
 	template <class X>
-	Future<ErrorOr<REPLY_TYPE(X)>> tryGetReply(const X& value, int taskID) const {
+	Future<ErrorOr<REPLY_TYPE(X)>> tryGetReply(const X& value, TaskPriority taskID) const {
 		setReplyPriority(value, taskID);
 		if (queue->isRemoteEndpoint()) {
 			Future<Void> disc = makeDependent<T>(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint(taskID));
@@ -344,7 +344,7 @@ public:
 	//   If it returns failure, the failure detector considers the endpoint failed permanently or for the given amount of time
 	//   See IFailureMonitor::onFailedFor() for an explanation of the duration and slope parameters.
 	template <class X>
-	Future<ErrorOr<REPLY_TYPE(X)>> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, int taskID) const {
+	Future<ErrorOr<REPLY_TYPE(X)>> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, TaskPriority taskID) const {
 		// If it is local endpoint, no need for failure monitoring
 		return waitValueOrSignal(getReply(value, taskID),
 				makeDependent<T>(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(taskID), sustainedFailureDuration, sustainedFailureSlope),
@@ -388,8 +388,8 @@ public:
 		//queue = (NetNotifiedQueue<T>*)0xdeadbeef;
 	}
 
-	Endpoint getEndpoint(int taskID = TaskDefaultEndpoint) const { return queue->getEndpoint(taskID); }
-	void makeWellKnownEndpoint(Endpoint::Token token, int taskID) {
+	Endpoint getEndpoint(TaskPriority taskID = TaskPriority::DefaultEndpoint) const { return queue->getEndpoint(taskID); }
+	void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) {
 		queue->makeWellKnownEndpoint(token, taskID);
 	}
 
diff --git a/fdbrpc/genericactors.actor.h b/fdbrpc/genericactors.actor.h
index 810ccdb731..744abaeebe 100644
--- a/fdbrpc/genericactors.actor.h
+++ b/fdbrpc/genericactors.actor.h
@@ -50,7 +50,7 @@ Future<REPLY_TYPE(Req)> retryBrokenPromise( RequestStream<Req> to, Req request )
 }
 
 ACTOR template <class Req>
-Future<REPLY_TYPE(Req)> retryBrokenPromise( RequestStream<Req> to, Req request, int taskID ) {
+Future<REPLY_TYPE(Req)> retryBrokenPromise( RequestStream<Req> to, Req request, TaskPriority taskID ) {
 	// Like to.getReply(request), except that a broken_promise exception results in retrying request immediately.
 	// Suitable for use with well known endpoints, which are likely to return to existence after the other process restarts.
 	// Not normally useful for ordinary endpoints, which conventionally are permanently destroyed after replying with broken_promise.
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 04aa0684ba..7b0547e922 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -422,7 +422,7 @@ public:
 	ACTOR static Future<Reference<IAsyncFile>> open( std::string filename, int flags, int mode,
 													Reference<DiskParameters> diskParameters = Reference<DiskParameters>(new DiskParameters(25000, 150000000)), bool delayOnWrite = true ) {
 		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state int currentTaskID = g_network->getCurrentTask();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
 
 		if(++openCount >= 3000) {
 			TraceEvent(SevError, "TooManyFiles");
@@ -741,11 +741,11 @@ public:
 	// Everything actually network related is delegated to the Sim2Net class; Sim2 is only concerned with simulating machines and time
 	virtual double now() { return time; }
 
-	virtual Future<class Void> delay( double seconds, int taskID ) {
-		ASSERT(taskID >= TaskMinPriority && taskID <= TaskMaxPriority);
+	virtual Future<class Void> delay( double seconds, TaskPriority taskID ) {
+		ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max);
 		return delay( seconds, taskID, currentProcess );
 	}
-	Future<class Void> delay( double seconds, int taskID, ProcessInfo* machine ) {
+	Future<class Void> delay( double seconds, TaskPriority taskID, ProcessInfo* machine ) {
 		ASSERT( seconds >= -0.0001 );
 		seconds = std::max(0.0, seconds);
 		Future<Void> f;
@@ -760,13 +760,13 @@ public:
 
 		return f;
 	}
-	ACTOR static Future<Void> checkShutdown(Sim2 *self, int taskID) {
+	ACTOR static Future<Void> checkShutdown(Sim2 *self, TaskPriority taskID) {
 		wait(success(self->getCurrentProcess()->shutdownSignal.getFuture()));
 		self->setCurrentTask(taskID);
 		return Void();
 	}
-	virtual Future<class Void> yield( int taskID ) {
-		if (taskID == TaskDefaultYield) taskID = currentTaskID;
+	virtual Future<class Void> yield( TaskPriority taskID ) {
+		if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID;
 		if (check_yield(taskID)) {
 			// We want to check that yielders can handle actual time elapsing (it sometimes will outside simulation), but
 			// don't want to prevent instantaneous shutdown of "rebooted" machines.
@@ -775,7 +775,7 @@ public:
 		setCurrentTask(taskID);
 		return Void();
 	}
-	virtual bool check_yield( int taskID ) {
+	virtual bool check_yield( TaskPriority taskID ) {
 		if (yielded) return true;
 		if (--yield_limit <= 0) {
 			yield_limit = deterministicRandom()->randomInt(1, 150);  // If yield returns false *too* many times in a row, there could be a stack overflow, since we can't deterministically check stack size as the real network does
@@ -783,10 +783,10 @@ public:
 		}
 		return yielded = BUGGIFY_WITH_PROB(0.01);
 	}
-	virtual int getCurrentTask() {
+	virtual TaskPriority getCurrentTask() {
 		return currentTaskID;
 	}
-	virtual void setCurrentTask(int taskID ) {
+	virtual void setCurrentTask(TaskPriority taskID ) {
 		currentTaskID = taskID;
 	}
 	// Sets the taskID/priority of the current task, without yielding
@@ -923,7 +923,7 @@ public:
 		}
 		if ( mustBeDurable || deterministicRandom()->random01() < 0.5 ) {
 			state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-			state int currentTaskID = g_network->getCurrentTask();
+			state TaskPriority currentTaskID = g_network->getCurrentTask();
 			wait( g_simulator.onMachine( currentProcess ) );
 			try {
 				wait( ::delay(0.05 * deterministicRandom()->random01()) );
@@ -949,7 +949,7 @@ public:
 	ACTOR static Future<Void> runLoop(Sim2 *self) {
 		state ISimulator::ProcessInfo *callingMachine = self->currentProcess;
 		while ( !self->isStopped ) {
-			wait( self->net2->yield(TaskDefaultYield) );
+			wait( self->net2->yield(TaskPriority::DefaultYield) );
 
 			self->mutex.enter();
 			if( self->tasks.size() == 0 ) {
@@ -1579,23 +1579,23 @@ public:
 		machines.erase(machineId);
 	}
 
-	Sim2(bool objSerializer) : time(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(-1) {
+	Sim2(bool objSerializer) : time(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(TaskPriority::Zero) {
 		// Not letting currentProcess be NULL eliminates some annoying special cases
 		currentProcess = new ProcessInfo("NoMachine", LocalityData(Optional<Standalone<StringRef>>(), StringRef(), StringRef(), StringRef()), ProcessClass(), {NetworkAddress()}, this, "", "");
 		g_network = net2 = newNet2(false, true, objSerializer);
 		Net2FileSystem::newFileSystem();
-		check_yield(0);
+		check_yield(TaskPriority::Zero);
 	}
 
 	// Implementation
 	struct Task {
-		int taskID;
+		TaskPriority taskID;
 		double time;
 		uint64_t stable;
 		ProcessInfo* machine;
 		Promise<Void> action;
-		Task( double time, int taskID, uint64_t stable, ProcessInfo* machine, Promise<Void>&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {}
-		Task( double time, int taskID, uint64_t stable, ProcessInfo* machine, Future<Void>& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); }
+		Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Promise<Void>&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {}
+		Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Future<Void>& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); }
 		Task(Task&& rhs) BOOST_NOEXCEPT : time(rhs.time), taskID(rhs.taskID), stable(rhs.stable), machine(rhs.machine), action(std::move(rhs.action)) {}
 		void operator= ( Task const& rhs ) { taskID = rhs.taskID; time = rhs.time; stable = rhs.stable; machine = rhs.machine; action = rhs.action; }
 		Task( Task const& rhs ) : taskID(rhs.taskID), time(rhs.time), stable(rhs.stable), machine(rhs.machine), action(rhs.action) {}
@@ -1642,20 +1642,20 @@ public:
 		}
 	}
 
-	virtual void onMainThread( Promise<Void>&& signal, int taskID ) {
+	virtual void onMainThread( Promise<Void>&& signal, TaskPriority taskID ) {
 		// This is presumably coming from either a "fake" thread pool thread, i.e. it is actually on this thread
 		// or a thread created with g_network->startThread
 		ASSERT(getCurrentProcess());
 
 		mutex.enter();
-		ASSERT(taskID >= TaskMinPriority && taskID <= TaskMaxPriority);
+		ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max);
 		tasks.push( Task( time, taskID, taskCount++, getCurrentProcess(), std::move(signal) ) );
 		mutex.leave();
 	}
-	virtual Future<Void> onProcess( ISimulator::ProcessInfo *process, int taskID ) {
+	virtual Future<Void> onProcess( ISimulator::ProcessInfo *process, TaskPriority taskID ) {
 		return delay( 0, taskID, process );
 	}
-	virtual Future<Void> onMachine( ISimulator::ProcessInfo *process, int taskID ) {
+	virtual Future<Void> onMachine( ISimulator::ProcessInfo *process, TaskPriority taskID ) {
 		if( process->machine == 0 )
 			return Void();
 		return delay( 0, taskID, process->machine->machineProcess );
@@ -1664,7 +1664,7 @@ public:
 	//time is guarded by ISimulator::mutex. It is not necessary to guard reads on the main thread because
 	//time should only be modified from the main thread.
 	double time;
-	int currentTaskID;
+	TaskPriority currentTaskID;
 
 	//taskCount is guarded by ISimulator::mutex
 	uint64_t taskCount;
@@ -1694,9 +1694,9 @@ void startNewSimulator(bool objSerializer) {
 }
 
 ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) {
-	TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskDefaultDelay", TaskDefaultDelay);
+	TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskPriority::DefaultDelay", TaskPriority::DefaultDelay);
 
-	wait( g_sim2.delay( 0, TaskDefaultDelay, p ) ); // Switch to the machine in question
+	wait( g_sim2.delay( 0, TaskPriority::DefaultDelay, p ) ); // Switch to the machine in question
 
 	try {
 		ASSERT( kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete || kt == ISimulator::RebootProcessAndDelete );
diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h
index 81e3ecc4f6..403db9ce57 100644
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@@ -137,8 +137,8 @@ public:
 
 	ProcessInfo* getProcess( Endpoint const& endpoint ) { return getProcessByAddress(endpoint.getPrimaryAddress()); }
 	ProcessInfo* getCurrentProcess() { return currentProcess; }
-	virtual Future<Void> onProcess( ISimulator::ProcessInfo *process, int taskID = -1 ) = 0;
-	virtual Future<Void> onMachine( ISimulator::ProcessInfo *process, int taskID = -1 ) = 0;
+	virtual Future<Void> onProcess( ISimulator::ProcessInfo *process, TaskPriority taskID = TaskPriority::Zero ) = 0;
+	virtual Future<Void> onMachine( ISimulator::ProcessInfo *process, TaskPriority taskID = TaskPriority::Zero ) = 0;
 
 	virtual ProcessInfo* newProcess(const char* name, IPAddress ip, uint16_t port, uint16_t listenPerProcess,
 	                                LocalityData locality, ProcessClass startingClass, const char* dataFolder,
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 9fc12d502e..f4b07cdbe5 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -107,7 +107,7 @@ public:
 		DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0),
 			clientInfo( new AsyncVar<ClientDBInfo>( ClientDBInfo() ) ),
 			serverInfo( new AsyncVar<ServerDBInfo>( ServerDBInfo() ) ),
-			db( DatabaseContext::create( clientInfo, Future<Void>(), LocalityData(), true, TaskDefaultEndpoint, true ) )  // SOMEDAY: Locality!
+			db( DatabaseContext::create( clientInfo, Future<Void>(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) )  // SOMEDAY: Locality!
 		{
 		}
 
@@ -1171,7 +1171,7 @@ public:
 		serverInfo.clusterInterface = ccInterface;
 		serverInfo.myLocality = locality;
 		db.serverInfo->set( serverInfo );
-		cx = openDBOnServer(db.serverInfo, TaskDefaultEndpoint, true, true);
+		cx = openDBOnServer(db.serverInfo, TaskPriority::DefaultEndpoint, true, true);
 	}
 
 	~ClusterControllerData() {
diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h
index dc9b41e5a6..d8432c7d1e 100644
--- a/fdbserver/ClusterRecruitmentInterface.h
+++ b/fdbserver/ClusterRecruitmentInterface.h
@@ -63,13 +63,13 @@ struct ClusterControllerFullInterface {
 
 	void initEndpoints() {
 		clientInterface.initEndpoints();
-		recruitFromConfiguration.getEndpoint( TaskClusterController );
-		recruitRemoteFromConfiguration.getEndpoint( TaskClusterController );
-		recruitStorage.getEndpoint( TaskClusterController );
-		registerWorker.getEndpoint( TaskClusterController );
-		getWorkers.getEndpoint( TaskClusterController );
-		registerMaster.getEndpoint( TaskClusterController );
-		getServerDBInfo.getEndpoint( TaskClusterController );
+		recruitFromConfiguration.getEndpoint( TaskPriority::ClusterController );
+		recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterController );
+		recruitStorage.getEndpoint( TaskPriority::ClusterController );
+		registerWorker.getEndpoint( TaskPriority::ClusterController );
+		getWorkers.getEndpoint( TaskPriority::ClusterController );
+		registerMaster.getEndpoint( TaskPriority::ClusterController );
+		getServerDBInfo.getEndpoint( TaskPriority::ClusterController );
 	}
 
 	template <class Ar>
diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index b4e7283592..641ded30a0 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -52,8 +52,8 @@ GenerationRegInterface::GenerationRegInterface( NetworkAddress remote )
 
 GenerationRegInterface::GenerationRegInterface( INetwork* local )
 {
-	read.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_READ, TaskCoordination );
-	write.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_WRITE, TaskCoordination );
+	read.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_READ, TaskPriority::Coordination );
+	write.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_WRITE, TaskPriority::Coordination );
 }
 
 LeaderElectionRegInterface::LeaderElectionRegInterface(NetworkAddress remote)
@@ -67,9 +67,9 @@ LeaderElectionRegInterface::LeaderElectionRegInterface(NetworkAddress remote)
 LeaderElectionRegInterface::LeaderElectionRegInterface(INetwork* local) 
 	: ClientLeaderRegInterface(local)
 {
-	candidacy.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_CANDIDACY, TaskCoordination );
-	leaderHeartbeat.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT, TaskCoordination );
-	forward.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_FORWARD, TaskCoordination );
+	candidacy.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_CANDIDACY, TaskPriority::Coordination );
+	leaderHeartbeat.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT, TaskPriority::Coordination );
+	forward.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_FORWARD, TaskPriority::Coordination );
 }
 
 ServerCoordinators::ServerCoordinators( Reference<ClusterConnectionFile> cf )
diff --git a/fdbserver/CoroFlow.actor.cpp b/fdbserver/CoroFlow.actor.cpp
index af9b5ac565..22eaab2b0f 100644
--- a/fdbserver/CoroFlow.actor.cpp
+++ b/fdbserver/CoroFlow.actor.cpp
@@ -263,7 +263,7 @@ typedef WorkPool<Coroutine, ThreadUnsafeSpinLock, true> CoroPool;
 
 
 
-ACTOR void coroSwitcher( Future<Void> what, int taskID, Coro* coro ) {
+ACTOR void coroSwitcher( Future<Void> what, TaskPriority taskID, Coro* coro ) {
 	try {
 		// state double t = now();
 		wait(what);
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index ef8c25b2b6..9fc6c04ccd 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -88,7 +88,7 @@ struct TCMachineInfo : public ReferenceCounted<TCMachineInfo> {
 
 ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
 	state StorageServerInterface ssi = server->lastKnownInterface;
-	state Future<ErrorOr<GetPhysicalMetricsReply>> metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskDataDistributionLaunch );
+	state Future<ErrorOr<GetPhysicalMetricsReply>> metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskPriority::DataDistributionLaunch );
 	state Future<Void> resetRequest = Never();
 	state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged( server->onInterfaceChanged );
 	state Future<Void> serverRemoved( server->onRemoved );
@@ -104,7 +104,7 @@ ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
 					return Void();
 				}
 				metricsRequest = Never();
-				resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskDataDistributionLaunch );
+				resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch );
 			}
 			when( std::pair<StorageServerInterface,ProcessClass> _ssi = wait( interfaceChanged ) ) {
 				ssi = _ssi.first;
@@ -120,7 +120,7 @@ ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
 				}
 				else {
 					resetRequest = Never();
-					metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskDataDistributionLaunch );
+					metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskPriority::DataDistributionLaunch );
 				}
 			}
 		}
@@ -635,9 +635,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	    shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()),
 	    badTeamRemover(Void()), redundantTeamRemover(Void()), configuration(configuration),
 	    readyToStart(readyToStart), clearHealthyZoneFuture(Void()),
-	    checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution)),
+	    checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskPriority::DataDistribution)),
 	    initialFailureReactionDelay(
-	        delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution)),
+	        delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskPriority::DataDistribution)),
 	    healthyTeamCount(0), storageServerSet(new LocalityMap<UID>()),
 	    initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)),
 	    optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
@@ -671,7 +671,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 	ACTOR static Future<Void> logOnCompletion( Future<Void> signal, DDTeamCollection* self ) {
 		wait(signal);
-		wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskDataDistribution));
+		wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskPriority::DataDistribution));
 
 		if(!self->primary || self->configuration.usableRegions == 1) {
 			TraceEvent("DDTrackerStarting", self->distributorId)
@@ -1919,7 +1919,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 		//Building teams can cause servers to become undesired, which can make teams unhealthy.
 		//Let all of these changes get worked out before responding to the get team request
-		wait( delay(0, TaskDataDistributionLaunch) );
+		wait( delay(0, TaskPriority::DataDistributionLaunch) );
 
 		return Void();
 	}
@@ -2232,7 +2232,7 @@ ACTOR Future<Void> waitUntilHealthy(DDTeamCollection* self) {
 			TraceEvent("WaitUntilHealthyStalled", self->distributorId).detail("Primary", self->primary).detail("ZeroHealthy", self->zeroHealthyTeams->get()).detail("ProcessingUnhealthy", self->processingUnhealthy->get());
 			wait(self->zeroHealthyTeams->onChange() || self->processingUnhealthy->onChange());
 		}
-		wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskLowPriority)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue.
+		wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskPriority::Low)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue.
 		if(!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) {
 			return Void();
 		}
@@ -2638,7 +2638,7 @@ ACTOR Future<Void> trackExcludedServers( DDTeamCollection* self ) {
 				if (nchid != lastChangeID)
 					break;
 
-				wait( delay( SERVER_KNOBS->SERVER_LIST_DELAY, TaskDataDistribution ) );  // FIXME: make this tr.watch( excludedServersVersionKey ) instead
+				wait( delay( SERVER_KNOBS->SERVER_LIST_DELAY, TaskPriority::DataDistribution ) );  // FIXME: make this tr.watch( excludedServersVersionKey ) instead
 				tr = Transaction(self->cx);
 			} catch (Error& e) {
 				wait( tr.onError(e) );
@@ -2757,14 +2757,14 @@ ACTOR Future<Void> serverMetricsPolling( TCServerInfo *server) {
 	state double lastUpdate = now();
 	loop {
 		wait( updateServerMetrics( server ) );
-		wait( delayUntil( lastUpdate + SERVER_KNOBS->STORAGE_METRICS_POLLING_DELAY + SERVER_KNOBS->STORAGE_METRICS_RANDOM_DELAY * deterministicRandom()->random01(), TaskDataDistributionLaunch ) );
+		wait( delayUntil( lastUpdate + SERVER_KNOBS->STORAGE_METRICS_POLLING_DELAY + SERVER_KNOBS->STORAGE_METRICS_RANDOM_DELAY * deterministicRandom()->random01(), TaskPriority::DataDistributionLaunch ) );
 		lastUpdate = now();
 	}
 }
 
 //Returns the KeyValueStoreType of server if it is different from self->storeType
 ACTOR Future<KeyValueStoreType> keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo *server) {
-	state KeyValueStoreType type = wait(brokenPromiseToNever(server->lastKnownInterface.getKeyValueStoreType.getReplyWithTaskID<KeyValueStoreType>(TaskDataDistribution)));
+	state KeyValueStoreType type = wait(brokenPromiseToNever(server->lastKnownInterface.getKeyValueStoreType.getReplyWithTaskID<KeyValueStoreType>(TaskPriority::DataDistribution)));
 	if(type == self->configuration.storageServerStoreType && (self->includedDCs.empty() || std::find(self->includedDCs.begin(), self->includedDCs.end(), server->lastKnownInterface.locality.dcId()) != self->includedDCs.end()) )
 		wait(Future<Void>(Never()));
 
@@ -2787,7 +2787,7 @@ ACTOR Future<Void> waitForAllDataRemoved( Database cx, UID serverID, Version add
 			}
 
 			// Wait for any change to the serverKeys for this server
-			wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskDataDistribution) );
+			wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskPriority::DataDistribution) );
 			tr.reset();
 		} catch (Error& e) {
 			wait( tr.onError(e) );
@@ -2830,7 +2830,7 @@ ACTOR Future<Void> storageServerFailureTracker(
 			ASSERT(!inHealthyZone);
 			healthChanged = IFailureMonitor::failureMonitor().onStateEqual( interf.waitFailure.getEndpoint(), FailureStatus(false));
 		} else if(!inHealthyZone) {
-			healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskDataDistribution);
+			healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskPriority::DataDistribution);
 		}
 		choose {
 			when ( wait(healthChanged) ) {
@@ -3120,7 +3120,7 @@ ACTOR Future<Void> monitorStorageServerRecruitment(DDTeamCollection* self) {
 			loop {
 				choose {
 					when( wait( self->recruitingStream.onChange() ) ) {}
-					when( wait( self->recruitingStream.get() == 0 ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskDataDistribution) : Future<Void>(Never()) ) ) { break; }
+					when( wait( self->recruitingStream.get() == 0 ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskPriority::DataDistribution) : Future<Void>(Never()) ) ) { break; }
 				}
 			}
 			TraceEvent("StorageServerRecruitment", self->distributorId)
@@ -3147,12 +3147,12 @@ ACTOR Future<Void> initializeStorage( DDTeamCollection* self, RecruitStorageRepl
 
 	self->recruitingIds.insert(interfaceId);
 	self->recruitingLocalities.insert(candidateWorker.worker.address());
-	state ErrorOr<InitializeStorageReply> newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskDataDistribution ) );
+	state ErrorOr<InitializeStorageReply> newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskPriority::DataDistribution ) );
 	if(newServer.isError()) {
 		TraceEvent(SevWarn, "DDRecruitmentError").error(newServer.getError());
 		if( !newServer.isError( error_code_recruitment_failed ) && !newServer.isError( error_code_request_maybe_delivered ) )
 			throw newServer.getError();
-		wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskDataDistribution) );
+		wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution) );
 	}
 	self->recruitingIds.erase(interfaceId);
 	self->recruitingLocalities.erase(candidateWorker.worker.address());
@@ -3217,7 +3217,7 @@ ACTOR Future<Void> storageRecruiter( DDTeamCollection* self, Reference<AsyncVar<
 
 			if(!fCandidateWorker.isValid() || fCandidateWorker.isReady() || rsr.excludeAddresses != lastRequest.excludeAddresses || rsr.criticalRecruitment != lastRequest.criticalRecruitment) {
 				lastRequest = rsr;
-				fCandidateWorker = brokenPromiseToNever( db->get().clusterInterface.recruitStorage.getReply( rsr, TaskDataDistribution ) );
+				fCandidateWorker = brokenPromiseToNever( db->get().clusterInterface.recruitStorage.getReply( rsr, TaskPriority::DataDistribution ) );
 			}
 
 			choose {
@@ -3388,7 +3388,7 @@ ACTOR Future<Void> dataDistributionTeamCollection(
 ACTOR Future<Void> waitForDataDistributionEnabled( Database cx ) {
 	state Transaction tr(cx);
 	loop {
-		wait(delay(SERVER_KNOBS->DD_ENABLED_CHECK_DELAY, TaskDataDistribution));
+		wait(delay(SERVER_KNOBS->DD_ENABLED_CHECK_DELAY, TaskPriority::DataDistribution));
 
 		try {
 			Optional<Value> mode = wait( tr.get( dataDistributionModeKey ) );
@@ -3516,7 +3516,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
 	state double lastLimited = 0;
 	self->addActor.send( monitorBatchLimitedTime(self->dbInfo, &lastLimited) );
 
-	state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true);
+	state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DataDistributionLaunch, true, true);
 	cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE;
 
 	//cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) );
@@ -3646,7 +3646,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self)
 					}
 					output.send( RelocateShard( keys, unhealthy ? PRIORITY_TEAM_UNHEALTHY : PRIORITY_RECOVER_MOVE ) );
 				}
-				wait( yield(TaskDataDistribution) );
+				wait( yield(TaskPriority::DataDistribution) );
 			}
 
 			vector<TeamCollectionInterface> tcis;
diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp
index e155254850..d11fc63146 100644
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@@ -512,9 +512,9 @@ struct DDQueueData {
 
 		// FIXME: is the merge case needed
 		if( input.priority == PRIORITY_MERGE_SHARD ) {
-			wait( delay( 0.5, TaskDataDistribution - 2 ) );
+			wait( delay( 0.5, decrementPriority(decrementPriority(TaskPriority::DataDistribution )) ) );
 		} else {
-			wait( delay( 0.0001, TaskDataDistributionLaunch ) );
+			wait( delay( 0.0001, TaskPriority::DataDistributionLaunch ) );
 		}
 
 		loop {
@@ -933,7 +933,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 				    .detail("Count", stuckCount)
 				    .detail("TeamCollectionId", tciIndex)
 				    .detail("NumOfTeamCollections", self->teamCollections.size());
-				wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskDataDistributionLaunch ) );
+				wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) );
 			}
 
 			state std::vector<UID> destIds;
@@ -993,7 +993,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 			state Error error = success();
 			state Promise<Void> dataMovementComplete;
 			state Future<Void> doMoveKeys = moveKeys(self->cx, rd.keys, destIds, healthyIds, self->lock, dataMovementComplete, &self->startMoveKeysParallelismLock, &self->finishMoveKeysParallelismLock, self->teamCollections.size() > 1, relocateShardInterval.pairID );
-			state Future<Void> pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
+			state Future<Void> pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch );
 			try {
 				loop {
 					choose {
@@ -1016,7 +1016,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 									self->dataTransferComplete.send(rd);
 								}
 							}
-							pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
+							pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch );
 						}
 						when( wait( signalledTransferComplete ? Never() : dataMovementComplete.getFuture() ) ) {
 							self->fetchKeysComplete.insert( rd );
@@ -1066,7 +1066,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
 			} else {
 				TEST(true);  // move to removed server
 				healthyDestinations.addDataInFlightToTeam( -metrics.bytes );
-				wait( delay( SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskDataDistributionLaunch ) );
+				wait( delay( SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch ) );
 			}
 		}
 	} catch (Error& e) {
@@ -1125,7 +1125,7 @@ ACTOR Future<Void> BgDDMountainChopper( DDQueueData* self, int teamCollectionInd
 	state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL;
 	state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
 	loop {
-		wait( delay(checkDelay, TaskDataDistributionLaunch) );
+		wait( delay(checkDelay, TaskPriority::DataDistributionLaunch) );
 		if (self->priority_relocations[PRIORITY_REBALANCE_OVERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
 			state Optional<Reference<IDataDistributionTeam>> randomTeam = wait( brokenPromiseToNever( self->teamCollections[teamCollectionIndex].getTeam.getReply( GetTeamRequest( true, false, true ) ) ) );
 			if( randomTeam.present() ) {
@@ -1160,7 +1160,7 @@ ACTOR Future<Void> BgDDValleyFiller( DDQueueData* self, int teamCollectionIndex)
 	state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL;
 	state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
 	loop {
-		wait( delay(checkDelay, TaskDataDistributionLaunch) );
+		wait( delay(checkDelay, TaskPriority::DataDistributionLaunch) );
 		if (self->priority_relocations[PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
 			state Optional<Reference<IDataDistributionTeam>> randomTeam = wait( brokenPromiseToNever( self->teamCollections[teamCollectionIndex].getTeam.getReply( GetTeamRequest( true, false, false ) ) ) );
 			if( randomTeam.present() ) {
@@ -1244,7 +1244,7 @@ ACTOR Future<Void> dataDistributionQueue(
 					bool wasEmpty = serversToLaunchFrom.empty();
 					self.queueRelocation( rs, serversToLaunchFrom );
 					if(wasEmpty && !serversToLaunchFrom.empty())
-						launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch);
+						launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch);
 				}
 				when ( wait(launchQueuedWorkTimeout) ) {
 					self.launchQueuedWork( serversToLaunchFrom );
@@ -1258,7 +1258,7 @@ ACTOR Future<Void> dataDistributionQueue(
 				when ( RelocateData done = waitNext( self.dataTransferComplete.getFuture() ) ) {
 					complete( done, self.busymap );
 					if(serversToLaunchFrom.empty() && !done.src.empty())
-						launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch);
+						launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch);
 					serversToLaunchFrom.insert(done.src.begin(), done.src.end());
 				}
 				when ( RelocateData done = waitNext( self.relocationComplete.getFuture() ) ) {
@@ -1266,7 +1266,7 @@ ACTOR Future<Void> dataDistributionQueue(
 					self.finishRelocation(done.priority);
 					self.fetchKeysComplete.erase( done );
 					//self.logRelocation( done, "ShardRelocatorDone" );
-					actors.add( tag( delay(0, TaskDataDistributionLaunch), done.keys, rangesComplete ) );
+					actors.add( tag( delay(0, TaskPriority::DataDistributionLaunch), done.keys, rangesComplete ) );
 					if( g_network->isSimulated() && debug_isCheckRelocationDuration() && now() - done.startTime > 60 ) {
 						TraceEvent(SevWarnAlways, "RelocationDurationTooLong").detail("Duration", now() - done.startTime);
 						debug_setCheckRelocationDuration(false);
diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp
index c4c8329754..ca4a849a33 100644
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@@ -140,7 +140,7 @@ ACTOR Future<Void> trackShardBytes(
 		Reference<AsyncVar<Optional<StorageMetrics>>> shardSize,
 		bool addToSizeEstimate = true)
 {
-	wait( delay( 0, TaskDataDistribution ) );
+	wait( delay( 0, TaskPriority::DataDistribution ) );
 
 	/*TraceEvent("TrackShardBytesStarting")
 		.detail("TrackerID", trackerID)
@@ -260,7 +260,7 @@ ACTOR Future<Void> changeSizes( DataDistributionTracker* self, KeyRangeRef keys,
 	}
 
 	wait( waitForAll( sizes ) );
-	wait( yield(TaskDataDistribution) );
+	wait( yield(TaskPriority::DataDistribution) );
 
 	int64_t newShardsStartingSize = 0;
 	for ( int i = 0; i < sizes.size(); i++ )
@@ -281,7 +281,7 @@ struct HasBeenTrueFor : NonCopyable {
 	Future<Void> set() {
 		if( !trigger.isValid() ) {
 			cleared = Promise<Void>();
-			trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, TaskDataDistribution - 1 ) || cleared.getFuture();
+			trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, decrementPriority(TaskPriority::DataDistribution) ) || cleared.getFuture();
 		}
 		return trigger;
 	}
@@ -361,7 +361,7 @@ ACTOR Future<Void> shardSplitter(
 
 		self->sizeChanges.add( changeSizes( self, keys, shardSize->get().get().bytes ) );
 	} else {
-		wait( delay(1.0, TaskDataDistribution) ); //In case the reason the split point was off was due to a discrepancy between storage servers
+		wait( delay(1.0, TaskPriority::DataDistribution) ); //In case the reason the split point was off was due to a discrepancy between storage servers
 	}
 	return Void();
 }
@@ -529,7 +529,7 @@ ACTOR Future<Void> shardTracker(
 		wait( yieldedFuture(self->maxShardSize->onChange()) );
 
 	// Since maxShardSize will become present for all shards at once, avoid slow tasks with a short delay
-	wait( delay( 0, TaskDataDistribution ) );
+	wait( delay( 0, TaskPriority::DataDistribution ) );
 
 	/*TraceEvent("ShardTracker", self->distributorId)
 		.detail("Begin", keys.begin)
@@ -546,7 +546,7 @@ ACTOR Future<Void> shardTracker(
 
 			// We could have a lot of actors being released from the previous wait at the same time. Immediately calling
 			// delay(0) mitigates the resulting SlowTask
-			wait( delay(0, TaskDataDistribution) );
+			wait( delay(0, TaskPriority::DataDistribution) );
 		}
 	} catch (Error& e) {
 		if (e.code() != error_code_actor_cancelled)
@@ -593,12 +593,12 @@ ACTOR Future<Void> trackInitialShards(DataDistributionTracker *self, Reference<I
 
 	//This line reduces the priority of shard initialization to prevent interference with failure monitoring.
 	//SOMEDAY: Figure out what this priority should actually be
-	wait( delay( 0.0, TaskDataDistribution ) );
+	wait( delay( 0.0, TaskPriority::DataDistribution ) );
 
 	state int s;
 	for(s=0; s<initData->shards.size()-1; s++) {
 		restartShardTrackers( self, KeyRangeRef( initData->shards[s].key, initData->shards[s+1].key ) );
-		wait( yield( TaskDataDistribution ) );
+		wait( yield( TaskPriority::DataDistribution ) );
 	}
 
 	Future<Void> initialSize = changeSizes( self, KeyRangeRef(allKeys.begin, allKeys.end), 0 );
diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp
index e53fa5a29a..7ce1a5c9b0 100644
--- a/fdbserver/KeyValueStoreSQLite.actor.cpp
+++ b/fdbserver/KeyValueStoreSQLite.actor.cpp
@@ -1937,8 +1937,8 @@ KeyValueStoreSQLite::KeyValueStoreSQLite(std::string const& filename, UID id, Ke
 	readCursors.resize(64); //< number of read threads
 
 	sqlite3_soft_heap_limit64( SERVER_KNOBS->SOFT_HEAP_LIMIT );  // SOMEDAY: Is this a performance issue?  Should we drop the cache sizes for individual threads?
-	int taskId = g_network->getCurrentTask();
-	g_network->setCurrentTask(TaskDiskWrite);
+	TaskPriority taskId = g_network->getCurrentTask();
+	g_network->setCurrentTask(TaskPriority::DiskWrite);
 	writeThread->addThread( new Writer(filename, type==KeyValueStoreType::SSD_BTREE_V2, checkChecksums, checkIntegrity, writesComplete, springCleaningStats, diskBytesUsed, freeListPages, id, &readCursors) );
 	g_network->setCurrentTask(taskId);
 	auto p = new Writer::InitAction();
@@ -1963,8 +1963,8 @@ StorageBytes KeyValueStoreSQLite::getStorageBytes() {
 
 void KeyValueStoreSQLite::startReadThreads() {
 	int nReadThreads = readCursors.size();
-	int taskId = g_network->getCurrentTask();
-	g_network->setCurrentTask(TaskDiskRead);
+	TaskPriority taskId = g_network->getCurrentTask();
+	g_network->setCurrentTask(TaskPriority::DiskRead);
 	for(int i=0; i<nReadThreads; i++)
 		readThreads->addThread( new Reader(filename, type==KeyValueStoreType::SSD_BTREE_V2, readsComplete, logID, &readCursors[i]) );
 	g_network->setCurrentTask(taskId);
diff --git a/fdbserver/LeaderElection.actor.cpp b/fdbserver/LeaderElection.actor.cpp
index 3cc50609d3..5a97b6358f 100644
--- a/fdbserver/LeaderElection.actor.cpp
+++ b/fdbserver/LeaderElection.actor.cpp
@@ -30,7 +30,7 @@ Optional<std::pair<LeaderInfo, bool>> getLeader( const vector<Optional<LeaderInf
 ACTOR Future<Void> submitCandidacy( Key key, LeaderElectionRegInterface coord, LeaderInfo myInfo, UID prevChangeID, Reference<AsyncVar<vector<Optional<LeaderInfo>>>> nominees, int index ) {
 	loop {
 		auto const& nom = nominees->get()[index];
-		Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.candidacy, CandidacyRequest( key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID ), TaskCoordinationReply ) );
+		Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.candidacy, CandidacyRequest( key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID ), TaskPriority::CoordinationReply ) );
 
 		if (li != nominees->get()[index]) {
 			vector<Optional<LeaderInfo>> v = nominees->get();
@@ -150,7 +150,7 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators, Valu
 			// we might be breaking the leader election process for someone with better communications but lower ID, so change IDs.
 			if ((!leader.present() || !leader.get().second) && std::count( nominees->get().begin(), nominees->get().end(), myInfo )) {
 				if (!badCandidateTimeout.isValid())
-					badCandidateTimeout = delay( SERVER_KNOBS->POLLING_FREQUENCY*2, TaskCoordinationReply );
+					badCandidateTimeout = delay( SERVER_KNOBS->POLLING_FREQUENCY*2, TaskPriority::CoordinationReply );
 			} else
 				badCandidateTimeout = Future<Void>();
 
@@ -183,12 +183,12 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators, Valu
 		state vector<Future<Void>> true_heartbeats;
 		state vector<Future<Void>> false_heartbeats;
 		for(int i=0; i<coordinators.leaderElectionServers.size(); i++) {
-			Future<bool> hb = retryBrokenPromise( coordinators.leaderElectionServers[i].leaderHeartbeat, LeaderHeartbeatRequest( coordinators.clusterKey, myInfo, prevChangeID ), TaskCoordinationReply );
+			Future<bool> hb = retryBrokenPromise( coordinators.leaderElectionServers[i].leaderHeartbeat, LeaderHeartbeatRequest( coordinators.clusterKey, myInfo, prevChangeID ), TaskPriority::CoordinationReply );
 			true_heartbeats.push_back( onEqual(hb, true) );
 			false_heartbeats.push_back( onEqual(hb, false) );
 		}
 
-		state Future<Void> rate = delay( SERVER_KNOBS->HEARTBEAT_FREQUENCY, TaskCoordinationReply ) || asyncPriorityInfo->onChange(); // SOMEDAY: Move to server side?
+		state Future<Void> rate = delay( SERVER_KNOBS->HEARTBEAT_FREQUENCY, TaskPriority::CoordinationReply ) || asyncPriorityInfo->onChange(); // SOMEDAY: Move to server side?
 
 		choose {
 			when ( wait( quorum( true_heartbeats, true_heartbeats.size()/2+1 ) ) ) {
diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp
index 2dc8194d3a..eae38b50a7 100644
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@@ -51,7 +51,7 @@ struct LogRouterData {
 		}
 
 		// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
-		ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, LogRouterData *tlogData, int taskID ) {
+		ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, LogRouterData *tlogData, TaskPriority taskID ) {
 			while(!self->version_messages.empty() && self->version_messages.front().first < before) {
 				Version version = self->version_messages.front().first;
 				int64_t messagesErased = 0;
@@ -68,7 +68,7 @@ struct LogRouterData {
 			return Void();
 		}
 
-		Future<Void> eraseMessagesBefore(Version before, LogRouterData *tlogData, int taskID) {
+		Future<Void> eraseMessagesBefore(Version before, LogRouterData *tlogData, TaskPriority taskID) {
 			return eraseMessagesBefore(this, before, tlogData, taskID);
 		}
 	};
@@ -197,7 +197,7 @@ ACTOR Future<Void> waitForVersion( LogRouterData *self, Version ver ) {
 		while(self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < ver) {
 			if(self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS > self->version.get()) {
 				self->version.set( self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS );
-				wait(yield(TaskTLogCommit));
+				wait(yield(TaskPriority::TLogCommit));
 			} else {
 				wait(self->minPopped.whenAtLeast((self->minPopped.get()+1)));
 			}
@@ -220,7 +220,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
 	loop {
 		loop {
 			choose {
-				when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) {
+				when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) {
 					break;
 				}
 				when( wait( dbInfoChange ) ) { //FIXME: does this actually happen?
@@ -247,7 +247,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
 
 					commitMessages(self, ver, messages);
 					self->version.set( ver );
-					wait(yield(TaskTLogCommit));
+					wait(yield(TaskPriority::TLogCommit));
 					//TraceEvent("LogRouterVersion").detail("Ver",ver);
 				}
 				lastVer = ver;
@@ -260,7 +260,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
 						wait( waitForVersion(self, ver) );
 
 						self->version.set( ver );
-						wait(yield(TaskTLogCommit));
+						wait(yield(TaskPriority::TLogCommit));
 					}
 					break;
 				}
@@ -370,7 +370,7 @@ ACTOR Future<Void> logRouterPop( LogRouterData* self, TLogPopRequest req ) {
 	} else if (req.to > tagData->popped) {
 		tagData->popped = req.to;
 		tagData->durableKnownCommittedVersion = req.durableKnownCommittedVersion;
-		wait(tagData->eraseMessagesBefore( req.to, self, TaskTLogPop ));
+		wait(tagData->eraseMessagesBefore( req.to, self, TaskPriority::TLogPop ));
 	}
 
 	state Version minPopped = std::numeric_limits<Version>::max();
@@ -384,7 +384,7 @@ ACTOR Future<Void> logRouterPop( LogRouterData* self, TLogPopRequest req ) {
 
 	while(!self->messageBlocks.empty() && self->messageBlocks.front().first < minPopped) {
 		self->messageBlocks.pop_front();
-		wait(yield(TaskTLogPop));
+		wait(yield(TaskPriority::TLogPop));
 	}
 
 	self->poppedVersion = std::min(minKnownCommittedVersion, self->minKnownCommittedVersion);
diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h
index dad3779938..a261354fbe 100644
--- a/fdbserver/LogSystem.h
+++ b/fdbserver/LogSystem.h
@@ -341,7 +341,7 @@ struct ILogSystem {
 
 		//returns immediately if hasMessage() returns true.
 		//returns when either the result of hasMessage() or version() has changed.
-		virtual Future<Void> getMore(int taskID = TaskTLogPeekReply) = 0;
+		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) = 0;
 
 		//returns when the failure monitor detects that the servers associated with the cursor are failed
 		virtual Future<Void> onFailed() = 0;
@@ -406,7 +406,7 @@ struct ILogSystem {
 		virtual StringRef getMessageWithTags();
 		virtual const std::vector<Tag>& getTags();
 		virtual void advanceTo(LogMessageVersion n);
-		virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
+		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
 		virtual bool isActive();
 		virtual bool isExhausted();
@@ -454,7 +454,7 @@ struct ILogSystem {
 		virtual StringRef getMessageWithTags();
 		virtual const std::vector<Tag>& getTags();
 		virtual void advanceTo(LogMessageVersion n);
-		virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
+		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
 		virtual bool isActive();
 		virtual bool isExhausted();
@@ -499,7 +499,7 @@ struct ILogSystem {
 		virtual StringRef getMessageWithTags();
 		virtual const std::vector<Tag>& getTags();
 		virtual void advanceTo(LogMessageVersion n);
-		virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
+		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
 		virtual bool isActive();
 		virtual bool isExhausted();
@@ -533,7 +533,7 @@ struct ILogSystem {
 		virtual StringRef getMessageWithTags();
 		virtual const std::vector<Tag>& getTags();
 		virtual void advanceTo(LogMessageVersion n);
-		virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
+		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
 		virtual bool isActive();
 		virtual bool isExhausted();
@@ -593,7 +593,7 @@ struct ILogSystem {
 		virtual StringRef getMessageWithTags();
 		virtual const std::vector<Tag>& getTags();
 		virtual void advanceTo(LogMessageVersion n);
-		virtual Future<Void> getMore(int taskID = TaskTLogPeekReply);
+		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
 		virtual bool isActive();
 		virtual bool isExhausted();
diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp
index ecf1877536..dee74c2dde 100644
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@@ -133,7 +133,7 @@ void ILogSystem::ServerPeekCursor::advanceTo(LogMessageVersion n) {
 	}
 }
 
-ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self, int taskID ) {
+ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self, TaskPriority taskID ) {
 	if( !self->interf || self->messageVersion >= self->end ) {
 		wait( Future<Void>(Never()));
 		throw internal_error();
@@ -192,7 +192,7 @@ ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self
 	}
 }
 
-ACTOR Future<Void> serverPeekGetMore( ILogSystem::ServerPeekCursor* self, int taskID ) {
+ACTOR Future<Void> serverPeekGetMore( ILogSystem::ServerPeekCursor* self, TaskPriority taskID ) {
 	if( !self->interf || self->messageVersion >= self->end ) {
 		wait( Future<Void>(Never()));
 		throw internal_error();
@@ -225,7 +225,7 @@ ACTOR Future<Void> serverPeekGetMore( ILogSystem::ServerPeekCursor* self, int ta
 	}
 }
 
-Future<Void> ILogSystem::ServerPeekCursor::getMore(int taskID) {
+Future<Void> ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) {
 	//TraceEvent("SPC_GetMore", randomID).detail("HasMessage", hasMessage()).detail("More", !more.isValid() || more.isReady()).detail("MessageVersion", messageVersion.toString()).detail("End", end.toString());
 	if( hasMessage() )
 		return Void();
@@ -431,7 +431,7 @@ void ILogSystem::MergedPeekCursor::advanceTo(LogMessageVersion n) {
 	}
 }
 
-ACTOR Future<Void> mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMessageVersion startVersion, int taskID) {
+ACTOR Future<Void> mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) {
 	loop {
 		//TraceEvent("MPC_GetMoreA", self->randomID).detail("Start", startVersion.toString());
 		if(self->bestServer >= 0 && self->serverCursors[self->bestServer]->isActive()) {
@@ -452,7 +452,7 @@ ACTOR Future<Void> mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMess
 	}
 }
 
-Future<Void> ILogSystem::MergedPeekCursor::getMore(int taskID) {
+Future<Void> ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) {
 	if(!serverCursors.size())
 		return Never();
 	
@@ -692,7 +692,7 @@ void ILogSystem::SetPeekCursor::advanceTo(LogMessageVersion n) {
 	}
 }
 
-ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, int taskID) {
+ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) {
 	loop {
 		//TraceEvent("LPC_GetMore1", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag);
 		if(self->bestServer >= 0 && self->bestSet >= 0 && self->serverCursors[self->bestSet][self->bestServer]->isActive()) {
@@ -753,7 +753,7 @@ ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVer
 	}
 }
 
-Future<Void> ILogSystem::SetPeekCursor::getMore(int taskID) {
+Future<Void> ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) {
 	auto startVersion = version();
 	calcHasMessage();
 	if( hasMessage() )
@@ -848,7 +848,7 @@ void ILogSystem::MultiCursor::advanceTo(LogMessageVersion n) {
 	cursors.back()->advanceTo(n);
 }
 
-Future<Void> ILogSystem::MultiCursor::getMore(int taskID) {
+Future<Void> ILogSystem::MultiCursor::getMore(TaskPriority taskID) {
 	LogMessageVersion startVersion = cursors.back()->version();
 	while( cursors.size() > 1 && cursors.back()->version() >= epochEnds.back() ) {
 		poppedVersion = std::max(poppedVersion, cursors.back()->popped());
@@ -964,7 +964,7 @@ void ILogSystem::BufferedCursor::advanceTo(LogMessageVersion n) {
 	ASSERT(false);
 }
 
-ACTOR Future<Void> bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference<ILogSystem::IPeekCursor> cursor, Version maxVersion, int taskID ) {
+ACTOR Future<Void> bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference<ILogSystem::IPeekCursor> cursor, Version maxVersion, TaskPriority taskID ) {
 	loop {
 		wait(yield());
 		if(cursor->version().version >= maxVersion) {
@@ -981,7 +981,7 @@ ACTOR Future<Void> bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Refe
 	}
 }
 
-ACTOR Future<Void> bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID ) {
+ACTOR Future<Void> bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriority taskID ) {
 	if( self->messageVersion.version >= self->end ) {
 		wait( Future<Void>(Never()));
 		throw internal_error();
@@ -1015,7 +1015,7 @@ ACTOR Future<Void> bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID
 	return Void();
 }
 
-Future<Void> ILogSystem::BufferedCursor::getMore(int taskID) {
+Future<Void> ILogSystem::BufferedCursor::getMore(TaskPriority taskID) {
 	if( hasMessage() )
 		return Void();
 	return bufferedGetMore(this, taskID);
diff --git a/fdbserver/MasterInterface.h b/fdbserver/MasterInterface.h
index 44674ec3bb..91a0d2444d 100644
--- a/fdbserver/MasterInterface.h
+++ b/fdbserver/MasterInterface.h
@@ -50,7 +50,7 @@ struct MasterInterface {
 	}
 
 	void initEndpoints() {
-		getCommitVersion.getEndpoint( TaskProxyGetConsistentReadVersion );
+		getCommitVersion.getEndpoint( TaskPriority::ProxyGetConsistentReadVersion );
 	}
 };
 
diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp
index 3fc4665a15..57d2211fd8 100644
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@@ -158,7 +158,7 @@ ACTOR Future<Void> queueTransactionStartRequests(
 				if (now() - *lastGRVTime > *GRVBatchTime)
 					*lastGRVTime = now() - *GRVBatchTime;
 
-				forwardPromise(GRVTimer, delayJittered(*GRVBatchTime - (now() - *lastGRVTime), TaskProxyGRVTimer));
+				forwardPromise(GRVTimer, delayJittered(*GRVBatchTime - (now() - *lastGRVTime), TaskPriority::ProxyGRVTimer));
 			}
 
 			transactionQueue->push(std::make_pair(req, counter--));
@@ -263,7 +263,7 @@ struct ProxyCommitData {
 			lastVersionTime(0), commitVersionRequestNumber(1), mostRecentProcessedRequestNumber(0),
 			getConsistentReadVersion(getConsistentReadVersion), commit(commit), lastCoalesceTime(0),
 			localCommitBatchesStarted(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN),
-			firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)), db(db),
+			firstProxy(firstProxy), cx(openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true)), db(db),
 			singleKeyMutationEvent(LiteralStringRef("SingleKeyMutation")), commitBatchesMemBytesCount(0), lastTxsPop(0)
 	{}
 };
@@ -350,7 +350,7 @@ struct ResolutionRequestBuilder {
 };
 
 ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std::pair<std::vector<CommitTransactionRequest>, int> > out, FutureStream<CommitTransactionRequest> in, int desiredBytes, int64_t memBytesLimit) {
-	wait(delayJittered(commitData->commitBatchInterval, TaskProxyCommitBatcher));  
+	wait(delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher));  
 
 	state double lastBatch = 0;
 
@@ -363,7 +363,7 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
 			timeout = Never();
 		}
 		else {
-			timeout = delayJittered(SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL, TaskProxyCommitBatcher);
+			timeout = delayJittered(SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL, TaskPriority::ProxyCommitBatcher);
 		}
 
 		while(!timeout.isReady() && !(batch.size() == SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_COUNT_MAX || batchBytes >= desiredBytes)) {
@@ -387,10 +387,10 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
 					if(!batch.size()) {
 						commitData->commitBatchStartNotifications.send(Void());
 						if(now() - lastBatch > commitData->commitBatchInterval) {
-							timeout = delayJittered(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, TaskProxyCommitBatcher);
+							timeout = delayJittered(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, TaskPriority::ProxyCommitBatcher);
 						}
 						else {
-							timeout = delayJittered(commitData->commitBatchInterval - (now() - lastBatch), TaskProxyCommitBatcher);
+							timeout = delayJittered(commitData->commitBatchInterval - (now() - lastBatch), TaskPriority::ProxyCommitBatcher);
 						}
 					}
 
@@ -398,7 +398,7 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
 						out.send({ batch, batchBytes });
 						lastBatch = now();
 						commitData->commitBatchStartNotifications.send(Void());
-						timeout = delayJittered(commitData->commitBatchInterval, TaskProxyCommitBatcher);
+						timeout = delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher);
 						batch = std::vector<CommitTransactionRequest>();
 						batchBytes = 0;
 					}
@@ -457,7 +457,7 @@ ACTOR Future<Void> commitBatch(
 	ASSERT(SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS <= SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT);  // since we are using just the former to limit the number of versions actually in flight!
 
 	// Active load balancing runs at a very high priority (to obtain accurate estimate of memory used by commit batches) so we need to downgrade here
-	wait(delay(0, TaskProxyCommit));
+	wait(delay(0, TaskPriority::ProxyCommit));
 
 	self->lastVersionTime = t1;
 
@@ -534,7 +534,7 @@ ACTOR Future<Void> commitBatch(
 	vector< Future<ResolveTransactionBatchReply> > replies;
 	for (int r = 0; r<self->resolvers.size(); r++) {
 		requests.requests[r].debugID = debugID;
-		replies.push_back(brokenPromiseToNever(self->resolvers[r].resolve.getReply(requests.requests[r], TaskProxyResolverReply)));
+		replies.push_back(brokenPromiseToNever(self->resolvers[r].resolve.getReply(requests.requests[r], TaskPriority::ProxyResolverReply)));
 	}
 
 	state vector<vector<int>> transactionResolverMap = std::move( requests.transactionResolverMap );
@@ -1135,7 +1135,7 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(ProxyCommitData* commi
 
 	state vector<Future<GetReadVersionReply>> proxyVersions;
 	for (auto const& p : *otherProxies)
-		proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskTLogConfirmRunningReply)));
+		proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskPriority::TLogConfirmRunningReply)));
 
 	if (!(flags&GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY))
 	{
@@ -1292,7 +1292,7 @@ ACTOR static Future<Void> transactionStarter(
 		}
 
 		if (!transactionQueue.empty())
-			forwardPromise(GRVTimer, delayJittered(SERVER_KNOBS->START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL, TaskProxyGRVTimer));
+			forwardPromise(GRVTimer, delayJittered(SERVER_KNOBS->START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL, TaskPriority::ProxyGRVTimer));
 
 		/*TraceEvent("GRVBatch", proxy.id())
 		.detail("Elapsed", elapsed)
diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index 4893f3c6a1..6a979e3cc5 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -130,12 +130,12 @@ ACTOR Future<vector<UID>> addReadWriteDestinations(KeyRangeRef shard, vector<Sto
 
 	state vector< Future<Optional<UID>> > srcChecks;
 	for(int s=0; s<srcInterfs.size(); s++) {
-		srcChecks.push_back( checkReadWrite( srcInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskMoveKeys ), srcInterfs[s].id(), 0 ) );
+		srcChecks.push_back( checkReadWrite( srcInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskPriority::MoveKeys ), srcInterfs[s].id(), 0 ) );
 	}
 
 	state vector< Future<Optional<UID>> > destChecks;
 	for(int s=0; s<destInterfs.size(); s++) {
-		destChecks.push_back( checkReadWrite( destInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskMoveKeys ), destInterfs[s].id(), version ) );
+		destChecks.push_back( checkReadWrite( destInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskPriority::MoveKeys ), destInterfs[s].id(), version ) );
 	}
 
 	wait( waitForAll(srcChecks) && waitForAll(destChecks) );
@@ -225,7 +225,7 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
 	state TraceInterval interval("RelocateShard_StartMoveKeys");
 	//state TraceInterval waitInterval("");
 
-	wait( startMoveKeysLock->take( TaskDataDistributionLaunch ) );
+	wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) );
 	state FlowLock::Releaser releaser( *startMoveKeysLock );
 
 	TraceEvent(SevDebug, interval.begin(), relocationIntervalId);
@@ -255,7 +255,7 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
 					//Keep track of shards for all src servers so that we can preserve their values in serverKeys
 					state Map<UID, VectorRef<KeyRangeRef>> shardMap;
 
-					tr.info.taskID = TaskMoveKeys;
+					tr.info.taskID = TaskPriority::MoveKeys;
 					tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 
 					wait( checkMoveKeysLock(&tr, lock) );
@@ -394,11 +394,11 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
 ACTOR Future<Void> waitForShardReady( StorageServerInterface server, KeyRange keys, Version minVersion, GetShardStateRequest::waitMode mode ) {
 	loop {
 		try {
-			std::pair<Version,Version> rep = wait( server.getShardState.getReply( GetShardStateRequest(keys, mode), TaskMoveKeys ) );
+			std::pair<Version,Version> rep = wait( server.getShardState.getReply( GetShardStateRequest(keys, mode), TaskPriority::MoveKeys ) );
 			if (rep.first >= minVersion) {
 				return Void();
 			}
-			wait( delayJittered( SERVER_KNOBS->SHARD_READY_DELAY, TaskMoveKeys ) );
+			wait( delayJittered( SERVER_KNOBS->SHARD_READY_DELAY, TaskPriority::MoveKeys ) );
 		}
 		catch (Error& e) {
 			if( e.code() != error_code_timed_out ) {
@@ -419,7 +419,7 @@ ACTOR Future<Void> checkFetchingState( Database cx, vector<UID> dest, KeyRange k
 		try {
 			if (BUGGIFY) wait(delay(5));
 
-			tr.info.taskID = TaskMoveKeys;
+			tr.info.taskID = TaskPriority::MoveKeys;
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 
 			vector< Future< Optional<Value> > > serverListEntries;
@@ -439,7 +439,7 @@ ACTOR Future<Void> checkFetchingState( Database cx, vector<UID> dest, KeyRange k
 			}
 
 			wait( timeoutError( waitForAll( requests ),
-					SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskMoveKeys ) );
+					SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskPriority::MoveKeys ) );
 
 			dataMovementComplete.send(Void());
 			return Void();
@@ -480,11 +480,11 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
 			//printf("finishMoveKeys( '%s'-'%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
 			loop {
 				try {
-					tr.info.taskID = TaskMoveKeys;
+					tr.info.taskID = TaskPriority::MoveKeys;
 					tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 
 					releaser.release();
-					wait( finishMoveKeysParallelismLock->take( TaskDataDistributionLaunch ) );
+					wait( finishMoveKeysParallelismLock->take( TaskPriority::DataDistributionLaunch ) );
 					releaser = FlowLock::Releaser( *finishMoveKeysParallelismLock );
 
 					wait( checkMoveKeysLock(&tr, lock) );
@@ -632,7 +632,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
 
 					for(int s=0; s<storageServerInterfaces.size(); s++)
 						serverReady.push_back( waitForShardReady( storageServerInterfaces[s], keys, tr.getReadVersion().get(), GetShardStateRequest::READABLE) );
-					wait( timeout( waitForAll( serverReady ), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, Void(), TaskMoveKeys ) );
+					wait( timeout( waitForAll( serverReady ), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, Void(), TaskPriority::MoveKeys ) );
 					int count = dest.size() - newDestinations.size();
 					for(int s=0; s<serverReady.size(); s++)
 						count += serverReady[s].isReady() && !serverReady[s].isError();
@@ -808,7 +808,7 @@ ACTOR Future<Void> removeStorageServer( Database cx, UID serverID, MoveKeysLock
 			if (!canRemove) {
 				TEST(true); // The caller had a transaction in flight that assigned keys to the server.  Wait for it to reverse its mistake.
 				TraceEvent(SevWarn,"NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID);
-				wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskDataDistributionLaunch) );
+				wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch) );
 				tr.reset();
 				TraceEvent("RemoveStorageServerRetrying").detail("CanRemove", canRemove);
 			} else {
diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp
index fd2be1f08f..bd8db636a1 100644
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@@ -333,7 +333,7 @@ namespace oldTLog_4_6 {
 			}
 
 			// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
-			ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference<LogData> tlogData, int taskID ) {
+			ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference<LogData> tlogData, TaskPriority taskID ) {
 				while(!self->version_messages.empty() && self->version_messages.front().first < before) {
 					Version version = self->version_messages.front().first;
 					std::pair<int, int> &sizes = tlogData->version_sizes[version];
@@ -359,7 +359,7 @@ namespace oldTLog_4_6 {
 				return Void();
 			}
 
-			Future<Void> eraseMessagesBefore(Version before, int64_t* gBytesErased, Reference<LogData> tlogData, int taskID) {
+			Future<Void> eraseMessagesBefore(Version before, int64_t* gBytesErased, Reference<LogData> tlogData, TaskPriority taskID) {
 				return eraseMessagesBefore(this, before, gBytesErased, tlogData, taskID);
 			}
 		};
@@ -526,21 +526,21 @@ namespace oldTLog_4_6 {
 
 				self->persistentData->set( KeyValueRef( persistTagMessagesKey( logData->logId, tag->key, currentVersion ), wr.toValue() ) );
 
-				Future<Void> f = yield(TaskUpdateStorage);
+				Future<Void> f = yield(TaskPriority::UpdateStorage);
 				if(!f.isReady()) {
 					wait(f);
 					msg = std::upper_bound(tag->value.version_messages.begin(), tag->value.version_messages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst<std::pair<Version, LengthPrefixedStringRef>>());
 				}
 			}
 
-			wait(yield(TaskUpdateStorage));
+			wait(yield(TaskPriority::UpdateStorage));
 		}
 
 		self->persistentData->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistCurrentVersionKeys.begin), BinaryWriter::toValue(newPersistentDataVersion, Unversioned()) ) );
 		logData->persistentDataVersion = newPersistentDataVersion;
 
 		wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down???
-		wait( delay(0, TaskUpdateStorage) );
+		wait( delay(0, TaskPriority::UpdateStorage) );
 
 		// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion.
 
@@ -548,20 +548,20 @@ namespace oldTLog_4_6 {
 		logData->persistentDataDurableVersion = newPersistentDataVersion;
 
 		for(tag = logData->tag_data.begin(); tag != logData->tag_data.end(); ++tag) {
-			wait(tag->value.eraseMessagesBefore( newPersistentDataVersion+1, &self->bytesDurable, logData, TaskUpdateStorage ));
-			wait(yield(TaskUpdateStorage));
+			wait(tag->value.eraseMessagesBefore( newPersistentDataVersion+1, &self->bytesDurable, logData, TaskPriority::UpdateStorage ));
+			wait(yield(TaskPriority::UpdateStorage));
 		}
 
 		logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion));
 
-		wait(yield(TaskUpdateStorage));
+		wait(yield(TaskPriority::UpdateStorage));
 
 		while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) {
 			int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
 			logData->bytesDurable += bytesErased;
 			self->bytesDurable += bytesErased;
 			logData->messageBlocks.pop_front();
-			wait(yield(TaskUpdateStorage));
+			wait(yield(TaskPriority::UpdateStorage));
 		}
 
 		if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) {
@@ -586,7 +586,7 @@ namespace oldTLog_4_6 {
 		}
 
 		if(!self->queueOrder.size()) {
-			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 			return Void();
 		}
 
@@ -621,14 +621,14 @@ namespace oldTLog_4_6 {
 					}
 
 					wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
-					wait( delay(0, TaskUpdateStorage) );
+					wait( delay(0, TaskPriority::UpdateStorage) );
 
 					//TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion);
 					if (nextVersion > logData->persistentDataVersion) {
 						self->updatePersist = updatePersistentData(self, logData, nextVersion);
 						wait( self->updatePersist );
 					} else {
-						wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+						wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 					}
 
 					if( logData->removed.isReady() ) {
@@ -639,9 +639,9 @@ namespace oldTLog_4_6 {
 				if(logData->persistentDataDurableVersion == logData->version.get()) {
 					self->queueOrder.pop_front();
 				}
-				wait( delay(0.0, TaskUpdateStorage) );
+				wait( delay(0.0, TaskPriority::UpdateStorage) );
 			} else {
-				wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+				wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 			}
 		}
 		else if(logData->initialized) {
@@ -650,7 +650,7 @@ namespace oldTLog_4_6 {
 			while( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT && sizeItr != logData->version_sizes.end()
 					&& (logData->bytesInput.getValue() - logData->bytesDurable.getValue() - totalSize >= SERVER_KNOBS->TLOG_SPILL_THRESHOLD || sizeItr->value.first == 0) )
 			{
-				wait( yield(TaskUpdateStorage) );
+				wait( yield(TaskPriority::UpdateStorage) );
 
 				++sizeItr;
 				nextVersion = sizeItr == logData->version_sizes.end() ? logData->version.get() : sizeItr->key;
@@ -662,7 +662,7 @@ namespace oldTLog_4_6 {
 						totalSize += it->second.expectedSize();
 					}
 
-					wait(yield(TaskUpdateStorage));
+					wait(yield(TaskPriority::UpdateStorage));
 				}
 
 				prevVersion = nextVersion;
@@ -673,7 +673,7 @@ namespace oldTLog_4_6 {
 			//TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize);
 
 			wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
-			wait( delay(0, TaskUpdateStorage) );
+			wait( delay(0, TaskPriority::UpdateStorage) );
 
 			if (nextVersion > logData->persistentDataVersion) {
 				self->updatePersist = updatePersistentData(self, logData, nextVersion);
@@ -681,21 +681,21 @@ namespace oldTLog_4_6 {
 			}
 
 			if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) {
-				wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+				wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 			}
 			else {
 				//recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after
 				//updatePersist returns another one has not been started yet.
-				wait( delay(0.0, TaskUpdateStorage) );
+				wait( delay(0.0, TaskPriority::UpdateStorage) );
 			}
 		} else {
-			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 		}
 		return Void();
 	}
 
 	ACTOR Future<Void> updateStorageLoop( TLogData* self ) {
-		wait(delay(0, TaskUpdateStorage));
+		wait(delay(0, TaskPriority::UpdateStorage));
 
 		loop {
 			wait( updateStorage(self) );
@@ -823,7 +823,7 @@ namespace oldTLog_4_6 {
 			ti->value.popped_recently = true;
 			//if (to.epoch == self->epoch())
 			if ( req.to > logData->persistentDataDurableVersion )
-				wait(ti->value.eraseMessagesBefore( req.to, &self->bytesDurable, logData, TaskTLogPop ));
+				wait(ti->value.eraseMessagesBefore( req.to, &self->bytesDurable, logData, TaskPriority::TLogPop ));
 		}
 
 		req.reply.send(Void());
diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp
index fc9251ec78..c9837d6814 100644
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@@ -297,7 +297,7 @@ struct TLogData : NonCopyable {
 			  concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS),
 			  ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped()
 		{
-			cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true);
+			cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true);
 		}
 };
 
@@ -323,7 +323,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		}
 
 		// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
-		ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference<LogData> logData, int taskID ) {
+		ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference<LogData> logData, TaskPriority taskID ) {
 			while(!self->versionMessages.empty() && self->versionMessages.front().first < before) {
 				Version version = self->versionMessages.front().first;
 				std::pair<int,int> &sizes = logData->version_sizes[version];
@@ -352,7 +352,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 			return Void();
 		}
 
-		Future<Void> eraseMessagesBefore(Version before, TLogData *tlogData, Reference<LogData> logData, int taskID) {
+		Future<Void> eraseMessagesBefore(Version before, TLogData *tlogData, Reference<LogData> logData, TaskPriority taskID) {
 			return eraseMessagesBefore(this, before, tlogData, logData, taskID);
 		}
 	};
@@ -607,14 +607,14 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 
 					self->persistentData->set( KeyValueRef( persistTagMessagesKey( logData->logId, tagData->tag, currentVersion ), wr.toValue() ) );
 
-					Future<Void> f = yield(TaskUpdateStorage);
+					Future<Void> f = yield(TaskPriority::UpdateStorage);
 					if(!f.isReady()) {
 						wait(f);
 						msg = std::upper_bound(tagData->versionMessages.begin(), tagData->versionMessages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst<std::pair<Version, LengthPrefixedStringRef>>());
 					}
 				}
 
-				wait(yield(TaskUpdateStorage));
+				wait(yield(TaskPriority::UpdateStorage));
 			}
 		}
 	}
@@ -624,7 +624,7 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 	logData->persistentDataVersion = newPersistentDataVersion;
 
 	wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down???
-	wait( delay(0, TaskUpdateStorage) );
+	wait( delay(0, TaskPriority::UpdateStorage) );
 
 	// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion.
 
@@ -634,22 +634,22 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 	for(tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
 		for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
 			if(logData->tag_data[tagLocality][tagId]) {
-				wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskUpdateStorage ));
-				wait(yield(TaskUpdateStorage));
+				wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskPriority::UpdateStorage ));
+				wait(yield(TaskPriority::UpdateStorage));
 			}
 		}
 	}
 
 	logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion));
 
-	wait(yield(TaskUpdateStorage));
+	wait(yield(TaskPriority::UpdateStorage));
 
 	while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) {
 		int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
 		logData->bytesDurable += bytesErased;
 		self->bytesDurable += bytesErased;
 		logData->messageBlocks.pop_front();
-		wait(yield(TaskUpdateStorage));
+		wait(yield(TaskPriority::UpdateStorage));
 	}
 
 	if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) {
@@ -674,7 +674,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 	}
 
 	if(!self->queueOrder.size()) {
-		wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+		wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 		return Void();
 	}
 
@@ -698,7 +698,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 				}
 
 				wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
-				wait( delay(0, TaskUpdateStorage) );
+				wait( delay(0, TaskPriority::UpdateStorage) );
 
 				//TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion);
 				if (nextVersion > logData->persistentDataVersion) {
@@ -707,7 +707,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 					wait( updatePersistentData(self, logData, nextVersion) );
 					commitLockReleaser.release();
 				} else {
-					wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+					wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 				}
 
 				if( logData->removed.isReady() ) {
@@ -718,9 +718,9 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 			if(logData->persistentDataDurableVersion == logData->version.get()) {
 				self->queueOrder.pop_front();
 			}
-			wait( delay(0.0, TaskUpdateStorage) );
+			wait( delay(0.0, TaskPriority::UpdateStorage) );
 		} else {
-			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 		}
 	}
 	else if(logData->initialized) {
@@ -741,7 +741,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 		//TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize);
 
 		wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
-		wait( delay(0, TaskUpdateStorage) );
+		wait( delay(0, TaskPriority::UpdateStorage) );
 
 		if (nextVersion > logData->persistentDataVersion) {
 			wait( self->persistentDataCommitLock.take() );
@@ -751,21 +751,21 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 		}
 
 		if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) {
-			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 		}
 		else {
 			//recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after
 			//updatePersist returns another one has not been started yet.
-			wait( delay(0.0, TaskUpdateStorage) );
+			wait( delay(0.0, TaskPriority::UpdateStorage) );
 		}
 	} else {
-		wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+		wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 	}
 	return Void();
 }
 
 ACTOR Future<Void> updateStorageLoop( TLogData* self ) {
-	wait(delay(0, TaskUpdateStorage));
+	wait(delay(0, TaskPriority::UpdateStorage));
 
 	loop {
 		wait( updateStorage(self) );
@@ -943,7 +943,7 @@ ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Refere
 		}
 
 		if (upTo > logData->persistentDataDurableVersion)
-			wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskTLogPop));
+			wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop));
 		//TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo);
 	}
 	return Void();
@@ -1059,7 +1059,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 	if( req.tag.locality == tagLocalityLogRouter ) {
 		wait( self->concurrentLogRouterReads.take() );
 		state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads);
-		wait( delay(0.0, TaskLowPriority) );
+		wait( delay(0.0, TaskPriority::Low) );
 	}
 
 	if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) {
@@ -1068,7 +1068,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 		// slightly faster over keeping the rest of the cluster operating normally.
 		// txsTag is only ever peeked on recovery, and we would still wish to prioritize requests
 		// that impact recovery duration.
-		wait(delay(0, TaskTLogSpilledPeekReply));
+		wait(delay(0, TaskPriority::TLogSpilledPeekReply));
 	}
 
 	Version poppedVer = poppedVersion(logData, req.tag);
@@ -1173,7 +1173,7 @@ ACTOR Future<Void> watchDegraded(TLogData* self) {
 	//This delay is divided into multiple delays to avoid marking the tlog as degraded because of a single SlowTask
 	state int loopCount = 0;
 	while(loopCount < SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT) {
-		wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority));
+		wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskPriority::Low));
 		loopCount++;
 	}
 	TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid);
@@ -1509,7 +1509,7 @@ ACTOR Future<Void> tLogCommit(
 				.detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion);
 			waitStartT = now();
 		}
-		wait( delayJittered(.005, TaskTLogCommit) );
+		wait( delayJittered(.005, TaskPriority::TLogCommit) );
 	}
 
 	// while exec op is being committed, no new transactions will be admitted.
@@ -1849,7 +1849,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 	while (!endVersion.present() || logData->version.get() < endVersion.get()) {
 		loop {
 			choose {
-				when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) {
+				when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) {
 					break;
 				}
 				when( wait( dbInfoChange ) ) {
@@ -1872,7 +1872,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 					.detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion);
 				waitStartT = now();
 			}
-			wait( delayJittered(.005, TaskTLogCommit) );
+			wait( delayJittered(.005, TaskPriority::TLogCommit) );
 		}
 
 		state Version ver = 0;
@@ -1912,7 +1912,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 
 					// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
 					logData->version.set( ver );
-					wait( yield(TaskTLogCommit) );
+					wait( yield(TaskPriority::TLogCommit) );
 				}
 				lastVer = ver;
 				ver = r->version().version;
@@ -1949,7 +1949,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 
 						// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
 						logData->version.set( ver );
-						wait( yield(TaskTLogCommit) );
+						wait( yield(TaskPriority::TLogCommit) );
 					}
 					break;
 				}
diff --git a/fdbserver/Orderer.actor.h b/fdbserver/Orderer.actor.h
index cd9d3d5a19..71f970ce45 100644
--- a/fdbserver/Orderer.actor.h
+++ b/fdbserver/Orderer.actor.h
@@ -38,7 +38,7 @@ public:
 		ready = NotifiedVersion(s);
 		started = false;
 	}
-	Future<bool> order( Seq s, int taskID = TaskDefaultYield ) {
+	Future<bool> order( Seq s, TaskPriority taskID = TaskPriority::DefaultYield ) {
 		if ( ready.get() < s )
 			return waitAndOrder( this, s, taskID );
 		else
@@ -54,7 +54,7 @@ public:
 		return ready.whenAtLeast(v);
 	}
 private:
-	ACTOR static Future<bool> waitAndOrder( Orderer<Seq>* self, Seq s, int taskID ) {
+	ACTOR static Future<bool> waitAndOrder( Orderer<Seq>* self, Seq s, TaskPriority taskID ) {
 		wait( self->ready.whenAtLeast(s) );
 		wait( yield( taskID ) || self->shutdown.getFuture() );
 		return self->dedup(s);
diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 9813592fc9..6de6f31f82 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -300,7 +300,7 @@ ACTOR Future<Void> trackEachStorageServer(
 ACTOR Future<Void> monitorServerListChange(
 		Reference<AsyncVar<ServerDBInfo>> dbInfo,
 		PromiseStream< std::pair<UID, Optional<StorageServerInterface>> > serverChanges) {
-	state Database db = openDBOnServer(dbInfo, TaskRatekeeper, true, true);
+	state Database db = openDBOnServer(dbInfo, TaskPriority::Ratekeeper, true, true);
 	state std::map<UID, StorageServerInterface> oldServers;
 	state Transaction tr(db);
 
@@ -629,7 +629,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 }
 
 ACTOR Future<Void> configurationMonitor(Reference<AsyncVar<ServerDBInfo>> dbInfo, DatabaseConfiguration* conf) {
-	state Database cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true);
+	state Database cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true);
 	loop {
 		state ReadYourWritesTransaction tr(cx);
 
diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp
index db49433692..41834bb163 100644
--- a/fdbserver/Resolver.actor.cpp
+++ b/fdbserver/Resolver.actor.cpp
@@ -114,9 +114,9 @@ ACTOR Future<Void> resolveBatch(
 		}
 	}
 
-	if (check_yield(TaskDefaultEndpoint)) {
-		wait( delay( 0, TaskLowPriority ) || delay( SERVER_KNOBS->COMMIT_SLEEP_TIME ) );  // FIXME: Is this still right?
-		g_network->setCurrentTask(TaskDefaultEndpoint);
+	if (check_yield(TaskPriority::DefaultEndpoint)) {
+		wait( delay( 0, TaskPriority::Low ) || delay( SERVER_KNOBS->COMMIT_SLEEP_TIME ) );  // FIXME: Is this still right?
+		g_network->setCurrentTask(TaskPriority::DefaultEndpoint);
 	}
 
 	if (self->version.get() == req.prevVersion) {  // Not a duplicate (check relies on no waiting between here and self->version.set() below!)
diff --git a/fdbserver/ResolverInterface.h b/fdbserver/ResolverInterface.h
index 2bb808d84b..65b46a5941 100644
--- a/fdbserver/ResolverInterface.h
+++ b/fdbserver/ResolverInterface.h
@@ -44,8 +44,8 @@ struct ResolverInterface {
 	bool operator != ( ResolverInterface const& r ) const { return id() != r.id(); }
 	NetworkAddress address() const { return resolve.getEndpoint().getPrimaryAddress(); }
 	void initEndpoints() {
-		metrics.getEndpoint( TaskResolutionMetrics );
-		split.getEndpoint( TaskResolutionMetrics );
+		metrics.getEndpoint( TaskPriority::ResolutionMetrics );
+		split.getEndpoint( TaskPriority::ResolutionMetrics );
 	}
 
 	template <class Ar> 
diff --git a/fdbserver/RestoreInterface.h b/fdbserver/RestoreInterface.h
index 847e940d9b..670946d9ab 100644
--- a/fdbserver/RestoreInterface.h
+++ b/fdbserver/RestoreInterface.h
@@ -37,7 +37,7 @@ struct RestoreInterface {
 	NetworkAddress address() const { return test.getEndpoint().getPrimaryAddress(); }
 
 	void initEndpoints() {
-		test.getEndpoint( TaskClusterController );
+		test.getEndpoint( TaskPriority::ClusterController );
 	}
 
 	template <class Ar>
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 81330eac10..95e14136f9 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -215,7 +215,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<ClusterConnec
 		    g_simulator.newProcess("Server", ip, port, listenPerProcess, localities, processClass, dataFolder->c_str(),
 		                           coordFolder->c_str());
 		wait(g_simulator.onProcess(process,
-		                           TaskDefaultYield)); // Now switch execution to the process on which we will run
+		                           TaskPriority::DefaultYield)); // Now switch execution to the process on which we will run
 		state Future<ISimulator::KillType> onShutdown = process->onShutdown();
 
 		try {
@@ -1399,7 +1399,7 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot
 	                                        Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
 	                                        Optional<Standalone<StringRef>>()),
 	                           ProcessClass(ProcessClass::TesterClass, ProcessClass::CommandLineSource), "", ""),
-	    TaskDefaultYield));
+	    TaskPriority::DefaultYield));
 	Sim2FileSystem::newFileSystem();
 	FlowTransport::createInstance(true, 1);
 	if (tlsOptions->enabled()) {
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 47c61aeb9f..e4c31fdf3d 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -1809,7 +1809,7 @@ ACTOR Future<JsonBuilderObject> layerStatusFetcher(Database cx, JsonBuilderArray
 ACTOR Future<JsonBuilderObject> lockedStatusFetcher(Reference<AsyncVar<struct ServerDBInfo>> db, JsonBuilderArray *messages, std::set<std::string> *incomplete_reasons) {
 	state JsonBuilderObject statusObj;
 
-	state Database cx = openDBOnServer(db, TaskDefaultEndpoint, true, false); // Open a new database connection that isn't lock-aware
+	state Database cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, false); // Open a new database connection that isn't lock-aware
 	state Transaction tr(cx);
 	state int timeoutSeconds = 5;
 	state Future<Void> getTimeout = delay(timeoutSeconds);
diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h
index aa0c8a622e..641843f7fe 100644
--- a/fdbserver/TLogInterface.h
+++ b/fdbserver/TLogInterface.h
@@ -56,11 +56,11 @@ struct TLogInterface {
 	bool operator == ( TLogInterface const& r ) const { return id() == r.id(); }
 	NetworkAddress address() const { return peekMessages.getEndpoint().getPrimaryAddress(); }
 	void initEndpoints() {
-		getQueuingMetrics.getEndpoint( TaskTLogQueuingMetrics );
-		popMessages.getEndpoint( TaskTLogPop );
-		peekMessages.getEndpoint( TaskTLogPeek );
-		confirmRunning.getEndpoint( TaskTLogConfirmRunning );
-		commit.getEndpoint( TaskTLogCommit );
+		getQueuingMetrics.getEndpoint( TaskPriority::TLogQueuingMetrics );
+		popMessages.getEndpoint( TaskPriority::TLogPop );
+		peekMessages.getEndpoint( TaskPriority::TLogPeek );
+		confirmRunning.getEndpoint( TaskPriority::TLogConfirmRunning );
+		commit.getEndpoint( TaskPriority::TLogCommit );
 	}
 
 	template <class Ar> 
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 52d0079ab7..98dab7d489 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -349,7 +349,7 @@ struct TLogData : NonCopyable {
 			  concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS),
 			  ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped()
 		{
-			cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true);
+			cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true);
 		}
 };
 
@@ -379,7 +379,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		}
 
 		// Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before)
-		ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference<LogData> logData, int taskID ) {
+		ACTOR Future<Void> eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference<LogData> logData, TaskPriority taskID ) {
 			while(!self->versionMessages.empty() && self->versionMessages.front().first < before) {
 				Version version = self->versionMessages.front().first;
 				std::pair<int,int> &sizes = logData->version_sizes[version];
@@ -408,7 +408,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 			return Void();
 		}
 
-		Future<Void> eraseMessagesBefore(Version before, TLogData *tlogData, Reference<LogData> logData, int taskID) {
+		Future<Void> eraseMessagesBefore(Version before, TLogData *tlogData, Reference<LogData> logData, TaskPriority taskID) {
 			return eraseMessagesBefore(this, before, tlogData, logData, taskID);
 		}
 	};
@@ -766,7 +766,7 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 		for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
 			state Reference<LogData::TagData> tagData = logData->tag_data[tagLocality][tagId];
 			if(tagData) {
-				wait(tagData->eraseMessagesBefore( tagData->popped, self, logData, TaskUpdateStorage ));
+				wait(tagData->eraseMessagesBefore( tagData->popped, self, logData, TaskPriority::UpdateStorage ));
 				state Version currentVersion = 0;
 				// Clear recently popped versions from persistentData if necessary
 				updatePersistentPopped( self, logData, tagData );
@@ -819,7 +819,7 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 							wr << uint32_t(0);
 						}
 
-						Future<Void> f = yield(TaskUpdateStorage);
+						Future<Void> f = yield(TaskPriority::UpdateStorage);
 						if(!f.isReady()) {
 							wait(f);
 							msg = std::upper_bound(tagData->versionMessages.begin(), tagData->versionMessages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst<std::pair<Version, LengthPrefixedStringRef>>());
@@ -832,7 +832,7 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 					tagData->poppedLocation = std::min(tagData->poppedLocation, firstLocation);
 				}
 
-				wait(yield(TaskUpdateStorage));
+				wait(yield(TaskPriority::UpdateStorage));
 			}
 		}
 	}
@@ -847,7 +847,7 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 	logData->persistentDataVersion = newPersistentDataVersion;
 
 	wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down???
-	wait( delay(0, TaskUpdateStorage) );
+	wait( delay(0, TaskPriority::UpdateStorage) );
 
 	// Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion.
 
@@ -857,22 +857,22 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 	for(tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) {
 		for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) {
 			if(logData->tag_data[tagLocality][tagId]) {
-				wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskUpdateStorage ));
-				wait(yield(TaskUpdateStorage));
+				wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskPriority::UpdateStorage ));
+				wait(yield(TaskPriority::UpdateStorage));
 			}
 		}
 	}
 
 	logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion));
 
-	wait(yield(TaskUpdateStorage));
+	wait(yield(TaskPriority::UpdateStorage));
 
 	while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) {
 		int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
 		logData->bytesDurable += bytesErased;
 		self->bytesDurable += bytesErased;
 		logData->messageBlocks.pop_front();
-		wait(yield(TaskUpdateStorage));
+		wait(yield(TaskPriority::UpdateStorage));
 	}
 
 	if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) {
@@ -915,7 +915,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 	}
 
 	if(!self->spillOrder.size()) {
-		wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+		wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 		return Void();
 	}
 
@@ -940,7 +940,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 				}
 
 				wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
-				wait( delay(0, TaskUpdateStorage) );
+				wait( delay(0, TaskPriority::UpdateStorage) );
 
 				//TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion);
 				if (nextVersion > logData->persistentDataVersion) {
@@ -953,7 +953,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 					}
 					commitLockReleaser.release();
 				} else {
-					wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+					wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 				}
 
 				if( logData->removed.isReady() ) {
@@ -964,9 +964,9 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 			if(logData->persistentDataDurableVersion == logData->version.get()) {
 				self->spillOrder.pop_front();
 			}
-			wait( delay(0.0, TaskUpdateStorage) );
+			wait( delay(0.0, TaskPriority::UpdateStorage) );
 		} else {
-			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 		}
 	}
 	else if(logData->initialized) {
@@ -988,7 +988,7 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 		//TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize);
 
 		wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) );
-		wait( delay(0, TaskUpdateStorage) );
+		wait( delay(0, TaskPriority::UpdateStorage) );
 
 		if (nextVersion > logData->persistentDataVersion) {
 			wait( self->persistentDataCommitLock.take() );
@@ -1001,21 +1001,21 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 		}
 
 		if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) {
-			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+			wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 		}
 		else {
 			//recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after
 			//updatePersist returns another one has not been started yet.
-			wait( delay(0.0, TaskUpdateStorage) );
+			wait( delay(0.0, TaskPriority::UpdateStorage) );
 		}
 	} else {
-		wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
+		wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) );
 	}
 	return Void();
 }
 
 ACTOR Future<Void> updateStorageLoop( TLogData* self ) {
-	wait(delay(0, TaskUpdateStorage));
+	wait(delay(0, TaskPriority::UpdateStorage));
 
 	loop {
 		wait( updateStorage(self) );
@@ -1194,7 +1194,7 @@ ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Refere
 		}
 
 		if (upTo > logData->persistentDataDurableVersion)
-			wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskTLogPop));
+			wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop));
 		//TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo);
 	}
 	return Void();
@@ -1346,7 +1346,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 	if( req.tag.locality == tagLocalityLogRouter ) {
 		wait( self->concurrentLogRouterReads.take() );
 		state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads);
-		wait( delay(0.0, TaskLowPriority) );
+		wait( delay(0.0, TaskPriority::Low) );
 	}
 
 	if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) {
@@ -1355,7 +1355,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 		// slightly faster over keeping the rest of the cluster operating normally.
 		// txsTag is only ever peeked on recovery, and we would still wish to prioritize requests
 		// that impact recovery duration.
-		wait(delay(0, TaskTLogSpilledPeekReply));
+		wait(delay(0, TaskPriority::TLogSpilledPeekReply));
 	}
 
 	Version poppedVer = poppedVersion(logData, req.tag);
@@ -1456,7 +1456,7 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 				if (earlyEnd) break;
 			}
 			earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK+1);
-			wait( self->peekMemoryLimiter.take(TaskTLogSpilledPeekReply, commitBytes) );
+			wait( self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes) );
 			state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes);
 			state std::vector<Future<Standalone<StringRef>>> messageReads;
 			messageReads.reserve( commitLocations.size() );
@@ -1540,7 +1540,7 @@ ACTOR Future<Void> watchDegraded(TLogData* self) {
 	//This delay is divided into multiple delays to avoid marking the tlog as degraded because of a single SlowTask
 	state int loopCount = 0;
 	while(loopCount < SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT) {
-		wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority));
+		wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskPriority::Low));
 		loopCount++;
 	}
 	TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid);
@@ -1876,7 +1876,7 @@ ACTOR Future<Void> tLogCommit(
 				.detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion);
 			waitStartT = now();
 		}
-		wait( delayJittered(.005, TaskTLogCommit) );
+		wait( delayJittered(.005, TaskPriority::TLogCommit) );
 	}
 
 	// while exec op is being committed, no new transactions will be admitted.
@@ -2223,7 +2223,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 	while (!endVersion.present() || logData->version.get() < endVersion.get()) {
 		loop {
 			choose {
-				when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) {
+				when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) {
 					break;
 				}
 				when( wait( dbInfoChange ) ) {
@@ -2246,7 +2246,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 					.detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion);
 				waitStartT = now();
 			}
-			wait( delayJittered(.005, TaskTLogCommit) );
+			wait( delayJittered(.005, TaskPriority::TLogCommit) );
 		}
 
 		state Version ver = 0;
@@ -2286,7 +2286,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 
 					// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
 					logData->version.set( ver );
-					wait( yield(TaskTLogCommit) );
+					wait( yield(TaskPriority::TLogCommit) );
 				}
 				lastVer = ver;
 				ver = r->version().version;
@@ -2323,7 +2323,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 
 						// Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors
 						logData->version.set( ver );
-						wait( yield(TaskTLogCommit) );
+						wait( yield(TaskPriority::TLogCommit) );
 					}
 					break;
 				}
diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index 2e25daae3b..d9ba5637b2 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -431,7 +431,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				vector<Future<Void>> tLogCommitResults;
 				for(int loc=0; loc< it->logServers.size(); loc++) {
 					Standalone<StringRef> msg = data.getMessages(location);
-					allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, data.getHasExecOp(), debugID ), TaskTLogCommitReply ) );
+					allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, data.getHasExecOp(), debugID ), TaskPriority::TLogCommitReply ) );
 					Future<Void> commitSuccess = success(allReplies.back());
 					addActor.get().send(commitSuccess);
 					tLogCommitResults.push_back(commitSuccess);
@@ -961,7 +961,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			if( t->get().present() ) {
 				alive.push_back( brokenPromiseToNever(
 				    t->get().interf().confirmRunning.getReply( TLogConfirmRunningRequest(debugID),
-				                                               TaskTLogConfirmRunningReply ) ) );
+				                                               TaskPriority::TLogConfirmRunningReply ) ) );
 				numPresent++;
 			} else {
 				alive.push_back( Never() );
diff --git a/fdbserver/VFSAsync.cpp b/fdbserver/VFSAsync.cpp
index 95e6b958a4..3d53aaccfb 100644
--- a/fdbserver/VFSAsync.cpp
+++ b/fdbserver/VFSAsync.cpp
@@ -713,7 +713,7 @@ static int asyncSleep(sqlite3_vfs *pVfs, int microseconds){
 			waitFor( delay(FLOW_KNOBS->MAX_BUGGIFIED_DELAY) );
 			return 0;
 		}
-		waitFor( g_network->delay( microseconds*1e-6, TaskDefaultDelay ) || simCancel );
+		waitFor( g_network->delay( microseconds*1e-6, TaskPriority::DefaultDelay ) || simCancel );
 		return microseconds;
 	} catch( Error &e ) {
 		TraceEvent(SevError, "AsyncSleepError").error(e,true);
diff --git a/fdbserver/WaitFailure.actor.cpp b/fdbserver/WaitFailure.actor.cpp
index 778128f830..6ab6efeb74 100644
--- a/fdbserver/WaitFailure.actor.cpp
+++ b/fdbserver/WaitFailure.actor.cpp
@@ -37,7 +37,7 @@ ACTOR Future<Void> waitFailureServer(FutureStream<ReplyPromise<Void>> waitFailur
 	}
 }
 
-ACTOR Future<Void> waitFailureClient(RequestStream<ReplyPromise<Void>> waitFailure, double reactionTime, double reactionSlope, int taskID){
+ACTOR Future<Void> waitFailureClient(RequestStream<ReplyPromise<Void>> waitFailure, double reactionTime, double reactionSlope, TaskPriority taskID){
 	loop {
 		try {
 			state double start = now();
@@ -55,7 +55,7 @@ ACTOR Future<Void> waitFailureClient(RequestStream<ReplyPromise<Void>> waitFailu
 	}
 }
 
-ACTOR Future<Void> waitFailureClientStrict(RequestStream<ReplyPromise<Void>> waitFailure, double failureReactionTime, int taskID){
+ACTOR Future<Void> waitFailureClientStrict(RequestStream<ReplyPromise<Void>> waitFailure, double failureReactionTime, TaskPriority taskID){
 	loop {
 		wait(waitFailureClient(waitFailure, 0, 0, taskID));
 		wait(delay(failureReactionTime, taskID) || IFailureMonitor::failureMonitor().onStateEqual( waitFailure.getEndpoint(), FailureStatus(false)));
@@ -65,7 +65,7 @@ ACTOR Future<Void> waitFailureClientStrict(RequestStream<ReplyPromise<Void>> wai
 	}
 }
 
-ACTOR Future<Void> waitFailureTracker(RequestStream<ReplyPromise<Void>> waitFailure, Reference<AsyncVar<bool>> failed, double reactionTime, double reactionSlope, int taskID){
+ACTOR Future<Void> waitFailureTracker(RequestStream<ReplyPromise<Void>> waitFailure, Reference<AsyncVar<bool>> failed, double reactionTime, double reactionSlope, TaskPriority taskID){
 	loop {
 		try {	
 			failed->set( IFailureMonitor::failureMonitor().getState(waitFailure.getEndpoint()).isFailed() );
diff --git a/fdbserver/WaitFailure.h b/fdbserver/WaitFailure.h
index 9ef3b4c3a0..413dc9a56a 100644
--- a/fdbserver/WaitFailure.h
+++ b/fdbserver/WaitFailure.h
@@ -26,13 +26,13 @@ Future<Void> waitFailureServer(const FutureStream<ReplyPromise<Void>>& waitFailu
 
 // talks to a wait failure server, returns Void on failure
 Future<Void> waitFailureClient(const RequestStream<ReplyPromise<Void>>& waitFailure, 
-	double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint);
+	double const& failureReactionTime=0, double const& failureReactionSlope=0, TaskPriority const& taskID=TaskPriority::DefaultEndpoint);
 
 // talks to a wait failure server, returns Void on failure, reaction time is always waited
-Future<Void> waitFailureClientStrict(const RequestStream<ReplyPromise<Void>>& waitFailure, double const& failureReactionTime=0, int const& taskID=TaskDefaultEndpoint);
+Future<Void> waitFailureClientStrict(const RequestStream<ReplyPromise<Void>>& waitFailure, double const& failureReactionTime=0, TaskPriority const& taskID=TaskPriority::DefaultEndpoint);
 
 // talks to a wait failure server, updates failed to be true or false based on failure status. 
 Future<Void> waitFailureTracker(const RequestStream<ReplyPromise<Void>>& waitFailure, Reference<AsyncVar<bool>> const& failed,
-	double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint);
+	double const& failureReactionTime=0, double const& failureReactionSlope=0, TaskPriority const& taskID=TaskPriority::DefaultEndpoint);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index 8370e7fdde..ffd373194c 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -392,7 +392,7 @@ void endRole(const Role &role, UID id, std::string reason, bool ok = true, Error
 
 struct ServerDBInfo;
 
-class Database openDBOnServer( Reference<AsyncVar<ServerDBInfo>> const& db, int taskID = TaskDefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false );
+class Database openDBOnServer( Reference<AsyncVar<ServerDBInfo>> const& db, TaskPriority taskID = TaskPriority::DefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false );
 ACTOR Future<Void> extractClusterInterface(Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> a,
                                            Reference<AsyncVar<Optional<struct ClusterInterface>>> b);
 
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 4d7f58796e..1d785b5d22 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -493,7 +493,7 @@ Future<Void> startSystemMonitor(std::string dataFolder, Optional<Standalone<Stri
 	initializeSystemMonitorMachineState(SystemMonitorMachineState(dataFolder, zoneId, machineId, g_network->getLocalAddress().ip));
 
 	systemMonitor();
-	return recurring( &systemMonitor, 5.0, TaskFlushTrace );
+	return recurring( &systemMonitor, 5.0, TaskPriority::FlushTrace );
 }
 
 void testIndexedSet();
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index 205b1dbc19..84365d455f 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -464,7 +464,7 @@ Future<Void> sendMasterRegistration( MasterData* self, LogSystemConfig const& lo
 }
 
 ACTOR Future<Void> updateRegistration( Reference<MasterData> self, Reference<ILogSystem> logSystem ) {
-	state Database cx = openDBOnServer(self->dbInfo, TaskDefaultEndpoint, true, true);
+	state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, true, true);
 	state Future<Void> trigger =  self->registrationTrigger.onTrigger();
 	state Future<Void> updateLogsKey;
 
@@ -1017,12 +1017,12 @@ ACTOR Future<Void> resolutionBalancing(Reference<MasterData> self) {
 	state CoalescedKeyRangeMap<int> key_resolver;
 	key_resolver.insert(allKeys, 0);
 	loop {
-		wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskResolutionMetrics));
+		wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskPriority::ResolutionMetrics));
 		while(self->resolverChanges.get().size())
 			wait(self->resolverChanges.onChange());
 		state std::vector<Future<int64_t>> futures;
 		for (auto& p : self->resolvers)
-			futures.push_back(brokenPromiseToNever(p.metrics.getReply(ResolutionMetricsRequest(), TaskResolutionMetrics)));
+			futures.push_back(brokenPromiseToNever(p.metrics.getReply(ResolutionMetricsRequest(), TaskPriority::ResolutionMetrics)));
 		wait( waitForAll(futures) );
 		state IndexedSet<std::pair<int64_t, int>, NoMetric> metrics;
 
@@ -1047,7 +1047,7 @@ ACTOR Future<Void> resolutionBalancing(Reference<MasterData> self) {
 					req.offset = amount;
 					req.range = range.first;
 
-					ResolutionSplitReply split = wait( brokenPromiseToNever(self->resolvers[metrics.lastItem()->second].split.getReply(req, TaskResolutionMetrics)) );
+					ResolutionSplitReply split = wait( brokenPromiseToNever(self->resolvers[metrics.lastItem()->second].split.getReply(req, TaskPriority::ResolutionMetrics)) );
 					KeyRangeRef moveRange = range.second ? KeyRangeRef( range.first.begin, split.key ) : KeyRangeRef( split.key, range.first.end );
 					movedRanges.push_back_deep(movedRanges.arena(), ResolverMoveRef(moveRange, dest));
 					TraceEvent("MovingResolutionRange").detail("Src", src).detail("Dest", dest).detail("Amount", amount).detail("StartRange", range.first).detail("MoveRange", moveRange).detail("Used", split.used).detail("KeyResolverRanges", key_resolver.size());
@@ -1185,7 +1185,7 @@ ACTOR Future<Void> trackTlogRecovery( Reference<MasterData> self, Reference<Asyn
 }
 
 ACTOR Future<Void> configurationMonitor( Reference<MasterData> self ) {
-	state Database cx = openDBOnServer(self->dbInfo, TaskDefaultEndpoint, true, true);
+	state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, true, true);
 	loop {
 		state ReadYourWritesTransaction tr(cx);
 
diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp
index 61bf80ed55..795dd769c5 100644
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@@ -30,7 +30,7 @@ NetworkTestInterface::NetworkTestInterface( NetworkAddress remote )
 
 NetworkTestInterface::NetworkTestInterface( INetwork* local )
 {
-	test.makeWellKnownEndpoint( WLTOKEN_NETWORKTEST, TaskDefaultEndpoint );
+	test.makeWellKnownEndpoint( WLTOKEN_NETWORKTEST, TaskPriority::DefaultEndpoint );
 }
 
 ACTOR Future<Void> networkTestServer() {
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 28d47fa9cb..090f3f5df5 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -550,7 +550,7 @@ public:
 		newestDirtyVersion.insert(allKeys, invalidVersion);
 		addShard( ShardInfo::newNotAssigned( allKeys ) );
 
-		cx = openDBOnServer(db, TaskDefaultEndpoint, true, true);
+		cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true);
 	}
 	//~StorageServer() { fclose(log); }
 
@@ -828,7 +828,7 @@ ACTOR Future<Void> getValueQ( StorageServer* data, GetValueRequest req ) {
 
 		// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
 		// so we need to downgrade here
-		wait( delay(0, TaskDefaultEndpoint) );
+		wait( delay(0, TaskPriority::DefaultEndpoint) );
 
 		if( req.debugID.present() )
 			g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask());
@@ -1345,7 +1345,7 @@ ACTOR Future<Void> getKeyValues( StorageServer* data, GetKeyValuesRequest req )
 
 	// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
 	// so we need to downgrade here
-	wait( delay(0, TaskDefaultEndpoint) );
+	wait( delay(0, TaskPriority::DefaultEndpoint) );
 
 	try {
 		if( req.debugID.present() )
@@ -1458,7 +1458,7 @@ ACTOR Future<Void> getKey( StorageServer* data, GetKeyRequest req ) {
 
 	// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
 	// so we need to downgrade here
-	wait( delay(0, TaskDefaultEndpoint) );
+	wait( delay(0, TaskPriority::DefaultEndpoint) );
 
 	try {
 		state Version version = wait( waitForVersion( data, req.version ) );
@@ -2003,7 +2003,7 @@ ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
 
 		TraceEvent(SevDebug, "FetchKeysVersionSatisfied", data->thisServerID).detail("FKID", interval.pairID);
 
-		wait( data->fetchKeysParallelismLock.take( TaskDefaultYield, fetchBlockBytes ) );
+		wait( data->fetchKeysParallelismLock.take( TaskPriority::DefaultYield, fetchBlockBytes ) );
 		state FlowLock::Releaser holdingFKPL( data->fetchKeysParallelismLock, fetchBlockBytes );
 
 		state double executeStart = now();
@@ -2590,7 +2590,7 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
 			}
 
 			data->behind = true;
-			wait( delayJittered(.005, TaskTLogPeekReply) );
+			wait( delayJittered(.005, TaskPriority::TLogPeekReply) );
 		}
 
 		while( data->byteSampleClearsTooLarge.get() ) {
@@ -2617,7 +2617,7 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
 		*pReceivedUpdate = true;
 
 		start = now();
-		wait( data->durableVersionLock.take(TaskTLogPeekReply,1) );
+		wait( data->durableVersionLock.take(TaskPriority::TLogPeekReply,1) );
 		state FlowLock::Releaser holdingDVL( data->durableVersionLock );
 		if(now() - start > 0.1)
 			TraceEvent("SSSlowTakeLock1", data->thisServerID).detailf("From", "%016llx", debug_lastLoadBalanceResultEndpointToken).detail("Duration", now() - start).detail("Version", data->version.get());
@@ -2865,11 +2865,11 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
 		if (g_network->isSimulated()) {
 			double endTime = g_simulator.checkDisabled(format("%s/updateStorage", data->thisServerID.toString().c_str()));
 			if(endTime > now()) {
-				wait(delay(endTime - now(), TaskUpdateStorage));
+				wait(delay(endTime - now(), TaskPriority::UpdateStorage));
 			}
 		}
 		wait( data->desiredOldestVersion.whenAtLeast( data->storageVersion()+1 ) );
-		wait( delay(0, TaskUpdateStorage) );
+		wait( delay(0, TaskPriority::UpdateStorage) );
 
 		state Promise<Void> durableInProgress;
 		data->durableInProgress = durableInProgress.getFuture();
@@ -2882,10 +2882,10 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
 			state bool done = data->storage.makeVersionMutationsDurable(newOldestVersion, desiredVersion, bytesLeft);
 			// We want to forget things from these data structures atomically with changing oldestVersion (and "before", since oldestVersion.set() may trigger waiting actors)
 			// forgetVersionsBeforeAsync visibly forgets immediately (without waiting) but asynchronously frees memory.
-			Future<Void> finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( newOldestVersion, TaskUpdateStorage );
+			Future<Void> finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( newOldestVersion, TaskPriority::UpdateStorage );
 			data->oldestVersion.set( newOldestVersion );
 			wait( finishedForgetting );
-			wait( yield(TaskUpdateStorage) );
+			wait( yield(TaskPriority::UpdateStorage) );
 			if (done) break;
 		}
 
@@ -2916,7 +2916,7 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
 		}
 
 		durableInProgress.send(Void());
-		wait( delay(0, TaskUpdateStorage) ); //Setting durableInProgess could cause the storage server to shut down, so delay to check for cancellation
+		wait( delay(0, TaskPriority::UpdateStorage) ); //Setting durableInProgess could cause the storage server to shut down, so delay to check for cancellation
 
 		// Taking and releasing the durableVersionLock ensures that no eager reads both begin before the commit was effective and
 		// are applied after we change the durable version. Also ensure that we have to lock while calling changeDurableVersion,
@@ -2925,9 +2925,9 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
 		data->popVersion( data->durableVersion.get() + 1 );
 
 		while (!changeDurableVersion( data, newOldestVersion )) {
-			if(g_network->check_yield(TaskUpdateStorage)) {
+			if(g_network->check_yield(TaskPriority::UpdateStorage)) {
 				data->durableVersionLock.release();
-				wait(delay(0, TaskUpdateStorage));
+				wait(delay(0, TaskPriority::UpdateStorage));
 				wait( data->durableVersionLock.take() );
 			}
 		}
@@ -3537,7 +3537,7 @@ ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterfac
 				}
 			}
 			when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) {
-				// Warning: This code is executed at extremely high priority (TaskLoadBalancedEndpoint), so downgrade before doing real work
+				// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
 				if( req.debugID.present() )
 					g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "storageServer.recieved"); //.detail("TaskID", g_network->getCurrentTask());
 
@@ -3552,11 +3552,11 @@ ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterfac
 				actors.add(self->readGuard(req, watchValueQ));
 			}
 			when (GetKeyRequest req = waitNext(ssi.getKey.getFuture())) {
-				// Warning: This code is executed at extremely high priority (TaskLoadBalancedEndpoint), so downgrade before doing real work
+				// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
 				actors.add(self->readGuard(req , getKey));
 			}
 			when (GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture()) ) {
-				// Warning: This code is executed at extremely high priority (TaskLoadBalancedEndpoint), so downgrade before doing real work
+				// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
 				actors.add(self->readGuard(req , getKeyValues));
 			}
 			when (GetShardStateRequest req = waitNext(ssi.getShardState.getFuture()) ) {
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 7205da04c4..b8fc706128 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -75,7 +75,7 @@ ACTOR static Future<Void> extractClientInfo( Reference<AsyncVar<ServerDBInfo>> d
 	}
 }
 
-Database openDBOnServer( Reference<AsyncVar<ServerDBInfo>> const& db, int taskID, bool enableLocalityLoadBalance, bool lockAware ) {
+Database openDBOnServer( Reference<AsyncVar<ServerDBInfo>> const& db, TaskPriority taskID, bool enableLocalityLoadBalance, bool lockAware ) {
 	Reference<AsyncVar<ClientDBInfo>> info( new AsyncVar<ClientDBInfo> );
 	return DatabaseContext::create( info, extractClientInfo(db, info), enableLocalityLoadBalance ? db->get().myLocality : LocalityData(), enableLocalityLoadBalance, taskID, lockAware );
 }
@@ -737,7 +737,7 @@ ACTOR Future<Void> workerServer(
 			}
 		} else {
 			bool lockAware = metricsPrefix.size() && metricsPrefix[0] == '\xff';
-			metricsLogger = runMetrics( openDBOnServer( dbInfo, TaskDefaultEndpoint, true, lockAware ), KeyRef(metricsPrefix) );
+			metricsLogger = runMetrics( openDBOnServer( dbInfo, TaskPriority::DefaultEndpoint, true, lockAware ), KeyRef(metricsPrefix) );
 		}
 	}
 
@@ -1169,7 +1169,7 @@ ACTOR Future<Void> workerServer(
 			}
 			when( wait( loggingTrigger ) ) {
 				systemMonitor();
-				loggingTrigger = delay( loggingDelay, TaskFlushTrace );
+				loggingTrigger = delay( loggingDelay, TaskPriority::FlushTrace );
 			}
 			when(state ExecuteRequest req = waitNext(interf.execReq.getFuture())) {
 				state ExecCmdValueString execArg(req.execPayload);
diff --git a/flow/IThreadPool.h b/flow/IThreadPool.h
index 5da60d2930..c5be41f87a 100644
--- a/flow/IThreadPool.h
+++ b/flow/IThreadPool.h
@@ -92,12 +92,12 @@ public:
 	void send( T const& t ) {  // Can be called safely from another thread.  Call send or sendError at most once.
 		Promise<Void> signal;
 		tagAndForward( &promise, t, signal.getFuture() );
-		g_network->onMainThread( std::move(signal), g_network->getCurrentTask() | 1 );
+		g_network->onMainThread( std::move(signal), incrementPriority( g_network->getCurrentTask() ) );
 	}
 	void sendError( Error const& e ) {  // Can be called safely from another thread.  Call send or sendError at most once.
 		Promise<Void> signal;
 		tagAndForwardError( &promise, e, signal.getFuture() );
-		g_network->onMainThread( std::move(signal), g_network->getCurrentTask() | 1 );
+		g_network->onMainThread( std::move(signal), incrementPriority( g_network->getCurrentTask() ) );
 	}
 private:
 	Promise<T> promise;
@@ -106,4 +106,4 @@ private:
 Reference<IThreadPool>	createGenericThreadPool();
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index 2dcf9783ed..0c3db011ed 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -100,9 +100,9 @@ public:
 
 struct OrderedTask {
 	int64_t priority;
-	int taskID;
+	TaskPriority taskID;
 	Task *task;
-	OrderedTask(int64_t priority, int taskID, Task* task) : priority(priority), taskID(taskID), task(task) {}
+	OrderedTask(int64_t priority, TaskPriority taskID, Task* task) : priority(priority), taskID(taskID), task(task) {}
 	bool operator < (OrderedTask const& rhs) const { return priority < rhs.priority; }
 };
 
@@ -122,12 +122,12 @@ public:
 
 	// INetwork interface
 	virtual double now() { return currentTime; };
-	virtual Future<Void> delay( double seconds, int taskId );
-	virtual Future<class Void> yield( int taskID );
-	virtual bool check_yield(int taskId);
-	virtual int getCurrentTask() { return currentTaskID; }
-	virtual void setCurrentTask(int taskID ) { priorityMetric = currentTaskID = taskID; }
-	virtual void onMainThread( Promise<Void>&& signal, int taskID );
+	virtual Future<Void> delay( double seconds, TaskPriority taskId );
+	virtual Future<class Void> yield( TaskPriority taskID );
+	virtual bool check_yield(TaskPriority taskId);
+	virtual TaskPriority getCurrentTask() { return currentTaskID; }
+	virtual void setCurrentTask(TaskPriority taskID ) { currentTaskID = taskID; priorityMetric = (int64_t)taskID; }
+	virtual void onMainThread( Promise<Void>&& signal, TaskPriority taskID );
 	virtual void stop() {
 		if ( thread_network == this )
 			stopImmediately();
@@ -157,7 +157,7 @@ public:
 
 	int64_t tsc_begin, tsc_end;
 	double taskBegin;
-	int currentTaskID;
+	TaskPriority currentTaskID;
 	uint64_t tasksIssued;
 	TDMetricCollection tdmetrics;
 	double currentTime;
@@ -167,7 +167,7 @@ public:
 	uint64_t numYields;
 
 	double lastPriorityTrackTime;
-	int lastMinTaskID;
+	TaskPriority lastMinTaskID;
 	double priorityTimer[NetworkMetrics::PRIORITY_BINS];
 
 	std::priority_queue<OrderedTask, std::vector<OrderedTask>> ready;
@@ -175,15 +175,15 @@ public:
 
 	struct DelayedTask : OrderedTask {
 		double at;
-		DelayedTask(double at, int64_t priority, int taskID, Task* task) : at(at), OrderedTask(priority, taskID, task) {}
+		DelayedTask(double at, int64_t priority, TaskPriority taskID, Task* task) : at(at), OrderedTask(priority, taskID, task) {}
 		bool operator < (DelayedTask const& rhs) const { return at > rhs.at; } // Ordering is reversed for priority_queue
 	};
 	std::priority_queue<DelayedTask, std::vector<DelayedTask>> timers;
 
-	void checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, int64_t priority);
-	bool check_yield(int taskId, bool isRunLoop);
+	void checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority);
+	bool check_yield(TaskPriority taskId, bool isRunLoop);
 	void processThreadReady();
-	void trackMinPriority( int minTaskID, double now );
+	void trackMinPriority( TaskPriority minTaskID, double now );
 	void stopImmediately() {
 		stopped=true; decltype(ready) _1; ready.swap(_1); decltype(timers) _2; timers.swap(_2);
 	}
@@ -489,8 +489,8 @@ Net2::Net2(bool useThreadPool, bool useMetrics, bool useObjectSerializer)
 	  stopped(false),
 	  tasksIssued(0),
 	  // Until run() is called, yield() will always yield
-	  tsc_begin(0), tsc_end(0), taskBegin(0), currentTaskID(TaskDefaultYield),
-	  lastMinTaskID(0),
+	  tsc_begin(0), tsc_end(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield),
+	  lastMinTaskID(TaskPriority::Zero),
 	  numYields(0)
 {
 	TraceEvent("Net2Starting");
@@ -511,7 +511,7 @@ Net2::Net2(bool useThreadPool, bool useMetrics, bool useObjectSerializer)
 	int priBins[] = { 1, 2050, 3050, 4050, 4950, 5050, 7050, 8050, 10050 };
 	static_assert( sizeof(priBins) == sizeof(int)*NetworkMetrics::PRIORITY_BINS, "Fix priority bins");
 	for(int i=0; i<NetworkMetrics::PRIORITY_BINS; i++)
-		networkMetrics.priorityBins[i] = priBins[i];
+		networkMetrics.priorityBins[i] = static_cast<TaskPriority>(priBins[i]);
 	updateNow();
 
 }
@@ -579,7 +579,7 @@ void Net2::run() {
 			tsc_begin = __rdtsc();
 			taskBegin = timer_monotonic();
 			runFunc();
-			checkForSlowTask(tsc_begin, __rdtsc(), timer_monotonic() - taskBegin, TaskRunCycleFunction);
+			checkForSlowTask(tsc_begin, __rdtsc(), timer_monotonic() - taskBegin, TaskPriority::RunCycleFunction);
 		}
 
 		double sleepTime = 0;
@@ -607,7 +607,7 @@ void Net2::run() {
 		if ((now-nnow) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (now-nnow)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE)
 			TraceEvent("SomewhatSlowRunLoopTop").detail("Elapsed", now - nnow);
 
-		if (sleepTime) trackMinPriority( 0, now );
+		if (sleepTime) trackMinPriority( TaskPriority::Zero, now );
 		while (!timers.empty() && timers.top().at < now) {
 			++countTimers;
 			ready.push( timers.top() );
@@ -620,12 +620,12 @@ void Net2::run() {
 		tsc_end = tsc_begin + FLOW_KNOBS->TSC_YIELD_TIME;
 		taskBegin = timer_monotonic();
 		numYields = 0;
-		int minTaskID = TaskMaxPriority;
+		TaskPriority minTaskID = TaskPriority::Max;
 
 		while (!ready.empty()) {
 			++countTasks;
 			currentTaskID = ready.top().taskID;
-			priorityMetric = currentTaskID;
+			priorityMetric = static_cast<int64_t>(currentTaskID);
 			minTaskID = std::min(minTaskID, currentTaskID);
 			Task* task = ready.top().task;
 			ready.pop();
@@ -638,7 +638,7 @@ void Net2::run() {
 				TraceEvent(SevError, "TaskError").error(unknown_error());
 			}
 
-			if (check_yield(TaskMaxPriority, true)) { ++countYields; break; }
+			if (check_yield(TaskPriority::Max, true)) { ++countYields; break; }
 		}
 
 		nnow = timer_monotonic();
@@ -697,10 +697,10 @@ void Net2::run() {
 	#endif
 }
 
-void Net2::trackMinPriority( int minTaskID, double now ) {
+void Net2::trackMinPriority( TaskPriority minTaskID, double now ) {
 	if (minTaskID != lastMinTaskID)
 		for(int c=0; c<NetworkMetrics::PRIORITY_BINS; c++) {
-			int64_t pri = networkMetrics.priorityBins[c];
+			TaskPriority pri = networkMetrics.priorityBins[c];
 			if (pri >= minTaskID && pri < lastMinTaskID) {  // busy -> idle
 				double busyFor = lastPriorityTrackTime - priorityTimer[c];
 				networkMetrics.secSquaredPriorityBlocked[c] += busyFor*busyFor;
@@ -723,7 +723,7 @@ void Net2::processThreadReady() {
 	}
 }
 
-void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, int64_t priority) {
+void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority) {
 	int64_t elapsed = tscEnd-tscBegin;
 	if (elapsed > FLOW_KNOBS->TSC_YIELD_TIME && tscBegin > 0) {
 		int i = std::min<double>(NetworkMetrics::SLOW_EVENT_BINS-1, log( elapsed/1e6 ) / log(2.));
@@ -734,7 +734,7 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, i
 
 		slowTaskMetric->clocks = elapsed;
 		slowTaskMetric->duration = (int64_t)(duration*1e9);
-		slowTaskMetric->priority = priority;
+		slowTaskMetric->priority = static_cast<int64_t>(priority);
 		slowTaskMetric->numYields = numYields;
 		slowTaskMetric->log();
 
@@ -748,7 +748,7 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, i
 	}
 }
 
-bool Net2::check_yield( int taskID, bool isRunLoop ) {
+bool Net2::check_yield( TaskPriority taskID, bool isRunLoop ) {
 	if(!isRunLoop && numYields > 0) {
 		++numYields;
 		return true;
@@ -761,8 +761,8 @@ bool Net2::check_yield( int taskID, bool isRunLoop ) {
 
 	processThreadReady();
 
-	if (taskID == TaskDefaultYield) taskID = currentTaskID;
-	if (!ready.empty() && ready.top().priority > (int64_t(taskID)<<32))  {
+	if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID;
+	if (!ready.empty() && ready.top().priority > int64_t(taskID)<<32)  {
 		return true;
 	}
 
@@ -787,13 +787,13 @@ bool Net2::check_yield( int taskID, bool isRunLoop ) {
 	return false;
 }
 
-bool Net2::check_yield( int taskID ) {
+bool Net2::check_yield( TaskPriority taskID ) {
 	return check_yield(taskID, false);
 }
 
-Future<class Void> Net2::yield( int taskID ) {
+Future<class Void> Net2::yield( TaskPriority taskID ) {
 	++countYieldCalls;
-	if (taskID == TaskDefaultYield) taskID = currentTaskID;
+	if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID;
 	if (check_yield(taskID, false)) {
 		++countYieldCallsTrue;
 		return delay(0, taskID);
@@ -802,7 +802,7 @@ Future<class Void> Net2::yield( int taskID ) {
 	return Void();
 }
 
-Future<Void> Net2::delay( double seconds, int taskId ) {
+Future<Void> Net2::delay( double seconds, TaskPriority taskId ) {
 	if (seconds <= 0.) {
 		PromiseTask* t = new PromiseTask;
 		this->ready.push( OrderedTask( (int64_t(taskId)<<32)-(++tasksIssued), taskId, t) );
@@ -817,7 +817,7 @@ Future<Void> Net2::delay( double seconds, int taskId ) {
 	return t->promise.getFuture();
 }
 
-void Net2::onMainThread(Promise<Void>&& signal, int taskID) {
+void Net2::onMainThread(Promise<Void>&& signal, TaskPriority taskID) {
 	if (stopped) return;
 	PromiseTask* p = new PromiseTask( std::move(signal) );
 	int64_t priority = int64_t(taskID)<<32;
diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp
index ef63f13c17..87befe9bb7 100644
--- a/flow/Profiler.actor.cpp
+++ b/flow/Profiler.actor.cpp
@@ -248,7 +248,7 @@ struct Profiler {
 		outOffset += self->environmentInfoWriter.getLength();
 
 		loop {
-			wait( self->network->delay(1.0, TaskMinPriority) || self->network->delay(2.0, TaskMaxPriority) );
+			wait( self->network->delay(1.0, TaskPriority::Min) || self->network->delay(2.0, TaskPriority::Max) );
 
 			self->enableSignal(false);
 			std::swap( self->output_buffer, otherBuffer );
diff --git a/flow/ThreadHelper.actor.h b/flow/ThreadHelper.actor.h
index 4fdd3c26ff..ed6a9cdc7d 100644
--- a/flow/ThreadHelper.actor.h
+++ b/flow/ThreadHelper.actor.h
@@ -35,11 +35,11 @@
 // void onMainThreadVoid( F f ) {
 // 	Promise<Void> signal;
 // 	doOnMainThreadVoid( signal.getFuture(), f );
-// 	g_network->onMainThread( std::move(signal), TaskDefaultOnMainThread );
+// 	g_network->onMainThread( std::move(signal), TaskPriority::DefaultOnMainThread );
 // }
 
 template <class F>
-void onMainThreadVoid( F f, Error* err, int taskID = TaskDefaultOnMainThread ) {
+void onMainThreadVoid( F f, Error* err, TaskPriority taskID = TaskPriority::DefaultOnMainThread ) {
 	Promise<Void> signal;
 	doOnMainThreadVoid( signal.getFuture(), f, err );
 	g_network->onMainThread( std::move(signal), taskID );
@@ -585,7 +585,7 @@ template <class F> ThreadFuture< decltype(fake<F>()().getValue()) > onMainThread
 	returnValue->addref(); // For the ThreadFuture we return
 	Future<Void> cancelFuture = doOnMainThread<decltype(fake<F>()().getValue()), F>( signal.getFuture(), f, returnValue );
 	returnValue->setCancel( std::move(cancelFuture) );
-	g_network->onMainThread( std::move(signal), TaskDefaultOnMainThread );
+	g_network->onMainThread( std::move(signal), TaskPriority::DefaultOnMainThread );
 	return ThreadFuture<decltype(fake<F>()().getValue())>( returnValue );
 }
 
diff --git a/flow/Trace.cpp b/flow/Trace.cpp
index 45fcce8d2e..4e70a5d29b 100644
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@@ -630,7 +630,7 @@ void openTraceFile(const NetworkAddress& na, uint64_t rollsize, uint64_t maxLogs
 	std::string baseName = format("%s.%s.%d", baseOfBase.c_str(), ip.c_str(), na.port);
 	g_traceLog.open( directory, baseName, logGroup, format("%lld", time(NULL)), rollsize, maxLogsSize, !g_network->isSimulated() ? na : Optional<NetworkAddress>());
 
-	uncancellable(recurring(&flushTraceFile, FLOW_KNOBS->TRACE_FLUSH_INTERVAL, TaskFlushTrace));
+	uncancellable(recurring(&flushTraceFile, FLOW_KNOBS->TRACE_FLUSH_INTERVAL, TaskPriority::FlushTrace));
 	g_traceBatch.dump();
 }
 
diff --git a/flow/flow.h b/flow/flow.h
index 7ce23eade7..53f35516eb 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -817,7 +817,7 @@ public:
 		return getReplyPromise(value).getFuture();
 	}
 	template <class X>
-	Future<REPLY_TYPE(X)> getReply(const X& value, int taskID) const {
+	Future<REPLY_TYPE(X)> getReply(const X& value, TaskPriority taskID) const {
 		setReplyPriority(value, taskID);
 		return getReplyPromise(value).getFuture();
 	}
@@ -827,7 +827,7 @@ public:
 		return getReply(Promise<X>());
 	}
 	template <class X>
-	Future<X> getReplyWithTaskID(int taskID) const {
+	Future<X> getReplyWithTaskID(TaskPriority taskID) const {
 		Promise<X> reply;
 		reply.getEndpoint(taskID);
 		return getReply(reply);
@@ -908,11 +908,11 @@ struct ActorSingleCallback : SingleCallback<ValueType> {
 	}
 };
 inline double now() { return g_network->now(); }
-inline Future<Void> delay(double seconds, int taskID = TaskDefaultDelay) { return g_network->delay(seconds, taskID); }
-inline Future<Void> delayUntil(double time, int taskID = TaskDefaultDelay) { return g_network->delay(std::max(0.0, time - g_network->now()), taskID); }
-inline Future<Void> delayJittered(double seconds, int taskID = TaskDefaultDelay) { return g_network->delay(seconds*(FLOW_KNOBS->DELAY_JITTER_OFFSET + FLOW_KNOBS->DELAY_JITTER_RANGE*deterministicRandom()->random01()), taskID); }
-inline Future<Void> yield(int taskID = TaskDefaultYield) { return g_network->yield(taskID); }
-inline bool check_yield(int taskID = TaskDefaultYield) { return g_network->check_yield(taskID); }
+inline Future<Void> delay(double seconds, TaskPriority taskID = TaskPriority::DefaultDelay) { return g_network->delay(seconds, taskID); }
+inline Future<Void> delayUntil(double time, TaskPriority taskID = TaskPriority::DefaultDelay) { return g_network->delay(std::max(0.0, time - g_network->now()), taskID); }
+inline Future<Void> delayJittered(double seconds, TaskPriority taskID = TaskPriority::DefaultDelay) { return g_network->delay(seconds*(FLOW_KNOBS->DELAY_JITTER_OFFSET + FLOW_KNOBS->DELAY_JITTER_RANGE*deterministicRandom()->random01()), taskID); }
+inline Future<Void> yield(TaskPriority taskID = TaskPriority::DefaultYield) { return g_network->yield(taskID); }
+inline bool check_yield(TaskPriority taskID = TaskPriority::DefaultYield) { return g_network->check_yield(taskID); }
 
 #include "flow/genericactors.actor.h"
 #endif
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 7b577b2e4c..fdf02a30d2 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -183,7 +183,7 @@ Future<Void> waitForAllReady( std::vector<Future<T>> results ) {
 }
 
 ACTOR template <class T>
-Future<T> timeout( Future<T> what, double time, T timedoutValue, int taskID = TaskDefaultDelay ) {
+Future<T> timeout( Future<T> what, double time, T timedoutValue, TaskPriority taskID = TaskPriority::DefaultDelay ) {
 	Future<Void> end = delay( time, taskID );
 	choose {
 		when( T t = wait( what ) ) { return t; }
@@ -201,7 +201,7 @@ Future<Optional<T>> timeout( Future<T> what, double time ) {
 }
 
 ACTOR template <class T>
-Future<T> timeoutError( Future<T> what, double time, int taskID = TaskDefaultDelay ) {
+Future<T> timeoutError( Future<T> what, double time, TaskPriority taskID = TaskPriority::DefaultDelay ) {
 	Future<Void> end = delay( time, taskID );
 	choose {
 		when( T t = wait( what ) ) { return t; }
@@ -210,7 +210,7 @@ Future<T> timeoutError( Future<T> what, double time, int taskID = TaskDefaultDel
 }
 
 ACTOR template <class T>
-Future<T> delayed( Future<T> what, double time = 0.0, int taskID = TaskDefaultDelay  ) {
+Future<T> delayed( Future<T> what, double time = 0.0, TaskPriority taskID = TaskPriority::DefaultDelay  ) {
 	try {
 		state T t = wait( what );
 		wait( delay( time, taskID ) );
@@ -223,7 +223,7 @@ Future<T> delayed( Future<T> what, double time = 0.0, int taskID = TaskDefaultDe
 }
 
 ACTOR template<class Func>
-Future<Void> recurring( Func what, double interval, int taskID = TaskDefaultDelay ) {
+Future<Void> recurring( Func what, double interval, TaskPriority taskID = TaskPriority::DefaultDelay ) {
 	loop choose {
 		when ( wait( delay( interval, taskID ) ) ) { what(); }
 	}
@@ -951,7 +951,7 @@ Future<Void> quorum(std::vector<Future<T>> const& results, int n) {
 }
 
 ACTOR template <class T>
-Future<Void> smartQuorum( std::vector<Future<T>> results, int required, double extraSeconds, int taskID = TaskDefaultDelay ) {
+Future<Void> smartQuorum( std::vector<Future<T>> results, int required, double extraSeconds, TaskPriority taskID = TaskPriority::DefaultDelay ) {
 	if (results.empty() && required == 0) return Void();
 	wait(quorum(results, required));
 	choose {
@@ -1259,7 +1259,7 @@ struct FlowLock : NonCopyable, public ReferenceCounted<FlowLock> {
 	FlowLock() : permits(1), active(0) {}
 	explicit FlowLock(int64_t permits) : permits(permits), active(0) {}
 
-	Future<Void> take(int taskID = TaskDefaultYield, int64_t amount = 1) {
+	Future<Void> take(TaskPriority taskID = TaskPriority::DefaultYield, int64_t amount = 1) {
 		if (active + amount <= permits || active == 0) {
 			active += amount;
 			return safeYieldActor(this, taskID, amount);
@@ -1298,7 +1298,7 @@ private:
 	int64_t active;
 	Promise<Void> broken_on_destruct;
 
-	ACTOR static Future<Void> takeActor(FlowLock* lock, int taskID, int64_t amount) {
+	ACTOR static Future<Void> takeActor(FlowLock* lock, TaskPriority taskID, int64_t amount) {
 		state std::list<std::pair<Promise<Void>, int64_t>>::iterator it = lock->takers.insert(lock->takers.end(), std::make_pair(Promise<Void>(), amount));
 
 		try {
@@ -1330,7 +1330,7 @@ private:
 		return Void();
 	}
 
-	ACTOR static Future<Void> safeYieldActor(FlowLock* lock, int taskID, int64_t amount) {
+	ACTOR static Future<Void> safeYieldActor(FlowLock* lock, TaskPriority taskID, int64_t amount) {
 		try {
 			choose{
 				when(wait(yield(taskID))) {}
@@ -1351,7 +1351,7 @@ private:
 };
 
 ACTOR template <class T>
-Future<Void> yieldPromiseStream( FutureStream<T> input, PromiseStream<T> output, int taskID = TaskDefaultYield ) {
+Future<Void> yieldPromiseStream( FutureStream<T> input, PromiseStream<T> output, TaskPriority taskID = TaskPriority::DefaultYield ) {
 	loop {
 		T f = waitNext( input );
 		output.send( f );
diff --git a/flow/network.h b/flow/network.h
index 256bc89b40..bb4841a97d 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -31,55 +31,64 @@
 #include "flow/IRandom.h"
 #include "fdbrpc/crc32c.h"
 
-enum {
-	TaskMaxPriority = 1000000,
-	TaskRunCycleFunction = 20000,
-	TaskFlushTrace = 10500,
-	TaskWriteSocket = 10000,
-	TaskPollEIO = 9900,
-	TaskDiskIOComplete = 9150,
-	TaskLoadBalancedEndpoint = 9000,
-	TaskReadSocket = 9000,
-	TaskCoordinationReply = 8810,
-	TaskCoordination = 8800,
-	TaskFailureMonitor = 8700,
-	TaskResolutionMetrics = 8700,
-	TaskClusterController = 8650,
-	TaskProxyStorageRejoin = 8645,
-	TaskProxyCommitDispatcher = 8640,
-	TaskTLogQueuingMetrics = 8620,
-	TaskTLogPop = 8610,
-	TaskTLogPeekReply = 8600,
-	TaskTLogPeek = 8590,
-	TaskTLogCommitReply = 8580,
-	TaskTLogCommit = 8570,
-	TaskProxyGetRawCommittedVersion = 8565,
-	TaskProxyResolverReply = 8560,
-	TaskProxyCommitBatcher = 8550,
-	TaskProxyCommit = 8540,
-	TaskTLogConfirmRunningReply = 8530,
-	TaskTLogConfirmRunning = 8520,
-	TaskProxyGRVTimer = 8510,
-	TaskProxyGetConsistentReadVersion = 8500,
-	TaskDefaultPromiseEndpoint = 8000,
-	TaskDefaultOnMainThread = 7500,
-	TaskDefaultDelay = 7010,
-	TaskDefaultYield = 7000,
-	TaskDiskRead = 5010,
-	TaskDefaultEndpoint = 5000,
-	TaskUnknownEndpoint = 4000,
-	TaskMoveKeys = 3550,
-	TaskDataDistributionLaunch = 3530,
-	TaskRatekeeper = 3510,
-	TaskDataDistribution = 3500,
-	TaskDiskWrite = 3010,
-	TaskUpdateStorage = 3000,
-	TaskTLogSpilledPeekReply = 2800,
-	TaskLowPriority = 2000,
+enum class TaskPriority {
+	Max = 1000000,
+	RunCycleFunction = 20000,
+	FlushTrace = 10500,
+	WriteSocket = 10000,
+	PollEIO = 9900,
+	DiskIOComplete = 9150,
+	LoadBalancedEndpoint = 9000,
+	ReadSocket = 9000,
+	CoordinationReply = 8810,
+	Coordination = 8800,
+	FailureMonitor = 8700,
+	ResolutionMetrics = 8700,
+	ClusterController = 8650,
+	ProxyStorageRejoin = 8645,
+	ProxyCommitDispatcher = 8640,
+	TLogQueuingMetrics = 8620,
+	TLogPop = 8610,
+	TLogPeekReply = 8600,
+	TLogPeek = 8590,
+	TLogCommitReply = 8580,
+	TLogCommit = 8570,
+	ProxyGetRawCommittedVersion = 8565,
+	ProxyResolverReply = 8560,
+	ProxyCommitBatcher = 8550,
+	ProxyCommit = 8540,
+	TLogConfirmRunningReply = 8530,
+	TLogConfirmRunning = 8520,
+	ProxyGRVTimer = 8510,
+	ProxyGetConsistentReadVersion = 8500,
+	DefaultPromiseEndpoint = 8000,
+	DefaultOnMainThread = 7500,
+	DefaultDelay = 7010,
+	DefaultYield = 7000,
+	DiskRead = 5010,
+	DefaultEndpoint = 5000,
+	UnknownEndpoint = 4000,
+	MoveKeys = 3550,
+	DataDistributionLaunch = 3530,
+	Ratekeeper = 3510,
+	DataDistribution = 3500,
+	DiskWrite = 3010,
+	UpdateStorage = 3000,
+	TLogSpilledPeekReply = 2800,
+	Low = 2000,
 
-	TaskMinPriority = 1000
+	Min = 1000,
+	Zero = 0
 };
 
+inline TaskPriority incrementPriority(TaskPriority p) {
+	return static_cast<TaskPriority>( static_cast<uint64_t>(p) + 1 );
+}
+
+inline TaskPriority decrementPriority(TaskPriority p) {
+	return static_cast<TaskPriority>( static_cast<uint64_t>(p) + 1 );
+}
+
 class Void;
 
 template<class T> class Optional;
@@ -270,7 +279,7 @@ struct NetworkMetrics {
 	uint64_t countSlowEvents[SLOW_EVENT_BINS];
 
 	enum { PRIORITY_BINS = 9 };
-	int priorityBins[ PRIORITY_BINS ];
+	TaskPriority priorityBins[ PRIORITY_BINS ];
 	double secSquaredPriorityBlocked[PRIORITY_BINS];
 
 	double oldestAlternativesFailure;
@@ -372,19 +381,19 @@ public:
 	// Provides a clock that advances at a similar rate on all connected endpoints
 	// FIXME: Return a fixed point Time class
 
-	virtual Future<class Void> delay( double seconds, int taskID ) = 0;
+	virtual Future<class Void> delay( double seconds, TaskPriority taskID ) = 0;
 	// The given future will be set after seconds have elapsed
 
-	virtual Future<class Void> yield( int taskID ) = 0;
+	virtual Future<class Void> yield( TaskPriority taskID ) = 0;
 	// The given future will be set immediately or after higher-priority tasks have executed
 
-	virtual bool check_yield( int taskID ) = 0;
+	virtual bool check_yield( TaskPriority taskID ) = 0;
 	// Returns true if a call to yield would result in a delay
 
-	virtual int getCurrentTask() = 0;
+	virtual TaskPriority getCurrentTask() = 0;
 	// Gets the taskID/priority of the current task
 
-	virtual void setCurrentTask(int taskID ) = 0;
+	virtual void setCurrentTask(TaskPriority taskID ) = 0;
 	// Sets the taskID/priority of the current task, without yielding
 
 	virtual flowGlobalType global(int id) = 0;
@@ -396,7 +405,7 @@ public:
 	virtual bool isSimulated() const = 0;
 	// Returns true if this network is a local simulation
 
-	virtual void onMainThread( Promise<Void>&& signal, int taskID ) = 0;
+	virtual void onMainThread( Promise<Void>&& signal, TaskPriority taskID ) = 0;
 	// Executes signal.send(Void()) on a/the thread belonging to this network
 
 	virtual THREAD_HANDLE startThread( THREAD_FUNC_RETURN (*func) (void *), void *arg) = 0;

From 8e28930d12b1a94a2c62b827b1ba43b5d31bb013 Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Tue, 25 Jun 2019 10:36:32 -0700
Subject: [PATCH 007/136] Fix another hardcoded priority.

---
 fdbclient/VersionedMap.actor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/VersionedMap.actor.h b/fdbclient/VersionedMap.actor.h
index 953c2f4c1f..53ba85097f 100644
--- a/fdbclient/VersionedMap.actor.h
+++ b/fdbclient/VersionedMap.actor.h
@@ -31,7 +31,7 @@
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
 ACTOR template <class Tree>
-Future<Void> deferredCleanupActor( std::vector<Tree> toFree, TaskPriority taskID = 7000 ) {
+Future<Void> deferredCleanupActor( std::vector<Tree> toFree, TaskPriority taskID = TaskPriority::DefaultYield ) {
 	state int freeCount = 0;
 	while (!toFree.empty()) {
 		Tree a = std::move( toFree.back() );

From d7c00f9cd29d50d099c5ebd25346a1cc8715e339 Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Tue, 25 Jun 2019 14:19:56 -0700
Subject: [PATCH 008/136] And another.

---
 fdbclient/VersionedMap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/VersionedMap.h b/fdbclient/VersionedMap.h
index 58c440c679..f56b883892 100644
--- a/fdbclient/VersionedMap.h
+++ b/fdbclient/VersionedMap.h
@@ -511,7 +511,7 @@ public:
 		oldestVersion = newOldestVersion;
 	}
 
-	Future<Void> forgetVersionsBeforeAsync( Version newOldestVersion, TaskPriority taskID = 7000 ) {
+	Future<Void> forgetVersionsBeforeAsync( Version newOldestVersion, TaskPriority taskID = TaskPriority::DefaultYield ) {
 		ASSERT( newOldestVersion <= latestVersion );
 		roots[newOldestVersion] = getRoot(newOldestVersion);
 

From b5af601a8a618b779635c12d3c5995757cc52787 Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Tue, 25 Jun 2019 21:41:43 -0700
Subject: [PATCH 009/136] Fix ExternalWorkload not being a part of the old
 build/test system.

---
 fdbserver/fdbserver.vcxproj                    | 1 +
 fdbserver/workloads/ExternalWorkload.actor.cpp | 2 +-
 tests/CMakeLists.txt                           | 2 +-
 tests/{fast => }/SimpleExternalTest.txt        | 0
 4 files changed, 3 insertions(+), 2 deletions(-)
 rename tests/{fast => }/SimpleExternalTest.txt (100%)

diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj
index 44e752cdb3..2dd4cb17db 100644
--- a/fdbserver/fdbserver.vcxproj
+++ b/fdbserver/fdbserver.vcxproj
@@ -157,6 +157,7 @@
     <ActorCompiler Include="workloads\DiskDurability.actor.cpp" />
     <ActorCompiler Include="workloads\SnapTest.actor.cpp" />
     <ActorCompiler Include="workloads\Mako.actor.cpp" />
+    <ActorCompiler Include="workloads\ExternalWorkload.actor.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="ApplyMetadataMutation.h" />
diff --git a/fdbserver/workloads/ExternalWorkload.actor.cpp b/fdbserver/workloads/ExternalWorkload.actor.cpp
index c967bc1655..69715def42 100644
--- a/fdbserver/workloads/ExternalWorkload.actor.cpp
+++ b/fdbserver/workloads/ExternalWorkload.actor.cpp
@@ -21,7 +21,7 @@
 #include "flow/ThreadHelper.actor.h"
 #include "flow/Platform.h"
 #include "fdbclient/ThreadSafeTransaction.h"
-#include "foundationdb/ClientWorkload.h"
+#include "bindings/c/foundationdb/ClientWorkload.h"
 #include "fdbserver/workloads/workloads.actor.h"
 
 #include "flow/actorcompiler.h" // has to be last include
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 80ef93456a..c90fc36ceb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -63,6 +63,7 @@ add_fdb_test(TEST_FILES ReadAbsent.txt IGNORE)
 add_fdb_test(TEST_FILES ReadHalfAbsent.txt IGNORE)
 add_fdb_test(TEST_FILES RedwoodCorrectness.txt IGNORE)
 add_fdb_test(TEST_FILES RedwoodPerfTests.txt IGNORE)
+add_fdb_test(TEST_FILES SimpleExternalTest.txt)
 add_fdb_test(TEST_FILES SlowTask.txt IGNORE)
 add_fdb_test(TEST_FILES SpecificUnitTest.txt IGNORE)
 add_fdb_test(TEST_FILES StreamingWrite.txt IGNORE)
@@ -109,7 +110,6 @@ add_fdb_test(TEST_FILES fast/RandomUnitTests.txt)
 add_fdb_test(TEST_FILES fast/SelectorCorrectness.txt)
 add_fdb_test(TEST_FILES fast/Sideband.txt)
 add_fdb_test(TEST_FILES fast/SidebandWithStatus.txt)
-add_fdb_test(TEST_FILES fast/SimpleExternalTest.txt)
 add_fdb_test(TEST_FILES fast/SnapTestFailAndDisablePop.txt)
 add_fdb_test(TEST_FILES fast/SwizzledRollbackSideband.txt)
 add_fdb_test(TEST_FILES fast/SystemRebootTestCycle.txt)
diff --git a/tests/fast/SimpleExternalTest.txt b/tests/SimpleExternalTest.txt
similarity index 100%
rename from tests/fast/SimpleExternalTest.txt
rename to tests/SimpleExternalTest.txt

From 7f2381484147057ef2cf8618c7fd944183610150 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Wed, 26 Jun 2019 14:03:02 -0700
Subject: [PATCH 010/136] Track run loop busyness and report it in status.

---
 .../source/mr-status-json-schemas.rst.inc     |  3 +-
 documentation/sphinx/source/release-notes.rst |  2 +
 fdbclient/Schemas.cpp                         |  3 +-
 fdbserver/Status.actor.cpp                    | 97 +++++++++++--------
 flow/Net2.actor.cpp                           | 34 ++++---
 flow/SystemMonitor.cpp                        | 29 ++++--
 flow/network.h                                |  3 +
 7 files changed, 109 insertions(+), 62 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 5b0099f142..ad5d6d95b5 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -187,7 +187,8 @@
                "megabits_received":{
                   "hz":0.0
                }
-            }
+            },
+			"run_loop_busy":0.2 // fraction of time the run loop was busy
          }
       },
       "old_logs":[
diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index f2a9813030..05654f5d14 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -17,6 +17,8 @@ Fixes
 Status
 ------
 
+* Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #) <https://github.com/apple/foundationdb/pull/>`_.
+
 Bindings
 --------
 
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 2e3db10c40..f1f2c5e305 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -207,7 +207,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                "megabits_received":{
                   "hz":0.0
                }
-            }
+            },
+            "run_loop_busy":0.2
          }
       },
       "old_logs":[
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 47c61aeb9f..2a8e8cf27c 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -315,10 +315,10 @@ static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics, vector<Work
 				statusObj["memory"] = memoryObj;
 
 				JsonBuilderObject cpuObj;
-				double cpu_seconds = event.getDouble("CPUSeconds");
+				double cpuSeconds = event.getDouble("CPUSeconds");
 				double elapsed = event.getDouble("Elapsed");
 				if (elapsed > 0){
-					cpuObj["logical_core_utilization"] = std::max(0.0, std::min(cpu_seconds / elapsed, 1.0));
+					cpuObj["logical_core_utilization"] = std::max(0.0, std::min(cpuSeconds / elapsed, 1.0));
 				}
 				statusObj["cpu"] = cpuObj;
 
@@ -541,8 +541,8 @@ struct RolesInfo {
 
 ACTOR static Future<JsonBuilderObject> processStatusFetcher(
     Reference<AsyncVar<struct ServerDBInfo>> db, std::vector<WorkerDetails> workers, WorkerEvents pMetrics,
-    WorkerEvents mMetrics, WorkerEvents errors, WorkerEvents traceFileOpenErrors, WorkerEvents programStarts,
-    std::map<std::string, std::vector<JsonBuilderObject>> processIssues,
+    WorkerEvents mMetrics, WorkerEvents nMetrics, WorkerEvents errors, WorkerEvents traceFileOpenErrors, 
+	WorkerEvents programStarts, std::map<std::string, std::vector<JsonBuilderObject>> processIssues,
     vector<std::pair<StorageServerInterface, EventMap>> storageServers,
     vector<std::pair<TLogInterface, EventMap>> tLogs, vector<std::pair<MasterProxyInterface, EventMap>> proxies,
     Database cx, Optional<DatabaseConfiguration> configuration, Optional<Key> healthyZone, std::set<std::string>* incomplete_reasons) {
@@ -668,84 +668,84 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
 			ASSERT(pMetrics.count(workerItr->interf.address()));
 
 			NetworkAddress address = workerItr->interf.address();
-			const TraceEventFields& event = pMetrics[workerItr->interf.address()];
+			const TraceEventFields& processMetrics = pMetrics[workerItr->interf.address()];
 			statusObj["address"] = address.toString();
 			JsonBuilderObject memoryObj;
 
-			if (event.size() > 0) {
-				std::string zoneID = event.getValue("ZoneID");
+			if (processMetrics.size() > 0) {
+				std::string zoneID = processMetrics.getValue("ZoneID");
 				statusObj["fault_domain"] = zoneID;
 				if(healthyZone.present() && healthyZone == workerItr->interf.locality.zoneId()) {
 					statusObj["under_maintenance"] = true;
 				}
 
-				std::string MachineID = event.getValue("MachineID");
+				std::string MachineID = processMetrics.getValue("MachineID");
 				statusObj["machine_id"] = MachineID;
 
 				statusObj["locality"] = getLocalityInfo(workerItr->interf.locality);
 
-				statusObj.setKeyRawNumber("uptime_seconds",event.getValue("UptimeSeconds"));
+				statusObj.setKeyRawNumber("uptime_seconds", processMetrics.getValue("UptimeSeconds"));
 
 				// rates are calculated over the last elapsed seconds
-				double elapsed = event.getDouble("Elapsed");
-				double cpu_seconds = event.getDouble("CPUSeconds");
-				double diskIdleSeconds = event.getDouble("DiskIdleSeconds");
-				double diskReads = event.getDouble("DiskReads");
-				double diskWrites = event.getDouble("DiskWrites");
+				double processMetricsElapsed = processMetrics.getDouble("Elapsed");
+				double cpuSeconds = processMetrics.getDouble("CPUSeconds");
+				double diskIdleSeconds = processMetrics.getDouble("DiskIdleSeconds");
+				double diskReads = processMetrics.getDouble("DiskReads");
+				double diskWrites = processMetrics.getDouble("DiskWrites");
 
 				JsonBuilderObject diskObj;
-				if (elapsed > 0){
+				if (processMetricsElapsed > 0) {
 					JsonBuilderObject cpuObj;
-					cpuObj["usage_cores"] = std::max(0.0, cpu_seconds / elapsed);
+					cpuObj["usage_cores"] = std::max(0.0, cpuSeconds / processMetricsElapsed);
 					statusObj["cpu"] = cpuObj;
 
-					diskObj["busy"] = std::max(0.0, std::min((elapsed - diskIdleSeconds) / elapsed, 1.0));
+					diskObj["busy"] = std::max(0.0, std::min((processMetricsElapsed - diskIdleSeconds) / processMetricsElapsed, 1.0));
 
 					JsonBuilderObject readsObj;
-					readsObj.setKeyRawNumber("counter",event.getValue("DiskReadsCount"));
-					if (elapsed > 0)
-						readsObj["hz"] = diskReads / elapsed;
-					readsObj.setKeyRawNumber("sectors",event.getValue("DiskReadSectors"));
+					readsObj.setKeyRawNumber("counter", processMetrics.getValue("DiskReadsCount"));
+					if (processMetricsElapsed > 0)
+						readsObj["hz"] = diskReads / processMetricsElapsed;
+					readsObj.setKeyRawNumber("sectors", processMetrics.getValue("DiskReadSectors"));
 
 					JsonBuilderObject writesObj;
-					writesObj.setKeyRawNumber("counter",event.getValue("DiskWritesCount"));
-					if (elapsed > 0)
-						writesObj["hz"] = diskWrites / elapsed;
-					writesObj.setKeyRawNumber("sectors",event.getValue("DiskWriteSectors"));
+					writesObj.setKeyRawNumber("counter", processMetrics.getValue("DiskWritesCount"));
+					if (processMetricsElapsed > 0)
+						writesObj["hz"] = diskWrites / processMetricsElapsed;
+					writesObj.setKeyRawNumber("sectors", processMetrics.getValue("DiskWriteSectors"));
 
 					diskObj["reads"] = readsObj;
 					diskObj["writes"] = writesObj;
 				}
 
-				diskObj.setKeyRawNumber("total_bytes",event.getValue("DiskTotalBytes"));
-				diskObj.setKeyRawNumber("free_bytes",event.getValue("DiskFreeBytes"));
+				diskObj.setKeyRawNumber("total_bytes", processMetrics.getValue("DiskTotalBytes"));
+				diskObj.setKeyRawNumber("free_bytes", processMetrics.getValue("DiskFreeBytes"));
 				statusObj["disk"] = diskObj;
 
 				JsonBuilderObject networkObj;
 
-				networkObj.setKeyRawNumber("current_connections",event.getValue("CurrentConnections"));
+				networkObj.setKeyRawNumber("current_connections", processMetrics.getValue("CurrentConnections"));
 				JsonBuilderObject connections_established;
-				connections_established.setKeyRawNumber("hz",event.getValue("ConnectionsEstablished"));
+				connections_established.setKeyRawNumber("hz", processMetrics.getValue("ConnectionsEstablished"));
 				networkObj["connections_established"] = connections_established;
 				JsonBuilderObject connections_closed;
-				connections_closed.setKeyRawNumber("hz",event.getValue("ConnectionsClosed"));
+				connections_closed.setKeyRawNumber("hz", processMetrics.getValue("ConnectionsClosed"));
 				networkObj["connections_closed"] = connections_closed;
 				JsonBuilderObject connection_errors;
-				connection_errors.setKeyRawNumber("hz",event.getValue("ConnectionErrors"));
+				connection_errors.setKeyRawNumber("hz", processMetrics.getValue("ConnectionErrors"));
 				networkObj["connection_errors"] = connection_errors;
 
 				JsonBuilderObject megabits_sent;
-				megabits_sent.setKeyRawNumber("hz",event.getValue("MbpsSent"));
+				megabits_sent.setKeyRawNumber("hz", processMetrics.getValue("MbpsSent"));
 				networkObj["megabits_sent"] = megabits_sent;
 
 				JsonBuilderObject megabits_received;
-				megabits_received.setKeyRawNumber("hz",event.getValue("MbpsReceived"));
+				megabits_received.setKeyRawNumber("hz", processMetrics.getValue("MbpsReceived"));
 				networkObj["megabits_received"] = megabits_received;
 
 				statusObj["network"] = networkObj;
 
-				memoryObj.setKeyRawNumber("used_bytes",event.getValue("Memory"));
-				memoryObj.setKeyRawNumber("unused_allocated_memory",event.getValue("UnusedAllocatedMemory"));
+				memoryObj.setKeyRawNumber("used_bytes", processMetrics.getValue("Memory"));
+				memoryObj.setKeyRawNumber("unused_allocated_memory", processMetrics.getValue("UnusedAllocatedMemory"));
 			}
 
 			if (programStarts.count(address)) {
@@ -820,6 +820,19 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
 			if(workerItr->degraded) {
 				statusObj["degraded"] = true;
 			}
+
+			const TraceEventFields& networkMetrics = nMetrics[workerItr->interf.address()];
+			double networkMetricsElapsed = networkMetrics.getDouble("Elapsed");
+
+			try {
+				double runLoopBusy = networkMetrics.getDouble("PriorityBusy1");
+				statusObj["run_loop_busy"] = runLoopBusy / networkMetricsElapsed;
+			}
+			catch(Error &e) {
+				// This should only happen very early in the process lifetime before priority bin info has been populated
+				incomplete_reasons->insert("Cannot retrieve run loop busyness.");
+			}
+
 		}
 		catch (Error& e){
 			// Something strange occurred, process list is incomplete but what was built so far, if anything, will be returned.
@@ -1905,6 +1918,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
 		std::vector< Future< Optional <std::pair<WorkerEvents, std::set<std::string>>> > > futures;
 		futures.push_back(latestEventOnWorkers(workers, "MachineMetrics"));
 		futures.push_back(latestEventOnWorkers(workers, "ProcessMetrics"));
+		futures.push_back(latestEventOnWorkers(workers, "NetworkMetrics"));
 		futures.push_back(latestErrorOnWorkers(workers));
 		futures.push_back(latestEventOnWorkers(workers, "TraceFileOpenError"));
 		futures.push_back(latestEventOnWorkers(workers, "ProgramStart"));
@@ -1944,9 +1958,10 @@ ACTOR Future<StatusReply> clusterGetStatus(
 		state WorkerEvents mMetrics = workerEventsVec[0].present() ? workerEventsVec[0].get().first : WorkerEvents();
 		// process metrics
 		state WorkerEvents pMetrics = workerEventsVec[1].present() ? workerEventsVec[1].get().first : WorkerEvents();
-		state WorkerEvents latestError = workerEventsVec[2].present() ? workerEventsVec[2].get().first : WorkerEvents();
-		state WorkerEvents traceFileOpenErrors = workerEventsVec[3].present() ? workerEventsVec[3].get().first : WorkerEvents();
-		state WorkerEvents programStarts = workerEventsVec[4].present() ? workerEventsVec[4].get().first : WorkerEvents();
+		state WorkerEvents networkMetrics = workerEventsVec[2].present() ? workerEventsVec[2].get().first : WorkerEvents();
+		state WorkerEvents latestError = workerEventsVec[3].present() ? workerEventsVec[3].get().first : WorkerEvents();
+		state WorkerEvents traceFileOpenErrors = workerEventsVec[4].present() ? workerEventsVec[4].get().first : WorkerEvents();
+		state WorkerEvents programStarts = workerEventsVec[5].present() ? workerEventsVec[5].get().first : WorkerEvents();
 
 		state JsonBuilderObject statusObj;
 		if(db->get().recoveryCount > 0) {
@@ -2089,7 +2104,11 @@ ACTOR Future<StatusReply> clusterGetStatus(
 			statusObj["layers"] = layers;
 		}
 
-		JsonBuilderObject processStatus = wait(processStatusFetcher(db, workers, pMetrics, mMetrics, latestError, traceFileOpenErrors, programStarts, processIssues, storageServers, tLogs, proxies, cx, configuration, loadResult.present() ? loadResult.get().healthyZone : Optional<Key>(), &status_incomplete_reasons));
+		JsonBuilderObject processStatus = wait(processStatusFetcher(db, workers, pMetrics, mMetrics, networkMetrics, 
+		                                                            latestError, traceFileOpenErrors, programStarts, 
+		                                                            processIssues, storageServers, tLogs, proxies, cx, 
+		                                                            configuration, loadResult.present() ? loadResult.get().healthyZone : Optional<Key>(), 
+		                                                            &status_incomplete_reasons));
 		statusObj["processes"] = processStatus;
 		statusObj["clients"] = clientStatusFetcher(clientVersionMap, clientStatusInfoMap);
 
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index 2dcf9783ed..ef6426936d 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -168,7 +168,6 @@ public:
 
 	double lastPriorityTrackTime;
 	int lastMinTaskID;
-	double priorityTimer[NetworkMetrics::PRIORITY_BINS];
 
 	std::priority_queue<OrderedTask, std::vector<OrderedTask>> ready;
 	ThreadSafeQueue<OrderedTask> threadReady;
@@ -577,7 +576,8 @@ void Net2::run() {
 
 		if (runFunc) {
 			tsc_begin = __rdtsc();
-			taskBegin = timer_monotonic();
+			taskBegin = nnow;
+			trackMinPriority(TaskRunCycleFunction, taskBegin);
 			runFunc();
 			checkForSlowTask(tsc_begin, __rdtsc(), timer_monotonic() - taskBegin, TaskRunCycleFunction);
 		}
@@ -591,8 +591,11 @@ void Net2::run() {
 			++countWontSleep;
 		if (b) {
 			sleepTime = 1e99;
-			if (!timers.empty())
-				sleepTime = timers.top().at - timer_monotonic();  // + 500e-6?
+			double sleepStart = timer_monotonic();
+			if (!timers.empty()) {
+				sleepTime = timers.top().at - sleepStart;  // + 500e-6?
+			}
+			trackMinPriority(0, sleepStart);
 		}
 
 		awakeMetric = false;
@@ -607,7 +610,6 @@ void Net2::run() {
 		if ((now-nnow) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (now-nnow)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE)
 			TraceEvent("SomewhatSlowRunLoopTop").detail("Elapsed", now - nnow);
 
-		if (sleepTime) trackMinPriority( 0, now );
 		while (!timers.empty() && timers.top().at < now) {
 			++countTimers;
 			ready.push( timers.top() );
@@ -641,7 +643,7 @@ void Net2::run() {
 			if (check_yield(TaskMaxPriority, true)) { ++countYields; break; }
 		}
 
-		nnow = timer_monotonic();
+		trackMinPriority(minTaskID, now);
 
 #if defined(__linux__)
 		if(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) {
@@ -685,11 +687,10 @@ void Net2::run() {
 			net2liveness.fetch_add(1);
 		}
 #endif
+		nnow = timer_monotonic();
 
 		if ((nnow-now) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (nnow-now)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE)
 			TraceEvent("SomewhatSlowRunLoopBottom").detail("Elapsed", nnow - now); // This includes the time spent running tasks
-
-		trackMinPriority( minTaskID, nnow );
 	}
 
 	#ifdef WIN32
@@ -698,17 +699,22 @@ void Net2::run() {
 }
 
 void Net2::trackMinPriority( int minTaskID, double now ) {
-	if (minTaskID != lastMinTaskID)
+	if (minTaskID != lastMinTaskID) {
 		for(int c=0; c<NetworkMetrics::PRIORITY_BINS; c++) {
 			int64_t pri = networkMetrics.priorityBins[c];
-			if (pri >= minTaskID && pri < lastMinTaskID) {  // busy -> idle
-				double busyFor = lastPriorityTrackTime - priorityTimer[c];
-				networkMetrics.secSquaredPriorityBlocked[c] += busyFor*busyFor;
+			if (pri > minTaskID && pri <= lastMinTaskID) {  // busy -> idle
+				double busyFor = lastPriorityTrackTime - networkMetrics.priorityTimer[c];
+				networkMetrics.priorityBlocked[c] = false;
+				networkMetrics.priorityBlockedDuration[c] += busyFor;
+				networkMetrics.secSquaredPriorityBlocked[c] += busyFor * busyFor;
 			}
-			if (pri < minTaskID && pri >= lastMinTaskID) {  // idle -> busy
-				priorityTimer[c] = now;
+			if (pri <= minTaskID && pri > lastMinTaskID) {  // idle -> busy
+				networkMetrics.priorityBlocked[c] = true;
+				networkMetrics.priorityTimer[c] = now;
 			}
 		}
+	}
+
 	lastMinTaskID = minTaskID;
 	lastPriorityTrackTime = now;
 }
diff --git a/flow/SystemMonitor.cpp b/flow/SystemMonitor.cpp
index fc778717e9..4481cba0f0 100644
--- a/flow/SystemMonitor.cpp
+++ b/flow/SystemMonitor.cpp
@@ -59,8 +59,7 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 	netData.init();
 	if (!DEBUG_DETERMINISM && currentStats.initialized) {
 		{
-			TraceEvent e(eventName.c_str());
-			e
+			TraceEvent(eventName.c_str())
 				.detail("Elapsed", currentStats.elapsed)
 				.detail("CPUSeconds", currentStats.processCPUSeconds)
 				.detail("MainThreadCPUSeconds", currentStats.mainThreadCPUSeconds)
@@ -120,6 +119,7 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 
 			TraceEvent n("NetworkMetrics");
 			n
+				.detail("Elapsed", currentStats.elapsed)
 				.detail("CantSleep", netData.countCantSleep - statState->networkState.countCantSleep)
 				.detail("WontSleep", netData.countWontSleep - statState->networkState.countWontSleep)
 				.detail("Yields", netData.countYields - statState->networkState.countYields)
@@ -139,12 +139,27 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 				.detail("PacketsGenerated", netData.countPacketsGenerated - statState->networkState.countPacketsGenerated)
 				.detail("WouldBlock", netData.countWouldBlock - statState->networkState.countWouldBlock);
 
-			for (int i = 0; i<NetworkMetrics::SLOW_EVENT_BINS; i++)
-				if (int c = g_network->networkMetrics.countSlowEvents[i] - statState->networkMetricsState.countSlowEvents[i])
+			for (int i = 0; i<NetworkMetrics::SLOW_EVENT_BINS; i++) {
+				if (int c = g_network->networkMetrics.countSlowEvents[i] - statState->networkMetricsState.countSlowEvents[i]) {
 					n.detail(format("SlowTask%dM", 1 << i).c_str(), c);
-			for (int i = 0; i<NetworkMetrics::PRIORITY_BINS; i++)
-				if (double x = g_network->networkMetrics.secSquaredPriorityBlocked[i] - statState->networkMetricsState.secSquaredPriorityBlocked[i])
-					n.detail(format("S2Pri%d", g_network->networkMetrics.priorityBins[i]).c_str(), x);
+				}
+			}
+
+			for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkMetrics.priorityBins[i] != 0; i++) {
+				if(g_network->networkMetrics.priorityBlocked[i]) {
+					double lastSegment = std::min(currentStats.elapsed, now() - g_network->networkMetrics.priorityTimer[i]);
+					g_network->networkMetrics.priorityBlockedDuration[i] += lastSegment;
+					g_network->networkMetrics.secSquaredPriorityBlocked[i] += lastSegment * lastSegment;
+					g_network->networkMetrics.priorityTimer[i] = now();
+				}
+
+				double blocked = g_network->networkMetrics.priorityBlockedDuration[i] - statState->networkMetricsState.priorityBlockedDuration[i];
+				double s2Blocked = g_network->networkMetrics.secSquaredPriorityBlocked[i] - statState->networkMetricsState.secSquaredPriorityBlocked[i];
+				n.detail(format("PriorityBusy%d", g_network->networkMetrics.priorityBins[i]).c_str(), blocked);
+				n.detail(format("SumOfSquaredPriorityBusy%d", g_network->networkMetrics.priorityBins[i]).c_str(), s2Blocked);
+			}
+
+			n.trackLatest("NetworkMetrics");
 		}
 
 		if(machineMetrics) {
diff --git a/flow/network.h b/flow/network.h
index 256bc89b40..55284fb39f 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -271,7 +271,10 @@ struct NetworkMetrics {
 
 	enum { PRIORITY_BINS = 9 };
 	int priorityBins[ PRIORITY_BINS ];
+	bool priorityBlocked[PRIORITY_BINS];
+	double priorityBlockedDuration[PRIORITY_BINS];
 	double secSquaredPriorityBlocked[PRIORITY_BINS];
+	double priorityTimer[PRIORITY_BINS];
 
 	double oldestAlternativesFailure;
 	double newestAlternativesFailure;

From 7e70fa7fcb6f0ec3d8eaba3c6ba4a12755b5110c Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Wed, 26 Jun 2019 14:10:08 -0700
Subject: [PATCH 011/136] Add pull request number to release notes.

---
 documentation/sphinx/source/release-notes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index 05654f5d14..c8bf7fbac7 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -17,7 +17,7 @@ Fixes
 Status
 ------
 
-* Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #) <https://github.com/apple/foundationdb/pull/>`_.
+* Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #1760) <https://github.com/apple/foundationdb/pull/1760>`_.
 
 Bindings
 --------

From 08f28e99f96e9339339364fc9524c8326e06e664 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Wed, 26 Jun 2019 13:47:45 -0700
Subject: [PATCH 012/136] TeamCollection:Test no server or machine has
 incorrect team number

Add test for simulation test which make sure the server team number
per server will be no less than the desired_teams_per_server defined
in knobs and no larger than the max_teams_per_server.

Add similar test for machine teams number per machine as well.
---
 fdbserver/DataDistribution.actor.cpp | 49 ++++++++++++++++++++++++++++
 fdbserver/QuietDatabase.actor.cpp    | 19 +++++++++--
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 7b29ce0bdf..2f6a9bbcd2 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1591,6 +1591,34 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return totalHealthyMachineCount;
 	}
 
+	std::pair<int, int> calculateMinMaxServerTeamNumOnServer() {
+		int minTeamNumber = std::numeric_limits<int>::max();
+		int maxTeamNumber = std::numeric_limits<int>::min();
+		for (auto& server : server_info ) {
+			if ( server.second->teams.size() < minTeamNumber ) {
+				minTeamNumber = server.second->teams.size();
+			}
+			if ( server.second->teams.size() > maxTeamNumber ) {
+				maxTeamNumber = server.second->teams.size();
+			}
+		}
+		return std::make_pair(minTeamNumber, maxTeamNumber);
+	}
+
+	std::pair<int, int> calculateMinMaxMachineTeamNumOnMachine() {
+		int minTeamNumber = std::numeric_limits<int>::max();
+		int maxTeamNumber = std::numeric_limits<int>::min();
+		for (auto& machine : machine_info) {
+			if ( machine.second->machineTeams.size() < minTeamNumber ) {
+				minTeamNumber = machine.second->machineTeams.size();
+			}
+			if ( machine.second->machineTeams.size() > maxTeamNumber ) {
+				maxTeamNumber = machine.second->machineTeams.size();
+			}
+		}
+		return std::make_pair(minTeamNumber, maxTeamNumber);
+	}
+
 	// Sanity check
 	bool isServerTeamNumberCorrect(Reference<TCMachineTeamInfo>& mt) {
 		int num = 0;
@@ -1762,6 +1790,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 		healthyMachineTeamCount = getHealthyMachineTeamCount();
 
+		std::pair<int, int>  minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
+		std::pair<int, int>  minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
+
 		TraceEvent("TeamCollectionInfo", distributorId)
 		    .detail("Primary", primary)
 		    .detail("AddedTeamNumber", addedTeams)
@@ -1775,6 +1806,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("DesiredMachineTeams", desiredMachineTeams)
 		    .detail("MaxMachineTeams", maxMachineTeams)
 		    .detail("TotalHealthyMachine", totalHealthyMachineCount)
+			.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
+			.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
+			.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
+			.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
 		    .trackLatest("TeamCollectionInfo");
 
 		return addedTeams;
@@ -1791,6 +1826,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
 		int healthyMachineTeamCount = getHealthyMachineTeamCount();
 
+		std::pair<int, int>  minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
+		std::pair<int, int>  minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
+
 		TraceEvent("TeamCollectionInfo", distributorId)
 		    .detail("Primary", primary)
 		    .detail("AddedTeamNumber", 0)
@@ -1804,6 +1842,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("DesiredMachineTeams", desiredMachineTeams)
 		    .detail("MaxMachineTeams", maxMachineTeams)
 		    .detail("TotalHealthyMachine", totalHealthyMachineCount)
+			.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
+			.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
+			.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
+			.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
 		    .trackLatest("TeamCollectionInfo");
 
 		// Debug purpose
@@ -1901,6 +1943,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
 				int healthyMachineTeamCount = self->getHealthyMachineTeamCount();
 
+				std::pair<int, int>  minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
+				std::pair<int, int>  minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
+
 				TraceEvent("TeamCollectionInfo", self->distributorId)
 				    .detail("Primary", self->primary)
 				    .detail("AddedTeamNumber", 0)
@@ -1914,6 +1959,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				    .detail("DesiredMachineTeams", desiredMachineTeams)
 				    .detail("MaxMachineTeams", maxMachineTeams)
 				    .detail("TotalHealthyMachine", totalHealthyMachineCount)
+					.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
+					.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
+					.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
+					.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
 				    .trackLatest("TeamCollectionInfo");
 			}
 		}
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index b5be5335dc..126779c4bf 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -289,6 +289,11 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 			int64_t healthyMachineTeamCount = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("CurrentHealthyMachineTeamNumber"));
 			int64_t desiredMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("DesiredMachineTeams"));
 			int64_t maxMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeams"));
+			
+			int64_t minServerTeamOnServer = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinTeamNumberOnServer"));
+			int64_t maxServerTeamOnServer = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxTeamNumberOnServer"));
+			int64_t minMachineTeamOnMachine = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinMachineTeamNumberOnMachine"));
+			int64_t maxMachineTeamOnMachine = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeamNumberOnMachine"));
 
 			// Team number is always valid when we disable teamRemover. This avoids false positive in simulation test
 			if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) {
@@ -299,7 +304,11 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 
 			// The if condition should be consistent with the condition in teamRemover() that decides
 			// if redundant teams exist.
-			if (healthyMachineTeamCount > desiredMachineTeamNumber) {
+			if (healthyMachineTeamCount > desiredMachineTeamNumber ||
+				minServerTeamOnServer < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ||
+				minMachineTeamOnMachine < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ||
+				maxServerTeamOnServer > SERVER_KNOBS->MAX_TEAMS_PER_SERVER ||
+				maxMachineTeamOnMachine > SERVER_KNOBS->MAX_TEAMS_PER_SERVER) {
 				TraceEvent("GetTeamCollectionValid")
 				    .detail("CurrentTeamNumber", currentTeamNumber)
 				    .detail("DesiredTeamNumber", desiredTeamNumber)
@@ -307,7 +316,13 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 				    .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount)
 				    .detail("DesiredMachineTeams", desiredMachineTeamNumber)
 				    .detail("CurrentMachineTeamNumber", currentMachineTeamNumber)
-				    .detail("MaxMachineTeams", maxMachineTeamNumber);
+				    .detail("MaxMachineTeams", maxMachineTeamNumber)
+					.detail("MinTeamNumberOnServer", minServerTeamOnServer)
+					.detail("MaxTeamNumberOnServer", maxServerTeamOnServer)
+					.detail("MinMachineTeamNumberOnMachine", minMachineTeamOnMachine)
+					.detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine)
+					.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
+					.detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER);
 				return false;
 			} else {
 				return true;

From 21664742a6f6388d4ba0bdf4145bb7ce865bcdaa Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Wed, 26 Jun 2019 14:59:02 -0700
Subject: [PATCH 013/136] TeamCollection:Desired team number may be larger than
 the max possible team number

For example, we have 3 servers for replica factor 3. We can have only 1 team
but the desired team number is 3 times 5 equal to 15.

Instead of sanity checking the absolute team number per server, we check
the difference between the minServerTeamOnServer and maxServerTeamOnServer.
---
 fdbserver/DataDistribution.actor.cpp | 12 ++++++------
 fdbserver/QuietDatabase.actor.cpp    |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 2f6a9bbcd2..db75ce76e1 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1591,9 +1591,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return totalHealthyMachineCount;
 	}
 
-	std::pair<int, int> calculateMinMaxServerTeamNumOnServer() {
-		int minTeamNumber = std::numeric_limits<int>::max();
-		int maxTeamNumber = std::numeric_limits<int>::min();
+	std::pair<uint32_t, uint32_t> calculateMinMaxServerTeamNumOnServer() {
+		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
+		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
 		for (auto& server : server_info ) {
 			if ( server.second->teams.size() < minTeamNumber ) {
 				minTeamNumber = server.second->teams.size();
@@ -1605,9 +1605,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return std::make_pair(minTeamNumber, maxTeamNumber);
 	}
 
-	std::pair<int, int> calculateMinMaxMachineTeamNumOnMachine() {
-		int minTeamNumber = std::numeric_limits<int>::max();
-		int maxTeamNumber = std::numeric_limits<int>::min();
+	std::pair<uint32_t, uint32_t> calculateMinMaxMachineTeamNumOnMachine() {
+		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
+		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
 		for (auto& machine : machine_info) {
 			if ( machine.second->machineTeams.size() < minTeamNumber ) {
 				minTeamNumber = machine.second->machineTeams.size();
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 126779c4bf..927bf08440 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -305,10 +305,10 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 			// The if condition should be consistent with the condition in teamRemover() that decides
 			// if redundant teams exist.
 			if (healthyMachineTeamCount > desiredMachineTeamNumber ||
-				minServerTeamOnServer < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ||
-				minMachineTeamOnMachine < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ||
-				maxServerTeamOnServer > SERVER_KNOBS->MAX_TEAMS_PER_SERVER ||
-				maxMachineTeamOnMachine > SERVER_KNOBS->MAX_TEAMS_PER_SERVER) {
+				minServerTeamOnServer <= 0 ||
+				minMachineTeamOnMachine <= 0 ||
+				( maxServerTeamOnServer > SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER && minServerTeamOnServer < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) ||
+				( maxMachineTeamOnMachine > SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER && minMachineTeamOnMachine < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) ) {
 				TraceEvent("GetTeamCollectionValid")
 				    .detail("CurrentTeamNumber", currentTeamNumber)
 				    .detail("DesiredTeamNumber", desiredTeamNumber)

From ee916b337d2c2cf457b91920ceb932c1bcc81ef9 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Wed, 26 Jun 2019 15:31:05 -0700
Subject: [PATCH 014/136] TeamCollection:Change the target team number to build

When team collection (TC) build server teams and machine teams,
it needs to build enough teams such that each server and machine has
the DESIRED_TEAMS_PER_SERVER server teams and machine teams.

This change calculate the number of teams (server team and machine teams)
needed to get each teams for each server and machine.
---
 fdbserver/DataDistribution.actor.cpp | 79 ++++++++++++++++++++++------
 1 file changed, 62 insertions(+), 17 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index db75ce76e1..5511d57b8d 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1310,7 +1310,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	// Five steps to create each machine team, which are document in the function
 	// Reuse ReplicationPolicy selectReplicas func to select machine team
 	// return number of added machine teams
-	int addBestMachineTeams(int targetMachineTeamsToBuild) {
+	int addBestMachineTeams(int targetMachineTeamsToBuild, int remainingMachineTeamBudget) {
 		int addedMachineTeams = 0;
 		int totalServerIndex = 0;
 		int machineTeamsToBuild = 0;
@@ -1329,7 +1329,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 		int loopCount = 0;
 		// Add a team in each iteration
-		while (addedMachineTeams < machineTeamsToBuild) {
+		while (addedMachineTeams < machineTeamsToBuild || addedMachineTeams < remainingMachineTeamBudget) {
 			// Step 2: Get least used machines from which we choose machines as a machine team
 			std::vector<Reference<TCMachineInfo>> leastUsedMachines; // A less used machine has less number of teams
 			int minTeamCount = std::numeric_limits<int>::max();
@@ -1432,6 +1432,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 				addMachineTeam(machines);
 				addedMachineTeams++;
+				// Update the remaining machine team budget because the budget may decrease by any value between 1 and storageTeamSize
+				remainingMachineTeamBudget = getRemainingMachineTeamBudget();
 			} else {
 				TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId)
 				    .detail("Primary", primary)
@@ -1669,11 +1671,48 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return healthyTeamCount;
 	}
 
+	// Each machine is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
+    // remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has  SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
+	int getRemainingMachineTeamBudget() {
+		 int remainingMachineTeamBudget = 0;
+		for ( auto& m : machine_info ) {
+			int healthyMTCount = 0;
+			for ( auto& mt : m.second->machineTeams ) {
+				if ( isMachineTeamHealthy(mt) ) {
+					++healthyMTCount;
+				}
+			}
+			remainingMachineTeamBudget += std::max(0, (int) (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - healthyMTCount));
+		}
+
+		// We over-provision the remainingMachineTeamBudget because we do not know when a new machine team is built, how many times it can be counted into the budget
+		// For example, when a new machine is added, a new machine team only consume 1 such budget
+		return remainingMachineTeamBudget;
+	}
+
+	// Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
+	int getRemainingServerTeamBudget() {
+		// remainingTeamBudget is the number of teams needed to ensure every server has  SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
+		int remainingTeamBudget = 0;
+		for ( auto& s : server_info ) {
+			int numValidTeams = 0;
+			for ( auto& team : s.second->teams ) {
+				if ( !team->isWrongConfiguration() && team->isHealthy() ) {
+					++numValidTeams;
+				}
+			}
+			remainingTeamBudget += std::max(0, (int) (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams));
+		}
+
+		return remainingTeamBudget;
+	}
+		
+
 	// Create server teams based on machine teams
 	// Before the number of machine teams reaches the threshold, build a machine team for each server team
 	// When it reaches the threshold, first try to build a server team with existing machine teams; if failed,
 	// build an extra machine team and record the event in trace
-	int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber) {
+	int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber, int remainingTeamBudget) {
 		ASSERT(teamsToBuild > 0);
 		ASSERT_WE_THINK(machine_info.size() > 0 || server_info.size() == 0);
 
@@ -1685,8 +1724,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		// When we change configuration, we may have machine teams with storageTeamSize in the old configuration.
 		int healthyMachineTeamCount = getHealthyMachineTeamCount();
 		int totalMachineTeamCount = machineTeams.size();
-
 		int totalHealthyMachineCount = calculateHealthyMachineCount();
+		int remainingMachineTeamBudget = getRemainingMachineTeamBudget();
 
 		int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount;
 		int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
@@ -1699,13 +1738,14 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("HealthyMachineTeamCount", healthyMachineTeamCount)
 		    .detail("DesiredMachineTeams", desiredMachineTeams)
 		    .detail("MaxMachineTeams", maxMachineTeams)
-		    .detail("MachineTeamsToBuild", machineTeamsToBuild);
+		    .detail("MachineTeamsToBuild", machineTeamsToBuild)
+			.detail("RemainingMachineTeamBudget", remainingMachineTeamBudget);
 		// Pre-build all machine teams until we have the desired number of machine teams
 		if (machineTeamsToBuild > 0) {
-			addedMachineTeams = addBestMachineTeams(machineTeamsToBuild);
+			addedMachineTeams = addBestMachineTeams(machineTeamsToBuild, remainingMachineTeamBudget);
 		}
 
-		while (addedTeams < teamsToBuild) {
+		while (addedTeams < teamsToBuild || addedTeams < remainingTeamBudget) {
 			// Step 1: Create 1 best machine team
 			std::vector<UID> bestServerTeam;
 			int bestScore = std::numeric_limits<int>::max();
@@ -1782,6 +1822,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			// Step 4: Add the server team
 			addTeam(bestServerTeam.begin(), bestServerTeam.end(), false);
 			addedTeams++;
+			remainingTeamBudget = getRemainingServerTeamBudget();
 
 			if (++loopCount > 2 * teamsToBuild * (configuration.storageTeamSize + 1)) {
 				break;
@@ -1901,6 +1942,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 					totalTeamCount++;
 				}
 			}
+			// Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
+			// remainingTeamBudget is the number of teams needed to ensure every server has  SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
+			int remainingTeamBudget = self->getRemainingServerTeamBudget();
 
 			// teamsToBuild is calculated such that we will not build too many teams in the situation
 			// when all (or most of) teams become unhealthy temporarily and then healthy again
@@ -1927,7 +1971,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				// addTeamsBestOf() will not add more teams than needed.
 				// If the team number is more than the desired, the extra teams are added in the code path when
 				// a team is added as an initial team
-				int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams);
+				int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams, remainingTeamBudget);
 
 				if (addedTeams <= 0 && self->teams.size() == 0) {
 					TraceEvent(SevWarn, "NoTeamAfterBuildTeam")
@@ -3005,8 +3049,9 @@ ACTOR Future<Void> storageServerTracker(
 			if(hasWrongStoreTypeOrDC)
 				self->restartRecruiting.trigger();
 
-			if ( lastIsUnhealthy && !status.isUnhealthy() && !server->teams.size() ) {
+			if ( lastIsUnhealthy && !status.isUnhealthy() && server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) {
 				self->doBuildTeams = true;
+				self->restartTeamBuilder.trigger();
 			}
 			lastIsUnhealthy = status.isUnhealthy();
 
@@ -3894,7 +3939,7 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") {
 	Reference<IReplicationPolicy> policy = Reference<IReplicationPolicy>(new PolicyAcross(teamSize, "zoneid", Reference<IReplicationPolicy>(new PolicyOne())));
 	state DDTeamCollection* collection = testMachineTeamCollection(teamSize, policy, processSize);
 
-	int result = collection->addTeamsBestOf(30, desiredTeams, maxTeams);
+	collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30);
 
 	ASSERT(collection->sanityCheckTeams() == true);
 
@@ -3919,8 +3964,8 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/NotUseMachineID") {
 		return Void();
 	}
 
-	collection->addBestMachineTeams(30); // Create machine teams to help debug
-	int result = collection->addTeamsBestOf(30, desiredTeams, maxTeams);
+	collection->addBestMachineTeams(30, 30); // Create machine teams to help debug
+	collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30);
 	collection->sanityCheckTeams(); // Server team may happen to be on the same machine team, although unlikely
 
 	if (collection) delete (collection);
@@ -3935,7 +3980,7 @@ TEST_CASE("DataDistribution/AddAllTeams/isExhaustive") {
 	state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize;
 	state DDTeamCollection* collection = testTeamCollection(3, policy, processSize);
 
-	int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams);
+	int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams, 200);
 
 	delete(collection);
 
@@ -3955,7 +4000,7 @@ TEST_CASE("/DataDistribution/AddAllTeams/withLimit") {
 
 	state DDTeamCollection* collection = testTeamCollection(3, policy, processSize);
 
-	int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams);
+	int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10);
 
 	delete(collection);
 
@@ -3975,7 +4020,7 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") {
 	collection->addTeam(std::set<UID>({ UID(1, 0), UID(2, 0), UID(3, 0) }), true);
 	collection->addTeam(std::set<UID>({ UID(1, 0), UID(3, 0), UID(4, 0) }), true);
 
-	int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams);
+	int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams, 8);
 
 	ASSERT(result == 8);
 
@@ -4005,8 +4050,8 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") {
 	collection->addTeam(std::set<UID>({ UID(1, 0), UID(2, 0), UID(3, 0) }), true);
 	collection->addTeam(std::set<UID>({ UID(1, 0), UID(3, 0), UID(4, 0) }), true);
 
-	int resultMachineTeams = collection->addBestMachineTeams(10);
-	int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams);
+	collection->addBestMachineTeams(10, 10);
+	int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10);
 
 	if (collection->machineTeams.size() != 10 || result != 8) {
 		collection->traceAllInfo(true); // Debug message

From e1d459075a940719040101f89581933157809baf Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Wed, 26 Jun 2019 17:30:29 -0700
Subject: [PATCH 015/136] TeamCollection:Count healthy machine teams only

Team collection should prioritize to build machine teams for a machine
that has the least number of healthy machine teams, instead of just
machine teams, because unhealthy machine team will not be able to
produce more server teams.
---
 fdbserver/DataDistribution.actor.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 5511d57b8d..334b380cd2 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1342,7 +1342,12 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				// Invariant: We only create correct size machine teams.
 				// When configuration (e.g., team size) is changed, the DDTeamCollection will be destroyed and rebuilt
 				// so that the invariant will not be violated.
-				int teamCount = machine.second->machineTeams.size();
+				int teamCount = 0;
+				for (auto& mt : machine.second->machineTeams) {
+					if ( isMachineTeamHealthy(mt) ) {
+						++teamCount;
+					}
+				}
 
 				if (teamCount < minTeamCount) {
 					leastUsedMachines.clear();

From 02cdcc0b0c09994c1904b01b07f3aa02285eb204 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Wed, 26 Jun 2019 17:35:57 -0700
Subject: [PATCH 016/136] TeamCollectionTest: Only ensure each server and
 machine have a team

---
 fdbserver/QuietDatabase.actor.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 927bf08440..9edb9d06e3 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -306,9 +306,7 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 			// if redundant teams exist.
 			if (healthyMachineTeamCount > desiredMachineTeamNumber ||
 				minServerTeamOnServer <= 0 ||
-				minMachineTeamOnMachine <= 0 ||
-				( maxServerTeamOnServer > SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER && minServerTeamOnServer < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) ||
-				( maxMachineTeamOnMachine > SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER && minMachineTeamOnMachine < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) ) {
+				minMachineTeamOnMachine <= 0 ) {
 				TraceEvent("GetTeamCollectionValid")
 				    .detail("CurrentTeamNumber", currentTeamNumber)
 				    .detail("DesiredTeamNumber", desiredTeamNumber)

From c23d89c98aa272ec926bea362ee7d3c795d93741 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Wed, 26 Jun 2019 17:56:54 -0700
Subject: [PATCH 017/136] TeamCollection:Only count healthy teams for a server

When team collection add new server teams, it picks a team with
the least number of teams. We should only consider the healthy teams
because the unhealthy ones will not be useful.
---
 fdbserver/DataDistribution.actor.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 334b380cd2..721dcd11df 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1507,7 +1507,12 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			// Only pick healthy server, which is not failed or excluded.
 			if (server_status.get(server.first).isUnhealthy()) continue;
 
-			int numTeams = server.second->teams.size();
+			int numTeams = 0;
+			for (auto& t : server.second->teams) {
+				 if (!t->isWrongConfiguration() && t->isHealthy()) {
+					 ++numTeams;
+				 }
+			}
 			if (numTeams < minTeamNumber) {
 				minTeamNumber = numTeams;
 				leastUsedServers.clear();
@@ -3056,7 +3061,7 @@ ACTOR Future<Void> storageServerTracker(
 
 			if ( lastIsUnhealthy && !status.isUnhealthy() && server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) {
 				self->doBuildTeams = true;
-				self->restartTeamBuilder.trigger();
+				self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams
 			}
 			lastIsUnhealthy = status.isUnhealthy();
 

From cc6a0e9bcdae581fae748b3129c22dbd5ab3d014 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Wed, 26 Jun 2019 19:33:38 -0700
Subject: [PATCH 018/136] TeamCollectionTest:Do not enforce
 minServerTeamOnServer larger than 0

In ConfigureTest, one server may be left with 0 server teams, even if
we call buildTeams in the storageServerTracker.
---
 fdbserver/DataDistribution.actor.cpp | 2 +-
 fdbserver/QuietDatabase.actor.cpp    | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 721dcd11df..443c47fc7b 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1682,7 +1682,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	}
 
 	// Each machine is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
-    // remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has  SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
+	// remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has  SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
 	int getRemainingMachineTeamBudget() {
 		 int remainingMachineTeamBudget = 0;
 		for ( auto& m : machine_info ) {
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 9edb9d06e3..7350420775 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -305,7 +305,6 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 			// The if condition should be consistent with the condition in teamRemover() that decides
 			// if redundant teams exist.
 			if (healthyMachineTeamCount > desiredMachineTeamNumber ||
-				minServerTeamOnServer <= 0 ||
 				minMachineTeamOnMachine <= 0 ) {
 				TraceEvent("GetTeamCollectionValid")
 				    .detail("CurrentTeamNumber", currentTeamNumber)

From 53324e4db753d4738069e3a664a2dfd5fa7a8922 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Wed, 26 Jun 2019 19:38:12 -0700
Subject: [PATCH 019/136] TeamCollectionInfo: clang format

---
 fdbserver/DataDistribution.actor.cpp | 95 +++++++++++++++-------------
 fdbserver/QuietDatabase.actor.cpp    | 29 +++++----
 2 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 443c47fc7b..3dd70b15e7 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1344,7 +1344,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				// so that the invariant will not be violated.
 				int teamCount = 0;
 				for (auto& mt : machine.second->machineTeams) {
-					if ( isMachineTeamHealthy(mt) ) {
+					if (isMachineTeamHealthy(mt)) {
 						++teamCount;
 					}
 				}
@@ -1437,7 +1437,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 				addMachineTeam(machines);
 				addedMachineTeams++;
-				// Update the remaining machine team budget because the budget may decrease by any value between 1 and storageTeamSize
+				// Update the remaining machine team budget because the budget may decrease by any value between 1 and
+				// storageTeamSize
 				remainingMachineTeamBudget = getRemainingMachineTeamBudget();
 			} else {
 				TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId)
@@ -1509,9 +1510,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 			int numTeams = 0;
 			for (auto& t : server.second->teams) {
-				 if (!t->isWrongConfiguration() && t->isHealthy()) {
-					 ++numTeams;
-				 }
+				if (!t->isWrongConfiguration() && t->isHealthy()) {
+					++numTeams;
+				}
 			}
 			if (numTeams < minTeamNumber) {
 				minTeamNumber = numTeams;
@@ -1606,11 +1607,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	std::pair<uint32_t, uint32_t> calculateMinMaxServerTeamNumOnServer() {
 		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
 		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
-		for (auto& server : server_info ) {
-			if ( server.second->teams.size() < minTeamNumber ) {
+		for (auto& server : server_info) {
+			if (server.second->teams.size() < minTeamNumber) {
 				minTeamNumber = server.second->teams.size();
 			}
-			if ( server.second->teams.size() > maxTeamNumber ) {
+			if (server.second->teams.size() > maxTeamNumber) {
 				maxTeamNumber = server.second->teams.size();
 			}
 		}
@@ -1621,10 +1622,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
 		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
 		for (auto& machine : machine_info) {
-			if ( machine.second->machineTeams.size() < minTeamNumber ) {
+			if (machine.second->machineTeams.size() < minTeamNumber) {
 				minTeamNumber = machine.second->machineTeams.size();
 			}
-			if ( machine.second->machineTeams.size() > maxTeamNumber ) {
+			if (machine.second->machineTeams.size() > maxTeamNumber) {
 				maxTeamNumber = machine.second->machineTeams.size();
 			}
 		}
@@ -1682,41 +1683,43 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	}
 
 	// Each machine is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
-	// remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has  SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
+	// remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has
+	// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
 	int getRemainingMachineTeamBudget() {
-		 int remainingMachineTeamBudget = 0;
-		for ( auto& m : machine_info ) {
+		int remainingMachineTeamBudget = 0;
+		for (auto& m : machine_info) {
 			int healthyMTCount = 0;
-			for ( auto& mt : m.second->machineTeams ) {
-				if ( isMachineTeamHealthy(mt) ) {
+			for (auto& mt : m.second->machineTeams) {
+				if (isMachineTeamHealthy(mt)) {
 					++healthyMTCount;
 				}
 			}
-			remainingMachineTeamBudget += std::max(0, (int) (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - healthyMTCount));
+			remainingMachineTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - healthyMTCount));
 		}
 
-		// We over-provision the remainingMachineTeamBudget because we do not know when a new machine team is built, how many times it can be counted into the budget
-		// For example, when a new machine is added, a new machine team only consume 1 such budget
+		// We over-provision the remainingMachineTeamBudget because we do not know, when a new machine team is built,
+		// how many times it can be counted into the budget. For example, when a new machine is added,
+		// a new machine team only consume 1 such budget
 		return remainingMachineTeamBudget;
 	}
 
 	// Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
 	int getRemainingServerTeamBudget() {
-		// remainingTeamBudget is the number of teams needed to ensure every server has  SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
+		// remainingTeamBudget is the number of teams needed to ensure every server has
+		// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
 		int remainingTeamBudget = 0;
-		for ( auto& s : server_info ) {
+		for (auto& s : server_info) {
 			int numValidTeams = 0;
-			for ( auto& team : s.second->teams ) {
-				if ( !team->isWrongConfiguration() && team->isHealthy() ) {
+			for (auto& team : s.second->teams) {
+				if (!team->isWrongConfiguration() && team->isHealthy()) {
 					++numValidTeams;
 				}
 			}
-			remainingTeamBudget += std::max(0, (int) (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams));
+			remainingTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams));
 		}
 
 		return remainingTeamBudget;
 	}
-		
 
 	// Create server teams based on machine teams
 	// Before the number of machine teams reaches the threshold, build a machine team for each server team
@@ -1749,7 +1752,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("DesiredMachineTeams", desiredMachineTeams)
 		    .detail("MaxMachineTeams", maxMachineTeams)
 		    .detail("MachineTeamsToBuild", machineTeamsToBuild)
-			.detail("RemainingMachineTeamBudget", remainingMachineTeamBudget);
+		    .detail("RemainingMachineTeamBudget", remainingMachineTeamBudget);
 		// Pre-build all machine teams until we have the desired number of machine teams
 		if (machineTeamsToBuild > 0) {
 			addedMachineTeams = addBestMachineTeams(machineTeamsToBuild, remainingMachineTeamBudget);
@@ -1841,8 +1844,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 		healthyMachineTeamCount = getHealthyMachineTeamCount();
 
-		std::pair<int, int>  minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
-		std::pair<int, int>  minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
+		std::pair<int, int> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
+		std::pair<int, int> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
 
 		TraceEvent("TeamCollectionInfo", distributorId)
 		    .detail("Primary", primary)
@@ -1857,10 +1860,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("DesiredMachineTeams", desiredMachineTeams)
 		    .detail("MaxMachineTeams", maxMachineTeams)
 		    .detail("TotalHealthyMachine", totalHealthyMachineCount)
-			.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
-			.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
-			.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
-			.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
+		    .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
+		    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
+		    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
+		    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
 		    .trackLatest("TeamCollectionInfo");
 
 		return addedTeams;
@@ -1877,8 +1880,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
 		int healthyMachineTeamCount = getHealthyMachineTeamCount();
 
-		std::pair<int, int>  minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
-		std::pair<int, int>  minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
+		std::pair<int, int> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
+		std::pair<int, int> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
 
 		TraceEvent("TeamCollectionInfo", distributorId)
 		    .detail("Primary", primary)
@@ -1893,10 +1896,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("DesiredMachineTeams", desiredMachineTeams)
 		    .detail("MaxMachineTeams", maxMachineTeams)
 		    .detail("TotalHealthyMachine", totalHealthyMachineCount)
-			.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
-			.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
-			.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
-			.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
+		    .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
+		    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
+		    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
+		    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
 		    .trackLatest("TeamCollectionInfo");
 
 		// Debug purpose
@@ -1953,7 +1956,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				}
 			}
 			// Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
-			// remainingTeamBudget is the number of teams needed to ensure every server has  SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
+			// remainingTeamBudget is the number of teams needed to ensure every server has
+			// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
 			int remainingTeamBudget = self->getRemainingServerTeamBudget();
 
 			// teamsToBuild is calculated such that we will not build too many teams in the situation
@@ -1997,8 +2001,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
 				int healthyMachineTeamCount = self->getHealthyMachineTeamCount();
 
-				std::pair<int, int>  minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
-				std::pair<int, int>  minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
+				std::pair<int, int> minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
+				std::pair<int, int> minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
 
 				TraceEvent("TeamCollectionInfo", self->distributorId)
 				    .detail("Primary", self->primary)
@@ -2013,10 +2017,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				    .detail("DesiredMachineTeams", desiredMachineTeams)
 				    .detail("MaxMachineTeams", maxMachineTeams)
 				    .detail("TotalHealthyMachine", totalHealthyMachineCount)
-					.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
-					.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
-					.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
-					.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
+				    .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
+				    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
+				    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
+				    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
 				    .trackLatest("TeamCollectionInfo");
 			}
 		}
@@ -3059,7 +3063,8 @@ ACTOR Future<Void> storageServerTracker(
 			if(hasWrongStoreTypeOrDC)
 				self->restartRecruiting.trigger();
 
-			if ( lastIsUnhealthy && !status.isUnhealthy() && server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) {
+			if (lastIsUnhealthy && !status.isUnhealthy() &&
+			    server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) {
 				self->doBuildTeams = true;
 				self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams
 			}
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 7350420775..72bda5cba6 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -289,11 +289,15 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 			int64_t healthyMachineTeamCount = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("CurrentHealthyMachineTeamNumber"));
 			int64_t desiredMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("DesiredMachineTeams"));
 			int64_t maxMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeams"));
-			
-			int64_t minServerTeamOnServer = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinTeamNumberOnServer"));
-			int64_t maxServerTeamOnServer = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxTeamNumberOnServer"));
-			int64_t minMachineTeamOnMachine = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinMachineTeamNumberOnMachine"));
-			int64_t maxMachineTeamOnMachine = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeamNumberOnMachine"));
+
+			int64_t minServerTeamOnServer =
+			    boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinTeamNumberOnServer"));
+			int64_t maxServerTeamOnServer =
+			    boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxTeamNumberOnServer"));
+			int64_t minMachineTeamOnMachine =
+			    boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinMachineTeamNumberOnMachine"));
+			int64_t maxMachineTeamOnMachine =
+			    boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeamNumberOnMachine"));
 
 			// Team number is always valid when we disable teamRemover. This avoids false positive in simulation test
 			if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) {
@@ -304,8 +308,7 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 
 			// The if condition should be consistent with the condition in teamRemover() that decides
 			// if redundant teams exist.
-			if (healthyMachineTeamCount > desiredMachineTeamNumber ||
-				minMachineTeamOnMachine <= 0 ) {
+			if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0) {
 				TraceEvent("GetTeamCollectionValid")
 				    .detail("CurrentTeamNumber", currentTeamNumber)
 				    .detail("DesiredTeamNumber", desiredTeamNumber)
@@ -314,12 +317,12 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 				    .detail("DesiredMachineTeams", desiredMachineTeamNumber)
 				    .detail("CurrentMachineTeamNumber", currentMachineTeamNumber)
 				    .detail("MaxMachineTeams", maxMachineTeamNumber)
-					.detail("MinTeamNumberOnServer", minServerTeamOnServer)
-					.detail("MaxTeamNumberOnServer", maxServerTeamOnServer)
-					.detail("MinMachineTeamNumberOnMachine", minMachineTeamOnMachine)
-					.detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine)
-					.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
-					.detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER);
+				    .detail("MinTeamNumberOnServer", minServerTeamOnServer)
+				    .detail("MaxTeamNumberOnServer", maxServerTeamOnServer)
+				    .detail("MinMachineTeamNumberOnMachine", minMachineTeamOnMachine)
+				    .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine)
+				    .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
+				    .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER);
 				return false;
 			} else {
 				return true;

From aaf97542e9e409b5989bb75437547ff2dcf030ec Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Wed, 26 Jun 2019 22:37:34 -0700
Subject: [PATCH 020/136] TeamCollectionTest: Update unit test

---
 fdbserver/DataDistribution.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 3dd70b15e7..a7b1f99d11 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4019,7 +4019,7 @@ TEST_CASE("/DataDistribution/AddAllTeams/withLimit") {
 
 	delete(collection);
 
-	ASSERT(result == 10);
+	ASSERT(result >= 10);
 
 	return Void();
 }
@@ -4037,7 +4037,7 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") {
 
 	int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams, 8);
 
-	ASSERT(result == 8);
+	ASSERT(result >= 8);
 
 	for(auto process = collection->server_info.begin(); process != collection->server_info.end(); process++) {
 		auto teamCount = process->second->teams.size();

From 90c158984c6b0fec6870b0468750bbdee26c67ba Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 11:12:48 -0700
Subject: [PATCH 021/136] TeamCollection:Add extra trace events

---
 fdbserver/DataDistribution.actor.cpp | 31 ++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index a7b1f99d11..f89e7d5cc9 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1669,6 +1669,26 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return std::pair<Reference<TCMachineTeamInfo>, int>(retMT, minNumProcessTeams);
 	}
 
+	// Find the machine team with the largest number of server teams
+	std::pair<Reference<TCMachineTeamInfo>, int> getMachineTeamWithMostProcessTeams() {
+		Reference<TCMachineTeamInfo> retMT;
+		int maxNumProcessTeams = std::numeric_limits<int>::min();
+
+		for (auto& mt : machineTeams) {
+			if (EXPENSIVE_VALIDATION) {
+				ASSERT(isServerTeamNumberCorrect(mt));
+			}
+			int size = mt->serverTeams.size();
+			if ( size > maxNumProcessTeams) {
+				maxNumProcessTeams = mt->serverTeams.size();
+				retMT = mt;
+			}
+		}
+
+		return std::pair<Reference<TCMachineTeamInfo>, int>(retMT, maxNumProcessTeams);
+	}
+
+
 	int getHealthyMachineTeamCount() {
 		int healthyTeamCount = 0;
 		for (auto mt = machineTeams.begin(); mt != machineTeams.end(); ++mt) {
@@ -1864,6 +1884,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
 		    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
 		    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
+			.detail("DoBuildTeams", doBuildTeams)
 		    .trackLatest("TeamCollectionInfo");
 
 		return addedTeams;
@@ -1900,6 +1921,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
 		    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
 		    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
+			.detail("DoBuildTeams", doBuildTeams)
 		    .trackLatest("TeamCollectionInfo");
 
 		// Debug purpose
@@ -2021,6 +2043,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
 				    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
 				    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
+					.detail("DoBuildTeams", doBuildTeams)
 				    .trackLatest("TeamCollectionInfo");
 			}
 		}
@@ -2418,6 +2441,14 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
 				team = mt->serverTeams[teamIndex];
 				ASSERT(team->machineTeam->machineIDs == mt->machineIDs); // Sanity check
 
+				// Check if a server will have 0 team after the team is removed
+				for (auto& s : team->getServers()) {
+					if ( s->teams.size() == 0 ) {
+						TraceEvent(SevError, "TeamRemoverTooAggressive").detail("Server", s->id).detail("Team", team->getServerIDsStr());
+						self->traceAllInfo(true);
+					}
+				}
+
 				// The team will be marked as a bad team
 				bool foundTeam = self->removeTeam(team);
 				ASSERT(foundTeam == true);

From 5f5c4042919793996b65a3bbf9b3da5d08c46125 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 13:47:46 -0700
Subject: [PATCH 022/136] BugFix:ReplicationPolicy always fails when teamSize
 is 1

Whenever use selectReplicas function, be careful that it may have bugs!
This bug is that it always return false (not able to find candidates)
when the storage team size is 1. This is wrong because when storage team size
is 1, the selectReplicas should return an empty result.
---
 fdbserver/DataDistribution.actor.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index f89e7d5cc9..b93aecc76f 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1367,6 +1367,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			if (leastUsedMachines.size()) {
 				// Randomly choose 1 least used machine
 				Reference<TCMachineInfo> tcMachineInfo = g_random->randomChoice(leastUsedMachines);
+				TraceEvent("MXDEBUG", distributorId).detail("MachineID", tcMachineInfo->machineID.contents().toString()).detail("Servers", tcMachineInfo->getServersIDStr());
 				ASSERT(!tcMachineInfo->serversOnMachine.empty());
 				LocalityEntry process = tcMachineInfo->localityEntry;
 				forcedAttributes.push_back(process);
@@ -1384,9 +1385,13 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				// that have the least-utilized server
 				team.clear();
 				auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team);
-				if (!success) {
+				if (!success && configuration.storageTeamSize > 1) { // NOTE: selectReplicas() returns false always when storageTeamSize == 1
+					TraceEvent("MXDEBUG", distributorId).detail("TeamSize", configuration.storageTeamSize);
 					break;
 				}
+				if ( !success &&  configuration.storageTeamSize == 1 && forcedAttributes.size() > 0 ) {
+					TraceEvent(SevError, "MXDEBUG", distributorId).detail("TeamSize", configuration.storageTeamSize).detail("Success", success);
+				}
 				ASSERT(forcedAttributes.size() > 0);
 				team.push_back((UID*)machineLocalityMap.getObject(forcedAttributes[0]));
 
@@ -2043,7 +2048,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
 				    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
 				    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
-					.detail("DoBuildTeams", doBuildTeams)
+					.detail("DoBuildTeams", self->doBuildTeams)
 				    .trackLatest("TeamCollectionInfo");
 			}
 		}
@@ -3215,7 +3220,9 @@ ACTOR Future<Void> storageServerTracker(
 					//Restart the storeTracker for the new interface
 					storeTracker = keyValueStoreTypeTracker(self, server);
 					hasWrongStoreTypeOrDC = false;
+					self->doBuildTeams = true;
 					self->restartTeamBuilder.trigger();
+					self->traceTeamCollectionInfo();
 					if(restartRecruiting)
 						self->restartRecruiting.trigger();
 				}

From 2993a96de8f325f34381cc01bfe872a8d40c2123 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 14:15:51 -0700
Subject: [PATCH 023/136] TeamCollectionInfo: Remove debug trace and apply
 clang format

---
 fdbserver/DataDistribution.actor.cpp | 44 +++++++---------------------
 1 file changed, 11 insertions(+), 33 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index b93aecc76f..58b62832d1 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1367,7 +1367,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			if (leastUsedMachines.size()) {
 				// Randomly choose 1 least used machine
 				Reference<TCMachineInfo> tcMachineInfo = g_random->randomChoice(leastUsedMachines);
-				TraceEvent("MXDEBUG", distributorId).detail("MachineID", tcMachineInfo->machineID.contents().toString()).detail("Servers", tcMachineInfo->getServersIDStr());
 				ASSERT(!tcMachineInfo->serversOnMachine.empty());
 				LocalityEntry process = tcMachineInfo->localityEntry;
 				forcedAttributes.push_back(process);
@@ -1385,13 +1384,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				// that have the least-utilized server
 				team.clear();
 				auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team);
-				if (!success && configuration.storageTeamSize > 1) { // NOTE: selectReplicas() returns false always when storageTeamSize == 1
-					TraceEvent("MXDEBUG", distributorId).detail("TeamSize", configuration.storageTeamSize);
+				// NOTE: selectReplicas() returns false always when storageTeamSize == 1
+				if (!success && configuration.storageTeamSize > 1) {
 					break;
 				}
-				if ( !success &&  configuration.storageTeamSize == 1 && forcedAttributes.size() > 0 ) {
-					TraceEvent(SevError, "MXDEBUG", distributorId).detail("TeamSize", configuration.storageTeamSize).detail("Success", success);
-				}
 				ASSERT(forcedAttributes.size() > 0);
 				team.push_back((UID*)machineLocalityMap.getObject(forcedAttributes[0]));
 
@@ -1442,8 +1438,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 				addMachineTeam(machines);
 				addedMachineTeams++;
-				// Update the remaining machine team budget because the budget may decrease by any value between 1 and
-				// storageTeamSize
+				// Update the remaining machine team budget because the budget may decrease by
+				// any value between 1 and storageTeamSize
 				remainingMachineTeamBudget = getRemainingMachineTeamBudget();
 			} else {
 				TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId)
@@ -1674,26 +1670,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return std::pair<Reference<TCMachineTeamInfo>, int>(retMT, minNumProcessTeams);
 	}
 
-	// Find the machine team with the largest number of server teams
-	std::pair<Reference<TCMachineTeamInfo>, int> getMachineTeamWithMostProcessTeams() {
-		Reference<TCMachineTeamInfo> retMT;
-		int maxNumProcessTeams = std::numeric_limits<int>::min();
-
-		for (auto& mt : machineTeams) {
-			if (EXPENSIVE_VALIDATION) {
-				ASSERT(isServerTeamNumberCorrect(mt));
-			}
-			int size = mt->serverTeams.size();
-			if ( size > maxNumProcessTeams) {
-				maxNumProcessTeams = mt->serverTeams.size();
-				retMT = mt;
-			}
-		}
-
-		return std::pair<Reference<TCMachineTeamInfo>, int>(retMT, maxNumProcessTeams);
-	}
-
-
 	int getHealthyMachineTeamCount() {
 		int healthyTeamCount = 0;
 		for (auto mt = machineTeams.begin(); mt != machineTeams.end(); ++mt) {
@@ -1889,7 +1865,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
 		    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
 		    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
-			.detail("DoBuildTeams", doBuildTeams)
+		    .detail("DoBuildTeams", doBuildTeams)
 		    .trackLatest("TeamCollectionInfo");
 
 		return addedTeams;
@@ -1926,7 +1902,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
 		    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
 		    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
-			.detail("DoBuildTeams", doBuildTeams)
+		    .detail("DoBuildTeams", doBuildTeams)
 		    .trackLatest("TeamCollectionInfo");
 
 		// Debug purpose
@@ -2048,7 +2024,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
 				    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
 				    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
-					.detail("DoBuildTeams", self->doBuildTeams)
+				    .detail("DoBuildTeams", self->doBuildTeams)
 				    .trackLatest("TeamCollectionInfo");
 			}
 		}
@@ -2448,8 +2424,10 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
 
 				// Check if a server will have 0 team after the team is removed
 				for (auto& s : team->getServers()) {
-					if ( s->teams.size() == 0 ) {
-						TraceEvent(SevError, "TeamRemoverTooAggressive").detail("Server", s->id).detail("Team", team->getServerIDsStr());
+					if (s->teams.size() == 0) {
+						TraceEvent(SevError, "TeamRemoverTooAggressive")
+						    .detail("Server", s->id)
+						    .detail("Team", team->getServerIDsStr());
 						self->traceAllInfo(true);
 					}
 				}

From 8d5e8488081940fd4d4c4031fd0ee45a3ee4dc9f Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 14:22:41 -0700
Subject: [PATCH 024/136] QuitDatabase test: Check each server has at least 1
 team

---
 fdbserver/QuietDatabase.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 72bda5cba6..4245495f07 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -308,7 +308,7 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 
 			// The if condition should be consistent with the condition in teamRemover() that decides
 			// if redundant teams exist.
-			if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0) {
+			if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0 || minServerTeamOnServer <= 0 ) {
 				TraceEvent("GetTeamCollectionValid")
 				    .detail("CurrentTeamNumber", currentTeamNumber)
 				    .detail("DesiredTeamNumber", desiredTeamNumber)

From 52efcfd136f0561469a6d04bb320b02343a689bc Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Thu, 27 Jun 2019 15:15:05 -0700
Subject: [PATCH 025/136] fix: properly create the right number for txsTags
 when changing between different numbers of logs

---
 fdbserver/TagPartitionedLogSystem.actor.cpp | 134 ++++++++++++--------
 fdbserver/masterserver.actor.cpp            |   6 -
 flow/ProtocolVersion.h                      |   2 +-
 3 files changed, 83 insertions(+), 59 deletions(-)

diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index eafb2e9554..2abd10c8c5 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -1764,8 +1764,37 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		std::vector<Tag> localTags = getLocalTags(remoteLocality, allTags);
 		LogSystemConfig oldLogSystemConfig = oldLogSystem->getLogSystemConfig();
 
+		logSet->tLogLocalities.resize( remoteWorkers.remoteTLogs.size() );
+		logSet->logServers.resize( remoteWorkers.remoteTLogs.size() );  // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size
+		logSet->updateLocalitySet(localities);
+
 		state vector<Future<TLogInterface>> remoteTLogInitializationReplies;
 		vector< InitializeTLogRequest > remoteTLogReqs( remoteWorkers.remoteTLogs.size() );
+
+		if(oldLogSystem->logRouterTags == 0) {
+			std::vector<int> locations;
+			for( Tag tag : localTags ) {
+				locations.clear();
+				logSet->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+				for(int loc : locations)
+					remoteTLogReqs[ loc ].recoverTags.push_back( tag );
+			}
+
+			if(oldLogSystem->tLogs.size()) {
+				for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) {
+					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
+					locations.clear();
+					logSet->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+					for(int loc : locations)
+						remoteTLogReqs[ loc ].recoverTags.push_back( tag );
+				}
+				for(int i = 0; i < self->tLogs[0]->logServers.size(); i++) {
+					localTags.push_back(Tag(tagLocalityTxs, i));
+				}
+				localTags.push_back(txsTag);
+			}
+		}
+
 		for( int i = 0; i < remoteWorkers.remoteTLogs.size(); i++ ) {
 			InitializeTLogRequest &req = remoteTLogReqs[i];
 			req.recruitmentID = self->recruitmentID;
@@ -1785,20 +1814,6 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			req.txsTags = self->tLogs[0]->logServers.size();
 		}
 
-		logSet->tLogLocalities.resize( remoteWorkers.remoteTLogs.size() );
-		logSet->logServers.resize( remoteWorkers.remoteTLogs.size() );  // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size
-		logSet->updateLocalitySet(localities);
-
-		if(oldLogSystem->logRouterTags == 0) {
-			std::vector<int> locations;
-			for( Tag tag : localTags ) {
-				locations.clear();
-				logSet->getPushLocations( vector<Tag>(1, tag), locations, 0 );
-				for(int loc : locations)
-					remoteTLogReqs[ loc ].recoverTags.push_back( tag );
-			}
-		}
-
 		for( int i = 0; i < remoteWorkers.remoteTLogs.size(); i++ )
 			remoteTLogInitializationReplies.push_back( transformErrors( throwErrorOr( remoteWorkers.remoteTLogs[i].tLog.getReplyUnlessFailedFor( remoteTLogReqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ), master_recovery_failed() ) );
 
@@ -1940,6 +1955,36 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 
 		state vector<Future<TLogInterface>> initializationReplies;
 		vector< InitializeTLogRequest > reqs( recr.tLogs.size() );
+
+		logSystem->tLogs[0]->tLogLocalities.resize( recr.tLogs.size() );
+		logSystem->tLogs[0]->logServers.resize( recr.tLogs.size() );  // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size
+		logSystem->tLogs[0]->updateLocalitySet(localities);
+
+		std::vector<int> locations;
+		for( Tag tag : localTags ) {
+			locations.clear();
+			logSystem->tLogs[0]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+			for(int loc : locations)
+				reqs[ loc ].recoverTags.push_back( tag );
+		}
+		for(int i = 0; i < oldLogSystem->logRouterTags; i++) {
+			Tag tag = Tag(tagLocalityLogRouter, i);
+			reqs[ logSystem->tLogs[0]->bestLocationFor( tag ) ].recoverTags.push_back( tag );
+		}
+		if(oldLogSystem->tLogs.size()) {
+			for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) {
+				Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
+				locations.clear();
+				logSystem->tLogs[0]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+				for(int loc : locations)
+					reqs[ loc ].recoverTags.push_back( tag );
+			}
+			for(int i = 0; i < recr.tLogs.size(); i++) {
+				localTags.push_back(Tag(tagLocalityTxs, i));
+			}
+			localTags.push_back(txsTag);
+		}
+
 		for( int i = 0; i < recr.tLogs.size(); i++ ) {
 			InitializeTLogRequest &req = reqs[i];
 			req.recruitmentID = logSystem->recruitmentID;
@@ -1959,36 +2004,37 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			req.txsTags = recr.tLogs.size();
 		}
 
-		logSystem->tLogs[0]->tLogLocalities.resize( recr.tLogs.size() );
-		logSystem->tLogs[0]->logServers.resize( recr.tLogs.size() );  // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size
-		logSystem->tLogs[0]->updateLocalitySet(localities);
-
-		for(int i = 0; i < oldLogSystem->logRouterTags; i++) {
-			Tag tag = Tag(tagLocalityLogRouter, i);
-			reqs[ logSystem->tLogs[0]->bestLocationFor( tag ) ].recoverTags.push_back( tag );
-		}
-		std::vector<int> locations;
-		for( Tag tag : localTags ) {
-			locations.clear();
-			logSystem->tLogs[0]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
-			for(int loc : locations)
-				reqs[ loc ].recoverTags.push_back( tag );
-		}
-
 		for( int i = 0; i < recr.tLogs.size(); i++ )
 			initializationReplies.push_back( transformErrors( throwErrorOr( recr.tLogs[i].tLog.getReplyUnlessFailedFor( reqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ), master_recovery_failed() ) );
 
 		state std::vector<Future<Void>> recoveryComplete;
 
 		if(region.satelliteTLogReplicationFactor > 0) {
-			std::vector<Tag> satelliteTags;
-			for(int i = 0; i < recr.tLogs.size(); i++) {
-				satelliteTags.push_back(Tag(tagLocalityTxs, i));
-			}
-			satelliteTags.push_back(txsTag);
-
 			state vector<Future<TLogInterface>> satelliteInitializationReplies;
 			vector< InitializeTLogRequest > sreqs( recr.satelliteTLogs.size() );
+			std::vector<Tag> satelliteTags;
+			
+			for(int i = 0; i < oldLogSystem->logRouterTags; i++) {
+				Tag tag = Tag(tagLocalityLogRouter, i);
+				locations.clear();
+				logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+				for(int loc : locations)
+					sreqs[ loc ].recoverTags.push_back( tag );
+			}
+			if(oldLogSystem->tLogs.size()) {
+				for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) {
+					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
+					locations.clear();
+					logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+					for(int loc : locations)
+						sreqs[ loc ].recoverTags.push_back( tag );
+				}
+				for(int i = 0; i < recr.tLogs.size(); i++) {
+					satelliteTags.push_back(Tag(tagLocalityTxs, i));
+				}
+				satelliteTags.push_back(txsTag);
+			}
+			
 			for( int i = 0; i < recr.satelliteTLogs.size(); i++ ) {
 				InitializeTLogRequest &req = sreqs[i];
 				req.recruitmentID = logSystem->recruitmentID;
@@ -2008,22 +2054,6 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				req.txsTags = recr.tLogs.size();
 			}
 
-			for(int i = -1; i < oldLogSystem->logRouterTags; i++) {
-				Tag tag = i == -1 ? txsTag : Tag(tagLocalityLogRouter, i);
-				locations.clear();
-				logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
-				for(int loc : locations)
-					sreqs[ loc ].recoverTags.push_back( tag );
-			}
-
-			for(int i = 0; i < recr.tLogs.size(); i++) {
-				Tag tag = Tag(tagLocalityTxs, i);
-				locations.clear();
-				logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
-				for(int loc : locations)
-					sreqs[ loc ].recoverTags.push_back( tag );
-			}
-
 			for( int i = 0; i < recr.satelliteTLogs.size(); i++ )
 				satelliteInitializationReplies.push_back( transformErrors( throwErrorOr( recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor( sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ), master_recovery_failed() ) );
 
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index 55de67106a..abb001d95e 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -675,12 +675,6 @@ ACTOR Future<Void> readTransactionSystemState( Reference<MasterData> self, Refer
 
 	Standalone<VectorRef<KeyValueRef>> rawTags = wait( self->txnStateStore->readRange( serverTagKeys ) );
 	self->allTags.clear();
-	if(self->lastEpochEnd > 0) {
-		for(int i = 0; i < oldLogSystem->getLogSystemConfig().tLogs[0].tLogs.size(); i++) {
-			self->allTags.push_back(Tag(tagLocalityTxs, i));
-		}
-		self->allTags.push_back(txsTag);
-	}
 
 	if(self->forceRecovery) {
 		self->safeLocality = oldLogSystem->getLogSystemConfig().tLogs[0].locality;
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index 95842aae94..f236a4fabf 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -85,7 +85,7 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B061020000LL, EndpointAddrList);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, IPv6);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, TLogVersion);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B061060000LL, PseudoLocalities);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B061070000LL, PseudoLocalities);
 };
 
 // These impact both communications and the deserialization of certain database and IKeyValueStore keys.

From ee41311a54b44163ea721c0ac8e54787c576af8e Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 15:06:17 -0700
Subject: [PATCH 026/136] TeamCollection:Call addTeamsBestOf when
 remainingTeamBudget is not 0

---
 fdbserver/DataDistribution.actor.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 58b62832d1..6316c2cc41 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1727,7 +1727,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	// When it reaches the threshold, first try to build a server team with existing machine teams; if failed,
 	// build an extra machine team and record the event in trace
 	int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber, int remainingTeamBudget) {
-		ASSERT(teamsToBuild > 0);
+		ASSERT(teamsToBuild >= 0);
 		ASSERT_WE_THINK(machine_info.size() > 0 || server_info.size() == 0);
 
 		int addedMachineTeams = 0;
@@ -1852,6 +1852,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("Primary", primary)
 		    .detail("AddedTeamNumber", addedTeams)
 		    .detail("AimToBuildTeamNumber", teamsToBuild)
+			.detail("RemainingTeamBudget", remainingTeamBudget)
 		    .detail("CurrentTeamNumber", teams.size())
 		    .detail("DesiredTeamNumber", desiredTeamNumber)
 		    .detail("MaxTeamNumber", maxTeamNumber)
@@ -1889,6 +1890,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("Primary", primary)
 		    .detail("AddedTeamNumber", 0)
 		    .detail("AimToBuildTeamNumber", 0)
+			.detail("RemainingTeamBudget", 0)
 		    .detail("CurrentTeamNumber", teams.size())
 		    .detail("DesiredTeamNumber", desiredServerTeams)
 		    .detail("MaxTeamNumber", maxServerTeams)
@@ -1965,7 +1967,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 			// teamsToBuild is calculated such that we will not build too many teams in the situation
 			// when all (or most of) teams become unhealthy temporarily and then healthy again
-			state int teamsToBuild = std::min(desiredTeams - teamCount, maxTeams - totalTeamCount);
+			state int teamsToBuild = std::max(0, std::min(desiredTeams - teamCount, maxTeams - totalTeamCount));
 
 			TraceEvent("BuildTeamsBegin", self->distributorId)
 			    .detail("TeamsToBuild", teamsToBuild)
@@ -1982,7 +1984,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			    .detail("MachineCount", self->machine_info.size())
 			    .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER);
 
-			if (teamsToBuild > 0) {
+			if (teamsToBuild > 0 || remainingTeamBudget > 0) {
 				state vector<std::vector<UID>> builtTeams;
 
 				// addTeamsBestOf() will not add more teams than needed.
@@ -2011,6 +2013,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				    .detail("Primary", self->primary)
 				    .detail("AddedTeamNumber", 0)
 				    .detail("AimToBuildTeamNumber", teamsToBuild)
+					.detail("RemainingTeamBudget", remainingTeamBudget)
 				    .detail("CurrentTeamNumber", self->teams.size())
 				    .detail("DesiredTeamNumber", desiredTeams)
 				    .detail("MaxTeamNumber", maxTeams)

From 42620e4831d3af5ac489578e2a8563d0fb8c3dac Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 16:52:36 -0700
Subject: [PATCH 027/136] TeamCollectionTest:GetTeamCollectionValid wait until
 values are correct

---
 fdbserver/DataDistribution.actor.cpp | 5 +++--
 fdbserver/QuietDatabase.actor.cpp    | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 6316c2cc41..28da690d09 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -3190,7 +3190,7 @@ ACTOR Future<Void> storageServerTracker(
 							self->badTeamRemover = removeBadTeams(self);
 							self->addActor.send(self->badTeamRemover);
 							// The team number changes, so we need to update the team number info
-							self->traceTeamCollectionInfo();
+							//self->traceTeamCollectionInfo();
 						}
 					}
 
@@ -3458,7 +3458,6 @@ ACTOR Future<Void> dataDistributionTeamCollection(
 			self->redundantTeamRemover = teamRemover(self);
 			self->addActor.send(self->redundantTeamRemover);
 		}
-		self->traceTeamCollectionInfo();
 
 		if(self->includedDCs.size()) {
 			//start this actor before any potential recruitments can happen
@@ -3472,6 +3471,8 @@ ACTOR Future<Void> dataDistributionTeamCollection(
 		self->addActor.send(monitorHealthyTeams( self ));
 		self->addActor.send(waitHealthyZoneChange( self ));
 
+		self->traceTeamCollectionInfo();
+
 		// SOMEDAY: Monitor FF/serverList for (new) servers that aren't in allServers and add or remove them
 
 		loop choose {
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 4245495f07..f5868713f1 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -323,7 +323,7 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 				    .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine)
 				    .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
 				    .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER);
-				return false;
+				wait(delay(10.0));
 			} else {
 				return true;
 			}

From bc3e83363409e1bef67426cb5c1c3e31b18d9d4e Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 16:53:01 -0700
Subject: [PATCH 028/136] TeamCollection: Add release note

---
 documentation/sphinx/source/release-notes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index 2e07e729ae..6ca2490060 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -14,6 +14,7 @@ Fixes
 -----
 
 * The ``fdbrestore`` commands ``abort``, ``wait``, and ``status`` would use a default cluster file instead of the destination cluster file argument.  `(PR #1701) <https://github.com/apple/foundationdb/pull/1701>`_
+* Ensure new added machines are used to build teams and host data from existing machines when a cluster is expanded. `(PR #1764) <https://github.com/apple/foundationdb/pull/1764>`_
 
 6.1.9
 =====

From 4fe3c7f749890c3812e43cc4aed5ece00c70fcc1 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 17:09:21 -0700
Subject: [PATCH 029/136] TeamCollectionInfo:Revert to original version where
 it is

---
 fdbserver/DataDistribution.actor.cpp | 5 ++---
 fdbserver/QuietDatabase.actor.cpp    | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 28da690d09..6316c2cc41 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -3190,7 +3190,7 @@ ACTOR Future<Void> storageServerTracker(
 							self->badTeamRemover = removeBadTeams(self);
 							self->addActor.send(self->badTeamRemover);
 							// The team number changes, so we need to update the team number info
-							//self->traceTeamCollectionInfo();
+							self->traceTeamCollectionInfo();
 						}
 					}
 
@@ -3458,6 +3458,7 @@ ACTOR Future<Void> dataDistributionTeamCollection(
 			self->redundantTeamRemover = teamRemover(self);
 			self->addActor.send(self->redundantTeamRemover);
 		}
+		self->traceTeamCollectionInfo();
 
 		if(self->includedDCs.size()) {
 			//start this actor before any potential recruitments can happen
@@ -3471,8 +3472,6 @@ ACTOR Future<Void> dataDistributionTeamCollection(
 		self->addActor.send(monitorHealthyTeams( self ));
 		self->addActor.send(waitHealthyZoneChange( self ));
 
-		self->traceTeamCollectionInfo();
-
 		// SOMEDAY: Monitor FF/serverList for (new) servers that aren't in allServers and add or remove them
 
 		loop choose {
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index f5868713f1..2c7d3c80f6 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -323,7 +323,7 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 				    .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine)
 				    .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
 				    .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER);
-				wait(delay(10.0));
+				wait(delay(5.0));
 			} else {
 				return true;
 			}

From bc4548e0d32754cd57011f67c51e78f9492d63df Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Thu, 27 Jun 2019 17:55:41 -0700
Subject: [PATCH 030/136] Fix sed accidentally rewriting a trace event to have
 an invalid field name.

---
 fdbrpc/sim2.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 044fc790db..a7ee2623e9 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -1698,7 +1698,7 @@ void startNewSimulator(bool objSerializer) {
 }
 
 ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) {
-	TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskPriority::DefaultDelay", TaskPriority::DefaultDelay);
+	TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskPriorityDefaultDelay", TaskPriority::DefaultDelay);
 
 	wait( g_sim2.delay( 0, TaskPriority::DefaultDelay, p ) ); // Switch to the machine in question
 

From f889843332853a83699a804457f09ab9086ea22d Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 18:24:18 -0700
Subject: [PATCH 031/136] Change traceTeamCollectionInfo to actor

There are cases where traceTeamCollectionInfo was called within the same execution block, i.e.,
no wait between the two traceTeamCollectionInfo calls.
Because simulation uses the same time for all execution instructions in the same execution block,
having more than one traceTeamCollectionInfo at the same time will mess up the trackLatest semantics.
When one of them is always chosen by simulator, simulation test will report false positive error.

Changing this function to actor and adding a small delay inside the function can solve this problem.
---
 fdbserver/DataDistribution.actor.cpp | 48 ++++++++++++++++++----------
 fdbserver/QuietDatabase.actor.cpp    |  2 +-
 2 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 6316c2cc41..689c4650f2 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -535,6 +535,7 @@ Future<Void> storageServerTracker(
 	Version const& addedVersion);
 
 Future<Void> teamTracker(struct DDTeamCollection* const& self, Reference<TCTeamInfo> const& team, bool const& badTeam, bool const& redundantTeam);
+ACTOR static Future<Void> traceTeamCollectionInfo(DDTeamCollection* self);
 
 struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	enum { REQUESTING_WORKER = 0, GETTING_WORKER = 1, GETTING_STORAGE = 2 };
@@ -958,7 +959,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		}
 
 		// Trace and record the current number of teams for correctness test
-		self->traceTeamCollectionInfo();
+		wait( self->traceTeamCollectionInfo(self) );
 
 		return Void();
 	}
@@ -1873,29 +1874,29 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	}
 
 	// Check if the number of server (and machine teams) is larger than the maximum allowed number
-	void traceTeamCollectionInfo() {
-		int totalHealthyServerCount = calculateHealthyServerCount();
+	ACTOR static Future<Void> traceTeamCollectionInfo(DDTeamCollection* self) {
+		int totalHealthyServerCount = self->calculateHealthyServerCount();
 		int desiredServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyServerCount;
 		int maxServerTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyServerCount;
 
-		int totalHealthyMachineCount = calculateHealthyMachineCount();
+		int totalHealthyMachineCount = self->calculateHealthyMachineCount();
 		int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount;
 		int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
-		int healthyMachineTeamCount = getHealthyMachineTeamCount();
+		int healthyMachineTeamCount = self->getHealthyMachineTeamCount();
 
-		std::pair<int, int> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
-		std::pair<int, int> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
+		std::pair<int, int> minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
+		std::pair<int, int> minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
 
-		TraceEvent("TeamCollectionInfo", distributorId)
-		    .detail("Primary", primary)
+		TraceEvent("TeamCollectionInfo", self->distributorId)
+		    .detail("Primary", self->primary)
 		    .detail("AddedTeamNumber", 0)
 		    .detail("AimToBuildTeamNumber", 0)
 			.detail("RemainingTeamBudget", 0)
-		    .detail("CurrentTeamNumber", teams.size())
+		    .detail("CurrentTeamNumber", self->teams.size())
 		    .detail("DesiredTeamNumber", desiredServerTeams)
 		    .detail("MaxTeamNumber", maxServerTeams)
-		    .detail("StorageTeamSize", configuration.storageTeamSize)
-		    .detail("CurrentMachineTeamNumber", machineTeams.size())
+		    .detail("StorageTeamSize", self->configuration.storageTeamSize)
+		    .detail("CurrentMachineTeamNumber", self->machineTeams.size())
 		    .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount)
 		    .detail("DesiredMachineTeams", desiredMachineTeams)
 		    .detail("MaxMachineTeams", maxMachineTeams)
@@ -1904,15 +1905,20 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
 		    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
 		    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
-		    .detail("DoBuildTeams", doBuildTeams)
+		    .detail("DoBuildTeams", self->doBuildTeams)
 		    .trackLatest("TeamCollectionInfo");
 
+		// Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise
+		// simulation test will randomly pick one TeamCollectionInfo trace, which could be the one before build teams
+		wait( delay(0.01) );
+
 		// Debug purpose
 //		if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {
 //			// When the number of machine teams is over the limit, print out the current team info.
 //			traceAllInfo(true);
 //		}
 
+		return Void();
 	}
 
 	// Use the current set of known processes (from server_info) to compute an optimized set of storage server teams.
@@ -2474,7 +2480,7 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
 				    .detail("CurrentMachineTeamNumber", self->machineTeams.size())
 				    .detail("DesiredMachineTeam", desiredMachineTeams)
 				    .detail("NumMachineTeamRemoved", numMachineTeamRemoved);
-				self->traceTeamCollectionInfo();
+				wait( self->traceTeamCollectionInfo(self) );
 			}
 		}
 	}
@@ -3087,6 +3093,7 @@ ACTOR Future<Void> storageServerTracker(
 			}
 			lastIsUnhealthy = status.isUnhealthy();
 
+			state bool recordTeamCollectionInfo = false;
 			choose {
 				when( wait( failureTracker ) ) {
 					// The server is failed AND all data has been removed from it, so permanently remove it.
@@ -3190,7 +3197,8 @@ ACTOR Future<Void> storageServerTracker(
 							self->badTeamRemover = removeBadTeams(self);
 							self->addActor.send(self->badTeamRemover);
 							// The team number changes, so we need to update the team number info
-							self->traceTeamCollectionInfo();
+							// wait( traceTeamCollectionInfo(self) );
+							recordTeamCollectionInfo = true;
 						}
 					}
 
@@ -3198,12 +3206,14 @@ ACTOR Future<Void> storageServerTracker(
 					// We rely on the old failureTracker being actorCancelled since the old actor now has a pointer to an invalid location
 					status = ServerStatus( status.isFailed, status.isUndesired, server->lastKnownInterface.locality );
 
+					// wait( traceTeamCollectionInfo(self) );
+					recordTeamCollectionInfo = true;
 					//Restart the storeTracker for the new interface
 					storeTracker = keyValueStoreTypeTracker(self, server);
 					hasWrongStoreTypeOrDC = false;
 					self->doBuildTeams = true;
 					self->restartTeamBuilder.trigger();
-					self->traceTeamCollectionInfo();
+
 					if(restartRecruiting)
 						self->restartRecruiting.trigger();
 				}
@@ -3224,6 +3234,10 @@ ACTOR Future<Void> storageServerTracker(
 					server->wakeUpTracker = Promise<Void>();
 				}
 			}
+
+			if ( recordTeamCollectionInfo ) {
+				wait( self->traceTeamCollectionInfo(self) );
+			}
 		}
 	} catch( Error &e ) {
 		if (e.code() != error_code_actor_cancelled && errorOut.canBeSet())
@@ -3458,7 +3472,7 @@ ACTOR Future<Void> dataDistributionTeamCollection(
 			self->redundantTeamRemover = teamRemover(self);
 			self->addActor.send(self->redundantTeamRemover);
 		}
-		self->traceTeamCollectionInfo();
+		wait( self->traceTeamCollectionInfo(self) );
 
 		if(self->includedDCs.size()) {
 			//start this actor before any potential recruitments can happen
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 2c7d3c80f6..4245495f07 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -323,7 +323,7 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 				    .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine)
 				    .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
 				    .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER);
-				wait(delay(5.0));
+				return false;
 			} else {
 				return true;
 			}

From ce7eb10cacc1d07875d473e22bde0c09249d0adb Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 19:04:22 -0700
Subject: [PATCH 032/136] TeamCollectionInfo: Only count team number for
 healthy server and machine

---
 fdbserver/DataDistribution.actor.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 689c4650f2..5a848b4714 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1610,6 +1610,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
 		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
 		for (auto& server : server_info) {
+			if ( server_status.get(server.first).isUnhealthy() ) {
+				continue;
+			}
 			if (server.second->teams.size() < minTeamNumber) {
 				minTeamNumber = server.second->teams.size();
 			}
@@ -1624,6 +1627,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
 		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
 		for (auto& machine : machine_info) {
+			if ( !isMachineHealthy(machine.second) ) {
+				continue;
+			}
 			if (machine.second->machineTeams.size() < minTeamNumber) {
 				minTeamNumber = machine.second->machineTeams.size();
 			}

From 4da345f7d2e1eeb1d7ebfabac6b4b3c4639cbea0 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Thu, 27 Jun 2019 19:05:10 -0700
Subject: [PATCH 033/136] TeamCollectionTest:Remove test on minTeamOnServer

---
 fdbserver/QuietDatabase.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 4245495f07..886f7ad099 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -308,7 +308,7 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 
 			// The if condition should be consistent with the condition in teamRemover() that decides
 			// if redundant teams exist.
-			if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0 || minServerTeamOnServer <= 0 ) {
+			if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0 ) {
 				TraceEvent("GetTeamCollectionValid")
 				    .detail("CurrentTeamNumber", currentTeamNumber)
 				    .detail("DesiredTeamNumber", desiredTeamNumber)

From 235697f688a31dc7bcb5da5c67aade1bc9efa6d7 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Thu, 27 Jun 2019 23:18:26 -0700
Subject: [PATCH 034/136] fix: txsTags are not popped at the recovery version

---
 fdbserver/OldTLogServer_6_0.actor.cpp         | 2 +-
 fdbserver/TLogServer.actor.cpp                | 2 +-
 fdbserver/workloads/LocalRatekeeper.actor.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp
index 6e3034821e..8337977ca1 100644
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@@ -401,7 +401,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 
 	//only callable after getTagData returns a null reference
 	Reference<TagData> createTagData(Tag tag, Version popped, bool nothingPersistent, bool poppedRecently, bool unpoppedRecovered) {
-		if(tag.locality != tagLocalityLogRouter && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) {
+		if(tag.locality != tagLocalityLogRouter && tag.locality != tagLocalityTxs && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) {
 			popped = recoveredAt + 1;
 		}
 		Reference<TagData> newTagData = Reference<TagData>( new TagData(tag, popped, nothingPersistent, poppedRecently, unpoppedRecovered) );
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index e34052a0a4..83a89cb9b8 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -458,7 +458,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 
 	//only callable after getTagData returns a null reference
 	Reference<TagData> createTagData(Tag tag, Version popped, bool nothingPersistent, bool poppedRecently, bool unpoppedRecovered) {
-		if(tag.locality != tagLocalityLogRouter && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) {
+		if(tag.locality != tagLocalityLogRouter && tag.locality != tagLocalityTxs && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) {
 			popped = recoveredAt + 1;
 		}
 		Reference<TagData> newTagData = Reference<TagData>( new TagData(tag, popped, 0, nothingPersistent, poppedRecently, unpoppedRecovered) );
diff --git a/fdbserver/workloads/LocalRatekeeper.actor.cpp b/fdbserver/workloads/LocalRatekeeper.actor.cpp
index 95c7eea701..53c7f339f0 100644
--- a/fdbserver/workloads/LocalRatekeeper.actor.cpp
+++ b/fdbserver/workloads/LocalRatekeeper.actor.cpp
@@ -61,7 +61,7 @@ struct LocalRatekeeperWorkload : TestWorkload {
 		state std::vector<Future<GetValueReply>> requests;
 		requests.reserve(100);
 		loop {
-			state StorageQueuingMetricsReply metrics = wait(ssi.getQueuingMetrics.getReply(StorageQueuingMetricsRequest{}));
+			state StorageQueuingMetricsReply metrics = wait(brokenPromiseToNever(ssi.getQueuingMetrics.getReply(StorageQueuingMetricsRequest{})));
 			auto durabilityLag = metrics.version - metrics.durableVersion;
 			double expectedRateLimit = 1.0;
 			if (durabilityLag >= SERVER_KNOBS->STORAGE_DURABILITY_LAG_HARD_MAX) {

From 2113d6d01e1ec5f1f9a2611a864f18c6d2f60251 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Thu, 27 Jun 2019 23:39:19 -0700
Subject: [PATCH 035/136] fix: peek all possible txsTags which could have been
 used by old log sets

---
 fdbserver/TagPartitionedLogSystem.actor.cpp | 33 ++++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index 2abd10c8c5..b47f32586d 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -746,9 +746,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 		TraceEvent("TLogPeekTxs", dbgid).detail("Begin", begin).detail("End", end).detail("LocalEnd", localEnd).detail("PeekLocality", peekLocality);
 
+		int maxTxsTags = tLogs[0]->logServers.size();
+		for(auto& it : oldLogData) {
+			maxTxsTags = std::max<int>(maxTxsTags, it.tLogs[0]->logServers.size());
+		}
+
 		if(peekLocality < 0 || localEnd == invalidVersion || localEnd <= begin) {
 			std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
-			for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+			for(int i = 0; i < maxTxsTags; i++) {
 				cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true));
 			}
 			//SOMEDAY: remove once upgrades from 6.2 are no longer supported
@@ -760,7 +765,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		try {
 			if(localEnd >= end) {
 				std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
-				for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+				for(int i = 0; i < maxTxsTags; i++) {
 					cursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, end, true, peekLocality));
 				}
 				//SOMEDAY: remove once upgrades from 6.2 are no longer supported
@@ -776,7 +781,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 
 			std::vector< Reference<ILogSystem::IPeekCursor> > localCursors;
 			std::vector< Reference<ILogSystem::IPeekCursor> > allCursors;
-			for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+			for(int i = 0; i < maxTxsTags; i++) {
 				localCursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, localEnd, true, peekLocality));
 				allCursors.push_back(peekAll(dbgid, localEnd, end, Tag(tagLocalityTxs, i), true));
 			}
@@ -792,7 +797,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		} catch( Error& e ) {
 			if(e.code() == error_code_worker_removed) {
 				std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
-				for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+				for(int i = 0; i < maxTxsTags; i++) {
 					cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true));
 				}
 				//SOMEDAY: remove once upgrades from 6.2 are no longer supported
@@ -1781,7 +1786,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			}
 
 			if(oldLogSystem->tLogs.size()) {
-				for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) {
+				int maxTxsTags = oldLogSystem->tLogs[0]->logServers.size();
+				for(auto& it : oldLogSystem->oldLogData) {
+					maxTxsTags = std::max<int>(maxTxsTags, it.tLogs[0]->logServers.size());
+				}
+				for(int i = -1; i < maxTxsTags; i++) {
 					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 					locations.clear();
 					logSet->getPushLocations( vector<Tag>(1, tag), locations, 0 );
@@ -1867,6 +1876,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 
 		state RegionInfo region = configuration.getRegion(recr.dcId);
 
+		state int maxTxsTags = 0;
+		if(oldLogSystem->tLogs.size()) {
+			maxTxsTags = oldLogSystem->tLogs[0]->logServers.size();
+			for(auto& it : oldLogSystem->oldLogData) {
+				maxTxsTags = std::max<int>(maxTxsTags, it.tLogs[0]->logServers.size());
+			}
+		}
+
 		if(region.satelliteTLogReplicationFactor > 0) {
 			logSystem->tLogs.emplace_back(new LogSet());
 			if(recr.satelliteFallback) {
@@ -1891,7 +1908,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 
 			logSystem->tLogs[1]->logServers.resize( recr.satelliteTLogs.size() );  // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size
 			logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities);
-			logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,recr.tLogs.size(),oldLogSystem->tLogs.size() ? oldLogSystem->tLogs[0]->logServers.size() : 0);
+			logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,recr.tLogs.size(),maxTxsTags);
 			logSystem->expectedLogSets++;
 		}
 
@@ -1972,7 +1989,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			reqs[ logSystem->tLogs[0]->bestLocationFor( tag ) ].recoverTags.push_back( tag );
 		}
 		if(oldLogSystem->tLogs.size()) {
-			for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) {
+			for(int i = -1; i < maxTxsTags; i++) {
 				Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 				locations.clear();
 				logSystem->tLogs[0]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
@@ -2022,7 +2039,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					sreqs[ loc ].recoverTags.push_back( tag );
 			}
 			if(oldLogSystem->tLogs.size()) {
-				for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) {
+				for(int i = -1; i < maxTxsTags; i++) {
 					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 					locations.clear();
 					logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, tag), locations, 0 );

From cb681693df715cc757cdf02e6792baae082f1628 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Fri, 28 Jun 2019 09:50:40 -0700
Subject: [PATCH 036/136] TeamCollection:Do NOT consider healthyness in
 counting team number

If a team is removed from DD, it will be marked as failed and eventually removed from the
global teams data structure.
Team healthyness is likely to be a temporary state which can be changed rather quickly.
---
 fdbserver/DataDistribution.actor.cpp | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 5a848b4714..11ce0063ff 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1510,12 +1510,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			// Only pick healthy server, which is not failed or excluded.
 			if (server_status.get(server.first).isUnhealthy()) continue;
 
-			int numTeams = 0;
-			for (auto& t : server.second->teams) {
-				if (!t->isWrongConfiguration() && t->isHealthy()) {
-					++numTeams;
-				}
-			}
+			int numTeams = server.second->teams.size();
 			if (numTeams < minTeamNumber) {
 				minTeamNumber = numTeams;
 				leastUsedServers.clear();
@@ -1610,9 +1605,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
 		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
 		for (auto& server : server_info) {
-			if ( server_status.get(server.first).isUnhealthy() ) {
-				continue;
-			}
+			// if ( server_status.get(server.first).isUnhealthy() ) {
+			// 	continue;
+			// }
 			if (server.second->teams.size() < minTeamNumber) {
 				minTeamNumber = server.second->teams.size();
 			}
@@ -1623,13 +1618,13 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return std::make_pair(minTeamNumber, maxTeamNumber);
 	}
 
-	std::pair<uint32_t, uint32_t> calculateMinMaxMachineTeamNumOnMachine() {
-		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
-		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
+	std::pair<int, int> calculateMinMaxMachineTeamNumOnMachine() {
+		int minTeamNumber = std::numeric_limits<int>::max();
+		int maxTeamNumber = 0;
 		for (auto& machine : machine_info) {
-			if ( !isMachineHealthy(machine.second) ) {
-				continue;
-			}
+			// if ( !isMachineHealthy(machine.second) ) {
+			// 	continue;
+			// }
 			if (machine.second->machineTeams.size() < minTeamNumber) {
 				minTeamNumber = machine.second->machineTeams.size();
 			}

From 7f4586ad497105f9f67b01bdb9de89475ffa889d Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 28 Jun 2019 12:33:24 -0700
Subject: [PATCH 037/136] the number of txsTags needs to be tracked separately
 from the number of transaction logs because of forced recoveries

---
 fdbserver/DBCoreState.h                     |  14 ++-
 fdbserver/LogSystemConfig.h                 |  14 ++-
 fdbserver/TagPartitionedLogSystem.actor.cpp | 115 +++++++++++++-------
 flow/ProtocolVersion.h                      |   1 +
 4 files changed, 90 insertions(+), 54 deletions(-)

diff --git a/fdbserver/DBCoreState.h b/fdbserver/DBCoreState.h
index cef22c0b60..b5006d2e72 100644
--- a/fdbserver/DBCoreState.h
+++ b/fdbserver/DBCoreState.h
@@ -76,14 +76,15 @@ struct CoreTLogSet {
 struct OldTLogCoreData {
 	std::vector<CoreTLogSet> tLogs;
 	int32_t logRouterTags;
+	int32_t txsTags;
 	Version epochEnd;
 	std::set<int8_t> pseudoLocalities;
 
-	OldTLogCoreData() : epochEnd(0), logRouterTags(0) {}
+	OldTLogCoreData() : epochEnd(0), logRouterTags(0), txsTags(0) {}
 	explicit OldTLogCoreData(const OldLogData&);
 
 	bool operator == (OldTLogCoreData const& rhs) const { 
-		return tLogs == rhs.tLogs && logRouterTags == rhs.logRouterTags && epochEnd == rhs.epochEnd && pseudoLocalities == rhs.pseudoLocalities;
+		return tLogs == rhs.tLogs && logRouterTags == rhs.logRouterTags && txsTags == rhs.txsTags && epochEnd == rhs.epochEnd && pseudoLocalities == rhs.pseudoLocalities;
 	}
 
 	template <class Archive>
@@ -97,7 +98,7 @@ struct OldTLogCoreData {
 			tLogs[0].tLogVersion = TLogVersion::V2;
 		}
 		if (ar.protocolVersion().hasPseudoLocalities()) {
-			serializer(ar, pseudoLocalities);
+			serializer(ar, pseudoLocalities, txsTags);
 		}
 	}
 };
@@ -105,12 +106,13 @@ struct OldTLogCoreData {
 struct DBCoreState {
 	std::vector<CoreTLogSet> tLogs;
 	int32_t logRouterTags;
+	int32_t txsTags;
 	std::vector<OldTLogCoreData> oldTLogData;
 	DBRecoveryCount recoveryCount;  // Increases with sequential successful recoveries.
 	LogSystemType logSystemType;
 	std::set<int8_t> pseudoLocalities;
 	
-	DBCoreState() : logRouterTags(0), recoveryCount(0), logSystemType(LogSystemType::empty) {}
+	DBCoreState() : logRouterTags(0), txsTags(0), recoveryCount(0), logSystemType(LogSystemType::empty) {}
 
 	vector<UID> getPriorCommittedLogServers() {
 		vector<UID> priorCommittedLogServers;
@@ -130,7 +132,7 @@ struct DBCoreState {
 	}
 
 	bool isEqual(DBCoreState const& r) const {
-		return logSystemType == r.logSystemType && recoveryCount == r.recoveryCount && tLogs == r.tLogs && oldTLogData == r.oldTLogData && logRouterTags == r.logRouterTags && pseudoLocalities == r.pseudoLocalities;
+		return logSystemType == r.logSystemType && recoveryCount == r.recoveryCount && tLogs == r.tLogs && oldTLogData == r.oldTLogData && logRouterTags == r.logRouterTags && txsTags == r.txsTags && pseudoLocalities == r.pseudoLocalities;
 	}
 	bool operator == ( const DBCoreState& rhs ) const { return isEqual(rhs); }
 
@@ -146,7 +148,7 @@ struct DBCoreState {
 		if(ar.protocolVersion().hasTagLocality()) {
 			serializer(ar, tLogs, logRouterTags, oldTLogData, recoveryCount, logSystemType);
 			if (ar.protocolVersion().hasPseudoLocalities()) {
-				serializer(ar, pseudoLocalities);
+				serializer(ar, pseudoLocalities, txsTags);
 			}
 		} else if(ar.isDeserializing) {
 			tLogs.push_back(CoreTLogSet());
diff --git a/fdbserver/LogSystemConfig.h b/fdbserver/LogSystemConfig.h
index b1947d1457..c7b0a592d2 100644
--- a/fdbserver/LogSystemConfig.h
+++ b/fdbserver/LogSystemConfig.h
@@ -157,9 +157,10 @@ struct OldTLogConf {
 	std::vector<TLogSet> tLogs;
 	Version epochEnd;
 	int32_t logRouterTags;
+	int32_t txsTags;
 	std::set<int8_t> pseudoLocalities;
 
-	OldTLogConf() : epochEnd(0), logRouterTags(0) {}
+	OldTLogConf() : epochEnd(0), logRouterTags(0), txsTags(0) {}
 	explicit OldTLogConf(const OldLogData&);
 
 	std::string toString() const {
@@ -167,7 +168,7 @@ struct OldTLogConf {
 	}
 
 	bool operator == ( const OldTLogConf& rhs ) const {
-		return tLogs == rhs.tLogs && epochEnd == rhs.epochEnd && logRouterTags == rhs.logRouterTags && pseudoLocalities == rhs.pseudoLocalities;
+		return tLogs == rhs.tLogs && epochEnd == rhs.epochEnd && logRouterTags == rhs.logRouterTags && txsTags == rhs.txsTags && pseudoLocalities == rhs.pseudoLocalities;
 	}
 
 	bool isEqualIds(OldTLogConf const& r) const {
@@ -184,7 +185,7 @@ struct OldTLogConf {
 
 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, tLogs, epochEnd, logRouterTags, pseudoLocalities);
+		serializer(ar, tLogs, epochEnd, logRouterTags, pseudoLocalities, txsTags);
 	}
 };
 
@@ -199,6 +200,7 @@ struct LogSystemConfig {
 	LogSystemType logSystemType;
 	std::vector<TLogSet> tLogs;
 	int32_t logRouterTags;
+	int32_t txsTags;
 	std::vector<OldTLogConf> oldTLogs;
 	int32_t expectedLogSets;
 	UID recruitmentID;
@@ -206,7 +208,7 @@ struct LogSystemConfig {
 	Optional<Version> recoveredAt;
 	std::set<int8_t> pseudoLocalities;
 
-	LogSystemConfig() : logSystemType(LogSystemType::empty), logRouterTags(0), expectedLogSets(0), stopped(false) {}
+	LogSystemConfig() : logSystemType(LogSystemType::empty), logRouterTags(0), txsTags(0), expectedLogSets(0), stopped(false) {}
 
 	std::string toString() const {
 		return format("type: %d oldGenerations: %d tags: %d %s", logSystemType, oldTLogs.size(), logRouterTags, describe(tLogs).c_str());
@@ -327,7 +329,7 @@ struct LogSystemConfig {
 	bool operator == ( const LogSystemConfig& rhs ) const { return isEqual(rhs); }
 
 	bool isEqual(LogSystemConfig const& r) const {
-		return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && expectedLogSets == r.expectedLogSets && logRouterTags == r.logRouterTags && recruitmentID == r.recruitmentID && stopped == r.stopped && recoveredAt == r.recoveredAt && pseudoLocalities == r.pseudoLocalities;
+		return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && expectedLogSets == r.expectedLogSets && logRouterTags == r.logRouterTags && txsTags == r.txsTags && recruitmentID == r.recruitmentID && stopped == r.stopped && recoveredAt == r.recoveredAt && pseudoLocalities == r.pseudoLocalities;
 	}
 
 	bool isEqualIds(LogSystemConfig const& r) const {
@@ -358,7 +360,7 @@ struct LogSystemConfig {
 
 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, logSystemType, tLogs, logRouterTags, oldTLogs, expectedLogSets, recruitmentID, stopped, recoveredAt, pseudoLocalities);
+		serializer(ar, logSystemType, tLogs, logRouterTags, oldTLogs, expectedLogSets, recruitmentID, stopped, recoveredAt, pseudoLocalities, txsTags);
 	}
 };
 
diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index b47f32586d..ede5d4bb7b 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -44,15 +44,16 @@ ACTOR Future<Version> minVersionWhenReady( Future<Void> f, std::vector<Future<Ve
 struct OldLogData {
 	std::vector<Reference<LogSet>> tLogs;
 	int32_t logRouterTags;
+	int32_t txsTags;
 	Version epochEnd;
 	std::set<int8_t> pseudoLocalities;
 
-	OldLogData() : epochEnd(0), logRouterTags(0) {}
+	OldLogData() : epochEnd(0), logRouterTags(0), txsTags(0) {}
 
 	// Constructor for T of OldTLogConf and OldTLogCoreData
 	template<class T>
 	explicit OldLogData(const T& conf)
-		: logRouterTags(conf.logRouterTags), epochEnd(conf.epochEnd),
+		: logRouterTags(conf.logRouterTags), txsTags(conf.txsTags), epochEnd(conf.epochEnd),
 		  pseudoLocalities(conf.pseudoLocalities)
 	{
 		tLogs.resize(conf.tLogs.size());
@@ -123,7 +124,7 @@ TLogSet::TLogSet(const LogSet& rhs) :
 }
 
 OldTLogConf::OldTLogConf(const OldLogData& oldLogData) :
-	logRouterTags(oldLogData.logRouterTags), epochEnd(oldLogData.epochEnd),
+	logRouterTags(oldLogData.logRouterTags), txsTags(oldLogData.txsTags), epochEnd(oldLogData.epochEnd),
 	pseudoLocalities(oldLogData.pseudoLocalities)
 {
 	for (const Reference<LogSet>& logSet : oldLogData.tLogs) {
@@ -146,7 +147,7 @@ CoreTLogSet::CoreTLogSet(const LogSet& logset) :
 }
 
 OldTLogCoreData::OldTLogCoreData(const OldLogData& oldData) :
-	logRouterTags(oldData.logRouterTags), epochEnd(oldData.epochEnd),
+	logRouterTags(oldData.logRouterTags), txsTags(oldData.txsTags), epochEnd(oldData.epochEnd),
 	pseudoLocalities(oldData.pseudoLocalities)
 {
 	for (const Reference<LogSet>& logSet : oldData.tLogs) {
@@ -162,6 +163,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	std::vector<Reference<LogSet>> tLogs;
 	int expectedLogSets;
 	int logRouterTags;
+	int txsTags;
 	UID recruitmentID;
 	int repopulateRegionAntiQuorum;
 	bool stopped;
@@ -188,7 +190,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	std::vector<OldLogData> oldLogData;
 	AsyncTrigger logSystemConfigChanged;
 
-	TagPartitionedLogSystem( UID dbgid, LocalityData locality, Optional<PromiseStream<Future<Void>>> addActor = Optional<PromiseStream<Future<Void>>>() ) : dbgid(dbgid), locality(locality), addActor(addActor), popActors(false), recoveryCompleteWrittenToCoreState(false), remoteLogsWrittenToCoreState(false), logSystemType(LogSystemType::empty), logRouterTags(0), expectedLogSets(0), hasRemoteServers(false), stopped(false), repopulateRegionAntiQuorum(0) {}
+	TagPartitionedLogSystem( UID dbgid, LocalityData locality, Optional<PromiseStream<Future<Void>>> addActor = Optional<PromiseStream<Future<Void>>>() ) : dbgid(dbgid), locality(locality), addActor(addActor), popActors(false), recoveryCompleteWrittenToCoreState(false), remoteLogsWrittenToCoreState(false), logSystemType(LogSystemType::empty), logRouterTags(0), txsTags(0), expectedLogSets(0), hasRemoteServers(false), stopped(false), repopulateRegionAntiQuorum(0) {}
 
 	virtual void stopRejoins() {
 		rejoins = Future<Void>();
@@ -264,6 +266,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		logSystem->tLogs.reserve(lsConf.tLogs.size());
 		logSystem->expectedLogSets = lsConf.expectedLogSets;
 		logSystem->logRouterTags = lsConf.logRouterTags;
+		logSystem->txsTags = lsConf.txsTags;
 		logSystem->recruitmentID = lsConf.recruitmentID;
 		logSystem->stopped = lsConf.stopped;
 		if(useRecoveredAt) {
@@ -294,6 +297,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				logSystem->tLogs.emplace_back(new LogSet(tLogSet));
 			}
 			logSystem->logRouterTags = lsConf.oldTLogs[0].logRouterTags;
+			logSystem->txsTags = lsConf.oldTLogs[0].txsTags;
 			//logSystem->epochEnd = lsConf.oldTLogs[0].epochEnd;
 
 			for (int i = 1; i < lsConf.oldTLogs.size(); i++ ) {
@@ -316,6 +320,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 
 		newState.tLogs.clear();
 		newState.logRouterTags = logRouterTags;
+		newState.txsTags = txsTags;
 		newState.pseudoLocalities = pseudoLocalities;
 		for (const auto &t : tLogs) {
 			if (t->logServers.size()) {
@@ -746,10 +751,13 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 		TraceEvent("TLogPeekTxs", dbgid).detail("Begin", begin).detail("End", end).detail("LocalEnd", localEnd).detail("PeekLocality", peekLocality);
 
-		int maxTxsTags = tLogs[0]->logServers.size();
+		int maxTxsTags = txsTags;
+		bool needsOldTxs = txsTags==0;
 		for(auto& it : oldLogData) {
-			maxTxsTags = std::max<int>(maxTxsTags, it.tLogs[0]->logServers.size());
+			maxTxsTags = std::max<int>(maxTxsTags, it.txsTags);
+			needsOldTxs = needsOldTxs || it.txsTags==0;
 		}
+		
 
 		if(peekLocality < 0 || localEnd == invalidVersion || localEnd <= begin) {
 			std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
@@ -757,7 +765,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true));
 			}
 			//SOMEDAY: remove once upgrades from 6.2 are no longer supported
-			cursors.push_back(peekAll(dbgid, begin, end, txsTag, true));
+			if(needsOldTxs) {
+				cursors.push_back(peekAll(dbgid, begin, end, txsTag, true));
+			}
 
 			return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(cursors, begin, end, false) );
 		}
@@ -769,7 +779,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					cursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, end, true, peekLocality));
 				}
 				//SOMEDAY: remove once upgrades from 6.2 are no longer supported
-				cursors.push_back(peekLocal(dbgid, txsTag, begin, end, true, peekLocality));
+				if(needsOldTxs) {
+					cursors.push_back(peekLocal(dbgid, txsTag, begin, end, true, peekLocality));
+				}
 
 				return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(cursors, begin, end, false) );
 			}
@@ -786,8 +798,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				allCursors.push_back(peekAll(dbgid, localEnd, end, Tag(tagLocalityTxs, i), true));
 			}
 			//SOMEDAY: remove once upgrades from 6.2 are no longer supported
-			localCursors.push_back(peekLocal(dbgid, txsTag, begin, localEnd, true, peekLocality));
-			allCursors.push_back(peekAll(dbgid, localEnd, end, txsTag, true));
+			if(needsOldTxs) {
+				localCursors.push_back(peekLocal(dbgid, txsTag, begin, localEnd, true, peekLocality));
+				allCursors.push_back(peekAll(dbgid, localEnd, end, txsTag, true));
+			}
 
 			cursors[1] = Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(localCursors, begin, localEnd, false) );
 			cursors[0] = Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(allCursors, localEnd, end, false) );
@@ -801,7 +815,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true));
 				}
 				//SOMEDAY: remove once upgrades from 6.2 are no longer supported
-				cursors.push_back(peekAll(dbgid, begin, end, txsTag, true));
+				if(needsOldTxs) {
+					cursors.push_back(peekAll(dbgid, begin, end, txsTag, true));
+				}
 
 				return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(cursors, begin, end, false) );
 			}
@@ -952,13 +968,13 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	}
 
 	virtual void popTxs( Version upTo, int8_t popLocality ) {
-		if(tLogs.size()) {
-			for(int i = 0; i < tLogs[0]->logServers.size(); i++) {
+		if(txsTags == 0) {
+			pop(upTo, txsTag, 0, popLocality);
+		} else {
+			for(int i = 0; i < txsTags; i++) {
 				pop(upTo, Tag(tagLocalityTxs, i), 0, popLocality);
 			}
 		}
-		//SOMEDAY: remove once upgrades from 6.2 are no longer supported
-		pop(upTo, txsTag, 0, popLocality);
 	}
 
 	virtual void pop( Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality ) {
@@ -1086,6 +1102,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		logSystemConfig.logSystemType = logSystemType;
 		logSystemConfig.expectedLogSets = expectedLogSets;
 		logSystemConfig.logRouterTags = logRouterTags;
+		logSystemConfig.txsTags = txsTags;
 		logSystemConfig.recruitmentID = recruitmentID;
 		logSystemConfig.stopped = stopped;
 		logSystemConfig.recoveredAt = recoveredAt;
@@ -1179,8 +1196,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	}
 
 	virtual Tag getRandomTxsTag() {
-		ASSERT(tLogs.size());
-		return Tag(tagLocalityTxs, deterministicRandom()->randomInt(0, tLogs[0]->logServers.size()));
+		if(txsTags==0) {
+			return txsTag;
+		}
+		return Tag(tagLocalityTxs, deterministicRandom()->randomInt(0, txsTags));
 	}
 
 	ACTOR static Future<Void> monitorLog(Reference<AsyncVar<OptionalInterface<TLogInterface>>> logServer, Reference<AsyncVar<bool>> failed) {
@@ -1529,6 +1548,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				lastEnd = minEnd;
 				logSystem->tLogs = logServers;
 				logSystem->logRouterTags = prevState.logRouterTags;
+				logSystem->txsTags = prevState.txsTags;
 				logSystem->oldLogData = oldLogData;
 				logSystem->logSystemType = prevState.logSystemType;
 				logSystem->rejoins = rejoins;
@@ -1786,21 +1806,26 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			}
 
 			if(oldLogSystem->tLogs.size()) {
-				int maxTxsTags = oldLogSystem->tLogs[0]->logServers.size();
+				int maxTxsTags = oldLogSystem->txsTags;
+				bool needsOldTxs = oldLogSystem->txsTags==0;
 				for(auto& it : oldLogSystem->oldLogData) {
-					maxTxsTags = std::max<int>(maxTxsTags, it.tLogs[0]->logServers.size());
+					maxTxsTags = std::max<int>(maxTxsTags, it.txsTags);
+					needsOldTxs = needsOldTxs || it.txsTags==0;
 				}
-				for(int i = -1; i < maxTxsTags; i++) {
+				for(int i = needsOldTxs?-1:0; i < maxTxsTags; i++) {
 					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 					locations.clear();
 					logSet->getPushLocations( vector<Tag>(1, tag), locations, 0 );
 					for(int loc : locations)
 						remoteTLogReqs[ loc ].recoverTags.push_back( tag );
 				}
-				for(int i = 0; i < self->tLogs[0]->logServers.size(); i++) {
-					localTags.push_back(Tag(tagLocalityTxs, i));
+				if(self->txsTags == 0) {
+					localTags.push_back(txsTag);
+				} else {
+					for(int i = 0; i < self->txsTags; i++) {
+						localTags.push_back(Tag(tagLocalityTxs, i));
+					}
 				}
-				localTags.push_back(txsTag);
 			}
 		}
 
@@ -1820,7 +1845,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			req.allTags = localTags;
 			req.startVersion = logSet->startVersion;
 			req.logRouterTags = 0;
-			req.txsTags = self->tLogs[0]->logServers.size();
+			req.txsTags = self->txsTags;
 		}
 
 		for( int i = 0; i < remoteWorkers.remoteTLogs.size(); i++ )
@@ -1858,6 +1883,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		logSystem->recoveredAt = oldLogSystem->recoverAt;
 		logSystem->repopulateRegionAntiQuorum = configuration.repopulateRegionAntiQuorum;
 		logSystem->recruitmentID = deterministicRandom()->randomUniqueID();
+		logSystem->txsTags = recr.tLogs.size();
 		oldLogSystem->recruitmentID = logSystem->recruitmentID;
 
 		if(configuration.usableRegions > 1) {
@@ -1876,12 +1902,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 
 		state RegionInfo region = configuration.getRegion(recr.dcId);
 
-		state int maxTxsTags = 0;
-		if(oldLogSystem->tLogs.size()) {
-			maxTxsTags = oldLogSystem->tLogs[0]->logServers.size();
-			for(auto& it : oldLogSystem->oldLogData) {
-				maxTxsTags = std::max<int>(maxTxsTags, it.tLogs[0]->logServers.size());
-			}
+		state int maxTxsTags = oldLogSystem->txsTags;
+		state bool needsOldTxs = oldLogSystem->txsTags==0;
+		for(auto& it : oldLogSystem->oldLogData) {
+			maxTxsTags = std::max<int>(maxTxsTags, it.txsTags);
+			needsOldTxs = needsOldTxs || it.txsTags==0;
 		}
 
 		if(region.satelliteTLogReplicationFactor > 0) {
@@ -1908,7 +1933,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 
 			logSystem->tLogs[1]->logServers.resize( recr.satelliteTLogs.size() );  // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size
 			logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities);
-			logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,recr.tLogs.size(),maxTxsTags);
+			logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,logSystem->txsTags,maxTxsTags);
 			logSystem->expectedLogSets++;
 		}
 
@@ -1989,17 +2014,20 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			reqs[ logSystem->tLogs[0]->bestLocationFor( tag ) ].recoverTags.push_back( tag );
 		}
 		if(oldLogSystem->tLogs.size()) {
-			for(int i = -1; i < maxTxsTags; i++) {
+			for(int i = needsOldTxs?-1:0; i < maxTxsTags; i++) {
 				Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 				locations.clear();
 				logSystem->tLogs[0]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
 				for(int loc : locations)
 					reqs[ loc ].recoverTags.push_back( tag );
 			}
-			for(int i = 0; i < recr.tLogs.size(); i++) {
-				localTags.push_back(Tag(tagLocalityTxs, i));
+			if(logSystem->txsTags == 0) {
+				localTags.push_back(txsTag);
+			} else {
+				for(int i = 0; i < logSystem->txsTags; i++) {
+					localTags.push_back(Tag(tagLocalityTxs, i));
+				}
 			}
-			localTags.push_back(txsTag);
 		}
 
 		for( int i = 0; i < recr.tLogs.size(); i++ ) {
@@ -2018,7 +2046,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			req.allTags = localTags;
 			req.startVersion = logSystem->tLogs[0]->startVersion;
 			req.logRouterTags = logSystem->logRouterTags;
-			req.txsTags = recr.tLogs.size();
+			req.txsTags = logSystem->txsTags;
 		}
 
 		for( int i = 0; i < recr.tLogs.size(); i++ )
@@ -2039,17 +2067,20 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					sreqs[ loc ].recoverTags.push_back( tag );
 			}
 			if(oldLogSystem->tLogs.size()) {
-				for(int i = -1; i < maxTxsTags; i++) {
+				for(int i = needsOldTxs?-1:0; i < maxTxsTags; i++) {
 					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 					locations.clear();
 					logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
 					for(int loc : locations)
 						sreqs[ loc ].recoverTags.push_back( tag );
 				}
-				for(int i = 0; i < recr.tLogs.size(); i++) {
-					satelliteTags.push_back(Tag(tagLocalityTxs, i));
+				if(logSystem->txsTags == 0) {
+					satelliteTags.push_back(txsTag);
+				} else {
+					for(int i = 0; i < logSystem->txsTags; i++) {
+						satelliteTags.push_back(Tag(tagLocalityTxs, i));
+					}
 				}
-				satelliteTags.push_back(txsTag);
 			}
 			
 			for( int i = 0; i < recr.satelliteTLogs.size(); i++ ) {
@@ -2068,7 +2099,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				req.allTags = satelliteTags;
 				req.startVersion = oldLogSystem->knownCommittedVersion + 1;
 				req.logRouterTags = logSystem->logRouterTags;
-				req.txsTags = recr.tLogs.size();
+				req.txsTags = logSystem->txsTags;
 			}
 
 			for( int i = 0; i < recr.satelliteTLogs.size(); i++ )
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index f236a4fabf..e35b7f197b 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -86,6 +86,7 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, IPv6);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, TLogVersion);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B061070000LL, PseudoLocalities);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B061070000LL, ShardedTxsTags);
 };
 
 // These impact both communications and the deserialization of certain database and IKeyValueStore keys.

From 2035b362573107f44ab57f837eb25a6cb938f4c7 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 28 Jun 2019 13:24:32 -0700
Subject: [PATCH 038/136] Make default and persistent options specifyable via
 annotations to fdb.options. Fix some issues with persisting these options in
 the multi-version client. Make size limit option not persistent.

---
 bindings/go/src/fdb/generated.go              |  28 ++++-
 bindings/python/tests/size_limit.py           |  66 ++++++-----
 bindings/python/tests/tester.py               |   4 +
 .../sphinx/source/api-common.rst.inc          |   2 +-
 fdbclient/DatabaseContext.h                   |   6 +-
 fdbclient/FDBOptions.h                        |  37 +++++-
 fdbclient/MultiVersionTransaction.actor.cpp   |  47 +++++++-
 fdbclient/MultiVersionTransaction.h           |   8 +-
 fdbclient/NativeAPI.actor.cpp                 | 112 ++++++++----------
 fdbclient/NativeAPI.actor.h                   |   2 +
 fdbclient/ReadYourWrites.actor.cpp            |  65 +++++-----
 fdbclient/ReadYourWrites.h                    |   5 +-
 fdbclient/ThreadSafeTransaction.actor.cpp     |   4 +
 fdbclient/vexillographer/cpp.cs               |   4 +-
 fdbclient/vexillographer/fdb.options          |  23 ++--
 fdbclient/vexillographer/vexillographer.cs    |  13 +-
 16 files changed, 276 insertions(+), 150 deletions(-)

diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go
index 2a7c40f6aa..782b108fda 100644
--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@@ -228,6 +228,30 @@ func (o NetworkOptions) SetEnableSlowTaskProfiling() error {
 	return o.setOpt(71, nil)
 }
 
+// Enable client buggify - will make requests randomly fail (intended for client testing)
+func (o NetworkOptions) SetClientBuggifyEnable() error {
+	return o.setOpt(80, nil)
+}
+
+// Disable client buggify
+func (o NetworkOptions) SetClientBuggifyDisable() error {
+	return o.setOpt(81, nil)
+}
+
+// Set the probability of a CLIENT_BUGGIFY section being active for the current execution.
+//
+// Parameter: probability expressed as a percentage between 0 and 100
+func (o NetworkOptions) SetClientBuggifySectionActivatedProbability(param int64) error {
+	return o.setOpt(82, int64ToBytes(param))
+}
+
+// Set the probability of an active CLIENT_BUGGIFY section being fired. A section will only fire if it was activated
+//
+// Parameter: probability expressed as a percentage between 0 and 100
+func (o NetworkOptions) SetClientBuggifySectionFiredProbability(param int64) error {
+	return o.setOpt(83, int64ToBytes(param))
+}
+
 // Set the size of the client location cache. Raising this value can boost performance in very large databases where clients access data in a near-random pattern. Defaults to 100000.
 //
 // Parameter: Max location cache entries
@@ -277,7 +301,7 @@ func (o DatabaseOptions) SetTransactionMaxRetryDelay(param int64) error {
 	return o.setOpt(502, int64ToBytes(param))
 }
 
-// Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Default to 10,000,000 bytes.
+// Set the maximum transaction size in bytes. This sets the ``size_limit`` option on each transaction created by this database. See the transaction option description for more information.
 //
 // Parameter: value in bytes
 func (o DatabaseOptions) SetTransactionSizeLimit(param int64) error {
@@ -409,7 +433,7 @@ func (o TransactionOptions) SetMaxRetryDelay(param int64) error {
 	return o.setOpt(502, int64ToBytes(param))
 }
 
-// Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Valid parameter values are ``[32, 10,000,000]```.
+// Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit.
 //
 // Parameter: value in bytes
 func (o TransactionOptions) SetSizeLimit(param int64) error {
diff --git a/bindings/python/tests/size_limit.py b/bindings/python/tests/size_limit.py
index 6d08f15efc..3072e153f8 100644
--- a/bindings/python/tests/size_limit.py
+++ b/bindings/python/tests/size_limit.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 #
-# size_limit.py
+# size_limit_tests.py
 #
 # This source file is part of the FoundationDB open source project
 #
@@ -21,44 +21,56 @@
 import fdb
 import sys
 
-fdb.api_version(610)
+if __name__ == '__main__':
+    fdb.api_version(610)
 
 @fdb.transactional
 def setValue(tr, key, value):
-	tr[key] = value
+    tr[key] = value
 
 @fdb.transactional
 def setValueWithLimit(tr, key, value, limit):
-	tr.options.set_size_limit(limit)
-	tr[key] = value
+    tr.options.set_size_limit(limit)
+    tr[key] = value
 
-def run(clusterFile):
-	db = fdb.open(clusterFile)
-	db.options.set_transaction_timeout(2000)  # 2 seconds
-	db.options.set_transaction_retry_limit(3)
-	value = 'a' * 1024
+def test_size_limit_option(db):
+    db.options.set_transaction_timeout(2000)  # 2 seconds
+    db.options.set_transaction_retry_limit(3)
+    value = 'a' * 1024
 
-	setValue(db, 't1', value)
-	assert(value == db['t1'])
+    setValue(db, 't1', value)
+    assert(value == db['t1'])
 
-	try:
-		db.options.set_transaction_size_limit(1000)
-		setValue(db, 't2', value)
-		assert(False)  # not reached
-	except fdb.impl.FDBError as e:
-		assert(e.code == 2101)  # Transaction exceeds byte limit (2101)
+    try:
+        db.options.set_transaction_size_limit(1000)
+        setValue(db, 't2', value)
+        assert(False)  # not reached
+    except fdb.FDBError as e:
+        assert(e.code == 2101)  # Transaction exceeds byte limit (2101)
 
-	# Per transaction option overrides database option
-	db.options.set_transaction_size_limit(1000000)
-	try:
-		setValueWithLimit(db, 't3', value, 1000)
-		assert(False)  # not reached
-	except fdb.impl.FDBError as e:
-		assert(e.code == 2101)  # Transaction exceeds byte limit (2101)
+    # Per transaction option overrides database option
+    db.options.set_transaction_size_limit(1000000)
+    try:
+        setValueWithLimit(db, 't3', value, 1000)
+        assert(False)  # not reached
+    except fdb.FDBError as e:
+        assert(e.code == 2101)  # Transaction exceeds byte limit (2101)
 
+    # DB default survives on_error reset
+    db.options.set_transaction_size_limit(1000)
+    tr = db.create_transaction()
+    try:
+        tr['t4'] = 'bar'
+        tr.on_error(fdb.FDBError(1007)).wait()
+        setValue(tr, 't4', value)
+        tr.commit().wait()
+        assert(False)  # not reached
+    except fdb.FDBError as e:
+        assert(e.code == 2101)  # Transaction exceeds byte limit (2101)
 
 # Expect a cluster file as input. This test will write to the FDB cluster, so
 # be aware of potential side effects.
 if __name__ == '__main__':
-	clusterFile = sys.argv[1]
-	run(clusterFile)
\ No newline at end of file
+    clusterFile = sys.argv[1]
+    db = fdb.open(clusterFile)
+    test_size_limit_option(db)
diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py
index 3023cc5cb8..95aa36ea3e 100644
--- a/bindings/python/tests/tester.py
+++ b/bindings/python/tests/tester.py
@@ -48,6 +48,8 @@ from cancellation_timeout_tests import test_retry_limits
 from cancellation_timeout_tests import test_db_retry_limits
 from cancellation_timeout_tests import test_combinations
 
+from size_limit_tests import test_size_limit_option
+
 random.seed(0)
 
 if len(sys.argv) == 4:
@@ -557,6 +559,8 @@ class Tester:
                         test_locality(db)
                         test_predicates()
 
+                        test_size_limit_option(db)
+
                     except fdb.FDBError as e:
                         print("Unit tests failed: %s" % e.description)
                         traceback.print_exc()
diff --git a/documentation/sphinx/source/api-common.rst.inc b/documentation/sphinx/source/api-common.rst.inc
index 292cd36ec5..3c99c45382 100644
--- a/documentation/sphinx/source/api-common.rst.inc
+++ b/documentation/sphinx/source/api-common.rst.inc
@@ -399,7 +399,7 @@
 
 ..  |option-set-size-limit-blurb| replace::
 
-    Set the maximum transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit. The value set by this limit will persist across transaction resets.
+    Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit.
 
 ..  |option-set-timeout-blurb1| replace::
 
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index eca185e8f8..537d329fce 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -154,10 +154,6 @@ public:
 	int outstandingWatches;
 	int maxOutstandingWatches;
 
-	double transactionTimeout;
-	int transactionMaxRetries;
-	double transactionMaxBackoff;
-	int transactionMaxSize;  // Max size in bytes.
 	int snapshotRywEnabled;
 
 	Future<Void> logger;
@@ -180,6 +176,8 @@ public:
 	HealthMetrics healthMetrics;
 	double healthMetricsLastUpdated;
 	double detailedHealthMetricsLastUpdated;
+
+	UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaults;
 };
 
 #endif
diff --git a/fdbclient/FDBOptions.h b/fdbclient/FDBOptions.h
index e23cab582a..dc3a1d0075 100644
--- a/fdbclient/FDBOptions.h
+++ b/fdbclient/FDBOptions.h
@@ -23,8 +23,11 @@
 #define FDBCLIENT_FDBOPTIONS_H
 
 #include <string>
+#include <list>
 #include <map>
 
+#include "flow/Arena.h"
+
 struct FDBOptionInfo {
 	std::string name;
 	std::string comment;
@@ -32,9 +35,14 @@ struct FDBOptionInfo {
 
 	bool hasParameter;
 	bool hidden;
+	bool persistent;
 
-	FDBOptionInfo(std::string name, std::string comment, std::string parameterComment, bool hasParameter, bool hidden) 
-		: name(name), comment(comment), parameterComment(parameterComment), hasParameter(hasParameter), hidden(hidden) { }
+	// If non-negative, this specifies the code for the transaction option that this option is the default value for.
+	int defaultFor;
+
+	FDBOptionInfo(std::string name, std::string comment, std::string parameterComment, bool hasParameter, bool hidden, bool persistent, int defaultFor) 
+		: name(name), comment(comment), parameterComment(parameterComment), hasParameter(hasParameter), hidden(hidden), persistent(persistent),
+		  defaultFor(defaultFor) { }
 
 	FDBOptionInfo() { }
 };
@@ -54,6 +62,29 @@ public:
 	FDBOptionInfoMap() { T::init(); }
 };
 
-#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden ) type::optionInfo[var] = FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden);
+template<class T>
+class UniqueOrderedOptionList {
+public:
+	typedef std::list<std::pair<typename T::Option, Optional<Standalone<StringRef>>>> OptionList;
+
+private:
+	OptionList options;
+	std::map<typename T::Option, typename OptionList::iterator> optionsIndexMap;
+
+public:
+	void addOption(typename T::Option option, Optional<Standalone<StringRef>> value) {
+		auto itr = optionsIndexMap.find(option);
+		if(itr != optionsIndexMap.end()) {
+			options.erase(itr->second);
+		}
+		options.push_back(std::make_pair(option, value));
+		optionsIndexMap[option] = --options.end();
+	}
+
+	typename OptionList::const_iterator begin() const { return options.cbegin(); }
+	typename OptionList::const_iterator end() const { return options.cend(); }
+};
+
+#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor ) type::optionInfo[var] = FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor);
 
 #endif
\ No newline at end of file
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index c2ef5ad1b6..1f51dd3680 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -408,17 +408,36 @@ void DLApi::addNetworkThreadCompletionHook(void (*hook)(void*), void *hookParame
 }
 
 // MultiVersionTransaction
-MultiVersionTransaction::MultiVersionTransaction(Reference<MultiVersionDatabase> db) : db(db) {
+MultiVersionTransaction::MultiVersionTransaction(Reference<MultiVersionDatabase> db, UniqueOrderedOptionList<FDBTransactionOptions> defaultOptions) : db(db) {
+	setDefaultOptions(defaultOptions);
 	updateTransaction();
 }
 
-// SOMEDAY: This function is unsafe if it's possible to set Database options that affect subsequently created transactions. There are currently no such options.
+void MultiVersionTransaction::setDefaultOptions(UniqueOrderedOptionList<FDBTransactionOptions> options) {
+	MutexHolder holder(db->dbState->optionLock);
+	std::copy(options.begin(), options.end(), std::back_inserter(persistentOptions));
+}
+
 void MultiVersionTransaction::updateTransaction() {
 	auto currentDb = db->dbState->dbVar->get();
 
 	TransactionInfo newTr;
 	if(currentDb.value) {
 		newTr.transaction = currentDb.value->createTransaction();
+
+		Optional<StringRef> timeout;
+		for (auto option : persistentOptions) {
+			if(option.first == FDBTransactionOptions::TIMEOUT) {
+				timeout = option.second.castTo<StringRef>();
+			}
+			else {
+				newTr.transaction->setOption(option.first, option.second.castTo<StringRef>());
+			}
+		}
+	
+		if(timeout.present()) {
+			newTr.transaction->setOption(FDBTransactionOptions::TIMEOUT, timeout);
+		}
 	}
 
 	newTr.onChange = currentDb.onChange;
@@ -574,6 +593,9 @@ Version MultiVersionTransaction::getCommittedVersion() {
 }
 
 void MultiVersionTransaction::setOption(FDBTransactionOptions::Option option, Optional<StringRef> value) {
+	if(MultiVersionApi::apiVersionAtLeast(610) && FDBTransactionOptions::optionInfo[option].persistent) {
+		persistentOptions.push_back(std::make_pair(option, value.castTo<Standalone<StringRef>>()));
+	}
 	auto tr = getTransaction();
 	if(tr.transaction) {
 		tr.transaction->setOption(option, value);
@@ -593,6 +615,8 @@ ThreadFuture<Void> MultiVersionTransaction::onError(Error const& e) {
 }
 
 void MultiVersionTransaction::reset() {
+	persistentOptions.clear();
+	setDefaultOptions(db->dbState->transactionDefaultOptions);
 	updateTransaction();
 }
 
@@ -630,13 +654,12 @@ Reference<IDatabase> MultiVersionDatabase::debugCreateFromExistingDatabase(Refer
 }
 
 Reference<ITransaction> MultiVersionDatabase::createTransaction() {
-	return Reference<ITransaction>(new MultiVersionTransaction(Reference<MultiVersionDatabase>::addRef(this)));
+	return Reference<ITransaction>(new MultiVersionTransaction(Reference<MultiVersionDatabase>::addRef(this), dbState->transactionDefaultOptions));
 }
 
 void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value) {
 	MutexHolder holder(dbState->optionLock);
 
-
 	auto itr = FDBDatabaseOptions::optionInfo.find(option);
 	if(itr != FDBDatabaseOptions::optionInfo.end()) {
 		TraceEvent("SetDatabaseOption").detail("Option", itr->second.name);
@@ -646,11 +669,18 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional
 		throw invalid_option();
 	}
 
-	if(dbState->db) {
-		dbState->db->setOption(option, value);
+	int defaultFor = FDBDatabaseOptions::optionInfo[option].defaultFor;
+	if (defaultFor >= 0) {
+		ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) !=
+		       FDBTransactionOptions::optionInfo.end());
+		dbState->transactionDefaultOptions.addOption((FDBTransactionOptions::Option)defaultFor, value.castTo<Standalone<StringRef>>());
 	}
 
 	dbState->options.push_back(std::make_pair(option, value.castTo<Standalone<StringRef>>()));
+
+	if(dbState->db) {
+		dbState->db->setOption(option, value);
+	}
 }
 
 void MultiVersionDatabase::Connector::connect() {
@@ -811,6 +841,11 @@ void MultiVersionDatabase::DatabaseState::cancelConnections() {
 
 // MultiVersionApi
 
+bool MultiVersionApi::apiVersionAtLeast(int minVersion) {
+	ASSERT(MultiVersionApi::api->apiVersion != 0);
+	return MultiVersionApi::api->apiVersion >= minVersion;
+}
+
 // runOnFailedClients should be used cautiously. Some failed clients may not have successfully loaded all symbols.
 void MultiVersionApi::runOnExternalClients(std::function<void(Reference<ClientInfo>)> func, bool runOnFailedClients) {
 	bool newFailure = false;
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 6ddaeac8fa..b1a1c3372a 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -210,7 +210,7 @@ class MultiVersionDatabase;
 
 class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted<MultiVersionTransaction> {
 public:
-	MultiVersionTransaction(Reference<MultiVersionDatabase> db);
+	MultiVersionTransaction(Reference<MultiVersionDatabase> db, UniqueOrderedOptionList<FDBTransactionOptions> defaultOptions);
 
 	void cancel();
 	void setVersion(Version v);
@@ -261,6 +261,9 @@ private:
 
 	TransactionInfo getTransaction();
 	void updateTransaction();
+	void setDefaultOptions(UniqueOrderedOptionList<FDBTransactionOptions> options);
+
+	std::vector<std::pair<FDBTransactionOptions::Option, Optional<Standalone<StringRef>>>> persistentOptions;
 };
 
 struct ClientInfo : ThreadSafeReferenceCounted<ClientInfo> {
@@ -341,6 +344,7 @@ private:
 		std::vector<Reference<Connector>> connectionAttempts;
 
 		std::vector<std::pair<FDBDatabaseOptions::Option, Optional<Standalone<StringRef>>>> options;
+		UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaultOptions;
 		Mutex optionLock;
 	};
 
@@ -370,6 +374,8 @@ public:
 	bool callbackOnMainThread;
 	bool localClientDisabled;
 
+	static bool apiVersionAtLeast(int minVersion);
+
 private:
 	MultiVersionApi();
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 6cea540491..4f0770f680 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -279,7 +279,6 @@ struct TrInfoChunk {
 
 ACTOR static Future<Void> transactionInfoCommitActor(Transaction *tr, std::vector<TrInfoChunk> *chunks) {
 	state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin);
-	state int retryCount = 0;
 	loop{
 		try {
 			tr->reset();
@@ -296,9 +295,6 @@ ACTOR static Future<Void> transactionInfoCommitActor(Transaction *tr, std::vecto
 			return Void();
 		}
 		catch (Error& e) {
-			retryCount++;
-			if (retryCount == 10)
-				throw;
 			wait(tr->onError(e));
 		}
 	}
@@ -516,15 +512,13 @@ DatabaseContext::DatabaseContext(
 	lockAware(lockAware), apiVersion(apiVersion), provisional(false),
 	transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), 
 	transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0), 
-	transactionsMaybeCommitted(0), transactionsResourceConstrained(0), transactionsProcessBehind(0), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1),
+	transactionsMaybeCommitted(0), transactionsResourceConstrained(0), transactionsProcessBehind(0), outstandingWatches(0),
 	latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
 	healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0)
 {
 	metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE);
 	maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES;
 
-	transactionMaxBackoff = CLIENT_KNOBS->FAILURE_MAX_DELAY;
-	transactionMaxSize = CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT;
 	snapshotRywEnabled = apiVersionAtLeast(300) ? 1 : 0; 
 
 	logger = databaseLogger( this );
@@ -745,52 +739,43 @@ uint64_t extractHexOption( StringRef value ) {
 }
 
 void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional<StringRef> value) {
-	switch(option) {
-		case FDBDatabaseOptions::LOCATION_CACHE_SIZE:
-			locationCacheSize = (int)extractIntOption(value, 0, std::numeric_limits<int>::max());
-			break;
-		case FDBDatabaseOptions::MACHINE_ID:
-			clientLocality = LocalityData( clientLocality.processId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>(), clientLocality.machineId(), clientLocality.dcId() );
-			if( clientInfo->get().proxies.size() )
-				masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ) );
-			server_interf.clear();
-			locationCache.insert( allKeys, Reference<LocationInfo>() );
-			break;
-		case FDBDatabaseOptions::MAX_WATCHES:
-			maxOutstandingWatches = (int)extractIntOption(value, 0, CLIENT_KNOBS->ABSOLUTE_MAX_WATCHES);
-			break;
-		case FDBDatabaseOptions::DATACENTER_ID:
-			clientLocality = LocalityData(clientLocality.processId(), clientLocality.zoneId(), clientLocality.machineId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>());
-			if( clientInfo->get().proxies.size() )
-				masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ));
-			server_interf.clear();
-			locationCache.insert( allKeys, Reference<LocationInfo>() );
-			break;
-		case FDBDatabaseOptions::TRANSACTION_TIMEOUT:
-			if( !apiVersionAtLeast(610) ) {
-				throw invalid_option();
-			}
-			transactionTimeout = extractIntOption(value, 0, std::numeric_limits<int>::max())/1000.0;
-			break;
-		case FDBDatabaseOptions::TRANSACTION_RETRY_LIMIT:
-			transactionMaxRetries = (int)extractIntOption(value, -1, std::numeric_limits<int>::max());
-			break;
-		case FDBDatabaseOptions::TRANSACTION_MAX_RETRY_DELAY:
-			validateOptionValue(value, true);
-			transactionMaxBackoff = extractIntOption(value, 0, std::numeric_limits<int32_t>::max()) / 1000.0;
-			break;
-		case FDBDatabaseOptions::TRANSACTION_SIZE_LIMIT:
-			validateOptionValue(value, true);
-			transactionMaxSize = extractIntOption(value, 32, CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT);
-			break;
-		case FDBDatabaseOptions::SNAPSHOT_RYW_ENABLE:
-			validateOptionValue(value, false);
-			snapshotRywEnabled++;
-			break;
-		case FDBDatabaseOptions::SNAPSHOT_RYW_DISABLE:
-			validateOptionValue(value, false);
-			snapshotRywEnabled--;
-			break;
+	int defaultFor = FDBDatabaseOptions::optionInfo[option].defaultFor;
+	if (defaultFor >= 0) {
+		ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) !=
+		       FDBTransactionOptions::optionInfo.end());
+		transactionDefaults.addOption((FDBTransactionOptions::Option)option, value.castTo<Standalone<StringRef>>());
+	}
+	else {
+		switch(option) {
+			case FDBDatabaseOptions::LOCATION_CACHE_SIZE:
+				locationCacheSize = (int)extractIntOption(value, 0, std::numeric_limits<int>::max());
+				break;
+			case FDBDatabaseOptions::MACHINE_ID:
+				clientLocality = LocalityData( clientLocality.processId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>(), clientLocality.machineId(), clientLocality.dcId() );
+				if( clientInfo->get().proxies.size() )
+					masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ) );
+				server_interf.clear();
+				locationCache.insert( allKeys, Reference<LocationInfo>() );
+				break;
+			case FDBDatabaseOptions::MAX_WATCHES:
+				maxOutstandingWatches = (int)extractIntOption(value, 0, CLIENT_KNOBS->ABSOLUTE_MAX_WATCHES);
+				break;
+			case FDBDatabaseOptions::DATACENTER_ID:
+				clientLocality = LocalityData(clientLocality.processId(), clientLocality.zoneId(), clientLocality.machineId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>());
+				if( clientInfo->get().proxies.size() )
+					masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ));
+				server_interf.clear();
+				locationCache.insert( allKeys, Reference<LocationInfo>() );
+				break;
+			case FDBDatabaseOptions::SNAPSHOT_RYW_ENABLE:
+				validateOptionValue(value, false);
+				snapshotRywEnabled++;
+				break;
+			case FDBDatabaseOptions::SNAPSHOT_RYW_DISABLE:
+				validateOptionValue(value, false);
+				snapshotRywEnabled--;
+				break;
+		    }
 	}
 }
 
@@ -839,6 +824,11 @@ Database Database::createDatabase( std::string connFileName, int apiVersion, Loc
 	return Database::createDatabase(rccf, apiVersion, clientLocality);
 }
 
+const UniqueOrderedOptionList<FDBTransactionOptions>& Database::getTransactionDefaults() const {
+	ASSERT(db);
+	return db->transactionDefaults;
+}
+
 extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs);
 
 Cluster::Cluster( Reference<ClusterConnectionFile> connFile,  Reference<AsyncVar<int>> connectedCoordinatorsNum, int apiVersion )
@@ -2457,8 +2447,6 @@ double Transaction::getBackoff(int errCode) {
 }
 
 TransactionOptions::TransactionOptions(Database const& cx) {
-	maxBackoff = cx->transactionMaxBackoff;
-	sizeLimit = cx->transactionMaxSize;
 	reset(cx);
 	if (BUGGIFY) {
 		commitOnFirstProxy = true;
@@ -2472,11 +2460,9 @@ TransactionOptions::TransactionOptions() {
 }
 
 void TransactionOptions::reset(Database const& cx) {
-	double oldMaxBackoff = maxBackoff;
-	uint32_t oldSizeLimit = sizeLimit;
 	memset(this, 0, sizeof(*this));
-	maxBackoff = cx->apiVersionAtLeast(610) ? oldMaxBackoff : cx->transactionMaxBackoff;
-	sizeLimit = oldSizeLimit;
+	maxBackoff = CLIENT_KNOBS->DEFAULT_MAX_BACKOFF;
+	sizeLimit = CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT;
 	lockAware = cx->lockAware;
 }
 
@@ -2503,7 +2489,6 @@ void Transaction::reset() {
 void Transaction::fullReset() {
 	reset();
 	backoff = CLIENT_KNOBS->DEFAULT_BACKOFF;
-	options.maxBackoff = getDatabase()->transactionMaxBackoff;
 }
 
 int Transaction::apiVersionAtLeast(int minVersion) const {
@@ -3150,8 +3135,7 @@ Future<Standalone<StringRef>> Transaction::getVersionstamp() {
 }
 
 Future<Void> Transaction::onError( Error const& e ) {
-	if (e.code() == error_code_success)
-	{
+	if (e.code() == error_code_success) {
 		return client_invalid_operation();
 	}
 	if (e.code() == error_code_not_committed ||
@@ -3175,7 +3159,7 @@ Future<Void> Transaction::onError( Error const& e ) {
 
 		double backoff = getBackoff(e.code());
 		reset();
-		return delay( backoff, info.taskID );
+		return delay(backoff, info.taskID);
 	}
 	if (e.code() == error_code_transaction_too_old ||
 		e.code() == error_code_future_version)
@@ -3187,7 +3171,7 @@ Future<Void> Transaction::onError( Error const& e ) {
 
 		double maxBackoff = options.maxBackoff;
 		reset();
-		return delay( std::min(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, maxBackoff), info.taskID );
+		return delay(std::min(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, maxBackoff), info.taskID);
 	}
 
 	if(g_network->isSimulated() && ++numErrors % 10 == 0)
diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h
index 0fbf76cfe4..4419646da3 100644
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@@ -90,6 +90,8 @@ public:
 	inline DatabaseContext* extractPtr() { return db.extractPtr(); }
 	DatabaseContext* operator->() const { return db.getPtr(); }
 
+	const UniqueOrderedOptionList<FDBTransactionOptions>& getTransactionDefaults() const;
+
 private:
 	Reference<DatabaseContext> db;
 };
diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp
index 7d006dd85a..8bc1f3683c 100644
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@@ -1124,7 +1124,8 @@ public:
 };
 
 ReadYourWritesTransaction::ReadYourWritesTransaction( Database const& cx ) : cache(&arena), writes(&arena), tr(cx), retries(0), creationTime(now()), commitStarted(false), options(tr), deferredError(cx->deferredError) {
-	resetTimeout();
+	std::copy(cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(), std::back_inserter(persistentOptions));
+	applyPersistentOptions();
 }
 
 ACTOR Future<Void> timebomb(double endTime, Promise<Void> resetPromise) {
@@ -1473,36 +1474,16 @@ void ReadYourWritesTransaction::writeRangeToNativeTransaction( KeyRangeRef const
 }
 
 ReadYourWritesTransactionOptions::ReadYourWritesTransactionOptions(Transaction const& tr) {
-	Database cx = tr.getDatabase();
-	timeoutInSeconds = cx->transactionTimeout;
-	maxRetries = cx->transactionMaxRetries;
 	reset(tr);
 }
 
 void ReadYourWritesTransactionOptions::reset(Transaction const& tr) {
-	double oldTimeout = timeoutInSeconds;
-	int oldMaxRetries = maxRetries;
 	memset(this, 0, sizeof(*this));
-	if( tr.apiVersionAtLeast(610) ) {
-		// Starting in API version 610, these options are not cleared after reset.
-		timeoutInSeconds = oldTimeout;
-		maxRetries = oldMaxRetries;
-	}
-	else {
-		Database cx = tr.getDatabase();
-		maxRetries = cx->transactionMaxRetries;
-		timeoutInSeconds = cx->transactionTimeout;
-	}
+	timeoutInSeconds = 0.0;
+	maxRetries = -1;
 	snapshotRywEnabled = tr.getDatabase()->snapshotRywEnabled;
 }
 
-void ReadYourWritesTransactionOptions::fullReset(Transaction const& tr) {
-	reset(tr);
-	Database cx = tr.getDatabase();
-	maxRetries = cx->transactionMaxRetries;
-	timeoutInSeconds = cx->transactionTimeout;
-}
-
 bool ReadYourWritesTransactionOptions::getAndResetWriteConflictDisabled() {
 	bool disabled = nextWriteDisableConflictRange;
 	nextWriteDisableConflictRange = false;
@@ -1777,7 +1758,15 @@ Future<Standalone<StringRef>> ReadYourWritesTransaction::getVersionstamp() {
 	return waitOrError(tr.getVersionstamp(), resetPromise.getFuture());
 }
 
-void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional<StringRef> value ) { 
+void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional<StringRef> value ) {
+	setOptionImpl(option, value);
+
+	if(FDBTransactionOptions::optionInfo[option].persistent) {
+		persistentOptions.push_back(std::make_pair(option, value.castTo<Standalone<StringRef>>()));
+	}
+}
+
+void ReadYourWritesTransaction::setOptionImpl( FDBTransactionOptions::Option option, Optional<StringRef> value ) { 
 	switch(option) {
 		case FDBTransactionOptions::READ_YOUR_WRITES_DISABLE:
 			validateOptionValue(value, false);
@@ -1815,8 +1804,8 @@ void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option,
 
 		case FDBTransactionOptions::TIMEOUT:
 			options.timeoutInSeconds = extractIntOption(value, 0, std::numeric_limits<int>::max())/1000.0;
-			resetTimeout();
-			break;
+		    resetTimeout();
+		    break;
 
 		case FDBTransactionOptions::RETRY_LIMIT:
 			options.maxRetries = (int)extractIntOption(value, -1, std::numeric_limits<int>::max());
@@ -1872,6 +1861,7 @@ void ReadYourWritesTransaction::operator=(ReadYourWritesTransaction&& r) BOOST_N
 	transactionDebugInfo = r.transactionDebugInfo;
 	cache.arena = &arena;
 	writes.arena = &arena;
+	persistentOptions = std::move(r.persistentOptions);
 }
 
 ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&& r) BOOST_NOEXCEPT :
@@ -1894,12 +1884,29 @@ ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&&
 	readConflicts = std::move(r.readConflicts);
 	watchMap = std::move( r.watchMap );
 	r.resetPromise = Promise<Void>();
+	persistentOptions = std::move(r.persistentOptions);
 }
 
 Future<Void> ReadYourWritesTransaction::onError(Error const& e) {
 	return RYWImpl::onError( this, e );
 }
 
+void ReadYourWritesTransaction::applyPersistentOptions() {
+	Optional<StringRef> timeout;
+	for (auto option : persistentOptions) {
+		if(option.first == FDBTransactionOptions::TIMEOUT) {
+			timeout = option.second.castTo<StringRef>();
+		}
+		else {
+			setOptionImpl(option.first, option.second.castTo<StringRef>());
+		}
+	}
+
+	if(timeout.present()) {
+		setOptionImpl(FDBTransactionOptions::TIMEOUT, timeout);
+	}
+}
+
 void ReadYourWritesTransaction::resetRyow() {
 	Promise<Void> oldReset = resetPromise;
 	resetPromise = Promise<Void>();
@@ -1917,7 +1924,7 @@ void ReadYourWritesTransaction::resetRyow() {
 
 	if(tr.apiVersionAtLeast(16)) {
 		options.reset(tr);
-		resetTimeout();
+		applyPersistentOptions();
 	}
 
 	if ( !oldReset.isSet() )
@@ -1933,9 +1940,11 @@ void ReadYourWritesTransaction::reset() {
 	retries = 0;
 	creationTime = now();
 	timeoutActor.cancel();
-	options.fullReset(tr);
+	persistentOptions.clear();
+	options.reset(tr);
 	transactionDebugInfo.clear();
 	tr.fullReset();
+	std::copy(tr.getDatabase().getTransactionDefaults().begin(), tr.getDatabase().getTransactionDefaults().end(), std::back_inserter(persistentOptions));
 	resetRyow();
 }
 
diff --git a/fdbclient/ReadYourWrites.h b/fdbclient/ReadYourWrites.h
index c5d4e0fafc..f4b93eabcc 100644
--- a/fdbclient/ReadYourWrites.h
+++ b/fdbclient/ReadYourWrites.h
@@ -44,7 +44,6 @@ struct ReadYourWritesTransactionOptions {
 	ReadYourWritesTransactionOptions() {}
 	explicit ReadYourWritesTransactionOptions(Transaction const& tr);
 	void reset(Transaction const& tr);
-	void fullReset(Transaction const& tr);
 	bool getAndResetWriteConflictDisabled();
 };
 
@@ -160,6 +159,10 @@ private:
 
 	void debugLogRetries(Optional<Error> error = Optional<Error>());
 
+	void setOptionImpl( FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>() );
+	void applyPersistentOptions();
+
+	std::vector<std::pair<FDBTransactionOptions::Option, Optional<Standalone<StringRef>>>> persistentOptions;
 	ReadYourWritesTransactionOptions options;
 };
 
diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp
index 130b1652ce..d341ac3c5d 100644
--- a/fdbclient/ThreadSafeTransaction.actor.cpp
+++ b/fdbclient/ThreadSafeTransaction.actor.cpp
@@ -53,6 +53,8 @@ Reference<ITransaction> ThreadSafeDatabase::createTransaction() {
 void ThreadSafeDatabase::setOption( FDBDatabaseOptions::Option option, Optional<StringRef> value) {
 	DatabaseContext *db = this->db;
 	Standalone<Optional<StringRef>> passValue = value;
+
+	// ThreadSafeDatabase is not allowed to do anything with options except pass them through to RYW.
 	onMainThreadVoid( [db, option, passValue](){ 
 		db->checkDeferredError();
 		db->setOption(option, passValue.contents()); 
@@ -274,6 +276,8 @@ ThreadFuture<Standalone<StringRef>> ThreadSafeTransaction::getVersionstamp() {
 void ThreadSafeTransaction::setOption( FDBTransactionOptions::Option option, Optional<StringRef> value ) {
 	ReadYourWritesTransaction *tr = this->tr;
 	Standalone<Optional<StringRef>> passValue = value;
+
+	// ThreadSafeTransaction is not allowed to do anything with options except pass them through to RYW.
 	onMainThreadVoid( [tr, option, passValue](){ tr->setOption(option, passValue.contents()); }, &tr->deferredError );
 }
 
diff --git a/fdbclient/vexillographer/cpp.cs b/fdbclient/vexillographer/cpp.cs
index 0d17cbdb5c..4ea844f6ca 100644
--- a/fdbclient/vexillographer/cpp.cs
+++ b/fdbclient/vexillographer/cpp.cs
@@ -47,8 +47,8 @@ namespace vexillographer
 
         private static string getCInfoLine(Option o, string indent, string structName)
         {
-            return String.Format("{0}ADD_OPTION_INFO({1}, {2}, \"{2}\", \"{3}\", \"{4}\", {5}, {6})",
-                indent, structName, o.name.ToUpper(), o.comment, o.getParameterComment(), (o.paramDesc != null).ToString().ToLower(), o.hidden.ToString().ToLower());
+            return String.Format("{0}ADD_OPTION_INFO({1}, {2}, \"{2}\", \"{3}\", \"{4}\", {5}, {6}, {7}, {8})",
+                indent, structName, o.name.ToUpper(), o.comment, o.getParameterComment(), (o.paramDesc != null).ToString().ToLower(), o.hidden.ToString().ToLower(), o.persistent.ToString().ToLower(), o.defaultFor);
         }
 
         private static void writeCppInfo(TextWriter outFile, Scope scope, IEnumerable<Option> options)
diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index f72a082de3..7e7f198ac3 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -148,16 +148,20 @@ description is not currently required but encouraged.
             description="Specify the datacenter ID that was passed to fdbserver processes running in the same datacenter as this client, for better location-aware load balancing." />
     <Option name="transaction_timeout" code="500"
             paramType="Int" paramDescription="value in milliseconds of timeout"
-            description="Set a timeout in milliseconds which, when elapsed, will cause each transaction automatically to be cancelled. This sets the ``timeout`` option of each transaction created by this database. See the transaction option description for more information. Using this option requires that the API version is 610 or higher." />
+            description="Set a timeout in milliseconds which, when elapsed, will cause each transaction automatically to be cancelled. This sets the ``timeout`` option of each transaction created by this database. See the transaction option description for more information. Using this option requires that the API version is 610 or higher." 
+            defaultFor="500"/>
     <Option name="transaction_retry_limit" code="501"
             paramType="Int" paramDescription="number of times to retry"
-            description="Set a timeout in milliseconds which, when elapsed, will cause a transaction automatically to be cancelled. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information." />
+            description="Set a timeout in milliseconds which, when elapsed, will cause a transaction automatically to be cancelled. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information." 
+            defaultFor="501"/>
     <Option name="transaction_max_retry_delay" code="502"
             paramType="Int" paramDescription="value in milliseconds of maximum delay"
-            description="Set the maximum amount of backoff delay incurred in the call to ``onError`` if the error is retryable. This sets the ``max_retry_delay`` option of each transaction created by this database. See the transaction option description for more information." />
+            description="Set the maximum amount of backoff delay incurred in the call to ``onError`` if the error is retryable. This sets the ``max_retry_delay`` option of each transaction created by this database. See the transaction option description for more information."
+            defaultFor="502"/>
     <Option name="transaction_size_limit" code="503"
             paramType="Int" paramDescription="value in bytes"
-            description="Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Default to 10,000,000 bytes." />
+            description="Set the maximum transaction size in bytes. This sets the ``size_limit`` option on each transaction created by this database. See the transaction option description for more information." 
+            defaultFor="503"/>
     <Option name="snapshot_ryw_enable" code="26"
             description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
     <Option name="snapshot_ryw_disable" code="27"
@@ -206,16 +210,19 @@ description is not currently required but encouraged.
             description="Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled and to get log output." />
     <Option name="timeout" code="500"
             paramType="Int" paramDescription="value in milliseconds of timeout"
-            description="Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option." />
+            description="Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option."
+            persistent="true" />
     <Option name="retry_limit" code="501"
             paramType="Int" paramDescription="number of times to retry"
-            description="Set a maximum number of retries after which additional calls to ``onError`` will throw the most recently seen error code. Valid parameter values are ``[-1, INT_MAX]``. If set to -1, will disable the retry limit. Prior to API version 610, like all other transaction options, the retry limit must be reset after a call to ``onError``. If the API version is 610 or greater, the retry limit is not reset after an ``onError`` call. Note that at all API versions, it is safe and legal to set the retry limit each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option." />
+            description="Set a maximum number of retries after which additional calls to ``onError`` will throw the most recently seen error code. Valid parameter values are ``[-1, INT_MAX]``. If set to -1, will disable the retry limit. Prior to API version 610, like all other transaction options, the retry limit must be reset after a call to ``onError``. If the API version is 610 or greater, the retry limit is not reset after an ``onError`` call. Note that at all API versions, it is safe and legal to set the retry limit each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option." 
+            persistent="true"/>
     <Option name="max_retry_delay" code="502"
             paramType="Int" paramDescription="value in milliseconds of maximum delay"
-            description="Set the maximum amount of backoff delay incurred in the call to ``onError`` if the error is retryable. Defaults to 1000 ms. Valid parameter values are ``[0, INT_MAX]``. If the maximum retry delay is less than the current retry delay of the transaction, then the current retry delay will be clamped to the maximum retry delay. Prior to API version 610, like all other transaction options, the maximum retry delay must be reset after a call to ``onError``. If the API version is 610 or greater, the retry limit is not reset after an ``onError`` call. Note that at all API versions, it is safe and legal to set the maximum retry delay each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option."/>
+            description="Set the maximum amount of backoff delay incurred in the call to ``onError`` if the error is retryable. Defaults to 1000 ms. Valid parameter values are ``[0, INT_MAX]``. If the maximum retry delay is less than the current retry delay of the transaction, then the current retry delay will be clamped to the maximum retry delay. Prior to API version 610, like all other transaction options, the maximum retry delay must be reset after a call to ``onError``. If the API version is 610 or greater, the retry limit is not reset after an ``onError`` call. Note that at all API versions, it is safe and legal to set the maximum retry delay each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option."
+            persistent="true"/>
     <Option name="size_limit" code="503"
             paramType="Int" paramDescription="value in bytes"
-            description="Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Valid parameter values are ``[32, 10,000,000]```." />
+            description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
 	<Option name="snapshot_ryw_enable" code="600"
             description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
     <Option name="snapshot_ryw_disable" code="601"
diff --git a/fdbclient/vexillographer/vexillographer.cs b/fdbclient/vexillographer/vexillographer.cs
index fab303c06c..9e9cb33190 100644
--- a/fdbclient/vexillographer/vexillographer.cs
+++ b/fdbclient/vexillographer/vexillographer.cs
@@ -54,7 +54,9 @@ namespace vexillographer
         public string paramDesc { get; set; }
         public int code { get; set; }
         public bool hidden { get; set; }
-        private string _comment;
+        public bool persistent { get; set; }
+        public int defaultFor { get; set; }
+		private string _comment;
         public string comment {
             get {
                 return _comment == null ? "" : _comment;
@@ -132,7 +134,10 @@ namespace vexillographer
                         var paramTypeStr = oDoc.AttributeOrNull("paramType");
                         ParamType p = paramTypeStr == null ? ParamType.None : (ParamType)Enum.Parse(typeof(ParamType), paramTypeStr);
                         bool hidden = oDoc.AttributeOrNull("hidden") == "true";
-                        string disableOn = oDoc.AttributeOrNull("disableOn");
+                        bool persistent = oDoc.AttributeOrNull("persistent") == "true";
+                        String defaultForString = oDoc.AttributeOrNull("defaultFor");
+						int defaultFor = defaultForString == null ? -1 : int.Parse(defaultForString);
+						string disableOn = oDoc.AttributeOrNull("disableOn");
                         bool disabled = false;
                         if(disableOn != null)
                         {
@@ -150,7 +155,9 @@ namespace vexillographer
                                 paramType = p,
                                 paramDesc = oDoc.AttributeOrNull("paramDescription"),
                                 comment = oDoc.AttributeOrNull("description"),
-                                hidden = hidden
+                                hidden = hidden,
+                                persistent = persistent,
+                                defaultFor = defaultFor
                             });
                         }
                     }

From c6df30deb3a95ed8a03401aba39ceb0dc9f06fce Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 28 Jun 2019 13:24:56 -0700
Subject: [PATCH 039/136] Move size_limit.py to size_limit_tests.py

---
 bindings/python/tests/{size_limit.py => size_limit_tests.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename bindings/python/tests/{size_limit.py => size_limit_tests.py} (100%)

diff --git a/bindings/python/tests/size_limit.py b/bindings/python/tests/size_limit_tests.py
similarity index 100%
rename from bindings/python/tests/size_limit.py
rename to bindings/python/tests/size_limit_tests.py

From aa1bc0087e2f01a643436f27e4d711577db250bb Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 28 Jun 2019 14:17:25 -0700
Subject: [PATCH 040/136] Address some review comments

---
 fdbclient/FDBOptions.h                      | 3 +++
 fdbclient/MultiVersionTransaction.actor.cpp | 5 ++++-
 fdbclient/ReadYourWrites.actor.cpp          | 7 +++++--
 fdbclient/vexillographer/fdb.options        | 4 +++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/fdbclient/FDBOptions.h b/fdbclient/FDBOptions.h
index dc3a1d0075..677a54ee6a 100644
--- a/fdbclient/FDBOptions.h
+++ b/fdbclient/FDBOptions.h
@@ -38,6 +38,7 @@ struct FDBOptionInfo {
 	bool persistent;
 
 	// If non-negative, this specifies the code for the transaction option that this option is the default value for.
+	// Options that have a defaultFor will only retain the value from time they were most recently set (i.e. there can be no cumulative effects from calling multiple times).
 	int defaultFor;
 
 	FDBOptionInfo(std::string name, std::string comment, std::string parameterComment, bool hasParameter, bool hidden, bool persistent, int defaultFor) 
@@ -62,6 +63,8 @@ public:
 	FDBOptionInfoMap() { T::init(); }
 };
 
+// An ordered list of options where each option is represented only once. Subsequent insertions will remove the option from its
+// original location and add it to the end with the new value.
 template<class T>
 class UniqueOrderedOptionList {
 public:
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 1f51dd3680..da9bb273d2 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -435,6 +435,9 @@ void MultiVersionTransaction::updateTransaction() {
 			}
 		}
 	
+		// Setting a timeout can immediately cause a transaction to fail. The only timeout 
+		// that matters is the one most recently set, so we ignore any earlier set timeouts
+		// that might inadvertently fail the transaction.
 		if(timeout.present()) {
 			newTr.transaction->setOption(FDBTransactionOptions::TIMEOUT, timeout);
 		}
@@ -843,7 +846,7 @@ void MultiVersionDatabase::DatabaseState::cancelConnections() {
 
 bool MultiVersionApi::apiVersionAtLeast(int minVersion) {
 	ASSERT(MultiVersionApi::api->apiVersion != 0);
-	return MultiVersionApi::api->apiVersion >= minVersion;
+	return MultiVersionApi::api->apiVersion >= minVersion || MultiVersionApi::api->apiVersion < 0;
 }
 
 // runOnFailedClients should be used cautiously. Some failed clients may not have successfully loaded all symbols.
diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp
index 8bc1f3683c..336fd46ad3 100644
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@@ -1804,8 +1804,8 @@ void ReadYourWritesTransaction::setOptionImpl( FDBTransactionOptions::Option opt
 
 		case FDBTransactionOptions::TIMEOUT:
 			options.timeoutInSeconds = extractIntOption(value, 0, std::numeric_limits<int>::max())/1000.0;
-		    resetTimeout();
-		    break;
+			resetTimeout();
+			break;
 
 		case FDBTransactionOptions::RETRY_LIMIT:
 			options.maxRetries = (int)extractIntOption(value, -1, std::numeric_limits<int>::max());
@@ -1902,6 +1902,9 @@ void ReadYourWritesTransaction::applyPersistentOptions() {
 		}
 	}
 
+	// Setting a timeout can immediately cause a transaction to fail. The only timeout 
+	// that matters is the one most recently set, so we ignore any earlier set timeouts
+	// that might inadvertently fail the transaction.
 	if(timeout.present()) {
 		setOptionImpl(FDBTransactionOptions::TIMEOUT, timeout);
 	}
diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index 7e7f198ac3..775e443137 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -162,6 +162,8 @@ description is not currently required but encouraged.
             paramType="Int" paramDescription="value in bytes"
             description="Set the maximum transaction size in bytes. This sets the ``size_limit`` option on each transaction created by this database. See the transaction option description for more information." 
             defaultFor="503"/>
+    <!-- The snapshot RYW options act like defaults for the equivalent transaction options, but database defaults cannot have cumulative effects from multiple calls.
+         Thus, we don't use the defaultFor annotation on these options. -->
     <Option name="snapshot_ryw_enable" code="26"
             description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
     <Option name="snapshot_ryw_disable" code="27"
@@ -223,7 +225,7 @@ description is not currently required but encouraged.
     <Option name="size_limit" code="503"
             paramType="Int" paramDescription="value in bytes"
             description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
-	<Option name="snapshot_ryw_enable" code="600"
+    <Option name="snapshot_ryw_enable" code="600"
             description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
     <Option name="snapshot_ryw_disable" code="601"
             description="Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300." />

From cfce1e170521cb40a517ea5cca712c723b79112a Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 28 Jun 2019 15:54:08 -0700
Subject: [PATCH 041/136] fix: buffered peek cursor would advance very slowly
 through large ranges of empty versions

---
 fdbserver/LogSystemDiskQueueAdapter.actor.cpp | 3 ++-
 fdbserver/LogSystemPeekCursor.actor.cpp       | 6 +++++-
 fdbserver/storageserver.actor.cpp             | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
index af2323923a..7a803abe46 100644
--- a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
+++ b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
@@ -60,7 +60,8 @@ public:
 						}
 					}
 				}
-				TraceEvent("PeekNextGetMore").detail("Queue", self->recoveryQueue.size()).detail("Bytes", bytes).detail("Loc", self->recoveryLoc).detail("End", self->logSystem->getEnd()); 
+				TraceEvent("PeekNextGetMore").detail("Queue", self->recoveryQueue.size()).detail("Bytes", bytes).detail("Loc", self->recoveryLoc)
+					.detail("End", self->logSystem->getEnd()).detail("HasMessage", self->cursor->hasMessage()).detail("Version", self->cursor->version().version); 
 				if(self->recoveryQueueDataSize == 0) {
 					self->recoveryQueueLoc = self->recoveryLoc;
 				}
diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp
index e58d365204..6acd402f38 100644
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@@ -1008,7 +1008,11 @@ ACTOR Future<Void> bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID
 	}
 	self->messageIndex = 0;
 	self->hasNextMessage = self->messages.size() > 0;
-	self->messageVersion = LogMessageVersion(targetVersion);
+	Version minVersion = self->end;
+	for(auto& cursor : self->cursors) {
+		minVersion = std::min(minVersion, cursor->version().version);
+	}
+	self->messageVersion = LogMessageVersion(minVersion);
 
 	if(self->collectTags) {
 		self->combineMessages();
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 28d47fa9cb..bee966fdf8 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -676,7 +676,7 @@ bool validateRange( StorageServer::VersionedData::ViewAtVersion const& view, Key
 	// * Nonoverlapping: No clear overlaps a set or another clear, or adjoins another clear.
 	// * Old mutations are erased: All items in versionedData.atLatest() have insertVersion() > durableVersion()
 
-	TraceEvent("ValidateRange", id).detail("KeyBegin", range.begin).detail("KeyEnd", range.end).detail("Version", version);
+	//TraceEvent("ValidateRange", id).detail("KeyBegin", range.begin).detail("KeyEnd", range.end).detail("Version", version);
 	KeyRef k;
 	bool ok = true;
 	bool kIsClear = false;

From 0baae134f6228d7f835c4f85cd3b3716b109656d Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Fri, 28 Jun 2019 15:59:47 -0700
Subject: [PATCH 042/136] TeamCollectionInfo: Resolve review comments

---
 documentation/sphinx/source/release-notes.rst |  8 ++++
 fdbserver/DataDistribution.actor.cpp          | 38 ++++++-------------
 fdbserver/QuietDatabase.actor.cpp             |  4 +-
 3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index 6ca2490060..11dd64fbd0 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -2,6 +2,14 @@
 Release Notes
 #############
 
+6.1.11
+======
+
+Fixes
+-----
+
+* Ensure new added machines are used to build teams and host data from existing machines when a cluster is expanded. `(PR #1764) <https://github.com/apple/foundationdb/pull/1764>`_
+
 6.1.10
 ======
 
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 11ce0063ff..d474850cc3 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1343,12 +1343,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				// Invariant: We only create correct size machine teams.
 				// When configuration (e.g., team size) is changed, the DDTeamCollection will be destroyed and rebuilt
 				// so that the invariant will not be violated.
-				int teamCount = 0;
-				for (auto& mt : machine.second->machineTeams) {
-					if (isMachineTeamHealthy(mt)) {
-						++teamCount;
-					}
-				}
+				int teamCount = machine.second->machineTeams.size();
 
 				if (teamCount < minTeamCount) {
 					leastUsedMachines.clear();
@@ -1385,7 +1380,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				// that have the least-utilized server
 				team.clear();
 				auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team);
-				// NOTE: selectReplicas() returns false always when storageTeamSize == 1
+				// NOTE: selectReplicas() should always return success when storageTeamSize = 1
+				ASSERT_WE_THINK ( configuration.storageTeamSize > 1 || (configuration.storageTeamSize == 1 && success) );
 				if (!success && configuration.storageTeamSize > 1) {
 					break;
 				}
@@ -1605,9 +1601,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
 		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
 		for (auto& server : server_info) {
-			// if ( server_status.get(server.first).isUnhealthy() ) {
-			// 	continue;
-			// }
+			if ( server_status.get(server.first).isUnhealthy() ) {
+				continue;
+			}
 			if (server.second->teams.size() < minTeamNumber) {
 				minTeamNumber = server.second->teams.size();
 			}
@@ -1622,9 +1618,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		int minTeamNumber = std::numeric_limits<int>::max();
 		int maxTeamNumber = 0;
 		for (auto& machine : machine_info) {
-			// if ( !isMachineHealthy(machine.second) ) {
-			// 	continue;
-			// }
+			if ( !isMachineHealthy(machine.second) ) {
+				continue;
+			}
 			if (machine.second->machineTeams.size() < minTeamNumber) {
 				minTeamNumber = machine.second->machineTeams.size();
 			}
@@ -1691,12 +1687,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	int getRemainingMachineTeamBudget() {
 		int remainingMachineTeamBudget = 0;
 		for (auto& m : machine_info) {
-			int healthyMTCount = 0;
-			for (auto& mt : m.second->machineTeams) {
-				if (isMachineTeamHealthy(mt)) {
-					++healthyMTCount;
-				}
-			}
+			int healthyMTCount = m.second->machineTeams.size();
 			remainingMachineTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - healthyMTCount));
 		}
 
@@ -1712,12 +1703,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
 		int remainingTeamBudget = 0;
 		for (auto& s : server_info) {
-			int numValidTeams = 0;
-			for (auto& team : s.second->teams) {
-				if (!team->isWrongConfiguration() && team->isHealthy()) {
-					++numValidTeams;
-				}
-			}
+			int numValidTeams = s.second->teams.size();
 			remainingTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams));
 		}
 
@@ -1757,7 +1743,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("MachineTeamsToBuild", machineTeamsToBuild)
 		    .detail("RemainingMachineTeamBudget", remainingMachineTeamBudget);
 		// Pre-build all machine teams until we have the desired number of machine teams
-		if (machineTeamsToBuild > 0) {
+		if (machineTeamsToBuild > 0 || remainingMachineTeamBudget > 0) {
 			addedMachineTeams = addBestMachineTeams(machineTeamsToBuild, remainingMachineTeamBudget);
 		}
 
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 886f7ad099..0f33faa40c 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -308,7 +308,9 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 
 			// The if condition should be consistent with the condition in teamRemover() that decides
 			// if redundant teams exist.
-			if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0 ) {
+			if (healthyMachineTeamCount > desiredMachineTeamNumber || (minMachineTeamOnMachine <= 0 && SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER == 3)) {
+				// When DESIRED_TEAMS_PER_SERVER == 1, we see minMachineTeamOnMachine can be 0 in one out of 30k test cases.
+				// Only check DESIRED_TEAMS_PER_SERVER == 3 for now since it is mostly used configuration.
 				TraceEvent("GetTeamCollectionValid")
 				    .detail("CurrentTeamNumber", currentTeamNumber)
 				    .detail("DesiredTeamNumber", desiredTeamNumber)

From 875cb877ac14e69e407519c4d07ba72a9ba1d5af Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Fri, 28 Jun 2019 16:01:05 -0700
Subject: [PATCH 043/136] TeamCollection: Apply clang-format

---
 fdbserver/DataDistribution.actor.cpp | 24 ++++++++++++------------
 fdbserver/QuietDatabase.actor.cpp    |  7 ++++---
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index d474850cc3..90c9e323b4 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -959,7 +959,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		}
 
 		// Trace and record the current number of teams for correctness test
-		wait( self->traceTeamCollectionInfo(self) );
+		wait(self->traceTeamCollectionInfo(self));
 
 		return Void();
 	}
@@ -1381,7 +1381,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				team.clear();
 				auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team);
 				// NOTE: selectReplicas() should always return success when storageTeamSize = 1
-				ASSERT_WE_THINK ( configuration.storageTeamSize > 1 || (configuration.storageTeamSize == 1 && success) );
+				ASSERT_WE_THINK(configuration.storageTeamSize > 1 || (configuration.storageTeamSize == 1 && success));
 				if (!success && configuration.storageTeamSize > 1) {
 					break;
 				}
@@ -1601,7 +1601,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
 		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
 		for (auto& server : server_info) {
-			if ( server_status.get(server.first).isUnhealthy() ) {
+			if (server_status.get(server.first).isUnhealthy()) {
 				continue;
 			}
 			if (server.second->teams.size() < minTeamNumber) {
@@ -1618,7 +1618,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		int minTeamNumber = std::numeric_limits<int>::max();
 		int maxTeamNumber = 0;
 		for (auto& machine : machine_info) {
-			if ( !isMachineHealthy(machine.second) ) {
+			if (!isMachineHealthy(machine.second)) {
 				continue;
 			}
 			if (machine.second->machineTeams.size() < minTeamNumber) {
@@ -1840,7 +1840,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("Primary", primary)
 		    .detail("AddedTeamNumber", addedTeams)
 		    .detail("AimToBuildTeamNumber", teamsToBuild)
-			.detail("RemainingTeamBudget", remainingTeamBudget)
+		    .detail("RemainingTeamBudget", remainingTeamBudget)
 		    .detail("CurrentTeamNumber", teams.size())
 		    .detail("DesiredTeamNumber", desiredTeamNumber)
 		    .detail("MaxTeamNumber", maxTeamNumber)
@@ -1878,7 +1878,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("Primary", self->primary)
 		    .detail("AddedTeamNumber", 0)
 		    .detail("AimToBuildTeamNumber", 0)
-			.detail("RemainingTeamBudget", 0)
+		    .detail("RemainingTeamBudget", 0)
 		    .detail("CurrentTeamNumber", self->teams.size())
 		    .detail("DesiredTeamNumber", desiredServerTeams)
 		    .detail("MaxTeamNumber", maxServerTeams)
@@ -1897,7 +1897,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 		// Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise
 		// simulation test will randomly pick one TeamCollectionInfo trace, which could be the one before build teams
-		wait( delay(0.01) );
+		wait(delay(0.01));
 
 		// Debug purpose
 //		if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {
@@ -2006,7 +2006,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				    .detail("Primary", self->primary)
 				    .detail("AddedTeamNumber", 0)
 				    .detail("AimToBuildTeamNumber", teamsToBuild)
-					.detail("RemainingTeamBudget", remainingTeamBudget)
+				    .detail("RemainingTeamBudget", remainingTeamBudget)
 				    .detail("CurrentTeamNumber", self->teams.size())
 				    .detail("DesiredTeamNumber", desiredTeams)
 				    .detail("MaxTeamNumber", maxTeams)
@@ -2467,7 +2467,7 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
 				    .detail("CurrentMachineTeamNumber", self->machineTeams.size())
 				    .detail("DesiredMachineTeam", desiredMachineTeams)
 				    .detail("NumMachineTeamRemoved", numMachineTeamRemoved);
-				wait( self->traceTeamCollectionInfo(self) );
+				wait(self->traceTeamCollectionInfo(self));
 			}
 		}
 	}
@@ -3222,8 +3222,8 @@ ACTOR Future<Void> storageServerTracker(
 				}
 			}
 
-			if ( recordTeamCollectionInfo ) {
-				wait( self->traceTeamCollectionInfo(self) );
+			if (recordTeamCollectionInfo) {
+				wait(self->traceTeamCollectionInfo(self));
 			}
 		}
 	} catch( Error &e ) {
@@ -3459,7 +3459,7 @@ ACTOR Future<Void> dataDistributionTeamCollection(
 			self->redundantTeamRemover = teamRemover(self);
 			self->addActor.send(self->redundantTeamRemover);
 		}
-		wait( self->traceTeamCollectionInfo(self) );
+		wait(self->traceTeamCollectionInfo(self));
 
 		if(self->includedDCs.size()) {
 			//start this actor before any potential recruitments can happen
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 0f33faa40c..df5a39e573 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -308,9 +308,10 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
 
 			// The if condition should be consistent with the condition in teamRemover() that decides
 			// if redundant teams exist.
-			if (healthyMachineTeamCount > desiredMachineTeamNumber || (minMachineTeamOnMachine <= 0 && SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER == 3)) {
-				// When DESIRED_TEAMS_PER_SERVER == 1, we see minMachineTeamOnMachine can be 0 in one out of 30k test cases.
-				// Only check DESIRED_TEAMS_PER_SERVER == 3 for now since it is mostly used configuration.
+			if (healthyMachineTeamCount > desiredMachineTeamNumber ||
+			    (minMachineTeamOnMachine <= 0 && SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER == 3)) {
+				// When DESIRED_TEAMS_PER_SERVER == 1, we see minMachineTeamOnMachine can be 0 in one out of 30k test
+				// cases. Only check DESIRED_TEAMS_PER_SERVER == 3 for now since it is mostly used configuration.
 				TraceEvent("GetTeamCollectionValid")
 				    .detail("CurrentTeamNumber", currentTeamNumber)
 				    .detail("DesiredTeamNumber", desiredTeamNumber)

From 63c42533eb7ca6caf894c42449b8128ac794eb9e Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Fri, 28 Jun 2019 16:19:58 -0700
Subject: [PATCH 044/136] TaceTeamCollectionInfo:Remove delay

---
 fdbserver/DataDistribution.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 90c9e323b4..f0f42f9e32 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1897,7 +1897,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 		// Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise
 		// simulation test will randomly pick one TeamCollectionInfo trace, which could be the one before build teams
-		wait(delay(0.01));
+		//wait(delay(0.01));
 
 		// Debug purpose
 //		if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {

From 2c40c818cf35cd57774489eea5de3b617eccfbad Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 28 Jun 2019 17:51:16 -0700
Subject: [PATCH 045/136] fix: txsTags was not copied into oldLogData

---
 fdbserver/TagPartitionedLogSystem.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index ede5d4bb7b..13b37b76a3 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -1942,6 +1942,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			logSystem->oldLogData[0].tLogs = oldLogSystem->tLogs;
 			logSystem->oldLogData[0].epochEnd = oldLogSystem->knownCommittedVersion + 1;
 			logSystem->oldLogData[0].logRouterTags = oldLogSystem->logRouterTags;
+			logSystem->oldLogData[0].txsTags = oldLogSystem->txsTags;
 			logSystem->oldLogData[0].pseudoLocalities = oldLogSystem->pseudoLocalities;
 		}
 		logSystem->oldLogData.insert(logSystem->oldLogData.end(), oldLogSystem->oldLogData.begin(), oldLogSystem->oldLogData.end());

From 4e45a587507b90d314359fc7edaf72b0d650fb7e Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 28 Jun 2019 20:51:16 -0700
Subject: [PATCH 046/136] fix: forced recovery did not copy the number of
 txsTags properly

---
 fdbserver/TagPartitionedLogSystem.actor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index 13b37b76a3..bacdd121f0 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -1355,6 +1355,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 							modifiedState.tLogs.push_back(coreSet);
 							modifiedState.tLogs[0].isLocal = true;
 							modifiedState.logRouterTags = 0;
+							modifiedState.txsTags = modifiedState.oldTLogData[0].txsTags;
 							modifiedLogSets++;
 							break;
 						}
@@ -1396,11 +1397,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			}
 			TraceEvent(SevWarnAlways, "ForcedRecovery", dbgid).detail("PrimaryLocality", primaryLocality).detail("RemoteLocality", remoteLocality).detail("FoundRemote", foundRemote).detail("Modified", modifiedLogSets).detail("Removed", removedLogSets);
 			for(int i = 0; i < prevState.tLogs.size(); i++) {
-				TraceEvent("ForcedRecoveryTLogs", dbgid).detail("I", i).detail("Log", ::describe(prevState.tLogs[i].tLogs)).detail("Loc", prevState.tLogs[i].locality);
+				TraceEvent("ForcedRecoveryTLogs", dbgid).detail("I", i).detail("Log", ::describe(prevState.tLogs[i].tLogs)).detail("Loc", prevState.tLogs[i].locality).detail("Txs", prevState.txsTags);
 			}
 			for(int i = 0; i < prevState.oldTLogData.size(); i++) {
 				for(int j = 0; j < prevState.oldTLogData[i].tLogs.size(); j++) {
-					TraceEvent("ForcedRecoveryTLogs", dbgid).detail("I", i).detail("J",j).detail("Log", ::describe(prevState.oldTLogData[i].tLogs[j].tLogs)).detail("Loc", prevState.oldTLogData[i].tLogs[j].locality);
+					TraceEvent("ForcedRecoveryTLogs", dbgid).detail("I", i).detail("J",j).detail("Log", ::describe(prevState.oldTLogData[i].tLogs[j].tLogs)).detail("Loc", prevState.oldTLogData[i].tLogs[j].locality).detail("Txs", prevState.oldTLogData[i].txsTags);
 				}
 			}
 		}

From b8cb883040b8447c8cea00594f300e2c60a29e5d Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Fri, 28 Jun 2019 17:42:24 -0700
Subject: [PATCH 047/136] AddBestMachineTeams:Fix input must be non-negative
 value

---
 fdbserver/DataDistribution.actor.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index f0f42f9e32..3dee73df82 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1732,8 +1732,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount;
 		int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
 		// machineTeamsToBuild mimics how the teamsToBuild is calculated in buildTeams()
-		int machineTeamsToBuild =
-		    std::min(desiredMachineTeams - healthyMachineTeamCount, maxMachineTeams - totalMachineTeamCount);
+		int machineTeamsToBuild = std::max(
+		    0, std::min(desiredMachineTeams - healthyMachineTeamCount, maxMachineTeams - totalMachineTeamCount));
 
 		TraceEvent("BuildMachineTeams")
 		    .detail("TotalHealthyMachine", totalHealthyMachineCount)
@@ -1897,7 +1897,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 		// Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise
 		// simulation test will randomly pick one TeamCollectionInfo trace, which could be the one before build teams
-		//wait(delay(0.01));
+		// wait(delay(0.01));
 
 		// Debug purpose
 //		if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {

From 347a7ecdff923d516cd03f659e4bcb5c2087f547 Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Mon, 1 Jul 2019 16:37:10 -0700
Subject: [PATCH 048/136] MachineTeams:Make traceTeamCollectionInfo not an
 actor

---
 fdbserver/DataDistribution.actor.cpp | 89 ++++++++++++----------------
 1 file changed, 39 insertions(+), 50 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 3dee73df82..c3a3e1a622 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -535,7 +535,6 @@ Future<Void> storageServerTracker(
 	Version const& addedVersion);
 
 Future<Void> teamTracker(struct DDTeamCollection* const& self, Reference<TCTeamInfo> const& team, bool const& badTeam, bool const& redundantTeam);
-ACTOR static Future<Void> traceTeamCollectionInfo(DDTeamCollection* self);
 
 struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	enum { REQUESTING_WORKER = 0, GETTING_WORKER = 1, GETTING_STORAGE = 2 };
@@ -959,7 +958,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		}
 
 		// Trace and record the current number of teams for correctness test
-		wait(self->traceTeamCollectionInfo(self));
+		self->traceTeamCollectionInfo();
 
 		return Void();
 	}
@@ -1382,7 +1381,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team);
 				// NOTE: selectReplicas() should always return success when storageTeamSize = 1
 				ASSERT_WE_THINK(configuration.storageTeamSize > 1 || (configuration.storageTeamSize == 1 && success));
-				if (!success && configuration.storageTeamSize > 1) {
+				if (!success) {
 					break;
 				}
 				ASSERT(forcedAttributes.size() > 0);
@@ -1597,36 +1596,28 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return totalHealthyMachineCount;
 	}
 
-	std::pair<uint32_t, uint32_t> calculateMinMaxServerTeamNumOnServer() {
-		uint32_t minTeamNumber = std::numeric_limits<uint32_t>::max();
-		uint32_t maxTeamNumber = std::numeric_limits<uint32_t>::min();
+	std::pair<uint64_t, uint64_t> calculateMinMaxServerTeamNumOnServer() {
+		uint64_t minTeamNumber = std::numeric_limits<uint64_t>::max();
+		uint64_t maxTeamNumber = 0;
 		for (auto& server : server_info) {
 			if (server_status.get(server.first).isUnhealthy()) {
 				continue;
 			}
-			if (server.second->teams.size() < minTeamNumber) {
-				minTeamNumber = server.second->teams.size();
-			}
-			if (server.second->teams.size() > maxTeamNumber) {
-				maxTeamNumber = server.second->teams.size();
-			}
+			minTeamNumber = std::min(server.second->teams.size(), minTeamNumber);
+			maxTeamNumber = std::max(server.second->teams.size(), maxTeamNumber);
 		}
 		return std::make_pair(minTeamNumber, maxTeamNumber);
 	}
 
-	std::pair<int, int> calculateMinMaxMachineTeamNumOnMachine() {
-		int minTeamNumber = std::numeric_limits<int>::max();
-		int maxTeamNumber = 0;
+	std::pair<uint64_t, uint64_t> calculateMinMaxMachineTeamNumOnMachine() {
+		uint64_t minTeamNumber = std::numeric_limits<uint64_t>::max();
+		uint64_t maxTeamNumber = 0;
 		for (auto& machine : machine_info) {
 			if (!isMachineHealthy(machine.second)) {
 				continue;
 			}
-			if (machine.second->machineTeams.size() < minTeamNumber) {
-				minTeamNumber = machine.second->machineTeams.size();
-			}
-			if (machine.second->machineTeams.size() > maxTeamNumber) {
-				maxTeamNumber = machine.second->machineTeams.size();
-			}
+			minTeamNumber = std::min<uint64_t>(machine.second->machineTeams.size(), minTeamNumber);
+			maxTeamNumber = std::max<uint64_t>(machine.second->machineTeams.size(), maxTeamNumber);
 		}
 		return std::make_pair(minTeamNumber, maxTeamNumber);
 	}
@@ -1687,8 +1678,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	int getRemainingMachineTeamBudget() {
 		int remainingMachineTeamBudget = 0;
 		for (auto& m : machine_info) {
-			int healthyMTCount = m.second->machineTeams.size();
-			remainingMachineTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - healthyMTCount));
+			int machineTeamCount = m.second->machineTeams.size();
+			remainingMachineTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - machineTeamCount));
 		}
 
 		// We over-provision the remainingMachineTeamBudget because we do not know, when a new machine team is built,
@@ -1833,8 +1824,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 		healthyMachineTeamCount = getHealthyMachineTeamCount();
 
-		std::pair<int, int> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
-		std::pair<int, int> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
+		std::pair<uint64_t, uint64_t> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
+		std::pair<uint64_t, uint64_t> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
 
 		TraceEvent("TeamCollectionInfo", distributorId)
 		    .detail("Primary", primary)
@@ -1861,29 +1852,29 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	}
 
 	// Check if the number of server (and machine teams) is larger than the maximum allowed number
-	ACTOR static Future<Void> traceTeamCollectionInfo(DDTeamCollection* self) {
-		int totalHealthyServerCount = self->calculateHealthyServerCount();
+	void traceTeamCollectionInfo() {
+		int totalHealthyServerCount = calculateHealthyServerCount();
 		int desiredServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyServerCount;
 		int maxServerTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyServerCount;
 
-		int totalHealthyMachineCount = self->calculateHealthyMachineCount();
+		int totalHealthyMachineCount = calculateHealthyMachineCount();
 		int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount;
 		int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
-		int healthyMachineTeamCount = self->getHealthyMachineTeamCount();
+		int healthyMachineTeamCount = getHealthyMachineTeamCount();
 
-		std::pair<int, int> minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
-		std::pair<int, int> minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
+		std::pair<uint64_t, uint64_t> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
+		std::pair<uint64_t, uint64_t> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
 
-		TraceEvent("TeamCollectionInfo", self->distributorId)
-		    .detail("Primary", self->primary)
+		TraceEvent("TeamCollectionInfo", distributorId)
+		    .detail("Primary", primary)
 		    .detail("AddedTeamNumber", 0)
 		    .detail("AimToBuildTeamNumber", 0)
 		    .detail("RemainingTeamBudget", 0)
-		    .detail("CurrentTeamNumber", self->teams.size())
+		    .detail("CurrentTeamNumber", teams.size())
 		    .detail("DesiredTeamNumber", desiredServerTeams)
 		    .detail("MaxTeamNumber", maxServerTeams)
-		    .detail("StorageTeamSize", self->configuration.storageTeamSize)
-		    .detail("CurrentMachineTeamNumber", self->machineTeams.size())
+		    .detail("StorageTeamSize", configuration.storageTeamSize)
+		    .detail("CurrentMachineTeamNumber", machineTeams.size())
 		    .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount)
 		    .detail("DesiredMachineTeams", desiredMachineTeams)
 		    .detail("MaxMachineTeams", maxMachineTeams)
@@ -1892,7 +1883,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		    .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
 		    .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
 		    .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
-		    .detail("DoBuildTeams", self->doBuildTeams)
+		    .detail("DoBuildTeams", doBuildTeams)
 		    .trackLatest("TeamCollectionInfo");
 
 		// Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise
@@ -1900,12 +1891,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		// wait(delay(0.01));
 
 		// Debug purpose
-//		if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {
-//			// When the number of machine teams is over the limit, print out the current team info.
-//			traceAllInfo(true);
-//		}
-
-		return Void();
+		// if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {
+		// 	// When the number of machine teams is over the limit, print out the current team info.
+		// 	traceAllInfo(true);
+		// }
 	}
 
 	// Use the current set of known processes (from server_info) to compute an optimized set of storage server teams.
@@ -1999,8 +1988,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
 				int healthyMachineTeamCount = self->getHealthyMachineTeamCount();
 
-				std::pair<int, int> minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
-				std::pair<int, int> minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
+				std::pair<uint64_t, uint64_t> minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
+				std::pair<uint64_t, uint64_t> minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
 
 				TraceEvent("TeamCollectionInfo", self->distributorId)
 				    .detail("Primary", self->primary)
@@ -2467,7 +2456,7 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
 				    .detail("CurrentMachineTeamNumber", self->machineTeams.size())
 				    .detail("DesiredMachineTeam", desiredMachineTeams)
 				    .detail("NumMachineTeamRemoved", numMachineTeamRemoved);
-				wait(self->traceTeamCollectionInfo(self));
+				self->traceTeamCollectionInfo();
 			}
 		}
 	}
@@ -3184,7 +3173,7 @@ ACTOR Future<Void> storageServerTracker(
 							self->badTeamRemover = removeBadTeams(self);
 							self->addActor.send(self->badTeamRemover);
 							// The team number changes, so we need to update the team number info
-							// wait( traceTeamCollectionInfo(self) );
+							// self->traceTeamCollectionInfo();
 							recordTeamCollectionInfo = true;
 						}
 					}
@@ -3193,7 +3182,7 @@ ACTOR Future<Void> storageServerTracker(
 					// We rely on the old failureTracker being actorCancelled since the old actor now has a pointer to an invalid location
 					status = ServerStatus( status.isFailed, status.isUndesired, server->lastKnownInterface.locality );
 
-					// wait( traceTeamCollectionInfo(self) );
+					// self->traceTeamCollectionInfo();
 					recordTeamCollectionInfo = true;
 					//Restart the storeTracker for the new interface
 					storeTracker = keyValueStoreTypeTracker(self, server);
@@ -3223,7 +3212,7 @@ ACTOR Future<Void> storageServerTracker(
 			}
 
 			if (recordTeamCollectionInfo) {
-				wait(self->traceTeamCollectionInfo(self));
+				self->traceTeamCollectionInfo();
 			}
 		}
 	} catch( Error &e ) {
@@ -3459,7 +3448,7 @@ ACTOR Future<Void> dataDistributionTeamCollection(
 			self->redundantTeamRemover = teamRemover(self);
 			self->addActor.send(self->redundantTeamRemover);
 		}
-		wait(self->traceTeamCollectionInfo(self));
+		self->traceTeamCollectionInfo();
 
 		if(self->includedDCs.size()) {
 			//start this actor before any potential recruitments can happen

From 841e61ac25f300d32512310a874f9885bc445802 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Mon, 1 Jul 2019 16:56:35 -0700
Subject: [PATCH 049/136] fixed a broken promise in localRatekeeper

---
 fdbserver/workloads/LocalRatekeeper.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/workloads/LocalRatekeeper.actor.cpp b/fdbserver/workloads/LocalRatekeeper.actor.cpp
index 53c7f339f0..289184ab8a 100644
--- a/fdbserver/workloads/LocalRatekeeper.actor.cpp
+++ b/fdbserver/workloads/LocalRatekeeper.actor.cpp
@@ -85,7 +85,7 @@ struct LocalRatekeeperWorkload : TestWorkload {
 				req.version = readVersion;
 				// we don't care about the value
 				req.key = LiteralStringRef("/lkfs");
-				requests.emplace_back(ssi.getValue.getReply(req));
+				requests.emplace_back(brokenPromiseToNever(ssi.getValue.getReply(req)));
 			}
 			wait(waitForAllReady(requests));
 			int failedRequests = 0;

From de5bcaf58809b1011b42dd393c93497c45fa9ddf Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Mon, 1 Jul 2019 21:11:23 -0700
Subject: [PATCH 050/136] minTeamNumber for server and machine cannot be
 uint64_t

Because the consistency check will try to conver the value to int64_t.
If no server exists, the variable will not be updated and thus get overflowed
when it is converted to int64_t
---
 fdbserver/DataDistribution.actor.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index c3a3e1a622..6c2c6acd2e 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -1596,28 +1596,28 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return totalHealthyMachineCount;
 	}
 
-	std::pair<uint64_t, uint64_t> calculateMinMaxServerTeamNumOnServer() {
-		uint64_t minTeamNumber = std::numeric_limits<uint64_t>::max();
-		uint64_t maxTeamNumber = 0;
+	std::pair<int64_t, int64_t> calculateMinMaxServerTeamNumOnServer() {
+		int64_t minTeamNumber = std::numeric_limits<int64_t>::max();
+		int64_t maxTeamNumber = 0;
 		for (auto& server : server_info) {
 			if (server_status.get(server.first).isUnhealthy()) {
 				continue;
 			}
-			minTeamNumber = std::min(server.second->teams.size(), minTeamNumber);
-			maxTeamNumber = std::max(server.second->teams.size(), maxTeamNumber);
+			minTeamNumber = std::min((int64_t) server.second->teams.size(), minTeamNumber);
+			maxTeamNumber = std::max((int64_t) server.second->teams.size(), maxTeamNumber);
 		}
 		return std::make_pair(minTeamNumber, maxTeamNumber);
 	}
 
-	std::pair<uint64_t, uint64_t> calculateMinMaxMachineTeamNumOnMachine() {
-		uint64_t minTeamNumber = std::numeric_limits<uint64_t>::max();
-		uint64_t maxTeamNumber = 0;
+	std::pair<int64_t, int64_t> calculateMinMaxMachineTeamNumOnMachine() {
+		int64_t minTeamNumber = std::numeric_limits<int64_t>::max();
+		int64_t maxTeamNumber = 0;
 		for (auto& machine : machine_info) {
 			if (!isMachineHealthy(machine.second)) {
 				continue;
 			}
-			minTeamNumber = std::min<uint64_t>(machine.second->machineTeams.size(), minTeamNumber);
-			maxTeamNumber = std::max<uint64_t>(machine.second->machineTeams.size(), maxTeamNumber);
+			minTeamNumber = std::min<int64_t>((int64_t) machine.second->machineTeams.size(), minTeamNumber);
+			maxTeamNumber = std::max<int64_t>((int64_t) machine.second->machineTeams.size(), maxTeamNumber);
 		}
 		return std::make_pair(minTeamNumber, maxTeamNumber);
 	}

From 7e5b5a0536b7940e36beae55c49659e2a636d986 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@users.noreply.github.com>
Date: Tue, 2 Jul 2019 11:09:46 -0700
Subject: [PATCH 051/136] Apply suggestions from code review

Use emplace_back instead of push_back

Co-Authored-By: Jingyu Zhou <jingyuzhou@gmail.com>
---
 fdbclient/MultiVersionTransaction.actor.cpp | 3 ++-
 fdbclient/ReadYourWrites.actor.cpp          | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index da9bb273d2..762b79b76d 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -597,7 +597,8 @@ Version MultiVersionTransaction::getCommittedVersion() {
 
 void MultiVersionTransaction::setOption(FDBTransactionOptions::Option option, Optional<StringRef> value) {
 	if(MultiVersionApi::apiVersionAtLeast(610) && FDBTransactionOptions::optionInfo[option].persistent) {
-		persistentOptions.push_back(std::make_pair(option, value.castTo<Standalone<StringRef>>()));
+		persistentOptions.emplace_back(option, value.castTo<Standalone<StringRef>>());
+
 	}
 	auto tr = getTransaction();
 	if(tr.transaction) {
diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp
index 336fd46ad3..013fd61596 100644
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@@ -1762,7 +1762,8 @@ void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option,
 	setOptionImpl(option, value);
 
 	if(FDBTransactionOptions::optionInfo[option].persistent) {
-		persistentOptions.push_back(std::make_pair(option, value.castTo<Standalone<StringRef>>()));
+		persistentOptions.emplace_back(option, value.castTo<Standalone<StringRef>>());
+
 	}
 }
 

From e7218bbb282346e052828c029c8a4ce34b0a1521 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Tue, 2 Jul 2019 11:16:00 -0700
Subject: [PATCH 052/136] Restore retry limiting on the client sampling
 transaction

---
 fdbclient/NativeAPI.actor.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 4f0770f680..e847bd9bc2 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -279,7 +279,8 @@ struct TrInfoChunk {
 
 ACTOR static Future<Void> transactionInfoCommitActor(Transaction *tr, std::vector<TrInfoChunk> *chunks) {
 	state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin);
-	loop{
+	state int retryCount = 0;
+	loop {
 		try {
 			tr->reset();
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@@ -295,6 +296,8 @@ ACTOR static Future<Void> transactionInfoCommitActor(Transaction *tr, std::vecto
 			return Void();
 		}
 		catch (Error& e) {
+			retryCount++;
+			if (retryCount == 10) throw;
 			wait(tr->onError(e));
 		}
 	}

From 1cf449db10b80ae3f5bd947bcb5f4a3f6a4f8a61 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Tue, 2 Jul 2019 11:17:47 -0700
Subject: [PATCH 053/136] Undo formatting changes to otherwise unchanged code

---
 fdbclient/NativeAPI.actor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index e847bd9bc2..a184516ae5 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -280,7 +280,7 @@ struct TrInfoChunk {
 ACTOR static Future<Void> transactionInfoCommitActor(Transaction *tr, std::vector<TrInfoChunk> *chunks) {
 	state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin);
 	state int retryCount = 0;
-	loop {
+	loop{
 		try {
 			tr->reset();
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@@ -297,7 +297,8 @@ ACTOR static Future<Void> transactionInfoCommitActor(Transaction *tr, std::vecto
 		}
 		catch (Error& e) {
 			retryCount++;
-			if (retryCount == 10) throw;
+			if (retryCount == 10) 
+				throw;
 			wait(tr->onError(e));
 		}
 	}

From 3f6ba3d737b2429988ced0a1e0720a399fece811 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Tue, 2 Jul 2019 11:18:45 -0700
Subject: [PATCH 054/136] Remove space

---
 fdbclient/NativeAPI.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index a184516ae5..625189fcbc 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -297,7 +297,7 @@ ACTOR static Future<Void> transactionInfoCommitActor(Transaction *tr, std::vecto
 		}
 		catch (Error& e) {
 			retryCount++;
-			if (retryCount == 10) 
+			if (retryCount == 10)
 				throw;
 			wait(tr->onError(e));
 		}

From c9ed860277aa8bd7574376b4be257e447eefda08 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Tue, 2 Jul 2019 14:19:22 -0700
Subject: [PATCH 055/136] Fix whitespace

---
 fdbclient/vexillographer/vexillographer.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbclient/vexillographer/vexillographer.cs b/fdbclient/vexillographer/vexillographer.cs
index 9e9cb33190..0afa6f5ff7 100644
--- a/fdbclient/vexillographer/vexillographer.cs
+++ b/fdbclient/vexillographer/vexillographer.cs
@@ -56,7 +56,7 @@ namespace vexillographer
         public bool hidden { get; set; }
         public bool persistent { get; set; }
         public int defaultFor { get; set; }
-		private string _comment;
+        private string _comment;
         public string comment {
             get {
                 return _comment == null ? "" : _comment;
@@ -136,8 +136,8 @@ namespace vexillographer
                         bool hidden = oDoc.AttributeOrNull("hidden") == "true";
                         bool persistent = oDoc.AttributeOrNull("persistent") == "true";
                         String defaultForString = oDoc.AttributeOrNull("defaultFor");
-						int defaultFor = defaultForString == null ? -1 : int.Parse(defaultForString);
-						string disableOn = oDoc.AttributeOrNull("disableOn");
+                        int defaultFor = defaultForString == null ? -1 : int.Parse(defaultForString);
+                        string disableOn = oDoc.AttributeOrNull("disableOn");
                         bool disabled = false;
                         if(disableOn != null)
                         {

From d2d6022ed4eb3ae790922c14fc4ab90a5fc17eae Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Tue, 2 Jul 2019 14:24:19 -0700
Subject: [PATCH 056/136] StorageServerTracker:Do not always set doBuildTeams

When interface changes, we set doBuildTeams to true only when
the interface location changes.
---
 fdbserver/DataDistribution.actor.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 6c2c6acd2e..7429280065 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -3187,7 +3187,6 @@ ACTOR Future<Void> storageServerTracker(
 					//Restart the storeTracker for the new interface
 					storeTracker = keyValueStoreTypeTracker(self, server);
 					hasWrongStoreTypeOrDC = false;
-					self->doBuildTeams = true;
 					self->restartTeamBuilder.trigger();
 
 					if(restartRecruiting)

From 6b6012ee7b6e53db3cf0ce9f30c08457e991077f Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Tue, 2 Jul 2019 15:42:53 -0700
Subject: [PATCH 057/136] Add a break to setOption() switch statement. Better
 detection of missing options (and logging for present options).

---
 fdbclient/MultiVersionTransaction.actor.cpp | 5 +----
 fdbclient/NativeAPI.actor.cpp               | 4 +++-
 fdbclient/ThreadSafeTransaction.actor.cpp   | 9 +++++++++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 762b79b76d..c3e6fd45ec 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -665,10 +665,7 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional
 	MutexHolder holder(dbState->optionLock);
 
 	auto itr = FDBDatabaseOptions::optionInfo.find(option);
-	if(itr != FDBDatabaseOptions::optionInfo.end()) {
-		TraceEvent("SetDatabaseOption").detail("Option", itr->second.name);
-	}
-	else {
+	if(itr == FDBDatabaseOptions::optionInfo.end()) {
 		TraceEvent("UnknownDatabaseOption").detail("Option", option);
 		throw invalid_option();
 	}
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 625189fcbc..b2c7281eef 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -779,7 +779,9 @@ void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional<Str
 				validateOptionValue(value, false);
 				snapshotRywEnabled--;
 				break;
-		    }
+			default:
+				break;
+		}
 	}
 }
 
diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp
index d341ac3c5d..4699efc965 100644
--- a/fdbclient/ThreadSafeTransaction.actor.cpp
+++ b/fdbclient/ThreadSafeTransaction.actor.cpp
@@ -51,6 +51,15 @@ Reference<ITransaction> ThreadSafeDatabase::createTransaction() {
 }
 
 void ThreadSafeDatabase::setOption( FDBDatabaseOptions::Option option, Optional<StringRef> value) {
+	auto itr = FDBDatabaseOptions::optionInfo.find(option);
+	if(itr != FDBDatabaseOptions::optionInfo.end()) {
+		TraceEvent("SetDatabaseOption").detail("Option", itr->second.name);
+	}
+	else {
+		TraceEvent("UnknownDatabaseOption").detail("Option", option);
+		throw invalid_option();
+	}
+
 	DatabaseContext *db = this->db;
 	Standalone<Optional<StringRef>> passValue = value;
 

From 64e33bb4f9f6ed09e353ab9968de2c6543c11622 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Tue, 2 Jul 2019 16:25:29 -0700
Subject: [PATCH 058/136] added logging for maintenance mode

---
 fdbserver/DataDistribution.actor.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 6c2c6acd2e..d6f1a47a89 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -2843,12 +2843,18 @@ ACTOR Future<Void> waitHealthyZoneChange( DDTeamCollection* self ) {
 			if(val.present()) {
 				auto p = decodeHealthyZoneValue(val.get());
 				if(p.second > tr.getReadVersion().get()) {
-					healthyZoneTimeout = delay((p.second - tr.getReadVersion().get())/(double)SERVER_KNOBS->VERSIONS_PER_SECOND);
-					self->healthyZone.set(p.first);
-				} else {
+					double timeoutSeconds = (p.second - tr.getReadVersion().get())/(double)SERVER_KNOBS->VERSIONS_PER_SECOND;
+					healthyZoneTimeout = delay(timeoutSeconds);
+					if(self->healthyZone.get() != p.first) {
+						TraceEvent("MaintenanceZoneStart", self->distributorId).detail("ZoneID", printable(p.first)).detail("EndVersion", p.second).detail("Duration", timeoutSeconds);
+						self->healthyZone.set(p.first);
+					}
+				} else if(self->healthyZone.get().present()) {
+					TraceEvent("MaintenanceZoneEnd", self->distributorId);
 					self->healthyZone.set(Optional<Key>());
 				}
-			} else {
+			} else if(self->healthyZone.get().present()) {
+				TraceEvent("MaintenanceZoneEnd", self->distributorId);
 				self->healthyZone.set(Optional<Key>());
 			}
 			
@@ -2949,6 +2955,7 @@ ACTOR Future<Void> storageServerFailureTracker(
 				}
 				if(status->isFailed && self->healthyZone.get().present() && self->clearHealthyZoneFuture.isReady()) {
 					self->clearHealthyZoneFuture = clearHealthyZone(self->cx);
+					TraceEvent("MaintenanceZoneCleared", self->distributorId);
 					self->healthyZone.set(Optional<Key>());
 				}
 

From b63fa29766bfb9f0329491db6f39c2d4c2880193 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Wed, 3 Jul 2019 11:53:27 -0700
Subject: [PATCH 059/136] updated documentation for 6.1.11

---
 documentation/sphinx/source/downloads.rst     | 24 +++++++++----------
 documentation/sphinx/source/release-notes.rst |  3 +--
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/documentation/sphinx/source/downloads.rst b/documentation/sphinx/source/downloads.rst
index e05b79f854..2e71f3da23 100644
--- a/documentation/sphinx/source/downloads.rst
+++ b/documentation/sphinx/source/downloads.rst
@@ -10,38 +10,38 @@ macOS
 
 The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.
 
-* `FoundationDB-6.1.10.pkg <https://www.foundationdb.org/downloads/6.1.10/macOS/installers/FoundationDB-6.1.10.pkg>`_
+* `FoundationDB-6.1.11.pkg <https://www.foundationdb.org/downloads/6.1.11/macOS/installers/FoundationDB-6.1.11.pkg>`_
 
 Ubuntu
 ------
 
 The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.
 
-* `foundationdb-clients-6.1.10-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.10/ubuntu/installers/foundationdb-clients_6.1.10-1_amd64.deb>`_
-* `foundationdb-server-6.1.10-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.10/ubuntu/installers/foundationdb-server_6.1.10-1_amd64.deb>`_ (depends on the clients package)
+* `foundationdb-clients-6.1.11-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.11/ubuntu/installers/foundationdb-clients_6.1.11-1_amd64.deb>`_
+* `foundationdb-server-6.1.11-1_amd64.deb <https://www.foundationdb.org/downloads/6.1.11/ubuntu/installers/foundationdb-server_6.1.11-1_amd64.deb>`_ (depends on the clients package)
 
 RHEL/CentOS EL6
 ---------------
 
 The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.
 
-* `foundationdb-clients-6.1.10-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.10/rhel6/installers/foundationdb-clients-6.1.10-1.el6.x86_64.rpm>`_
-* `foundationdb-server-6.1.10-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.10/rhel6/installers/foundationdb-server-6.1.10-1.el6.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.1.11-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.11/rhel6/installers/foundationdb-clients-6.1.11-1.el6.x86_64.rpm>`_
+* `foundationdb-server-6.1.11-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.11/rhel6/installers/foundationdb-server-6.1.11-1.el6.x86_64.rpm>`_ (depends on the clients package)
 
 RHEL/CentOS EL7
 ---------------
 
 The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.
 
-* `foundationdb-clients-6.1.10-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.10/rhel7/installers/foundationdb-clients-6.1.10-1.el7.x86_64.rpm>`_
-* `foundationdb-server-6.1.10-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.10/rhel7/installers/foundationdb-server-6.1.10-1.el7.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.1.11-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.11/rhel7/installers/foundationdb-clients-6.1.11-1.el7.x86_64.rpm>`_
+* `foundationdb-server-6.1.11-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.1.11/rhel7/installers/foundationdb-server-6.1.11-1.el7.x86_64.rpm>`_ (depends on the clients package)
 
 Windows
 -------
 
 The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.
 
-* `foundationdb-6.1.10-x64.msi <https://www.foundationdb.org/downloads/6.1.10/windows/installers/foundationdb-6.1.10-x64.msi>`_
+* `foundationdb-6.1.11-x64.msi <https://www.foundationdb.org/downloads/6.1.11/windows/installers/foundationdb-6.1.11-x64.msi>`_
 
 API Language Bindings
 =====================
@@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part
 
 If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:
 
-* `foundationdb-6.1.10.tar.gz <https://www.foundationdb.org/downloads/6.1.10/bindings/python/foundationdb-6.1.10.tar.gz>`_
+* `foundationdb-6.1.11.tar.gz <https://www.foundationdb.org/downloads/6.1.11/bindings/python/foundationdb-6.1.11.tar.gz>`_
 
 Ruby 1.9.3/2.0.0+
 -----------------
 
-* `fdb-6.1.10.gem <https://www.foundationdb.org/downloads/6.1.10/bindings/ruby/fdb-6.1.10.gem>`_
+* `fdb-6.1.11.gem <https://www.foundationdb.org/downloads/6.1.11/bindings/ruby/fdb-6.1.11.gem>`_
 
 Java 8+
 -------
 
-* `fdb-java-6.1.10.jar <https://www.foundationdb.org/downloads/6.1.10/bindings/java/fdb-java-6.1.10.jar>`_
-* `fdb-java-6.1.10-javadoc.jar <https://www.foundationdb.org/downloads/6.1.10/bindings/java/fdb-java-6.1.10-javadoc.jar>`_
+* `fdb-java-6.1.11.jar <https://www.foundationdb.org/downloads/6.1.11/bindings/java/fdb-java-6.1.11.jar>`_
+* `fdb-java-6.1.11-javadoc.jar <https://www.foundationdb.org/downloads/6.1.11/bindings/java/fdb-java-6.1.11-javadoc.jar>`_
 
 Go 1.1+
 -------
diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index 11dd64fbd0..0741fe9015 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -8,7 +8,7 @@ Release Notes
 Fixes
 -----
 
-* Ensure new added machines are used to build teams and host data from existing machines when a cluster is expanded. `(PR #1764) <https://github.com/apple/foundationdb/pull/1764>`_
+* Machines which were added to a cluster immediately after the cluster was upgraded to 6.1 would not be given data. `(PR #1764) <https://github.com/apple/foundationdb/pull/1764>`_
 
 6.1.10
 ======
@@ -22,7 +22,6 @@ Fixes
 -----
 
 * The ``fdbrestore`` commands ``abort``, ``wait``, and ``status`` would use a default cluster file instead of the destination cluster file argument.  `(PR #1701) <https://github.com/apple/foundationdb/pull/1701>`_
-* Ensure new added machines are used to build teams and host data from existing machines when a cluster is expanded. `(PR #1764) <https://github.com/apple/foundationdb/pull/1764>`_
 
 6.1.9
 =====

From cd0d9c396887bda294bd09efa5a7b4f170c721c2 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Wed, 3 Jul 2019 11:55:24 -0700
Subject: [PATCH 060/136] update installer WIX GUID following release

---
 packaging/msi/FDBInstaller.wxs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs
index d81feaf3d7..a742b9ed3b 100644
--- a/packaging/msi/FDBInstaller.wxs
+++ b/packaging/msi/FDBInstaller.wxs
@@ -32,7 +32,7 @@
 
 <Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
   <Product Name='$(var.Title)'
-           Id='{001349F6-30BD-4854-ABD6-A8D30CB31677}'
+           Id='{8E0DAD6E-4CA7-45A0-9D24-BA18FFC47547}'
            UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
            Version='$(var.Version)'
            Manufacturer='$(var.Manufacturer)'

From 79a90d33a7d857b919b7cdc97ef974208358de53 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Wed, 3 Jul 2019 16:06:54 -0700
Subject: [PATCH 061/136] fix: the push location for txs tags needs to be based
 on what the tag will become after changing the number of txs tags

---
 fdbserver/TagPartitionedLogSystem.actor.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index bacdd121f0..e1dca05e8a 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -1815,8 +1815,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				}
 				for(int i = needsOldTxs?-1:0; i < maxTxsTags; i++) {
 					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
+					Tag pushTag = (i==-1 || self->txsTags==0) ? txsTag : Tag(tagLocalityTxs, i%self->txsTags);
 					locations.clear();
-					logSet->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+					logSet->getPushLocations( vector<Tag>(1, pushTag), locations, 0 );
 					for(int loc : locations)
 						remoteTLogReqs[ loc ].recoverTags.push_back( tag );
 				}
@@ -2018,8 +2019,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		if(oldLogSystem->tLogs.size()) {
 			for(int i = needsOldTxs?-1:0; i < maxTxsTags; i++) {
 				Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
+				Tag pushTag = (i==-1 || logSystem->txsTags==0) ? txsTag : Tag(tagLocalityTxs, i%logSystem->txsTags);
 				locations.clear();
-				logSystem->tLogs[0]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+				logSystem->tLogs[0]->getPushLocations( vector<Tag>(1, pushTag), locations, 0 );
 				for(int loc : locations)
 					reqs[ loc ].recoverTags.push_back( tag );
 			}
@@ -2071,8 +2073,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			if(oldLogSystem->tLogs.size()) {
 				for(int i = needsOldTxs?-1:0; i < maxTxsTags; i++) {
 					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
+					Tag pushTag = (i==-1 || logSystem->txsTags==0) ? txsTag : Tag(tagLocalityTxs, i%logSystem->txsTags);
 					locations.clear();
-					logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+					logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, pushTag), locations, 0 );
 					for(int loc : locations)
 						sreqs[ loc ].recoverTags.push_back( tag );
 				}

From 888f4f92e0783792dd4fbc59b9485cf1dc40be9e Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Wed, 3 Jul 2019 21:03:58 -0700
Subject: [PATCH 062/136] Fix errors and TaskPriority more priorities.

---
 fdbrpc/FlowTransport.actor.cpp | 14 +++++++-------
 flow/IThreadPool.h             |  4 ++--
 flow/network.h                 |  8 +++++++-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index e3d6000575..c20aa607a6 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -50,7 +50,7 @@ const uint64_t TOKEN_STREAM_FLAG = 1;
 class EndpointMap : NonCopyable {
 public:
 	EndpointMap();
-	void insert( NetworkMessageReceiver* r, Endpoint::Token& token, uint32_t priority );
+	void insert( NetworkMessageReceiver* r, Endpoint::Token& token, TaskPriority priority );
 	NetworkMessageReceiver* get( Endpoint::Token const& token );
 	TaskPriority getPriority( Endpoint::Token const& token );
 	void remove( Endpoint::Token const& token, NetworkMessageReceiver* r );
@@ -86,12 +86,12 @@ void EndpointMap::realloc() {
 	firstFree = oldSize;
 }
 
-void EndpointMap::insert( NetworkMessageReceiver* r, Endpoint::Token& token, uint32_t priority ) {
+void EndpointMap::insert( NetworkMessageReceiver* r, Endpoint::Token& token, TaskPriority priority ) {
 	if (firstFree == uint32_t(-1)) realloc();
 	int index = firstFree;
 	firstFree = data[index].nextFree;
 	token = Endpoint::Token( token.first(), (token.second()&0xffffffff00000000LL) | index );
-	data[index].token() = Endpoint::Token( token.first(), (token.second()&0xffffffff00000000LL) | priority );
+	data[index].token() = Endpoint::Token( token.first(), (token.second()&0xffffffff00000000LL) | static_cast<uint32_t>(priority) );
 	data[index].receiver = r;
 }
 
@@ -122,7 +122,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver {
 	EndpointNotFoundReceiver(EndpointMap& endpoints) {
 		//endpoints[WLTOKEN_ENDPOINT_NOT_FOUND] = this;
 		Endpoint::Token e = WLTOKEN_ENDPOINT_NOT_FOUND;
-		endpoints.insert(this, e, static_cast<uint32_t>(TaskPriority::DefaultEndpoint));
+		endpoints.insert(this, e, TaskPriority::DefaultEndpoint);
 		ASSERT( e == WLTOKEN_ENDPOINT_NOT_FOUND );
 	}
 	virtual void receive( ArenaReader& reader ) {
@@ -141,7 +141,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver {
 struct PingReceiver : NetworkMessageReceiver {
 	PingReceiver(EndpointMap& endpoints) {
 		Endpoint::Token e = WLTOKEN_PING_PACKET;
-		endpoints.insert(this, e, static_cast<uint32_t>(TaskPriority::ReadSocket));
+		endpoints.insert(this, e, TaskPriority::ReadSocket);
 		ASSERT( e == WLTOKEN_PING_PACKET );
 	}
 	virtual void receive( ArenaReader& reader ) {
@@ -1087,7 +1087,7 @@ void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* rec
 		endpoint.addresses = NetworkAddressList();
 		endpoint.token = UID( endpoint.token.first() & ~TOKEN_STREAM_FLAG, endpoint.token.second() );
 	}
-	self->endpoints.insert( receiver, endpoint.token, static_cast<uint32_t>(taskID) );
+	self->endpoints.insert( receiver, endpoint.token, taskID );
 }
 
 void FlowTransport::removeEndpoint( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) {
@@ -1098,7 +1098,7 @@ void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageRece
 	endpoint.addresses = self->localAddresses;
 	ASSERT( ((endpoint.token.first() & TOKEN_STREAM_FLAG)!=0) == receiver->isStream() );
 	Endpoint::Token otoken = endpoint.token;
-	self->endpoints.insert( receiver, endpoint.token, static_cast<uint32_t>(taskID) );
+	self->endpoints.insert( receiver, endpoint.token, taskID );
 	ASSERT( endpoint.token == otoken );
 }
 
diff --git a/flow/IThreadPool.h b/flow/IThreadPool.h
index c5be41f87a..39d5d484a8 100644
--- a/flow/IThreadPool.h
+++ b/flow/IThreadPool.h
@@ -92,12 +92,12 @@ public:
 	void send( T const& t ) {  // Can be called safely from another thread.  Call send or sendError at most once.
 		Promise<Void> signal;
 		tagAndForward( &promise, t, signal.getFuture() );
-		g_network->onMainThread( std::move(signal), incrementPriority( g_network->getCurrentTask() ) );
+		g_network->onMainThread( std::move(signal), incrementPriorityIfEven( g_network->getCurrentTask() ) );
 	}
 	void sendError( Error const& e ) {  // Can be called safely from another thread.  Call send or sendError at most once.
 		Promise<Void> signal;
 		tagAndForwardError( &promise, e, signal.getFuture() );
-		g_network->onMainThread( std::move(signal), incrementPriority( g_network->getCurrentTask() ) );
+		g_network->onMainThread( std::move(signal), incrementPriorityIfEven( g_network->getCurrentTask() ) );
 	}
 private:
 	Promise<T> promise;
diff --git a/flow/network.h b/flow/network.h
index 5fd7fc2276..494d238a2e 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -80,12 +80,18 @@ enum class TaskPriority {
 	Zero = 0
 };
 
+// These have been given long, annoying names to discourage their use.
+
 inline TaskPriority incrementPriority(TaskPriority p) {
 	return static_cast<TaskPriority>( static_cast<uint64_t>(p) + 1 );
 }
 
 inline TaskPriority decrementPriority(TaskPriority p) {
-	return static_cast<TaskPriority>( static_cast<uint64_t>(p) + 1 );
+	return static_cast<TaskPriority>( static_cast<uint64_t>(p) - 1 );
+}
+
+inline TaskPriority incrementPriorityIfEven(TaskPriority p) {
+	return static_cast<TaskPriority>( static_cast<uint64_t>(p) | 1 );
 }
 
 class Void;

From a3ac9c7eea4beb08e2ee8d508765674ab724f4d7 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 5 Jul 2019 08:08:29 -0700
Subject: [PATCH 063/136] Remove underscores from some trace event names

---
 fdbserver/DataDistribution.actor.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index ef8c25b2b6..fb7a041ed3 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -3718,7 +3718,7 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 	state Future<Void> collection = actorCollection( self->addActor.getFuture() );
 
 	try {
-		TraceEvent("DataDistributor_Running", di.id());
+		TraceEvent("DataDistributorRunning", di.id());
 		self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) );
 		state Future<Void> distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() );
 
@@ -3736,10 +3736,10 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 	}
 	catch ( Error &err ) {
 		if ( normalDataDistributorErrors().count(err.code()) == 0 ) {
-			TraceEvent("DataDistributor_Error", di.id()).error(err, true);
+			TraceEvent("DataDistributorError", di.id()).error(err, true);
 			throw err;
 		}
-		TraceEvent("DataDistributor_Died", di.id()).error(err, true);
+		TraceEvent("DataDistributorDied", di.id()).error(err, true);
 	}
 
 	return Void();

From 9f4b6fd770c83caa67cc574d61c3209dbcdf63ac Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 5 Jul 2019 08:12:25 -0700
Subject: [PATCH 064/136] Remove additional underscores

---
 fdbserver/ClusterController.actor.cpp | 34 +++++++++++++--------------
 fdbserver/MasterProxyServer.actor.cpp |  4 ++--
 fdbserver/Ratekeeper.actor.cpp        |  4 ++--
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 9fc12d502e..cb4420e267 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1425,7 +1425,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 			rkFitness = ProcessClass::ExcludeFit;
 		}
 		if (self->isProxyOrResolver(rkWorker.details.interf.locality.processId()) || rkFitness > bestFitnessForRK) {
-			TraceEvent("CC_HaltRK", self->id).detail("RKID", db.ratekeeper.get().id())
+			TraceEvent("CCHaltRK", self->id).detail("RKID", db.ratekeeper.get().id())
 			.detail("Excluded", rkWorker.priorityInfo.isExcluded)
 			.detail("Fitness", rkFitness).detail("BestFitness", bestFitnessForRK);
 			self->recruitRatekeeper.set(true);
@@ -1439,7 +1439,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 			ddFitness = ProcessClass::ExcludeFit;
 		}
 		if (self->isProxyOrResolver(ddWorker.details.interf.locality.processId()) || ddFitness > bestFitnessForDD) {
-			TraceEvent("CC_HaltDD", self->id).detail("DDID", db.distributor.get().id())
+			TraceEvent("CCHaltDD", self->id).detail("DDID", db.distributor.get().id())
 			.detail("Excluded", ddWorker.priorityInfo.isExcluded)
 			.detail("Fitness", ddFitness).detail("BestFitness", bestFitnessForDD);
 			ddWorker.haltDistributor = brokenPromiseToNever(db.distributor.get().haltDataDistributor.getReply(HaltDataDistributorRequest(self->id)));
@@ -1920,13 +1920,13 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 			self->clusterControllerDcId == req.distributorInterf.get().locality.dcId() &&
 			!self->recruitingDistributor) {
 		const DataDistributorInterface& di = req.distributorInterf.get();
-		TraceEvent("CC_RegisterDataDistributor", self->id).detail("DDID", di.id());
+		TraceEvent("CCRegisterDataDistributor", self->id).detail("DDID", di.id());
 		self->db.setDistributor(di);
 	}
 	if (req.ratekeeperInterf.present()) {
 		if((self->recruitingRatekeeperID.present() && self->recruitingRatekeeperID.get() != req.ratekeeperInterf.get().id()) ||
 			self->clusterControllerDcId != w.locality.dcId()) {
-				TraceEvent("CC_HaltRegisteringRatekeeper", self->id).detail("RKID", req.ratekeeperInterf.get().id())
+				TraceEvent("CCHaltRegisteringRatekeeper", self->id).detail("RKID", req.ratekeeperInterf.get().id())
 			.detail("DcID", printable(self->clusterControllerDcId))
 			.detail("ReqDcID", printable(w.locality.dcId()))
 			.detail("RecruitingRKID", self->recruitingRatekeeperID.present() ? self->recruitingRatekeeperID.get() : UID());
@@ -1934,9 +1934,9 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 		} else if(!self->recruitingRatekeeperID.present()) {
 			const RatekeeperInterface& rki = req.ratekeeperInterf.get();
 			const auto& ratekeeper = self->db.serverInfo->get().ratekeeper;
-			TraceEvent("CC_RegisterRatekeeper", self->id).detail("RKID", rki.id());
+			TraceEvent("CCRegisterRatekeeper", self->id).detail("RKID", rki.id());
 			if (ratekeeper.present() && ratekeeper.get().id() != rki.id() && self->id_worker.count(ratekeeper.get().locality.processId())) {
-				TraceEvent("CC_HaltPreviousRatekeeper", self->id).detail("RKID", ratekeeper.get().id())
+				TraceEvent("CCHaltPreviousRatekeeper", self->id).detail("RKID", ratekeeper.get().id())
 				.detail("DcID", printable(self->clusterControllerDcId))
 				.detail("ReqDcID", printable(w.locality.dcId()))
 				.detail("RecruitingRKID", self->recruitingRatekeeperID.present() ? self->recruitingRatekeeperID.get() : UID());
@@ -2475,7 +2475,7 @@ ACTOR Future<Void> handleForcedRecoveries( ClusterControllerData *self, ClusterC
 ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerData *self ) {
 	wait(delay(0.0));  // If master fails at the same time, give it a chance to clear master PID.
 
-	TraceEvent("CC_StartDataDistributor", self->id);
+	TraceEvent("CCStartDataDistributor", self->id);
 	loop {
 		try {
 			state bool no_distributor = !self->db.serverInfo->get().distributor.present();
@@ -2494,16 +2494,16 @@ ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerDa
 			}
 			
 			InitializeDataDistributorRequest req(deterministicRandom()->randomUniqueID());
-			TraceEvent("CC_DataDistributorRecruit", self->id).detail("Addr", worker.interf.address());
+			TraceEvent("CCDataDistributorRecruit", self->id).detail("Addr", worker.interf.address());
 
 			ErrorOr<DataDistributorInterface> distributor = wait( worker.interf.dataDistributor.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) );
 			if (distributor.present()) {
-				TraceEvent("CC_DataDistributorRecruited", self->id).detail("Addr", worker.interf.address());
+				TraceEvent("CCDataDistributorRecruited", self->id).detail("Addr", worker.interf.address());
 				return distributor.get();
 			}
 		}
 		catch (Error& e) {
-			TraceEvent("CC_DataDistributorRecruitError", self->id).error(e);
+			TraceEvent("CCDataDistributorRecruitError", self->id).error(e);
 			if ( e.code() != error_code_no_more_servers ) {
 				throw;
 			}
@@ -2520,7 +2520,7 @@ ACTOR Future<Void> monitorDataDistributor(ClusterControllerData *self) {
 	loop {
 		if ( self->db.serverInfo->get().distributor.present() ) {
 			wait( waitFailureClient( self->db.serverInfo->get().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) );
-			TraceEvent("CC_DataDistributorDied", self->id)
+			TraceEvent("CCDataDistributorDied", self->id)
 			.detail("DistributorId", self->db.serverInfo->get().distributor.get().id());
 			self->db.clearInterf(ProcessClass::DataDistributorClass);
 		} else {
@@ -2535,7 +2535,7 @@ ACTOR Future<Void> monitorDataDistributor(ClusterControllerData *self) {
 ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
 	wait(delay(0.0));  // If master fails at the same time, give it a chance to clear master PID.
 
-	TraceEvent("CC_StartRatekeeper", self->id);
+	TraceEvent("CCStartRatekeeper", self->id);
 	loop {
 		try {
 			state bool no_ratekeeper = !self->db.serverInfo->get().ratekeeper.present();
@@ -2556,16 +2556,16 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
 			}
 
 			self->recruitingRatekeeperID = req.reqId;
-			TraceEvent("CC_RecruitRatekeeper", self->id).detail("Addr", worker.interf.address()).detail("RKID", req.reqId);
+			TraceEvent("CCRecruitRatekeeper", self->id).detail("Addr", worker.interf.address()).detail("RKID", req.reqId);
 
 			ErrorOr<RatekeeperInterface> interf = wait( worker.interf.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY, 0) );
 			if (interf.present()) {
 				self->recruitRatekeeper.set(false);
 				self->recruitingRatekeeperID = interf.get().id();
 				const auto& ratekeeper = self->db.serverInfo->get().ratekeeper;
-				TraceEvent("CC_RatekeeperRecruited", self->id).detail("Addr", worker.interf.address()).detail("RKID", interf.get().id());
+				TraceEvent("CCRatekeeperRecruited", self->id).detail("Addr", worker.interf.address()).detail("RKID", interf.get().id());
 				if (ratekeeper.present() && ratekeeper.get().id() != interf.get().id() && self->id_worker.count(ratekeeper.get().locality.processId())) {
-					TraceEvent("CC_HaltRatekeeperAfterRecruit", self->id).detail("RKID", ratekeeper.get().id())
+					TraceEvent("CCHaltRatekeeperAfterRecruit", self->id).detail("RKID", ratekeeper.get().id())
 					.detail("DcID", printable(self->clusterControllerDcId));
 					self->id_worker[ratekeeper.get().locality.processId()].haltRatekeeper = brokenPromiseToNever(ratekeeper.get().haltRatekeeper.getReply(HaltRatekeeperRequest(self->id)));
 				}
@@ -2577,7 +2577,7 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
 			}
 		}
 		catch (Error& e) {
-			TraceEvent("CC_RatekeeperRecruitError", self->id).error(e);
+			TraceEvent("CCRatekeeperRecruitError", self->id).error(e);
 			if ( e.code() != error_code_no_more_servers ) {
 				throw;
 			}
@@ -2595,7 +2595,7 @@ ACTOR Future<Void> monitorRatekeeper(ClusterControllerData *self) {
 		if ( self->db.serverInfo->get().ratekeeper.present() && !self->recruitRatekeeper.get() ) {
 			choose {
 				when(wait(waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME )))  {
-					TraceEvent("CC_RatekeeperDied", self->id)
+					TraceEvent("CCRatekeeperDied", self->id)
 					.detail("RKID", self->db.serverInfo->get().ratekeeper.get().id());
 					self->db.clearInterf(ProcessClass::RatekeeperClass);
 				}
diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp
index 3fc4665a15..752f14cb5d 100644
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@@ -95,11 +95,11 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
 	loop choose {
 		when ( wait( db->onChange() ) ) {
 			if ( db->get().ratekeeper.present() ) {
-				TraceEvent("Proxy_RatekeeperChanged", myID)
+				TraceEvent("ProxyRatekeeperChanged", myID)
 				.detail("RKID", db->get().ratekeeper.get().id());
 				nextRequestTimer = Void();  // trigger GetRate request
 			} else {
-				TraceEvent("Proxy_RatekeeperDied", myID);
+				TraceEvent("ProxyRatekeeperDied", myID);
 				nextRequestTimer = Never();
 				reply = Never();
 			}
diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 3f6fbcb600..cda29e2297 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -650,7 +650,7 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 	state Promise<Void> err;
 	state Future<Void> collection = actorCollection( self.addActor.getFuture() );
 
-	TraceEvent("Ratekeeper_Starting", rkInterf.id());
+	TraceEvent("RatekeeperStarting", rkInterf.id());
 	self.addActor.send( waitFailureServer(rkInterf.waitFailure.getFuture()) );
 	self.addActor.send( configurationMonitor(dbInfo, &self.configuration) );
 
@@ -732,7 +732,7 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 		}
 	}
 	catch (Error& err) {
-		TraceEvent("Ratekeeper_Died", rkInterf.id()).error(err, true);
+		TraceEvent("RatekeeperDied", rkInterf.id()).error(err, true);
 	}
 	return Void();
 }

From 4c5ebd760952af540d5074a38651a3b4e3dcb7f1 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Sat, 29 Jun 2019 11:20:33 -0700
Subject: [PATCH 065/136] Avoid assert when collecting vtables

---
 fdbrpc/fdbrpc.h     | 5 ++++-
 flow/flat_buffers.h | 4 ++++
 flow/serialize.h    | 4 ++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h
index 470cec10d9..75e0a9a551 100644
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@@ -425,7 +425,10 @@ struct serializable_traits<RequestStream<T>> : std::true_type {
 		} else {
 			const auto& ep = stream.getEndpoint();
 			serializer(ar, ep);
-			UNSTOPPABLE_ASSERT(ep.getPrimaryAddress().isValid());  // No serializing PromiseStreams on a client with no public address
+			if constexpr (Archiver::isSerializing) { // Don't assert this when collecting vtable for flatbuffers
+				UNSTOPPABLE_ASSERT(ep.getPrimaryAddress()
+				                       .isValid()); // No serializing PromiseStreams on a client with no public address
+			}
 		}
 	}
 };
diff --git a/flow/flat_buffers.h b/flow/flat_buffers.h
index a7ff261358..2e8deb9826 100644
--- a/flow/flat_buffers.h
+++ b/flow/flat_buffers.h
@@ -542,6 +542,7 @@ private:
 
 struct InsertVTableLambda {
 	static constexpr bool isDeserializing = false;
+	static constexpr bool isSerializing = false;
 	static constexpr bool is_fb_visitor = true;
 	std::set<const VTable*>& vtables;
 	std::set<std::type_index>& known_types;
@@ -665,6 +666,7 @@ private:
 template <class Writer>
 struct SaveVisitorLambda {
 	static constexpr bool isDeserializing = false;
+	static constexpr bool isSerializing = true;
 	static constexpr bool is_fb_visitor = true;
 	const VTableSet* vtableset;
 	Writer& writer;
@@ -738,6 +740,7 @@ struct SaveVisitorLambda {
 template <class Context>
 struct LoadMember {
 	static constexpr bool isDeserializing = true;
+	static constexpr bool isSerializing = false;
 	const uint16_t* const vtable;
 	const uint8_t* const message;
 	const uint16_t vtable_length;
@@ -852,6 +855,7 @@ struct LoadSaveHelper {
 	template <class Context>
 	struct SerializeFun {
 		static constexpr bool isDeserializing = true;
+		static constexpr bool isSerializing = false;
 		static constexpr bool is_fb_visitor = true;
 
 		const uint16_t* vtable;
diff --git a/flow/serialize.h b/flow/serialize.h
index e7431e7205..bc56879df8 100644
--- a/flow/serialize.h
+++ b/flow/serialize.h
@@ -317,6 +317,7 @@ inline _Unversioned Unversioned() { return _Unversioned(); }
 class BinaryWriter : NonCopyable {
 public:
 	static const int isDeserializing = 0;
+	static constexpr bool isSerializing = true;
 	typedef BinaryWriter WRITER;
 
 	void serializeBytes( StringRef bytes ) {
@@ -518,6 +519,7 @@ private:
 class ArenaReader {
 public:
 	static const int isDeserializing = 1;
+	static constexpr bool isSerializing = false;
 	typedef ArenaReader READER;
 
 	const void* readBytes( int bytes ) {
@@ -583,6 +585,7 @@ private:
 class BinaryReader {
 public:
 	static const int isDeserializing = 1;
+	static constexpr bool isSerializing = false;
 	typedef BinaryReader READER;
 
 	const void* readBytes( int bytes );
@@ -682,6 +685,7 @@ struct PacketBuffer : SendBuffer, FastAllocated<PacketBuffer> {
 
 struct PacketWriter {
 	static const int isDeserializing = 0;
+	static constexpr bool isSerializing = 1;
 	typedef PacketWriter WRITER;
 
 	PacketBuffer* buffer;

From 9894d928a1f00d631ccc6fce83632ad055e72ff1 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Wed, 19 Jun 2019 14:21:31 -0700
Subject: [PATCH 066/136] Re-use identical vtables

---
 flow/flat_buffers.cpp | 20 ++++++++++----------
 flow/flat_buffers.h   | 24 ++++++++++++++----------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/flow/flat_buffers.cpp b/flow/flat_buffers.cpp
index d4c1ddcdf2..ce8261f54f 100644
--- a/flow/flat_buffers.cpp
+++ b/flow/flat_buffers.cpp
@@ -34,17 +34,16 @@ bool TraverseMessageTypes::vtableGeneratedBefore(const std::type_index& idx) {
 	return !f.known_types.insert(idx).second;
 }
 
-VTable generate_vtable(size_t numMembers, const std::vector<unsigned>& members,
-                       const std::vector<unsigned>& alignments) {
+VTable generate_vtable(size_t numMembers, const std::vector<unsigned>& sizesAlignments) {
 	if (numMembers == 0) {
 		return VTable{ 4, 4 };
 	}
 	// first is index, second is size
 	std::vector<std::pair<unsigned, unsigned>> indexed;
-	indexed.reserve(members.size());
-	for (unsigned i = 0; i < members.size(); ++i) {
-		if (members[i] > 0) {
-			indexed.emplace_back(i, members[i]);
+	indexed.reserve(numMembers);
+	for (unsigned i = 0; i < numMembers; ++i) {
+		if (sizesAlignments[i] > 0) {
+			indexed.emplace_back(i, sizesAlignments[i]);
 		}
 	}
 	std::stable_sort(indexed.begin(), indexed.end(),
@@ -52,15 +51,15 @@ VTable generate_vtable(size_t numMembers, const std::vector<unsigned>& members,
 		                 return lhs.second > rhs.second;
 	                 });
 	VTable result;
-	result.resize(members.size() + 2);
+	result.resize(numMembers + 2);
 	// size of the vtable is
 	// - 2 bytes per member +
 	// - 2 bytes for the size entry +
 	// - 2 bytes for the size of the object
-	result[0] = 2 * members.size() + 4;
+	result[0] = 2 * numMembers + 4;
 	int offset = 0;
 	for (auto p : indexed) {
-		auto align = alignments[p.first];
+		auto align = sizesAlignments[numMembers + p.first];
 		auto& res = result[p.first + 2];
 		res = offset % align == 0 ? offset : ((offset / align) + 1) * align;
 		offset = res + p.second;
@@ -78,8 +77,10 @@ TEST_CASE("flow/FlatBuffers/test") {
 	auto* vtable1 = detail::get_vtable<int>();
 	auto* vtable2 = detail::get_vtable<uint8_t, uint8_t, int, int64_t, int>();
 	auto* vtable3 = detail::get_vtable<uint8_t, uint8_t, int, int64_t, int>();
+	auto* vtable4 = detail::get_vtable<uint32_t>();
 	ASSERT(vtable1 != vtable2);
 	ASSERT(vtable2 == vtable3);
+	ASSERT(vtable1 == vtable4); // Different types, but same vtable! Saves space in encoded messages
 	ASSERT(vtable1->size() == 3);
 	ASSERT(vtable2->size() == 7);
 	ASSERT((*vtable2)[0] == 14);
@@ -166,7 +167,6 @@ TEST_CASE("flow/FlatBuffers/collectVTables") {
 	Root root;
 	const auto* vtables = detail::get_vtableset(root);
 	ASSERT(vtables == detail::get_vtableset(root));
-	ASSERT(vtables->offsets.size() == 3);
 	const auto& root_vtable = *detail::get_vtable<uint8_t, std::vector<Nested2>, Nested>();
 	const auto& nested_vtable = *detail::get_vtable<uint8_t, std::vector<std::string>, int>();
 	int root_offset = vtables->offsets.at(&root_vtable);
diff --git a/flow/flat_buffers.h b/flow/flat_buffers.h
index 2e8deb9826..4eb454c4d9 100644
--- a/flow/flat_buffers.h
+++ b/flow/flat_buffers.h
@@ -459,24 +459,28 @@ constexpr auto fields_helper() {
 template <class Member>
 using Fields = decltype(fields_helper<Member>());
 
-// TODO(anoyes): Make this `template <int... offsets>` so we can re-use
-// identical vtables even if they have different types.
-// Also, it's important that get_vtable always returns the same VTable pointer
+// It's important that get_vtable always returns the same VTable pointer
 // so that we can decide equality by comparing the pointers.
 
-extern VTable generate_vtable(size_t numMembers, const std::vector<unsigned>& members,
-                              const std::vector<unsigned>& alignments);
+// First |numMembers| elements of sizesAndAlignments are sizes, the second
+// |numMembers| elements are alignments.
+extern VTable generate_vtable(size_t numMembers, const std::vector<unsigned>& sizesAndAlignments);
+
+template <unsigned... MembersAndAlignments>
+const VTable* gen_vtable3() {
+	static VTable table =
+	    generate_vtable(sizeof...(MembersAndAlignments) / 2, std::vector<unsigned>{ MembersAndAlignments... });
+	return &table;
+}
 
 template <class... Members>
-VTable gen_vtable(pack<Members...> p) {
-	return generate_vtable(sizeof...(Members), std::vector<unsigned>{ { _SizeOf<Members>::size... } },
-	                       std::vector<unsigned>{ { _SizeOf<Members>::align... } });
+const VTable* gen_vtable2(pack<Members...> p) {
+	return gen_vtable3<_SizeOf<Members>::size..., _SizeOf<Members>::align...>();
 }
 
 template <class... Members>
 const VTable* get_vtable() {
-	static VTable table = gen_vtable(concat_t<Fields<Members>...>{});
-	return &table;
+	return gen_vtable2(concat_t<Fields<Members>...>{});
 }
 
 template <class F, class... Members>

From e2ed56fa567a3d81853725d15b2665e2f31eec9a Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Wed, 19 Jun 2019 11:30:35 -0700
Subject: [PATCH 067/136] Convert ownedPtr to unownedPtr for IReplicationPolicy

Remove WriteRawMemory feature

Remove deserialization_done
---
 fdbrpc/ReplicationPolicy.h    | 36 +++++++++++++++----
 flow/Arena.h                  |  2 +-
 flow/ObjectSerializerTraits.h | 45 +++++------------------
 flow/flat_buffers.cpp         | 41 ---------------------
 flow/flat_buffers.h           | 68 +++++++++++------------------------
 5 files changed, 59 insertions(+), 133 deletions(-)

diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h
index 0f4f350cae..c20a3de821 100644
--- a/fdbrpc/ReplicationPolicy.h
+++ b/fdbrpc/ReplicationPolicy.h
@@ -70,6 +70,13 @@ struct IReplicationPolicy : public ReferenceCounted<IReplicationPolicy> {
 		return keys;
 	}
 	virtual void attributeKeys(std::set<std::string>*) const = 0;
+
+	// For flatbuffers, IReplicationPolicy is just encoded as a string using
+	// |serializeReplicationPolicy|. |writer| is a member of IReplicationPolicy
+	// so that this string outlives all calls to
+	// dynamic_size_traits<Reference<IReplicationPolicy>>::save
+	mutable BinaryWriter writer{ IncludeVersion() };
+	mutable bool alreadyWritten = false;
 };
 
 template <class Archive>
@@ -276,12 +283,28 @@ void serializeReplicationPolicy(Ar& ar, Reference<IReplicationPolicy>& policy) {
 
 template <>
 struct dynamic_size_traits<Reference<IReplicationPolicy>> : std::true_type {
-	static WriteRawMemory save(const Reference<IReplicationPolicy>& value) {
-		BinaryWriter writer(IncludeVersion());
-		serializeReplicationPolicy(writer, const_cast<Reference<IReplicationPolicy>&>(value));
-		std::unique_ptr<uint8_t[]> memory(new uint8_t[writer.getLength()]);
-		memcpy(memory.get(), writer.getData(), writer.getLength());
-		return std::make_pair<OwnershipErasedPtr<const uint8_t>, size_t>(ownedPtr(const_cast<const uint8_t*>(memory.release())), writer.getLength());
+	static Block save(const Reference<IReplicationPolicy>& value) {
+		if (value.getPtr() == nullptr) {
+			static BinaryWriter writer{ IncludeVersion() };
+			writer = BinaryWriter{ IncludeVersion() };
+			serializeReplicationPolicy(writer, const_cast<Reference<IReplicationPolicy>&>(value));
+			return unownedPtr(const_cast<const uint8_t*>(reinterpret_cast<uint8_t*>(writer.getData())),
+			                  writer.getLength());
+		}
+		if (!value->alreadyWritten) {
+			value->alreadyWritten = true;
+			serializeReplicationPolicy(value->writer, const_cast<Reference<IReplicationPolicy>&>(value));
+		}
+		return unownedPtr(const_cast<const uint8_t*>(reinterpret_cast<uint8_t*>(value->writer.getData())),
+		                  value->writer.getLength());
+	}
+
+	static void serialization_done(const Reference<IReplicationPolicy>& value) {
+		if (value.getPtr() == nullptr) {
+			return;
+		}
+		value->alreadyWritten = false;
+		value->writer = BinaryWriter{ IncludeVersion() };
 	}
 
 	// Context is an arbitrary type that is plumbed by reference throughout the
@@ -294,5 +317,4 @@ struct dynamic_size_traits<Reference<IReplicationPolicy>> : std::true_type {
 	}
 };
 
-
 #endif
diff --git a/flow/Arena.h b/flow/Arena.h
index b5280bae99..90ff501d25 100644
--- a/flow/Arena.h
+++ b/flow/Arena.h
@@ -767,7 +767,7 @@ inline void save( Archive& ar, const StringRef& value ) {
 
 template<>
 struct dynamic_size_traits<StringRef> : std::true_type {
-	static WriteRawMemory save(const StringRef& str) { return { { unownedPtr(str.begin()), str.size() } }; }
+	static Block save(const StringRef& str) { return unownedPtr(str.begin(), str.size()); }
 
 	template <class Context>
 	static void load(const uint8_t* ptr, size_t sz, StringRef& str, Context& context) {
diff --git a/flow/ObjectSerializerTraits.h b/flow/ObjectSerializerTraits.h
index 3301214e76..37b8b2ece3 100644
--- a/flow/ObjectSerializerTraits.h
+++ b/flow/ObjectSerializerTraits.h
@@ -62,42 +62,15 @@ struct index_impl<0, pack<T, Ts...>> {
 template <int i, class Pack>
 using index_t = typename index_impl<i, Pack>::type;
 
-//  A smart pointer that knows whether or not to delete itself.
-template <class T>
-using OwnershipErasedPtr = std::unique_ptr<T, std::function<void(T*)>>;
-
-// Creates an OwnershipErasedPtr<T> that will delete itself.
-template <class T, class Deleter = std::default_delete<T>>
-OwnershipErasedPtr<T> ownedPtr(T* t, Deleter&& d = Deleter{}) {
-	return OwnershipErasedPtr<T>{ t, std::forward<Deleter>(d) };
-}
-
-// Creates an OwnershipErasedPtr<T> that will not delete itself.
-template <class T>
-OwnershipErasedPtr<T> unownedPtr(T* t) {
-	return OwnershipErasedPtr<T>{ t, [](T*) {} };
-}
-
-struct WriteRawMemory {
-	using Block = std::pair<OwnershipErasedPtr<const uint8_t>, size_t>;
-	std::vector<Block> blocks;
-
-	WriteRawMemory() {}
-	WriteRawMemory(Block&& b) { blocks.emplace_back(std::move(b.first), b.second); }
-	WriteRawMemory(std::vector<Block>&& v) : blocks(std::move(v)) {}
-
-	WriteRawMemory(WriteRawMemory&&) = default;
-	WriteRawMemory& operator=(WriteRawMemory&&) = default;
-
-	size_t size() const {
-		size_t result = 0;
-		for (const auto& b : blocks) {
-			result += b.second;
-		}
-		return result;
-	}
+struct Block {
+	const uint8_t* data;
+	size_t size;
 };
 
+template <class T>
+Block unownedPtr(T* t, size_t s) {
+	return Block{ t, s };
+}
 
 template <class T, typename = void>
 struct scalar_traits : std::false_type {
@@ -113,7 +86,8 @@ struct scalar_traits : std::false_type {
 
 template <class T>
 struct dynamic_size_traits : std::false_type {
-	static WriteRawMemory save(const T&);
+	static Block save(const T&);
+	static void serialization_done(const T&); // Optional. Called after the last call to save.
 
 	// Context is an arbitrary type that is plumbed by reference throughout the
 	// load call tree.
@@ -140,7 +114,6 @@ struct vector_like_traits : std::false_type {
 
 	static insert_iterator insert(VectorLike&);
 	static iterator begin(const VectorLike&);
-	static void deserialization_done(VectorLike&); // Optional
 };
 
 template <class UnionLike>
diff --git a/flow/flat_buffers.cpp b/flow/flat_buffers.cpp
index ce8261f54f..6c6c442e52 100644
--- a/flow/flat_buffers.cpp
+++ b/flow/flat_buffers.cpp
@@ -329,51 +329,10 @@ TEST_CASE("flow/FlatBuffers/vectorBool") {
 	return Void();
 }
 
-struct DynamicSizeThingy {
-	std::string x;
-	mutable int saves = 0;
-};
-
 } // namespace unit_tests
 
-template <>
-struct dynamic_size_traits<unit_tests::DynamicSizeThingy> : std::true_type {
-private:
-	using T = unit_tests::DynamicSizeThingy;
-
-public:
-	static WriteRawMemory save(const T& t) {
-		++t.saves;
-		T* t2 = new T(t);
-		return { { ownedPtr(reinterpret_cast<const uint8_t*>(t2->x.data()), [t2](auto*) { delete t2; }),
-			       t2->x.size() } };
-	}
-
-	// Context is an arbitrary type that is plumbed by reference throughout the
-	// load call tree.
-	template <class Context>
-	static void load(const uint8_t* p, size_t n, T& t, Context&) {
-		t.x.assign(reinterpret_cast<const char*>(p), n);
-	}
-};
-
 namespace unit_tests {
 
-TEST_CASE("flow/FlatBuffers/dynamic_size_owned") {
-	DynamicSizeThingy x1 = { "abcdefg" };
-	DynamicSizeThingy x2;
-	Arena arena;
-	DummyContext context;
-	const uint8_t* out;
-
-	out = save_members(arena, FileIdentifier{}, x1);
-	ASSERT(x1.saves == 1);
-	// print_buffer(out, arena.get_size(out));
-	load_members(out, context, x2);
-	ASSERT(x1.x == x2.x);
-	return Void();
-}
-
 struct Y1 {
 	int a;
 
diff --git a/flow/flat_buffers.h b/flow/flat_buffers.h
index 4eb454c4d9..ac9a8124c2 100644
--- a/flow/flat_buffers.h
+++ b/flow/flat_buffers.h
@@ -174,9 +174,7 @@ private:
 	using T = std::string;
 
 public:
-	static WriteRawMemory save(const T& t) {
-		return { { unownedPtr(reinterpret_cast<const uint8_t*>(t.data())), t.size() } };
-	};
+	static Block save(const T& t) { return unownedPtr(reinterpret_cast<const uint8_t*>(t.data()), t.size()); };
 
 	// Context is an arbitrary type that is plumbed by reference throughout the
 	// load call tree.
@@ -233,13 +231,13 @@ template <class T>
 struct sfinae_true : std::true_type {};
 
 template <class T>
-auto test_deserialization_done(int) -> sfinae_true<decltype(T::deserialization_done)>;
+auto test_serialization_done(int) -> sfinae_true<decltype(T::serialization_done)>;
 
 template <class T>
-auto test_deserialization_done(long) -> std::false_type;
+auto test_serialization_done(long) -> std::false_type;
 
 template <class T>
-struct has_deserialization_done : decltype(test_deserialization_done<T>(0)) {};
+struct has_serialization_done : decltype(test_serialization_done<T>(0)) {};
 
 template <class T>
 constexpr int fb_scalar_size = is_scalar<T> ? scalar_traits<T>::size : sizeof(RelativeOffset);
@@ -324,19 +322,6 @@ struct PrecomputeSize {
 	// offset.
 	void write(const void*, int offset, int len) { current_buffer_size = std::max(current_buffer_size, offset); }
 
-	template <class ToRawMemory>
-	void writeRawMemory(ToRawMemory&& to_raw_memory) {
-		auto w = std::forward<ToRawMemory>(to_raw_memory)();
-		int start = RightAlign(current_buffer_size + w.size() + 4, 4);
-		write(nullptr, start, 4);
-		start -= 4;
-		for (auto& block : w.blocks) {
-			write(nullptr, start, block.second);
-			start -= block.second;
-		}
-		writeRawMemories.emplace_back(std::move(w));
-	}
-
 	struct Noop {
 		void write(const void* src, int offset, int len) {}
 		void writeTo(PrecomputeSize& writer, int offset) {
@@ -355,12 +340,13 @@ struct PrecomputeSize {
 		return Noop{ size, writeToIndex };
 	}
 
+	static constexpr bool finalPass = false;
+
 	int current_buffer_size = 0;
 
 	const int buffer_length = -1; // Dummy, the value of this should not affect anything.
 	const int vtable_start = -1; // Dummy, the value of this should not affect anything.
 	std::vector<int> writeToOffsets;
-	std::vector<WriteRawMemory> writeRawMemories;
 };
 
 template <class Member, class Context>
@@ -382,26 +368,9 @@ struct WriteToBuffer {
 		current_buffer_size = std::max(current_buffer_size, offset);
 	}
 
-	template <class ToRawMemory>
-	void writeRawMemory(ToRawMemory&&) {
-		auto& w = *write_raw_memories_iter;
-		uint32_t size = w.size();
-		int start = RightAlign(current_buffer_size + size + 4, 4);
-		write(&size, start, 4);
-		start -= 4;
-		for (auto& p : w.blocks) {
-			if (p.second > 0) {
-				write(reinterpret_cast<const void*>(p.first.get()), start, p.second);
-			}
-			start -= p.second;
-		}
-		++write_raw_memories_iter;
-	}
-
-	WriteToBuffer(int buffer_length, int vtable_start, uint8_t* buffer, std::vector<int> writeToOffsets,
-	              std::vector<WriteRawMemory>::iterator write_raw_memories_iter)
+	WriteToBuffer(int buffer_length, int vtable_start, uint8_t* buffer, std::vector<int> writeToOffsets)
 	  : buffer_length(buffer_length), vtable_start(vtable_start), buffer(buffer),
-	    writeToOffsets(std::move(writeToOffsets)), write_raw_memories_iter(write_raw_memories_iter) {}
+	    writeToOffsets(std::move(writeToOffsets)) {}
 
 	struct MessageWriter {
 		template <class T>
@@ -433,12 +402,13 @@ struct WriteToBuffer {
 	const int vtable_start;
 	int current_buffer_size = 0;
 
+	static constexpr bool finalPass = true;
+
 private:
 	void copy_memory(const void* src, int offset, int len) {
 		memcpy(static_cast<void*>(&buffer[buffer_length - offset]), src, len);
 	}
 	std::vector<int> writeToOffsets;
-	std::vector<WriteRawMemory>::iterator write_raw_memories_iter;
 	int writeToIndex = 0;
 	uint8_t* buffer;
 };
@@ -781,9 +751,6 @@ struct LoadMember {
 				++inserter;
 				current += sizeof(RelativeOffset);
 			}
-			if constexpr (has_deserialization_done<VectorTraits>::value) {
-				VectorTraits::deserialization_done(member);
-			}
 		} else if constexpr (is_union_like<Member>) {
 			if (!field_present()) {
 				i += 2;
@@ -909,9 +876,6 @@ struct LoadSaveHelper {
 			++inserter;
 			current += fb_size<T>;
 		}
-		if constexpr (has_deserialization_done<VectorTraits>::value) {
-			VectorTraits::deserialization_done(member);
-		}
 	}
 
 	template <class U, class Writer, typename = std::enable_if_t<is_scalar<U>>>
@@ -942,7 +906,15 @@ struct LoadSaveHelper {
 	template <class U, class Writer, typename = std::enable_if_t<is_dynamic_size<U>>>
 	RelativeOffset save(const U& message, Writer& writer, const VTableSet*,
 	                    std::enable_if_t<is_dynamic_size<U>, int> _ = 0) {
-		writer.writeRawMemory([&]() { return dynamic_size_traits<U>::save(message); });
+		auto block = dynamic_size_traits<U>::save(message);
+		uint32_t size = block.size;
+		int start = RightAlign(writer.current_buffer_size + size + 4, 4);
+		writer.write(&size, start, 4);
+		start -= 4;
+		writer.write(block.data, start, block.size);
+		if constexpr (has_serialization_done<dynamic_size_traits<U>>::value && Writer::finalPass) {
+			dynamic_size_traits<U>::serialization_done(message);
+		}
 		return RelativeOffset{ writer.current_buffer_size };
 	}
 
@@ -1058,7 +1030,7 @@ uint8_t* save(Allocator& allocator, const Root& root, FileIdentifier file_identi
 	uint8_t* out = allocator(precompute_size.current_buffer_size);
 	memset(out, 0, precompute_size.current_buffer_size);
 	WriteToBuffer writeToBuffer{ precompute_size.current_buffer_size, vtable_start, out,
-		                         std::move(precompute_size.writeToOffsets), precompute_size.writeRawMemories.begin() };
+		                         std::move(precompute_size.writeToOffsets) };
 	save_with_vtables(root, vtableset, writeToBuffer, &vtable_start, file_identifier);
 	return out;
 }

From 889e153b815e0cb082b11d693bffd1e4f229b3ad Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Sun, 30 Jun 2019 10:24:55 -0700
Subject: [PATCH 068/136] Add object serializer flag to fdbcli

---
 fdbcli/fdbcli.actor.cpp | 169 ++++++++++++++++++++++++----------------
 1 file changed, 102 insertions(+), 67 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 00327def46..d07fab7340 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -59,31 +59,44 @@ extern const char* getHGVersion();
 
 std::vector<std::string> validOptions;
 
-enum { OPT_CONNFILE, OPT_DATABASE, OPT_HELP, OPT_TRACE, OPT_TRACE_DIR, OPT_TIMEOUT, OPT_EXEC, OPT_NO_STATUS, OPT_STATUS_FROM_JSON, OPT_VERSION, OPT_TRACE_FORMAT };
+enum {
+	OPT_CONNFILE,
+	OPT_DATABASE,
+	OPT_HELP,
+	OPT_TRACE,
+	OPT_TRACE_DIR,
+	OPT_TIMEOUT,
+	OPT_EXEC,
+	OPT_NO_STATUS,
+	OPT_STATUS_FROM_JSON,
+	OPT_VERSION,
+	OPT_TRACE_FORMAT,
+	OPT_USE_OBJECT_SERIALIZER
+};
 
-CSimpleOpt::SOption g_rgOptions[] = {
-	{ OPT_CONNFILE, "-C", SO_REQ_SEP },
-	{ OPT_CONNFILE, "--cluster_file", SO_REQ_SEP },
-	{ OPT_DATABASE, "-d", SO_REQ_SEP },
-	{ OPT_TRACE, "--log", SO_NONE },
-	{ OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP },
-	{ OPT_TIMEOUT, "--timeout", SO_REQ_SEP },
-	{ OPT_EXEC, "--exec", SO_REQ_SEP },
-	{ OPT_NO_STATUS, "--no-status", SO_NONE },
-	{ OPT_HELP, "-?", SO_NONE },
-	{ OPT_HELP, "-h", SO_NONE },
-	{ OPT_HELP, "--help", SO_NONE },
-	{ OPT_STATUS_FROM_JSON, "--status-from-json", SO_REQ_SEP },
-	{ OPT_VERSION,         "--version",        SO_NONE },
-	{ OPT_VERSION,         "-v",               SO_NONE },
-	{ OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP },
+CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP },
+	                                  { OPT_CONNFILE, "--cluster_file", SO_REQ_SEP },
+	                                  { OPT_DATABASE, "-d", SO_REQ_SEP },
+	                                  { OPT_TRACE, "--log", SO_NONE },
+	                                  { OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP },
+	                                  { OPT_TIMEOUT, "--timeout", SO_REQ_SEP },
+	                                  { OPT_EXEC, "--exec", SO_REQ_SEP },
+	                                  { OPT_NO_STATUS, "--no-status", SO_NONE },
+	                                  { OPT_HELP, "-?", SO_NONE },
+	                                  { OPT_HELP, "-h", SO_NONE },
+	                                  { OPT_HELP, "--help", SO_NONE },
+	                                  { OPT_STATUS_FROM_JSON, "--status-from-json", SO_REQ_SEP },
+	                                  { OPT_VERSION, "--version", SO_NONE },
+	                                  { OPT_VERSION, "-v", SO_NONE },
+	                                  { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP },
+	                                  { OPT_USE_OBJECT_SERIALIZER, "-S", SO_REQ_SEP },
+	                                  { OPT_USE_OBJECT_SERIALIZER, "--object-serializer", SO_REQ_SEP },
 
 #ifndef TLS_DISABLED
-	TLS_OPTION_FLAGS
+	                                  TLS_OPTION_FLAGS
 #endif
 
-	SO_END_OF_OPTIONS
-};
+	                                      SO_END_OF_OPTIONS };
 
 void printAtCol(const char* text, int col) {
 	const char* iter = text;
@@ -401,21 +414,25 @@ static void printProgramUsage(const char* name) {
 		   "                 FDB_CLUSTER_FILE environment variable, then `./fdb.cluster',\n"
 		   "                 then `%s'.\n", platform::getDefaultClusterFilePath().c_str());
 	printf("  --log          Enables trace file logging for the CLI session.\n"
-		   "  --log-dir PATH Specifes the output directory for trace files. If\n"
-		   "                 unspecified, defaults to the current directory. Has\n"
-		   "                 no effect unless --log is specified.\n"
-		   "  --trace_format FORMAT\n"
-		   "                 Select the format of the log files. xml (the default) and json\n"
-		   "                 are supported. Has no effect unless --log is specified.\n"
-		   "  --exec CMDS    Immediately executes the semicolon separated CLI commands\n"
-		   "                 and then exits.\n"
-		   "  --no-status    Disables the initial status check done when starting\n"
-		   "                 the CLI.\n"
+	       "  --log-dir PATH Specifes the output directory for trace files. If\n"
+	       "                 unspecified, defaults to the current directory. Has\n"
+	       "                 no effect unless --log is specified.\n"
+	       "  --trace_format FORMAT\n"
+	       "                 Select the format of the log files. xml (the default) and json\n"
+	       "                 are supported. Has no effect unless --log is specified.\n"
+	       "  -S ON|OFF, --object-serializer ON|OFF\n"
+	       "                 Use object serializer for sending messages. The object serializer\n"
+	       "                 is currently a beta feature and it allows fdb processes to talk to\n"
+	       "                 each other even if they don't have the same version\n"
+	       "  --exec CMDS    Immediately executes the semicolon separated CLI commands\n"
+	       "                 and then exits.\n"
+	       "  --no-status    Disables the initial status check done when starting\n"
+	       "                 the CLI.\n"
 #ifndef TLS_DISABLED
-		   TLS_HELP
+	       TLS_HELP
 #endif
-		   "  -v, --version  Print FoundationDB CLI version information and exit.\n"
-		   "  -h, --help     Display this help and exit.\n");
+	       "  -v, --version  Print FoundationDB CLI version information and exit.\n"
+	       "  -h, --help     Display this help and exit.\n");
 }
 
 
@@ -2332,6 +2349,7 @@ struct CLIOptions {
 	bool trace;
 	std::string traceDir;
 	std::string traceFormat;
+	bool useObjectSerializer = false;
 	int exit_timeout;
 	Optional<std::string> exec;
 	bool initialStatusCheck;
@@ -2403,41 +2421,55 @@ struct CLIOptions {
 
 #ifndef TLS_DISABLED
 			// TLS Options
-			case TLSOptions::OPT_TLS_PLUGIN:
-				args.OptionArg();
-				break;
-			case TLSOptions::OPT_TLS_CERTIFICATES:
-				tlsCertPath = args.OptionArg();
-				break;
-			case TLSOptions::OPT_TLS_CA_FILE:
-				tlsCAPath = args.OptionArg();
-				break;
-			case TLSOptions::OPT_TLS_KEY:
-				tlsKeyPath = args.OptionArg();
-				break;
-			case TLSOptions::OPT_TLS_PASSWORD:
-				tlsPassword = args.OptionArg();
-				break;
-			case TLSOptions::OPT_TLS_VERIFY_PEERS:
-				tlsVerifyPeers = args.OptionArg();
-				break;
+		    case TLSOptions::OPT_TLS_PLUGIN:
+			    args.OptionArg();
+			    break;
+		    case TLSOptions::OPT_TLS_CERTIFICATES:
+			    tlsCertPath = args.OptionArg();
+			    break;
+		    case TLSOptions::OPT_TLS_CA_FILE:
+			    tlsCAPath = args.OptionArg();
+			    break;
+		    case TLSOptions::OPT_TLS_KEY:
+			    tlsKeyPath = args.OptionArg();
+			    break;
+		    case TLSOptions::OPT_TLS_PASSWORD:
+			    tlsPassword = args.OptionArg();
+			    break;
+		    case TLSOptions::OPT_TLS_VERIFY_PEERS:
+			    tlsVerifyPeers = args.OptionArg();
+			    break;
 #endif
-			case OPT_HELP:
-				printProgramUsage(program_name.c_str());
-				return 0;
-			case OPT_STATUS_FROM_JSON:
-				return printStatusFromJSON(args.OptionArg());
-			case OPT_TRACE_FORMAT:
-				if (!validateTraceFormat(args.OptionArg())) {
-					fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg());
-				}
-				traceFormat = args.OptionArg();
-				break;
-			case OPT_VERSION:
-				printVersion();
-				return FDB_EXIT_SUCCESS;
-		}
-		return -1;
+		    case OPT_HELP:
+			    printProgramUsage(program_name.c_str());
+			    return 0;
+		    case OPT_STATUS_FROM_JSON:
+			    return printStatusFromJSON(args.OptionArg());
+		    case OPT_TRACE_FORMAT:
+			    if (!validateTraceFormat(args.OptionArg())) {
+				    fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg());
+			    }
+			    traceFormat = args.OptionArg();
+			    break;
+		    case OPT_USE_OBJECT_SERIALIZER: {
+			    std::string s = args.OptionArg();
+			    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+			    if (s == "on" || s == "true" || s == "1") {
+				    useObjectSerializer = true;
+			    } else if (s == "off" || s == "false" || s == "0") {
+				    useObjectSerializer = false;
+			    } else {
+				    fprintf(stderr, "ERROR: Could not parse object serializer option: `%s'\n", s.c_str());
+				    printProgramUsage(program_name.c_str());
+				    flushAndExit(FDB_EXIT_ERROR);
+			    }
+			    break;
+		    }
+		    case OPT_VERSION:
+			    printVersion();
+			    return FDB_EXIT_SUCCESS;
+		    }
+		    return -1;
 	}
 };
 
@@ -3490,6 +3522,9 @@ int main(int argc, char **argv) {
 		}
 		setNetworkOption(FDBNetworkOptions::ENABLE_SLOW_TASK_PROFILING);
 	}
+	setNetworkOption(FDBNetworkOptions::USE_OBJECT_SERIALIZER,
+	                 opt.useObjectSerializer ? LiteralStringRef("\x01\x00\x00\x00\x00\x00\x00\x00")
+	                                         : LiteralStringRef("\x00\x00\x00\x00\x00\x00\x00\x00"));
 
 	initHelp();
 

From 7350b3db309344e3589a5813f0e9964898c96abc Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Mon, 1 Jul 2019 10:26:49 -0700
Subject: [PATCH 069/136] Don't assume serializeReplicationPolicy succeeds

---
 fdbrpc/ReplicationPolicy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h
index c20a3de821..441084b86d 100644
--- a/fdbrpc/ReplicationPolicy.h
+++ b/fdbrpc/ReplicationPolicy.h
@@ -292,8 +292,8 @@ struct dynamic_size_traits<Reference<IReplicationPolicy>> : std::true_type {
 			                  writer.getLength());
 		}
 		if (!value->alreadyWritten) {
-			value->alreadyWritten = true;
 			serializeReplicationPolicy(value->writer, const_cast<Reference<IReplicationPolicy>&>(value));
+			value->alreadyWritten = true;
 		}
 		return unownedPtr(const_cast<const uint8_t*>(reinterpret_cast<uint8_t*>(value->writer.getData())),
 		                  value->writer.getLength());

From 9ed8eb2cdb6c05f9f44a0002887baec2d9252a14 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Mon, 1 Jul 2019 10:28:08 -0700
Subject: [PATCH 070/136] Explain strange use of literal byte strings

---
 fdbcli/fdbcli.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index d07fab7340..ebb63a82b9 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -3522,6 +3522,8 @@ int main(int argc, char **argv) {
 		}
 		setNetworkOption(FDBNetworkOptions::ENABLE_SLOW_TASK_PROFILING);
 	}
+	// The USE_OBJECT_SERIALIZER network option expects an 8 byte little endian integer which is interpreted as zero =
+	// false, non-zero = true.
 	setNetworkOption(FDBNetworkOptions::USE_OBJECT_SERIALIZER,
 	                 opt.useObjectSerializer ? LiteralStringRef("\x01\x00\x00\x00\x00\x00\x00\x00")
 	                                         : LiteralStringRef("\x00\x00\x00\x00\x00\x00\x00\x00"));

From 15c6f2b864df1002a7d761b343dea3f04163916d Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Mon, 1 Jul 2019 11:09:11 -0700
Subject: [PATCH 071/136] Explain SFINAE for has_serialization_done

---
 fdbrpc/ReplicationPolicy.h | 2 ++
 flow/flat_buffers.h        | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h
index 441084b86d..27752271d1 100644
--- a/fdbrpc/ReplicationPolicy.h
+++ b/fdbrpc/ReplicationPolicy.h
@@ -317,4 +317,6 @@ struct dynamic_size_traits<Reference<IReplicationPolicy>> : std::true_type {
 	}
 };
 
+static_assert(detail::has_serialization_done<dynamic_size_traits<Reference<IReplicationPolicy>>>::value);
+
 #endif
diff --git a/flow/flat_buffers.h b/flow/flat_buffers.h
index ac9a8124c2..420fa9f83a 100644
--- a/flow/flat_buffers.h
+++ b/flow/flat_buffers.h
@@ -236,6 +236,8 @@ auto test_serialization_done(int) -> sfinae_true<decltype(T::serialization_done)
 template <class T>
 auto test_serialization_done(long) -> std::false_type;
 
+// int is a better match for 0 than long. If substituting T::serialization_done succeeds the true_type overload is
+// selected.
 template <class T>
 struct has_serialization_done : decltype(test_serialization_done<T>(0)) {};
 

From 6d74af93d38585f809ce10e76237023646c9680c Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Mon, 1 Jul 2019 11:29:06 -0700
Subject: [PATCH 072/136] Use true instead of 1

---
 flow/serialize.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/serialize.h b/flow/serialize.h
index bc56879df8..1a3a916549 100644
--- a/flow/serialize.h
+++ b/flow/serialize.h
@@ -685,7 +685,7 @@ struct PacketBuffer : SendBuffer, FastAllocated<PacketBuffer> {
 
 struct PacketWriter {
 	static const int isDeserializing = 0;
-	static constexpr bool isSerializing = 1;
+	static constexpr bool isSerializing = true;
 	typedef PacketWriter WRITER;
 
 	PacketBuffer* buffer;

From 4be08d9b2d95a3bb80dd58ea4a41d8b3f2ac8dc5 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 5 Jul 2019 14:36:18 -0700
Subject: [PATCH 073/136] Rename datacenter_version_difference to
 datacenter_lag and include both seconds and versions.

---
 documentation/sphinx/source/mr-status-json-schemas.rst.inc | 5 ++++-
 documentation/sphinx/source/release-notes.rst              | 2 ++
 fdbclient/Schemas.cpp                                      | 5 ++++-
 fdbserver/Status.actor.cpp                                 | 6 +++++-
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 5b0099f142..d4353dc75f 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -264,7 +264,10 @@
       },
       "incompatible_connections":[
       ],
-      "datacenter_version_difference":0,
+      "datacenter_lag":{
+         "seconds":1.0,
+         "versions":1000000
+      },
       "degraded_processes":0,
       "database_available":true,
       "database_locked":false,
diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index c4d248ba48..6ef49db75d 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -19,6 +19,8 @@ Fixes
 Status
 ------
 
+* Remove ``cluster.datacenter_version_difference`` and replace it with ``cluster.datacenter_lag`` that has subfields ``versions`` and ``seconds``. `(PR #) <https://github.com/apple/foundationdb/pull/>`_.
+
 Bindings
 --------
 
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index f8303e40bf..53a1d493ed 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -285,7 +285,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
       "incompatible_connections":[
 
       ],
-      "datacenter_version_difference":0,
+      "datacenter_lag": {
+         "seconds" : 1.0,
+         "versions" : 1000000
+      },
       "degraded_processes":0,
       "database_available":true,
       "database_locked":false,
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 47c61aeb9f..2fe2de1db2 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -2098,7 +2098,11 @@ ACTOR Future<StatusReply> clusterGetStatus(
 			incompatibleConnectionsArray.push_back(it.toString());
 		}
 		statusObj["incompatible_connections"] = incompatibleConnectionsArray;
-		statusObj["datacenter_version_difference"] = datacenterVersionDifference;
+
+		StatusObject datacenterLag;
+		datacenterLag["versions"] = datacenterVersionDifference;
+		datacenterLag["seconds"] = datacenterVersionDifference / (double)SERVER_KNOBS->VERSIONS_PER_SECOND;
+		statusObj["datacenter_lag"] = datacenterLag;
 
 		int totalDegraded = 0;
 		for(auto& it : workers) {

From abb85038397b270cfe0e07b2254d699913b1ce60 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 5 Jul 2019 14:37:28 -0700
Subject: [PATCH 074/136] Add PR number

---
 documentation/sphinx/source/release-notes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index 6ef49db75d..707fc16702 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -19,7 +19,7 @@ Fixes
 Status
 ------
 
-* Remove ``cluster.datacenter_version_difference`` and replace it with ``cluster.datacenter_lag`` that has subfields ``versions`` and ``seconds``. `(PR #) <https://github.com/apple/foundationdb/pull/>`_.
+* Remove ``cluster.datacenter_version_difference`` and replace it with ``cluster.datacenter_lag`` that has subfields ``versions`` and ``seconds``. `(PR #1800) <https://github.com/apple/foundationdb/pull/1800>`_.
 
 Bindings
 --------

From 46d28a3b793a5bca7c6e7ad1fa428776d62f29db Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Fri, 5 Jul 2019 14:29:35 -0700
Subject: [PATCH 075/136] TeamTracker:Set redundant team priority as redundant

The redundant team removed by teamRemover will not exist
in the global teams data structure. So we will not find
the redundant team from shard-to-team mapping in the system key.

Before this change, teamTracker marks such team as PRIORITY_TEAM_UNHEALTHY.
With this change, it marks it as PRIORITY_TEAM_REDUNDANT
---
 fdbserver/DataDistribution.actor.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index ef8c25b2b6..59a595873d 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -2540,7 +2540,12 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 
 									//If we cannot find the team, it could be a bad team so assume unhealthy priority
 									if(!found) {
-										maxPriority = std::max<int>( maxPriority, PRIORITY_TEAM_UNHEALTHY );
+										// If the input team (in function parameters) is a redundant team, found will be
+										// false We want to differentiate the redundant_team from unhealthy_team in
+										// terms of relocate priority
+										maxPriority =
+										    std::max<int>(maxPriority, redundantTeam ? PRIORITY_TEAM_REDUNDANT
+										                                             : PRIORITY_TEAM_UNHEALTHY);
 									}
 								} else {
 									TEST(true); // A removed server is still associated with a team in SABTF

From e7c0ecf729c53d43e9ecb577af41d8b0d0e4d89d Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 5 Jul 2019 15:46:16 -0700
Subject: [PATCH 076/136] fix: we cannot reject 100% of requests, because a
 storage server which is stuck needs to get a future version error to trigger
 an all alternatives failed message from load balance so that clients will
 re-grab storage server interfaces from the proxy

---
 fdbserver/Knobs.cpp               | 1 +
 fdbserver/Knobs.h                 | 1 +
 fdbserver/storageserver.actor.cpp | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index 06798a485a..b69faad5a0 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -411,6 +411,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( BUGGIFY_BLOCK_BYTES,                                 10000 );
 	init( STORAGE_COMMIT_BYTES,                             10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
 	init( STORAGE_DURABILITY_LAG_REJECT_THRESHOLD,              0.25 );
+	init( STORAGE_DURABILITY_LAG_MIN_RATE,                       0.1 );
 	init( STORAGE_COMMIT_INTERVAL,                               0.5 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_INTERVAL = 2.0;
 	init( UPDATE_SHARD_VERSION_INTERVAL,                        0.25 ); if( randomize && BUGGIFY ) UPDATE_SHARD_VERSION_INTERVAL = 1.0;
 	init( BYTE_SAMPLING_FACTOR,                                  250 ); //cannot buggify because of differences in restarting tests
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index dab19d5108..1953b547b4 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -349,6 +349,7 @@ public:
 	int64_t STORAGE_DURABILITY_LAG_HARD_MAX;
 	int64_t STORAGE_DURABILITY_LAG_SOFT_MAX;
 	double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD;
+	double STORAGE_DURABILITY_LAG_MIN_RATE;
 	int STORAGE_COMMIT_BYTES;
 	double STORAGE_COMMIT_INTERVAL;
 	double UPDATE_SHARD_VERSION_INTERVAL;
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 666c8a13ce..300e0c4d9c 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -643,7 +643,7 @@ public:
 	template<class Request, class HandleFunction>
 	Future<Void> readGuard(const Request& request, const HandleFunction& fun) {
 		auto rate = currentRate();
-		if (rate < SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD && deterministicRandom()->random01() > rate/SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD) {
+		if (rate < SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD && deterministicRandom()->random01() > std::max(SERVER_KNOBS->STORAGE_DURABILITY_LAG_MIN_RATE, rate/SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD)) {
 			//request.error = future_version();
 			sendErrorWithPenalty(request.reply, server_overloaded(), getPenalty());
 			return Void();

From 310a5fe9a31c29bed91606d16d3eeab76de5e3a1 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 5 Jul 2019 15:46:16 -0700
Subject: [PATCH 077/136] fix: we cannot reject 100% of requests, because a
 storage server which is stuck needs to get a future version error to trigger
 an all alternatives failed message from load balance so that clients will
 re-grab storage server interfaces from the proxy

---
 fdbserver/Knobs.cpp               | 1 +
 fdbserver/Knobs.h                 | 1 +
 fdbserver/storageserver.actor.cpp | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index 06798a485a..b69faad5a0 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -411,6 +411,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( BUGGIFY_BLOCK_BYTES,                                 10000 );
 	init( STORAGE_COMMIT_BYTES,                             10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
 	init( STORAGE_DURABILITY_LAG_REJECT_THRESHOLD,              0.25 );
+	init( STORAGE_DURABILITY_LAG_MIN_RATE,                       0.1 );
 	init( STORAGE_COMMIT_INTERVAL,                               0.5 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_INTERVAL = 2.0;
 	init( UPDATE_SHARD_VERSION_INTERVAL,                        0.25 ); if( randomize && BUGGIFY ) UPDATE_SHARD_VERSION_INTERVAL = 1.0;
 	init( BYTE_SAMPLING_FACTOR,                                  250 ); //cannot buggify because of differences in restarting tests
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index dab19d5108..1953b547b4 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -349,6 +349,7 @@ public:
 	int64_t STORAGE_DURABILITY_LAG_HARD_MAX;
 	int64_t STORAGE_DURABILITY_LAG_SOFT_MAX;
 	double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD;
+	double STORAGE_DURABILITY_LAG_MIN_RATE;
 	int STORAGE_COMMIT_BYTES;
 	double STORAGE_COMMIT_INTERVAL;
 	double UPDATE_SHARD_VERSION_INTERVAL;
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 666c8a13ce..300e0c4d9c 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -643,7 +643,7 @@ public:
 	template<class Request, class HandleFunction>
 	Future<Void> readGuard(const Request& request, const HandleFunction& fun) {
 		auto rate = currentRate();
-		if (rate < SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD && deterministicRandom()->random01() > rate/SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD) {
+		if (rate < SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD && deterministicRandom()->random01() > std::max(SERVER_KNOBS->STORAGE_DURABILITY_LAG_MIN_RATE, rate/SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD)) {
 			//request.error = future_version();
 			sendErrorWithPenalty(request.reply, server_overloaded(), getPenalty());
 			return Void();

From 14e5dd74fef0928ad470dabd449d43f9dfbb8768 Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Fri, 5 Jul 2019 19:09:09 -0700
Subject: [PATCH 078/136] Add a checkOnly parameter to Cycle workload.

So that it can be used in the real world for consistency checking of
backup and DR.
---
 fdbserver/workloads/Cycle.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdbserver/workloads/Cycle.actor.cpp b/fdbserver/workloads/Cycle.actor.cpp
index e622ffe4a3..62c80af294 100644
--- a/fdbserver/workloads/Cycle.actor.cpp
+++ b/fdbserver/workloads/Cycle.actor.cpp
@@ -28,6 +28,7 @@ struct CycleWorkload : TestWorkload {
 	int actorCount, nodeCount;
 	double testDuration, transactionsPerSecond, minExpectedTransactionsPerSecond;
 	Key		keyPrefix;
+	bool checkOnly;
 
 	vector<Future<Void>> clients;
 	PerfIntCounter transactions, retries, tooOldRetries, commitFailedRetries;
@@ -44,6 +45,7 @@ struct CycleWorkload : TestWorkload {
 		nodeCount = getOption(options, LiteralStringRef("nodeCount"), transactionsPerSecond * clientCount);
 		keyPrefix = getOption(options, LiteralStringRef("keyPrefix"), LiteralStringRef(""));
 		minExpectedTransactionsPerSecond = transactionsPerSecond * getOption(options, LiteralStringRef("expectedRate"), 0.7);
+		checkOnly = getOption(options, LiteralStringRef("checkOnly"), false);
 	}
 
 	virtual std::string description() { return "CycleWorkload"; }
@@ -51,6 +53,7 @@ struct CycleWorkload : TestWorkload {
 		return bulkSetup( cx, this, nodeCount, Promise<double>() );
 	}
 	virtual Future<Void> start( Database const& cx ) {
+		if (checkOnly) return Void();
 		for(int c=0; c<actorCount; c++)
 			clients.push_back(
 				timeout(

From f840d40e05f28c7f3161c4cb3a168fce704e0a69 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Sun, 7 Jul 2019 20:48:22 -0700
Subject: [PATCH 079/136] update versions target to 6.1.12

---
 versions.target | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/versions.target b/versions.target
index 78c4980b3d..167bd4277b 100644
--- a/versions.target
+++ b/versions.target
@@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <PropertyGroup>
-    <Version>6.1.11</Version>
+    <Version>6.1.12</Version>
     <PackageName>6.1</PackageName>
   </PropertyGroup>
 </Project>

From f80279d8c4dfb47a052212cea958c1f4be88f9de Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Sun, 7 Jul 2019 20:48:22 -0700
Subject: [PATCH 080/136] update installer WIX GUID following release

---
 packaging/msi/FDBInstaller.wxs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs
index a742b9ed3b..486cd85ea0 100644
--- a/packaging/msi/FDBInstaller.wxs
+++ b/packaging/msi/FDBInstaller.wxs
@@ -32,7 +32,7 @@
 
 <Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
   <Product Name='$(var.Title)'
-           Id='{8E0DAD6E-4CA7-45A0-9D24-BA18FFC47547}'
+           Id='{58285A17-7601-4E68-B41C-E6BD0ED36743}'
            UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
            Version='$(var.Version)'
            Manufacturer='$(var.Manufacturer)'

From c348b3da510fcbb5110496656a8fc9d00d05f600 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Mon, 8 Jul 2019 12:53:40 -0700
Subject: [PATCH 081/136] After a proxy dies, it will remain alive for an
 additional 10 seconds to forward clients to the new proxies

---
 fdbclient/CMakeLists.txt                      |  1 -
 fdbclient/ClientDBInfo.h                      | 49 -----------
 fdbclient/ClientWorkerInterface.h             |  2 +-
 fdbclient/ClusterInterface.h                  |  2 +-
 fdbclient/DatabaseContext.h                   |  1 -
 fdbclient/MasterProxyInterface.h              | 44 ++++++++--
 fdbclient/NativeAPI.actor.cpp                 | 81 +++++++++++++------
 fdbclient/fdbclient.vcxproj                   |  1 -
 fdbserver/Knobs.cpp                           |  1 +
 fdbserver/Knobs.h                             |  1 +
 fdbserver/MasterProxyServer.actor.cpp         | 64 +++++++++++----
 fdbserver/masterserver.actor.cpp              |  2 +-
 .../workloads/ConsistencyCheck.actor.cpp      |  4 +-
 13 files changed, 150 insertions(+), 103 deletions(-)
 delete mode 100644 fdbclient/ClientDBInfo.h

diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 8cca7703c2..06e62615a5 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -8,7 +8,6 @@ set(FDBCLIENT_SRCS
   BackupContainer.actor.cpp
   BackupContainer.h
   BlobStore.actor.cpp
-  ClientDBInfo.h
   ClientLogEvents.h
   ClientWorkerInterface.h
   ClusterInterface.h
diff --git a/fdbclient/ClientDBInfo.h b/fdbclient/ClientDBInfo.h
deleted file mode 100644
index 73f4865b6d..0000000000
--- a/fdbclient/ClientDBInfo.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * ClientDBInfo.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef FDBCLIENT_CLIENTDBINFO_H
-#define FDBCLIENT_CLIENTDBINFO_H
-#pragma once
-
-#include "fdbclient/MasterProxyInterface.h"
-
-// ClientDBInfo is all the information needed by a database client to access the database
-// It is returned (and kept up to date) by the OpenDatabaseRequest interface of ClusterInterface
-struct ClientDBInfo {
-	constexpr static FileIdentifier file_identifier = 5355080;
-	UID id;  // Changes each time anything else changes
-	vector< MasterProxyInterface > proxies;
-	double clientTxnInfoSampleRate;
-	int64_t clientTxnInfoSizeLimit;
-	ClientDBInfo() : clientTxnInfoSampleRate(std::numeric_limits<double>::infinity()), clientTxnInfoSizeLimit(-1) {}
-
-	bool operator == (ClientDBInfo const& r) const { return id == r.id; }
-	bool operator != (ClientDBInfo const& r) const { return id != r.id; }
-
-	template <class Archive>
-	void serialize(Archive& ar) {
-		if constexpr (!is_fb_function<Archive>) {
-			ASSERT(ar.protocolVersion().isValid());
-		}
-		serializer(ar, proxies, id, clientTxnInfoSampleRate, clientTxnInfoSizeLimit);
-	}
-};
-
-#endif
diff --git a/fdbclient/ClientWorkerInterface.h b/fdbclient/ClientWorkerInterface.h
index d5f83b86ec..58b8b3964a 100644
--- a/fdbclient/ClientWorkerInterface.h
+++ b/fdbclient/ClientWorkerInterface.h
@@ -25,7 +25,7 @@
 #include "fdbclient/FDBTypes.h"
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbclient/Status.h"
-#include "fdbclient/ClientDBInfo.h"
+#include "fdbclient/MasterProxyInterface.h"
 
 // Streams from WorkerInterface that are safe and useful to call from a client.
 // A ClientWorkerInterface is embedded as the first element of a WorkerInterface.
diff --git a/fdbclient/ClusterInterface.h b/fdbclient/ClusterInterface.h
index 5e17807c4d..3e420b8fa0 100644
--- a/fdbclient/ClusterInterface.h
+++ b/fdbclient/ClusterInterface.h
@@ -25,7 +25,7 @@
 #include "fdbclient/FDBTypes.h"
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbclient/Status.h"
-#include "fdbclient/ClientDBInfo.h"
+#include "fdbclient/MasterProxyInterface.h"
 #include "fdbclient/ClientWorkerInterface.h"
 
 struct ClusterInterface {
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index 4c1c21dc6a..fa31d904e8 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -25,7 +25,6 @@
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/MasterProxyInterface.h"
-#include "fdbclient/ClientDBInfo.h"
 #include "fdbrpc/QueueModel.h"
 #include "fdbrpc/MultiInterface.h"
 #include "flow/TDMetric.actor.h"
diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h
index dea0d8b797..a7051f4328 100644
--- a/fdbclient/MasterProxyInterface.h
+++ b/fdbclient/MasterProxyInterface.h
@@ -75,7 +75,39 @@ struct MasterProxyInterface {
 	}
 };
 
-struct CommitID {
+// ClientDBInfo is all the information needed by a database client to access the database
+// It is returned (and kept up to date) by the OpenDatabaseRequest interface of ClusterInterface
+struct ClientDBInfo {
+	constexpr static FileIdentifier file_identifier = 5355080;
+	UID id;  // Changes each time anything else changes
+	vector< MasterProxyInterface > proxies;
+	double clientTxnInfoSampleRate;
+	int64_t clientTxnInfoSizeLimit;
+	ClientDBInfo() : clientTxnInfoSampleRate(std::numeric_limits<double>::infinity()), clientTxnInfoSizeLimit(-1) {}
+
+	bool operator == (ClientDBInfo const& r) const { return id == r.id; }
+	bool operator != (ClientDBInfo const& r) const { return id != r.id; }
+
+	template <class Archive>
+	void serialize(Archive& ar) {
+		if constexpr (!is_fb_function<Archive>) {
+			ASSERT(ar.protocolVersion().isValid());
+		}
+		serializer(ar, proxies, id, clientTxnInfoSampleRate, clientTxnInfoSizeLimit);
+	}
+};
+
+struct ProxyForwardReply {
+	Optional<ClientDBInfo> newClientInfo;
+	ProxyForwardReply() {}
+
+	template <class Ar>
+	void serialize(Ar &ar) {
+		serializer(ar, newClientInfo);
+	}
+};
+
+struct CommitID : public ProxyForwardReply {
 	constexpr static FileIdentifier file_identifier = 14254927;
 	Version version; 			// returns invalidVersion if transaction conflicts
 	uint16_t txnBatchId;
@@ -83,7 +115,7 @@ struct CommitID {
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, version, txnBatchId, metadataVersion);
+		serializer(ar, *(ProxyForwardReply*)this, version, txnBatchId, metadataVersion);
 	}
 
 	CommitID() : version(invalidVersion), txnBatchId(0) {}
@@ -127,7 +159,7 @@ static inline int getBytes( CommitTransactionRequest const& r ) {
 	return total;
 }
 
-struct GetReadVersionReply {
+struct GetReadVersionReply : public ProxyForwardReply {
 	constexpr static FileIdentifier file_identifier = 15709388;
 	Version version;
 	bool locked;
@@ -135,7 +167,7 @@ struct GetReadVersionReply {
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, version, locked, metadataVersion);
+		serializer(ar, *(ProxyForwardReply*)this, version, locked, metadataVersion);
 	}
 };
 
@@ -169,14 +201,14 @@ struct GetReadVersionRequest : TimedRequest {
 	}
 };
 
-struct GetKeyServerLocationsReply {
+struct GetKeyServerLocationsReply : public ProxyForwardReply {
 	constexpr static FileIdentifier file_identifier = 10636023;
 	Arena arena;
 	std::vector<std::pair<KeyRangeRef, vector<StorageServerInterface>>> results;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, results, arena);
+		serializer(ar, *(ProxyForwardReply*)this, results, arena);
 	}
 };
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 8ada99503f..cb2cb461b0 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -573,38 +573,44 @@ ACTOR static Future<Void> monitorClientInfo( Reference<AsyncVar<Optional<Cluster
 
 			choose {
 				when( ClientDBInfo ni = wait( clusterInterface->get().present() ? brokenPromiseToNever( clusterInterface->get().get().openDatabase.getReply( req ) ) : Never() ) ) {
-					TraceEvent("ClientInfoChange").detail("ChangeID", ni.id);
 					outInfo->set(ni);
 
-					if (ni.proxies.empty()) {
-						TraceEvent("ClientInfo_NoProxiesReturned").detail("ChangeID", ni.id);
-						continue;
-					} else if (!FlowTransport::transport().isClient()) {
-						continue;
-					}
-
-					vector<Future<Void>> onProxyFailureVec;
-					bool skipWaitForProxyFail = false;
-					for (const auto& proxy : ni.proxies) {
-						if (proxy.provisional) {
-							skipWaitForProxyFail = true;
+					loop {
+						TraceEvent("ClientInfoChange").detail("ChangeID", outInfo->get().id);
+						if (outInfo->get().proxies.empty()) {
+							TraceEvent("ClientInfo_NoProxiesReturned").detail("ChangeID", outInfo->get().id);
+							break;
+						} else if (!FlowTransport::transport().isClient()) {
 							break;
 						}
 
-						onProxyFailureVec.push_back(
-						    IFailureMonitor::failureMonitor().onDisconnectOrFailure(
-						        proxy.getConsistentReadVersion.getEndpoint()) ||
-						    IFailureMonitor::failureMonitor().onDisconnectOrFailure(proxy.commit.getEndpoint()) ||
-						    IFailureMonitor::failureMonitor().onDisconnectOrFailure(
-						        proxy.getKeyServersLocations.getEndpoint()) ||
-						    IFailureMonitor::failureMonitor().onDisconnectOrFailure(
-						        proxy.getStorageServerRejoinInfo.getEndpoint()));
-					}
-					if (skipWaitForProxyFail) continue;
+						state vector<Future<Void>> onProxyFailureVec;
+						bool skipWaitForProxyFail = false;
+						for (const auto& proxy : outInfo->get().proxies) {
+							if (proxy.provisional) {
+								skipWaitForProxyFail = true;
+								break;
+							}
 
-					leaderMon = Void();
-					wait(waitForAny(onProxyFailureVec));
-					leaderMon = ccf ? monitorLeader(ccf, clusterInterface) : Void();
+							onProxyFailureVec.push_back(
+									IFailureMonitor::failureMonitor().onDisconnectOrFailure(
+											proxy.getConsistentReadVersion.getEndpoint()) ||
+									IFailureMonitor::failureMonitor().onDisconnectOrFailure(proxy.commit.getEndpoint()) ||
+									IFailureMonitor::failureMonitor().onDisconnectOrFailure(
+											proxy.getKeyServersLocations.getEndpoint()) ||
+									IFailureMonitor::failureMonitor().onDisconnectOrFailure(
+											proxy.getStorageServerRejoinInfo.getEndpoint()));
+						}
+						if (skipWaitForProxyFail) break;
+
+						leaderMon = Void();
+						state Future<Void> anyFailures = waitForAny(onProxyFailureVec);
+						wait(anyFailures || outInfo->onChange());
+						if(anyFailures.isReady()) {
+							leaderMon = ccf ? monitorLeader(ccf, clusterInterface) : Void();
+							break;
+						}
+					}
 				}
 				when( wait( clusterInterface->onChange() ) ) {
 					if(clusterInterface->get().present())
@@ -1241,6 +1247,10 @@ ACTOR Future< pair<KeyRange,Reference<LocationInfo>> > getKeyLocation_internal(
 		choose {
 			when ( wait( cx->onMasterProxiesChanged() ) ) {}
 			when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional<KeyRef>(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
+				if(rep.newClientInfo.present()) {
+					cx->clientInfo->set(rep.newClientInfo.get());
+					continue;
+				}
 				if( info.debugID.present() )
 					g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After");
 				ASSERT( rep.results.size() == 1 );
@@ -1278,6 +1288,11 @@ ACTOR Future< vector< pair<KeyRange,Reference<LocationInfo>> > > getKeyRangeLoca
 		choose {
 			when ( wait( cx->onMasterProxiesChanged() ) ) {}
 			when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
+				if(_rep.newClientInfo.present()) {
+					cx->clientInfo->set(_rep.newClientInfo.get());
+					continue;
+				}
+				
 				state GetKeyServerLocationsReply rep = _rep;
 				if( info.debugID.present() )
 					g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.After");
@@ -1490,6 +1505,11 @@ ACTOR Future<Version> waitForCommittedVersion( Database cx, Version version ) {
 			choose {
 				when ( wait( cx->onMasterProxiesChanged() ) ) {}
 				when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE ), cx->taskID ) ) ) {
+					if(v.newClientInfo.present()) {
+						cx->clientInfo->set(v.newClientInfo.get());
+						continue;
+					}
+					
 					if (v.version >= version)
 						return v.version;
 					// SOMEDAY: Do the wait on the server side, possibly use less expensive source of committed version (causal consistency is not needed for this purpose)
@@ -2703,6 +2723,11 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 				throw request_maybe_delivered();
 			}
 			when (CommitID ci = wait( reply )) {
+				if(ci.newClientInfo.present()) {
+					cx->clientInfo->set(ci.newClientInfo.get());
+					throw not_committed();
+				}
+
 				Version v = ci.version;
 				if (v != invalidVersion) {
 					if (info.debugID.present())
@@ -3034,6 +3059,10 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion( DatabaseContext *cx,
 			choose {
 				when ( wait( cx->onMasterProxiesChanged() ) ) {}
 				when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES), &MasterProxyInterface::getConsistentReadVersion, req, cx->taskID ) ) ) {
+					if(v.newClientInfo.present()) {
+						cx->clientInfo->set(v.newClientInfo.get());
+						continue;
+					}
 					if( debugID.present() )
 						g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.After");
 					ASSERT( v.version > 0 );
diff --git a/fdbclient/fdbclient.vcxproj b/fdbclient/fdbclient.vcxproj
index 94ab1d8024..866c78c93b 100644
--- a/fdbclient/fdbclient.vcxproj
+++ b/fdbclient/fdbclient.vcxproj
@@ -31,7 +31,6 @@
       <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
     </ActorCompiler>
     <ClInclude Include="BlobStore.h" />
-    <ClInclude Include="ClientDBInfo.h" />
     <ClInclude Include="ClientLogEvents.h" />
     <ClInclude Include="ClientWorkerInterface.h" />
     <ClInclude Include="ClusterInterface.h" />
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index b69faad5a0..5f7f45ea19 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -282,6 +282,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( PROXY_SPIN_DELAY,                                     0.01 );
 	init( UPDATE_REMOTE_LOG_VERSION_INTERVAL,                    2.0 );
 	init( MAX_TXS_POP_VERSION_HISTORY,                           1e5 );
+	init( PROXY_FORWARD_DELAY,                                  10.0 );
 
 	// Master Server
 	// masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution)
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 1953b547b4..f8517ababe 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -227,6 +227,7 @@ public:
 	double PROXY_SPIN_DELAY;
 	double UPDATE_REMOTE_LOG_VERSION_INTERVAL;
 	int MAX_TXS_POP_VERSION_HISTORY;
+	double PROXY_FORWARD_DELAY;
 
 	// Master Server
 	double COMMIT_SLEEP_TIME;
diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp
index 8051ddb662..83cf0d51db 100644
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@@ -965,7 +965,7 @@ ACTOR Future<Void> commitBatch(
 				break; 
 			}
 			when(GetReadVersionReply v = wait(self->getConsistentReadVersion.getReply(GetReadVersionRequest(0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE | GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)))) {
-				if(v.version > self->committedVersion.get()) {
+				if(!v.newClientInfo.present() && v.version > self->committedVersion.get()) {
 					self->locked = v.locked;
 					self->metadataVersion = v.metadataVersion;
 					self->committedVersion.set(v.version);
@@ -1782,33 +1782,69 @@ ACTOR Future<Void> masterProxyServerCore(
 
 ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, uint64_t recoveryCount, MasterProxyInterface myInterface) {
 	loop{
-		if (db->get().recoveryCount >= recoveryCount && !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), myInterface))
-		throw worker_removed();
+		if (db->get().recoveryCount >= recoveryCount && !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), myInterface)) {
+			throw worker_removed();
+		}
 		wait(db->onChange());
 	}
 }
 
+ACTOR Future<Void> forwardProxy(ClientDBInfo info, RequestStream<CommitTransactionRequest> commit, RequestStream<GetReadVersionRequest> getConsistentReadVersion, RequestStream<GetKeyServerLocationsRequest> getKeyServersLocations) {
+	loop {
+		choose {
+			when(CommitTransactionRequest req = waitNext(commit.getFuture())) {
+				CommitID rep;
+				rep.newClientInfo = info;
+				req.reply.send(rep);
+			}
+			when(GetReadVersionRequest req = waitNext(getConsistentReadVersion.getFuture())) {
+				GetReadVersionReply rep;
+				rep.newClientInfo = info;
+				req.reply.send(rep);
+			}
+			when(GetKeyServerLocationsRequest req = waitNext(getKeyServersLocations.getFuture())) {
+				GetKeyServerLocationsReply rep;
+				rep.newClientInfo = info;
+				req.reply.send(rep);
+			}
+		}
+		wait(yield());
+	}
+}
+
 ACTOR Future<Void> masterProxyServer(
 	MasterProxyInterface proxy,
 	InitializeMasterProxyRequest req,
 	Reference<AsyncVar<ServerDBInfo>> db,
 	std::string whitelistBinPaths)
 {
+	state Future<Void> core;
 	try {
-		state Future<Void> core = masterProxyServerCore(proxy, req.master, db, req.recoveryCount, req.recoveryTransactionVersion, req.firstProxy, whitelistBinPaths);
-		loop choose{
-			when(wait(core)) { return Void(); }
-			when(wait(checkRemoved(db, req.recoveryCount, proxy))) {}
-		}
+		core = masterProxyServerCore(proxy, req.master, db, req.recoveryCount, req.recoveryTransactionVersion, req.firstProxy, whitelistBinPaths);
+		wait(core || checkRemoved(db, req.recoveryCount, proxy));
 	}
 	catch (Error& e) {
-		if (e.code() == error_code_actor_cancelled || e.code() == error_code_worker_removed || e.code() == error_code_tlog_stopped ||
-			e.code() == error_code_master_tlog_failed || e.code() == error_code_coordinators_changed || e.code() == error_code_coordinated_state_conflict ||
-			e.code() == error_code_new_coordinators_timed_out)
-		{
-			TraceEvent("MasterProxyTerminated", proxy.id()).error(e, true);
+		TraceEvent("MasterProxyTerminated", proxy.id()).error(e, true);
+
+		if (e.code() != error_code_worker_removed && e.code() != error_code_tlog_stopped &&
+			e.code() != error_code_master_tlog_failed && e.code() != error_code_coordinators_changed &&
+			e.code() != error_code_coordinated_state_conflict && e.code() != error_code_new_coordinators_timed_out) {
+			throw;
+		}
+	}
+	core.cancel();
+	state Future<Void> finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY);
+	loop {
+		if(finishForward.isReady()) {
 			return Void();
 		}
-		throw;
+		if(db->get().client.proxies.size() > 0 && !db->get().client.proxies[0].provisional && db->get().recoveryCount >= req.recoveryCount
+			&& !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), proxy)) {
+			core = forwardProxy(db->get().client, proxy.commit, proxy.getConsistentReadVersion, proxy.getKeyServersLocations);
+			proxy = MasterProxyInterface();
+			wait(finishForward);
+			return Void();
+		}
+		wait(db->onChange() || finishForward);
 	}
 }
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index d0ac5392a5..d7502b5f35 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -1357,7 +1357,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
 	// SOMEDAY: For faster recovery, do this and setDBState asynchronously and don't wait for them
 	// unless we want to change TLogs
 	wait((success(recoveryCommit) && sendInitialCommitToResolvers(self)) );
-	if(recoveryCommit.isReady() && recoveryCommit.get().isError()) {
+	if(recoveryCommit.isReady() && ( recoveryCommit.get().isError() || recoveryCommit.get().get().newClientInfo.present() )) {
 		TEST(true);  // Master recovery failed because of the initial commit failed
 		throw master_recovery_failed();
 	}
diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp
index 0f4f064bf4..f7a67f09dc 100644
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@@ -354,7 +354,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 						ErrorOr<GetKeyServerLocationsReply> shards = keyServerLocationFutures[i].get();
 
 						//If performing quiescent check, then all master proxies should be reachable.  Otherwise, only one needs to be reachable
-						if (self->performQuiescentChecks && !shards.present())
+						if (self->performQuiescentChecks && (!shards.present() || shards.get().newClientInfo.present()))
 						{
 							TraceEvent("ConsistencyCheck_MasterProxyUnavailable").detail("MasterProxyID", proxyInfo->getId(i));
 							self->testFailure("Master proxy unavailable");
@@ -363,7 +363,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 
 						//Get the list of shards if one was returned.  If not doing a quiescent check, we can break if it is.
 						//If we are doing a quiescent check, then we only need to do this for the first shard.
-						if (shards.present() && !keyServersInsertedForThisIteration)
+						if (shards.present() && !shards.get().newClientInfo.present() && !keyServersInsertedForThisIteration)
 						{
 							keyServers.insert(keyServers.end(), shards.get().results.begin(), shards.get().results.end());
 							keyServersInsertedForThisIteration = true;

From a5a6f8431cb2404b69d58a15b0cbe6e9a7146dce Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Mon, 8 Jul 2019 14:01:04 -0700
Subject: [PATCH 082/136] Add a random UID to TransactionMetrics in case a
 client opens multiple connections and also a field to indicate whether the
 connection is internal. Convert some of the metrics to our Counter object
 instead of running totals.

---
 fdbbackup/backup.actor.cpp                |  4 +-
 fdbcli/fdbcli.actor.cpp                   |  2 +-
 fdbclient/DatabaseContext.h               | 40 +++++-----
 fdbclient/NativeAPI.actor.cpp             | 93 ++++++++++++-----------
 fdbclient/NativeAPI.actor.h               |  4 +-
 fdbclient/ThreadSafeTransaction.actor.cpp |  2 +-
 fdbserver/Restore.actor.cpp               |  2 +-
 fdbserver/tester.actor.cpp                |  2 +-
 fdbserver/worker.actor.cpp                |  2 +-
 flow/Stats.actor.cpp                      | 20 +++--
 flow/Stats.h                              | 21 +++++
 11 files changed, 114 insertions(+), 78 deletions(-)

diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 1e4fd786e2..5d62be9a8b 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -3244,7 +3244,7 @@ int main(int argc, char* argv[]) {
 			}
 
 			try {
-				db = Database::createDatabase(ccf, -1, localities);
+				db = Database::createDatabase(ccf, -1, true, localities);
 			}
 			catch (Error& e) {
 				fprintf(stderr, "ERROR: %s\n", e.what());
@@ -3266,7 +3266,7 @@ int main(int argc, char* argv[]) {
 			}
 
 			try {
-				sourceDb = Database::createDatabase(sourceCcf, -1, localities);
+				sourceDb = Database::createDatabase(sourceCcf, -1, true, localities);
 			}
 			catch (Error& e) {
 				fprintf(stderr, "ERROR: %s\n", e.what());
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index ebb63a82b9..a84712ddfd 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -2516,7 +2516,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	TraceEvent::setNetworkThread();
 
 	try {
-		db = Database::createDatabase(ccf, -1);
+		db = Database::createDatabase(ccf, -1, false);
 		if (!opt.exec.present()) {
 			printf("Using cluster file `%s'.\n", ccf->getFilename().c_str());
 		}
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index 4c1c21dc6a..35eb8e6f71 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -58,7 +58,7 @@ public:
 
 	~DatabaseContext();
 
-	Database clone() const { return Database(new DatabaseContext( cluster, clientInfo, clientInfoMonitor, dbId, taskID, clientLocality, enableLocalityLoadBalance, lockAware, apiVersion )); }
+	Database clone() const { return Database(new DatabaseContext( cluster, clientInfo, clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, internal, apiVersion )); }
 
 	std::pair<KeyRange,Reference<LocationInfo>> getCachedLocation( const KeyRef&, bool isBackward = false );
 	bool getCachedLocations( const KeyRangeRef&, vector<std::pair<KeyRange,Reference<LocationInfo>>>&, int limit, bool reverse );
@@ -97,8 +97,8 @@ public:
 
 //private: 
 	explicit DatabaseContext( Reference<Cluster> cluster, Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
-		Future<Void> clientInfoMonitor, Standalone<StringRef> dbId, TaskPriority taskID, LocalityData const& clientLocality, 
-		bool enableLocalityLoadBalance, bool lockAware, int apiVersion = Database::API_VERSION_LATEST );
+		Future<Void> clientInfoMonitor, TaskPriority taskID, LocalityData const& clientLocality, 
+		bool enableLocalityLoadBalance, bool lockAware, bool internal = true, int apiVersion = Database::API_VERSION_LATEST );
 
 	explicit DatabaseContext( const Error &err );
 
@@ -133,22 +133,26 @@ public:
 
 	std::map< UID, StorageServerInfo* > server_interf;
 
-	Standalone<StringRef> dbId;
+	UID dbId;
+	bool internal;
+
+	CounterCollection cc;
+
+	Counter transactionReadVersions;
+	Counter transactionLogicalReads;
+	Counter transactionPhysicalReads;
+	Counter transactionCommittedMutations;
+	Counter transactionCommittedMutationBytes;
+	Counter transactionsCommitStarted;
+	Counter transactionsCommitCompleted;
+	Counter transactionsTooOld;
+	Counter transactionsFutureVersions;
+	Counter transactionsNotCommitted;
+	Counter transactionsMaybeCommitted;
+	Counter transactionsResourceConstrained;
+	Counter transactionsProcessBehind;
+	Counter transactionWaitsForFullRecovery;
 
-	int64_t transactionReadVersions;
-	int64_t transactionLogicalReads;
-	int64_t transactionPhysicalReads;
-	int64_t transactionCommittedMutations;
-	int64_t transactionCommittedMutationBytes;
-	int64_t transactionsCommitStarted;
-	int64_t transactionsCommitCompleted;
-	int64_t transactionsTooOld;
-	int64_t transactionsFutureVersions;
-	int64_t transactionsNotCommitted;
-	int64_t transactionsMaybeCommitted;
-	int64_t transactionsResourceConstrained;
-	int64_t transactionsProcessBehind;
-	int64_t transactionWaitsForFullRecovery;
 	ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit;
 
 	int outstandingWatches;
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 8ada99503f..b9cea4c1ad 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -208,24 +208,18 @@ template <> void addref( DatabaseContext* ptr ) { ptr->addref(); }
 template <> void delref( DatabaseContext* ptr ) { ptr->delref(); }
 
 ACTOR Future<Void> databaseLogger( DatabaseContext *cx ) {
+	state double lastLogged = 0;
 	loop {
-		wait( delay( CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, cx->taskID ) );
-		TraceEvent("TransactionMetrics")
+		wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, cx->taskID));
+		TraceEvent ev("TransactionMetrics", cx->dbId);
+
+		ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
 			.detail("Cluster", cx->cluster && cx->getConnectionFile() ? cx->getConnectionFile()->getConnectionString().clusterKeyName().toString() : "")
-			.detail("ReadVersions", cx->transactionReadVersions)
-			.detail("LogicalUncachedReads", cx->transactionLogicalReads)
-			.detail("PhysicalReadRequests", cx->transactionPhysicalReads)
-			.detail("CommittedMutations", cx->transactionCommittedMutations)
-			.detail("CommittedMutationBytes", cx->transactionCommittedMutationBytes)
-			.detail("CommitStarted", cx->transactionsCommitStarted)
-			.detail("CommitCompleted", cx->transactionsCommitCompleted)
-			.detail("TooOld", cx->transactionsTooOld)
-			.detail("FutureVersions", cx->transactionsFutureVersions)
-			.detail("NotCommitted", cx->transactionsNotCommitted)
-			.detail("MaybeCommitted", cx->transactionsMaybeCommitted)
-			.detail("ResourceConstrained", cx->transactionsResourceConstrained)
-			.detail("ProcessBehind", cx->transactionsProcessBehind)
-			.detail("MeanLatency", cx->latencies.mean())
+			.detail("Internal", cx->internal);
+
+		cx->cc.logToTraceEvent(ev);
+
+		ev.detail("MeanLatency", cx->latencies.mean())
 			.detail("MedianLatency", cx->latencies.median())
 			.detail("Latency90", cx->latencies.percentile(0.90))
 			.detail("Latency98", cx->latencies.percentile(0.98))
@@ -245,12 +239,15 @@ ACTOR Future<Void> databaseLogger( DatabaseContext *cx ) {
 			.detail("MeanBytesPerCommit", cx->bytesPerCommit.mean())
 			.detail("MedianBytesPerCommit", cx->bytesPerCommit.median())
 			.detail("MaxBytesPerCommit", cx->bytesPerCommit.max());
+
 		cx->latencies.clear();
 		cx->readLatencies.clear();
 		cx->GRVLatencies.clear();
 		cx->commitLatencies.clear();
 		cx->mutationsPerCommit.clear();
 		cx->bytesPerCommit.clear();
+
+		lastLogged = now();
 	}
 }
 
@@ -508,18 +505,21 @@ ACTOR static Future<HealthMetrics> getHealthMetricsActor(DatabaseContext *cx, bo
 Future<HealthMetrics> DatabaseContext::getHealthMetrics(bool detailed = false) {
 	return getHealthMetricsActor(this, detailed);
 }
-
 DatabaseContext::DatabaseContext(
-	Reference<Cluster> cluster, Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, Standalone<StringRef> dbId, 
-	TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion ) 
-	: cluster(cluster), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), dbId(dbId), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance),
-	lockAware(lockAware), apiVersion(apiVersion), provisional(false),
-	transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), 
-	transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0), 
-	transactionsMaybeCommitted(0), transactionsResourceConstrained(0), transactionsProcessBehind(0), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1),
+	Reference<Cluster> cluster, Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor,
+	TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, bool internal, int apiVersion ) 
+	: cluster(cluster), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance),
+	lockAware(lockAware), apiVersion(apiVersion), provisional(false), cc("TransactionMetrics"),
+	transactionReadVersions("ReadVersions", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), 
+	transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), 
+	transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), 
+	transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), 
+	transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0),
 	latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
-	healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0)
+	healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal)
 {
+	dbId = deterministicRandom()->randomUniqueID();
+
 	metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE);
 	maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES;
 
@@ -539,7 +539,14 @@ DatabaseContext::DatabaseContext(
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 }
 
-DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000) {}
+DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("TransactionMetrics"),
+	transactionReadVersions("ReadVersions", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), 
+	transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), 
+	transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), 
+	transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), 
+	transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), latencies(1000), readLatencies(1000), commitLatencies(1000), 
+	GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), 
+	internal(false) {}
 
 ACTOR static Future<Void> monitorClientInfo( Reference<AsyncVar<Optional<ClusterInterface>>> clusterInterface, Reference<ClusterConnectionFile> ccf, Reference<AsyncVar<ClientDBInfo>> outInfo, Reference<AsyncVar<int>> connectedCoordinatorsNumDelayed ) {
 	try {
@@ -632,11 +639,11 @@ Database DatabaseContext::create(Reference<AsyncVar<Optional<ClusterInterface>>>
 	Reference<AsyncVar<ClientDBInfo>> clientInfo(new AsyncVar<ClientDBInfo>());
 	Future<Void> clientInfoMonitor = delayedAsyncVar(connectedCoordinatorsNum, connectedCoordinatorsNumDelayed, CLIENT_KNOBS->CHECK_CONNECTED_COORDINATOR_NUM_DELAY) || monitorClientInfo(clusterInterface, connFile, clientInfo, connectedCoordinatorsNumDelayed);
 
-	return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false));
+	return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, true, false, true));
 }
 
 Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID, bool lockAware, int apiVersion) {
-	return Database( new DatabaseContext( Reference<Cluster>(nullptr), clientInfo, clientInfoMonitor, LiteralStringRef(""), taskID, clientLocality, enableLocalityLoadBalance, lockAware, apiVersion ) );
+	return Database( new DatabaseContext( Reference<Cluster>(nullptr), clientInfo, clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, true, apiVersion ) );
 }
 
 DatabaseContext::~DatabaseContext() {
@@ -816,7 +823,7 @@ Reference<ClusterConnectionFile> DatabaseContext::getConnectionFile() {
 	return cluster->getConnectionFile();
 }
 
-Database Database::createDatabase( Reference<ClusterConnectionFile> connFile, int apiVersion, LocalityData const& clientLocality, DatabaseContext *preallocatedDb ) {
+Database Database::createDatabase( Reference<ClusterConnectionFile> connFile, int apiVersion, bool internal, LocalityData const& clientLocality, DatabaseContext *preallocatedDb ) {
 	Reference<AsyncVar<int>> connectedCoordinatorsNum(new AsyncVar<int>(0)); // Number of connected coordinators for the client
 	Reference<AsyncVar<int>> connectedCoordinatorsNumDelayed(new AsyncVar<int>(0));
 	Reference<Cluster> cluster(new Cluster(connFile, connectedCoordinatorsNum, apiVersion));
@@ -825,18 +832,18 @@ Database Database::createDatabase( Reference<ClusterConnectionFile> connFile, in
 
 	DatabaseContext *db;
 	if(preallocatedDb) {
-		db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion);
+		db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, true, false, internal, apiVersion);
 	}
 	else {
-		db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion);
+		db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, true, false, internal, apiVersion);
 	}
 
 	return Database(db);
 }
 
-Database Database::createDatabase( std::string connFileName, int apiVersion, LocalityData const& clientLocality ) {
+Database Database::createDatabase( std::string connFileName, int apiVersion, bool internal, LocalityData const& clientLocality ) {
 	Reference<ClusterConnectionFile> rccf = Reference<ClusterConnectionFile>(new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFileName).first));
-	return Database::createDatabase(rccf, apiVersion, clientLocality);
+	return Database::createDatabase(rccf, apiVersion, internal, clientLocality);
 }
 
 extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs);
@@ -2718,7 +2725,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 					tr->versionstampPromise.send(ret);
 
 					tr->numErrors = 0;
-					cx->transactionsCommitCompleted++;
+					++cx->transactionsCommitCompleted;
 					cx->transactionCommittedMutations += req.transaction.mutations.size();
 					cx->transactionCommittedMutationBytes += req.transaction.mutations.expectedSize();
 
@@ -2793,7 +2800,7 @@ Future<Void> Transaction::commitMutations() {
 			return Void();
 		}
 
-		cx->transactionsCommitStarted++;
+		++cx->transactionsCommitStarted;
 
 		if(options.readOnly)
 			return transaction_read_only();
@@ -3126,7 +3133,7 @@ ACTOR Future<Version> extractReadVersion(DatabaseContext* cx, Reference<Transact
 }
 
 Future<Version> Transaction::getReadVersion(uint32_t flags) {
-	cx->transactionReadVersions++;
+	++cx->transactionReadVersions;
 	flags |= options.getReadVersionFlags;
 
 	auto& batcher = cx->versionBatcher[ flags ];
@@ -3162,15 +3169,15 @@ Future<Void> Transaction::onError( Error const& e ) {
 		e.code() == error_code_cluster_not_fully_recovered)
 	{
 		if(e.code() == error_code_not_committed)
-			cx->transactionsNotCommitted++;
+			++cx->transactionsNotCommitted;
 		if(e.code() == error_code_commit_unknown_result)
-			cx->transactionsMaybeCommitted++;
+			++cx->transactionsMaybeCommitted;
 		if (e.code() == error_code_proxy_memory_limit_exceeded)
-			cx->transactionsResourceConstrained++;
+			++cx->transactionsResourceConstrained;
 		if (e.code() == error_code_process_behind)
-			cx->transactionsProcessBehind++;
+			++cx->transactionsProcessBehind;
 		if (e.code() == error_code_cluster_not_fully_recovered) {
-			cx->transactionWaitsForFullRecovery++;
+			++cx->transactionWaitsForFullRecovery;
 		}
 
 		double backoff = getBackoff(e.code());
@@ -3181,9 +3188,9 @@ Future<Void> Transaction::onError( Error const& e ) {
 		e.code() == error_code_future_version)
 	{
 		if( e.code() == error_code_transaction_too_old )
-			cx->transactionsTooOld++;
+			++cx->transactionsTooOld;
 		else if( e.code() == error_code_future_version )
-			cx->transactionsFutureVersions++;
+			++cx->transactionsFutureVersions;
 
 		double maxBackoff = options.maxBackoff;
 		reset();
diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h
index b7c3aa6d71..92bde7817b 100644
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@@ -74,8 +74,8 @@ class Database {
 public:
 	enum { API_VERSION_LATEST = -1 };
 
-	static Database createDatabase( Reference<ClusterConnectionFile> connFile, int apiVersion, LocalityData const& clientLocality=LocalityData(), DatabaseContext *preallocatedDb=nullptr );
-	static Database createDatabase( std::string connFileName, int apiVersion, LocalityData const& clientLocality=LocalityData() ); 
+	static Database createDatabase( Reference<ClusterConnectionFile> connFile, int apiVersion, bool internal=true, LocalityData const& clientLocality=LocalityData(), DatabaseContext *preallocatedDb=nullptr );
+	static Database createDatabase( std::string connFileName, int apiVersion, bool internal=true, LocalityData const& clientLocality=LocalityData() ); 
 
 	Database() {}  // an uninitialized database can be destructed or reassigned safely; that's it
 	void operator= ( Database const& rhs ) { db = rhs.db; }
diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp
index 130b1652ce..d074515dba 100644
--- a/fdbclient/ThreadSafeTransaction.actor.cpp
+++ b/fdbclient/ThreadSafeTransaction.actor.cpp
@@ -68,7 +68,7 @@ ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion)
 
 	onMainThreadVoid([db, connFile, apiVersion](){ 
 		try {
-			Database::createDatabase(connFile, apiVersion, LocalityData(), db).extractPtr();
+			Database::createDatabase(connFile, apiVersion, false, LocalityData(), db).extractPtr();
 		}
 		catch(Error &e) {
 			new (db) DatabaseContext(e);
diff --git a/fdbserver/Restore.actor.cpp b/fdbserver/Restore.actor.cpp
index a221a8593d..81f82ae387 100644
--- a/fdbserver/Restore.actor.cpp
+++ b/fdbserver/Restore.actor.cpp
@@ -24,7 +24,7 @@
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
 ACTOR Future<Void> restoreWorker(Reference<ClusterConnectionFile> ccf, LocalityData locality) {
-	state Database cx = Database::createDatabase(ccf->getFilename(), Database::API_VERSION_LATEST,locality);
+	state Database cx = Database::createDatabase(ccf->getFilename(), Database::API_VERSION_LATEST, true, locality);
 	state RestoreInterface interf;
 	interf.initEndpoints();
 	state Optional<RestoreInterface> leaderInterf;
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 3e977c9493..2904d96015 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -505,7 +505,7 @@ ACTOR Future<Void> testerServerWorkload( WorkloadRequest work, Reference<Cluster
 		startRole(Role::TESTER, workIface.id(), UID(), details);
 
 		if( work.useDatabase ) {
-			cx = Database::createDatabase(ccf, -1, locality);
+			cx = Database::createDatabase(ccf, -1, true, locality);
 			wait( delay(1.0) );
 		}
 
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index f23d889c37..ff32540017 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -730,7 +730,7 @@ ACTOR Future<Void> workerServer(
 	if(metricsPrefix.size() > 0) {
 		if( metricsConnFile.size() > 0) {
 			try {
-				state Database db = Database::createDatabase(metricsConnFile, Database::API_VERSION_LATEST, locality);
+				state Database db = Database::createDatabase(metricsConnFile, Database::API_VERSION_LATEST, true, locality);
 				metricsLogger = runMetrics( db, KeyRef(metricsPrefix) );
 			} catch(Error &e) {
 				TraceEvent(SevWarnAlways, "TDMetricsBadClusterFile").error(e).detail("ConnFile", metricsConnFile);
diff --git a/flow/Stats.actor.cpp b/flow/Stats.actor.cpp
index 8d0afa7455..751130bc25 100644
--- a/flow/Stats.actor.cpp
+++ b/flow/Stats.actor.cpp
@@ -69,6 +69,13 @@ void Counter::clear() {
 	metric = 0;
 }
 
+void CounterCollection::logToTraceEvent(TraceEvent &te) const {
+	for (ICounter* c : counters) {
+		te.detail(c->getName().c_str(), c);
+		c->resetInterval();
+	}
+}
+
 ACTOR Future<Void> traceCounters(std::string traceEventName, UID traceEventID, double interval, CounterCollection* counters, std::string trackLatestName) {
 	wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized
 
@@ -80,15 +87,12 @@ ACTOR Future<Void> traceCounters(std::string traceEventName, UID traceEventID, d
 	loop{
 		TraceEvent te(traceEventName.c_str(), traceEventID);
 		te.detail("Elapsed", now() - last_interval);
-		for (ICounter* c : counters->counters) {
-			if (c->hasRate() && c->hasRoughness())
-				te.detailf(c->getName().c_str(), "%g %g %lld", c->getRate(), c->getRoughness(), (long long)c->getValue());
-			else
-				te.detail(c->getName().c_str(), c->getValue());
-			c->resetInterval();
-		}
-		if (!trackLatestName.empty())
+
+		counters->logToTraceEvent(te);
+
+		if (!trackLatestName.empty()) {
 			te.trackLatest(trackLatestName.c_str());
+		}
 
 		last_interval = now();
 		wait(delay(interval));
diff --git a/flow/Stats.h b/flow/Stats.h
index 8044c4d802..24481c3024 100644
--- a/flow/Stats.h
+++ b/flow/Stats.h
@@ -62,12 +62,26 @@ struct ICounter {
 	virtual void remove() {}
 };
 
+template<>
+struct Traceable<ICounter*> : std::true_type {
+	static std::string toString(ICounter const *counter) {
+		if (counter->hasRate() && counter->hasRoughness()) {
+			return format("%g %g %lld", counter->getRate(), counter->getRoughness(), (long long)counter->getValue());
+		}
+		else {
+			return format("%lld", (long long)counter->getValue());
+		}
+	}
+};
+
 struct CounterCollection {
 	CounterCollection(std::string name, std::string id = std::string()) : name(name), id(id) {}
 	std::vector<struct ICounter*> counters, counters_to_remove;
 	~CounterCollection() { for (auto c : counters_to_remove) c->remove(); }
 	std::string name;
 	std::string id;
+
+	void logToTraceEvent(TraceEvent& te) const;
 };
 
 struct Counter : ICounter, NonCopyable {
@@ -97,6 +111,13 @@ private:
 	Int64MetricHandle metric;
 };
 
+template<>
+struct Traceable<Counter> : std::true_type {
+	static std::string toString(Counter const& counter) {
+		return Traceable<ICounter*>::toString((ICounter const*)&counter);
+	}
+};
+
 template <class F>
 struct SpecialCounter : ICounter, FastAllocated<SpecialCounter<F>>, NonCopyable {
 	SpecialCounter(CounterCollection& collection, std::string const& name, F && f) : name(name), f(f) { collection.counters.push_back(this); collection.counters_to_remove.push_back(this); }

From 6c8f50ca669b0c72fc9f7149988bce9c431957aa Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Mon, 8 Jul 2019 22:13:09 -0700
Subject: [PATCH 083/136] Improve the behavior of parallelPeekMore+onlySpilled.

When onlySpilled transitions from true (don't peek memory) to false (do
peek memory) as part of a parallel peek, we'll end up wasting the rest
of the replies because we'll honor their onlySpilled=true setting and
thus not have any additional data to return.

Instead, we thread the onlySpilled back through in the same way that the
ending version of the last peek is used overrides the requested starting
version of the next peek.  This simulated the same behavior that the
client has, where the value of onlySpilled that we reply with comes back
in the next request.

I haven't actually seen it be a problem, but this should help make sure
the onlySpilled transition when catching up doesn't ever cause any ill
effects if a process starts riding the line between onlySpilled settings.
---
 fdbserver/OldTLogServer_6_0.actor.cpp | 15 ++++++++-------
 fdbserver/TLogServer.actor.cpp        | 15 ++++++++-------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp
index 227578d49f..07a3bee98d 100644
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@@ -265,7 +265,7 @@ struct TLogData : NonCopyable {
 	int64_t overheadBytesDurable;
 
 	struct PeekTrackerData {
-		std::map<int, Promise<Version>> sequence_version;
+		std::map<int, Promise<std::pair<Version, bool>>> sequence_version;
 		double lastUpdate;
 	};
 
@@ -1030,8 +1030,9 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 				}
 
 				trackerData.lastUpdate = now();
-				Version ver = wait(trackerData.sequence_version[sequence].getFuture());
-				req.begin = ver;
+				std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
+				req.begin = prevPeekData.first;
+				req.onlySpilled = prevPeekData.second;
 				wait(yield());
 			}
 		} catch( Error &e ) {
@@ -1089,13 +1090,13 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 			}
 			auto& sequenceData = trackerData.sequence_version[sequence+1];
 			if(sequenceData.isSet()) {
-				if(sequenceData.getFuture().get() != rep.end) {
+				if(sequenceData.getFuture().get().first != rep.end) {
 					TEST(true); //tlog peek second attempt ended at a different version
 					req.reply.sendError(timed_out());
 					return Void();
 				}
 			} else {
-				sequenceData.send(rep.end);
+				sequenceData.send(std::make_pair(rep.end, rep.onlySpilled));
 			}
 			rep.begin = req.begin;
 		}
@@ -1163,13 +1164,13 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 		}
 		auto& sequenceData = trackerData.sequence_version[sequence+1];
 		if(sequenceData.isSet()) {
-			if(sequenceData.getFuture().get() != reply.end) {
+			if(sequenceData.getFuture().get().first != reply.end) {
 				TEST(true); //tlog peek second attempt ended at a different version
 				req.reply.sendError(timed_out());
 				return Void();
 			}
 		} else {
-			sequenceData.send(reply.end);
+			sequenceData.send(std::make_pair(reply.end, reply.onlySpilled));
 		}
 		reply.begin = req.begin;
 	}
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 96a63c1d39..dc4728882f 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -315,7 +315,7 @@ struct TLogData : NonCopyable {
 	int64_t overheadBytesDurable;
 
 	struct PeekTrackerData {
-		std::map<int, Promise<Version>> sequence_version;
+		std::map<int, Promise<std::pair<Version, bool>>> sequence_version;
 		double lastUpdate;
 	};
 
@@ -1317,8 +1317,9 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 				}
 
 				trackerData.lastUpdate = now();
-				Version ver = wait(trackerData.sequence_version[sequence].getFuture());
-				req.begin = ver;
+				std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
+				req.begin = prevPeekData.first;
+				req.onlySpilled = prevPeekData.second;
 				wait(yield());
 			}
 		} catch( Error &e ) {
@@ -1376,13 +1377,13 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 			}
 			auto& sequenceData = trackerData.sequence_version[sequence+1];
 			if(sequenceData.isSet()) {
-				if(sequenceData.getFuture().get() != rep.end) {
+				if(sequenceData.getFuture().get().first != rep.end) {
 					TEST(true); //tlog peek second attempt ended at a different version
 					req.reply.sendError(timed_out());
 					return Void();
 				}
 			} else {
-				sequenceData.send(rep.end);
+				sequenceData.send(std::make_pair(rep.end, rep.onlySpilled));
 			}
 			rep.begin = req.begin;
 		}
@@ -1537,13 +1538,13 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 		}
 		auto& sequenceData = trackerData.sequence_version[sequence+1];
 		if(sequenceData.isSet()) {
-			if(sequenceData.getFuture().get() != reply.end) {
+			if(sequenceData.getFuture().get().first != reply.end) {
 				TEST(true); //tlog peek second attempt ended at a different version
 				req.reply.sendError(timed_out());
 				return Void();
 			}
 		} else {
-			sequenceData.send(reply.end);
+			sequenceData.send(std::make_pair(reply.end, reply.onlySpilled));
 		}
 		reply.begin = req.begin;
 	}

From d2ef84a8f964372f971f8ab9ecdf7bc8584df1cb Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Mon, 8 Jul 2019 22:22:45 -0700
Subject: [PATCH 084/136] Add a TLogVersion::V4

And refactor some code to make adding more TLogVersions easier.
---
 fdbclient/FDBTypes.h                          |  4 ++-
 fdbserver/SimulatedCluster.actor.cpp          | 24 ++++++------------
 fdbserver/worker.actor.cpp                    | 25 ++++++++++++++++---
 .../workloads/ConfigureDatabase.actor.cpp     |  6 ++++-
 4 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h
index 31c246ffcb..e78a5f1813 100644
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@@ -601,8 +601,9 @@ struct TLogVersion {
 		// V1 = 1,  // 4.6 is dispatched to via 6.0
 		V2 = 2, // 6.0
 		V3 = 3, // 6.1
+		V4 = 4, // 6.2
 		MIN_SUPPORTED = V2,
-		MAX_SUPPORTED = V3,
+		MAX_SUPPORTED = V4,
 		MIN_RECRUITABLE = V2,
 		DEFAULT = V3,
 	} version;
@@ -624,6 +625,7 @@ struct TLogVersion {
 	static ErrorOr<TLogVersion> FromStringRef( StringRef s ) {
 		if (s == LiteralStringRef("2")) return V2;
 		if (s == LiteralStringRef("3")) return V3;
+		if (s == LiteralStringRef("4")) return V4;
 		return default_error_or();
 	}
 };
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 81330eac10..0257563af1 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -850,23 +850,15 @@ void SimulationConfig::generateNormalConfig(int minimumReplication, int minimumR
 	}
 
 	if (deterministicRandom()->random01() < 0.5) {
-		if (deterministicRandom()->random01() < 0.5) {
-			set_config("log_spill:=1");  // VALUE
-		}
-		int logVersion = deterministicRandom()->randomInt( 0, 3 );
-		switch (logVersion) {
-		case 0:
-			break;
-		case 1:
-			set_config("log_version:=2");  // 6.0
-			break;
-		case 2:
-			set_config("log_version:=3");  // 6.1
-			break;
-		}
+		int logSpill = deterministicRandom()->randomInt( TLogSpillType::VALUE, TLogSpillType::END );
+		set_config(format("log_spill:=%d", logSpill));
+		int logVersion = deterministicRandom()->randomInt( TLogVersion::MIN_RECRUITABLE, TLogVersion::MAX_SUPPORTED+1 );
+		set_config(format("log_version:=%d", logVersion));
 	} else {
-		set_config("log_version:=3");  // 6.1
-		set_config("log_spill:=2");  // REFERENCE
+		if (deterministicRandom()->random01() < 0.7)
+			set_config(format("log_version:=%d", TLogVersion::MAX_SUPPORTED));
+		if (deterministicRandom()->random01() < 0.5)
+			set_config(format("log_spill:=%d", TLogSpillType::DEFAULT));
 	}
 
 	if(generateFearless || (datacenters == 2 && deterministicRandom()->random01() < 0.5)) {
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 5f22334b44..0025e0aebc 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -278,10 +278,27 @@ struct TLogOptions {
 
 TLogFn tLogFnForOptions( TLogOptions options ) {
 	auto tLogFn = tLog;
-	if ( options.version == TLogVersion::V2 && options.spillType == TLogSpillType::VALUE) return oldTLog_6_0::tLog;
-	if ( options.version == TLogVersion::V2 && options.spillType == TLogSpillType::REFERENCE) ASSERT(false);
-	if ( options.version == TLogVersion::V3 && options.spillType == TLogSpillType::VALUE ) return oldTLog_6_0::tLog;
-	if ( options.version == TLogVersion::V3 && options.spillType == TLogSpillType::REFERENCE) return tLog;
+	if ( options.spillType == TLogSpillType::VALUE ) {
+		switch (options.version) {
+		case TLogVersion::V2:
+		case TLogVersion::V3:
+		case TLogVersion::V4:
+			return oldTLog_6_0::tLog;
+		default:
+			ASSERT(false);
+		}
+	}
+	if ( options.spillType == TLogSpillType::REFERENCE ) {
+		switch (options.version) {
+		case TLogVersion::V2:
+			ASSERT(false);
+		case TLogVersion::V3:
+		case TLogVersion::V4:
+			return tLog;
+		default:
+			ASSERT(false);
+		}
+	}
 	ASSERT(false);
 	return tLogFn;
 }
diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp
index b49b7b8f82..6ebabc899c 100644
--- a/fdbserver/workloads/ConfigureDatabase.actor.cpp
+++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp
@@ -27,7 +27,11 @@
 
 // "ssd" is an alias to the preferred type which skews the random distribution toward it but that's okay.
 static const char* storeTypes[] = { "ssd", "ssd-1", "ssd-2", "memory", "memory-1", "memory-2" };
-static const char* logTypes[] = { "log_engine:=1", "log_engine:=2", "log_spill:=1", "log_spill:=2", "log_version:=2", "log_version:=3" };
+static const char* logTypes[] = {
+	"log_engine:=1", "log_engine:=2",
+	"log_spill:=1", "log_spill:=2",
+	"log_version:=2", "log_version:=3", "log_version:=4"
+};
 static const char* redundancies[] = { "single", "double", "triple" };
 
 std::string generateRegions() {

From 44f11702a864f1219aa288543cb126b1ea4c7bf6 Mon Sep 17 00:00:00 2001
From: Alex Miller <alexmiller@apple.com>
Date: Mon, 8 Jul 2019 22:25:01 -0700
Subject: [PATCH 085/136] Log Routers will prefer to peek from satellite logs.

Formerly, they would prefer to peek from the primary's logs.  Testing of
a failed region rejoining the cluster revealed that this becomes quite a
strain on the primary logs when extremely large volumes of peek requests
are coming from the Log Routers.  It happens that we have satellites
that contain the same mutations with Log Router tags, that have no other
peeking load, so we can prefer to use the satellite to peek rather than
the primary to distribute load across TLogs better.

Unfortunately, this revealed a latent bug in how tagged mutations in the
KnownCommittedVersion->RecoveryVersion gap were copied across
generations when the number of log router tags were decreased.
Satellite TLogs would be assigned log router tags using the
team-building based logic in getPushLocations(), whereas TLogs would
internally re-index tags according to tag.id%logRouterTags.  This
mismatch would mean that we could have:

    Log0 -2:0 ----- -2:0  Log 0

    Log1 -2:1 \
               >--- -2:1,-2:0 (-2:2 mod 2 becomes -2:0)  Log 1
    Log2 -2:2 /

And now we have data that's tagged as -2:0 on a TLog that's not the
preferred location for -2:0, and therefore a BestLocationOnly cursor
would miss the mutations.

This was never noticed before, as we never
used a satellite as a preferred location to peek from.  Merge cursors
always peek from all locations, and thus a peek for -2:0 that needed
data from the satellites would have gone to both TLogs and merged the
results.

We now take this mod-based re-indexing into account when assigning which
TLogs need to recover which tags from the previous generation, to make
sure that tag.id%logRouterTags always results in the assigned TLog being
the preferred location.

Unfortunately, previously existing will potentially have existing
satellites with log router tags indexed incorrectly, so this transition
needs to be gated on a `log_version` transition.  Old LogSets will have
an old LogVersion, and we won't prefer the sattelite for peeking.  Log
Sets post-6.2 (opt-in) or post-6.3 (default) will be indexed correctly,
and therefore we can safely offload peeking onto the satellites.
---
 fdbserver/Knobs.cpp                         |  1 +
 fdbserver/Knobs.h                           |  1 +
 fdbserver/TLogServer.actor.cpp              |  4 ++
 fdbserver/TagPartitionedLogSystem.actor.cpp | 79 ++++++++++++++++-----
 4 files changed, 69 insertions(+), 16 deletions(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index 06798a485a..a20f1c7fa9 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -68,6 +68,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( MAX_QUEUE_COMMIT_BYTES,                               15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000;
 	init( VERSIONS_PER_BATCH,                 VERSIONS_PER_SECOND/20 ); if( randomize && BUGGIFY ) VERSIONS_PER_BATCH = std::max<int64_t>(1,VERSIONS_PER_SECOND/1000);
 	init( CONCURRENT_LOG_ROUTER_READS,                             1 );
+	init( LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED,               1 ); if( randomize && BUGGIFY ) LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED = 0;
 	init( DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME,                    1.0 );
 	init( DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME,                    5.0 );
 	init( TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES,            2e9 ); if ( randomize && BUGGIFY ) TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES = 2e6;
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index dab19d5108..1342184cab 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -72,6 +72,7 @@ public:
 	int64_t MAX_QUEUE_COMMIT_BYTES;
 	int64_t VERSIONS_PER_BATCH;
 	int CONCURRENT_LOG_ROUTER_READS;
+	int LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED; // 0==peek from primary, non-zero==peek from satellites
 	double DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME;
 	double DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME;
 	int64_t TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES;
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 96a63c1d39..20e753ed9c 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -2236,6 +2236,10 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 	state Version tagAt = beginVersion;
 	state Version lastVer = 0;
 
+	if (endVersion.present()) {
+		TraceEvent("TLogRestoreReplicationFactor", self->dbgid).detail("LogId", logData->logId).detail("Locality", logData->locality).detail("RecoverFrom", beginVersion).detail("RecoverTo", endVersion.get());
+	}
+
 	while (!endVersion.present() || logData->version.get() < endVersion.get()) {
 		loop {
 			choose {
diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index 2e25daae3b..e5ae6550ee 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -803,27 +803,52 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		if( found ) {
 			if(stopped) {
 				std::vector<Reference<LogSet>> localSets;
-				int bestSet = 0;
+				int bestPrimarySet = 0;
+				int bestSatelliteSet = -1;
 				for(auto& log : tLogs) {
 					if(log->isLocal && log->logServers.size()) {
 						TraceEvent("TLogPeekLogRouterLocalSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LogServers", log->logServerString());
 						localSets.push_back(log);
-						if(log->locality != tagLocalitySatellite) {
-							bestSet = localSets.size() - 1;
+						if(log->locality == tagLocalitySatellite) {
+							bestSatelliteSet = localSets.size() - 1;
+						} else {
+							bestPrimarySet = localSets.size() - 1;
 						}
 					}
 				}
+				int bestSet = bestPrimarySet;
+				if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED &&
+				    bestSatelliteSet != -1 &&
+				    tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4 ) {
+					bestSet = bestSatelliteSet;
+				}
 
 				TraceEvent("TLogPeekLogRouterSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin);
 				//FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies across the WAN
 				return Reference<ILogSystem::SetPeekCursor>( new ILogSystem::SetPeekCursor( localSets, bestSet, localSets[bestSet]->bestLocationFor( tag ), tag, begin, getPeekEnd(), true ) );
 			} else {
-				for( auto& log : tLogs ) {
-					if(log->logServers.size() && log->isLocal && log->locality != tagLocalitySatellite) {
-						TraceEvent("TLogPeekLogRouterBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LogId", log->logServers[log->bestLocationFor( tag )]->get().id());
-						return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( log->logServers[log->bestLocationFor( tag )], tag, begin, getPeekEnd(), false, true ) );
+				int bestPrimarySet = -1;
+				int bestSatelliteSet = -1;
+				for( int i = 0; i < tLogs.size(); i++ ) {
+					const auto& log = tLogs[i];
+					if(log->logServers.size() && log->isLocal) {
+						if (log->locality == tagLocalitySatellite) {
+							bestSatelliteSet = i;
+							break;
+						} else {
+							if (bestPrimarySet == -1) bestPrimarySet = i;
+						}
 					}
 				}
+				int bestSet = bestPrimarySet;
+				if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED &&
+				    bestSatelliteSet != -1 &&
+				    tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4 ) {
+					bestSet = bestSatelliteSet;
+				}
+				const auto& log = tLogs[bestSet];
+				TraceEvent("TLogPeekLogRouterBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LogId", log->logServers[log->bestLocationFor( tag )]->get().id());
+				return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( log->logServers[log->bestLocationFor( tag )], tag, begin, getPeekEnd(), false, true ) );
 			}
 		}
 		bool firstOld = true;
@@ -836,17 +861,26 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				}
 			}
 			if( found ) {
-				int bestSet = 0;
+				int bestPrimarySet = 0;
+				int bestSatelliteSet = -1;
 				std::vector<Reference<LogSet>> localSets;
 				for(auto& log : old.tLogs) {
 					if(log->isLocal && log->logServers.size()) {
 						TraceEvent("TLogPeekLogRouterOldLocalSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LogServers", log->logServerString());
 						localSets.push_back(log);
-						if(log->locality != tagLocalitySatellite) {
-							bestSet = localSets.size()-1;
+						if(log->locality == tagLocalitySatellite) {
+							bestSatelliteSet = localSets.size() - 1;
+						} else {
+							bestPrimarySet = localSets.size() - 1;
 						}
 					}
 				}
+				int bestSet = bestPrimarySet;
+				if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED &&
+				    bestSatelliteSet != -1 &&
+				    old.tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4 ) {
+					bestSet = bestSatelliteSet;
+				}
 
 				TraceEvent("TLogPeekLogRouterOldSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("OldEpoch", old.epochEnd).detail("RecoveredAt", recoveredAt.present() ? recoveredAt.get() : -1).detail("FirstOld", firstOld);
 				//FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies across the WAN
@@ -1949,12 +1983,25 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				req.logRouterTags = logSystem->logRouterTags;
 			}
 
-			for(int i = -1; i < oldLogSystem->logRouterTags; i++) {
-				Tag tag = i == -1 ? txsTag : Tag(tagLocalityLogRouter, i);
-				locations.clear();
-				logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
-				for(int loc : locations)
-					sreqs[ loc ].recoverTags.push_back( tag );
+			locations.clear();
+			logSystem->tLogs[1]->getPushLocations( {txsTag}, locations, 0 );
+			for(int loc : locations)
+				sreqs[ loc ].recoverTags.push_back( txsTag );
+
+			if (logSystem->logRouterTags) {
+				for(int i = 0; i < oldLogSystem->logRouterTags; i++) {
+					Tag tag = Tag(tagLocalityLogRouter, i);
+					// Sattelite logs will index a mutation with tagLocalityLogRouter with an id greater than
+					// the number of log routers as having an id mod the number of log routers.  We thus need
+					// to make sure that if we're going from more log routers in the previous generation to
+					// less log routers in the newer one, that we map the log router tags onto satellites that
+					// are the preferred location for id%logRouterTags.
+					Tag pushLocation = Tag(tagLocalityLogRouter, i%logSystem->logRouterTags);
+					locations.clear();
+					logSystem->tLogs[1]->getPushLocations( {pushLocation}, locations, 0 );
+					for(int loc : locations)
+						sreqs[ loc ].recoverTags.push_back( tag );
+				}
 			}
 
 			for( int i = 0; i < recr.satelliteTLogs.size(); i++ )

From 23963328cc82ef3d9724f840c286db96533694d9 Mon Sep 17 00:00:00 2001
From: mpilman <markus.pilman@snowflake.com>
Date: Tue, 9 Jul 2019 12:11:51 -0700
Subject: [PATCH 086/136] Compile relative paths into the debug info

This is a suggestion to resolve #1780

This change introduces a new cmake flag `RELATIVE_DEBUG_PATHS`.
If this flag is set or FDB is compiled with `-DFDB_RELEASE=ON`,
the resulting binary will have debug information using relative
file paths to source files. This simulates the behavior of the
old build system but might break local debugging (making the
debugger aware of build and source directory will be required).
---
 cmake/ConfigureCompiler.cmake | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake
index 0d7d6a67a8..2da36b788c 100644
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@@ -8,6 +8,12 @@ set(FDB_RELEASE OFF CACHE BOOL "This is a building of a final release")
 set(USE_LD "LD" CACHE STRING "The linker to use for building: can be LD (system default, default choice), GOLD, or LLD")
 set(USE_LIBCXX OFF CACHE BOOL "Use libc++")
 set(USE_CCACHE OFF CACHE BOOL "Use ccache for compilation if available")
+set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info")
+
+set(rel_debug_paths OFF)
+if(RELATIVE_DEBUG_PATHS OR FDB_RELEASE)
+  set(rel_debug_paths ON)
+endif()
 
 if(USE_GPERFTOOLS)
   find_package(Gperftools REQUIRED)
@@ -103,6 +109,10 @@ else()
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld -Wl,--disable-new-dtags")
   endif()
 
+  if(rel_debug_paths)
+    add_compile_options("-fdebug-prefix-map=${CMAKE_SOURCE_DIR}=." "-fdebug-prefix-map=${CMAKE_BINARY_DIR}=.")
+  endif()
+
   # we always compile with debug symbols. CPack will strip them out
   # and create a debuginfo rpm
   add_compile_options(-ggdb -fno-omit-frame-pointer)

From 1bac04509e7066d6390f8422af6478868ce17624 Mon Sep 17 00:00:00 2001
From: Trevor Clinkenbeard <trevorclinkenbeard@gmail.com>
Date: Sat, 15 Jun 2019 15:02:43 -0700
Subject: [PATCH 087/136] Track the local ratekeeper rate as a percentage

This value is reported in status for each storage server.
---
 documentation/sphinx/source/mr-status-json-schemas.rst.inc | 1 +
 fdbclient/Schemas.cpp                                      | 1 +
 fdbserver/Status.actor.cpp                                 | 1 +
 fdbserver/storageserver.actor.cpp                          | 2 +-
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 5b0099f142..cb18fa4cb1 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -36,6 +36,7 @@
             "roles":[
                {
                   "query_queue_max":0,
+                  "local_rate":0,
                   "input_bytes":{
                      "hz":0.0,
                      "counter":0,
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 2e3db10c40..87d8b97ed1 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -56,6 +56,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
             "roles":[
                {
                   "query_queue_max":0,
+                  "local_rate":0,
                   "input_bytes":{
                      "hz":0.0,
                      "counter":0,
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 47c61aeb9f..92d1498589 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -429,6 +429,7 @@ struct RolesInfo {
 			obj["keys_queried"] = StatusCounter(storageMetrics.getValue("RowsQueried")).getStatus();
 			obj["mutation_bytes"] = StatusCounter(storageMetrics.getValue("MutationBytes")).getStatus();
 			obj["mutations"] = StatusCounter(storageMetrics.getValue("Mutations")).getStatus();
+			obj.setKeyRawNumber("local_rate", storageMetrics.getValue("LocalRate"));
 
 			Version version = storageMetrics.getInt64("Version");
 			Version durableVersion = storageMetrics.getInt64("DurableVersion");
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 7d5c59f0ba..ade11ab8a0 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -507,7 +507,7 @@ public:
 			specialCounter(cc, "DurableVersion", [self](){ return self->durableVersion.get(); });
 			specialCounter(cc, "DesiredOldestVersion", [self](){ return self->desiredOldestVersion.get(); });
 			specialCounter(cc, "VersionLag", [self](){ return self->versionLag; });
-			specialCounter(cc, "LocalRatekeeper", [self]{ return self->currentRate(); });
+			specialCounter(cc, "LocalRate", [self]{ return self->currentRate() * 100; });
 
 			specialCounter(cc, "FetchKeysFetchActive", [self](){ return self->fetchKeysParallelismLock.activePermits(); });
 			specialCounter(cc, "FetchKeysWaiting", [self](){ return self->fetchKeysParallelismLock.waiters(); });

From 764a4591ada255cecc0e63fd0f1ebbbcba07a1c2 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Tue, 9 Jul 2019 14:17:26 -0700
Subject: [PATCH 088/136] Add a comment to internal flag

---
 fdbclient/DatabaseContext.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index 35eb8e6f71..dfd4ef0dd0 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -134,7 +134,7 @@ public:
 	std::map< UID, StorageServerInfo* > server_interf;
 
 	UID dbId;
-	bool internal;
+	bool internal; // Only contexts created through the C client and fdbcli are non-internal
 
 	CounterCollection cc;
 

From 705059dea129c9771bb4ef2ddb384a19c8218ee3 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Mon, 24 Jun 2019 17:36:55 -0700
Subject: [PATCH 089/136] Trace: Add support to print pointers

---
 flow/Trace.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flow/Trace.h b/flow/Trace.h
index 6ba7e0e7db..a3957bddf4 100644
--- a/flow/Trace.h
+++ b/flow/Trace.h
@@ -210,6 +210,7 @@ FORMAT_TRACEABLE(unsigned long int, "%lu");
 FORMAT_TRACEABLE(long long int, "%lld");
 FORMAT_TRACEABLE(unsigned long long int, "%llu");
 FORMAT_TRACEABLE(double, "%g");
+FORMAT_TRACEABLE(void*, "%p");
 FORMAT_TRACEABLE(volatile long, "%ld");
 FORMAT_TRACEABLE(volatile unsigned long, "%lu");
 FORMAT_TRACEABLE(volatile long long, "%lld");

From 3f4f71ff9f119eb7dcded13dec32fa73ef1f485a Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Mon, 24 Jun 2019 17:37:57 -0700
Subject: [PATCH 090/136] fdbrpc: Increment peerReferences correctly

The constructor of FlowReceiver which handled reference counting
peerReferences relied on calling a virtual method from constructor
whose behaviour isn't correct. This patch, bubbles down result of that
virtual method from derived constructor to base contructor.
---
 fdbrpc/FlowTransport.actor.cpp |  9 ++++-----
 fdbrpc/FlowTransport.h         |  4 ++--
 fdbrpc/fdbrpc.h                | 22 +++++++++++++++-------
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index c20aa607a6..303699a750 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -1047,12 +1047,11 @@ Endpoint FlowTransport::loadedEndpoint( const UID& token ) {
 	return Endpoint(g_currentDeliveryPeerAddress, token);
 }
 
-void FlowTransport::addPeerReference( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) {
+void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
 	if (FlowTransport::transport().isClient()) {
 		IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
 	}
-
-	if (!receiver->isStream() || !endpoint.getPrimaryAddress().isValid()) return;
+	if (!isStream || !endpoint.getPrimaryAddress().isValid()) return;
 	Peer* peer = self->getPeer(endpoint.getPrimaryAddress());
 	if(peer->peerReferences == -1) {
 		peer->peerReferences = 1;
@@ -1061,8 +1060,8 @@ void FlowTransport::addPeerReference( const Endpoint& endpoint, NetworkMessageRe
 	}
 }
 
-void FlowTransport::removePeerReference( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) {
-	if (!receiver->isStream() || !endpoint.getPrimaryAddress().isValid()) return;
+void FlowTransport::removePeerReference(const Endpoint& endpoint, bool isStream) {
+	if (!isStream || !endpoint.getPrimaryAddress().isValid()) return;
 	Peer* peer = self->getPeer(endpoint.getPrimaryAddress(), false);
 	if(peer) {
 		peer->peerReferences--;
diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h
index 5bda279de3..73425b4ec6 100644
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@@ -132,10 +132,10 @@ public:
 	std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
 	// Returns the same of all peers that have attempted to connect, but have incompatible protocol versions
 
-	void addPeerReference( const Endpoint&, NetworkMessageReceiver* );
+	void addPeerReference(const Endpoint&, bool isStream);
 	// Signal that a peer connection is being used, even if no messages are currently being sent to the peer
 
-	void removePeerReference( const Endpoint&, NetworkMessageReceiver* );
+	void removePeerReference(const Endpoint&, bool isStream);
 	// Signal that a peer connection is no longer being used
 
 	void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID );
diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h
index 75e0a9a551..08c544ab7d 100644
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@@ -31,15 +31,19 @@
 struct FlowReceiver : private NetworkMessageReceiver {
 	// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
 
-	FlowReceiver() : m_isLocalEndpoint(false) {}
-	FlowReceiver(Endpoint const& remoteEndpoint) : endpoint(remoteEndpoint), m_isLocalEndpoint(false) {
-		FlowTransport::transport().addPeerReference(endpoint, this);
+	FlowReceiver() : m_isLocalEndpoint(false), m_stream(false) {
 	}
+
+	FlowReceiver(Endpoint const& remoteEndpoint, bool stream)
+	  : endpoint(remoteEndpoint), m_isLocalEndpoint(false), m_stream(stream) {
+		FlowTransport::transport().addPeerReference(endpoint, m_stream);
+	}
+
 	~FlowReceiver() {
 		if (m_isLocalEndpoint) {
 			FlowTransport::transport().removeEndpoint(endpoint, this);
 		} else {
-			FlowTransport::transport().removePeerReference(endpoint, this);
+			FlowTransport::transport().removePeerReference(endpoint, m_stream);
 		}
 	}
 
@@ -63,9 +67,10 @@ struct FlowReceiver : private NetworkMessageReceiver {
 		FlowTransport::transport().addWellKnownEndpoint(endpoint, this, taskID);
 	}
 
-protected:
+private:
 	Endpoint endpoint;
 	bool m_isLocalEndpoint;
+	bool m_stream;
 };
 
 template <class T>
@@ -74,7 +79,9 @@ struct NetSAV : SAV<T>, FlowReceiver, FastAllocated<NetSAV<T>> {
 	using FastAllocated<NetSAV<T>>::operator delete;
 
 	NetSAV(int futures, int promises) : SAV<T>(futures, promises) {}
-	NetSAV(int futures, int promises, const Endpoint& remoteEndpoint) : SAV<T>(futures, promises), FlowReceiver(remoteEndpoint) {}
+	NetSAV(int futures, int promises, const Endpoint& remoteEndpoint)
+	  : SAV<T>(futures, promises), FlowReceiver(remoteEndpoint, false) {
+	}
 
 	virtual void destroy() { delete this; }
 	virtual void receive(ArenaReader& reader) {
@@ -228,7 +235,8 @@ struct NetNotifiedQueue : NotifiedQueue<T>, FlowReceiver, FastAllocated<NetNotif
 	using FastAllocated<NetNotifiedQueue<T>>::operator delete;
 
 	NetNotifiedQueue(int futures, int promises) : NotifiedQueue<T>(futures, promises) {}
-	NetNotifiedQueue(int futures, int promises, const Endpoint& remoteEndpoint) : NotifiedQueue<T>(futures, promises), FlowReceiver(remoteEndpoint) {}
+	NetNotifiedQueue(int futures, int promises, const Endpoint& remoteEndpoint)
+	  : NotifiedQueue<T>(futures, promises), FlowReceiver(remoteEndpoint, true) {}
 
 	virtual void destroy() { delete this; }
 	virtual void receive(ArenaReader& reader) {

From 78a1b2defc033b0a1693589baafae65fba13c2c6 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Tue, 25 Jun 2019 16:25:42 -0700
Subject: [PATCH 091/136] simulator: Destroy each process individually in its
 context

When simulation ends, all the actors are cancelled, and the
destructions which rely on `globals` may not have access to right
globals (instead of the default simulator process globals). This
patch, calls destroy on each process individually after we context
switch to that process so that the globals acceses in destructor are
its own.

This issue arised when trying to get `Peer::peerReferences` in
NetNotifiedQueue, resulting in decrementing the reference count of
peers in FlowTransport object of '0.0.0.0'.
---
 fdbserver/SimulatedCluster.actor.cpp | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 705f401d39..fe7c5a1ac2 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -1373,6 +1373,28 @@ void checkExtraDB(const char *testFile, int &extraDB, int &minimumReplication, i
 	ifs.close();
 }
 
+// To be called after we stop simulator, so that destructors of each process is
+// called with right context, with access to right globals. At this point, we
+// also no longer have to protect coordinator addresses.
+// TODO: Investigate why this doesn't work when we call before stop(). Some
+//   earlier permanently failed processes seems to be the reason.
+ACTOR Future<Void> destroyAllProcesses() {
+	state ISimulator::ProcessInfo* simProcess = g_simulator.getCurrentProcess();
+	state vector<ISimulator::ProcessInfo*> processes = g_simulator.getAllProcesses();
+	state std::vector<ISimulator::ProcessInfo*>::iterator it;
+
+	g_simulator.protectedAddresses.clear();
+	for (it = processes.begin(); it != processes.end(); ++it) {
+		if (*it == simProcess || (*it)->failed) continue;
+		wait (g_simulator.onProcess(*it, TaskPriority::DefaultYield));
+		(*it)->shutdownSignal.send(ISimulator::KillInstantly);
+		g_simulator.destroyProcess(*it);
+	}
+
+	wait (g_simulator.onProcess(simProcess, TaskPriority::DefaultYield));
+	return Void();
+}
+
 ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool rebooting, bool restoring, std::string whitelistBinPaths, Reference<TLSOptions> tlsOptions) {
 	state vector<Future<Void>> systemActors;
 	state Optional<ClusterConnectionString> connFile;
@@ -1427,8 +1449,8 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot
 	}
 
 	TraceEvent("SimulatedSystemDestruct");
-	destructed = true;
-	systemActors.clear();
-
 	g_simulator.stop();
+	destructed = true;
+	wait(destroyAllProcesses());
+	systemActors.clear();
 }

From 7647d3e3c0f303b1b1a8fa9f716263ff0fe403bc Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Fri, 28 Jun 2019 00:38:28 -0700
Subject: [PATCH 092/136] fdbrpc: Don't use RequestStream for pings in
 ConnectionMonitor

RequestStream add another count to peerReference, which means as long
as ConnectionMonitor is alive, we'll never get peerReference=0 keeping
unnecessary connections potentially alive.
---
 fdbrpc/FlowTransport.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 303699a750..752f6426cd 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -396,7 +396,7 @@ struct Peer : NonCopyable {
 	}
 
 	ACTOR static Future<Void> connectionMonitor( Peer *peer ) {
-		state RequestStream< ReplyPromise<Void> > remotePing( Endpoint( {peer->destination}, WLTOKEN_PING_PACKET ) );
+		state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET);
 
 		loop {
 			if(peer->peerReferences == 0 && peer->reliable.empty() && peer->unsent.empty()) {
@@ -408,7 +408,7 @@ struct Peer : NonCopyable {
 			// SOMEDAY: Stop monitoring and close the connection after a long period of inactivity with no reliable or onDisconnect requests outstanding
 
 			state ReplyPromise<Void> reply;
-			FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePing.getEndpoint() );
+			FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePingEndpoint );
 			state int64_t startingBytes = peer->bytesReceived;
 			state int timeouts = 0;
 			loop {

From 867986cdeae0b712e414b89ce139b81195346ef6 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Fri, 28 Jun 2019 00:39:51 -0700
Subject: [PATCH 093/136] fdbrpc: Reduced connection monitoring from clients

This patch does two changes to connection monitoring:

1. Connection monitoring at client side will check if the connection
has been stayed idle for some time. If connection is unused for a
while, we close the connection. There is some weirdness involved here
as ping messages are by themselves are connection traffic. We get over
this by making it two-phase process, first being checking idle
reliable traffic, followed by disabling pings and then checking for
idle unreliable traffic.

2. Connection monitoring of clients from server will no longer send
pings to clients. Instead, it keep monitor the received bytes and
close after certain period of inactivity.
---
 fdbrpc/FlowTransport.actor.cpp | 74 ++++++++++++++++++++++++++++------
 fdbrpc/sim2.actor.cpp          |  9 ++++-
 flow/Knobs.cpp                 |  1 +
 flow/Knobs.h                   |  1 +
 flow/error_definitions.h       |  1 +
 5 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 752f6426cd..78d164cdd1 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -305,15 +305,18 @@ struct Peer : NonCopyable {
 	int peerReferences;
 	bool incompatibleProtocolVersionNewer;
 	int64_t bytesReceived;
+	double lastSentTime;
 
 	explicit Peer( TransportData* transport, NetworkAddress const& destination )
 		: transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0), reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), 
-		  compatible(true), incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0)
+		  compatible(true), incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastSentTime(now())
 	{
 		connect = connectionKeeper(this);
 	}
 
 	void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) {
+		if (rp)
+			lastSentTime = now();
 		unsent.setWriteBuffer(pb);
 		if (rp) reliable.insert(rp);
 		if (firstUnsent) dataToSend.trigger();
@@ -396,17 +399,47 @@ struct Peer : NonCopyable {
 	}
 
 	ACTOR static Future<Void> connectionMonitor( Peer *peer ) {
-		state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET);
+		if (!peer->destination.isPublic()) {
+			// Don't send ping messages to clients. Instead monitor incoming client pings.
+			state double lastRefreshed = now();
+			state int64_t lastBytesReceived = peer->bytesReceived;
+			loop {
+				wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
+				if (lastBytesReceived < peer->bytesReceived) {
+					lastRefreshed = now();
+					lastBytesReceived = peer->bytesReceived;
+				} else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT*2.5) {
+					throw connection_idle();
+				}
+			}
+		}
 
+		state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET);
 		loop {
-			if(peer->peerReferences == 0 && peer->reliable.empty() && peer->unsent.empty()) {
+			const bool pendingPacketsEmpty = peer->reliable.empty() && peer->unsent.empty();
+
+			if (peer->peerReferences == 0 && pendingPacketsEmpty) {
 				throw connection_unreferenced();
 			}
 
-			wait( delayJittered( FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME ) );
+			// TODO: Investigate connection idling at server-side peer too.
+			const bool monitorStateActive = peer->destination.isPublic() &&
+				(peer->lastSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) &&
+				(peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT);
+			if (!monitorStateActive) {
+				choose {
+					when(wait(peer->dataToSend.onTrigger())){
+						peer->lastSentTime = now();
+					}
+					when(wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT))) {
+						throw connection_idle();
+					}
+				}
+			}
 
-			// SOMEDAY: Stop monitoring and close the connection after a long period of inactivity with no reliable or onDisconnect requests outstanding
+			wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
 
+			// TODO: Stop monitoring and close the connection with no onDisconnect requests outstanding
 			state ReplyPromise<Void> reply;
 			FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePingEndpoint );
 			state int64_t startingBytes = peer->bytesReceived;
@@ -414,12 +447,17 @@ struct Peer : NonCopyable {
 			loop {
 				choose {
 					when (wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT ) )) {
+						// TODO: Since server will not ping clients (but will respond to incoming pings), is this
+						//   a safe metric, or instead we should fail after multiple timeouts?
 						if(startingBytes == peer->bytesReceived) {
 							TraceEvent("ConnectionTimeout").suppressFor(1.0).detail("WithAddr", peer->destination);
 							throw connection_failed();
 						}
 						if(timeouts > 1) {
-							TraceEvent(SevWarnAlways, "ConnectionSlowPing").suppressFor(1.0).detail("WithAddr", peer->destination).detail("Timeouts", timeouts);
+							TraceEvent(SevWarnAlways, "ConnectionSlowPing")
+							    .suppressFor(1.0)
+							    .detail("WithAddr", peer->destination)
+							    .detail("Timeouts", timeouts);
 						}
 						startingBytes = peer->bytesReceived;
 						timeouts++;
@@ -550,14 +588,21 @@ struct Peer : NonCopyable {
 				self->discardUnreliablePackets();
 				reader = Future<Void>();
 				bool ok = e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled ||
-				          e.code() == error_code_connection_unreferenced ||
+				          e.code() == error_code_connection_unreferenced || e.code() == error_code_connection_idle ||
 				          (g_network->isSimulated() && e.code() == error_code_checksum_failed);
 
 				if(self->compatible) {
-					TraceEvent(ok ? SevInfo : SevWarnAlways, "ConnectionClosed", conn ? conn->getDebugID() : UID()).error(e, true).suppressFor(1.0).detail("PeerAddr", self->destination);
+					TraceEvent(ok ? SevInfo : SevWarnAlways, "ConnectionClosed", conn ? conn->getDebugID() : UID())
+					    .error(e, true)
+					    .suppressFor(1.0)
+					    .detail("PeerAddr", self->destination);
 				}
 				else {
-					TraceEvent(ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed", conn ? conn->getDebugID() : UID()).error(e, true).suppressFor(1.0).detail("PeerAddr", self->destination);
+					TraceEvent(ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed",
+					           conn ? conn->getDebugID() : UID())
+					    .error(e, true)
+					    .suppressFor(1.0)
+					    .detail("PeerAddr", self->destination);
 				}
 
 				if(self->destination.isPublic() && IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()) {
@@ -565,20 +610,25 @@ struct Peer : NonCopyable {
 					if(now() - it.second > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY) {
 						it.first = now();
 					} else if(now() - it.first > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT) {
-						TraceEvent(SevWarnAlways, "TooManyConnectionsClosed", conn ? conn->getDebugID() : UID()).suppressFor(5.0).detail("PeerAddr", self->destination);
+						TraceEvent(SevWarnAlways, "TooManyConnectionsClosed", conn ? conn->getDebugID() : UID())
+						    .suppressFor(5.0)
+						    .detail("PeerAddr", self->destination);
 						self->transport->degraded->set(true);
 					}
 					it.second = now();
 				}
 
 				if (conn) {
-					if (FlowTransport::transport().isClient()) {
+					if (FlowTransport::transport().isClient() && e.code() != error_code_connection_idle) {
 						clientReconnectDelay = true;
 					}
 					conn->close();
 					conn = Reference<IConnection>();
 				}
-				IFailureMonitor::failureMonitor().notifyDisconnect( self->destination );  //< Clients might send more packets in response, which needs to go out on the next connection
+
+				// Clients might send more packets in response, which needs to go out on the next connection
+				IFailureMonitor::failureMonitor().notifyDisconnect( self->destination );
+
 				if (e.code() == error_code_actor_cancelled) throw;
 				// Try to recover, even from serious errors, by retrying
 
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index a7ee2623e9..42e3e32930 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -381,8 +381,13 @@ private:
 	ACTOR static Future<Void> trackLeakedConnection( Sim2Conn* self ) {
 		wait( g_simulator.onProcess( self->process ) );
 		// SOMEDAY: Make this value variable? Dependent on buggification status?
-		wait( delay( 20.0 ) );
-		TraceEvent(SevError, "LeakedConnection", self->dbgid).error(connection_leaked()).detail("MyAddr", self->process->address).detail("PeerAddr", self->peerEndpoint).detail("PeerId", self->peerId).detail("Opened", self->opened);
+		wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 4 ) );
+		TraceEvent(SevError, "LeakedConnection", self->dbgid)
+		    .error(connection_leaked())
+		    .detail("MyAddr", self->process->address)
+		    .detail("PeerAddr", self->peerEndpoint)
+		    .detail("PeerId", self->peerId)
+		    .detail("Opened", self->opened);
 		return Void();
 	}
 };
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index a79dbed1ca..db6d24cb67 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -55,6 +55,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
 	//connectionMonitor
 	init( CONNECTION_MONITOR_LOOP_TIME,   isSimulated ? 0.75 : 1.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_LOOP_TIME = 6.0;
 	init( CONNECTION_MONITOR_TIMEOUT,     isSimulated ? 1.50 : 2.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_TIMEOUT = 6.0;
+	init( CONNECTION_MONITOR_IDLE_TIMEOUT,              10.0 );
 
 	//FlowTransport
 	init( CONNECTION_REJECTED_MESSAGE_DELAY,                   1.0 );
diff --git a/flow/Knobs.h b/flow/Knobs.h
index 2268e6dfad..910bdec66f 100644
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@@ -73,6 +73,7 @@ public:
 	//connectionMonitor
 	double CONNECTION_MONITOR_LOOP_TIME;
 	double CONNECTION_MONITOR_TIMEOUT;
+	double CONNECTION_MONITOR_IDLE_TIMEOUT;
 
 	//FlowTransport
 	double CONNECTION_REJECTED_MESSAGE_DELAY;
diff --git a/flow/error_definitions.h b/flow/error_definitions.h
index 25f000935b..a505641694 100755
--- a/flow/error_definitions.h
+++ b/flow/error_definitions.h
@@ -69,6 +69,7 @@ ERROR( transaction_not_permitted, 1045, "Operation not permitted")
 ERROR( cluster_not_fully_recovered, 1046, "Cluster not fully recovered")
 ERROR( txn_exec_log_anti_quorum, 1047, "Execute Transaction not supported when log anti quorum is configured")
 ERROR( connection_unreferenced, 1048, "No peer references for connection" )
+ERROR( connection_idle, 1049, "Connection closed after idle timeout" )
 
 ERROR( broken_promise, 1100, "Broken promise" )
 ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" )

From ae6c3e013a27b4ca559c2db55abe82a9817ad625 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Fri, 28 Jun 2019 13:18:49 -0700
Subject: [PATCH 094/136] monitorClientInfo: Wait for master proxy endpoint
 failures than triggers

This will not initiate request to get get new set of proxy unless we
know for a fact that endpoint has indeed failed, not just because the
connection to Peer was closed as it was sitting idle.
---
 fdbclient/NativeAPI.actor.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 8ada99503f..9b2685508f 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -592,13 +592,13 @@ ACTOR static Future<Void> monitorClientInfo( Reference<AsyncVar<Optional<Cluster
 						}
 
 						onProxyFailureVec.push_back(
-						    IFailureMonitor::failureMonitor().onDisconnectOrFailure(
-						        proxy.getConsistentReadVersion.getEndpoint()) ||
-						    IFailureMonitor::failureMonitor().onDisconnectOrFailure(proxy.commit.getEndpoint()) ||
-						    IFailureMonitor::failureMonitor().onDisconnectOrFailure(
-						        proxy.getKeyServersLocations.getEndpoint()) ||
-						    IFailureMonitor::failureMonitor().onDisconnectOrFailure(
-						        proxy.getStorageServerRejoinInfo.getEndpoint()));
+						    IFailureMonitor::failureMonitor().onStateEqual(
+						        proxy.getConsistentReadVersion.getEndpoint(), FailureStatus()) ||
+						    IFailureMonitor::failureMonitor().onStateEqual(proxy.commit.getEndpoint(), FailureStatus()) ||
+						    IFailureMonitor::failureMonitor().onStateEqual(
+						        proxy.getKeyServersLocations.getEndpoint(), FailureStatus()) ||
+						    IFailureMonitor::failureMonitor().onStateEqual(
+						        proxy.getStorageServerRejoinInfo.getEndpoint(), FailureStatus()));
 					}
 					if (skipWaitForProxyFail) continue;
 

From 1f9c80f633af74512616bcd05cc14ed55cb91cae Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Fri, 28 Jun 2019 13:43:22 -0700
Subject: [PATCH 095/136] fdbrpc: Instead of tracking last sent data, track
 last sent non-ping data

* This will allow client to continue monitoring peer connections while
connection stays open, so that there is no period of "uncertainity"
without previous no-monitoring approach.

* Use multiplier for incoming connection idle timeout

* Update idle connection timeout values and leaked connection timeout in
simulator.
---
 fdbrpc/FlowTransport.actor.cpp | 49 +++++++++++++---------------------
 fdbrpc/sim2.actor.cpp          |  6 ++++-
 flow/Knobs.cpp                 |  3 ++-
 flow/Knobs.h                   |  1 +
 4 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 78d164cdd1..5392cbede2 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -305,18 +305,16 @@ struct Peer : NonCopyable {
 	int peerReferences;
 	bool incompatibleProtocolVersionNewer;
 	int64_t bytesReceived;
-	double lastSentTime;
+	double lastDataPacketSentTime;
 
-	explicit Peer( TransportData* transport, NetworkAddress const& destination )
-		: transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0), reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), 
-		  compatible(true), incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastSentTime(now())
-	{
+	explicit Peer(TransportData* transport, NetworkAddress const& destination)
+	  : transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0),
+	    reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true),
+	    incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {
 		connect = connectionKeeper(this);
 	}
 
 	void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) {
-		if (rp)
-			lastSentTime = now();
 		unsent.setWriteBuffer(pb);
 		if (rp) reliable.insert(rp);
 		if (firstUnsent) dataToSend.trigger();
@@ -408,7 +406,8 @@ struct Peer : NonCopyable {
 				if (lastBytesReceived < peer->bytesReceived) {
 					lastRefreshed = now();
 					lastBytesReceived = peer->bytesReceived;
-				} else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT*2.5) {
+				} else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT *
+				                                       FLOW_KNOBS->CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER) {
 					throw connection_idle();
 				}
 			}
@@ -417,28 +416,14 @@ struct Peer : NonCopyable {
 		state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET);
 		loop {
 			const bool pendingPacketsEmpty = peer->reliable.empty() && peer->unsent.empty();
-
-			if (peer->peerReferences == 0 && pendingPacketsEmpty) {
-				throw connection_unreferenced();
+			if (pendingPacketsEmpty && (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) &&
+			    (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) {
+				if (peer->peerReferences == 0)
+					throw connection_unreferenced();
+				else if (peer->destination.isPublic())
+					throw connection_idle();
 			}
 
-			// TODO: Investigate connection idling at server-side peer too.
-			const bool monitorStateActive = peer->destination.isPublic() &&
-				(peer->lastSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) &&
-				(peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT);
-			if (!monitorStateActive) {
-				choose {
-					when(wait(peer->dataToSend.onTrigger())){
-						peer->lastSentTime = now();
-					}
-					when(wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT))) {
-						throw connection_idle();
-					}
-				}
-			}
-
-			wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
-
 			// TODO: Stop monitoring and close the connection with no onDisconnect requests outstanding
 			state ReplyPromise<Void> reply;
 			FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePingEndpoint );
@@ -447,8 +432,6 @@ struct Peer : NonCopyable {
 			loop {
 				choose {
 					when (wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT ) )) {
-						// TODO: Since server will not ping clients (but will respond to incoming pings), is this
-						//   a safe metric, or instead we should fail after multiple timeouts?
 						if(startingBytes == peer->bytesReceived) {
 							TraceEvent("ConnectionTimeout").suppressFor(1.0).detail("WithAddr", peer->destination);
 							throw connection_failed();
@@ -470,6 +453,8 @@ struct Peer : NonCopyable {
 					}
 				}
 			}
+
+			wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
 		}
 	}
 
@@ -1262,7 +1247,9 @@ static PacketID sendPacket( TransportData* self, ISerializeSource const& what, c
 #endif
 
 		peer->send(pb, rp, firstUnsent);
-
+		if (destination.token != WLTOKEN_PING_PACKET) {
+			peer->lastDataPacketSentTime = now();
+		}
 		return (PacketID)rp;
 	}
 }
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 42e3e32930..92402fd56a 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -381,7 +381,11 @@ private:
 	ACTOR static Future<Void> trackLeakedConnection( Sim2Conn* self ) {
 		wait( g_simulator.onProcess( self->process ) );
 		// SOMEDAY: Make this value variable? Dependent on buggification status?
-		wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 4 ) );
+		if (self->process->address.isPublic()) {
+			wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) );
+		} else {
+			wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) );
+		}
 		TraceEvent(SevError, "LeakedConnection", self->dbgid)
 		    .error(connection_leaked())
 		    .detail("MyAddr", self->process->address)
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index db6d24cb67..12d9b25cc1 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -55,7 +55,8 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
 	//connectionMonitor
 	init( CONNECTION_MONITOR_LOOP_TIME,   isSimulated ? 0.75 : 1.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_LOOP_TIME = 6.0;
 	init( CONNECTION_MONITOR_TIMEOUT,     isSimulated ? 1.50 : 2.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_TIMEOUT = 6.0;
-	init( CONNECTION_MONITOR_IDLE_TIMEOUT,              10.0 );
+	init( CONNECTION_MONITOR_IDLE_TIMEOUT,                   180.0 );
+	init( CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER,         1.2 );
 
 	//FlowTransport
 	init( CONNECTION_REJECTED_MESSAGE_DELAY,                   1.0 );
diff --git a/flow/Knobs.h b/flow/Knobs.h
index 910bdec66f..c2adec0bd6 100644
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@@ -74,6 +74,7 @@ public:
 	double CONNECTION_MONITOR_LOOP_TIME;
 	double CONNECTION_MONITOR_TIMEOUT;
 	double CONNECTION_MONITOR_IDLE_TIMEOUT;
+	double CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER;
 
 	//FlowTransport
 	double CONNECTION_REJECTED_MESSAGE_DELAY;

From 22678267cdb8fa519866c52dd91c20d82f54df30 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Fri, 5 Jul 2019 16:27:17 -0700
Subject: [PATCH 096/136] fdbrpc: Don't drop idle connections from server

Instead try pinging the client and let that decide whether the client
is alive or not. Ideally, it should always be failed since a well
behaved client would have closed the connection.
---
 fdbrpc/FlowTransport.actor.cpp | 44 ++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 5392cbede2..605edbbfc3 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -397,33 +397,39 @@ struct Peer : NonCopyable {
 	}
 
 	ACTOR static Future<Void> connectionMonitor( Peer *peer ) {
-		if (!peer->destination.isPublic()) {
-			// Don't send ping messages to clients. Instead monitor incoming client pings.
-			state double lastRefreshed = now();
-			state int64_t lastBytesReceived = peer->bytesReceived;
-			loop {
-				wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
-				if (lastBytesReceived < peer->bytesReceived) {
-					lastRefreshed = now();
-					lastBytesReceived = peer->bytesReceived;
-				} else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT *
-				                                       FLOW_KNOBS->CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER) {
-					throw connection_idle();
-				}
-			}
-		}
-
 		state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET);
 		loop {
+			if (!FlowTransport::transport().isClient() && !peer->destination.isPublic()) {
+				// Don't send ping messages to clients unless necessary. Instead monitor incoming client pings.
+				state double lastRefreshed = now();
+				state int64_t lastBytesReceived = peer->bytesReceived;
+				loop {
+					wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
+					if (lastBytesReceived < peer->bytesReceived) {
+						lastRefreshed = now();
+						lastBytesReceived = peer->bytesReceived;
+					} else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT *
+							   FLOW_KNOBS->CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER) {
+						// If we have not received anything in this period, client must have closed
+						// connection by now. Break loop to check if it is still alive by sending a ping.
+						break;
+					}
+				}
+			}
+
 			const bool pendingPacketsEmpty = peer->reliable.empty() && peer->unsent.empty();
 			if (pendingPacketsEmpty && (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) &&
 			    (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) {
-				if (peer->peerReferences == 0)
+				if (peer->peerReferences == 0) {
 					throw connection_unreferenced();
-				else if (peer->destination.isPublic())
+				} else if (FlowTransport::transport().isClient() && peer->destination.isPublic()) {
+					// First condition is necessary because we may get here if we are server.
 					throw connection_idle();
+				}
 			}
 
+			wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
+
 			// TODO: Stop monitoring and close the connection with no onDisconnect requests outstanding
 			state ReplyPromise<Void> reply;
 			FlowTransport::transport().sendUnreliable( SerializeSource<ReplyPromise<Void>>(reply), remotePingEndpoint );
@@ -453,8 +459,6 @@ struct Peer : NonCopyable {
 					}
 				}
 			}
-
-			wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME));
 		}
 	}
 

From 983343978e5e25b1d6867b11e5c346cbc0b754b1 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Mon, 8 Jul 2019 19:13:53 -0700
Subject: [PATCH 097/136] fdbrpc: ConnectionMonitor should close unreferenced
 after delay

Potentially for cases, where it goes up to 1 immediately.
---
 fdbrpc/FlowTransport.actor.cpp | 12 +++++++-----
 flow/Knobs.cpp                 |  1 +
 flow/Knobs.h                   |  1 +
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 605edbbfc3..d0f69162aa 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -417,12 +417,14 @@ struct Peer : NonCopyable {
 				}
 			}
 
-			const bool pendingPacketsEmpty = peer->reliable.empty() && peer->unsent.empty();
-			if (pendingPacketsEmpty && (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) &&
-			    (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) {
-				if (peer->peerReferences == 0) {
+			if (peer->reliable.empty() && peer->unsent.empty()) {
+				if (peer->peerReferences == 0 &&
+				    (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY)) {
+					// TODO: What about when peerReference == -1?
 					throw connection_unreferenced();
-				} else if (FlowTransport::transport().isClient() && peer->destination.isPublic()) {
+				} else if (FlowTransport::transport().isClient() && peer->destination.isPublic() &&
+				           (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) &&
+				           (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) {
 					// First condition is necessary because we may get here if we are server.
 					throw connection_idle();
 				}
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index 12d9b25cc1..90f2e078bd 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -57,6 +57,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
 	init( CONNECTION_MONITOR_TIMEOUT,     isSimulated ? 1.50 : 2.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_TIMEOUT = 6.0;
 	init( CONNECTION_MONITOR_IDLE_TIMEOUT,                   180.0 );
 	init( CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER,         1.2 );
+	init( CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY,         2.0 );
 
 	//FlowTransport
 	init( CONNECTION_REJECTED_MESSAGE_DELAY,                   1.0 );
diff --git a/flow/Knobs.h b/flow/Knobs.h
index c2adec0bd6..99ac9df386 100644
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@@ -75,6 +75,7 @@ public:
 	double CONNECTION_MONITOR_TIMEOUT;
 	double CONNECTION_MONITOR_IDLE_TIMEOUT;
 	double CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER;
+	double CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY;
 
 	//FlowTransport
 	double CONNECTION_REJECTED_MESSAGE_DELAY;

From 2f29b2c3d13d832edd1d8cdb31a9d3db5a30ec75 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Tue, 9 Jul 2019 14:01:05 -0700
Subject: [PATCH 098/136] simulator: Just do a wait() in setupAndRun to avoid
 destruction

It get us out of the ACTOR, never clearing the systemActors, and let
simulator call exit().
---
 fdbserver/SimulatedCluster.actor.cpp | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index fe7c5a1ac2..44a80342ab 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -1373,28 +1373,6 @@ void checkExtraDB(const char *testFile, int &extraDB, int &minimumReplication, i
 	ifs.close();
 }
 
-// To be called after we stop simulator, so that destructors of each process is
-// called with right context, with access to right globals. At this point, we
-// also no longer have to protect coordinator addresses.
-// TODO: Investigate why this doesn't work when we call before stop(). Some
-//   earlier permanently failed processes seems to be the reason.
-ACTOR Future<Void> destroyAllProcesses() {
-	state ISimulator::ProcessInfo* simProcess = g_simulator.getCurrentProcess();
-	state vector<ISimulator::ProcessInfo*> processes = g_simulator.getAllProcesses();
-	state std::vector<ISimulator::ProcessInfo*>::iterator it;
-
-	g_simulator.protectedAddresses.clear();
-	for (it = processes.begin(); it != processes.end(); ++it) {
-		if (*it == simProcess || (*it)->failed) continue;
-		wait (g_simulator.onProcess(*it, TaskPriority::DefaultYield));
-		(*it)->shutdownSignal.send(ISimulator::KillInstantly);
-		g_simulator.destroyProcess(*it);
-	}
-
-	wait (g_simulator.onProcess(simProcess, TaskPriority::DefaultYield));
-	return Void();
-}
-
 ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool rebooting, bool restoring, std::string whitelistBinPaths, Reference<TLSOptions> tlsOptions) {
 	state vector<Future<Void>> systemActors;
 	state Optional<ClusterConnectionString> connFile;
@@ -1451,6 +1429,6 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot
 	TraceEvent("SimulatedSystemDestruct");
 	g_simulator.stop();
 	destructed = true;
-	wait(destroyAllProcesses());
-	systemActors.clear();
+	wait(Never());
+	ASSERT(false);
 }

From fdd580c8788b4d4b27a90908ee8c05c9a9bf59e6 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Tue, 9 Jul 2019 15:00:11 -0700
Subject: [PATCH 099/136] Restore some variable initializations that were
 unintentionally removed.

---
 fdbclient/NativeAPI.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index b9cea4c1ad..41be26ccfe 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -514,7 +514,7 @@ DatabaseContext::DatabaseContext(
 	transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), 
 	transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), 
 	transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), 
-	transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0),
+	transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1),
 	latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
 	healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal)
 {

From 4b8eb27134ee242a97f6451f14b6dcaef6529896 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Tue, 9 Jul 2019 14:57:38 -0700
Subject: [PATCH 100/136] fdbrpc: Move setStatus line in addPeerReference

---
 fdbrpc/FlowTransport.actor.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index d0f69162aa..c0e78e9a32 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -1089,10 +1089,11 @@ Endpoint FlowTransport::loadedEndpoint( const UID& token ) {
 }
 
 void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
-	if (FlowTransport::transport().isClient()) {
+	if (!isStream || !endpoint.getPrimaryAddress().isValid())
+		return;
+	else if (FlowTransport::transport().isClient())
 		IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
-	}
-	if (!isStream || !endpoint.getPrimaryAddress().isValid()) return;
+
 	Peer* peer = self->getPeer(endpoint.getPrimaryAddress());
 	if(peer->peerReferences == -1) {
 		peer->peerReferences = 1;

From c8cf7f88ef5cd9b4b8c569c7c49cf09a818071b9 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Tue, 9 Jul 2019 15:25:32 -0700
Subject: [PATCH 101/136] Add a release note for option fix, in particular
 noting that a 6.2 client must be used as the primary for the behavior to work
 correctly.

---
 documentation/sphinx/source/release-notes.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index f2a9813030..bb59d59559 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -14,6 +14,8 @@ Performance
 Fixes
 -----
 
+* During an upgrade, the multi-version client now persists database default options and transaction options that aren't reset on retry (e.g. transaction timeout). In order for these options to function correctly during an upgrade, a 6.2 or later client should be used as the primary client. `(PR #1767) <https://github.com/apple/foundationdb/pull/1767>`_.
+
 Status
 ------
 

From d032d7fcf93748bd41bfd3ac089a2712a99de261 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Tue, 9 Jul 2019 16:37:54 -0700
Subject: [PATCH 102/136] fix: if we get a broken_promise from the actor, wait
 to get the real error from the store

---
 fdbserver/worker.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index f23d889c37..af948b9b87 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -129,6 +129,9 @@ ACTOR Future<Void> handleIOErrors( Future<Void> actor, IClosable* store, UID id,
 			} else {
 				wait(onClosed);
 			}
+			if(e.isError() && e.getError().code() == error_code_broken_promise && !storeError.isReady()) {
+				wait(delay(0.00001 + FLOW_KNOBS->MAX_BUGGIFIED_DELAY));
+			}
 			if(storeError.isReady()) throw storeError.get().getError();
 			if (e.isError()) throw e.getError(); else return e.get();
 		}

From b27a909f3af2aadc0b012a912683ee6dc84c2aa2 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Tue, 9 Jul 2019 16:38:59 -0700
Subject: [PATCH 103/136] fix: onDisconnectOrFailure can spuriously trigger

---
 fdbserver/LogSystemPeekCursor.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp
index 797aa85a13..d4af96c99d 100644
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@@ -251,7 +251,7 @@ Future<Void> ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) {
 ACTOR Future<Void> serverPeekOnFailed( ILogSystem::ServerPeekCursor* self ) {
 	loop {
 		choose {
-			when( wait( self->interf->get().present() ? IFailureMonitor::failureMonitor().onDisconnectOrFailure( self->interf->get().interf().peekMessages.getEndpoint() ) : Never() ) ) { return Void(); }
+			when( wait( self->interf->get().present() ? IFailureMonitor::failureMonitor().onStateEqual( self->interf->get().interf().peekMessages.getEndpoint(), FailureStatus() ) : Never() ) ) { return Void(); }
 			when( wait( self->interf->onChange() ) ) {}
 		}
 	}

From 64aee73c4f3544216029d98e363e16dce518a63f Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Tue, 9 Jul 2019 16:47:56 -0700
Subject: [PATCH 104/136] we only need to hold the ReplyPromise for messages
 that we are going to forward to new proxies

---
 fdbserver/Knobs.cpp                   |  1 +
 fdbserver/Knobs.h                     |  1 +
 fdbserver/MasterProxyServer.actor.cpp | 36 +++++++++++++++++++--------
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index 5f7f45ea19..2b4fb5e87f 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -283,6 +283,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( UPDATE_REMOTE_LOG_VERSION_INTERVAL,                    2.0 );
 	init( MAX_TXS_POP_VERSION_HISTORY,                           1e5 );
 	init( PROXY_FORWARD_DELAY,                                  10.0 );
+	init( MAX_FORWARD_MESSAGES,                                  1e6 );
 
 	// Master Server
 	// masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution)
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index f8517ababe..f0f11d5277 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -228,6 +228,7 @@ public:
 	double UPDATE_REMOTE_LOG_VERSION_INTERVAL;
 	int MAX_TXS_POP_VERSION_HISTORY;
 	double PROXY_FORWARD_DELAY;
+	int MAX_FORWARD_MESSAGES;
 
 	// Master Server
 	double COMMIT_SLEEP_TIME;
diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp
index 83cf0d51db..81b86ec2c0 100644
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@@ -1789,23 +1789,34 @@ ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, uint64_t r
 	}
 }
 
-ACTOR Future<Void> forwardProxy(ClientDBInfo info, RequestStream<CommitTransactionRequest> commit, RequestStream<GetReadVersionRequest> getConsistentReadVersion, RequestStream<GetKeyServerLocationsRequest> getKeyServersLocations) {
+ACTOR template <class X> Future<Void> stripRequests( RequestStream<X> in, PromiseStream<ReplyPromise<REPLY_TYPE(X)>> out, int* count) {
+	loop {
+		X req = waitNext(in.getFuture());
+		out.send(req.reply);
+		if((*count) >= 0 && ++(*count) >= SERVER_KNOBS->MAX_FORWARD_MESSAGES) {
+			TraceEvent(SevWarnAlways, "TooManyProxyForwardRequests");
+			return Void();
+		}
+	}
+}
+
+ACTOR Future<Void> forwardProxy(ClientDBInfo info, PromiseStream<ReplyPromise<CommitID>> commitReplies, PromiseStream<ReplyPromise<GetReadVersionReply>> grvReplies, PromiseStream<ReplyPromise<GetKeyServerLocationsReply>> locationReplies) {
 	loop {
 		choose {
-			when(CommitTransactionRequest req = waitNext(commit.getFuture())) {
+			when(ReplyPromise<CommitID> req = waitNext(commitReplies.getFuture())) {
 				CommitID rep;
 				rep.newClientInfo = info;
-				req.reply.send(rep);
+				req.send(rep);
 			}
-			when(GetReadVersionRequest req = waitNext(getConsistentReadVersion.getFuture())) {
+			when(ReplyPromise<GetReadVersionReply> req = waitNext(grvReplies.getFuture())) {
 				GetReadVersionReply rep;
 				rep.newClientInfo = info;
-				req.reply.send(rep);
+				req.send(rep);
 			}
-			when(GetKeyServerLocationsRequest req = waitNext(getKeyServersLocations.getFuture())) {
+			when(ReplyPromise<GetKeyServerLocationsReply> req = waitNext(locationReplies.getFuture())) {
 				GetKeyServerLocationsReply rep;
 				rep.newClientInfo = info;
-				req.reply.send(rep);
+				req.send(rep);
 			}
 		}
 		wait(yield());
@@ -1833,15 +1844,20 @@ ACTOR Future<Void> masterProxyServer(
 		}
 	}
 	core.cancel();
-	state Future<Void> finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY);
+	state PromiseStream<ReplyPromise<CommitID>> commitReplies;
+	state PromiseStream<ReplyPromise<GetReadVersionReply>> grvReplies;
+	state PromiseStream<ReplyPromise<GetKeyServerLocationsReply>> locationReplies;
+	state int replyCount = 0;
+	state Future<Void> finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY) || stripRequest(proxy.commit, commitReplies, &replyCount) || stripRequest(proxy.getConsistentReadVersion, grvReplies, &replyCount) || stripRequest(proxy.getKeyServersLocations, locationReplies, &replyCount);
+	proxy = MasterProxyInterface();
 	loop {
 		if(finishForward.isReady()) {
 			return Void();
 		}
 		if(db->get().client.proxies.size() > 0 && !db->get().client.proxies[0].provisional && db->get().recoveryCount >= req.recoveryCount
 			&& !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), proxy)) {
-			core = forwardProxy(db->get().client, proxy.commit, proxy.getConsistentReadVersion, proxy.getKeyServersLocations);
-			proxy = MasterProxyInterface();
+			replyCount = -1;
+			core = forwardProxy(db->get().client, commitReplies, grvReplies, locationReplies);
 			wait(finishForward);
 			return Void();
 		}

From 001abec29d7fa9445dacb7121b1db296106ee5ae Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Tue, 9 Jul 2019 16:50:59 -0700
Subject: [PATCH 105/136] fixed a compiler error, buggified a new knob

---
 fdbserver/Knobs.cpp                   | 2 +-
 fdbserver/MasterProxyServer.actor.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index 2b4fb5e87f..5ea999b472 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -283,7 +283,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( UPDATE_REMOTE_LOG_VERSION_INTERVAL,                    2.0 );
 	init( MAX_TXS_POP_VERSION_HISTORY,                           1e5 );
 	init( PROXY_FORWARD_DELAY,                                  10.0 );
-	init( MAX_FORWARD_MESSAGES,                                  1e6 );
+	init( MAX_FORWARD_MESSAGES,                                  1e6 ); if( randomize && BUGGIFY ) MAX_FORWARD_MESSAGES = 10;
 
 	// Master Server
 	// masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution)
diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp
index 81b86ec2c0..07c3bbc931 100644
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@@ -1848,7 +1848,7 @@ ACTOR Future<Void> masterProxyServer(
 	state PromiseStream<ReplyPromise<GetReadVersionReply>> grvReplies;
 	state PromiseStream<ReplyPromise<GetKeyServerLocationsReply>> locationReplies;
 	state int replyCount = 0;
-	state Future<Void> finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY) || stripRequest(proxy.commit, commitReplies, &replyCount) || stripRequest(proxy.getConsistentReadVersion, grvReplies, &replyCount) || stripRequest(proxy.getKeyServersLocations, locationReplies, &replyCount);
+	state Future<Void> finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY) || stripRequests(proxy.commit, commitReplies, &replyCount) || stripRequests(proxy.getConsistentReadVersion, grvReplies, &replyCount) || stripRequests(proxy.getKeyServersLocations, locationReplies, &replyCount);
 	proxy = MasterProxyInterface();
 	loop {
 		if(finishForward.isReady()) {

From a53bf9289ae31393f8b37efb92474fefb9beddf7 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Tue, 9 Jul 2019 17:13:24 -0700
Subject: [PATCH 106/136] remove SnapTestAttrition because it is causing
 correctness errors

---
 .../from_6.2.0/SnapTestAttrition-1.txt        | 45 -------------------
 .../from_6.2.0/SnapTestAttrition-2.txt        |  7 ---
 2 files changed, 52 deletions(-)
 delete mode 100644 tests/restarting/from_6.2.0/SnapTestAttrition-1.txt
 delete mode 100644 tests/restarting/from_6.2.0/SnapTestAttrition-2.txt

diff --git a/tests/restarting/from_6.2.0/SnapTestAttrition-1.txt b/tests/restarting/from_6.2.0/SnapTestAttrition-1.txt
deleted file mode 100644
index d3ceed1584..0000000000
--- a/tests/restarting/from_6.2.0/SnapTestAttrition-1.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-testTitle=SnapTestPre
-;write 1000 Keys ending with even numbers
-        testName=SnapTest
-        numSnaps=1
-        maxSnapDelay=3.0
-        testID=0
-        clearAfterTest=false
-
-testTitle=SnapTestTakeSnap
-;Take snap and do read/write
-        testName=ReadWrite
-        testDuration=10.0
-        transactionsPerSecond=10000
-        writesPerTransactionA=0
-        readsPerTransactionA=10
-        writesPerTransactionB=10
-        readsPerTransactionB=1
-        alpha=0.5
-        nodeCount=100000
-        valueBytes=16
-        discardEdgeMeasurements=false
-
-        testName=SnapTest
-        numSnaps=1
-        maxSnapDelay=10.0
-        testID=1
-        clearAfterTest=false
-
-        testName=Attrition
-        testDuration=10.0
-
-testTitle=SnapTestPost
-;write 1000 Keys ending with odd numbers
-        testName=SnapTest
-        numSnaps=1
-        maxSnapDelay=25.0
-        testID=2
-        clearAfterTest=false
-
-; save and shutdown
-testTitle=SnapSimpleShutdown
-        testName=SaveAndKill
-        restartInfoLocation=simfdb/restartInfo.ini
-        testDuration=10.0
-        isRestoring=1
diff --git a/tests/restarting/from_6.2.0/SnapTestAttrition-2.txt b/tests/restarting/from_6.2.0/SnapTestAttrition-2.txt
deleted file mode 100644
index 07d71073e1..0000000000
--- a/tests/restarting/from_6.2.0/SnapTestAttrition-2.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-; verify all keys are even numbered
-testTitle=SnapTestVerify
-testName=SnapTest
-numSnaps=1
-maxSnapDelay=3.0
-testID=3
-restartInfoLocation=simfdb/restartInfo.ini

From 38ae352fc55e887d5f33e18b2f0e607e2457d951 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Wed, 10 Jul 2019 09:46:23 -0700
Subject: [PATCH 107/136] Fix a merge issue

---
 flow/SystemMonitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/SystemMonitor.cpp b/flow/SystemMonitor.cpp
index 4481cba0f0..c391c93db1 100644
--- a/flow/SystemMonitor.cpp
+++ b/flow/SystemMonitor.cpp
@@ -145,7 +145,7 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 				}
 			}
 
-			for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkMetrics.priorityBins[i] != 0; i++) {
+			for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkMetrics.priorityBins[i] != TaskPriority::Zero; i++) {
 				if(g_network->networkMetrics.priorityBlocked[i]) {
 					double lastSegment = std::min(currentStats.elapsed, now() - g_network->networkMetrics.priorityTimer[i]);
 					g_network->networkMetrics.priorityBlockedDuration[i] += lastSegment;

From c694931e33ce4daaedf73c03947fc66763f5a801 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Wed, 10 Jul 2019 14:06:06 -0700
Subject: [PATCH 108/136] sim2: Remove obsolete comment

---
 fdbrpc/sim2.actor.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 92402fd56a..e71959d831 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -380,7 +380,6 @@ private:
 
 	ACTOR static Future<Void> trackLeakedConnection( Sim2Conn* self ) {
 		wait( g_simulator.onProcess( self->process ) );
-		// SOMEDAY: Make this value variable? Dependent on buggification status?
 		if (self->process->address.isPublic()) {
 			wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) );
 		} else {

From b4dbc6d7fad9c932981ee3cb3c13fa6453435dda Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Wed, 10 Jul 2019 14:43:20 -0700
Subject: [PATCH 109/136] Change the way cache hits and misses are tracked to
 avoid counting blind page writes as misses and count the results of partial
 page writes. Report cache hit rate in status.

---
 .../source/mr-status-json-schemas.rst.inc     |  4 ++
 fdbclient/Schemas.cpp                         |  4 ++
 fdbrpc/AsyncFileCached.actor.h                | 62 +++++++++++------
 fdbserver/Status.actor.cpp                    | 68 +++++++++++++++++++
 flow/SystemMonitor.h                          |  4 +-
 5 files changed, 119 insertions(+), 23 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index ed07092ab1..1ba9d1652d 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -297,6 +297,10 @@
              }
          ]
       },
+      "page_cache":{
+         "log_hit_rate":0.5,
+         "storage_hit_rate":0.5
+      },
       "messages":[
          {
             "reasons":[
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index ec4488741e..9eebf116dc 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -319,6 +319,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
              }
          ]
       },
+      "page_cache":{
+         "log_hit_rate":0.5,
+         "storage_hit_rate":0.5
+      },
       "messages":[
          {
             "reasons":[
diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h
index 031ba79ab9..d9b192b662 100644
--- a/fdbrpc/AsyncFileCached.actor.h
+++ b/fdbrpc/AsyncFileCached.actor.h
@@ -67,8 +67,6 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
 	EvictablePageCache() : pageSize(0), maxPages(0), cacheEvictionType(RANDOM) {}
 
 	explicit EvictablePageCache(int pageSize, int64_t maxSize) : pageSize(pageSize), maxPages(maxSize / pageSize), cacheEvictionType(evictionPolicyStringToEnum(FLOW_KNOBS->CACHE_EVICTION_POLICY)) {
-		cacheHits.init(LiteralStringRef("EvictablePageCache.CacheHits"));
-		cacheMisses.init(LiteralStringRef("EvictablePageCache.CacheMisses"));
 		cacheEvictions.init(LiteralStringRef("EvictablePageCache.CacheEvictions"));
 	}
 
@@ -82,7 +80,6 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
 		} else {
 			lruPages.push_back(*page); // new page is considered the most recently used (placed at LRU tail)
 		}
-		++cacheMisses;
 	}
 
 	void updateHit(EvictablePage* page) {
@@ -91,7 +88,6 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
 			lruPages.erase(List::s_iterator_to(*page));
 			lruPages.push_back(*page);
 		}
-		++cacheHits;
 	}
 
 	void try_evict() {
@@ -126,8 +122,6 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
 	List lruPages;
 	int pageSize;
 	int64_t maxPages;
-	Int64MetricHandle cacheHits;
-	Int64MetricHandle cacheMisses;
 	Int64MetricHandle cacheEvictions;
 	const CacheEvictionType cacheEvictionType;
 };
@@ -278,6 +272,8 @@ private:
 	Int64MetricHandle countFileCacheWrites;
 	Int64MetricHandle countFileCacheReadsBlocked;
 	Int64MetricHandle countFileCacheWritesBlocked;
+	Int64MetricHandle countFileCachePageReadsHit;
+	Int64MetricHandle countFileCachePageReadsMissed;
 	Int64MetricHandle countFileCachePageReadsMerged;
 	Int64MetricHandle countFileCacheReadBytes;
 
@@ -286,28 +282,33 @@ private:
 	Int64MetricHandle countCacheWrites;
 	Int64MetricHandle countCacheReadsBlocked;
 	Int64MetricHandle countCacheWritesBlocked;
+	Int64MetricHandle countCachePageReadsHit;
+	Int64MetricHandle countCachePageReadsMissed;
 	Int64MetricHandle countCachePageReadsMerged;
 	Int64MetricHandle countCacheReadBytes;
 
-	AsyncFileCached( Reference<IAsyncFile> uncached, const std::string& filename, int64_t length, Reference<EvictablePageCache> pageCache ) 
+	AsyncFileCached( Reference<IAsyncFile> uncached, const std::string& filename, int64_t length, Reference<EvictablePageCache> pageCache )
 		: uncached(uncached), filename(filename), length(length), prevLength(length), pageCache(pageCache), currentTruncate(Void()), currentTruncateSize(0) {
 		if( !g_network->isSimulated() ) {
-			countFileCacheWrites.init(         LiteralStringRef("AsyncFile.CountFileCacheWrites"), filename);
-			countFileCacheReads.init(          LiteralStringRef("AsyncFile.CountFileCacheReads"), filename);
-			countFileCacheWritesBlocked.init(  LiteralStringRef("AsyncFile.CountFileCacheWritesBlocked"), filename);
-			countFileCacheReadsBlocked.init(   LiteralStringRef("AsyncFile.CountFileCacheReadsBlocked"), filename);
+			countFileCacheWrites.init(LiteralStringRef("AsyncFile.CountFileCacheWrites"), filename);
+			countFileCacheReads.init(LiteralStringRef("AsyncFile.CountFileCacheReads"), filename);
+			countFileCacheWritesBlocked.init(LiteralStringRef("AsyncFile.CountFileCacheWritesBlocked"), filename);
+			countFileCacheReadsBlocked.init(LiteralStringRef("AsyncFile.CountFileCacheReadsBlocked"), filename);
+			countFileCachePageReadsHit.init(LiteralStringRef("AsyncFile.CountFileCachePageReadsHit"), filename);
+			countFileCachePageReadsMissed.init(LiteralStringRef("AsyncFile.CountFileCachePageReadsMissed"), filename);
 			countFileCachePageReadsMerged.init(LiteralStringRef("AsyncFile.CountFileCachePageReadsMerged"), filename);
-			countFileCacheFinds.init(          LiteralStringRef("AsyncFile.CountFileCacheFinds"), filename);
-			countFileCacheReadBytes.init(      LiteralStringRef("AsyncFile.CountFileCacheReadBytes"), filename);
+			countFileCacheFinds.init(LiteralStringRef("AsyncFile.CountFileCacheFinds"), filename);
+			countFileCacheReadBytes.init(LiteralStringRef("AsyncFile.CountFileCacheReadBytes"), filename);
 
-			countCacheWrites.init(         LiteralStringRef("AsyncFile.CountCacheWrites"));
-			countCacheReads.init(          LiteralStringRef("AsyncFile.CountCacheReads"));
-			countCacheWritesBlocked.init(  LiteralStringRef("AsyncFile.CountCacheWritesBlocked"));
-			countCacheReadsBlocked.init(   LiteralStringRef("AsyncFile.CountCacheReadsBlocked"));
+			countCacheWrites.init(LiteralStringRef("AsyncFile.CountCacheWrites"));
+			countCacheReads.init(LiteralStringRef("AsyncFile.CountCacheReads"));
+			countCacheWritesBlocked.init(LiteralStringRef("AsyncFile.CountCacheWritesBlocked"));
+			countCacheReadsBlocked.init(LiteralStringRef("AsyncFile.CountCacheReadsBlocked"));
+			countCachePageReadsHit.init(LiteralStringRef("AsyncFile.CountCachePageReadsHit"));
+			countCachePageReadsMissed.init(LiteralStringRef("AsyncFile.CountCachePageReadsMissed"));
 			countCachePageReadsMerged.init(LiteralStringRef("AsyncFile.CountCachePageReadsMerged"));
-			countCacheFinds.init(          LiteralStringRef("AsyncFile.CountCacheFinds"));
-			countCacheReadBytes.init(      LiteralStringRef("AsyncFile.CountCacheReadBytes"));
-
+			countCacheFinds.init(LiteralStringRef("AsyncFile.CountCacheFinds"));
+			countCacheReadBytes.init(LiteralStringRef("AsyncFile.CountCacheReadBytes"));
 		}
 	}
 
@@ -387,11 +388,18 @@ struct AFCPage : public EvictablePage, public FastAllocated<AFCPage> {
 
 		// If there are no active readers then if data is valid or we're replacing all of it we can write directly
 		if (valid || fullPage) {
+			if(!fullPage) {
+				++owner->countFileCachePageReadsHit;
+				++owner->countCachePageReadsHit;
+			}
 			valid = true;
 			memcpy( static_cast<uint8_t*>(this->data) + offset, data, length );
 			return yield();
 		}
 
+		++owner->countFileCachePageReadsMissed;
+		++owner->countCachePageReadsMissed;
+
 		// If data is not valid but no read is in progress, start reading
 		if (notReading.isReady()) {
 			notReading = readThrough( this );
@@ -410,7 +418,14 @@ struct AFCPage : public EvictablePage, public FastAllocated<AFCPage> {
 
 	Future<Void> readZeroCopy() {
 		++zeroCopyRefCount;
-		if (valid) return yield();
+		if (valid) {
+			++owner->countFileCachePageReadsHit;
+			++owner->countCachePageReadsHit;
+			return yield();
+		}
+
+		++owner->countFileCachePageReadsMissed;
+		++owner->countCachePageReadsMissed;
 
 		if (notReading.isReady()) {
 			notReading = readThrough( this );
@@ -428,12 +443,17 @@ struct AFCPage : public EvictablePage, public FastAllocated<AFCPage> {
 
 	Future<Void> read( void* data, int length, int offset ) {
 		if (valid) {
+			++owner->countFileCachePageReadsHit;
+			++owner->countCachePageReadsHit;
 			owner->countFileCacheReadBytes += length;
 			owner->countCacheReadBytes += length;
 			memcpy( data, static_cast<uint8_t const*>(this->data) + offset, length );
 			return yield();
 		}
 
+		++owner->countFileCachePageReadsMissed;
+		++owner->countCachePageReadsMissed;
+
 		if (notReading.isReady()) {
 			notReading = readThrough( this );
 		} else {
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index ceb929f894..01a74ab763 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -1578,6 +1578,68 @@ ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(Reference<AsyncVar<
 	return statusObj;
 }
 
+ACTOR static Future<JsonBuilderObject> clusterSummaryStatisticsFetcher(WorkerEvents pMetrics, Future<ErrorOr<vector<std::pair<StorageServerInterface, EventMap>>>> storageServerFuture,
+	Future<ErrorOr<vector<std::pair<TLogInterface, EventMap>>>> tlogFuture, std::set<std::string> *incomplete_reasons)
+{
+	state JsonBuilderObject statusObj;
+	try {
+		state JsonBuilderObject cacheStatistics;
+
+		ErrorOr<vector<std::pair<StorageServerInterface, EventMap>>> storageServers = wait(storageServerFuture);
+
+		if (!storageServers.present()) {
+			throw storageServers.getError();
+		}
+
+		double storageCacheHitsHz = 0;
+		double storageCacheMissesHz = 0;
+
+		for(auto &ss : storageServers.get()) {
+			auto processMetrics = pMetrics.find(ss.first.address());
+			if(processMetrics != pMetrics.end()) {
+				int64_t hits = processMetrics->second.getInt64("CacheHits");
+				int64_t misses = processMetrics->second.getInt64("CacheMisses");
+				double elapsed = processMetrics->second.getDouble("Elapsed");
+				storageCacheHitsHz += hits / elapsed;
+				storageCacheMissesHz += misses / elapsed;
+			}
+		}
+
+		cacheStatistics["storage_hit_rate"] = (storageCacheMissesHz == 0) ? 1.0 : storageCacheHitsHz / (storageCacheHitsHz + storageCacheMissesHz);
+
+		ErrorOr<vector<std::pair<TLogInterface, EventMap>>> tlogServers = wait(tlogFuture);
+
+		if(!tlogServers.present()) {
+			throw tlogServers.getError();
+		}
+
+		double logCacheHitsHz = 0;
+		double logCacheMissesHz = 0;
+
+		for(auto &log : tlogServers.get()) {
+			auto processMetrics = pMetrics.find(log.first.address());
+			if(processMetrics != pMetrics.end()) {
+				int64_t hits = processMetrics->second.getInt64("CacheHits");
+				int64_t misses = processMetrics->second.getInt64("CacheMisses");
+				double elapsed = processMetrics->second.getDouble("Elapsed");
+				logCacheHitsHz += hits / elapsed;
+				logCacheMissesHz += misses / elapsed;
+			}
+		}
+
+		cacheStatistics["log_hit_rate"] = (logCacheMissesHz == 0) ? 1.0 : logCacheHitsHz / (logCacheHitsHz + logCacheMissesHz);
+		statusObj["page_cache"] = cacheStatistics;
+	}
+	catch (Error& e) {
+		if (e.code() == error_code_actor_cancelled)
+			throw;
+
+		incomplete_reasons->insert("Unknown cache statistics.");
+	}
+
+	return statusObj;
+}
+
 static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference<AsyncVar<struct ServerDBInfo>> db, std::unordered_map<NetworkAddress, WorkerInterface> const& address_workers) {
 	JsonBuilderArray oldTlogsArray;
 
@@ -2025,6 +2087,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
 			futures2.push_back(workloadStatusFetcher(db, workers, mWorker, rkWorker, &qos, &data_overlay, &status_incomplete_reasons, storageServerFuture));
 			futures2.push_back(layerStatusFetcher(cx, &messages, &status_incomplete_reasons));
 			futures2.push_back(lockedStatusFetcher(db, &messages, &status_incomplete_reasons));
+			futures2.push_back(clusterSummaryStatisticsFetcher(pMetrics, storageServerFuture, tLogFuture, &status_incomplete_reasons));
 
 			state std::vector<JsonBuilderObject> workerStatuses = wait(getAll(futures2));
 
@@ -2069,6 +2132,11 @@ ACTOR Future<StatusReply> clusterGetStatus(
 				statusObj.addContents(workerStatuses[3]);
 			}
 
+			// Insert cluster summary statistics
+			if(!workerStatuses[4].empty()) {
+				statusObj.addContents(workerStatuses[4]);
+			}
+
 			// Need storage servers now for processStatusFetcher() below.
 			ErrorOr<vector<std::pair<StorageServerInterface, EventMap>>> _storageServers = wait(storageServerFuture);
 			if (_storageServers.present()) {
diff --git a/flow/SystemMonitor.h b/flow/SystemMonitor.h
index 4c0585cd69..afc3584c36 100644
--- a/flow/SystemMonitor.h
+++ b/flow/SystemMonitor.h
@@ -124,8 +124,8 @@ struct NetworkData {
 		countFileCachePageReadsMerged = getValue(LiteralStringRef("AsyncFile.CountCachePageReadsMerged"));
 		countFileCacheFinds = getValue(LiteralStringRef("AsyncFile.CountCacheFinds"));
 		countFileCacheReadBytes = getValue(LiteralStringRef("AsyncFile.CountCacheReadBytes"));
-		countFilePageCacheHits = getValue(LiteralStringRef("EvictablePageCache.CacheHits"));
-		countFilePageCacheMisses = getValue(LiteralStringRef("EvictablePageCache.CacheMisses"));
+		countFilePageCacheHits = getValue(LiteralStringRef("AsyncFile.CountCachePageReadsHit"));
+		countFilePageCacheMisses = getValue(LiteralStringRef("AsyncFile.CountCachePageReadsMissed"));
 		countFilePageCacheEvictions = getValue(LiteralStringRef("EvictablePageCache.CacheEvictions"));
 	}
 };

From a380dda5e8e006c62f9a811cc1771eebc58671f6 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Wed, 10 Jul 2019 18:41:12 -0700
Subject: [PATCH 110/136] fixed a typo

---
 fdbserver/TagPartitionedLogSystem.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index 2dd94d7ac5..5fe2b28d74 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -2109,7 +2109,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			if (logSystem->logRouterTags) {
 				for(int i = 0; i < oldLogSystem->logRouterTags; i++) {
 					Tag tag = Tag(tagLocalityLogRouter, i);
-					// Sattelite logs will index a mutation with tagLocalityLogRouter with an id greater than
+					// Satellite logs will index a mutation with tagLocalityLogRouter with an id greater than
 					// the number of log routers as having an id mod the number of log routers.  We thus need
 					// to make sure that if we're going from more log routers in the previous generation to
 					// less log routers in the newer one, that we map the log router tags onto satellites that

From bbef631872ee6f7dc7b1babcf4d0a09512146f30 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Wed, 10 Jul 2019 18:48:54 -0700
Subject: [PATCH 111/136] fix: do not access optionInfo unless the option
 already exists in the map

---
 fdbclient/MultiVersionTransaction.actor.cpp      | 11 ++++++++---
 fdbclient/NativeAPI.actor.cpp                    |  8 +++++++-
 fdbclient/ReadYourWrites.actor.cpp               |  8 +++++++-
 fdbclient/ThreadSafeTransaction.actor.cpp        |  6 ++++++
 fdbserver/workloads/FuzzApiCorrectness.actor.cpp |  1 +
 5 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 16bf8afd0a..eb973b2659 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -596,9 +596,14 @@ Version MultiVersionTransaction::getCommittedVersion() {
 }
 
 void MultiVersionTransaction::setOption(FDBTransactionOptions::Option option, Optional<StringRef> value) {
-	if(MultiVersionApi::apiVersionAtLeast(610) && FDBTransactionOptions::optionInfo[option].persistent) {
+	auto itr = FDBTransactionOptions::optionInfo.find(option);
+	if(itr == FDBTransactionOptions::optionInfo.end()) {
+		TraceEvent("UnknownTransactionOption").detail("Option", option);
+		throw invalid_option();
+	}
+	
+	if(MultiVersionApi::apiVersionAtLeast(610) && itr->second.persistent) {
 		persistentOptions.emplace_back(option, value.castTo<Standalone<StringRef>>());
-
 	}
 	auto tr = getTransaction();
 	if(tr.transaction) {
@@ -683,7 +688,7 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional
 		throw invalid_option();
 	}
 
-	int defaultFor = FDBDatabaseOptions::optionInfo[option].defaultFor;
+	int defaultFor = itr->second.defaultFor;
 	if (defaultFor >= 0) {
 		ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) !=
 		       FDBTransactionOptions::optionInfo.end());
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 1d44b5a3cc..eec7ce36cd 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -756,7 +756,13 @@ uint64_t extractHexOption( StringRef value ) {
 }
 
 void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional<StringRef> value) {
-	int defaultFor = FDBDatabaseOptions::optionInfo[option].defaultFor;
+	auto itr = FDBDatabaseOptions::optionInfo.find(option);
+	if(itr == FDBDatabaseOptions::optionInfo.end()) {
+		TraceEvent("UnknownDatabaseOption").detail("Option", option);
+		throw invalid_option();
+	}
+	
+	int defaultFor = itr->second.defaultFor;
 	if (defaultFor >= 0) {
 		ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) !=
 		       FDBTransactionOptions::optionInfo.end());
diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp
index 013fd61596..709fd5a734 100644
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@@ -1759,9 +1759,15 @@ Future<Standalone<StringRef>> ReadYourWritesTransaction::getVersionstamp() {
 }
 
 void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional<StringRef> value ) {
+	auto itr = FDBTransactionOptions::optionInfo.find(option);
+	if(itr == FDBTransactionOptions::optionInfo.end()) {
+		TraceEvent("UnknownTransactionOption").detail("Option", option);
+		throw invalid_option();
+	}
+	
 	setOptionImpl(option, value);
 
-	if(FDBTransactionOptions::optionInfo[option].persistent) {
+	if(itr->second.persistent) {
 		persistentOptions.emplace_back(option, value.castTo<Standalone<StringRef>>());
 
 	}
diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp
index 0a47c43407..c26170c8c8 100644
--- a/fdbclient/ThreadSafeTransaction.actor.cpp
+++ b/fdbclient/ThreadSafeTransaction.actor.cpp
@@ -283,6 +283,12 @@ ThreadFuture<Standalone<StringRef>> ThreadSafeTransaction::getVersionstamp() {
 }
 
 void ThreadSafeTransaction::setOption( FDBTransactionOptions::Option option, Optional<StringRef> value ) {
+	auto itr = FDBTransactionOptions::optionInfo.find(option);
+	if(itr == FDBTransactionOptions::optionInfo.end()) {
+		TraceEvent("UnknownTransactionOption").detail("Option", option);
+		throw invalid_option();
+	}
+	
 	ReadYourWritesTransaction *tr = this->tr;
 	Standalone<Optional<StringRef>> passValue = value;
 
diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
index 0c02bbb556..80fe24c6ed 100644
--- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
@@ -1092,6 +1092,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 			}
 
 			contract = {
+				std::make_pair( error_code_invalid_option, ExceptionContract::Possible ),
 				std::make_pair( error_code_invalid_option_value, ExceptionContract::Possible ),
 				std::make_pair( error_code_client_invalid_operation, ExceptionContract::possibleIf((FDBTransactionOptions::Option)op == FDBTransactionOptions::READ_YOUR_WRITES_DISABLE || 
 				                                                                                   (FDBTransactionOptions::Option)op == FDBTransactionOptions::LOG_TRANSACTION) ),

From f4366e69caa3da8983d296df41425a902f181c1a Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Thu, 11 Jul 2019 11:25:39 -0700
Subject: [PATCH 112/136] Unknown options should not be used internally (i.e.
 underneath thread-safe API). This commit removes various checks that options
 exist and replaces them with an ASSERT.

---
 fdbcli/fdbcli.actor.cpp            |  6 +++---
 fdbclient/FDBOptions.h             | 18 +++++++++++++-----
 fdbclient/NativeAPI.actor.cpp      |  8 +-------
 fdbclient/ReadYourWrites.actor.cpp |  9 +--------
 4 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index a84712ddfd..67611efab1 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -181,7 +181,7 @@ public:
 private:
 	//Sets a transaction option. If intrans == true, then this option is also applied to the passed in transaction.
 	void setTransactionOption(Reference<ReadYourWritesTransaction> tr, FDBTransactionOptions::Option option, bool enabled, Optional<StringRef> arg, bool intrans) {
-		if(enabled && arg.present() != FDBTransactionOptions::optionInfo[option].hasParameter)	{
+		if(enabled && arg.present() != FDBTransactionOptions::optionInfo.getMustExist(option).hasParameter)	{
 			printf("ERROR: option %s a parameter\n", arg.present() ? "did not expect" : "expected");
 			throw invalid_option_value();
 		}
@@ -237,7 +237,7 @@ private:
 
 		//Returns true if the specified option is documented
 		bool isDocumented(typename T::Option option) {
-			FDBOptionInfo info = T::optionInfo[option];
+			FDBOptionInfo info = T::optionInfo.getMustExist(option);
 
 			std::string deprecatedStr = "Deprecated";
 			return !info.comment.empty() && info.comment.substr(0, deprecatedStr.size()) != deprecatedStr;
@@ -259,7 +259,7 @@ private:
 		void printHelpString() {
 			for(auto itr = legalOptions.begin(); itr != legalOptions.end(); ++itr) {
 				if(isDocumented(itr->second)) {
-					FDBOptionInfo info = T::optionInfo[itr->second];
+					FDBOptionInfo info = T::optionInfo.getMustExist(itr->second);
 					std::string helpStr = info.name + " - " + info.comment;
 					if(info.hasParameter)
 						helpStr += " " + info.parameterComment;
diff --git a/fdbclient/FDBOptions.h b/fdbclient/FDBOptions.h
index 677a54ee6a..80e00903ba 100644
--- a/fdbclient/FDBOptions.h
+++ b/fdbclient/FDBOptions.h
@@ -54,11 +54,19 @@ private:
 	std::map<typename T::Option, FDBOptionInfo> optionInfo;
 
 public:
-	typename std::map<typename T::Option, FDBOptionInfo>::iterator begin() { return optionInfo.begin(); }
-	typename std::map<typename T::Option, FDBOptionInfo>::iterator end() { return optionInfo.end(); }
-	typename std::map<typename T::Option, FDBOptionInfo>::iterator find(const typename T::Option& key) { return optionInfo.find(key); }
+	typename std::map<typename T::Option, FDBOptionInfo>::const_iterator begin() const { return optionInfo.begin(); }
+	typename std::map<typename T::Option, FDBOptionInfo>::const_iterator end() const { return optionInfo.end(); }
+	typename std::map<typename T::Option, FDBOptionInfo>::const_iterator find(const typename T::Option& key) const { return optionInfo.find(key); }
 
-	FDBOptionInfo& operator[] (const typename T::Option& key) { return optionInfo[key]; }
+	void insert(const typename T::Option& key, FDBOptionInfo info) {
+		optionInfo[key] = info;
+	}
+
+	FDBOptionInfo const& getMustExist(const typename T::Option& key) const { 
+		auto itr = optionInfo.find(key);
+		ASSERT(itr != optionInfo.end());
+		return itr->second; 
+	}
 
 	FDBOptionInfoMap() { T::init(); }
 };
@@ -88,6 +96,6 @@ public:
 	typename OptionList::const_iterator end() const { return options.cend(); }
 };
 
-#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor ) type::optionInfo[var] = FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor);
+#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor ) type::optionInfo.insert(var, FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor));
 
 #endif
\ No newline at end of file
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index eec7ce36cd..c00fffafa9 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -756,13 +756,7 @@ uint64_t extractHexOption( StringRef value ) {
 }
 
 void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional<StringRef> value) {
-	auto itr = FDBDatabaseOptions::optionInfo.find(option);
-	if(itr == FDBDatabaseOptions::optionInfo.end()) {
-		TraceEvent("UnknownDatabaseOption").detail("Option", option);
-		throw invalid_option();
-	}
-	
-	int defaultFor = itr->second.defaultFor;
+	int defaultFor = FDBDatabaseOptions::optionInfo.getMustExist(option).defaultFor;
 	if (defaultFor >= 0) {
 		ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) !=
 		       FDBTransactionOptions::optionInfo.end());
diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp
index 709fd5a734..2f862cb338 100644
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@@ -1759,17 +1759,10 @@ Future<Standalone<StringRef>> ReadYourWritesTransaction::getVersionstamp() {
 }
 
 void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional<StringRef> value ) {
-	auto itr = FDBTransactionOptions::optionInfo.find(option);
-	if(itr == FDBTransactionOptions::optionInfo.end()) {
-		TraceEvent("UnknownTransactionOption").detail("Option", option);
-		throw invalid_option();
-	}
-	
 	setOptionImpl(option, value);
 
-	if(itr->second.persistent) {
+	if (FDBTransactionOptions::optionInfo.getMustExist(option).persistent) {
 		persistentOptions.emplace_back(option, value.castTo<Standalone<StringRef>>());
-
 	}
 }
 

From 8e3c8ee0c40478a978d339bcd9017a1e0a576b1c Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Thu, 11 Jul 2019 12:07:45 -0700
Subject: [PATCH 113/136] Fix a tab/space mismatch that I was responsible for.

---
 documentation/sphinx/source/mr-status-json-schemas.rst.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index ed07092ab1..90c5763e36 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -188,7 +188,7 @@
                   "hz":0.0
                }
             },
-			"run_loop_busy":0.2 // fraction of time the run loop was busy
+            "run_loop_busy":0.2 // fraction of time the run loop was busy
          }
       },
       "old_logs":[

From 97609ad9917a8cb934b0a70f0b1c4ba76c93e653 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Thu, 11 Jul 2019 13:54:44 -0700
Subject: [PATCH 114/136] Add information about transaction starts at different
 priorities to status.

---
 .../sphinx/source/mr-status-json-schemas.rst.inc  | 15 +++++++++++++++
 fdbclient/Schemas.cpp                             | 15 +++++++++++++++
 fdbserver/Status.actor.cpp                        | 15 ++++++++++++++-
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 90c5763e36..fc8f33dafb 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -411,6 +411,21 @@
                "counter":0,
                "roughness":0.0
             },
+            "started_immediate_priority":{
+               "hz":0.0,
+               "counter":0,
+               "roughness":0.0
+            },
+            "started_default_priority":{
+               "hz":0.0,
+               "counter":0,
+               "roughness":0.0
+            },
+            "started_batch_priority":{
+               "hz":0.0,
+               "counter":0,
+               "roughness":0.0
+            },
             "conflicted":{
                "hz":0.0,
                "counter":0,
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index ec4488741e..c78fbbbb26 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -434,6 +434,21 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                "counter":0,
                "roughness":0.0
             },
+            "started_immediate_priority":{
+               "hz":0.0,
+               "counter":0,
+               "roughness":0.0
+            },
+            "started_default_priority":{
+               "hz":0.0,
+               "counter":0,
+               "roughness":0.0
+            },
+            "started_batch_priority":{
+               "hz":0.0,
+               "counter":0,
+               "roughness":0.0
+            },
             "conflicted":{
                "hz":0.0,
                "counter":0,
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index ceb929f894..40d210271b 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -1462,13 +1462,23 @@ ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(Reference<AsyncVar<
 		}
 		vector<TraceEventFields> proxyStats = wait(getAll(proxyStatFutures));
 
-		StatusCounter mutations, mutationBytes, txnConflicts, txnStartOut, txnCommitOutSuccess;
+		StatusCounter mutations;
+		StatusCounter mutationBytes;
+		StatusCounter txnConflicts;
+		StatusCounter txnStartOut;
+		StatusCounter txnSystemPriorityStartOut;
+		StatusCounter txnDefaultPriorityStartOut;
+		StatusCounter txnBatchPriorityStartOut;
+		StatusCounter txnCommitOutSuccess;
 
 		for (auto &ps : proxyStats) {
 			mutations.updateValues( StatusCounter(ps.getValue("Mutations")) );
 			mutationBytes.updateValues( StatusCounter(ps.getValue("MutationBytes")) );
 			txnConflicts.updateValues( StatusCounter(ps.getValue("TxnConflicts")) );
 			txnStartOut.updateValues( StatusCounter(ps.getValue("TxnStartOut")) );
+			txnSystemPriorityStartOut.updateValues(StatusCounter(ps.getValue("TxnSystemPriorityStartOut")));
+			txnDefaultPriorityStartOut.updateValues(StatusCounter(ps.getValue("TxnDefaultPriorityStartOut")));
+			txnBatchPriorityStartOut.updateValues(StatusCounter(ps.getValue("TxnBatchPriorityStartOut")));
 			txnCommitOutSuccess.updateValues( StatusCounter(ps.getValue("TxnCommitOutSuccess")) );
 		}
 
@@ -1478,6 +1488,9 @@ ACTOR static Future<JsonBuilderObject> workloadStatusFetcher(Reference<AsyncVar<
 		JsonBuilderObject transactions;
 		transactions["conflicted"] = txnConflicts.getStatus();
 		transactions["started"] = txnStartOut.getStatus();
+		transactions["started_immediate_priority"] = txnSystemPriorityStartOut.getStatus();
+		transactions["started_default_priority"] = txnDefaultPriorityStartOut.getStatus();
+		transactions["started_batch_priority"] = txnBatchPriorityStartOut.getStatus();
 		transactions["committed"] = txnCommitOutSuccess.getStatus();
 
 		statusObj["transactions"] = transactions;

From c0972dbdb9758a4d6d1a07cc79b2e055f875e481 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Thu, 11 Jul 2019 13:59:54 -0700
Subject: [PATCH 115/136] Add release note.

---
 documentation/sphinx/source/release-notes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index 8be4172363..e82ee58f14 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -21,6 +21,7 @@ Status
 ------
 
 * Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #1760) <https://github.com/apple/foundationdb/pull/1760>`_.
+* Added transaction start counts by priority to ``cluster.workload.transactions``. The new counters are named ``started_immediate_priority``, ``started_default_priority``, and ``started_batch_priority``. `(PR #1836) <https://github.com/apple/foundationdb/pull/1836>`_.
 * Remove ``cluster.datacenter_version_difference`` and replace it with ``cluster.datacenter_lag`` that has subfields ``versions`` and ``seconds``. `(PR #1800) <https://github.com/apple/foundationdb/pull/1800>`_.
 
 Bindings

From 46d670b261cfaff777f70ec1bfe2542286906769 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Thu, 11 Jul 2019 14:02:34 -0700
Subject: [PATCH 116/136] Add release note.

---
 documentation/sphinx/source/release-notes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index 9123615929..04ec8d171c 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -20,6 +20,7 @@ Status
 ------
 
 * Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #1760) <https://github.com/apple/foundationdb/pull/1760>`_.
+* Added ``cluster.page_cache`` section to status. In this section, added two new statistics ``storage_hit_rate`` and ``log_hit_rate`` that indicate the fraction of recent page reads that were served by cache. `(PR #1823) <https://github.com/apple/foundationdb/pull/1823>`_.
 * Remove ``cluster.datacenter_version_difference`` and replace it with ``cluster.datacenter_lag`` that has subfields ``versions`` and ``seconds``. `(PR #1800) <https://github.com/apple/foundationdb/pull/1800>`_.
 
 Bindings

From 9c3591ff43fe6f14365b8828f2c968db461c2142 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Wed, 10 Jul 2019 21:55:24 -0700
Subject: [PATCH 117/136] Fix python3 test failure

Both key and value has to be of type bytes.
---
 bindings/python/tests/size_limit_tests.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/bindings/python/tests/size_limit_tests.py b/bindings/python/tests/size_limit_tests.py
index 3072e153f8..6445e02e2d 100644
--- a/bindings/python/tests/size_limit_tests.py
+++ b/bindings/python/tests/size_limit_tests.py
@@ -36,14 +36,14 @@ def setValueWithLimit(tr, key, value, limit):
 def test_size_limit_option(db):
     db.options.set_transaction_timeout(2000)  # 2 seconds
     db.options.set_transaction_retry_limit(3)
-    value = 'a' * 1024
+    value = b'a' * 1024
 
-    setValue(db, 't1', value)
-    assert(value == db['t1'])
+    setValue(db, b't1', value)
+    assert(value == db[b't1'])
 
     try:
         db.options.set_transaction_size_limit(1000)
-        setValue(db, 't2', value)
+        setValue(db, b't2', value)
         assert(False)  # not reached
     except fdb.FDBError as e:
         assert(e.code == 2101)  # Transaction exceeds byte limit (2101)
@@ -51,7 +51,7 @@ def test_size_limit_option(db):
     # Per transaction option overrides database option
     db.options.set_transaction_size_limit(1000000)
     try:
-        setValueWithLimit(db, 't3', value, 1000)
+        setValueWithLimit(db, b't3', value, 1000)
         assert(False)  # not reached
     except fdb.FDBError as e:
         assert(e.code == 2101)  # Transaction exceeds byte limit (2101)
@@ -60,9 +60,9 @@ def test_size_limit_option(db):
     db.options.set_transaction_size_limit(1000)
     tr = db.create_transaction()
     try:
-        tr['t4'] = 'bar'
+        tr[b't4'] = b'bar'
         tr.on_error(fdb.FDBError(1007)).wait()
-        setValue(tr, 't4', value)
+        setValue(tr, b't4', value)
         tr.commit().wait()
         assert(False)  # not reached
     except fdb.FDBError as e:

From 1cf036fc9f5cdaea6e7d2de8b162737c08ad5b8c Mon Sep 17 00:00:00 2001
From: Andrew Noyes <anoyes34@gmail.com>
Date: Fri, 12 Jul 2019 10:43:42 -0700
Subject: [PATCH 118/136] Remove SnapTestAttrition from ctest

SnapTestAttrition the test file was removed in a53bf928
---
 tests/CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f0913f2a26..3964790878 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -156,9 +156,6 @@ add_fdb_test(
 add_fdb_test(
   TEST_FILES restarting/from_6.2.0/SnapCycleRestart-1.txt
              restarting/from_6.2.0/SnapCycleRestart-2.txt)
-add_fdb_test(
-  TEST_FILES restarting/from_6.2.0/SnapTestAttrition-1.txt
-             restarting/from_6.2.0/SnapTestAttrition-2.txt)
 add_fdb_test(
   TEST_FILES restarting/from_5.1.7/DrUpgradeRestart-1.txt
              restarting/from_5.1.7/DrUpgradeRestart-2.txt IGNORE)

From d5051b08dd1178963197a84585f4e4c386c4589b Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 12 Jul 2019 16:12:35 -0700
Subject: [PATCH 119/136] Make trace event field lengths (and total event
 sizes) default knobified and configurable. Add a transaction option to
 control the field length of transaction debug logging. Make the program start
 command line field less likely to be truncated.

---
 bindings/flow/tester/Tester.actor.cpp         |   7 +-
 bindings/go/src/_stacktester/stacktester.go   |   6 +-
 bindings/go/src/fdb/generated.go              |  34 ++++--
 .../foundationdb/test/AsyncStackTester.java   |   6 +-
 .../apple/foundationdb/test/StackTester.java  |   6 +-
 bindings/python/tests/tester.py               |   9 +-
 bindings/ruby/tests/tester.rb                 |   6 +-
 documentation/sphinx/source/api-c.rst         |   2 +
 .../sphinx/source/api-common.rst.inc          |  10 +-
 documentation/sphinx/source/api-python.rst    |  10 ++
 documentation/sphinx/source/api-ruby.rst      |  10 ++
 documentation/sphinx/source/data-modeling.rst |   2 +
 .../sphinx/source/developer-guide.rst         |   2 +
 documentation/sphinx/source/release-notes.rst |   1 +
 fdbbackup/backup.actor.cpp                    |   3 +
 fdbcli/fdbcli.actor.cpp                       |   2 +
 fdbclient/ClientLogEvents.h                   | 102 ++++++++++++++----
 fdbclient/NativeAPI.actor.cpp                 |  15 +++
 fdbclient/NativeAPI.actor.h                   |  11 +-
 fdbclient/vexillographer/fdb.options          |  17 +--
 fdbserver/fdbserver.actor.cpp                 |   3 +
 flow/Knobs.cpp                                |   2 +
 flow/Knobs.h                                  |   2 +
 flow/Trace.cpp                                |  51 +++++++--
 flow/Trace.h                                  |  14 +++
 25 files changed, 268 insertions(+), 65 deletions(-)

diff --git a/bindings/flow/tester/Tester.actor.cpp b/bindings/flow/tester/Tester.actor.cpp
index fa309a59e8..1d6702be05 100644
--- a/bindings/flow/tester/Tester.actor.cpp
+++ b/bindings/flow/tester/Tester.actor.cpp
@@ -1551,19 +1551,21 @@ struct UnitTestsFunc : InstructionFunc {
 		const uint64_t noRetryLimit = -1;
 		const uint64_t maxRetryDelay = 100;
 		const uint64_t sizeLimit = 100000;
+		const uint64_t maxFieldLength = 1000;
 
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_LOCATION_CACHE_SIZE, Optional<StringRef>(StringRef((const uint8_t*)&locationCacheSize, 8)));
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MAX_WATCHES, Optional<StringRef>(StringRef((const uint8_t*)&maxWatches, 8)));
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_DATACENTER_ID, Optional<StringRef>(LiteralStringRef("dc_id")));
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MACHINE_ID, Optional<StringRef>(LiteralStringRef("machine_id")));
+		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_ENABLE);
+		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_DISABLE);
+		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_LOGGING_MAX_FIELD_LENGTH, Optional<StringRef>(StringRef((const uint8_t*)&maxFieldLength, 8)));
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_TIMEOUT, Optional<StringRef>(StringRef((const uint8_t*)&timeout, 8)));
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_TIMEOUT, Optional<StringRef>(StringRef((const uint8_t*)&noTimeout, 8)));
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_MAX_RETRY_DELAY, Optional<StringRef>(StringRef((const uint8_t*)&maxRetryDelay, 8)));
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_SIZE_LIMIT, Optional<StringRef>(StringRef((const uint8_t*)&sizeLimit, 8)));
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_RETRY_LIMIT, Optional<StringRef>(StringRef((const uint8_t*)&retryLimit, 8)));
 		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_RETRY_LIMIT, Optional<StringRef>(StringRef((const uint8_t*)&noRetryLimit, 8)));
-		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_ENABLE);
-		data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_DISABLE);
 
 		state Reference<Transaction> tr = data->db->createTransaction();
 		tr->setOption(FDBTransactionOption::FDB_TR_OPTION_PRIORITY_SYSTEM_IMMEDIATE);
@@ -1574,6 +1576,7 @@ struct UnitTestsFunc : InstructionFunc {
 		tr->setOption(FDBTransactionOption::FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
 		tr->setOption(FDBTransactionOption::FDB_TR_OPTION_READ_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOption::FDB_TR_OPTION_ACCESS_SYSTEM_KEYS);
+		tr->setOption(FDBTransactionOption::FDB_TR_OPTION_TRANSACTION_LOGGING_MAX_FIELD_LENGTH, Optional<StringRef>(StringRef((const uint8_t*)&maxFieldLength, 8)));
 		tr->setOption(FDBTransactionOption::FDB_TR_OPTION_TIMEOUT, Optional<StringRef>(StringRef((const uint8_t*)&timeout, 8)));
 		tr->setOption(FDBTransactionOption::FDB_TR_OPTION_RETRY_LIMIT, Optional<StringRef>(StringRef((const uint8_t*)&retryLimit, 8)));
 		tr->setOption(FDBTransactionOption::FDB_TR_OPTION_MAX_RETRY_DELAY, Optional<StringRef>(StringRef((const uint8_t*)&maxRetryDelay, 8)));
diff --git a/bindings/go/src/_stacktester/stacktester.go b/bindings/go/src/_stacktester/stacktester.go
index 5b5d988e6b..aef3f5ceda 100644
--- a/bindings/go/src/_stacktester/stacktester.go
+++ b/bindings/go/src/_stacktester/stacktester.go
@@ -793,13 +793,14 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) {
 		db.Options().SetMaxWatches(10001)
 		db.Options().SetDatacenterId("dc_id")
 		db.Options().SetMachineId("machine_id")
+		db.Options().SetSnapshotRywEnable()
+		db.Options().SetSnapshotRywDisable()
+		db.Options().SetTransactionLoggingMaxFieldLength(1000)
 		db.Options().SetTransactionTimeout(100000)
 		db.Options().SetTransactionTimeout(0)
 		db.Options().SetTransactionMaxRetryDelay(100)
 		db.Options().SetTransactionRetryLimit(10)
 		db.Options().SetTransactionRetryLimit(-1)
-		db.Options().SetSnapshotRywEnable()
-		db.Options().SetSnapshotRywDisable()
 
 		if !fdb.IsAPIVersionSelected() {
 			log.Fatal("API version should be selected")
@@ -836,6 +837,7 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) {
 			tr.Options().SetReadYourWritesDisable()
 			tr.Options().SetReadSystemKeys()
 			tr.Options().SetAccessSystemKeys()
+			tr.Options().SetTransactionLoggingMaxFieldLength(1000)
 			tr.Options().SetTimeout(60 * 1000)
 			tr.Options().SetRetryLimit(50)
 			tr.Options().SetMaxRetryDelay(100)
diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go
index 782b108fda..3435613de6 100644
--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@@ -280,6 +280,23 @@ func (o DatabaseOptions) SetDatacenterId(param string) error {
 	return o.setOpt(22, []byte(param))
 }
 
+// Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior.
+func (o DatabaseOptions) SetSnapshotRywEnable() error {
+	return o.setOpt(26, nil)
+}
+
+// Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300.
+func (o DatabaseOptions) SetSnapshotRywDisable() error {
+	return o.setOpt(27, nil)
+}
+
+// Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option. This sets the ``transaction_logging_max_field_length`` option of each transaction created by this database. See the transaction option description for more information.
+//
+// Parameter: Maximum length of escaped key and value fields.
+func (o DatabaseOptions) SetTransactionLoggingMaxFieldLength(param int64) error {
+	return o.setOpt(405, int64ToBytes(param))
+}
+
 // Set a timeout in milliseconds which, when elapsed, will cause each transaction automatically to be cancelled. This sets the ``timeout`` option of each transaction created by this database. See the transaction option description for more information. Using this option requires that the API version is 610 or higher.
 //
 // Parameter: value in milliseconds of timeout
@@ -308,16 +325,6 @@ func (o DatabaseOptions) SetTransactionSizeLimit(param int64) error {
 	return o.setOpt(503, int64ToBytes(param))
 }
 
-// Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior.
-func (o DatabaseOptions) SetSnapshotRywEnable() error {
-	return o.setOpt(26, nil)
-}
-
-// Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300.
-func (o DatabaseOptions) SetSnapshotRywDisable() error {
-	return o.setOpt(27, nil)
-}
-
 // The transaction, if not self-conflicting, may be committed a second time after commit succeeds, in the event of a fault
 func (o TransactionOptions) SetCausalWriteRisky() error {
 	return o.setOpt(10, nil)
@@ -412,6 +419,13 @@ func (o TransactionOptions) SetLogTransaction() error {
 	return o.setOpt(404, nil)
 }
 
+// Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation.
+//
+// Parameter: Maximum length of escaped key and value fields.
+func (o TransactionOptions) SetTransactionLoggingMaxFieldLength(param int64) error {
+	return o.setOpt(405, int64ToBytes(param))
+}
+
 // Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option.
 //
 // Parameter: value in milliseconds of timeout
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java
index 6e2b6e9318..7920d9d9dc 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java
@@ -481,13 +481,14 @@ public class AsyncStackTester {
 				db.options().setMaxWatches(10001);
 				db.options().setDatacenterId("dc_id");
 				db.options().setMachineId("machine_id");
+				db.options().setSnapshotRywEnable();
+				db.options().setSnapshotRywDisable();
+				db.options().setTransactionLoggingMaxFieldLength(1000);
 				db.options().setTransactionTimeout(100000);
 				db.options().setTransactionTimeout(0);
 				db.options().setTransactionMaxRetryDelay(100);
 				db.options().setTransactionRetryLimit(10);
 				db.options().setTransactionRetryLimit(-1);
-				db.options().setSnapshotRywEnable();
-				db.options().setSnapshotRywDisable();
 
 				tr.options().setPrioritySystemImmediate();
 				tr.options().setPriorityBatch();
@@ -496,6 +497,7 @@ public class AsyncStackTester {
 				tr.options().setReadYourWritesDisable();
 				tr.options().setReadSystemKeys();
 				tr.options().setAccessSystemKeys();
+				tr.options().setTransactionLoggingMaxFieldLength(1000);
 				tr.options().setTimeout(60*1000);
 				tr.options().setRetryLimit(50);
 				tr.options().setMaxRetryDelay(100);
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java
index 20c9770bc2..a9cf47320f 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java
@@ -434,13 +434,14 @@ public class StackTester {
 						db.options().setMaxWatches(10001);
 						db.options().setDatacenterId("dc_id");
 						db.options().setMachineId("machine_id");
+						db.options().setSnapshotRywEnable();
+						db.options().setSnapshotRywDisable();
+						db.options().setTransactionLoggingMaxFieldLength(1000);
 						db.options().setTransactionTimeout(100000);
 						db.options().setTransactionTimeout(0);
 						db.options().setTransactionMaxRetryDelay(100);
 						db.options().setTransactionRetryLimit(10);
 						db.options().setTransactionRetryLimit(-1);
-						db.options().setSnapshotRywEnable();
-						db.options().setSnapshotRywDisable();
 
 						tr.options().setPrioritySystemImmediate();
 						tr.options().setPriorityBatch();
@@ -449,6 +450,7 @@ public class StackTester {
 						tr.options().setReadYourWritesDisable();
 						tr.options().setReadSystemKeys();
 						tr.options().setAccessSystemKeys();
+						tr.options().setTransactionLoggingMaxFieldLength(1000);
 						tr.options().setTimeout(60*1000);
 						tr.options().setRetryLimit(50);
 						tr.options().setMaxRetryDelay(100);
diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py
index 95aa36ea3e..b5dc84dbd3 100644
--- a/bindings/python/tests/tester.py
+++ b/bindings/python/tests/tester.py
@@ -128,9 +128,13 @@ class Instruction:
 
 
 def test_db_options(db):
+    db.options.set_location_cache_size(100001)
     db.options.set_max_watches(100001)
     db.options.set_datacenter_id("dc_id")
     db.options.set_machine_id("machine_id")
+    db.options.set_snapshot_ryw_enable()
+    db.options.set_snapshot_ryw_disable()
+    db.options.set_transaction_logging_max_field_length(1000)
     db.options.set_transaction_timeout(100000)
     db.options.set_transaction_timeout(0)
     db.options.set_transaction_timeout(0)
@@ -138,8 +142,6 @@ def test_db_options(db):
     db.options.set_transaction_size_limit(100000)
     db.options.set_transaction_retry_limit(10)
     db.options.set_transaction_retry_limit(-1)
-    db.options.set_snapshot_ryw_enable()
-    db.options.set_snapshot_ryw_disable()
 
 
 @fdb.transactional
@@ -151,6 +153,7 @@ def test_options(tr):
     tr.options.set_read_your_writes_disable()
     tr.options.set_read_system_keys()
     tr.options.set_access_system_keys()
+    tr.options.set_transaction_logging_max_field_length(1000)
     tr.options.set_timeout(60 * 1000)
     tr.options.set_retry_limit(50)
     tr.options.set_max_retry_delay(100)
@@ -545,8 +548,6 @@ class Tester:
                     inst.push(b"WAITED_FOR_EMPTY")
                 elif inst.op == six.u("UNIT_TESTS"):
                     try:
-                        db.options.set_location_cache_size(100001)
-
                         test_db_options(db)
                         test_options(db)
                         test_watches(db)
diff --git a/bindings/ruby/tests/tester.rb b/bindings/ruby/tests/tester.rb
index 829ecf8a5f..c199eddc09 100755
--- a/bindings/ruby/tests/tester.rb
+++ b/bindings/ruby/tests/tester.rb
@@ -456,14 +456,15 @@ class Tester
             @db.options.set_max_watches(10001)
             @db.options.set_datacenter_id("dc_id")
             @db.options.set_machine_id("machine_id")
+            @db.options.set_snapshot_ryw_enable()
+            @db.options.set_snapshot_ryw_disable()
+            @db.options.set_transaction_logging_max_field_length(1000)
             @db.options.set_transaction_timeout(100000)
             @db.options.set_transaction_timeout(0)
             @db.options.set_transaction_max_retry_delay(100)
             @db.options.set_transaction_size_limit(100000)
             @db.options.set_transaction_retry_limit(10)
             @db.options.set_transaction_retry_limit(-1)
-            @db.options.set_snapshot_ryw_enable()
-            @db.options.set_snapshot_ryw_disable()
 
             @db.transact do |tr|
               tr.options.set_priority_system_immediate
@@ -473,6 +474,7 @@ class Tester
               tr.options.set_read_your_writes_disable
               tr.options.set_read_system_keys
               tr.options.set_access_system_keys
+              tr.options.set_transaction_logging_max_field_length(1000)
               tr.options.set_timeout(60*1000)
               tr.options.set_retry_limit(50)
               tr.options.set_max_retry_delay(100)
diff --git a/documentation/sphinx/source/api-c.rst b/documentation/sphinx/source/api-c.rst
index 1e5d0243dc..a57b9a052b 100644
--- a/documentation/sphinx/source/api-c.rst
+++ b/documentation/sphinx/source/api-c.rst
@@ -49,6 +49,8 @@
 .. |max-retry-delay-database-option| replace:: FIXME
 .. |transaction-size-limit-database-option| replace:: FIXME
 .. |timeout-database-option| replace:: FIXME
+.. |transaction-logging-max-field-length-database-option| replace:: FIXME
+.. |transaction-logging-max-field-length-transaction-option| replace:: FIXME
 
 .. include:: api-common.rst.inc
 
diff --git a/documentation/sphinx/source/api-common.rst.inc b/documentation/sphinx/source/api-common.rst.inc
index 3c99c45382..3862fea779 100644
--- a/documentation/sphinx/source/api-common.rst.inc
+++ b/documentation/sphinx/source/api-common.rst.inc
@@ -326,6 +326,10 @@
 
     If this option has been set more times with this database than the disable option, snapshot reads will *not* see the effects of prior writes in the same transaction. Disabling this option is equivalent to calling |snapshot-ryw-disable-transaction-option| on each transaction created by this database.
 
+.. |option-db-tr-transaction-logging-max-field-length-blurb| replace::
+
+    Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option. This is equivalent to calling |transaction-logging-max-field-length-transaction-option| on each transaction created by this database.
+
 .. |transaction-options-blurb| replace::
 
     Transaction options alter the behavior of FoundationDB transactions. FoundationDB defaults to extremely safe transaction behavior, and we have worked hard to make the performance excellent with the default setting, so you should not often need to use transaction options.
@@ -411,7 +415,7 @@
 
 .. |option-set-timeout-blurb3| replace::
 
-    Prior to API version 610, like all other transaction options, a timeout must be reset after a call to |on-error-func|. Note that resetting this option resets only the timeout *duration*, not the starting point from which the time is measured. If the API version is 610 or newer, then the timeout is not reset. This allows the user to specify a timeout for specific transactions that is longer than the timeout specified by |timeout-database-option|. Note that at all API versions, it is safe and legal to call this option after each call to |on-error-func|, so most code written assuming the older behavior can be upgraded without requiring any modification. This also means that there is no need to introduce logic to conditionally set this option within retry loops.  One can set the default timeout for all transactions by calling |timeout-database-option|.
+    Prior to API version 610, like all other transaction options, a timeout must be reset after a call to |on-error-func|. Note that resetting this option resets only the timeout *duration*, not the starting point from which the time is measured. If the API version is 610 or newer, then the timeout is not reset. This allows the user to specify a timeout for specific transactions that is longer than the timeout specified by |timeout-database-option|. Note that at all API versions, it is safe and legal to call this option after each call to |on-error-func|, so most code written assuming the older behavior can be upgraded without requiring any modification. This also means that there is no need to introduce logic to conditionally set this option within retry loops. One can set the default timeout for all transactions by calling |timeout-database-option|.
 
 .. |option-next-write-no-write-conflict-range-blurb| replace::
 
@@ -421,6 +425,10 @@
 
     Care needs to be taken when using this option on a transaction that is shared between multiple threads. When setting this option, write conflict ranges will be disabled on the next write operation, regardless of what thread it is on.
 
+..  |option-set-transaction-logging-max-field-length-blurb| replace::
+
+    Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation. One can set the default max field length for all transactions by calling |transaction-logging-max-field-length-database-option|.
+
 .. |future-blurb1| replace::
     Many FoundationDB API functions return "future" objects. A brief overview of futures is included in the :doc:`class scheduling tutorial <class-scheduling>`. Most future objects behave just like a normal object, but block when you use them for the first time if the asynchronous function which returned the future has not yet completed its action. A future object is considered ready when either a value is available, or when an error has occurred.
 
diff --git a/documentation/sphinx/source/api-python.rst b/documentation/sphinx/source/api-python.rst
index 68ae70b1fa..a063c9f234 100644
--- a/documentation/sphinx/source/api-python.rst
+++ b/documentation/sphinx/source/api-python.rst
@@ -25,6 +25,7 @@
 .. |timeout-database-option| replace:: :func:`Database.options.set_transaction_timeout`
 .. |max-retry-delay-database-option| replace:: :func:`Database.options.set_transaction_max_retry_delay`
 .. |transaction-size-limit-database-option| replace:: :func:`Database.options.set_transaction_size_limit`
+.. |transaction-logging-max-field-length-database-option| replace:: :func:`Database.options.set_transaction_logging_max_field_length`
 .. |snapshot-ryw-enable-database-option| replace:: :func:`Database.options.set_snapshot_ryw_enable`
 .. |snapshot-ryw-disable-database-option| replace:: :func:`Database.options.set_snapshot_ryw_disable`
 .. |future-type-string| replace:: a :ref:`future <api-python-future>`
@@ -35,6 +36,7 @@
 .. |size-limit-transaction-option| replace:: :func:`Transaction.options.set_size_limit`
 .. |snapshot-ryw-enable-transaction-option| replace:: :func:`Transaction.options.set_snapshot_ryw_enable`
 .. |snapshot-ryw-disable-transaction-option| replace:: :func:`Transaction.options.set_snapshot_ryw_disable`
+.. |transaction-logging-max-field-length-transaction-option| replace:: :func:`Transaction.options.set_transaction_logging_max_field_length`
 .. |lazy-iterator-object| replace:: generator
 .. |key-meth| replace:: :meth:`Subspace.key`
 .. |directory-subspace| replace:: :ref:`DirectorySubspace <api-python-directory-subspace>`
@@ -384,6 +386,10 @@ Database options
 
     |option-db-tr-size-limit-blurb|
 
+.. method:: Database.options.set_transaction_logging_max_field_length(size_limit)
+
+    |option-db-tr-transaction-logging-max-field-length-blurb|
+
 .. method:: Database.options.set_snapshot_ryw_enable()
 
     |option-db-snapshot-ryw-enable-blurb|
@@ -855,6 +861,10 @@ Transaction options
 
     |option-set-timeout-blurb3|
 
+.. method:: Transaction.options.set_transaction_logging_max_field_length(size_limit)
+
+    |option-set-transaction-logging-max-field-length-blurb|
+
 .. _api-python-future:
 
 Future objects
diff --git a/documentation/sphinx/source/api-ruby.rst b/documentation/sphinx/source/api-ruby.rst
index cc35ad68b8..d363feecd1 100644
--- a/documentation/sphinx/source/api-ruby.rst
+++ b/documentation/sphinx/source/api-ruby.rst
@@ -25,6 +25,7 @@
 .. |transaction-size-limit-database-option| replace:: :func:`Database.options.set_transaction_size_limit`
 .. |snapshot-ryw-enable-database-option| replace:: :meth:`Database.options.set_snapshot_ryw_enable`
 .. |snapshot-ryw-disable-database-option| replace:: :meth:`Database.options.set_snapshot_ryw_disable`
+.. |transaction-logging-max-field-length-database-option| replace:: :meth:`Database.options.set_transaction_logging_max_field_length`
 .. |future-type-string| replace:: a :class:`Future`
 .. |read-your-writes-disable-option| replace:: :meth:`Transaction.options.set_read_your_writes_disable`
 .. |retry-limit-transaction-option| replace:: :meth:`Transaction.options.set_retry_limit`
@@ -33,6 +34,7 @@
 .. |size-limit-transaction-option| replace:: :meth:`Transaction.options.set_size_limit`
 .. |snapshot-ryw-enable-transaction-option| replace:: :meth:`Transaction.options.set_snapshot_ryw_enable`
 .. |snapshot-ryw-disable-transaction-option| replace:: :meth:`Transaction.options.set_snapshot_ryw_disable`
+.. |transaction-logging-max-field-length-transaction-option| replace:: :meth:`Transaction.options.set_transaction_logging_max_field_length`
 .. |lazy-iterator-object| replace:: :class:`Enumerator`
 .. |key-meth| replace:: :meth:`Subspace.key`
 .. |directory-subspace| replace:: :class:`DirectorySubspace`
@@ -380,6 +382,10 @@ Database options
 
     |option-db-tr-size-limit-blurb|
 
+.. method:: Database.options.set_transaction_logging_max_field_length(size_limit) -> nil
+
+    |option-db-tr-transaction-logging-max-field-length-blurb|
+
 .. method:: Database.options.set_snapshot_ryw_enable() -> nil
 
     |option-db-snapshot-ryw-enable-blurb|
@@ -797,6 +803,10 @@ Transaction options
 
     |option-set-timeout-blurb3|
 
+.. method:: Transaction.options.set_size_limit(size_limit) -> nil
+
+    |option-set-transaction-logging-max-field-length-blurb|
+
 .. _transact:
 
 The transact method
diff --git a/documentation/sphinx/source/data-modeling.rst b/documentation/sphinx/source/data-modeling.rst
index 51867006af..8fcde06958 100644
--- a/documentation/sphinx/source/data-modeling.rst
+++ b/documentation/sphinx/source/data-modeling.rst
@@ -51,6 +51,8 @@
 .. |max-retry-delay-database-option| replace:: FIXME
 .. |transaction-size-limit-database-option| replace:: FIXME
 .. |timeout-database-option| replace:: FIXME
+.. |transaction-logging-max-field-length-transaction-option| replace:: FIXME
+.. |transaction-logging-max-field-length-database-option| replace:: FIXME
 
 .. include:: api-common.rst.inc
 
diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index e8d6335b6f..0d7f24f661 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -51,6 +51,8 @@
 .. |max-retry-delay-database-option| replace:: FIXME
 .. |transaction-size-limit-database-option| replace:: FIXME
 .. |timeout-database-option| replace:: FIXME
+.. |transaction-logging-max-field-length-transaction-option| replace:: FIXME
+.. |transaction-logging-max-field-length-database-option| replace:: FIXME
 
 .. include:: api-common.rst.inc
 
diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index 9817574b41..e53eeab2ca 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -30,6 +30,7 @@ Bindings
 
 * Go: The Go bindings now require Go version 1.11 or later.
 * Go: Fix issue with finalizers running too early that could lead to undefined behavior. `(PR #1451) <https://github.com/apple/foundationdb/pull/1451>`_.
+* Added transaction option to control the field length of keys and values in debug transaction logging in order to avoid truncation. `(PR #) <https://github.com/apple/foundationdb/pull/>`_.
 
 Other Changes
 -------------
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 5d62be9a8b..a3bf12c493 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -3194,11 +3194,14 @@ int main(int argc, char* argv[]) {
 		}
 
 		TraceEvent("ProgramStart")
+			.setMaxEventLength(12000)
 			.detail("SourceVersion", getHGVersion())
 			.detail("Version", FDB_VT_VERSION )
 			.detail("PackageName", FDB_VT_PACKAGE_NAME)
 			.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL))
+			.setMaxFieldLength(10000)
 			.detail("CommandLine", commandLine)
+			.setMaxFieldLength(0)
 			.detail("MemoryLimit", memLimit)
 			.trackLatest("ProgramStart");
 
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 67611efab1..9bbd0977a4 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -2529,12 +2529,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 
 	if (opt.trace) {
 		TraceEvent("CLIProgramStart")
+			.setMaxEventLength(12000)
 			.detail("SourceVersion", getHGVersion())
 			.detail("Version", FDB_VT_VERSION)
 			.detail("PackageName", FDB_VT_PACKAGE_NAME)
 			.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL))
 			.detail("ClusterFile", ccf->getFilename().c_str())
 			.detail("ConnectionString", ccf->getConnectionString().toString())
+			.setMaxFieldLength(10000)
 			.detail("CommandLine", opt.commandLine)
 			.trackLatest("ProgramStart");
 	}
diff --git a/fdbclient/ClientLogEvents.h b/fdbclient/ClientLogEvents.h
index 14191bf209..9be2b78798 100644
--- a/fdbclient/ClientLogEvents.h
+++ b/fdbclient/ClientLogEvents.h
@@ -44,7 +44,7 @@ namespace FdbClientLogEvents {
 		EventType type{ EVENTTYPEEND };
 		double startTs{ 0 };
 
-		void logEvent(std::string id) const {}
+		void logEvent(std::string id, int maxFieldLength) const {}
 	};
 
 	struct EventGetVersion : public Event {
@@ -60,8 +60,10 @@ namespace FdbClientLogEvents {
 
 		double latency;
 
-		void logEvent(std::string id) const {
-			TraceEvent("TransactionTrace_GetVersion").detail("TransactionID", id).detail("Latency", latency);
+		void logEvent(std::string id, int maxFieldLength) const override {
+			TraceEvent("TransactionTrace_GetVersion")
+			.detail("TransactionID", id)
+			.detail("Latency", latency);
 		}
 	};
 
@@ -80,8 +82,14 @@ namespace FdbClientLogEvents {
 		int valueSize;
 		Key key;
 
-		void logEvent(std::string id) const {
-			TraceEvent("TransactionTrace_Get").detail("TransactionID", id).detail("Latency", latency).detail("ValueSizeBytes", valueSize).detail("Key", key);
+		void logEvent(std::string id, int maxFieldLength) const override {
+			TraceEvent("TransactionTrace_Get")
+			.setMaxEventLength(-1)
+			.detail("TransactionID", id)
+			.detail("Latency", latency)
+			.detail("ValueSizeBytes", valueSize)
+			.setMaxFieldLength(maxFieldLength)
+			.detail("Key", key);
 		}
 	};
 
@@ -101,8 +109,15 @@ namespace FdbClientLogEvents {
 		Key startKey;
 		Key endKey;
 
-		void logEvent(std::string id) const {
-			TraceEvent("TransactionTrace_GetRange").detail("TransactionID", id).detail("Latency", latency).detail("RangeSizeBytes", rangeSize).detail("StartKey", startKey).detail("EndKey", endKey);
+		void logEvent(std::string id, int maxFieldLength) const override {
+			TraceEvent("TransactionTrace_GetRange")
+			.setMaxEventLength(-1)
+			.detail("TransactionID", id)
+			.detail("Latency", latency)
+			.detail("RangeSizeBytes", rangeSize)
+			.setMaxFieldLength(maxFieldLength)
+			.detail("StartKey", startKey)
+			.detail("EndKey", endKey);
 		}
 	};
 
@@ -122,20 +137,38 @@ namespace FdbClientLogEvents {
 		int commitBytes;
 		CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized
 
-		void logEvent(std::string id) const {
+		void logEvent(std::string id, int maxFieldLength) const override {
 			for (auto &read_range : req.transaction.read_conflict_ranges) {
-				TraceEvent("TransactionTrace_Commit_ReadConflictRange").detail("TransactionID", id).detail("Begin", read_range.begin).detail("End", read_range.end);
+				TraceEvent("TransactionTrace_Commit_ReadConflictRange")
+				.setMaxEventLength(-1)
+				.detail("TransactionID", id)
+				.setMaxFieldLength(maxFieldLength)
+				.detail("Begin", read_range.begin)
+				.detail("End", read_range.end);
 			}
 
 			for (auto &write_range : req.transaction.write_conflict_ranges) {
-				TraceEvent("TransactionTrace_Commit_WriteConflictRange").detail("TransactionID", id).detail("Begin", write_range.begin).detail("End", write_range.end);
+				TraceEvent("TransactionTrace_Commit_WriteConflictRange")
+				.setMaxEventLength(-1)
+				.detail("TransactionID", id)
+				.setMaxFieldLength(maxFieldLength)
+				.detail("Begin", write_range.begin)
+				.detail("End", write_range.end);
 			}
 
 			for (auto &mutation : req.transaction.mutations) {
-				TraceEvent("TransactionTrace_Commit_Mutation").detail("TransactionID", id).detail("Mutation", mutation.toString());
+				TraceEvent("TransactionTrace_Commit_Mutation")
+				.setMaxEventLength(-1)
+				.detail("TransactionID", id)
+				.setMaxFieldLength(maxFieldLength)
+				.detail("Mutation", mutation.toString());
 			}
 
-			TraceEvent("TransactionTrace_Commit").detail("TransactionID", id).detail("Latency", latency).detail("NumMutations", numMutations).detail("CommitSizeBytes", commitBytes);
+			TraceEvent("TransactionTrace_Commit")
+			.detail("TransactionID", id)
+			.detail("Latency", latency)
+			.detail("NumMutations", numMutations)
+			.detail("CommitSizeBytes", commitBytes);
 		}
 	};
 
@@ -153,8 +186,13 @@ namespace FdbClientLogEvents {
 		int errCode;
 		Key key;
 
-		void logEvent(std::string id) const {
-			TraceEvent("TransactionTrace_GetError").detail("TransactionID", id).detail("ErrCode", errCode).detail("Key", key);
+		void logEvent(std::string id, int maxFieldLength) const override {
+			TraceEvent("TransactionTrace_GetError")
+			.setMaxEventLength(-1)
+			.detail("TransactionID", id)
+			.detail("ErrCode", errCode)
+			.setMaxFieldLength(maxFieldLength)
+			.detail("Key", key);
 		}
 	};
 
@@ -173,8 +211,14 @@ namespace FdbClientLogEvents {
 		Key startKey;
 		Key endKey;
 
-		void logEvent(std::string id) const {
-			TraceEvent("TransactionTrace_GetRangeError").detail("TransactionID", id).detail("ErrCode", errCode).detail("StartKey", startKey).detail("EndKey", endKey);
+		void logEvent(std::string id, int maxFieldLength) const override {
+			TraceEvent("TransactionTrace_GetRangeError")
+			.setMaxEventLength(-1)
+			.detail("TransactionID", id)
+			.detail("ErrCode", errCode)
+			.setMaxFieldLength(maxFieldLength)
+			.detail("StartKey", startKey)
+			.detail("EndKey", endKey);
 		}
 	};
 
@@ -192,20 +236,36 @@ namespace FdbClientLogEvents {
 		int errCode;
 		CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized
 
-		void logEvent(std::string id) const {
+		void logEvent(std::string id, int maxFieldLength) const override {
 			for (auto &read_range : req.transaction.read_conflict_ranges) {
-				TraceEvent("TransactionTrace_CommitError_ReadConflictRange").detail("TransactionID", id).detail("Begin", read_range.begin).detail("End", read_range.end);
+				TraceEvent("TransactionTrace_CommitError_ReadConflictRange")
+				.setMaxEventLength(-1)
+				.detail("TransactionID", id)
+				.setMaxFieldLength(maxFieldLength)
+				.detail("Begin", read_range.begin)
+				.detail("End", read_range.end);
 			}
 
 			for (auto &write_range : req.transaction.write_conflict_ranges) {
-				TraceEvent("TransactionTrace_CommitError_WriteConflictRange").detail("TransactionID", id).detail("Begin", write_range.begin).detail("End", write_range.end);
+				TraceEvent("TransactionTrace_CommitError_WriteConflictRange")
+				.setMaxEventLength(-1)
+				.detail("TransactionID", id)
+				.setMaxFieldLength(maxFieldLength)
+				.detail("Begin", write_range.begin)
+				.detail("End", write_range.end);
 			}
 
 			for (auto &mutation : req.transaction.mutations) {
-				TraceEvent("TransactionTrace_CommitError_Mutation").detail("TransactionID", id).detail("Mutation", mutation.toString());
+				TraceEvent("TransactionTrace_CommitError_Mutation")
+				.setMaxEventLength(-1)
+				.detail("TransactionID", id)
+				.setMaxFieldLength(maxFieldLength)
+				.detail("Mutation", mutation.toString());
 			}
 
-			TraceEvent("TransactionTrace_CommitError").detail("TransactionID", id).detail("ErrCode", errCode);
+			TraceEvent("TransactionTrace_CommitError")
+			.detail("TransactionID", id)
+			.detail("ErrCode", errCode);
 		}
 	};
 }
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index c00fffafa9..ea9923a462 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -2994,6 +2994,7 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
 			}
 			else {
 				trLogInfo = Reference<TransactionLogInfo>(new TransactionLogInfo(value.get().printable(), TransactionLogInfo::DONT_LOG));
+				trLogInfo->maxFieldLength = options.maxTransactionLoggingFieldLength;
 			}
 			break;
 
@@ -3008,6 +3009,20 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
 			}
 			break;
 
+		case FDBTransactionOptions::TRANSACTION_LOGGING_MAX_FIELD_LENGTH:
+			validateOptionValue(value, true);
+			{
+				int maxFieldLength = extractIntOption(value, -1, std::numeric_limits<int32_t>::max());
+				if(maxFieldLength == 0) {
+					throw invalid_option_value();
+				}
+				options.maxTransactionLoggingFieldLength = maxFieldLength;
+			}
+			if(trLogInfo) {
+				trLogInfo->maxFieldLength = options.maxTransactionLoggingFieldLength;
+			}
+			break;
+
 		case FDBTransactionOptions::MAX_RETRY_DELAY:
 			validateOptionValue(value, true);
 			options.maxBackoff = extractIntOption(value, 0, std::numeric_limits<int32_t>::max()) / 1000.0;
diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h
index 218f1f09d9..1338ffafca 100644
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@@ -149,6 +149,7 @@ struct TransactionOptions {
 	double maxBackoff;
 	uint32_t getReadVersionFlags;
 	uint32_t sizeLimit;
+	int maxTransactionLoggingFieldLength;
 	bool checkWritesEnabled : 1;
 	bool causalWriteRisky : 1;
 	bool commitOnFirstProxy : 1;
@@ -174,17 +175,18 @@ struct TransactionInfo {
 struct TransactionLogInfo : public ReferenceCounted<TransactionLogInfo>, NonCopyable {
 	enum LoggingLocation { DONT_LOG = 0, TRACE_LOG = 1, DATABASE = 2 };
 
-	TransactionLogInfo() : logLocation(DONT_LOG) {}
-	TransactionLogInfo(LoggingLocation location) : logLocation(location) {}
-	TransactionLogInfo(std::string id, LoggingLocation location) : logLocation(location), identifier(id) {}
+	TransactionLogInfo() : logLocation(DONT_LOG), maxFieldLength(0) {}
+	TransactionLogInfo(LoggingLocation location) : logLocation(location), maxFieldLength(0) {}
+	TransactionLogInfo(std::string id, LoggingLocation location) : logLocation(location), identifier(id), maxFieldLength(0) {}
 
 	void setIdentifier(std::string id) { identifier = id; }
 	void logTo(LoggingLocation loc) { logLocation = logLocation | loc; }
+
 	template <typename T>
 	void addLog(const T& event) {
 		if(logLocation & TRACE_LOG) {
 			ASSERT(!identifier.empty())
-			event.logEvent(identifier);
+			event.logEvent(identifier, maxFieldLength);
 		}
 
 		if (flushed) {
@@ -202,6 +204,7 @@ struct TransactionLogInfo : public ReferenceCounted<TransactionLogInfo>, NonCopy
 	bool logsAdded{ false };
 	bool flushed{ false };
 	int logLocation;
+	int maxFieldLength;
 	std::string identifier;
 };
 
diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index 775e443137..6044403dc6 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -146,6 +146,15 @@ description is not currently required but encouraged.
     <Option name="datacenter_id" code="22"
             paramType="String" paramDescription="Hexadecimal ID"
             description="Specify the datacenter ID that was passed to fdbserver processes running in the same datacenter as this client, for better location-aware load balancing." />
+    <!-- The snapshot RYW options act like defaults for the equivalent transaction options, but database defaults cannot have cumulative effects from multiple calls.
+         Thus, we don't use the defaultFor annotation on these options. -->
+    <Option name="snapshot_ryw_enable" code="26"
+            description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
+    <Option name="snapshot_ryw_disable" code="27"
+            description="Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300." />
+    <Option name="transaction_logging_max_field_length" code="405" paramType="Int" paramDescription="Maximum length of escaped key and value fields."
+            description="Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option. This sets the ``transaction_logging_max_field_length`` option of each transaction created by this database. See the transaction option description for more information." 
+            defaultFor="405"/>
     <Option name="transaction_timeout" code="500"
             paramType="Int" paramDescription="value in milliseconds of timeout"
             description="Set a timeout in milliseconds which, when elapsed, will cause each transaction automatically to be cancelled. This sets the ``timeout`` option of each transaction created by this database. See the transaction option description for more information. Using this option requires that the API version is 610 or higher." 
@@ -162,12 +171,6 @@ description is not currently required but encouraged.
             paramType="Int" paramDescription="value in bytes"
             description="Set the maximum transaction size in bytes. This sets the ``size_limit`` option on each transaction created by this database. See the transaction option description for more information." 
             defaultFor="503"/>
-    <!-- The snapshot RYW options act like defaults for the equivalent transaction options, but database defaults cannot have cumulative effects from multiple calls.
-         Thus, we don't use the defaultFor annotation on these options. -->
-    <Option name="snapshot_ryw_enable" code="26"
-            description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
-    <Option name="snapshot_ryw_disable" code="27"
-            description="Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300." />
   </Scope>
   
   <Scope name="TransactionOption">
@@ -210,6 +213,8 @@ description is not currently required but encouraged.
             description="Sets a client provided identifier for the transaction that will be used in scenarios like tracing or profiling. Client trace logging or transaction profiling must be separately enabled." />
     <Option name="log_transaction" code="404"
             description="Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled and to get log output." />
+    <Option name="transaction_logging_max_field_length" code="405" paramType="Int" paramDescription="Maximum length of escaped key and value fields."
+            description="Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation." />
     <Option name="timeout" code="500"
             paramType="Int" paramDescription="value in milliseconds of timeout"
             description="Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option."
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 8b765b65bc..e56fbe9b3e 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -1596,6 +1596,7 @@ int main(int argc, char* argv[]) {
 		}
 
 		TraceEvent("ProgramStart")
+			.setMaxEventLength(12000)
 			.detail("RandomSeed", randomSeed)
 			.detail("SourceVersion", getHGVersion())
 			.detail("Version", FDB_VT_VERSION )
@@ -1606,7 +1607,9 @@ int main(int argc, char* argv[]) {
 			.detail("ClusterFile", connectionFile ? connectionFile->getFilename().c_str() : "")
 			.detail("ConnectionString", connectionFile ? connectionFile->getConnectionString().toString() : "")
 			.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL))
+			.setMaxFieldLength(10000)
 			.detail("CommandLine", commandLine)
+			.setMaxFieldLength(0)
 			.detail("BuggifyEnabled", buggifyEnabled)
 			.detail("MemoryLimit", memLimit)
 			.trackLatest("ProgramStart");
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index 90f2e078bd..887f8ea81b 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -137,6 +137,8 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
 	init( TRACE_EVENT_METRIC_UNITS_PER_SAMPLE,                 500 );
 	init( TRACE_EVENT_THROTTLER_SAMPLE_EXPIRY,              1800.0 ); // 30 mins
 	init( TRACE_EVENT_THROTTLER_MSG_LIMIT,                   20000 );
+	init( MAX_TRACE_FIELD_LENGTH,                              495 ); // If the value of this is changed, the corresponding default in Trace.cpp should be changed as well
+	init( MAX_TRACE_EVENT_LENGTH,                             4000 ); // If the value of this is changed, the corresponding default in Trace.cpp should be changed as well
 
 	//TDMetrics
 	init( MAX_METRICS,                                         600 );
diff --git a/flow/Knobs.h b/flow/Knobs.h
index 99ac9df386..d8fb71680e 100644
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@@ -157,6 +157,8 @@ public:
 	int TRACE_EVENT_METRIC_UNITS_PER_SAMPLE;
 	int TRACE_EVENT_THROTTLER_SAMPLE_EXPIRY;
 	int TRACE_EVENT_THROTTLER_MSG_LIMIT;
+	int MAX_TRACE_FIELD_LENGTH;
+	int MAX_TRACE_EVENT_LENGTH;
 
 	//TDMetrics
 	int64_t MAX_METRIC_SIZE;
diff --git a/flow/Trace.cpp b/flow/Trace.cpp
index 4e70a5d29b..f8ddc7dcb2 100644
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@@ -122,7 +122,6 @@ static TransientThresholdMetricSample<Standalone<StringRef>> *traceEventThrottle
 static const char *TRACE_EVENT_THROTTLE_STARTING_TYPE = "TraceEventThrottle_";
 static const char *TRACE_EVENT_INVALID_SUPPRESSION = "InvalidSuppression_";
 static int TRACE_LOG_MAX_PREOPEN_BUFFER = 1000000;
-static int TRACE_EVENT_MAX_SIZE = 4000;
 
 struct TraceLog {
 	Reference<ITraceLogFormatter> formatter;
@@ -656,18 +655,26 @@ void removeTraceRole(std::string role) {
 
 TraceEvent::TraceEvent( const char* type, UID id ) : id(id), type(type), severity(SevInfo), initialized(false), enabled(true) {
 	g_trace_depth++;
+	setMaxFieldLength(0);
+	setMaxEventLength(0);
 }
 TraceEvent::TraceEvent( Severity severity, const char* type, UID id )
 	: id(id), type(type), severity(severity), initialized(false),
 	  enabled(g_network == nullptr || FLOW_KNOBS->MIN_TRACE_SEVERITY <= severity) {
 	g_trace_depth++;
+	setMaxFieldLength(0);
+	setMaxEventLength(0);
 }
 TraceEvent::TraceEvent( TraceInterval& interval, UID id )
-	: id(id), type(interval.type)
-	, severity(interval.severity)
-	, initialized(false)
-	, enabled(g_network == nullptr || FLOW_KNOBS->MIN_TRACE_SEVERITY <= interval.severity) {
+	: id(id), type(interval.type),
+	  severity(interval.severity),
+	  initialized(false),
+	  enabled(g_network == nullptr || FLOW_KNOBS->MIN_TRACE_SEVERITY <= interval.severity) {
+
 	g_trace_depth++;
+	setMaxFieldLength(0);
+	setMaxEventLength(0);
+
 	init(interval);
 }
 TraceEvent::TraceEvent( Severity severity, TraceInterval& interval, UID id )
@@ -675,7 +682,11 @@ TraceEvent::TraceEvent( Severity severity, TraceInterval& interval, UID id )
 	  severity(severity),
 	  initialized(false),
 	  enabled(g_network == nullptr || FLOW_KNOBS->MIN_TRACE_SEVERITY <= severity) {
+
 	g_trace_depth++;
+	setMaxFieldLength(0);
+	setMaxEventLength(0);
+
 	init(interval);
 }
 
@@ -778,8 +789,8 @@ TraceEvent& TraceEvent::errorImpl(class Error const& error, bool includeCancelle
 TraceEvent& TraceEvent::detailImpl( std::string&& key, std::string&& value, bool writeEventMetricField) {
 	init();
 	if (enabled) {
-		if( value.size() > 495 ) {
-			value = value.substr(0, 495) + "...";
+		if( maxFieldLength >= 0 && value.size() > maxFieldLength ) {
+			value = value.substr(0, maxFieldLength) + "...";
 		}
 
 		if(writeEventMetricField) {
@@ -788,8 +799,8 @@ TraceEvent& TraceEvent::detailImpl( std::string&& key, std::string&& value, bool
 
 		fields.addField(std::move(key), std::move(value));
 
-		if(fields.sizeBytes() > TRACE_EVENT_MAX_SIZE) {
-			TraceEvent(g_network && g_network->isSimulated() ? SevError : SevWarnAlways, "TraceEventOverflow").detail("TraceFirstBytes", fields.toString().substr(300));
+		if(maxEventLength >= 0 && fields.sizeBytes() > maxEventLength) {
+			TraceEvent(g_network && g_network->isSimulated() ? SevError : SevWarnAlways, "TraceEventOverflow").setMaxEventLength(1000).detail("TraceFirstBytes", fields.toString().substr(300));
 			enabled = false;
 		}
 	}
@@ -884,6 +895,28 @@ TraceEvent& TraceEvent::suppressFor( double duration, bool logSuppressedEventCou
 	return *this;
 }
 
+TraceEvent& TraceEvent::setMaxFieldLength(int maxFieldLength) {
+	if(maxFieldLength == 0) {
+		this->maxFieldLength = FLOW_KNOBS ? FLOW_KNOBS->MAX_TRACE_FIELD_LENGTH : 495;
+	} 
+	else {
+		this->maxFieldLength = maxFieldLength;
+	}
+
+	return *this;
+}
+
+TraceEvent& TraceEvent::setMaxEventLength(int maxEventLength) {
+	if(maxEventLength == 0) {
+		this->maxEventLength = FLOW_KNOBS ? FLOW_KNOBS->MAX_TRACE_EVENT_LENGTH : 4000;
+	} 
+	else {
+		this->maxEventLength = maxEventLength;
+	}
+
+	return *this;
+}
+
 TraceEvent& TraceEvent::GetLastError() {
 #ifdef _WIN32
 	return detailf("WinErrorCode", "%x", ::GetLastError());
diff --git a/flow/Trace.h b/flow/Trace.h
index a3957bddf4..0fa98dbd78 100644
--- a/flow/Trace.h
+++ b/flow/Trace.h
@@ -446,6 +446,15 @@ public:
 	TraceEvent& trackLatest( const char* trackingKey );
 	TraceEvent& sample( double sampleRate, bool logSampleRate=true );
 
+	// Sets the maximum length a field can be before it gets truncated. A value of 0 uses the default, a negative value
+	// disables truncation. This should be called before the field whose length you want to change, and it can be
+	// changed multiple times in a single event.
+	TraceEvent& setMaxFieldLength(int maxFieldLength);
+
+	// Sets the maximum event length before the event gets suppressed and a warning is logged. A value of 0 uses the default,
+	// A negative value disables length suppression. This should be called before adding details.
+	TraceEvent& setMaxEventLength(int maxEventLength);
+
 	//Cannot call other functions which could disable the trace event afterwords
 	TraceEvent& suppressFor( double duration, bool logSuppressedEventCount=true );
 
@@ -468,6 +477,11 @@ private:
 	UID id;
 	Error err;
 
+	int maxFieldLength;
+	int maxEventLength;
+
+	void setSizeLimits();
+
 	static unsigned long eventCounts[5];
 	static thread_local bool networkThread;
 

From a0203457d95fc39d17b3da07cb6ee2e113b0fa51 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Thu, 11 Jul 2019 08:51:59 -0700
Subject: [PATCH 120/136] Some rewording in the table describing different
 replication modes

---
 documentation/sphinx/source/configuration.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst
index 02ef379328..48390e288c 100644
--- a/documentation/sphinx/source/configuration.rst
+++ b/documentation/sphinx/source/configuration.rst
@@ -316,19 +316,19 @@ Single datacenter modes
 +==============================+==+=================+=================+================+
 | Best for                     |  | 1-2 machines    | 3-4 machines    | 5+ machines    | 
 +------------------------------+--+-----------------+-----------------+----------------+
-| Replication                  |  | 1 copy          | 2 copy          | 3 copy         |
+| Total Replicas               |  | 1 copy          | 2 copies        | 3 copies       |
 +------------------------------+--+-----------------+-----------------+----------------+
-| # live machines              |  |                 |                 |                |
+| Live machines required       |  |                 |                 |                |
 | to make progress             |  | 1               | 2               | 3              |
 +------------------------------+--+-----------------+-----------------+----------------+
-| Minimum # of machines        |  |                 |                 |                |
+| Required machines            |  |                 |                 |                |
 | for fault tolerance          |  | impossible      | 3               | 4              |
 +------------------------------+--+-----------------+-----------------+----------------+
-| Ideal # of                   |  |                 |                 |                |
+| Ideal number of              |  |                 |                 |                |
 | coordination servers         |  | 1               | 3               | 5              |
 +------------------------------+--+-----------------+-----------------+----------------+
-| # simultaneous failures      |  |                 |                 |                |
-| after which data may be lost |  | any machine     | 2+ machines     | 3+ machines    |
+| Simultaneous failures        |  |                 |                 |                |
+| after which data may be lost |  | any process     | 2+ machines     | 3+ machines    |
 +------------------------------+--+-----------------+-----------------+----------------+
 
 In the three single datacenter redundancy modes, FoundationDB replicates data across the required number of machines in the cluster, but without aiming for datacenter redundancy. Although machines may be placed in more than one datacenter, the cluster will not be tolerant of datacenter-correlated failures. 

From e5622a424eb23b8a3075a0d48393a0f07eebfd1b Mon Sep 17 00:00:00 2001
From: Evan Tschannen <36455792+etschannen@users.noreply.github.com>
Date: Fri, 12 Jul 2019 18:13:56 -0700
Subject: [PATCH 121/136] Update fdbserver/LogSystemDiskQueueAdapter.actor.cpp

Co-Authored-By: Alex Miller <35046903+alexmiller-apple@users.noreply.github.com>
---
 fdbserver/LogSystemDiskQueueAdapter.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
index 7a803abe46..182ae7a87f 100644
--- a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
+++ b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp
@@ -67,7 +67,7 @@ public:
 				}
 				if(!self->cursor->hasMessage()) {
 					self->recoveryLoc = self->cursor->version().version;
-					wait(delay(0));
+					wait(yield());
 					continue;
 				}
 			}

From 2278e905cee1404e89710dc15fc4d6f33664be9e Mon Sep 17 00:00:00 2001
From: Evan Tschannen <36455792+etschannen@users.noreply.github.com>
Date: Fri, 12 Jul 2019 18:15:59 -0700
Subject: [PATCH 122/136] Update fdbserver/DBCoreState.h

Co-Authored-By: Alex Miller <35046903+alexmiller-apple@users.noreply.github.com>
---
 fdbserver/DBCoreState.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbserver/DBCoreState.h b/fdbserver/DBCoreState.h
index b5006d2e72..a350a190ae 100644
--- a/fdbserver/DBCoreState.h
+++ b/fdbserver/DBCoreState.h
@@ -98,7 +98,10 @@ struct OldTLogCoreData {
 			tLogs[0].tLogVersion = TLogVersion::V2;
 		}
 		if (ar.protocolVersion().hasPseudoLocalities()) {
-			serializer(ar, pseudoLocalities, txsTags);
+			serializer(ar, pseudoLocalities);
+		}
+		if (ar.protocolVersion().hasShardedTxsTags()) {
+			serializer(ar, txsTags);
 		}
 	}
 };

From b2d8110c13889efbf2424fa7508e491fc7f13d68 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <36455792+etschannen@users.noreply.github.com>
Date: Fri, 12 Jul 2019 18:16:44 -0700
Subject: [PATCH 123/136] Update fdbserver/TagPartitionedLogSystem.actor.cpp

Co-Authored-By: Alex Miller <35046903+alexmiller-apple@users.noreply.github.com>
---
 fdbserver/TagPartitionedLogSystem.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp
index 5fe2b28d74..b7e189156c 100644
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@@ -2126,7 +2126,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 					Tag pushTag = (i==-1 || logSystem->txsTags==0) ? txsTag : Tag(tagLocalityTxs, i%logSystem->txsTags);
 					locations.clear();
-					logSystem->tLogs[1]->getPushLocations( vector<Tag>(1, pushTag), locations, 0 );
+					logSystem->tLogs[1]->getPushLocations( {pushTag}, locations, 0 );
 					for(int loc : locations)
 						sreqs[ loc ].recoverTags.push_back( tag );
 				}

From 2c0bf99ef20155132e23b8b61b41bf9422a47dac Mon Sep 17 00:00:00 2001
From: Evan Tschannen <36455792+etschannen@users.noreply.github.com>
Date: Fri, 12 Jul 2019 18:17:30 -0700
Subject: [PATCH 124/136] Update fdbserver/DBCoreState.h

Co-Authored-By: Alex Miller <35046903+alexmiller-apple@users.noreply.github.com>
---
 fdbserver/DBCoreState.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbserver/DBCoreState.h b/fdbserver/DBCoreState.h
index a350a190ae..7c645c2d92 100644
--- a/fdbserver/DBCoreState.h
+++ b/fdbserver/DBCoreState.h
@@ -151,7 +151,10 @@ struct DBCoreState {
 		if(ar.protocolVersion().hasTagLocality()) {
 			serializer(ar, tLogs, logRouterTags, oldTLogData, recoveryCount, logSystemType);
 			if (ar.protocolVersion().hasPseudoLocalities()) {
-				serializer(ar, pseudoLocalities, txsTags);
+				serializer(ar, pseudoLocalities);
+			}
+			if (ar.protocolVersion().hasShardedTxsTags()) {
+				serializer(ar, txsTags);
 			}
 		} else if(ar.isDeserializing) {
 			tLogs.push_back(CoreTLogSet());

From e85c05c906a76c6604f60d9947721120d2ca36a2 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Tue, 2 Jul 2019 18:25:27 -0700
Subject: [PATCH 125/136] experimental slow control on durability lag

---
 fdbserver/Ratekeeper.actor.cpp | 65 +++++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 5 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 91f44ef99c..5f6e40b6e1 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -41,6 +41,7 @@ enum limitReason_t {
 	storage_server_min_free_space_ratio,  // a storage server's normal limits are being reduced by a low free space ratio
 	log_server_min_free_space,
 	log_server_min_free_space_ratio,
+	storage_server_durability_lag,
 	limitReason_t_end
 };
 
@@ -56,7 +57,8 @@ const char* limitReasonName[] = {
 	"storage_server_min_free_space",
 	"storage_server_min_free_space_ratio",
 	"log_server_min_free_space",
-	"log_server_min_free_space_ratio"
+	"log_server_min_free_space_ratio",
+	"storage_server_durability_lag"
 };
 static_assert(sizeof(limitReasonName) / sizeof(limitReasonName[0]) == limitReason_t_end, "limitReasonDesc table size");
 
@@ -72,7 +74,8 @@ const char* limitReasonDesc[] = {
 	"Storage server running out of space (approaching 100MB limit).",
 	"Storage server running out of space (approaching 5% limit).",
 	"Log server running out of space (approaching 100MB limit).",
-	"Log server running out of space (approaching 5% limit)."
+	"Log server running out of space (approaching 5% limit).",
+	"Storage server is overwhelmed by read workload"
 };
 
 static_assert(sizeof(limitReasonDesc) / sizeof(limitReasonDesc[0]) == limitReason_t_end, "limitReasonDesc table size");
@@ -166,12 +169,15 @@ struct RatekeeperData {
 
 	RatekeeperLimits normalLimits;
 	RatekeeperLimits batchLimits;
+	double durabilityLagLimit;
+	int64_t lastDurabilityLag;
 
 	RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), 
 		actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")),
 		lastWarning(0),
 		normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE),
-		batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH)
+		batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH),
+		durabilityLagLimit(std::numeric_limits<double>::infinity()), lastDurabilityLag(0)
 	{}
 };
 
@@ -342,9 +348,9 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 	// SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this value
 	actualTps = std::max( std::max( 1.0, actualTps ), self->smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT );
 
-	limits->tpsLimit = std::numeric_limits<double>::infinity();
+	limits->tpsLimit = self->durabilityLagLimit;
 	UID reasonID = UID();
-	limitReason_t limitReason = limitReason_t::unlimited;
+	limitReason_t limitReason = self->durabilityLagLimit == std::numeric_limits<double>::infinity() ? limitReason_t::unlimited : limitReasion_t::storage_server_durability_lag;
 
 	int sscount = 0;
 
@@ -446,6 +452,10 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		ssReasons[ss.id] = ssLimitReason;
 	}
 
+	if(worstStorageDurabilityLagStorageServer < 200e6) {
+		self->durabilityLagLimit = std::numeric_limits<double>::infinity();
+	}
+
 	self->healthMetrics.worstStorageQueue = worstStorageQueueStorageServer;
 	self->healthMetrics.worstStorageDurabilityLag = worstStorageDurabilityLagStorageServer;
 
@@ -617,6 +627,46 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 	}
 }
 
+void updateVersionLagRate(RatekeeperData* self) {
+	int64_t worstStorageDurabilityLagStorageServer = 0;
+	int64_t limitingStorageDurabilityLagStorageServer = 0;
+
+	std::multimap<int64_t, StorageQueueInfo*> storageTpsLimitReverseIndex;
+
+	// Look at each storage server's write queue and local rate, compute and store the desired rate ratio
+	for(auto i = self->storageQueueInfo.begin(); i != self->storageQueueInfo.end(); ++i) {
+		auto& ss = i->value;
+		if (!ss.valid) continue;
+
+		int64_t storageDurabilityLag = ss.smoothLatestVersion.smoothTotal() - ss.smoothDurableVersion.smoothTotal();
+		worstStorageDurabilityLagStorageServer = std::max(worstStorageDurabilityLagStorageServer, storageDurabilityLag);
+
+		storageTpsLimitReverseIndex.insert(std::make_pair(-1*storageDurabilityLag, &ss));
+	}
+
+	std::set<Optional<Standalone<StringRef>>> ignoredMachines;
+	for (auto ss = storageTpsLimitReverseIndex.begin(); ss != storageTpsLimitReverseIndex.end() && ss->first < limits->tpsLimit; ++ss) {
+		if (ignoredMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) {
+			ignoredMachines.insert(ss->second->locality.zoneId());
+			continue;
+		}
+		if (ignoredMachines.count(ss->second->locality.zoneId()) > 0) {
+			continue;
+		}
+
+		limitingStorageDurabilityLagStorageServer = -1*ss->first;
+		if(limitingStorageDurabilityLagStorageServer < 200e6) {
+			self->durabilityLagLimit = std::numeric_limits<double>::infinity();
+		} else if(self->durabilityLagLimit == std::numeric_limits<double>::infinity()) {
+			self->durabilityLagLimit = std::max( 1000.0, 0.95*self->smoothReleasedTransactions.smoothRate() );
+		} else if(limitingStorageDurabilityLagStorageServer > self->lastDurabilityLag) {
+			self->durabilityLagLimit = 0.95*self->durabilityLagLimit;
+		}
+		self->lastDurabilityLag = limitingStorageDurabilityLagStorageServer;
+		break;
+	}
+}
+
 ACTOR Future<Void> configurationMonitor(Reference<AsyncVar<ServerDBInfo>> dbInfo, DatabaseConfiguration* conf) {
 	state Database cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true);
 	loop {
@@ -645,6 +695,7 @@ ACTOR Future<Void> configurationMonitor(Reference<AsyncVar<ServerDBInfo>> dbInfo
 ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo>> dbInfo) {
 	state RatekeeperData self;
 	state Future<Void> timeout = Void();
+	state Future<Void> versionLagTimeout = Void();
 	state std::vector<Future<Void>> tlogTrackers;
 	state std::vector<TLogInterface> tlogInterfs;
 	state Promise<Void> err;
@@ -685,6 +736,10 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 				}
 				timeout = delayJittered(SERVER_KNOBS->METRIC_UPDATE_RATE);
 			}
+			when (wait( versionLagTimeout )) {
+				updateVersionLagRate(&self);
+				versionLagTimeout = delayJittered(60.0);
+			}
 			when (GetRateInfoRequest req = waitNext(rkInterf.getRateInfo.getFuture())) {
 				GetRateInfoReply reply;
 

From dc171b3eae24e5d8a6d1f15e1d1ff244c49e66c0 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Tue, 2 Jul 2019 18:40:46 -0700
Subject: [PATCH 126/136] fixed compiler error

---
 fdbserver/Ratekeeper.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 5f6e40b6e1..7ec4df6f19 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -350,7 +350,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 
 	limits->tpsLimit = self->durabilityLagLimit;
 	UID reasonID = UID();
-	limitReason_t limitReason = self->durabilityLagLimit == std::numeric_limits<double>::infinity() ? limitReason_t::unlimited : limitReasion_t::storage_server_durability_lag;
+	limitReason_t limitReason = self->durabilityLagLimit == std::numeric_limits<double>::infinity() ? limitReason_t::unlimited : limitReason_t::storage_server_durability_lag;
 
 	int sscount = 0;
 

From c5fb5494f5c4fc9dc28192c8c137b8ccc5739b9d Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Wed, 3 Jul 2019 11:33:04 -0700
Subject: [PATCH 127/136] a better attempt a ratekeeper control on durability
 lag

---
 fdbserver/Ratekeeper.actor.cpp | 107 ++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 55 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 7ec4df6f19..0f665aff7f 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -169,8 +169,10 @@ struct RatekeeperData {
 
 	RatekeeperLimits normalLimits;
 	RatekeeperLimits batchLimits;
-	double durabilityLagLimit;
+
 	int64_t lastDurabilityLag;
+	double durabilityLagLimit;
+	Deque<double> actualTpsHistory;
 
 	RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), 
 		actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")),
@@ -347,10 +349,15 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 	self->actualTpsMetric = (int64_t)actualTps;
 	// SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this value
 	actualTps = std::max( std::max( 1.0, actualTps ), self->smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT );
+	
+	if(self->actualTpsHistory.size() > 600) {
+		self->actualTpsHistory.pop_front();
+	}
+	self->actualTpsHistory.push_back(actualTps);
 
-	limits->tpsLimit = self->durabilityLagLimit;
+	limits->tpsLimit = std::numeric_limits<double>::infinity();
 	UID reasonID = UID();
-	limitReason_t limitReason = self->durabilityLagLimit == std::numeric_limits<double>::infinity() ? limitReason_t::unlimited : limitReason_t::storage_server_durability_lag;
+	limitReason_t limitReason = limitReason_t::unlimited;
 
 	int sscount = 0;
 
@@ -362,6 +369,8 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 	double limitingStorageLocalLimit = 0;
 
 	std::multimap<double, StorageQueueInfo*> storageTpsLimitReverseIndex;
+	std::multimap<int64_t, StorageQueueInfo*> storageDurabilityLagReverseIndex;
+
 	std::map<UID, limitReason_t> ssReasons;
 
 	// Look at each storage server's write queue and local rate, compute and store the desired rate ratio
@@ -393,6 +402,8 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		int64_t storageDurabilityLag = ss.smoothLatestVersion.smoothTotal() - ss.smoothDurableVersion.smoothTotal();
 		worstStorageDurabilityLagStorageServer = std::max(worstStorageDurabilityLagStorageServer, storageDurabilityLag);
 
+		storageDurabilityLagReverseIndex.insert(std::make_pair(-1*storageDurabilityLag, &ss));
+
 		auto& ssMetrics = self->healthMetrics.storageStats[ss.id];
 		ssMetrics.storageQueue = storageQueue;
 		ssMetrics.storageDurabilityLag = storageDurabilityLag;
@@ -452,13 +463,6 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		ssReasons[ss.id] = ssLimitReason;
 	}
 
-	if(worstStorageDurabilityLagStorageServer < 200e6) {
-		self->durabilityLagLimit = std::numeric_limits<double>::infinity();
-	}
-
-	self->healthMetrics.worstStorageQueue = worstStorageQueueStorageServer;
-	self->healthMetrics.worstStorageDurabilityLag = worstStorageDurabilityLagStorageServer;
-
 	std::set<Optional<Standalone<StringRef>>> ignoredMachines;
 	for (auto ss = storageTpsLimitReverseIndex.begin(); ss != storageTpsLimitReverseIndex.end() && ss->first < limits->tpsLimit; ++ss) {
 		if (ignoredMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) {
@@ -478,6 +482,44 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		break;
 	}
 
+	int64_t limitingStorageDurabilityLagStorageServer = 0;
+
+	std::set<Optional<Standalone<StringRef>>> ignoredDurabilityLagMachines;
+	for (auto ss = storageDurabilityLagReverseIndex.begin(); ss != storageDurabilityLagReverseIndex.end(); ++ss) {
+		if (ignoredDurabilityLagMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) {
+			ignoredDurabilityLagMachines.insert(ss->second->locality.zoneId());
+			continue;
+		}
+		if (ignoredDurabilityLagMachines.count(ss->second->locality.zoneId()) > 0) {
+			continue;
+		}
+
+		limitingStorageDurabilityLagStorageServer = -1*ss->first;
+		if(limitingStorageDurabilityLagStorageServer > 200e6) {
+			if(self->durabilityLagLimit == std::numeric_limits<double>::infinity()) {
+				double maxTps = 0;
+				for(int i = 0; i < self->actualTpsHistory.size(); i++) {
+					maxTps = std::max(maxTps, self->actualTpsHistory[i]);
+				}
+				self->durabilityLagLimit = 1.02*maxTps;
+			}
+			if( limitingStorageDurabilityLagStorageServer > self->lastDurabilityLag ) {
+				self->durabilityLagLimit = 0.9999*self->durabilityLagLimit;
+			}
+			if(self->durabilityLagLimit < limits->tpsLimit) {
+				limits->tpsLimit = self->durabilityLagLimit;
+				limitReason = limitReason_t::storage_server_durability_lag;
+			}
+		} else {
+			self->durabilityLagLimit = std::numeric_limits<double>::infinity();
+		}
+		self->lastDurabilityLag = limitingStorageDurabilityLagStorageServer;
+		break;
+	}
+
+	self->healthMetrics.worstStorageQueue = worstStorageQueueStorageServer;
+	self->healthMetrics.worstStorageDurabilityLag = worstStorageDurabilityLagStorageServer;
+
 	double writeToReadLatencyLimit = 0;
 	Version worstVersionLag = 0;
 	Version limitingVersionLag = 0;
@@ -627,46 +669,6 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 	}
 }
 
-void updateVersionLagRate(RatekeeperData* self) {
-	int64_t worstStorageDurabilityLagStorageServer = 0;
-	int64_t limitingStorageDurabilityLagStorageServer = 0;
-
-	std::multimap<int64_t, StorageQueueInfo*> storageTpsLimitReverseIndex;
-
-	// Look at each storage server's write queue and local rate, compute and store the desired rate ratio
-	for(auto i = self->storageQueueInfo.begin(); i != self->storageQueueInfo.end(); ++i) {
-		auto& ss = i->value;
-		if (!ss.valid) continue;
-
-		int64_t storageDurabilityLag = ss.smoothLatestVersion.smoothTotal() - ss.smoothDurableVersion.smoothTotal();
-		worstStorageDurabilityLagStorageServer = std::max(worstStorageDurabilityLagStorageServer, storageDurabilityLag);
-
-		storageTpsLimitReverseIndex.insert(std::make_pair(-1*storageDurabilityLag, &ss));
-	}
-
-	std::set<Optional<Standalone<StringRef>>> ignoredMachines;
-	for (auto ss = storageTpsLimitReverseIndex.begin(); ss != storageTpsLimitReverseIndex.end() && ss->first < limits->tpsLimit; ++ss) {
-		if (ignoredMachines.size() < std::min(self->configuration.storageTeamSize - 1, SERVER_KNOBS->MAX_MACHINES_FALLING_BEHIND)) {
-			ignoredMachines.insert(ss->second->locality.zoneId());
-			continue;
-		}
-		if (ignoredMachines.count(ss->second->locality.zoneId()) > 0) {
-			continue;
-		}
-
-		limitingStorageDurabilityLagStorageServer = -1*ss->first;
-		if(limitingStorageDurabilityLagStorageServer < 200e6) {
-			self->durabilityLagLimit = std::numeric_limits<double>::infinity();
-		} else if(self->durabilityLagLimit == std::numeric_limits<double>::infinity()) {
-			self->durabilityLagLimit = std::max( 1000.0, 0.95*self->smoothReleasedTransactions.smoothRate() );
-		} else if(limitingStorageDurabilityLagStorageServer > self->lastDurabilityLag) {
-			self->durabilityLagLimit = 0.95*self->durabilityLagLimit;
-		}
-		self->lastDurabilityLag = limitingStorageDurabilityLagStorageServer;
-		break;
-	}
-}
-
 ACTOR Future<Void> configurationMonitor(Reference<AsyncVar<ServerDBInfo>> dbInfo, DatabaseConfiguration* conf) {
 	state Database cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true);
 	loop {
@@ -695,7 +697,6 @@ ACTOR Future<Void> configurationMonitor(Reference<AsyncVar<ServerDBInfo>> dbInfo
 ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo>> dbInfo) {
 	state RatekeeperData self;
 	state Future<Void> timeout = Void();
-	state Future<Void> versionLagTimeout = Void();
 	state std::vector<Future<Void>> tlogTrackers;
 	state std::vector<TLogInterface> tlogInterfs;
 	state Promise<Void> err;
@@ -736,10 +737,6 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 				}
 				timeout = delayJittered(SERVER_KNOBS->METRIC_UPDATE_RATE);
 			}
-			when (wait( versionLagTimeout )) {
-				updateVersionLagRate(&self);
-				versionLagTimeout = delayJittered(60.0);
-			}
 			when (GetRateInfoRequest req = waitNext(rkInterf.getRateInfo.getFuture())) {
 				GetRateInfoReply reply;
 

From 1a18c859c768de90e05673e49a2987e50981da18 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Thu, 11 Jul 2019 18:34:19 -0700
Subject: [PATCH 128/136] knobified the durability lag rate controls

---
 .../source/mr-status-json-schemas.rst.inc      |  6 ++++--
 documentation/sphinx/source/mr-status.rst      |  1 +
 fdbclient/Schemas.cpp                          |  6 ++++--
 fdbserver/Knobs.cpp                            |  7 +++++++
 fdbserver/Knobs.h                              |  7 +++++++
 fdbserver/Ratekeeper.actor.cpp                 | 18 ++++++++++--------
 tests/fast/RedwoodCorrectnessBTree.txt         |  6 ------
 7 files changed, 33 insertions(+), 18 deletions(-)
 delete mode 100644 tests/fast/RedwoodCorrectnessBTree.txt

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index bc6aac6411..f4fc5ded96 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -230,7 +230,8 @@
                   "storage_server_min_free_space",
                   "storage_server_min_free_space_ratio",
                   "log_server_min_free_space",
-                  "log_server_min_free_space_ratio"
+                  "log_server_min_free_space_ratio",
+                  "storage_server_durability_lag"
                ]
             },
             "description":"The database is not being saturated by the workload."
@@ -249,7 +250,8 @@
                   "storage_server_min_free_space",
                   "storage_server_min_free_space_ratio",
                   "log_server_min_free_space",
-                  "log_server_min_free_space_ratio"
+                  "log_server_min_free_space_ratio",
+                  "storage_server_durability_lag"
                ]
             },
             "description":"The database is not being saturated by the workload."
diff --git a/documentation/sphinx/source/mr-status.rst b/documentation/sphinx/source/mr-status.rst
index dfc063e911..9e11906e71 100644
--- a/documentation/sphinx/source/mr-status.rst
+++ b/documentation/sphinx/source/mr-status.rst
@@ -128,4 +128,5 @@ min_free_space                      Running out of space (approaching 100MB limi
 min_free_space_ratio                Running out of space (approaching 5% limit).
 log_server_min_free_space           Log server running out of space (approaching 100MB limit).
 log_server_min_free_space_ratio     Log server running out of space (approaching 5% limit).
+storage_server_durability_lag       Storage server durable version falling behind.
 =================================== ====================================================
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 658bf86527..8080eb2d2c 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -250,7 +250,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                   "storage_server_min_free_space",
                   "storage_server_min_free_space_ratio",
                   "log_server_min_free_space",
-                  "log_server_min_free_space_ratio"
+                  "log_server_min_free_space_ratio",
+                  "storage_server_durability_lag"
                ]
             },
             "description":"The database is not being saturated by the workload."
@@ -269,7 +270,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                   "storage_server_min_free_space",
                   "storage_server_min_free_space_ratio",
                   "log_server_min_free_space",
-                  "log_server_min_free_space_ratio"
+                  "log_server_min_free_space_ratio",
+                  "storage_server_durability_lag"
                ]
             },
             "description":"The database is not being saturated by the workload."
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index 7a570281ca..ee3fb18823 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -396,6 +396,13 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( MAX_TL_SS_VERSION_DIFFERENCE_BATCH,                   1e99 );
 	init( MAX_MACHINES_FALLING_BEHIND,                             1 );
 
+	init( MAX_TPS_HISTORY_SAMPLES,                               600 );
+	init( NEEDED_TPS_HISTORY_SAMPLES,                            200 );
+	init( TARGET_DURABILITY_LAG_VERSIONS,                      200e6 );
+	init( TARGET_DURABILITY_LAG_VERSIONS_BATCH,                100e6 );
+	init( INITIAL_DURABILITY_LAG_MULTIPLIER,                    1.02 );
+	init( DURABILITY_LAG_REDUCTION_RATE,                      0.9999 );
+
 	//Storage Metrics
 	init( STORAGE_METRICS_AVERAGE_INTERVAL,                    120.0 );
 	init( STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS,        1000.0 / STORAGE_METRICS_AVERAGE_INTERVAL );  // milliHz!
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 37c3916ae5..065c5ab4e5 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -332,6 +332,13 @@ public:
 	double MAX_TL_SS_VERSION_DIFFERENCE_BATCH;
 	int MAX_MACHINES_FALLING_BEHIND;
 
+	int MAX_TPS_HISTORY_SAMPLES;
+	int NEEDED_TPS_HISTORY_SAMPLES;
+	int64_t TARGET_DURABILITY_LAG_VERSIONS;
+	int64_t TARGET_DURABILITY_LAG_VERSIONS_BATCH;
+	double INITIAL_DURABILITY_LAG_MULTIPLIER;
+	double DURABILITY_LAG_REDUCTION_RATE;
+
 	//Storage Metrics
 	double STORAGE_METRICS_AVERAGE_INTERVAL;
 	double STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 0f665aff7f..8ea2463651 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -75,7 +75,7 @@ const char* limitReasonDesc[] = {
 	"Storage server running out of space (approaching 5% limit).",
 	"Log server running out of space (approaching 100MB limit).",
 	"Log server running out of space (approaching 5% limit).",
-	"Storage server is overwhelmed by read workload"
+	"Storage server durable version falling behind."
 };
 
 static_assert(sizeof(limitReasonDesc) / sizeof(limitReasonDesc[0]) == limitReason_t_end, "limitReasonDesc table size");
@@ -128,10 +128,11 @@ struct RatekeeperLimits {
 	int64_t logTargetBytes;
 	int64_t logSpringBytes;
 	double maxVersionDifference;
+	int64_t durabilityLagTargetVersions;
 
 	std::string context;
 
-	RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference) :
+	RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference, int64_t durabilityLagTargetVersions) :
 		tpsLimit(std::numeric_limits<double>::infinity()),
 		tpsLimitMetric(StringRef("Ratekeeper.TPSLimit" + context)),
 		reasonMetric(StringRef("Ratekeeper.Reason" + context)),
@@ -140,6 +141,7 @@ struct RatekeeperLimits {
 		logTargetBytes(logTargetBytes),
 		logSpringBytes(logSpringBytes),
 		maxVersionDifference(maxVersionDifference),
+		durabilityLagTargetVersions(durabilityLagTargetVersions),
 		context(context)
 	{}
 };
@@ -177,8 +179,8 @@ struct RatekeeperData {
 	RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), 
 		actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")),
 		lastWarning(0),
-		normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE),
-		batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH),
+		normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS),
+		batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH),
 		durabilityLagLimit(std::numeric_limits<double>::infinity()), lastDurabilityLag(0)
 	{}
 };
@@ -350,7 +352,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 	// SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this value
 	actualTps = std::max( std::max( 1.0, actualTps ), self->smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT );
 	
-	if(self->actualTpsHistory.size() > 600) {
+	if(self->actualTpsHistory.size() > SERVER_KNOBS->MAX_TPS_HISTORY_SAMPLES) {
 		self->actualTpsHistory.pop_front();
 	}
 	self->actualTpsHistory.push_back(actualTps);
@@ -495,16 +497,16 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		}
 
 		limitingStorageDurabilityLagStorageServer = -1*ss->first;
-		if(limitingStorageDurabilityLagStorageServer > 200e6) {
+		if(limitingStorageDurabilityLagStorageServer > limits->durabilityLagTargetVersions && self->actualTpsHistory.size() > SERVER_KNOBS->NEEDED_TPS_HISTORY_SAMPLES) {
 			if(self->durabilityLagLimit == std::numeric_limits<double>::infinity()) {
 				double maxTps = 0;
 				for(int i = 0; i < self->actualTpsHistory.size(); i++) {
 					maxTps = std::max(maxTps, self->actualTpsHistory[i]);
 				}
-				self->durabilityLagLimit = 1.02*maxTps;
+				self->durabilityLagLimit = SERVER_KNOBS->INITIAL_DURABILITY_LAG_MULTIPLIER*maxTps;
 			}
 			if( limitingStorageDurabilityLagStorageServer > self->lastDurabilityLag ) {
-				self->durabilityLagLimit = 0.9999*self->durabilityLagLimit;
+				self->durabilityLagLimit = SERVER_KNOBS->DURABILITY_LAG_REDUCTION_RATE*self->durabilityLagLimit;
 			}
 			if(self->durabilityLagLimit < limits->tpsLimit) {
 				limits->tpsLimit = self->durabilityLagLimit;
diff --git a/tests/fast/RedwoodCorrectnessBTree.txt b/tests/fast/RedwoodCorrectnessBTree.txt
deleted file mode 100644
index 3bde204032..0000000000
--- a/tests/fast/RedwoodCorrectnessBTree.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-testTitle=UnitTests
-testName=UnitTests
-startDelay=0
-useDB=false
-maxTestCases=0
-testsMatching=!/redwood/correctness/btree

From fef58e13a406349ba9e919b37cba78083cda69b5 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 12 Jul 2019 11:58:48 -0700
Subject: [PATCH 129/136] adding logging for durability lag in ratekeeper

---
 fdbserver/Ratekeeper.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 8ea2463651..50ac0e021c 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -667,6 +667,8 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 			.detail("TotalDiskUsageBytes", totalDiskUsageBytes)
 			.detail("WorstStorageServerVersionLag", worstVersionLag)
 			.detail("LimitingStorageServerVersionLag", limitingVersionLag)
+			.detail("WorstDurabilityLag", worstStorageDurabilityLagStorageServer)
+			.detail("LimitingDurabilityLag", limitingStorageDurabilityLagStorageServer)
 			.trackLatest(name.c_str());
 	}
 }

From b2b2e25324ed227975cba4d17ff93cb604ef6880 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 12 Jul 2019 12:24:53 -0700
Subject: [PATCH 130/136] the durabilityLagLimit needs to be tracked separately
 for batch priority and normal priority

---
 fdbserver/Ratekeeper.actor.cpp | 42 ++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 50ac0e021c..eddddee3c5 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -128,7 +128,10 @@ struct RatekeeperLimits {
 	int64_t logTargetBytes;
 	int64_t logSpringBytes;
 	double maxVersionDifference;
+
 	int64_t durabilityLagTargetVersions;
+	int64_t lastDurabilityLag;
+	double durabilityLagLimit;
 
 	std::string context;
 
@@ -142,6 +145,8 @@ struct RatekeeperLimits {
 		logSpringBytes(logSpringBytes),
 		maxVersionDifference(maxVersionDifference),
 		durabilityLagTargetVersions(durabilityLagTargetVersions),
+		durabilityLagLimit(std::numeric_limits<double>::infinity()),
+		lastDurabilityLag(0),
 		context(context)
 	{}
 };
@@ -172,16 +177,13 @@ struct RatekeeperData {
 	RatekeeperLimits normalLimits;
 	RatekeeperLimits batchLimits;
 
-	int64_t lastDurabilityLag;
-	double durabilityLagLimit;
 	Deque<double> actualTpsHistory;
 
 	RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), 
 		actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")),
 		lastWarning(0),
 		normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS),
-		batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH),
-		durabilityLagLimit(std::numeric_limits<double>::infinity()), lastDurabilityLag(0)
+		batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH)
 	{}
 };
 
@@ -365,8 +367,8 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 
 	int64_t worstFreeSpaceStorageServer = std::numeric_limits<int64_t>::max();
 	int64_t worstStorageQueueStorageServer = 0;
-	int64_t worstStorageDurabilityLagStorageServer = 0;
 	int64_t limitingStorageQueueStorageServer = 0;
+	int64_t worstDurabilityLag = 0;
 	double worstStorageLocalLimit = 0;
 	double limitingStorageLocalLimit = 0;
 
@@ -402,7 +404,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		worstStorageLocalLimit = std::min(worstStorageLocalLimit, ss.localRateLimit);
 
 		int64_t storageDurabilityLag = ss.smoothLatestVersion.smoothTotal() - ss.smoothDurableVersion.smoothTotal();
-		worstStorageDurabilityLagStorageServer = std::max(worstStorageDurabilityLagStorageServer, storageDurabilityLag);
+		worstDurabilityLag = std::max(worstDurabilityLag, storageDurabilityLag);
 
 		storageDurabilityLagReverseIndex.insert(std::make_pair(-1*storageDurabilityLag, &ss));
 
@@ -484,7 +486,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		break;
 	}
 
-	int64_t limitingStorageDurabilityLagStorageServer = 0;
+	int64_t limitingDurabilityLag = 0;
 
 	std::set<Optional<Standalone<StringRef>>> ignoredDurabilityLagMachines;
 	for (auto ss = storageDurabilityLagReverseIndex.begin(); ss != storageDurabilityLagReverseIndex.end(); ++ss) {
@@ -496,31 +498,31 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 			continue;
 		}
 
-		limitingStorageDurabilityLagStorageServer = -1*ss->first;
-		if(limitingStorageDurabilityLagStorageServer > limits->durabilityLagTargetVersions && self->actualTpsHistory.size() > SERVER_KNOBS->NEEDED_TPS_HISTORY_SAMPLES) {
-			if(self->durabilityLagLimit == std::numeric_limits<double>::infinity()) {
+		limitingDurabilityLag = -1*ss->first;
+		if(limitingDurabilityLag > limits->durabilityLagTargetVersions && self->actualTpsHistory.size() > SERVER_KNOBS->NEEDED_TPS_HISTORY_SAMPLES) {
+			if(limits->durabilityLagLimit == std::numeric_limits<double>::infinity()) {
 				double maxTps = 0;
 				for(int i = 0; i < self->actualTpsHistory.size(); i++) {
 					maxTps = std::max(maxTps, self->actualTpsHistory[i]);
 				}
-				self->durabilityLagLimit = SERVER_KNOBS->INITIAL_DURABILITY_LAG_MULTIPLIER*maxTps;
+				limits->durabilityLagLimit = SERVER_KNOBS->INITIAL_DURABILITY_LAG_MULTIPLIER*maxTps;
 			}
-			if( limitingStorageDurabilityLagStorageServer > self->lastDurabilityLag ) {
-				self->durabilityLagLimit = SERVER_KNOBS->DURABILITY_LAG_REDUCTION_RATE*self->durabilityLagLimit;
+			if( limitingDurabilityLag > limits->lastDurabilityLag ) {
+				limits->durabilityLagLimit = SERVER_KNOBS->DURABILITY_LAG_REDUCTION_RATE*limits->durabilityLagLimit;
 			}
-			if(self->durabilityLagLimit < limits->tpsLimit) {
-				limits->tpsLimit = self->durabilityLagLimit;
+			if(limits->durabilityLagLimit < limits->tpsLimit) {
+				limits->tpsLimit = limits->durabilityLagLimit;
 				limitReason = limitReason_t::storage_server_durability_lag;
 			}
 		} else {
-			self->durabilityLagLimit = std::numeric_limits<double>::infinity();
+			limits->durabilityLagLimit = std::numeric_limits<double>::infinity();
 		}
-		self->lastDurabilityLag = limitingStorageDurabilityLagStorageServer;
+		limits->lastDurabilityLag = limitingDurabilityLag;
 		break;
 	}
 
 	self->healthMetrics.worstStorageQueue = worstStorageQueueStorageServer;
-	self->healthMetrics.worstStorageDurabilityLag = worstStorageDurabilityLagStorageServer;
+	self->healthMetrics.worstStorageDurabilityLag = worstDurabilityLag;
 
 	double writeToReadLatencyLimit = 0;
 	Version worstVersionLag = 0;
@@ -667,8 +669,8 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 			.detail("TotalDiskUsageBytes", totalDiskUsageBytes)
 			.detail("WorstStorageServerVersionLag", worstVersionLag)
 			.detail("LimitingStorageServerVersionLag", limitingVersionLag)
-			.detail("WorstDurabilityLag", worstStorageDurabilityLagStorageServer)
-			.detail("LimitingDurabilityLag", limitingStorageDurabilityLagStorageServer)
+			.detail("WorstDurabilityLag", worstDurabilityLag)
+			.detail("LimitingDurabilityLag", limitingDurabilityLag)
 			.trackLatest(name.c_str());
 	}
 }

From 6e34e166999c135c06c07589cbfef2beef519e46 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 12 Jul 2019 12:46:37 -0700
Subject: [PATCH 131/136] durable version needs more smoothing because it will
 be updated in bursts

---
 fdbserver/Ratekeeper.actor.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index eddddee3c5..bc1f70eb6a 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -87,14 +87,14 @@ struct StorageQueueInfo {
 	StorageQueuingMetricsReply lastReply;
 	StorageQueuingMetricsReply prevReply;
 	Smoother smoothDurableBytes, smoothInputBytes, verySmoothDurableBytes;
-	Smoother smoothDurableVersion, smoothLatestVersion;
+	Smoother verySmoothDurableVersion, smoothLatestVersion;
 	Smoother smoothFreeSpace;
 	Smoother smoothTotalSpace;
 	double localRateLimit;
 	limitReason_t limitReason;
 	StorageQueueInfo(UID id, LocalityData locality) : valid(false), id(id), locality(locality), smoothDurableBytes(SERVER_KNOBS->SMOOTHING_AMOUNT),
 		smoothInputBytes(SERVER_KNOBS->SMOOTHING_AMOUNT), verySmoothDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
-		smoothDurableVersion(1.), smoothLatestVersion(1.), smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT),
+		verySmoothDurableVersion(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), smoothLatestVersion(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT),
 		smoothTotalSpace(SERVER_KNOBS->SMOOTHING_AMOUNT)
 	{
 		// FIXME: this is a tacky workaround for a potential uninitialized use in trackStorageServerQueueInfo
@@ -206,7 +206,7 @@ ACTOR Future<Void> trackStorageServerQueueInfo( RatekeeperData* self, StorageSer
 					myQueueInfo->value.smoothInputBytes.reset(reply.get().bytesInput);
 					myQueueInfo->value.smoothFreeSpace.reset(reply.get().storageBytes.available);
 					myQueueInfo->value.smoothTotalSpace.reset(reply.get().storageBytes.total);
-					myQueueInfo->value.smoothDurableVersion.reset(reply.get().durableVersion);
+					myQueueInfo->value.verySmoothDurableVersion.reset(reply.get().durableVersion);
 					myQueueInfo->value.smoothLatestVersion.reset(reply.get().version);
 				} else {
 					self->smoothTotalDurableBytes.addDelta( reply.get().bytesDurable - myQueueInfo->value.prevReply.bytesDurable );
@@ -215,7 +215,7 @@ ACTOR Future<Void> trackStorageServerQueueInfo( RatekeeperData* self, StorageSer
 					myQueueInfo->value.smoothInputBytes.setTotal( reply.get().bytesInput );
 					myQueueInfo->value.smoothFreeSpace.setTotal( reply.get().storageBytes.available );
 					myQueueInfo->value.smoothTotalSpace.setTotal( reply.get().storageBytes.total );
-					myQueueInfo->value.smoothDurableVersion.setTotal(reply.get().durableVersion);
+					myQueueInfo->value.verySmoothDurableVersion.setTotal(reply.get().durableVersion);
 					myQueueInfo->value.smoothLatestVersion.setTotal(reply.get().version);
 				}
 			} else {
@@ -403,7 +403,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 		worstStorageQueueStorageServer = std::max(worstStorageQueueStorageServer, storageQueue);
 		worstStorageLocalLimit = std::min(worstStorageLocalLimit, ss.localRateLimit);
 
-		int64_t storageDurabilityLag = ss.smoothLatestVersion.smoothTotal() - ss.smoothDurableVersion.smoothTotal();
+		int64_t storageDurabilityLag = ss.smoothLatestVersion.smoothTotal() - ss.verySmoothDurableVersion.smoothTotal();
 		worstDurabilityLag = std::max(worstDurabilityLag, storageDurabilityLag);
 
 		storageDurabilityLagReverseIndex.insert(std::make_pair(-1*storageDurabilityLag, &ss));

From db5b4a6331705163ae0ed0b6f87fb1896caba97f Mon Sep 17 00:00:00 2001
From: Evan Tschannen <ejt@apple.com>
Date: Fri, 12 Jul 2019 13:40:18 -0700
Subject: [PATCH 132/136] avoid going to unlimited immediately after going
 below the durabilityLagTargetVersion

---
 fdbserver/Knobs.cpp            | 2 ++
 fdbserver/Knobs.h              | 2 ++
 fdbserver/Ratekeeper.actor.cpp | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index ee3fb18823..8b1e4e47e9 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -400,8 +400,10 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( NEEDED_TPS_HISTORY_SAMPLES,                            200 );
 	init( TARGET_DURABILITY_LAG_VERSIONS,                      200e6 );
 	init( TARGET_DURABILITY_LAG_VERSIONS_BATCH,                100e6 );
+	init( DURABILITY_LAG_UNLIMITED_THRESHOLD,                   50e6 );
 	init( INITIAL_DURABILITY_LAG_MULTIPLIER,                    1.02 );
 	init( DURABILITY_LAG_REDUCTION_RATE,                      0.9999 );
+	init( DURABILITY_LAG_INCREASE_RATE,                        1.001 );
 
 	//Storage Metrics
 	init( STORAGE_METRICS_AVERAGE_INTERVAL,                    120.0 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 065c5ab4e5..2810360341 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -336,8 +336,10 @@ public:
 	int NEEDED_TPS_HISTORY_SAMPLES;
 	int64_t TARGET_DURABILITY_LAG_VERSIONS;
 	int64_t TARGET_DURABILITY_LAG_VERSIONS_BATCH;
+	int64_t DURABILITY_LAG_UNLIMITED_THRESHOLD;
 	double INITIAL_DURABILITY_LAG_MULTIPLIER;
 	double DURABILITY_LAG_REDUCTION_RATE;
+	double DURABILITY_LAG_INCREASE_RATE;
 
 	//Storage Metrics
 	double STORAGE_METRICS_AVERAGE_INTERVAL;
diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index bc1f70eb6a..22752d7c08 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -514,6 +514,8 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
 				limits->tpsLimit = limits->durabilityLagLimit;
 				limitReason = limitReason_t::storage_server_durability_lag;
 			}
+		} else if(limits->durabilityLagLimit != std::numeric_limits<double>::infinity() && limitingDurabilityLag > limits->durabilityLagTargetVersions - SERVER_KNOBS->DURABILITY_LAG_UNLIMITED_THRESHOLD) {
+			limits->durabilityLagLimit = SERVER_KNOBS->DURABILITY_LAG_INCREASE_RATE*limits->durabilityLagLimit;
 		} else {
 			limits->durabilityLagLimit = std::numeric_limits<double>::infinity();
 		}

From e1541778abbcb12f60c9a596c27b00453f525102 Mon Sep 17 00:00:00 2001
From: Trevor Clinkenbeard <trevorclinkenbeard@gmail.com>
Date: Mon, 15 Jul 2019 10:51:09 -0700
Subject: [PATCH 133/136] Added readsRejected counter to storage server

---
 fdbserver/storageserver.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index ade11ab8a0..bdb5236235 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -471,6 +471,7 @@ public:
 		Counter updateBatches, updateVersions;
 		Counter loops;
 		Counter fetchWaitingMS, fetchWaitingCount, fetchExecutingMS, fetchExecutingCount;
+		Counter readsRejected;
 
 		LatencyBands readLatencyBands;
 
@@ -499,6 +500,7 @@ public:
 			fetchWaitingCount("FetchWaitingCount", cc),
 			fetchExecutingMS("FetchExecutingMS", cc),
 			fetchExecutingCount("FetchExecutingCount", cc),
+			readsRejected("ReadsRejected", cc),
 			readLatencyBands("ReadLatencyMetrics", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY)
 		{
 			specialCounter(cc, "LastTLogVersion", [self](){ return self->lastTLogVersion; });
@@ -645,6 +647,7 @@ public:
 		if (rate < 0.8 && deterministicRandom()->random01() > rate) {
 			//request.error = future_version();
 			sendErrorWithPenalty(request.reply, server_overloaded(), getPenalty());
+			++counters.readsRejected;
 			return Void();
 		}
 		return fun(this, request);

From 68f2b7a7f3ffa39da49b31dbec179e80740304b8 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Mon, 15 Jul 2019 15:01:24 -0700
Subject: [PATCH 134/136] Event subclasses are managed through templates and
 don't have virtual functions, so don't use override.

---
 fdbclient/ClientLogEvents.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fdbclient/ClientLogEvents.h b/fdbclient/ClientLogEvents.h
index 9be2b78798..ba50a2308d 100644
--- a/fdbclient/ClientLogEvents.h
+++ b/fdbclient/ClientLogEvents.h
@@ -60,7 +60,7 @@ namespace FdbClientLogEvents {
 
 		double latency;
 
-		void logEvent(std::string id, int maxFieldLength) const override {
+		void logEvent(std::string id, int maxFieldLength) const {
 			TraceEvent("TransactionTrace_GetVersion")
 			.detail("TransactionID", id)
 			.detail("Latency", latency);
@@ -82,7 +82,7 @@ namespace FdbClientLogEvents {
 		int valueSize;
 		Key key;
 
-		void logEvent(std::string id, int maxFieldLength) const override {
+		void logEvent(std::string id, int maxFieldLength) const {
 			TraceEvent("TransactionTrace_Get")
 			.setMaxEventLength(-1)
 			.detail("TransactionID", id)
@@ -109,7 +109,7 @@ namespace FdbClientLogEvents {
 		Key startKey;
 		Key endKey;
 
-		void logEvent(std::string id, int maxFieldLength) const override {
+		void logEvent(std::string id, int maxFieldLength) const {
 			TraceEvent("TransactionTrace_GetRange")
 			.setMaxEventLength(-1)
 			.detail("TransactionID", id)
@@ -137,7 +137,7 @@ namespace FdbClientLogEvents {
 		int commitBytes;
 		CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized
 
-		void logEvent(std::string id, int maxFieldLength) const override {
+		void logEvent(std::string id, int maxFieldLength) const {
 			for (auto &read_range : req.transaction.read_conflict_ranges) {
 				TraceEvent("TransactionTrace_Commit_ReadConflictRange")
 				.setMaxEventLength(-1)
@@ -186,7 +186,7 @@ namespace FdbClientLogEvents {
 		int errCode;
 		Key key;
 
-		void logEvent(std::string id, int maxFieldLength) const override {
+		void logEvent(std::string id, int maxFieldLength) const {
 			TraceEvent("TransactionTrace_GetError")
 			.setMaxEventLength(-1)
 			.detail("TransactionID", id)
@@ -211,7 +211,7 @@ namespace FdbClientLogEvents {
 		Key startKey;
 		Key endKey;
 
-		void logEvent(std::string id, int maxFieldLength) const override {
+		void logEvent(std::string id, int maxFieldLength) const {
 			TraceEvent("TransactionTrace_GetRangeError")
 			.setMaxEventLength(-1)
 			.detail("TransactionID", id)
@@ -236,7 +236,7 @@ namespace FdbClientLogEvents {
 		int errCode;
 		CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized
 
-		void logEvent(std::string id, int maxFieldLength) const override {
+		void logEvent(std::string id, int maxFieldLength) const {
 			for (auto &read_range : req.transaction.read_conflict_ranges) {
 				TraceEvent("TransactionTrace_CommitError_ReadConflictRange")
 				.setMaxEventLength(-1)

From 562e41d25d38cdb3a68d73b9f19f496c4478a1a8 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@users.noreply.github.com>
Date: Mon, 15 Jul 2019 15:02:55 -0700
Subject: [PATCH 135/136] Apply suggestions from code review

Co-Authored-By: Alec Grieser <alloc@apple.com>
---
 documentation/sphinx/source/api-ruby.rst      | 2 +-
 documentation/sphinx/source/release-notes.rst | 2 +-
 flow/Trace.h                                  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/documentation/sphinx/source/api-ruby.rst b/documentation/sphinx/source/api-ruby.rst
index d363feecd1..9363ad8366 100644
--- a/documentation/sphinx/source/api-ruby.rst
+++ b/documentation/sphinx/source/api-ruby.rst
@@ -803,7 +803,7 @@ Transaction options
 
     |option-set-timeout-blurb3|
 
-.. method:: Transaction.options.set_size_limit(size_limit) -> nil
+.. method:: Transaction.options.set_transaction_logging_max_field_length(size_limit) -> nil
 
     |option-set-transaction-logging-max-field-length-blurb|
 
diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst
index e53eeab2ca..68315c2603 100644
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@@ -30,7 +30,7 @@ Bindings
 
 * Go: The Go bindings now require Go version 1.11 or later.
 * Go: Fix issue with finalizers running too early that could lead to undefined behavior. `(PR #1451) <https://github.com/apple/foundationdb/pull/1451>`_.
-* Added transaction option to control the field length of keys and values in debug transaction logging in order to avoid truncation. `(PR #) <https://github.com/apple/foundationdb/pull/>`_.
+* Added transaction option to control the field length of keys and values in debug transaction logging in order to avoid truncation. `(PR #1844) <https://github.com/apple/foundationdb/pull/1844>`_.
 
 Other Changes
 -------------
diff --git a/flow/Trace.h b/flow/Trace.h
index 0fa98dbd78..7388ac1dde 100644
--- a/flow/Trace.h
+++ b/flow/Trace.h
@@ -452,7 +452,7 @@ public:
 	TraceEvent& setMaxFieldLength(int maxFieldLength);
 
 	// Sets the maximum event length before the event gets suppressed and a warning is logged. A value of 0 uses the default,
-	// A negative value disables length suppression. This should be called before adding details.
+	// a negative value disables length suppression. This should be called before adding details.
 	TraceEvent& setMaxEventLength(int maxEventLength);
 
 	//Cannot call other functions which could disable the trace event afterwords

From 520f5520820ac0fcb75c73fa3408eca8201ff2fa Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus@pilman.ch>
Date: Mon, 15 Jul 2019 15:16:43 -0700
Subject: [PATCH 136/136] Update cmake/ConfigureCompiler.cmake

Co-Authored-By: A.J. Beamon <ajbeamon@users.noreply.github.com>
---
 cmake/ConfigureCompiler.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake
index 2da36b788c..7b9924e249 100644
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@@ -11,7 +11,7 @@ set(USE_CCACHE OFF CACHE BOOL "Use ccache for compilation if available")
 set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info")
 
 set(rel_debug_paths OFF)
-if(RELATIVE_DEBUG_PATHS OR FDB_RELEASE)
+if(RELATIVE_DEBUG_PATHS)
   set(rel_debug_paths ON)
 endif()