Merge branch 'master' into fdb_cache_subfeature2

2020-05-04 17:29:43 -07:00 · 2020-05-04 17:29:43 -07:00 · dd033736ed
parent 9034964864 939a62449f
commit dd033736ed
182 changed files with 8408 additions and 4764 deletions
--- a/14
+++ b/14
@ -504,4 +504,16 @@ Armon Dadgar (ART)
    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
           ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
           (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Copyright (C) 2009 The Guava Authors
+	
+	Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+	in compliance with the License. You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+ 	Unless required by applicable law or agreed to in writing, software distributed under the License
+	is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+	or implied. See the License for the specific language governing permissions and limitations under
+	the License.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -146,6 +146,10 @@ set(SEED "0x${SEED_}" CACHE STRING "Random seed for testing")
 # components
 ################################################################################

+if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
+  include_directories(/usr/local/include)
+endif()
+
 include(CompileBoost)
 add_subdirectory(flow)
 add_subdirectory(fdbrpc)
@ -173,6 +177,10 @@ else()
  include(CPack)
 endif()

+if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
+  add_link_options(-lexecinfo)
+endif()
+
 ################################################################################
 # process compile commands for IDE
 ################################################################################
--- a/FDBLibTLS/CMakeLists.txt
+++ b/FDBLibTLS/CMakeLists.txt
@ -9,4 +9,4 @@ set(SRCS
  FDBLibTLSVerify.h)

 add_library(FDBLibTLS STATIC ${SRCS})
-target_link_libraries(FDBLibTLS PUBLIC LibreSSL boost_target PRIVATE flow)
+target_link_libraries(FDBLibTLS PUBLIC OpenSSL::SSL boost_target PRIVATE flow)
--- a/README.md
+++ b/README.md
@ -123,6 +123,37 @@ cmake -G Xcode -DOPEN_FOR_IDE=ON <FDB_SOURCE_DIRECTORY>
 You should create a second build-directory which you will use for building
 (probably with make or ninja) and debugging.

+#### FreeBSD
+
+1. Check out this repo on your server.
+1. Install compile-time dependencies from ports.
+1. (Optional) Use tmpfs & ccache for significantly faster repeat builds
+1. (Optional) Install a [JDK](https://www.freshports.org/java/openjdk8/)
+   for Java Bindings. FoundationDB currently builds with Java 8.
+1. Navigate to the directory where you checked out the foundationdb
+   repo.
+1. Build from source.
+
+    ```shell
+    sudo pkg install -r FreeBSD \
+        shells/bash devel/cmake devel/ninja devel/ccache  \
+        lang/mono lang/python3 \
+        devel/boost-libs devel/libeio \
+        security/openssl
+    mkdir .build && cd .build
+    cmake -G Ninja \
+        -DUSE_CCACHE=on \
+        -DDISABLE_TLS=off \
+        -DUSE_DTRACE=off \
+        ..
+    ninja -j 10
+    # run fast tests
+    ctest -L fast
+    # run all tests
+    ctest --output-on-failure -v
+    ```
+
+
 ### Linux

 There are no special requirements for Linux.  A docker image can be pulled from
@ -206,37 +237,3 @@ will automatically find it and build with TLS support.
 If you installed WIX before running `cmake` you should find the
 `FDBInstaller.msi` in your build directory under `packaging/msi`. 

-## Makefile (Deprecated - all users should transition to using cmake)
-
-#### MacOS
-
-1. Check out this repo on your Mac.
-1. Install the Xcode command-line tools.
-1. Download version 1.67.0 of [Boost](https://sourceforge.net/projects/boost/files/boost/1.67.0/).
-1. Set the `BOOSTDIR` environment variable to the location containing this boost installation.
-1. Install [Mono](http://www.mono-project.com/download/stable/).
-1. Install a [JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html). FoundationDB currently builds with Java 8.
-1. Navigate to the directory where you checked out the foundationdb repo.
-1. Run `make`.
-
-#### Linux
-
-1. Install [Docker](https://www.docker.com/).
-1. Check out the foundationdb repo.
-1. Run the docker image interactively with [Docker Run](https://docs.docker.com/engine/reference/run/#general-form), and with the directory containing the foundationdb repo mounted via [Docker Mounts](https://docs.docker.com/storage/volumes/).
-
-    ```shell
-    docker run -it -v '/local/dir/path/foundationdb:/docker/dir/path/foundationdb' foundationdb/foundationdb-build:latest
-    ```
-
-1. Run `$ scl enable devtoolset-8 python27 rh-python36 rh-ruby24 -- bash` within the running container.  This enables a more modern compiler, which is required to build FoundationDB.
-1. Navigate to the container's mounted directory which contains the foundationdb repo.
-
-    ```shell
-    cd /docker/dir/path/foundationdb
-    ```
-
-1. Run `make`.
-
-This will build the fdbserver binary and the python bindings. If you want to build our other bindings, you will need to install a runtime for the language whose binding you want to build. Each binding has an `.mk` file which provides specific targets for that binding.
-
--- a/bindings/bindingtester/run_binding_tester.sh
+++ b/bindings/bindingtester/run_binding_tester.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 ######################################################
 #
 # FoundationDB Binding Test Script
--- a/bindings/bindingtester/run_tester_loop.sh
+++ b/bindings/bindingtester/run_tester_loop.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 LOGGING_LEVEL=WARNING

--- a/bindings/c/generate_asm.py
+++ b/bindings/c/generate_asm.py
@ -61,7 +61,7 @@ def write_windows_asm(asmfile, functions):
 def write_unix_asm(asmfile, functions, prefix):
    asmfile.write(".intel_syntax noprefix\n")

-    if platform == "linux":
+    if platform == "linux" or platform == "freebsd":
        asmfile.write("\n.data\n")
        for f in functions:
            asmfile.write("\t.extern fdb_api_ptr_%s\n" % f)
--- a/bindings/c/test/mako/mako.c
+++ b/bindings/c/test/mako/mako.c
@ -12,6 +12,9 @@

 #if defined(__linux__)
 #include <linux/limits.h>
+#elif defined(__FreeBSD__)
+#include <sys/stat.h>
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC_FAST
 #elif defined(__APPLE__)
 #include <sys/syslimits.h>
 #define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
--- a/bindings/go/CMakeLists.txt
+++ b/bindings/go/CMakeLists.txt
@ -99,6 +99,8 @@ function(build_go_package)
  endif()
  add_custom_command(OUTPUT ${outfile}
    COMMAND ${CMAKE_COMMAND} -E env ${go_env}
+            ${GO_EXECUTABLE} get -d ${GO_IMPORT_PATH}/${BGP_PATH} &&
+            ${CMAKE_COMMAND} -E env ${go_env}
            ${GO_EXECUTABLE} install ${GO_IMPORT_PATH}/${BGP_PATH}
    DEPENDS ${fdb_options_file}
    COMMENT "Building ${BGP_NAME}")
--- a/bindings/go/fdb-go-install.sh
+++ b/bindings/go/fdb-go-install.sh
@ -25,6 +25,9 @@ platform=$(uname)
 if [[ "${platform}" == "Darwin" ]] ; then
    FDBLIBDIR="${FDBLIBDIR:-/usr/local/lib}"
    libfdbc="libfdb_c.dylib"
+elif [[ "${platform}" == "FreeBSD" ]] ; then
+    FDBLIBDIR="${FDBLIBDIR:-/lib}"
+    libfdbc="libfdb_c.so"
 elif [[ "${platform}" == "Linux" ]] ; then
    libfdbc="libfdb_c.so"
    custom_libdir="${FDBLIBDIR:-}"
@ -248,8 +251,11 @@ else
            :
        elif [[ "${status}" -eq 0 ]] ; then
            echo "Building generated files."
+        if [[ "${platform}" == "FreeBSD" ]] ; then
+            cmd=( 'gmake' '-C' "${fdbdir}" 'bindings/c/foundationdb/fdb_c_options.g.h' )
+        else
            cmd=( 'make' '-C' "${fdbdir}" 'bindings/c/foundationdb/fdb_c_options.g.h' )
-
+        fi
            echo "${cmd[*]}"
            if ! "${cmd[@]}" ; then
                let status="${status} + 1"
--- a/bindings/go/src/fdb/directory/directorySubspace.go
+++ b/bindings/go/src/fdb/directory/directorySubspace.go
@ -23,6 +23,8 @@
 package directory

 import (
+	"fmt"
+	"strings"
 	"github.com/apple/foundationdb/bindings/go/src/fdb"
 	"github.com/apple/foundationdb/bindings/go/src/fdb/subspace"
 )
@ -43,6 +45,18 @@ type directorySubspace struct {
 	layer []byte
 }

+// String implements the fmt.Stringer interface and returns human-readable
+// string representation of this object.
+func (ds directorySubspace) String() string {
+	var path string
+	if len(ds.path) > 0 {
+		path = "(" + strings.Join(ds.path, ",") + ")"
+	} else {
+		path = "nil"
+	}
+	return fmt.Sprintf("DirectorySubspace(%s, %s)", path, fdb.Printable(ds.Bytes()))
+}
+
 func (d directorySubspace) CreateOrOpen(t fdb.Transactor, path []string, layer []byte) (DirectorySubspace, error) {
 	return d.dl.CreateOrOpen(t, d.dl.partitionSubpath(d.path, path), layer)
 }
--- a/bindings/go/src/fdb/futures.go
+++ b/bindings/go/src/fdb/futures.go
@ -268,6 +268,7 @@ type futureKeyValueArray struct {
 	*future
 }

+//go:nocheckptr
 func stringRefToSlice(ptr unsafe.Pointer) []byte {
 	size := *((*C.int)(unsafe.Pointer(uintptr(ptr) + 8)))

--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@ -304,7 +304,7 @@ func (o DatabaseOptions) SetTransactionTimeout(param int64) error {
 	return o.setOpt(500, int64ToBytes(param))
 }

-// Set a timeout in milliseconds which, when elapsed, will cause a transaction automatically to be cancelled. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information.
+// Set a maximum number of retries after which additional calls to ``onError`` will throw the most recently seen error code. This sets the ``retry_limit`` option of each transaction created by this database. See the transaction option description for more information.
 //
 // Parameter: number of times to retry
 func (o DatabaseOptions) SetTransactionRetryLimit(param int64) error {
@ -330,7 +330,7 @@ func (o DatabaseOptions) SetTransactionCausalReadRisky() error {
 	return o.setOpt(504, nil)
 }

-// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 700, this option is enabled by default and setting this has no effect.
+// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect.
 func (o DatabaseOptions) SetTransactionIncludePortInAddress() error {
 	return o.setOpt(505, nil)
 }
@ -350,7 +350,7 @@ func (o TransactionOptions) SetCausalReadDisable() error {
 	return o.setOpt(21, nil)
 }

-// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 700, this option is enabled by default and setting this has no effect.
+// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect.
 func (o TransactionOptions) SetIncludePortInAddress() error {
 	return o.setOpt(23, nil)
 }
@ -429,7 +429,7 @@ func (o TransactionOptions) SetDebugTransactionIdentifier(param string) error {
 	return o.setOpt(403, []byte(param))
 }

-// Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled and to get log output.
+// Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled to get log output.
 func (o TransactionOptions) SetLogTransaction() error {
 	return o.setOpt(404, nil)
 }
@ -479,7 +479,7 @@ func (o TransactionOptions) SetSnapshotRywDisable() error {
 	return o.setOpt(601, nil)
 }

-// The transaction can read and write to locked databases, and is resposible for checking that it took the lock.
+// The transaction can read and write to locked databases, and is responsible for checking that it took the lock.
 func (o TransactionOptions) SetLockAware() error {
 	return o.setOpt(700, nil)
 }
--- a/bindings/go/src/fdb/subspace/subspace.go
+++ b/bindings/go/src/fdb/subspace/subspace.go
@ -35,6 +35,8 @@ package subspace
 import (
 	"bytes"
 	"errors"
+	"fmt"
+
 	"github.com/apple/foundationdb/bindings/go/src/fdb"
 	"github.com/apple/foundationdb/bindings/go/src/fdb/tuple"
 )
@ -82,7 +84,7 @@ type Subspace interface {
 }

 type subspace struct {
-	b []byte
+	rawPrefix []byte
 }

 // AllKeys returns the Subspace corresponding to all keys in a FoundationDB
@ -105,40 +107,46 @@ func FromBytes(b []byte) Subspace {
 	return subspace{s}
 }

+// String implements the fmt.Stringer interface and return the subspace
+// as a human readable byte string provided by fdb.Printable.
+func (s subspace) String() string {
+	return fmt.Sprintf("Subspace(rawPrefix=%s)", fdb.Printable(s.rawPrefix))
+}
+
 func (s subspace) Sub(el ...tuple.TupleElement) Subspace {
 	return subspace{concat(s.Bytes(), tuple.Tuple(el).Pack()...)}
 }

 func (s subspace) Bytes() []byte {
-	return s.b
+	return s.rawPrefix
 }

 func (s subspace) Pack(t tuple.Tuple) fdb.Key {
-	return fdb.Key(concat(s.b, t.Pack()...))
+	return fdb.Key(concat(s.rawPrefix, t.Pack()...))
 }

 func (s subspace) PackWithVersionstamp(t tuple.Tuple) (fdb.Key, error) {
-	return t.PackWithVersionstamp(s.b)
+	return t.PackWithVersionstamp(s.rawPrefix)
 }

 func (s subspace) Unpack(k fdb.KeyConvertible) (tuple.Tuple, error) {
 	key := k.FDBKey()
-	if !bytes.HasPrefix(key, s.b) {
+	if !bytes.HasPrefix(key, s.rawPrefix) {
 		return nil, errors.New("key is not in subspace")
 	}
-	return tuple.Unpack(key[len(s.b):])
+	return tuple.Unpack(key[len(s.rawPrefix):])
 }

 func (s subspace) Contains(k fdb.KeyConvertible) bool {
-	return bytes.HasPrefix(k.FDBKey(), s.b)
+	return bytes.HasPrefix(k.FDBKey(), s.rawPrefix)
 }

 func (s subspace) FDBKey() fdb.Key {
-	return fdb.Key(s.b)
+	return fdb.Key(s.rawPrefix)
 }

 func (s subspace) FDBRangeKeys() (fdb.KeyConvertible, fdb.KeyConvertible) {
-	return fdb.Key(concat(s.b, 0x00)), fdb.Key(concat(s.b, 0xFF))
+	return fdb.Key(concat(s.rawPrefix, 0x00)), fdb.Key(concat(s.rawPrefix, 0xFF))
 }

 func (s subspace) FDBRangeKeySelectors() (fdb.Selectable, fdb.Selectable) {
--- a/bindings/go/src/fdb/subspace/subspace_test.go
+++ b/bindings/go/src/fdb/subspace/subspace_test.go
@ -0,0 +1,15 @@
+package subspace
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestSubspaceString(t *testing.T) {
+	printed := fmt.Sprint(Sub([]byte("hello"), "world", 42, 0x99))
+	expected := "Subspace(rawPrefix=\\x01hello\\x00\\x02world\\x00\\x15*\\x15\\x99)"
+
+	if printed != expected {
+		t.Fatalf("printed subspace result differs, expected %v, got %v", expected, printed)
+	}
+}
--- a/bindings/go/src/fdb/transaction.go
+++ b/bindings/go/src/fdb/transaction.go
@ -406,6 +406,9 @@ func (t *transaction) getApproximateSize() FutureInt64 {
 	}
 }

+// Returns a future that is the approximate transaction size so far in this
+// transaction, which is the summation of the estimated size of mutations,
+// read conflict ranges, and write conflict ranges.
 func (t Transaction) GetApproximateSize() FutureInt64 {
 	return t.getApproximateSize()
 }
--- a/bindings/go/src/fdb/tuple/tuple.go
+++ b/bindings/go/src/fdb/tuple/tuple.go
@ -43,6 +43,8 @@ import (
 	"fmt"
 	"math"
 	"math/big"
+	"strconv"
+	"strings"

 	"github.com/apple/foundationdb/bindings/go/src/fdb"
 )
@ -66,6 +68,48 @@ type TupleElement interface{}
 // packing T (modulo type normalization to []byte, uint64, and int64).
 type Tuple []TupleElement

+// String implements the fmt.Stringer interface and returns human-readable
+// string representation of this tuple. For most elements, we use the
+// object's default string representation.
+func (tuple Tuple) String() string {
+	sb := strings.Builder{}
+	printTuple(tuple, &sb)
+	return sb.String()
+}
+
+func printTuple(tuple Tuple, sb *strings.Builder) {
+	sb.WriteString("(")
+
+	for i, t := range tuple {
+		switch t := t.(type) {
+		case Tuple:
+			printTuple(t, sb)
+		case nil:
+			sb.WriteString("<nil>")
+		case string:
+			sb.WriteString(strconv.Quote(t))
+		case UUID:
+			sb.WriteString("UUID(")
+			sb.WriteString(t.String())
+			sb.WriteString(")")
+		case []byte:
+			sb.WriteString("b\"")
+			sb.WriteString(fdb.Printable(t))
+			sb.WriteString("\"")
+		default:
+			// For user-defined and standard types, we use standard Go
+			// printer, which itself uses Stringer interface.
+			fmt.Fprintf(sb, "%v", t)
+		}
+
+		if (i < len(tuple) - 1) {
+			sb.WriteString(", ")
+		}
+	}
+
+	sb.WriteString(")")
+}
+
 // UUID wraps a basic byte array as a UUID. We do not provide any special
 // methods for accessing or generating the UUID, but as Go does not provide
 // a built-in UUID type, this simple wrapper allows for other libraries
@ -73,6 +117,10 @@ type Tuple []TupleElement
 // an instance of this type.
 type UUID [16]byte

+func (uuid UUID) String() string {
+	return fmt.Sprintf("%x-%x-%x-%x-%x", uuid[0:4], uuid[4:6], uuid[6:8], uuid[8:10], uuid[10:])
+}
+
 // Versionstamp is struct for a FoundationDB verionstamp. Versionstamps are
 // 12 bytes long composed of a 10 byte transaction version and a 2 byte user
 // version. The transaction version is filled in at commit time and the user
@ -82,6 +130,11 @@ type Versionstamp struct {
 	UserVersion        uint16
 }

+// Returns a human-readable string for this Versionstamp.
+func (vs Versionstamp) String() string {
+	return fmt.Sprintf("Versionstamp(%s, %d)", fdb.Printable(vs.TransactionVersion[:]), vs.UserVersion)
+}
+
 var incompleteTransactionVersion = [10]byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}

 const versionstampLength = 12
--- a/bindings/go/src/fdb/tuple/tuple_test.go
+++ b/bindings/go/src/fdb/tuple/tuple_test.go
@ -4,6 +4,7 @@ import (
 	"bytes"
 	"encoding/gob"
 	"flag"
+	"fmt"
 	"math/rand"
 	"os"
 	"testing"
@ -118,3 +119,38 @@ func BenchmarkTuplePacking(b *testing.B) {
 		})
 	}
 }
+
+func TestTupleString(t *testing.T) {
+	testCases :=[ ]struct {
+		input    Tuple
+		expected string
+	}{
+		{
+			Tuple{[]byte("hello"), "world", 42, 0x99},
+			"(b\"hello\", \"world\", 42, 153)",
+		},
+		{
+			Tuple{nil, Tuple{"Ok", Tuple{1, 2}, "Go"}, 42, 0x99},
+			"(<nil>, (\"Ok\", (1, 2), \"Go\"), 42, 153)",
+		},
+		{
+			Tuple{"Bool", true, false},
+			"(\"Bool\", true, false)",
+		},
+		{
+			Tuple{"UUID", testUUID},
+			"(\"UUID\", UUID(1100aabb-ccdd-eeff-1100-aabbccddeeff))",
+		},
+		{
+			Tuple{"Versionstamp", Versionstamp{[10]byte{0, 0, 0, 0xaa, 0, 0xbb, 0, 0xcc, 0, 0xdd}, 620}},
+			"(\"Versionstamp\", Versionstamp(\\x00\\x00\\x00\\xaa\\x00\\xbb\\x00\\xcc\\x00\\xdd, 620))",
+		},
+	}
+
+	for _, testCase := range testCases {
+		printed := fmt.Sprint(testCase.input)
+		if printed != testCase.expected {
+			t.Fatalf("printed tuple result differs, expected %v, got %v", testCase.expected, printed)
+		}
+	}
+}
--- a/bindings/java/CMakeLists.txt
+++ b/bindings/java/CMakeLists.txt
@ -56,6 +56,7 @@ set(JAVA_BINDING_SRCS
  src/main/com/apple/foundationdb/testing/Promise.java
  src/main/com/apple/foundationdb/testing/PerfMetric.java
  src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java
+  src/main/com/apple/foundationdb/tuple/FastByteComparisons.java
  src/main/com/apple/foundationdb/tuple/IterableComparator.java
  src/main/com/apple/foundationdb/tuple/package-info.java
  src/main/com/apple/foundationdb/tuple/StringUtil.java
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@ -36,6 +36,11 @@ static JavaVM* g_jvm = nullptr;
 static thread_local JNIEnv* g_thread_jenv = nullptr;  // Defined for the network thread once it is running, and for any thread that has called registerCallback
 static thread_local jmethodID g_IFutureCallback_call_methodID = JNI_NULL;
 static thread_local bool is_external = false;
+static jclass range_result_summary_class;
+static jclass range_result_class;
+static jclass string_class;
+static jmethodID range_result_init;
+static jmethodID range_result_summary_init;

 void detachIfExternalThread(void *ignore) {
 	if(is_external && g_thread_jenv != nullptr) {
@ -275,10 +280,9 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureStrings_FutureString
 		return JNI_NULL;
 	}

-	jclass str_clazz = jenv->FindClass("java/lang/String");
 	if( jenv->ExceptionOccurred() )
 		return JNI_NULL;
-	jobjectArray arr = jenv->NewObjectArray(count, str_clazz, JNI_NULL);
+	jobjectArray arr = jenv->NewObjectArray(count, string_class, JNI_NULL);
 	if( !arr ) {
 		if( !jenv->ExceptionOccurred() )
 			throwOutOfMem(jenv);
@ -306,13 +310,6 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureResults_FutureResult
 		throwParamNotNull(jenv);
 		return JNI_NULL;
 	}
-
-	jclass resultCls = jenv->FindClass("com/apple/foundationdb/RangeResultSummary");
-	if( jenv->ExceptionOccurred() )
-		return JNI_NULL;
-	jmethodID resultCtorId = jenv->GetMethodID(resultCls, "<init>", "([BIZ)V");
-	if( jenv->ExceptionOccurred() )
-		return JNI_NULL;
 
 	FDBFuture *f = (FDBFuture *)future;

@ -337,7 +334,7 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureResults_FutureResult
 		jenv->SetByteArrayRegion(lastKey, 0, kvs[count - 1].key_length, (jbyte *)kvs[count - 1].key);
 	}

-	jobject result = jenv->NewObject(resultCls, resultCtorId, lastKey, count, (jboolean)more);
+	jobject result = jenv->NewObject(range_result_summary_class, range_result_summary_init, lastKey, count, (jboolean)more);
 	if( jenv->ExceptionOccurred() )
 		return JNI_NULL;

@ -350,9 +347,6 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureResults_FutureResult
 		throwParamNotNull(jenv);
 		return JNI_NULL;
 	}
-
-	jclass resultCls = jenv->FindClass("com/apple/foundationdb/RangeResult");
-	jmethodID resultCtorId = jenv->GetMethodID(resultCls, "<init>", "([B[IZ)V");
 
 	FDBFuture *f = (FDBFuture *)future;

@ -414,7 +408,7 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureResults_FutureResult
 	jenv->ReleaseByteArrayElements(keyValueArray, (jbyte *)keyvalues_barr, 0);
 	jenv->ReleaseIntArrayElements(lengthArray, length_barr, 0);

-	jobject result = jenv->NewObject(resultCls, resultCtorId, keyValueArray, lengthArray, (jboolean)more);
+	jobject result = jenv->NewObject(range_result_class, range_result_init, keyValueArray, lengthArray, (jboolean)more);
 	if( jenv->ExceptionOccurred() )
 		return JNI_NULL;

@ -1042,8 +1036,43 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDB_Network_1stop(JNIEnv *jen
 }

 jint JNI_OnLoad(JavaVM *vm, void *reserved) {
+    JNIEnv *env;
 	g_jvm = vm;
-	return JNI_VERSION_1_1;
+	if (vm->GetEnv((void**)&env, JNI_VERSION_1_6) != JNI_OK) {
+		return JNI_ERR;
+	} else {
+		jclass local_range_result_class = env->FindClass("com/apple/foundationdb/RangeResult");
+		range_result_init = env->GetMethodID(local_range_result_class, "<init>", "([B[IZ)V");
+		range_result_class = (jclass) (env)->NewGlobalRef(local_range_result_class);
+
+		jclass local_range_result_summary_class = env->FindClass("com/apple/foundationdb/RangeResultSummary");
+		range_result_summary_init = env->GetMethodID(local_range_result_summary_class, "<init>", "([BIZ)V");
+		range_result_summary_class = (jclass) (env)->NewGlobalRef(local_range_result_summary_class);
+
+		jclass local_string_class = env->FindClass("java/lang/String");
+		string_class = (jclass) (env)->NewGlobalRef(local_string_class);
+
+		return JNI_VERSION_1_6;
+	}
+}
+
+// Is automatically called once the Classloader is destroyed
+void JNI_OnUnload(JavaVM *vm, void *reserved) {
+	JNIEnv* env;
+	if (vm->GetEnv((void**)&env, JNI_VERSION_1_6) != JNI_OK) {
+		return;
+	} else {
+		// delete global references so the GC can collect them
+		if (range_result_summary_class != NULL) {
+			env->DeleteGlobalRef(range_result_summary_class);
+		}
+		if (range_result_class != NULL) {
+			env->DeleteGlobalRef(range_result_class);
+		}
+		if (string_class != NULL) {
+			env->DeleteGlobalRef(string_class);
+		}
+	}
 }

 #ifdef __cplusplus
--- a/bindings/java/src/junit/com/apple/foundationdb/tuple/ArrayUtilTests.java
+++ b/bindings/java/src/junit/com/apple/foundationdb/tuple/ArrayUtilTests.java
@ -304,4 +304,58 @@ public class ArrayUtilTests {
 		fail("Not yet implemented");
 	}

+	private static final int SAMPLE_COUNT = 1000000;
+	private static final int SAMPLE_MAX_SIZE = 2048;
+	private List<byte[]> unsafe;
+	private List<byte[]> java;
+	@Before
+	public void init() {
+		unsafe = new ArrayList(SAMPLE_COUNT);
+		java = new ArrayList(SAMPLE_COUNT);
+		Random random = new Random();
+		for (int i = 0; i <= SAMPLE_COUNT; i++) {
+			byte[] addition = new byte[random.nextInt(SAMPLE_MAX_SIZE)];
+			random.nextBytes(addition);
+			unsafe.add(addition);
+			java.add(addition);
+		}
+	}
+
+	@Test
+	public void testComparatorSort() {
+		Collections.sort(unsafe, FastByteComparisons.lexicographicalComparerUnsafeImpl());
+		Collections.sort(java, FastByteComparisons.lexicographicalComparerJavaImpl());
+		Assert.assertTrue(unsafe.equals(java));
+	}
+
+	@Test
+	public void testUnsafeComparison() {
+		for (int i =0; i< SAMPLE_COUNT; i++) {
+			Assert.assertEquals(FastByteComparisons.lexicographicalComparerUnsafeImpl().compare(unsafe.get(i), java.get(i)), 0);
+		}
+	}
+
+	@Test
+	public void testJavaComparison() {
+		for (int i =0; i< SAMPLE_COUNT; i++) {
+			Assert.assertEquals(FastByteComparisons.lexicographicalComparerJavaImpl().compare(unsafe.get(i), java.get(i)), 0);
+		}
+	}
+
+	@Test
+	public void testUnsafeComparisonWithOffet() {
+		for (int i =0; i< SAMPLE_COUNT; i++) {
+			if (unsafe.get(i).length > 5)
+				Assert.assertEquals(FastByteComparisons.lexicographicalComparerUnsafeImpl().compareTo(unsafe.get(i), 4, unsafe.get(i).length - 4,  java.get(i), 4, java.get(i).length - 4), 0);
+		}
+	}
+
+	@Test
+	public void testJavaComparisonWithOffset() {
+		for (int i =0; i< SAMPLE_COUNT; i++) {
+			if (unsafe.get(i).length > 5)
+				Assert.assertEquals(FastByteComparisons.lexicographicalComparerJavaImpl().compareTo(unsafe.get(i), 4, unsafe.get(i).length - 4,  java.get(i), 4, java.get(i).length - 4), 0);
+		}
+	}
+
 }
--- a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java
+++ b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java
@ -34,7 +34,7 @@ import com.apple.foundationdb.Transaction;
 *  {@link #printable(byte[])} for debugging non-text keys and values.
 *
 */
-public class ByteArrayUtil {
+public class ByteArrayUtil extends FastByteComparisons {

 	/**
 	 * Joins a set of byte arrays into a larger array. The {@code interlude} is placed
@ -135,11 +135,7 @@ public class ByteArrayUtil {
 		if(src.length < start + pattern.length)
 			return false;

-		for(int i = 0; i < pattern.length; i++)
-			if(pattern[i] != src[start + i])
-				return false;
-
-		return true;
+		return compareTo(src, start, pattern.length, pattern, 0, pattern.length) == 0;
 	}

 	/**
@ -307,14 +303,7 @@ public class ByteArrayUtil {
 	 *  {@code r}.
 	 */
 	public static int compareUnsigned(byte[] l, byte[] r) {
-		for(int idx = 0; idx < l.length && idx < r.length; ++idx) {
-			if(l[idx] != r[idx]) {
-				return (l[idx] & 0xFF) < (r[idx] & 0xFF) ? -1 : 1;
-			}
-		}
-		if(l.length == r.length)
-			return 0;
-		return l.length < r.length ? -1 : 1;
+		return compareTo(l, 0, l.length, r, 0, r.length);
 	}

 	/**
@ -328,15 +317,11 @@ public class ByteArrayUtil {
 	 * @return {@code true} if {@code array} starts with {@code prefix}
 	 */
 	public static boolean startsWith(byte[] array, byte[] prefix) {
+		// Short Circuit
 		if(array.length < prefix.length) {
 			return false;
 		}
-		for(int i = 0; i < prefix.length; ++i) {
-			if(prefix[i] != array[i]) {
-				return false;
-			}
-		}
-		return true;
+		return compareTo(array, 0, prefix.length, prefix, 0, prefix.length) == 0;
 	}

 	/**
--- a/bindings/java/src/main/com/apple/foundationdb/tuple/FastByteComparisons.java
+++ b/bindings/java/src/main/com/apple/foundationdb/tuple/FastByteComparisons.java
@ -0,0 +1,294 @@
+/*
+ * ByteArrayUtil.java
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.apple.foundationdb.tuple;
+
+import java.lang.reflect.Field;
+import java.nio.ByteOrder;
+import java.security.AccessController;
+import java.security.PrivilegedAction;
+import java.util.Comparator;
+
+import sun.misc.Unsafe;
+
+
+/**
+ * Utility code to do optimized byte-array comparison.
+ * This is borrowed and slightly modified from Guava's {@link UnsignedBytes}
+ * class to be able to compare arrays that start at non-zero offsets.
+ */
+abstract class FastByteComparisons {
+
+    private static final int UNSIGNED_MASK = 0xFF;
+    /**
+     * Lexicographically compare two byte arrays.
+     * 
+     * @param buffer1 left operand, expected to not be null
+     * @param buffer2 right operand, expected to not be null
+     * @param offset1 Where to start comparing in the left buffer, expected to be >= 0
+     * @param offset2 Where to start comparing in the right buffer, expected to be >= 0
+     * @param length1 How much to compare from the left buffer, expected to be >= 0
+     * @param length2 How much to compare from the right buffer, expected to be >= 0
+     * @return 0 if equal, < 0 if left is less than right, etc.
+     */
+    public static int compareTo(byte[] buffer1, int offset1, int length1,
+            byte[] buffer2, int offset2, int length2) {
+        return LexicographicalComparerHolder.BEST_COMPARER.compareTo(
+                buffer1, offset1, length1, buffer2, offset2, length2);
+    }
+    /**
+     * Interface for both the java and unsafe comparators + offset based comparisons.
+     * @param <T>
+     */
+    interface Comparer<T> extends Comparator<T> {
+        /**
+         * Lexicographically compare two byte arrays.
+         * 
+         * @param buffer1 left operand
+         * @param buffer2 right operand
+         * @param offset1 Where to start comparing in the left buffer
+         * @param offset2 Where to start comparing in the right buffer
+         * @param length1 How much to compare from the left buffer
+         * @param length2 How much to compare from the right buffer
+         * @return 0 if equal, < 0 if left is less than right, etc.
+         */
+        abstract public int compareTo(T buffer1, int offset1, int length1,
+                                      T buffer2, int offset2, int length2);
+    }
+
+    /**
+     * Pure Java Comparer
+     *
+     * @return
+     */
+    static Comparer<byte[]> lexicographicalComparerJavaImpl() {
+        return LexicographicalComparerHolder.PureJavaComparer.INSTANCE;
+    }
+
+    /**
+     * Unsafe Comparer
+     *
+     * @return
+     */
+    static Comparer<byte[]> lexicographicalComparerUnsafeImpl() {
+        return LexicographicalComparerHolder.UnsafeComparer.INSTANCE;
+    }
+
+
+    /**
+     * Provides a lexicographical comparer implementation; either a Java
+     * implementation or a faster implementation based on {@link Unsafe}.
+     *
+     * <p>Uses reflection to gracefully fall back to the Java implementation if
+     * {@code Unsafe} isn't available.
+     */
+    private static class LexicographicalComparerHolder {
+        static final String UNSAFE_COMPARER_NAME =
+                LexicographicalComparerHolder.class.getName() + "$UnsafeComparer";
+
+        static final Comparer<byte[]> BEST_COMPARER = getBestComparer();
+        /**
+         * Returns the Unsafe-using Comparer, or falls back to the pure-Java
+         * implementation if unable to do so.
+         */
+        static Comparer<byte[]> getBestComparer() {
+            String arch = System.getProperty("os.arch");
+            boolean unaligned = arch.equals("i386") || arch.equals("x86")
+                    || arch.equals("amd64") || arch.equals("x86_64");
+            if (!unaligned)
+                return lexicographicalComparerJavaImpl();
+            try {
+                Class<?> theClass = Class.forName(UNSAFE_COMPARER_NAME);
+
+                // yes, UnsafeComparer does implement Comparer<byte[]>
+                @SuppressWarnings("unchecked")
+                Comparer<byte[]> comparer =
+                        (Comparer<byte[]>) theClass.getEnumConstants()[0];
+                return comparer;
+            } catch (Throwable t) { // ensure we really catch *everything*
+                return lexicographicalComparerJavaImpl();
+            }
+        }
+
+        /**
+         * Java Comparer doing byte by byte comparisons
+         *
+         */
+        enum PureJavaComparer implements Comparer<byte[]> {
+            INSTANCE;
+
+            /**
+             *
+             * CompareTo looking at two buffers.
+             *
+             * @param buffer1 left operand
+             * @param buffer2 right operand
+             * @param offset1 Where to start comparing in the left buffer
+             * @param offset2 Where to start comparing in the right buffer
+             * @param length1 How much to compare from the left buffer
+             * @param length2 How much to compare from the right buffer
+             * @return 0 if equal, < 0 if left is less than right, etc.
+             */
+            @Override
+            public int compareTo(byte[] buffer1, int offset1, int length1,
+                                 byte[] buffer2, int offset2, int length2) {
+                // Short circuit equal case
+                if (buffer1 == buffer2 &&
+                        offset1 == offset2 &&
+                        length1 == length2) {
+                    return 0;
+                }
+                int end1 = offset1 + length1;
+                int end2 = offset2 + length2;
+                for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) {
+                    int a = (buffer1[i] & UNSIGNED_MASK);
+                    int b = (buffer2[j] & UNSIGNED_MASK);
+                    if (a != b) {
+                        return a - b;
+                    }
+                }
+                return length1 - length2;
+            }
+
+            /**
+             * Supports Comparator
+             *
+             * @param o1
+             * @param o2
+             * @return comparison
+             */
+            @Override
+            public int compare(byte[] o1, byte[] o2) {
+                return compareTo(o1, 0, o1.length, o2, 0, o2.length);
+            }
+        }
+
+        /**
+         *
+         * Takes advantage of word based comparisons
+         *
+         */
+        @SuppressWarnings("unused") // used via reflection
+        enum UnsafeComparer implements Comparer<byte[]> {
+            INSTANCE;
+
+            static final Unsafe theUnsafe;
+
+            /**
+             * The offset to the first element in a byte array.
+             */
+            static final int BYTE_ARRAY_BASE_OFFSET;
+
+            @Override
+            public int compare(byte[] o1, byte[] o2) {
+                return compareTo(o1, 0, o1.length, o2, 0, o2.length);
+            }
+
+            static {
+                theUnsafe = (Unsafe) AccessController.doPrivileged(
+                        (PrivilegedAction<Object>) () -> {
+                            try {
+                                Field f = Unsafe.class.getDeclaredField("theUnsafe");
+                                f.setAccessible(true);
+                                return f.get(null);
+                            } catch (NoSuchFieldException e) {
+                                // It doesn't matter what we throw;
+                                // it's swallowed in getBestComparer().
+                                throw new Error();
+                            } catch (IllegalAccessException e) {
+                                throw new Error();
+                            }
+                        });
+
+                BYTE_ARRAY_BASE_OFFSET = theUnsafe.arrayBaseOffset(byte[].class);
+
+                // sanity check - this should never fail
+                if (theUnsafe.arrayIndexScale(byte[].class) != 1) {
+                    throw new AssertionError();
+                }
+            }
+
+            static final boolean LITTLE_ENDIAN =
+                    ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN);
+
+            /**
+             * Lexicographically compare two arrays.
+             *
+             * @param buffer1 left operand
+             * @param buffer2 right operand
+             * @param offset1 Where to start comparing in the left buffer
+             * @param offset2 Where to start comparing in the right buffer
+             * @param length1 How much to compare from the left buffer
+             * @param length2 How much to compare from the right buffer
+             * @return 0 if equal, < 0 if left is less than right, etc.
+             */
+            @Override
+            public int compareTo(byte[] buffer1, int offset1, int length1,
+                                 byte[] buffer2, int offset2, int length2) {
+                // Short circuit equal case
+                if (buffer1 == buffer2 &&
+                        offset1 == offset2 &&
+                        length1 == length2) {
+                    return 0;
+                }
+                final int stride = 8;
+                final int minLength = Math.min(length1, length2);
+                int strideLimit = minLength & ~(stride - 1);
+                final long offset1Adj = offset1 + BYTE_ARRAY_BASE_OFFSET;
+                final long offset2Adj = offset2 + BYTE_ARRAY_BASE_OFFSET;
+                int i;
+
+                /*
+                 * Compare 8 bytes at a time. Benchmarking on x86 shows a stride of 8 bytes is no slower
+                 * than 4 bytes even on 32-bit. On the other hand, it is substantially faster on 64-bit.
+                 */
+                for (i = 0; i < strideLimit; i += stride) {
+                    long lw = theUnsafe.getLong(buffer1, offset1Adj + i);
+                    long rw = theUnsafe.getLong(buffer2, offset2Adj + i);
+                    if (lw != rw) {
+                        if(!LITTLE_ENDIAN) {
+                            return ((lw + Long.MIN_VALUE) < (rw + Long.MIN_VALUE)) ? -1 : 1;
+                        }
+
+                        /*
+                         * We want to compare only the first index where left[index] != right[index]. This
+                         * corresponds to the least significant nonzero byte in lw ^ rw, since lw and rw are
+                         * little-endian. Long.numberOfTrailingZeros(diff) tells us the least significant
+                         * nonzero bit, and zeroing out the first three bits of L.nTZ gives us the shift to get
+                         * that least significant nonzero byte. This comparison logic is based on UnsignedBytes
+                         * comparator from guava v21
+                         */
+                        int n = Long.numberOfTrailingZeros(lw ^ rw) & ~0x7;
+                        return ((int) ((lw >>> n) & UNSIGNED_MASK)) - ((int) ((rw >>> n) & UNSIGNED_MASK));
+                    }
+                }
+
+                // The epilogue to cover the last (minLength % stride) elements.
+                for (; i < minLength; i++) {
+                    int a = (buffer1[offset1 + i] & UNSIGNED_MASK);
+                    int b = (buffer2[offset2 + i] & UNSIGNED_MASK);
+                    if (a != b) {
+                        return a - b;
+                    }
+                }
+                return length1 - length2;
+            }
+        }
+    }
+}
--- a/bindings/python/LICENSE
+++ b/bindings/python/LICENSE
@ -0,0 +1,207 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+-------------------------------------------------------------------------------
+SOFTWARE DISTRIBUTED WITH FOUNDATIONDB:
+
+The FoundationDB software includes a number of subcomponents with separate 
+copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
+-------------------------------------------------------------------------------
--- a/bindings/python/fdb/impl.py
+++ b/bindings/python/fdb/impl.py
@ -1231,6 +1231,8 @@ if platform.system() == 'Windows':
    capi_name = 'fdb_c.dll'
 elif platform.system() == 'Linux':
    capi_name = 'libfdb_c.so'
+elif platform.system() == 'FreeBSD':
+    capi_name = 'libfdb_c.so'
 elif platform.system() == 'Darwin':
    capi_name = 'libfdb_c.dylib'
 elif sys.platform == 'win32':
--- a/build/gen_dev_docker.sh
+++ b/build/gen_dev_docker.sh
@ -20,7 +20,7 @@ cd ${tmpdir}
 echo

 cat <<EOF >> Dockerfile
-FROM foundationdb/foundationdb-build:latest
+FROM foundationdb/foundationdb-dev:0.11.1
 RUN yum install -y sudo
 RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
 RUN groupadd -g 1100 sudo
@ -64,18 +64,24 @@ then
        ccache_args=\$args
 fi

+if [ -t 1 ] ; then
+    TERMINAL_ARGS=-it `# Run in interactive mode and simulate a TTY`
+else
+    TERMINAL_ARGS=-i `# Run in interactive mode`
+fi

 sudo docker run --rm `# delete (temporary) image after return` \\
-                -it `# Run in interactive mode and simulate a TTY` \\
+                \${TERMINAL_ARGS}  \\
                --privileged=true `# Run in privileged mode ` \\
                --cap-add=SYS_PTRACE \\
                --security-opt seccomp=unconfined \\
                -v "${HOME}:${HOME}" `# Mount home directory` \\
+                -w="\$(pwd)" \\
                \${ccache_args} \\
                ${image} "\$@"
 EOF

-cat <<EOF $HOME/bin/clangd
+cat <<EOF > $HOME/bin/clangd
 #!/usr/bin/bash

 fdb-dev scl enable devtoolset-8 rh-python36 rh-ruby24 -- clangd
@ -87,6 +93,7 @@ then
        echo -e "\tThis can cause problems with some scripts (like fdb-clangd)"
 fi
 chmod +x $HOME/bin/fdb-dev
+chmod +x $HOME/bin/clangd
 echo "To start the dev docker image run $HOME/bin/fdb-dev"
 echo "$HOME/bin/clangd can be used for IDE integration"
 echo "You can edit these files but be aware that this script will overwrite your changes if you rerun it"
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@ -87,6 +87,9 @@ function(add_fdb_test)
  if (NOT "${ADD_FDB_TEST_TEST_NAME}" STREQUAL "")
    set(test_name ${ADD_FDB_TEST_TEST_NAME})
  endif()
+  if((NOT test_name MATCHES "${TEST_INCLUDE}") OR (test_name MATCHES "${TEST_EXCLUDE}"))
+    return()
+  endif()
  math(EXPR test_idx "${CURRENT_TEST_INDEX} + ${NUM_TEST_FILES}")
  set(CURRENT_TEST_INDEX "${test_idx}" PARENT_SCOPE)
  # set(<var> <value> PARENT_SCOPE) doesn't set the
@ -160,8 +163,6 @@ function(create_test_package)
        string(SUBSTRING ${file} ${base_length} -1 rel_out_file)
        set(out_file ${CMAKE_BINARY_DIR}/packages/tests/${rel_out_file})
        list(APPEND out_files ${out_file})
-        get_filename_component(test_dir ${out_file} DIRECTORY)
-        file(MAKE_DIRECTORY packages/tests/${test_dir})
        add_custom_command(
          OUTPUT ${out_file}
          DEPENDS ${file}
@ -277,7 +278,51 @@ function(package_bindingtester)
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bindings ${CMAKE_BINARY_DIR}/bindingtester/tests
    COMMAND ${CMAKE_COMMAND} -E touch "${CMAKE_BINARY_DIR}/bindingtester.touch"
    COMMENT "Copy test files for bindingtester")
-  add_custom_target(copy_bindingtester_binaries DEPENDS ${outfiles}  "${CMAKE_BINARY_DIR}/bindingtester.touch")
+
+  add_custom_target(copy_binding_output_files DEPENDS ${CMAKE_BINARY_DIR}/bindingtester.touch python_binding fdb_flow_tester)
+  add_custom_command(
+    TARGET copy_binding_output_files
+    COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:fdb_flow_tester> ${bdir}/tests/flow/bin/fdb_flow_tester
+    COMMENT "Copy Flow tester for bindingtester")
+
+  set(generated_binding_files python/fdb/fdboptions.py)
+  if(WITH_JAVA)
+    if(NOT FDB_RELEASE)
+      set(prerelease_string "-PRERELEASE")
+    else()
+      set(prerelease_string "")
+    endif()
+    add_custom_command(
+      TARGET copy_binding_output_files
+      COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/packages/fdb-java-${CMAKE_PROJECT_VERSION}${prerelease_string}.jar
+        ${bdir}/tests/java/foundationdb-client.jar
+      COMMENT "Copy Java bindings for bindingtester")
+    add_dependencies(copy_binding_output_files fat-jar)
+    add_dependencies(copy_binding_output_files foundationdb-tests)
+    set(generated_binding_files ${generated_binding_files} java/foundationdb-tests.jar)
+  endif()
+
+  if(WITH_GO AND NOT OPEN_FOR_IDE)
+    add_dependencies(copy_binding_output_files fdb_go_tester fdb_go)
+    add_custom_command(
+      TARGET copy_binding_output_files
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/go/bin/_stacktester ${bdir}/tests/go/build/bin/_stacktester
+      COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/bindings/go/src/github.com/apple/foundationdb/bindings/go/src/fdb/generated.go # SRC
+        ${bdir}/tests/go/src/fdb/ # DEST
+      COMMENT "Copy generated.go for bindingtester")
+  endif()
+
+  foreach(generated IN LISTS generated_binding_files)
+    add_custom_command(
+      TARGET copy_binding_output_files
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/${generated} ${bdir}/tests/${generated}
+      COMMENT "Copy ${generated} to bindingtester")
+  endforeach()
+
+  add_custom_target(copy_bindingtester_binaries
+    DEPENDS ${outfiles} "${CMAKE_BINARY_DIR}/bindingtester.touch" copy_binding_output_files)
  add_dependencies(copy_bindingtester_binaries strip_only_fdbserver strip_only_fdbcli strip_only_fdb_c)
  set(tar_file ${CMAKE_BINARY_DIR}/packages/bindingtester-${CMAKE_PROJECT_VERSION}.tar.gz)
  add_custom_command(
--- a/cmake/CompilerChecks.cmake
+++ b/cmake/CompilerChecks.cmake
@ -0,0 +1,53 @@
+include(CheckCXXCompilerFlag)
+
+function(env_set var_name default_value type docstring)
+  set(val ${default_value})
+  if(DEFINED ENV{${var_name}})
+    set(val $ENV{${var_name}})
+  endif()
+  set(${var_name} ${val} CACHE ${type} "${docstring}")
+endfunction()
+
+function(default_linker var_name)
+  if(APPLE)
+    set("${var_name}" "DEFAULT" PARENT_SCOPE)
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    find_program(lld_path ld.lld "Path to LLD - is only used to determine default linker")
+    if(lld_path)
+      set("${var_name}" "LLD" PARENT_SCOPE)
+    else()
+      set("${var_name}" "DEFAULT" PARENT_SCOPE)
+    endif()
+  else()
+    set("${var_name}" "DEFAULT" PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(use_libcxx out)
+  if(APPLE OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set("${out}" ON PARENT_SCOPE)
+  else()
+    set("${out}" OFF PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(static_link_libcxx out)
+  if(APPLE)
+    set("${out}" OFF PARENT_SCOPE)
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    default_linker(linker)
+    if(NOT linker STREQUAL "LLD")
+      set("${out}" OFF PARENT_SCOPE)
+      return()
+    endif()
+    find_library(libcxx_a libc++.a)
+    find_library(libcxx_abi libc++abi.a)
+    if(libcxx_a AND libcxx_abi)
+      set("${out}" ON PARENT_SCOPE)
+    else()
+      set("${out}" OFF PARENT_SCOPE)
+    endif()
+  else()
+    set("${out}" ON PARENT_SCOPE)
+  endif()
+endfunction()
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -1,25 +1,24 @@
-function(env_set var_name default_value type docstring)
-  set(val ${default_value})
-  if(DEFINED ENV{${var_name}})
-    set(val $ENV{${var_name}})
-  endif()
-  set(${var_name} ${val} CACHE ${type} "${docstring}")
-endfunction()
+include(CompilerChecks)

-set(USE_GPERFTOOLS OFF CACHE BOOL "Use gperfools for profiling")
+env_set(USE_GPERFTOOLS OFF BOOL "Use gperfools for profiling")
+env_set(USE_DTRACE ON BOOL "Enable dtrace probes on supported platforms")
 env_set(USE_VALGRIND OFF BOOL "Compile for valgrind usage")
-set(USE_VALGRIND_FOR_CTEST ${USE_VALGRIND} CACHE BOOL "Use valgrind for ctest")
-set(ALLOC_INSTRUMENTATION OFF CACHE BOOL "Instrument alloc")
-set(WITH_UNDODB OFF CACHE BOOL "Use rr or undodb")
-set(USE_ASAN OFF CACHE BOOL "Compile with address sanitizer")
-set(USE_UBSAN OFF CACHE BOOL "Compile with undefined behavior sanitizer")
-set(FDB_RELEASE OFF CACHE BOOL "This is a building of a final release")
-env_set(USE_LD "DEFAULT" STRING "The linker to use for building: can be LD (system default, default choice), BFD, GOLD, or LLD")
-env_set(USE_LIBCXX OFF BOOL "Use libc++")
+env_set(USE_VALGRIND_FOR_CTEST ${USE_VALGRIND} BOOL "Use valgrind for ctest")
+env_set(ALLOC_INSTRUMENTATION OFF BOOL "Instrument alloc")
+env_set(WITH_UNDODB OFF BOOL "Use rr or undodb")
+env_set(USE_ASAN OFF BOOL "Compile with address sanitizer")
+env_set(USE_UBSAN OFF BOOL "Compile with undefined behavior sanitizer")
+env_set(FDB_RELEASE OFF BOOL "This is a building of a final release")
 env_set(USE_CCACHE OFF BOOL "Use ccache for compilation if available")
-set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info")
-set(STATIC_LINK_LIBCXX ON CACHE BOOL "Statically link libstdcpp/libc++")
-set(USE_WERROR OFF CACHE BOOL "Compile with -Werror. Recommended for local development and CI.")
+env_set(RELATIVE_DEBUG_PATHS OFF BOOL "Use relative file paths in debug info")
+env_set(USE_WERROR OFF BOOL "Compile with -Werror. Recommended for local development and CI.")
+default_linker(_use_ld)
+env_set(USE_LD "${_use_ld}" STRING
+  "The linker to use for building: can be LD (system default and same as DEFAULT), BFD, GOLD, or LLD - will be LLD for Clang if available, DEFAULT otherwise")
+use_libcxx(_use_libcxx)
+env_set(USE_LIBCXX "${_use_libcxx}" BOOL "Use libc++")
+static_link_libcxx(_static_link_libcxx)
+env_set(STATIC_LINK_LIBCXX "${_static_link_libcxx}" BOOL "Statically link libstdcpp/libc++")

 if(USE_LIBCXX AND STATIC_LINK_LIBCXX AND NOT USE_LD STREQUAL "LLD")
  message(FATAL_ERROR "Unsupported configuration: STATIC_LINK_LIBCXX with libc+++ only works if USE_LD=LLD")
@ -257,7 +256,7 @@ else()
  check_symbol_exists(DTRACE_PROBE sys/sdt.h SUPPORT_DTRACE)
  check_symbol_exists(aligned_alloc stdlib.h HAS_ALIGNED_ALLOC)
  message(STATUS "Has aligned_alloc: ${HAS_ALIGNED_ALLOC}")
-  if(SUPPORT_DTRACE)
+  if((SUPPORT_DTRACE) AND (USE_DTRACE))
    add_compile_definitions(DTRACE_PROBES)
  endif()
  if(HAS_ALIGNED_ALLOC)
--- a/cmake/FDBComponents.cmake
+++ b/cmake/FDBComponents.cmake
@ -11,26 +11,27 @@ endif()
 ################################################################################
 # SSL
 ################################################################################
-
-set(DISABLE_TLS OFF CACHE BOOL "Don't try to find LibreSSL and always build without TLS support")
+include(CheckSymbolExists)
+ 
+set(DISABLE_TLS OFF CACHE BOOL "Don't try to find OpenSSL and always build without TLS support")
 if(DISABLE_TLS)
  set(WITH_TLS OFF)
 else()
  set(OPENSSL_USE_STATIC_LIBS TRUE)
  find_package(OpenSSL)
-  if(NOT OPENSSL_FOUND)
-    set(LIBRESSL_USE_STATIC_LIBS TRUE)
-    find_package(LibreSSL)
-		if (LIBRESSL_FOUND)
-			add_library(OpenSSL::SSL ALIAS LibreSSL)
-		endif()
-  endif()
-	if(OPENSSL_FOUND OR LIBRESSL_FOUND)
-    set(WITH_TLS ON)
-    add_compile_options(-DHAVE_OPENSSL)
+  if(OPENSSL_FOUND)
+    set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
+    check_symbol_exists("OPENSSL_INIT_NO_ATEXIT" "openssl/crypto.h" OPENSSL_HAS_NO_ATEXIT)
+    if(OPENSSL_HAS_NO_ATEXIT)
+      set(WITH_TLS ON)
+      add_compile_options(-DHAVE_OPENSSL)
+    else()
+      message(WARNING "An OpenSSL version was found, but it doesn't support OPENSSL_INIT_NO_ATEXIT - Will compile without TLS Support")
+      set(WITH_TLS OFF)
+    endif()
  else()
-    message(STATUS "Neither OpenSSL nor LibreSSL were found - Will compile without TLS Support")
-    message(STATUS "You can set OPENSSL_ROOT_DIR or LibreSSL_ROOT to the LibreSSL install directory to help cmake find it")
+    message(STATUS "OpenSSL was not found - Will compile without TLS Support")
+    message(STATUS "You can set OPENSSL_ROOT_DIR to help cmake find it")
    set(WITH_TLS OFF)
  endif()
  if(WIN32)
@ -46,7 +47,8 @@ endif()
 set(WITH_JAVA OFF)
 find_package(JNI 1.8)
 find_package(Java 1.8 COMPONENTS Development)
-if(JNI_FOUND AND Java_FOUND AND Java_Development_FOUND)
+# leave FreeBSD JVM compat for later
+if(JNI_FOUND AND Java_FOUND AND Java_Development_FOUND AND NOT (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD"))
  set(WITH_JAVA ON)
  include(UseJava)
  enable_language(Java)
--- a/cmake/FlowCommands.cmake
+++ b/cmake/FlowCommands.cmake
@ -185,12 +185,12 @@ function(add_flow_target)
        if(WIN32)
          add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${generated}"
            COMMAND $<TARGET_FILE:actorcompiler> "${CMAKE_CURRENT_SOURCE_DIR}/${src}" "${CMAKE_CURRENT_BINARY_DIR}/${generated}" ${actor_compiler_flags}
-            DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}"
+            DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}" ${actor_exe}
            COMMENT "Compile actor: ${src}")
        else()
          add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${generated}"
            COMMAND ${MONO_EXECUTABLE} ${actor_exe} "${CMAKE_CURRENT_SOURCE_DIR}/${src}" "${CMAKE_CURRENT_BINARY_DIR}/${generated}" ${actor_compiler_flags} > /dev/null
-            DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}"
+            DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${src}" ${actor_exe}
            COMMENT "Compile actor: ${src}")
        endif()
      else()
--- a/cmake/InstallLayout.cmake
+++ b/cmake/InstallLayout.cmake
@ -131,9 +131,9 @@ set(install_destination_for_log_el6 "var/log/foundationdb")
 set(install_destination_for_log_el7 "var/log/foundationdb")
 set(install_destination_for_log_pm "")
 set(install_destination_for_data_tgz "lib/foundationdb")
-set(install_destination_for_data_deb "var/lib/foundationdb")
-set(install_destination_for_data_el6 "var/lib/foundationdb")
-set(install_destination_for_data_el7 "var/lib/foundationdb")
+set(install_destination_for_data_deb "var/lib/foundationdb/data")
+set(install_destination_for_data_el6 "var/lib/foundationdb/data")
+set(install_destination_for_data_el7 "var/lib/foundationdb/data")
 set(install_destination_for_data_pm "")

 set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
@ -320,9 +320,14 @@ set(CPACK_RPM_SERVER-EL7_USER_FILELIST
  "%config(noreplace) /etc/foundationdb/foundationdb.conf"
  "%attr(0700,foundationdb,foundationdb) /var/log/foundationdb"
  "%attr(0700, foundationdb, foundationdb) /var/lib/foundationdb")
+set(CPACK_RPM_CLIENTS-EL6_USER_FILELIST "%dir /etc/foundationdb")
+set(CPACK_RPM_CLIENTS-EL7_USER_FILELIST "%dir /etc/foundationdb")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
  "/usr/sbin"
  "/usr/share/java"
+  "/usr/lib64/cmake"
+  "/etc/foundationdb"
+  "/usr/lib64/pkgconfig"
  "/usr/lib64/python2.7"
  "/usr/lib64/python2.7/site-packages"
  "/var"
--- a/design/backup-dataFormat.md
+++ b/design/backup-dataFormat.md
@ -1,10 +1,10 @@
 ## FDB Backup Data Format

 ### Introduction
-This document describes the data format of the files generated by FoundationDB (FDB) backup procedure. 
-The target readers who may benefit from reading this document are: 
-* who make changes on the current backup or restore procedure; 
-* who writes tools to digest the backup data for analytical purpose; 
+This document describes the data format of the files generated by FoundationDB (FDB) backup procedure.
+The target readers who may benefit from reading this document are:
+* who make changes on the current backup or restore procedure;
+* who writes tools to digest the backup data for analytical purpose;
 * who wants to understand the internals of how backup and restore works.

 The description of the backup data format is based on FDB 5.2 to FDB 6.1. The backup data format may (although unlikely) change after FDB 6.1.
@ -12,27 +12,27 @@ The description of the backup data format is based on FDB 5.2 to FDB 6.1. The ba

 ### Files generated by backup
 The backup procedure generates two types of files: range files and log files.
-* A range file  describes key-value pairs in a range at the version when the backup process takes a snapshot of the range. Different range files have data for different ranges at different versions. 
-* A log file describes the mutations taken from a version v<sub>1</sub> to v<sub>2</sub>  during the backup procedure. 
+* A range file  describes key-value pairs in a range at the version when the backup process takes a snapshot of the range. Different range files have data for different ranges at different versions.
+* A log file describes the mutations taken from a version v<sub>1</sub> to v<sub>2</sub>  during the backup procedure.

 With the key-value pairs in range file and the mutations in log file, the restore procedure can restore the database into a consistent state at a user-provided version v<sub>k</sub>  if the backup data is claimed by the restore as restorable at  v<sub>k</sub>. (The details of determining if a set of backup data is restorable at a version is out of scope of this document and can be found at [backup.md](https://github.com/xumengpanda/foundationdb/blob/cd873831ecd18653c5bf459d6f72d14a99b619c4/design/backup.md).


 ### Filename conventions
-The backup files will be saved in a directory (i.e., url) specified by users. Under the directory, the range files are in the `snapshots` folder. The log files are in the `logs` folder. 
+The backup files will be saved in a directory (i.e., url) specified by users. Under the directory, the range files are in the `snapshots` folder. The log files are in the `logs` folder.

 The convention of the range filename is ` snapshots/snapshot,beginVersion,beginVersion,blockSize`, where `beginVersion` is the version when the key-values in the range file are recorded, and blockSize is the size of data blocks in the range file.

 The convention of the log filename is  `logs/,versionPrefix/log,beginVersion,endVersion,randomUID, blockSize`, where the versionPrefix is a 2-level path (`x/y`) where beginVersion should go such that `x/y/*` contains (10^smallestBucket) possible versions; the randomUID is a random UID, the `beginVersion` and `endVersion` are the version range (left inclusive, right exclusive) when the mutations are recorded; and the `blockSize` is the data block size in the log file.

 We will use an example to explain what each field in the range and log filename means.
-Suppose under the backup directory, we have a range file `snapshots/snapshot,78994177,78994177,97` and a log file `logs/0000/0000/log,78655645,98655645,149a0bdfedecafa2f648219d5eba816e,1048576`. 
+Suppose under the backup directory, we have a range file `snapshots/snapshot,78994177,78994177,97` and a log file `logs/0000/0000/log,78655645,98655645,149a0bdfedecafa2f648219d5eba816e,1048576`.
 The range file’s filename tells us that all key-value pairs decoded from the file are the KV value in DB at the version `78994177`.  The data block size is `97` bytes.
-The log file’s filename tells us that the mutations in the log file were the mutations in the DB during the version range `[78655645,98655645)`, and the data block size is `1048576` bytes. 
+The log file’s filename tells us that the mutations in the log file were the mutations in the DB during the version range `[78655645,98655645)`, and the data block size is `1048576` bytes.


-### Data format in a range file 
-A range file can have one to many data blocks. Each data block has a set of key-value pairs. 
+### Data format in a range file
+A range file can have one to many data blocks. Each data block has a set of key-value pairs.
 A data block is encoded as follows: `Header startKey k1v1 k2v2 Padding`.


@ -44,7 +44,7 @@ A data block is encoded as follows: `Header startKey k1v1 k2v2 Padding`.

    H = header   P = padding   a...z = keys  v = value | = block boundary

-    Encoded file:  H a cv dv ev P | H e ev fv gv hv P | H h hv iv jv z
+    Encoded file:  H a cv dv P | H e ev fv gv hv P | H h hv iv jv z
    Decoded in blocks yields:
               Block 1: range [a, e) with kv pairs cv, dv
               Block 2: range [e, h) with kv pairs ev, fv, gv
@ -58,19 +58,19 @@ The code that decodes a range block is in `ACTOR Future<Standalone<VectorRef<Key


 ### Data format in a log file
-A log file can have one to many data blocks. 
-Each block is encoded as  `Header, [Param1, Param2]... padding`. 
-The first 32bits in `Param1` and `Param2` specifies the length of the `Param1` and `Param2`. 
-`Param1` specifies the version when the mutations happened; 
-`Param2` encodes the group of mutations happened at the version. 
+A log file can have one to many data blocks.
+Each block is encoded as  `Header, [Param1, Param2]... padding`.
+The first 32bits in `Param1` and `Param2` specifies the length of the `Param1` and `Param2`.
+`Param1` specifies the version when the mutations happened;
+`Param2` encodes the group of mutations happened at the version.

-Note that if the group of mutations is bigger than the block size, the mutation group will be split across multiple data blocks. 
+Note that if the group of mutations is bigger than the block size, the mutation group will be split across multiple data blocks.
 For example, we may get `[Param1, Param2_part0]`, `[Param1, Param2_part1]`. By concatenating the `Param2_part0` and `Param2_part1`, we can get the group of all mutations happened in the version specified in `Param1`.

 The encoding format for `Param1` is as follows:
-`hashValue|commitVersion|part`, 
-where `hashValue` is the hash of the commitVersion, `commitVersion` is the version when the mutations in `Param2`(s) are taken, and `part` is the part number in case we need to concatenate the `Param2` to get the group of all mutations.  
- `hashValue` takes 8bits,  `commitVersion` takes 64bits, and `part` takes 32bits. 
+`hashValue|commitVersion|part`,
+where `hashValue` is the hash of the commitVersion, `commitVersion` is the version when the mutations in `Param2`(s) are taken, and `part` is the part number in case we need to concatenate the `Param2` to get the group of all mutations.
+ `hashValue` takes 8bits,  `commitVersion` takes 64bits, and `part` takes 32bits.

 Note that in case of concatenating the partial group of mutations in `Param2` to get the full group of all mutations, the part number should be continuous.

@ -80,12 +80,12 @@ The `encoded_mutation_i` is encoded as follows
      `type|kLen|vLen|Key|Value`
 where type is the mutation type, such as Set or Clear, `kLen` and `vLen` respectively are the length of the key and value in the mutation. `Key` and `Value` are the serialized value of the Key and Value in the mutation.

-The code related to how a log file is written is in the `struct LogFileWriter` in `namespace fileBackup`. 
+The code related to how a log file is written is in the `struct LogFileWriter` in `namespace fileBackup`.

 The code that decodes a mutation block is in `ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeLogFileBlock(Reference<IAsyncFile> file, int64_t offset, int len)`.


 ### Endianness
-When the restore decodes a serialized integer from the backup file, it needs to convert the serialized value from big endian to little endian. 
+When the restore decodes a serialized integer from the backup file, it needs to convert the serialized value from big endian to little endian.

-The reason is as follows: When the backup procedure transfers the data to remote blob store, the backup data is encoded in big endian. However, FoundationDB currently only run on little endian machines. The endianness affects the interpretation of an integer, so we must perform the endianness convertion. 
+The reason is as follows: When the backup procedure transfers the data to remote blob store, the backup data is encoded in big endian. However, FoundationDB currently only run on little endian machines. The endianness affects the interpretation of an integer, so we must perform the endianness convertion.
--- a/design/backup_v2_partitioned_logs.md
+++ b/design/backup_v2_partitioned_logs.md
@ -0,0 +1,336 @@
+# The New FDB Backup System: Requirements & Design
+
+Github tracking issue: https://github.com/apple/foundationdb/issues/1003
+
+## Purpose and Audience
+
+The purpose of this document is to capture functional requirements as well as propose a high level design for implementation of  the new backup system in FoundationDB. The intended audience for this document includes:
+
+* **FDB users** - Users can understand what are the changes in the new backup system, especially how to start a backup using the new backup system. The restore for new backup is handled by the [Performant Restore System](https://github.com/apple/foundationdb/issues/1049).
+* **SRE's and Support** - can understand the high level architecture and know the requirements, including the metrics, tooling, and documentation to ensure that the new FDB backup can be supported.
+* **Developers** - can know why this feature is needed, what it does, and how it is to be implemented. The hope is that this document becomes the starting point for any developer wishing to understand or be involved in the related aspects of FDB.
+
+## Functional Requirements
+
+As an essential component of a database system, backup and restore is commonly used technique for disaster recovery, reliability, audit and compliance purposes. The current FDB backup system consumes about half of the cluster’s write bandwidth, causes write skew among storage servers, increases storage space usage, and results in data balancing. The new backup system aims to double cluster’s write bandwidth for *HA clusters* (old DR clusters still need old style backup system).
+
+## Background
+
+FDB backup system continuously scan the database’s key-value space, save key-value pairs and mutations at versions into range files and log files in blob storage. Specifically, mutation logs are generated at Proxy, and are written to transaction logs along with regular mutations. In production clusters like CK clusters, backup system is always on, which means each mutation is written twice to transaction logs, consuming about half of write bandwidth and about 40% of Proxy CPU time.
+
+The design of old backup system is [here](https://github.com/apple/foundationdb/blob/master/design/backup.md), and the data format of range files and mutations files is [here](https://github.com/apple/foundationdb/blob/master/design/backup-dataFormat.md). The technical overview of FDB is [here](https://github.com/apple/foundationdb/wiki/Technical-Overview-of-the-Database). The FDB recovery is described in this [doc](https://github.com/apple/foundationdb/blob/master/design/recovery-internals.md).
+
+
+## Terminology
+
+* **Blob storage**: blob storage is an object storage for unstructed data. Backup files are encoded in binary format and saved in blob storage, e.g., Amazon S3.
+* **Version**: FDB continuously generate increasing number as version and use version to decide mutation ordering. Version number typically advance one million per second. To restore a FDB cluster to a specified date and time, the restore system first convert the date and time to the corresponding version number and restore the cluster to the version number.
+* **Epoch**: A generation of FDB’s transaction system. After a component of the transaction system failed, FDB automatically initiates a recovery and restores the system in a new healthy generation, which is called an epoch.
+* **Backup worker**: is a new role added to the FDB cluster that is responsible for pulling mutations from transaction logs and saving them to blob storage.
+* **Tag**: A tag is a short address for a mutation’s destination, which includes a locality (`int8_t`, representing the data center ID and a negative number denotes special system locality) and an ID (`int16_t`). The idea is that the tag is a small data structure that consumes less bytes than using IP addresses or storage server’s UIDs (16 bytes each), since tags are associated with each mutation and are stored both in memory and on disk.
+* **Tag partitioned log system**: FDB’s write-ahead log is a tag partitioned log system, where each mutation is assigned a number of tags.
+* **Log router tag**: is a special system tag, e.g., `-2:0` where locality `-2` means log router tag and `0` means ID. If attached to a mutation, originally this tag means the mutation should be sent to a remote log router. In the new backup system, we reuse this tag for backup workers to receive all mutations in a number of partitioned streams.
+* **Restorable version:** The version that a backup can be restored to.  A version `v` is a restorable version if the entire key-space and mutations in version `[v1, v)` are recorded in backup files.
+* **Node**: A node is a machine or a process in a cluster.
+
+## Detailed Feature Requirements
+
+Feature priorities: Feature 1, 2, 3, 4, 5 are must-have; Feature 6 is better to have.
+
+1. **Write bandwidth reduction by half**: removes the requirement to generate backup mutations at the Proxy, thus reduce TLog write bandwidth usage by half and significantly improve Proxy CPU usage;
+2. **Correctness**: The restored database must be consistent: each *restored* state (i.e., key-value pair) at a version `v` must match the original state at version `v`.
+3. **Performance**: The backup system should be performant, mostly measured as a small CPU overhead on transaction logs and backup workers. The version lag on backup workers is an indicator of performance.
+4. **Fault-tolerant**: The backup system should be fault-tolerant to node failures in the FDB cluster.
+5. **Restore ready**: The new backup system should be restored by the Performant Restore System. As a fallback for new performant restore system, we can convert new backup logs into the format of old backup logs, thus enabling restore of the new backup with existing old restore system.
+6. **Backward compatibility**:  The new backup system should allow both old style backup and DR (FDB 6.2 and below) to be performed, as well as support new backup in FDB 6.3 and above.
+
+## Security and Privacy Requirements
+
+**Security**: The backup system’s components are assumed to be trusted components, because they are running on the nodes in a FDB cluster. The transmission from cluster to blob store is through SSL connections. Blob credentials are passed in from “fdbserver” command line.
+
+**Privacy**: Backup data are stored in blob store with appropriate access control. Data retention policy can be set with “fdbbackup” tool to delete older backup data.
+
+## Operational and Maintainability Requirements
+
+This section discusses changes that may need to be identified or accounted for on the back-end in order to support the feature from a monitoring or management perspective.
+
+### Tooling / Front-End
+
+Workflow is needed for DBA to start, pause, resume, abort the new type of backups. The difference from the old type of backups should be only a flag change for starting the backup. The FDB cluster then generates backups as specified by the flag.
+
+A command line tool `fdbconvert` has been written to convert new backup logs into the format of old backup logs. Thus, if the new restore system has issues, we can still restore the new backup with existing old restore system.
+
+**Deployment instructions for tooling development**
+
+* A new stateless role “`Backup Worker`” (or “`BW`” for abbreviation) is introduced in a FDB cluster. The number of BW processes is based on the number of log routers (usually they are the same). If there is no log routers, the number of transaction logs is used. Note that occasionally the cluster may recruit more backup workers for version ranges in the old epoch. Since these version ranges are small, the resource requirements for these short-lived backup workers are very small.
+* As in the old backup system, backup agents need to be started for saving snapshot files to blob storage. In contrast, backup workers in the new backup system running in the primary DC are responsible for saving mutation logs to blob storage.
+* Backup worker’s memory should be large enough to hold 10s of seconds worth of mutation data from TLogs. The memory requirement can be calculated as: `WriteThroughput * BufferPeriod / partitions + SafetyMargin`, where `WriteThroughput` is the aggregated TLog write bandwidth, `partitions` is the number of log router tags.
+* A new process class “backup” is defined for backup workers.
+* How to start a new type backup: e.g.,
+
+  ```
+  fdbbackup start -C fdb.cluster -p -d blob_url
+  ```
+
+### KPI's and Health
+
+The solution must provide at least the following KPIs:
+
+* How fast (MB/s) does the transaction logs commit writes (already existed);
+* How much backup data has been processed;
+* An estimation of backup delay;
+
+### Customer Care
+
+The feature does not require any specific customer care awareness or interaction.
+
+### Roll-out
+
+The feature must follow the usual roll-out process. It needs to coexist with the existing backup system and periodically restore clusters to test its correctness. Only after we gain enough confidence will we deprecate the existing backup system.
+
+Note the new backup system is designed for HA clusters. Existing DR clusters still uses the old backup system. Thus, rolling out of the new backup system is only for HA clusters.
+
+### Quota
+
+This feature requires a blob storage for saving all log files. The blob storage must have enough:
+
+* disk capacity for all backup data;
+* write bandwidth for uploading backup data;
+* file count for backup data: the new backup system stored partitioned mutation logs, thus expecting several time increases of the file count.
+
+## Success Criteria
+
+* Write bandwidth reduction meets the expectation: TLog write bandwidth is reduced by half;
+* New backup workflow is available to SREs;
+* Continuous backup and restore should be performed to validate the restore.
+
+# Design
+
+**One sentence summary**: the new backup system introduces a new role, backup worker, to pull mutations from transaction logs and save them, thus removing the burden of saving mutation logs into the database.
+
+The old backup system writes the mutation log to the database itself, thus doubling the write bandwidth usage. Backup agents later fetch mutation logs from the database, upload them to blob storage, and then remove the mutation logs from the database.
+
+This project saves the mutation log to blob storage directly from the FDB cluster, which should almost double the database's write bandwidth when backup is enabled. In FDB, every mutation already has exactly one log router tag, so the idea of the new system is to backup data for each log router tag individually (i.e., saving mutation logs into multiple partitioned logs). During restore time, these partitioned mutation logs are combined together to form a continuous mutation log stream.
+
+## Design choices
+
+**Design question 1**: Should backup workers be recruited as part of log system or not?
+There are two design alternatives:
+
+1. Backup worker is external to the log system. In other words, backup workers survive master recovery. Thus, backup workers are recruited and monitored by the cluster controller.
+    1. The advantage is that the failure of backup workers does not cause master recovery.
+    2. The disadvantage is that backup workers need to monitor master recovery, especially configuration changes. Because the number of log routers can change after a recovery, we might need to recruit more backup workers for an increase and need to pause/shutdown backup workers for a decrease, which complicates the recruitment logic; or we might need to changing the mapping of tags to backup workers, which is also complex. A further complication is that backup workers need to constantly monitor master recovery and be very careful about the version boundary between two consecutive epochs, because the number of tags may change.
+2. Backup worker is recruited during master recovery as part of log system. The Master recruits a fixed number of backup workers, i.e., the same number as LogRouters.
+    1. The advantage is that recruiting and mapping from backup worker to LogRouter tags are simple, i.e., one tag per worker.
+    2. The disadvantages is that backup workers are tied with master recovery -- a failure of a backup worker results in a master recovery, and a master recovery stops old backup workers and starts new ones.
+
+**Decision**: We choose the second approach for the simplicity of the recruiting process and handling of mapping of LogRouter tags to backup workers.
+
+**Design question 2**: Place of backup workers on the primary or remote Data Center (DC)?
+Placing backup workers on the primary side has the advantage of supporting any deployment configurations (single DC, multi DC).
+
+Placing on the remote is desirable to reduce the workload on the primary DC’s transaction logs. Since log routers on the remote side is already pulling mutations from primary DC, backup workers can simply pull from these log routers.
+
+**Decision**: We choose to recruit backup workers on the primary DC, because not all clusters are configured with multiple DCs and the backup system needs to support all types of deployment.
+
+## Design Assumptions
+
+The design proposed below is based upon the following assumptions:
+
+* Blob system has enough write bandwidth and storage space for backup workers to save log files.
+* FDB cluster has enough stateless processes to run as backup workers and these processes have memory capacity to buffer 10s of seconds of commit data. 
+
+## Design Challenges
+
+The requirement of the new backup system raises several design challenges:
+
+1. Correctness of the new backup files. Backup files must be complete and accurate to capture all data, otherwise we end up with corrupted data in the backup. The challenge here is to make sure no mutation is missing, even when the FDB cluster experiences failures and has to perform recovery.
+2. Testing of the new backup system. How can we test the new backup system when there is no restore system available? We need to verify backup files are correct without performing a full restore.
+
+## System components
+
+**Backup Worker**: This is a new role introduced in the new backup system. A backup worker is a `fdbserver` process running inside a FDB cluster, responsible for pulling mutations from transaction logs and saving the mutations to blob storage.
+
+**Master**: The master is responsible for coordinating the transition of the FDB transaction sub-system from one generation to the next. In particular, the master recruits backup workers during the recovery.
+
+**Transaction Logs (TLogs)**: The transaction logs make mutations durable to disk for fast commit latencies. The logs receive commits from the proxy in version order, and only respond to the proxy once the data has been written and fsync'ed to an append only mutation log on disk. Storage servers retrieve mutations from TLogs. Once the storage servers have persisted mutations, storage servers then pop the mutations from the TLogs.
+
+**Proxy**: The proxies are responsible for providing read versions, committing transactions, and tracking the storage servers responsible for each range of keys. In the old backup system, Proxies are responsible to group mutations into backup mutations and write them to the database.
+
+## System overview
+
+From an end-to-end perspective, the new backup system works in the following steps:
+
+1. Operators issue a new backup request via `fdbbackup` command line tool;
+2. FDB cluster receives the request and registers the request in the database (internal `TaskBucket` and system keys);
+3. Backup workers monitor changes to system keys, register the request in its own internal queue, and starts logging mutations for the request key range; at the same time, backup agents (scheduled by `TaskBucket`) starts taking snapshots of key ranges in the database;
+4. Periodically, backup workers upload mutations to the requested blob storage, and save the progress into the database;
+5. The backup is restorable when backup workers have saved versions that are larger than the complete snapshot’s end version, and the backup is stopped if a stop on restorable flag is set in the request.
+
+The new backup has four major components: 1) backup workers; 2) recruitment of backup workers; 3) extension of tag partitioned log system to support pseudo tags; 4) integration with existing `TaskBucket` based backup command interface; and 5) integration with the Performant Restore System.
+
+### Backup workers
+
+Backup worker is a new role introduced in the new backup system. A backup worker is responsible for pulling mutations from transaction logs and saving the mutations to blob storage. Internally, a backup worker maintains a message buffer, which keeps mutations pulled from transaction logs, but have not been saved to blob storage yet. Periodically, the backup worker parses mutations in the message buffer, extracts those mutations that are within user specified key ranges, and then uploads mutation data to blob storage. After data is saved, the backup worker removes these messages from its internal buffer and saves its progress in the database, so that after a failure, a new backup worker starts from the previously saved version.
+
+Backup worker has two modes of operation: *no-op* mode, and *working* mode. When there is no active backup in the cluster, backup worker operates in the no-op mode, which simply obtains the recently committed version from Proxies and then pops mutations from transaction logs. After operators submit a new backup request to the cluster, backup workers transition into the working mode that starts pulling mutations from transaction logs and saving the mutation data to blob storage.
+
+In the working mode, the popping of backup workers need to follow a strictly increasing version order. For the same tag, there could be multiple backup workers, each is responsible for a different epoch. These backup workers must coordinating their popping order, otherwise the backup can miss some mutation data. This coordination among backup workers is achieved by deferring popping of a later epoch and only allowing the oldest epoch to pop first. After the oldest epoch has finished, these corresponding backup workers notifies the master, which will then advances the oldest backup epoch so that the next epoch can proceed the popping.
+
+A subtle issue for a displaced backup worker (i.e., being displaced because a new epoch begins), is that the last pop of the backup worker can cause missing version ranges in mutation logs. This is because the transaction for saving the progress may be delayed during recovery. As a result, the master could already recruited a new backup worker for the old epoch starting at the previously saved progress version. Then the saving transaction succeeds, and the worker pops mutations that the new backup worker is supposed to save, resulting in missing data for new backup worker’s log. The solution to this problem can be: 1) the old backup worker aborts immediately after knowing itself is displaced, thus not trying to save its progress; or 2) the old backup worker skip its last pop, since the next epoch will pop versions larger than its progress. Because the second approach avoids doing duplicated work in the new epoch, we choose to the second approach.
+
+Finally, multiple concurrent backups are supported. Each backup worker keeps track of current backup jobs and saves mutations to corresponding backup containers for the same batch of mutations.
+
+### Recruitment of Backup workers
+
+Backup workers are recruited during master recovery as part of log system. The Master recruits a fixed number of backup workers, one for each log router tag. During the recruiting process, the master sends backup worker initialization request as:
+
+```
+struct InitializeBackupRequest {
+    UID reqId;
+    LogEpoch epoch; // epoch this worker is recruited
+    LogEpoch backupEpoch; // epoch that this worker actually works on
+    Tag routerTag;
+    Version startVersion;
+    Optional<Version> endVersion; // Only present for unfinished old epoch
+    ReplyPromise<struct BackupInterface> reply;
+    … // additional methods elided
+};
+
+```
+
+Note we need two epochs here: one for the recruited epoch and one for backing up epoch. The recruited epoch is the epoch of the log system, which is used by a backup worker to find out if it works for the current epoch. If so, the worker should save its progress and immediately exit. The `backupEpoch` is used for saving progress. The `backupEpoch` is usually the same as the epoch that the worker is recruited. However, it can be some earlier epoch than the recruiting epoch, signifying that the worker is responsible for data in that earlier epoch. In this case, when the worker is done and exits, the master should not flag its departure as a trigger of recovery. This is solved by the following protocol:
+
+1. The backup worker finishes its work, including saving progress to the key value store and uploading to cloud storage, and then sends a `BackupWorkerDoneRequest` to the master;
+2. The master receives the request, removes the worker from its log system, and updates the oldest backing up epoch `oldestBackupEpoch`;
+3. The master sends backup a reply message to the backup worker and registers the new log system with cluster controller;
+4. The backup worker exits after receiving the reply. Other backup workers in the system get the new log system from the cluster controller. If a backup worker’s `backupEpoch` is equal to `oldestBackupEpoch`, then the worker may start popping from TLogs.
+
+Note `oldestBackupEpoch` is introduced to prevent a backup worker for a newer epoch from popping when there are backup workers for older epochs. Otherwise, these older backup workers may lose data.
+
+### Extension of tag partitioned log system to support pseudo tags
+
+The tag partitioned log system is modeled like a FIFO queue, where Proxies push mutations to the queue and Storage Servers or Log Routers pop mutations from the queue. Specifically, consumers of the tag partitioned log system use two operations, `peek` and `pop`, to read mutations for a given tag and to pop mutations from the queue. Because Proxies assign each mutation a unique log router tag, the backup system reuses this tag to obtain the whole mutation stream. As a result, each log router tag now has two consumers, a log router and a backup worker.
+
+To support multiple consumers of the log router tag, the peek and pop has been extended to support pseudo tags. In other words, each log router tag can be mapped to multiple pseudo tags. Log routers and Backup workers still `peek` mutations with the log router tag, but `pop` with different pseudo tags. Only after both pseudo tags are popped, TLogs can pop the mutations from its internal queue.
+
+Note the introduction of pseudo tags opens the possibility for more usage scenarios. For instance, a change stream can be implemented with a pseudo tag, where the new consumer can look at each mutation and emit mutations on specified key ranges.
+
+### Integration with existing taskbucket based backup command interface
+
+We strive to keep the operational interface the same as the old backup system. That is, the new backup is initiated by the client as before with an additional flag. FDB cluster receives the backup request, sees the flag being set, and uses the new system for generating mutation logs.
+
+By default, backup workers are not enabled in the system. When operators submit a new backup request for the first time, the database performs a configuration change (`backup_worker_enabled:=1`) that enables backup workers.
+
+The operator’s backup request can indicate if an old backup or a new backup is used. This is a command line option (i.e., `-p` or `--partitioned_log`) in the `fdbbackup` command. A backup request of the new type is started in the following steps:
+
+1. Operators use `fdbbackup` tool to write the backup range to a system key, i.e., `\xff\x02/backupStarted`.
+2. All backup workers monitor the key `\xff\x02/backupStarted`, see the change, and start logging mutations.
+3. After all backup workers have started, the `fdbbackup` tool initiates the backup of all or specified key ranges by issuing a transaction `Ts`.
+
+Compared to the old backup system, the above step 1 and 2 are new and is only triggered if client requests for a new type of backup. The purpose is to allow backup workers to function as no-op if there are no ongoing backups. However, the backup workers should still continuously pop their corresponding tags, otherwise mutations will be kept in the TLog. In order to know the version to pop, backup workers can obtain the read version from any proxy. Because the read version must be a committed version, so popping to this version is safe.
+
+**Backup Submission Protocol**
+Protocol for `submitBackup()` to ensure that all backup workers of the current epoch have started logging mutations:
+
+1. After the `submitBackup()` call, the task bucket (i.e., `StartFullBackupTaskFunc`) starts by creating a `BackupConfig` object in the system key space.
+2. Each backup worker monitors the `\xff\x02/backupStarted` key and notices the new backup job. Then the backup worker inserts the new job into its internal queue, and writes to `startedBackupWorkers` key in the `BackupConfig` object if the worker’s `backupEpoch` is the current epoch. Among these workers, the worker with Log Router Tag `-2:0` monitors the `startedBackupWorkers` key, and sets `allWorkerStarted` key after all workers have updated the `startedBackupWorkers` key.
+3. The task bucket watches change to the `startedBackupWorkers` key and declares the job submission successful.
+
+This protocol was implemented after another abandoned protocol: the `startedBackupWorkers` key is set after all backup workers have saved logs with versions larger than the version of `submitBackup()` call. This protocol fails if there is already a backup job and there is a backup worker that doesn’t notice the change to the `\xff\x02/backupStarted` key. As a result, the worker is saving versions larger than the new job’s start version, but in the old backup container. Thus the new container misses some mutations.
+
+**Protocol for Determining A Backup is Restorable**
+
+1. Each backup worker independently logs mutations to a backup container and updates its progress in the system key space.
+2. The worker with Log Router Tag `-2:0` of current epoch monitors all workers’ progress. If the oldest backup epoch is the current epoch (i.e, there are no backup workers for any old epochs, thus no version ranges missing before this epoch), this worker updates `latestBackupWorkerSavedVersion` key in the `BackupConfig` object with the minimum saved version among workers.
+3. The client calls `describeBackup()`, which eventually calls `getLatestRestorableVersion` to read the value from the `latestBackupWorkerSavedVersion` key. If this version is larger than the first snapshot’s end version, then the backup is restorable.
+
+**Pause and Resume Backups**
+The command line for pause or resume backups remains the same, but the implementation for the new backup system is different from the old one. This is because in the old backup system, both mutation logs and range logs are handled by `TaskBucket`, an asynchronous task scheduling framework that stores states in the FDB database. Thus, the old backup system simply pauses or resumes the `TaskBucket`. In the new backup system, mutation logs are generated by backup workers, thus the pause or resume command needs to tell all backup workers to pause or resume pulling mutations from TLogs. Specifically,
+
+1. The operator issues a pause or resume request that upates both the `TaskBucket` and `\xff\x02/backupPaused` key.
+2. Each backup worker monitors the `\xff\x02/backupPaused` key and notices the change. Then the backup worker pauses or resumes pulling from TLogs.
+
+**Backup Container Changes**
+
+* Partitioned mutation logs are stored in `plogs/XXXX/XXXX` directory and their names are in the format of `log,[startVersion],[endVersion],[UID],[N-of-M],[blockSize]`, where `M` is total partition number, `N` can be any number from `0` to `M - 1`. In contrast, old mutation logs are stored in `logs/XXXX/XXXX` directory and are named differently.
+* To restore a version range, all partitioned logs for the range needs to be available. The restore process should read all partitioned logs, and combine mutations from different logs into one mutation stream, ordered by `(commit_version, subsequence)` pair. It is guaranteed that all mutations form a total order. Note in the old backup files, there is no subsequence number, as each version’s mutations are serialized in order in one file.
+
+### Integration with the [Performant Restore System](https://github.com/apple/foundationdb/issues/1049)
+
+As discussed above, the new backup system split mutation logs into multiple partitions. Thus, the restore process must verify the backup files are continuous for all partitions with the restore’s version range. This is possible because each log file name has the information about its partition number and the total number of partitions.
+
+Once the restore system verifies the version range is continuous, the restore system needs to filter out duplicated version range among different log files (both log continuity analysis and dedup logic are implemented in `BackupContainer` abstraction). A given version range may be stored in **multiple** mutation log files. This can happen because a recruited backup worker can upload mutation files successfully, but doesn’t save the progress before another recovery happens. As a result, the new epoch tries to backup this version range again, producing the same version ranges (though the file names are different).
+
+Finally, the restore system loads the same version’s mutations from all partitions, and then merges these mutations in the order of their subsequence number before they are applied on the restore cluster. Note the mutations in the old backup system lack subsequence numbers. As a result, restoring old backups needs to assign subsequence number to mutations.
+
+## Ordered and Complete Guarantee of Mutation Logs
+
+The backup system must generate log files that the restore system can apply all the mutations on the backup cluster in the same order exactly once.
+
+**Ordering guarantee**. To maintain the ordering of mutations, each mutation is stored with its commit version and a subsequence number, both are assigned by Proxies during commit. The restore system can load all mutations and derive a total order among all the mutations.
+
+**Completeness guarantee**. All mutations should be saved in log files. We cannot allow any mutations missing from the backup. This is guaranteed by the fault tolerance discussed below. Essentially all backup workers checkpoint their progress in the database. After the recovery, the new master reads previous checkpoints and recruit new backup workers for any missing version ranges.
+
+## Backup File Format
+
+The old backup file format is documented [here](https://github.com/apple/foundationdb/blob/release-6.2/design/backup-dataFormat.md). We can’t use this file format, because our backup files are created for log router tags. When there are more than one log routers (almost always the case), the mutations in one transaction can be given different log router tags. As a result, for the same version, mutations are distributed in many files. Another subtle issue is that, there can be two mutations, (e.g., `a = 1` and `a = 2` in a transaction), which are given two different tags. We have to preserve the order of these two mutations in the restore process. Even though the order is saved in the sub-sequence number of a version, we still need to merge mutations from multiple files and apply them in the correct order.
+
+In the new backup system, mutation log file is named as `log,[startVersion],[endVersion],[UID],[N-of-M],[blockSize]`, where `startVersion` is inclusive and `endVersion` is *not* inclusive, e.g., `log,332850851,332938927,7be23c0a3e80df8ab1530fa76fa66980,1-of-4,1048576`. With the information from all file names, the restore process can find all files for a version range, i.e., versions intersect with the range and all log router tags. “`M`” is the total number of tags, and “`N`” is from `0` to `m - 1`.Note `tagId` is not required in the old backup filename, since all mutations for a version are included in one file.
+
+Each file content is a list of fixed size blocks. Each block contains a sequence of mutations, where each mutation consists of a serialized `Version`, `int32_t`, `int32_t`, (all these three numbers are in big endian) and `Mutation`, where `Mutation` is of format `type|kLen|vLen|Key|Value`, where `type` is the mutation type (e.g., `Set` or `Clear`), `kLen` and `vLen` respectively are the lengths of the key and value in the mutation. `Key` and `Value` are the serialized value of the Key and Value in the mutation. The paddings at the end of the block are bytes of `0xFF`.
+
+```
+`<BlockHeader>`
+`<Version_1><Subseq_1><Mutation1_len><Mutation1>`
+`<Version_2><Subseq_2><Mutation2_len><Mutation2>`
+`…`
+`<Padding>
+`
+```
+
+Note the big Endianness for version is required, as `0xFF` is used as the padding to indicate block end. A little endian number can easily be mistaken as the end. In contrast, big endian for version almost guarantee the first byte is not `0xFF` (should always be `0x00`).
+
+## Performance optimization
+
+### Future Optimizations
+
+Add a metadata file describe the backup file:
+
+* The number of mutations;
+* The number of atomic operations;
+* key range and version range of mutations in each backup file;
+
+The information can be used to optimize the restore process. For instance, the number of mutations can be used to make better load balancing decisions; if there is no atomic operations, the restore can apply mutation in a backward fashion -- skipping mutations with earlier versions.
+
+## Fault Tolerance
+
+Failures of a backup worker will trigger a master recovery. After the recovery, the new master recruits a new set of backup workers. Among them, a new backup worker shall continue the work of the failed backup worker from the previous epoch.
+
+The interesting part is the handling of old epochs, since the backup workers for the old epoch are in the “displaced” state and should exit. So the basic idea is that we need a set of backup workers for the data left in the old epochs. To figure out the set of data not backed up yet, the master first loads saved backup progress data `<Worker_UID, LogEpoch, SavedVersion, Tag, TotalTags> `from the database, and then computes for each epoch, what version ranges have not been backed up. For each of the version range and tag, master recruit a worker to resume the backup for that version range and tag. Note that this worker has a different worker UID from the worker in the original epoch. As a result, for a given epoch and a tag, there might be multiple progress status, as these workers are recruited at different epochs.
+
+## KPI's and Metrics
+
+The backup system emits the following metrics:
+
+* How much backup data has been processed: the backup command line tool `fdbbackup` can show the status of backup, including the size of mutation logs (`LogBytes written`) and snapshots (`RangeBytes written`). By taking two consecutive backup status, the backup speed can be estimated as (`2nd_LogBytes - 1st_LogBytes) / interval`.
+* An estimation of backup delay: Each backup worker emits `BackupWorkerMetrics` trace events every 5 seconds, which includes `SavedVersion`, `MinKnownCommittedVersion`, and `MsgQ`. The backup delay can be estimated as (`MinKnownCommittedVersion - SavedVersion) / 1,000,000` seconds, which is the difference between a worker’s saved version and current committed version, divided by 1M version per second. `MsgQ` is the queue size of memory buffer of the backup worker.
+
+## Controlling Properties
+
+System operator can control the following backup properties:
+
+* **Backup key ranges**: The non-overlapped key ranges that will be backed up to the blob storage.
+* **Blob url**: The root path in blob that host all backup files.
+* **Performance knobs**: The knobs that control the performance
+    * The backup interval (knob `BACKUP_UPLOAD_DELAY`) for saving mutation logs to blob storage;
+
+## Testing
+
+The feature will be tested both in simulation and in real clusters:
+
+* New test cases are added into the test folder in FDB. The nightly correctness (i.e., simulation) tests will test the correctness of both backup and restore.
+* Tests will be added to constantly backup a cluster with the new backup system and restore the backup to ensure the restore works on real clusters. During the time period of active backup, the cluster should have better write performance than using old backup system.
+* Tests should also be conducted with production data. This ensures backup data is restorable and catches potential bugs in backup and restore. This test is preferably conducted regularly, e.g., weekly per cluster.
+
+Before the restore system is available, the testing strategy for backup files is to keep old backup system running. Thus, both new backup files and old backup files are generated. Then both types of log files are decoded and compared against. The new backup file is considered correct if its content matches the content of old log files.
--- a/design/special-key-space.md
+++ b/design/special-key-space.md
@ -0,0 +1,81 @@
+# Special-Key-Space
+This document discusses why we need the proposed special-key-space framwork. And for what problems the framework aims to solve and in what scenarios a developer should use it.   
+
+## Motivation
+Currently, there are several client functions implemented as FDB calls by passing through special keys(`prefixed with \xff\xff`). Below are all existing features:
+- **status/json**: `get("\xff\xff/status/json")`
+- **cluster_file_path**: `get("\xff\xff/cluster_file_path)`
+- **connection_string**: `get("\xff\xff/connection_string)`
+- **worker_interfaces**: `getRange("\xff\xff/worker_interfaces", <any_key>)`
+- **conflicting-keys**: `getRange("\xff\xff/transaction/conflicting_keys/", "\xff\xff/transaction/conflicting_keys/\xff")`
+
+At present, implementions are hard-coded and the pain points are obvious:
+- **Maintainability**: As more features added, the hard-coded snippets are hard to maintain 
+- **Granularity**: It is impossible to scale up and down. For example, you want a cheap call like `get("\xff\xff/status/json/<certain_field>")` instead of calling `status/json` and parsing the results. On the contrary, sometime you want to aggregate results from several similiar features like `getRange("\xff\xff/transaction/, \xff\xff/transaction/\xff")` to get all transaction related info. Both of them are not achievable at present.
+- **Consistency**: While using FDB calls like `get` or `getRange`, the behavior that the result of `get("\xff\xff/B")` is not included in `getRange("\xff\xff/A", "\xff\xff/C")` is inconsistent with general FDB calls.
+
+Consequently, the special-key-space framework wants to integrate all client functions using special keys(`prefixed with \xff`) and solve the pain points listed above.
+
+## When
+If your feature is exposing information to clients and the results are easily formatted as key-value pairs, then you can use special-key-space to implement your client function.
+
+## How
+If you choose to use, you need to implement a function class that inherits from `SpecialKeyRangeBaseImpl`, which has an abstract method `Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw, KeyRangeRef kr)`.
+This method can be treated as a callback, whose implementation details are determined by the developer.
+Once you fill out the method, register the function class to the corresponding key range.
+Below is a detailed example.
+```c++
+// Implement the function class,
+// the corresponding key range is [\xff\xff/example/, \xff\xff/example/\xff)
+class SKRExampleImpl : public SpecialKeyRangeBaseImpl {
+public:
+    explicit SKRExampleImpl(KeyRangeRef kr): SpecialKeyRangeBaseImpl(kr) {
+        // Our implementation is quite simple here, the key-value pairs are formatted as:
+        // \xff\xff/example/<country_name> : <capital_city_name>
+        CountryToCapitalCity[LiteralStringRef("USA")] = LiteralStringRef("Washington, D.C.");
+        CountryToCapitalCity[LiteralStringRef("UK")] = LiteralStringRef("London");
+        CountryToCapitalCity[LiteralStringRef("Japan")] = LiteralStringRef("Tokyo");
+        CountryToCapitalCity[LiteralStringRef("China")] = LiteralStringRef("Beijing");
+    }
+    // Implement the getRange interface
+    Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
+                                            KeyRangeRef kr) const override {
+        
+        Standalone<RangeResultRef> result;
+        for (auto const& country : CountryToCapitalCity) {
+            // the registered range here: [\xff\xff/example/, \xff\xff/example/\xff]
+            Key keyWithPrefix = country.first.withPrefix(range.begin);
+            // check if any valid keys are given in the range
+            if (kr.contains(keyWithPrefix)) {
+                result.push_back(result.arena(), KeyValueRef(keyWithPrefix, country.second));
+                result.arena().dependsOn(keyWithPrefix.arena());
+            }
+        }
+        return result;
+    }
+private:
+    std::map<Key, Value> CountryToCapitalCity;
+};
+// Instantiate the function object
+// In development, you should have a function object pointer in DatabaseContext(DatabaseContext.h) and initialize in DatabaseContext's constructor(NativeAPI.actor.cpp)
+const KeyRangeRef exampleRange(LiteralStringRef("\xff\xff/example/"), LiteralStringRef("\xff\xff/example/\xff"));
+SKRExampleImpl exampleImpl(exampleRange);
+// Assuming the database handler is `cx`, register to special-key-space
+// In development, you should register all function objects in the constructor of DatabaseContext(NativeAPI.actor.cpp)
+cx->specialKeySpace->registerKeyRange(exampleRange, &exampleImpl);
+// Now any ReadYourWritesTransaction associated with `cx` is able to query the info
+state ReadYourWritesTransaction tr(cx);
+// get
+Optional<Value> res1 = wait(tr.get("\xff\xff/example/Japan"));
+ASSERT(res1.present() && res.getValue() == LiteralStringRef("Tokyo"));
+// getRange
+// Note: for getRange(key1, key2), both key1 and key2 should prefixed with \xff\xff
+// something like getRange("normal_key", "\xff\xff/...") is not supported yet
+Standalone<RangeResultRef> res2 = wait(tr.getRange(LiteralStringRef("\xff\xff/example/U"), LiteralStringRef("\xff\xff/example/U\xff")));
+// res2 should contain USA and UK
+ASSERT(
+    res2.size() == 2 &&
+    res2[0].value == LiteralStringRef("London") &&
+    res2[1].value == LiteralStringRef("Washington, D.C.")
+);
+```
--- a/documentation/sphinx/source/api-common.rst.inc
+++ b/documentation/sphinx/source/api-common.rst.inc
@ -176,6 +176,9 @@
 .. |transaction-get-committed-version-blurb| replace::
    Gets the version number at which a successful commit modified the database. This must be called only after the successful (non-error) completion of a call to |commit-func| on this Transaction, or the behavior is undefined. Read-only transactions do not modify the database when committed and will have a committed version of -1. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction.

+.. |transaction-get-approximate-size-blurb| replace::
+    Gets the the approximate transaction size so far, which is the summation of the estimated size of mutations, read conflict ranges, and write conflict ranges.
+
 .. |transaction-get-versionstamp-blurb| replace::
    Returns a future which will contain the versionstamp which was used by any versionstamp operations in this transaction. This function must be called before a call to |commit-func| on this Transaction. The future will be ready only after the successful completion of a call to |commit-func| on this Transaction. Read-only transactions do not modify the database when committed and will result in the future completing with an error. Keep in mind that a transaction which reads keys and then sets them to their current values may be optimized to a read-only transaction.

--- a/documentation/sphinx/source/api-python.rst
+++ b/documentation/sphinx/source/api-python.rst
@ -805,6 +805,13 @@ Transaction misc functions

 .. _api-python-transaction-options:

+Transaction misc functions
+--------------------------
+
+.. method:: Transaction.get_approximate_size()
+
+    |transaction-get-approximate-size-blurb|. Returns a :class:`FutureInt64`.
+
 Transaction options
 -------------------

--- a/documentation/sphinx/source/api-ruby.rst
+++ b/documentation/sphinx/source/api-ruby.rst
@ -736,7 +736,7 @@ Most applications should use the read version that FoundationDB determines autom

    |infrequent| |transaction-get-committed-version-blurb|

-.. method:: Transaction.get_verionstamp() -> String
+.. method:: Transaction.get_versionstamp() -> String

    |infrequent| |transaction-get-versionstamp-blurb|

@ -747,6 +747,10 @@ Transaction misc functions

    Get the estimated byte size of the given key range. Returns a :class:`Int64Future`.

+.. method:: Transaction.get_approximate_size() -> Int64Future
+
+    |transaction-get-approximate-size-blurb|. Returns a :class:`Int64Future`.
+
 Transaction options
 -------------------

--- a/documentation/sphinx/source/command-line-interface.rst
+++ b/documentation/sphinx/source/command-line-interface.rst
@ -167,6 +167,11 @@ getversion

 The ``getversion`` command fetches the current read version of the cluster or currently running transaction.

+advanceversion
+--------------
+
+Forces the cluster to recover at the specified version. If the specified version is larger than the current version of the cluster, the cluster version is advanced to the specified version via a forced recovery.
+
 help
 ----

--- a/documentation/sphinx/source/downloads.rst
+++ b/documentation/sphinx/source/downloads.rst
@ -10,38 +10,38 @@ macOS

 The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.

-* `FoundationDB-6.2.19.pkg <https://www.foundationdb.org/downloads/6.2.19/macOS/installers/FoundationDB-6.2.19.pkg>`_
+* `FoundationDB-6.2.20.pkg <https://www.foundationdb.org/downloads/6.2.20/macOS/installers/FoundationDB-6.2.20.pkg>`_

 Ubuntu
 ------

 The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.

-* `foundationdb-clients-6.2.19-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.19/ubuntu/installers/foundationdb-clients_6.2.19-1_amd64.deb>`_
-* `foundationdb-server-6.2.19-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.19/ubuntu/installers/foundationdb-server_6.2.19-1_amd64.deb>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.20-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.20/ubuntu/installers/foundationdb-clients_6.2.20-1_amd64.deb>`_
+* `foundationdb-server-6.2.20-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.20/ubuntu/installers/foundationdb-server_6.2.20-1_amd64.deb>`_ (depends on the clients package)

 RHEL/CentOS EL6
 ---------------

 The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.

-* `foundationdb-clients-6.2.19-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.19/rhel6/installers/foundationdb-clients-6.2.19-1.el6.x86_64.rpm>`_
-* `foundationdb-server-6.2.19-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.19/rhel6/installers/foundationdb-server-6.2.19-1.el6.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.20-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.20/rhel6/installers/foundationdb-clients-6.2.20-1.el6.x86_64.rpm>`_
+* `foundationdb-server-6.2.20-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.20/rhel6/installers/foundationdb-server-6.2.20-1.el6.x86_64.rpm>`_ (depends on the clients package)

 RHEL/CentOS EL7
 ---------------

 The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.

-* `foundationdb-clients-6.2.19-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.19/rhel7/installers/foundationdb-clients-6.2.19-1.el7.x86_64.rpm>`_
-* `foundationdb-server-6.2.19-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.19/rhel7/installers/foundationdb-server-6.2.19-1.el7.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.20-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.20/rhel7/installers/foundationdb-clients-6.2.20-1.el7.x86_64.rpm>`_
+* `foundationdb-server-6.2.20-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.20/rhel7/installers/foundationdb-server-6.2.20-1.el7.x86_64.rpm>`_ (depends on the clients package)

 Windows
 -------

 The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.

-* `foundationdb-6.2.19-x64.msi <https://www.foundationdb.org/downloads/6.2.19/windows/installers/foundationdb-6.2.19-x64.msi>`_
+* `foundationdb-6.2.20-x64.msi <https://www.foundationdb.org/downloads/6.2.20/windows/installers/foundationdb-6.2.20-x64.msi>`_

 API Language Bindings
 =====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part

 If you need to use the FoundationDB Python API from other Python installations or paths, use the Python package manager ``pip`` (``pip install foundationdb``) or download the Python package:

-* `foundationdb-6.2.19.tar.gz <https://www.foundationdb.org/downloads/6.2.19/bindings/python/foundationdb-6.2.19.tar.gz>`_
+* `foundationdb-6.2.20.tar.gz <https://www.foundationdb.org/downloads/6.2.20/bindings/python/foundationdb-6.2.20.tar.gz>`_

 Ruby 1.9.3/2.0.0+
 -----------------

-* `fdb-6.2.19.gem <https://www.foundationdb.org/downloads/6.2.19/bindings/ruby/fdb-6.2.19.gem>`_
+* `fdb-6.2.20.gem <https://www.foundationdb.org/downloads/6.2.20/bindings/ruby/fdb-6.2.20.gem>`_

 Java 8+
 -------

-* `fdb-java-6.2.19.jar <https://www.foundationdb.org/downloads/6.2.19/bindings/java/fdb-java-6.2.19.jar>`_
-* `fdb-java-6.2.19-javadoc.jar <https://www.foundationdb.org/downloads/6.2.19/bindings/java/fdb-java-6.2.19-javadoc.jar>`_
+* `fdb-java-6.2.20.jar <https://www.foundationdb.org/downloads/6.2.20/bindings/java/fdb-java-6.2.20.jar>`_
+* `fdb-java-6.2.20-javadoc.jar <https://www.foundationdb.org/downloads/6.2.20/bindings/java/fdb-java-6.2.20-javadoc.jar>`_

 Go 1.11+
 --------
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@ -2,7 +2,7 @@
 Release Notes
 #############

-7.0.0
+6.3.0
 =====

 Features
--- a/fdbbackup/CMakeLists.txt
+++ b/fdbbackup/CMakeLists.txt
@ -45,11 +45,11 @@ if(NOT OPEN_FOR_IDE)
  symlink_files(
    LOCATION packages/bin
    SOURCE fdbbackup
-    TARGETS fdbdr dr_agent backup_agent fdbrestore)
+    TARGETS fdbdr dr_agent backup_agent fdbrestore fastrestore_agent)
  symlink_files(
    LOCATION bin
    SOURCE fdbbackup
-    TARGETS fdbdr dr_agent backup_agent fdbrestore)
+    TARGETS fdbdr dr_agent backup_agent fdbrestore fastrestore_agent)
 endif()

 if (GPERFTOOLS_FOUND)
--- a/fdbbackup/FileConverter.actor.cpp
+++ b/fdbbackup/FileConverter.actor.cpp
@ -373,17 +373,6 @@ struct LogFileWriter {
 		return wr.toValue();
 	}

-	// Return a block of contiguous padding bytes, growing if needed.
-	static Value makePadding(int size) {
-		static Value pad;
-		if (pad.size() < size) {
-			pad = makeString(size);
-			memset(mutateString(pad), '\xff', pad.size());
-		}
-
-		return pad.substr(0, size);
-	}
-
 	// Start a new block if needed, then write the key and value
 	ACTOR static Future<Void> writeKV_impl(LogFileWriter* self, Key k, Value v) {
 		// If key and value do not fit in this block, end it and start a new one
@ -392,7 +381,7 @@ struct LogFileWriter {
 			// Write padding if needed
 			int bytesLeft = self->blockEnd - self->file->size();
 			if (bytesLeft > 0) {
-				state Value paddingFFs = makePadding(bytesLeft);
+				state Value paddingFFs = fileBackup::makePadding(bytesLeft);
 				wait(self->file->append(paddingFFs.begin(), bytesLeft));
 			}

--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -2192,20 +2192,21 @@ ACTOR Future<Void> runRestore(Database db, std::string originalClusterFile, std:
 // Fast restore agent that kicks off the restore: send restore requests to restore workers.
 ACTOR Future<Void> runFastRestoreAgent(Database db, std::string tagName, std::string container,
                                       Standalone<VectorRef<KeyRangeRef>> ranges, Version dbVersion,
-                                       bool performRestore, bool verbose, bool waitForDone, std::string addPrefix,
-                                       std::string removePrefix) {
+                                       bool performRestore, bool verbose, bool waitForDone) {
 	try {
 		state FileBackupAgent backupAgent;
 		state Version restoreVersion = invalidVersion;

 		if (ranges.size() > 1) {
-			fprintf(stderr, "Currently only a single restore range is supported!\n");
-			throw restore_error();
+			fprintf(stdout, "[WARNING] Currently only a single restore range is tested!\n");
 		}

-		state KeyRange range = (ranges.size() == 0) ? normalKeys : ranges.front();
+		if (ranges.size() == 0) {
+			ranges.push_back(ranges.arena(), normalKeys);
+		}

-		printf("[INFO] runFastRestoreAgent: num_ranges:%d restore_range:%s\n", ranges.size(), range.toString().c_str());
+		printf("[INFO] runFastRestoreAgent: restore_ranges:%d first range:%s\n", ranges.size(),
+		       ranges.front().toString().c_str());

 		if (performRestore) {
 			if (dbVersion == invalidVersion) {
@ -2219,9 +2220,26 @@ ACTOR Future<Void> runFastRestoreAgent(Database db, std::string tagName, std::st
 				dbVersion = desc.maxRestorableVersion.get();
 				TraceEvent("FastRestoreAgent").detail("TargetRestoreVersion", dbVersion);
 			}
-			Version _restoreVersion = wait(fastRestore(db, KeyRef(tagName), KeyRef(container), waitForDone, dbVersion,
-			                                           verbose, range, KeyRef(addPrefix), KeyRef(removePrefix)));
-			restoreVersion = _restoreVersion;
+			state UID randomUID = deterministicRandom()->randomUniqueID();
+			TraceEvent("FastRestoreAgent")
+			    .detail("SubmitRestoreRequests", ranges.size())
+			    .detail("RestoreUID", randomUID);
+			wait(backupAgent.submitParallelRestore(db, KeyRef(tagName), ranges, KeyRef(container), dbVersion, true,
+			                                       randomUID));
+			if (waitForDone) {
+				// Wait for parallel restore to finish and unlock DB after that
+				TraceEvent("FastRestoreAgent").detail("BackupAndParallelRestore", "WaitForRestoreToFinish");
+				wait(backupAgent.parallelRestoreFinish(db, randomUID));
+				TraceEvent("FastRestoreAgent").detail("BackupAndParallelRestore", "RestoreFinished");
+			} else {
+				TraceEvent("FastRestoreAgent")
+				    .detail("RestoreUID", randomUID)
+				    .detail("OperationGuide", "Manually unlock DB when restore finishes");
+				printf("WARNING: DB will be in locked state after restore. Need UID:%s to unlock DB\n",
+				       randomUID.toString().c_str());
+			}
+
+			restoreVersion = dbVersion;
 		} else {
 			state Reference<IBackupContainer> bc = IBackupContainer::openContainer(container);
 			state BackupDescription description = wait(bc->describeBackup());
@ -3740,7 +3758,7 @@ int main(int argc, char* argv[]) {
 			switch (restoreType) {
 			case RESTORE_START:
 				f = stopAfter(runFastRestoreAgent(db, tagName, restoreContainer, backupKeys, restoreVersion, !dryRun,
-				                                  !quietDisplay, waitForDone, addPrefix, removePrefix));
+				                                  !quietDisplay, waitForDone));
 				break;
 			case RESTORE_WAIT:
 				printf("[TODO][ERROR] FastRestore does not support RESTORE_WAIT yet!\n");
@ -3887,102 +3905,3 @@ int main(int argc, char* argv[]) {

 	flushAndExit(status);
 }
-
-//------Restore Agent: Kick off the restore by sending the restore requests
-ACTOR static Future<FileBackupAgent::ERestoreState> waitFastRestore(Database cx, Key tagName, bool verbose) {
-	// We should wait on all restore to finish before proceeds
-	TraceEvent("FastRestore").detail("Progress", "WaitForRestoreToFinish");
-	state ReadYourWritesTransaction tr(cx);
-	state Future<Void> fRestoreRequestDone;
-	state bool restoreRequestDone = false;
-
-	loop {
-		try {
-			tr.reset();
-			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-			// In case restoreRequestDoneKey is already set before we set watch on it
-			Optional<Value> restoreRequestDoneKeyValue = wait(tr.get(restoreRequestDoneKey));
-			if (restoreRequestDoneKeyValue.present()) {
-				restoreRequestDone = true;
-				tr.clear(restoreRequestDoneKey);
-				wait(tr.commit());
-				break;
-			} else if (!restoreRequestDone) {
-				fRestoreRequestDone = tr.watch(restoreRequestDoneKey);
-				wait(tr.commit());
-				wait(fRestoreRequestDone);
-			} else {
-				break;
-			}
-		} catch (Error& e) {
-			wait(tr.onError(e));
-		}
-	}
-
-	TraceEvent("FastRestore").detail("Progress", "RestoreFinished");
-
-	return FileBackupAgent::ERestoreState::COMPLETED;
-}
-
-ACTOR static Future<Version> _fastRestore(Database cx, Key tagName, Key url, bool waitForComplete,
-                                          Version targetVersion, bool verbose, KeyRange range, Key addPrefix,
-                                          Key removePrefix) {
-	state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString());
-	state BackupDescription desc = wait(bc->describeBackup());
-	wait(desc.resolveVersionTimes(cx));
-
-	if (targetVersion == invalidVersion && desc.maxRestorableVersion.present())
-		targetVersion = desc.maxRestorableVersion.get();
-
-	Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));
-	TraceEvent("FastRestore").detail("BackupDesc", desc.toString()).detail("TargetVersion", targetVersion);
-
-	if (!restoreSet.present()) {
-		TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
-		    .detail("BackupContainer", bc->getURL())
-		    .detail("TargetVersion", targetVersion);
-		throw restore_invalid_version();
-	}
-
-	// NOTE: The restore agent makes sure we only support 1 restore range for each restore request for now!
-	// The simulation test did test restoring multiple restore ranges in one restore request though.
-	state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
-	state int restoreIndex = 0;
-	loop {
-		try {
-			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-			Standalone<StringRef> restoreTag(tagName.toString() + "_" + std::to_string(restoreIndex));
-			bool locked = true;
-			struct RestoreRequest restoreRequest(restoreIndex, restoreTag, KeyRef(bc->getURL()), true, targetVersion,
-			                                     true, range, Key(), Key(), locked,
-			                                     deterministicRandom()->randomUniqueID());
-			tr->set(restoreRequestKeyFor(restoreRequest.index), restoreRequestValue(restoreRequest));
-			// backupRanges.size = 1 because we only support restoring 1 range in real mode for now
-			tr->set(restoreRequestTriggerKey, restoreRequestTriggerValue(deterministicRandom()->randomUniqueID(),1));
-			wait(tr->commit()); // Trigger fast restore
-			break;
-		} catch (Error& e) {
-			if (e.code() != error_code_restore_duplicate_tag) {
-				wait(tr->onError(e));
-			}
-		}
-	}
-
-	if (waitForComplete) {
-		FileBackupAgent::ERestoreState finalState = wait(waitFastRestore(cx, tagName, verbose));
-		if (finalState != FileBackupAgent::ERestoreState::COMPLETED) throw restore_error();
-	}
-
-	return targetVersion;
-}
-
-ACTOR Future<Version> fastRestore(Database cx, Standalone<StringRef> tagName, Standalone<StringRef> url,
-                                  bool waitForComplete, long targetVersion, bool verbose, Standalone<KeyRangeRef> range,
-                                  Standalone<StringRef> addPrefix, Standalone<StringRef> removePrefix) {
-	Version result =
-	    wait(_fastRestore(cx, tagName, url, waitForComplete, targetVersion, verbose, range, addPrefix, removePrefix));
-	return result;
-}
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -525,6 +525,11 @@ void initHelp() {
 	helpMap["getversion"] =
 	    CommandHelp("getversion", "Fetch the current read version",
 	                "Displays the current read version of the database or currently running transaction.");
+	helpMap["advanceversion"] = CommandHelp(
+	    "advanceversion <VERSION>", "Force the cluster to recover at the specified version",
+	    "Forces the cluster to recover at the specified version. If the specified version is larger than the current "
+	    "version of the cluster, the cluster version is advanced "
+	    "to the specified version via a forced recovery.");
 	helpMap["reset"] = CommandHelp(
 		"reset",
 		"reset the current transaction",
@ -3217,6 +3222,23 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 					continue;
 				}

+				if (tokencmp(tokens[0], "advanceversion")) {
+					if (tokens.size() != 2) {
+						printUsage(tokens[0]);
+						is_error = true;
+					} else {
+						Version v;
+						int n = 0;
+						if (sscanf(tokens[1].toString().c_str(), "%ld%n", &v, &n) != 1 || n != tokens[1].size()) {
+							printUsage(tokens[0]);
+							is_error = true;
+						} else {
+							wait(makeInterruptable(advanceVersion(db, v)));
+						}
+					}
+					continue;
+				}
+
 				if (tokencmp(tokens[0], "kill")) {
 					getTransaction(db, tr, options, intrans);
 					if (tokens.size() == 1) {
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@ -278,7 +278,7 @@ public:
 	// parallel restore
 	Future<Void> parallelRestoreFinish(Database cx, UID randomUID);
 	Future<Void> submitParallelRestore(Database cx, Key backupTag, Standalone<VectorRef<KeyRangeRef>> backupRanges,
-	                                   KeyRef bcUrl, Version targetVersion, bool lockDB, UID randomUID);
+	                                   Key bcUrl, Version targetVersion, bool lockDB, UID randomUID);
 	Future<Void> atomicParallelRestore(Database cx, Key tagName, Standalone<VectorRef<KeyRangeRef>> ranges,
 	                                   Key addPrefix, Key removePrefix);

@ -505,8 +505,6 @@ Standalone<VectorRef<KeyRangeRef>> getApplyRanges(Version beginVersion, Version
 Future<Void> eraseLogData(Reference<ReadYourWritesTransaction> tr, Key logUidValue, Key destUidValue, Optional<Version> endVersion = Optional<Version>(), bool checkBackupUid = false, Version backupUid = 0);
 Key getApplyKey( Version version, Key backupUid );
 std::pair<Version, uint32_t> decodeBKMutationLogKey(Key key);
-Standalone<VectorRef<MutationRef>> decodeBackupLogValue(StringRef value);
-void decodeBackupLogValue(Arena& arena, VectorRef<MutationRef>& result, int64_t& mutationSize, StringRef value, StringRef addPrefix = StringRef(), StringRef removePrefix = StringRef());
 Future<Void> logError(Database cx, Key keyErrors, const std::string& message);
 Future<Void> logError(Reference<ReadYourWritesTransaction> tr, Key keyErrors, const std::string& message);
 Future<Void> checkVersion(Reference<ReadYourWritesTransaction> const& tr);
@ -893,10 +891,6 @@ public:
 	}
 };

-ACTOR Future<Version> fastRestore(Database cx, Standalone<StringRef> tagName, Standalone<StringRef> url,
-                                  bool waitForComplete, long targetVersion, bool verbose, Standalone<KeyRangeRef> range,
-                                  Standalone<StringRef> addPrefix, Standalone<StringRef> removePrefix);
-
 // Helper class for reading restore data from a buffer and throwing the right errors.
 struct StringRefReader {
 	StringRefReader(StringRef s = StringRef(), Error e = Error()) : rptr(s.begin()), end(s.end()), failure_error(e) {}
@ -934,5 +928,13 @@ struct StringRefReader {
 	Error failure_error;
 };

+namespace fileBackup {
+ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file, int64_t offset,
+                                                                      int len);
+
+// Return a block of contiguous padding bytes "\0xff" for backup files, growing if needed.
+Value makePadding(int size);
+}
+
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@ -209,69 +209,6 @@ std::pair<Version, uint32_t> decodeBKMutationLogKey(Key key) {
 		bigEndian32(*(int32_t*)(key.begin() + backupLogPrefixBytes + sizeof(UID) + sizeof(uint8_t) + sizeof(int64_t))));
 }

-// value is an iterable representing all of the transaction log data for
-// a given version.Returns an iterable(generator) yielding a tuple for
-// each mutation in the log.At present, all mutations are represented as
-// (type, param1, param2) where type is an integer and param1 and param2 are byte strings
-Standalone<VectorRef<MutationRef>> decodeBackupLogValue(StringRef value) {
-	try {
-		uint64_t offset(0);
-		uint64_t protocolVersion = 0;
-		memcpy(&protocolVersion, value.begin(), sizeof(uint64_t));
-		offset += sizeof(uint64_t);
-		if (protocolVersion <= 0x0FDB00A200090001){
-			TraceEvent(SevError, "DecodeBackupLogValue").detail("IncompatibleProtocolVersion", protocolVersion)
-				.detail("ValueSize", value.size()).detail("Value", value);
-			throw incompatible_protocol_version();
-		}
-
-		Standalone<VectorRef<MutationRef>> result;
-		uint32_t totalBytes = 0;
-		memcpy(&totalBytes, value.begin() + offset, sizeof(uint32_t));
-		offset += sizeof(uint32_t);
-		uint32_t consumed = 0;
-
-		if(totalBytes + offset > value.size())
-			throw restore_missing_data();
-
-		int originalOffset = offset;
-
-		while (consumed < totalBytes){
-			uint32_t type = 0;
-			memcpy(&type, value.begin() + offset, sizeof(uint32_t));
-			offset += sizeof(uint32_t);
-			uint32_t len1 = 0;
-			memcpy(&len1, value.begin() + offset, sizeof(uint32_t));
-			offset += sizeof(uint32_t);
-			uint32_t len2 = 0;
-			memcpy(&len2, value.begin() + offset, sizeof(uint32_t));
-			offset += sizeof(uint32_t);
-
-			MutationRef logValue;
-			logValue.type = type;
-			logValue.param1 = value.substr(offset, len1);
-			offset += len1;
-			logValue.param2 = value.substr(offset, len2);
-			offset += len2;
-			result.push_back_deep(result.arena(), logValue);
-
-			consumed += BackupAgentBase::logHeaderSize + len1 + len2;
-		}
-
-		ASSERT(consumed == totalBytes);
-		if (value.size() != offset) {
-			TraceEvent(SevError, "BA_DecodeBackupLogValue").detail("UnexpectedExtraDataSize", value.size()).detail("Offset", offset).detail("TotalBytes", totalBytes).detail("Consumed", consumed).detail("OriginalOffset", originalOffset);
-			throw restore_corrupted_data();
-		}
-
-		return result;
-	}
-	catch (Error& e) {
-		TraceEvent(e.code() == error_code_restore_missing_data ? SevWarn : SevError, "BA_DecodeBackupLogValue").error(e).GetLastError().detail("ValueSize", value.size()).detail("Value", value);
-		throw;
-	}
-}
-
 void decodeBackupLogValue(Arena& arena, VectorRef<MutationRef>& result, int& mutationSize, StringRef value, StringRef addPrefix, StringRef removePrefix, Version version, Reference<KeyRangeMap<Version>> key_version) {
 	try {
 		uint64_t offset(0);
--- a/fdbclient/BackupContainer.actor.cpp
+++ b/fdbclient/BackupContainer.actor.cpp
@ -20,6 +20,7 @@

 #include "fdbclient/BackupContainer.h"
 #include "fdbclient/BackupAgent.actor.h"
+#include "fdbclient/FDBTypes.h"
 #include "fdbclient/JsonBuilder.h"
 #include "flow/Trace.h"
 #include "flow/UnitTest.h"
@ -424,9 +425,11 @@ public:
 	}

 	// TODO:  Do this more efficiently, as the range file list for a snapshot could potentially be hundreds of megabytes.
-	ACTOR static Future<std::vector<RangeFile>> readKeyspaceSnapshot_impl(Reference<BackupContainerFileSystem> bc, KeyspaceSnapshotFile snapshot) {
+	ACTOR static Future<std::pair<std::vector<RangeFile>, std::map<std::string, KeyRange>>> readKeyspaceSnapshot_impl(
+	    Reference<BackupContainerFileSystem> bc, KeyspaceSnapshotFile snapshot) {
 		// Read the range file list for the specified version range, and then index them by fileName.
-		// This is so we can verify that each of the files listed in the manifest file are also in the container at this time.
+		// This is so we can verify that each of the files listed in the manifest file are also in the container at this
+		// time.
 		std::vector<RangeFile> files = wait(bc->listRangeFiles(snapshot.beginVersion, snapshot.endVersion));
 		state std::map<std::string, RangeFile> rangeIndex;
 		for(auto &f : files)
@ -482,15 +485,38 @@ public:
 			throw restore_missing_data();
 		}

-		return results;
+		// Check key ranges for files
+		std::map<std::string, KeyRange> fileKeyRanges;
+		JSONDoc ranges = doc.subDoc("keyRanges"); // Create an empty doc if not existed
+		for (auto i : ranges.obj()) {
+			const std::string& filename = i.first;
+			JSONDoc fields(i.second);
+			std::string begin, end;
+			if (fields.tryGet("beginKey", begin) && fields.tryGet("endKey", end)) {
+				TraceEvent("ManifestFields")
+				    .detail("File", filename)
+				    .detail("Begin", printable(StringRef(begin)))
+				    .detail("End", printable(StringRef(end)));
+				fileKeyRanges.emplace(filename, KeyRange(KeyRangeRef(StringRef(begin), StringRef(end))));
+			} else {
+				TraceEvent("MalFormattedManifest").detail("Key", filename);
+				throw restore_corrupted_data();
+			}
+		}
+
+		return std::make_pair(results, fileKeyRanges);
 	}

-	Future<std::vector<RangeFile>> readKeyspaceSnapshot(KeyspaceSnapshotFile snapshot) {
+	Future<std::pair<std::vector<RangeFile>, std::map<std::string, KeyRange>>> readKeyspaceSnapshot(
+	    KeyspaceSnapshotFile snapshot) {
 		return readKeyspaceSnapshot_impl(Reference<BackupContainerFileSystem>::addRef(this), snapshot);
 	}

-	ACTOR static Future<Void> writeKeyspaceSnapshotFile_impl(Reference<BackupContainerFileSystem> bc, std::vector<std::string> fileNames, int64_t totalBytes) {
-		ASSERT(!fileNames.empty());
+	ACTOR static Future<Void> writeKeyspaceSnapshotFile_impl(Reference<BackupContainerFileSystem> bc,
+	                                                         std::vector<std::string> fileNames,
+	                                                         std::vector<std::pair<Key, Key>> beginEndKeys,
+	                                                         int64_t totalBytes) {
+		ASSERT(!fileNames.empty() && fileNames.size() == beginEndKeys.size());

 		state Version minVer = std::numeric_limits<Version>::max();
 		state Version maxVer = 0;
@ -521,6 +547,13 @@ public:
 		doc.create("beginVersion") = minVer;
 		doc.create("endVersion") = maxVer;

+		auto ranges = doc.subDoc("keyRanges");
+		for (int i = 0; i < beginEndKeys.size(); i++) {
+			auto fileDoc = ranges.subDoc(fileNames[i], /*split=*/false);
+			fileDoc.create("beginKey") = beginEndKeys[i].first.toString();
+			fileDoc.create("endKey") = beginEndKeys[i].second.toString();
+		}
+
 		wait(yield());
 		state std::string docString = json_spirit::write_string(json);

@ -531,8 +564,11 @@ public:
 		return Void();
 	}

-	Future<Void> writeKeyspaceSnapshotFile(std::vector<std::string> fileNames, int64_t totalBytes) final {
-		return writeKeyspaceSnapshotFile_impl(Reference<BackupContainerFileSystem>::addRef(this), fileNames, totalBytes);
+	Future<Void> writeKeyspaceSnapshotFile(const std::vector<std::string>& fileNames,
+	                                       const std::vector<std::pair<Key, Key>>& beginEndKeys,
+	                                       int64_t totalBytes) final {
+		return writeKeyspaceSnapshotFile_impl(Reference<BackupContainerFileSystem>::addRef(this), fileNames,
+		                                      beginEndKeys, totalBytes);
 	};

 	// List log files, unsorted, which contain data at any version >= beginVersion and <= targetVersion.
@ -1193,7 +1229,10 @@ public:
 		std::vector<LogFile> filtered;
 		int i = 0;
 		for (int j = 1; j < logs.size(); j++) {
-			if (logs[j].isSubset(logs[i])) continue;
+			if (logs[j].isSubset(logs[i])) {
+				ASSERT(logs[j].fileSize <= logs[i].fileSize);
+				continue;
+			}

 			if (!logs[i].isSubset(logs[j])) {
 				filtered.push_back(logs[i]);
@ -1249,6 +1288,7 @@ public:
 			// filter out if indices.back() is subset of files[i] or vice versa
 			if (!indices.empty()) {
 				if (logs[indices.back()].isSubset(logs[i])) {
+					ASSERT(logs[indices.back()].fileSize <= logs[i].fileSize);
 					indices.back() = i;
 				} else if (!logs[i].isSubset(logs[indices.back()])) {
 					indices.push_back(i);
@ -1291,6 +1331,30 @@ public:
 		return end;
 	}

+	ACTOR static Future<KeyRange> getSnapshotFileKeyRange_impl(Reference<BackupContainerFileSystem> bc,
+	                                                           RangeFile file) {
+		state Reference<IAsyncFile> inFile = wait(bc->readFile(file.fileName));
+		state bool beginKeySet = false;
+		state Key beginKey;
+		state Key endKey;
+		state int64_t j = 0;
+		for (; j < file.fileSize; j += file.blockSize) {
+			int64_t len = std::min<int64_t>(file.blockSize, file.fileSize - j);
+			Standalone<VectorRef<KeyValueRef>> blockData = wait(fileBackup::decodeRangeFileBlock(inFile, j, len));
+			if (!beginKeySet) {
+				beginKey = blockData.front().key;
+				beginKeySet = true;
+			}
+			endKey = blockData.back().key;
+		}
+		return KeyRange(KeyRangeRef(beginKey, endKey));
+	}
+
+	Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file) final {
+		ASSERT(g_network->isSimulated());
+		return getSnapshotFileKeyRange_impl(Reference<BackupContainerFileSystem>::addRef(this), file);
+	}
+
 	ACTOR static Future<Optional<RestorableFileSet>> getRestoreSet_impl(Reference<BackupContainerFileSystem> bc, Version targetVersion) {
 		// Find the most recent keyrange snapshot to end at or before targetVersion
 		state Optional<KeyspaceSnapshotFile> snapshot;
@ -1305,12 +1369,27 @@ public:
 			restorable.snapshot = snapshot.get();
 			restorable.targetVersion = targetVersion;

-			std::vector<RangeFile> ranges = wait(bc->readKeyspaceSnapshot(snapshot.get()));
-			restorable.ranges = ranges;
+			std::pair<std::vector<RangeFile>, std::map<std::string, KeyRange>> results =
+			    wait(bc->readKeyspaceSnapshot(snapshot.get()));
+			restorable.ranges = std::move(results.first);
+			restorable.keyRanges = std::move(results.second);
+			// TODO: Reenable the sanity check after TooManyFiles error is resolved
+			if (false && g_network->isSimulated()) {
+				// Sanity check key ranges
+				state std::map<std::string, KeyRange>::iterator rit;
+				for (rit = restorable.keyRanges.begin(); rit != restorable.keyRanges.end(); rit++) {
+					auto it = std::find_if(restorable.ranges.begin(), restorable.ranges.end(),
+					                       [file = rit->first](const RangeFile f) { return f.fileName == file; });
+					ASSERT(it != restorable.ranges.end());
+					KeyRange result = wait(bc->getSnapshotFileKeyRange(*it));
+					ASSERT(rit->second.begin <= result.begin && rit->second.end >= result.end);
+				}
+			}

 			// No logs needed if there is a complete key space snapshot at the target version.
 			if (snapshot.get().beginVersion == snapshot.get().endVersion &&
 			    snapshot.get().endVersion == targetVersion) {
+				restorable.continuousBeginVersion = restorable.continuousEndVersion = invalidVersion;
 				return Optional<RestorableFileSet>(restorable);
 			}

@ -1335,6 +1414,8 @@ public:
 				// sort by version order again for continuous analysis
 				std::sort(restorable.logs.begin(), restorable.logs.end());
 				if (isPartitionedLogsContinuous(restorable.logs, snapshot.get().beginVersion, targetVersion)) {
+					restorable.continuousBeginVersion = snapshot.get().beginVersion;
+					restorable.continuousEndVersion = targetVersion + 1; // not inclusive
 					return Optional<RestorableFileSet>(restorable);
 				}
 				return Optional<RestorableFileSet>();
@ -1348,6 +1429,8 @@ public:
 				Version end = logs.begin()->endVersion;
 				computeRestoreEndVersion(logs, &restorable.logs, &end, targetVersion);
 				if (end >= targetVersion) {
+					restorable.continuousBeginVersion = logs.begin()->beginVersion;
+					restorable.continuousEndVersion = end;
 					return Optional<RestorableFileSet>(restorable);
 				}
 			}
@ -1694,7 +1777,6 @@ public:
 	virtual ~BackupContainerBlobStore() {}

 	Future<Reference<IAsyncFile>> readFile(std::string path) final {
-		ASSERT(m_bstore->knobs.read_ahead_blocks > 0);
 		return Reference<IAsyncFile>(
 			new AsyncFileReadAheadCache(
 				Reference<IAsyncFile>(new AsyncFileBlobStoreRead(m_bstore, m_bucket, dataPath(path))),
@ -2015,6 +2097,8 @@ ACTOR Future<Optional<int64_t>> timeKeeperEpochsFromVersion(Version v, Reference
 	return found.first + (v - found.second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
 }

+namespace backup_test {
+
 int chooseFileSize(std::vector<int> &sizes) {
 	int size = 1000;
 	if(!sizes.empty()) {
@ -2052,7 +2136,30 @@ Version nextVersion(Version v) {
 	return v + increment;
 }

-ACTOR Future<Void> testBackupContainer(std::string url) {
+// Write a snapshot file with only begin & end key
+ACTOR static Future<Void> testWriteSnapshotFile(Reference<IBackupFile> file, Key begin, Key end, uint32_t blockSize) {
+	ASSERT(blockSize > 3 * sizeof(uint32_t) + begin.size() + end.size());
+
+	uint32_t fileVersion = BACKUP_AGENT_SNAPSHOT_FILE_VERSION;
+	// write Header
+	wait(file->append((uint8_t*)&fileVersion, sizeof(fileVersion)));
+
+	// write begin key length and key
+	wait(file->appendStringRefWithLen(begin));
+
+	// write end key length and key
+	wait(file->appendStringRefWithLen(end));
+
+	int bytesLeft = blockSize - file->size();
+	if (bytesLeft > 0) {
+		Value paddings = fileBackup::makePadding(bytesLeft);
+		wait(file->append(paddings.begin(), bytesLeft));
+	}
+	wait(file->finish());
+	return Void();
+}
+
+ACTOR static Future<Void> testBackupContainer(std::string url) {
 	printf("BackupContainerTest URL %s\n", url.c_str());

 	state Reference<IBackupContainer> c = IBackupContainer::openContainer(url);
@ -2070,6 +2177,7 @@ ACTOR Future<Void> testBackupContainer(std::string url) {
 	state std::vector<Future<Void>> writes;
 	state std::map<Version, std::vector<std::string>> snapshots;
 	state std::map<Version, int64_t> snapshotSizes;
+	state std::map<Version, std::vector<std::pair<Key, Key>>> snapshotBeginEndKeys;
 	state int nRangeFiles = 0;
 	state std::map<Version, std::string> logs;
 	state Version v = deterministicRandom()->randomInt64(0, std::numeric_limits<Version>::max() / 2);
@ -2080,27 +2188,36 @@ ACTOR Future<Void> testBackupContainer(std::string url) {
 	loop {
 		state Version logStart = v;
 		state int kvfiles = deterministicRandom()->randomInt(0, 3);
+		state Key begin = LiteralStringRef("");
+		state Key end = LiteralStringRef("");
+		state int blockSize = 3 * sizeof(uint32_t) + begin.size() + end.size() + 8;

 		while(kvfiles > 0) {
 			if(snapshots.empty()) {
 				snapshots[v] = {};
+				snapshotBeginEndKeys[v] = {};
 				snapshotSizes[v] = 0;
 				if(deterministicRandom()->coinflip()) {
 					v = nextVersion(v);
 				}
 			}
-			Reference<IBackupFile> range = wait(c->writeRangeFile(snapshots.rbegin()->first, 0, v, 10));
+			Reference<IBackupFile> range = wait(c->writeRangeFile(snapshots.rbegin()->first, 0, v, blockSize));
 			++nRangeFiles;
 			v = nextVersion(v);
 			snapshots.rbegin()->second.push_back(range->getFileName());
+			snapshotBeginEndKeys.rbegin()->second.emplace_back(begin, end);

 			int size = chooseFileSize(fileSizes);
 			snapshotSizes.rbegin()->second += size;
-			writes.push_back(writeAndVerifyFile(c, range, size));
+			// Write in actual range file format, instead of random data.
+			// writes.push_back(writeAndVerifyFile(c, range, size));
+			wait(testWriteSnapshotFile(range, begin, end, blockSize));

 			if(deterministicRandom()->random01() < .2) {
-				writes.push_back(c->writeKeyspaceSnapshotFile(snapshots.rbegin()->second, snapshotSizes.rbegin()->second));
+				writes.push_back(c->writeKeyspaceSnapshotFile(
+				    snapshots.rbegin()->second, snapshotBeginEndKeys.rbegin()->second, snapshotSizes.rbegin()->second));
 				snapshots[v] = {};
+				snapshotBeginEndKeys[v] = {};
 				snapshotSizes[v] = 0;
 				break;
 			}
@ -2290,4 +2407,6 @@ TEST_CASE("/backup/continuous") {
 	ASSERT(BackupContainerFileSystem::getPartitionedLogsContinuousEndVersion(files, 250) == 399);

 	return Void();
-}
+}
+
+} // namespace backup_test
--- a/fdbclient/BackupContainer.h
+++ b/fdbclient/BackupContainer.h
@ -68,6 +68,9 @@ static const uint32_t BACKUP_AGENT_MLOG_VERSION = 2001;
 // Mutation log version written by BackupWorker
 static const uint32_t PARTITIONED_MLOG_VERSION = 4110;

+// Snapshot file version written by FileBackupAgent
+static const uint32_t BACKUP_AGENT_SNAPSHOT_FILE_VERSION = 1001;
+
 struct LogFile {
 	Version beginVersion;
 	Version endVersion;
@ -193,6 +196,14 @@ struct RestorableFileSet {
 	Version targetVersion;
 	std::vector<LogFile> logs;
 	std::vector<RangeFile> ranges;
+
+	// Range file's key ranges. Can be empty for backups generated before 6.3.
+	std::map<std::string, KeyRange> keyRanges;
+
+	// Mutation logs continuous range [begin, end). Both can be invalidVersion
+	// when the entire key space snapshot is at the target version.
+	Version continuousBeginVersion, continuousEndVersion;
+
 	KeyspaceSnapshotFile snapshot; // Info. for debug purposes
 };

@ -231,11 +242,17 @@ public:

 	// Write a KeyspaceSnapshotFile of range file names representing a full non overlapping
 	// snapshot of the key ranges this backup is targeting.
-	virtual Future<Void> writeKeyspaceSnapshotFile(std::vector<std::string> fileNames, int64_t totalBytes) = 0;
+	virtual Future<Void> writeKeyspaceSnapshotFile(const std::vector<std::string>& fileNames,
+	                                               const std::vector<std::pair<Key, Key>>& beginEndKeys,
+	                                               int64_t totalBytes) = 0;

 	// Open a file for read by name
 	virtual Future<Reference<IAsyncFile>> readFile(std::string name) = 0;

+	// Returns the key ranges in the snapshot file. This is an expensive function
+	// and should only be used in simulation for sanity check.
+	virtual Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file) = 0;
+
 	struct ExpireProgress {
 		std::string step;
 		int total;
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -44,6 +44,8 @@ set(FDBCLIENT_SRCS
  NativeAPI.actor.cpp
  NativeAPI.actor.h
  Notified.h
+  SpecialKeySpace.actor.cpp
+  SpecialKeySpace.actor.h
  ReadYourWrites.actor.cpp
  ReadYourWrites.h
  RestoreWorkerInterface.actor.h
--- a/fdbclient/CommitTransaction.h
+++ b/fdbclient/CommitTransaction.h
@ -98,19 +98,26 @@ struct MutationRef {
 	}

 	std::string toString() const {
-		if (type < MutationRef::MAX_ATOMIC_OP) {
-			return format("code: %s param1: %s param2: %s", typeString[type], printable(param1).c_str(), printable(param2).c_str());
-		}
-		else {
-			return format("code: Invalid param1: %s param2: %s", printable(param1).c_str(), printable(param2).c_str());
-		}
+		return format("code: %s param1: %s param2: %s",
+		              type < MutationRef::MAX_ATOMIC_OP ? typeString[(int)type] : "Unset", printable(param1).c_str(),
+		              printable(param2).c_str());
 	}

 	bool isAtomicOp() const { return (ATOMIC_MASK & (1 << type)) != 0; }

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, type, param1, param2);
+		if (!ar.isDeserializing && type == ClearRange && equalsKeyAfter(param1, param2)) {
+			StringRef empty;
+			serializer(ar, type, param2, empty);
+		} else {
+			serializer(ar, type, param1, param2);
+		}
+		if (ar.isDeserializing && type == ClearRange && param2 == StringRef() && param1 != StringRef()) {
+			ASSERT(param1[param1.size()-1] == '\x00');
+			param2 = param1;
+			param1 = param2.substr(0, param2.size()-1);
+		}
 	}

 	// These masks define which mutation types have particular properties (they are used to implement isSingleKeyMutation() etc)
@ -129,6 +136,10 @@ static inline std::string getTypeString(MutationRef::Type type) {
 	return type < MutationRef::MAX_ATOMIC_OP ? typeString[(int)type] : "Unset";
 }

+static inline std::string getTypeString(uint8_t type) {
+	return type < MutationRef::MAX_ATOMIC_OP ? typeString[type] : "Unset";
+}
+
 // A 'single key mutation' is one which affects exactly the value of the key specified by its param1
 static inline bool isSingleKeyMutation(MutationRef::Type type) {
 	return (MutationRef::SINGLE_KEY_MASK & (1<<type)) != 0;
--- a/fdbclient/DatabaseBackupAgent.actor.cpp
+++ b/fdbclient/DatabaseBackupAgent.actor.cpp
@ -1490,6 +1490,12 @@ namespace dbBackup {
 					Version bVersion = wait(srcTr->getReadVersion());
 					beginVersionKey = BinaryWriter::toValue(bVersion, Unversioned());

+					state Key versionKey = logUidValue.withPrefix(destUidValue).withPrefix(backupLatestVersionsPrefix);
+					Optional<Key> versionRecord = wait( srcTr->get(versionKey) );
+					if(!versionRecord.present()) {
+						srcTr->set(versionKey, beginVersionKey);
+					}
+
 					task->params[BackupAgentBase::destUid] = destUidValue;

 					wait(srcTr->commit());
@ -1539,9 +1545,6 @@ namespace dbBackup {
 					if(v.present() && BinaryReader::fromStringRef<Version>(v.get(), Unversioned()) >= BinaryReader::fromStringRef<Version>(task->params[DatabaseBackupAgent::keyFolderId], Unversioned()))
 						return Void();

-					Key versionKey = logUidValue.withPrefix(destUidValue).withPrefix(backupLatestVersionsPrefix);
-					srcTr2->set(versionKey, beginVersionKey);
-
 					srcTr2->set( Subspace(databaseBackupPrefixRange.begin).get(BackupAgentBase::keySourceTagName).pack(task->params[BackupAgentBase::keyTagName]), logUidValue );
 					srcTr2->set( sourceStates.pack(DatabaseBackupAgent::keyFolderId), task->params[DatabaseBackupAgent::keyFolderId] );
 					srcTr2->set( sourceStates.pack(DatabaseBackupAgent::keyStateStatus), StringRef(BackupAgentBase::getStateText(BackupAgentBase::STATE_RUNNING)));
--- a/fdbclient/DatabaseConfiguration.cpp
+++ b/fdbclient/DatabaseConfiguration.cpp
@ -494,11 +494,16 @@ Optional<ValueRef> DatabaseConfiguration::get( KeyRef key ) const {
 	}
 }

-bool DatabaseConfiguration::isExcludedServer( NetworkAddress a ) const {
-	return get( encodeExcludedServersKey( AddressExclusion(a.ip, a.port) ) ).present() ||
-		get( encodeExcludedServersKey( AddressExclusion(a.ip) ) ).present() ||
-		get( encodeFailedServersKey( AddressExclusion(a.ip, a.port) ) ).present() ||
-		get( encodeFailedServersKey( AddressExclusion(a.ip) ) ).present();
+bool DatabaseConfiguration::isExcludedServer( NetworkAddressList a ) const {
+	return get( encodeExcludedServersKey( AddressExclusion(a.address.ip, a.address.port) ) ).present() ||
+		get( encodeExcludedServersKey( AddressExclusion(a.address.ip) ) ).present() ||
+		get( encodeFailedServersKey( AddressExclusion(a.address.ip, a.address.port) ) ).present() ||
+		get( encodeFailedServersKey( AddressExclusion(a.address.ip) ) ).present() || 
+		( a.secondaryAddress.present() && (
+		get( encodeExcludedServersKey( AddressExclusion(a.secondaryAddress.get().ip, a.secondaryAddress.get().port) ) ).present() ||
+		get( encodeExcludedServersKey( AddressExclusion(a.secondaryAddress.get().ip) ) ).present() ||
+		get( encodeFailedServersKey( AddressExclusion(a.secondaryAddress.get().ip, a.secondaryAddress.get().port) ) ).present() ||
+		get( encodeFailedServersKey( AddressExclusion(a.secondaryAddress.get().ip) ) ).present() ) );
 }
 std::set<AddressExclusion> DatabaseConfiguration::getExcludedServers() const {
 	const_cast<DatabaseConfiguration*>(this)->makeConfigurationImmutable();
--- a/fdbclient/DatabaseConfiguration.h
+++ b/fdbclient/DatabaseConfiguration.h
@ -187,7 +187,7 @@ struct DatabaseConfiguration {
 	std::vector<RegionInfo> regions;

 	// Excluded servers (no state should be here)
-	bool isExcludedServer( NetworkAddress ) const;
+	bool isExcludedServer( NetworkAddressList ) const;
 	std::set<AddressExclusion> getExcludedServers() const;

 	int32_t getDesiredProxies() const { if(masterProxyCount == -1) return autoMasterProxyCount; return masterProxyCount; }
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -30,6 +30,7 @@
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/MasterProxyInterface.h"
+#include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbrpc/QueueModel.h"
 #include "fdbrpc/MultiInterface.h"
 #include "flow/TDMetric.actor.h"
@ -66,7 +67,7 @@ struct LocationInfo : MultiInterface<ReferencedInterface<StorageServerInterface>
 	}
 };

-typedef MultiInterface<MasterProxyInterface> ProxyInfo;
+typedef ModelInterface<MasterProxyInterface> ProxyInfo;

 class DatabaseContext : public ReferenceCounted<DatabaseContext>, public FastAllocated<DatabaseContext>, NonCopyable {
 public:
@ -253,6 +254,8 @@ public:
 	UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaults;
 	Future<Void> cacheListMonitor;
 	AsyncTrigger updateCache;
+	std::shared_ptr<SpecialKeySpace> specialKeySpace;
+	std::shared_ptr<ConflictingKeysImpl> cKImpl;
 };

 #endif
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -282,8 +282,20 @@ struct KeyRangeRef {

 	template <class Ar>
 	force_inline void serialize(Ar& ar) {
-		serializer(ar, const_cast<KeyRef&>(begin), const_cast<KeyRef&>(end));
+		if (!ar.isDeserializing && equalsKeyAfter(begin, end)) {
+			StringRef empty;
+			serializer(ar, const_cast<KeyRef&>(end), empty);
+		} else {
+			serializer(ar, const_cast<KeyRef&>(begin), const_cast<KeyRef&>(end));
+		}
+		if (ar.isDeserializing && end == StringRef() && begin != StringRef()) {
+			ASSERT(begin[begin.size()-1] == '\x00');
+			const_cast<KeyRef&>(end) = begin;
+			const_cast<KeyRef&>(begin) = end.substr(0, end.size()-1);
+		}
+
 		if( begin > end ) {
+			TraceEvent("InvertedRange").detail("Begin", begin).detail("End", end);
 			throw inverted_range();
 		};
 	}
@ -416,7 +428,7 @@ typedef Standalone<KeyRangeRef> KeyRange;
 typedef Standalone<KeyValueRef> KeyValue;
 typedef Standalone<struct KeySelectorRef> KeySelector;

-enum { invalidVersion = -1, latestVersion = -2 };
+enum { invalidVersion = -1, latestVersion = -2, MAX_VERSION = std::numeric_limits<int64_t>::max() };

 inline Key keyAfter( const KeyRef& key ) {
 	if(key == LiteralStringRef("\xff\xff"))
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -461,7 +461,8 @@ namespace fileBackup {
 	//   then the space after the final key to the next 1MB boundary would
 	//   just be padding anyway.
 	struct RangeFileWriter {
-		RangeFileWriter(Reference<IBackupFile> file = Reference<IBackupFile>(), int blockSize = 0) : file(file), blockSize(blockSize), blockEnd(0), fileVersion(1001) {}
+	    RangeFileWriter(Reference<IBackupFile> file = Reference<IBackupFile>(), int blockSize = 0)
+	      : file(file), blockSize(blockSize), blockEnd(0), fileVersion(BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {}

 		// Handles the first block and internal blocks.  Ends current block if needed.
 		// The final flag is used in simulation to pad the file's final block to a whole block size
@ -557,8 +558,8 @@ namespace fileBackup {
 		state StringRefReader reader(buf, restore_corrupted_data());

 		try {
-			// Read header, currently only decoding version 1001
-			if(reader.consume<int32_t>() != 1001)
+			// Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION
+			if(reader.consume<int32_t>() != BACKUP_AGENT_SNAPSHOT_FILE_VERSION)
 				throw restore_unsupported_file_version();

 			// Read begin key, if this fails then block was invalid.
@ -2257,6 +2258,7 @@ namespace fileBackup {
 			}

 			std::vector<std::string> files;
+			std::vector<std::pair<Key, Key>> beginEndKeys;
 			state Version maxVer = 0;
 			state Version minVer = std::numeric_limits<Version>::max();
 			state int64_t totalBytes = 0;
@ -2272,6 +2274,9 @@ namespace fileBackup {
 					// Add file to final file list
 					files.push_back(r.fileName);

+					// Add (beginKey, endKey) pairs to the list
+					beginEndKeys.emplace_back(i->second.begin, i->first);
+
 					// Update version range seen
 					if(r.version < minVer)
 						minVer = r.version;
@ -2293,7 +2298,7 @@ namespace fileBackup {
 			}

 			Params.endVersion().set(task, maxVer);
-			wait(bc->writeKeyspaceSnapshotFile(files, totalBytes));
+			wait(bc->writeKeyspaceSnapshotFile(files, beginEndKeys, totalBytes));

 			TraceEvent(SevInfo, "FileBackupWroteSnapshotManifest")
 				.detail("BackupUID", config.getUid())
@ -2402,6 +2407,7 @@ namespace fileBackup {
 			state bool backupWorkerEnabled = dbConfig.backupWorkerEnabled;
 			if (!backupWorkerEnabled) {
 				wait(success(changeConfig(cx, "backup_worker_enabled:=1", true)));
+				backupWorkerEnabled = true;
 			}

 			// Set the "backupStartedKey" and wait for all backup worker started
@ -3622,8 +3628,32 @@ public:
 	}

 	ACTOR static Future<Void> submitParallelRestore(Database cx, Key backupTag,
-	                                                Standalone<VectorRef<KeyRangeRef>> backupRanges, KeyRef bcUrl,
+	                                                Standalone<VectorRef<KeyRangeRef>> backupRanges, Key bcUrl,
 	                                                Version targetVersion, bool lockDB, UID randomUID) {
+		// Sanity check backup is valid
+		state Reference<IBackupContainer> bc = IBackupContainer::openContainer(bcUrl.toString());
+		state BackupDescription desc = wait(bc->describeBackup());
+		wait(desc.resolveVersionTimes(cx));
+
+		if (targetVersion == invalidVersion && desc.maxRestorableVersion.present()) {
+			targetVersion = desc.maxRestorableVersion.get();
+			TraceEvent(SevWarn, "FastRestoreSubmitRestoreRequestWithInvalidTargetVersion")
+			    .detail("OverrideTargetVersion", targetVersion);
+		}
+
+		Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));
+
+		if (!restoreSet.present()) {
+			TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
+			    .detail("BackupContainer", bc->getURL())
+			    .detail("TargetVersion", targetVersion);
+			throw restore_invalid_version();
+		}
+
+		TraceEvent("FastRestoreSubmitRestoreRequest")
+		    .detail("BackupDesc", desc.toString())
+		    .detail("TargetVersion", targetVersion);
+
 		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
 		state int restoreIndex = 0;
 		state int numTries = 0;
@ -4602,7 +4632,7 @@ Future<Void> FileBackupAgent::parallelRestoreFinish(Database cx, UID randomUID)
 }

 Future<Void> FileBackupAgent::submitParallelRestore(Database cx, Key backupTag,
-                                                    Standalone<VectorRef<KeyRangeRef>> backupRanges, KeyRef bcUrl,
+                                                    Standalone<VectorRef<KeyRangeRef>> backupRanges, Key bcUrl,
                                                    Version targetVersion, bool lockDB, UID randomUID) {
 	return FileBackupAgentImpl::submitParallelRestore(cx, backupTag, backupRanges, bcUrl, targetVersion, lockDB,
 	                                                  randomUID);
--- a/fdbclient/JSONDoc.h
+++ b/fdbclient/JSONDoc.h
@ -193,7 +193,7 @@ struct JSONDoc {
 		return v.get_value<T>();
 	}

-	// Ensures that a an Object exists at path and returns a JSONDoc that writes to it.
+	// Ensures that an Object exists at path and returns a JSONDoc that writes to it.
 	JSONDoc subDoc(std::string path, bool split=true) {
 		json_spirit::mValue &v = create(path, split);
 		if(v.type() != json_spirit::obj_type)
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -1808,6 +1808,26 @@ ACTOR Future<Void> checkDatabaseLock( Reference<ReadYourWritesTransaction> tr, U
 	return Void();
 }

+ACTOR Future<Void> advanceVersion(Database cx, Version v) {
+	state Transaction tr(cx);
+	loop {
+		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+		try {
+			Version rv = wait(tr.getReadVersion());
+			if (rv <= v) {
+				tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(v + 1, Unversioned()));
+				wait(tr.commit());
+			} else {
+				printf("Current read version is %ld\n", rv);
+				return Void();
+			}
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
 ACTOR Future<Void> forceRecovery( Reference<ClusterConnectionFile> clusterFile, Key dcId ) {
 	state Reference<AsyncVar<Optional<ClusterInterface>>> clusterInterface(new AsyncVar<Optional<ClusterInterface>>);
 	state Future<Void> leaderMon = monitorLeader<ClusterInterface>(clusterFile, clusterInterface);
@ -1851,7 +1871,6 @@ ACTOR Future<Void> changeCachedRange(Database cx, KeyRangeRef range, bool add) {
 	state Value trueValue = storageCacheValue(std::vector<uint16_t>{ 0 });
 	state Value falseValue = storageCacheValue(std::vector<uint16_t>{});
 	loop {
-		//TraceEvent(SevDebug, "ChangeCachedRangeInLoop").detail("BeginKey", range.begin.toString()).detail("EndKey", range.end.toString());
 		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		try {
@ -1870,12 +1889,10 @@ ACTOR Future<Void> changeCachedRange(Database cx, KeyRangeRef range, bool add) {
 				// we need to uncache from here
 				tr.set(sysRange.begin, falseValue);
 				tr.set(privateRange.begin, serverKeysFalse);
-				//TraceEvent(SevDebug, "ChangeCachedRangeSetBegin1").detail("BeginKey", sysRange.begin.toString());
 			} else if (!prevIsCached && add) {
 				// we need to cache, starting from here
 				tr.set(sysRange.begin, trueValue);
 				tr.set(privateRange.begin, serverKeysTrue);
-				//TraceEvent(SevDebug, "ChangeCachedRangeSetBegin2").detail("BeginKey", sysRange.begin.toString());
 			}
 			Standalone<RangeResultRef> after =
 			    wait(tr.getRange(KeyRangeRef(sysRange.end, storageCacheKeys.end), 1, false));
@ -1888,14 +1905,11 @@ ACTOR Future<Void> changeCachedRange(Database cx, KeyRangeRef range, bool add) {
 			if (afterIsCached && !add) {
 				tr.set(sysRange.end, trueValue);
 				tr.set(privateRange.end, serverKeysTrue);
-				//TraceEvent(SevDebug, "ChangeCachedRangeSetEnd1").detail("EndKey", sysRange.end.toString());
 			} else if (!afterIsCached && add) {
 				tr.set(sysRange.end, falseValue);
 				tr.set(privateRange.end, serverKeysFalse);
-				//TraceEvent(SevDebug, "ChangeCachedRangeSetEnd2").detail("EndKey", sysRange.end.toString());
 			}
 			wait(tr.commit());
-			//TraceEvent(SevDebug, "ChangeCachedRangeReturn");
 			return Void();
 		} catch (Error& e) {
 			state Error err = e;
--- a/fdbclient/ManagementAPI.actor.h
+++ b/fdbclient/ManagementAPI.actor.h
@ -178,6 +178,8 @@ ACTOR Future<Void> unlockDatabase( Database  cx, UID  id );
 ACTOR Future<Void> checkDatabaseLock( Transaction*  tr, UID  id );
 ACTOR Future<Void> checkDatabaseLock( Reference<ReadYourWritesTransaction>  tr, UID  id );

+ACTOR Future<Void> advanceVersion(Database cx, Version v);
+
 ACTOR Future<int> setDDMode( Database  cx, int  mode );

 ACTOR Future<Void> forceRecovery( Reference<ClusterConnectionFile> clusterFile, Standalone<StringRef> dcId );
--- a/fdbclient/MasterProxyInterface.h
+++ b/fdbclient/MasterProxyInterface.h
@ -156,7 +156,7 @@ static inline int getBytes( CommitTransactionRequest const& r ) {
 	return total;
 }

-struct GetReadVersionReply {
+struct GetReadVersionReply : public BasicLoadBalancedReply {
 	constexpr static FileIdentifier file_identifier = 15709388;
 	Version version;
 	bool locked;
@ -166,7 +166,7 @@ struct GetReadVersionReply {

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, version, locked, metadataVersion);
+		serializer(ar, BasicLoadBalancedReply::recentRequests, version, locked, metadataVersion);
 	}
 };

@ -278,11 +278,12 @@ struct TxnStateRequest {
 	VectorRef<KeyValueRef> data;
 	Sequence sequence;
 	bool last;
+	std::vector<Endpoint> broadcastInfo;
 	ReplyPromise<Void> reply;

 	template <class Ar> 
 	void serialize(Ar& ar) { 
-		serializer(ar, data, sequence, last, reply, arena);
+		serializer(ar, data, sequence, last, broadcastInfo, reply, arena);
 	}
 };

--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -43,6 +43,7 @@
 #include "fdbclient/MonitorLeader.h"
 #include "fdbclient/MutationList.h"
 #include "fdbclient/ReadYourWrites.h"
+#include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/SystemData.h"
 #include "fdbrpc/LoadBalance.h"
@ -102,7 +103,6 @@ Future<REPLY_TYPE(Request)> loadBalance(
 	return runAfter(loadBalance(alternatives->locations(), channel, request, taskID, atMostOnce, model),
 					[ctx](auto res) {
 						if (res.cached) {
-							//TraceEvent(SevDebug, "NativeCientReqCached");
 							ctx->updateCache.trigger();
 						}
 						return res;
@ -117,16 +117,7 @@ TLSConfig tlsConfig(TLSEndpointType::CLIENT);
 NetworkOptions::NetworkOptions()
 	: localAddress(""), clusterFile(""), traceDirectory(Optional<std::string>()),
 	  traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"),
-	  traceFormat("xml"), traceClockSource("now"), runLoopProfilingEnabled(false) {
-
-	Standalone<VectorRef<ClientVersionRef>> defaultSupportedVersions;
-
-	StringRef sourceVersion = StringRef((const uint8_t*)getSourceVersion(), strlen(getSourceVersion()));
-	std::string protocolVersionString = format("%llx", currentProtocolVersion.version());
-	defaultSupportedVersions.push_back_deep(defaultSupportedVersions.arena(), ClientVersionRef(LiteralStringRef(FDB_VT_VERSION), sourceVersion, protocolVersionString));
-
-	supportedVersions = ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>::from(defaultSupportedVersions);
-}
+	  traceFormat("xml"), traceClockSource("now"), runLoopProfilingEnabled(false), supportedVersions(new ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>()) {}

 static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/");
 static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/");
@ -593,18 +584,15 @@ ACTOR Future<Void> updateCachedRanges(DatabaseContext* self, std::map<UID, Stora
 						auto iter = self->locationCache.rangeContaining(begin);
 						if (iter->value() && !iter->value()->hasCaches) {
 							if (end>=iter->range().end) {
-								TraceEvent(SevDebug, "UpdateCachedRangeInsert1").detail("Begin",begin).detail("End",iter->range().end);
 								self->locationCache.insert(KeyRangeRef{ begin, iter->range().end },
 														   addCaches(iter->value(), cacheInterfaces));
 							} else {
-								TraceEvent(SevDebug, "UpdateCachedRangeInsert2").detail("Begin",begin).detail("End",end);
 								self->locationCache.insert(KeyRangeRef{ begin, end },
 														   addCaches(iter->value(), cacheInterfaces));
 							}
 						}
 						iter = self->locationCache.rangeContainingKeyBefore(end);
 						if (iter->value() && !iter->value()->hasCaches) {
-							TraceEvent(SevDebug, "UpdateCachedRangeInsert2").detail("Begin",iter->range().begin).detail("End", end);
 							self->locationCache.insert(KeyRangeRef{iter->range().begin, end}, addCaches(iter->value(), cacheInterfaces));
 						}
 					}
@ -654,8 +642,6 @@ ACTOR Future<Void> monitorCacheList(DatabaseContext* self) {
 									std::insert_iterator<std::map<UID, StorageServerInterface>>(
 										deletedCacheServers, deletedCacheServers.begin()));
 				hasChanges = !(newCacheServers.empty() && deletedCacheServers.empty());
-				//TraceEvent(SevDebug, "MonitorCacheList").detail("AllCacheServers",allCacheServers.size())
-				//.detail("NewCacheServers",newCacheServers.size()).detail("OldCacheServers",cacheServerMap.size());
 				if (hasChanges) {
 					updateLocationCacheWithCaches(self, deletedCacheServers, newCacheServers);
 				}
@ -688,7 +674,7 @@ ACTOR static Future<HealthMetrics> getHealthMetricsActor(DatabaseContext *cx, bo
 		choose {
 			when(wait(cx->onMasterProxiesChanged())) {}
 			when(GetHealthMetricsReply rep =
-				 wait(loadBalance(cx->getMasterProxies(false), &MasterProxyInterface::getHealthMetrics,
+				 wait(basicLoadBalance(cx->getMasterProxies(false), &MasterProxyInterface::getHealthMetrics,
 							 GetHealthMetricsRequest(sendDetailedRequest)))) {
 				cx->healthMetrics.update(rep.healthMetrics, detailed, true);
 				if (detailed) {
@ -710,25 +696,42 @@ ACTOR static Future<HealthMetrics> getHealthMetricsActor(DatabaseContext *cx, bo
 Future<HealthMetrics> DatabaseContext::getHealthMetrics(bool detailed = false) {
 	return getHealthMetricsActor(this, detailed);
 }
-DatabaseContext::DatabaseContext(
-	Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile, Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor,
-	TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, bool internal, int apiVersion, bool switchable ) 
-	: connectionFile(connectionFile),clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance),
-	lockAware(lockAware), apiVersion(apiVersion), switchable(switchable), provisional(false), cc("TransactionMetrics"), transactionReadVersions("ReadVersions", cc), 
-	transactionReadVersionsCompleted("ReadVersionsCompleted", cc), transactionReadVersionBatches("ReadVersionBatches", cc), transactionBatchReadVersions("BatchPriorityReadVersions", cc), 
-	transactionDefaultReadVersions("DefaultPriorityReadVersions", cc), transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc), 
-	transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc), transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc), 
-	transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), 
-	transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc), transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc), 
-	transactionGetRangeRequests("GetRangeRequests", cc), transactionWatchRequests("WatchRequests", cc), transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), 
-	transactionBytesRead("BytesRead", cc), transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc), transactionCommittedMutations("CommittedMutations", cc), 
-	transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc), transactionClearMutations("ClearMutations", cc), 
-	transactionAtomicMutations("AtomicMutations", cc), transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), 
-	transactionKeyServerLocationRequests("KeyServerLocationRequests", cc), transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc), transactionsTooOld("TooOld", cc), 
-	transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), 
-	transactionsResourceConstrained("ResourceConstrained", cc), transactionsThrottled("Throttled", cc), transactionsProcessBehind("ProcessBehind", cc), outstandingWatches(0), latencies(1000), readLatencies(1000), commitLatencies(1000), 
-	GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal)
-{
+DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile,
+                                 Reference<AsyncVar<ClientDBInfo>> clientInfo, Future<Void> clientInfoMonitor,
+                                 TaskPriority taskID, LocalityData const& clientLocality,
+                                 bool enableLocalityLoadBalance, bool lockAware, bool internal, int apiVersion,
+                                 bool switchable)
+  : connectionFile(connectionFile), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), taskID(taskID),
+    clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), lockAware(lockAware),
+    apiVersion(apiVersion), switchable(switchable), provisional(false), cc("TransactionMetrics"),
+    transactionReadVersions("ReadVersions", cc), transactionReadVersionsCompleted("ReadVersionsCompleted", cc),
+    transactionReadVersionBatches("ReadVersionBatches", cc),
+    transactionBatchReadVersions("BatchPriorityReadVersions", cc),
+    transactionDefaultReadVersions("DefaultPriorityReadVersions", cc),
+    transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc),
+    transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc),
+    transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc),
+    transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc),
+    transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc),
+    transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc),
+    transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc),
+    transactionGetRangeRequests("GetRangeRequests", cc), transactionWatchRequests("WatchRequests", cc),
+    transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc),
+    transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc),
+    transactionCommittedMutations("CommittedMutations", cc),
+    transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc),
+    transactionClearMutations("ClearMutations", cc), transactionAtomicMutations("AtomicMutations", cc),
+    transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc),
+    transactionKeyServerLocationRequests("KeyServerLocationRequests", cc),
+    transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc),
+    transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc),
+    transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc),
+    transactionsResourceConstrained("ResourceConstrained", cc), transactionsThrottled("Throttled", cc),
+    transactionsProcessBehind("ProcessBehind", cc), outstandingWatches(0), latencies(1000), readLatencies(1000),
+    commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
+    healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal),
+    specialKeySpace(std::make_shared<SpecialKeySpace>(normalKeys.begin, specialKeys.end)),
+    cKImpl(std::make_shared<ConflictingKeysImpl>(conflictingKeysRange)) {
 	dbId = deterministicRandom()->randomUniqueID();
 	connected = clientInfo->get().proxies.size() ? Void() : clientInfo->onChange();

@ -748,6 +751,7 @@ DatabaseContext::DatabaseContext(
 	monitorMasterProxiesInfoChange = monitorMasterProxiesChange(clientInfo, &masterProxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 	cacheListMonitor = monitorCacheList(this);
+	specialKeySpace->registerKeyRange(conflictingKeysRange, cKImpl.get());
 }

 DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("TransactionMetrics"), transactionReadVersions("ReadVersions", cc), 
@ -893,7 +897,7 @@ void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional<Str
 			case FDBDatabaseOptions::MACHINE_ID:
 				clientLocality = LocalityData( clientLocality.processId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>(), clientLocality.machineId(), clientLocality.dcId() );
 				if( clientInfo->get().proxies.size() )
-					masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ) );
+					masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies ) );
 				server_interf.clear();
 				locationCache.insert( allKeys, Reference<LocationInfo>() );
 				break;
@ -903,7 +907,7 @@ void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional<Str
 			case FDBDatabaseOptions::DATACENTER_ID:
 				clientLocality = LocalityData(clientLocality.processId(), clientLocality.zoneId(), clientLocality.machineId(), value.present() ? Standalone<StringRef>(value.get()) : Optional<Standalone<StringRef>>());
 				if( clientInfo->get().proxies.size() )
-					masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ));
+					masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies ));
 				server_interf.clear();
 				locationCache.insert( allKeys, Reference<LocationInfo>() );
 				break;
@ -1236,7 +1240,10 @@ void setupNetwork(uint64_t transportId, bool useMetrics) {
 	if (!networkOptions.logClientInfo.present())
 		networkOptions.logClientInfo = true;

+	TLS::DisableOpenSSLAtExitHandler();
 	g_network = newNet2(tlsConfig, false, useMetrics || networkOptions.traceDirectory.present());
+	g_network->addStopCallback( Net2FileSystem::stop );
+	g_network->addStopCallback( TLS::DestroyOpenSSLGlobalState );
 	FlowTransport::createInstance(true, transportId);
 	Net2FileSystem::newFileSystem();
 }
@ -1268,7 +1275,7 @@ Reference<ProxyInfo> DatabaseContext::getMasterProxies(bool useProvisionalProxie
 		masterProxiesLastChange = clientInfo->get().id;
 		masterProxies.clear();
 		if( clientInfo->get().proxies.size() ) {
-			masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies, clientLocality ));
+			masterProxies = Reference<ProxyInfo>( new ProxyInfo( clientInfo->get().proxies ));
 			provisional = clientInfo->get().proxies[0].provisional;
 		}
 	}
@ -1414,7 +1421,7 @@ ACTOR Future< pair<KeyRange,Reference<LocationInfo>> > getKeyLocation_internal(
 		++cx->transactionKeyServerLocationRequests;
 		choose {
 			when ( wait( cx->onMasterProxiesChanged() ) ) {}
-			when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional<KeyRef>(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
+			when ( GetKeyServerLocationsReply rep = wait( basicLoadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional<KeyRef>(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
 				++cx->transactionKeyServerLocationRequestsCompleted;
 				if( info.debugID.present() )
 					g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After");
@ -1457,7 +1464,7 @@ ACTOR Future< vector< pair<KeyRange,Reference<LocationInfo>> > > getKeyRangeLoca
 		++cx->transactionKeyServerLocationRequests;
 		choose {
 			when ( wait( cx->onMasterProxiesChanged() ) ) {}
-			when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
+			when ( GetKeyServerLocationsReply _rep = wait( basicLoadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) {
 				++cx->transactionKeyServerLocationRequestsCompleted;
 				state GetKeyServerLocationsReply rep = _rep;
 				if( info.debugID.present() )
@ -1701,7 +1708,7 @@ ACTOR Future<Version> waitForCommittedVersion( Database cx, Version version ) {
 		loop {
 			choose {
 				when ( wait( cx->onMasterProxiesChanged() ) ) {}
-				when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE ), cx->taskID ) ) ) {
+				when ( GetReadVersionReply v = wait( basicLoadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE ), cx->taskID ) ) ) {
 					cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version);

 					if (v.version >= version)
@ -1721,7 +1728,7 @@ ACTOR Future<Version> getRawVersion( Database cx ) {
 	loop {
 		choose {
 			when ( wait( cx->onMasterProxiesChanged() ) ) {}
-			when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE ), cx->taskID ) ) ) {
+			when ( GetReadVersionReply v = wait( basicLoadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE ), cx->taskID ) ) ) {
 				return v.version;
 			}
 		}
@ -1831,7 +1838,6 @@ ACTOR Future<Standalone<RangeResultRef>> getExactRange( Database cx, Version ver
 			req.begin = firstGreaterOrEqual( range.begin );
 			req.end = firstGreaterOrEqual( range.end );

-			TraceEvent(SevDebug, "GetExactRange").detail("Begin", req.begin.getKey()).detail("End", req.end.getKey());
 			transformRangeLimits(limits, reverse, req);
 			ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);

@ -2952,7 +2958,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 				reply = proxies.size() ? throwErrorOr ( brokenPromiseToMaybeDelivered ( proxies[0].commit.tryGetReply(req) ) ) : Never();
 			}
 		} else {
-			reply = loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskPriority::DefaultPromiseEndpoint, true );
+			reply = basicLoadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskPriority::DefaultPromiseEndpoint, true );
 		}

 		choose {
@ -2994,42 +3000,21 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 					return Void();
 				} else {
 					// clear the RYW transaction which contains previous conflicting keys
-					tr->info.conflictingKeysRYW.reset();
+					tr->info.conflictingKeys.reset();
 					if (ci.conflictingKRIndices.present()) {
-						// In general, if we want to use getRange to expose conflicting keys,
-						// we need to support all the parameters getRange provides.
-						// It is difficult to take care of all corner cases of what getRange does.
-						// Consequently, we use a hack way here to achieve it.
-						// We create an empty RYWTransaction and write all conflicting key/values to it.
-						// Since it is RYWTr, we can call getRange on it with same parameters given to the original
-						// getRange.
-						tr->info.conflictingKeysRYW = std::make_shared<ReadYourWritesTransaction>(tr->getDatabase());
-						state Reference<ReadYourWritesTransaction> hackTr =
-						    Reference<ReadYourWritesTransaction>(tr->info.conflictingKeysRYW.get());
-						try {
-							state Standalone<VectorRef<int>> conflictingKRIndices = ci.conflictingKRIndices.get();
-							// To make the getRange call local, we need to explicitly set the read version here.
-							// This version number 100 set here does nothing but prevent getting read version from the
-							// proxy
-							tr->info.conflictingKeysRYW->setVersion(100);
-							// Clear the whole key space, thus, RYWTr knows to only read keys locally
-							tr->info.conflictingKeysRYW->clear(normalKeys);
-							// initialize value
-							tr->info.conflictingKeysRYW->set(conflictingKeysPrefix, conflictingKeysFalse);
-							// drop duplicate indices and merge overlapped ranges
-							// Note: addReadConflictRange in native transaction object does not merge overlapped ranges
-							state std::unordered_set<int> mergedIds(conflictingKRIndices.begin(),
-							                                        conflictingKRIndices.end());
-							for (auto const& rCRIndex : mergedIds) {
-								const KeyRange kr = req.transaction.read_conflict_ranges[rCRIndex];
-								wait(krmSetRangeCoalescing(hackTr, conflictingKeysPrefix, kr, allKeys,
-								                           conflictingKeysTrue));
-							}
-						} catch (Error& e) {
-							hackTr.extractPtr(); // Make sure the RYW is not freed twice in case exception thrown
-							throw;
+						tr->info.conflictingKeys =
+						    std::make_shared<CoalescedKeyRangeMap<Value>>(conflictingKeysFalse, specialKeys.end);
+						state Standalone<VectorRef<int>> conflictingKRIndices = ci.conflictingKRIndices.get();
+						// drop duplicate indices and merge overlapped ranges
+						// Note: addReadConflictRange in native transaction object does not merge overlapped ranges
+						state std::unordered_set<int> mergedIds(conflictingKRIndices.begin(),
+																conflictingKRIndices.end());
+						for (auto const& rCRIndex : mergedIds) {
+							const KeyRangeRef kr = req.transaction.read_conflict_ranges[rCRIndex];
+							const KeyRange krWithPrefix = KeyRangeRef(kr.begin.withPrefix(conflictingKeysRange.begin),
+							                                          kr.end.withPrefix(conflictingKeysRange.begin));
+							tr->info.conflictingKeys->insert(krWithPrefix, conflictingKeysTrue);
 						}
-						hackTr.extractPtr(); // Avoid the Reference to destroy the RYW object
 					}

 					if (info.debugID.present())
@ -3373,7 +3358,7 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion( DatabaseContext *cx,
 			state GetReadVersionRequest req( transactionCount, flags, debugID );
 			choose {
 				when ( wait( cx->onMasterProxiesChanged() ) ) {}
-				when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES), &MasterProxyInterface::getConsistentReadVersion, req, cx->taskID ) ) ) {
+				when ( GetReadVersionReply v = wait( basicLoadBalance( cx->getMasterProxies(flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES), &MasterProxyInterface::getConsistentReadVersion, req, cx->taskID ) ) ) {
 					if( debugID.present() )
 						g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.After");
 					ASSERT( v.version > 0 );
@ -3834,7 +3819,7 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
 		loop {
 			choose {
 				when(wait(cx->onMasterProxiesChanged())) {}
-				when(wait(loadBalance(cx->getMasterProxies(false), &MasterProxyInterface::proxySnapReq, ProxySnapRequest(snapCmd, snapUID, snapUID), cx->taskID, true /*atmostOnce*/ ))) {
+				when(wait(basicLoadBalance(cx->getMasterProxies(false), &MasterProxyInterface::proxySnapReq, ProxySnapRequest(snapCmd, snapUID, snapUID), cx->taskID, true /*atmostOnce*/ ))) {
 					TraceEvent("SnapCreateExit")
 						.detail("SnapCmd", snapCmd.toString())
 						.detail("UID", snapUID);
@ -3862,7 +3847,7 @@ ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exc
 			choose {
 				when(wait(cx->onMasterProxiesChanged())) {}
 				when(ExclusionSafetyCheckReply _ddCheck =
-				         wait(loadBalance(cx->getMasterProxies(false), &MasterProxyInterface::exclusionSafetyCheckReq,
+				         wait(basicLoadBalance(cx->getMasterProxies(false), &MasterProxyInterface::exclusionSafetyCheckReq,
 				                          req, cx->taskID))) {
 					ddCheck = _ddCheck.safe;
 					break;
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -33,6 +33,7 @@
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/ClientLogEvents.h"
+#include "fdbclient/KeyRangeMap.h"
 #include "flow/actorcompiler.h" // has to be last include

 // CLIENT_BUGGIFY should be used to randomly introduce failures at run time (like BUGGIFY but for client side testing)
@ -143,8 +144,9 @@ struct TransactionInfo {
 	TaskPriority taskID;
 	bool useProvisionalProxies;
 	// Used to save conflicting keys if FDBTransactionOptions::REPORT_CONFLICTING_KEYS is enabled
-	// shared_ptr used here since TransactionInfo is sometimes copied as function parameters.
-	std::shared_ptr<ReadYourWritesTransaction> conflictingKeysRYW;
+	// prefix/<key1> : '1' - any keys equal or larger than this key are (probably) conflicting keys
+	// prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
+	std::shared_ptr<CoalescedKeyRangeMap<Value>> conflictingKeys;

 	explicit TransactionInfo( TaskPriority taskID ) : taskID(taskID), useProvisionalProxies(false) {}
 };
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -21,6 +21,7 @@
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/Atomic.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/StatusClient.h"
 #include "fdbclient/MonitorLeader.h"
 #include "flow/Util.h"
@ -1018,7 +1019,12 @@ public:
 			return Void();
 		}

-		watchFuture = ryw->tr.watch(watch); // throws if there are too many outstanding watches	
+		try {
+			watchFuture = ryw->tr.watch(watch); // throws if there are too many outstanding watches	
+		} catch( Error &e ) {
+			done.send(Void());
+			throw;
+		}
 		done.send(Void());

 		wait(watchFuture);
@ -1129,7 +1135,6 @@ ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx)
    options(tr), deferredError(cx->deferredError) {
 	std::copy(cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(),
 	          std::back_inserter(persistentOptions));
-	//debugTransaction( deterministicRandom()->randomUniqueID() );
 	applyPersistentOptions();
 }

@ -1229,6 +1234,10 @@ Future< Optional<Value> > ReadYourWritesTransaction::get( const Key& key, bool s
 		return Optional<Value>();
 	}

+	// special key space are only allowed to query if both begin and end are in \xff\xff, \xff\xff\xff
+	if (specialKeys.contains(key))
+		return getDatabase()->specialKeySpace->get(Reference<ReadYourWritesTransaction>::addRef(this), key);
+
 	if(checkUsedDuringCommit()) {
 		return used_during_commit();
 	}
@ -1280,41 +1289,10 @@ Future< Standalone<RangeResultRef> > ReadYourWritesTransaction::getRange(
 		}
 	}

-	// Use special key prefix "\xff\xff/transaction/conflicting_keys/<some_key>",
-	// to retrieve keys which caused latest not_committed(conflicting with another transaction) error.
-	// The returned key value pairs are interpretted as :
-	// prefix/<key1> : '1' - any keys equal or larger than this key are (probably) conflicting keys
-	// prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
-	// Currently, the conflicting keyranges returned are original read_conflict_ranges or union of them.
-	// TODO : This interface needs to be integrated into the framework that handles special keys' calls in the future
-	if (begin.getKey().startsWith(conflictingKeysAbsolutePrefix) &&
-	    end.getKey().startsWith(conflictingKeysAbsolutePrefix)) {
-		// Remove the special key prefix "\xff\xff"
-		KeyRef beginConflictingKey = begin.getKey().removePrefix(specialKeys.begin);
-		KeyRef endConflictingKey = end.getKey().removePrefix(specialKeys.begin);
-		// Check if the conflicting key range to be read is valid
-		KeyRef maxKey = getMaxReadKey();
-		if (beginConflictingKey > maxKey || endConflictingKey > maxKey) return key_outside_legal_range();
-		begin.setKey(beginConflictingKey);
-		end.setKey(endConflictingKey);
-		if (tr.info.conflictingKeysRYW) {
-			Future<Standalone<RangeResultRef>> resultWithoutPrefixFuture =
-			    tr.info.conflictingKeysRYW->getRange(begin, end, limits, snapshot, reverse);
-			// Make sure it happens locally
-			ASSERT(resultWithoutPrefixFuture.isReady());
-			Standalone<RangeResultRef> resultWithoutPrefix = resultWithoutPrefixFuture.get();
-			// Add prefix to results, making keys consistent with the getRange query
-			Standalone<RangeResultRef> resultWithPrefix;
-			resultWithPrefix.reserve(resultWithPrefix.arena(), resultWithoutPrefix.size());
-			for (auto const& kv : resultWithoutPrefix) {
-				KeyValueRef kvWithPrefix(kv.key.withPrefix(specialKeys.begin, resultWithPrefix.arena()), kv.value);
-				resultWithPrefix.push_back(resultWithPrefix.arena(), kvWithPrefix);
-			}
-			return resultWithPrefix;
-		} else {
-			return Standalone<RangeResultRef>();
-		}
-	}
+	// special key space are only allowed to query if both begin and end are in \xff\xff, \xff\xff\xff
+	if (specialKeys.contains(begin.getKey()) && specialKeys.contains(end.getKey()))
+		return getDatabase()->specialKeySpace->getRange(Reference<ReadYourWritesTransaction>::addRef(this), begin, end,
+		                                                limits, reverse);

 	if(checkUsedDuringCommit()) {
 		return used_during_commit();
@ -1612,6 +1590,7 @@ void ReadYourWritesTransaction::atomicOp( const KeyRef& key, const ValueRef& ope
 	}

 	if(operationType == MutationRef::SetVersionstampedKey) {
+		TEST(options.readYourWritesDisabled); // SetVersionstampedKey without ryw enabled
 		// this does validation of the key and needs to be performed before the readYourWritesDisabled path
 		KeyRangeRef range = getVersionstampKeyRange(arena, k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey());
 		if(!options.readYourWritesDisabled) {
--- a/fdbclient/ReadYourWrites.h
+++ b/fdbclient/ReadYourWrites.h
@ -131,6 +131,10 @@ public:
 	Database getDatabase() const {
 		return tr.getDatabase();
 	}
+
+	const TransactionInfo& getTransactionInfo() const {
+		return tr.info;
+	}
 private:
 	friend class RYWImpl;

--- a/fdbclient/RestoreWorkerInterface.actor.h
+++ b/fdbclient/RestoreWorkerInterface.actor.h
@ -362,20 +362,24 @@ struct RestoreSysInfoRequest : TimedRequest {
 	constexpr static FileIdentifier file_identifier = 75960741;

 	RestoreSysInfo sysInfo;
+	Standalone<VectorRef<std::pair<KeyRangeRef, Version>>> rangeVersions;

 	ReplyPromise<RestoreCommonReply> reply;

 	RestoreSysInfoRequest() = default;
-	explicit RestoreSysInfoRequest(RestoreSysInfo sysInfo) : sysInfo(sysInfo) {}
+	explicit RestoreSysInfoRequest(RestoreSysInfo sysInfo,
+	                               Standalone<VectorRef<std::pair<KeyRangeRef, Version>>> rangeVersions)
+	  : sysInfo(sysInfo), rangeVersions(rangeVersions) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, sysInfo, reply);
+		serializer(ar, sysInfo, rangeVersions, reply);
 	}

 	std::string toString() {
 		std::stringstream ss;
-		ss << "RestoreSysInfoRequest";
+		ss << "RestoreSysInfoRequest "
+		   << "rangeVersions.size:" << rangeVersions.size();
 		return ss.str();
 	}
 };
@ -462,29 +466,26 @@ struct RestoreSendVersionedMutationsRequest : TimedRequest {

 	Version msgIndex; // Monitonically increasing index of mutation messages
 	bool isRangeFile;
-	MutationsVec mutations; // Mutations that may be at different versions parsed by one loader
-	LogMessageVersionVec mVersions; // (version, subversion) of each mutation in mutations field
+	VersionedMutationsVec versionedMutations; // Versioned mutations may be at different versions parsed by one loader

 	ReplyPromise<RestoreCommonReply> reply;

 	RestoreSendVersionedMutationsRequest() = default;
 	explicit RestoreSendVersionedMutationsRequest(int batchIndex, const RestoreAsset& asset, Version msgIndex,
-	                                              bool isRangeFile, MutationsVec mutations,
-	                                              LogMessageVersionVec mVersions)
-	  : batchIndex(batchIndex), asset(asset), msgIndex(msgIndex), isRangeFile(isRangeFile), mutations(mutations),
-	    mVersions(mVersions) {}
+	                                              bool isRangeFile, VersionedMutationsVec versionedMutations)
+	  : batchIndex(batchIndex), asset(asset), msgIndex(msgIndex), isRangeFile(isRangeFile),
+	    versionedMutations(versionedMutations) {}

 	std::string toString() {
 		std::stringstream ss;
 		ss << "VersionBatchIndex:" << batchIndex << "RestoreAsset:" << asset.toString() << " msgIndex:" << msgIndex
-		   << " isRangeFile:" << isRangeFile << " mutations.size:" << mutations.size()
-		   << " mVersions.size:" << mVersions.size();
+		   << " isRangeFile:" << isRangeFile << " versionedMutations.size:" << versionedMutations.size();
 		return ss.str();
 	}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, batchIndex, asset, msgIndex, isRangeFile, mutations, mVersions, reply);
+		serializer(ar, batchIndex, asset, msgIndex, isRangeFile, versionedMutations, reply);
 	}
 };

--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -0,0 +1,377 @@
+/*
+ * SpecialKeySpace.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/SpecialKeySpace.actor.h"
+#include "flow/UnitTest.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+// This function will normalize the given KeySelector to a standard KeySelector:
+// orEqual == false && offset == 1 (Standard form)
+// If the corresponding key is not in this special key range, it will move as far as possible to adjust the offset to 1
+// It does have overhead here since we query all keys twice in the worst case.
+// However, moving the KeySelector while handling other parameters like limits makes the code much more complex and hard
+// to maintain Separate each part to make the code easy to understand and more compact
+ACTOR Future<Void> SpecialKeyRangeBaseImpl::normalizeKeySelectorActor(const SpecialKeyRangeBaseImpl* pkrImpl,
+                                                                      Reference<ReadYourWritesTransaction> ryw,
+                                                                      KeySelector* ks) {
+	ASSERT(!ks->orEqual); // should be removed before calling
+	ASSERT(ks->offset != 1); // never being called if KeySelector is already normalized
+
+	state Key startKey(pkrImpl->range.begin);
+	state Key endKey(pkrImpl->range.end);
+
+	if (ks->offset < 1) {
+		// less than the given key
+		if (pkrImpl->range.contains(ks->getKey())) endKey = keyAfter(ks->getKey());
+	} else {
+		// greater than the given key
+		if (pkrImpl->range.contains(ks->getKey())) startKey = ks->getKey();
+	}
+
+	TraceEvent(SevDebug, "NormalizeKeySelector")
+	    .detail("OriginalKey", ks->getKey())
+	    .detail("OriginalOffset", ks->offset)
+	    .detail("SpecialKeyRangeStart", pkrImpl->range.begin)
+	    .detail("SpecialKeyRangeEnd", pkrImpl->range.end);
+
+	Standalone<RangeResultRef> result = wait(pkrImpl->getRange(ryw, KeyRangeRef(startKey, endKey)));
+	if (result.size() == 0) {
+		TraceEvent("ZeroElementsIntheRange").detail("Start", startKey).detail("End", endKey);
+		return Void();
+	}
+	// Note : KeySelector::setKey has byte limit according to the knobs, customize it if needed
+	if (ks->offset < 1) {
+		if (result.size() >= 1 - ks->offset) {
+			ks->setKey(KeyRef(ks->arena(), result[result.size() - (1 - ks->offset)].key));
+			ks->offset = 1;
+		} else {
+			ks->setKey(KeyRef(ks->arena(), result[0].key));
+			ks->offset += result.size();
+		}
+	} else {
+		if (result.size() >= ks->offset) {
+			ks->setKey(KeyRef(ks->arena(), result[ks->offset - 1].key));
+			ks->offset = 1;
+		} else {
+			ks->setKey(KeyRef(ks->arena(), keyAfter(result[result.size() - 1].key)));
+			ks->offset -= result.size();
+		}
+	}
+	TraceEvent(SevDebug, "NormalizeKeySelector")
+	    .detail("NormalizedKey", ks->getKey())
+	    .detail("NormalizedOffset", ks->offset)
+	    .detail("SpecialKeyRangeStart", pkrImpl->range.begin)
+	    .detail("SpecialKeyRangeEnd", pkrImpl->range.end);
+	return Void();
+}
+
+ACTOR Future<Standalone<RangeResultRef>> SpecialKeySpace::getRangeAggregationActor(
+    SpecialKeySpace* pks, Reference<ReadYourWritesTransaction> ryw, KeySelector begin, KeySelector end,
+    GetRangeLimits limits, bool reverse) {
+	// This function handles ranges which cover more than one keyrange and aggregates all results
+	// KeySelector, GetRangeLimits and reverse are all handled here
+	state Standalone<RangeResultRef> result;
+	state RangeMap<Key, SpecialKeyRangeBaseImpl*, KeyRangeRef>::Iterator iter;
+	state int actualBeginOffset;
+	state int actualEndOffset;
+
+	// make sure offset == 1
+	state RangeMap<Key, SpecialKeyRangeBaseImpl*, KeyRangeRef>::Iterator beginIter =
+	    pks->impls.rangeContaining(begin.getKey());
+	while ((begin.offset < 1 && beginIter != pks->impls.ranges().begin()) ||
+	       (begin.offset > 1 && beginIter != pks->impls.ranges().end())) {
+		if (beginIter->value() != nullptr)
+			wait(beginIter->value()->normalizeKeySelectorActor(beginIter->value(), ryw, &begin));
+		begin.offset < 1 ? --beginIter : ++beginIter;
+	}
+
+	actualBeginOffset = begin.offset;
+	if (beginIter == pks->impls.ranges().begin())
+		begin.setKey(pks->range.begin);
+	else if (beginIter == pks->impls.ranges().end())
+		begin.setKey(pks->range.end);
+
+	if (!begin.isFirstGreaterOrEqual()) {
+		// The Key Selector points to key outside the whole special key space
+		TraceEvent(SevInfo, "BeginKeySelectorPointsOutside")
+		    .detail("TerminateKey", begin.getKey())
+		    .detail("TerminateOffset", begin.offset);
+		if (begin.offset < 1 && beginIter == pks->impls.ranges().begin())
+			result.readToBegin = true;
+		else
+			result.readThroughEnd = true;
+		begin.offset = 1;
+	}
+	state RangeMap<Key, SpecialKeyRangeBaseImpl*, KeyRangeRef>::Iterator endIter =
+	    pks->impls.rangeContaining(end.getKey());
+	while ((end.offset < 1 && endIter != pks->impls.ranges().begin()) ||
+	       (end.offset > 1 && endIter != pks->impls.ranges().end())) {
+		if (endIter->value() != nullptr) wait(endIter->value()->normalizeKeySelectorActor(endIter->value(), ryw, &end));
+		end.offset < 1 ? --endIter : ++endIter;
+	}
+
+	actualEndOffset = end.offset;
+	if (endIter == pks->impls.ranges().begin())
+		end.setKey(pks->range.begin);
+	else if (endIter == pks->impls.ranges().end())
+		end.setKey(pks->range.end);
+
+	if (!end.isFirstGreaterOrEqual()) {
+		// The Key Selector points to key outside the whole special key space
+		TraceEvent(SevInfo, "EndKeySelectorPointsOutside")
+		    .detail("TerminateKey", end.getKey())
+		    .detail("TerminateOffset", end.offset);
+		if (end.offset < 1 && endIter == pks->impls.ranges().begin())
+			result.readToBegin = true;
+		else
+			result.readThroughEnd = true;
+		end.offset = 1;
+	}
+	// Handle all corner cases like what RYW does
+	// return if range inverted
+	if (actualBeginOffset >= actualEndOffset && begin.getKey() >= end.getKey()) {
+		TEST(true);
+		return RangeResultRef(false, false);
+	}
+	// If touches begin or end, return with readToBegin and readThroughEnd flags
+	if (beginIter == pks->impls.ranges().end() || endIter == pks->impls.ranges().begin()) {
+		TEST(true);
+		return result;
+	}
+	state RangeMap<Key, SpecialKeyRangeBaseImpl*, KeyRangeRef>::Ranges ranges =
+	    pks->impls.intersectingRanges(KeyRangeRef(begin.getKey(), end.getKey()));
+	// TODO : workaround to write this two together to make the code compact
+	// The issue here is boost::iterator_range<> doest not provide rbegin(), rend()
+	iter = reverse ? ranges.end() : ranges.begin();
+	if (reverse) {
+		while (iter != ranges.begin()) {
+			--iter;
+			if (iter->value() == nullptr) continue;
+			KeyRangeRef kr = iter->range();
+			KeyRef keyStart = kr.contains(begin.getKey()) ? begin.getKey() : kr.begin;
+			KeyRef keyEnd = kr.contains(end.getKey()) ? end.getKey() : kr.end;
+			Standalone<RangeResultRef> pairs = wait(iter->value()->getRange(ryw, KeyRangeRef(keyStart, keyEnd)));
+			result.arena().dependsOn(pairs.arena());
+			// limits handler
+			for (int i = pairs.size() - 1; i >= 0; --i) {
+				result.push_back(result.arena(), pairs[i]);
+				// Note : behavior here is even the last k-v pair makes total bytes larger than specified, it's still
+				// returned. In other words, the total size of the returned value (less the last entry) will be less
+				// than byteLimit
+				limits.decrement(pairs[i]);
+				if (limits.isReached()) {
+					result.more = true;
+					result.readToBegin = false;
+					return result;
+				};
+			}
+		}
+	} else {
+		for (iter = ranges.begin(); iter != ranges.end(); ++iter) {
+			if (iter->value() == nullptr) continue;
+			KeyRangeRef kr = iter->range();
+			KeyRef keyStart = kr.contains(begin.getKey()) ? begin.getKey() : kr.begin;
+			KeyRef keyEnd = kr.contains(end.getKey()) ? end.getKey() : kr.end;
+			Standalone<RangeResultRef> pairs = wait(iter->value()->getRange(ryw, KeyRangeRef(keyStart, keyEnd)));
+			result.arena().dependsOn(pairs.arena());
+			// limits handler
+			for (int i = 0; i < pairs.size(); ++i) {
+				result.push_back(result.arena(), pairs[i]);
+				// Note : behavior here is even the last k-v pair makes total bytes larger than specified, it's still
+				// returned. In other words, the total size of the returned value (less the last entry) will be less
+				// than byteLimit
+				limits.decrement(pairs[i]);
+				if (limits.isReached()) {
+					result.more = true;
+					result.readThroughEnd = false;
+					return result;
+				};
+			}
+		}
+	}
+	return result;
+}
+
+Future<Standalone<RangeResultRef>> SpecialKeySpace::getRange(Reference<ReadYourWritesTransaction> ryw,
+                                                             KeySelector begin, KeySelector end, GetRangeLimits limits,
+                                                             bool reverse) {
+	// validate limits here
+	if (!limits.isValid()) return range_limits_invalid();
+	if (limits.isReached()) {
+		TEST(true); // read limit 0
+		return Standalone<RangeResultRef>();
+	}
+	// make sure orEqual == false
+	begin.removeOrEqual(begin.arena());
+	end.removeOrEqual(end.arena());
+
+	return getRangeAggregationActor(this, ryw, begin, end, limits, reverse);
+}
+
+ACTOR Future<Optional<Value>> SpecialKeySpace::getActor(SpecialKeySpace* pks, Reference<ReadYourWritesTransaction> ryw,
+                                                        KeyRef key) {
+	// use getRange to workaround this
+	Standalone<RangeResultRef> result =
+	    wait(pks->getRange(ryw, KeySelector(firstGreaterOrEqual(key)), KeySelector(firstGreaterOrEqual(keyAfter(key))),
+	                       GetRangeLimits(CLIENT_KNOBS->TOO_MANY), false));
+	ASSERT(result.size() <= 1);
+	if (result.size()) {
+		return Optional<Value>(result[0].value);
+	} else {
+		return Optional<Value>();
+	}
+}
+
+Future<Optional<Value>> SpecialKeySpace::get(Reference<ReadYourWritesTransaction> ryw, const Key& key) {
+	return getActor(this, ryw, key);
+}
+
+ConflictingKeysImpl::ConflictingKeysImpl(KeyRangeRef kr) : SpecialKeyRangeBaseImpl(kr) {}
+
+Future<Standalone<RangeResultRef>> ConflictingKeysImpl::getRange(Reference<ReadYourWritesTransaction> ryw,
+                                                                 KeyRangeRef kr) const {
+	Standalone<RangeResultRef> result;
+	if (ryw->getTransactionInfo().conflictingKeys) {
+		auto krMapPtr = ryw->getTransactionInfo().conflictingKeys.get();
+		auto beginIter = krMapPtr->rangeContaining(kr.begin);
+		if (beginIter->begin() != kr.begin) ++beginIter;
+		auto endIter = krMapPtr->rangeContaining(kr.end);
+		for (auto it = beginIter; it != endIter; ++it) {
+			// it->begin() is stored in the CoalescedKeyRangeMap in TransactionInfo
+			// it->value() is always constants in SystemData.cpp
+			// Thus, push_back() can be used
+			result.push_back(result.arena(), KeyValueRef(it->begin(), it->value()));
+		}
+		if (endIter->begin() != kr.end)
+			result.push_back(result.arena(), KeyValueRef(endIter->begin(), endIter->value()));
+	}
+	return result;
+}
+
+class SpecialKeyRangeTestImpl : public SpecialKeyRangeBaseImpl {
+public:
+	explicit SpecialKeyRangeTestImpl(KeyRangeRef kr, const std::string& prefix, int size)
+	  : SpecialKeyRangeBaseImpl(kr), prefix(prefix), size(size) {
+		ASSERT(size > 0);
+		for (int i = 0; i < size; ++i) {
+			kvs.push_back_deep(kvs.arena(),
+			                   KeyValueRef(getKeyForIndex(i), deterministicRandom()->randomAlphaNumeric(16)));
+		}
+	}
+
+	KeyValueRef getKeyValueForIndex(int idx) { return kvs[idx]; }
+
+	Key getKeyForIndex(int idx) { return Key(prefix + format("%010d", idx)).withPrefix(range.begin); }
+	int getSize() { return size; }
+	Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
+	                                            KeyRangeRef kr) const override {
+		int startIndex = 0, endIndex = size;
+		while (startIndex < size && kvs[startIndex].key < kr.begin) ++startIndex;
+		while (endIndex > startIndex && kvs[endIndex - 1].key >= kr.end) --endIndex;
+		if (startIndex == endIndex)
+			return Standalone<RangeResultRef>();
+		else
+			return Standalone<RangeResultRef>(RangeResultRef(kvs.slice(startIndex, endIndex), false));
+	}
+
+private:
+	Standalone<VectorRef<KeyValueRef>> kvs;
+	std::string prefix;
+	int size;
+};
+
+TEST_CASE("/fdbclient/SpecialKeySpace/Unittest") {
+	SpecialKeySpace pks(normalKeys.begin, normalKeys.end);
+	SpecialKeyRangeTestImpl pkr1(KeyRangeRef(LiteralStringRef("/cat/"), LiteralStringRef("/cat/\xff")), "small", 10);
+	SpecialKeyRangeTestImpl pkr2(KeyRangeRef(LiteralStringRef("/dog/"), LiteralStringRef("/dog/\xff")), "medium", 100);
+	SpecialKeyRangeTestImpl pkr3(KeyRangeRef(LiteralStringRef("/pig/"), LiteralStringRef("/pig/\xff")), "large", 1000);
+	pks.registerKeyRange(pkr1.getKeyRange(), &pkr1);
+	pks.registerKeyRange(pkr2.getKeyRange(), &pkr2);
+	pks.registerKeyRange(pkr3.getKeyRange(), &pkr3);
+	auto nullRef = Reference<ReadYourWritesTransaction>();
+	// get
+	{
+		auto resultFuture = pks.get(nullRef, LiteralStringRef("/cat/small0000000009"));
+		ASSERT(resultFuture.isReady());
+		auto result = resultFuture.getValue().get();
+		ASSERT(result == pkr1.getKeyValueForIndex(9).value);
+		auto emptyFuture = pks.get(nullRef, LiteralStringRef("/cat/small0000000010"));
+		ASSERT(emptyFuture.isReady());
+		auto emptyResult = emptyFuture.getValue();
+		ASSERT(!emptyResult.present());
+	}
+	// general getRange
+	{
+		KeySelector start = KeySelectorRef(LiteralStringRef("/elepant"), false, -9);
+		KeySelector end = KeySelectorRef(LiteralStringRef("/frog"), false, +11);
+		auto resultFuture = pks.getRange(nullRef, start, end, GetRangeLimits());
+		ASSERT(resultFuture.isReady());
+		auto result = resultFuture.getValue();
+		ASSERT(result.size() == 20);
+		ASSERT(result[0].key == pkr2.getKeyForIndex(90));
+		ASSERT(result[result.size() - 1].key == pkr3.getKeyForIndex(9));
+	}
+	// KeySelector points outside
+	{
+		KeySelector start = KeySelectorRef(pkr3.getKeyForIndex(999), true, -1110);
+		KeySelector end = KeySelectorRef(pkr1.getKeyForIndex(0), false, +1112);
+		auto resultFuture = pks.getRange(nullRef, start, end, GetRangeLimits());
+		ASSERT(resultFuture.isReady());
+		auto result = resultFuture.getValue();
+		ASSERT(result.size() == 1110);
+		ASSERT(result[0].key == pkr1.getKeyForIndex(0));
+		ASSERT(result[result.size() - 1].key == pkr3.getKeyForIndex(999));
+	}
+	// GetRangeLimits with row limit
+	{
+		KeySelector start = KeySelectorRef(pkr2.getKeyForIndex(0), true, 0);
+		KeySelector end = KeySelectorRef(pkr3.getKeyForIndex(0), false, 0);
+		auto resultFuture = pks.getRange(nullRef, start, end, GetRangeLimits(2));
+		ASSERT(resultFuture.isReady());
+		auto result = resultFuture.getValue();
+		ASSERT(result.size() == 2);
+		ASSERT(result[0].key == pkr2.getKeyForIndex(0));
+		ASSERT(result[1].key == pkr2.getKeyForIndex(1));
+	}
+	// GetRangeLimits with byte limit
+	{
+		KeySelector start = KeySelectorRef(pkr2.getKeyForIndex(0), true, 0);
+		KeySelector end = KeySelectorRef(pkr3.getKeyForIndex(0), false, 0);
+		auto resultFuture = pks.getRange(nullRef, start, end, GetRangeLimits(10, 100));
+		ASSERT(resultFuture.isReady());
+		auto result = resultFuture.getValue();
+		int bytes = 0;
+		for (int i = 0; i < result.size() - 1; ++i) bytes += 8 + pkr2.getKeyValueForIndex(i).expectedSize();
+		ASSERT(bytes < 100);
+		ASSERT(bytes + 8 + pkr2.getKeyValueForIndex(result.size()).expectedSize() >= 100);
+	}
+	// reverse test with overlapping key range
+	{
+		KeySelector start = KeySelectorRef(pkr2.getKeyForIndex(0), true, 0);
+		KeySelector end = KeySelectorRef(pkr3.getKeyForIndex(999), true, +1);
+		auto resultFuture = pks.getRange(nullRef, start, end, GetRangeLimits(1100), true);
+		ASSERT(resultFuture.isReady());
+		auto result = resultFuture.getValue();
+		for (int i = 0; i < pkr3.getSize(); ++i) ASSERT(result[i] == pkr3.getKeyValueForIndex(pkr3.getSize() - 1 - i));
+		for (int i = 0; i < pkr2.getSize(); ++i)
+			ASSERT(result[i + pkr3.getSize()] == pkr2.getKeyValueForIndex(pkr2.getSize() - 1 - i));
+	}
+	return Void();
+}
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@ -0,0 +1,99 @@
+/*
+ * SpecialKeySpace.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_SPECIALKEYSPACE_ACTOR_G_H)
+#define FDBCLIENT_SPECIALKEYSPACE_ACTOR_G_H
+#include "fdbclient/SpecialKeySpace.actor.g.h"
+#elif !defined(FDBCLIENT_SPECIALKEYSPACE_ACTOR_H)
+#define FDBCLIENT_SPECIALKEYSPACE_ACTOR_H
+
+#include "flow/flow.h"
+#include "flow/Arena.h"
+#include "fdbclient/FDBTypes.h"
+#include "fdbclient/KeyRangeMap.h"
+#include "fdbclient/ReadYourWrites.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+class SpecialKeyRangeBaseImpl {
+public:
+	// Each derived class only needs to implement this simple version of getRange
+	virtual Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
+	                                                    KeyRangeRef kr) const = 0;
+
+	explicit SpecialKeyRangeBaseImpl(KeyRangeRef kr) : range(kr) {}
+	KeyRangeRef getKeyRange() const { return range; }
+	ACTOR Future<Void> normalizeKeySelectorActor(const SpecialKeyRangeBaseImpl* pkrImpl,
+	                                             Reference<ReadYourWritesTransaction> ryw, KeySelector* ks);
+
+protected:
+	KeyRange range; // underlying key range for this function
+};
+
+class SpecialKeySpace {
+public:
+	Future<Optional<Value>> get(Reference<ReadYourWritesTransaction> ryw, const Key& key);
+
+	Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw, KeySelector begin,
+	                                            KeySelector end, GetRangeLimits limits, bool reverse = false);
+
+	SpecialKeySpace(KeyRef spaceStartKey = Key(), KeyRef spaceEndKey = normalKeys.end) {
+		// Default value is nullptr, begin of KeyRangeMap is Key()
+		impls = KeyRangeMap<SpecialKeyRangeBaseImpl*>(nullptr, spaceEndKey);
+		range = KeyRangeRef(spaceStartKey, spaceEndKey);
+	}
+	void registerKeyRange(const KeyRangeRef& kr, SpecialKeyRangeBaseImpl* impl) {
+		// range check
+		// TODO: add range check not to be replaced by overlapped ones
+		ASSERT(kr.begin >= range.begin && kr.end <= range.end);
+		// make sure the registered range is not overlapping with existing ones
+		// Note: kr.end should not be the same as another range's begin, although it should work even they are the same
+		ASSERT(impls.rangeContaining(kr.begin) == impls.rangeContaining(kr.end) && impls[kr.begin] == nullptr);
+		impls.insert(kr, impl);
+	}
+
+private:
+	ACTOR Future<Optional<Value>> getActor(SpecialKeySpace* pks, Reference<ReadYourWritesTransaction> ryw, KeyRef key);
+
+	ACTOR Future<Standalone<RangeResultRef>> getRangeAggregationActor(SpecialKeySpace* pks,
+	                                                                  Reference<ReadYourWritesTransaction> ryw,
+	                                                                  KeySelector begin, KeySelector end,
+	                                                                  GetRangeLimits limits, bool reverse);
+
+	KeyRangeMap<SpecialKeyRangeBaseImpl*> impls;
+	KeyRange range;
+};
+
+// Use special key prefix "\xff\xff/transaction/conflicting_keys/<some_key>",
+// to retrieve keys which caused latest not_committed(conflicting with another transaction) error.
+// The returned key value pairs are interpretted as :
+// prefix/<key1> : '1' - any keys equal or larger than this key are (probably) conflicting keys
+// prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
+// Currently, the conflicting keyranges returned are original read_conflict_ranges or union of them.
+class ConflictingKeysImpl : public SpecialKeyRangeBaseImpl {
+public:
+	explicit ConflictingKeysImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
+	                                            KeyRangeRef kr) const override;
+};
+
+#include "flow/unactorcompiler.h"
+#endif
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -74,6 +74,7 @@ struct StorageServerInterface {
 	explicit StorageServerInterface(UID uid) : uniqueID( uid ), isCacheServer(false) {}
 	StorageServerInterface() : uniqueID( deterministicRandom()->randomUniqueID() ), isCacheServer(false) {}
 	NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); }
+	NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); }
 	Optional<NetworkAddress> secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; }
 	UID id() const { return uniqueID; }
 	std::string toString() const { return id().shortString(); }
@ -398,12 +399,14 @@ struct GetStorageMetricsReply {
 	StorageMetrics available;
 	StorageMetrics capacity;
 	double bytesInputRate;
+	int64_t versionLag;
+	double lastUpdate;

 	GetStorageMetricsReply() : bytesInputRate(0) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, load, available, capacity, bytesInputRate);
+		serializer(ar, load, available, capacity, bytesInputRate, versionLag, lastUpdate);
 	}
 };

--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -32,7 +32,7 @@ const KeyRangeRef systemKeys(systemKeysPrefix, LiteralStringRef("\xff\xff") );
 const KeyRangeRef nonMetadataSystemKeys(LiteralStringRef("\xff\x02"), LiteralStringRef("\xff\x03"));
 const KeyRangeRef allKeys = KeyRangeRef(normalKeys.begin, systemKeys.end);
 const KeyRef afterAllKeys = LiteralStringRef("\xff\xff\x00");
-const KeyRangeRef specialKeys = KeyRangeRef(LiteralStringRef("\xff\xff"), LiteralStringRef("\xff\xff\xff\xff"));
+const KeyRangeRef specialKeys = KeyRangeRef(LiteralStringRef("\xff\xff"), LiteralStringRef("\xff\xff\xff"));

 // keyServersKeys.contains(k) iff k.startsWith(keyServersPrefix)
 const KeyRangeRef keyServersKeys( LiteralStringRef("\xff/keyServers/"), LiteralStringRef("\xff/keyServers0") );
@ -110,8 +110,8 @@ void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& v
 	std::sort(dest.begin(), dest.end());
 }

-const KeyRef conflictingKeysPrefix = LiteralStringRef("/transaction/conflicting_keys/");
-const Key conflictingKeysAbsolutePrefix = conflictingKeysPrefix.withPrefix(specialKeys.begin);
+const KeyRangeRef conflictingKeysRange = KeyRangeRef(LiteralStringRef("\xff\xff/transaction/conflicting_keys/"),
+                                                     LiteralStringRef("\xff\xff/transaction/conflicting_keys/\xff"));
 const ValueRef conflictingKeysTrue = LiteralStringRef("1");
 const ValueRef conflictingKeysFalse = LiteralStringRef("0");

--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -40,7 +40,8 @@ extern const KeyRangeRef normalKeys; // '' to systemKeys.begin
 extern const KeyRangeRef systemKeys;  // [FF] to [FF][FF]
 extern const KeyRangeRef nonMetadataSystemKeys; // [FF][00] to [FF][01]
 extern const KeyRangeRef allKeys; // '' to systemKeys.end
-extern const KeyRangeRef specialKeys; // [FF][FF] to [FF][FF][FF][FF]
+extern const KeyRangeRef specialKeys; // [FF][FF] to [FF][FF][FF], some client functions are exposed through FDB calls
+                                      // using these special keys, see pr#2662
 extern const KeyRef afterAllKeys;

 //    "\xff/keyServers/[[begin]]" := "[[vector<serverID>, vector<serverID>]|[vector<Tag>, vector<Tag>]]"
@ -80,8 +81,7 @@ const Key serverKeysPrefixFor( UID serverID );
 UID serverKeysDecodeServer( const KeyRef& key );
 bool serverHasKey( ValueRef storedValue );

-extern const KeyRef conflictingKeysPrefix;
-extern const Key conflictingKeysAbsolutePrefix;
+extern const KeyRangeRef conflictingKeysRange;
 extern const ValueRef conflictingKeysTrue, conflictingKeysFalse;

 extern const KeyRef cacheKeysPrefix;
--- a/fdbmonitor/fdbmonitor.cpp
+++ b/fdbmonitor/fdbmonitor.cpp
@ -37,6 +37,11 @@
 #include <linux/limits.h>
 #endif

+#ifdef __FreeBSD__
+#include <sys/event.h>
+#define O_EVTONLY O_RDONLY
+#endif
+
 #ifdef __APPLE__
 #include <sys/event.h>
 #include <mach/mach.h>
@ -78,7 +83,7 @@

 #ifdef __linux__
 typedef fd_set* fdb_fd_set;
-#elif defined __APPLE__
+#elif defined(__APPLE__) || defined(__FreeBSD__)
 typedef int fdb_fd_set;
 #endif

@ -89,7 +94,7 @@ void monitor_fd( fdb_fd_set list, int fd, int* maxfd, void* cmd ) {
 	FD_SET( fd, list );
 	if ( fd > *maxfd )
 		*maxfd = fd;
-#elif defined __APPLE__
+#elif defined(__APPLE__) || defined(__FreeBSD__)
 	/* ignore maxfd */
 	struct kevent ev;
 	EV_SET( &ev, fd, EVFILT_READ, EV_ADD, 0, 0, cmd );
@ -100,7 +105,7 @@ void monitor_fd( fdb_fd_set list, int fd, int* maxfd, void* cmd ) {
 void unmonitor_fd( fdb_fd_set list, int fd ) {
 #ifdef __linux__
 	FD_CLR( fd, list );
-#elif defined __APPLE__
+#elif defined(__APPLE__) || defined(__FreeBSD__)
 	struct kevent ev;
 	EV_SET( &ev, fd, EVFILT_READ, EV_DELETE, 0, 0, NULL );
 	kevent( list, &ev, 1, NULL, 0, NULL ); // FIXME: check?
@ -194,7 +199,7 @@ const char* get_value_multi(const CSimpleIni& ini, const char* key, ...) {
 }

 double timer() {
-#if defined(__linux__)
+#if defined(__linux__) || defined(__FreeBSD__)
 	struct timespec ts;
 	clock_gettime(CLOCK_MONOTONIC, &ts);
 	return double(ts.tv_sec) + (ts.tv_nsec * 1e-9);
@ -913,7 +918,7 @@ void read_child_output( Command* cmd, int pipe_idx, fdb_fd_set fds ) {
 	}
 }

-#ifdef __APPLE__
+#if defined(__APPLE__) || defined(__FreeBSD__)
 void watch_conf_dir( int kq, int* confd_fd, std::string confdir ) {
 	struct kevent ev;
 	std::string original = confdir;
@ -1171,7 +1176,11 @@ int main(int argc, char** argv) {
 	// testPathOps(); return -1;

 	std::string lockfile = "/var/run/fdbmonitor.pid";
+#ifdef __FreeBSD__
+	std::string _confpath = "/usr/local/etc/foundationdb/foundationdb.conf";
+#else
 	std::string _confpath = "/etc/foundationdb/foundationdb.conf";
+#endif

 	std::vector<const char *> additional_watch_paths;

@ -1266,12 +1275,12 @@ int main(int argc, char** argv) {
 #endif

 	if (daemonize) {
-#ifdef __APPLE__
+#if defined(__APPLE__) || defined(__FreeBSD__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
 		if (daemon(0, 0)) {
-#ifdef __APPLE__
+#if defined(__APPLE__) || defined(__FreeBSD__)
 #pragma GCC diagnostic pop
 #endif
 			log_err("daemon", errno, "Unable to daemonize");
@ -1330,7 +1339,7 @@ int main(int argc, char** argv) {
 	signal(SIGHUP, signal_handler);
 	signal(SIGINT, signal_handler);
 	signal(SIGTERM, signal_handler);
-#elif defined(__APPLE__)
+#elif defined(__APPLE__) || defined(__FreeBSD__)
 	int kq = kqueue();
 	if ( kq < 0 ) {
 		log_err( "kqueue", errno, "Unable to create kqueue" );
@ -1375,11 +1384,11 @@ int main(int argc, char** argv) {
 	/* normal will be restored in our main loop in the call to
 	   pselect, but none blocks all signals while processing events */
 	sigprocmask(SIG_SETMASK, &full_mask, &normal_mask);
-#elif defined(__APPLE__)
+#elif defined(__APPLE__) || defined(__FreeBSD__)
 	sigprocmask(0, NULL, &normal_mask);
 #endif

-#ifdef __APPLE__
+#if defined(__APPLE__) || defined(__FreeBSD__)
 	struct stat st_buf;
 	struct timespec mtimespec;

@ -1438,7 +1447,7 @@ int main(int argc, char** argv) {

 			load_conf(confpath.c_str(), uid, gid, &normal_mask, &rfds, &maxfd);
 			reload_additional_watches = false;
-#elif defined(__APPLE__)
+#elif defined(__APPLE__) || defined(__FreeBSD__)
 			load_conf( confpath.c_str(), uid, gid, &normal_mask, watched_fds, &maxfd );
 			watch_conf_file( kq, &conff_fd, confpath.c_str() );
 			watch_conf_dir( kq, &confd_fd, confdir );
@ -1476,7 +1485,7 @@ int main(int argc, char** argv) {
 		if(nfds == 0) {
 			reload = true;
 		}
-#elif defined(__APPLE__)
+#elif defined(__APPLE__) || defined(__FreeBSD__)
 		int nev = 0;
 		if(timeout < 0) {
 			nev = kevent( kq, NULL, 0, &ev, 1, NULL );
--- a/fdbrpc/AsyncFileEIO.actor.h
+++ b/fdbrpc/AsyncFileEIO.actor.h
@ -52,6 +52,10 @@ public:
 		}
 	}

+	static void stop() {
+		eio_set_max_parallel(0);
+	}
+
 	static bool should_poll() { return want_poll; }

 	static bool lock_fd( int fd ) {
--- a/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/AsyncFileKAIO.actor.h
@ -97,6 +97,7 @@ public:
 #endif

 	static Future<Reference<IAsyncFile>> open( std::string filename, int flags, int mode, void* ignore ) {
+		ASSERT( !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO );
 		ASSERT( flags & OPEN_UNBUFFERED );

 		if (flags & OPEN_LOCK)
@ -153,6 +154,7 @@ public:
 	}

 	static void init( Reference<IEventFD> ev, double ioTimeout ) {
+		ASSERT( !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO );
 		if( !g_network->isSimulated() ) {
 			ctx.countAIOSubmit.init(LiteralStringRef("AsyncFile.CountAIOSubmit"));
 			ctx.countAIOCollect.init(LiteralStringRef("AsyncFile.CountAIOCollect"));
@ -578,7 +580,7 @@ private:
 	static Context ctx;

 	explicit AsyncFileKAIO(int fd, int flags, std::string const& filename) : fd(fd), flags(flags), filename(filename), failed(false) {
-
+		ASSERT( !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO );
 		if( !g_network->isSimulated() ) {
 			countFileLogicalWrites.init(LiteralStringRef("AsyncFile.CountFileLogicalWrites"), filename);
 			countFileLogicalReads.init( LiteralStringRef("AsyncFile.CountFileLogicalReads"), filename);
--- a/fdbrpc/AsyncFileWinASIO.actor.h
+++ b/fdbrpc/AsyncFileWinASIO.actor.h
@ -39,6 +39,8 @@ class AsyncFileWinASIO : public IAsyncFile, public ReferenceCounted<AsyncFileWin
 public:
 	static void init() {}

+	static void stop() {}
+
 	static bool should_poll() { return false; }
 	// FIXME: This implementation isn't actually asynchronous - it just does operations synchronously!

--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@ -13,6 +13,7 @@ set(FDBRPC_SRCS
  FlowTransport.actor.cpp
  genericactors.actor.h
  genericactors.actor.cpp
+  HealthMonitor.actor.cpp
  IAsyncFile.actor.cpp
  LoadBalance.actor.h
  Locality.cpp
@ -48,7 +49,16 @@ if(APPLE)
  list(APPEND FDBRPC_THIRD_PARTY_SRCS libcoroutine/asm.S)
 endif()
 if(NOT WIN32)
-  list(APPEND FDBRPC_THIRD_PARTY_SRCS libcoroutine/context.c libeio/eio.c)
+  if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
+    find_library(EIO eio)
+    if(EIO)
+      list(APPEND FDBRPC_THIRD_PARTY_SRCS libcoroutine/context.c)
+    else()
+      list(APPEND FDBRPC_THIRD_PARTY_SRCS libcoroutine/context.c libeio/eio.c)
+    endif()
+  else()
+    list(APPEND FDBRPC_THIRD_PARTY_SRCS libcoroutine/context.c libeio/eio.c)
+  endif()
 endif()

 add_library(thirdparty STATIC ${FDBRPC_THIRD_PARTY_SRCS})
--- a/fdbrpc/FailureMonitor.actor.cpp
+++ b/fdbrpc/FailureMonitor.actor.cpp
@ -19,55 +19,57 @@
 */

 #include "fdbrpc/FailureMonitor.h"
-#include "flow/actorcompiler.h"  // This must be the last #include.
+#include "flow/actorcompiler.h" // This must be the last #include.

-ACTOR Future<Void> waitForStateEqual( IFailureMonitor* monitor, Endpoint endpoint, FailureStatus status ) {
+ACTOR Future<Void> waitForStateEqual(IFailureMonitor* monitor, Endpoint endpoint, FailureStatus status) {
 	loop {
 		Future<Void> change = monitor->onStateChanged(endpoint);
-		if (monitor->getState(endpoint) == status)
-			return Void();
-		wait( change );
+		if (monitor->getState(endpoint) == status) return Void();
+		wait(change);
 	}
 }

-ACTOR Future<Void> waitForContinuousFailure( IFailureMonitor* monitor, Endpoint endpoint, double sustainedFailureDuration, double slope ) {
+ACTOR Future<Void> waitForContinuousFailure(IFailureMonitor* monitor, Endpoint endpoint,
+                                            double sustainedFailureDuration, double slope) {
 	state double startT = now();
+
 	loop {
-		wait( monitor->onFailed( endpoint ) );
-		if(monitor->permanentlyFailed(endpoint))
-			return Void();
+		wait(monitor->onFailed(endpoint));
+		if (monitor->permanentlyFailed(endpoint)) return Void();

 		// X == sustainedFailureDuration + slope * (now()-startT+X)
-		double waitDelay = (sustainedFailureDuration + slope * (now()-startT)) / (1-slope);
+		double waitDelay = (sustainedFailureDuration + slope * (now() - startT)) / (1 - slope);

-		//SOMEDAY: if we know that this process is a server or client we can tune this optimization better
-		if(waitDelay < std::min(FLOW_KNOBS->CLIENT_REQUEST_INTERVAL, FLOW_KNOBS->SERVER_REQUEST_INTERVAL)) //We will not get a failure monitoring update in this amount of time, so there is no point in waiting for changes
+		// SOMEDAY: if we know that this process is a server or client we can tune this optimization better
+		if (waitDelay <
+		    std::min(FLOW_KNOBS->CLIENT_REQUEST_INTERVAL,
+		             FLOW_KNOBS->SERVER_REQUEST_INTERVAL)) // We will not get a failure monitoring update in this amount
+		                                                   // of time, so there is no point in waiting for changes
 			waitDelay = 0;
 		choose {
-			when (wait( monitor->onStateEqual( endpoint, FailureStatus(false) ) )) {}  // SOMEDAY: Use onStateChanged() for efficiency
-			when (wait( delay(waitDelay) )) {
-				return Void();
-			}
+			when(wait(monitor->onStateEqual(endpoint, FailureStatus(false)))) {
+			} // SOMEDAY: Use onStateChanged() for efficiency
+			when(wait(delay(waitDelay))) { return Void(); }
 		}
 	}
 }

-Future<Void> IFailureMonitor::onStateEqual( Endpoint const& endpoint, FailureStatus status ) {
-	if ( status == getState(endpoint) ) return Void();
+Future<Void> IFailureMonitor::onStateEqual(Endpoint const& endpoint, FailureStatus status) {
+	if (status == getState(endpoint)) return Void();
 	return waitForStateEqual(this, endpoint, status);
 }

-Future<Void> IFailureMonitor::onFailedFor( Endpoint const& endpoint, double sustainedFailureDuration, double slope ) {
-	ASSERT( slope < 1.0 );
-	return waitForContinuousFailure( this, endpoint, sustainedFailureDuration, slope );
+Future<Void> IFailureMonitor::onFailedFor(Endpoint const& endpoint, double sustainedFailureDuration, double slope) {
+	ASSERT(slope < 1.0);
+	return waitForContinuousFailure(this, endpoint, sustainedFailureDuration, slope);
 }

-void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStatus const& status ) {
+void SimpleFailureMonitor::setStatus(NetworkAddress const& address, FailureStatus const& status) {

-	//if (status.failed)
-	//	printf("On machine '%s': Machine '%s' is failed\n", g_network->getLocalAddress().toString().c_str(), address.toString().c_str());
-	//printf("%s.setState(%s, %s) %p\n", g_network->getLocalAddress().toString(), address.toString(), status.failed ? "FAILED" : "OK", this);
-	//addressStatus.set( address, status );
+	// if (status.failed)
+	//	printf("On machine '%s': Machine '%s' is failed\n", g_network->getLocalAddress().toString().c_str(),
+	//         address.toString().c_str()); printf("%s.setState(%s, %s) %p\n", g_network->getLocalAddress().toString(),
+	//         address.toString(), status.failed ? "FAILED" : "OK", this); addressStatus.set( address, status );

 	// onStateChanged() will be waiting on endpointKnownFailed only where it is false, so if the address status
 	// for an endpoint that is waited on changes, the waiter sees its failure status change
@ -96,22 +98,29 @@ void SimpleFailureMonitor::setStatus( NetworkAddress const& address, FailureStat
 	}
 }

-void SimpleFailureMonitor::endpointNotFound( Endpoint const& endpoint ) {
+void SimpleFailureMonitor::endpointNotFound(Endpoint const& endpoint) {
 	// SOMEDAY: Expiration (this "leaks" memory)
-	if(endpoint.token.first() == -1) {
-		TraceEvent("WellKnownEndpointNotFound").suppressFor(1.0).detail("Address", endpoint.getPrimaryAddress()).detail("TokenFirst", endpoint.token.first()).detail("TokenSecond", endpoint.token.second());
+	if (endpoint.token.first() == -1) {
+		TraceEvent("WellKnownEndpointNotFound")
+		    .suppressFor(1.0)
+		    .detail("Address", endpoint.getPrimaryAddress())
+		    .detail("TokenFirst", endpoint.token.first())
+		    .detail("TokenSecond", endpoint.token.second());
 		return;
 	}
-	TraceEvent("EndpointNotFound").suppressFor(1.0).detail("Address", endpoint.getPrimaryAddress()).detail("Token", endpoint.token);
-	endpointKnownFailed.set( endpoint, true );
+	TraceEvent("EndpointNotFound")
+	    .suppressFor(1.0)
+	    .detail("Address", endpoint.getPrimaryAddress())
+	    .detail("Token", endpoint.token);
+	endpointKnownFailed.set(endpoint, true);
 }

-void SimpleFailureMonitor::notifyDisconnect( NetworkAddress const& address ) {
+void SimpleFailureMonitor::notifyDisconnect(NetworkAddress const& address) {
 	//TraceEvent("NotifyDisconnect").detail("Address", address);
-	endpointKnownFailed.triggerRange( Endpoint({address}, UID()), Endpoint({address}, UID(-1,-1)) );
+	endpointKnownFailed.triggerRange(Endpoint({ address }, UID()), Endpoint({ address }, UID(-1, -1)));
 }

-Future<Void> SimpleFailureMonitor::onDisconnectOrFailure( Endpoint const& endpoint ) {
+Future<Void> SimpleFailureMonitor::onDisconnectOrFailure(Endpoint const& endpoint) {
 	// If the endpoint or address is already failed, return right away
 	auto i = addressStatus.find(endpoint.getPrimaryAddress());
 	if (i == addressStatus.end() || i->second.isFailed() || endpointKnownFailed.get(endpoint)) {
@ -120,12 +129,12 @@ Future<Void> SimpleFailureMonitor::onDisconnectOrFailure( Endpoint const& endpoi
 	}

 	// Return when the endpoint is triggered, which means that either the endpoint has become known failed, or the
-	//   address has changed state (and since it was previously not failed, it must now be failed), or notifyDisconnect()
-	//   has been called.
+	//   address has changed state (and since it was previously not failed, it must now be failed), or
+	//   notifyDisconnect() has been called.
 	return endpointKnownFailed.onChange(endpoint);
 }

-Future<Void> SimpleFailureMonitor::onStateChanged( Endpoint const& endpoint ) {
+Future<Void> SimpleFailureMonitor::onStateChanged(Endpoint const& endpoint) {
 	// Wait on endpointKnownFailed if it is false, to pick up both endpointNotFound errors (which set it to true)
 	//   and changes to addressStatus (which trigger a range).  Don't wait on endpointKnownFailed if it is true, because
 	//   failure status for that endpoint can never change (and we could be spuriously triggered by setStatus)
@ -137,36 +146,42 @@ Future<Void> SimpleFailureMonitor::onStateChanged( Endpoint const& endpoint ) {
 		return endpointKnownFailed.onChange(endpoint);
 }

-FailureStatus SimpleFailureMonitor::getState( Endpoint const& endpoint ) {
+FailureStatus SimpleFailureMonitor::getState(Endpoint const& endpoint) {
 	if (endpointKnownFailed.get(endpoint))
 		return FailureStatus(true);
 	else {
 		auto a = addressStatus.find(endpoint.getPrimaryAddress());
-		if (a == addressStatus.end()) return FailureStatus();
-		else return a->second;
-		//printf("%s.getState(%s) = %s %p\n", g_network->getLocalAddress().toString(), endpoint.address.toString(), a.failed ? "FAILED" : "OK", this);
+		if (a == addressStatus.end())
+			return FailureStatus();
+		else
+			return a->second;
+		// printf("%s.getState(%s) = %s %p\n", g_network->getLocalAddress().toString(), endpoint.address.toString(),
+		//        a.failed ? "FAILED" : "OK", this);
 	}
 }

-FailureStatus SimpleFailureMonitor::getState( NetworkAddress const& address ) {
+FailureStatus SimpleFailureMonitor::getState(NetworkAddress const& address) {
 	auto a = addressStatus.find(address);
-	if (a == addressStatus.end()) return FailureStatus();
-	else return a->second;
+	if (a == addressStatus.end())
+		return FailureStatus();
+	else
+		return a->second;
 }

-bool SimpleFailureMonitor::onlyEndpointFailed( Endpoint const& endpoint ) {
-	if(!endpointKnownFailed.get(endpoint))
-		return false;
+bool SimpleFailureMonitor::onlyEndpointFailed(Endpoint const& endpoint) {
+	if (!endpointKnownFailed.get(endpoint)) return false;
 	auto a = addressStatus.find(endpoint.getPrimaryAddress());
-	if (a == addressStatus.end()) return true;
-	else return !a->second.failed;
+	if (a == addressStatus.end())
+		return true;
+	else
+		return !a->second.failed;
 }

-bool SimpleFailureMonitor::permanentlyFailed( Endpoint const& endpoint ) {
+bool SimpleFailureMonitor::permanentlyFailed(Endpoint const& endpoint) {
 	return endpointKnownFailed.get(endpoint);
 }

 void SimpleFailureMonitor::reset() {
-	addressStatus = std::unordered_map< NetworkAddress, FailureStatus >();
+	addressStatus = std::unordered_map<NetworkAddress, FailureStatus>();
 	endpointKnownFailed.resetNoWaiting();
 }
--- a/fdbrpc/FailureMonitor.h
+++ b/fdbrpc/FailureMonitor.h
@ -76,8 +76,8 @@ struct FailureStatus {
 	bool isFailed() const { return failed; }
 	bool isAvailable() const { return !failed; }

-	bool operator == (FailureStatus const& r) const { return failed == r.failed; }
-	bool operator != (FailureStatus const& r) const { return failed != r.failed; }
+	bool operator==(FailureStatus const& r) const { return failed == r.failed; }
+	bool operator!=(FailureStatus const& r) const { return failed != r.failed; }
 	template <class Ar>
 	void serialize(Ar& ar) {
 		serializer(ar, failed);
@ -87,43 +87,43 @@ struct FailureStatus {
 class IFailureMonitor {
 public:
 	// Returns the currently known status for the endpoint
-	virtual FailureStatus getState( Endpoint const& endpoint ) = 0;
+	virtual FailureStatus getState(Endpoint const& endpoint) = 0;

 	// Returns the currently known status for the address
-	virtual FailureStatus getState( NetworkAddress const& address ) = 0;
+	virtual FailureStatus getState(NetworkAddress const& address) = 0;

 	// Only use this function when the endpoint is known to be failed
-	virtual void endpointNotFound( Endpoint const& ) = 0;
+	virtual void endpointNotFound(Endpoint const&) = 0;

 	// The next time the known status for the endpoint changes, returns the new status.
-	virtual Future<Void> onStateChanged( Endpoint const& endpoint ) = 0;
+	virtual Future<Void> onStateChanged(Endpoint const& endpoint) = 0;

 	// Returns when onFailed(endpoint) || transport().onDisconnect( endpoint.getPrimaryAddress() ), but more efficiently
-	virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint ) = 0;
+	virtual Future<Void> onDisconnectOrFailure(Endpoint const& endpoint) = 0;

 	// Returns true if the endpoint is failed but the address of the endpoint is not failed.
-	virtual bool onlyEndpointFailed( Endpoint const& endpoint ) = 0;
+	virtual bool onlyEndpointFailed(Endpoint const& endpoint) = 0;

 	// Returns true if the endpoint will never become available.
-	virtual bool permanentlyFailed( Endpoint const& endpoint ) = 0;
+	virtual bool permanentlyFailed(Endpoint const& endpoint) = 0;

 	// Called by FlowTransport when a connection closes and a prior request or reply might be lost
-	virtual void notifyDisconnect( NetworkAddress const& ) = 0;
+	virtual void notifyDisconnect(NetworkAddress const&) = 0;

 	// Called to update the failure status of network address directly when running client.
 	virtual void setStatus(NetworkAddress const& address, FailureStatus const& status) = 0;

 	// Returns when the known status of endpoint is next equal to status.  Returns immediately
 	//   if appropriate.
-	Future<Void> onStateEqual( Endpoint const& endpoint, FailureStatus status );
+	Future<Void> onStateEqual(Endpoint const& endpoint, FailureStatus status);

 	// Returns when the status of the given endpoint is next considered "failed"
-	Future<Void> onFailed( Endpoint const& endpoint ) {
-		return onStateEqual( endpoint, FailureStatus() );
-	}
+	Future<Void> onFailed(Endpoint const& endpoint) { return onStateEqual(endpoint, FailureStatus()); }

-	// Returns when the status of the given endpoint has continuously been "failed" for sustainedFailureDuration + (elapsedTime*sustainedFailureSlope)
-	Future<Void> onFailedFor( Endpoint const& endpoint, double sustainedFailureDuration, double sustainedFailureSlope = 0.0 );
+	// Returns when the status of the given endpoint has continuously been "failed" for sustainedFailureDuration +
+	// (elapsedTime*sustainedFailureSlope)
+	Future<Void> onFailedFor(Endpoint const& endpoint, double sustainedFailureDuration,
+	                         double sustainedFailureSlope = 0.0);

 	// Returns the failure monitor that the calling machine should use
 	static IFailureMonitor& failureMonitor() {
@ -137,22 +137,23 @@ public:

 class SimpleFailureMonitor : public IFailureMonitor {
 public:
-	SimpleFailureMonitor() : endpointKnownFailed() { }
-	void setStatus( NetworkAddress const& address, FailureStatus const& status );
-	void endpointNotFound( Endpoint const& );
-	virtual void notifyDisconnect( NetworkAddress const& );
+	SimpleFailureMonitor() : endpointKnownFailed() {}
+	void setStatus(NetworkAddress const& address, FailureStatus const& status);
+	void endpointNotFound(Endpoint const&);
+	virtual void notifyDisconnect(NetworkAddress const&);

-	virtual Future<Void> onStateChanged( Endpoint const& endpoint );
-	virtual FailureStatus getState( Endpoint const& endpoint );
-	virtual FailureStatus getState( NetworkAddress const& address );
-	virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint );
-	virtual bool onlyEndpointFailed( Endpoint const& endpoint );
-	virtual bool permanentlyFailed( Endpoint const& endpoint );
+	virtual Future<Void> onStateChanged(Endpoint const& endpoint);
+	virtual FailureStatus getState(Endpoint const& endpoint);
+	virtual FailureStatus getState(NetworkAddress const& address);
+	virtual Future<Void> onDisconnectOrFailure(Endpoint const& endpoint);
+	virtual bool onlyEndpointFailed(Endpoint const& endpoint);
+	virtual bool permanentlyFailed(Endpoint const& endpoint);

 	void reset();
+
 private:
-	std::unordered_map< NetworkAddress, FailureStatus > addressStatus;
-	YieldedAsyncMap< Endpoint, bool > endpointKnownFailed;
+	std::unordered_map<NetworkAddress, FailureStatus> addressStatus;
+	YieldedAsyncMap<Endpoint, bool> endpointKnownFailed;

 	friend class OnStateChangedActorActor;
 };
--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@ -236,6 +236,7 @@ struct YieldMockNetwork : INetwork, ReferenceCounted<YieldMockNetwork> {
 	virtual double now() { return baseNetwork->now(); }
 	virtual double timer() { return baseNetwork->timer(); }
 	virtual void stop() { return baseNetwork->stop(); }
+	virtual void addStopCallback( std::function<void()> fn ) { ASSERT(false); return; }
 	virtual bool isSimulated() const { return baseNetwork->isSimulated(); }
 	virtual void onMainThread(Promise<Void>&& signal, TaskPriority taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); }
 	bool isOnMainThread() const override { return baseNetwork->isOnMainThread(); }
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -28,6 +28,7 @@
 #include "flow/crc32c.h"
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/FailureMonitor.h"
+#include "fdbrpc/HealthMonitor.h"
 #include "fdbrpc/genericactors.actor.h"
 #include "fdbrpc/simulator.h"
 #include "flow/ActorCollection.h"
@ -189,6 +190,7 @@ public:
 	std::vector<Future<Void>> listeners;
 	std::unordered_map<NetworkAddress, Reference<struct Peer>> peers;
 	std::unordered_map<NetworkAddress, std::pair<double, double>> closedPeers;
+	HealthMonitor healthMonitor;
 	Reference<AsyncVar<bool>> degraded;
 	bool warnAlwaysForLargePacket;

@ -206,6 +208,7 @@ public:
 	Int64MetricHandle countConnClosedWithoutError;

 	std::map<NetworkAddress, std::pair<uint64_t, double>> incompatiblePeers;
+	AsyncTrigger incompatiblePeersChanged;
 	uint32_t numIncompatibleConnections;
 	std::map<uint64_t, double> multiVersionConnections;
 	double lastIncompatibleMessage;
@ -295,7 +298,7 @@ static ReliablePacket* sendPacket( TransportData* self, Reference<Peer> peer, IS
 ACTOR Future<Void> connectionMonitor( Reference<Peer> peer ) {
 	state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET);
 	loop {
-		if (!FlowTransport::transport().isClient() && !peer->destination.isPublic() && peer->compatible) {
+		if (!FlowTransport::isClient() && !peer->destination.isPublic() && peer->compatible) {
 			// Don't send ping messages to clients unless necessary. Instead monitor incoming client pings.
 			// We ignore this block for incompatible clients because pings from server would trigger the
 			// peer->resetPing and prevent 'connection_failed' due to ping timeout.
@ -324,7 +327,7 @@ ACTOR Future<Void> connectionMonitor( Reference<Peer> peer ) {
 					(peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY)) {
 				// TODO: What about when peerReference == -1?
 				throw connection_unreferenced();
-			} else if (FlowTransport::transport().isClient() && peer->compatible && peer->destination.isPublic() &&
+			} else if (FlowTransport::isClient() && peer->compatible && peer->destination.isPublic() &&
 			           (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) &&
 			           (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) {
 				// First condition is necessary because we may get here if we are server.
@ -396,80 +399,133 @@ ACTOR Future<Void> connectionWriter( Reference<Peer> self, Reference<IConnection
 	}
 }

+ACTOR Future<Void> delayedHealthUpdate(NetworkAddress address) {
+	state double start = now();
+	state bool delayed = false;
+	loop {
+		if (FLOW_KNOBS->HEALTH_MONITOR_MARK_FAILED_UNSTABLE_CONNECTIONS &&
+		    FlowTransport::transport().healthMonitor()->tooManyConnectionsClosed(address) && address.isPublic()) {
+			if (!delayed) {
+				TraceEvent("TooManyConnectionsClosedMarkFailed")
+				    .detail("Dest", address)
+				    .detail("StartTime", start)
+				    .detail("ClosedCount", FlowTransport::transport().healthMonitor()->closedConnectionsCount(address));
+				IFailureMonitor::failureMonitor().setStatus(address, FailureStatus(true));
+			}
+			delayed = true;
+			wait(delayJittered(FLOW_KNOBS->MAX_RECONNECTION_TIME * 2.0));
+		} else {
+			if (delayed) {
+			    TraceEvent("TooManyConnectionsClosedMarkAvailable")
+				    .detail("Dest", address)
+				    .detail("StartTime", start)
+				    .detail("TimeElapsed", now() - start)
+				    .detail("ClosedCount", FlowTransport::transport().healthMonitor()->closedConnectionsCount(address));
+			}
+			IFailureMonitor::failureMonitor().setStatus(address, FailureStatus(false));
+			break;
+		}
+	}
+	return Void();
+}
+
 ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
 		Reference<IConnection> conn = Reference<IConnection>(),
 		Future<Void> reader = Void()) {
 	TraceEvent(SevDebug, "ConnectionKeeper", conn ? conn->getDebugID() : UID())
 		.detail("PeerAddr", self->destination)
 		.detail("ConnSet", (bool)conn);
+	ASSERT_WE_THINK(FlowTransport::transport().getLocalAddress() != self->destination);

 	state Optional<double> firstConnFailedTime = Optional<double>();
+	state int retryConnect = false;
+
 	loop {
 		try {
+			state Future<Void> delayedHealthUpdateF = Future<Void>();
+
 			if (!conn) {  // Always, except for the first loop with an incoming connection
 				self->outgoingConnectionIdle = true;
 				// Wait until there is something to send.
 				while (self->unsent.empty()) {
-					if (self->destination.isPublic() &&
-					    IFailureMonitor::failureMonitor().getState(self->destination).isFailed()) {
-						break;
+					// Override waiting, if we are in failed state to update failure monitoring status.
+					Future<Void> retryConnectF = Never();
+					if (retryConnect) {
+						retryConnectF = IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()
+						                    ? delay(FLOW_KNOBS->FAILURE_DETECTION_DELAY)
+						                    : delay(FLOW_KNOBS->SERVER_REQUEST_INTERVAL);
 					}

-					wait (self->dataToSend.onTrigger());
+					choose {
+						when(wait(self->dataToSend.onTrigger())) {}
+						when(wait(retryConnectF)) { break; }
+					}
 				}

-				ASSERT( self->destination.isPublic() );
+				ASSERT(self->destination.isPublic());
 				self->outgoingConnectionIdle = false;
 				wait(delayJittered(
 						std::max(0.0, self->lastConnectTime + self->reconnectionDelay -
 															now()))); // Don't connect() to the same peer more than once per 2 sec
 				self->lastConnectTime = now();

-				TraceEvent("ConnectingTo", conn ? conn->getDebugID() : UID()).suppressFor(1.0).detail("PeerAddr", self->destination);
+				TraceEvent("ConnectingTo", conn ? conn->getDebugID() : UID())
+				    .suppressFor(1.0)
+				    .detail("PeerAddr", self->destination)
+				    .detail("PeerReferences", self->peerReferences)
+				    .detail("FailureStatus", IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()
+				                                 ? "OK"
+				                                 : "FAILED");

 				try {
 					choose {
-						when( Reference<IConnection> _conn = wait( INetworkConnections::net()->connect(self->destination) ) ) { 
+						when(Reference<IConnection> _conn =
+						         wait(INetworkConnections::net()->connect(self->destination))) {
 							conn = _conn;
 							wait(conn->connectHandshake());
-							IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(false));
 							if (self->unsent.empty()) {
-								conn->close();
-								conn = Reference<IConnection>();
-								continue;
-							} else {
-								TraceEvent("ConnectionExchangingConnectPacket", conn->getDebugID())
-										.suppressFor(1.0)
-										.detail("PeerAddr", self->destination);
-								self->prependConnectPacket();
+								delayedHealthUpdateF = delayedHealthUpdate(self->destination);
+								choose {
+									when(wait(delayedHealthUpdateF)) {
+										conn->close();
+										conn = Reference<IConnection>();
+										retryConnect = false;
+										continue;
+									}
+									when(wait(self->dataToSend.onTrigger())) {}
+								}
 							}
-							reader = connectionReader( self->transport, conn, self, Promise<Reference<Peer>>());
+
+							TraceEvent("ConnectionExchangingConnectPacket", conn->getDebugID())
+							    .suppressFor(1.0)
+							    .detail("PeerAddr", self->destination);
+							self->prependConnectPacket();
+							reader = connectionReader(self->transport, conn, self, Promise<Reference<Peer>>());
 						}
 						when( wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT ) ) ) {
 							throw connection_failed();
 						}
 					}
-				} catch( Error &e ) {
-					if(e.code() != error_code_connection_failed) {
+				} catch (Error& e) {
+					if (e.code() != error_code_connection_failed) {
 						throw;
 					}
 					TraceEvent("ConnectionTimedOut", conn ? conn->getDebugID() : UID())
 					    .suppressFor(1.0)
 					    .detail("PeerAddr", self->destination);

-					IFailureMonitor::failureMonitor().setStatus(
-					    self->destination, FailureStatus(e.code() == error_code_connection_failed));
 					throw;
 				}
 			} else {
-				IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(false));
 				self->outgoingConnectionIdle = false;
 			}

 			firstConnFailedTime.reset();
 			try {
 				self->transport->countConnEstablished++;
-				wait( connectionWriter( self, conn ) || reader || connectionMonitor(self) );
+				if (!delayedHealthUpdateF.isValid())
+					delayedHealthUpdateF = delayedHealthUpdate(self->destination);
+				wait(connectionWriter(self, conn) || reader || connectionMonitor(self));
 			} catch (Error& e) {
 				if (e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled ||
 						e.code() == error_code_connection_unreferenced ||
@ -483,6 +539,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,

 			ASSERT( false );
 		} catch (Error& e) {
+			delayedHealthUpdateF.cancel();
 			if(now() - self->lastConnectTime > FLOW_KNOBS->RECONNECTION_RESET_TIME) {
 				self->reconnectionDelay = FLOW_KNOBS->INITIAL_RECONNECTION_TIME;
 			} else {
@ -499,6 +556,18 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
 				firstConnFailedTime = now();
 			}

+			// Don't immediately mark connection as failed. To stay closed to earlier behaviour of centralized
+			// failure monitoring, wait until connection stays failed for FLOW_KNOBS->FAILURE_DETECTION_DELAY timeout.
+			retryConnect = self->destination.isPublic() && e.code() == error_code_connection_failed;
+			if (e.code() == error_code_connection_failed) {
+				if (!self->destination.isPublic()) {
+					// Can't connect back to non-public addresses.
+					IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(true));
+				} else if (now() - firstConnFailedTime.get() > FLOW_KNOBS->FAILURE_DETECTION_DELAY) {
+					IFailureMonitor::failureMonitor().setStatus(self->destination, FailureStatus(true));
+				}
+			}
+
 			self->discardUnreliablePackets();
 			reader = Future<Void>();
 			bool ok = e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled ||
@ -521,7 +590,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,

 			if(self->destination.isPublic() 
 				&& IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()
-				&& !FlowTransport::transport().isClient()) 
+				&& !FlowTransport::isClient()) 
 			{
 				auto& it = self->transport->closedPeers[self->destination];
 				if(now() - it.second > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY) {
@ -536,6 +605,10 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
 			}

 			if (conn) {
+				if (self->destination.isPublic() && e.code() == error_code_connection_failed) {
+					FlowTransport::transport().healthMonitor()->reportPeerClosed(self->destination);
+				}
+
 				conn->close();
 				conn = Reference<IConnection>();
 			}
@ -556,6 +629,14 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
 	}
 }

+Peer::Peer(TransportData* transport, NetworkAddress const& destination)
+  : transport(transport), destination(destination), outgoingConnectionIdle(true), lastConnectTime(0.0),
+    reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0),
+    incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {
+
+	IFailureMonitor::failureMonitor().setStatus(destination, FailureStatus(false));
+}
+
 void Peer::send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) {
 	unsent.setWriteBuffer(pb);
 	if (rp) reliable.insert(rp);
@ -662,6 +743,9 @@ ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader
 		} catch (Error& e) {
 			g_currentDeliveryPeerAddress = {NetworkAddress()};
 			TraceEvent(SevError, "ReceiverError").error(e).detail("Token", destination.token.toString()).detail("Peer", destination.getPrimaryAddress());
+			if(!FlowTransport::isClient()) {
+				flushAndExit(FDB_EXIT_ERROR);
+			}
 			throw;
 		}
 	} else if (destination.token.first() & TOKEN_STREAM_FLAG) {
@ -1023,7 +1107,7 @@ Reference<Peer> TransportData::getOrOpenPeer( NetworkAddress const& address, boo
 	auto peer = getPeer(address);
 	if(!peer) {
 		peer = Reference<Peer>( new Peer(this, address) );
-		if(startConnectionKeeper) {
+		if(startConnectionKeeper && !isLocalAddress(address)) {
 			peer->connect = connectionKeeper(peer);
 		}
 		peers[address] = peer;
@ -1039,10 +1123,14 @@ bool TransportData::isLocalAddress(const NetworkAddress& address) const {
 ACTOR static Future<Void> multiVersionCleanupWorker( TransportData* self ) {
 	loop {
 		wait(delay(FLOW_KNOBS->CONNECTION_CLEANUP_DELAY));
+		bool foundIncompatible = false;
 		for(auto it = self->incompatiblePeers.begin(); it != self->incompatiblePeers.end();) {
 			if( self->multiVersionConnections.count(it->second.first) ) {
 				it = self->incompatiblePeers.erase(it);
 			} else {
+				if( now() - it->second.second > FLOW_KNOBS->INCOMPATIBLE_PEER_DELAY_BEFORE_LOGGING ) {
+					foundIncompatible = true;
+				}
 				it++;
 			}
 		}
@ -1054,6 +1142,10 @@ ACTOR static Future<Void> multiVersionCleanupWorker( TransportData* self ) {
 				it++;
 			}
 		}
+
+		if(foundIncompatible) {
+			self->incompatiblePeersChanged.trigger();
+		}
 	}
 }

@ -1084,6 +1176,10 @@ std::map<NetworkAddress, std::pair<uint64_t, double>>* FlowTransport::getIncompa
 	return &self->incompatiblePeers;
 }

+Future<Void> FlowTransport::onIncompatibleChanged() {
+	return self->incompatiblePeersChanged.onTrigger();
+}
+
 Future<Void> FlowTransport::bind( NetworkAddress publicAddress, NetworkAddress listenAddress ) {
 	ASSERT( publicAddress.isPublic() );
 	if(self->localAddresses.address == NetworkAddress()) {
@ -1107,9 +1203,7 @@ void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
 		return;

 	Reference<Peer> peer = self->getOrOpenPeer(endpoint.getPrimaryAddress());
-
-	if(peer->peerReferences == -1) {
-		IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false));
+	if (peer->peerReferences == -1) {
 		peer->peerReferences = 1;
 	} else {
 		peer->peerReferences++;
@ -1173,7 +1267,8 @@ static void sendLocal( TransportData* self, ISerializeSource const& what, const
 	deliver(self, destination, ArenaReader(copy.arena(), copy, AssumeVersion(currentProtocolVersion)), false);
 }

-static ReliablePacket* sendPacket( TransportData* self, Reference<Peer> peer, ISerializeSource const& what, const Endpoint& destination, bool reliable ) {
+static ReliablePacket* sendPacket(TransportData* self, Reference<Peer> peer, ISerializeSource const& what,
+                                  const Endpoint& destination, bool reliable) {
 	const bool checksumEnabled = !destination.getPrimaryAddress().isTLS();
 	++self->countPacketsGenerated;

@ -1315,4 +1410,15 @@ void FlowTransport::createInstance(bool isClient, uint64_t transportId) {
 	g_network->setGlobal(INetwork::enFlowTransport, (flowGlobalType) new FlowTransport(transportId));
 	g_network->setGlobal(INetwork::enNetworkAddressFunc, (flowGlobalType) &FlowTransport::getGlobalLocalAddress);
 	g_network->setGlobal(INetwork::enNetworkAddressesFunc, (flowGlobalType) &FlowTransport::getGlobalLocalAddresses);
+
+	// Mark ourselves as avaiable in FailureMonitor
+	const auto& localAddresses = FlowTransport::transport().getLocalAddresses();
+	IFailureMonitor::failureMonitor().setStatus(localAddresses.address, FailureStatus(false));
+	if (localAddresses.secondaryAddress.present()) {
+		IFailureMonitor::failureMonitor().setStatus(localAddresses.secondaryAddress.get(), FailureStatus(false));
+	}
+}
+
+HealthMonitor* FlowTransport::healthMonitor() {
+	return &self->healthMonitor;
 }
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@ -23,6 +23,7 @@
 #pragma once

 #include <algorithm>
+#include "fdbrpc/HealthMonitor.h"
 #include "flow/genericactors.actor.h"
 #include "flow/network.h"
 #include "flow/FileIdentifier.h"
@ -44,7 +45,9 @@ public:
 	}

 	void choosePrimaryAddress() {
-		if(addresses.secondaryAddress.present() && !g_network->getLocalAddresses().secondaryAddress.present() && (addresses.address.isTLS() != g_network->getLocalAddresses().address.isTLS())) {
+		if(addresses.secondaryAddress.present() && 
+		((!g_network->getLocalAddresses().secondaryAddress.present() && (addresses.address.isTLS() != g_network->getLocalAddresses().address.isTLS())) ||
+		(g_network->getLocalAddresses().secondaryAddress.present() && !addresses.address.isTLS()))) {
 			std::swap(addresses.address, addresses.secondaryAddress.get());
 		}
 	}
@ -58,6 +61,10 @@ public:
 		return addresses.address;
 	}

+	NetworkAddress getStableAddress() const {
+		return addresses.getTLSAddress();
+	}
+
 	bool operator == (Endpoint const& r) const {
 		return getPrimaryAddress() == r.getPrimaryAddress() && token == r.token;
 	}
@ -123,10 +130,7 @@ struct Peer : public ReferenceCounted<Peer> {
 	double lastDataPacketSentTime;
 	int outstandingReplies;

-	explicit Peer(TransportData* transport, NetworkAddress const& destination)
-	  : transport(transport), destination(destination), outgoingConnectionIdle(true), lastConnectTime(0.0),
-	    reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), outstandingReplies(0),
-	    incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) {}
+	explicit Peer(TransportData* transport, NetworkAddress const& destination);

 	void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent);

@ -164,6 +168,9 @@ public:
 	std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
 	// Returns the same of all peers that have attempted to connect, but have incompatible protocol versions

+	Future<Void> onIncompatibleChanged();
+	// Returns when getIncompatiblePeers has at least one peer which is incompatible.
+
 	void addPeerReference(const Endpoint&, bool isStream);
 	// Signal that a peer connection is being used, even if no messages are currently being sent to the peer

@ -205,6 +212,8 @@ public:

 	Endpoint loadedEndpoint(const UID& token);

+	HealthMonitor* healthMonitor();
+
 private:
 	class TransportData* self;
 };
--- a/fdbrpc/HealthMonitor.actor.cpp
+++ b/fdbrpc/HealthMonitor.actor.cpp
@ -0,0 +1,51 @@
+/*
+ * HealthMonitor.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbrpc/FailureMonitor.h"
+#include "fdbrpc/FlowTransport.h"
+#include "fdbrpc/HealthMonitor.h"
+
+void HealthMonitor::reportPeerClosed(const NetworkAddress& peerAddress) {
+	purgeOutdatedHistory();
+	peerClosedHistory.push_back(std::make_pair(now(), peerAddress));
+	peerClosedNum[peerAddress] += 1;
+}
+
+void HealthMonitor::purgeOutdatedHistory() {
+	for (auto it : peerClosedHistory) {
+		if (it.first < now() - FLOW_KNOBS->HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS) {
+			peerClosedNum[it.second] -= 1;
+			ASSERT(peerClosedNum[it.second] >= 0);
+			peerClosedHistory.pop_front();
+		} else {
+			break;
+		}
+	}
+}
+
+bool HealthMonitor::tooManyConnectionsClosed(const NetworkAddress& peerAddress) {
+	purgeOutdatedHistory();
+	return peerClosedNum[peerAddress] > FLOW_KNOBS->HEALTH_MONITOR_CONNECTION_MAX_CLOSED;
+}
+
+int HealthMonitor::closedConnectionsCount(const NetworkAddress& peerAddress) {
+	purgeOutdatedHistory();
+	return peerClosedNum[peerAddress];
+}
--- a/fdbrpc/HealthMonitor.h
+++ b/fdbrpc/HealthMonitor.h
@ -0,0 +1,41 @@
+/*
+ * HealthMonitor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDBRPC_HEALTH_MONITOR_H
+#define FDBRPC_HEALTH_MONITOR_H
+
+#include <deque>
+#include <unordered_map>
+
+#include <flow/flow.h>
+
+class HealthMonitor {
+public:
+	void reportPeerClosed(const NetworkAddress& peerAddress);
+	bool tooManyConnectionsClosed(const NetworkAddress& peerAddress);
+	int closedConnectionsCount(const NetworkAddress& peerAddress);
+private:
+	void purgeOutdatedHistory();
+
+	std::deque<std::pair<double, NetworkAddress>> peerClosedHistory;
+	std::unordered_map<NetworkAddress, int> peerClosedNum;
+};
+
+#endif // FDBRPC_HEALTH_MONITOR_H
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@ -73,8 +73,8 @@ struct LoadBalancedReply {
 	LoadBalancedReply() : penalty(1.0) {}
 };

-Optional<LoadBalancedReply> getLoadBalancedReply(LoadBalancedReply *reply);
-Optional<LoadBalancedReply> getLoadBalancedReply(void*);
+Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply *reply);
+Optional<LoadBalancedReply> getLoadBalancedReply(const void*);

 // Returns true if we got a value for our request
 // Throws an error if the request returned an error that should bubble out
@ -455,6 +455,103 @@ Future< REPLY_TYPE(Request) > loadBalance(
 	}
 }

+// Subclasses must initialize all members in their default constructors
+// Subclasses must serialize all members
+struct BasicLoadBalancedReply {
+	int recentRequests;
+	BasicLoadBalancedReply() : recentRequests(0) {}
+};
+
+Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const BasicLoadBalancedReply *reply);
+Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const void*);
+
+// A simpler version of LoadBalance that does not send second requests where the list of servers are always fresh
+ACTOR template <class Interface, class Request, class Multi>
+Future< REPLY_TYPE(Request) > basicLoadBalance(
+	Reference<ModelInterface<Multi>> alternatives,
+	RequestStream<Request> Interface::* channel,
+	Request request = Request(),
+	TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
+	bool atMostOnce = false) 
+{
+	setReplyPriority(request, taskID);
+	if (!alternatives)
+		return Never();
+
+	ASSERT( alternatives->size() && alternatives->alwaysFresh() );
+
+	state int bestAlt = alternatives->getBest();
+	state int nextAlt = deterministicRandom()->randomInt(0, std::max(alternatives->size() - 1,1));
+	if( nextAlt >= bestAlt )
+		nextAlt++;
+
+	state int startAlt = nextAlt;
+	state int startDistance = (bestAlt+alternatives->size()-startAlt) % alternatives->size();
+
+	state int numAttempts = 0;
+	state double backoff = 0;
+	state int useAlt;
+	loop {
+		// Find an alternative, if any, that is not failed, starting with nextAlt
+		state RequestStream<Request> const* stream = NULL;
+		for(int alternativeNum=0; alternativeNum<alternatives->size(); alternativeNum++) {
+			useAlt = nextAlt;
+			if( nextAlt == startAlt )
+				useAlt = bestAlt;
+			else if( (nextAlt+alternatives->size()-startAlt) % alternatives->size() <= startDistance )
+				useAlt = (nextAlt+alternatives->size()-1) % alternatives->size();
+			
+			stream = &alternatives->get( useAlt, channel );
+			if (!IFailureMonitor::failureMonitor().getState( stream->getEndpoint() ).failed)
+				break;
+			nextAlt = (nextAlt+1) % alternatives->size();
+			stream=NULL;
+		}
+
+		if(!stream) {
+			// Everything is down!  Wait for someone to be up.
+
+			vector<Future<Void>> ok( alternatives->size() );
+			for(int i=0; i<ok.size(); i++) {
+				ok[i] = IFailureMonitor::failureMonitor().onStateEqual( alternatives->get(i, channel).getEndpoint(), FailureStatus(false) );
+			}
+			wait( quorum( ok, 1 ) );
+
+			numAttempts = 0; // now that we've got a server back, reset the backoff
+		} else {
+			if(backoff > 0.0) {
+				wait(delay(backoff));
+			}
+
+			ErrorOr<REPLY_TYPE(Request)> result = wait(stream->tryGetReply(request));
+
+			if(result.present()) {
+				Optional<BasicLoadBalancedReply> loadBalancedReply = getBasicLoadBalancedReply(&result.get());
+				if(loadBalancedReply.present()) {
+					alternatives->updateRecent( useAlt, loadBalancedReply.get().recentRequests );
+				}
+
+				return result.get();
+			}
+
+			if(result.getError().code() != error_code_broken_promise && result.getError().code() != error_code_request_maybe_delivered) {
+				throw result.getError();
+			}
+
+			if(atMostOnce) {
+				throw request_maybe_delivered();
+			}
+
+			if(++numAttempts >= alternatives->size()) {
+				backoff = std::min(FLOW_KNOBS->LOAD_BALANCE_MAX_BACKOFF, std::max(FLOW_KNOBS->LOAD_BALANCE_START_BACKOFF, backoff * FLOW_KNOBS->LOAD_BALANCE_BACKOFF_RATE));
+			}
+		}
+
+		nextAlt = (nextAlt+1) % alternatives->size();
+		resetReply(request, taskID);
+	}
+}
+
 #include "flow/unactorcompiler.h"

 #endif
--- a/fdbrpc/MultiInterface.h
+++ b/fdbrpc/MultiInterface.h
@ -63,50 +63,111 @@ struct ReferencedInterface : public ReferenceCounted<ReferencedInterface<T>> {
 };

 template <class T>
-class MultiInterface : public ReferenceCounted<MultiInterface<T>> {
+struct AlternativeInfo {
+	T interf;
+	double probability;
+	double cumulativeProbability;
+	int recentRequests;
+	double lastUpdate;
+
+	AlternativeInfo(T const& interf, double probability, double cumulativeProbability) : interf(interf), probability(probability), cumulativeProbability(cumulativeProbability), recentRequests(-1), lastUpdate(0) {}
+
+	bool operator < (double const& r) const {
+		return cumulativeProbability < r;
+	}
+	bool operator <= (double const& r) const {
+		return cumulativeProbability <= r;
+	}
+	bool operator == (double const& r) const {
+		return cumulativeProbability == r;
+	}
+};
+
+template <class T>
+class ModelInterface : public ReferenceCounted<ModelInterface<T>> {
 public:
-	MultiInterface( const std::vector<T>& v, LocalityData const& locality = LocalityData() ) : bestCount(0) {
-		for(int i=0; i<v.size(); i++)
-			alternatives.push_back(KVPair<int,T>(LBDistance::DISTANT,v[i]));
-		deterministicRandom()->randomShuffle(alternatives);
-		if ( LBLocalityData<T>::Present ) {
-			for(int a=0; a<alternatives.size(); a++)
-				alternatives[a].k = loadBalanceDistance( locality, LBLocalityData<T>::getLocality( alternatives[a].v ), LBLocalityData<T>::getAddress( alternatives[a].v ) );
-			std::stable_sort( alternatives.begin(), alternatives.end() );
+	ModelInterface( const vector<T>& v ) {
+		for(int i = 0; i < v.size(); i++) {
+			alternatives.push_back(AlternativeInfo(v[i], 1.0/v.size(), (i+1.0)/v.size()));
+		}
+		if(v.size()) {
+			updater = recurring([this](){ updateProbabilities(); }, FLOW_KNOBS->BASIC_LOAD_BALANCE_UPDATE_RATE);
 		}
-		if(size())
-			bestCount = std::lower_bound( alternatives.begin()+1, alternatives.end(), alternatives[0].k+1 ) - alternatives.begin();
 	}

 	int size() const { return alternatives.size(); }
-	int countBest() const {
-		return bestCount;
-	}
-	LBDistance::Type bestDistance() const {
-		if( !size() )
-			return LBDistance::DISTANT;
-		return (LBDistance::Type) alternatives[0].k;
-	}
+
 	bool alwaysFresh() const {
 		return LBLocalityData<T>::alwaysFresh();
 	}

-	template <class F>
-	F const& get( int index, F T::*member ) const {
-		return alternatives[index].v.*member;
+	int getBest() const {
+		return std::lower_bound( alternatives.begin(), alternatives.end(), deterministicRandom()->random01() ) - alternatives.begin();
 	}

-	T const& getInterface(int index) { return alternatives[index].v; }
-	UID getId( int index ) const { return alternatives[index].v.id(); }
+	void updateRecent( int index, int recentRequests ) {
+		alternatives[index].recentRequests = recentRequests;
+		alternatives[index].lastUpdate = now();
+	}

-	virtual ~MultiInterface() {}
+	void updateProbabilities() {
+		double totalRequests = 0;
+		for(auto& it : alternatives) {
+			totalRequests += it.recentRequests;
+			if(now() - it.lastUpdate > FLOW_KNOBS->BASIC_LOAD_BALANCE_UPDATE_RATE/2.0) {
+				return;
+			}
+		}
+		if(totalRequests < 1000) {
+			return;
+		}
+		
+		double totalProbability = 0;
+		for(auto& it : alternatives) {
+			it.probability += (1.0/alternatives.size()-(it.recentRequests/totalRequests))*FLOW_KNOBS->BASIC_LOAD_BALANCE_MAX_CHANGE;
+			it.probability = std::max(it.probability, 1/(FLOW_KNOBS->BASIC_LOAD_BALANCE_MAX_PROB*alternatives.size()));
+			it.probability = std::min(it.probability, FLOW_KNOBS->BASIC_LOAD_BALANCE_MAX_PROB/alternatives.size());
+			totalProbability += it.probability;
+		}
+
+		for(auto& it : alternatives) {
+			it.probability = it.probability/totalProbability;
+		}
+
+		totalProbability = 0;
+		for(auto& it : alternatives) {
+			totalProbability += it.probability;
+			it.cumulativeProbability = totalProbability;
+		}
+		alternatives.back().cumulativeProbability = 1.0;
+	}
+
+	template <class F>
+	F const& get( int index, F T::*member ) const {
+		return alternatives[index].interf.*member;
+	}
+
+	T const& getInterface(int index) { return alternatives[index].interf; }
+	UID getId( int index ) const { return alternatives[index].interf.id(); }
+
+	virtual ~ModelInterface() {}

 	std::string description() {
 		return describe( alternatives );
 	}
 private:
-	std::vector<KVPair<int,T>> alternatives;
-	int16_t bestCount;
+	vector<AlternativeInfo<T>> alternatives;
+	Future<Void> updater;
+};
+
+template <class T>
+class MultiInterface : public ReferenceCounted<MultiInterface<T>> {
+	MultiInterface( const vector<T>& v, LocalityData const& locality = LocalityData() ) {
+		//This version of MultInterface is no longer used, but was kept around because of templating
+		ASSERT(false); 
+	}
+
+	virtual ~MultiInterface() {}
 };

 template <class T>
@ -172,5 +233,6 @@ private:
 };

 template <class Ar, class T> void load(Ar& ar, Reference<MultiInterface<T>>&) { ASSERT(false); }	//< required for Future<T>
+template <class Ar, class T> void load(Ar& ar, Reference<ModelInterface<T>>&) { ASSERT(false); }	//< required for Future<T>

 #endif
--- a/fdbrpc/Net2FileSystem.cpp
+++ b/fdbrpc/Net2FileSystem.cpp
@ -93,7 +93,8 @@ Net2FileSystem::Net2FileSystem(double ioTimeout, std::string fileSystemPath)
 {
 	Net2AsyncFile::init();
 #ifdef __linux__
-	AsyncFileKAIO::init( Reference<IEventFD>(N2::ASIOReactor::getEventFD()), ioTimeout );
+	if (!FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO)
+		AsyncFileKAIO::init( Reference<IEventFD>(N2::ASIOReactor::getEventFD()), ioTimeout );

 	if (fileSystemPath.empty()) {
 		checkFileSystem = false;
@ -114,3 +115,7 @@ Net2FileSystem::Net2FileSystem(double ioTimeout, std::string fileSystemPath)
 	}
 #endif
 }
+
+void Net2FileSystem::stop() {
+	Net2AsyncFile::stop();
+}
--- a/fdbrpc/Net2FileSystem.h
+++ b/fdbrpc/Net2FileSystem.h
@ -36,6 +36,7 @@ public:
 	virtual Future< std::time_t > lastWriteTime( std::string filename );

 	//void init();
+	static void stop();

 	Net2FileSystem(double ioTimeout=0.0, std::string fileSystemPath = "");

--- a/fdbrpc/QueueModel.cpp
+++ b/fdbrpc/QueueModel.cpp
@ -57,14 +57,22 @@ double QueueModel::addRequest( uint64_t id ) {
 	return d.penalty;
 }

-Optional<LoadBalancedReply> getLoadBalancedReply(LoadBalancedReply *reply) {
+Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply *reply) {
 	return *reply;
 }

-Optional<LoadBalancedReply> getLoadBalancedReply(void*) {
+Optional<LoadBalancedReply> getLoadBalancedReply(const void*) {
 	return Optional<LoadBalancedReply>();
 }

+Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const BasicLoadBalancedReply *reply) {
+	return *reply;
+}
+
+Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const void*) {
+	return Optional<BasicLoadBalancedReply>();
+}
+
 /*
 void QueueModel::addMeasurement( uint64_t id, QueueDetails qd ){
 	if (data[new_index].count(id))
--- a/fdbrpc/RangeMap.h
+++ b/fdbrpc/RangeMap.h
@ -112,6 +112,7 @@ public:
 	Val const& operator[]( const Key& k ) { return rangeContaining(k).value(); }

 	Ranges ranges() { return Ranges( Iterator(map.begin()), Iterator(map.lastItem()) ); }
+	// intersectingRanges returns [begin, end] where begin <= r.begin and end >= r.end
 	Ranges intersectingRanges( const Range& r ) { return Ranges(rangeContaining(r.begin), Iterator(map.lower_bound(r.end))); }
 	// containedRanges() will return all ranges that are fully contained by the passed range (note that a range fully contains itself)
 	Ranges containedRanges( const Range& r ) { 
--- a/fdbrpc/libeio/config.h.FreeBSD
+++ b/fdbrpc/libeio/config.h.FreeBSD
@ -0,0 +1,142 @@
+/* config.h.  Generated from config.h.in by configure.  */
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* fdatasync(2) is available */
+#define HAVE_FDATASYNC 1
+
+/* futimes(2) is available */
+#define HAVE_FUTIMES 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* fallocate(2) is available */
+/* #undef HAVE_LINUX_FALLOCATE */
+
+/* Define to 1 if you have the <linux/fiemap.h> header file. */
+/* #undef HAVE_LINUX_FIEMAP_H */
+
+/* Define to 1 if you have the <linux/fs.h> header file. */
+/* #undef HAVE_LINUX_FS_H */
+
+/* splice/vmsplice/tee(2) are available */
+/* #undef HAVE_LINUX_SPLICE */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* posix_fadvise(2) is available */
+#define HAVE_POSIX_FADVISE 1
+
+/* posix_madvise(2) is available */
+#define HAVE_POSIX_MADVISE 1
+
+/* prctl(PR_SET_NAME) is available */
+/* #undef HAVE_PRCTL_SET_NAME */
+
+/* readahead(2) is available (linux) */
+/* #undef HAVE_READAHEAD */
+
+/* sendfile(2) is available and supported */
+#define HAVE_SENDFILE 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* sync_file_range(2) is available */
+/* #undef HAVE_SYNC_FILE_RANGE */
+
+/* Define to 1 if you have the <sys/prctl.h> header file. */
+/* #undef HAVE_SYS_PRCTL_H */
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* syscall(__NR_syncfs) is available */
+/* #undef HAVE_SYS_SYNCFS */
+
+/* Define to 1 if you have the <sys/syscall.h> header file. */
+#define HAVE_SYS_SYSCALL_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* utimes(2) is available */
+#define HAVE_UTIMES 1
+
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
+#define LT_OBJDIR ".libs/"
+
+/* Name of package */
+#define PACKAGE "libeio"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME ""
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING ""
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME ""
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION ""
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
+/* Version number of package */
+#define VERSION "1.0"
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
--- a/fdbrpc/libeio/eio.c
+++ b/fdbrpc/libeio/eio.c
@ -39,6 +39,8 @@

 #ifdef __linux__
 #include "config.h.linux"
+#elif defined(__FreeBSD__)
+#include "config.h.FreeBSD"
 #elif defined(__APPLE__)
 #include "config.h.osx"
 #endif
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -871,7 +871,12 @@ public:
 		return emptyConfig;
 	}

-	virtual void stop() { isStopped = true; }
+	virtual void stop() {
+		isStopped = true;
+	}
+	virtual void addStopCallback( std::function<void()> fn ) {
+		stopCallbacks.emplace_back(std::move(fn));
+	}
 	virtual bool isSimulated() const { return true; }

 	struct SimThreadArgs {
@ -995,6 +1000,9 @@ public:
 		}
 		self->currentProcess = callingMachine;
 		self->net2->stop();
+		for ( auto& fn : self->stopCallbacks ) {
+			fn();
+		}
 		return Void();
 	}

@ -1615,6 +1623,7 @@ public:
 		// Not letting currentProcess be NULL eliminates some annoying special cases
 		currentProcess = new ProcessInfo("NoMachine", LocalityData(Optional<Standalone<StringRef>>(), StringRef(), StringRef(), StringRef()), ProcessClass(), {NetworkAddress()}, this, "", "");
 		g_network = net2 = newNet2(TLSConfig(), false, true);
+		g_network->addStopCallback( Net2FileSystem::stop );
 		Net2FileSystem::newFileSystem();
 		check_yield(TaskPriority::Zero);
 	}
@ -1713,6 +1722,8 @@ public:
 	//tasks is guarded by ISimulator::mutex
 	std::priority_queue<Task, std::vector<Task>> tasks;

+	std::vector<std::function<void()>> stopCallbacks;
+
 	//Sim2Net network;
 	INetwork *net2;

--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -144,7 +144,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 						{
 							MutationRef privatized = m;
 							privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
-							TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
+							//TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
 							cachedRangeInfo[k] = privatized;
 						}
 					if(k != allKeys.end) {
@ -161,7 +161,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 				if(toCommit) {
 					MutationRef privatized = m;
 					privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
-					TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
+					//TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
 					toCommit->addTag( cacheTag );
 					toCommit->addTypedMessage(privatized);
 				}
@ -499,7 +499,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 					keyEnd = itr->first;
 					mutationEnd = itr->second;
 				} else {
-					TraceEvent(SevDebug, "EndKeyNotFound", dbgid).detail("KeyBegin", keyBegin.toString());
+					//TraceEvent(SevDebug, "EndKeyNotFound", dbgid).detail("KeyBegin", keyBegin.toString());
 					break;
 				}
 			} else {
@ -510,7 +510,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 					keyBegin = itr->first;
 					mutationBegin = itr->second;
 				} else {
-					TraceEvent(SevDebug, "BeginKeyNotFound", dbgid).detail("KeyEnd", keyEnd.toString());
+					//TraceEvent(SevDebug, "BeginKeyNotFound", dbgid).detail("KeyEnd", keyEnd.toString());
 					break;
 				}
 			}
--- a/fdbserver/BackupProgress.actor.cpp
+++ b/fdbserver/BackupProgress.actor.cpp
@ -83,6 +83,15 @@ std::map<std::tuple<LogEpoch, Version, int>, std::map<Tag, Version>> BackupProgr

 		auto progressIt = progress.lower_bound(epoch);
 		if (progressIt != progress.end() && progressIt->first == epoch) {
+			if (progressIt != progress.begin()) {
+				// Previous epoch is gone, consolidate the progress.
+				auto prev = std::prev(progressIt);
+				for (auto [tag, version] : prev->second) {
+					if (tags.count(tag) > 0) {
+						progressIt->second[tag] = std::max(version, progressIt->second[tag]);
+					}
+				}
+			}
 			updateTagVersions(&tagVersions, &tags, progressIt->second, info.epochEnd, adjustedBeginVersion, epoch);
 		} else {
 			auto rit = std::find_if(
--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@ -68,13 +68,14 @@ struct BackupData {
 	const UID myId;
 	const Tag tag; // LogRouter tag for this worker, i.e., (-2, i)
 	const int totalTags; // Total log router tags
-	const Version startVersion;
+	const Version startVersion; // This worker's start version
 	const Optional<Version> endVersion; // old epoch's end version (inclusive), or empty for current epoch
 	const LogEpoch recruitedEpoch; // current epoch whose tLogs are receiving mutations
 	const LogEpoch backupEpoch; // the epoch workers should pull mutations
 	LogEpoch oldestBackupEpoch = 0; // oldest epoch that still has data on tLogs for backup to pull
 	Version minKnownCommittedVersion;
-	Version savedVersion;
+	Version savedVersion; // Largest version saved to blob storage
+	Version popVersion; // Largest version popped in NOOP mode, can be larger than savedVersion.
 	AsyncVar<Reference<ILogSystem>> logSystem;
 	Database cx;
 	std::vector<VersionedMessage> messages;
@ -207,8 +208,12 @@ struct BackupData {
 		}

 		BackupData* self = nullptr;
+
+		// Backup request's commit version. Mutations are logged at some version after this.
 		Version startVersion = invalidVersion;
+		// The last mutation log's saved version (not inclusive), i.e., next log's begin version.
 		Version lastSavedVersion = invalidVersion;
+
 		Future<Optional<Reference<IBackupContainer>>> container;
 		Future<Optional<std::vector<KeyRange>>> ranges; // Key ranges of this backup
 		Future<Void> updateWorker;
@ -225,7 +230,7 @@ struct BackupData {
 	explicit BackupData(UID id, Reference<AsyncVar<ServerDBInfo>> db, const InitializeBackupRequest& req)
 	  : myId(id), tag(req.routerTag), totalTags(req.totalTags), startVersion(req.startVersion),
 	    endVersion(req.endVersion), recruitedEpoch(req.recruitedEpoch), backupEpoch(req.backupEpoch),
-	    minKnownCommittedVersion(invalidVersion), savedVersion(req.startVersion - 1),
+	    minKnownCommittedVersion(invalidVersion), savedVersion(req.startVersion - 1), popVersion(req.startVersion - 1),
 	    cc("BackupWorker", myId.toString()), pulledVersion(0), paused(false) {
 		cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true);

@ -291,7 +296,7 @@ struct BackupData {
 		}
 		ASSERT_WE_THINK(backupEpoch == oldestBackupEpoch);
 		const Tag popTag = logSystem.get()->getPseudoPopTag(tag, ProcessClass::BackupClass);
-		logSystem.get()->pop(savedVersion, popTag);
+		logSystem.get()->pop(std::max(popVersion, savedVersion), popTag);
 	}

 	void stop() {
@ -326,11 +331,15 @@ struct BackupData {
 		}

 		bool modified = false;
+		bool minVersionChanged = false;
+		Version minVersion = std::numeric_limits<Version>::max();
 		for (const auto [uid, version] : uidVersions) {
 			auto it = backups.find(uid);
 			if (it == backups.end()) {
 				modified = true;
 				backups.emplace(uid, BackupData::PerBackupInfo(this, uid, version));
+				minVersion = std::min(minVersion, version);
+				minVersionChanged = true;
 			} else {
 				stopList.erase(uid);
 			}
@ -342,6 +351,14 @@ struct BackupData {
 			it->second.stop();
 			modified = true;
 		}
+		if (minVersionChanged && backupEpoch < recruitedEpoch && savedVersion + 1 == startVersion) {
+			// Advance savedVersion to minimize version ranges in case backupEpoch's
+			// progress is not saved. Master may set a very low startVersion that
+			// is already popped. Advance the version is safe because these
+			// versions are not popped -- if they are popped, their progress should
+			// be already recorded and Master would use a higher version than minVersion.
+			savedVersion = std::max(minVersion, savedVersion);
+		}
 		if (modified) changedTrigger.trigger();
 	}

@ -378,7 +395,7 @@ struct BackupData {
 			                                     GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION);
 			choose {
 				when(wait(self->cx->onMasterProxiesChanged())) {}
-				when(GetReadVersionReply reply = wait(loadBalance(self->cx->getMasterProxies(false),
+				when(GetReadVersionReply reply = wait(basicLoadBalance(self->cx->getMasterProxies(false),
 				                                                  &MasterProxyInterface::getConsistentReadVersion,
 				                                                  request, self->cx->taskID))) {
 					return reply.version;
@ -390,10 +407,10 @@ struct BackupData {
 	Future<Version> getMinKnownCommittedVersion() { return _getMinKnownCommittedVersion(this); }
 };

-// Monitors "backupStartedKey". If "started" is true, wait until the key is set;
+// Monitors "backupStartedKey". If "present" is true, wait until the key is set;
 // otherwise, wait until the key is cleared. If "watch" is false, do not perform
 // the wait for key set/clear events. Returns if key present.
-ACTOR Future<bool> monitorBackupStartedKeyChanges(BackupData* self, bool started, bool watch) {
+ACTOR Future<bool> monitorBackupStartedKeyChanges(BackupData* self, bool present, bool watch) {
 	loop {
 		state ReadYourWritesTransaction tr(self->cx);

@ -418,13 +435,13 @@ ACTOR Future<bool> monitorBackupStartedKeyChanges(BackupData* self, bool started
 					}
 					self->exitEarly = shouldExit;
 					self->onBackupChanges(uidVersions);
-					if (started || !watch) return true;
+					if (present || !watch) return true;
 				} else {
 					TraceEvent("BackupWorkerEmptyStartKey", self->myId);
 					self->onBackupChanges(uidVersions);

 					self->exitEarly = shouldExit;
-					if (!started || !watch) {
+					if (!present || !watch) {
 						return false;
 					}
 				}
@ -554,17 +571,6 @@ ACTOR Future<Void> saveProgress(BackupData* self, Version backupVersion) {
 	}
 }

-// Return a block of contiguous padding bytes, growing if needed.
-static Value makePadding(int size) {
-	static Value pad;
-	if (pad.size() < size) {
-		pad = makeString(size);
-		memset(mutateString(pad), '\xff', pad.size());
-	}
-
-	return pad.substr(0, size);
-}
-
 // Write a mutation to a log file. Note the mutation can be different from
 // message.message for clear mutations.
 ACTOR Future<Void> addMutation(Reference<IBackupFile> logFile, VersionedMessage message, StringRef mutation,
@ -585,7 +591,7 @@ ACTOR Future<Void> addMutation(Reference<IBackupFile> logFile, VersionedMessage
 		// Write padding if needed
 		const int bytesLeft = *blockEnd - logFile->size();
 		if (bytesLeft > 0) {
-			state Value paddingFFs = makePadding(bytesLeft);
+			state Value paddingFFs = fileBackup::makePadding(bytesLeft);
 			wait(logFile->append(paddingFFs.begin(), bytesLeft));
 		}

@ -650,8 +656,13 @@ ACTOR Future<Void> saveMutationsToFile(BackupData* self, Version popVersion, int
 		activeUids.push_back(it->first);
 		self->insertRanges(keyRangeMap, it->second.ranges.get(), index);
 		if (it->second.lastSavedVersion == invalidVersion) {
-			it->second.lastSavedVersion =
-			    self->savedVersion > self->startVersion ? self->savedVersion : self->startVersion;
+			if (it->second.startVersion > self->startVersion && !self->messages.empty()) {
+				// True-up first mutation log's begin version
+				it->second.lastSavedVersion = self->messages[0].getVersion();
+			} else {
+				it->second.lastSavedVersion =
+				    std::max(self->popVersion, std::max(self->savedVersion, self->startVersion));
+			}
 		}
 		logFileFutures.push_back(it->second.container.get().get()->writeTaggedLogFile(
 		    it->second.lastSavedVersion, popVersion + 1, blockSize, self->tag.id, self->totalTags));
@ -743,6 +754,10 @@ ACTOR Future<Void> uploadData(BackupData* self) {

 		state int numMsg = 0;
 		Version lastPopVersion = popVersion;
+		// index of last version's end position in self->messages
+		int lastVersionIndex = 0;
+		Version lastVersion = invalidVersion;
+
 		if (self->messages.empty()) {
 			// Even though messages is empty, we still want to advance popVersion.
 			if (!self->endVersion.present()) {
@ -751,17 +766,30 @@ ACTOR Future<Void> uploadData(BackupData* self) {
 		} else {
 			for (const auto& message : self->messages) {
 				// message may be prefetched in peek; uncommitted message should not be uploaded.
-				if (message.getVersion() > self->maxPopVersion()) break;
-				popVersion = std::max(popVersion, message.getVersion());
+				const Version version = message.getVersion();
+				if (version > self->maxPopVersion()) break;
+				if (version > popVersion) {
+					lastVersionIndex = numMsg;
+					lastVersion = popVersion;
+					popVersion = version;
+				}
 				numMsg++;
 			}
 		}
 		if (self->pullFinished()) {
 			popVersion = self->endVersion.get();
+		} else {
+			// make sure file is saved on version boundary
+			popVersion = lastVersion;
+			numMsg = lastVersionIndex;
 		}
 		if (((numMsg > 0 || popVersion > lastPopVersion) && self->pulling) || self->pullFinished()) {
 			TraceEvent("BackupWorkerSave", self->myId)
 			    .detail("Version", popVersion)
+			    .detail("LastPopVersion", lastPopVersion)
+			    .detail("Pulling", self->pulling)
+			    .detail("SavedVersion", self->savedVersion)
+			    .detail("NumMsg", numMsg)
 			    .detail("MsgQ", self->messages.size());
 			// save an empty file for old epochs so that log file versions are continuous
 			wait(saveMutationsToFile(self, popVersion, numMsg));
@ -769,9 +797,11 @@ ACTOR Future<Void> uploadData(BackupData* self) {
 		}

 		// If transition into NOOP mode, should clear messages
-		if (!self->pulling) self->messages.clear();
+		if (!self->pulling) {
+			self->messages.clear();
+		}

-		if (popVersion > self->savedVersion) {
+		if (popVersion > self->savedVersion && popVersion > self->popVersion) {
 			wait(saveProgress(self, popVersion));
 			TraceEvent("BackupWorkerSavedProgress", self->myId)
 			    .detail("Tag", self->tag.toString())
@ -872,10 +902,13 @@ ACTOR Future<Void> monitorBackupKeyOrPullData(BackupData* self, bool keyPresent)
 				when(wait(success(present))) { break; }
 				when(wait(success(committedVersion) || delay(SERVER_KNOBS->BACKUP_NOOP_POP_DELAY, self->cx->taskID))) {
 					if (committedVersion.isReady()) {
-						self->savedVersion = std::max(committedVersion.get(), self->savedVersion);
+						self->popVersion =
+						    std::max(self->popVersion, std::max(committedVersion.get(), self->savedVersion));
 						self->minKnownCommittedVersion =
 						    std::max(committedVersion.get(), self->minKnownCommittedVersion);
-						TraceEvent("BackupWorkerNoopPop", self->myId).detail("SavedVersion", self->savedVersion);
+						TraceEvent("BackupWorkerNoopPop", self->myId)
+						    .detail("SavedVersion", self->savedVersion)
+						    .detail("PopVersion", self->popVersion);
 						self->pop(); // Pop while the worker is in this NOOP state.
 						committedVersion = Never();
 					} else {
@ -884,6 +917,7 @@ ACTOR Future<Void> monitorBackupKeyOrPullData(BackupData* self, bool keyPresent)
 				}
 			}
 		}
+		ASSERT(!keyPresent == present.get());
 		keyPresent = !keyPresent;
 	}
 }
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@ -6,7 +6,6 @@ set(FDBSERVER_SRCS
  BackupProgress.actor.h
  BackupWorker.actor.cpp
  ClusterController.actor.cpp
-  ClusterRecruitmentInterface.h
  ConflictSet.h
  CoordinatedState.actor.cpp
  CoordinatedState.h
@ -179,6 +178,7 @@ set(FDBSERVER_SRCS
  workloads/Sideband.actor.cpp
  workloads/SlowTaskWorkload.actor.cpp
  workloads/SnapTest.actor.cpp
+  workloads/SpecialKeySpaceCorrectness.actor.cpp
  workloads/StatusWorkload.actor.cpp
  workloads/Storefront.actor.cpp
  workloads/StreamingRead.actor.cpp
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -36,7 +36,6 @@
 #include "fdbserver/LeaderElection.h"
 #include "fdbserver/LogSystemConfig.h"
 #include "fdbserver/WaitFailure.h"
-#include "fdbserver/ClusterRecruitmentInterface.h"
 #include "fdbserver/RatekeeperInterface.h"
 #include "fdbserver/ServerDBInfo.h"
 #include "fdbserver/Status.h"
@ -62,14 +61,15 @@ struct WorkerInfo : NonCopyable {
 	WorkerDetails details;
 	Future<Void> haltRatekeeper;
 	Future<Void> haltDistributor;
+	Standalone<VectorRef<StringRef>> issues;

 	WorkerInfo() : gen(-1), reboots(0), priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
-	WorkerInfo( Future<Void> watcher, ReplyPromise<RegisterWorkerReply> reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded ) :
-		watcher(watcher), reply(reply), gen(gen), reboots(0), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded) {}
+	WorkerInfo( Future<Void> watcher, ReplyPromise<RegisterWorkerReply> reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded, Standalone<VectorRef<StringRef>> issues ) :
+		watcher(watcher), reply(reply), gen(gen), reboots(0), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded), issues(issues) {}

 	WorkerInfo( WorkerInfo&& r ) BOOST_NOEXCEPT : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen),
 		reboots(r.reboots), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)),
-		haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor) {}
+		haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), issues(r.issues) {}
 	void operator=( WorkerInfo&& r ) BOOST_NOEXCEPT {
 		watcher = std::move(r.watcher);
 		reply = std::move(r.reply);
@ -80,6 +80,7 @@ struct WorkerInfo : NonCopyable {
 		details = std::move(r.details);
 		haltRatekeeper = r.haltRatekeeper;
 		haltDistributor = r.haltDistributor;
+		issues = r.issues;
 	}
 };

@ -96,13 +97,11 @@ class ClusterControllerData {
 public:
 	struct DBInfo {
 		Reference<AsyncVar<ClientDBInfo>> clientInfo;
-		Reference<AsyncVar<CachedSerialization<ServerDBInfo>>> serverInfo;
-		CachedSerialization<ServerDBInfo> serverInfoMasterOnly;
-		std::set<NetworkAddress> requiredAddresses;
-		ProcessIssuesMap workersWithIssues;
+		Reference<AsyncVar<ServerDBInfo>> serverInfo;
 		std::map<NetworkAddress, double> incompatibleConnections;
 		AsyncTrigger forceMasterFailure;
 		int64_t masterRegistrationCount;
+		int64_t dbInfoCount;
 		bool recoveryStalled;
 		bool forceRecovery;
 		DatabaseConfiguration config;   // Asynchronously updated via master registration
@ -114,44 +113,38 @@ public:
 		std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>> clientStatus;

 		DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), cachePopulated(false),
-			clientInfo( new AsyncVar<ClientDBInfo>( ClientDBInfo() ) ),
-			serverInfo( new AsyncVar<CachedSerialization<ServerDBInfo>>( CachedSerialization<ServerDBInfo>() ) ),
+			clientInfo( new AsyncVar<ClientDBInfo>( ClientDBInfo() ) ), dbInfoCount(0),
+			serverInfo( new AsyncVar<ServerDBInfo>( ServerDBInfo() ) ),
 			db( DatabaseContext::create( clientInfo, Future<Void>(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) )  // SOMEDAY: Locality!
 		{
 		}

-		void addRequiredAddresses(const std::vector<WorkerInterface>& interfaces) {
-			for(auto& it : interfaces) {
-				requiredAddresses.insert(it.address());
-			}
-		}
-
 		void setDistributor(const DataDistributorInterface& interf) {
-			CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
-			auto& newInfo = newInfoCache.mutate();
+			auto newInfo = serverInfo->get();
 			newInfo.id = deterministicRandom()->randomUniqueID();
+			newInfo.infoGeneration = ++dbInfoCount;
 			newInfo.distributor = interf;
-			serverInfo->set( newInfoCache );
+			serverInfo->set( newInfo );
 		}

 		void setRatekeeper(const RatekeeperInterface& interf) {
-			CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
-			auto& newInfo = newInfoCache.mutate();
+			auto newInfo = serverInfo->get();
 			newInfo.id = deterministicRandom()->randomUniqueID();
+			newInfo.infoGeneration = ++dbInfoCount;
 			newInfo.ratekeeper = interf;
-			serverInfo->set( newInfoCache );
+			serverInfo->set( newInfo );
 		}

 		void clearInterf(ProcessClass::ClassType t) {
-			CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
-			auto& newInfo = newInfoCache.mutate();
+			auto newInfo = serverInfo->get();
 			newInfo.id = deterministicRandom()->randomUniqueID();
+			newInfo.infoGeneration = ++dbInfoCount;
 			if (t == ProcessClass::DataDistributorClass) {
 				newInfo.distributor = Optional<DataDistributorInterface>();
 			} else if (t == ProcessClass::RatekeeperClass) {
 				newInfo.ratekeeper = Optional<RatekeeperInterface>();
 			}
-			serverInfo->set( newInfoCache );
+			serverInfo->set( newInfo );
 		}

 	};
@ -219,8 +212,8 @@ public:
 	}

 	bool isLongLivedStateless( Optional<Key> const& processId ) {
-		return (db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == processId) ||
-				   (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == processId);
+		return (db.serverInfo->get().distributor.present() && db.serverInfo->get().distributor.get().locality.processId() == processId) ||
+				   (db.serverInfo->get().ratekeeper.present() && db.serverInfo->get().ratekeeper.get().locality.processId() == processId);
 	}

 	WorkerDetails getStorageWorker( RecruitStorageRequest const& req ) {
@ -233,6 +226,7 @@ public:
 					!excludedMachines.count(it.second.details.interf.locality.zoneId()) &&
 					( includeDCs.size() == 0 || includeDCs.count(it.second.details.interf.locality.dcId()) ) &&
 					!addressExcluded(excludedAddresses, it.second.details.interf.address()) &&
+					( !it.second.details.interf.secondaryAddress().present() || !addressExcluded(excludedAddresses, it.second.details.interf.secondaryAddress().get()) ) &&
 					it.second.details.processClass.machineClassFitness( ProcessClass::Storage ) <= ProcessClass::UnsetFit ) {
 				return it.second.details;
 			}
@ -269,7 +263,7 @@ public:

 		for( auto& it : id_worker ) {
 			auto fitness = it.second.details.processClass.machineClassFitness( ProcessClass::Storage );
-			if( workerAvailable(it.second, false) && !conf.isExcludedServer(it.second.details.interf.address()) && fitness != ProcessClass::NeverAssign && ( !dcId.present() || it.second.details.interf.locality.dcId()==dcId.get() ) ) {
+			if( workerAvailable(it.second, false) && !conf.isExcludedServer(it.second.details.interf.addresses()) && fitness != ProcessClass::NeverAssign && ( !dcId.present() || it.second.details.interf.locality.dcId()==dcId.get() ) ) {
 				fitness_workers[ fitness ].push_back(it.second.details);
 			}
 		}
@ -314,7 +308,7 @@ public:
 		for( auto& it : id_worker ) {
 			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), it.second.details.interf.id()) == exclusionWorkerIds.end()) {
 				auto fitness = it.second.details.processClass.machineClassFitness(ProcessClass::TLog);
-				if (workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && fitness != ProcessClass::NeverAssign && (!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId()))) {
+				if (workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.addresses()) && fitness != ProcessClass::NeverAssign && (!dcIds.size() || dcIds.count(it.second.details.interf.locality.dcId()))) {
 					fitness_workers[std::make_pair(fitness, it.second.details.degraded)].push_back(it.second.details);
 				}
 				else {
@ -470,7 +464,7 @@ public:

 		for( auto& it : id_worker ) {
 			auto fitness = it.second.details.processClass.machineClassFitness( role );
-			if(conf.isExcludedServer(it.second.details.interf.address())) {
+			if(conf.isExcludedServer(it.second.details.interf.addresses())) {
 				fitness = std::max(fitness, ProcessClass::ExcludeFit);
 			}
 			if( workerAvailable(it.second, checkStable) && fitness < unacceptableFitness && it.second.details.interf.locality.dcId()==dcId ) {
@ -508,7 +502,7 @@ public:

 		for( auto& it : id_worker ) {
 			auto fitness = it.second.details.processClass.machineClassFitness( role );
-			if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && it.second.details.interf.locality.dcId() == dcId &&
+			if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.addresses()) && it.second.details.interf.locality.dcId() == dcId &&
 			  ( !minWorker.present() || ( it.second.details.interf.id() != minWorker.get().worker.interf.id() && ( fitness < minWorker.get().fitness || (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used ) ) ) ) ) {
 				if (isLongLivedStateless(it.first)) {
 					fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].second.push_back(it.second.details);
@ -627,7 +621,7 @@ public:
 	std::set<Optional<Standalone<StringRef>>> getDatacenters( DatabaseConfiguration const& conf, bool checkStable = false ) {
 		std::set<Optional<Standalone<StringRef>>> result;
 		for( auto& it : id_worker )
-			if( workerAvailable( it.second, checkStable ) && !conf.isExcludedServer( it.second.details.interf.address() ) )
+			if( workerAvailable( it.second, checkStable ) && !conf.isExcludedServer( it.second.details.interf.addresses() ) )
 				result.insert(it.second.details.interf.locality.dcId());
 		return result;
 	}
@ -947,7 +941,7 @@ public:
 	}

 	void checkRecoveryStalled() {
-		if( (db.serverInfo->get().read().recoveryState == RecoveryState::RECRUITING || db.serverInfo->get().read().recoveryState == RecoveryState::ACCEPTING_COMMITS || db.serverInfo->get().read().recoveryState == RecoveryState::ALL_LOGS_RECRUITED) && db.recoveryStalled ) {
+		if( (db.serverInfo->get().recoveryState == RecoveryState::RECRUITING || db.serverInfo->get().recoveryState == RecoveryState::ACCEPTING_COMMITS || db.serverInfo->get().recoveryState == RecoveryState::ALL_LOGS_RECRUITED) && db.recoveryStalled ) {
 			if (db.config.regions.size() > 1) {
 				auto regions = db.config.regions;
 				if(clusterControllerDcId.get() == regions[0].dcId) {
@ -961,7 +955,7 @@ public:

 	//FIXME: determine when to fail the cluster controller when a primaryDC has not been set
 	bool betterMasterExists() {
-		const ServerDBInfo dbi = db.serverInfo->get().read();
+		const ServerDBInfo dbi = db.serverInfo->get();

 		if(dbi.recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 			return false;
@ -1057,7 +1051,7 @@ public:

 		// Check master fitness. Don't return false if master is excluded in case all the processes are excluded, we still need master for recovery.
 		ProcessClass::Fitness oldMasterFit = masterWorker->second.details.processClass.machineClassFitness( ProcessClass::Master );
-		if(db.config.isExcludedServer(dbi.master.address())) {
+		if(db.config.isExcludedServer(dbi.master.addresses())) {
 			oldMasterFit = std::max(oldMasterFit, ProcessClass::ExcludeFit);
 		}

@ -1065,7 +1059,7 @@ public:
 		id_used[clusterControllerProcessId]++;
 		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true);
 		auto newMasterFit = mworker.worker.processClass.machineClassFitness( ProcessClass::Master );
-		if(db.config.isExcludedServer(mworker.worker.interf.address())) {
+		if(db.config.isExcludedServer(mworker.worker.interf.addresses())) {
 			newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit);
 		}

@ -1226,7 +1220,7 @@ public:
 		ASSERT(masterProcessId.present());
 		if (processId == masterProcessId) return false;

-		auto& dbInfo = db.serverInfo->get().read();
+		auto& dbInfo = db.serverInfo->get();
 		for (const auto& tlogset : dbInfo.logSystemConfig.tLogs) {
 			for (const auto& tlog: tlogset.tLogs) {
 				if (tlog.present() && tlog.interf().locality.processId() == processId) return true;
@ -1256,7 +1250,7 @@ public:
 		std::map<Optional<Standalone<StringRef>>, int> idUsed;
 		updateKnownIds(&idUsed);

-		auto& dbInfo = db.serverInfo->get().read();
+		auto& dbInfo = db.serverInfo->get();
 		for (const auto& tlogset : dbInfo.logSystemConfig.tLogs) {
 			for (const auto& tlog: tlogset.tLogs) {
 				if (tlog.present()) {
@ -1294,6 +1288,9 @@ public:
 	UpdateWorkerList updateWorkerList;
 	Future<Void> outstandingRequestChecker;
 	Future<Void> outstandingRemoteRequestChecker;
+	AsyncTrigger updateDBInfo;
+	std::set<Endpoint> updateDBInfoEndpoints;
+	std::set<Endpoint> removedDBInfoEndpoints;

 	DBInfo db;
 	Database cx;
@ -1314,7 +1311,6 @@ public:
 	Counter getWorkersRequests;
 	Counter getClientWorkersRequests;
 	Counter registerMasterRequests;
-	Counter getServerDBInfoRequests;
 	Counter statusRequests;
 	Counter failureMonitoringRequests;

@ -1333,18 +1329,18 @@ public:
 			getWorkersRequests("GetWorkersRequests", clusterControllerMetrics),
 			getClientWorkersRequests("GetClientWorkersRequests", clusterControllerMetrics),
 			registerMasterRequests("RegisterMasterRequests", clusterControllerMetrics),
-			getServerDBInfoRequests("GetServerDBInfoRequests", clusterControllerMetrics),
 			statusRequests("StatusRequests", clusterControllerMetrics),
 			failureMonitoringRequests("FailureMonitoringRequests", clusterControllerMetrics),
 			serversFailed("ServersFailed", clusterControllerMetrics),
 			serversUnfailed("ServersUnfailed", clusterControllerMetrics)
 	{
-		auto& serverInfo = db.serverInfoMasterOnly.mutate();
+		auto serverInfo = ServerDBInfo();
 		serverInfo.id = deterministicRandom()->randomUniqueID();
+		serverInfo.infoGeneration = ++db.dbInfoCount;
 		serverInfo.masterLifetime.ccID = id;
 		serverInfo.clusterInterface = ccInterface;
 		serverInfo.myLocality = locality;
-		db.serverInfo->set( db.serverInfoMasterOnly );
+		db.serverInfo->set( serverInfo );
 		cx = openDBOnServer(db.serverInfo, TaskPriority::DefaultEndpoint, true, true);
 	}

@ -1379,7 +1375,7 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 				continue;
 			}
 			RecruitMasterRequest rmq;
-			rmq.lifetime = db->serverInfo->get().read().masterLifetime;
+			rmq.lifetime = db->serverInfo->get().masterLifetime;
 			rmq.forceRecovery = db->forceRecovery;

 			cluster->masterProcessId = masterWorker.worker.interf.locality.processId();
@ -1399,21 +1395,19 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 				db->masterRegistrationCount = 0;
 				db->recoveryStalled = false;

-				db->serverInfoMasterOnly = CachedSerialization<ServerDBInfo>();
-				auto& dbInfo = db->serverInfoMasterOnly.mutate();
-
+				auto dbInfo = ServerDBInfo();
 				dbInfo.master = iMaster;
 				dbInfo.id = deterministicRandom()->randomUniqueID();
-				dbInfo.masterLifetime = db->serverInfo->get().read().masterLifetime;
+				dbInfo.infoGeneration = ++db->dbInfoCount;
+				dbInfo.masterLifetime = db->serverInfo->get().masterLifetime;
 				++dbInfo.masterLifetime;
-				dbInfo.clusterInterface = db->serverInfo->get().read().clusterInterface;
-				dbInfo.distributor = db->serverInfo->get().read().distributor;
-				dbInfo.ratekeeper = db->serverInfo->get().read().ratekeeper;
-				dbInfo.latencyBandConfig = db->serverInfo->get().read().latencyBandConfig;
+				dbInfo.clusterInterface = db->serverInfo->get().clusterInterface;
+				dbInfo.distributor = db->serverInfo->get().distributor;
+				dbInfo.ratekeeper = db->serverInfo->get().ratekeeper;
+				dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig;

 				TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id);
-				db->requiredAddresses.clear();
-				db->serverInfo->set( db->serverInfoMasterOnly );
+				db->serverInfo->set( dbInfo );

 				state Future<Void> spinDelay = delay(SERVER_KNOBS->MASTER_SPIN_DELAY);  // Don't retry master recovery more than once per second, but don't delay the "first" recovery after more than a second of normal operation

@ -1448,30 +1442,14 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 }

 ACTOR Future<Void> clusterGetServerInfo(ClusterControllerData::DBInfo* db, UID knownServerInfoID,
-                                        Standalone<VectorRef<StringRef>> issues,
-                                        std::vector<NetworkAddress> incompatiblePeers,
-                                        ReplyPromise<CachedSerialization<ServerDBInfo>> reply) {
-	state Optional<UID> issueID;
-	state bool useMasterOnly = false;
-	setIssues(db->workersWithIssues, reply.getEndpoint().getPrimaryAddress(), issues, issueID);
-	for(auto it : incompatiblePeers) {
-		db->incompatibleConnections[it] = now() + SERVER_KNOBS->INCOMPATIBLE_PEERS_LOGGING_INTERVAL;
-	}
-
-	loop {
-		useMasterOnly = db->serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS && !db->requiredAddresses.count(reply.getEndpoint().getPrimaryAddress());
-		if((useMasterOnly ? db->serverInfoMasterOnly.read().id : db->serverInfo->get().read().id) != knownServerInfoID) {
-			break;
-		}
+                                        ReplyPromise<ServerDBInfo> reply) {
+	while(db->serverInfo->get().id == knownServerInfoID) {
 		choose {
 			when (wait( yieldedFuture(db->serverInfo->onChange()) )) {}
 			when (wait( delayJittered( 300 ) )) { break; }  // The server might be long gone!
 		}
 	}
-
-	removeIssues(db->workersWithIssues, reply.getEndpoint().getPrimaryAddress(), issueID);
-
-	reply.send( useMasterOnly ? db->serverInfoMasterOnly : db->serverInfo->get() );
+	reply.send( db->serverInfo->get() );
 	return Void();
 }

@ -1497,12 +1475,6 @@ void checkOutstandingRecruitmentRequests( ClusterControllerData* self ) {
 		RecruitFromConfigurationRequest& req = self->outstandingRecruitmentRequests[i];
 		try {
 			RecruitFromConfigurationReply rep = self->findWorkersForConfiguration( req );
-			self->db.addRequiredAddresses(rep.oldLogRouters);
-			self->db.addRequiredAddresses(rep.proxies);
-			self->db.addRequiredAddresses(rep.resolvers);
-			self->db.addRequiredAddresses(rep.satelliteTLogs);
-			self->db.addRequiredAddresses(rep.tLogs);
-			self->db.serverInfo->trigger();
 			req.reply.send( rep );
 			swapAndPop( &self->outstandingRecruitmentRequests, i-- );
 		} catch (Error& e) {
@ -1521,9 +1493,6 @@ void checkOutstandingRemoteRecruitmentRequests( ClusterControllerData* self ) {
 		RecruitRemoteFromConfigurationRequest& req = self->outstandingRemoteRecruitmentRequests[i];
 		try {
 			RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration( req );
-			self->db.addRequiredAddresses(rep.remoteTLogs);
-			self->db.addRequiredAddresses(rep.logRouters);
-			self->db.serverInfo->trigger();
 			req.reply.send( rep );
 			swapAndPop( &self->outstandingRemoteRecruitmentRequests, i-- );
 		} catch (Error& e) {
@ -1571,7 +1540,7 @@ void checkOutstandingStorageRequests( ClusterControllerData* self ) {
 }

 void checkBetterDDOrRK(ClusterControllerData* self) {
-	if (!self->masterProcessId.present() || self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
+	if (!self->masterProcessId.present() || self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 		return;
 	}

@ -1590,11 +1559,11 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 		newDDWorker = self->id_worker[self->masterProcessId.get()].details;
 	}
 	auto bestFitnessForRK = newRKWorker.processClass.machineClassFitness(ProcessClass::Ratekeeper);
-	if(self->db.config.isExcludedServer(newRKWorker.interf.address())) {
+	if(self->db.config.isExcludedServer(newRKWorker.interf.addresses())) {
 		bestFitnessForRK = std::max(bestFitnessForRK, ProcessClass::ExcludeFit);
 	}
 	auto bestFitnessForDD = newDDWorker.processClass.machineClassFitness(ProcessClass::DataDistributor);
-	if(self->db.config.isExcludedServer(newDDWorker.interf.address())) {
+	if(self->db.config.isExcludedServer(newDDWorker.interf.addresses())) {
 		bestFitnessForDD = std::max(bestFitnessForDD, ProcessClass::ExcludeFit);
 	}
 	//TraceEvent("CheckBetterDDorRKNewRecruits", self->id).detail("MasterProcessId", self->masterProcessId)
@ -1602,8 +1571,8 @@ void checkBetterDDOrRK(ClusterControllerData* self) {

 	Optional<Standalone<StringRef>> currentRKProcessId;
 	Optional<Standalone<StringRef>> currentDDProcessId;
-
-	auto& db = self->db.serverInfo->get().read();
+	
+	auto& db = self->db.serverInfo->get();
 	bool ratekeeperHealthy = false;
 	if (db.ratekeeper.present() && self->id_worker.count(db.ratekeeper.get().locality.processId()) &&
 	   (!self->recruitingRatekeeperID.present() || (self->recruitingRatekeeperID.get() == db.ratekeeper.get().id()))) {
@ -1662,7 +1631,7 @@ ACTOR Future<Void> doCheckOutstandingRequests( ClusterControllerData* self ) {
 		self->checkRecoveryStalled();
 		if (self->betterMasterExists()) {
 			self->db.forceMasterFailure.trigger();
-			TraceEvent("MasterRegistrationKill", self->id).detail("MasterId", self->db.serverInfo->get().read().master.id());
+			TraceEvent("MasterRegistrationKill", self->id).detail("MasterId", self->db.serverInfo->get().master.id());
 		}
 	} catch( Error &e ) {
 		if(e.code() != error_code_no_more_servers) {
@ -1719,12 +1688,14 @@ ACTOR Future<Void> rebootAndCheck( ClusterControllerData* cluster, Optional<Stan
 	return Void();
 }

-ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass startingClass, ClusterControllerData* cluster ) {
+ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass startingClass, ClusterControllerData* cluster) {
 	state Future<Void> failed =
 	    (worker.address() == g_network->getLocalAddress() || startingClass.classType() == ProcessClass::TesterClass)
 	        ? Never()
 	        : waitFailureClient(worker.waitFailure, SERVER_KNOBS->WORKER_FAILURE_TIME);
-	cluster->updateWorkerList.set( worker.locality.processId(), ProcessData(worker.locality, startingClass, worker.address()) );
+	cluster->updateWorkerList.set( worker.locality.processId(), ProcessData(worker.locality, startingClass, worker.stableAddress()) );
+	cluster->updateDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
+	cluster->updateDBInfo.trigger();
 	// This switching avoids a race where the worker can be added to id_worker map after the workerAvailabilityWatch fails for the worker.
 	wait(delay(0));

@ -1745,6 +1716,7 @@ ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass
 				if (worker.locality.processId() == cluster->masterProcessId) {
 					cluster->masterProcessId = Optional<Key>();
 				}
+				cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
 				cluster->id_worker.erase( worker.locality.processId() );
 				cluster->updateWorkerList.set( worker.locality.processId(), Optional<ProcessData>() );
 				return Void();
@ -1940,12 +1912,6 @@ ACTOR Future<Void> clusterRecruitFromConfiguration( ClusterControllerData* self,
 	loop {
 		try {
 			auto rep = self->findWorkersForConfiguration( req );
-			self->db.addRequiredAddresses(rep.oldLogRouters);
-			self->db.addRequiredAddresses(rep.proxies);
-			self->db.addRequiredAddresses(rep.resolvers);
-			self->db.addRequiredAddresses(rep.satelliteTLogs);
-			self->db.addRequiredAddresses(rep.tLogs);
-			self->db.serverInfo->trigger();
 			req.reply.send( rep );
 			return Void();
 		} catch (Error& e) {
@ -1971,9 +1937,6 @@ ACTOR Future<Void> clusterRecruitRemoteFromConfiguration( ClusterControllerData*
 	loop {
 		try {
 			RecruitRemoteFromConfigurationReply rep = self->findRemoteWorkersForConfiguration( req );
-			self->db.addRequiredAddresses(rep.remoteTLogs);
-			self->db.addRequiredAddresses(rep.logRouters);
-			self->db.serverInfo->trigger();
 			req.reply.send( rep );
 			return Void();
 		} catch (Error& e) {
@ -2010,8 +1973,8 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c

 	//make sure the request comes from an active database
 	auto db = &self->db;
-	if ( db->serverInfo->get().read().master.id() != req.id || req.registrationCount <= db->masterRegistrationCount ) {
-		TraceEvent("MasterRegistrationNotFound", self->id).detail("MasterId", req.id).detail("ExistingId", db->serverInfo->get().read().master.id()).detail("RegCount", req.registrationCount).detail("ExistingRegCount", db->masterRegistrationCount);
+	if ( db->serverInfo->get().master.id() != req.id || req.registrationCount <= db->masterRegistrationCount ) {
+		TraceEvent("MasterRegistrationNotFound", self->id).detail("MasterId", req.id).detail("ExistingId", db->serverInfo->get().master.id()).detail("RegCount", req.registrationCount).detail("ExistingRegCount", db->masterRegistrationCount);
 		return;
 	}

@ -2032,7 +1995,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
 			self->gotFullyRecoveredConfig = true;
 			db->fullyRecoveredConfig = req.configuration.get();
 			for ( auto& it : self->id_worker ) {
-				bool isExcludedFromConfig = db->fullyRecoveredConfig.isExcludedServer(it.second.details.interf.address());
+				bool isExcludedFromConfig = db->fullyRecoveredConfig.isExcludedServer(it.second.details.interf.addresses());
 				if ( it.second.priorityInfo.isExcluded != isExcludedFromConfig ) {
 					it.second.priorityInfo.isExcluded = isExcludedFromConfig;
 					if( !it.second.reply.isSet() ) {
@ -2044,8 +2007,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
 	}

 	bool isChanged = false;
-	auto cachedInfo = self->db.serverInfo->get();
-	auto& dbInfo = cachedInfo.mutate();
+	auto dbInfo = self->db.serverInfo->get();

 	if (dbInfo.recoveryState != req.recoveryState) {
 		dbInfo.recoveryState = req.recoveryState;
@ -2086,7 +2048,8 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c

 	if( isChanged ) {
 		dbInfo.id = deterministicRandom()->randomUniqueID();
-		self->db.serverInfo->set( cachedInfo );
+		dbInfo.infoGeneration = ++self->db.dbInfoCount;
+		self->db.serverInfo->set( dbInfo );
 	}

 	checkOutstandingRequests(self);
@ -2099,6 +2062,11 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 	ClusterControllerPriorityInfo newPriorityInfo = req.priorityInfo;
 	newPriorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController);

+	for(auto it : req.incompatiblePeers) {
+		self->db.incompatibleConnections[it] = now() + SERVER_KNOBS->INCOMPATIBLE_PEERS_LOGGING_INTERVAL;
+	}
+	self->removedDBInfoEndpoints.erase(w.updateServerDBInfo.getEndpoint());
+
 	if(info == self->id_worker.end()) {
 		TraceEvent("ClusterControllerActualWorkers", self->id).detail("WorkerId",w.id()).detail("ProcessId", w.locality.processId()).detail("ZoneId", w.locality.zoneId()).detail("DataHall", w.locality.dataHallId()).detail("PClass", req.processClass.toString()).detail("Workers", self->id_worker.size());
 		self->goodRecruitmentTime = lowPriorityDelay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY);
@ -2138,13 +2106,13 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 		}

 		if ( self->gotFullyRecoveredConfig ) {
-			newPriorityInfo.isExcluded = self->db.fullyRecoveredConfig.isExcludedServer(w.address());
+			newPriorityInfo.isExcluded = self->db.fullyRecoveredConfig.isExcludedServer(w.addresses());
 		}
 	}

 	if( info == self->id_worker.end() ) {
-		self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo, req.degraded );
-		if (!self->masterProcessId.present() && w.locality.processId() == self->db.serverInfo->get().read().master.locality.processId()) {
+		self->id_worker[w.locality.processId()] = WorkerInfo( workerAvailabilityWatch( w, newProcessClass, self ), req.reply, req.generation, w, req.initialClass, newProcessClass, newPriorityInfo, req.degraded, req.issues );
+		if (!self->masterProcessId.present() && w.locality.processId() == self->db.serverInfo->get().master.locality.processId()) {
 			self->masterProcessId = w.locality.processId();
 		}
 		checkOutstandingRequests( self );
@ -2158,8 +2126,10 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 		info->second.initialClass = req.initialClass;
 		info->second.details.degraded = req.degraded;
 		info->second.gen = req.generation;
+		info->second.issues = req.issues;

 		if(info->second.details.interf.id() != w.id()) {
+			self->removedDBInfoEndpoints.insert(info->second.details.interf.updateServerDBInfo.getEndpoint());
 			info->second.details.interf = w;
 			info->second.watcher = workerAvailabilityWatch( w, newProcessClass, self );
 		}
@ -2168,7 +2138,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 		TEST(true); // Received an old worker registration request.
 	}

-	if (req.distributorInterf.present() && !self->db.serverInfo->get().read().distributor.present() &&
+	if (req.distributorInterf.present() && !self->db.serverInfo->get().distributor.present() &&
 			self->clusterControllerDcId == req.distributorInterf.get().locality.dcId() &&
 			!self->recruitingDistributor) {
 		const DataDistributorInterface& di = req.distributorInterf.get();
@ -2188,7 +2158,7 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 			    req.ratekeeperInterf.get().haltRatekeeper.getReply(HaltRatekeeperRequest(self->id)));
 		} else if (!self->recruitingRatekeeperID.present()) {
 			const RatekeeperInterface& rki = req.ratekeeperInterf.get();
-			const auto& ratekeeper = self->db.serverInfo->get().read().ratekeeper;
+			const auto& ratekeeper = self->db.serverInfo->get().ratekeeper;
 			TraceEvent("CCRegisterRatekeeper", self->id).detail("RKID", rki.id());
 			if (ratekeeper.present() && ratekeeper.get().id() != rki.id() && self->id_worker.count(ratekeeper.get().locality.processId())) {
 				TraceEvent("CCHaltPreviousRatekeeper", self->id).detail("RKID", ratekeeper.get().id())
@ -2323,8 +2293,14 @@ ACTOR Future<Void> statusServer(FutureStream< StatusRequest> requests,

 			// Get status but trap errors to send back to client.
 			vector<WorkerDetails> workers;
-			for(auto& it : self->id_worker)
+			std::vector<ProcessIssues> workerIssues;
+
+			for(auto& it : self->id_worker) {
 				workers.push_back(it.second.details);
+				if(it.second.issues.size()) {
+					workerIssues.push_back(ProcessIssues(it.second.details.interf.address(), it.second.issues));
+				}
+			}

 			std::vector<NetworkAddress> incompatibleConnections;
 			for(auto it = self->db.incompatibleConnections.begin(); it != self->db.incompatibleConnections.end();) {
@ -2336,7 +2312,7 @@ ACTOR Future<Void> statusServer(FutureStream< StatusRequest> requests,
 				}
 			}

-			state ErrorOr<StatusReply> result = wait(errorOr(clusterGetStatus(self->db.serverInfo, self->cx, workers, self->db.workersWithIssues, &self->db.clientStatus, coordinators, incompatibleConnections, self->datacenterVersionDifference)));
+			state ErrorOr<StatusReply> result = wait(errorOr(clusterGetStatus(self->db.serverInfo, self->cx, workers, workerIssues, &self->db.clientStatus, coordinators, incompatibleConnections, self->datacenterVersionDifference)));

 			if (result.isError() && result.getError().code() == error_code_actor_cancelled)
 				throw result.getError();
@ -2463,13 +2439,13 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
 					config = LatencyBandConfig::parse(configVal.get());
 				}

-				auto cachedInfo = db->serverInfo->get();
-				auto& serverInfo = cachedInfo.mutate();
+				auto serverInfo = db->serverInfo->get();
 				if(config != serverInfo.latencyBandConfig) {
 					TraceEvent("LatencyBandConfigChanged").detail("Present", config.present());
 					serverInfo.id = deterministicRandom()->randomUniqueID();
+					serverInfo.infoGeneration = ++db->dbInfoCount;
 					serverInfo.latencyBandConfig = config;
-					db->serverInfo->set(cachedInfo);
+					db->serverInfo->set(serverInfo);
 				}

 				state Future<Void> configChangeFuture = tr.watch(latencyBandConfigKey);
@ -2623,7 +2599,7 @@ ACTOR Future<Void> updateDatacenterVersionDifference( ClusterControllerData *sel
 	state double lastLogTime = 0;
 	loop {
 		self->versionDifferenceUpdated = false;
-		if(self->db.serverInfo->get().read().recoveryState >= RecoveryState::ACCEPTING_COMMITS && self->db.config.usableRegions == 1) {
+		if(self->db.serverInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS && self->db.config.usableRegions == 1) {
 			bool oldDifferenceTooLarge = !self->versionDifferenceUpdated || self->datacenterVersionDifference >= SERVER_KNOBS->MAX_VERSION_DIFFERENCE;
 			self->versionDifferenceUpdated = true;
 			self->datacenterVersionDifference = 0;
@ -2638,8 +2614,8 @@ ACTOR Future<Void> updateDatacenterVersionDifference( ClusterControllerData *sel

 		state Optional<TLogInterface> primaryLog;
 		state Optional<TLogInterface> remoteLog;
-		if(self->db.serverInfo->get().read().recoveryState >= RecoveryState::ALL_LOGS_RECRUITED) {
-			for(auto& logSet : self->db.serverInfo->get().read().logSystemConfig.tLogs) {
+		if(self->db.serverInfo->get().recoveryState >= RecoveryState::ALL_LOGS_RECRUITED) {
+			for(auto& logSet : self->db.serverInfo->get().logSystemConfig.tLogs) {
 				if(logSet.isLocal && logSet.locality != tagLocalitySatellite) {
 					for(auto& tLog : logSet.tLogs) {
 						if(tLog.present()) {
@ -2740,12 +2716,12 @@ ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerDa
 	TraceEvent("CCStartDataDistributor", self->id);
 	loop {
 		try {
-			state bool no_distributor = !self->db.serverInfo->get().read().distributor.present();
-			while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().read().master.locality.processId() || self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
+			state bool no_distributor = !self->db.serverInfo->get().distributor.present();
+			while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().master.locality.processId() || self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 				wait(self->db.serverInfo->onChange() || delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY));
 			}
-			if (no_distributor && self->db.serverInfo->get().read().distributor.present()) {
-				return self->db.serverInfo->get().read().distributor.get();
+			if (no_distributor && self->db.serverInfo->get().distributor.present()) {
+				return self->db.serverInfo->get().distributor.get();
 			}

 			std::map<Optional<Standalone<StringRef>>, int> id_used = self->getUsedIds();
@ -2775,15 +2751,15 @@ ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerDa
 }

 ACTOR Future<Void> monitorDataDistributor(ClusterControllerData *self) {
-	while(self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
+	while(self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 		wait(self->db.serverInfo->onChange());
 	}

 	loop {
-		if ( self->db.serverInfo->get().read().distributor.present() ) {
-			wait( waitFailureClient( self->db.serverInfo->get().read().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) );
+		if ( self->db.serverInfo->get().distributor.present() ) {
+			wait( waitFailureClient( self->db.serverInfo->get().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) );
 			TraceEvent("CCDataDistributorDied", self->id)
-			.detail("DistributorId", self->db.serverInfo->get().read().distributor.get().id());
+			.detail("DistributorId", self->db.serverInfo->get().distributor.get().id());
 			self->db.clearInterf(ProcessClass::DataDistributorClass);
 		} else {
 			self->recruitingDistributor = true;
@ -2800,11 +2776,11 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
 	TraceEvent("CCStartRatekeeper", self->id);
 	loop {
 		try {
-			state bool no_ratekeeper = !self->db.serverInfo->get().read().ratekeeper.present();
-			while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().read().master.locality.processId() || self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
+			state bool no_ratekeeper = !self->db.serverInfo->get().ratekeeper.present();
+			while (!self->masterProcessId.present() || self->masterProcessId != self->db.serverInfo->get().master.locality.processId() || self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 				wait(self->db.serverInfo->onChange() || delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY));
 			}
-			if (no_ratekeeper && self->db.serverInfo->get().read().ratekeeper.present()) {
+			if (no_ratekeeper && self->db.serverInfo->get().ratekeeper.present()) {
 				// Existing ratekeeper registers while waiting, so skip.
 				return Void();
 			}
@ -2824,7 +2800,7 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
 			if (interf.present()) {
 				self->recruitRatekeeper.set(false);
 				self->recruitingRatekeeperID = interf.get().id();
-				const auto& ratekeeper = self->db.serverInfo->get().read().ratekeeper;
+				const auto& ratekeeper = self->db.serverInfo->get().ratekeeper;
 				TraceEvent("CCRatekeeperRecruited", self->id).detail("Addr", worker.interf.address()).detail("RKID", interf.get().id());
 				if (ratekeeper.present() && ratekeeper.get().id() != interf.get().id() && self->id_worker.count(ratekeeper.get().locality.processId())) {
 					TraceEvent("CCHaltRatekeeperAfterRecruit", self->id).detail("RKID", ratekeeper.get().id())
@ -2849,16 +2825,16 @@ ACTOR Future<Void> startRatekeeper(ClusterControllerData *self) {
 }

 ACTOR Future<Void> monitorRatekeeper(ClusterControllerData *self) {
-	while(self->db.serverInfo->get().read().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
+	while(self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 		wait(self->db.serverInfo->onChange());
 	}

 	loop {
-		if ( self->db.serverInfo->get().read().ratekeeper.present() && !self->recruitRatekeeper.get() ) {
+		if ( self->db.serverInfo->get().ratekeeper.present() && !self->recruitRatekeeper.get() ) {
 			choose {
-				when(wait(waitFailureClient( self->db.serverInfo->get().read().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME )))  {
+				when(wait(waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME )))  {
 					TraceEvent("CCRatekeeperDied", self->id)
-					.detail("RKID", self->db.serverInfo->get().read().ratekeeper.get().id());
+					.detail("RKID", self->db.serverInfo->get().ratekeeper.get().id());
 					self->db.clearInterf(ProcessClass::RatekeeperClass);
 				}
 				when(wait(self->recruitRatekeeper.onChange())) {}
@ -2869,6 +2845,54 @@ ACTOR Future<Void> monitorRatekeeper(ClusterControllerData *self) {
 	}
 }

+ACTOR Future<Void> dbInfoUpdater( ClusterControllerData* self ) {
+	state Future<Void> dbInfoChange = self->db.serverInfo->onChange();
+	state Future<Void> updateDBInfo = self->updateDBInfo.onTrigger();
+	loop {
+		choose {
+			when(wait(updateDBInfo)) {
+				wait(delay(SERVER_KNOBS->DBINFO_BATCH_DELAY) || dbInfoChange);
+			}
+			when(wait(dbInfoChange)) {}
+		}
+		
+		UpdateServerDBInfoRequest req;
+		if(dbInfoChange.isReady()) {
+			for(auto &it : self->id_worker) {
+				req.broadcastInfo.push_back(it.second.details.interf.updateServerDBInfo.getEndpoint());
+			}
+		} else {
+			for(auto it : self->removedDBInfoEndpoints) {
+				self->updateDBInfoEndpoints.erase(it);
+			}
+			req.broadcastInfo = std::vector<Endpoint>(self->updateDBInfoEndpoints.begin(), self->updateDBInfoEndpoints.end());
+		}
+
+		self->updateDBInfoEndpoints.clear();
+		self->removedDBInfoEndpoints.clear();
+		
+		dbInfoChange = self->db.serverInfo->onChange();
+		updateDBInfo = self->updateDBInfo.onTrigger();
+
+		req.serializedDbInfo = BinaryWriter::toValue(self->db.serverInfo->get(), AssumeVersion(currentProtocolVersion));
+
+		TraceEvent("DBInfoStartBroadcast", self->id);
+		choose {
+			when(std::vector<Endpoint> notUpdated = wait( broadcastDBInfoRequest(req, SERVER_KNOBS->DBINFO_SEND_AMOUNT, Optional<Endpoint>(), false) )) {
+				TraceEvent("DBInfoFinishBroadcast", self->id);
+				for(auto &it : notUpdated) {
+					TraceEvent("DBInfoNotUpdated", self->id).detail("Addr", it.getPrimaryAddress());
+				}
+				if(notUpdated.size()) {
+					self->updateDBInfoEndpoints.insert(notUpdated.begin(), notUpdated.end());
+					self->updateDBInfo.trigger();
+				}
+			}
+			when(wait(dbInfoChange)) {}
+		}
+	}
+}
+
 ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf, Future<Void> leaderFail, ServerCoordinators coordinators, LocalityData locality ) {
 	state ClusterControllerData self( interf, locality );
 	state Future<Void> coordinationPingDelay = delay( SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY );
@ -2889,6 +2913,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
 	self.addActor.send( handleForcedRecoveries(&self, interf) );
 	self.addActor.send( monitorDataDistributor(&self) );
 	self.addActor.send( monitorRatekeeper(&self) );
+	self.addActor.send( dbInfoUpdater(&self) );
 	self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
 	//printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());

@ -2926,7 +2951,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
 			vector<WorkerDetails> workers;

 			for(auto& it : self.id_worker) {
-				if ( (req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && self.db.config.isExcludedServer(it.second.details.interf.address()) ) {
+				if ( (req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && self.db.config.isExcludedServer(it.second.details.interf.addresses()) ) {
 					continue;
 				}

@ -2961,9 +2986,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
 			clusterRegisterMaster( &self, req );
 		}
 		when( GetServerDBInfoRequest req = waitNext( interf.getServerDBInfo.getFuture() ) ) {
-			++self.getServerDBInfoRequests;
-			self.addActor.send(
-			    clusterGetServerInfo(&self.db, req.knownServerInfoID, req.issues, req.incompatiblePeers, req.reply));
+			self.addActor.send( clusterGetServerInfo(&self.db, req.knownServerInfoID, req.reply) );
 		}
 		when( wait( leaderFail ) ) {
 			// We are no longer the leader if this has changed.
--- a/fdbserver/ClusterRecruitmentInterface.h
+++ b/fdbserver/ClusterRecruitmentInterface.h
@ -1,261 +0,0 @@
-/*
- * ClusterRecruitmentInterface.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef FDBSERVER_CLUSTERRECRUITMENTINTERFACE_H
-#define FDBSERVER_CLUSTERRECRUITMENTINTERFACE_H
-#pragma once
-
-#include <vector>
-
-#include "fdbclient/ClusterInterface.h"
-#include "fdbclient/StorageServerInterface.h"
-#include "fdbclient/MasterProxyInterface.h"
-#include "fdbclient/DatabaseConfiguration.h"
-#include "fdbserver/BackupInterface.h"
-#include "fdbserver/DataDistributorInterface.h"
-#include "fdbserver/MasterInterface.h"
-#include "fdbserver/RecoveryState.h"
-#include "fdbserver/TLogInterface.h"
-#include "fdbserver/WorkerInterface.actor.h"
-#include "fdbserver/Knobs.h"
-
-// This interface and its serialization depend on slicing, since the client will deserialize only the first part of this structure
-struct ClusterControllerFullInterface {
-    constexpr static FileIdentifier file_identifier =
-        ClusterControllerClientInterface::file_identifier;
-	ClusterInterface clientInterface;
-	RequestStream< struct RecruitFromConfigurationRequest > recruitFromConfiguration;
-	RequestStream< struct RecruitRemoteFromConfigurationRequest > recruitRemoteFromConfiguration;
-	RequestStream< struct RecruitStorageRequest > recruitStorage;
-	RequestStream< struct RegisterWorkerRequest > registerWorker;
-	RequestStream< struct GetWorkersRequest > getWorkers;
-	RequestStream< struct RegisterMasterRequest > registerMaster;
-	RequestStream< struct GetServerDBInfoRequest > getServerDBInfo;
-
-	UID id() const { return clientInterface.id(); }
-	bool operator == (ClusterControllerFullInterface const& r) const { return id() == r.id(); }
-	bool operator != (ClusterControllerFullInterface const& r) const { return id() != r.id(); }
-
-	bool hasMessage() {
-		return clientInterface.hasMessage() ||
-				recruitFromConfiguration.getFuture().isReady() ||
-				recruitRemoteFromConfiguration.getFuture().isReady() ||
-				recruitStorage.getFuture().isReady() ||
-				registerWorker.getFuture().isReady() || 
-				getWorkers.getFuture().isReady() || 
-				registerMaster.getFuture().isReady() ||
-				getServerDBInfo.getFuture().isReady();
-	}
-
-	void initEndpoints() {
-		clientInterface.initEndpoints();
-		recruitFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit );
-		recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterControllerRecruit );
-		recruitStorage.getEndpoint( TaskPriority::ClusterController );
-		registerWorker.getEndpoint( TaskPriority::ClusterControllerWorker );
-		getWorkers.getEndpoint( TaskPriority::ClusterController );
-		registerMaster.getEndpoint( TaskPriority::ClusterControllerRegister );
-		getServerDBInfo.getEndpoint( TaskPriority::ClusterController );
-	}
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		if constexpr (!is_fb_function<Ar>) {
-			ASSERT(ar.protocolVersion().isValid());
-		}
-		serializer(ar, clientInterface, recruitFromConfiguration, recruitRemoteFromConfiguration, recruitStorage,
-		           registerWorker, getWorkers, registerMaster, getServerDBInfo);
-	}
-};
-
-struct RecruitFromConfigurationReply {
-	constexpr static FileIdentifier file_identifier = 2224085;
-	std::vector<WorkerInterface> backupWorkers;
-	std::vector<WorkerInterface> tLogs;
-	std::vector<WorkerInterface> satelliteTLogs;
-	std::vector<WorkerInterface> proxies;
-	std::vector<WorkerInterface> resolvers;
-	std::vector<WorkerInterface> storageServers;
-	std::vector<WorkerInterface> oldLogRouters;
-	Optional<Key> dcId;
-	bool satelliteFallback;
-
-	RecruitFromConfigurationReply() : satelliteFallback(false) {}
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, tLogs, satelliteTLogs, proxies, resolvers, storageServers, oldLogRouters, dcId,
-		           satelliteFallback, backupWorkers);
-	}
-};
-
-struct RecruitFromConfigurationRequest {
-	constexpr static FileIdentifier file_identifier = 2023046;
-	DatabaseConfiguration configuration;
-	bool recruitSeedServers;
-	int maxOldLogRouters;
-	ReplyPromise< struct RecruitFromConfigurationReply > reply;
-
-	RecruitFromConfigurationRequest() {}
-	explicit RecruitFromConfigurationRequest(DatabaseConfiguration const& configuration, bool recruitSeedServers, int maxOldLogRouters)
-		: configuration(configuration), recruitSeedServers(recruitSeedServers), maxOldLogRouters(maxOldLogRouters) {}
-
-	template <class Ar>
-	void serialize( Ar& ar ) {
-		serializer(ar, configuration, recruitSeedServers, maxOldLogRouters, reply);
-	}
-};
-
-struct RecruitRemoteFromConfigurationReply {
-	constexpr static FileIdentifier file_identifier = 9091392;
-	std::vector<WorkerInterface> remoteTLogs;
-	std::vector<WorkerInterface> logRouters;
-
-	template <class Ar>
-	void serialize( Ar& ar ) {
-		serializer(ar, remoteTLogs, logRouters);
-	}
-};
-
-struct RecruitRemoteFromConfigurationRequest {
-	constexpr static FileIdentifier file_identifier = 3235995;
-	DatabaseConfiguration configuration;
-	Optional<Key> dcId;
-	int logRouterCount;
-	std::vector<UID> exclusionWorkerIds;
-	ReplyPromise< struct RecruitRemoteFromConfigurationReply > reply;
-
-	RecruitRemoteFromConfigurationRequest() {}
-	RecruitRemoteFromConfigurationRequest(DatabaseConfiguration const& configuration, Optional<Key> const& dcId, int logRouterCount, const std::vector<UID> &exclusionWorkerIds) : configuration(configuration), dcId(dcId), logRouterCount(logRouterCount), exclusionWorkerIds(exclusionWorkerIds){}
-
-	template <class Ar>
-	void serialize( Ar& ar ) {
-		serializer(ar, configuration, dcId, logRouterCount, exclusionWorkerIds, reply);
-	}
-};
-
-struct RecruitStorageReply {
-	constexpr static FileIdentifier file_identifier = 15877089;
-	WorkerInterface worker;
-	ProcessClass processClass;
-
-	template <class Ar>
-	void serialize( Ar& ar ) {
-		serializer(ar, worker, processClass);
-	}
-};
-
-struct RecruitStorageRequest {
-	constexpr static FileIdentifier file_identifier = 905920;
-	std::vector<Optional<Standalone<StringRef>>> excludeMachines;	//< Don't recruit any of these machines
-	std::vector<AddressExclusion> excludeAddresses;		//< Don't recruit any of these addresses
-	std::vector<Optional<Standalone<StringRef>>> includeDCs;
-	bool criticalRecruitment;							//< True if machine classes are to be ignored
-	ReplyPromise< RecruitStorageReply > reply;
-
-	template <class Ar>
-	void serialize( Ar& ar ) {
-		serializer(ar, excludeMachines, excludeAddresses, includeDCs, criticalRecruitment, reply);
-	}
-};
-
-struct RegisterWorkerReply {
-	constexpr static FileIdentifier file_identifier = 16475696;
-	ProcessClass processClass;
-	ClusterControllerPriorityInfo priorityInfo;
-
-	RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
-	RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo) : processClass(processClass), priorityInfo(priorityInfo) {}
-
-	template <class Ar>
-	void serialize( Ar& ar ) {
-		serializer(ar, processClass, priorityInfo);
-	}
-};
-
-struct RegisterWorkerRequest {
-	constexpr static FileIdentifier file_identifier = 14332605;
-	WorkerInterface wi;
-	ProcessClass initialClass;
-	ProcessClass processClass;
-	ClusterControllerPriorityInfo priorityInfo;
-	Generation generation;
-	Optional<DataDistributorInterface> distributorInterf;
-	Optional<RatekeeperInterface> ratekeeperInterf;
-	ReplyPromise<RegisterWorkerReply> reply;
-	bool degraded;
-
-	RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {}
-	RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional<DataDistributorInterface> ddInterf, Optional<RatekeeperInterface> rkInterf, bool degraded) :
-	wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), degraded(degraded) {}
-
-	template <class Ar>
-	void serialize( Ar& ar ) {
-		serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, reply, degraded);
-	}
-};
-
-struct GetWorkersRequest {
-	constexpr static FileIdentifier file_identifier = 1254174;
-	enum { TESTER_CLASS_ONLY = 0x1, NON_EXCLUDED_PROCESSES_ONLY = 0x2 };
-
-	int flags;
-	ReplyPromise<vector<WorkerDetails>> reply;
-
-	GetWorkersRequest() : flags(0) {}
-	explicit GetWorkersRequest(int fl) : flags(fl) {}
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, flags, reply);
-	}
-};
-
-struct RegisterMasterRequest {
-	constexpr static FileIdentifier file_identifier = 10773445;
-	UID id;
-	LocalityData mi;
-	LogSystemConfig logSystemConfig;
-	std::vector<MasterProxyInterface> proxies;
-	std::vector<ResolverInterface> resolvers;
-	DBRecoveryCount recoveryCount;
-	int64_t registrationCount;
-	Optional<DatabaseConfiguration> configuration;
-	std::vector<UID> priorCommittedLogServers;
-	RecoveryState recoveryState;
-	bool recoveryStalled;
-
-	ReplyPromise<Void> reply;
-
-	RegisterMasterRequest() : logSystemConfig(0) {}
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		if constexpr (!is_fb_function<Ar>) {
-			ASSERT(ar.protocolVersion().isValid());
-		}
-		serializer(ar, id, mi, logSystemConfig, proxies, resolvers, recoveryCount, registrationCount, configuration,
-		           priorCommittedLogServers, recoveryState, recoveryStalled, reply);
-	}
-};
-
-#include "fdbserver/ServerDBInfo.h" // include order hack
-
-#endif
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@ -216,7 +216,9 @@ ACTOR Future<Void> openDatabase(ClientData* db, int* clientCount, Reference<Asyn
 	++(*clientCount);
 	hasConnectedClients->set(true);
 	
-	db->clientStatusInfoMap[req.reply.getEndpoint().getPrimaryAddress()] = ClientStatusInfo(req.traceLogGroup, req.supportedVersions, req.issues);
+	if(req.supportedVersions.size() > 0) {
+		db->clientStatusInfoMap[req.reply.getEndpoint().getPrimaryAddress()] = ClientStatusInfo(req.traceLogGroup, req.supportedVersions, req.issues);
+	}

 	while (db->clientInfo->get().read().id == req.knownClientInfoID && !db->clientInfo->get().read().forward.present()) {
 		choose {
@ -225,7 +227,9 @@ ACTOR Future<Void> openDatabase(ClientData* db, int* clientCount, Reference<Asyn
 		}
 	}

-	db->clientStatusInfoMap.erase(req.reply.getEndpoint().getPrimaryAddress());
+	if(req.supportedVersions.size() > 0) {
+		db->clientStatusInfoMap.erase(req.reply.getEndpoint().getPrimaryAddress());
+	}

 	req.reply.send( db->clientInfo->get() );

--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -23,6 +23,7 @@
 #include "fdbclient/SystemData.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/ManagementAPI.actor.h"
+#include "fdbclient/RunTransaction.actor.h"
 #include "fdbrpc/Replication.h"
 #include "fdbserver/DataDistribution.actor.h"
 #include "fdbserver/FDBExecHelper.actor.h"
@ -45,8 +46,10 @@ class TCMachineTeamInfo;
 ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self);
 ACTOR Future<Void> removeWrongStoreType(DDTeamCollection* self);

+
 struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
 	UID id;
+	DDTeamCollection* collection;
 	StorageServerInterface lastKnownInterface;
 	ProcessClass lastKnownClass;
 	vector<Reference<TCTeamInfo>> teams;
@ -63,13 +66,14 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
 	LocalityEntry localityEntry;
 	Promise<Void> updated;
 	AsyncVar<bool> wrongStoreTypeToRemove;
+	AsyncVar<bool> ssVersionTooFarBehind;
 	// A storage server's StoreType does not change.
 	// To change storeType for an ip:port, we destroy the old one and create a new one.
 	KeyValueStoreType storeType; // Storage engine type

-	TCServerInfo(StorageServerInterface ssi, ProcessClass processClass, bool inDesiredDC,
+	TCServerInfo(StorageServerInterface ssi, DDTeamCollection* collection, ProcessClass processClass, bool inDesiredDC,
 	             Reference<LocalitySet> storageServerSet)
-	  : id(ssi.id()), lastKnownInterface(ssi), lastKnownClass(processClass), dataInFlightToServer(0),
+	  : id(ssi.id()), collection(collection), lastKnownInterface(ssi), lastKnownClass(processClass), dataInFlightToServer(0),
 	    onInterfaceChanged(interfaceChanged.getFuture()), onRemoved(removed.getFuture()), inDesiredDC(inDesiredDC),
 	    storeType(KeyValueStoreType::END) {
 		localityEntry = ((LocalityMap<UID>*) storageServerSet.getPtr())->add(ssi.locality, &id);
@ -80,6 +84,7 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
 		// If a storage server does not reply its storeType, it will be tracked by failure monitor and removed.
 		return (storeType == configStoreType || storeType == KeyValueStoreType::END);
 	}
+	~TCServerInfo();
 };

 struct TCMachineInfo : public ReferenceCounted<TCMachineInfo> {
@ -109,51 +114,7 @@ struct TCMachineInfo : public ReferenceCounted<TCMachineInfo> {
 	}
 };

-ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
-	state StorageServerInterface ssi = server->lastKnownInterface;
-	state Future<ErrorOr<GetStorageMetricsReply>> metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch );
-	state Future<Void> resetRequest = Never();
-	state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged( server->onInterfaceChanged );
-	state Future<Void> serverRemoved( server->onRemoved );
-
-	loop {
-		choose {
-			when( ErrorOr<GetStorageMetricsReply> rep = wait( metricsRequest ) ) {
-				if( rep.present() ) {
-					server->serverMetrics = rep;
-					if(server->updated.canBeSet()) {
-						server->updated.send(Void());
-					}
-					return Void();
-				}
-				metricsRequest = Never();
-				resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch );
-			}
-			when( std::pair<StorageServerInterface,ProcessClass> _ssi = wait( interfaceChanged ) ) {
-				ssi = _ssi.first;
-				interfaceChanged = server->onInterfaceChanged;
-				resetRequest = Void();
-			}
-			when( wait( serverRemoved ) ) {
-				return Void();
-			}
-			when( wait( resetRequest ) ) { //To prevent a tight spin loop
-				if(IFailureMonitor::failureMonitor().getState(ssi.getStorageMetrics.getEndpoint()).isFailed()) {
-					resetRequest = IFailureMonitor::failureMonitor().onStateEqual(ssi.getStorageMetrics.getEndpoint(), FailureStatus(false));
-				}
-				else {
-					resetRequest = Never();
-					metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch );
-				}
-			}
-		}
-	}
-}
-
-ACTOR Future<Void> updateServerMetrics( Reference<TCServerInfo> server ) {
-	wait( updateServerMetrics( server.getPtr() ) );
-	return Void();
-}
+ACTOR Future<Void> updateServerMetrics( Reference<TCServerInfo> server);

 // TeamCollection's machine team information
 class TCMachineTeamInfo : public ReferenceCounted<TCMachineTeamInfo> {
@ -596,6 +557,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	int64_t unhealthyServers;
 	std::map<int,int> priority_teams;
 	std::map<UID, Reference<TCServerInfo>> server_info;
+	std::map<Key, int> lagging_zones; // zone to number of storage servers lagging
+	AsyncVar<bool> disableFailingLaggingServers;

 	// machine_info has all machines info; key must be unique across processes on the same machine
 	std::map<Standalone<StringRef>, Reference<TCMachineInfo>> machine_info;
@ -721,6 +684,23 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		teamBuilder.cancel();
 	}

+	void addLaggingStorageServer(Key zoneId) {
+		lagging_zones[zoneId]++;
+		if (lagging_zones.size() > std::max(1, configuration.storageTeamSize - 1) && !disableFailingLaggingServers.get())
+			disableFailingLaggingServers.set(true);
+	}
+
+	void removeLaggingStorageServer(Key zoneId) {
+		auto iter = lagging_zones.find(zoneId);
+		ASSERT(iter != lagging_zones.end());
+		iter->second--;
+		ASSERT(iter->second >= 0);
+		if (iter->second == 0)
+			lagging_zones.erase(iter);
+		if (lagging_zones.size() <= std::max(1, configuration.storageTeamSize - 1) && disableFailingLaggingServers.get())
+			disableFailingLaggingServers.set(false);
+	}
+
 	ACTOR static Future<Void> logOnCompletion( Future<Void> signal, DDTeamCollection* self ) {
 		wait(signal);
 		wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskPriority::DataDistribution));
@ -1040,7 +1020,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 					TraceEvent(SevWarnAlways, "MissingLocality")
 					    .detail("Server", i->first.uniqueID)
 					    .detail("Locality", i->first.locality.toString());
-					auto addr = i->first.address();
+					auto addr = i->first.stableAddress();
 					self->invalidLocalityAddr.insert(AddressExclusion(addr.ip, addr.port));
 					if (self->checkInvalidLocalities.isReady()) {
 						self->checkInvalidLocalities = checkAndRemoveInvalidLocalityAddr(self);
@ -2255,6 +2235,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				    .detail("DoBuildTeams", self->doBuildTeams)
 				    .trackLatest("TeamCollectionInfo");
 			}
+		} else {
+			self->lastBuildTeamsFailed = true;
 		}

 		self->evaluateTeamQuality();
@ -2297,7 +2279,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		allServers.push_back( newServer.id() );

 		TraceEvent("AddedStorageServer", distributorId).detail("ServerID", newServer.id()).detail("ProcessClass", processClass.toString()).detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token).detail("Address", newServer.waitFailure.getEndpoint().getPrimaryAddress());
-		auto &r = server_info[newServer.id()] = Reference<TCServerInfo>( new TCServerInfo( newServer, processClass, includedDCs.empty() || std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(), storageServerSet ) );
+		auto &r = server_info[newServer.id()] = Reference<TCServerInfo>( new TCServerInfo( newServer, this, processClass, includedDCs.empty() || std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(), storageServerSet ) );

 		// Establish the relation between server and machine
 		checkAndCreateMachine(r);
@ -2586,6 +2568,80 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	}
 };

+
+TCServerInfo::~TCServerInfo() {
+	if (ssVersionTooFarBehind.get()) {
+		collection->removeLaggingStorageServer(lastKnownInterface.locality.zoneId().get());			
+	}
+}
+
+ACTOR Future<Void> updateServerMetrics( TCServerInfo *server ) {
+	state StorageServerInterface ssi = server->lastKnownInterface;
+	state Future<ErrorOr<GetStorageMetricsReply>> metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch );
+	state Future<Void> resetRequest = Never();
+	state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged( server->onInterfaceChanged );
+	state Future<Void> serverRemoved( server->onRemoved );
+
+	loop {
+		choose {
+			when( ErrorOr<GetStorageMetricsReply> rep = wait( metricsRequest ) ) {
+				if( rep.present() ) {
+					server->serverMetrics = rep;
+					if(server->updated.canBeSet()) {
+						server->updated.send(Void());
+					}
+					break;
+				}
+				metricsRequest = Never();
+				resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch );
+			}
+			when( std::pair<StorageServerInterface,ProcessClass> _ssi = wait( interfaceChanged ) ) {
+				ssi = _ssi.first;
+				interfaceChanged = server->onInterfaceChanged;
+				resetRequest = Void();
+			}
+			when( wait( serverRemoved ) ) {
+				return Void();
+			}
+			when( wait( resetRequest ) ) { //To prevent a tight spin loop
+				if(IFailureMonitor::failureMonitor().getState(ssi.getStorageMetrics.getEndpoint()).isFailed()) {
+					resetRequest = IFailureMonitor::failureMonitor().onStateEqual(ssi.getStorageMetrics.getEndpoint(), FailureStatus(false));
+				}
+				else {
+					resetRequest = Never();
+					metricsRequest = ssi.getStorageMetrics.tryGetReply( GetStorageMetricsRequest(), TaskPriority::DataDistributionLaunch );
+				}
+			}
+		}
+	}
+
+	if ( server->serverMetrics.get().lastUpdate < now() - SERVER_KNOBS->DD_SS_STUCK_TIME_LIMIT ) {
+			if (server->ssVersionTooFarBehind.get() == false) {
+				TraceEvent("StorageServerStuck", server->collection->distributorId).detail("ServerId", server->id.toString()).detail("LastUpdate", server->serverMetrics.get().lastUpdate);
+				server->ssVersionTooFarBehind.set(true);
+				server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get());
+			}
+	} else if ( server->serverMetrics.get().versionLag > SERVER_KNOBS->DD_SS_FAILURE_VERSIONLAG  ) {
+		if (server->ssVersionTooFarBehind.get() == false) {
+			TraceEvent("SSVersionDiffLarge", server->collection->distributorId).detail("ServerId", server->id.toString()).detail("VersionLag", server->serverMetrics.get().versionLag);
+			server->ssVersionTooFarBehind.set(true);
+			server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get());
+		}
+	} else if ( server->serverMetrics.get().versionLag  < SERVER_KNOBS->DD_SS_ALLOWED_VERSIONLAG ) {
+		if (server->ssVersionTooFarBehind.get() == true) {
+			TraceEvent("SSVersionDiffNormal", server->collection->distributorId).detail("ServerId", server->id.toString()).detail("VersionLag", server->serverMetrics.get().versionLag);
+			server->ssVersionTooFarBehind.set(false);
+			server->collection->removeLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get());
+		}
+	}
+	return Void();
+}
+
+ACTOR Future<Void> updateServerMetrics( Reference<TCServerInfo> server) {
+	wait( updateServerMetrics( server.getPtr() ) );
+	return Void();
+}
+
 ACTOR Future<Void> waitUntilHealthy(DDTeamCollection* self, double extraDelay = 0) {
 	state int waitCount = 0;
 	loop {
@ -2858,6 +2914,14 @@ bool teamContainsFailedServer(DDTeamCollection* self, Reference<TCTeamInfo> team
 		    self->excludedServers.get(ipaddr) == DDTeamCollection::Status::FAILED) {
 			return true;
 		}
+		if(ssi.secondaryAddress().present()) {
+			AddressExclusion saddr(ssi.secondaryAddress().get().ip, ssi.secondaryAddress().get().port);
+			AddressExclusion sipaddr(ssi.secondaryAddress().get().ip);
+			if (self->excludedServers.get(saddr) == DDTeamCollection::Status::FAILED ||
+				self->excludedServers.get(sipaddr) == DDTeamCollection::Status::FAILED) {
+				return true;
+			}
+		}
 	}
 	return false;
 }
@ -3332,7 +3396,7 @@ ACTOR Future<Void> waitHealthyZoneChange( DDTeamCollection* self ) {
 	}
 }

-ACTOR Future<Void> serverMetricsPolling( TCServerInfo *server) {
+ACTOR Future<Void> serverMetricsPolling( TCServerInfo *server ) {
 	state double lastUpdate = now();
 	loop {
 		wait( updateServerMetrics( server ) );
@ -3479,6 +3543,7 @@ ACTOR Future<Void> storageServerTracker(
 	state ServerStatus status( false, false, server->lastKnownInterface.locality );
 	state bool lastIsUnhealthy = false;
 	state Future<Void> metricsTracker = serverMetricsPolling( server );
+
 	state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged = server->onInterfaceChanged;

 	state Future<Void> storeTypeTracker = keyValueStoreTypeTracker(self, server);
@ -3489,7 +3554,7 @@ ACTOR Future<Void> storageServerTracker(

 	try {
 		loop {
-			status.isUndesired = false;
+			status.isUndesired = !self->disableFailingLaggingServers.get() && server->ssVersionTooFarBehind.get();
 			status.isWrongConfiguration = false;
 			hasWrongDC = !isCorrectDC(self, server);
 			hasInvalidLocality =
@ -3569,29 +3634,41 @@ ACTOR Future<Void> storageServerTracker(

 			// If the storage server is in the excluded servers list, it is undesired
 			NetworkAddress a = server->lastKnownInterface.address();
-			state AddressExclusion addr( a.ip, a.port );
-			state AddressExclusion ipaddr( a.ip );
-			state DDTeamCollection::Status addrStatus = self->excludedServers.get(addr);
-			state DDTeamCollection::Status ipaddrStatus = self->excludedServers.get(ipaddr);
-			if (addrStatus != DDTeamCollection::Status::NONE || ipaddrStatus != DDTeamCollection::Status::NONE) {
+			AddressExclusion worstAddr( a.ip, a.port );
+			DDTeamCollection::Status worstStatus = self->excludedServers.get( worstAddr );
+			otherChanges.push_back( self->excludedServers.onChange( worstAddr ) );
+
+			for(int i = 0; i < 3; i++) {
+				if(i > 0 && !server->lastKnownInterface.secondaryAddress().present()) {
+					break;
+				}
+				AddressExclusion testAddr;
+				if(i == 0) testAddr = AddressExclusion(a.ip);
+				else if(i == 1) testAddr = AddressExclusion(server->lastKnownInterface.secondaryAddress().get().ip, server->lastKnownInterface.secondaryAddress().get().port);
+				else if(i == 2) testAddr = AddressExclusion(server->lastKnownInterface.secondaryAddress().get().ip);
+				DDTeamCollection::Status testStatus = self->excludedServers.get(testAddr);
+				if(testStatus > worstStatus) {
+					worstStatus = testStatus;
+					worstAddr = testAddr;
+				}
+				otherChanges.push_back( self->excludedServers.onChange( testAddr ) );
+			}
+
+			if (worstStatus != DDTeamCollection::Status::NONE) {
 				TraceEvent(SevWarn, "UndesiredStorageServer", self->distributorId)
-				    .detail("Server", server->id)
-				    .detail("Excluded",
-				            ipaddrStatus == DDTeamCollection::Status::NONE ? addr.toString() : ipaddr.toString());
+					.detail("Server", server->id)
+					.detail("Excluded", worstAddr.toString());
 				status.isUndesired = true;
 				status.isWrongConfiguration = true;
-				if (addrStatus == DDTeamCollection::Status::FAILED ||
-				    ipaddrStatus == DDTeamCollection::Status::FAILED) {
+				if (worstStatus == DDTeamCollection::Status::FAILED) {
 					TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId)
-					    .detail("Address", addr.toString())
-					    .detail("ServerID", server->id);
+						.detail("Server", server->id)
+						.detail("Excluded", worstAddr.toString());
 					wait(removeKeysFromFailedServer(cx, server->id, self->lock));
 					if (BUGGIFY) wait(delay(5.0));
 					self->shardsAffectedByTeamFailure->eraseServer(server->id);
 				}
 			}
-			otherChanges.push_back( self->excludedServers.onChange( addr ) );
-			otherChanges.push_back( self->excludedServers.onChange( ipaddr ) );

 			failureTracker = storageServerFailureTracker(self, server, cx, &status, addedVersion);
 			//We need to recruit new storage servers if the key value store type has changed
@ -3599,6 +3676,7 @@ ACTOR Future<Void> storageServerTracker(
 				self->restartRecruiting.trigger();
 			}

+
 			if (lastIsUnhealthy && !status.isUnhealthy() &&
 			    ( server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) {
 				self->doBuildTeams = true;
@ -3753,6 +3831,8 @@ ACTOR Future<Void> storageServerTracker(
 					server->wakeUpTracker = Promise<Void>();
 				}
 				when(wait(storeTypeTracker)) {}
+				when(wait(server->ssVersionTooFarBehind.onChange())) { }
+				when(wait(self->disableFailingLaggingServers.onChange())) { }
 			}

 			if (recordTeamCollectionInfo) {
@ -3861,7 +3941,7 @@ ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
 int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) {
 	int numExistingSS = 0;
 	for (auto& server : self->server_info) {
-		const NetworkAddress& netAddr = server.second->lastKnownInterface.address();
+		const NetworkAddress& netAddr = server.second->lastKnownInterface.stableAddress();
 		AddressExclusion usedAddr(netAddr.ip, netAddr.port);
 		if (usedAddr == addr) {
 			++numExistingSS;
@ -3875,10 +3955,10 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self, RecruitStorageReply
 	// SOMEDAY: Cluster controller waits for availability, retry quickly if a server's Locality changes
 	self->recruitingStream.set(self->recruitingStream.get() + 1);

-	const NetworkAddress& netAddr = candidateWorker.worker.address();
+	const NetworkAddress& netAddr = candidateWorker.worker.stableAddress();
 	AddressExclusion workerAddr(netAddr.ip, netAddr.port);
 	if (numExistingSSOnAddr(self, workerAddr) <= 2 &&
-	    self->recruitingLocalities.find(candidateWorker.worker.address()) == self->recruitingLocalities.end()) {
+	    self->recruitingLocalities.find(candidateWorker.worker.stableAddress()) == self->recruitingLocalities.end()) {
 		// Only allow at most 2 storage servers on an address, because
 		// too many storage server on the same address (i.e., process) can cause OOM.
 		// Ask the candidateWorker to initialize a SS only if the worker does not have a pending request
@ -3899,7 +3979,7 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self, RecruitStorageReply
 		    .detail("RecruitingStream", self->recruitingStream.get());

 		self->recruitingIds.insert(interfaceId);
-		self->recruitingLocalities.insert(candidateWorker.worker.address());
+		self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());
 		state ErrorOr<InitializeStorageReply> newServer =
 		    wait(candidateWorker.worker.storage.tryGetReply(isr, TaskPriority::DataDistribution));
 		if (newServer.isError()) {
@ -3910,7 +3990,7 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self, RecruitStorageReply
 			wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution));
 		}
 		self->recruitingIds.erase(interfaceId);
-		self->recruitingLocalities.erase(candidateWorker.worker.address());
+		self->recruitingLocalities.erase(candidateWorker.worker.stableAddress());

 		TraceEvent("DDRecruiting")
 		    .detail("Primary", self->primary)
@ -3956,7 +4036,7 @@ ACTOR Future<Void> storageRecruiter( DDTeamCollection* self, Reference<AsyncVar<
 					TraceEvent(SevDebug, "DDRecruitExcl1")
 					    .detail("Primary", self->primary)
 					    .detail("Excluding", s->second->lastKnownInterface.address());
-					auto addr = s->second->lastKnownInterface.address();
+					auto addr = s->second->lastKnownInterface.stableAddress();
 					AddressExclusion addrExcl(addr.ip, addr.port);
 					exclusions.insert(addrExcl);
 					numSSPerAddr[addrExcl]++; // increase from 0
@ -4007,8 +4087,8 @@ ACTOR Future<Void> storageRecruiter( DDTeamCollection* self, Reference<AsyncVar<

 			choose {
 				when( RecruitStorageReply candidateWorker = wait( fCandidateWorker ) ) {
-					AddressExclusion candidateSSAddr(candidateWorker.worker.address().ip,
-					                                 candidateWorker.worker.address().port);
+					AddressExclusion candidateSSAddr(candidateWorker.worker.stableAddress().ip,
+					                                 candidateWorker.worker.stableAddress().port);
 					int numExistingSS = numSSPerAddr[candidateSSAddr];
 					if (numExistingSS >= 2) {
 						TraceEvent(SevWarnAlways, "StorageRecruiterTooManySSOnSameAddr", self->distributorId)
@ -4334,12 +4414,12 @@ ACTOR Future<Void> monitorBatchLimitedTime(Reference<AsyncVar<ServerDBInfo>> db,
 	loop {
 		wait( delay(SERVER_KNOBS->METRIC_UPDATE_RATE) );

-		state Reference<ProxyInfo> proxies(new ProxyInfo(db->get().client.proxies, db->get().myLocality));
+		state Reference<ProxyInfo> proxies(new ProxyInfo(db->get().client.proxies));

 		choose {
 			when (wait(db->onChange())) {}
 			when (GetHealthMetricsReply reply = wait(proxies->size() ?
-					loadBalance(proxies, &MasterProxyInterface::getHealthMetrics, GetHealthMetricsRequest(false))
+					basicLoadBalance(proxies, &MasterProxyInterface::getHealthMetrics, GetHealthMetricsRequest(false))
 					: Never())) {
 				if (reply.healthMetrics.batchLimited) {
 					*lastLimited = now();
@ -4742,7 +4822,7 @@ ACTOR Future<Void> ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest
 	// Go through storage server interfaces and translate Address -> server ID (UID)
 	for (const AddressExclusion& excl : req.exclusions) {
 		for (const auto& ssi : ssis) {
-			if (excl.excludes(ssi.address())) {
+			if (excl.excludes(ssi.address()) || (ssi.secondaryAddress().present() && excl.excludes(ssi.secondaryAddress().get()))) {
 				excludeServerIDs.push_back(ssi.id());
 			}
 		}
@ -4844,7 +4924,7 @@ DDTeamCollection* testTeamCollection(int teamSize, Reference<IReplicationPolicy>
 	 	interface.locality.set(LiteralStringRef("machineid"), Standalone<StringRef>(std::to_string(id)));
 		interface.locality.set(LiteralStringRef("zoneid"), Standalone<StringRef>(std::to_string(id % 5)));
 		interface.locality.set(LiteralStringRef("data_hall"), Standalone<StringRef>(std::to_string(id % 3)));
-		collection->server_info[uid] = Reference<TCServerInfo>(new TCServerInfo(interface, ProcessClass(), true, collection->storageServerSet));
+		collection->server_info[uid] = Reference<TCServerInfo>(new TCServerInfo(interface, collection, ProcessClass(), true, collection->storageServerSet));
 		collection->server_status.set(uid, ServerStatus(false, false, interface.locality));
 		collection->checkAndCreateMachine(collection->server_info[uid]);
 	}
@ -4885,7 +4965,7 @@ DDTeamCollection* testMachineTeamCollection(int teamSize, Reference<IReplication
 		interface.locality.set(LiteralStringRef("data_hall"), Standalone<StringRef>(std::to_string(data_hall_id)));
 		interface.locality.set(LiteralStringRef("dcid"), Standalone<StringRef>(std::to_string(dc_id)));
 		collection->server_info[uid] =
-		    Reference<TCServerInfo>(new TCServerInfo(interface, ProcessClass(), true, collection->storageServerSet));
+		    Reference<TCServerInfo>(new TCServerInfo(interface, collection, ProcessClass(), true, collection->storageServerSet));

 		collection->server_status.set(uid, ServerStatus(false, false, interface.locality));
 	}
--- a/Show More
+++ b/Show More