Merge remote-tracking branch 'origin' into atomically-update-dependent-knobs

2020-04-08 12:26:21 -07:00 · 2020-04-08 12:26:21 -07:00 · 52860043c9
parent 92b9fdf4c4 f500353368
commit 52860043c9
143 changed files with 3648 additions and 228 deletions
--- a/26
+++ b/26
@ -479,3 +479,29 @@ SHIBUKAWA Yoshiki (sphinxcontrib-rubydomain)
 	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Armon Dadgar (ART)
+    Copyright (c) 2012, Armon Dadgar
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+           modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+           notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+           notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+    * Neither the name of the organization nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+           ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL ARMON DADGAR BE LIABLE FOR ANY
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+           ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+           (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -18,7 +18,7 @@
 # limitations under the License.
 cmake_minimum_required(VERSION 3.13)
 project(foundationdb
-  VERSION 7.0.0
+  VERSION 6.3.0
  DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions."
  HOMEPAGE_URL "http://www.foundationdb.org/"
  LANGUAGES C CXX ASM)
--- a/2
+++ b/2
@ -35,7 +35,7 @@ ifeq ($(NIGHTLY),true)
 	CFLAGS += -DFDB_CLEAN_BUILD
 endif

-BOOST_BASENAME ?= boost_1_67_0
+BOOST_BASENAME ?= boost_1_72_0
 ifeq ($(PLATFORM),Linux)
  PLATFORM := linux

--- a/README.md
+++ b/README.md
@ -33,8 +33,8 @@ CMake-based build system. Both of them should currently work for most users,
 and CMake should be the preferred choice as it will eventually become the only
 build system available.

-If compiling for local development, please set -DUSE_WERROR=ON in
-cmake. Our CI compiles with -Werror on, so this way you'll find out about
+If compiling for local development, please set `-DUSE_WERROR=ON` in
+cmake. Our CI compiles with `-Werror` on, so this way you'll find out about
 compiler warnings that break the build earlier.

 ## CMake
@ -51,8 +51,8 @@ Mac OS - for Windows see below):
 1. Create a build directory (you can have the build directory anywhere you
   like): `mkdir build`
 1. `cd build`
-1. `cmake -DBOOST_ROOT=<PATH_TO_BOOST> <PATH_TO_FOUNDATIONDB_DIRECTORY>`
-1. `make`
+1. `cmake -GNinja -DBOOST_ROOT=<PATH_TO_BOOST> <PATH_TO_FOUNDATIONDB_DIRECTORY>`
+1. `ninja`

 CMake will try to find its dependencies. However, for LibreSSL this can be often
 problematic (especially if OpenSSL is installed as well). For that we recommend
@ -61,7 +61,7 @@ LibreSSL is installed under `/usr/local/libressl-2.8.3`, you should call cmake l
 this:

 ```
-cmake -DLibreSSL_ROOT=/usr/local/libressl-2.8.3/ ../foundationdb
+cmake -GNinja -DLibreSSL_ROOT=/usr/local/libressl-2.8.3/ ../foundationdb
 ```

 FoundationDB will build just fine without LibreSSL, however, the resulting
@ -133,8 +133,8 @@ If you want to create a package you have to tell cmake what platform it is for.
 And then you can build by simply calling `cpack`. So for debian, call:

 ```
-cmake <FDB_SOURCE_DIR>
-make
+cmake -GNinja <FDB_SOURCE_DIR>
+ninja
 cpack -G DEB
 ```

@ -142,21 +142,21 @@ For RPM simply replace `DEB` with `RPM`.

 ### MacOS

-The build under MacOS will work the same way as on Linux. To get LibreSSL and boost you
-can use [Homebrew](https://brew.sh/). LibreSSL will not be installed in
-`/usr/local` instead it will stay in `/usr/local/Cellar`. So the cmake command
-will look something like this:
+The build under MacOS will work the same way as on Linux. To get LibreSSL,
+boost, and ninja you can use [Homebrew](https://brew.sh/). LibreSSL will not be
+installed in `/usr/local` instead it will stay in `/usr/local/Cellar`. So the
+cmake command will look something like this:

 ```sh
-cmake -DLibreSSL_ROOT=/usr/local/Cellar/libressl/2.8.3 <PATH_TO_FOUNDATIONDB_SOURCE>
+cmake -GNinja -DLibreSSL_ROOT=/usr/local/Cellar/libressl/2.8.3 <PATH_TO_FOUNDATIONDB_SOURCE>
 ```

 To generate a installable package, you have to call CMake with the corresponding
 arguments and then use cpack to generate the package:

 ```sh
-cmake <FDB_SOURCE_DIR>
-make
+cmake -GNinja <FDB_SOURCE_DIR>
+ninja
 cpack -G productbuild
 ```

--- a/bindings/bindingtester/init.py
+++ b/bindings/bindingtester/init.py
@ -26,7 +26,7 @@ sys.path[:0] = [os.path.join(os.path.dirname(__file__), '..', '..', 'bindings',

 import util

-FDB_API_VERSION = 700
+FDB_API_VERSION = 630

 LOGGING = {
    'version': 1,
--- a/bindings/bindingtester/bindingtester.py
+++ b/bindings/bindingtester/bindingtester.py
@ -157,7 +157,7 @@ def choose_api_version(selected_api_version, tester_min_version, tester_max_vers
            api_version = min_version
        elif random.random() < 0.9:
            api_version = random.choice([v for v in [13, 14, 16, 21, 22, 23, 100, 200, 300, 400, 410, 420, 430,
-                                                     440, 450, 460, 500, 510, 520, 600, 610, 620, 700] if v >= min_version and v <= max_version])
+                                                     440, 450, 460, 500, 510, 520, 600, 610, 620, 630] if v >= min_version and v <= max_version])
        else:
            api_version = random.randint(min_version, max_version)

--- a/bindings/bindingtester/known_testers.py
+++ b/bindings/bindingtester/known_testers.py
@ -20,7 +20,7 @@

 import os

-MAX_API_VERSION = 700
+MAX_API_VERSION = 630
 COMMON_TYPES = ['null', 'bytes', 'string', 'int', 'uuid', 'bool', 'float', 'double', 'tuple']
 ALL_TYPES = COMMON_TYPES + ['versionstamp']

--- a/bindings/bindingtester/tests/scripted.py
+++ b/bindings/bindingtester/tests/scripted.py
@ -34,7 +34,7 @@ fdb.api_version(FDB_API_VERSION)


 class ScriptedTest(Test):
-    TEST_API_VERSION = 700
+    TEST_API_VERSION = 630

    def __init__(self, subspace):
        super(ScriptedTest, self).__init__(subspace, ScriptedTest.TEST_API_VERSION, ScriptedTest.TEST_API_VERSION)
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -18,7 +18,7 @@
 * limitations under the License.
 */

-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 630
 #define FDB_INCLUDE_LEGACY_TYPES

 #include "fdbclient/MultiVersionTransaction.h"
--- a/bindings/c/fdb_c.vcxproj
+++ b/bindings/c/fdb_c.vcxproj
@ -67,14 +67,14 @@ FOR /F "tokens=1" %%i in ('hg.exe id') do copy /Y "$(TargetPath)" "$(TargetPath)
    </PostBuildEvent>
  </ItemDefinitionGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <IncludePath>..\..\;C:\Program Files\boost_1_67_0;$(IncludePath)</IncludePath>
+    <IncludePath>..\..\;C:\Program Files\boost_1_72_0;$(IncludePath)</IncludePath>
    <OutDir>$(SolutionDir)bin\$(Configuration)\</OutDir>
    <IntDir>$(SystemDrive)\temp\msvcfdb\$(Platform)$(Configuration)\$(MSBuildProjectName)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <OutDir>$(SolutionDir)bin\$(Configuration)\</OutDir>
    <IntDir>$(SystemDrive)\temp\msvcfdb\$(Platform)$(Configuration)\$(MSBuildProjectName)\</IntDir>
-    <IncludePath>..\..\;C:\Program Files\boost_1_67_0;$(IncludePath)</IncludePath>
+    <IncludePath>..\..\;C:\Program Files\boost_1_72_0;$(IncludePath)</IncludePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@ -28,10 +28,10 @@
 #endif

 #if !defined(FDB_API_VERSION)
-#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 700)
+#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 630)
 #elif FDB_API_VERSION < 13
 #error API version no longer supported (upgrade to 13)
-#elif FDB_API_VERSION > 700
+#elif FDB_API_VERSION > 630
 #error Requested API version requires a newer version of this header
 #endif

@ -91,7 +91,7 @@ extern "C" {
    DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_add_network_thread_completion_hook(void (*hook)(void*), void *hook_parameter);

 #pragma pack(push, 4)
-#if FDB_API_VERSION >= 700
+#if FDB_API_VERSION >= 630
    typedef struct keyvalue {
        const uint8_t* key;
        int key_length;
--- a/bindings/c/test/mako/mako.h
+++ b/bindings/c/test/mako/mako.h
@ -3,7 +3,7 @@
 #pragma once

 #ifndef FDB_API_VERSION
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 630
 #endif

 #include <foundationdb/fdb_c.h>
--- a/bindings/c/test/performance_test.c
+++ b/bindings/c/test/performance_test.c
@ -603,7 +603,7 @@ void runTests(struct ResultSet *rs) {
 int main(int argc, char **argv) {
 	srand(time(NULL));
 	struct ResultSet *rs = newResultSet();
-	checkError(fdb_select_api_version(700), "select API version", rs);
+	checkError(fdb_select_api_version(630), "select API version", rs);
 	printf("Running performance test at client version: %s\n", fdb_get_client_version());

 	valueStr = (uint8_t*)malloc((sizeof(uint8_t))*valueSize);
--- a/bindings/c/test/ryw_benchmark.c
+++ b/bindings/c/test/ryw_benchmark.c
@ -244,7 +244,7 @@ void runTests(struct ResultSet *rs) {
 int main(int argc, char **argv) {
 	srand(time(NULL));
 	struct ResultSet *rs = newResultSet();
-	checkError(fdb_select_api_version(700), "select API version", rs);
+	checkError(fdb_select_api_version(630), "select API version", rs);
 	printf("Running RYW Benchmark test at client version: %s\n", fdb_get_client_version());

 	keys = generateKeys(numKeys, keySize);
--- a/bindings/c/test/test.h
+++ b/bindings/c/test/test.h
@ -29,7 +29,7 @@
 #include <inttypes.h>

 #ifndef FDB_API_VERSION
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 630
 #endif

 #include <foundationdb/fdb_c.h>
--- a/bindings/c/test/txn_size_test.c
+++ b/bindings/c/test/txn_size_test.c
@ -97,7 +97,7 @@ void runTests(struct ResultSet *rs) {
 int main(int argc, char **argv) {
 	srand(time(NULL));
 	struct ResultSet *rs = newResultSet();
-	checkError(fdb_select_api_version(700), "select API version", rs);
+	checkError(fdb_select_api_version(630), "select API version", rs);
 	printf("Running performance test at client version: %s\n", fdb_get_client_version());

 	keys = generateKeys(numKeys, KEY_SIZE);
--- a/bindings/c/test/workloads/SimpleWorkload.cpp
+++ b/bindings/c/test/workloads/SimpleWorkload.cpp
@ -18,7 +18,7 @@
 * limitations under the License.
 */

-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 630
 #include "foundationdb/fdb_c.h"
 #undef DLLEXPORT
 #include "workloads.h"
@ -258,7 +258,7 @@ struct SimpleWorkload : FDBWorkload {
 		insertsPerTx = context->getOption("insertsPerTx", 100ul);
 		opsPerTx = context->getOption("opsPerTx", 100ul);
 		runFor = context->getOption("runFor", 10.0);
-		auto err = fdb_select_api_version(700);
+		auto err = fdb_select_api_version(630);
 		if (err) {
 			context->trace(FDBSeverity::Info, "SelectAPIVersionFailed",
 			               { { "Error", std::string(fdb_get_error(err)) } });
--- a/bindings/flow/fdb_flow.actor.cpp
+++ b/bindings/flow/fdb_flow.actor.cpp
@ -36,7 +36,7 @@ THREAD_FUNC networkThread(void* fdb) {
 }

 ACTOR Future<Void> _test() {
-	API *fdb = FDB::API::selectAPIVersion(700);
+	API *fdb = FDB::API::selectAPIVersion(630);
 	auto db = fdb->createDatabase();
 	state Reference<Transaction> tr = db->createTransaction();

@ -79,7 +79,7 @@ ACTOR Future<Void> _test() {
 }

 void fdb_flow_test() {
-	API *fdb = FDB::API::selectAPIVersion(700);
+	API *fdb = FDB::API::selectAPIVersion(630);
 	fdb->setupNetwork();
 	startThread(networkThread, fdb);

--- a/bindings/flow/fdb_flow.h
+++ b/bindings/flow/fdb_flow.h
@ -23,7 +23,7 @@

 #include <flow/flow.h>

-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 630
 #include <bindings/c/foundationdb/fdb_c.h>
 #undef DLLEXPORT

--- a/bindings/flow/fdb_flow.vcxproj
+++ b/bindings/flow/fdb_flow.vcxproj
@ -79,11 +79,11 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">
    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>..\..\;C:\Program Files\boost_1_67_0;$(IncludePath)</IncludePath>
+    <IncludePath>..\..\;C:\Program Files\boost_1_72_0;$(IncludePath)</IncludePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|X64'">
    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>..\..\;C:\Program Files\boost_1_67_0;$(IncludePath)</IncludePath>
+    <IncludePath>..\..\;C:\Program Files\boost_1_72_0;$(IncludePath)</IncludePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">
    <ClCompile>
--- a/bindings/flow/tester/Tester.actor.cpp
+++ b/bindings/flow/tester/Tester.actor.cpp
@ -1817,7 +1817,7 @@ ACTOR void _test_versionstamp() {
 	try {
 		g_network = newNet2(TLSConfig());

-		API *fdb = FDB::API::selectAPIVersion(700);
+		API *fdb = FDB::API::selectAPIVersion(630);

 		fdb->setupNetwork();
 		startThread(networkThread, fdb);
--- a/bindings/flow/tester/fdb_flow_tester.vcxproj
+++ b/bindings/flow/tester/fdb_flow_tester.vcxproj
@ -58,13 +58,13 @@
    <LinkIncremental>true</LinkIncremental>
    <OutDir>$(SolutionDir)bin\$(Configuration)\</OutDir>
    <IntDir>$(SystemDrive)\temp\msvcfdb\$(Platform)$(Configuration)\$(MSBuildProjectName)\</IntDir>
-    <IncludePath>$(IncludePath);../../../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../../../;C:\Program Files\boost_1_72_0</IncludePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|X64'">
    <LinkIncremental>false</LinkIncremental>
    <OutDir>$(SolutionDir)bin\$(Configuration)\</OutDir>
    <IntDir>$(SystemDrive)\temp\msvcfdb\$(Platform)$(Configuration)\$(MSBuildProjectName)\</IntDir>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_72_0</IncludePath>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
--- a/bindings/go/README.md
+++ b/bindings/go/README.md
@ -9,7 +9,7 @@ This package requires:
 - [Mono](http://www.mono-project.com/) (macOS or Linux) or [Visual Studio](https://www.visualstudio.com/) (Windows)  (build-time only)
 - FoundationDB C API 2.0.x-6.1.x (part of the [FoundationDB client packages](https://apple.github.io/foundationdb/downloads.html#c))

-Use of this package requires the selection of a FoundationDB API version at runtime. This package currently supports FoundationDB API versions 200-700.
+Use of this package requires the selection of a FoundationDB API version at runtime. This package currently supports FoundationDB API versions 200-630.

 To install this package, you can run the "fdb-go-install.sh" script (for versions 5.0.x and greater):

--- a/bindings/go/src/fdb/cluster.go
+++ b/bindings/go/src/fdb/cluster.go
@ -22,7 +22,7 @@

 package fdb

-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 630
 // #include <foundationdb/fdb_c.h>
 import "C"

--- a/bindings/go/src/fdb/database.go
+++ b/bindings/go/src/fdb/database.go
@ -22,7 +22,7 @@

 package fdb

-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 630
 // #include <foundationdb/fdb_c.h>
 import "C"

--- a/bindings/go/src/fdb/doc.go
+++ b/bindings/go/src/fdb/doc.go
@ -46,7 +46,7 @@ A basic interaction with the FoundationDB API is demonstrated below:

    func main() {
        // Different API versions may expose different runtime behaviors.
-        fdb.MustAPIVersion(700)
+        fdb.MustAPIVersion(630)

        // Open the default database from the system cluster
        db := fdb.MustOpenDefault()
--- a/bindings/go/src/fdb/errors.go
+++ b/bindings/go/src/fdb/errors.go
@ -22,7 +22,7 @@

 package fdb

-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 630
 // #include <foundationdb/fdb_c.h>
 import "C"

--- a/bindings/go/src/fdb/fdb.go
+++ b/bindings/go/src/fdb/fdb.go
@ -22,7 +22,7 @@

 package fdb

-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 630
 // #include <foundationdb/fdb_c.h>
 // #include <stdlib.h>
 import "C"
@ -108,7 +108,7 @@ func (opt NetworkOptions) setOpt(code int, param []byte) error {
 // library, an error will be returned. APIVersion must be called prior to any
 // other functions in the fdb package.
 //
-// Currently, this package supports API versions 200 through 700.
+// Currently, this package supports API versions 200 through 630.
 //
 // Warning: When using the multi-version client API, setting an API version that
 // is not supported by a particular client library will prevent that client from
@ -116,7 +116,7 @@ func (opt NetworkOptions) setOpt(code int, param []byte) error {
 // the API version of your application after upgrading your client until the
 // cluster has also been upgraded.
 func APIVersion(version int) error {
-	headerVersion := 700
+	headerVersion := 630

 	networkMutex.Lock()
 	defer networkMutex.Unlock()
@ -128,7 +128,7 @@ func APIVersion(version int) error {
 		return errAPIVersionAlreadySet
 	}

-	if version < 200 || version > 700 {
+	if version < 200 || version > 630 {
 		return errAPIVersionNotSupported
 	}

--- a/bindings/go/src/fdb/futures.go
+++ b/bindings/go/src/fdb/futures.go
@ -23,7 +23,7 @@
 package fdb

 //  #cgo LDFLAGS: -lfdb_c -lm
-//  #define FDB_API_VERSION 700
+//  #define FDB_API_VERSION 630
 //  #include <foundationdb/fdb_c.h>
 //  #include <string.h>
 //
--- a/bindings/go/src/fdb/range.go
+++ b/bindings/go/src/fdb/range.go
@ -22,7 +22,7 @@

 package fdb

-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 630
 // #include <foundationdb/fdb_c.h>
 import "C"

--- a/bindings/go/src/fdb/transaction.go
+++ b/bindings/go/src/fdb/transaction.go
@ -22,7 +22,7 @@

 package fdb

-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 630
 // #include <foundationdb/fdb_c.h>
 import "C"

--- a/bindings/java/JavaWorkload.cpp
+++ b/bindings/java/JavaWorkload.cpp
@ -19,7 +19,7 @@
 */

 #include <foundationdb/ClientWorkload.h>
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 630
 #include <foundationdb/fdb_c.h>

 #include <jni.h>
@ -370,7 +370,7 @@ struct JVM {
 		jmethodID selectMethod =
 		    env->GetStaticMethodID(fdbClass, "selectAPIVersion", "(I)Lcom/apple/foundationdb/FDB;");
 		checkException();
-		auto fdbInstance = env->CallStaticObjectMethod(fdbClass, selectMethod, jint(700));
+		auto fdbInstance = env->CallStaticObjectMethod(fdbClass, selectMethod, jint(630));
 		checkException();
 		env->CallObjectMethod(fdbInstance, getMethod(fdbClass, "disableShutdownHook", "()V"));
 		checkException();
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@ -21,7 +21,7 @@
 #include <jni.h>
 #include <string.h>

-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 630

 #include <foundationdb/fdb_c.h>

--- a/bindings/java/fdb_java.vcxproj
+++ b/bindings/java/fdb_java.vcxproj
@ -45,7 +45,7 @@
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
-    <IncludePath>..\..\;C:\Program Files\Java\jdk6\include\win32;C:\Program Files\Java\jdk6\include;C:\Program Files\boost_1_67_0;$(IncludePath)</IncludePath>
+    <IncludePath>..\..\;C:\Program Files\Java\jdk6\include\win32;C:\Program Files\Java\jdk6\include;C:\Program Files\boost_1_72_0;$(IncludePath)</IncludePath>
    <OutDir>$(SolutionDir)bin\$(Configuration)\</OutDir>
    <IntDir>$(SystemDrive)\temp\msvcfdb\$(Platform)$(Configuration)\$(MSBuildProjectName)\</IntDir>
  </PropertyGroup>
--- a/bindings/java/src/main/com/apple/foundationdb/FDB.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FDB.java
@ -35,7 +35,7 @@ import java.util.concurrent.atomic.AtomicInteger;
 *   This call is required before using any other part of the API. The call allows
 *   an error to be thrown at this point to prevent client code from accessing a later library
 *   with incorrect assumptions from the current version. The API version documented here is version
- *   {@code 700}.<br><br>
+ *   {@code 630}.<br><br>
 *  FoundationDB encapsulates multiple versions of its interface by requiring
 *   the client to explicitly specify the version of the API it uses. The purpose
 *   of this design is to allow you to upgrade the server, client libraries, or
@ -181,8 +181,8 @@ public class FDB {
 		}
 		if(version < 510)
 			throw new IllegalArgumentException("API version not supported (minimum 510)");
-		if(version > 700)
-			throw new IllegalArgumentException("API version not supported (maximum 700)");
+		if(version > 630)
+			throw new IllegalArgumentException("API version not supported (maximum 630)");

 		Select_API_version(version);
 		singleton = new FDB(version);
--- a/bindings/java/src/main/overview.html.in
+++ b/bindings/java/src/main/overview.html.in
@ -13,7 +13,7 @@ and then added to your classpath.<br>
 <h1>Getting started</h1>
 To start using FoundationDB from Java, create an instance of the 
 {@link com.apple.foundationdb.FDB FoundationDB API interface} with the version of the
-API that you want to use (this release of the FoundationDB Java API supports versions between {@code 510} and {@code 700}).
+API that you want to use (this release of the FoundationDB Java API supports versions between {@code 510} and {@code 630}).
 With this API object you can then open {@link com.apple.foundationdb.Cluster Cluster}s and
 {@link com.apple.foundationdb.Database Database}s and start using
 {@link com.apple.foundationdb.Transaction Transaction}s.
@ -29,7 +29,7 @@ import com.apple.foundationdb.tuple.Tuple;

 public class Example {
  public static void main(String[] args) {
-    FDB fdb = FDB.selectAPIVersion(700);
+    FDB fdb = FDB.selectAPIVersion(630);

    try(Database db = fdb.open()) {
      // Run an operation on the database
--- a/bindings/java/src/test/com/apple/foundationdb/test/AbstractTester.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/AbstractTester.java
@ -27,7 +27,7 @@ import com.apple.foundationdb.Database;
 import com.apple.foundationdb.FDB;

 public abstract class AbstractTester {
-	public static final int API_VERSION = 700;
+	public static final int API_VERSION = 630;
 	protected static final int NUM_RUNS = 25;
 	protected static final Charset ASCII = Charset.forName("ASCII");

--- a/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java
@ -33,7 +33,7 @@ public class BlockingBenchmark {
 	private static final int PARALLEL = 100;

 	public static void main(String[] args) throws InterruptedException {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(630);

 		// The cluster file DOES NOT need to be valid, although it must exist.
 		//  This is because the database is never really contacted in this test.
--- a/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java
@ -48,7 +48,7 @@ public class ConcurrentGetSetGet {
 	}

 	public static void main(String[] args) {
-		try(Database database = FDB.selectAPIVersion(700).open()) {
+		try(Database database = FDB.selectAPIVersion(630).open()) {
 			new ConcurrentGetSetGet().apply(database);
 		}
 	}
--- a/bindings/java/src/test/com/apple/foundationdb/test/DirectoryTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/DirectoryTest.java
@ -33,7 +33,7 @@ import com.apple.foundationdb.directory.DirectorySubspace;
 public class DirectoryTest {
 	public static void main(String[] args) throws Exception {
 		try {
-			FDB fdb = FDB.selectAPIVersion(700);
+			FDB fdb = FDB.selectAPIVersion(630);
 			try(Database db = fdb.open()) {
 				runTests(db);
 			}
--- a/bindings/java/src/test/com/apple/foundationdb/test/Example.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/Example.java
@ -26,7 +26,7 @@ import com.apple.foundationdb.tuple.Tuple;

 public class Example {
 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(630);

 		try(Database db = fdb.open()) {
 			// Run an operation on the database
--- a/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java
@ -31,7 +31,7 @@ public class IterableTest {
 	public static void main(String[] args) throws InterruptedException {
 		final int reps = 1000;
 		try {
-			FDB fdb = FDB.selectAPIVersion(700);
+			FDB fdb = FDB.selectAPIVersion(630);
 			try(Database db = fdb.open()) {
 				runTests(reps, db);
 			}
--- a/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java
@ -34,7 +34,7 @@ import com.apple.foundationdb.tuple.ByteArrayUtil;
 public class LocalityTests {

 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(630);
 		try(Database database = fdb.open(args[0])) {
 			try(Transaction tr = database.createTransaction()) {
 				String[] keyAddresses = LocalityUtil.getAddressesForKey(tr, "a".getBytes()).join();
--- a/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java
@ -43,7 +43,7 @@ public class ParallelRandomScan {
 	private static final int PARALLELISM_STEP = 5;

 	public static void main(String[] args) throws InterruptedException {
-		FDB api = FDB.selectAPIVersion(700);
+		FDB api = FDB.selectAPIVersion(630);
 		try(Database database = api.open(args[0])) {
 			for(int i = PARALLELISM_MIN; i <= PARALLELISM_MAX; i += PARALLELISM_STEP) {
 				runTest(database, i, ROWS, DURATION_MS);
--- a/bindings/java/src/test/com/apple/foundationdb/test/RangeTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/RangeTest.java
@ -34,7 +34,7 @@ import com.apple.foundationdb.Transaction;
 import com.apple.foundationdb.async.AsyncIterable;

 public class RangeTest {
-	private static final int API_VERSION = 700;
+	private static final int API_VERSION = 630;

 	public static void main(String[] args) {
 		System.out.println("About to use version " + API_VERSION);
--- a/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java
@ -34,7 +34,7 @@ public class SerialInsertion {
 	private static final int NODES = 1000000;

 	public static void main(String[] args) {
-		FDB api = FDB.selectAPIVersion(700);
+		FDB api = FDB.selectAPIVersion(630);
 		try(Database database = api.open()) {
 			long start = System.currentTimeMillis();

--- a/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java
@ -39,7 +39,7 @@ public class SerialIteration {
 	private static final int THREAD_COUNT = 1;

 	public static void main(String[] args) throws InterruptedException {
-		FDB api = FDB.selectAPIVersion(700);
+		FDB api = FDB.selectAPIVersion(630);
 		try(Database database = api.open(args[0])) {
 			for(int i = 1; i <= THREAD_COUNT; i++) {
 				runThreadedTest(database, i);
--- a/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java
@ -30,7 +30,7 @@ public class SerialTest {
 	public static void main(String[] args) throws InterruptedException {
 		final int reps = 1000;
 		try {
-			FDB fdb = FDB.selectAPIVersion(700);
+			FDB fdb = FDB.selectAPIVersion(630);
 			try(Database db = fdb.open()) {
 				runTests(reps, db);
 			}
--- a/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java
@ -39,7 +39,7 @@ public class SnapshotTransactionTest {
 	private static final Subspace SUBSPACE = new Subspace(Tuple.from("test", "conflict_ranges"));

 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(630);
 		try(Database db = fdb.open()) {
 			snapshotReadShouldNotConflict(db);
 			snapshotShouldNotAddConflictRange(db);
--- a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java
@ -50,7 +50,7 @@ public class TupleTest {
 	public static void main(String[] args) throws NoSuchFieldException {
 		final int reps = 1000;
 		try {
-			FDB fdb = FDB.selectAPIVersion(700);
+			FDB fdb = FDB.selectAPIVersion(630);
 			addMethods();
 			comparisons();
 			emptyTuple();
--- a/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java
@ -32,7 +32,7 @@ import com.apple.foundationdb.tuple.Versionstamp;

 public class VersionstampSmokeTest {
 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(630);
 		try(Database db = fdb.open()) {
 			db.run(tr -> {
 				tr.clear(Tuple.from("prefix").range());
--- a/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java
@ -34,7 +34,7 @@ import com.apple.foundationdb.Transaction;
 public class WatchTest {

 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(630);
 		try(Database database = fdb.open(args[0])) {
 			database.options().setLocationCacheSize(42);
 			try(Transaction tr = database.createTransaction()) {
--- a/bindings/python/fdb/init.py
+++ b/bindings/python/fdb/init.py
@ -52,7 +52,7 @@ def get_api_version():


 def api_version(ver):
-    header_version = 700
+    header_version = 630

    if '_version' in globals():
        if globals()['_version'] != ver:
--- a/bindings/python/tests/size_limit_tests.py
+++ b/bindings/python/tests/size_limit_tests.py
@ -22,7 +22,7 @@ import fdb
 import sys

 if __name__ == '__main__':
-    fdb.api_version(700)
+    fdb.api_version(630)

@fdb.transactional
 def setValue(tr, key, value):
--- a/bindings/ruby/lib/fdb.rb
+++ b/bindings/ruby/lib/fdb.rb
@ -36,7 +36,7 @@ module FDB
    end
  end
  def self.api_version(version)
-    header_version = 700
+    header_version = 630
    if self.is_api_version_selected?()
      if @@chosen_version != version
        raise "FDB API already loaded at version #{@@chosen_version}."
--- a/build/cmake/Dockerfile
+++ b/build/cmake/Dockerfile
@ -13,10 +13,10 @@ RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.13.4/cmake-3.1
    cd /tmp && tar xf cmake.tar.gz && cp -r cmake-3.13.4-Linux-x86_64/* /usr/local/

 # install boost
-RUN curl -L https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.tar.bz2 > /tmp/boost.tar.bz2 &&\
+RUN curl -L https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_72_0.tar.bz2 > /tmp/boost.tar.bz2 &&\
    cd /tmp && echo "2684c972994ee57fc5632e03bf044746f6eb45d4920c343937a465fd67a5adba  boost.tar.bz2" > boost-sha.txt &&\
-    sha256sum -c boost-sha.txt && tar xf boost.tar.bz2 && cp -r boost_1_67_0/boost /usr/local/include/ &&\
-    rm -rf boost.tar.bz2 boost_1_67_0
+    sha256sum -c boost-sha.txt && tar xf boost.tar.bz2 && cp -r boost_1_72_0/boost /usr/local/include/ &&\
+    rm -rf boost.tar.bz2 boost_1_72_0

 # install mono (for actorcompiler)
 RUN yum install -y epel-release
--- a/build/cmake/package_tester/fdb_c_app/app.c
+++ b/build/cmake/package_tester/fdb_c_app/app.c
@ -1,7 +1,7 @@
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 630
 #include <foundationdb/fdb_c.h>

 int main(int argc, char* argv[]) {
-	fdb_select_api_version(700);
+	fdb_select_api_version(630);
 	return 0;
 }
--- a/build/cmake/package_tester/modules/tests.sh
+++ b/build/cmake/package_tester/modules/tests.sh
@ -65,7 +65,7 @@ then
       python setup.py install
       successOr "Installing python bindings failed"
       popd
-       python -c 'import fdb; fdb.api_version(700)'
+       python -c 'import fdb; fdb.api_version(630)'
       successOr "Loading python bindings failed"

       # Test cmake and pkg-config integration: https://github.com/apple/foundationdb/issues/1483
--- a/cmake/CompileBoost.cmake
+++ b/cmake/CompileBoost.cmake
@ -1,4 +1,4 @@
-find_package(Boost 1.67)
+find_package(Boost 1.72)

 if(Boost_FOUND)
  add_library(boost_target INTERFACE)
@ -6,8 +6,8 @@ if(Boost_FOUND)
 else()
  include(ExternalProject)
  ExternalProject_add(boostProject
-    URL "https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.tar.bz2"
-    URL_HASH SHA256=2684c972994ee57fc5632e03bf044746f6eb45d4920c343937a465fd67a5adba
+    URL "https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.bz2"
+    URL_HASH SHA256=59c9b274bc451cf91a9ba1dd2c7fdcaf5d60b1b3aa83f2c9fa143417cc660722
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
    BUILD_IN_SOURCE ON
--- a/design/tlog-spilling.md.html
+++ b/design/tlog-spilling.md.html
@ -0,0 +1,680 @@
+<meta charset="utf-8">
+
+# TLog Spill-By-Reference Design
+
+## Background
+
+(This assumes a basic familiarity with [FoundationDB's architecture](https://www.youtu.be/EMwhsGsxfPU).)
+
+Transaction logs are a distributed Write-Ahead-Log for FoundationDB.  They
+receive commits from proxies, and are responsible for durably storing those
+commits, and making them available to storage servers for reading.
+
+Clients send *mutations*, the list of their set, clears, atomic operations,
+etc., to proxies.  Proxies collect mutations into a *batch*, which is the list
+of all changes that need to be applied to the database to bring it from version
+`N-1` to `N`.  Proxies then walk through their in-memory mapping of shard
+boundaries to associate one or more *tags*, a small integer uniquely
+identifying a destination storage server, with each mutation.  They then send a
+*commit*, the full list of `(tags, mutation)` for each mutation in a batch, to
+the transaction logs.
+
+The transaction log has two responsibilities: it must persist the commits to
+disk and notify the proxy when a commit is durably stored, and it must make the
+commit available for consumption by the storage server.  Each storage server
+*peeks* its own tag, which requests all mutations from the transaction log with
+the given tag at a given version or above.  After a storage server durably
+applies the mutations to disk, it *pops* the transaction logs with the same tag
+and its new durable version, notifying the transaction logs that they may
+discard mutations with the given tag and a lesser version.
+
+To persist commits, a transaction log appends commits to a growable on-disk
+ring buffer, called a *disk queue*, in version order.  Commit data is *pushed*
+onto the disk queue, and when all mutations in the oldest commit persisted are
+no longer needed, the disk queue is *popped* to trim its tail.
+
+To make commits available to storage servers efficiently, a transaction log
+maintains a copy of the commit in-memory, and maintains one queue per tag that
+indexes the location of each mutation in each commit with the specific tag,
+sequentially.  This way, responding to a peek from a storage server only
+requires sequentailly walking through the queue, and copying each mutation
+referenced into the response buffer.
+
+Transaction logs internally handle commits via performing two operations
+concurrently.  First, they walk through each mutation in the commit, and push
+the mutation onto an in-memory queue of mutations destined for that tag.
+Second, they include the data in the next batch of pages to durably persist to
+disk.  These in-memory queues are popped from when the corresponding storage
+server has persisted the data to its own disk.  The disk queue only exists to
+allow the in-memory queues to be rebuilt if the transaction log crashes, is
+never read from except during a transaction log recovering post-crash, and is
+popped when the oldest version it contains is no longer needed in memory.
+
+TLogs will need to hold the last 5-7 seconds of mutations.  In normal
+operation, the default 1.5GB of memory is enough such that the last 5-7 seconds
+of commits should almost always fit in memory.  However, in the presence of
+failures, the transaction log can be required to buffer significantly more
+data.  Most notably, when a storage server fails, its tag isn't popped until
+data distribution is able to re-replicate all of the shards that storage server
+was responsible for to other storage servers.  Before that happens, mutations
+will accumulate on the TLog destined for the failed storage server, in case it
+comes back and is able to rejoin the cluster.
+
+When this accumulation causes the memory required to hold all the unpopped data
+to exceed `TLOG_SPILL_THREASHOLD` bytes, the transaction log offloads the
+oldest data to disk.  This writing of data to disk to reduce TLog memory
+pressure is referred to as *spilling*.
+
+**************************************************************
+*                      Transaction Log                       *
+*                                                            *
+*                                                            *
+*  +------------------+   pushes   +------------+            *
+*  | Incoming Commits |----------->| Disk Queue |  +------+  *
+*  +------------------+            +------------+  |SQLite|  *
+*          |                                ^      +------+  *
+*          |                                |        ^       *
+*          |                               pops      |       *
+*          +------+-------+------+          |       writes   *
+*                 |       |      |          |        |       *
+*                 v       v      v         +----------+      *
+*    in-memory  +---+   +---+  +---+       |Spill Loop|      *
+*    queues     | 1 |   | 2 |  | 3 |       +----------+      *
+*    per-tag    |   |   |   |  |   |            ^            *
+*               |...|   |...|  |...|            |            *
+*                 |       |      |              |            *
+*                 v       v      v              |            *
+*                 +-------+------+--------------+            *
+*                    queues spilled on overflow              *
+*                                                            *
+**************************************************************
+
+## Overview
+
+Previously, spilling would work by writing the data to a SQLite B-tree.  The
+key would be `(tag, version)`, and the value would be all the mutations
+destined for the given tag at the given version.  Peek requests have a start
+version, that is the latest version for which the storage server knows about,
+and the TLog responds by range-reading the B-tree from the start version.  Pop
+requests allow the TLog to forget all mutations for a tag until a specific
+version, and the TLog thus issues a range clear from `(tag, 0)` to
+`(tag, pop_version)`.  After spilling, the durably written data in the disk
+queue would be trimmed to only include from the spilled version on, as any
+required data is now entirely, durably held in the B-tree.  As the entire value
+is copied into the B-tree, this method of spilling will be referred to as
+*spill-by-value* in the rest of this document.
+
+Unfortunately, it turned out that spilling in this fashion greatly impacts TLog
+performance.  A write bandwidth saturation test was run against a cluster, with
+a modification to the transaction logs to have them act as if there was one
+storage server that was permanently failed; it never sent pop requests to allow
+the TLog to remove data from memory.  After 15min, the write bandwidth had
+reduced to 30% of its baseline.  After 30min, that became 10%.  After 60min,
+that became 5%.  Writing entire values gives an immediate 3x additional write
+amplification, and the actual write amplification increases as the B-tree gets
+deeper.  (This is an intentional illustration of the worst case, due to the
+workload being a saturating write load.)
+
+With the recent multi-DC/multi-region work, a failure of a remote data center
+would cause transaction logs to need to buffer all commits, as every commit is
+tagged as destined for the remote datacenter.  This would rapidly push
+transaction logs into a spilling regime, and thus write bandwidth would begin
+to rapidly degrade.  It is unacceptable for a remote datacenter failure to so
+drastically affect the primary datacenter's performance in the case of a
+failure, so a more performant way of spilling data is required.
+
+Whereas spill-by-value copied the entire mutation into the B-tree and removes
+it from the disk queue, spill-by-reference leaves the mutations in the disk
+queue and writes a pointer to it into the B-tree. Performance experiments
+revealed that the TLog's performance while spilling was dictated more by the
+number of writes done to the SQLite B-tree, than by the size of those writes.
+Thus, "spill-by-reference" being able to do a significantly better batching
+with its writes to the B-tree is more important than that it writes less data
+in aggregate.  Spill-by-reference significantly reduces the volume of data
+written to the B-tree, and the less data that we write, the more we can batch
+versions to be written together.
+
+************************************************************************
+*                              DiskQueue                               *
+*                                                                      *
+*    ------- Index in B-tree -------     ---- Index in memory ----     *
+*   /                               \   /                         \    *
+* +-----------------------------------+-----------------------------+  *
+* |            Spilled Data           |       Most Recent Data      |  *
+* +-----------------------------------+-----------------------------+  *
+* lowest version                                      highest version  *
+*                                                                      *
+************************************************************************
+
+Spill-by-reference works by taking a larger range of versions, and building a
+single key-value pair per tag that describes where in the disk queue is every
+relevant commit for that tag.  Concretely, this takes the form
+`(tag, last_version) -> [(version, start, end, mutation_bytes), ...]`, where:
+
+ * `tag` is the small integer representing the storage server this mutation batch is destined for.
+ * `last_version` is the last/maximum version contained in the value's batch.
+ * `version` is the version of the commit that this index entry points to.
+ * `start` is an index into the disk queue of where to find the beginning of the commit.
+ * `end` is an index into the disk queue of where the end of the commit is.
+ * `mutation_bytes` is the number of bytes in the commit that are relevant for this tag.
+
+And then writing only once per tag spilled into the B-tree for each iteration
+through spilling.  This turns the number of writes into the B-Tree from
+`O(tags * versions)` to `O(tags)`.
+
+Note that each tuple in the list represents a commit, and not a mutation.  This
+means that peeking spilled commits will involve reading all mutations that were
+a part of the commit, and then filtering them to only the ones that have the
+tag of interest.  Alternatively, one could have each tuple represent a mutation
+within a commit, to prevent over-reading when peeking.  There exist
+pathological workloads for each strategy.  The purpose of this work is most
+importantly to support spilling of log router tags.  These exist on every
+mutation, so that it will get copied to other datacenters.  This is the exact
+pathological workload for recording each mutation individually, because it only
+increases the number of IO operations used to read the same amount of data.
+For a wider set of workloads, there's room to establish a heuristic as to when
+to record mutation(s) versus the entire commit, but performance testing hasn't
+surfaced this as important enough to include in the initial version of this
+work.
+
+Peeking spilled data now works by issuing a range read to the B-tree from
+`(tag, peek_begin)` to `(tag, infinity)`.  This is why the key contains the
+last version of the batch, rather than the beginning, so that a range read from
+the peek request's version will always return all relevant batches.  For each
+batched tuple, if the version is greater than our peek request's version, then
+we read the commit containing that mutation from disk, extract the relevant
+mutations, and append them to our response.  There is a target size of the
+response, 150KB by default.  As we iterate through the tuples, we sum
+`mutation_bytes`, which already informs us how many bytes of relevant mutations
+we'll get from a given commit.  This allows us to make sure we won't waste disk
+IOs on reads that will end up being discarded as unnecessary.
+
+Popping spilled data works similarly to before, but now requires recovering
+information from disk.  Previously, we would maintain a map from version to
+location in the disk queue for every version we hadn't yet spilled.  Once
+spilling has copied the value into the B-tree, knowing where the commit was in
+the disk queue is useless to us, and is removed.  In spill-by-reference, that
+information is still needed to know how to map "pop until version 7" to "pop
+until byte 87" in the disk queue.  Unfortunately, keeping this information in
+memory would result in TLogs slowly consuming more and more
+memory[^versionmap-memory] as more data is spilled.  Instead, we issue a range
+read of the B-tree from `(tag, pop_version)` to `(tag, infinity)` and look at
+the first commit we find with a version greater than our own.  We then use its
+starting disk queue location as the limit of what we could pop the disk queue
+until for this tag.
+
+[^versionmap-memory]: Pessimistic assumptions would suggest that a TLog spilling 1TB of data would require ~50GB of memory to hold this map, which isn't acceptable.
+
+## Detailed Implementation
+
+The rough outline of concrete changes proposed looks like:
+
+1. Allow a new TLog and old TLog to co-exist and be configurable, upgradeable, and recoverable
+1. Modify spilling in new TLogServer
+1. Modify peeking in new TLogServer
+1. Modify popping in new TLogServer
+1. Spill txsTag specially
+
+### Configuring and Upgrading
+
+Modifying how transaction logs spill data is a change to the on-disk files of
+transaction logs.  The work for enabling safe upgrades and rollbacks of
+persistent state changes to transaction logs was split off into a seperate
+design document: "Forward Compatibility for Transaction Logs".
+
+That document describes a `log_version` configuration setting that controls the
+availability of new transaction log features.  A similar configuration setting
+was created, `log_spill`, that at `log_version>=3`, one may `fdbcli>
+configure log_spill:=2` to enable spill-by-reference.  Only FDB 6.1 or newer
+will be unable to recover transaction log files that were using
+spill-by-reference.  FDB 6.2 will use spill-by-reference by default.
+
+| FDB Version | Default | Configurable |
+|-------------|---------|--------------|
+| 6.0         | No      | No           |
+| 6.1         | No      | Yes          |
+| 6.2         | Yes     | Yes          |
+
+If running FDB 6.1, the full command to enable spill-by-reference is
+`fdbcli> configure log_version:=3 log_spill:=2`.
+
+The TLog implementing spill-by-value was moved to `OldTLogServer_6_0.actor.cpp`
+and namespaced similarly.  `tLogFnForOptions` takes a `TLogOptions`, which is
+the version and spillType, and returns the correct TLog implementation
+according to those settings.  We maintain a map of
+`(TLogVersion, StoreType, TLogSpillType)` to TLog instance, so that only
+one SharedTLog exists per configuration variant.
+
+### Generations
+
+As a background, each time FoundationDB goes through a recovery, it will
+recruit a new generation of transaction logs.  This new generation of
+transaction logs will often be recruited on the same worker that hosted the
+previous generation's transaction log.  The old generation of transaction logs
+will only shut down once all the data that they have has been fully popped.
+This means that there can be multiple instances of a transaction log in the
+same process.
+
+Naively, this would create resource issues.  Each instance would think that it
+is allowed its own 1.5GB buffer of in-memory mutations.  Instead, internally to
+the TLog implmentation, the transaction log is split into two parts.  A
+`SharedTLog` is all the data that should be shared across multiple generations.
+A TLog is all the data that is private to one generation.  Most notably, the
+1.5GB mutation buffer and the on-disk files are owned by the `SharedTLog`.  The
+index for the data added to that buffer is maintained within each TLog.  In the
+code, a SharedTLog is `struct TLogData`, and a TLog is `struct LogData`.
+(I didn't choose these names.)
+
+This background is required, because one needs to keep in mind that we might be
+committing in one TLog instance, a different one might be spilling, and yet
+another might be the one popping data.
+
+*********************************************************
+*                   SharedTLog                          *
+*                                                       *
+* +--------+--------+--------+--------+--------+        *
+* | TLog 1 | TLog 2 | TLog 3 | TLog 4 | TLog 5 |        *
+* +--------+--------+--------+--------+--------+        *
+*   ^ popping         ^spilling         ^committing     * 
+*********************************************************
+
+Conceptually, this is because each TLog owns a separate part of the same Disk
+Queue file.  The earliest TLog instance needs to be the one that controls when
+the earliest part of the file can be discarded.  We spill in version order, and
+thus whatever TLog is responsible for the earliest unspilled version needs to
+be the one doing the spilling.  We always commit the newest version, so the
+newest TLog must be the one writing to the disk queue and inserting new data
+into the buffer of mutations.
+
+
+### Spilling
+
+`updatePersistentData()` is the core of the spilling loop, that takes a new
+persistent data version, writes the in-memory index for all commits less than
+that version to disk, and then removes them from memory.  By contact, once
+spilling commits an updated persistentDataVersion to the B-tree, then those
+bytes will not need to be recovered into memory after a crash, nor will the
+in-memory bytes be needed to serve a peek response.
+
+Our new method of spilling iterates through each tag, and builds up a
+`vector<SpilledData>` for each tag, where `SpilledData` is:
+
+``` CPP
+struct SpilledData {
+  Version version;
+  IDiskQueue::location start;
+  uint32_t length;
+  uint32_t mutationBytes;
+};
+```
+
+And then this vector is serialized, and written to the B-tree as
+`(logId, tag, max(SpilledData.version))` = `serialized(vector<SpilledData>)`
+
+As we iterate through each commit, we record the number of mutation bytes in
+this commit that have our tag of interest.  This is so that later, peeking can
+read exactly the number of commits that it needs from disk.
+
+Although the focus of this project is on the topic of spilling, the code
+implementing itself saw the least amount of total change.
+
+### Peeking
+
+A `TLogPeekRequest` contains a `Tag` and a `Version`, and is a request for all
+commits with the specified tag with a commit version greater than or equal to
+the given version.  The goal is to return a 150KB block of mutations.
+
+When servicing a peek request, we will read up to 150KB of mutations from the
+in-memory index.  If the peek version is lower than the version that we've
+spilled to disk, then we consult the on-disk index for up to 150KB of
+mutations.  (If we tried to read from disk first, and then read from memory, we
+would then be racing with the spilling loop moving data from memory to disk.)
+
+**************************************************************************
+*                                                                        *
+* +---------+  Tag    +---------+           Tag    +--------+            *
+* |  Peek   |-------->| Spilled | ...------------->| Memory |            *
+* | Request | Version |  Index  |          Version | Index  |            *
+* +---------+         +---------+                  +--------+            *
+*                        |                                 |             *
+*      +-----------------+-----------------+               |             *
+*     / \  Start=100   _/ \_  Start=500    +  Start=900    +  Ptr=0xF00  *
+*    /   \ Length=50  /     \ Length=70   / \ Length=30   / \ Length=30  *
+*  +------------------------------------------------+------------------+ *
+*  |                   Disk Queue                   |  Also In Memory  | *
+*  +------------------------------------------------+------------------+ *
+*                                                                        *
+**************************************************************************
+
+Spill-by-value and memory storage engine only ever read from the DiskQueue when
+recovering, and read the entire file linearly.  Therefore, `IDiskQueue` had no
+API for random reads to the DiskQueue.  That ability is now required for
+peeking, and thus, `IDiskQueue`'s API has been enhanced correspondingly:
+
+``` CPP
+enum class CheckHashes { NO, YES };
+
+class IDiskQueue {
+    // ...
+    Future<Standalone<StringRef>> read(location start, location end, CheckHashes ch);
+    // ...
+};
+```
+
+Internally, the DiskQueue adds page headers every 4K, which are stripped out
+from the returned data.  Therefore, the length of the result will not be the
+same as `end-start`, intentionally.  For this reason, the API is `(start, end)`
+and not `(start, length)`.
+
+Spilled data, when using spill-by-value, was resistent to bitrot via data being
+checksummed interally within SQLite's B-tree.  Now that reads can be done
+directly, the responsibility for verifing data integrity falls upon the
+DiskQueue.  `CheckHashes::YES` will cause the DiskQueue to use the checksum in
+each DiskQueue page to verify data integrity.  If an externally maintained
+checksums exists to verify the returned data, then `CheckHashes::NO` can be
+used to elide the checksumming.  A page failing its checksum will cause the
+transaction log to die with an `io_error()`.  
+
+What is read from disk is a `TLogQueueEntry`:
+
+``` CPP
+struct TLogQueueEntryRef {
+    UID id;
+    Version version;
+    Version knownCommittedVersion;
+    StringRef messages;
+}
+```
+
+Which provides the commit version and the logId of the TLog generation that
+produced this commit, in addition to all of the mutations for that version.
+(`knownCommittedVersion` is only used during FDB's recovery process.)
+
+### Popping
+
+As storage servers persist data, they send `pop(tag, version)` requests to the
+transaction log to notify it that it is allowed to discard data for `tag` up
+through `version`.  Once all the tags have been popped from the oldest commit
+in the DiskQueue, the tail of the DiskQueue can be discarded to reclaim space.
+
+If our popped version is in the range of what has been spilled, then we need to
+consult our on-disk index to see what is the next location in the disk queue
+that has data which is useful to us.  This act would race with the spilling
+loop changing what data is spilled, and thus disk queue popping
+(`popDiskQueue()`) was made to run serially after spilling completes.
+
+Also due to spilling and popping largely overlapping in state, the disk queue
+popping loop does not immediately react to a pop request from a storage server
+changing the popped version for a tag.  Spilling saves the popped version for
+each tag when the spill loop runs, and if that version changed, then
+`popDiskQueue()` refreshes its knowledge of what the minimum location in the
+disk queue is required for that tag.  We can pop the disk queue to the minimum
+of all minimum tag locations, or to the minimum location needed for an
+in-memory mutation if there is no spilled data.
+
+As a post implementation note, this ended up being a "here be dragons"
+experience, with a surprising number of edge cases in races between
+spilling/popping, various situations of having/not having/having inaccurate
+data for tags, or that tags can stop being pushed to when storage servers are
+removed but their corresponding `TagData` is never removed.
+
+### Transaction State Store
+
+For FDB to perform a recovery, there is information that it needs to know about
+the database, such as the configuration, worker exclusions, backup status, etc.
+These values are stored into the database in the `\xff` system keyspace.
+However, during a recovery, FDB can't read this data from the storage servers,
+because recovery hasn't completed, so it doesn't know who the storage servers
+are yet.  Thus, a copy of this data is held in-memory on every proxy in the
+*transaction state store*, and durably persisted as a part of commits on the
+transaction logs.  Being durably stored on the transaction logs means the list
+of transaction logs can be fetched from the coordinators, and then used to load
+the rest of the information about the database.
+
+The in-memory storage engine writes an equal amount of mutations and snapshot
+data to a queue, an when a full snapshot of the data has been written, deletes
+the preceeding snapshot and begins writing a new one.  When backing an
+in-memory storage engine with the transaction logs, the
+`LogSystemDiskQueueAdapter` implements writing to a queue as committing
+mutations to the transaction logs with a special tag of `txsTag`, and deleting
+the preceeding snapshot as popping the transaction logs for the tag of `txsTag`
+until the version where the last full snapshot began.
+
+This means that unlike every other commit that is tagged and stored on the
+transaction logs, `txsTag` signifies data that is:
+
+1. Committed to infrequently
+2. Only peeked on recovery
+3. Popped infrequently, and a large portion of the data is popped at once
+4. A small total volume of data
+
+The most problematic of these is the infrequent popping.  Unpopped data will be
+spilled after some time, and if `txsTag` data is spilled and not popped, it
+will prevent the DiskQueue from being popped as well.  This will cause the
+DiskQueue to grow continuously.  The infrequent commits and small data volume
+means that there benefits of spill-by-reference over spill-by-value don't apply
+for this tag.
+
+Thus, even when configured to spill-by-reference, `txsTag` is spilled by value.
+
+### Disk Queue Recovery
+
+If a transaction log dies and restarts, all commits that were in memory at the
+time of the crash must be loaded back into memory.  Recovery is blocked on this
+process, as there might have been a commit to the transaction state store
+immediately before crashing, and that data needs to be fully readable during a
+recovery.
+
+In spill-by-value, the DiskQueue only ever contained commits that were also
+held in memory, and thus recovery would need to read up to 1.5GB of data.  With
+spill-by-reference, the DiskQueue could theoretically contain terrabytes of
+data. To keep recovery times boundedly low, FDB must still only read the
+commits that need to be loaded back into memory.
+
+This is done by persisting the location in the DiskQueue of the last spilled
+commit to the SQLite B-Tree.  This is done in the same transaction as the
+spilling of that commit.  This provides an always accurate pointer to where
+data that needs to be loaded into memory begins.  The pointer is to the
+beginning of the last commit rather than the end, to make sure that the pointer
+is always contained within the DiskQueue.  This provides extra sanity checking
+on the validity of the DiskQueue's contents at recovery, at the cost of
+potentially reading 10MB more than what would be required.
+
+## Testing
+
+Correctness bugs in spilling would manifest as data corruption, which is well covered by simulation.
+The only special testing code added was to enable changing `log_spill` in `ConfigureTest`.
+This covers switching between spilling methods in the presence of faults.
+
+An `ASSERT` was added to simulation that verifies that commits read from the
+DiskQueue on recovery are only the commits which have not been spilled.
+
+The rest of the testing is to take a physical cluster and try the extremes that
+can only happen at scale:
+
+* Verify that recovery times are not impacted when a large amount of data is spilled
+* Verify that long running tests hit a steady state of memory usage (and thus there are likely no leaks).
+* Plot how quickly (MB/s) a remote datacenter can catch up in old vs new spilling strategy
+* See what happens when there's 1 tlog and more than 100 storage servers.
+  * Verify that peek requests get limited
+	* See if tlog commits can get starved by excessive peeking
+
+# TLog Spill-By-Reference Operational Guide
+
+## Notable Behavior Changes
+
+TL;DR: Spilling involves less IOPS and is faster.  Peeking involves more IOPS and is slower.  Popping involves >0 IOPS.
+
+### Spilling
+
+The most notable effect of the spilling changes is that the Disk Queue files
+will now grow to potentially terrabytes in size.
+
+ 1. Spilling will occur in larger batches, which will result in a more
+sawtooth-like `BytesInput - BytesDurable` value.  I'm not aware that this will have any meaningful impact.
+
+ * Disk queue files will grow when spilling is happening
+   * Alerting based on DQ file size is no longer appropriate
+
+As a curious aside, throughput decreases as spilled volume increases, which
+quite possibly worked as accidental backpressure.  As a feature, this no longer
+exists, but means write-heavy workloads can drown storage servers faster than
+before.
+
+### Peeking
+
+Peeking has seen tremendous changes.  Its involves more IO operations and memory usage.
+
+The expected implication of this are:
+
+1.  A peek of spilled data will involve a burst of IO operations.
+
+		Theoretically, this burst can drown out queued write operations to disk,
+		thus and slowing down TLog commits.  This hasn't been observed in testing.
+
+		Low IOPS devices, such as HDD or network attached storage, would struggle
+		more here than locally attached SSD.
+
+2.  Generating a peek response of 150KB could require reading 100MB of data, and allocating buffers to hold that 100MB.
+
+		OOMs were observed in early testing.  Code has been added to specifically
+		limit how much memory can be allocated for serving a signle peek request
+		and all concurrent peek requests, with knobs to allow tuning this per
+		deployment configuration.
+
+### Popping
+
+Popping will transition from being an only in-memory operation to one that
+can involve reads from disk if the popped tag has spilled data.
+
+Due to a strange quirk, TLogs will allocate up to 2GB of memory as a read cache
+for SQLite's B-tree.  The expected maximum size of the B-tree has drastically
+reduced, so these reads should almost never actually hit disk.  The number of
+writes to disk will stay the same, so performance should stay unchanged.
+
+### Disk Queues
+
+This work should have a minimal impact on recovery times, which is why recovery
+hasn't been significantly mentioned in this document. However, there are two
+minor impacts on recovery times:
+
+1.  Larger disk queue file means more file to zero out in the case of recovery.
+
+    This should be negligable when fallocate `ZERO_RANGE` is available, because then it's only a metadata operation.
+
+2.  A larger file means more bisection iterations to find the first page.
+
+		If we say Disk Queue files are typically ~4GB now, and people are unlikely
+		to have more than 4TB drives, then this means in the worst case, another 8
+		sequential IOs will need to be done when first recovering a disk queue file
+		to find the most recent page with a binary search.
+
+		If this turns out to be an issue, it's trivial to address.  There's no
+		reason to do only a binary search when drives support parallel requests.  A
+		32-way search could reasonably be done, and would would make a 4TB Disk
+		Queue file faster to recover than a 4GB one currently.
+
+3.  Disk queue files can now shrink.
+
+    The particular logic currently used is that:
+
+		If one file is significantly larger than the other file, then it will be
+		truncated to the size of the other file.  This resolves situations where a
+		particular storage server or remote DC being down causes one DiskQueue file
+		to be grown to a massive size, and then the data is rapidly popped.
+
+		Otherwise, If the files are of reasonably similar size, then we'll take
+		`pushLocation - popLocation` as the number of "active" bytes, and then
+		shrink the file by `TLOG_DISK_QUEUE_SHRINK_BYTES` bytes if the file is
+		larger than `active + TLOG_DISK_QUEUE_EXTENSION_BYTES + TLOG_DISK_QUEUE_SHRINK_BYTES`.
+
+## Knobs
+
+`REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT`
+: How many bytes of mutations should be spilled at once in a spill-by-reference TLog.<br>
+  Increasing it could increase throughput in spilling regimes.<br>
+  Decreasing it will decrease how sawtooth-like TLog memory usage is.<br>
+
+`UPDATE_STORAGE_BYTE_LIMIT`
+: How many bytes of mutations should be spilled at once in a spill-by-value TLog.<br>
+  This knob is pre-existing, and has only been "changed" to only apply to spill-by-value.<br>
+
+`TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK`
+: How many batches of spilled data index batches should be read from disk to serve one peek request.<br>
+  Increasing it will potentially increase the throughput of peek requests.<br>
+	Decreasing it will decrease the number of read IOs done per peek request.<br>
+
+`TLOG_SPILL_REFERENCE_MAX_BYTES_PER_BATCH`
+: How many bytes a batch of spilled data indexes can be.<br>
+  Increasing it will increase TLog throughput while spilling.<br>
+	Decreasing it will decrease the latency and increase the throughput of peek requests.<br>
+
+`TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES`
+: How many bytes of memory can be allocated to hold the results of reads from disk to respond to peek requests.<br>
+  Increasing it will increase the number of parallel peek requests a TLog can handle at once.<br>
+	Decreasing it will reduce TLog memory usage.<br>
+	If increased, `--max_memory` should be increased by the same amount.<br>
+
+`TLOG_DISK_QUEUE_EXTENSION_BYTES`
+: When a DiskQueue needs to extend a file, by how many bytes should it extend the file.<br>
+  Increasing it will reduce metadata operations done to the drive, and likely tail commit latency.<br>
+	Decreasing it will reduce allocated but unused space in the DiskQueue files.<br>
+  Note that this was previously hardcoded to 20MB, and is only being promoted to a knob.<br>
+
+`TLOG_DISK_QUEUE_SHRINK_BYTES`
+: If a DiskQueue file has extra space left when switching to the other file, by how many bytes should it be shrunk.<br>
+  Increasing this will cause disk space to be returned to the OS faster.<br>
+  Decreasing this will decrease TLog tail latency due to filesystem metadata updates.<br>
+
+## Observability
+
+With the new changes, we must ensure that sufficent information has been exposed such that:
+
+1. If something goes wrong in production, we can understand what and why from trace logs.
+2. We can understand if the TLog is performing suboptimally, and if so, which knob we should change and by how much.
+
+The following metrics were added to `TLogMetrics`:
+
+### Spilling
+
+### Peeking
+
+`PeekMemoryRequestsStalled`
+: The number of peek requests that are blocked on acquiring memory for reads.
+
+`PeekMemoryReserved`
+: The amount of memory currently reserved for serving peek requests.
+
+### Popping
+
+`QueuePoppedVersion`
+: The oldest version that's still useful.
+
+`MinPoppedTagLocality`
+: The locality of the tag that's preventing the DiskQueue from being further popped.
+
+`MinPoppedTagId`
+: The id of the tag that's preventing the DiskQueue from being further popped.
+
+## Monitoring and Alerting
+
+To answer questions like:
+
+1. What new graphs should exist?
+2. What old graphs might exist that would no longer be meaningful?
+3. What alerts might exist that need to be changed?
+4. What alerts should be created?
+
+Of which I'm aware of:
+
+* Any current alerts on "Disk Queue files more than [constant size] GB" will need to be removed.
+* Any alerting or monitoring of `log*.sqlite` as an indication of spilling will no longer be effective.
+
+* A graph of `BytesInput - BytesPopped` will give an idea of the number of "active" bytes in the DiskQueue file.
+
+<!-- Force long-style table of contents -->
+<script>window.markdeepOptions={}; window.markdeepOptions.tocStyle="long";</script>
+<!-- When printed, top level section headers should force page breaks -->
+<style>.md h1, .md .nonumberh1 {page-break-before:always}</style>
+<!-- Markdeep: -->
+<style class="fallback">body{visibility:hidden;white-space:pre;font-family:monospace}</style><script src="markdeep.min.js" charset="utf-8"></script><script src="https://casual-effects.com/markdeep/latest/markdeep.min.js" charset="utf-8"></script><script>window.alreadyProcessedMarkdeep||(document.body.style.visibility="visible")</script>
--- a/documentation/sphinx/source/api-c.rst
+++ b/documentation/sphinx/source/api-c.rst
@ -133,7 +133,7 @@ API versioning

 Prior to including ``fdb_c.h``, you must define the ``FDB_API_VERSION`` macro. This, together with the :func:`fdb_select_api_version()` function, allows programs written against an older version of the API to compile and run with newer versions of the C library. The current version of the FoundationDB C API is |api-version|. ::

-  #define FDB_API_VERSION 700
+  #define FDB_API_VERSION 630
  #include <foundationdb/fdb_c.h>

 .. function:: fdb_error_t fdb_select_api_version(int version)
--- a/documentation/sphinx/source/api-common.rst.inc
+++ b/documentation/sphinx/source/api-common.rst.inc
@ -150,7 +150,7 @@
 .. |atomic-versionstamps-tuple-warning-value| replace::
    At this time, versionstamped values are not compatible with the Tuple layer except in Java, Python, and Go. Note that this implies versionstamped values may not be used with the Subspace and Directory layers except in those languages.

-.. |api-version| replace:: 700
+.. |api-version| replace:: 630

 .. |streaming-mode-blurb1| replace::
    When using |get-range-func| and similar interfaces, API clients can request large ranges of the database to iterate over.  Making such a request doesn't necessarily mean that the client will consume all of the data in the range - sometimes the client doesn't know how far it intends to iterate in advance.  FoundationDB tries to balance latency and bandwidth by requesting data for iteration in batches.
--- a/documentation/sphinx/source/api-python.rst
+++ b/documentation/sphinx/source/api-python.rst
@ -108,7 +108,7 @@ Opening a database
 After importing the ``fdb`` module and selecting an API version, you probably want to open a :class:`Database` using :func:`open`::

    import fdb
-    fdb.api_version(700)
+    fdb.api_version(630)
    db = fdb.open()

 .. function:: open( cluster_file=None, event_model=None )
--- a/documentation/sphinx/source/api-ruby.rst
+++ b/documentation/sphinx/source/api-ruby.rst
@ -93,7 +93,7 @@ Opening a database
 After requiring the ``FDB`` gem and selecting an API version, you probably want to open a :class:`Database` using :func:`open`::

    require 'fdb'
-    FDB.api_version 700
+    FDB.api_version 630
    db = FDB.open

 .. function:: open( cluster_file=nil ) -> Database
--- a/documentation/sphinx/source/api-version-upgrade-guide.rst
+++ b/documentation/sphinx/source/api-version-upgrade-guide.rst
@ -9,9 +9,9 @@ This document provides an overview of changes that an application developer may

 For more details about API versions, see :ref:`api-versions`.

-.. _api-version-upgrade-guide-700:
+.. _api-version-upgrade-guide-630:

-API version 700
+API version 630
 ===============

 C bindings
--- a/documentation/sphinx/source/class-scheduling-go.rst
+++ b/documentation/sphinx/source/class-scheduling-go.rst
@ -29,7 +29,7 @@ Before using the API, we need to specify the API version. This allows programs t

 .. code-block:: go

-  fdb.MustAPIVersion(700)
+  fdb.MustAPIVersion(630)

 Next, we open a FoundationDB database.  The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`.

@ -78,7 +78,7 @@ If this is all working, it looks like we are ready to start building a real appl

  func main() {
      // Different API versions may expose different runtime behaviors.
-      fdb.MustAPIVersion(700)
+      fdb.MustAPIVersion(630)

      // Open the default database from the system cluster
      db := fdb.MustOpenDefault()
@ -666,7 +666,7 @@ Here's the code for the scheduling tutorial:
  }

  func main() {
-    fdb.MustAPIVersion(700)
+    fdb.MustAPIVersion(630)
    db := fdb.MustOpenDefault()
    db.Options().SetTransactionTimeout(60000)  // 60,000 ms = 1 minute
    db.Options().SetTransactionRetryLimit(100)
--- a/documentation/sphinx/source/class-scheduling-java.rst
+++ b/documentation/sphinx/source/class-scheduling-java.rst
@ -30,7 +30,7 @@ Before using the API, we need to specify the API version. This allows programs t
  private static final Database db;

  static {
-    fdb = FDB.selectAPIVersion(700);
+    fdb = FDB.selectAPIVersion(630);
    db = fdb.open();
  }

@ -66,7 +66,7 @@ If this is all working, it looks like we are ready to start building a real appl
    private static final Database db;

    static {
-      fdb = FDB.selectAPIVersion(700);
+      fdb = FDB.selectAPIVersion(630);
      db = fdb.open();
    }

@ -441,7 +441,7 @@ Here's the code for the scheduling tutorial:
    private static final Database db;

    static {
-      fdb = FDB.selectAPIVersion(700);
+      fdb = FDB.selectAPIVersion(630);
      db = fdb.open();
      db.options().setTransactionTimeout(60000);  // 60,000 ms = 1 minute
      db.options().setTransactionRetryLimit(100);
--- a/documentation/sphinx/source/class-scheduling-ruby.rst
+++ b/documentation/sphinx/source/class-scheduling-ruby.rst
@ -23,7 +23,7 @@ Open a Ruby interactive interpreter and import the FoundationDB API module::

 Before using the API, we need to specify the API version. This allows programs to maintain compatibility even if the API is modified in future versions::

-    > FDB.api_version 700
+    > FDB.api_version 630
    => nil

 Next, we open a FoundationDB database.  The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`. ::
@ -46,7 +46,7 @@ If this is all working, it looks like we are ready to start building a real appl
 .. code-block:: ruby

    require 'fdb'
-    FDB.api_version 700
+    FDB.api_version 630
    @db = FDB.open
    @db['hello'] = 'world'
    print 'hello ', @db['hello']
@ -373,7 +373,7 @@ Here's the code for the scheduling tutorial:

    require 'fdb'

-    FDB.api_version 700
+    FDB.api_version 630

    ####################################
    ##        Initialization          ##
--- a/documentation/sphinx/source/class-scheduling.rst
+++ b/documentation/sphinx/source/class-scheduling.rst
@ -30,7 +30,7 @@ Open a Python interactive interpreter and import the FoundationDB API module::

 Before using the API, we need to specify the API version. This allows programs to maintain compatibility even if the API is modified in future versions::

-    >>> fdb.api_version(700)
+    >>> fdb.api_version(630)

 Next, we open a FoundationDB database.  The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`. ::

@ -48,7 +48,7 @@ When this command returns without exception, the modification is durably stored
 If this is all working, it looks like we are ready to start building a real application. For reference, here's the full code for "hello world"::

    import fdb
-    fdb.api_version(700)
+    fdb.api_version(630)
    db = fdb.open()
    db[b'hello'] = b'world'
    print 'hello', db[b'hello']
@ -91,7 +91,7 @@ FoundationDB includes a few tools that make it easy to model data using this app
 opening a :ref:`directory <developer-guide-directories>` in the database::

    import fdb
-    fdb.api_version(700)
+    fdb.api_version(630)

    db = fdb.open()
    scheduling = fdb.directory.create_or_open(db, ('scheduling',))
@ -337,7 +337,7 @@ Here's the code for the scheduling tutorial::
    import fdb
    import fdb.tuple

-    fdb.api_version(700)
+    fdb.api_version(630)


    ####################################
--- a/documentation/sphinx/source/hierarchical-documents-java.rst
+++ b/documentation/sphinx/source/hierarchical-documents-java.rst
@ -69,7 +69,7 @@ Here’s a basic implementation of the recipe.
        private static final long EMPTY_ARRAY = -1;

        static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(630);
            db = fdb.open();
            docSpace = new Subspace(Tuple.from("D"));
        }
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@ -367,7 +367,9 @@
                  "layer_status_incomplete",
                  "database_availability_timeout",
                  "consistencycheck_suspendkey_fetch_timeout",
-                  "consistencycheck_disabled"
+                  "consistencycheck_disabled",
+                  "primary_dc_missing",
+                  "fetch_primary_dc_timeout"
               ]
            },
            "issues":[
--- a/documentation/sphinx/source/mr-status.rst
+++ b/documentation/sphinx/source/mr-status.rst
@ -88,6 +88,8 @@ cluster.messages                      unreachable_ratekeeper_worker         Unab
 cluster.messages                      unreachable_processes                 The cluster has some unreachable processes.
 cluster.messages                      unreadable_configuration              Unable to read database configuration.
 cluster.messages                      layer_status_incomplete               Some or all of the layers subdocument could not be read.
+cluster.messages                      primary_dc_missing                    Unable to determine primary datacenter.
+cluster.messages                      fetch_primary_dc_timeout              Fetching primary DC timed out.
 cluster.processes.<process>.messages  file_open_error                       Unable to open ‘<file>’ (<os_error>).
 cluster.processes.<process>.messages  incorrect_cluster_file_contents       Cluster file contents do not match current cluster connection string. Verify cluster file is writable and has not been overwritten externally.
 cluster.processes.<process>.messages  io_error                              <error> occured in <subsystem>
--- a/documentation/sphinx/source/multimaps-java.rst
+++ b/documentation/sphinx/source/multimaps-java.rst
@ -74,7 +74,7 @@ Here’s a simple implementation of multimaps with multisets as described:
        private static final int N = 100;

        static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(630);
            db = fdb.open();
            multi = new Subspace(Tuple.from("M"));
        }
--- a/documentation/sphinx/source/old-release-notes/release-notes-620.rst
+++ b/documentation/sphinx/source/old-release-notes/release-notes-620.rst
@ -261,7 +261,7 @@ Bindings
 * Go: The Go bindings now require Go version 1.11 or later.
 * Go: Finalizers could run too early leading to undefined behavior. `(PR #1451) <https://github.com/apple/foundationdb/pull/1451>`_.
 * Added a transaction option to control the field length of keys and values in debug transaction logging in order to avoid truncation. `(PR #1844) <https://github.com/apple/foundationdb/pull/1844>`_.
-* Added a transaction option to control the whether ``get_addresses_for_key`` includes a port in the address. This will be deprecated in api version 700, and addresses will include ports by default. [6.2.4] `(PR #2060) <https://github.com/apple/foundationdb/pull/2060>`_.
+* Added a transaction option to control the whether ``get_addresses_for_key`` includes a port in the address. This will be deprecated in api version 630, and addresses will include ports by default. [6.2.4] `(PR #2060) <https://github.com/apple/foundationdb/pull/2060>`_.
 * Python: ``Versionstamp`` comparisons didn't work in Python 3. [6.2.4] `(PR #2089) <https://github.com/apple/foundationdb/pull/2089>`_.

 Features
--- a/documentation/sphinx/source/priority-queues-java.rst
+++ b/documentation/sphinx/source/priority-queues-java.rst
@ -74,7 +74,7 @@ Here's a basic implementation of the model:
        private static final Random randno;

        static{
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(630);
            db = fdb.open();
            pq = new Subspace(Tuple.from("P"));

--- a/documentation/sphinx/source/queues-java.rst
+++ b/documentation/sphinx/source/queues-java.rst
@ -73,7 +73,7 @@ The following is a simple implementation of the basic pattern:
        private static final Random randno;

        static{
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(630);
            db = fdb.open();
            queue = new Subspace(Tuple.from("Q"));
            randno = new Random();
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@ -17,7 +17,7 @@ Status

 Bindings
 --------
-* API version updated to 700. See the :ref:`API version upgrade guide <api-version-upgrade-guide-700>` for upgrade details.
+* API version updated to 630. See the :ref:`API version upgrade guide <api-version-upgrade-guide-630>` for upgrade details.
 * Java: Introduced ``keyAfter`` utility function that can be used to create the immediate next key for a given byte array. `(PR #2458) <https://github.com/apple/foundationdb/pull/2458>`_
 * C: The ``FDBKeyValue`` struct's ``key`` and ``value`` members have changed type from ``void*`` to ``uint8_t*``. `(PR #2622) <https://github.com/apple/foundationdb/pull/2622>`_

--- a/documentation/sphinx/source/simple-indexes-java.rst
+++ b/documentation/sphinx/source/simple-indexes-java.rst
@ -87,7 +87,7 @@ In this example, we’re storing user data based on user ID but sometimes need t
        private static final Subspace index;

        static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(630);
            db = fdb.open();
            main = new Subspace(Tuple.from("user"));
            index = new Subspace(Tuple.from("zipcode_index"));
--- a/documentation/sphinx/source/tables-java.rst
+++ b/documentation/sphinx/source/tables-java.rst
@ -62,7 +62,7 @@ Here’s a simple implementation of the basic table pattern:
        private static final Subspace colIndex;

        static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(630);
            db = fdb.open();
            table = new Subspace(Tuple.from("T"));
            rowIndex = table.subspace(Tuple.from("R"));
--- a/documentation/sphinx/source/vector-java.rst
+++ b/documentation/sphinx/source/vector-java.rst
@ -77,7 +77,7 @@ Here’s the basic pattern:
        private static final Subspace vector;

        static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(630);
            db = fdb.open();
            vector = new Subspace(Tuple.from("V"));
        }
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -103,6 +103,7 @@ enum {
 	OPT_EXPIRE_RESTORABLE_AFTER_VERSION, OPT_EXPIRE_RESTORABLE_AFTER_DATETIME, OPT_EXPIRE_MIN_RESTORABLE_DAYS,
 	OPT_BASEURL, OPT_BLOB_CREDENTIALS, OPT_DESCRIBE_DEEP, OPT_DESCRIBE_TIMESTAMPS,
 	OPT_DUMP_BEGIN, OPT_DUMP_END, OPT_JSON, OPT_DELETE_DATA, OPT_MIN_CLEANUP_SECONDS,
+	OPT_USE_PARTITIONED_LOG,

 	// Backup and Restore constants
 	OPT_TAGNAME, OPT_BACKUPKEYS, OPT_WAITFORDONE,
@ -169,6 +170,8 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = {
 	{ OPT_NOSTOPWHENDONE,   "--no-stop-when-done",SO_NONE },
 	{ OPT_DESTCONTAINER,    "-d",               SO_REQ_SEP },
 	{ OPT_DESTCONTAINER,    "--destcontainer",  SO_REQ_SEP },
+	{ OPT_USE_PARTITIONED_LOG, "-p",                 SO_NONE },
+	{ OPT_USE_PARTITIONED_LOG, "--partitioned_log",  SO_NONE },
 	{ OPT_SNAPSHOTINTERVAL, "-s",                   SO_REQ_SEP },
 	{ OPT_SNAPSHOTINTERVAL, "--snapshot_interval",  SO_REQ_SEP },
 	{ OPT_TAGNAME,         "-t",               SO_REQ_SEP },
@ -953,6 +956,7 @@ static void printBackupUsage(bool devhelp) {
 	printf("  -e ERRORLIMIT  The maximum number of errors printed by status (default is 10).\n");
 	printf("  -k KEYS        List of key ranges to backup.\n"
 		   "                 If not specified, the entire database will be backed up.\n");
+	printf("  -p, --partitioned_log  Starts with new type of backup system using partitioned logs.\n");
 	printf("  -n, --dryrun   For backup start or restore start, performs a trial run with no actual changes made.\n");
 	printf("  --log          Enables trace file logging for the CLI session.\n"
 		   "  --logdir PATH  Specifes the output directory for trace files. If\n"
@ -1744,9 +1748,10 @@ ACTOR Future<Void> submitDBBackup(Database src, Database dest, Standalone<Vector
 	return Void();
 }

-ACTOR Future<Void> submitBackup(Database db, std::string url, int snapshotIntervalSeconds, Standalone<VectorRef<KeyRangeRef>> backupRanges, std::string tagName, bool dryRun, bool waitForCompletion, bool stopWhenDone) {
-	try
-	{
+ACTOR Future<Void> submitBackup(Database db, std::string url, int snapshotIntervalSeconds,
+                                Standalone<VectorRef<KeyRangeRef>> backupRanges, std::string tagName, bool dryRun,
+                                bool waitForCompletion, bool stopWhenDone, bool usePartitionedLog) {
+	try {
 		state FileBackupAgent backupAgent;

 		// Backup everything, if no ranges were specified
@ -1789,7 +1794,8 @@ ACTOR Future<Void> submitBackup(Database db, std::string url, int snapshotInterv
 		}

 		else {
-			wait(backupAgent.submitBackup(db, KeyRef(url), snapshotIntervalSeconds, tagName, backupRanges, stopWhenDone));
+			wait(backupAgent.submitBackup(db, KeyRef(url), snapshotIntervalSeconds, tagName, backupRanges, stopWhenDone,
+			                              usePartitionedLog));

 			// Wait for the backup to complete, if requested
 			if (waitForCompletion) {
@ -1811,8 +1817,7 @@ ACTOR Future<Void> submitBackup(Database db, std::string url, int snapshotInterv
 				}
 			}
 		}
-	}
-	catch (Error& e) {
+	} catch (Error& e) {
 		if(e.code() == error_code_actor_cancelled)
 			throw;
 		switch (e.code())
@ -2046,8 +2051,8 @@ ACTOR Future<Void> discontinueBackup(Database db, std::string tagName, bool wait

 ACTOR Future<Void> changeBackupResumed(Database db, bool pause) {
 	try {
-		state FileBackupAgent backupAgent;
-		wait(backupAgent.taskBucket->changePause(db, pause));
+		FileBackupAgent backupAgent;
+		wait(backupAgent.changePause(db, pause));
 		printf("All backup agents have been %s.\n", pause ? "paused" : "resumed");
 	}
 	catch (Error& e) {
@ -2908,6 +2913,7 @@ int main(int argc, char* argv[]) {
 		std::string restoreTimestamp;
 		bool waitForDone = false;
 		bool stopWhenDone = true;
+		bool usePartitionedLog = false; // Set to true to use new backup system
 		bool forceAction = false;
 		bool trace = false;
 		bool quietDisplay = false;
@ -3153,6 +3159,9 @@ int main(int argc, char* argv[]) {
 				case OPT_NOSTOPWHENDONE:
 					stopWhenDone = false;
 					break;
+				case OPT_USE_PARTITIONED_LOG:
+					usePartitionedLog = true;
+					break;
 				case OPT_RESTORECONTAINER:
 					restoreContainer = args->OptionArg();
 					// If the url starts with '/' then prepend "file://" for backwards compatibility
@ -3568,7 +3577,8 @@ int main(int argc, char* argv[]) {
 					return FDB_EXIT_ERROR;
 				// Test out the backup url to make sure it parses.  Doesn't test to make sure it's actually writeable.
 				openBackupContainer(argv[0], destinationContainer);
-				f = stopAfter( submitBackup(db, destinationContainer, snapshotIntervalSeconds, backupKeys, tagName, dryRun, waitForDone, stopWhenDone) );
+				f = stopAfter(submitBackup(db, destinationContainer, snapshotIntervalSeconds, backupKeys, tagName,
+				                           dryRun, waitForDone, stopWhenDone, usePartitionedLog));
 				break;
 			}

--- a/fdbbackup/fdbbackup.vcxproj
+++ b/fdbbackup/fdbbackup.vcxproj
@ -53,11 +53,11 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">
    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_72_0</IncludePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|X64'">
    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_72_0</IncludePath>
    <CustomBuildBeforeTargets>PreBuildEvent</CustomBuildBeforeTargets>
  </PropertyGroup>
  <ItemDefinitionGroup>
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -936,7 +936,11 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level,

 			StatusObjectReader statusObjConfig;
 			StatusArray excludedServersArr;
+			Optional<std::string> activePrimaryDC;

+			if (statusObjCluster.has("active_primary_dc")) {
+				activePrimaryDC = statusObjCluster["active_primary_dc"].get_str();
+			}
 			if (statusObjCluster.get("configuration", statusObjConfig)) {
 				if (statusObjConfig.has("excluded_servers"))
 					excludedServersArr = statusObjConfig.last().get_array();
@ -992,6 +996,73 @@ void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level,

 				if (statusObjConfig.get("log_routers", intVal))
 					outputString += format("\n  Desired Log Routers    - %d", intVal);
+
+				outputString += "\n  Usable Regions         - ";
+				if (statusObjConfig.get("usable_regions", intVal)) {
+					outputString += std::to_string(intVal);
+				} else {
+					outputString += "unknown";
+				}
+
+				StatusArray regions;
+				if (statusObjConfig.has("regions")) {
+					outputString += "\n  Regions: ";
+					regions = statusObjConfig["regions"].get_array();
+					bool isPrimary = false;
+					std::vector<std::string> regionSatelliteDCs;
+					std::string regionDC;
+					for (StatusObjectReader region : regions) {
+						for (StatusObjectReader dc : region["datacenters"].get_array()) {
+							if (!dc.has("satellite")) {
+								regionDC = dc["id"].get_str();
+								if (activePrimaryDC.present() && dc["id"].get_str() == activePrimaryDC.get()) {
+									isPrimary = true;
+								}
+							} else if (dc["satellite"].get_int() == 1) {
+								regionSatelliteDCs.push_back(dc["id"].get_str());
+							}
+						}
+						if (activePrimaryDC.present()) {
+							if (isPrimary) {
+								outputString += "\n    Primary -";
+							} else {
+								outputString += "\n    Remote -";
+							}
+						} else {
+							outputString += "\n    Region -";
+						}
+						outputString += format("\n        Datacenter                    - %s", regionDC.c_str());
+						if (regionSatelliteDCs.size() > 0) {
+							outputString += "\n        Satellite datacenters         - ";
+							for (int i = 0; i < regionSatelliteDCs.size(); i++) {
+								if (i != regionSatelliteDCs.size() - 1) {
+									outputString += format("%s, ", regionSatelliteDCs[i].c_str());
+								} else {
+									outputString += format("%s", regionSatelliteDCs[i].c_str());
+								}
+							}
+						}
+						isPrimary = false;
+						if (region.get("satellite_redundancy_mode", strVal)) {
+							outputString += format("\n        Satellite Redundancy Mode     - %s", strVal.c_str());
+						}
+						if (region.get("satellite_anti_quorum", intVal)) {
+							outputString += format("\n        Satellite Anti Quorum         - %d", intVal);
+						}
+						if (region.get("satellite_logs", intVal)) {
+							outputString += format("\n        Satellite Logs                - %d", intVal);
+						}
+						if (region.get("satellite_log_policy", strVal)) {
+							outputString += format("\n        Satellite Log Policy          - %s", strVal.c_str());
+						}
+						if (region.get("satellite_log_replicas", intVal)) {
+							outputString += format("\n        Satellite Log Replicas        - %d", intVal);
+						}
+						if (region.get("satellite_usable_dcs", intVal)) {
+							outputString += format("\n        Satellite Usable DCs          - %d", intVal);
+						}
+					}
+				}
 			}
 			catch (std::runtime_error& ) {
 				outputString = outputStringCache;
--- a/fdbcli/fdbcli.vcxproj
+++ b/fdbcli/fdbcli.vcxproj
@ -62,13 +62,13 @@
    <LinkIncremental>true</LinkIncremental>
    <OutDir>$(SolutionDir)bin\$(Configuration)\</OutDir>
    <IntDir>$(SystemDrive)\temp\msvcfdb\$(Platform)$(Configuration)\$(MSBuildProjectName)\</IntDir>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_72_0</IncludePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|X64'">
    <LinkIncremental>false</LinkIncremental>
    <OutDir>$(SolutionDir)bin\$(Configuration)\</OutDir>
    <IntDir>$(SystemDrive)\temp\msvcfdb\$(Platform)$(Configuration)\$(MSBuildProjectName)\</IntDir>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_72_0</IncludePath>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@ -362,6 +362,9 @@ public:

 	Future<bool> checkActive(Database cx) { return taskBucket->checkActive(cx); }

+	// If "pause" is true, pause all backups; otherwise, resume all.
+	Future<Void> changePause(Database db, bool pause);
+
 	friend class FileBackupAgentImpl;
 	static const int dataFooterSize;

--- a/fdbclient/CommitTransaction.h
+++ b/fdbclient/CommitTransaction.h
@ -48,7 +48,8 @@ static const char* typeString[] = { "SetValue",
 	                                "ByteMax",
 	                                "MinV2",
 	                                "AndV2",
-	                                "CompareAndClear"};
+	                                "CompareAndClear",
+	                                "MAX_ATOMIC_OP" };

 struct MutationRef {
 	static const int OVERHEAD_BYTES = 12; //12 is the size of Header in MutationList entries
@ -124,6 +125,10 @@ struct MutationRef {
 	};
 };

+static inline std::string getTypeString(MutationRef::Type type) {
+	return type < MutationRef::MAX_ATOMIC_OP ? typeString[(int)type] : "Unset";
+}
+
 // A 'single key mutation' is one which affects exactly the value of the key specified by its param1
 static inline bool isSingleKeyMutation(MutationRef::Type type) {
 	return (MutationRef::SINGLE_KEY_MASK & (1<<type)) != 0;
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -4049,6 +4049,28 @@ public:
 		return Void();
 	}

+	ACTOR static Future<Void> changePause(FileBackupAgent* backupAgent, Database db, bool pause) {
+		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(db));
+		state Future<Void> change = backupAgent->taskBucket->changePause(db, pause);
+
+		loop {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			try {
+				tr->set(backupPausedKey, pause ? LiteralStringRef("1") : LiteralStringRef("0"));
+				wait(tr->commit());
+				break;
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
+		wait(change);
+		TraceEvent("FileBackupAgentChangePaused").detail("Action", pause ? "Paused" : "Resumed");
+		return Void();
+	}
+
 	struct TimestampedVersion {
 		Optional<Version> version;
 		Optional<int64_t> epochs;
@ -4652,3 +4674,7 @@ void FileBackupAgent::setLastRestorable(Reference<ReadYourWritesTransaction> tr,
 Future<int> FileBackupAgent::waitBackup(Database cx, std::string tagName, bool stopWhenDone, Reference<IBackupContainer> *pContainer, UID *pUID) {
 	return FileBackupAgentImpl::waitBackup(this, cx, tagName, stopWhenDone, pContainer, pUID);
 }
+
+Future<Void> FileBackupAgent::changePause(Database db, bool pause) {
+	return FileBackupAgentImpl::changePause(this, db, pause);
+}
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@ -32,6 +32,7 @@ ClientKnobs::ClientKnobs() {
 }

 void ClientKnobs::initialize(bool randomize) {
+	// clang-format off
 	// FIXME: These are not knobs, get them out of ClientKnobs!
 	BYTE_LIMIT_UNLIMITED = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
 	ROW_LIMIT_UNLIMITED = GetRangeLimits::ROW_LIMIT_UNLIMITED;
@ -216,6 +217,10 @@ void ClientKnobs::initialize(bool randomize) {
 	//fdbcli		
 	init( CLI_CONNECT_PARALLELISM,                  400 );
 	init( CLI_CONNECT_TIMEOUT,                     10.0 );
+
+	// trace
+	init( TRACE_LOG_FILE_IDENTIFIER_MAX_LENGTH,      50 );
+	// clang-format on
 }

 TEST_CASE("/fdbclient/knobs/initialize") {
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@ -202,6 +202,9 @@ public:
 	int CLI_CONNECT_PARALLELISM;
 	double CLI_CONNECT_TIMEOUT;

+	// trace
+	int TRACE_LOG_FILE_IDENTIFIER_MAX_LENGTH;
+
 	ClientKnobs();
 	void initialize(bool randomize = false);
 };
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@ -321,7 +321,7 @@ void DLApi::init() {
 	loadClientFunction(&api->transactionReset, lib, fdbCPath, "fdb_transaction_reset");
 	loadClientFunction(&api->transactionCancel, lib, fdbCPath, "fdb_transaction_cancel");
 	loadClientFunction(&api->transactionAddConflictRange, lib, fdbCPath, "fdb_transaction_add_conflict_range");
-	loadClientFunction(&api->transactionGetEstimatedRangeSizeBytes, lib, fdbCPath, "fdb_transaction_get_estimated_range_size_bytes", headerVersion >= 700);
+	loadClientFunction(&api->transactionGetEstimatedRangeSizeBytes, lib, fdbCPath, "fdb_transaction_get_estimated_range_size_bytes", headerVersion >= 630);

 	loadClientFunction(&api->futureGetInt64, lib, fdbCPath, headerVersion >= 620 ? "fdb_future_get_int64" : "fdb_future_get_version");
 	loadClientFunction(&api->futureGetError, lib, fdbCPath, "fdb_future_get_error");
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -21,6 +21,7 @@
 #include "fdbclient/NativeAPI.actor.h"

 #include <iterator>
+#include <regex>
 #include <unordered_set>

 #include "fdbclient/Atomic.h"
@ -794,7 +795,9 @@ Database Database::createDatabase( Reference<ClusterConnectionFile> connFile, in
 			auto publicIP = determinePublicIPAutomatically( connFile->getConnectionString() );
 			selectTraceFormatter(networkOptions.traceFormat);
 			selectTraceClockSource(networkOptions.traceClockSource);
-			openTraceFile(NetworkAddress(publicIP, ::getpid()), networkOptions.traceRollSize, networkOptions.traceMaxLogsSize, networkOptions.traceDirectory.get(), "trace", networkOptions.traceLogGroup);
+			openTraceFile(NetworkAddress(publicIP, ::getpid()), networkOptions.traceRollSize,
+			              networkOptions.traceMaxLogsSize, networkOptions.traceDirectory.get(), "trace",
+			              networkOptions.traceLogGroup, networkOptions.traceFileIdentifier);

 			TraceEvent("ClientStart")
 				.detail("SourceVersion", getSourceVersion())
@ -843,8 +846,9 @@ const UniqueOrderedOptionList<FDBTransactionOptions>& Database::getTransactionDe
 }

 void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value) {
+	std::regex identifierRegex("^[a-zA-Z0-9_]*$");
 	switch(option) {
-		// SOMEDAY: If the network is already started, should these four throw an error?
+		// SOMEDAY: If the network is already started, should these five throw an error?
 		case FDBNetworkOptions::TRACE_ENABLE:
 			networkOptions.traceDirectory = value.present() ? value.get().toString() : "";
 			break;
@ -864,6 +868,17 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> valu
 				throw invalid_option_value();
 			}
 			break;
+		case FDBNetworkOptions::TRACE_FILE_IDENTIFIER:
+			validateOptionValue(value, true);
+			networkOptions.traceFileIdentifier = value.get().toString();
+			if (networkOptions.traceFileIdentifier.length() > CLIENT_KNOBS->TRACE_LOG_FILE_IDENTIFIER_MAX_LENGTH) {
+				fprintf(stderr, "Trace file identifier provided is too long.\n");
+				throw invalid_option_value();
+			} else if (!std::regex_match(networkOptions.traceFileIdentifier, identifierRegex)) {
+				fprintf(stderr, "Trace file identifier should only contain alphanumerics and underscores.\n");
+				throw invalid_option_value();
+			}
+			break;

 		case FDBNetworkOptions::TRACE_LOG_GROUP:
 			if(value.present()) {
@ -2497,7 +2512,7 @@ void TransactionOptions::reset(Database const& cx) {
 	maxBackoff = CLIENT_KNOBS->DEFAULT_MAX_BACKOFF;
 	sizeLimit = CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT;
 	lockAware = cx->lockAware;
-	if (cx->apiVersionAtLeast(700)) {
+	if (cx->apiVersionAtLeast(630)) {
 		includePort = true;
 	}
 }
@ -3037,6 +3052,12 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
 				trLogInfo = Reference<TransactionLogInfo>(new TransactionLogInfo(value.get().printable(), TransactionLogInfo::DONT_LOG));
 				trLogInfo->maxFieldLength = options.maxTransactionLoggingFieldLength;
 			}
+			if (info.debugID.present()) {
+				TraceEvent(SevInfo, "TransactionBeingTraced")
+					.detail("DebugTransactionID", trLogInfo->identifier)
+					.detail("ServerTraceID", info.debugID.get().toString());
+
+			}
 			break;

 		case FDBTransactionOptions::LOG_TRANSACTION:
@ -3064,6 +3085,16 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
 			}
 			break;

+		case FDBTransactionOptions::SERVER_REQUEST_TRACING:
+			validateOptionValue(value, false);
+			debugTransaction(deterministicRandom()->randomUniqueID());
+			if (trLogInfo && !trLogInfo->identifier.empty()) {
+				TraceEvent(SevInfo, "TransactionBeingTraced")
+					.detail("DebugTransactionID", trLogInfo->identifier)
+					.detail("ServerTraceID", info.debugID.get().toString());
+			}
+			break;
+
 		case FDBTransactionOptions::MAX_RETRY_DELAY:
 			validateOptionValue(value, true);
 			options.maxBackoff = extractIntOption(value, 0, std::numeric_limits<int32_t>::max()) / 1000.0;
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -58,6 +58,7 @@ struct NetworkOptions {
 	std::string traceLogGroup;
 	std::string traceFormat;
 	std::string traceClockSource;
+	std::string traceFileIdentifier;
 	Optional<bool> logClientInfo;
 	Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions;
 	bool slowTaskProfilingEnabled;
--- a/fdbclient/RestoreWorkerInterface.actor.h
+++ b/fdbclient/RestoreWorkerInterface.actor.h
@ -250,6 +250,16 @@ struct RestoreAsset {
 	bool isInVersionRange(Version commitVersion) const {
 		return commitVersion >= beginVersion && commitVersion < endVersion;
 	}
+
+	// Is mutation's begin and end keys are in RestoreAsset's range
+	bool isInKeyRange(MutationRef mutation) const {
+		if (isRangeMutation(mutation)) {
+			// Range mutation's right side is exclusive
+			return mutation.param1 >= range.begin && mutation.param2 <= range.end;
+		} else {
+			return mutation.param1 >= range.begin && mutation.param1 < range.end;
+		}
+	}
 };

 struct LoadingParam {
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@ -398,7 +398,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                  "consistencycheck_suspendkey_fetch_timeout",
                  "consistencycheck_disabled",
                  "duplicate_mutation_streams",
-                  "duplicate_mutation_fetch_timeout"
+                  "duplicate_mutation_fetch_timeout",
+                  "primary_dc_missing",
+                  "fetch_primary_dc_timeout"
               ]
            },
            "issues":[
@ -535,6 +537,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
      "data_distribution_disabled_for_ss_failures":true,
      "data_distribution_disabled_for_rebalance":true,
      "data_distribution_disabled":true,
+      "active_primary_dc":"pv",
      "configuration":{
         "log_anti_quorum":0,
         "log_replicas":2,
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -501,6 +501,7 @@ const KeyRangeRef backupProgressKeys(LiteralStringRef("\xff\x02/backupProgress/"
                                     LiteralStringRef("\xff\x02/backupProgress0"));
 const KeyRef backupProgressPrefix = backupProgressKeys.begin;
 const KeyRef backupStartedKey = LiteralStringRef("\xff\x02/backupStarted");
+extern const KeyRef backupPausedKey = LiteralStringRef("\xff\x02/backupPaused");

 const Key backupProgressKeyFor(UID workerID) {
 	BinaryWriter wr(Unversioned());
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -179,7 +179,7 @@ const Value workerListValue( ProcessData const& );
 Key decodeWorkerListKey( KeyRef const& );
 ProcessData decodeWorkerListValue( ValueRef const& );

-//    "\xff/backupProgress/[[workerID]]" := "[[WorkerBackupStatus]]"
+//    "\xff\x02/backupProgress/[[workerID]]" := "[[WorkerBackupStatus]]"
 extern const KeyRangeRef backupProgressKeys;
 extern const KeyRef backupProgressPrefix;
 const Key backupProgressKeyFor(UID workerID);
@ -187,11 +187,16 @@ const Value backupProgressValue(const WorkerBackupStatus& status);
 UID decodeBackupProgressKey(const KeyRef& key);
 WorkerBackupStatus decodeBackupProgressValue(const ValueRef& value);

-//    "\xff/backupStarted" := "[[vector<UID,Version1>]]"
+// The key to signal backup workers a new backup job is submitted.
+//    "\xff\x02/backupStarted" := "[[vector<UID,Version1>]]"
 extern const KeyRef backupStartedKey;
 Value encodeBackupStartedValue(const std::vector<std::pair<UID, Version>>& ids);
 std::vector<std::pair<UID, Version>> decodeBackupStartedValue(const ValueRef& value);

+// The key to signal backup workers that they should pause or resume.
+//    "\xff\x02/backupPaused" := "[[0|1]]"
+extern const KeyRef backupPausedKey;
+
 extern const KeyRef coordinatorsKey;
 extern const KeyRef logsKey;
 extern const KeyRef minRequiredCommitVersionKey;
--- a/fdbclient/fdbclient.vcxproj
+++ b/fdbclient/fdbclient.vcxproj
@ -167,11 +167,11 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">
    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_72_0</IncludePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|X64'">
    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_72_0</IncludePath>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@ -54,6 +54,9 @@ description is not currently required but encouraged.
    <Option name="trace_clock_source" code="35"
            paramType="String" paramDescription="Trace clock source"
            description="Select clock source for trace files. now (the default) or realtime are supported." />
+    <Option name="trace_file_identifier" code="36"
+            paramType="String" paramDescription="The identifier that will be part of all trace file names"
+            description="Once provided, this string will be used to replace the port/PID in the log file names." />
    <Option name="knob" code="40"
            paramType="String" paramDescription="knob_name=knob_value"
            description="Set internal tuning or debugging knobs"/>
@ -175,7 +178,7 @@ description is not currently required but encouraged.
            description="The read version will be committed, and usually will be the latest committed, but might not be the latest committed in the event of a simultaneous fault and misbehaving clock."
            defaultFor="20"/>
    <Option name="transaction_include_port_in_address" code="505"
-            description="Addresses returned by get_addresses_for_key include the port when enabled. As of api version 700, this option is enabled by default and setting this has no effect."
+            description="Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect."
            defaultFor="23"/>
  </Scope>
  
@ -186,7 +189,7 @@ description is not currently required but encouraged.
            description="The read version will be committed, and usually will be the latest committed, but might not be the latest committed in the event of a simultaneous fault and misbehaving clock."/>
    <Option name="causal_read_disable" code="21" />
    <Option name="include_port_in_address" code="23"
-            description="Addresses returned by get_addresses_for_key include the port when enabled. As of api version 700, this option is enabled by default and setting this has no effect." />
+            description="Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect." />
    <Option name="next_write_no_write_conflict_range" code="30"
            description="The next write performed on this transaction will not generate a write conflict range. As a result, other transactions which read the key(s) being modified by the next write will not conflict with this transaction. Care needs to be taken when using this option on a transaction that is shared between multiple threads. When setting this option, write conflict ranges will be disabled on the next write operation, regardless of what thread it is on." />
    <Option name="commit_on_first_proxy" code="40"
@ -223,6 +226,8 @@ description is not currently required but encouraged.
            description="Enables tracing for this transaction and logs results to the client trace logs. The DEBUG_TRANSACTION_IDENTIFIER option must be set before using this option, and client trace logging must be enabled to get log output." />
    <Option name="transaction_logging_max_field_length" code="405" paramType="Int" paramDescription="Maximum length of escaped key and value fields."
            description="Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation." />
+    <Option name="server_request_tracing" code="406"
+	    description="Sets an identifier for server tracing of this transaction. When committed, this identifier triggers logging when each part of the transaction authority encounters it, which is helpful in diagnosing slowness in misbehaving clusters. The identifier is randomly generated. When there is also a debug_transaction_identifier, both IDs are logged together." />
    <Option name="timeout" code="500"
            paramType="Int" paramDescription="value in milliseconds of timeout"
            description="Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option."
--- a/fdbrpc/RangeMap.h
+++ b/fdbrpc/RangeMap.h
@ -78,16 +78,16 @@ public:

 		Range range() { return Range(begin(),end()); }

-		Val& value() { 
+		Val& value() {
 			//ASSERT( it->key != allKeys.end );
-			return it->value; 
+			return it->value;
 		}

 		void operator ++() { ++it; }
 		void operator --() { it.decrementNonEnd(); }
 		bool operator ==(Iterator const& r) const { return it == r.it; }
 		bool operator !=(Iterator const& r) const { return it != r.it; }
-		
+
 		// operator* and -> return this
 		Iterator& operator*() { return *this; }
 		Iterator* operator->() { return this; }
@ -131,10 +131,10 @@ public:
 		--i;
 		return i;
 	}
-	Iterator lastItem() { 
+	Iterator lastItem() {
 		auto i = map.lastItem();
 		i.decrementNonEnd();
-		return Iterator(i); 
+		return Iterator(i);
 	}
 	int size() const { return map.size() - 1; } // We always have one range bounded by two entries
 	Iterator randomRange() {
--- a/fdbrpc/fdbrpc.vcxproj
+++ b/fdbrpc/fdbrpc.vcxproj
@ -147,11 +147,11 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">
    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_72_0</IncludePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|X64'">
    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
+    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_72_0</IncludePath>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <CustomBuildStep>
--- a/Show More
+++ b/Show More