Merge branch 'master' of https://github.com/apple/foundationdb into include-failed-ss

2019-11-25 11:05:35 -08:00 · 2019-11-25 11:05:35 -08:00 · 92892d4168
parent 0f6b444551 a04f314b1b
commit 92892d4168
135 changed files with 8234 additions and 4893 deletions
--- a/.gitignore
+++ b/.gitignore
@ -31,8 +31,10 @@ bindings/ruby/lib/fdboptions.rb
 bindings/ruby/fdb.gemspec
 fdbclient/vexillographer/obj/
 fdbrpc/hgVersion*.h
+fdbrpc/SourceVersion*.h
 fdbrpc/libeio/config.h
 flow/hgVersion*.h
+flow/SourceVersion*.h
 generated.mk
 versions.h
 packaging/msi/FDBInstaller.wix*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -29,18 +29,23 @@ if("${PROJECT_SOURCE_DIR}" STREQUAL "${PROJECT_BINARY_DIR}")
  message(FATAL_ERROR "In-source builds are forbidden")
 endif()

+set(OPEN_FOR_IDE OFF CACHE BOOL "Open this in an IDE (won't compile/link)")
+
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  message(STATUS "Setting build type to 'Release' as none was specified")
-  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE)
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
-    "MinSizeRel" "RelWithDebInfo")
+  if (OPEN_FOR_IDE)
+    message(STATUS "Defaulting build type to 'Debug' for OPEN_FOR_IDE")
+    set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build" FORCE)
+  else()
+    message(STATUS "Setting build type to 'Release' as none was specified")
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
+      "MinSizeRel" "RelWithDebInfo")
+  endif()
 endif()

 set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)

-set(OPEN_FOR_IDE OFF CACHE BOOL "Open this in an IDE (won't compile/link)")
-
 ################################################################################
 # Packages used for bindings
 ################################################################################
@ -196,7 +201,7 @@ add_subdirectory(tests)
 if(WITH_DOCUMENTATION)
  add_subdirectory(documentation)
 endif()
-add_subdirectory(monitoring)
+add_subdirectory(contrib/monitoring)

 if(WIN32)
  add_subdirectory(packaging/msi)
--- a/2
+++ b/2
@ -44,6 +44,8 @@ ifeq ($(PLATFORM),Linux)

  ifneq '' '$(findstring clang++,$(CXX))'
    CXXFLAGS += -Wno-undefined-var-template -Wno-unknown-warning-option -Wno-unused-command-line-argument -Wno-register -Wno-logical-op-parentheses
+  else
+    CXXFLAGS += -Wno-attributes
  endif

  CXXFLAGS += -std=c++17
--- a/bindings/c/test/mako/mako.c
+++ b/bindings/c/test/mako/mako.c
--- a/bindings/c/test/mako/mako.h
+++ b/bindings/c/test/mako/mako.h
@ -17,8 +17,6 @@
 #include <limits.h>
 #endif

-#define DEFAULT_RETRY_COUNT 3
-
 #define VERBOSE_NONE 0
 #define VERBOSE_DEFAULT 1
 #define VERBOSE_ANNOYING 2
@ -29,9 +27,11 @@
 #define MODE_BUILD 1
 #define MODE_RUN 2

-/* we set mako_txn_t and mako_args_t only once in the master process,
- * and won't be touched by child processes.
- */
+#define FDB_SUCCESS 0
+#define FDB_ERROR_RETRY -1
+#define FDB_ERROR_ABORT -2
+#define FDB_ERROR_CONFLICT -3
+

 /* transaction specification */
 enum Operations {
@ -55,7 +55,7 @@ enum Operations {
 #define OP_RANGE 1
 #define OP_REVERSE 2

-/* for arguments */
+/* for long arguments */
 enum Arguments {
  ARG_KEYLEN,
  ARG_VALLEN,
@ -82,6 +82,10 @@ enum TPSChangeTypes {
 #define KEYPREFIX "mako"
 #define KEYPREFIXLEN 4

+/* we set mako_txnspec_t and mako_args_t only once in the master process,
+ * and won't be touched by child processes.
+ */
+
 typedef struct {
  /* for each operation, it stores "count", "range" and "reverse" */
  int ops[MAX_OP][3];
@ -91,6 +95,7 @@ typedef struct {

 /* benchmark parameters */
 typedef struct {
+  int api_version;
  int json;
  int num_processes;
  int num_threads;
--- a/bindings/c/test/mako/mako.rst
+++ b/bindings/c/test/mako/mako.rst
@ -38,6 +38,9 @@ Arguments
  | - ``build``:  Populate data
  | - ``run``:  Run the benchmark

+- | ``-a | --api_version <api_version>``
+  | FDB API version to use (Default: Latest)
+
 - | ``-c | --cluster <cluster file>``
  | FDB cluster file (Required)

@ -48,7 +51,7 @@ Arguments
  | Number of threads per worker process (Default: 1)

 - | ``-r | --rows <rows>``
-  | Number of rows populated (Default: 10000)
+  | Number of rows populated (Default: 100000)

 - | ``-s | --seconds <seconds>``
  | Test duration in seconds (Default: 30)
@ -58,12 +61,23 @@ Arguments
  | Specify the number of operations to be executed.
  | This option cannot be set with ``--seconds``.

- | ``--tps <tps>``
-  | Target total transaction-per-second (TPS) of all worker processes/threads
+- | ``--tps|--tpsmax <tps>``
+  | Target total transaction-per-second (TPS) of all worker processes/threads.
+  | When --tpsmin is also specified, this defines the upper-bound TPS.
  | (Default: Unset / Unthrottled)

+- | ``--tpsmin <tps>``
+  | Target total lower-bound TPS of all worker processes/threads
+  | (Default: Unset / Unthrottled)
+
+- | ``--tpsinterval <seconds>``
+  | Time period TPS oscillates between --tpsmax and --tpsmin (Default: 10)
+
+- | ``--tpschange <sin|square|pulse>``
+  | Shape of the TPS change (Default: sin)
+
 - | ``--keylen <num>``
-  | Key string length in bytes (Default and Minimum: 16)
+  | Key string length in bytes (Default and Minimum: 32)

 - | ``--vallen <num>``
  | Value string length in bytes (Default and Minimum: 16)
@ -75,22 +89,19 @@ Arguments
  | Generate a skewed workload based on Zipf distribution (Default: Unset = Uniform)

 - | ``--sampling <num>``
-  | Sampling rate (1 sample / <num> ops) for latency stats
+  | Sampling rate (1 sample / <num> ops) for latency stats (Default: 1000)

 - | ``--trace``
-  | Enable tracing.  The trace file will be created in the current directory.
+  | Enable tracing.  The trace file will be created in the current directory.  (Default: Unset)

 - | ``--tracepath <path>``
  | Enable tracing and set the trace file path.

 - | ``--knobs <knobs>``
-  | Set client knobs
-
- | ``--flatbuffers``
-  | Enable flatbuffers
+  | Set client knobs (comma-separated)

 - | ``--commitget``
-  | Force commit for read-only transactions
+  | Force commit for read-only transactions (Default: Unset)

 - | ``-v | --verbose <level>``
  | Set verbose level (Default: 1)
--- a/bindings/java/src/main/com/apple/foundationdb/FDB.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FDB.java
@ -30,7 +30,7 @@ import java.util.concurrent.atomic.AtomicInteger;
 /**
 * The starting point for accessing FoundationDB.
 *  <br>
- *  <h3>Setting API version</h3>
+ *  <h2>Setting API version</h2>
 *  The FoundationDB API is accessed with a call to {@link #selectAPIVersion(int)}.
 *   This call is required before using any other part of the API. The call allows
 *   an error to be thrown at this point to prevent client code from accessing a later library
@ -49,11 +49,11 @@ import java.util.concurrent.atomic.AtomicInteger;
 *   being used to connect to the cluster. In particular, you should not advance
 *   the API version of your application after upgrading your client until the 
 *   cluster has also been upgraded.<br>
- *  <h3>Getting a database</h3>
+ *  <h2>Getting a database</h2>
 *  Once the API version has been set, the easiest way to get a {@link Database} object to use is
 *   to call {@link #open}.
 *  <br>
- *  <h3>Client networking</h3>
+ *  <h2>Client networking</h2>
 *  The network is started either implicitly with a call to a variant of {@link #open()}
 *  or started explicitly with a call to {@link #startNetwork()}.
 *  <br>
--- a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java
+++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java
@ -39,7 +39,7 @@ import com.apple.foundationdb.Range;
 *  the same order in which they would sort in FoundationDB. {@code Tuple}s sort
 *  first by the first element, then by the second, etc. This makes the tuple layer
 *  ideal for building a variety of higher-level data models.<br>
- * <h3>Types</h3>
+ * <h2>Types</h2>
 * A {@code Tuple} can
 *  contain byte arrays ({@code byte[]}), {@link String}s, {@link Number}s, {@link UUID}s,
 *  {@code boolean}s, {@link List}s, {@link Versionstamp}s, other {@code Tuple}s, and {@code null}.
@ -50,7 +50,7 @@ import com.apple.foundationdb.Range;
 *  a {@code long} integral value, so the range will be constrained to
 *  [{@code -2^63}, {@code 2^63-1}]. Note that for numbers outside this range the way that Java
 *  truncates integral values may yield unexpected results.<br>
- * <h3>{@code null} values</h3>
+ * <h2>{@code null} values</h2>
 * The FoundationDB tuple specification has a special type-code for {@code None}; {@code nil}; or,
 *  as Java would understand it, {@code null}.
 *  The behavior of the layer in the presence of {@code null} varies by type with the intention
--- a/bindings/java/src/main/overview.html.in
+++ b/bindings/java/src/main/overview.html.in
@ -2,7 +2,7 @@
 <BODY>
 This documents the client API for using FoundationDB from Java.<br>
 <br>
-<h3>Installation</h3>
+<h1>Installation</h1>
 FoundationDB's Java bindings rely on native libraries that are installed as part of the 
 FoundationDB client binaries installation (see 
 <a href="/foundationdb/api-general.html#installing-client-binaries" target="_blank">
@ -10,7 +10,7 @@ Installing FoundationDB client binaries</a>). The JAR can be downloaded from
 <a href="https://www.foundationdb.org/download/">our website</a>
 and then added to your classpath.<br>
 <br>
-<h3>Getting started</h3>
+<h1>Getting started</h1>
 To start using FoundationDB from Java, create an instance of the 
 {@link com.apple.foundationdb.FDB FoundationDB API interface} with the version of the
 API that you want to use (this release of the FoundationDB Java API supports versions between {@code 510} and {@code 620}).
@ -50,7 +50,7 @@ public class Example {
 }
 }
 </pre>
-<h3>FoundationDB {@link com.apple.foundationdb.tuple Tuple API}</h3>
+<h1>FoundationDB {@link com.apple.foundationdb.tuple Tuple API}</h1>
 The {@link com.apple.foundationdb.tuple Tuple API} is provided with the core Java API for FoundationDB.
 This layer is provided in some form in all official language bindings. It enables
 cross-language support for storing and retrieving typed data from the 
@ -60,7 +60,7 @@ binary data that FoundationDB supports. And, just as importantly, data packed in
 and <a href="/foundationdb/data-modeling.html#data-modeling-tuples">general Tuple documentation</a>
 for information about how Tuples sort and can be used to efficiently model data.
 <br>
-<h3>FoundationDB {@link com.apple.foundationdb.directory Directory API}</h3>
+<h1>FoundationDB {@link com.apple.foundationdb.directory Directory API}</h1>
 The {@link com.apple.foundationdb.directory Directory API} is provided with the core
 Java API for FoundationDB. This layer is provided in some form in all official
 language bindings. The FoundationDB API provides directories as a tool for
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@ -130,9 +130,69 @@ function(add_fdb_test)
    ${VALGRIND_OPTION}
    ${ADD_FDB_TEST_TEST_FILES}
    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
-	get_filename_component(test_dir_full ${first_file} DIRECTORY)
-	if(NOT ${test_dir_full} STREQUAL "")
-		get_filename_component(test_dir ${test_dir_full} NAME)
-		set_tests_properties(${test_name} PROPERTIES TIMEOUT ${this_test_timeout} LABELS "${test_dir}")
-	endif()
+  get_filename_component(test_dir_full ${first_file} DIRECTORY)
+  if(NOT ${test_dir_full} STREQUAL "")
+    get_filename_component(test_dir ${test_dir_full} NAME)
+    set_tests_properties(${test_name} PROPERTIES TIMEOUT ${this_test_timeout} LABELS "${test_dir}")
+  endif()
+  # set variables used for generating test packages
+  set(TEST_NAMES ${TEST_NAMES} ${test_name} PARENT_SCOPE)
+  set(TEST_FILES_${test_name} ${ADD_FDB_TEST_TEST_FILES} PARENT_SCOPE)
+  set(TEST_TYPE_${test_name} ${test_type} PARENT_SCOPE)
+endfunction()
+
+if(NOT WIN32)
+  set(TEST_PACKAGE_INCLUDE ".*" CACHE STRING "A regex of all tests that should be included in the test package")
+  set(TEST_PACKAGE_EXCLUDE ".^" CACHE STRING "A regex of all tests that shouldn't be added to the test package")
+  set(TEST_PACKAGE_ADD_DIRECTORIES "" CACHE STRING "A ;-separated list of directories. All files within each directory will be added to the test package")
+endif()
+
+function(create_test_package)
+  if(WIN32)
+    return()
+  endif()
+  string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length)
+  foreach(test IN LISTS TEST_NAMES)
+    if(("${TEST_TYPE_${test}}" STREQUAL "simulation") AND
+        (${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND
+        (NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE}))
+      foreach(file IN LISTS TEST_FILES_${test})
+        string(SUBSTRING ${file} ${base_length} -1 rel_out_file)
+        set(out_file ${CMAKE_BINARY_DIR}/packages/tests/${rel_out_file})
+        list(APPEND out_files ${out_file})
+        get_filename_component(test_dir ${out_file} DIRECTORY)
+        file(MAKE_DIRECTORY packages/tests/${test_dir})
+        add_custom_command(
+          OUTPUT ${out_file}
+          DEPENDS ${file}
+          COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file})
+      endforeach()
+    endif()
+  endforeach()
+  foreach(dir IN LISTS TEST_PACKAGE_ADD_DIRECTORIES)
+    file(GLOB_RECURSE files ${dir}/*)
+    string(LENGTH ${dir} dir_len)
+    foreach(file IN LISTS files)
+      get_filename_component(src_dir ${file} DIRECTORY)
+      # We need to make sure that ${src_dir} is at least
+      # as long as ${dir}. Otherwise the later call to
+      # SUBSTRING will fail
+      set(src_dir "${src_dir}/")
+      string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir)
+      string(SUBSTRING ${file} ${dir_len} -1 out_file)
+      list(APPEND external_files ${CMAKE_BINARY_DIR}/packages/${out_file})
+      file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/packages/${dest_dir})
+    endforeach()
+  endforeach()
+  set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness.tar.gz)
+  add_custom_command(
+    OUTPUT ${tar_file}
+    DEPENDS ${out_files}
+    COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
+    ${out_files} ${external_files}
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages
+    COMMENT "Package correctness archive"
+    )
+  add_custom_target(package_tests DEPENDS ${tar_file})
+  add_dependencies(package_tests strip_fdbserver)
 endfunction()
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -217,7 +217,13 @@ else()
  else()
    add_compile_options(-Werror)
  endif()
-  add_compile_options($<$<BOOL:${GCC}>:-Wno-pragmas>)
+  if (GCC)
+    add_compile_options(-Wno-pragmas)
+
+    # Otherwise `state [[maybe_unused]] int x;` will issue a warning.
+    # https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is
+    add_compile_options(-Wno-attributes)
+  endif()
  add_compile_options(-Wno-error=format
    -Wunused-variable
    -Wno-deprecated
@ -235,9 +241,14 @@ else()
  # Check whether we can use dtrace probes
  include(CheckSymbolExists)
  check_symbol_exists(DTRACE_PROBE sys/sdt.h SUPPORT_DTRACE)
+  check_symbol_exists(aligned_alloc stdlib.h HAS_ALIGNED_ALLOC)
+  message(STATUS "Has aligned_alloc: ${HAS_ALIGNED_ALLOC}")
  if(SUPPORT_DTRACE)
    add_compile_definitions(DTRACE_PROBES)
  endif()
+  if(HAS_ALIGNED_ALLOC)
+    add_compile_definitions(HAS_ALIGNED_ALLOC)
+  endif()

  if(CMAKE_COMPILER_IS_GNUCXX)
    set(USE_LTO OFF CACHE BOOL "Do link time optimization")
--- a/cmake/FlowCommands.cmake
+++ b/cmake/FlowCommands.cmake
@ -136,7 +136,6 @@ function(strip_debug_symbols target)
    add_custom_command(OUTPUT "${out_file}.debug"
      COMMAND objcopy --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug" &&
      objcopy --add-gnu-debuglink="${out_file}.debug" ${out_file}
-      DEPENDS "${out_file}"
      COMMENT "Copy debug symbols to ${out_name}.debug")
    list(APPEND out_files "${out_file}.debug")
  endif()
--- a/contrib/alloc_instrumentation.py
+++ b/contrib/alloc_instrumentation.py
--- a/contrib/monitoring/CMakeLists.txt
+++ b/contrib/monitoring/CMakeLists.txt
--- a/contrib/monitoring/actor_flamegraph.cpp
+++ b/contrib/monitoring/actor_flamegraph.cpp
--- a/contrib/transaction_profiling_analyzer.py
+++ b/contrib/transaction_profiling_analyzer.py
@ -0,0 +1,806 @@
+"""
+Requirements:
+python3
+fdb python bindings
+optional packages:
+  dateparser (for human date parsing)
+  sortedcontainers (for estimating key range read/write density)
+"""
+
+
+import argparse
+from collections import defaultdict
+from enum import Enum
+import fdb
+from fdb.impl import strinc
+import json
+from json import JSONEncoder
+import logging
+import struct
+from bisect import bisect_left
+import time
+
+PROTOCOL_VERSION_5_2 = 0x0FDB00A552000001
+PROTOCOL_VERSION_6_0 = 0x0FDB00A570010001
+PROTOCOL_VERSION_6_1 = 0x0FDB00B061060001
+PROTOCOL_VERSION_6_2 = 0x0FDB00B062010001
+supported_protocol_versions = frozenset([PROTOCOL_VERSION_5_2, PROTOCOL_VERSION_6_0, PROTOCOL_VERSION_6_1,
+                                         PROTOCOL_VERSION_6_2])
+
+
+fdb.api_version(600)
+
+BASIC_FORMAT = "%(asctime)s - %(levelname)-8s %(message)s"
+LOG_PATH = "transaction_profiling_analyzer.log"
+
+
+def setup_logger(name):
+    root = logging.getLogger(name)
+    root.setLevel(logging.DEBUG)
+    root.propagate = False
+
+    file_formatter = logging.Formatter(BASIC_FORMAT)
+
+    file_handler = logging.FileHandler(LOG_PATH)
+    file_handler.setFormatter(file_formatter)
+    file_handler.setLevel(logging.DEBUG)
+
+    root.addHandler(file_handler)
+
+    return root
+
+
+logger = setup_logger(__name__)
+
+
+class ByteBuffer(object):
+    def __init__(self, val):
+        self._offset = 0
+        self.val = val
+
+    def get_bytes(self, n):
+        if self._offset + n > len(self.val):
+            raise IndexError("Request to read %d bytes with only %d remaining" % (n, self.get_remaining_bytes()))
+        ret = self.val[self._offset:self._offset + n]
+        self._offset += n
+        return ret
+
+    def get_int(self):
+        return struct.unpack("<i", self.get_bytes(4))[0]
+
+    def get_long(self):
+        return struct.unpack("<q", self.get_bytes(8))[0]
+
+    def get_double(self):
+        return struct.unpack("<d", self.get_bytes(8))[0]
+
+    def get_bytes_with_length(self):
+        length = self.get_int()
+        return self.get_bytes(length)
+
+    def get_key_range(self):
+        return KeyRange(self.get_bytes_with_length(), self.get_bytes_with_length())
+
+    def get_key_range_list(self):
+        length = self.get_int()
+        return [self.get_key_range() for _ in range(0, length)]
+
+    def get_mutation(self):
+        return Mutation(ord(self.get_bytes(1)), self.get_bytes_with_length(), self.get_bytes_with_length())
+
+    def get_mutation_list(self):
+        length = self.get_int()
+        return [self.get_mutation() for _ in range(0, length)]
+
+    def get_remaining_bytes(self):
+        return len(self.val) - self._offset
+
+
+class ObjJsonEncoder(JSONEncoder):
+    def default(self, o):
+        try:
+            super().default(o)
+        except TypeError:
+            if isinstance(o, Enum):
+                return str(o)
+            if hasattr(o, "__dict__"):
+                return o.__dict__
+            return str(o)
+
+
+class TrInfoChunk(object):
+    def __init__(self, num_chunks, chunk_num, key, value):
+        self.num_chunks = num_chunks
+        self.chunk_num = chunk_num
+        self.key = key
+        self.value = value
+
+
+class KeyRange(object):
+    def __init__(self, start_key, end_key):
+        self.start_key = start_key
+        self.end_key = end_key
+
+
+class MutationType(Enum):
+    SET_VALUE = 0
+    CLEAR_RANGE = 1
+    ADD_VALUE = 2
+    DEBUG_KEY_RANGE = 3
+    DEBUG_KEY = 4
+    NO_OP = 5
+    AND = 6
+    OR = 7
+    XOR = 8
+    APPEND_IF_FITS = 9
+    AVAILABLE_FOR_REUSE = 10
+    RESERVED_FOR_LOG_PROTOCOL_MESSAGE = 11
+    MAX = 12
+    MIN = 13
+    SET_VERSION_STAMPED_KEY = 14
+    SET_VERSION_STAMPED_VALUE = 15
+
+
+class Mutation(object):
+    def __init__(self, code, param_one, param_two):
+        self.code = MutationType(code)
+        self.param_one = param_one
+        self.param_two = param_two
+
+
+class BaseInfo(object):
+    def __init__(self, start_timestamp):
+        self.start_timestamp = start_timestamp
+
+
+class GetVersionInfo(BaseInfo):
+    def __init__(self, bb, protocol_version):
+        super().__init__(bb.get_double())
+        self.latency = bb.get_double()
+        if protocol_version >= PROTOCOL_VERSION_6_2:
+            self.transaction_priority_type = bb.get_int()
+
+
+class GetInfo(BaseInfo):
+    def __init__(self, bb):
+        super().__init__(bb.get_double())
+        self.latency = bb.get_double()
+        self.value_size = bb.get_int()
+        self.key = bb.get_bytes_with_length()
+
+
+class GetRangeInfo(BaseInfo):
+    def __init__(self, bb):
+        super().__init__(bb.get_double())
+        self.latency = bb.get_double()
+        self.range_size = bb.get_int()
+        self.key_range = bb.get_key_range()
+
+
+class CommitInfo(BaseInfo):
+    def __init__(self, bb, full_output=True):
+        super().__init__(bb.get_double())
+        self.latency = bb.get_double()
+        self.num_mutations = bb.get_int()
+        self.commit_bytes = bb.get_int()
+
+        read_conflict_range = bb.get_key_range_list()
+        if full_output:
+            self.read_conflict_range = read_conflict_range
+        write_conflict_range = bb.get_key_range_list()
+        if full_output:
+            self.write_conflict_range = write_conflict_range
+        mutations = bb.get_mutation_list()
+        if full_output:
+            self.mutations = mutations
+
+        self.read_snapshot_version = bb.get_long()
+
+
+class ErrorGetInfo(BaseInfo):
+    def __init__(self, bb):
+        super().__init__(bb.get_double())
+        self.error_code = bb.get_int()
+        self.key = bb.get_bytes_with_length()
+
+
+class ErrorGetRangeInfo(BaseInfo):
+    def __init__(self, bb):
+        super().__init__(bb.get_double())
+        self.error_code = bb.get_int()
+        self.key_range = bb.get_key_range()
+
+
+class ErrorCommitInfo(BaseInfo):
+    def __init__(self, bb, full_output=True):
+        super().__init__(bb.get_double())
+        self.error_code = bb.get_int()
+
+        read_conflict_range = bb.get_key_range_list()
+        if full_output:
+            self.read_conflict_range = read_conflict_range
+        write_conflict_range = bb.get_key_range_list()
+        if full_output:
+            self.write_conflict_range = write_conflict_range
+        mutations = bb.get_mutation_list()
+        if full_output:
+            self.mutations = mutations
+
+        self.read_snapshot_version = bb.get_long()
+
+
+class UnsupportedProtocolVersionError(Exception):
+    def __init__(self, protocol_version):
+        super().__init__("Unsupported protocol version 0x%0.2X" % protocol_version)
+
+
+class ClientTransactionInfo:
+    def __init__(self, bb, full_output=True, type_filter=None):
+        self.get_version = None
+        self.gets = []
+        self.get_ranges = []
+        self.commit = None
+        self.error_gets = []
+        self.error_get_ranges = []
+        self.error_commits = []
+
+        protocol_version = bb.get_long()
+        if protocol_version not in supported_protocol_versions:
+            raise UnsupportedProtocolVersionError(protocol_version)
+        while bb.get_remaining_bytes():
+            event = bb.get_int()
+            if event == 0:
+                # we need to read it to consume the buffer even if we don't want to store it
+                get_version = GetVersionInfo(bb, protocol_version)
+                if (not type_filter or "get_version" in type_filter):
+                    self.get_version = get_version
+            elif event == 1:
+                get = GetInfo(bb)
+                if (not type_filter or "get" in type_filter):
+                    # because of the crappy json serializtion using __dict__ we have to set the list here otherwise
+                    # it doesn't print
+                    if not self.gets: self.gets = []
+                    self.gets.append(get)
+            elif event == 2:
+                get_range = GetRangeInfo(bb)
+                if (not type_filter or "get_range" in type_filter):
+                    if not self.get_ranges: self.get_ranges = []
+                    self.get_ranges.append(get_range)
+            elif event == 3:
+                commit = CommitInfo(bb, full_output=full_output)
+                if (not type_filter or "commit" in type_filter):
+                    self.commit = commit
+            elif event == 4:
+                error_get = ErrorGetInfo(bb)
+                if (not type_filter or "error_gets" in type_filter):
+                    if not self.error_gets: self.error_gets = []
+                    self.error_gets.append(error_get)
+            elif event == 5:
+                error_get_range = ErrorGetRangeInfo(bb)
+                if (not type_filter or "error_get_range" in type_filter):
+                    if not self.error_get_ranges: self.error_get_ranges = []
+                    self.error_get_ranges.append(error_get_range)
+            elif event == 6:
+                error_commit = ErrorCommitInfo(bb, full_output=full_output)
+                if (not type_filter or "error_commit" in type_filter):
+                    if not self.error_commits: self.error_commits = []
+                    self.error_commits.append(error_commit)
+            else:
+                raise Exception("Unknown event type %d" % event)
+
+    def has_types(self):
+        return self.get_version or self.gets or self.get_ranges or self.commit or self.error_gets \
+            or self.error_get_ranges or self.error_commits
+
+    def to_json(self):
+        return json.dumps(self, cls=ObjJsonEncoder, sort_keys=True)
+
+
+class TransactionInfoLoader(object):
+    max_num_chunks_to_store = 1000 # Each chunk would be 100 KB in size
+
+    def __init__(self, db, full_output=True, type_filter=None, min_timestamp=None, max_timestamp=None):
+        self.db = db
+        self.full_output = full_output
+        self.type_filter = type_filter
+        self.min_timestamp = min_timestamp
+        self.max_timestamp = max_timestamp
+        '''
+        Keys look like this
+            FF               - 2 bytes \xff\x02
+            SSSSSSSSSS       - 10 bytes Version Stamp
+            RRRRRRRRRRRRRRRR - 16 bytes Transaction id
+            NNNN             - 4 Bytes Chunk number
+            TTTT             - 4 Bytes Total number of chunks
+        '''
+        sample_key = "FF/fdbClientInfo/client_latency/SSSSSSSSSS/RRRRRRRRRRRRRRRR/NNNNTTTT/"
+
+        self.client_latency_start = b'\xff\x02/fdbClientInfo/client_latency/'
+        self.client_latency_start_key_selector = fdb.KeySelector.first_greater_than(self.client_latency_start)
+        self.client_latency_end_key_selector = fdb.KeySelector.first_greater_or_equal(strinc(self.client_latency_start))
+        self.version_stamp_start_idx = sample_key.index('S')
+        self.version_stamp_end_idx = sample_key.rindex('S')
+        self.tr_id_start_idx = sample_key.index('R')
+        self.tr_id_end_idx = sample_key.rindex('R')
+        self.chunk_num_start_idx = sample_key.index('N')
+        self.num_chunks_start_idx = sample_key.index('T')
+
+        self.tr_info_map = {}
+        self.num_chunks_stored = 0
+        self.num_transactions_discarded = 0
+
+    def _check_and_adjust_chunk_cache_size(self):
+        if self.num_chunks_stored > self.max_num_chunks_to_store:
+            c_list = self.tr_info_map.pop(next(iter(self.tr_info_map)))
+            self.num_chunks_stored -= len(c_list)
+            self.num_transactions_discarded += 1
+
+    def parse_key(self, k):
+        version_stamp_bytes = k[self.version_stamp_start_idx:self.version_stamp_end_idx + 1]
+        tr_id = k[self.tr_id_start_idx:self.tr_id_end_idx + 1]
+        num_chunks = struct.unpack(">i", k[self.num_chunks_start_idx:self.num_chunks_start_idx + 4])[0]
+        chunk_num = struct.unpack(">i", k[self.chunk_num_start_idx:self.chunk_num_start_idx + 4])[0]
+        return version_stamp_bytes, tr_id, num_chunks, chunk_num
+
+    def get_key_prefix_for_version_stamp(self, version_stamp):
+        return self.client_latency_start + struct.pack(">Q", version_stamp) + b'\x00\x00'
+
+    @fdb.transactional
+    def find_version_for_timestamp(self, tr, timestamp, start):
+        """
+        Uses Timekeeper to find the closest version to a timestamp.
+        If start is True, will find the greatest version at or before timestamp.
+        If start is False, will find the smallest version at or after the timestamp.
+
+        :param tr:
+        :param timestamp:
+        :param start:
+        :return:
+        """
+        tr.options.set_read_system_keys()
+        tr.options.set_read_lock_aware()
+        timekeeper_prefix = b'\xff\x02/timeKeeper/map/'
+        timestamp_packed = fdb.tuple.pack((timestamp,))
+        if start:
+            start_key = timekeeper_prefix
+            end_key = fdb.KeySelector.first_greater_than(timekeeper_prefix + timestamp_packed)
+            reverse = True
+        else:
+            start_key = fdb.KeySelector.first_greater_or_equal(timekeeper_prefix + timestamp_packed)
+            end_key = fdb.KeySelector.first_greater_or_equal(strinc(timekeeper_prefix))
+            reverse = False
+        for k, v in tr.snapshot.get_range(start_key, end_key, limit=1, reverse=reverse):
+            return fdb.tuple.unpack(v)[0]
+        return 0 if start else 0x8000000000000000 # we didn't find any timekeeper data so find the max range
+
+    def fetch_transaction_info(self):
+        if self.min_timestamp:
+            start_version = self.find_version_for_timestamp(self.db, self.min_timestamp, True)
+            logger.debug("Using start version %s" % start_version)
+            start_key = self.get_key_prefix_for_version_stamp(start_version)
+        else:
+            start_key = self.client_latency_start_key_selector
+
+        if self.max_timestamp:
+            end_version = self.find_version_for_timestamp(self.db, self.max_timestamp, False)
+            logger.debug("Using end version %s" % end_version)
+            end_key = self.get_key_prefix_for_version_stamp(end_version)
+        else:
+            end_key = self.client_latency_end_key_selector
+
+        valid_transaction_infos = 0
+        invalid_transaction_infos = 0
+
+        def build_client_transaction_info(v):
+            return ClientTransactionInfo(ByteBuffer(v), full_output=self.full_output, type_filter=self.type_filter)
+
+        more = True
+        tr = self.db.create_transaction()
+        while more:
+            tr.options.set_read_system_keys()
+            tr.options.set_read_lock_aware()
+            found = 0
+            buffer = []
+            try:
+                logger.debug("Querying [%s:%s]" % (start_key, end_key))
+                transaction_info_range = tr.snapshot.get_range(start_key, end_key,
+                                                               streaming_mode=fdb.impl.StreamingMode.want_all)
+                for k, v in transaction_info_range:
+                    found += 1
+                    #logger.debug(k)
+                    start_key = fdb.KeySelector.first_greater_than(k)
+
+                    _, tr_id, num_chunks, chunk_num = self.parse_key(k)
+
+                    #logger.debug("num_chunks=%d, chunk_num=%d" % (num_chunks,chunk_num))
+
+                    if num_chunks == 1:
+                        assert chunk_num == 1
+                        try:
+                            info = build_client_transaction_info(v)
+                            if info.has_types():
+                                buffer.append(info)
+                            valid_transaction_infos += 1
+                        except UnsupportedProtocolVersionError as e:
+                            invalid_transaction_infos += 1
+                        except ValueError:
+                            invalid_transaction_infos += 1
+                    else:
+                        if chunk_num == 1:
+                            # first chunk
+                            assert tr_id not in self.tr_info_map
+                            self.tr_info_map[tr_id] = [TrInfoChunk(num_chunks, chunk_num, k, v)]
+                            self.num_chunks_stored += 1
+                            self._check_and_adjust_chunk_cache_size()
+                        else:
+                            if tr_id not in self.tr_info_map:
+                                logger.error("Got a middle chunk without getting beginning part. Discarding transaction id: %s\n" % tr_id)
+                                continue
+                            c_list = self.tr_info_map[tr_id]
+                            if c_list[-1].num_chunks != num_chunks or c_list[-1].chunk_num != chunk_num - 1:
+                                self.tr_info_map.pop(tr_id)
+                                self.num_chunks_stored -= len(c_list)
+                                raise Exception("Chunk numbers do not match for Transaction id: %s" % tr_id)
+                            c_list.append(TrInfoChunk(num_chunks, chunk_num, k, v))
+                            self.num_chunks_stored += 1
+                            if num_chunks == chunk_num:
+                                self.tr_info_map.pop(tr_id)
+                                self.num_chunks_stored -= len(c_list)
+                                try:
+                                    info = build_client_transaction_info(b''.join([chunk.value for chunk in c_list]))
+                                    if info.has_types():
+                                        buffer.append(info)
+                                    valid_transaction_infos += 1
+                                except UnsupportedProtocolVersionError as e:
+                                    invalid_transaction_infos += 1
+                                except ValueError:
+                                    invalid_transaction_infos += 1
+                            self._check_and_adjust_chunk_cache_size()
+                    if (valid_transaction_infos + invalid_transaction_infos) % 1000 == 0:
+                        print("Processed valid: %d, invalid: %d" % (valid_transaction_infos, invalid_transaction_infos))
+                if found == 0:
+                    more = False
+            except fdb.FDBError as e:
+                # if too old then reset and don't wait
+                if e.code == 1007:
+                    tr.reset()
+                else:
+                    tr.on_error(e).wait()
+            for item in buffer:
+                yield item
+
+
+def has_sortedcontainers():
+    try:
+        import sortedcontainers
+        return True
+    except ImportError:
+        logger.warn("Can't find sortedcontainers so disabling RangeCounter")
+        return False
+
+
+def has_dateparser():
+    try:
+        import dateparser
+        return True
+    except ImportError:
+        logger.warn("Can't find dateparser so disabling human date parsing")
+        return False
+
+
+class RangeCounter(object):
+    def __init__(self, k):
+        self.k = k
+        from sortedcontainers import SortedDict
+        self.ranges = SortedDict()
+
+    def process(self, transaction_info):
+        for get_range in transaction_info.get_ranges:
+            self._insert_range(get_range.key_range.start_key, get_range.key_range.end_key)
+
+    def _insert_range(self, start_key, end_key):
+        keys = self.ranges.keys()
+        if len(keys) == 0:
+            self.ranges[start_key] = end_key, 1
+            return
+
+        start_pos = bisect_left(keys, start_key)
+        end_pos = bisect_left(keys, end_key)
+        #print("start_pos=%d, end_pos=%d" % (start_pos, end_pos))
+
+        possible_intersection_keys = keys[max(0, start_pos - 1):min(len(keys), end_pos+1)]
+
+        start_range_left = start_key
+
+        for key in possible_intersection_keys:
+            cur_end_key, cur_count = self.ranges[key]
+            #logger.debug("key=%s, cur_end_key=%s, cur_count=%d, start_range_left=%s" % (key, cur_end_key, cur_count, start_range_left))
+            if start_range_left < key:
+                if end_key <= key:
+                    self.ranges[start_range_left] = end_key, 1
+                    return
+                self.ranges[start_range_left] = key, 1
+                start_range_left = key
+            assert start_range_left >= key
+            if start_range_left >= cur_end_key:
+                continue
+
+            # [key, start_range_left) = cur_count
+            # if key == start_range_left this will get overwritten below
+            self.ranges[key] = start_range_left, cur_count
+
+            if end_key <= cur_end_key:
+                # [start_range_left, end_key) = cur_count+1
+                # [end_key, cur_end_key) = cur_count
+                self.ranges[start_range_left] = end_key, cur_count + 1
+                if end_key != cur_end_key:
+                    self.ranges[end_key] = cur_end_key, cur_count
+                start_range_left = end_key
+                break
+            else:
+                # [start_range_left, cur_end_key) = cur_count+1
+                self.ranges[start_range_left] = cur_end_key, cur_count+1
+                start_range_left = cur_end_key
+            assert start_range_left <= end_key
+
+        # there may be some range left
+        if start_range_left < end_key:
+            self.ranges[start_range_left] = end_key, 1
+
+    def get_count_for_key(self, key):
+        if key in self.ranges:
+            return self.ranges[key][1]
+
+        keys = self.ranges.keys()
+        index = bisect_left(keys, key)
+        if index == 0:
+            return 0
+
+        index_key = keys[index-1]
+        if index_key <= key < self.ranges[index_key][0]:
+            return self.ranges[index_key][1]
+        return 0
+
+    def get_range_boundaries(self, shard_finder=None):
+        total = sum([count for _, (_, count) in self.ranges.items()])
+        range_size = total // self.k
+        output_range_counts = []
+
+        def add_boundary(start, end, count):
+            if shard_finder:
+                shard_count = shard_finder.get_shard_count(start, end)
+                if shard_count == 1:
+                    addresses = shard_finder.get_addresses_for_key(start)
+                else:
+                    addresses = None
+                output_range_counts.append((start, end, count, shard_count, addresses))
+            else:
+                output_range_counts.append((start, end, count, None, None))
+
+        this_range_start_key = None
+        count_this_range = 0
+        for (start_key, (end_key, count)) in self.ranges.items():
+            if not this_range_start_key:
+                this_range_start_key = start_key
+            count_this_range += count
+            if count_this_range >= range_size:
+                add_boundary(this_range_start_key, end_key, count_this_range)
+                count_this_range = 0
+                this_range_start_key = None
+        if count_this_range > 0:
+            add_boundary(this_range_start_key, end_key, count_this_range)
+
+        return output_range_counts
+
+
+class ShardFinder(object):
+    def __init__(self, db):
+        self.db = db
+
+    @staticmethod
+    @fdb.transactional
+    def _get_boundary_keys(tr, begin, end):
+        tr.options.set_read_lock_aware()
+        return fdb.locality.get_boundary_keys(tr, begin, end)
+
+    @staticmethod
+    @fdb.transactional
+    def _get_addresses_for_key(tr, key):
+        tr.options.set_read_lock_aware()
+        return fdb.locality.get_addresses_for_key(tr, key)
+
+    def get_shard_count(self, start_key, end_key):
+        return len(list(self._get_boundary_keys(self.db, start_key, end_key))) + 1
+
+    def get_addresses_for_key(self, key):
+        return [a.decode('ascii') for a in self._get_addresses_for_key(self.db, key).wait()]
+
+
+class TopKeysCounter(object):
+    mutation_types_to_consider = frozenset([MutationType.SET_VALUE, MutationType.ADD_VALUE])
+
+    def __init__(self, k):
+        self.k = k
+        self.reads = defaultdict(lambda: 0)
+        self.writes = defaultdict(lambda: 0)
+
+    def process(self, transaction_info):
+        for get in transaction_info.gets:
+            self.reads[get.key] += 1
+        if transaction_info.commit:
+            for mutation in transaction_info.commit.mutations:
+                if mutation.code in self.mutation_types_to_consider:
+                    self.writes[mutation.param_one] += 1
+
+    def _get_range_boundaries(self, counts, shard_finder=None):
+        total = sum([v for (k, v) in counts.items()])
+        range_size = total // self.k
+        key_counts_sorted = sorted(counts.items())
+        output_range_counts = []
+
+        def add_boundary(start, end, count):
+            if shard_finder:
+                shard_count = shard_finder.get_shard_count(start, end)
+                if shard_count == 1:
+                    addresses = shard_finder.get_addresses_for_key(start)
+                else:
+                    addresses = None
+                output_range_counts.append((start, end, count, shard_count, addresses))
+            else:
+                output_range_counts.append((start, end, count, None, None))
+
+        start_key = None
+        count_this_range = 0
+        for (k, v) in key_counts_sorted:
+            if not start_key:
+                start_key = k
+            count_this_range += v
+            if count_this_range >= range_size:
+                add_boundary(start_key, k, count_this_range)
+                count_this_range = 0
+                start_key = None
+        if count_this_range > 0:
+            add_boundary(start_key, k, count_this_range)
+
+        return output_range_counts
+
+    def _get_top_k(self, counts):
+        count_key_pairs = sorted([(v, k) for (k, v) in counts.items()], reverse=True)
+        return count_key_pairs[0:self.k]
+
+    def get_top_k_reads(self):
+        return self._get_top_k(self.reads)
+
+    def get_top_k_writes(self):
+        return self._get_top_k(self.writes)
+
+    def get_k_read_range_boundaries(self, shard_finder=None):
+        return self._get_range_boundaries(self.reads, shard_finder)
+
+    def get_k_write_range_boundaries(self, shard_finder=None):
+        return self._get_range_boundaries(self.writes, shard_finder)
+
+
+def connect(cluster_file=None):
+    db = fdb.open(cluster_file=cluster_file)
+    return db
+
+
+def main():
+    parser = argparse.ArgumentParser(description="TransactionProfilingAnalyzer")
+    parser.add_argument("-C", "--cluster-file", type=str, help="Cluster file")
+    parser.add_argument("--full-output", action="store_true", help="Print full output from mutations")
+    parser.add_argument("--filter-get-version", action="store_true",
+                        help="Include get_version type. If no filter args are given all will be returned.")
+    parser.add_argument("--filter-get", action="store_true",
+                        help="Include get type. If no filter args are given all will be returned.")
+    parser.add_argument("--filter-get-range", action="store_true",
+                        help="Include get_range type. If no filter args are given all will be returned.")
+    parser.add_argument("--filter-commit", action="store_true",
+                        help="Include commit type. If no filter args are given all will be returned.")
+    parser.add_argument("--filter-error-get", action="store_true",
+                        help="Include error_get type. If no filter args are given all will be returned.")
+    parser.add_argument("--filter-error-get-range", action="store_true",
+                        help="Include error_get_range type. If no filter args are given all will be returned.")
+    parser.add_argument("--filter-error-commit", action="store_true",
+                        help="Include error_commit type. If no filter args are given all will be returned.")
+    start_time_group = parser.add_mutually_exclusive_group()
+    start_time_group.add_argument("--min-timestamp", type=int, help="Don't return events older than this epoch time")
+    start_time_group.add_argument("-s", "--start-time", type=str,
+                                  help="Don't return events older than this parsed time")
+    end_time_group = parser.add_mutually_exclusive_group()
+    end_time_group.add_argument("--max-timestamp", type=int, help="Don't return events newer than this epoch time")
+    end_time_group.add_argument("-e", "--end-time", type=str, help="Don't return events older than this parsed time")
+    parser.add_argument("--top-keys", type=int, help="If specified will output this many top keys for reads or writes", default=0)
+    args = parser.parse_args()
+
+    type_filter = set()
+    if args.filter_get_version: type_filter.add("get_version")
+    if args.filter_get: type_filter.add("get")
+    if args.filter_get_range: type_filter.add("get_range")
+    if args.filter_commit: type_filter.add("commit")
+    if args.filter_error_get: type_filter.add("error_get")
+    if args.filter_error_get_range: type_filter.add("error_get_range")
+    if args.filter_error_commit: type_filter.add("error_commit")
+    top_keys = args.top_keys
+    key_counter = TopKeysCounter(top_keys) if top_keys else None
+    range_counter = RangeCounter(top_keys) if (has_sortedcontainers() and top_keys) else None
+    full_output = args.full_output or (top_keys is not None)
+
+    if args.min_timestamp:
+        min_timestamp = args.min_timestamp
+    elif args.start_time:
+        if not has_dateparser():
+            raise Exception("Can't find dateparser needed to parse human dates")
+        import dateparser
+        min_timestamp = int(dateparser.parse(args.start_time).timestamp())
+    else:
+        raise Exception("Must specify start time")
+
+    if args.max_timestamp:
+        max_timestamp = args.max_timestamp
+    elif args.end_time:
+        if not has_dateparser():
+            raise Exception("Can't find dateparser needed to parse human dates")
+        import dateparser
+        max_timestamp = int(dateparser.parse(args.end_time).timestamp())
+    else:
+        raise Exception("Must specify end time")
+
+    now = time.time()
+    if max_timestamp > now:
+        raise Exception("max_timestamp is %d seconds in the future" % (max_timestamp - now))
+    if min_timestamp > now:
+        raise Exception("min_timestamp is %d seconds in the future" % (min_timestamp - now))
+
+    logger.info("Loading transactions from %d to %d" % (min_timestamp, max_timestamp))
+
+    db = connect(cluster_file=args.cluster_file)
+    loader = TransactionInfoLoader(db, full_output=full_output, type_filter=type_filter,
+                                   min_timestamp=min_timestamp, max_timestamp=max_timestamp)
+    for info in loader.fetch_transaction_info():
+        if info.has_types():
+            if not key_counter and not range_counter:
+                print(info.to_json())
+            else:
+                if key_counter:
+                    key_counter.process(info)
+                if range_counter:
+                    range_counter.process(info)
+
+    if key_counter:
+        def print_top(top):
+            for (count, key) in top:
+                print("%s %d" % (key, count))
+
+        def print_range_boundaries(range_boundaries):
+            for (start, end, count, shard_count, addresses) in range_boundaries:
+                if not shard_count:
+                    print("[%s, %s] %d" % (start, end, count))
+                else:
+                    addresses_string = "addresses=%s" % ','.join(addresses) if addresses else ''
+                    print("[%s, %s] %d shards=%d %s" % (start, end, count, shard_count, addresses_string))
+
+        shard_finder = ShardFinder(db)
+        top_reads = key_counter.get_top_k_reads()
+        if top_reads:
+            print("Top %d reads:" % min(top_keys, len(top_reads)))
+            print_top(top_reads)
+            print("Approx equal sized gets range boundaries:")
+            print_range_boundaries(key_counter.get_k_read_range_boundaries(shard_finder=shard_finder))
+        top_writes = key_counter.get_top_k_writes()
+        if top_writes:
+            print("Top %d writes:" % min(top_keys, len(top_writes)))
+            print_top(top_writes)
+            print("Approx equal sized commits range boundaries:")
+            print_range_boundaries(key_counter.get_k_write_range_boundaries(shard_finder=shard_finder))
+    if range_counter:
+        range_boundaries = range_counter.get_range_boundaries(shard_finder=shard_finder)
+        if range_boundaries:
+            print("Approx equal sized get_ranges boundaries:")
+            print_range_boundaries(range_boundaries)
+
+
+if __name__ == "__main__":
+    main()
+
--- a/documentation/CMakeLists.txt
+++ b/documentation/CMakeLists.txt
@ -1,3 +1,4 @@
+add_subdirectory(tutorial)
 # build a virtualenv
 set(sphinx_dir ${CMAKE_CURRENT_SOURCE_DIR}/sphinx)
 set(venv_dir ${CMAKE_CURRENT_BINARY_DIR}/venv)
--- a/documentation/sphinx/source/downloads.rst
+++ b/documentation/sphinx/source/downloads.rst
@ -10,38 +10,38 @@ macOS

 The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.

-* `FoundationDB-6.2.6.pkg <https://www.foundationdb.org/downloads/6.2.6/macOS/installers/FoundationDB-6.2.6.pkg>`_
+* `FoundationDB-6.2.8.pkg <https://www.foundationdb.org/downloads/6.2.8/macOS/installers/FoundationDB-6.2.8.pkg>`_

 Ubuntu
 ------

 The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.

-* `foundationdb-clients-6.2.6-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.6/ubuntu/installers/foundationdb-clients_6.2.6-1_amd64.deb>`_
-* `foundationdb-server-6.2.6-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.6/ubuntu/installers/foundationdb-server_6.2.6-1_amd64.deb>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.8-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.8/ubuntu/installers/foundationdb-clients_6.2.8-1_amd64.deb>`_
+* `foundationdb-server-6.2.8-1_amd64.deb <https://www.foundationdb.org/downloads/6.2.8/ubuntu/installers/foundationdb-server_6.2.8-1_amd64.deb>`_ (depends on the clients package)

 RHEL/CentOS EL6
 ---------------

 The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.

-* `foundationdb-clients-6.2.6-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.6/rhel6/installers/foundationdb-clients-6.2.6-1.el6.x86_64.rpm>`_
-* `foundationdb-server-6.2.6-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.6/rhel6/installers/foundationdb-server-6.2.6-1.el6.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.8-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.8/rhel6/installers/foundationdb-clients-6.2.8-1.el6.x86_64.rpm>`_
+* `foundationdb-server-6.2.8-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.8/rhel6/installers/foundationdb-server-6.2.8-1.el6.x86_64.rpm>`_ (depends on the clients package)

 RHEL/CentOS EL7
 ---------------

 The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.

-* `foundationdb-clients-6.2.6-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.6/rhel7/installers/foundationdb-clients-6.2.6-1.el7.x86_64.rpm>`_
-* `foundationdb-server-6.2.6-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.6/rhel7/installers/foundationdb-server-6.2.6-1.el7.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.2.8-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.8/rhel7/installers/foundationdb-clients-6.2.8-1.el7.x86_64.rpm>`_
+* `foundationdb-server-6.2.8-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.2.8/rhel7/installers/foundationdb-server-6.2.8-1.el7.x86_64.rpm>`_ (depends on the clients package)

 Windows
 -------

 The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.

-* `foundationdb-6.2.6-x64.msi <https://www.foundationdb.org/downloads/6.2.6/windows/installers/foundationdb-6.2.6-x64.msi>`_
+* `foundationdb-6.2.8-x64.msi <https://www.foundationdb.org/downloads/6.2.8/windows/installers/foundationdb-6.2.8-x64.msi>`_

 API Language Bindings
 =====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part

 If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package:

-* `foundationdb-6.2.6.tar.gz <https://www.foundationdb.org/downloads/6.2.6/bindings/python/foundationdb-6.2.6.tar.gz>`_
+* `foundationdb-6.2.8.tar.gz <https://www.foundationdb.org/downloads/6.2.8/bindings/python/foundationdb-6.2.8.tar.gz>`_

 Ruby 1.9.3/2.0.0+
 -----------------

-* `fdb-6.2.6.gem <https://www.foundationdb.org/downloads/6.2.6/bindings/ruby/fdb-6.2.6.gem>`_
+* `fdb-6.2.8.gem <https://www.foundationdb.org/downloads/6.2.8/bindings/ruby/fdb-6.2.8.gem>`_

 Java 8+
 -------

-* `fdb-java-6.2.6.jar <https://www.foundationdb.org/downloads/6.2.6/bindings/java/fdb-java-6.2.6.jar>`_
-* `fdb-java-6.2.6-javadoc.jar <https://www.foundationdb.org/downloads/6.2.6/bindings/java/fdb-java-6.2.6-javadoc.jar>`_
+* `fdb-java-6.2.8.jar <https://www.foundationdb.org/downloads/6.2.8/bindings/java/fdb-java-6.2.8.jar>`_
+* `fdb-java-6.2.8-javadoc.jar <https://www.foundationdb.org/downloads/6.2.8/bindings/java/fdb-java-6.2.8-javadoc.jar>`_

 Go 1.11+
 --------
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@ -29,7 +29,8 @@
                  "resolution",
                  "proxy",
                  "master",
-                  "test"
+                  "test",
+                  "storage_cache"
               ]
            },
            "degraded":true,
@ -66,6 +67,7 @@
                        "cluster_controller",
                        "data_distributor",
                        "ratekeeper",
+                        "storage_cache",
                        "router",
                        "coordinator"
                     ]
--- a/documentation/sphinx/source/old-release-notes/release-notes-620.rst
+++ b/documentation/sphinx/source/old-release-notes/release-notes-620.rst
@ -2,7 +2,16 @@
 Release Notes
 #############

-6.2.6
+6.2.8
+=====
+
+Fixes
+-----
+
+* Significantly improved the rate at which the transaction logs in a remote region can pull data from the primary region. `(PR #2307) <https://github.com/apple/foundationdb/pull/2307>`_ `(PR #2323) <https://github.com/apple/foundationdb/pull/2323>`_.
+* The ``system_kv_size_bytes`` status field could report a size much larger than the actual size of the system keyspace. `(PR #2305) <https://github.com/apple/foundationdb/pull/2305>`_.
+
+6.2.7
 =====

 Performance
@ -39,7 +48,6 @@ Fixes
 * File descriptors opened by clients and servers set close-on-exec, if available on the platform. `(PR #1581) <https://github.com/apple/foundationdb/pull/1581>`_.
 * ``fdbrestore`` commands other than ``start`` required a default cluster file to be found but did not actually use it. `(PR #1912) <https://github.com/apple/foundationdb/pull/1912>`_.
 * Unneeded network connections were not being closed because peer reference counts were handled improperly. `(PR #1768) <https://github.com/apple/foundationdb/pull/1768>`_.
-* Under certain conditions, cross region replication could stall for 10 minute periods. `(PR #1818) <https://github.com/apple/foundationdb/pull/1818>`_.
 * In very rare scenarios, master recovery would restart because system metadata was loaded incorrectly. `(PR #1919) <https://github.com/apple/foundationdb/pull/1919>`_.
 * Ratekeeper will aggressively throttle when unable to fetch the list of storage servers for a considerable period of time. `(PR #1858) <https://github.com/apple/foundationdb/pull/1858>`_.
 * Proxies could become overloaded when all storage servers on a team fail. [6.2.1] `(PR #1976) <https://github.com/apple/foundationdb/pull/1976>`_.
@ -58,6 +66,10 @@ Fixes
 * Committing transactions larger than 1 MB could cause the proxy to stall for up to a second. [6.2.6] `(PR #2250) <https://github.com/apple/foundationdb/pull/2250>`_.
 * The cluster controller could become saturated in clusters with large numbers of connected clients using TLS. [6.2.6] `(PR #2252) <https://github.com/apple/foundationdb/pull/2252>`_.
 * Backup and DR would not share a mutation stream if they were started on different versions of FoundationDB. Either backup or DR must be restarted to resolve this issue. [6.2.6] `(PR #2202) <https://github.com/apple/foundationdb/pull/2202>`_.
+* Don't track batch priority GRV requests in latency bands. [6.2.7] `(PR #2279) <https://github.com/apple/foundationdb/pull/2279>`_.
+* Transaction log processes used twice their normal memory when switching spill types. [6.2.7] `(PR #2256) <https://github.com/apple/foundationdb/pull/2256>`_.
+* Under certain conditions, cross region replication could stall for 10 minute periods. [6.2.7] `(PR #1818) <https://github.com/apple/foundationdb/pull/1818>`_ `(PR #2276) <https://github.com/apple/foundationdb/pull/2276>`_.
+* When dropping a remote region from the configuration after processes in the region have failed, data distribution would create teams from the dead servers for one minute. [6.2.7] `(PR #2286) <https://github.com/apple/foundationdb/pull/1818>`_.

 Status
 ------
@ -130,10 +142,10 @@ Fixes only impacting 6.2.0+
 * The cluster controller would saturate its CPU for a few seconds when sending configuration information to all of the worker processes. [6.2.4] `(PR #2086) <https://github.com/apple/foundationdb/pull/2086>`_.
 * The data distributor would build all possible team combinations if it was tracking an unhealthy server with less than 10 teams. [6.2.4] `(PR #2099) <https://github.com/apple/foundationdb/pull/2099>`_.
 * The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) <https://github.com/apple/foundationdb/pull/2065>`_.
-* The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) <https://github.com/apple/foundationdb/pull/2065>`_.
 * A storage server could crash if it took longer than 10 minutes to fetch a key range from another server. [6.2.5] `(PR #2170) <https://github.com/apple/foundationdb/pull/2170>`_.
 * Excluding or including servers would restart the data distributor. [6.2.5] `(PR #2170) <https://github.com/apple/foundationdb/pull/2170>`_.
 * The data distributor could read invalid memory when estimating database size. [6.2.6] `(PR #2225) <https://github.com/apple/foundationdb/pull/2225>`_.
+* Status could incorrectly report that backup and DR were not sharing a mutation stream. [6.2.7] `(PR #2274) <https://github.com/apple/foundationdb/pull/2274>`_.

 Earlier release notes
 ---------------------
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@ -13,13 +13,14 @@ Fixes

 Status
 ------
-* Replaced ``cluster.database_locked`` status field with ``cluster.database_lock_state``, which contains two subfields: ``locked`` (boolean) and ``lock_uid`` (which contains the database lock uid if the database is locked). `(PR #2058) <https://github.com/apple/foundationdb/pull/2058>`_.
+* Replaced ``cluster.database_locked`` status field with ``cluster.database_lock_state``, which contains two subfields: ``locked`` (boolean) and ``lock_uid`` (which contains the database lock uid if the database is locked). `(PR #2058) <https://github.com/apple/foundationdb/pull/2058>`_

 Bindings
 --------

 Other Changes
 -------------
+* Double the number of shard locations that the client will cache locally. `(PR #2198) <https://github.com/apple/foundationdb/pull/2198>`_

 Earlier release notes
 ---------------------
--- a/documentation/tutorial/CMakeLists.txt
+++ b/documentation/tutorial/CMakeLists.txt
@ -0,0 +1,4 @@
+set(TUTORIAL_SRCS tutorial.actor.cpp)
+
+add_flow_target(EXECUTABLE NAME tutorial SRCS "${TUTORIAL_SRCS}")
+target_link_libraries(tutorial PUBLIC fdbclient)
--- a/documentation/tutorial/tutorial.actor.cpp
+++ b/documentation/tutorial/tutorial.actor.cpp
@ -0,0 +1,467 @@
+/*
+ * tutorial.actor.cpp
+
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/flow.h"
+#include "flow/Platform.h"
+#include "flow/DeterministicRandom.h"
+#include "fdbclient/NativeAPI.actor.h"
+#include "fdbclient/ReadYourWrites.h"
+#include <functional>
+#include <unordered_map>
+#include <memory>
+#include <iostream>
+#include "flow/actorcompiler.h"
+
+NetworkAddress serverAddress;
+
+// this is a simple actor that will report how long
+// it is already running once a second.
+ACTOR Future<Void> simpleTimer() {
+	// we need to remember the time when we first
+	// started.
+	// This needs to be a state-variable because
+	// we will use it in different parts of the
+	// actor. If you don't understand how state
+	// variables work, it is a good idea to remove
+	// the state keyword here and look at the
+	// generated C++ code from the actor compiler.
+	state double start_time = g_network->now();
+	loop {
+		wait(delay(1.0));
+		std::cout << format("Time: %.2f\n", g_network->now() - start_time);
+	}
+}
+
+// A actor that demonstrates how choose-when
+// blocks work.
+ACTOR Future<Void> someFuture(Future<int> ready) {
+	// loop choose {} works as well here - the braces are optional
+	loop choose {
+		when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; }
+		when(int r = wait(ready)) {
+			std::cout << format("Ready %d\n", r);
+			wait(delay(double(r)));
+			std::cout << "Done\n";
+			return Void();
+		}
+	}
+}
+
+ACTOR Future<Void> promiseDemo() {
+	state Promise<int> promise;
+	state Future<Void> f = someFuture(promise.getFuture());
+	wait(delay(3.0));
+	promise.send(2);
+	wait(f);
+	return Void();
+}
+
+ACTOR Future<Void> eventLoop(AsyncTrigger* trigger) {
+	loop choose {
+		when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; }
+		when(wait(trigger->onTrigger())) { std::cout << "Triggered!\n"; }
+	}
+}
+
+ACTOR Future<Void> triggerDemo() {
+	state int runs = 1;
+	state AsyncTrigger trigger;
+	state Future<Void> triggerLoop = eventLoop(&trigger);
+	while (++runs < 10) {
+		wait(delay(1.0));
+		std::cout << "trigger..";
+		trigger.trigger();
+	}
+	std::cout << "Done.";
+	return Void();
+}
+
+struct EchoServerInterface {
+	constexpr static FileIdentifier file_identifier = 3152015;
+	RequestStream<struct GetInterfaceRequest> getInterface;
+	RequestStream<struct EchoRequest> echo;
+	RequestStream<struct ReverseRequest> reverse;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, echo, reverse);
+	}
+};
+
+struct GetInterfaceRequest {
+	constexpr static FileIdentifier file_identifier = 12004156;
+	ReplyPromise<EchoServerInterface> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, reply);
+	}
+};
+
+struct EchoRequest {
+	constexpr static FileIdentifier file_identifier = 10624019;
+	std::string message;
+	// this variable has to be called reply!
+	ReplyPromise<std::string> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, message, reply);
+	}
+};
+
+struct ReverseRequest {
+	constexpr static FileIdentifier file_identifier = 10765955;
+	std::string message;
+	// this variable has to be called reply!
+	ReplyPromise<std::string> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, message, reply);
+	}
+};
+
+uint64_t tokenCounter = 1;
+
+ACTOR Future<Void> echoServer() {
+	state EchoServerInterface echoServer;
+	echoServer.getInterface.makeWellKnownEndpoint(UID(-1, ++tokenCounter), TaskPriority::DefaultEndpoint);
+	loop {
+		choose {
+			when(GetInterfaceRequest req = waitNext(echoServer.getInterface.getFuture())) {
+				req.reply.send(echoServer);
+			}
+			when(EchoRequest req = waitNext(echoServer.echo.getFuture())) { req.reply.send(req.message); }
+			when(ReverseRequest req = waitNext(echoServer.reverse.getFuture())) {
+				req.reply.send(std::string(req.message.rbegin(), req.message.rend()));
+			}
+		}
+	}
+}
+
+ACTOR Future<Void> echoClient() {
+	state EchoServerInterface server;
+	server.getInterface = RequestStream<GetInterfaceRequest>(Endpoint({ serverAddress }, UID(-1, ++tokenCounter)));
+	EchoServerInterface s = wait(server.getInterface.getReply(GetInterfaceRequest()));
+	server = s;
+	EchoRequest echoRequest;
+	echoRequest.message = "Hello World";
+	std::string echoMessage = wait(server.echo.getReply(echoRequest));
+	std::cout << format("Sent {} to echo, received %s\n", "Hello World", echoMessage.c_str());
+	ReverseRequest reverseRequest;
+	reverseRequest.message = "Hello World";
+	std::string reverseString = wait(server.reverse.getReply(reverseRequest));
+	std::cout << format("Sent {} to reverse, received {}\n", "Hello World", reverseString.c_str());
+	return Void();
+}
+
+struct SimpleKeyValueStoreInteface {
+	constexpr static FileIdentifier file_identifier = 8226647;
+	RequestStream<struct GetKVInterface> connect;
+	RequestStream<struct GetRequest> get;
+	RequestStream<struct SetRequest> set;
+	RequestStream<struct ClearRequest> clear;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, connect, get, set, clear);
+	}
+};
+
+struct GetKVInterface {
+	constexpr static FileIdentifier file_identifier = 8062308;
+	ReplyPromise<SimpleKeyValueStoreInteface> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, reply);
+	}
+};
+
+struct GetRequest {
+	constexpr static FileIdentifier file_identifier = 6983506;
+	std::string key;
+	ReplyPromise<std::string> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, key, reply);
+	}
+};
+
+struct SetRequest {
+	constexpr static FileIdentifier file_identifier = 7554186;
+	std::string key;
+	std::string value;
+	ReplyPromise<Void> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, key, value, reply);
+	}
+};
+
+struct ClearRequest {
+	constexpr static FileIdentifier file_identifier = 8500026;
+	std::string from;
+	std::string to;
+	ReplyPromise<Void> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, from, to, reply);
+	}
+};
+
+ACTOR Future<Void> kvStoreServer() {
+	state SimpleKeyValueStoreInteface inf;
+	state std::map<std::string, std::string> store;
+	inf.connect.makeWellKnownEndpoint(UID(-1, ++tokenCounter), TaskPriority::DefaultEndpoint);
+	loop {
+		choose {
+			when(GetKVInterface req = waitNext(inf.connect.getFuture())) {
+				std::cout << "Received connection attempt\n";
+				req.reply.send(inf);
+			}
+			when(GetRequest req = waitNext(inf.get.getFuture())) {
+				auto iter = store.find(req.key);
+				if (iter == store.end()) {
+					req.reply.sendError(io_error());
+				} else {
+					req.reply.send(iter->second);
+				}
+			}
+			when(SetRequest req = waitNext(inf.set.getFuture())) {
+				store[req.key] = req.value;
+				req.reply.send(Void());
+			}
+			when(ClearRequest req = waitNext(inf.clear.getFuture())) {
+				auto from = store.lower_bound(req.from);
+				auto to = store.lower_bound(req.to);
+				while (from != store.end() && from != to) {
+					auto next = from;
+					++next;
+					store.erase(from);
+					from = next;
+				}
+				req.reply.send(Void());
+			}
+		}
+	}
+}
+
+ACTOR Future<SimpleKeyValueStoreInteface> connect() {
+	std::cout << format("%ull: Connect...\n", uint64_t(g_network->now()));
+	SimpleKeyValueStoreInteface c;
+	c.connect = RequestStream<GetKVInterface>(Endpoint({ serverAddress }, UID(-1, ++tokenCounter)));
+	SimpleKeyValueStoreInteface result = wait(c.connect.getReply(GetKVInterface()));
+	std::cout << format("%ull: done..\n", uint64_t(g_network->now()));
+	return result;
+}
+
+ACTOR Future<Void> kvSimpleClient() {
+	state SimpleKeyValueStoreInteface server = wait(connect());
+	std::cout << format("Set %s -> %s\n", "foo", "bar");
+	SetRequest setRequest;
+	setRequest.key = "foo";
+	setRequest.value = "bar";
+	wait(server.set.getReply(setRequest));
+	GetRequest getRequest;
+	getRequest.key = "foo";
+	std::string value = wait(server.get.getReply(getRequest));
+	std::cout << format("get(%s) -> %s\n", "foo", value.c_str());
+	return Void();
+}
+
+ACTOR Future<Void> kvClient(SimpleKeyValueStoreInteface server, std::shared_ptr<uint64_t> ops) {
+	state Future<Void> timeout = delay(20);
+	state int rangeSize = 2 << 12;
+	loop {
+		SetRequest setRequest;
+		setRequest.key = std::to_string(deterministicRandom()->randomInt(0, rangeSize));
+		setRequest.value = "foo";
+		wait(server.set.getReply(setRequest));
+		++(*ops);
+		try {
+			GetRequest getRequest;
+			getRequest.key = std::to_string(deterministicRandom()->randomInt(0, rangeSize));
+			std::string _ = wait(server.get.getReply(getRequest));
+			++(*ops);
+		} catch (Error& e) {
+			if (e.code() != error_code_io_error) {
+				throw e;
+			}
+		}
+		int from = deterministicRandom()->randomInt(0, rangeSize);
+		ClearRequest clearRequest;
+		clearRequest.from = std::to_string(from);
+		clearRequest.to = std::to_string(from + 100);
+		wait(server.clear.getReply(clearRequest));
+		++(*ops);
+		if (timeout.isReady()) {
+			// we are done
+			return Void();
+		}
+	}
+}
+
+ACTOR Future<Void> throughputMeasurement(std::shared_ptr<uint64_t> operations) {
+	loop {
+		wait(delay(1.0));
+		std::cout << format("%ull op/s\n", *operations);
+		*operations = 0;
+	}
+}
+
+ACTOR Future<Void> multipleClients() {
+	SimpleKeyValueStoreInteface server = wait(connect());
+	auto ops = std::make_shared<uint64_t>(0);
+	std::vector<Future<Void>> clients(100);
+	for (auto& f : clients) {
+		f = kvClient(server, ops);
+	}
+	auto done = waitForAll(clients);
+	wait(done || throughputMeasurement(ops));
+	return Void();
+}
+
+std::string clusterFile = "fdb.cluster";
+
+ACTOR Future<Void> fdbClient() {
+	wait(delay(30));
+	state Database db = Database::createDatabase(clusterFile, 300);
+	state Transaction tx(db);
+	state std::string keyPrefix = "/tut/";
+	state Key startKey;
+	state KeyRef endKey = LiteralStringRef("/tut0");
+	state int beginIdx = 0;
+	loop {
+		try {
+			tx.reset();
+			// this workload is stupidly simple:
+			// 1. select a random key between 1
+			//    and 1e8
+			// 2. select this key plus the 100
+			//    next ones
+			// 3. write 10 values in [k, k+100]
+			beginIdx = deterministicRandom()->randomInt(0, 1e8 - 100);
+			startKey = keyPrefix + std::to_string(beginIdx);
+			Standalone<RangeResultRef> range = wait(tx.getRange(KeyRangeRef(startKey, endKey), 100));
+			for (int i = 0; i < 10; ++i) {
+				Key k = Key(keyPrefix + std::to_string(beginIdx + deterministicRandom()->randomInt(0, 100)));
+				tx.set(k, LiteralStringRef("foo"));
+			}
+			wait(tx.commit());
+			std::cout << "Committed\n";
+			wait(delay(2.0));
+		} catch (Error& e) {
+			wait(tx.onError(e));
+		}
+	}
+}
+
+ACTOR Future<Void> fdbStatusStresser() {
+	state Database db = Database::createDatabase(clusterFile, 300);
+	state ReadYourWritesTransaction tx(db);
+	state Key statusJson(std::string("\xff\xff/status/json"));
+	loop {
+		try {
+			tx.reset();
+			Optional<Value> _ = wait(tx.get(statusJson));
+		} catch (Error& e) {
+			wait(tx.onError(e));
+		}
+	}
+}
+
+std::unordered_map<std::string, std::function<Future<Void>()>> actors = { { "timer", &simpleTimer },
+	                                                                      { "promiseDemo", &promiseDemo },
+	                                                                      { "triggerDemo", &triggerDemo },
+	                                                                      { "echoServer", &echoServer },
+	                                                                      { "echoClient", &echoClient },
+	                                                                      { "kvStoreServer", &kvStoreServer },
+	                                                                      { "kvSimpleClient", &kvSimpleClient },
+	                                                                      { "multipleClients", &multipleClients },
+	                                                                      { "fdbClient", &fdbClient },
+	                                                                      { "fdbStatusStresser", &fdbStatusStresser } };
+
+int main(int argc, char* argv[]) {
+	bool isServer = false;
+	std::string port;
+	std::vector<std::function<Future<Void>()>> toRun;
+	// parse arguments
+	for (int i = 1; i < argc; ++i) {
+		std::string arg(argv[i]);
+		if (arg == "-p") {
+			isServer = true;
+			if (i + 1 >= argc) {
+				std::cout << "Expecting an argument after -p\n";
+				return 1;
+			}
+			port = std::string(argv[++i]);
+			continue;
+		} else if (arg == "-s") {
+			if (i + 1 >= argc) {
+				std::cout << "Expecting an argument after -s\n";
+				return 1;
+			}
+			serverAddress = NetworkAddress::parse(argv[++i]);
+			continue;
+		} else if (arg == "-C") {
+			clusterFile = argv[++i];
+			std::cout << "Using cluster file " << clusterFile << std::endl;
+			continue;
+		}
+		auto actor = actors.find(arg);
+		if (actor == actors.end()) {
+			std::cout << format("Error: actor %s does not exist\n", arg.c_str());
+			return 1;
+		}
+		toRun.push_back(actor->second);
+	}
+	platformInit();
+	g_network = newNet2(false, true);
+	NetworkAddress publicAddress = NetworkAddress::parse("0.0.0.0:0");
+	if (isServer) {
+		publicAddress = NetworkAddress::parse("0.0.0.0:" + port);
+	}
+	// openTraceFile(publicAddress, TRACE_DEFAULT_ROLL_SIZE,
+	//              TRACE_DEFAULT_MAX_LOGS_SIZE);
+	try {
+		if (isServer) {
+			auto listenError = FlowTransport::transport().bind(publicAddress, publicAddress);
+			if (listenError.isError()) {
+				listenError.get();
+			}
+		}
+	} catch (Error& e) {
+		std::cout << format("Error while binding to address (%d): %s\n", e.code(), e.what());
+	}
+	// now we start the actors
+	std::vector<Future<Void>> all;
+	for (auto& f : toRun) {
+		all.emplace_back(f());
+	}
+	auto f = stopAfter(waitForAll(all));
+	g_network->run();
+	return 0;
+}
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -826,7 +826,7 @@ const KeyRef exeFastRestoreAgent = LiteralStringRef("fastrestore_agent"); // mus
 const KeyRef exeDatabaseAgent = LiteralStringRef("dr_agent");
 const KeyRef exeDatabaseBackup = LiteralStringRef("fdbdr");

-extern const char* getHGVersion();
+extern const char* getSourceVersion();

 #ifdef _WIN32
 void parentWatcher(void *parentHandle) {
@ -842,7 +842,7 @@ void parentWatcher(void *parentHandle) {

 static void printVersion() {
 	printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n");
-	printf("source version %s\n", getHGVersion());
+	printf("source version %s\n", getSourceVersion());
 	printf("protocol %llx\n", (long long) currentProtocolVersion.version());
 }

@ -913,7 +913,7 @@ void printBackupContainerInfo() {

 static void printBackupUsage(bool devhelp) {
 	printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n");
-	printf("Usage: %s (start | status | abort | wait | discontinue | pause | resume | expire | delete | describe | list) [OPTIONS]\n\n", exeBackup.toString().c_str());
+	printf("Usage: %s (start | status | abort | wait | discontinue | pause | resume | expire | delete | describe | list | cleanup) [OPTIONS]\n\n", exeBackup.toString().c_str());
 	printf("  -C CONNFILE    The path of a file containing the connection string for the\n"
 		   "                 FoundationDB cluster. The default is first the value of the\n"
 		   "                 FDB_CLUSTER_FILE environment variable, then `./fdb.cluster',\n"
@ -964,6 +964,11 @@ static void printBackupUsage(bool devhelp) {
 	printf("  --trace_format FORMAT\n"
 		   "                 Select the format of the trace files. xml (the default) and json are supported.\n"
 		   "                 Has no effect unless --log is specified.\n");
+	printf("  --max_cleanup_seconds SECONDS\n"
+	       "                 Specifies the amount of time a backup or DR needs to be stale before cleanup will\n"
+	       "                 remove mutations for it. By default this is set to one hour.\n");
+	printf("  --delete_data\n"
+		   "                 This flag will cause cleanup to remove mutations for the most stale backup or DR.\n");
 #ifndef TLS_DISABLED
 	printf(TLS_HELP);
 #endif
@ -3454,7 +3459,7 @@ int main(int argc, char* argv[]) {

 		TraceEvent("ProgramStart")
 			.setMaxEventLength(12000)
-			.detail("SourceVersion", getHGVersion())
+			.detail("SourceVersion", getSourceVersion())
 			.detail("Version", FDB_VT_VERSION )
 			.detail("PackageName", FDB_VT_PACKAGE_NAME)
 			.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL))
@ -3948,7 +3953,7 @@ ACTOR static Future<Version> _fastRestore(Database cx, Key tagName, Key url, boo
 ACTOR Future<Version> fastRestore(Database cx, Standalone<StringRef> tagName, Standalone<StringRef> url,
                                  bool waitForComplete, long targetVersion, bool verbose, Standalone<KeyRangeRef> range,
                                  Standalone<StringRef> addPrefix, Standalone<StringRef> removePrefix) {
-	Version targetVersion =
+	Version result =
 	    wait(_fastRestore(cx, tagName, url, waitForComplete, targetVersion, verbose, range, addPrefix, removePrefix));
-	return targetVersion;
-}
+	return result;
+}
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -54,7 +54,7 @@

 #include "flow/actorcompiler.h"  // This must be the last #include.

-extern const char* getHGVersion();
+extern const char* getSourceVersion();

 std::vector<std::string> validOptions;

@ -563,7 +563,7 @@ void initHelp() {

 void printVersion() {
 	printf("FoundationDB CLI " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n");
-	printf("source version %s\n", getHGVersion());
+	printf("source version %s\n", getSourceVersion());
 	printf("protocol %" PRIx64 "\n", currentProtocolVersion.version());
 }

@ -2632,7 +2632,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	if (opt.trace) {
 		TraceEvent("CLIProgramStart")
 			.setMaxEventLength(12000)
-			.detail("SourceVersion", getHGVersion())
+			.detail("SourceVersion", getSourceVersion())
 			.detail("Version", FDB_VT_VERSION)
 			.detail("PackageName", FDB_VT_PACKAGE_NAME)
 			.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL))
@ -3511,7 +3511,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 							printf("Data distribution is turned off.\n");
 						} else if (tokencmp(tokens[1], "disable")) {
 							if (tokencmp(tokens[2], "ssfailure")) {
-								bool _ = wait(makeInterruptable(setHealthyZone(db, ignoreSSFailuresZoneString, 0)));
+								wait(success(makeInterruptable(setHealthyZone(db, ignoreSSFailuresZoneString, 0))));
 								printf("Data distribution is disabled for storage server failures.\n");
 							} else if (tokencmp(tokens[2], "rebalance")) {
 								wait(makeInterruptable(setDDIgnoreRebalanceSwitch(db, true)));
@ -3523,7 +3523,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 							}
 						} else if (tokencmp(tokens[1], "enable")) {
 							if (tokencmp(tokens[2], "ssfailure")) {
-								bool _ = wait(makeInterruptable(clearHealthyZone(db, false, true)));
+								wait(success(makeInterruptable(clearHealthyZone(db, false, true))));
 								printf("Data distribution is enabled for storage server failures.\n");
 							} else if (tokencmp(tokens[2], "rebalance")) {
 								wait(makeInterruptable(setDDIgnoreRebalanceSwitch(db, false)));
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@ -862,29 +862,33 @@ ACTOR Future<Void> cleanupLogMutations(Database cx, Value destUidValue, bool del
 					wait(success(foundDRKey) && success(foundBackupKey));

 					if(foundDRKey.get().present() && foundBackupKey.get().present()) {
-							printf("WARNING: Found a tag which looks like both a backup and a DR. This tag was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
+							printf("WARNING: Found a tag that looks like both a backup and a DR. This tag is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
 					} else if(foundDRKey.get().present() && !foundBackupKey.get().present()) {
-							printf("Found a DR which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
+							printf("Found a DR that is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
 					} else if(!foundDRKey.get().present() && foundBackupKey.get().present()) {
-							printf("Found a Backup which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
+							printf("Found a Backup that is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
 					} else {
-						printf("WARNING: Found a unknown tag which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
+						printf("WARNING: Found an unknown tag that is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
 					}
 					loggedLogUids.insert(currLogUid);
 				}
 			}

-			if( readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND && deleteData && (!removingLogUid.present() || minVersionLogUid == removingLogUid.get()) ) {
-				removingLogUid = minVersionLogUid;
-				wait(eraseLogData(tr, minVersionLogUid, destUidValue));
-				wait(tr->commit());
-				printf("\nSuccessfully removed the tag which was %.4f hours behind.\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
-			} else if(removingLogUid.present() && minVersionLogUid != removingLogUid.get()) { 
-				printf("\nWARNING: The oldest tag was possibly removed, run again without `--delete_data' to check.\n");
-			} else if( deleteData ) {
-				printf("\nWARNING: Did not delete data because the tag was not at least %.4f hours behind. Change `--min_cleanup_seconds' to adjust this threshold.\n", CLIENT_KNOBS->MIN_CLEANUP_SECONDS/3600.0);
+			if(deleteData) {
+				if(readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND && (!removingLogUid.present() || minVersionLogUid == removingLogUid.get())) {
+					removingLogUid = minVersionLogUid;
+					wait(eraseLogData(tr, minVersionLogUid, destUidValue));
+					wait(tr->commit());
+					printf("\nSuccessfully removed the tag that was %.4f hours behind.\n\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
+				} else if(removingLogUid.present() && minVersionLogUid != removingLogUid.get()) {
+					printf("\nWARNING: The oldest tag was possibly removed, run again without `--delete_data' to check.\n\n");
+				} else {
+					printf("\nWARNING: Did not delete data because the tag is not at least %.4f hours behind. Change `--min_cleanup_seconds' to adjust this threshold.\n\n", CLIENT_KNOBS->MIN_CLEANUP_SECONDS/3600.0);
+				}
+			} else if(readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND) {
+				printf("\nPassing `--delete_data' would delete the tag that is %.4f hours behind.\n\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
 			} else {
-				printf("\nPassing `--delete_data' would delete the tag which was %.4f hours behind.\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
+				printf("\nPassing `--delete_data' would not delete the tag that is %.4f hours behind. Change `--min_cleanup_seconds' to adjust the cleanup threshold.\n\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND));
 			}

 			return Void();
--- a/fdbclient/BackupContainer.h
+++ b/fdbclient/BackupContainer.h
@ -173,7 +173,7 @@ struct RestorableFileSet {
 	Version targetVersion;
 	std::vector<LogFile> logs;
 	std::vector<RangeFile> ranges;
-	KeyspaceSnapshotFile snapshot;
+	KeyspaceSnapshotFile snapshot; // Info. for debug purposes
 };

 /* IBackupContainer is an interface to a set of backup data, which contains
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -48,6 +48,7 @@ set(FDBCLIENT_SRCS
  Notified.h
  ReadYourWrites.actor.cpp
  ReadYourWrites.h
+  RestoreWorkerInterface.actor.h
  RunTransaction.actor.h
  RYWIterator.cpp
  RYWIterator.h
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -93,7 +93,7 @@ struct struct_like_traits<Tag> : std::true_type {
 	}

 	template <int i, class Type, class Context>
-	static const void assign(Member& m, const Type& t, Context&) {
+	static void assign(Member& m, const Type& t, Context&) {
 		if constexpr (i == 0) {
 			m.id = t;
 		} else {
@ -105,15 +105,16 @@ struct struct_like_traits<Tag> : std::true_type {

 static const Tag invalidTag {tagLocalitySpecial, 0};
 static const Tag txsTag {tagLocalitySpecial, 1};
+static const Tag cacheTag {tagLocalitySpecial, 2};

 enum { txsTagOld = -1, invalidTagOld = -100 };

 struct TagsAndMessage {
 	StringRef message;
-	std::vector<Tag> tags;
+	VectorRef<Tag> tags;

 	TagsAndMessage() {}
-	TagsAndMessage(StringRef message, const std::vector<Tag>& tags) : message(message), tags(tags) {}
+	TagsAndMessage(StringRef message, VectorRef<Tag> tags) : message(message), tags(tags) {}

 	// Loads tags and message from a serialized buffer. "rd" is checkpointed at
 	// its begining position to allow the caller to rewind if needed.
@ -123,15 +124,11 @@ struct TagsAndMessage {
 		int32_t messageLength;
 		uint16_t tagCount;
 		uint32_t sub;
-		tags.clear();

 		rd->checkpoint();
 		*rd >> messageLength >> sub >> tagCount;
 		if (messageVersionSub) *messageVersionSub = sub;
-		tags.resize(tagCount);
-		for (int i = 0; i < tagCount; i++) {
-			*rd >> tags[i];
-		}
+		tags = VectorRef<Tag>((Tag*)rd->readBytes(tagCount*sizeof(Tag)), tagCount);
 		const int32_t rawLength = messageLength + sizeof(messageLength);
 		rd->rewind();
 		rd->checkpoint();
@ -553,6 +550,10 @@ inline KeySelectorRef operator + (const KeySelectorRef& s, int off) {
 inline KeySelectorRef operator - (const KeySelectorRef& s, int off) {
 	return KeySelectorRef(s.getKey(), s.orEqual, s.offset-off);
 }
+inline bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) {
+	// Returns true if the given range suffices to at least begin to resolve the given KeySelectorRef
+	return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end);
+}

 template <class Val>
 struct KeyRangeWith : KeyRange {
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -572,8 +572,8 @@ namespace fileBackup {

 		// Functions for consuming big endian (network byte order) integers.
 		// Consumes a big endian number, swaps it to little endian, and returns it.
-		const int32_t  consumeNetworkInt32()  { return (int32_t)bigEndian32((uint32_t)consume< int32_t>());}
-		const uint32_t consumeNetworkUInt32() { return          bigEndian32(          consume<uint32_t>());}
+		int32_t  consumeNetworkInt32()  { return (int32_t)bigEndian32((uint32_t)consume< int32_t>());}
+		uint32_t consumeNetworkUInt32() { return          bigEndian32(          consume<uint32_t>());}

 		bool eof() { return rptr == end; }

--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@ -69,7 +69,7 @@ ClientKnobs::ClientKnobs(bool randomize) {
 	init( GRV_BATCH_TIMEOUT,                     0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1;
 	init( BROADCAST_BATCH_SIZE,                     20 ); if( randomize && BUGGIFY ) BROADCAST_BATCH_SIZE = 1;

-	init( LOCATION_CACHE_EVICTION_SIZE,         300000 );
+	init( LOCATION_CACHE_EVICTION_SIZE,         600000 );
 	init( LOCATION_CACHE_EVICTION_SIZE_SIM,         10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3;

 	init( GET_RANGE_SHARD_LIMIT,                     2 );
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -60,7 +60,7 @@
 #endif
 #include "flow/actorcompiler.h" // This must be the last #include.

-extern const char* getHGVersion();
+extern const char* getSourceVersion();

 using std::max;
 using std::min;
@ -791,7 +791,7 @@ Database Database::createDatabase( Reference<ClusterConnectionFile> connFile, in
 			openTraceFile(NetworkAddress(publicIP, ::getpid()), networkOptions.traceRollSize, networkOptions.traceMaxLogsSize, networkOptions.traceDirectory.get(), "trace", networkOptions.traceLogGroup);

 			TraceEvent("ClientStart")
-				.detail("SourceVersion", getHGVersion())
+				.detail("SourceVersion", getSourceVersion())
 				.detail("Version", FDB_VT_VERSION)
 				.detail("PackageName", FDB_VT_PACKAGE_NAME)
 				.detail("ClusterFile", connFile->getFilename().c_str())
--- a/fdbclient/Notified.h
+++ b/fdbclient/Notified.h
@ -25,103 +25,70 @@
 #include "fdbclient/FDBTypes.h"
 #include "flow/TDMetric.actor.h"

-struct NotifiedVersion {
-	NotifiedVersion( StringRef& name, StringRef const &id, Version version = 0 ) : val(name, id, version) { val = version; }
-	NotifiedVersion( Version version = 0 ) : val(StringRef(), StringRef(), version) {}
+template <class T>
+struct IsMetricHandle : std::false_type {};
+template <class T>
+struct IsMetricHandle<MetricHandle<T>> : std::true_type {};

-	void initMetric(const StringRef& name, const StringRef &id) { 
-		Version version = val;
-		val.init(name, id); 
-		val = version;
-	}
+template <class T, class ValueType = T>
+struct Notified {
+	explicit Notified(ValueType v = 0) { val = v; }

-	Future<Void> whenAtLeast( Version limit ) {
-		if (val >= limit) 
-			return Void();
+	[[nodiscard]] Future<Void> whenAtLeast(const ValueType& limit) {
+		if (val >= limit) return Void();
 		Promise<Void> p;
-		waiting.push( std::make_pair(limit,p) );
+		waiting.push(std::make_pair(limit, p));
 		return p.getFuture();
 	}

-	Version get() const { return val; }
+	[[nodiscard]] ValueType get() const { return val; }

-	void set( Version v ) {
-		ASSERT( v >= val );
+	void initMetric(const StringRef& name, const StringRef& id) {
+		if constexpr (IsMetricHandle<T>::value) {
+			ValueType v = val;
+			val.init(name, id);
+			val = v;
+		} else {
+			TraceEvent(SevError, "InvalidNotifiedOperation")
+			    .detail("Reason", "Notified<T> where T is not a metric: Can't use initMetric");
+		}
+	}
+
+	void set(const ValueType& v) {
+		ASSERT(v >= val);
 		if (v != val) {
 			val = v;

 			std::vector<Promise<Void>> toSend;
-			while ( waiting.size() && v >= waiting.top().first ) {
+			while (waiting.size() && v >= waiting.top().first) {
 				Promise<Void> p = std::move(waiting.top().second);
 				waiting.pop();
 				toSend.push_back(p);
 			}
-			for(auto& p : toSend) {
+			for (auto& p : toSend) {
 				p.send(Void());
 			}
 		}
 	}

-	void operator=( Version v ) {
-		set( v );
+	void operator=(const ValueType& v) { set(v); }
+
+	Notified(Notified&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(std::move(r.val)) {}
+	void operator=(Notified&& r) BOOST_NOEXCEPT {
+		waiting = std::move(r.waiting);
+		val = std::move(r.val);
 	}

-	NotifiedVersion(NotifiedVersion&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(std::move(r.val)) {}
-	void operator=(NotifiedVersion&& r) BOOST_NOEXCEPT { waiting = std::move(r.waiting); val = std::move(r.val); }
-
 private:
-	typedef std::pair<Version,Promise<Void>> Item;
+	using Item = std::pair<ValueType, Promise<Void>>;
 	struct ItemCompare {
 		bool operator()(const Item& a, const Item& b) { return a.first > b.first; }
 	};
 	std::priority_queue<Item, std::vector<Item>, ItemCompare> waiting;
-	VersionMetricHandle val;
+	T val;
 };

-struct NotifiedDouble {
-	explicit NotifiedDouble( double val = 0 ) : val(val) {}
-
-	Future<Void> whenAtLeast( double limit ) {
-		if (val >= limit) 
-			return Void();
-		Promise<Void> p;
-		waiting.push( std::make_pair(limit,p) );
-		return p.getFuture();
-	}
-
-	double get() const { return val; }
-
-	void set( double v ) {
-		ASSERT( v >= val );
-		if (v != val) {
-			val = v;
-
-			std::vector<Promise<Void>> toSend;
-			while ( waiting.size() && v >= waiting.top().first ) {
-				Promise<Void> p = std::move(waiting.top().second);
-				waiting.pop();
-				toSend.push_back(p);
-			}
-			for(auto& p : toSend) {
-				p.send(Void());
-			}
-		}
-	}
-
-	void operator=( double v ) {
-		set( v );
-	}
-
-	NotifiedDouble(NotifiedDouble&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(r.val) {}
-	void operator=(NotifiedDouble&& r) BOOST_NOEXCEPT { waiting = std::move(r.waiting); val = r.val; }
-
-private:
-	typedef std::pair<double,Promise<Void>> Item;
-	struct ItemCompare {
-		bool operator()(const Item& a, const Item& b) { return a.first > b.first; }
-	};
-	std::priority_queue<Item, std::vector<Item>, ItemCompare> waiting;
-	double val;
-};
+using NotifiedVersion = Notified<VersionMetricHandle, VersionMetricHandle::ValueType>;
+using NotifiedDouble = Notified<double>;

 #endif
--- a/fdbclient/RestoreWorkerInterface.actor.h
+++ b/fdbclient/RestoreWorkerInterface.actor.h
@ -1,5 +1,5 @@
 /*
- * RestoreWorkerInterface.h
+ * RestoreWorkerInterface.actor.h
 *
 * This source file is part of the FoundationDB open source project
 *
@ -22,8 +22,11 @@
 // which are RestoreMaster, RestoreLoader, and RestoreApplier

 #pragma once
-#ifndef FDBSERVER_RESTORE_WORKER_INTERFACE_H
-#define FDBSERVER_RESTORE_WORKER_INTERFACE_H
+#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H)
+	#define FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H
+	#include "fdbclient/RestoreWorkerInterface.actor.g.h"
+#elif !defined(FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_H)
+	#define FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_H

 #include <sstream>
 #include "flow/Stats.h"
@ -35,6 +38,7 @@
 #include "fdbserver/CoordinationInterface.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/RestoreUtil.h"
+#include "flow/actorcompiler.h"  // This must be the last #include.

 class RestoreConfigFR;

@ -43,8 +47,8 @@ struct RestoreRecruitRoleRequest;
 struct RestoreSysInfoRequest;
 struct RestoreLoadFileRequest;
 struct RestoreVersionBatchRequest;
+struct RestoreSendMutationsToAppliersRequest;
 struct RestoreSendMutationVectorVersionedRequest;
-struct RestoreSetApplierKeyRangeVectorRequest;
 struct RestoreSysInfo;
 struct RestoreApplierInterface;

@ -121,10 +125,10 @@ struct RestoreLoaderInterface : RestoreRoleInterface {

 	RequestStream<RestoreSimpleRequest> heartbeat;
 	RequestStream<RestoreSysInfoRequest> updateRestoreSysInfo;
-	RequestStream<RestoreSetApplierKeyRangeVectorRequest> setApplierKeyRangeVectorRequest;
 	RequestStream<RestoreLoadFileRequest> loadFile;
+	RequestStream<RestoreSendMutationsToAppliersRequest> sendMutations;
 	RequestStream<RestoreVersionBatchRequest> initVersionBatch;
-	RequestStream<RestoreSimpleRequest> collectRestoreRoleInterfaces; // TODO: Change to collectRestoreRoleInterfaces
+	RequestStream<RestoreSimpleRequest> collectRestoreRoleInterfaces;
 	RequestStream<RestoreVersionBatchRequest> finishRestore;

 	bool operator==(RestoreWorkerInterface const& r) const { return id() == r.id(); }
@ -140,8 +144,8 @@ struct RestoreLoaderInterface : RestoreRoleInterface {
 	void initEndpoints() {
 		heartbeat.getEndpoint(TaskPriority::LoadBalancedEndpoint);
 		updateRestoreSysInfo.getEndpoint(TaskPriority::LoadBalancedEndpoint);
-		setApplierKeyRangeVectorRequest.getEndpoint(TaskPriority::LoadBalancedEndpoint);
 		loadFile.getEndpoint(TaskPriority::LoadBalancedEndpoint);
+		sendMutations.getEndpoint(TaskPriority::LoadBalancedEndpoint);
 		initVersionBatch.getEndpoint(TaskPriority::LoadBalancedEndpoint);
 		collectRestoreRoleInterfaces.getEndpoint(TaskPriority::LoadBalancedEndpoint);
 		finishRestore.getEndpoint(TaskPriority::LoadBalancedEndpoint);
@ -149,8 +153,8 @@ struct RestoreLoaderInterface : RestoreRoleInterface {

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, *(RestoreRoleInterface*)this, heartbeat, updateRestoreSysInfo, setApplierKeyRangeVectorRequest,
-		           loadFile, initVersionBatch, collectRestoreRoleInterfaces, finishRestore);
+		serializer(ar, *(RestoreRoleInterface*)this, heartbeat, updateRestoreSysInfo, loadFile, sendMutations,
+		           initVersionBatch, collectRestoreRoleInterfaces, finishRestore);
 	}
 };

@ -338,6 +342,31 @@ struct RestoreLoadFileRequest : TimedRequest {
 	}
 };

+struct RestoreSendMutationsToAppliersRequest : TimedRequest {
+	constexpr static FileIdentifier file_identifier = 68827305;
+
+	std::map<Key, UID> rangeToApplier;
+	bool useRangeFile; // Send mutations parsed from range file?
+
+	ReplyPromise<RestoreCommonReply> reply;
+
+	RestoreSendMutationsToAppliersRequest() = default;
+	explicit RestoreSendMutationsToAppliersRequest(std::map<Key, UID> rangeToApplier, bool useRangeFile)
+	  : rangeToApplier(rangeToApplier), useRangeFile(useRangeFile) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, rangeToApplier, useRangeFile, reply);
+	}
+
+	std::string toString() {
+		std::stringstream ss;
+		ss << "RestoreSendMutationsToAppliersRequest keyToAppliers.size:" << rangeToApplier.size()
+		   << " useRangeFile:" << useRangeFile;
+		return ss.str();
+	}
+};
+
 struct RestoreSendMutationVectorVersionedRequest : TimedRequest {
 	constexpr static FileIdentifier file_identifier = 69764565;

@ -356,7 +385,7 @@ struct RestoreSendMutationVectorVersionedRequest : TimedRequest {

 	std::string toString() {
 		std::stringstream ss;
-		ss << "fileIndex" << fileIndex << "prevVersion:" << prevVersion << " version:" << version
+		ss << "fileIndex:" << fileIndex << " prevVersion:" << prevVersion << " version:" << version
 		   << " isRangeFile:" << isRangeFile << " mutations.size:" << mutations.size();
 		return ss.str();
 	}
@ -389,29 +418,6 @@ struct RestoreVersionBatchRequest : TimedRequest {
 	}
 };

-struct RestoreSetApplierKeyRangeVectorRequest : TimedRequest {
-	constexpr static FileIdentifier file_identifier = 92038306;
-
-	std::map<Standalone<KeyRef>, UID> rangeToApplier;
-
-	ReplyPromise<RestoreCommonReply> reply;
-
-	RestoreSetApplierKeyRangeVectorRequest() = default;
-	explicit RestoreSetApplierKeyRangeVectorRequest(std::map<Standalone<KeyRef>, UID> rangeToApplier)
-	  : rangeToApplier(rangeToApplier) {}
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, rangeToApplier, reply);
-	}
-
-	std::string toString() {
-		std::stringstream ss;
-		ss << "RestoreVersionBatchRequest rangeToApplierSize:" << rangeToApplier.size();
-		return ss.str();
-	}
-};
-
 struct RestoreRequest {
 	constexpr static FileIdentifier file_identifier = 49589770;

@ -467,7 +473,8 @@ struct RestoreRequest {
 std::string getRoleStr(RestoreRole role);

 ////--- Interface functions
-Future<Void> _restoreWorker(Database const& cx, LocalityData const& locality);
-Future<Void> restoreWorker(Reference<ClusterConnectionFile> const& ccf, LocalityData const& locality);
+ACTOR Future<Void> _restoreWorker(Database cx, LocalityData locality);
+ACTOR Future<Void> restoreWorker(Reference<ClusterConnectionFile> ccf, LocalityData locality);

-#endif
+#include "flow/unactorcompiler.h"
+#endif
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@ -49,7 +49,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                  "resolution",
                  "proxy",
                  "master",
-                  "test"
+                  "test",
+                  "storage_cache"
               ]
            },
            "degraded":true,
@ -86,6 +87,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                        "cluster_controller",
                        "data_distributor",
                        "ratekeeper",
+                        "storage_cache",
                        "router",
                        "coordinator"
                     ]
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -189,8 +189,9 @@ struct GetKeyValuesReply : public LoadBalancedReply {
 	VectorRef<KeyValueRef, VecSerStrategy::String> data;
 	Version version; // useful when latestVersion was requested
 	bool more;
+	bool cached;

-	GetKeyValuesReply() : version(invalidVersion), more(false) {}
+	GetKeyValuesReply() : version(invalidVersion), more(false), cached(false) {}

 	template <class Ar>
 	void serialize( Ar& ar ) {
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -58,6 +58,28 @@ void decodeKeyServersValue( const ValueRef& value, vector<UID>& src, vector<UID>
 	}
 }

+//    "\xff/storageCache/[[begin]]" := "[[vector<uint16_t>]]"
+const KeyRangeRef storageCacheKeys( LiteralStringRef("\xff/storageCache/"), LiteralStringRef("\xff/storageCache0") );
+const KeyRef storageCachePrefix = storageCacheKeys.begin;
+
+const Key storageCacheKey( const KeyRef& k ) {
+	return k.withPrefix( storageCachePrefix );
+}
+
+const Value storageCacheValue( const vector<uint16_t>& serverIndices ) {
+	BinaryWriter wr((IncludeVersion())); 
+	wr << serverIndices;
+	return wr.toValue();
+}
+
+void decodeStorageCacheValue( const ValueRef& value, vector<uint16_t>& serverIndices ) {
+	serverIndices.clear();
+	if (value.size()) {
+		BinaryReader rd(value, IncludeVersion());
+		rd >> serverIndices;
+	}
+}
+
 const Value logsValue( const vector<std::pair<UID, NetworkAddress>>& logs, const vector<std::pair<UID, NetworkAddress>>& oldLogs ) {
 	BinaryWriter wr(IncludeVersion());
 	wr << logs;
@ -73,7 +95,6 @@ std::pair<vector<std::pair<UID, NetworkAddress>>,vector<std::pair<UID, NetworkAd
 	return std::make_pair(logs, oldLogs);
 }

-
 const KeyRef serverKeysPrefix = LiteralStringRef("\xff/serverKeys/");
 const ValueRef serverKeysTrue = LiteralStringRef("1"), // compatible with what was serverKeysTrue
 			   serverKeysFalse;
@ -103,6 +124,49 @@ bool serverHasKey( ValueRef storedValue ) {
 	return storedValue == serverKeysTrue;
 }

+const KeyRef cacheKeysPrefix = LiteralStringRef("\xff\x02/cacheKeys/");
+
+const Key cacheKeysKey( uint16_t idx, const KeyRef& key ) {
+	BinaryWriter wr(Unversioned());
+	wr.serializeBytes( cacheKeysPrefix );
+	wr << idx;
+	wr.serializeBytes( LiteralStringRef("/") );
+	wr.serializeBytes( key );
+	return wr.toValue();
+}
+const Key cacheKeysPrefixFor( uint16_t idx ) {
+	BinaryWriter wr(Unversioned());
+	wr.serializeBytes( cacheKeysPrefix );
+	wr << idx;
+	wr.serializeBytes( LiteralStringRef("/") );
+	return wr.toValue();
+}
+uint16_t cacheKeysDecodeIndex( const KeyRef& key ) {
+	uint16_t idx;
+	BinaryReader rd( key.removePrefix(cacheKeysPrefix), Unversioned() );
+	rd >> idx;
+	return idx;
+}
+KeyRef cacheKeysDecodeKey( const KeyRef& key ) {
+	return key.substr( cacheKeysPrefix.size() + sizeof(uint16_t) + 1);
+}
+
+const KeyRef cacheChangeKey = LiteralStringRef("\xff\x02/cacheChangeKey");
+const KeyRangeRef cacheChangeKeys( LiteralStringRef("\xff\x02/cacheChangeKeys/"), LiteralStringRef("\xff\x02/cacheChangeKeys0") );
+const KeyRef cacheChangePrefix = cacheChangeKeys.begin;
+const Key cacheChangeKeyFor( uint16_t idx ) {
+	BinaryWriter wr(Unversioned());
+	wr.serializeBytes( cacheChangePrefix );
+	wr << idx;
+	return wr.toValue();
+}
+uint16_t cacheChangeKeyDecodeIndex( const KeyRef& key ) {
+	uint16_t idx;
+	BinaryReader rd( key.removePrefix(cacheChangePrefix), Unversioned() );
+	rd >> idx;
+	return idx;
+}
+
 const KeyRangeRef serverTagKeys(
 	LiteralStringRef("\xff/serverTag/"),
 	LiteralStringRef("\xff/serverTag0") );
@ -641,13 +705,22 @@ const KeyRangeRef restoreApplierKeys(LiteralStringRef("\xff\x02/restoreApplier/"
 const KeyRef restoreApplierTxnValue = LiteralStringRef("1");

 // restoreApplierKeys: track atomic transaction progress to ensure applying atomicOp exactly once
+// Version is passed in as LittleEndian, it must be converted to BigEndian to maintain ordering in lexical order
 const Key restoreApplierKeyFor(UID const& applierID, Version version) {
 	BinaryWriter wr(Unversioned());
-	wr.serializeBytes(restoreWorkersKeys.begin);
-	wr << applierID << version;
+	wr.serializeBytes(restoreApplierKeys.begin);
+	wr << applierID << bigEndian64(version);
 	return wr.toValue();
 }

+std::pair<UID, Version> decodeRestoreApplierKey(ValueRef const& key) {
+	BinaryReader rd(key, Unversioned());
+	UID applierID;
+	Version version;
+	rd >> applierID >> version;
+	return std::make_pair(applierID, bigEndian64(version));
+}
+
 // Encode restore worker key for workerID
 const Key restoreWorkerKeyFor(UID const& workerID) {
 	BinaryWriter wr(Unversioned());
@ -678,7 +751,7 @@ const Value restoreRequestTriggerValue(UID randomID, int const numRequests) {
 	wr << randomID;
 	return wr.toValue();
 }
-const int decodeRestoreRequestTriggerValue(ValueRef const& value) {
+int decodeRestoreRequestTriggerValue(ValueRef const& value) {
 	int s;
 	UID randomID;
 	BinaryReader reader(value, IncludeVersion());
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -26,7 +26,7 @@

 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/StorageServerInterface.h"
-#include "fdbserver/RestoreWorkerInterface.h"
+#include "fdbclient/RestoreWorkerInterface.actor.h"

 struct RestoreLoaderInterface;
 struct RestoreApplierInterface;
@ -49,6 +49,13 @@ const Value keyServersValue(
 void decodeKeyServersValue( const ValueRef& value,
 	vector<UID>& src, vector<UID>& dest  );

+//    "\xff/storageCache/[[begin]]" := "[[vector<uint16_t>]]"
+extern const KeyRangeRef storageCacheKeys;
+extern const KeyRef storageCachePrefix;
+const Key storageCacheKey( const KeyRef& k );
+const Value storageCacheValue( const vector<uint16_t>& serverIndices );
+void decodeStorageCacheValue( const ValueRef& value, vector<uint16_t>& serverIndices );
+
 //    "\xff/serverKeys/[[serverID]]/[[begin]]" := "" | "1" | "2"
 extern const KeyRef serverKeysPrefix;
 extern const ValueRef serverKeysTrue, serverKeysFalse;
@ -57,6 +64,19 @@ const Key serverKeysPrefixFor( UID serverID );
 UID serverKeysDecodeServer( const KeyRef& key );
 bool serverHasKey( ValueRef storedValue );

+extern const KeyRef cacheKeysPrefix;
+
+const Key cacheKeysKey( uint16_t idx, const KeyRef& key );
+const Key cacheKeysPrefixFor( uint16_t idx );
+uint16_t cacheKeysDecodeIndex( const KeyRef& key );
+KeyRef cacheKeysDecodeKey( const KeyRef& key );
+
+extern const KeyRef cacheChangeKey;
+extern const KeyRangeRef cacheChangeKeys;
+extern const KeyRef cacheChangePrefix;
+const Key cacheChangeKeyFor( uint16_t idx );
+uint16_t cacheChangeKeyDecodeIndex( const KeyRef& key );
+
 extern const KeyRangeRef serverTagKeys;
 extern const KeyRef serverTagPrefix;
 extern const KeyRangeRef serverTagMaxKeys;
@ -298,11 +318,12 @@ extern const KeyRangeRef restoreApplierKeys;
 extern const KeyRef restoreApplierTxnValue;

 const Key restoreApplierKeyFor(UID const& applierID, Version version);
+std::pair<UID, Version> decodeRestoreApplierKey(ValueRef const& key);
 const Key restoreWorkerKeyFor(UID const& workerID);
 const Value restoreWorkerInterfaceValue(RestoreWorkerInterface const& server);
 RestoreWorkerInterface decodeRestoreWorkerInterfaceValue(ValueRef const& value);
 const Value restoreRequestTriggerValue(UID randomUID, int const numRequests);
-const int decodeRestoreRequestTriggerValue(ValueRef const& value);
+int decodeRestoreRequestTriggerValue(ValueRef const& value);
 const Value restoreRequestDoneVersionValue(Version readVersion);
 Version decodeRestoreRequestDoneVersionValue(ValueRef const& value);
 const Key restoreRequestKeyFor(int const& index);
--- a/fdbclient/ThreadSafeTransaction.actor.cpp
+++ b/fdbclient/ThreadSafeTransaction.actor.cpp
@ -333,9 +333,9 @@ void ThreadSafeTransaction::reset() {
 	onMainThreadVoid( [tr](){ tr->reset(); }, NULL );
 }

-extern const char* getHGVersion();
+extern const char* getSourceVersion();

-ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), clientVersion(format("%s,%s,%llx", FDB_VT_VERSION, getHGVersion(), currentProtocolVersion)), transportId(0) {}
+ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), clientVersion(format("%s,%s,%llx", FDB_VT_VERSION, getSourceVersion(), currentProtocolVersion)), transportId(0) {}

 void ThreadSafeApi::selectApiVersion(int apiVersion) {
 	this->apiVersion = apiVersion;
--- a/fdbclient/VersionedMap.h
+++ b/fdbclient/VersionedMap.h
@ -414,16 +414,19 @@ namespace PTreeImpl {
 		if (p->left(at)) printTree(p->left(at), at, depth+1);
 		for (int i=0;i<depth;i++)
 			printf("  ");
-		printf(":%s\n", describe(p->data).c_str());
+		//printf(":%s\n", describe(p->data.value.first).c_str());
+		printf(":%s\n", describe(p->data.key).c_str());
 		if (p->right(at)) printTree(p->right(at), at, depth+1);
 	}

 	template <class T>
 	void printTreeDetails(const Reference<PTree<T>>& p, int depth = 0) {
-		printf("Node %p (depth %d): %s\n", p.getPtr(), depth, describe(p->data).c_str());
+		//printf("Node %p (depth %d): %s\n", p.getPtr(), depth, describe(p->data.value.first).c_str());
+		printf("Node %p (depth %d): %s\n", p.getPtr(), depth, describe(p->data.key).c_str());
 		printf("  Left: %p\n", p->pointer[0].getPtr());
 		printf("  Right: %p\n", p->pointer[1].getPtr());
-		if (p->pointer[2])
+		//if (p->pointer[2])
+		if (p->updated)
 			printf("  Version %lld %s: %p\n", p->lastUpdateVersion, p->replacedPointer ? "Right" : "Left", p->pointer[2].getPtr());
 		for(int i=0; i<3; i++)
 			if (p->pointer[i]) printTreeDetails(p->pointer[i], depth+1);
@ -462,8 +465,47 @@ namespace PTreeImpl {
 		}
 	}

+	//Remove pointers to any child nodes that have been updated at or before the given version
+	//This essentially gets rid of node versions that will never be read (beyond 5s worth of versions)
+	//TODO look into making this per-version compaction. (We could keep track of updated nodes at each version for example)
+	template <class T>
+	void compact(Reference<PTree<T>>& p, Version newOldestVersion){
+		if (!p) {
+			return;
+		}
+		if (p->updated && p->lastUpdateVersion <= newOldestVersion) {
+			/* If the node has been updated, figure out which pointer was repalced. And delete that pointer */
+			auto which = p->replacedPointer;
+			p->pointer[which] = p->pointer[2];
+			p->updated = false;
+			p->pointer[2] = Reference<PTree<T>>();
+			//p->pointer[which] = Reference<PTree<T>>();
+		}
+		Reference<PTree<T>> left = p->left(newOldestVersion);
+		Reference<PTree<T>> right = p->right(newOldestVersion);
+		compact(left, newOldestVersion);
+		compact(right, newOldestVersion);
+	}
+
 }

+class ValueOrClearToRef {
+public:
+	static ValueOrClearToRef value(ValueRef const& v) { return ValueOrClearToRef(v, false); }
+	static ValueOrClearToRef clearTo(KeyRef const& k) { return ValueOrClearToRef(k, true); }
+
+	bool isValue() const { return !isClear; };
+	bool isClearTo() const { return isClear; }
+
+	ValueRef const& getValue() const { ASSERT( isValue() ); return item; };
+	KeyRef const&  getEndKey() const { ASSERT(isClearTo()); return item; };
+
+private:
+	ValueOrClearToRef( StringRef item, bool isClear ) : item(item), isClear(isClear) {}
+	StringRef item;
+	bool isClear;
+};
+
 // VersionedMap provides an interface to a partially persistent tree, allowing you to read the values at a particular version,
 // create new versions, modify the current version of the tree, and forget versions prior to a specific version.
 template <class K, class T>
@ -597,6 +639,26 @@ public:
 		erase(key);
 	}

+	void printDetail() {
+		PTreeImpl::printTreeDetails(roots.back().second, 0);
+	}
+
+	void printTree(Version at) {
+		PTreeImpl::printTree(roots.back().second, at, 0);
+	}
+
+	void compact(Version newOldestVersion) {
+		ASSERT( newOldestVersion <= latestVersion );
+		//auto newBegin = roots.lower_bound(newOldestVersion);
+		auto newBegin = lower_bound(roots.begin(), roots.end(), newOldestVersion, rootsComparator());
+		for(auto root = roots.begin(); root != newBegin; ++root) {
+			if(root->second)
+				PTreeImpl::compact(root->second, newOldestVersion);
+		}
+		//printf("\nPrinting the tree at latest version after compaction.\n");
+		//PTreeImpl::printTreeDetails(roots.back().second(), 0);
+	}
+
 	// for(auto i = vm.at(version).lower_bound(range.begin); i < range.end; ++i)
 	struct iterator{
 		explicit iterator(Tree const& root, Version at) : root(root), at(at) {}
@ -686,6 +748,11 @@ public:
 	ViewAtVersion at( Version v ) const { return ViewAtVersion(getRoot(v), v); }
 	ViewAtVersion atLatest() const { return ViewAtVersion(roots.back().second, latestVersion); }

+	bool isClearContaining( ViewAtVersion const& view, KeyRef key ) {
+		auto i = view.lastLessOrEqual(key);
+		return i && i->isClearTo() && i->getEndKey() > key;
+	}
+
 	// TODO: getHistory?

 };
--- a/fdbclient/fdbclient.vcxproj
+++ b/fdbclient/fdbclient.vcxproj
@ -89,6 +89,9 @@
    <ClInclude Include="StorageServerInterface.h" />
    <ClInclude Include="Subspace.h" />
    <ClInclude Include="SystemData.h" />
+    <ActorCompiler Include="RestoreWorkerInterface.actor.h">
+        <EnableCompile>false</EnableCompile>
+    </ActorCompiler>
    <ClInclude Include="TaskBucket.h" />
    <ClInclude Include="ThreadSafeTransaction.h" />
    <ClInclude Include="Tuple.h" />
--- a/fdbrpc/AsyncFileEIO.actor.h
+++ b/fdbrpc/AsyncFileEIO.actor.h
@ -45,7 +45,8 @@ class AsyncFileEIO : public IAsyncFile, public ReferenceCounted<AsyncFileEIO> {

 public:
 	static void init() {
-		if (eio_init( &eio_want_poll, NULL )) { 
+		eio_set_max_parallel(FLOW_KNOBS->EIO_MAX_PARALLELISM);
+		if (eio_init( &eio_want_poll, NULL )) {
 			TraceEvent("EioInitError").detail("ErrorNo", errno);
 			throw platform_error(); 
 		}
@ -246,6 +247,9 @@ private:
 		if( flags & OPEN_READONLY )  oflags |= O_RDONLY;
 		if( flags & OPEN_READWRITE ) oflags |= O_RDWR;
 		if( flags & OPEN_ATOMIC_WRITE_AND_CREATE ) oflags |= O_TRUNC;
+#if defined(__linux__)
+		if ( flags & OPEN_UNBUFFERED && FLOW_KNOBS->EIO_USE_ODIRECT ) oflags |= O_DIRECT;
+#endif
 		return oflags;
 	}

--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@ -50,24 +50,27 @@ TEST_CASE("/flow/actorcompiler/lineNumbers") {
 	return Void();
 }

-TEST_CASE("/flow/delayOrdering") {
-	state double x = deterministicRandom()->random01();
-	state double y = deterministicRandom()->random01();
-	if (BUGGIFY) {
-		y = x;
+TEST_CASE("/flow/buggifiedDelay") {
+	if (FLOW_KNOBS->MAX_BUGGIFIED_DELAY == 0) {
+		return Void();
+	}
+	loop {
+		state double x = deterministicRandom()->random01();
+		state int last = 0;
+		state Future<Void> f1 = map(delay(x), [last = &last](const Void&) {
+			*last = 1;
+			return Void();
+		});
+		state Future<Void> f2 = map(delay(x), [last = &last](const Void&) {
+			*last = 2;
+			return Void();
+		});
+		wait(f1 && f2);
+		if (last == 1) {
+			TEST(true); // Delays can become ready out of order
+			return Void();
+		}
 	}
-	state int last = 0;
-	state Future<Void> f1 = map(delay(x), [last = &last](const Void&) {
-		*last = 1;
-		return Void();
-	});
-	state Future<Void> f2 = map(delay(y), [last = &last](const Void&) {
-		*last = 2;
-		return Void();
-	});
-	wait(f1 && f2);
-	ASSERT((x <= y) == (last == 2));
-	return Void();
 }

 template <class T, class Func, class ErrFunc, class CallbackType>
--- a/fdbrpc/Locality.cpp
+++ b/fdbrpc/Locality.cpp
@ -40,8 +40,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons
 		case ProcessClass::LogClass:
 			return ProcessClass::WorstFit;
 		case ProcessClass::CoordinatorClass:
-			return ProcessClass::NeverAssign;
 		case ProcessClass::TesterClass:
+		case ProcessClass::StorageCacheClass:
 			return ProcessClass::NeverAssign;
 		default:
 			return ProcessClass::NeverAssign;
@ -57,8 +57,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons
 		case ProcessClass::StorageClass:
 			return ProcessClass::WorstFit;
 		case ProcessClass::CoordinatorClass:
-			return ProcessClass::NeverAssign;
 		case ProcessClass::TesterClass:
+		case ProcessClass::StorageCacheClass:
 			return ProcessClass::NeverAssign;
 		default:
 			return ProcessClass::NeverAssign;
@ -76,8 +76,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
-			return ProcessClass::NeverAssign;
 		case ProcessClass::TesterClass:
+		case ProcessClass::StorageCacheClass:
 			return ProcessClass::NeverAssign;
 		default:
 			return ProcessClass::WorstFit;
@ -93,8 +93,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons
 		case ProcessClass::ResolutionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
-			return ProcessClass::NeverAssign;
 		case ProcessClass::TesterClass:
+		case ProcessClass::StorageCacheClass:
 			return ProcessClass::NeverAssign;
 		default:
 			return ProcessClass::WorstFit;
@ -110,8 +110,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
-			return ProcessClass::NeverAssign;
 		case ProcessClass::TesterClass:
+		case ProcessClass::StorageCacheClass:
 			return ProcessClass::NeverAssign;
 		default:
 			return ProcessClass::WorstFit;
@ -129,8 +129,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
-			return ProcessClass::NeverAssign;
 		case ProcessClass::TesterClass:
+		case ProcessClass::StorageCacheClass:
 			return ProcessClass::NeverAssign;
 		default:
 			return ProcessClass::WorstFit;
@ -154,8 +154,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons
 		case ProcessClass::LogRouterClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
-			return ProcessClass::NeverAssign;
 		case ProcessClass::TesterClass:
+		case ProcessClass::StorageCacheClass:
 			return ProcessClass::NeverAssign;
 		default:
 			return ProcessClass::WorstFit;
@ -172,6 +172,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
 		case ProcessClass::TesterClass:
+		case ProcessClass::StorageCacheClass:
 			return ProcessClass::NeverAssign;
 		default:
 			return ProcessClass::WorstFit;
@ -188,10 +189,18 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
 		case ProcessClass::TesterClass:
+		case ProcessClass::StorageCacheClass:
 			return ProcessClass::NeverAssign;
 		default:
 			return ProcessClass::WorstFit;
 		}
+	case ProcessClass::StorageCache:
+		switch( _class ) {
+		case ProcessClass::StorageCacheClass:
+			return ProcessClass::BestFit;
+		default:
+			return ProcessClass::NeverAssign;
+		}
 	default:
 		return ProcessClass::NeverAssign;
 	}
--- a/fdbrpc/Locality.h
+++ b/fdbrpc/Locality.h
@ -43,11 +43,12 @@ struct ProcessClass {
 		DataDistributorClass,
 		CoordinatorClass,
 		RatekeeperClass,
+		StorageCacheClass,
 		InvalidClass = -1
 	};

 	enum Fitness { BestFit, GoodFit, UnsetFit, OkayFit, WorstFit, ExcludeFit, NeverAssign }; //cannot be larger than 7 because of leader election mask
-	enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, Ratekeeper, NoRole };
+	enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, Ratekeeper, StorageCache, NoRole };
 	enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 };
 	int16_t _class;
 	int16_t _source;
@ -72,6 +73,7 @@ public:
 		else if (s=="data_distributor") _class = DataDistributorClass;
 		else if (s=="coordinator") _class = CoordinatorClass;
 		else if (s=="ratekeeper") _class = RatekeeperClass;
+		else if (s=="storage_cache") _class = StorageCacheClass;
 		else _class = InvalidClass;
 	}

@ -91,6 +93,7 @@ public:
 		else if (classStr=="data_distributor") _class = DataDistributorClass;
 		else if (classStr=="coordinator") _class = CoordinatorClass;
 		else if (classStr=="ratekeeper") _class = RatekeeperClass;
+		else if (classStr=="storage_cache") _class = StorageCacheClass;
 		else _class = InvalidClass;

 		if (sourceStr=="command_line") _source = CommandLineSource;
@ -125,6 +128,7 @@ public:
 			case DataDistributorClass: return "data_distributor";
 			case CoordinatorClass: return "coordinator";
 			case RatekeeperClass: return "ratekeeper";
+			case StorageCacheClass: return "storage_cache";
 			default: return "invalid";
 		}
 	}
--- a/fdbrpc/Net2FileSystem.cpp
+++ b/fdbrpc/Net2FileSystem.cpp
@ -59,9 +59,10 @@ Future< Reference<class IAsyncFile> > Net2FileSystem::open( std::string filename
 	Future<Reference<IAsyncFile>> f;
 #ifdef __linux__
 	// In the vast majority of cases, we wish to use Kernel AIO. However, some systems
-	// dont properly support don’t properly support kernel async I/O without O_DIRECT
-	// or AIO at all. In such cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to
-	// EIO instead of Kernel AIO.
+	// don’t properly support kernel async I/O without O_DIRECT or AIO at all. In such
+	// cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to EIO instead
+	// of Kernel AIO. And EIO_USE_ODIRECT can be used to turn on or off O_DIRECT within
+	// EIO.
 	if ((flags & IAsyncFile::OPEN_UNBUFFERED) && !(flags & IAsyncFile::OPEN_NO_AIO) &&
 	    !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO)
 		f = AsyncFileKAIO::open(filename, flags, mode, NULL);
--- a/fdbrpc/fdbrpc.vcxproj
+++ b/fdbrpc/fdbrpc.vcxproj
@ -163,8 +163,8 @@
  </PropertyGroup>
  <ItemDefinitionGroup>
    <CustomBuildStep>
-      <Command>echo const char *hgVersion = "Current version id not currently supported within Windows."; &gt; hgVersion.temp.h &amp;&amp; fc /b hgVersion.temp.h hgVersion.h &gt; nul || copy hgVersion.temp.h hgVersion.h &gt; nul</Command>
-      <Message>Checking HG source version</Message>
+      <Command>echo const char *sourceVersion = "Current version id not currently supported within Windows."; &gt; SourceVersion.temp.h &amp;&amp; fc /b SourceVersion.temp.h SourceVersion.h &gt; nul || copy SourceVersion.temp.h SourceVersion.h &gt; nul</Command>
+      <Message>Checking source version</Message>
      <Outputs>fake.out</Outputs>
    </CustomBuildStep>
  </ItemDefinitionGroup>
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@ -98,6 +98,7 @@ public:
 				case ProcessClass::ClusterControllerClass: return false;
 				case ProcessClass::DataDistributorClass: return false;
 				case ProcessClass::RatekeeperClass: return false;
+				case ProcessClass::StorageCacheClass: return false;
 				default: return false;
 			}
 		}
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -46,8 +46,10 @@ Reference<StorageInfo> getStorageInfo(UID id, std::map<UID, Reference<StorageInf
 // the same operations will be done on all proxies at the same time. Otherwise, the data stored in
 // txnStateStore will become corrupted.
 void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRef> const& mutations, IKeyValueStore* txnStateStore, LogPushData* toCommit, bool *confChange, Reference<ILogSystem> logSystem, Version popVersion,
-	KeyRangeMap<std::set<Key> >* vecBackupKeys, KeyRangeMap<ServerCacheInfo>* keyInfo, std::map<Key, applyMutationsData>* uid_applyMutationsData, RequestStream<CommitTransactionRequest> commit,
-	Database cx, NotifiedVersion* commitVersion, std::map<UID, Reference<StorageInfo>>* storageCache, std::map<Tag, Version>* tag_popped, bool initialCommit ) {
+	KeyRangeMap<std::set<Key> >* vecBackupKeys, KeyRangeMap<ServerCacheInfo>* keyInfo, KeyRangeMap<bool>* cacheInfo, std::map<Key, applyMutationsData>* uid_applyMutationsData, RequestStream<CommitTransactionRequest> commit,
+							Database cx, NotifiedVersion* commitVersion, std::map<UID, Reference<StorageInfo>>* storageCache, std::map<Tag, Version>* tag_popped, bool initialCommit ) {
+	//std::map<keyRef, vector<uint16_t>> cacheRangeInfo;
+	std::map<KeyRef, MutationRef> cachedRangeInfo;
 	for (auto const& m : mutations) {
 		//TraceEvent("MetadataMutation", dbgid).detail("M", m.toString());

@ -129,6 +131,37 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 						}
 					}
 				}
+			} else if (m.param1.startsWith(storageCachePrefix)) {
+				if(cacheInfo) {
+					KeyRef k = m.param1.removePrefix(storageCachePrefix);
+
+						// Create a private mutation for storage servers
+						// This is done to make the storage servers aware of the cached key-ranges
+						if(toCommit)
+						{
+							MutationRef privatized = m;
+							privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
+							TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
+							cachedRangeInfo[k] = privatized;
+						}
+					if(k != allKeys.end) {
+						KeyRef end = cacheInfo->rangeContaining(k).end();
+						vector<uint16_t> serverIndices;
+						decodeStorageCacheValue(m.param2, serverIndices);
+						cacheInfo->insert(KeyRangeRef(k,end),serverIndices.size() > 0);
+					}
+				}
+				if(!initialCommit) txnStateStore->set(KeyValueRef(m.param1, m.param2));
+			} else if (m.param1.startsWith(cacheKeysPrefix)) {
+				// Create a private mutation for cache servers
+				// This is done to make the cache servers aware of the cached key-ranges
+				if(toCommit) {
+					MutationRef privatized = m;
+					privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
+					TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
+					toCommit->addTag( cacheTag );
+					toCommit->addTypedMessage(privatized);
+				}
 			}
 			else if (m.param1.startsWith(configKeysPrefix) || m.param1 == coordinatorsKey) {
 				if(Optional<StringRef>(m.param2) != txnStateStore->readValue(m.param1).get().castTo<StringRef>()) { // FIXME: Make this check more specific, here or by reading configuration whenever there is a change
@ -138,7 +171,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 						TraceEvent("MutationRequiresRestart", dbgid)
 							.detail("M", m.toString())
 							.detail("PrevValue", t.present() ? t.get() : LiteralStringRef("(none)"))
-							.detail("ToCommit", toCommit!=NULL);
+							.detail("ToCommit", toCommit!=nullptr);
 						if(confChange) *confChange = true;
 					}
 				}
@ -171,7 +204,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 			}
 			else if (m.param1.startsWith(applyMutationsEndRange.begin)) {
 				if(!initialCommit) txnStateStore->set(KeyValueRef(m.param1, m.param2));
-				if(uid_applyMutationsData != NULL) {
+				if(uid_applyMutationsData != nullptr) {
 					Key uid = m.param1.removePrefix(applyMutationsEndRange.begin);
 					auto &p = (*uid_applyMutationsData)[uid];
 					p.endVersion = BinaryReader::fromStringRef<Version>(m.param2, Unversioned());
@ -190,7 +223,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 			}
 			else if (m.param1.startsWith(applyMutationsKeyVersionMapRange.begin)) {
 				if(!initialCommit) txnStateStore->set(KeyValueRef(m.param1, m.param2));
-				if(uid_applyMutationsData != NULL) {
+				if(uid_applyMutationsData != nullptr) {
 					if(m.param1.size() >= applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID)) {
 						Key uid = m.param1.substr(applyMutationsKeyVersionMapRange.begin.size(), sizeof(UID));
 						Key k = m.param1.substr(applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID));
@ -205,7 +238,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 				if(!initialCommit) txnStateStore->set(KeyValueRef(m.param1, m.param2));
 				if (vecBackupKeys) {
 					Key logDestination;
-					KeyRef logRangeBegin = logRangesDecodeKey(m.param1, NULL);
+					KeyRef logRangeBegin = logRangesDecodeKey(m.param1, nullptr);
 					Key	logRangeEnd = logRangesDecodeValue(m.param2, &logDestination);

 					// Insert the logDestination into each range of vecBackupKeys overlapping the decoded range
@ -345,7 +378,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 			if(range.intersects(applyMutationsEndRange)) {
 				KeyRangeRef commonEndRange(range & applyMutationsEndRange);
 				if(!initialCommit) txnStateStore->clear(commonEndRange);
-				if(uid_applyMutationsData != NULL) {
+				if(uid_applyMutationsData != nullptr) {
 					uid_applyMutationsData->erase(uid_applyMutationsData->lower_bound(m.param1.substr(applyMutationsEndRange.begin.size())),
 						m.param2 == applyMutationsEndRange.end ? uid_applyMutationsData->end() : uid_applyMutationsData->lower_bound(m.param2.substr(applyMutationsEndRange.begin.size())));
 				}
@ -353,7 +386,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 			if(range.intersects(applyMutationsKeyVersionMapRange)) {
 				KeyRangeRef commonApplyRange(range & applyMutationsKeyVersionMapRange);
 				if(!initialCommit) txnStateStore->clear(commonApplyRange);
-				if(uid_applyMutationsData != NULL) {
+				if(uid_applyMutationsData != nullptr) {
 					if(m.param1.size() >= applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID) && m.param2.size() >= applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID)) {
 						Key uid = m.param1.substr(applyMutationsKeyVersionMapRange.begin.size(), sizeof(UID));
 						Key uid2 = m.param2.substr(applyMutationsKeyVersionMapRange.begin.size(), sizeof(UID));
@ -389,7 +422,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 					for (auto logRangeAffected : logRangesAffected)
 					{
 						// Parse the backup key and name
-						logKeyBegin = logRangesDecodeKey(logRangeAffected.key, NULL);
+						logKeyBegin = logRangesDecodeKey(logRangeAffected.key, nullptr);

 						// Decode the log destination and key value
 						logKeyEnd = logRangesDecodeValue(logRangeAffected.value, &logDestination);
@ -434,4 +467,58 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 			}
 		}
 	}
+
+	// If we accumulated private mutations for cached key-ranges, we also need to
+	// tag them with the relevant storage servers. This is done to make the storage
+	// servers aware of the cached key-ranges
+	// NOTE: we are assuming non-colliding cached key-ranges
+
+	// TODO Note that, we are currently not handling the case when cached key-ranges move out
+	// to different storage servers. This would require some checking when keys in the keyServersPrefix change.
+	// For the first implementation, we could just send the entire map to every storage server. Revisit!
+	if (cachedRangeInfo.size() != 0 && toCommit) {
+		std::map<KeyRef, MutationRef>::iterator itr;
+		KeyRef keyBegin, keyEnd;
+		vector<uint16_t> serverIndices;
+		MutationRef mutationBegin, mutationEnd;
+
+		for (itr = cachedRangeInfo.begin(); itr != cachedRangeInfo.end(); ++itr) {
+			// first figure out the begin and end keys for the cached-range,
+			// the begin and end mutations can be in any order
+			decodeStorageCacheValue(itr->second.param2, serverIndices);
+			// serverIndices count should be greater than zero for beginKey mutations
+			if (serverIndices.size() > 0) {
+				keyBegin = itr->first;
+				mutationBegin = itr->second;
+				++itr;
+				keyEnd = itr->first;
+				mutationEnd = itr->second;
+			} else {
+				keyEnd = itr->first;
+				mutationEnd = itr->second;
+				++itr;
+				keyBegin = itr->first;
+				mutationBegin = itr->second;
+			}
+
+			// Now get all the storage server tags for the cached key-ranges
+			std::set<Tag> allTags;
+			auto ranges = keyInfo->intersectingRanges(KeyRangeRef(keyBegin, keyEnd));
+			for(auto it : ranges) {
+				auto& r = it.value();
+				for(auto info : r.src_info) {
+					allTags.insert(info->tag);
+				}
+				for(auto info : r.dest_info) {
+					allTags.insert(info->tag);
+				}
+			}
+
+			// Add the tags to both begin and end mutations
+			toCommit->addTags(allTags);
+			toCommit->addTypedMessage(mutationBegin);
+			toCommit->addTags(allTags);
+			toCommit->addTypedMessage(mutationEnd);
+		}
+	}
 }
--- a/fdbserver/ApplyMetadataMutation.h
+++ b/fdbserver/ApplyMetadataMutation.h
@ -45,7 +45,7 @@ struct applyMutationsData {
 Reference<StorageInfo> getStorageInfo(UID id, std::map<UID, Reference<StorageInfo>>* storageCache, IKeyValueStore* txnStateStore);
 
 void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRef> const& mutations, IKeyValueStore* txnStateStore, LogPushData* toCommit, bool *confChange, Reference<ILogSystem> logSystem = Reference<ILogSystem>(), Version popVersion = 0,
-	KeyRangeMap<std::set<Key> >* vecBackupKeys = NULL, KeyRangeMap<ServerCacheInfo>* keyInfo = NULL, std::map<Key, applyMutationsData>* uid_applyMutationsData = NULL, RequestStream<CommitTransactionRequest> commit = RequestStream<CommitTransactionRequest>(),
-	Database cx = Database(), NotifiedVersion* commitVersion = NULL, std::map<UID, Reference<StorageInfo>>* storageCache = NULL, std::map<Tag, Version>* tag_popped = NULL, bool initialCommit = false );
+	KeyRangeMap<std::set<Key> >* vecBackupKeys = nullptr, KeyRangeMap<ServerCacheInfo>* keyInfo = nullptr, KeyRangeMap<bool>* cacheInfo = nullptr, std::map<Key, applyMutationsData>* uid_applyMutationsData = nullptr, RequestStream<CommitTransactionRequest> commit = RequestStream<CommitTransactionRequest>(),
+	Database cx = Database(), NotifiedVersion* commitVersion = nullptr, std::map<UID, Reference<StorageInfo>>* storageCache = nullptr, std::map<Tag, Version>* tag_popped = nullptr, bool initialCommit = false );

 #endif
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@ -24,8 +24,6 @@ set(FDBSERVER_SRCS
  IKeyValueStore.h
  IPager.h
  IVersionedStore.h
-  IndirectShadowPager.actor.cpp
-  IndirectShadowPager.h
  KeyValueStoreCompressTestData.actor.cpp
  KeyValueStoreMemory.actor.cpp
  KeyValueStoreSQLite.actor.cpp
@ -45,8 +43,6 @@ set(FDBSERVER_SRCS
  MasterInterface.h
  MasterProxyServer.actor.cpp
  masterserver.actor.cpp
-  MemoryPager.actor.cpp
-  MemoryPager.h
  MoveKeys.actor.cpp
  MoveKeys.actor.h
  networktest.actor.cpp
@ -76,7 +72,6 @@ set(FDBSERVER_SRCS
  RestoreLoader.actor.cpp
  RestoreWorker.actor.h
  RestoreWorker.actor.cpp
-  RestoreWorkerInterface.h
  Resolver.actor.cpp
  ResolverInterface.h
  ServerDBInfo.h
@ -85,6 +80,7 @@ set(FDBSERVER_SRCS
  SkipList.cpp
  Status.actor.cpp
  Status.h
+  StorageCache.actor.cpp
  StorageMetrics.actor.h
  StorageMetrics.h
  storageserver.actor.cpp
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -57,13 +57,15 @@ struct WorkerInfo : NonCopyable {
 	WorkerDetails details;
 	Future<Void> haltRatekeeper;
 	Future<Void> haltDistributor;
+	Optional<uint16_t> storageCacheInfo;

 	WorkerInfo() : gen(-1), reboots(0), lastAvailableTime(now()), priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
 	WorkerInfo( Future<Void> watcher, ReplyPromise<RegisterWorkerReply> reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded ) :
 		watcher(watcher), reply(reply), gen(gen), reboots(0), lastAvailableTime(now()), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded) {}

 	WorkerInfo( WorkerInfo&& r ) BOOST_NOEXCEPT : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen),
-		reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)) {}
+		reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)),
+		haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), storageCacheInfo(r.storageCacheInfo) {}
 	void operator=( WorkerInfo&& r ) BOOST_NOEXCEPT {
 		watcher = std::move(r.watcher);
 		reply = std::move(r.reply);
@ -73,6 +75,9 @@ struct WorkerInfo : NonCopyable {
 		initialClass = r.initialClass;
 		priorityInfo = r.priorityInfo;
 		details = std::move(r.details);
+		haltRatekeeper = r.haltRatekeeper;
+		haltDistributor = r.haltDistributor;
+		storageCacheInfo = r.storageCacheInfo;
 	}
 };

@ -101,9 +106,11 @@ public:
 		Database db;
 		int unfinishedRecoveries;
 		int logGenerations;
+		std::map<uint16_t, std::pair<Optional<StorageServerInterface>, Optional<Key>>> cacheInterfaces;
+		bool cachePopulated;
 		std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>> clientStatus;

-		DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0),
+		DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), cachePopulated(false),
 			clientInfo( new AsyncVar<ClientDBInfo>( ClientDBInfo() ) ),
 			serverInfo( new AsyncVar<CachedSerialization<ServerDBInfo>>( CachedSerialization<ServerDBInfo>() ) ),
 			db( DatabaseContext::create( clientInfo, Future<Void>(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) )  // SOMEDAY: Locality!
@ -126,6 +133,27 @@ public:
 			serverInfo->set( newInfoCache );
 		}

+		void setStorageCache(uint16_t id, const StorageServerInterface& interf) {
+			CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
+			auto& newInfo = newInfoCache.mutate();
+			bool found = false;
+			for(auto& it : newInfo.storageCaches) {
+				if(it.first == id) {
+					if(it.second != interf) {
+						newInfo.id = deterministicRandom()->randomUniqueID();
+						it.second = interf;
+					}
+					found = true;
+					break;
+				}
+			}
+			if(!found) {
+				newInfo.id = deterministicRandom()->randomUniqueID();
+				newInfo.storageCaches.push_back(std::make_pair(id, interf));
+			}
+			serverInfo->set( newInfoCache );
+		}
+
 		void clearInterf(ProcessClass::ClassType t) {
 			CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
 			auto& newInfo = newInfoCache.mutate();
@ -137,6 +165,19 @@ public:
 			}
 			serverInfo->set( newInfoCache );
 		}
+
+		void clearStorageCache(uint16_t id) {
+			CachedSerialization<ServerDBInfo> newInfoCache = serverInfo->get();
+			auto& newInfo = newInfoCache.mutate();
+			for(auto it = newInfo.storageCaches.begin(); it != newInfo.storageCaches.end(); ++it) {
+				if(it->first == id) {
+					newInfo.id = deterministicRandom()->randomUniqueID();
+					newInfo.storageCaches.erase(it);
+					break;
+				}
+			}
+			serverInfo->set( newInfoCache );
+		}
 	};

 	struct UpdateWorkerList {
@ -201,6 +242,11 @@ public:
 		return ( now() - startTime < 2 * FLOW_KNOBS->SERVER_REQUEST_INTERVAL ) || ( IFailureMonitor::failureMonitor().getState(worker.details.interf.storage.getEndpoint()).isAvailable() && ( !checkStable || worker.reboots < 2 ) );
 	}

+	bool isLongLivedStateless( Optional<Key> const& processId ) {
+		return (db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == processId) ||
+				   (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == processId);
+	}
+
 	WorkerDetails getStorageWorker( RecruitStorageRequest const& req ) {
 		std::set<Optional<Standalone<StringRef>>> excludedMachines( req.excludeMachines.begin(), req.excludeMachines.end() );
 		std::set<Optional<Standalone<StringRef>>> includeDCs( req.includeDCs.begin(), req.includeDCs.end() );
@ -453,8 +499,7 @@ public:
 				fitness = std::max(fitness, ProcessClass::ExcludeFit);
 			}
 			if( workerAvailable(it.second, checkStable) && fitness < unacceptableFitness && it.second.details.interf.locality.dcId()==dcId ) {
-				if ((db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == it.first) ||
-				    (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == it.first)) {
+				if (isLongLivedStateless(it.first)) {
 					fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].second.push_back(it.second.details);
 				} else {
 					fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].first.push_back(it.second.details);
@ -486,8 +531,7 @@ public:
 			auto fitness = it.second.details.processClass.machineClassFitness( role );
 			if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && it.second.details.interf.locality.dcId() == dcId &&
 			  ( !minWorker.present() || ( it.second.details.interf.id() != minWorker.get().worker.interf.id() && ( fitness < minWorker.get().fitness || (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used ) ) ) ) ) {
-				if ((db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == it.first) ||
-				    (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == it.first)) {
+				if (isLongLivedStateless(it.first)) {
 					fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].second.push_back(it.second.details);
 				} else {
 					fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].first.push_back(it.second.details);
@ -1328,6 +1372,7 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 				dbInfo.clusterInterface = db->serverInfo->get().read().clusterInterface;
 				dbInfo.distributor = db->serverInfo->get().read().distributor;
 				dbInfo.ratekeeper = db->serverInfo->get().read().ratekeeper;
+				dbInfo.storageCaches = db->serverInfo->get().read().storageCaches;

 				TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id);
 				db->serverInfo->set( cachedInfo );
@ -1580,8 +1625,27 @@ ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass
 			}
 			when( wait( failed ) ) {  // remove workers that have failed
 				WorkerInfo& failedWorkerInfo = cluster->id_worker[ worker.locality.processId() ];
+				if(failedWorkerInfo.storageCacheInfo.present()) {
+					bool found = false;
+					for(auto& it : cluster->id_worker) {
+						if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) {
+							found = true;
+							it.second.storageCacheInfo = failedWorkerInfo.storageCacheInfo;
+							cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional<StorageServerInterface>(), it.first);
+							if(!it.second.reply.isSet()) {
+								it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, failedWorkerInfo.storageCacheInfo) );
+							}
+							break;
+						}
+					}
+					if(!found) {
+						cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional<StorageServerInterface>(), Optional<Key>());
+					}
+					cluster->db.clearStorageCache(failedWorkerInfo.storageCacheInfo.get());
+				}
+				
 				if (!failedWorkerInfo.reply.isSet()) {
-					failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo) );
+					failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo, Optional<uint16_t>()) );
 				}
 				if (worker.locality.processId() == cluster->masterProcessId) {
 					cluster->masterProcessId = Optional<Key>();
@ -1855,7 +1919,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
 				if ( it.second.priorityInfo.isExcluded != isExcludedFromConfig ) {
 					it.second.priorityInfo.isExcluded = isExcludedFromConfig;
 					if( !it.second.reply.isSet() ) {
-						it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) );
+						it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) );
 					}
 				}
 			}
@ -1957,11 +2021,6 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 		if ( self->gotFullyRecoveredConfig ) {
 			newPriorityInfo.isExcluded = self->db.fullyRecoveredConfig.isExcludedServer(w.address());
 		}
-
-		// Notify the worker to register again with new process class/exclusive property
-		if ( !req.reply.isSet() && newPriorityInfo != req.priorityInfo ) {
-			req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo) );
-		}
 	}

 	if( info == self->id_worker.end() ) {
@ -2021,6 +2080,57 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 			}
 		}
 	}
+	Optional<uint16_t> newStorageCache = req.storageCacheInterf.present() ? req.storageCacheInterf.get().first : Optional<uint16_t>();
+	auto& cacheInfo = self->id_worker[w.locality.processId()].storageCacheInfo;
+	if (req.storageCacheInterf.present()) {
+		auto it = self->db.cacheInterfaces.find(req.storageCacheInterf.get().first);
+		if(it == self->db.cacheInterfaces.end()) {
+			if(self->db.cachePopulated) {
+				if(cacheInfo.present()) {
+					self->db.clearStorageCache(cacheInfo.get());
+				}
+				newStorageCache = Optional<uint16_t>();
+				cacheInfo = Optional<uint16_t>();
+			} else {
+				self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second);
+				self->db.cacheInterfaces[req.storageCacheInterf.get().first] = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId());
+				cacheInfo = req.storageCacheInterf.get().first;
+			}
+		} else {
+			if(!it->second.second.present() || (cacheInfo.present() && cacheInfo.get() == it->first) ) {
+				self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second);
+				it->second = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId());
+				cacheInfo = req.storageCacheInterf.get().first;
+			}
+			else {
+				if(cacheInfo.present()) {
+					self->db.clearStorageCache(cacheInfo.get());
+				}
+				newStorageCache = Optional<uint16_t>();
+				cacheInfo = Optional<uint16_t>();
+			}
+		}
+	} else {
+		newStorageCache = cacheInfo;
+	}
+
+	if(self->gotProcessClasses && newProcessClass == ProcessClass::StorageCacheClass && !newStorageCache.present()) {
+		for(auto& it : self->db.cacheInterfaces) {
+			if(!it.second.second.present()) {
+				it.second.second = w.locality.processId();
+				self->id_worker[w.locality.processId()].storageCacheInfo = it.first;
+				newStorageCache = it.first;
+				break;
+			}
+		}
+	}
+
+	// Notify the worker to register again with new process class/exclusive property
+	if ( !req.reply.isSet() && ( newPriorityInfo != req.priorityInfo || 
+				newStorageCache.present() != req.storageCacheInterf.present() ||
+				(newStorageCache.present() && newStorageCache.get() != req.storageCacheInterf.get().first) ) ) {
+		req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo, newStorageCache) );
+	}
 }

 #define TIME_KEEPER_VERSION LiteralStringRef("1")
@ -2240,7 +2350,7 @@ ACTOR Future<Void> monitorProcessClasses(ClusterControllerData *self) {
 							w.second.details.processClass = newProcessClass;
 							w.second.priorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController);
 							if (!w.second.reply.isSet()) {
-								w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo) );
+								w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo, w.second.storageCacheInfo) );
 							}
 						}
 					}
@ -2300,6 +2410,80 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
 	}
 }

+ACTOR Future<Void> monitorStorageCache(ClusterControllerData* self) {
+	loop {
+		state ReadYourWritesTransaction tr(self->db.db);
+		loop {
+			try {
+				tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+
+				Optional<Value> changeVal = wait(tr.get(cacheChangeKey));
+				Standalone<RangeResultRef> changeKeys = wait(tr.getRange(cacheChangeKeys, CLIENT_KNOBS->TOO_MANY));
+				ASSERT( !changeKeys.more && changeKeys.size() < CLIENT_KNOBS->TOO_MANY );
+
+				std::set<uint16_t> changeIDs;
+				for(auto& it : changeKeys) {
+					changeIDs.insert(cacheChangeKeyDecodeIndex(it.key));
+				}
+
+				for(auto& it : changeIDs) {
+					if(!self->db.cacheInterfaces.count(it)) {
+						self->db.cacheInterfaces[it] = std::make_pair(Optional<StorageServerInterface>(), Optional<Key>());
+					}
+				}
+
+				std::vector<uint16_t> removeIDs;
+				for(auto& it : self->db.cacheInterfaces) {
+					if(!changeIDs.count(it.first)) {
+						removeIDs.push_back(it.first);
+						if(it.second.second.present()) {
+							self->id_worker[it.second.second.get()].storageCacheInfo = Optional<uint16_t>();
+						}
+						self->db.clearStorageCache(it.first);
+					}
+				}
+
+				for(auto& it : removeIDs) {
+					self->db.cacheInterfaces.erase(it);
+				}
+
+				for(auto& c : self->db.cacheInterfaces) {
+					if(!c.second.second.present()) {
+						bool found = false;
+						for(auto& it : self->id_worker) {
+							if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) {
+								found = true;
+								it.second.storageCacheInfo = c.first;
+								c.second.second = it.first;
+								if(!it.second.reply.isSet()) {
+									it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, c.first) );
+								}
+								break;
+							}
+						}
+						if(!found) {
+							break;
+						}
+					}
+				}
+
+				state Future<Void> configChangeFuture = tr.watch(cacheChangeKey);
+
+				self->db.cachePopulated = true;
+				wait(tr.commit());
+				wait(configChangeFuture);
+
+				break;
+			}
+			catch (Error &e) {
+				wait(tr.onError(e));		
+			}
+		}
+	}
+}
+
 ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db) {
 	loop {
 		state ReadYourWritesTransaction tr(db->db);
@ -2350,7 +2534,7 @@ ACTOR Future<Void> updatedChangingDatacenters(ClusterControllerData *self) {
 			if ( worker.priorityInfo.dcFitness > newFitness ) {
 				worker.priorityInfo.dcFitness = newFitness;
 				if(!worker.reply.isSet()) {
-					worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) );
+					worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) );
 				}
 			} else {
 				state int currentFit = ProcessClass::BestFit;
@ -2363,7 +2547,7 @@ ACTOR Future<Void> updatedChangingDatacenters(ClusterControllerData *self) {
 								updated = true;
 								it.second.priorityInfo.dcFitness = fitness;
 								if(!it.second.reply.isSet()) {
-									it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) );
+									it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) );
 								}
 							}
 						}
@ -2402,7 +2586,7 @@ ACTOR Future<Void> updatedChangedDatacenters(ClusterControllerData *self) {
 						if( worker.priorityInfo.dcFitness != newFitness ) {
 							worker.priorityInfo.dcFitness = newFitness;
 							if(!worker.reply.isSet()) {
-								worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) );
+								worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) );
 							}
 						}
 					} else {
@ -2416,7 +2600,7 @@ ACTOR Future<Void> updatedChangedDatacenters(ClusterControllerData *self) {
 										updated = true;
 										it.second.priorityInfo.dcFitness = fitness;
 										if(!it.second.reply.isSet()) {
-											it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) );
+											it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) );
 										}
 									}
 								}
@ -2703,8 +2887,8 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
 	self.addActor.send( handleForcedRecoveries(&self, interf) );
 	self.addActor.send( monitorDataDistributor(&self) );
 	self.addActor.send( monitorRatekeeper(&self) );
+	self.addActor.send( monitorStorageCache(&self) );
 	self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
-	
 	//printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());

 	loop choose {
--- a/fdbserver/ClusterRecruitmentInterface.h
+++ b/fdbserver/ClusterRecruitmentInterface.h
@ -175,13 +175,14 @@ struct RegisterWorkerReply {
 	constexpr static FileIdentifier file_identifier = 16475696;
 	ProcessClass processClass;
 	ClusterControllerPriorityInfo priorityInfo;
+	Optional<uint16_t> storageCache;

 	RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
-	RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo) : processClass(processClass), priorityInfo(priorityInfo) {}
+	RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Optional<uint16_t> storageCache) : processClass(processClass), priorityInfo(priorityInfo), storageCache(storageCache) {}

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, processClass, priorityInfo);
+		serializer(ar, processClass, priorityInfo, storageCache);
 	}
 };

@ -194,16 +195,17 @@ struct RegisterWorkerRequest {
 	Generation generation;
 	Optional<DataDistributorInterface> distributorInterf;
 	Optional<RatekeeperInterface> ratekeeperInterf;
+	Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf;
 	ReplyPromise<RegisterWorkerReply> reply;
 	bool degraded;

 	RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {}
-	RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional<DataDistributorInterface> ddInterf, Optional<RatekeeperInterface> rkInterf, bool degraded) :
-	wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), degraded(degraded) {}
+	RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional<DataDistributorInterface> ddInterf, Optional<RatekeeperInterface> rkInterf, Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf, bool degraded) :
+	wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), storageCacheInterf(storageCacheInterf), degraded(degraded) {}

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, reply, degraded);
+		serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, storageCacheInterf, reply, degraded);
 	}
 };

--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -4986,7 +4986,6 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") {
 	state int desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * processSize;
 	state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize;
 	state int teamSize = 3;
-	state int targetTeamsPerServer = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (teamSize + 1) / 2;
 	state DDTeamCollection* collection = testTeamCollection(teamSize, policy, processSize);

 	collection->addTeam(std::set<UID>({ UID(1, 0), UID(2, 0), UID(3, 0) }), true);
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -443,6 +443,7 @@ Future<Void> shardMerger(
 	bool forwardComplete = false;
 	KeyRangeRef merged;
 	StorageMetrics endingStats = shardSize->get().get();
+	int64_t systemBytes = keys.begin >= systemKeys.begin ? shardSize->get().get().bytes : 0; 

 	loop {
 		Optional<StorageMetrics> newMetrics;
@ -480,6 +481,9 @@ Future<Void> shardMerger(

 		merged = KeyRangeRef( prevIter->range().begin, nextIter->range().end );
 		endingStats += newMetrics.get();
+		if((forwardComplete ? prevIter->range().begin : nextIter->range().begin) >= systemKeys.begin) {
+			systemBytes += newMetrics.get().bytes;
+		}
 		shardsMerged++;

 		auto shardBounds = getShardSizeBounds( merged, maxShardSize );
@ -498,6 +502,9 @@ Future<Void> shardMerger(

 			// If going forward, remove most recently added range
 			endingStats -= newMetrics.get();
+			if(nextIter->range().begin >= systemKeys.begin) {
+				systemBytes -= newMetrics.get().bytes;
+			}
 			shardsMerged--;
 			--nextIter;
 			merged = KeyRangeRef( prevIter->range().begin, nextIter->range().end );
@ -514,6 +521,9 @@ Future<Void> shardMerger(
 		.detail("EndingSize", endingStats.bytes)
 		.detail("BatchedMerges", shardsMerged);

+	if(mergeRange.begin < systemKeys.begin) {
+		self->systemSizeEstimate -= systemBytes;
+	}
 	restartShardTrackers( self, mergeRange, endingStats );
 	self->shardsAffectedByTeamFailure->defineShard( mergeRange );
 	self->output.send( RelocateShard( mergeRange, SERVER_KNOBS->PRIORITY_MERGE_SHARD ) );
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@ -1,5 +1,5 @@
 /*
- * MutablePrefixTree.h
+ * DeltaTree.h
 *
 * This source file is part of the FoundationDB open source project
 *
@ -24,9 +24,85 @@
 #include "flow/Arena.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbserver/Knobs.h"
-#include "fdbserver/PrefixTree.h"
 #include <string.h>

+typedef uint64_t Word;
+static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int cl) {
+	int i = 0;
+	const int wordEnd = cl - sizeof(Word) + 1;
+
+	for(; i < wordEnd; i += sizeof(Word)) {
+		Word a = *(Word *)ap;
+		Word b = *(Word *)bp;
+		if(a != b) {
+			return i + ctzll(a ^ b) / 8;
+		}
+		ap += sizeof(Word);
+		bp += sizeof(Word);
+	}
+
+	for (; i < cl; i++) {
+		if (*ap != *bp) {
+			return i;
+		}
+		++ap;
+		++bp;
+	}
+	return cl;
+}
+
+static int commonPrefixLength(StringRef a, StringRef b) {
+	return commonPrefixLength(a.begin(), b.begin(), std::min(a.size(), b.size()));
+}
+
+// This appears to be the fastest version
+static int lessOrEqualPowerOfTwo(int n) {
+	int p;
+	for (p = 1; p+p <= n; p+=p);
+	return p;
+}
+
+/*
+static int _lessOrEqualPowerOfTwo(uint32_t n) {
+	if(n == 0)
+		return n;
+	int trailing = __builtin_ctz(n);
+	int leading = __builtin_clz(n);
+	if(trailing + leading == ((sizeof(n) * 8) - 1))
+		return n;
+	return 1 << ( (sizeof(n) * 8) - leading - 1);
+}
+
+static int __lessOrEqualPowerOfTwo(unsigned int n) {
+	int p = 1;
+	for(; p <= n; p <<= 1);
+	return p >> 1;
+}
+*/
+
+static int perfectSubtreeSplitPoint(int subtree_size) {
+	// return the inorder index of the root node in a subtree of the given size
+	// consistent with the resulting binary search tree being "perfect" (having minimal height 
+	// and all missing nodes as far right as possible).
+	// There has to be a simpler way to do this.
+	int s = lessOrEqualPowerOfTwo((subtree_size - 1) / 2 + 1) - 1;
+	return std::min(s * 2 + 1, subtree_size - s - 1);
+}
+
+static int perfectSubtreeSplitPointCached(int subtree_size) {
+	static uint16_t *points = nullptr;
+	static const int max = 500;
+	if(points == nullptr) {
+		points = new uint16_t[max];
+		for(int i = 0; i < max; ++i)
+			points[i] = perfectSubtreeSplitPoint(i);
+	}
+
+	if(subtree_size < max)
+		return points[subtree_size];
+	return perfectSubtreeSplitPoint(subtree_size);
+}
+
 // Delta Tree is a memory mappable binary tree of T objects such that each node's item is
 // stored as a Delta which can reproduce the node's T item given the node's greatest
 // lesser ancestor and the node's least greater ancestor.
@ -209,7 +285,7 @@ public:
 		}
 	};

-	// Cursor provides a way to seek into a PrefixTree and iterate over its contents
+	// Cursor provides a way to seek into a DeltaTree and iterate over its contents
 	// All Cursors from a Reader share the same decoded node 'cache' (tree of DecodedNodes)
 	struct Cursor {
 		Cursor() : reader(nullptr), node(nullptr) {
@ -342,7 +418,7 @@ public:

 		// The boundary leading to the new page acts as the last time we branched right
 		if(begin != end) {
-			nodeBytes = build(root(), begin, end, prev, next);
+			nodeBytes = build(root(), begin, end, prev, next, prev->getCommonPrefixLen(*next, 0));
 		}
 		else {
 			nodeBytes = 0;
@ -351,7 +427,7 @@ public:
 	}

 private:
-	static OffsetT build(Node &root, const T *begin, const T *end, const T *prev, const T *next) {
+	static OffsetT build(Node &root, const T *begin, const T *end, const T *prev, const T *next, int subtreeCommon) {
 		//printf("build: %s to %s\n", begin->toString().c_str(), (end - 1)->toString().c_str());
 		//printf("build: root at %p  sizeof(Node) %d  delta at %p  \n", &root, sizeof(Node), &root.delta());
 		ASSERT(end != begin);
@ -361,12 +437,8 @@ private:
 		int mid = perfectSubtreeSplitPointCached(count);
 		const T &item = begin[mid];

-		// Get the common prefix length between next and prev
-		// Since mid is between them, we can skip that length to determine the common prefix length
-		// between mid and prev and between mid and next.
-		int nextPrevCommon = prev->getCommonPrefixLen(*next, 0);
-		int commonWithPrev = item.getCommonPrefixLen(*prev, nextPrevCommon);
-		int commonWithNext = item.getCommonPrefixLen(*next, nextPrevCommon);
+		int commonWithPrev = item.getCommonPrefixLen(*prev, subtreeCommon);
+		int commonWithNext = item.getCommonPrefixLen(*next, subtreeCommon);

 		bool prefixSourcePrev;
 		int commonPrefix;
@ -391,7 +463,7 @@ private:

 		// Serialize left child
 		if(count > 1) {
-			wptr += build(*(Node *)wptr, begin, begin + mid, prev, &item);
+			wptr += build(*(Node *)wptr, begin, begin + mid, prev, &item, commonWithPrev);
 			root.leftChildOffset = deltaSize;
 		}
 		else {
@ -401,7 +473,7 @@ private:
 		// Serialize right child
 		if(count > 2) {
 			root.rightChildOffset = wptr - (uint8_t *)&root.delta();
-			wptr += build(*(Node *)wptr, begin + mid + 1, end, &item, next);
+			wptr += build(*(Node *)wptr, begin + mid + 1, end, &item, next, commonWithNext);
 		}
 		else {
 			root.rightChildOffset = 0;
--- a/fdbserver/FDBExecHelper.actor.cpp
+++ b/fdbserver/FDBExecHelper.actor.cpp
@ -142,7 +142,7 @@ ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> par
 #endif

 ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::string folder, std::string role) {
-	state Standalone<StringRef> uidStr = snapUID.toString();
+	state Standalone<StringRef> uidStr(snapUID.toString());
 	state int err = 0;
 	state Future<int> cmdErr;
 	state double maxWaitTime = SERVER_KNOBS->SNAP_CREATE_MAX_TIMEOUT;
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@ -29,25 +29,33 @@

 #define REDWOOD_DEBUG 0

-#define debug_printf_always(...) { fprintf(stdout, "%s %f ", g_network->getLocalAddress().toString().c_str(), now()), fprintf(stdout, __VA_ARGS__); fflush(stdout); }
+#define debug_printf_stream stderr
+#define debug_printf_always(...) { fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); fprintf(debug_printf_stream, __VA_ARGS__); fflush(debug_printf_stream); }

 #define debug_printf_noop(...)

-#if REDWOOD_DEBUG
-  #define debug_printf debug_printf_always
+#if defined(NO_INTELLISENSE)
+	#if REDWOOD_DEBUG
+		#define debug_printf debug_printf_always
+	#else
+		#define debug_printf debug_printf_noop
+	#endif
 #else
-#define debug_printf debug_printf_noop
+	// To get error-checking on debug_printf statements in IDE
+	#define debug_printf printf
 #endif

-#define BEACON fprintf(stderr, "%s: %s line %d \n", __FUNCTION__, __FILE__, __LINE__)
+#define BEACON debug_printf_always("HERE\n")
+#define TRACE debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str());

 #ifndef VALGRIND
 #define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
 #define VALGRIND_MAKE_MEM_DEFINED(x, y)
 #endif

-typedef uint32_t LogicalPageID; // uint64_t?
-static const int invalidLogicalPageID = LogicalPageID(-1);
+typedef uint32_t LogicalPageID;
+typedef uint32_t PhysicalPageID;
+#define invalidLogicalPageID std::numeric_limits<LogicalPageID>::max()

 class IPage {
 public:
@ -78,72 +86,96 @@ public:

 class IPagerSnapshot {
 public:
-	virtual Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID) = 0;
+	virtual Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0;
 	virtual Version getVersion() const = 0;

+	virtual Key getMetaKey() const = 0;
+
 	virtual ~IPagerSnapshot() {}

 	virtual void addref() = 0;
 	virtual void delref() = 0;
 };

-class IPager : public IClosable {
+// This API is probably customized to the behavior of DWALPager and probably needs some changes to be more generic.
+class IPager2 : public IClosable {
 public:
 	// Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed.
 	virtual Reference<IPage> newPageBuffer() = 0;

 	// Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead).
 	// For a given pager instance, separate calls to this function must return the same value.
+	// Only valid to call after recovery is complete.
 	virtual int getUsablePageSize() = 0;
-	
-	virtual StorageBytes getStorageBytes() = 0;

-	// Permitted to fail (ASSERT) during recovery.
-	virtual Reference<IPagerSnapshot> getReadSnapshot(Version version) = 0;
+	// Allocate a new page ID for a subsequent write.  The page will be considered in-use after the next commit
+	// regardless of whether or not it was written to.
+	virtual Future<LogicalPageID> newPageID() = 0;

-	// Returns an unused LogicalPageID. 
-	// LogicalPageIDs in the range [0, SERVER_KNOBS->PAGER_RESERVED_PAGES) do not need to be allocated.
-	// Permitted to fail (ASSERT) during recovery.
-	virtual LogicalPageID allocateLogicalPage() = 0;
+	// Replace the contents of a page with new data across *all* versions.
+	// Existing holders of a page reference for pageID, read from any version,
+	// may see the effects of this write.
+	virtual void updatePage(LogicalPageID pageID, Reference<IPage> data) = 0;

-	// Signals that the page will no longer be used as of the specified version. Versions prior to the specified version must be kept.
-	// Permitted to fail (ASSERT) during recovery.
-	virtual void freeLogicalPage(LogicalPageID pageID, Version version) = 0;
+	// Try to atomically update the contents of a page as of version v in the next commit.
+	// If the pager is unable to do this at this time, it may choose to write the data to a new page ID
+	// instead and return the new page ID to the caller.  Otherwise the original pageID argument will be returned.
+	// If a new page ID is returned, the old page ID will be freed as of version v
+	virtual Future<LogicalPageID> atomicUpdatePage(LogicalPageID pageID, Reference<IPage> data, Version v) = 0;

-	// Writes a page with the given LogicalPageID at the specified version. LogicalPageIDs in the range [0, SERVER_KNOBS->PAGER_RESERVED_PAGES)
-	// can be written without being allocated. All other LogicalPageIDs must be allocated using allocateLogicalPage before writing them.
-	//
-	// If updateVersion is 0, we are signalling to the pager that we are reusing the LogicalPageID entry at the current latest version of pageID.
-	// 
-	// Otherwise, we will add a new entry for LogicalPageID at the specified version. In that case, updateVersion must be larger than any version 
-	// written to this page previously, and it must be larger than any version committed.  If referencePageID is given, the latest version of that
-	// page will be used for the write, which *can* be less than the latest committed version.
-	//
-	// Permitted to fail (ASSERT) during recovery.
-	virtual void writePage(LogicalPageID pageID, Reference<IPage> contents, Version updateVersion, LogicalPageID referencePageID = invalidLogicalPageID) = 0;
+	// Free pageID to be used again after the commit that moves oldestVersion past v
+	virtual void freePage(LogicalPageID pageID, Version v) = 0;

-	// Signals to the pager that no more reads will be performed in the range [begin, end). 
-	// Permitted to fail (ASSERT) during recovery.
-	virtual void forgetVersions(Version begin, Version end) = 0;
+	// Returns the latest data (regardless of version) for a page by LogicalPageID
+	// The data returned will be the later of
+	//   - the most recent committed atomic
+	//   - the most recent non-atomic write
+	// Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read.
+	// NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are
+	// considered likely to be needed soon.
+	virtual Future<Reference<IPage>> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0;

-	// Makes durable all writes and any data structures used for recovery.
-	// Permitted to fail (ASSERT) during recovery.
+	// Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion()
+	// Note that snapshots at any version may still see the results of updatePage() calls.
+	// The snapshot shall be usable until setOldVersion() is called with a version > v.
+	virtual Reference<IPagerSnapshot> getReadSnapshot(Version v) = 0;
+
+	// Atomically make durable all pending page writes, page frees, and update the metadata string.
 	virtual Future<Void> commit() = 0;

-	// Returns the latest version of the pager. Permitted to block until recovery is complete, at which point it should always be set immediately.
-	// Some functions in the IPager interface are permitted to fail (ASSERT) during recovery, so users should wait for getLatestVersion to complete 
-	// before doing anything else.
-	virtual Future<Version> getLatestVersion() = 0;
+	// Get the latest meta key set or committed
+	virtual Key getMetaKey() const = 0;

-	// Sets the latest version of the pager. Must be monotonically increasing. 
-	// 
-	// Must be called prior to reading the specified version. SOMEDAY: It may be desirable in the future to relax this constraint for performance reasons.
-	//
-	// Permitted to fail (ASSERT) during recovery.
-	virtual void setLatestVersion(Version version) = 0;
+	// Set the metakey which will be stored in the next commit
+	virtual void setMetaKey(KeyRef metaKey) = 0;
+
+	// Sets the next commit version
+	virtual void setCommitVersion(Version v) = 0;
+
+	virtual StorageBytes getStorageBytes() = 0;
+
+	// Count of pages in use by the pager client
+	virtual Future<int64_t> getUserPageCount() = 0;
+
+	// Future returned is ready when pager has been initialized from disk and is ready for reads and writes.
+	// It is invalid to call most other functions until init() is ready.
+	// TODO: Document further.
+	virtual Future<Void> init() = 0;
+
+	// Returns latest committed version
+	virtual Version getLatestVersion() = 0;
+
+	// Returns the oldest readable version as of the most recent committed version
+	virtual Version getOldestVersion() = 0;
+
+	// Sets the oldest readable version to be put into affect at the next commit.
+	// The pager can reuse pages that were freed at a version less than v.
+	// If any snapshots are in use at a version less than v, the pager can either forcefully
+	// invalidate them or keep their versions around until the snapshots are no longer in use.
+	virtual void setOldestVersion(Version v) = 0;

 protected:
-	~IPager() {} // Destruction should be done using close()/dispose() from the IClosable interface
+	~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface
 };

 #endif
--- a/fdbserver/IVersionedStore.h
+++ b/fdbserver/IVersionedStore.h
@ -30,22 +30,17 @@
 class IStoreCursor {
 public:
 	virtual Future<Void> findEqual(KeyRef key) = 0;
-	virtual Future<Void> findFirstEqualOrGreater(KeyRef key, bool needValue, int prefetchNextBytes) = 0;
-	virtual Future<Void> findLastLessOrEqual(KeyRef key, bool needValue, int prefetchPriorBytes) = 0;
-	virtual Future<Void> next(bool needValue) = 0;
-	virtual Future<Void> prev(bool needValue) = 0;
+	virtual Future<Void> findFirstEqualOrGreater(KeyRef key, int prefetchBytes = 0) = 0;
+	virtual Future<Void> findLastLessOrEqual(KeyRef key, int prefetchBytes = 0) = 0;
+	virtual Future<Void> next() = 0;
+	virtual Future<Void> prev() = 0;

 	virtual bool isValid() = 0;
 	virtual KeyRef getKey() = 0;
-	//virtual StringRef getCompressedKey() = 0;
 	virtual ValueRef getValue() = 0;

-	virtual void invalidateReturnedStrings() = 0;
-
 	virtual void addref() = 0;
 	virtual void delref() = 0;
-
-	virtual std::string toString() const = 0;
 };

 class IVersionedStore : public IClosable {
@ -61,10 +56,12 @@ public:
 	virtual void clear(KeyRangeRef range) = 0;
 	virtual void mutate(int op, StringRef param1, StringRef param2) = 0;
 	virtual void setWriteVersion(Version) = 0;   // The write version must be nondecreasing
-	virtual void forgetVersions(Version begin, Version end) = 0;  // Versions [begin, end) no longer readable
+	virtual void setOldestVersion(Version v) = 0;  // Set oldest readable version to be used in next commit
+	virtual Version getOldestVersion() = 0;  // Get oldest readable version
 	virtual Future<Void> commit() = 0;

-	virtual Future<Version> getLatestVersion() = 0;
+	virtual Future<Void> init() = 0;
+	virtual Version getLatestVersion() = 0;

 	// readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed
 	//   to forgetVersion.  The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations.
--- a/fdbserver/IndirectShadowPager.actor.cpp
+++ b/fdbserver/IndirectShadowPager.actor.cpp
@ -1,960 +0,0 @@
-/*
- * IndirectShadowPager.actor.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "fdbserver/IndirectShadowPager.h"
-#include "fdbserver/Knobs.h"
-
-#include "flow/UnitTest.h"
-#include "flow/actorcompiler.h"
-#include "fdbrpc/crc32c.h"
-
-struct SumType {
-	bool operator==(const SumType &rhs) const { return crc == rhs.crc; }
-	uint32_t crc;
-	std::string toString() { return format("0x%08x", crc); }
-};
-
-bool checksum(IAsyncFile *file, uint8_t *page, int pageSize, LogicalPageID logical, PhysicalPageID physical, bool write) {
-	// Calculates and then stores or verifies the checksum at the end of the page.
-	// If write is true then the checksum is written into the page
-	// If write is false then the checksum is compared to the in-page sum and
-	// and error will be thrown if they do not match.
-	ASSERT(sizeof(SumType) == IndirectShadowPage::PAGE_OVERHEAD_BYTES);
-	// Adjust pageSize to refer to only usable storage bytes
-	pageSize -= IndirectShadowPage::PAGE_OVERHEAD_BYTES;
-	SumType sum;
-	SumType *pSumInPage = (SumType *)(page + pageSize);
-	// Write sum directly to page or to sum variable based on mode
-	SumType *sumOut = write ? pSumInPage : &sum;
-	sumOut->crc = crc32c_append(logical, page, pageSize);
-	VALGRIND_MAKE_MEM_DEFINED(sumOut, sizeof(SumType));
-
-	debug_printf("checksum %s%s logical %d physical %d size %d checksums page %s calculated %s data at %p %s\n",
-		write ? "write" : "read",
-		(!write && sum != *pSumInPage) ? " MISMATCH" : "",
-		logical, physical, pageSize,
-		write ? "NA" : pSumInPage->toString().c_str(),
-		sumOut->toString().c_str(), page, "");
-
-	// Verify if not in write mode
-	if(!write && sum != *pSumInPage) {
-		TraceEvent (SevError, "IndirectShadowPagerPageChecksumFailure")
-			.detail("UserPageSize", pageSize)
-			.detail("Filename", file->getFilename())
-			.detail("LogicalPage", logical)
-			.detail("PhysicalPage", physical)
-			.detail("ChecksumInPage", pSumInPage->toString())
-			.detail("ChecksumCalculated", sum.toString());
-		return false;
-	}
-	return true;
-}
-
-inline bool checksumRead(IAsyncFile *file, uint8_t *page, int pageSize, LogicalPageID logical, PhysicalPageID physical) {
-	return checksum(file, page, pageSize, logical, physical, false);
-}
-
-inline void checksumWrite(IAsyncFile *file, uint8_t *page, int pageSize, LogicalPageID logical, PhysicalPageID physical) {
-	checksum(file, page, pageSize, logical, physical, true);
-}
-
-IndirectShadowPage::IndirectShadowPage() : fastAllocated(true) {
-	data = (uint8_t*)FastAllocator<4096>::allocate();
-}
-
-IndirectShadowPage::~IndirectShadowPage() {
-	if(fastAllocated) {
-		FastAllocator<4096>::release(data);
-	}
-	else if(file) {
-		file->releaseZeroCopy(data, PAGE_BYTES, (int64_t) physicalPageID * PAGE_BYTES);
-	}
-}
-
-uint8_t const* IndirectShadowPage::begin() const {
-	return data;
-}
-
-uint8_t* IndirectShadowPage::mutate() {
-	return data;
-}
-
-int IndirectShadowPage::size() const {
-	return PAGE_BYTES - PAGE_OVERHEAD_BYTES;
-}
-
-const int IndirectShadowPage::PAGE_BYTES = 4096;
-const int IndirectShadowPage::PAGE_OVERHEAD_BYTES = sizeof(SumType);
-
-IndirectShadowPagerSnapshot::IndirectShadowPagerSnapshot(IndirectShadowPager *pager, Version version)
-	: pager(pager), version(version), pagerError(pager->getError())
-{
-}
-
-Future<Reference<const IPage>> IndirectShadowPagerSnapshot::getPhysicalPage(LogicalPageID pageID) {
-	if(pagerError.isReady())
-		pagerError.get();
-	return pager->getPage(Reference<IndirectShadowPagerSnapshot>::addRef(this), pageID, version);
-}
-
-template <class T>
-T bigEndian(T val) {
-	static_assert(sizeof(T) <= 8, "Can't compute bigEndian on integers larger than 8 bytes");
-	uint64_t b = bigEndian64(val);
-	return *(T*)((uint8_t*)&b+8-sizeof(T));
-}
-
-ACTOR Future<Void> recover(IndirectShadowPager *pager) {
-	try {
-		TraceEvent("PagerRecovering").detail("Filename", pager->pageFileName);
-		pager->pageTableLog = keyValueStoreMemory(pager->basename, UID(), 1e9, "pagerlog");
-
-		// TODO: this can be done synchronously with the log recovery
-		int64_t flags = IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_LOCK;
-		state bool exists = fileExists(pager->pageFileName);
-		if(!exists) {
-			flags |= IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE;
-		}
-
-		Reference<IAsyncFile> dataFile = wait(IAsyncFileSystem::filesystem()->open(pager->pageFileName, flags, 0600));
-		pager->dataFile = dataFile;
-
-		TraceEvent("PagerOpenedDataFile").detail("Filename", pager->pageFileName);
-
-		if(!exists) {
-			wait(pager->dataFile->sync());
-		}
-		TraceEvent("PagerSyncdDataFile").detail("Filename", pager->pageFileName);
-
-		state int64_t fileSize = wait(pager->dataFile->size());
-		TraceEvent("PagerGotFileSize").detail("Size", fileSize).detail("Filename", pager->pageFileName);
-
-		if(fileSize > 0) {
-			TraceEvent("PagerRecoveringFromLogs").detail("Filename", pager->pageFileName);
-			Optional<Value> pagesAllocatedValue = wait(pager->pageTableLog->readValue(IndirectShadowPager::PAGES_ALLOCATED_KEY));
-			if(pagesAllocatedValue.present()) {
-				BinaryReader pr(pagesAllocatedValue.get(), Unversioned());
-				uint32_t pagesAllocated;
-				pr >> pagesAllocated;
-				pager->pagerFile.init(fileSize, pagesAllocated);
-
-				debug_printf("%s: Recovered pages allocated: %d\n", pager->pageFileName.c_str(), pager->pagerFile.pagesAllocated);
-				ASSERT(pager->pagerFile.pagesAllocated != PagerFile::INVALID_PAGE);
-
-				Optional<Value> latestVersionValue = wait(pager->pageTableLog->readValue(IndirectShadowPager::LATEST_VERSION_KEY));
-				ASSERT(latestVersionValue.present());
-
-				BinaryReader vr(latestVersionValue.get(), Unversioned());
-				vr >> pager->latestVersion;
-
-				Optional<Value> oldestVersionValue = wait(pager->pageTableLog->readValue(IndirectShadowPager::OLDEST_VERSION_KEY));
-
-				if(oldestVersionValue.present()) {
-					BinaryReader vr(oldestVersionValue.get(), Unversioned());
-					vr >> pager->oldestVersion;
-				}
-
-				debug_printf("%s: Recovered version info: earliest v%lld  latest v%lld\n", pager->pageFileName.c_str(), pager->oldestVersion, pager->latestVersion);
-				pager->committedVersion = pager->latestVersion;
-
-				Standalone<VectorRef<KeyValueRef>> tableEntries = wait(pager->pageTableLog->readRange(KeyRangeRef(IndirectShadowPager::TABLE_ENTRY_PREFIX, strinc(IndirectShadowPager::TABLE_ENTRY_PREFIX))));
-
-				if(tableEntries.size() > 0) {
-					BinaryReader kr(tableEntries.back().key, Unversioned());
-
-					uint8_t prefix;
-					LogicalPageID logicalPageID;
-
-					kr >> prefix;
-					ASSERT(prefix == IndirectShadowPager::TABLE_ENTRY_PREFIX.begin()[0]);
-
-					kr >> logicalPageID;
-					logicalPageID = bigEndian(logicalPageID);
-
-					LogicalPageID pageTableSize = std::max<LogicalPageID>(logicalPageID+1, SERVER_KNOBS->PAGER_RESERVED_PAGES);
-					pager->pageTable.resize(pageTableSize);
-					debug_printf("%s: Recovered page table size: %d\n", pager->pageFileName.c_str(), pageTableSize);
-				}
-				else {
-					debug_printf("%s: Recovered no page table entries\n", pager->pageFileName.c_str());
-				}
-
-				LogicalPageID nextPageID = SERVER_KNOBS->PAGER_RESERVED_PAGES;
-				std::set<PhysicalPageID> allocatedPhysicalPages;
-				for(auto entry : tableEntries) {
-					BinaryReader kr(entry.key, Unversioned());
-					BinaryReader vr(entry.value, Unversioned());
-
-					uint8_t prefix;
-					LogicalPageID logicalPageID;
-					Version version;
-					PhysicalPageID physicalPageID;
-
-					kr >> prefix;
-					ASSERT(prefix == IndirectShadowPager::TABLE_ENTRY_PREFIX.begin()[0]);
-
-					kr >> logicalPageID;
-					logicalPageID = bigEndian(logicalPageID);
-
-					kr >> version;
-					version = bigEndian(version);
-					vr >> physicalPageID;
-
-					ASSERT(version <= pager->latestVersion);
-
-					pager->pageTable[logicalPageID].push_back(std::make_pair(version, physicalPageID));
-					
-					if(physicalPageID != PagerFile::INVALID_PAGE) {
-						allocatedPhysicalPages.insert(physicalPageID);
-						pager->pagerFile.markPageAllocated(logicalPageID, version, physicalPageID);
-					}
-
-					while(nextPageID < logicalPageID) {
-						pager->logicalFreeList.push_back(nextPageID++);
-					}
-					if(logicalPageID == nextPageID) {
-						++nextPageID;
-					}
-
-					debug_printf("%s: Recovered page table entry logical %d -> (v%lld, physical %d)\n", pager->pageFileName.c_str(), logicalPageID, version, physicalPageID);
-				}
-
-				debug_printf("%s: Building physical free list\n", pager->pageFileName.c_str());
-				// TODO: can we do this better? does it require storing extra info in the log?
-				PhysicalPageID nextPhysicalPageID = 0;
-				for(auto itr = allocatedPhysicalPages.begin(); itr != allocatedPhysicalPages.end(); ++itr) {
-					while(nextPhysicalPageID < *itr) {
-						pager->pagerFile.freePage(nextPhysicalPageID++);
-					}
-					++nextPhysicalPageID;
-				}
-
-				while(nextPhysicalPageID < pager->pagerFile.pagesAllocated) {
-					pager->pagerFile.freePage(nextPhysicalPageID++);
-				}
-			}
-		}
-
-		if(pager->pageTable.size() < SERVER_KNOBS->PAGER_RESERVED_PAGES) {
-			pager->pageTable.resize(SERVER_KNOBS->PAGER_RESERVED_PAGES);
-		}
-
-		pager->pagerFile.finishedMarkingPages();
-		pager->pagerFile.startVacuuming();
-
-		debug_printf("%s: Finished recovery at v%lld\n", pager->pageFileName.c_str(), pager->latestVersion);
-		TraceEvent("PagerFinishedRecovery").detail("LatestVersion", pager->latestVersion).detail("OldestVersion", pager->oldestVersion).detail("Filename", pager->pageFileName);
-	}
-	catch(Error &e) {
-		if(e.code() != error_code_actor_cancelled) {
-			TraceEvent(SevError, "PagerRecoveryFailed").error(e, true).detail("Filename", pager->pageFileName);
-		}
-		throw;
-	}
-
-	return Void();
-}
-
-ACTOR Future<Void> housekeeper(IndirectShadowPager *pager) {
-	wait(pager->recovery);
-	wait(Never());
-	loop {
-		state LogicalPageID pageID = 0;
-		for(; pageID < pager->pageTable.size(); ++pageID) {
-			// TODO: pick an appropriate rate for this loop and determine the right way to implement it
-			// Right now, this delays 10ms every 400K pages, which means we have 1s of delay for every
-			// 40M pages. In total, we introduce 100s delay for a max size 4B page file.
-			if(pageID % 400000 == 0) {
-				wait(delay(0.01)); 
-			}
-			else {
-				wait(yield());
-			}
-
-			auto& pageVersionMap = pager->pageTable[pageID];
-
-			if(pageVersionMap.size() > 0) {
-				auto itr = pageVersionMap.begin();
-				for(auto prev = itr; prev != pageVersionMap.end() && prev->first < pager->oldestVersion; prev=itr) {
-					pager->pagerFile.markPageAllocated(pageID, itr->first, itr->second);
-					++itr;
-					if(prev->second != PagerFile::INVALID_PAGE && (itr == pageVersionMap.end() || itr->first <= pager->oldestVersion)) {
-						pager->freePhysicalPageID(prev->second);
-					}
-					if(itr == pageVersionMap.end() || itr->first >= pager->oldestVersion) {
-						debug_printf("%s: Updating oldest version for logical %u: v%lld\n", pager->pageFileName.c_str(), pageID, pager->oldestVersion);
-						pager->logPageTableClear(pageID, 0, pager->oldestVersion);
-
-						if(itr != pageVersionMap.end() && itr->first > pager->oldestVersion) {
-							debug_printf("%s: Erasing pages to prev from pageVersionMap for %d (itr=%lld, prev=%lld)\n", pager->pageFileName.c_str(), pageID, itr->first, prev->first);
-							prev->first = pager->oldestVersion;
-							pager->logPageTableUpdate(pageID, pager->oldestVersion, prev->second);
-							itr = pageVersionMap.erase(pageVersionMap.begin(), prev);
-						}
-						else {
-							debug_printf("%s: Erasing pages to itr from pageVersionMap for %d (%d) (itr=%lld, prev=%lld)\n", pager->pageFileName.c_str(), pageID, itr == pageVersionMap.end(), itr==pageVersionMap.end() ? -1 : itr->first, prev->first);
-							itr = pageVersionMap.erase(pageVersionMap.begin(), itr);
-						}
-					}
-				}
-
-				for(; itr != pageVersionMap.end(); ++itr) {
-					pager->pagerFile.markPageAllocated(pageID, itr->first, itr->second);
-				}
-
-				if(pageVersionMap.size() == 0) {
-					pager->freeLogicalPageID(pageID);
-				}
-			}
-		}
-
-		pager->pagerFile.finishedMarkingPages();
-	}
-}
-
-ACTOR Future<Void> forwardError(Future<Void> f, Promise<Void> target) {
-	try {
-		wait(f);
-	}
-	catch(Error &e) {
-		if(e.code() != error_code_actor_cancelled && target.canBeSet()) {
-			target.sendError(e);
-		}
-
-		throw e;
-	}
-
-	return Void();
-}
-
-IndirectShadowPager::IndirectShadowPager(std::string basename) 
-	: basename(basename), latestVersion(0), committedVersion(0), committing(Void()), oldestVersion(0), pagerFile(this)
-{
-	pageFileName = basename;
-	recovery = forwardError(recover(this), errorPromise);
-	housekeeping = forwardError(housekeeper(this), errorPromise);
-}
-
-StorageBytes IndirectShadowPager::getStorageBytes() {
-	int64_t free;
-	int64_t total;
-	g_network->getDiskBytes(parentDirectory(basename), free, total);
-	return StorageBytes(free, total, pagerFile.size(), free + IndirectShadowPage::PAGE_BYTES * pagerFile.getFreePages());
-}
-
-Reference<IPage> IndirectShadowPager::newPageBuffer() {
-	return Reference<IPage>(new IndirectShadowPage());
-}
-
-int IndirectShadowPager::getUsablePageSize() {
-	return IndirectShadowPage::PAGE_BYTES - IndirectShadowPage::PAGE_OVERHEAD_BYTES;
-}
-
-Reference<IPagerSnapshot> IndirectShadowPager::getReadSnapshot(Version version) {
-	debug_printf("%s: Getting read snapshot v%lld  latest v%lld  oldest v%lld\n", pageFileName.c_str(), version, latestVersion, oldestVersion);
-	ASSERT(recovery.isReady());
-	ASSERT(version <= latestVersion);
-	ASSERT(version >= oldestVersion);
-
-	return Reference<IPagerSnapshot>(new IndirectShadowPagerSnapshot(this, version));
-}
-
-LogicalPageID IndirectShadowPager::allocateLogicalPage() {
-	ASSERT(recovery.isReady());
-
-	LogicalPageID allocatedPage;
-	if(logicalFreeList.size() > 0) {
-		allocatedPage = logicalFreeList.front();
-		logicalFreeList.pop_front();
-	}
-	else {
-		ASSERT(pageTable.size() < std::numeric_limits<LogicalPageID>::max()); // TODO: different error?
-		allocatedPage = pageTable.size();
-		pageTable.push_back(PageVersionMap());
-	}
-
-	ASSERT(allocatedPage >= SERVER_KNOBS->PAGER_RESERVED_PAGES);
-	debug_printf("%s: op=allocate id=%u\n", pageFileName.c_str(), allocatedPage);
-	return allocatedPage;
-}
-
-void IndirectShadowPager::freeLogicalPage(LogicalPageID pageID, Version version) {
-	ASSERT(recovery.isReady());
-	ASSERT(committing.isReady());
-
-	ASSERT(pageID < pageTable.size());
-
-	PageVersionMap &pageVersionMap = pageTable[pageID];
-	ASSERT(!pageVersionMap.empty());
-
-	// 0 will mean delete as of latest version, similar to write at latest version
-	if(version == 0) {
-		version = pageVersionMap.back().first;
-	}
-
-	auto itr = pageVersionMapLowerBound(pageVersionMap, version);
-	// TODO:  Is this correct, that versions from the past *forward* can be deleted?
-	for(auto i = itr; i != pageVersionMap.end(); ++i) {
-		freePhysicalPageID(i->second);
-	}
-
-	if(itr != pageVersionMap.end()) {
-		debug_printf("%s: Clearing newest versions for logical %u: v%lld\n", pageFileName.c_str(), pageID, version);
-		logPageTableClearToEnd(pageID, version);
-		pageVersionMap.erase(itr, pageVersionMap.end());
-	}
-	
-	if(pageVersionMap.size() == 0) {
-		debug_printf("%s: Freeing logical %u (freeLogicalPage)\n", pageFileName.c_str(), pageID);
-		logicalFreeList.push_back(pageID);
-	}
-	else if(pageVersionMap.back().second != PagerFile::INVALID_PAGE) {
-		pageVersionMap.push_back(std::make_pair(version, PagerFile::INVALID_PAGE));
-		logPageTableUpdate(pageID, version, PagerFile::INVALID_PAGE);
-	}
-}
-
-ACTOR Future<Void> waitAndFreePhysicalPageID(IndirectShadowPager *pager, PhysicalPageID pageID, Future<Void> canFree) {
-	wait(canFree);
-	pager->pagerFile.freePage(pageID);
-	return Void();
-}
-
-// TODO: Freeing physical pages must be done *after* committing the page map changes that cause the physical page to no longer be used.
-// Otherwise, the physical page could be reused by a write followed by a power loss in which case the mapping change would not
-// have been committed and so the physical page should still contain its previous data but it's been overwritten.
-void IndirectShadowPager::freePhysicalPageID(PhysicalPageID pageID) {
-	debug_printf("%s: Freeing physical %u\n", pageFileName.c_str(), pageID);
-	pagerFile.freePage(pageID);
-}
-
-void IndirectShadowPager::writePage(LogicalPageID pageID, Reference<IPage> contents, Version updateVersion, LogicalPageID referencePageID) {
-	ASSERT(recovery.isReady());
-	ASSERT(committing.isReady());
-
-	ASSERT(updateVersion > latestVersion || updateVersion == 0);
-	ASSERT(pageID < pageTable.size());
-
-	PageVersionMap &pageVersionMap = pageTable[pageID];
-
-	ASSERT(pageVersionMap.empty() || pageVersionMap.back().second != PagerFile::INVALID_PAGE);
-
-	// TODO: should this be conditional on the write succeeding?
-	bool updateExisting = updateVersion == 0;
-	if(updateExisting) {
-		// If there is no existing latest version to update then there must be a referencePageID from which to get a latest version
-		// so get that version and change this to a normal update
-		if(pageVersionMap.empty()) {
-			ASSERT(referencePageID != invalidLogicalPageID);
-			PageVersionMap &rpv = pageTable[referencePageID];
-			ASSERT(!rpv.empty());
-			updateVersion = rpv.back().first;
-			updateExisting = false;
-		}
-		else {
-			ASSERT(pageVersionMap.size());
-			updateVersion = pageVersionMap.back().first;
-		}
-	}
-
-	PhysicalPageID physicalPageID = pagerFile.allocatePage(pageID, updateVersion);
-
-	debug_printf("%s: Writing logical %d v%lld physical %d\n", pageFileName.c_str(), pageID, updateVersion, physicalPageID);
-
-	if(updateExisting) {
-		// TODO:  Physical page cannot be freed now, it must be done after the page mapping change above is committed
-		//freePhysicalPageID(pageVersionMap.back().second);
-		pageVersionMap.back().second = physicalPageID;
-	}
-	else {
-		ASSERT(pageVersionMap.empty() || pageVersionMap.back().first < updateVersion);
-		pageVersionMap.push_back(std::make_pair(updateVersion, physicalPageID));
-	}
-
-	logPageTableUpdate(pageID, updateVersion, physicalPageID);
-
-	checksumWrite(dataFile.getPtr(), contents->mutate(), IndirectShadowPage::PAGE_BYTES, pageID, physicalPageID);
-
-	Future<Void> write = holdWhile(contents, dataFile->write(contents->begin(), IndirectShadowPage::PAGE_BYTES, (int64_t) physicalPageID * IndirectShadowPage::PAGE_BYTES));
-
-	if(write.isError()) {
-		if(errorPromise.canBeSet()) {
-			errorPromise.sendError(write.getError());
-		}
-		throw write.getError();
-	}
-	writeActors.add(forwardError(write, errorPromise));
-}
-
-void IndirectShadowPager::forgetVersions(Version begin, Version end) {
-	ASSERT(recovery.isReady());
-	ASSERT(begin <= end);
-	ASSERT(end <= latestVersion);
-
-	// TODO: support forgetting arbitrary ranges
-	if(begin <= oldestVersion) {
-		oldestVersion = std::max(end, oldestVersion);
-		logVersion(OLDEST_VERSION_KEY, oldestVersion);
-	}
-}
-
-ACTOR Future<Void> commitImpl(IndirectShadowPager *pager, Future<Void> previousCommit) {
-	state Future<Void> outstandingWrites = pager->writeActors.signalAndCollapse();
-	state Version commitVersion = pager->latestVersion;
-
-	wait(previousCommit);
-
-	pager->logVersion(IndirectShadowPager::LATEST_VERSION_KEY, commitVersion);
-
-	// TODO: we need to prevent writes that happen now from being committed in the subsequent log commit
-	// This is probably best done once we have better control of the log, where we can write a commit entry
-	// here without syncing the file.
-
-	wait(outstandingWrites);
-
-	wait(pager->dataFile->sync());
-	wait(pager->pageTableLog->commit());
-	
-	pager->committedVersion = std::max(pager->committedVersion, commitVersion);
-
-	return Void();
-}
-
-Future<Void> IndirectShadowPager::commit() {
-	ASSERT(recovery.isReady());
-	Future<Void> f = commitImpl(this, committing);
-	committing = f;
-	return committing;
-}
-
-void IndirectShadowPager::setLatestVersion(Version version) {
-	ASSERT(recovery.isReady());
-	latestVersion = version;
-}
-
-ACTOR Future<Version> getLatestVersionImpl(IndirectShadowPager *pager) {
-	wait(pager->recovery);
-	return pager->latestVersion;
-}
-
-Future<Version> IndirectShadowPager::getLatestVersion() {
-	return getLatestVersionImpl(this);
-}
-
-Future<Void> IndirectShadowPager::getError() {
-	return errorPromise.getFuture();
-}
-
-Future<Void> IndirectShadowPager::onClosed() {
-	return closed.getFuture();
-}
-
-ACTOR void shutdown(IndirectShadowPager *pager, bool dispose) {
-	if(pager->errorPromise.canBeSet())
-		pager->errorPromise.sendError(actor_cancelled());  // Ideally this should be shutdown_in_progress
-
-	// Cancel all outstanding reads
-	auto i = pager->busyPages.begin();
-	auto iEnd = pager->busyPages.end();
-
-	while(i != iEnd) {
-		// Advance before calling cancel as the rawRead cancel will destroy the map entry it lives in
-		(i++)->second.read.cancel();
-	}
-	ASSERT(pager->busyPages.empty());
-
-	wait(ready(pager->writeActors.signal()));
-	wait(ready(pager->operations.signal()));
-	wait(ready(pager->committing));
-
-	pager->housekeeping.cancel();
-	pager->pagerFile.shutdown();
-
-	state Future<Void> pageTableClosed = pager->pageTableLog->onClosed();
-	if(dispose) {
-		wait(ready(IAsyncFileSystem::filesystem()->deleteFile(pager->pageFileName, true)));
-		pager->pageTableLog->dispose();
-	}
-	else {
-		pager->pageTableLog->close();
-	}
-
-	wait(ready(pageTableClosed));
-
-	pager->closed.send(Void());
-	delete pager;
-}
-
-void IndirectShadowPager::dispose() {
-	shutdown(this, true);
-}
-
-void IndirectShadowPager::close() {
-	shutdown(this, false);
-}
-
-ACTOR Future<Reference<const IPage>> rawRead(IndirectShadowPager *pager, LogicalPageID logicalPageID, PhysicalPageID physicalPageID) {
-	state void *data;
-	state int len = IndirectShadowPage::PAGE_BYTES;
-	state bool readSuccess = false;
-
-	try {
-		wait(pager->dataFile->readZeroCopy(&data, &len, (int64_t) physicalPageID * IndirectShadowPage::PAGE_BYTES));
-		readSuccess = true;
-
-		if(!checksumRead(pager->dataFile.getPtr(), (uint8_t *)data, len, logicalPageID, physicalPageID)) {
-			throw checksum_failed();
-		}
-
-		pager->busyPages.erase(physicalPageID);
-		return Reference<const IPage>(new IndirectShadowPage((uint8_t *)data, pager->dataFile, physicalPageID));
-	}
-	catch(Error &e) {
-		pager->busyPages.erase(physicalPageID);
-		if(readSuccess || e.code() == error_code_actor_cancelled) {
-			pager->dataFile->releaseZeroCopy(data, len, (int64_t) physicalPageID * IndirectShadowPage::PAGE_BYTES);
-		}
-		throw;
-	}
-}
-
-Future<Reference<const IPage>> getPageImpl(IndirectShadowPager *pager, Reference<IndirectShadowPagerSnapshot> snapshot, LogicalPageID logicalPageID, Version version) {
-	ASSERT(logicalPageID < pager->pageTable.size());
-	PageVersionMap &pageVersionMap = pager->pageTable[logicalPageID];
-
-	auto itr = IndirectShadowPager::pageVersionMapUpperBound(pageVersionMap, version);
-	if(itr == pageVersionMap.begin()) {
-		debug_printf("%s: Page version map empty! op=error id=%u @%lld\n", pager->pageFileName.c_str(), logicalPageID, version);
-		ASSERT(false);
-	}
-	--itr;
-	PhysicalPageID physicalPageID = itr->second;
-	ASSERT(physicalPageID != PagerFile::INVALID_PAGE);
-
-	debug_printf("%s: Reading logical %d v%lld physical %d mapSize %lu\n", pager->pageFileName.c_str(), logicalPageID, version, physicalPageID, pageVersionMap.size());
-
-	IndirectShadowPager::BusyPage &bp = pager->busyPages[physicalPageID];
-	if(!bp.read.isValid()) {
-		Future<Reference<const IPage>> get = rawRead(pager, logicalPageID, physicalPageID);
-		if(!get.isReady()) {
-			bp.read = get;
-		}
-		return get;
-	}
-	return bp.read;
-}
-
-Future<Reference<const IPage>> IndirectShadowPager::getPage(Reference<IndirectShadowPagerSnapshot> snapshot, LogicalPageID pageID, Version version) {
-	if(!recovery.isReady()) {
-		debug_printf("%s: getPage failure, recovery not ready - op=error id=%u @%lld\n", pageFileName.c_str(), pageID, version);
-		ASSERT(false);
-	}
-
-	Future<Reference<const IPage>> f = getPageImpl(this, snapshot, pageID, version);
-	operations.add(forwardError(ready(f), errorPromise));  // For some reason if success is ready() then shutdown hangs when waiting on operations
-	return f;
-}
-
-PageVersionMap::iterator IndirectShadowPager::pageVersionMapLowerBound(PageVersionMap &pageVersionMap, Version version) {
-	return std::lower_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](std::pair<Version, PhysicalPageID> p, Version v) {
-		return p.first < v;
-	});
-}
-
-PageVersionMap::iterator IndirectShadowPager::pageVersionMapUpperBound(PageVersionMap &pageVersionMap, Version version) {
-	return std::upper_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](Version v, std::pair<Version, PhysicalPageID> p) {
-		return v < p.first;
-	});
-}
-
-void IndirectShadowPager::freeLogicalPageID(LogicalPageID pageID) {
-	if(pageID >= SERVER_KNOBS->PAGER_RESERVED_PAGES) {
-		debug_printf("%s: Freeing logical %u\n", pageFileName.c_str(), pageID);
-		logicalFreeList.push_back(pageID);
-	}
-}
-
-void IndirectShadowPager::logVersion(StringRef versionKey, Version version) {
-	BinaryWriter v(Unversioned());
-	v << version;
-
-	pageTableLog->set(KeyValueRef(versionKey, v.toValue()));
-}
-
-void IndirectShadowPager::logPagesAllocated() {
-	BinaryWriter v(Unversioned());
-	v << pagerFile.getPagesAllocated();
-
-	pageTableLog->set(KeyValueRef(PAGES_ALLOCATED_KEY, v.toValue()));
-}
-
-void IndirectShadowPager::logPageTableUpdate(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID) {
-	BinaryWriter k(Unversioned());
-	k << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(version);
-
-	BinaryWriter v(Unversioned());
-	v << physicalPageID;
-
-	pageTableLog->set(KeyValueRef(k.toValue(), v.toValue()));
-}
-
-void IndirectShadowPager::logPageTableClearToEnd(LogicalPageID logicalPageID, Version start) {
-	BinaryWriter b(Unversioned());
-	b << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(start);
-
-	BinaryWriter e(Unversioned());
-	e << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID);
-
-	pageTableLog->clear(KeyRangeRef(b.toValue(), strinc(e.toValue())));
-}
-
-void IndirectShadowPager::logPageTableClear(LogicalPageID logicalPageID, Version start, Version end) {
-	BinaryWriter b(Unversioned());
-	b << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(start);
-
-	BinaryWriter e(Unversioned());
-	e << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(end);
-
-	pageTableLog->clear(KeyRangeRef(b.toValue(), e.toValue()));
-}
-
-const StringRef IndirectShadowPager::LATEST_VERSION_KEY = LiteralStringRef("\xff/LatestVersion");
-const StringRef IndirectShadowPager::OLDEST_VERSION_KEY = LiteralStringRef("\xff/OldestVersion");
-const StringRef IndirectShadowPager::PAGES_ALLOCATED_KEY = LiteralStringRef("\xff/PagesAllocated");
-const StringRef IndirectShadowPager::TABLE_ENTRY_PREFIX = LiteralStringRef("\x00");
-
-ACTOR Future<Void> copyPage(IndirectShadowPager *pager, Reference<IPage> page, LogicalPageID logical, PhysicalPageID from, PhysicalPageID to) {
-	state bool zeroCopied = true;
-	state int bytes = IndirectShadowPage::PAGE_BYTES;
-	state void *data = nullptr;
-
-	try {
-		try {
-			wait(pager->dataFile->readZeroCopy(&data, &bytes, (int64_t)from * IndirectShadowPage::PAGE_BYTES));
-		}
-		catch(Error &e) {
-			zeroCopied = false;
-			data = page->mutate();
-			int _bytes = wait(pager->dataFile->read(data, page->size(), (int64_t)from * IndirectShadowPage::PAGE_BYTES));
-			bytes = _bytes;
-		}
-
-		ASSERT(bytes == IndirectShadowPage::PAGE_BYTES);
-		checksumWrite(pager->dataFile.getPtr(), page->mutate(), bytes, logical, to);
-		wait(pager->dataFile->write(data, bytes, (int64_t)to * IndirectShadowPage::PAGE_BYTES));
-		if(zeroCopied) {
-			pager->dataFile->releaseZeroCopy(data, bytes, (int64_t)from * IndirectShadowPage::PAGE_BYTES);
-		}
-	}
-	catch(Error &e) {
-		if(zeroCopied) {
-			pager->dataFile->releaseZeroCopy(data, bytes, (int64_t)from * IndirectShadowPage::PAGE_BYTES);
-		}
-		pager->pagerFile.freePage(to);
-		throw e;
-	}
-
-	return Void();
-}
-
-ACTOR Future<Void> vacuumer(IndirectShadowPager *pager, PagerFile *pagerFile) {
-	state Reference<IPage> page(new IndirectShadowPage());
-
-	loop {
-		state double start = now();
-		while(!pagerFile->canVacuum()) {
-			wait(delay(1.0));
-		}
-
-		ASSERT(!pagerFile->freePages.empty());
-
-		if(!pagerFile->vacuumQueue.empty()) {
-			state PhysicalPageID lastUsedPage = pagerFile->vacuumQueue.rbegin()->first;
-			PhysicalPageID lastFreePage = *pagerFile->freePages.rbegin();
-			debug_printf("%s: Vacuuming: evaluating (free list size=%lu, lastFreePage=%u, lastUsedPage=%u, pagesAllocated=%u)\n", pager->pageFileName.c_str(), pagerFile->freePages.size(), lastFreePage, lastUsedPage, pagerFile->pagesAllocated);
-			ASSERT(lastFreePage < pagerFile->pagesAllocated);
-			ASSERT(lastUsedPage < pagerFile->pagesAllocated);
-			ASSERT(lastFreePage != lastUsedPage);
-
-			if(lastFreePage < lastUsedPage) {
-				state std::pair<LogicalPageID, Version> logicalPageInfo = pagerFile->vacuumQueue[lastUsedPage];
-				state PhysicalPageID newPage = pagerFile->allocatePage(logicalPageInfo.first, logicalPageInfo.second);
-
-				debug_printf("%s: Vacuuming: copying page %u to %u\n", pager->pageFileName.c_str(), lastUsedPage, newPage);
-				wait(copyPage(pager, page, logicalPageInfo.first, lastUsedPage, newPage));
-
-				auto &pageVersionMap = pager->pageTable[logicalPageInfo.first];
-				auto itr = IndirectShadowPager::pageVersionMapLowerBound(pageVersionMap, logicalPageInfo.second);
-				if(itr != pageVersionMap.end() && itr->second == lastUsedPage) {
-					itr->second = newPage;
-					pager->logPageTableUpdate(logicalPageInfo.first, itr->first, newPage);
-					pagerFile->freePage(lastUsedPage);
-				}
-				else {
-					TEST(true); // page was freed while vacuuming
-					pagerFile->freePage(newPage);
-				}
-			}
-		}
-
-		PhysicalPageID firstFreePage = pagerFile->vacuumQueue.empty() ? pagerFile->minVacuumQueuePage : (pagerFile->vacuumQueue.rbegin()->first + 1);
-		ASSERT(pagerFile->pagesAllocated >= firstFreePage);
-
-		uint64_t pagesToErase = 0;
-		if(pagerFile->freePages.size() >= SERVER_KNOBS->FREE_PAGE_VACUUM_THRESHOLD) {
-			pagesToErase = std::min<uint64_t>(pagerFile->freePages.size() - SERVER_KNOBS->FREE_PAGE_VACUUM_THRESHOLD + 1, pagerFile->pagesAllocated - firstFreePage);
-		}
-
-		debug_printf("%s: Vacuuming: got %llu pages to erase (freePages=%lu, pagesAllocated=%u, vacuumQueueEmpty=%u, minVacuumQueuePage=%u, firstFreePage=%u)\n", pager->pageFileName.c_str(), pagesToErase, pagerFile->freePages.size(), pagerFile->pagesAllocated, pagerFile->vacuumQueue.empty(), pagerFile->minVacuumQueuePage, firstFreePage);
-
-		if(pagesToErase > 0) {
-			PhysicalPageID eraseStartPage = pagerFile->pagesAllocated - pagesToErase;
-			debug_printf("%s: Vacuuming: truncating last %llu pages starting at %u\n", pager->pageFileName.c_str(), pagesToErase, eraseStartPage);
-
-			ASSERT(pagesToErase <= pagerFile->pagesAllocated);
-
-			pagerFile->pagesAllocated = eraseStartPage;
-			pager->logPagesAllocated();
-
-			auto freePageItr = pagerFile->freePages.find(eraseStartPage);
-			ASSERT(freePageItr != pagerFile->freePages.end());
-
-			pagerFile->freePages.erase(freePageItr, pagerFile->freePages.end());
-			ASSERT(pagerFile->vacuumQueue.empty() || pagerFile->vacuumQueue.rbegin()->first < eraseStartPage);
-
-			wait(pager->dataFile->truncate((int64_t)pagerFile->pagesAllocated * IndirectShadowPage::PAGE_BYTES));
-		}
-
-		wait(delayUntil(start + (double)IndirectShadowPage::PAGE_BYTES / SERVER_KNOBS->VACUUM_BYTES_PER_SECOND)); // TODO: figure out the correct mechanism here
-	}
-}
-
-PagerFile::PagerFile(IndirectShadowPager *pager) : fileSize(0), pagesAllocated(0), pager(pager), vacuumQueueReady(false), minVacuumQueuePage(0) {}
-
-PhysicalPageID PagerFile::allocatePage(LogicalPageID logicalPageID, Version version) {
-	ASSERT((int64_t)pagesAllocated * IndirectShadowPage::PAGE_BYTES <= fileSize);
-	ASSERT(fileSize % IndirectShadowPage::PAGE_BYTES == 0);
-
-	PhysicalPageID allocatedPage;
-	if(!freePages.empty()) {
-		allocatedPage = *freePages.begin();
-		freePages.erase(freePages.begin());
-	}
-	else {
-		if((int64_t)pagesAllocated * IndirectShadowPage::PAGE_BYTES == fileSize) {
-			fileSize += (1 << 24);
-			// TODO: extend the file before writing beyond the end.
-		}
-
-		ASSERT(pagesAllocated < INVALID_PAGE); // TODO: we should throw a better error here
-		allocatedPage = pagesAllocated++;
-		pager->logPagesAllocated();
-	}
-
-	markPageAllocated(logicalPageID, version, allocatedPage);
-
-	debug_printf("%s: Allocated physical %u\n", pager->pageFileName.c_str(), allocatedPage);
-	return allocatedPage;
-}
-
-void PagerFile::freePage(PhysicalPageID pageID) {
-	freePages.insert(pageID);
-
-	if(pageID >= minVacuumQueuePage) {
-		vacuumQueue.erase(pageID);
-	}
-}
-
-void PagerFile::markPageAllocated(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID) {
-	if(physicalPageID != INVALID_PAGE && physicalPageID >= minVacuumQueuePage) {
-		vacuumQueue[physicalPageID] = std::make_pair(logicalPageID, version);
-	}
-}
-
-void PagerFile::finishedMarkingPages() {
-	if(minVacuumQueuePage >= pagesAllocated) {
-		minVacuumQueuePage = pagesAllocated >= SERVER_KNOBS->VACUUM_QUEUE_SIZE ? pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE : 0;
-		vacuumQueueReady = false;
-	}
-	else {
-		if(!vacuumQueueReady) {
-			vacuumQueueReady = true;
-		}
-		if(pagesAllocated > SERVER_KNOBS->VACUUM_QUEUE_SIZE && minVacuumQueuePage < pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE) {
-			minVacuumQueuePage = pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE;
-			auto itr = vacuumQueue.lower_bound(minVacuumQueuePage);
-			vacuumQueue.erase(vacuumQueue.begin(), itr);
-		}
-	}
-}
-
-uint64_t PagerFile::size() {
-	return fileSize;
-}
-
-uint32_t PagerFile::getPagesAllocated() {
-	return pagesAllocated;
-}
-
-uint32_t PagerFile::getFreePages() {
-	return freePages.size();
-}
-
-void PagerFile::init(uint64_t fileSize, uint32_t pagesAllocated) {
-	this->fileSize = fileSize;
-	this->pagesAllocated = pagesAllocated;
-	this->minVacuumQueuePage = pagesAllocated >= SERVER_KNOBS->VACUUM_QUEUE_SIZE ? pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE : 0;
-}
-
-void PagerFile::startVacuuming() {
-	vacuuming = Never(); //vacuumer(pager, this);
-}
-
-void PagerFile::shutdown() {
-	vacuuming.cancel();
-}
-
-bool PagerFile::canVacuum() {
-	if(freePages.size() < SERVER_KNOBS->FREE_PAGE_VACUUM_THRESHOLD // Not enough free pages
-		|| minVacuumQueuePage >= pagesAllocated // We finished processing all pages in the vacuum queue
-		|| !vacuumQueueReady) // Populating vacuum queue
-	{
-		debug_printf("%s: Vacuuming: waiting for vacuumable pages (free list size=%lu, minVacuumQueuePage=%u, pages allocated=%u, vacuumQueueReady=%d)\n", pager->pageFileName.c_str(), freePages.size(), minVacuumQueuePage, pagesAllocated, vacuumQueueReady);
-		return false;
-	}
-
-	return true;
-}
-
-const PhysicalPageID PagerFile::INVALID_PAGE = std::numeric_limits<PhysicalPageID>::max();
-
-extern Future<Void> simplePagerTest(IPager* const& pager);
-
-TEST_CASE("/fdbserver/indirectshadowpager/simple") {
-	state IPager *pager = new IndirectShadowPager("unittest_pageFile");
-
-	wait(simplePagerTest(pager));
-
-	Future<Void> closedFuture = pager->onClosed();
-	pager->close();
-	wait(closedFuture);
-
-	return Void();
-}
--- a/fdbserver/IndirectShadowPager.h
+++ b/fdbserver/IndirectShadowPager.h
@ -1,215 +0,0 @@
-/*
- * IndirectShadowPager.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef FDBSERVER_INDIRECTSHADOWPAGER_H
-#define FDBSERVER_INDIRECTSHADOWPAGER_H
-#pragma once
-
-#include "fdbserver/IKeyValueStore.h"
-#include "fdbserver/IPager.h"
-
-#include "flow/ActorCollection.h"
-#include "fdbclient/Notified.h"
-
-#include "fdbrpc/IAsyncFile.h"
-
-typedef uint32_t PhysicalPageID;
-typedef std::vector<std::pair<Version, PhysicalPageID>> PageVersionMap;
-typedef std::vector<PageVersionMap> LogicalPageTable;
-
-class IndirectShadowPager;
-
-class IndirectShadowPage : public IPage, ReferenceCounted<IndirectShadowPage> {
-public:
-	IndirectShadowPage();
-	IndirectShadowPage(uint8_t *data, Reference<IAsyncFile> file, PhysicalPageID pageID)
-	 : file(file), physicalPageID(pageID), fastAllocated(false), data(data) {}
-	virtual ~IndirectShadowPage();
-
-	virtual void addref() const {
-		ReferenceCounted<IndirectShadowPage>::addref();
-	}
-
-	virtual void delref() const {
-		ReferenceCounted<IndirectShadowPage>::delref();
-	}
-
-	virtual int size() const;
-	virtual uint8_t const* begin() const;
-	virtual uint8_t* mutate();
-
-//private:
-	static const int PAGE_BYTES;
-	static const int PAGE_OVERHEAD_BYTES;
-
-private:
-	Reference<IAsyncFile> file;
-	PhysicalPageID physicalPageID;
-	bool fastAllocated;
-	uint8_t *data;
-};
-
-class IndirectShadowPagerSnapshot : public IPagerSnapshot, ReferenceCounted<IndirectShadowPagerSnapshot> {
-public:
-	IndirectShadowPagerSnapshot(IndirectShadowPager *pager, Version version);
-
-	virtual Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID);
-
-	virtual Version getVersion() const {
-		return version;
-	}
-
-	virtual ~IndirectShadowPagerSnapshot() {
-	}
-
-	virtual void addref() {
-		ReferenceCounted<IndirectShadowPagerSnapshot>::addref();
-	}
-
-	virtual void delref() {
-		ReferenceCounted<IndirectShadowPagerSnapshot>::delref();
-	}
-
-private:
-	IndirectShadowPager *pager;
-	Version version;
-	Future<Void> pagerError;
-};
-
-class PagerFile {
-public:
-	PagerFile(IndirectShadowPager *pager);
-
-	PhysicalPageID allocatePage(LogicalPageID logicalPageID, Version version);
-	void freePage(PhysicalPageID physicalPageID);
-	void markPageAllocated(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID);
-
-	void finishedMarkingPages();
-
-	uint64_t size();
-	uint32_t getPagesAllocated();
-	uint32_t getFreePages();
-
-	void init(uint64_t fileSize, uint32_t pagesAllocated);
-	void startVacuuming();
-	void shutdown();
-
-//private:
-	Future<Void> vacuuming;
-	IndirectShadowPager *pager;
-
-	uint32_t pagesAllocated;
-	uint64_t fileSize;
-
-	std::set<PhysicalPageID> freePages;
-
-	PhysicalPageID minVacuumQueuePage;
-	bool vacuumQueueReady;
-	std::map<PhysicalPageID, std::pair<LogicalPageID, Version>> vacuumQueue;
-
-	bool canVacuum();
-
-	static const PhysicalPageID INVALID_PAGE;
-};
-
-class IndirectShadowPager : public IPager {
-public:
-	IndirectShadowPager(std::string basename);
-	virtual ~IndirectShadowPager() {
-	}
-
-	virtual Reference<IPage> newPageBuffer();
-	virtual int getUsablePageSize();
-
-	virtual Reference<IPagerSnapshot> getReadSnapshot(Version version);
-
-	virtual LogicalPageID allocateLogicalPage();
-	virtual void freeLogicalPage(LogicalPageID pageID, Version version);
-	virtual void writePage(LogicalPageID pageID, Reference<IPage> contents, Version updateVersion, LogicalPageID referencePageID);
-	virtual void forgetVersions(Version begin, Version end);
-	virtual Future<Void> commit();
-
-	virtual void setLatestVersion(Version version);
-	virtual Future<Version> getLatestVersion();	
-
-	virtual StorageBytes getStorageBytes();
-
-	virtual Future<Void> getError();
-	virtual Future<Void> onClosed();
-	virtual void dispose();
-	virtual void close();
-
-	Future<Reference<const IPage>> getPage(Reference<IndirectShadowPagerSnapshot> snapshot, LogicalPageID pageID, Version version);
-
-//private:
-	std::string basename;
-	std::string pageFileName;
-
-	Version latestVersion;
-	Version committedVersion;
-
-	LogicalPageTable pageTable;
-	IKeyValueStore *pageTableLog;
-
-	Reference<IAsyncFile> dataFile;
-	Future<Void> recovery;
-
-	Future<Void> housekeeping;
-	Future<Void> vacuuming;
-	Version oldestVersion;
-
-	// TODO: This structure maybe isn't needed
-	struct BusyPage {
-		Future<Reference<const IPage>> read;
-	};
-
-	typedef std::map<PhysicalPageID, BusyPage> BusyPageMapT;
-	BusyPageMapT busyPages;
-
-	SignalableActorCollection operations;
-	SignalableActorCollection writeActors;
-	Future<Void> committing;
-
-	Promise<Void> closed;
-	Promise<Void> errorPromise;
-
-	std::deque<LogicalPageID> logicalFreeList;
-	PagerFile pagerFile;
-
-	static PageVersionMap::iterator pageVersionMapLowerBound(PageVersionMap &pageVersionMap, Version v);
-	static PageVersionMap::iterator pageVersionMapUpperBound(PageVersionMap &pageVersionMap, Version v);
-
-	void freeLogicalPageID(LogicalPageID pageID);
-	void freePhysicalPageID(PhysicalPageID pageID);
-
-	void logVersion(StringRef versionKey, Version version);
-	void logPagesAllocated();
-	void logPageTableUpdate(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID);
-	void logPageTableClearToEnd(LogicalPageID logicalPageID, Version start);
-	void logPageTableClear(LogicalPageID logicalPageID, Version start, Version end);
-
-	static const StringRef LATEST_VERSION_KEY;
-	static const StringRef OLDEST_VERSION_KEY;
-	static const StringRef PAGES_ALLOCATED_KEY;
-	static const StringRef TABLE_ENTRY_PREFIX;
-
-};
-
-#endif
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -67,7 +67,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( PARALLEL_GET_MORE_REQUESTS,                             32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2;
 	init( MULTI_CURSOR_PRE_FETCH_LIMIT,                           10 );
 	init( MAX_QUEUE_COMMIT_BYTES,                               15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000;
-	init( VERSIONS_PER_BATCH,                 VERSIONS_PER_SECOND/20 ); if( randomize && BUGGIFY ) VERSIONS_PER_BATCH = std::max<int64_t>(1,VERSIONS_PER_SECOND/1000);
+	init( DESIRED_OUTSTANDING_MESSAGES,                         5000 ); if( randomize && BUGGIFY ) DESIRED_OUTSTANDING_MESSAGES = deterministicRandom()->randomInt(0,100);
+	init( DESIRED_GET_MORE_DELAY,                              0.005 );
 	init( CONCURRENT_LOG_ROUTER_READS,                             1 );
 	init( LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED,               1 ); if( randomize && BUGGIFY ) LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED = 0;
 	init( DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME,                    1.0 );
@ -80,6 +81,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( DISK_QUEUE_MAX_TRUNCATE_BYTES,                       2<<30 ); if ( randomize && BUGGIFY ) DISK_QUEUE_MAX_TRUNCATE_BYTES = 0;
 	init( TLOG_DEGRADED_DELAY_COUNT,                               5 );
 	init( TLOG_DEGRADED_DURATION,                                5.0 );
+	init( MAX_CACHE_VERSIONS,                                   10e6 );
 	init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY,                   300.0 );
 	init( TXS_POPPED_MAX_DELAY,                                  1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01();

@ -130,8 +132,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( MAX_SHARD_BYTES,                                 500000000 );
 	init( KEY_SERVER_SHARD_BYTES,                          500000000 );
 	bool buggifySmallReadBandwidth = randomize && BUGGIFY;
-	init( SHARD_MAX_BYTES_READ_PER_KSEC,            100LL*1000000*1000 ); if( buggifySmallReadBandwidth ) SHARD_MAX_BYTES_READ_PER_KSEC = 100LL*1000*1000;
-	/* 100*1MB/sec * 1000sec/ksec
+	init( SHARD_MAX_BYTES_READ_PER_KSEC,            8LL*1000000*1000 ); if( buggifySmallReadBandwidth ) SHARD_MAX_BYTES_READ_PER_KSEC = 100LL*1000*1000;
+	/* 8*1MB/sec * 1000sec/ksec
 		Shards with more than this read bandwidth will be considered as a read cache candidate
 	*/
 	init( SHARD_MAX_BYTES_READ_PER_KSEC_JITTER,     0.1 );
@ -327,6 +329,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( ENFORCED_MIN_RECOVERY_DURATION,                       0.085 ); if( shortRecoveryDuration ) ENFORCED_MIN_RECOVERY_DURATION = 0.01;
 	init( REQUIRED_MIN_RECOVERY_DURATION,                       0.080 ); if( shortRecoveryDuration ) REQUIRED_MIN_RECOVERY_DURATION = 0.01;
 	init( ALWAYS_CAUSAL_READ_RISKY,                             false );
+	init( MAX_COMMIT_UPDATES,                                  100000 ); if( randomize && BUGGIFY ) MAX_COMMIT_UPDATES = 1;

 	// Master Server
 	// masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution)
@ -456,7 +459,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( SPLIT_JITTER_AMOUNT,                                  0.05 ); if( randomize && BUGGIFY ) SPLIT_JITTER_AMOUNT = 0.2;
 	init( IOPS_UNITS_PER_SAMPLE,                                10000 * 1000 / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 100 );
 	init( BANDWIDTH_UNITS_PER_SAMPLE,                           SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 );
-	init( BYTES_READ_UNITS_PER_SAMPLE,                           100); // Effectively weight up read on small or non-existing key/values.
+	init( BYTES_READ_UNITS_PER_SAMPLE,                          100000 ); // 100K bytes
+	init( EMPTY_READ_PENALTY,                                    20 ); // 20 bytes

 	//Storage Server
 	init( STORAGE_LOGGING_DELAY,                                 5.0 );
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@ -70,7 +70,8 @@ public:
 	int PARALLEL_GET_MORE_REQUESTS;
 	int MULTI_CURSOR_PRE_FETCH_LIMIT;
 	int64_t MAX_QUEUE_COMMIT_BYTES;
-	int64_t VERSIONS_PER_BATCH;
+	int DESIRED_OUTSTANDING_MESSAGES;
+	double DESIRED_GET_MORE_DELAY;
 	int CONCURRENT_LOG_ROUTER_READS;
 	int LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED; // 0==peek from primary, non-zero==peek from satellites
 	double DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME;
@ -83,6 +84,7 @@ public:
 	int DISK_QUEUE_MAX_TRUNCATE_BYTES;  // A truncate larger than this will cause the file to be replaced instead.
 	int TLOG_DEGRADED_DELAY_COUNT;
 	double TLOG_DEGRADED_DURATION;
+	int64_t MAX_CACHE_VERSIONS;
 	double TXS_POPPED_MAX_DELAY;

 	// Data distribution queue
@ -269,6 +271,7 @@ public:
 	double ENFORCED_MIN_RECOVERY_DURATION;
 	double REQUIRED_MIN_RECOVERY_DURATION;
 	bool ALWAYS_CAUSAL_READ_RISKY;
+	int MAX_COMMIT_UPDATES;

 	// Master Server
 	double COMMIT_SLEEP_TIME;
@ -394,6 +397,7 @@ public:
 	int64_t IOPS_UNITS_PER_SAMPLE;
 	int64_t BANDWIDTH_UNITS_PER_SAMPLE;
 	int64_t BYTES_READ_UNITS_PER_SAMPLE;
+	int64_t EMPTY_READ_PENALTY;

 	//Storage Server
 	double STORAGE_LOGGING_DELAY;
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@ -243,6 +243,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {

 		state Version ver = 0;
 		state std::vector<TagsAndMessage> messages;
+		state Arena arena;
 		while (true) {
 			state bool foundMessage = r->hasMessage();
 			if (!foundMessage || r->version().version != ver) {
@ -258,6 +259,7 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
 				lastVer = ver;
 				ver = r->version().version;
 				messages.clear();
+				arena = Arena();

 				if (!foundMessage) {
 					ver--; //ver is the next possible version we will get data for
@ -275,8 +277,9 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {
 			tagAndMsg.message = r->getMessageWithTags();
 			tags.clear();
 			self->logSet.getPushLocations(r->getTags(), tags, 0);
+			tagAndMsg.tags.reserve(arena, tags.size());
 			for (const auto& t : tags) {
-				tagAndMsg.tags.emplace_back(tagLocalityRemoteLog, t);
+				tagAndMsg.tags.push_back(arena, Tag(tagLocalityRemoteLog, t));
 			}
 			messages.push_back(std::move(tagAndMsg));

@ -337,6 +340,9 @@ ACTOR Future<Void> logRouterPeekMessages( LogRouterData* self, TLogPeekRequest r
 		try {
 			peekId = req.sequence.get().first;
 			sequence = req.sequence.get().second;
+			if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->peekTracker.find(peekId) == self->peekTracker.end()) {
+				throw timed_out();
+			}
 			auto& trackerData = self->peekTracker[peekId];
 			if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
 				trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
--- a/fdbserver/LogSystem.h
+++ b/fdbserver/LogSystem.h
@ -232,7 +232,7 @@ public:
 		return resultEntries.size() == 0;
 	}

-	void getPushLocations(std::vector<Tag> const& tags, std::vector<int>& locations, int locationOffset,
+	void getPushLocations(VectorRef<Tag> tags, std::vector<int>& locations, int locationOffset,
 	                      bool allLocations = false) {
 		if(locality == tagLocalitySatellite) {
 			for(auto& t : tags) {
@ -310,7 +310,7 @@ struct ILogSystem {

 		//pre: only callable if hasMessage() returns true
 		//return the tags associated with the message for the current sequence
-		virtual const std::vector<Tag>& getTags() = 0;
+		virtual VectorRef<Tag> getTags() = 0;

 		//pre: only callable if hasMessage() returns true
 		//returns the arena containing the contents of getMessage(), getMessageWithTags(), and reader()
@ -405,7 +405,7 @@ struct ILogSystem {
 		virtual void nextMessage();
 		virtual StringRef getMessage();
 		virtual StringRef getMessageWithTags();
-		virtual const std::vector<Tag>& getTags();
+		virtual VectorRef<Tag> getTags();
 		virtual void advanceTo(LogMessageVersion n);
 		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
@ -438,6 +438,7 @@ struct ILogSystem {
 		bool hasNextMessage;
 		UID randomID;
 		int tLogReplicationFactor;
+		Future<Void> more;

 		MergedPeekCursor( std::vector< Reference<ILogSystem::IPeekCursor> > const& serverCursors, Version begin );
 		MergedPeekCursor( std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end, bool parallelGetMore, std::vector<LocalityData> const& tLogLocalities, Reference<IReplicationPolicy> const tLogPolicy, int tLogReplicationFactor );
@ -453,7 +454,7 @@ struct ILogSystem {
 		virtual void nextMessage();
 		virtual StringRef getMessage();
 		virtual StringRef getMessageWithTags();
-		virtual const std::vector<Tag>& getTags();
+		virtual VectorRef<Tag> getTags();
 		virtual void advanceTo(LogMessageVersion n);
 		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
@ -484,6 +485,7 @@ struct ILogSystem {
 		bool hasNextMessage;
 		bool useBestSet;
 		UID randomID;
+		Future<Void> more;

 		SetPeekCursor( std::vector<Reference<LogSet>> const& logSets, int bestSet, int bestServer, Tag tag, Version begin, Version end, bool parallelGetMore );
 		SetPeekCursor( std::vector<Reference<LogSet>> const& logSets, std::vector< std::vector< Reference<IPeekCursor> > > const& serverCursors, LogMessageVersion const& messageVersion, int bestSet, int bestServer, Optional<LogMessageVersion> nextVersion, bool useBestSet );
@ -498,7 +500,7 @@ struct ILogSystem {
 		virtual void nextMessage();
 		virtual StringRef getMessage();
 		virtual StringRef getMessageWithTags();
-		virtual const std::vector<Tag>& getTags();
+		virtual VectorRef<Tag> getTags();
 		virtual void advanceTo(LogMessageVersion n);
 		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
@ -532,7 +534,7 @@ struct ILogSystem {
 		virtual void nextMessage();
 		virtual StringRef getMessage();
 		virtual StringRef getMessageWithTags();
-		virtual const std::vector<Tag>& getTags();
+		virtual VectorRef<Tag> getTags();
 		virtual void advanceTo(LogMessageVersion n);
 		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
@ -555,12 +557,12 @@ struct ILogSystem {
 		struct BufferedMessage {
 			Arena arena;
 			StringRef message;
-			std::vector<Tag> tags;
+			VectorRef<Tag> tags;
 			LogMessageVersion version;

 			BufferedMessage() {}
 			explicit BufferedMessage( Version version ) : version(version) {}
-			BufferedMessage( Arena arena, StringRef message, const std::vector<Tag>& tags, const LogMessageVersion& version ) : arena(arena), message(message), tags(tags), version(version) {}
+			BufferedMessage( Arena arena, StringRef message, const VectorRef<Tag>& tags, const LogMessageVersion& version ) : arena(arena), message(message), tags(tags), version(version) {}

 			bool operator < (BufferedMessage const& r) const {
 				return version < r.version;
@ -572,23 +574,28 @@ struct ILogSystem {
 		};

 		std::vector<Reference<IPeekCursor>> cursors;
+		std::vector<Deque<BufferedMessage>> cursorMessages;
 		std::vector<BufferedMessage> messages;
 		int messageIndex;
 		LogMessageVersion messageVersion;
 		Version end;
 		bool hasNextMessage;
 		bool withTags;
+		bool knownUnique;
+		Version minKnownCommittedVersion;
 		Version poppedVersion;
 		Version initialPoppedVersion;
 		bool canDiscardPopped;
 		Future<Void> more;
+		int targetQueueSize;
+		UID randomID;

 		//FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support that upgrade.
 		bool collectTags;
-		std::vector<Tag> tags;
 		void combineMessages();

 		BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped );
+		BufferedCursor( std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers, Tag tag, Version begin, Version end, bool parallelGetMore );

 		virtual Reference<IPeekCursor> cloneNoMore();
 		virtual void setProtocolVersion( ProtocolVersion version );
@ -598,7 +605,7 @@ struct ILogSystem {
 		virtual void nextMessage();
 		virtual StringRef getMessage();
 		virtual StringRef getMessageWithTags();
-		virtual const std::vector<Tag>& getTags();
+		virtual VectorRef<Tag> getTags();
 		virtual void advanceTo(LogMessageVersion n);
 		virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply);
 		virtual Future<Void> onFailed();
@ -644,7 +651,7 @@ struct ILogSystem {
 		// Returns when the preceding changes are durable.  (Later we will need multiple return signals for diffferent durability levels)
 		// If the current epoch has ended, push will not return, and the pushed messages will not be visible in any subsequent epoch (but may become visible in this epoch)

-	virtual Reference<IPeekCursor> peek( UID dbgid, Version begin, Tag tag, bool parallelGetMore = false ) = 0;
+	virtual Reference<IPeekCursor> peek( UID dbgid, Version begin, Optional<Version> end, Tag tag, bool parallelGetMore = false ) = 0;
 		// Returns (via cursor interface) a stream of messages with the given tag and message versions >= (begin, 0), ordered by message version
 		// If pop was previously or concurrently called with upTo > begin, the cursor may not return all such messages.  In that case cursor->popped() will
 		// be greater than begin to reflect that.
@ -710,7 +717,11 @@ struct ILogSystem {
 	virtual Future<Void> onLogSystemConfigChange() = 0;
 		// Returns when the log system configuration has changed due to a tlog rejoin.

-	virtual void getPushLocations(std::vector<Tag> const& tags, std::vector<int>& locations, bool allLocations = false) = 0;
+	virtual void getPushLocations(VectorRef<Tag> tags, std::vector<int>& locations, bool allLocations = false) = 0;
+
+	void getPushLocations(std::vector<Tag> const& tags, std::vector<int>& locations, bool allLocations = false) {
+		getPushLocations(VectorRef<Tag>((Tag*)&tags.front(), tags.size()), locations, allLocations);
+	}

 	virtual bool hasRemoteLogs() const = 0;

--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@ -110,7 +110,7 @@ StringRef ILogSystem::ServerPeekCursor::getMessageWithTags() {
 	return rawMessage;
 }

-const std::vector<Tag>& ILogSystem::ServerPeekCursor::getTags() {
+VectorRef<Tag> ILogSystem::ServerPeekCursor::getTags() {
 	return messageAndTags.tags;
 }

@ -150,6 +150,12 @@ ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self
 				while(self->futureResults.size() < SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->interf->get().present()) {
 					self->futureResults.push_back( brokenPromiseToNever( self->interf->get().interf().peekMessages.getReply(TLogPeekRequest(self->messageVersion.version,self->tag,self->returnIfBlocked, self->onlySpilled, std::make_pair(self->randomID, self->sequence++)), taskID) ) );
 				}
+				if (self->sequence == std::numeric_limits<decltype(self->sequence)>::max()) {
+					throw timed_out();
+				}
+			} else if (self->futureResults.size() == 1) {
+				self->randomID = deterministicRandom()->randomUniqueID();
+				self->sequence = 0;
 			} else if (self->futureResults.size() == 0) {
 				return Void();
 			}
@ -430,7 +436,7 @@ StringRef ILogSystem::MergedPeekCursor::getMessageWithTags() {
 	return serverCursors[currentCursor]->getMessageWithTags();
 }

-const std::vector<Tag>& ILogSystem::MergedPeekCursor::getTags() {
+VectorRef<Tag> ILogSystem::MergedPeekCursor::getTags() {
 	return serverCursors[currentCursor]->getTags();
 }

@ -469,6 +475,10 @@ ACTOR Future<Void> mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMess
 }

 Future<Void> ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) {
+	if( more.isValid() && !more.isReady() ) {
+		return more;
+	}
+	
 	if(!serverCursors.size())
 		return Never();
 	
@ -482,7 +492,8 @@ Future<Void> ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) {
 	if (version() > startVersion)
 		return Void();

-	return mergedPeekGetMore(this, startVersion, taskID);
+	more = mergedPeekGetMore(this, startVersion, taskID);
+	return more;
 }

 Future<Void> ILogSystem::MergedPeekCursor::onFailed() {
@ -689,7 +700,7 @@ StringRef ILogSystem::SetPeekCursor::getMessage() { return serverCursors[current

 StringRef ILogSystem::SetPeekCursor::getMessageWithTags() { return serverCursors[currentSet][currentCursor]->getMessageWithTags(); }

-const std::vector<Tag>& ILogSystem::SetPeekCursor::getTags() {
+VectorRef<Tag> ILogSystem::SetPeekCursor::getTags() {
 	return serverCursors[currentSet][currentCursor]->getTags();
 }

@ -770,6 +781,10 @@ ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVer
 }

 Future<Void> ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) {
+	if( more.isValid() && !more.isReady() ) {
+		return more;
+	}
+	
 	auto startVersion = version();
 	calcHasMessage();
 	if( hasMessage() )
@ -780,7 +795,8 @@ Future<Void> ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) {
 	if (version() > startVersion)
 		return Void();

-	return setPeekGetMore(this, startVersion, taskID);
+	more = setPeekGetMore(this, startVersion, taskID);
+	return more;
 }

 Future<Void> ILogSystem::SetPeekCursor::onFailed() {
@ -851,7 +867,7 @@ StringRef ILogSystem::MultiCursor::getMessageWithTags() {
 	return cursors.back()->getMessageWithTags();
 }

-const std::vector<Tag>& ILogSystem::MultiCursor::getTags() {
+VectorRef<Tag> ILogSystem::MultiCursor::getTags() {
 	return cursors.back()->getTags();
 }

@ -901,8 +917,20 @@ Version ILogSystem::MultiCursor::popped() {
 	return std::max(poppedVersion, cursors.back()->popped());
 }

-ILogSystem::BufferedCursor::BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(canDiscardPopped) {
-	messages.reserve(10000);
+ILogSystem::BufferedCursor::BufferedCursor( std::vector<Reference<IPeekCursor>> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(canDiscardPopped), knownUnique(false), minKnownCommittedVersion(0), randomID(deterministicRandom()->randomUniqueID()) {
+	targetQueueSize = SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES/cursors.size();
+	messages.reserve(SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES);
+	cursorMessages.resize(cursors.size());
+}
+
+ILogSystem::BufferedCursor::BufferedCursor( std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers, Tag tag, Version begin, Version end, bool parallelGetMore ) : messageVersion(begin), end(end), withTags(true), collectTags(false), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(false), knownUnique(true), minKnownCommittedVersion(0), randomID(deterministicRandom()->randomUniqueID()) {
+	targetQueueSize = SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES/logServers.size();
+	messages.reserve(SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES);
+	cursorMessages.resize(logServers.size());
+	for( int i = 0; i < logServers.size(); i++ ) {
+		Reference<ILogSystem::ServerPeekCursor> cursor( new ILogSystem::ServerPeekCursor( logServers[i], tag, begin, end, false, parallelGetMore ) );
+		cursors.push_back( cursor );
+	}
 }

 void ILogSystem::BufferedCursor::combineMessages() {
@ -910,7 +938,7 @@ void ILogSystem::BufferedCursor::combineMessages() {
 		return;
 	}

-	tags.clear();
+	std::vector<Tag> tags;
 	tags.push_back(messages[messageIndex].tags[0]);
 	for(int i = messageIndex + 1; i < messages.size() && messages[messageIndex].version == messages[i].version; i++) {
 		tags.push_back(messages[i].tags[0]);
@ -919,14 +947,17 @@ void ILogSystem::BufferedCursor::combineMessages() {
 	auto& msg = messages[messageIndex];
 	BinaryWriter messageWriter(Unversioned());
 	messageWriter << uint32_t(msg.message.size() + sizeof(uint32_t) + sizeof(uint16_t) + tags.size()*sizeof(Tag)) << msg.version.sub << uint16_t(tags.size());
-	for(auto& t : tags) {
+	for(auto t : tags) {
 		messageWriter << t;
 	}
 	messageWriter.serializeBytes(msg.message);
 	Standalone<StringRef> val = messageWriter.toValue();
 	msg.arena = val.arena();
-	msg.tags = tags;
 	msg.message = val;
+	msg.tags = VectorRef<Tag>();
+	for(auto t : tags) {
+		msg.tags.push_back(msg.arena, t);
+	}
 }

 Reference<ILogSystem::IPeekCursor> ILogSystem::BufferedCursor::cloneNoMore() {
@ -973,7 +1004,7 @@ StringRef ILogSystem::BufferedCursor::getMessageWithTags() {
 	return messages[messageIndex].message;
 }

-const std::vector<Tag>& ILogSystem::BufferedCursor::getTags() {
+VectorRef<Tag> ILogSystem::BufferedCursor::getTags() {
 	ASSERT(withTags);
 	return messages[messageIndex].tags;
 }
@ -982,24 +1013,25 @@ void ILogSystem::BufferedCursor::advanceTo(LogMessageVersion n) {
 	ASSERT(false);
 }

-ACTOR Future<Void> bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference<ILogSystem::IPeekCursor> cursor, Version maxVersion, TaskPriority taskID ) {
+ACTOR Future<Void> bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference<ILogSystem::IPeekCursor> cursor, int idx, TaskPriority taskID ) {
 	loop {
 		wait(yield());
-		if(cursor->version().version >= maxVersion) {
+		if(cursor->version().version >= self->end || self->cursorMessages[idx].size() > self->targetQueueSize) {
 			return Void();
 		}
-		while(cursor->hasMessage()) {
-			self->messages.push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? std::vector<Tag>() : cursor->getTags(), cursor->version()));
-			cursor->nextMessage();
-			if(cursor->version().version >= maxVersion) {
-				return Void();
-			}
-		}
 		wait(cursor->getMore(taskID));
 		self->poppedVersion = std::max(self->poppedVersion, cursor->popped());
+		self->minKnownCommittedVersion = std::max(self->minKnownCommittedVersion, cursor->getMinKnownCommittedVersion());
 		if(self->canDiscardPopped) {
 			self->initialPoppedVersion = std::max(self->initialPoppedVersion, cursor->popped());
 		}
+		if(cursor->version().version >= self->end) {
+			return Void();
+		}
+		while(cursor->hasMessage()) {
+			self->cursorMessages[idx].push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? VectorRef<Tag>() : cursor->getTags(), cursor->version()));
+			cursor->nextMessage();
+		}
 	}
 }

@ -1009,39 +1041,57 @@ ACTOR Future<Void> bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriori
 		throw internal_error();
 	}

-	state Version targetVersion = std::min(self->end, self->messageVersion.version + SERVER_KNOBS->VERSIONS_PER_BATCH);
 	self->messages.clear();

 	std::vector<Future<Void>> loaders;
 	loaders.reserve(self->cursors.size());
-	for(auto& cursor : self->cursors) {
-		loaders.push_back(bufferedGetMoreLoader(self, cursor, targetVersion, taskID));
-	}
-	wait( waitForAll(loaders) );
-	wait(yield());

-	if(self->collectTags) {
+	for(int i = 0; i < self->cursors.size(); i++) {
+		loaders.push_back(bufferedGetMoreLoader(self, self->cursors[i], i, taskID));
+	}
+
+	state Future<Void> allLoaders = waitForAll(loaders);
+	state Version minVersion;
+	loop {
+		wait( allLoaders || delay(SERVER_KNOBS->DESIRED_GET_MORE_DELAY, taskID) );
+		minVersion = self->end;
+		for(auto cursor : self->cursors) {
+			minVersion = std::min(minVersion, cursor->version().version);
+		}
+		if(minVersion > self->messageVersion.version) {
+			break;
+		}
+		if(allLoaders.isReady()) {
+			wait(Future<Void>(Never()));
+		}
+	}
+	wait( yield() );
+
+	for(auto &it : self->cursorMessages) {
+		while(!it.empty() && it.front().version.version < minVersion) {
+			self->messages.push_back(it.front());
+			it.pop_front();
+		}
+	}
+	if(self->collectTags || self->knownUnique) {
 		std::sort(self->messages.begin(), self->messages.end());
 	} else {
 		uniquify(self->messages);
 	}
+
+	self->messageVersion = LogMessageVersion(minVersion);
 	self->messageIndex = 0;
 	self->hasNextMessage = self->messages.size() > 0;
-	Version minVersion = self->end;
-	for(auto& cursor : self->cursors) {
-		minVersion = std::min(minVersion, cursor->version().version);
-	}
-	self->messageVersion = LogMessageVersion(minVersion);
-
+	
 	if(self->collectTags) {
 		self->combineMessages();
 	}

 	wait(yield());
 	if(self->canDiscardPopped && self->poppedVersion > self->version().version) {
-		TraceEvent(SevWarn, "DiscardingPoppedData").detail("Version", self->version().version).detail("Popped", self->poppedVersion);
+		TraceEvent(SevWarn, "DiscardingPoppedData", self->randomID).detail("Version", self->version().version).detail("Popped", self->poppedVersion);
 		self->messageVersion = std::max(self->messageVersion, LogMessageVersion(self->poppedVersion));
-		for(auto& cursor : self->cursors) {
+		for(auto cursor : self->cursors) {
 			cursor->advanceTo(self->messageVersion);
 		}
 		self->messageIndex = self->messages.size();
@ -1096,8 +1146,7 @@ const LogMessageVersion& ILogSystem::BufferedCursor::version() {
 }

 Version ILogSystem::BufferedCursor::getMinKnownCommittedVersion() {
-	ASSERT(false);
-	return invalidVersion;
+	return minKnownCommittedVersion;
 }

 Version ILogSystem::BufferedCursor::popped() {
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@ -207,6 +207,7 @@ struct ProxyCommitData {
 	uint64_t mostRecentProcessedRequestNumber;
 	KeyRangeMap<Deque<std::pair<Version,int>>> keyResolvers;
 	KeyRangeMap<ServerCacheInfo> keyInfo;
+	KeyRangeMap<bool> cacheInfo;
 	std::map<Key, applyMutationsData> uid_applyMutationsData;
 	bool firstProxy;
 	double lastCoalesceTime;
@ -236,6 +237,7 @@ struct ProxyCommitData {
 	Optional<LatencyBandConfig> latencyBandConfig;
 	double lastStartCommit;
 	double lastCommitLatency;
+	int updateCommitRequests = 0;
 	NotifiedDouble lastCommitTime;

 	//The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly more CPU efficient.
@ -257,6 +259,16 @@ struct ProxyCommitData {
 		return tags;
 	}

+	const bool needsCacheTag(KeyRangeRef range) {
+		auto ranges = cacheInfo.intersectingRanges(range);
+		for(auto r : ranges) {
+			if(r.value()) {
+				return true;
+			}
+		}
+		return false;
+	}
+
 	ProxyCommitData(UID dbgid, MasterInterface master, RequestStream<GetReadVersionRequest> getConsistentReadVersion, Version recoveryTransactionVersion, RequestStream<CommitTransactionRequest> commit, Reference<AsyncVar<ServerDBInfo>> db, bool firstProxy)
 		: dbgid(dbgid), stats(dbgid, &version, &committedVersion, &commitBatchesMemBytesCount), master(master),
 			logAdapter(NULL), txnStateStore(NULL), popRemoteTxs(false),
@ -657,7 +669,7 @@ ACTOR Future<Void> commitBatch(
 			for (int resolver = 0; resolver < resolution.size(); resolver++)
 				committed = committed && resolution[resolver].stateMutations[versionIndex][transactionIndex].committed;
 			if (committed)
-				applyMetadataMutations( self->dbgid, arena, resolution[0].stateMutations[versionIndex][transactionIndex].mutations, self->txnStateStore, NULL, &forceRecovery, self->logSystem, 0, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped);
+				applyMetadataMutations( self->dbgid, arena, resolution[0].stateMutations[versionIndex][transactionIndex].mutations, self->txnStateStore, nullptr, &forceRecovery, self->logSystem, 0, &self->vecBackupKeys, &self->keyInfo, &self->cacheInfo, self->firstProxy ? &self->uid_applyMutationsData : nullptr, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped);
 			
 			if( resolution[0].stateMutations[versionIndex][transactionIndex].mutations.size() && firstStateMutations ) {
 				ASSERT(committed);
@ -737,7 +749,7 @@ ACTOR Future<Void> commitBatch(
 	{
 		if (committed[t] == ConflictBatch::TransactionCommitted && (!locked || trs[t].isLockAware())) {
 			commitCount++;
-			applyMetadataMutations(self->dbgid, arena, trs[t].transaction.mutations, self->txnStateStore, &toCommit, &forceRecovery, self->logSystem, commitVersion+1, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped);
+			applyMetadataMutations(self->dbgid, arena, trs[t].transaction.mutations, self->txnStateStore, &toCommit, &forceRecovery, self->logSystem, commitVersion+1, &self->vecBackupKeys, &self->keyInfo, &self->cacheInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped);
 		}
 		if(firstStateMutations) {
 			ASSERT(committed[t] == ConflictBatch::TransactionCommitted);
@ -808,11 +820,16 @@ ACTOR Future<Void> commitBatch(

 					if (debugMutation("ProxyCommit", commitVersion, m))
 						TraceEvent("ProxyCommitTo", self->dbgid).detail("To", describe(tags)).detail("Mutation", m.toString()).detail("Version", commitVersion);
+					
 					toCommit.addTags(tags);
+					if(self->cacheInfo[m.param1]) {
+						toCommit.addTag(cacheTag);
+					}
 					toCommit.addTypedMessage(m);
 				}
 				else if (m.type == MutationRef::ClearRange) {
-					auto ranges = self->keyInfo.intersectingRanges(KeyRangeRef(m.param1, m.param2));
+					KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2));
+					auto ranges = self->keyInfo.intersectingRanges(clearRange);
 					auto firstRange = ranges.begin();
 					++firstRange;
 					if (firstRange == ranges.end()) {
@ -832,8 +849,12 @@ ACTOR Future<Void> commitBatch(
 						}
 						if (debugMutation("ProxyCommit", commitVersion, m))
 							TraceEvent("ProxyCommitTo", self->dbgid).detail("To", describe(allSources)).detail("Mutation", m.toString()).detail("Version", commitVersion);
+
 						toCommit.addTags(allSources);
 					}
+					if(self->needsCacheTag(clearRange)) {
+						toCommit.addTag(cacheTag);
+					}
 					toCommit.addTypedMessage(m);
 				} else
 					UNREACHABLE();
@ -1052,7 +1073,9 @@ ACTOR Future<Void> commitBatch(
 ACTOR Future<Void> updateLastCommit(ProxyCommitData* self, Optional<UID> debugID = Optional<UID>()) {
 	state double confirmStart = now();
 	self->lastStartCommit = confirmStart;
+	self->updateCommitRequests++;
 	wait(self->logSystem->confirmEpochLive(debugID));
+	self->updateCommitRequests--;
 	self->lastCommitLatency = now()-confirmStart;
 	self->lastCommitTime = std::max(self->lastCommitTime.get(), confirmStart);
 	return Void();
@ -1135,7 +1158,9 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture, std::
 	GetReadVersionReply reply = wait(replyFuture);
 	double end = timer();
 	for(GetReadVersionRequest const& request : requests) {
-		stats->grvLatencyBands.addMeasurement(end - request.requestTime());
+		if(request.priority() >= GetReadVersionRequest::PRIORITY_DEFAULT) {
+			stats->grvLatencyBands.addMeasurement(end - request.requestTime());
+		}
 		request.reply.send(reply);
 	}

@ -1460,7 +1485,12 @@ ACTOR Future<Void> lastCommitUpdater(ProxyCommitData* self, PromiseStream<Future
 		if(elapsed < interval) {
 			wait( delay(interval + 0.0001 - elapsed) );
 		} else {
-			addActor.send(updateLastCommit(self));
+			if(self->updateCommitRequests < SERVER_KNOBS->MAX_COMMIT_UPDATES) {
+				addActor.send(updateLastCommit(self));
+			} else {
+				TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "TooManyLastCommitUpdates").suppressFor(1.0);
+				self->lastStartCommit = now();
+			}
 		}
 	}
 }
@ -1770,7 +1800,7 @@ ACTOR Future<Void> masterProxyServerCore(

 						Arena arena;
 						bool confChanges;
-						applyMetadataMutations(commitData.dbgid, arena, mutations, commitData.txnStateStore, NULL, &confChanges, Reference<ILogSystem>(), 0, &commitData.vecBackupKeys, &commitData.keyInfo, commitData.firstProxy ? &commitData.uid_applyMutationsData : NULL, commitData.commit, commitData.cx, &commitData.committedVersion, &commitData.storageCache, &commitData.tag_popped, true );
+						applyMetadataMutations(commitData.dbgid, arena, mutations, commitData.txnStateStore, nullptr, &confChanges, Reference<ILogSystem>(), 0, &commitData.vecBackupKeys, &commitData.keyInfo, &commitData.cacheInfo, commitData.firstProxy ? &commitData.uid_applyMutationsData : nullptr, commitData.commit, commitData.cx, &commitData.committedVersion, &commitData.storageCache, &commitData.tag_popped, true );
 					}

 					auto lockedKey = commitData.txnStateStore->readValue(databaseLockedKey).get();
--- a/fdbserver/MemoryPager.actor.cpp
+++ b/fdbserver/MemoryPager.actor.cpp
@ -1,456 +0,0 @@
-/*
- * MemoryPager.actor.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cinttypes>
-#include "fdbserver/MemoryPager.h"
-#include "fdbserver/Knobs.h"
-
-#include "flow/Arena.h"
-#include "flow/UnitTest.h"
-#include "flow/actorcompiler.h"
-
-typedef uint8_t* PhysicalPageID;
-typedef std::vector<std::pair<Version, PhysicalPageID>> PageVersionMap;
-typedef std::vector<PageVersionMap> LogicalPageTable;
-
-class MemoryPager;
-
-class MemoryPage : public IPage, ReferenceCounted<MemoryPage> {
-public:
-	MemoryPage();
-	MemoryPage(uint8_t *data);
-	virtual ~MemoryPage();
-
-	virtual void addref() const {
-		ReferenceCounted<MemoryPage>::addref();
-	}
-
-	virtual void delref() const {
-		ReferenceCounted<MemoryPage>::delref();
-	}
-
-	virtual int size() const;
-	virtual uint8_t const* begin() const;
-	virtual uint8_t* mutate();
-
-private:
-	friend class MemoryPager;
-	uint8_t *data;
-	bool allocated;
-
-	static const int PAGE_BYTES;
-};
-
-class MemoryPagerSnapshot : public IPagerSnapshot, ReferenceCounted<MemoryPagerSnapshot> {
-public:
-	MemoryPagerSnapshot(MemoryPager *pager, Version version) : pager(pager), version(version) {}
-	virtual Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID);
-	virtual Version getVersion() const {
-		return version;
-	}
-
-	virtual void addref() {
-		ReferenceCounted<MemoryPagerSnapshot>::addref();
-	}
-
-	virtual void delref() {
-		ReferenceCounted<MemoryPagerSnapshot>::delref();
-	}
-
-private:
-	MemoryPager *pager;
-	Version version;
-};
-
-class MemoryPager : public IPager, ReferenceCounted<MemoryPager> {
-public:
-	MemoryPager();
-
-	virtual Reference<IPage> newPageBuffer();
-	virtual int getUsablePageSize();
-
-	virtual Reference<IPagerSnapshot> getReadSnapshot(Version version);
-
-	virtual LogicalPageID allocateLogicalPage();
-	virtual void freeLogicalPage(LogicalPageID pageID, Version version);
-	virtual void writePage(LogicalPageID pageID, Reference<IPage> contents, Version updateVersion, LogicalPageID referencePageID);
-	virtual void forgetVersions(Version begin, Version end);
-	virtual Future<Void> commit();
-
-	virtual StorageBytes getStorageBytes() {
-		// TODO:  Get actual values for used and free memory
-		return StorageBytes();
-	}
-
-	virtual void setLatestVersion(Version version);
-	virtual Future<Version> getLatestVersion();	
-	
-	virtual Future<Void> getError();
-	virtual Future<Void> onClosed();
-	virtual void dispose();
-	virtual void close();
-
-	virtual Reference<const IPage> getPage(LogicalPageID pageID, Version version);
-
-private:
-	Version latestVersion;
-	Version committedVersion;
-	Standalone<VectorRef<VectorRef<uint8_t>>> data;
-	LogicalPageTable pageTable;
-
-	Promise<Void> closed;
-
-	std::vector<PhysicalPageID> freeList; // TODO: is this good enough for now?
-
-	PhysicalPageID allocatePage(Reference<IPage> contents);
-	void extendData();	
-
-	static const PhysicalPageID INVALID_PAGE;
-};
-
-IPager * createMemoryPager() {
-	return new MemoryPager();
-}
-
-MemoryPage::MemoryPage() : allocated(true) {
-	data = (uint8_t*)FastAllocator<4096>::allocate();
-}
-
-MemoryPage::MemoryPage(uint8_t *data) : data(data), allocated(false) {}
-
-MemoryPage::~MemoryPage() {
-	if(allocated) {
-		FastAllocator<4096>::release(data);
-	}
-}
-
-uint8_t const* MemoryPage::begin() const {
-	return data;
-}
-
-uint8_t* MemoryPage::mutate() {
-	return data;
-}
-
-int MemoryPage::size() const {
-	return PAGE_BYTES;
-}
-
-const int MemoryPage::PAGE_BYTES = 4096;
-
-Future<Reference<const IPage>> MemoryPagerSnapshot::getPhysicalPage(LogicalPageID pageID) {
-	return pager->getPage(pageID, version);
-}
-
-MemoryPager::MemoryPager() : latestVersion(0), committedVersion(0) {
-	extendData();
-	pageTable.resize(SERVER_KNOBS->PAGER_RESERVED_PAGES);
-}
-
-Reference<IPage> MemoryPager::newPageBuffer() {
-	return Reference<IPage>(new MemoryPage());
-}
-
-int MemoryPager::getUsablePageSize() {
-	return MemoryPage::PAGE_BYTES;
-}
-
-Reference<IPagerSnapshot> MemoryPager::getReadSnapshot(Version version) {
-	ASSERT(version <= latestVersion);
-	return Reference<IPagerSnapshot>(new MemoryPagerSnapshot(this, version));
-}
-
-LogicalPageID MemoryPager::allocateLogicalPage() {
-	ASSERT(pageTable.size() >= SERVER_KNOBS->PAGER_RESERVED_PAGES);
-	pageTable.push_back(PageVersionMap());
-	return pageTable.size() - 1;
-}
-
-void MemoryPager::freeLogicalPage(LogicalPageID pageID, Version version) {
-	ASSERT(pageID < pageTable.size());
-
-	PageVersionMap &pageVersionMap = pageTable[pageID];
-	ASSERT(!pageVersionMap.empty());
-
-	auto itr = std::lower_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](std::pair<Version, PhysicalPageID> p, Version v) {
-		return p.first < v;
-	});
-
-	pageVersionMap.erase(itr, pageVersionMap.end());
-	if(pageVersionMap.size() > 0 && pageVersionMap.back().second != INVALID_PAGE) {
-		pageVersionMap.push_back(std::make_pair(version, INVALID_PAGE));
-	}
-}
-
-void MemoryPager::writePage(LogicalPageID pageID, Reference<IPage> contents, Version updateVersion, LogicalPageID referencePageID) {
-	ASSERT(updateVersion > latestVersion || updateVersion == 0);
-	ASSERT(pageID < pageTable.size());
-
-	if(referencePageID != invalidLogicalPageID) {
-		PageVersionMap &rpv = pageTable[referencePageID];
-		ASSERT(!rpv.empty());
-		updateVersion = rpv.back().first;
-	}
-
-	PageVersionMap &pageVersionMap = pageTable[pageID];
-
-	ASSERT(updateVersion >= committedVersion || updateVersion == 0);
-	PhysicalPageID physicalPageID = allocatePage(contents);
-
-	ASSERT(pageVersionMap.empty() || pageVersionMap.back().second != INVALID_PAGE);
-
-	if(updateVersion == 0) {
-		ASSERT(pageVersionMap.size());
-		updateVersion = pageVersionMap.back().first;
-		pageVersionMap.back().second = physicalPageID;
-		// TODO: what to do with old page?
-	}
-	else {
-		ASSERT(pageVersionMap.empty() || pageVersionMap.back().first < updateVersion);
-		pageVersionMap.push_back(std::make_pair(updateVersion, physicalPageID));
-	}
-
-}
-
-void MemoryPager::forgetVersions(Version begin, Version end) {
-	ASSERT(begin <= end);
-	ASSERT(end <= latestVersion);
-	// TODO
-}
-
-Future<Void> MemoryPager::commit() { 
-	ASSERT(committedVersion < latestVersion);
-	committedVersion = latestVersion;
-	return Void();
-}
-
-void MemoryPager::setLatestVersion(Version version) {
-	ASSERT(version > latestVersion);
-	latestVersion = version;
-}
-
-Future<Version> MemoryPager::getLatestVersion() {
-	return latestVersion;
-}
-
-Reference<const IPage> MemoryPager::getPage(LogicalPageID pageID, Version version) {
-	ASSERT(pageID < pageTable.size());
-	PageVersionMap const& pageVersionMap = pageTable[pageID];
-
-	auto itr = std::upper_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](Version v, std::pair<Version, PhysicalPageID> p) {
-		return v < p.first;
-	});
-
-	if(itr == pageVersionMap.begin()) {
-		return Reference<IPage>(); // TODO: should this be an error?
-	}
-
-	--itr;
-	
-	ASSERT(itr->second != INVALID_PAGE);
-	return Reference<const IPage>(new MemoryPage(itr->second)); // TODO: Page memory owned by the pager. Change this?
-}
-
-Future<Void> MemoryPager::getError() {
-	return Void();
-}
-
-Future<Void> MemoryPager::onClosed() {
-	return closed.getFuture();
-}
-
-void MemoryPager::dispose() {
-	closed.send(Void());
-	delete this;
-}
-
-void MemoryPager::close() {
-	dispose();
-}
-
-PhysicalPageID MemoryPager::allocatePage(Reference<IPage> contents) {
-	if(freeList.size()) {
-		PhysicalPageID pageID = freeList.back();
-		freeList.pop_back();
-
-		memcpy(pageID, contents->begin(), contents->size());
-		return pageID;
-	}
-	else {
-		ASSERT(data.size() && data.back().capacity() - data.back().size() >= contents->size());
-		PhysicalPageID pageID = data.back().end();
-
-		data.back().append(data.arena(), contents->begin(), contents->size());
-		if(data.back().size() == data.back().capacity()) {
-			extendData();
-		}
-		else {
-			ASSERT(data.back().size() <= data.back().capacity() - 4096);
-		}
-
-		return pageID;
-	}
-}
-
-void MemoryPager::extendData() {
-	if(data.size() > 1000) { // TODO: is this an ok way to handle large data size?
-		throw io_error();
-	}
-
-	VectorRef<uint8_t> d;
-	d.reserve(data.arena(), 1 << 22);
-	data.push_back(data.arena(), d);
-}
-
-// TODO: these tests are not MemoryPager specific, we should make them more general
-
-void fillPage(Reference<IPage> page, LogicalPageID pageID, Version version) {
-	ASSERT(page->size() > sizeof(LogicalPageID) + sizeof(Version));
-
-	memset(page->mutate(), 0, page->size());
-	memcpy(page->mutate(), (void*)&pageID, sizeof(LogicalPageID));
-	memcpy(page->mutate() + sizeof(LogicalPageID), (void*)&version, sizeof(Version));
-}
-
-bool validatePage(Reference<const IPage> page, LogicalPageID pageID, Version version) {
-	bool valid = true;
-
-	LogicalPageID readPageID = *(LogicalPageID*)page->begin();
-	if(readPageID != pageID) {
-		fprintf(stderr, "Invalid PageID detected: %u (expected %u)\n", readPageID, pageID);
-		valid = false;
-	}
-
-	Version readVersion = *(Version*)(page->begin()+sizeof(LogicalPageID));
-	if(readVersion != version) {
-		fprintf(stderr, "Invalid Version detected on page %u: %" PRId64 "(expected %" PRId64 ")\n", pageID, readVersion, version);
-		valid = false;
-	}
-
-	return valid;
-}
-
-void writePage(IPager *pager, Reference<IPage> page, LogicalPageID pageID, Version version, bool updateVersion=true) {
-	fillPage(page, pageID, version);
-	pager->writePage(pageID, page, updateVersion ? version : 0);
-}
-
-ACTOR Future<Void> commit(IPager *pager) {
-	static int commitNum = 1;
-	state int myCommit = commitNum++;
-
-	debug_printf("Commit%d\n", myCommit);
-	wait(pager->commit());
-	debug_printf("FinishedCommit%d\n", myCommit);
-	return Void();
-}
-
-ACTOR Future<Void> read(IPager *pager, LogicalPageID pageID, Version version, Version expectedVersion=-1) {
-	static int readNum = 1;
-	state int myRead = readNum++;
-	state Reference<IPagerSnapshot> readSnapshot = pager->getReadSnapshot(version);
-	debug_printf("Read%d\n", myRead);
-	Reference<const IPage> readPage = wait(readSnapshot->getPhysicalPage(pageID));
-	debug_printf("FinishedRead%d\n", myRead);
-	ASSERT(validatePage(readPage, pageID, expectedVersion >= 0 ? expectedVersion : version));
-	return Void();
-}
-
-ACTOR Future<Void> simplePagerTest(IPager *pager) {
-	state Reference<IPage> page = pager->newPageBuffer();
-
-	Version latestVersion = wait(pager->getLatestVersion());
-	debug_printf("Got latest version: %lld\n", latestVersion);
-
-	state Version version = latestVersion+1;
-	state Version v1 = version;
-
-	state LogicalPageID pageID1 = pager->allocateLogicalPage();
-
-	writePage(pager, page, pageID1, v1);
-	pager->setLatestVersion(v1);
-	wait(commit(pager));
-
-	state LogicalPageID pageID2 = pager->allocateLogicalPage();
-
-	state Version v2 = ++version;
-
-	writePage(pager, page, pageID1, v2);
-	writePage(pager, page, pageID2, v2);
-	pager->setLatestVersion(v2);
-	wait(commit(pager));
-
-	wait(read(pager, pageID1, v2));
-	wait(read(pager, pageID1, v1));
-
-	state Version v3 = ++version;
-	writePage(pager, page, pageID1, v3, false);
-	pager->setLatestVersion(v3);
-
-	wait(read(pager, pageID1, v2, v3));
-	wait(read(pager, pageID1, v3, v3));
-
-	state LogicalPageID pageID3 = pager->allocateLogicalPage();
-
-	state Version v4 = ++version;
-	writePage(pager, page, pageID2, v4);
-	writePage(pager, page, pageID3, v4);
-	pager->setLatestVersion(v4);
-	wait(commit(pager));
-
-	wait(read(pager, pageID2, v4, v4));
-
-	state Version v5 = ++version;
-	writePage(pager, page, pageID2, v5);
-
-	state LogicalPageID pageID4 = pager->allocateLogicalPage();
-	writePage(pager, page, pageID4, v5);
-	
-	state Version v6 = ++version;
-	pager->freeLogicalPage(pageID2, v5);
-	pager->freeLogicalPage(pageID3, v3);
-	pager->setLatestVersion(v6);
-	wait(commit(pager));
-
-	pager->forgetVersions(0, v4);
-	wait(commit(pager));
-
-	wait(delay(3.0));
-
-	wait(commit(pager));
-
-	return Void();
-}
-
-/*
-TEST_CASE("/fdbserver/memorypager/simple") {
-	state IPager *pager = new MemoryPager();
-
-	wait(simplePagerTest(pager));
-
-	Future<Void> closedFuture = pager->onClosed();
-	pager->dispose();
-
-	wait(closedFuture);
-	return Void();
-}
-*/
-
-const PhysicalPageID MemoryPager::INVALID_PAGE = nullptr;
--- a/fdbserver/MemoryPager.h
+++ b/fdbserver/MemoryPager.h
@ -1,29 +0,0 @@
-/*
- * MemoryPager.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef FDBSERVER_MEMORYPAGER_H
-#define FDBSERVER_MEMORYPAGER_H
-#pragma once
-
-#include "fdbserver/IPager.h"
-
-IPager * createMemoryPager();
-
-#endif
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@ -875,6 +875,9 @@ namespace oldTLog_4_6 {
 			try {
 				peekId = req.sequence.get().first;
 				sequence = req.sequence.get().second;
+				if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->peekTracker.find(peekId) == self->peekTracker.end()) {
+					throw timed_out();
+				}
 				if(sequence > 0) {
 					auto& trackerData = self->peekTracker[peekId];
 					trackerData.lastUpdate = now();
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@ -284,6 +284,7 @@ struct TLogData : NonCopyable {
 	std::map<Tag, Version> toBePopped; // map of Tag->Version for all the pops
                                       // that came when ignorePopRequest was set
 	Reference<AsyncVar<bool>> degraded;
+	std::vector<TagsAndMessage> tempTagMessages;

 	TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference<AsyncVar<ServerDBInfo>> dbInfo, Reference<AsyncVar<bool>> degraded, std::string folder)
 			: dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()),
@ -677,6 +678,80 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 	return Void();
 }

+ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference<LogData> logData ) {
+	if (self->ignorePopRequest) {
+		TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline);
+
+		if (self->toBePopped.find(inputTag) == self->toBePopped.end()
+			|| to > self->toBePopped[inputTag]) {
+			self->toBePopped[inputTag] = to;
+		}
+		// add the pop to the toBePopped map
+		TraceEvent(SevDebug, "IgnoringPopRequest")
+			.detail("IgnorePopDeadline", self->ignorePopDeadline)
+			.detail("Tag", inputTag.toString())
+			.detail("Version", to);
+		return Void();
+	}
+	state Version upTo = to;
+	int8_t tagLocality = inputTag.locality;
+	if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) {
+		upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to);
+		tagLocality = tagLocalityLogRouter;
+	}
+	state Tag tag(tagLocality, inputTag.id);
+	auto tagData = logData->getTagData(tag);
+	if (!tagData) {
+		tagData = logData->createTagData(tag, upTo, true, true, false);
+	} else if (upTo > tagData->popped) {
+		tagData->popped = upTo;
+		tagData->poppedRecently = true;
+
+		if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) {
+			tagData->unpoppedRecovered = false;
+			logData->unpoppedRecoveredTags--;
+			TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt);
+			if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) {
+				logData->recoveryComplete.send(Void());
+			}
+		}
+
+		if (upTo > logData->persistentDataDurableVersion)
+			wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop));
+		//TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo);
+	}
+	return Void();
+}
+
+ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogData> logData ) {
+	// timeout check for ignorePopRequest
+	if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) {
+
+		TraceEvent("EnableTLogPlayAllIgnoredPops");
+		// use toBePopped and issue all the pops
+		state std::map<Tag, Version>::iterator it;
+		state vector<Future<Void>> ignoredPops;
+		self->ignorePopRequest = false;
+		self->ignorePopUid = "";
+		self->ignorePopDeadline = 0.0;
+		for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) {
+			TraceEvent("PlayIgnoredPop")
+				.detail("Tag", it->first.toString())
+				.detail("Version", it->second);
+			ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData));
+		}
+		self->toBePopped.clear();
+		wait(waitForAll(ignoredPops));
+		TraceEvent("ResetIgnorePopRequest")
+		    .detail("Now", g_network->now())
+		    .detail("IgnorePopRequest", self->ignorePopRequest)
+		    .detail("IgnorePopDeadline", self->ignorePopDeadline);
+	}
+	wait(tLogPopCore(self, req.tag, req.to, logData));
+	req.reply.send(Void());
+	return Void();
+}
+
 // This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources.
 // For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important
 // work (e.g. commits).
@ -696,6 +771,26 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {

 	state FlowLock::Releaser commitLockReleaser;

+	//FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after two consecutive recoveries.
+	// It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this tag; which is not intended to ever happen.
+	Optional<Version> cachePopVersion;
+	for(auto& it : self->id_data) {
+		if(!it.second->stopped) {
+			if(it.second->version.get() - it.second->unrecoveredBefore > SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) {
+				cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS;
+			}
+			break;
+		}
+	}
+
+	if(cachePopVersion.present()) {
+		state std::vector<Future<Void>> cachePopFutures;
+		for(auto& it : self->id_data) {
+			cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(),0,cacheTag), it.second));
+		}
+		wait( waitForAll(cachePopFutures) );
+	}
+
 	if(logData->stopped) {
 		if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) {
 			while(logData->persistentDataDurableVersion != logData->version.get()) {
@ -886,13 +981,13 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version

 void commitMessages( TLogData *self, Reference<LogData> logData, Version version, Arena arena, StringRef messages ) {
 	ArenaReader rd( arena, messages, Unversioned() );
-	std::vector<TagsAndMessage> msgs;
+	self->tempTagMessages.clear();
 	while(!rd.empty()) {
 		TagsAndMessage tagsAndMsg;
 		tagsAndMsg.loadFromArena(&rd, nullptr);
-		msgs.push_back(std::move(tagsAndMsg));
+		self->tempTagMessages.push_back(std::move(tagsAndMsg));
 	}
-	commitMessages(self, logData, version, msgs);
+	commitMessages(self, logData, version, self->tempTagMessages);
 }

 Version poppedVersion( Reference<LogData> self, Tag tag) {
@ -915,80 +1010,6 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>> & getVersionMessages( Re
 	return tagData->versionMessages;
 };

-ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference<LogData> logData ) {
-	if (self->ignorePopRequest) {
-		TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline);
-
-		if (self->toBePopped.find(inputTag) == self->toBePopped.end()
-			|| to > self->toBePopped[inputTag]) {
-			self->toBePopped[inputTag] = to;
-		}
-		// add the pop to the toBePopped map
-		TraceEvent(SevDebug, "IgnoringPopRequest")
-			.detail("IgnorePopDeadline", self->ignorePopDeadline)
-			.detail("Tag", inputTag.toString())
-			.detail("Version", to);
-		return Void();
-	}
-	state Version upTo = to;
-	int8_t tagLocality = inputTag.locality;
-	if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) {
-		upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to);
-		tagLocality = tagLocalityLogRouter;
-	}
-	state Tag tag(tagLocality, inputTag.id);
-	auto tagData = logData->getTagData(tag);
-	if (!tagData) {
-		tagData = logData->createTagData(tag, upTo, true, true, false);
-	} else if (upTo > tagData->popped) {
-		tagData->popped = upTo;
-		tagData->poppedRecently = true;
-
-		if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) {
-			tagData->unpoppedRecovered = false;
-			logData->unpoppedRecoveredTags--;
-			TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt);
-			if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) {
-				logData->recoveryComplete.send(Void());
-			}
-		}
-
-		if (upTo > logData->persistentDataDurableVersion)
-			wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop));
-		//TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo);
-	}
-	return Void();
-}
-
-ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogData> logData ) {
-	// timeout check for ignorePopRequest
-	if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) {
-
-		TraceEvent("EnableTLogPlayAllIgnoredPops");
-		// use toBePopped and issue all the pops
-		state std::map<Tag, Version>::iterator it;
-		state vector<Future<Void>> ignoredPops;
-		self->ignorePopRequest = false;
-		self->ignorePopUid = "";
-		self->ignorePopDeadline = 0.0;
-		for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) {
-			TraceEvent("PlayIgnoredPop")
-				.detail("Tag", it->first.toString())
-				.detail("Version", it->second);
-			ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData));
-		}
-		self->toBePopped.clear();
-		wait(waitForAll(ignoredPops));
-		TraceEvent("ResetIgnorePopRequest")
-		    .detail("Now", g_network->now())
-		    .detail("IgnorePopRequest", self->ignorePopRequest)
-		    .detail("IgnorePopDeadline", self->ignorePopDeadline);
-	}
-	wait(tLogPopCore(self, req.tag, req.to, logData));
-	req.reply.send(Void());
-	return Void();
-}
-
 void peekMessagesFromMemory( Reference<LogData> self, TLogPeekRequest const& req, BinaryWriter& messages, Version& endVersion ) {
 	ASSERT( !messages.getLength() );

@ -1025,6 +1046,9 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 		try {
 			peekId = req.sequence.get().first;
 			sequence = req.sequence.get().second;
+			if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) {
+				throw timed_out();
+			}
 			auto& trackerData = logData->peekTracker[peekId];
 			if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
 				trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
@ -1226,6 +1250,7 @@ ACTOR Future<Void> doQueueCommit( TLogData* self, Reference<LogData> logData, st
 	self->queueCommitBegin = commitNumber;
 	logData->queueCommittingVersion = ver;

+	g_network->setCurrentTask(TaskPriority::TLogCommitReply);
 	Future<Void> c = self->persistentQueue->commit();
 	self->diskQueueCommitBytes = 0;
 	self->largeDiskQueueCommitBytes.set(false);
@ -1716,7 +1741,7 @@ void removeLog( TLogData* self, Reference<LogData> logData ) {
 	}
 }

-ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, std::vector<Tag> tags, Version beginVersion, Optional<Version> endVersion, bool poppedIsKnownCommitted, bool parallelGetMore ) {
+ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, std::vector<Tag> tags, Version beginVersion, Optional<Version> endVersion, bool poppedIsKnownCommitted ) {
 	state Future<Void> dbInfoChange = Void();
 	state Reference<ILogSystem::IPeekCursor> r;
 	state Version tagAt = beginVersion;
@ -1730,7 +1755,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 				}
 				when( wait( dbInfoChange ) ) {
 					if( logData->logSystem->get() ) {
-						r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, parallelGetMore );
+						r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, true );
 					} else {
 						r = Reference<ILogSystem::IPeekCursor>();
 					}
@ -1867,7 +1892,7 @@ ACTOR Future<Void> tLogCore( TLogData* self, Reference<LogData> logData, TLogInt
 	if(!logData->isPrimary) {
 		std::vector<Tag> tags;
 		tags.push_back(logData->remoteTag);
-		logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional<Version>(), true, true) );
+		logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional<Version>(), true) );
 	}

 	try {
@ -2230,10 +2255,10 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
 					logData->logRouterPopToVersion = req.recoverAt;
 					std::vector<Tag> tags;
 					tags.push_back(logData->remoteTag);
-					wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true, false) || logData->removed);
+					wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true) || logData->removed);
 				} else if(!req.recoverTags.empty()) {
 					ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion);
-					wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false, true) || logData->removed);
+					wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false) || logData->removed);
 				}
 				pulledRecoveryVersions = true;
 				logData->knownCommittedVersion = req.recoverAt;
@ -2331,6 +2356,7 @@ ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ

 		self.sharedActors.send( commitQueue(&self) );
 		self.sharedActors.send( updateStorageLoop(&self) );
+		state Future<Void> activeSharedChange = Void();

 		loop {
 			choose {
@ -2343,12 +2369,13 @@ ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ
 					}
 				}
 				when ( wait( error ) ) { throw internal_error(); }
-				when ( wait( activeSharedTLog->onChange() ) ) {
+				when ( wait( activeSharedChange ) ) {
 					if (activeSharedTLog->get() == tlogId) {
 						self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD;
 					} else {
 						self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) );
 					}
+					activeSharedChange = activeSharedTLog->onChange();
 				}
 			}
 		}
--- a/fdbserver/OldTLogServer_6_2.actor.cpp
+++ b/fdbserver/OldTLogServer_6_2.actor.cpp
@ -943,6 +943,8 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 	return Void();
 }

+ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogData> logData );
+
 // This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources.
 // For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important
 // work (e.g. commits).
@ -962,6 +964,26 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {

 	state FlowLock::Releaser commitLockReleaser;

+	//FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after two consecutive recoveries.
+	// It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this tag; which is not intended to ever happen.
+	Optional<Version> cachePopVersion;
+	for(auto& it : self->id_data) {
+		if(!it.second->stopped) {
+			if(it.second->version.get() - it.second->unrecoveredBefore > SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) {
+				cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS;
+			}
+			break;
+		}
+	}
+
+	if(cachePopVersion.present()) {
+		state std::vector<Future<Void>> cachePopFutures;
+		for(auto& it : self->id_data) {
+			cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(),0,cacheTag), it.second));
+		}
+		wait( waitForAll(cachePopFutures) );
+	}
+
 	if(logData->stopped) {
 		if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) {
 			while(logData->persistentDataDurableVersion != logData->version.get()) {
--- a/fdbserver/PrefixTree.h
+++ b/fdbserver/PrefixTree.h
--- a/fdbserver/RestoreApplier.actor.cpp
+++ b/fdbserver/RestoreApplier.actor.cpp
@ -65,11 +65,12 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
 				}
 				when(RestoreVersionBatchRequest req = waitNext(applierInterf.initVersionBatch.getFuture())) {
 					requestTypeStr = "initVersionBatch";
-					actors.add(handleInitVersionBatchRequest(req, self));
+					wait(handleInitVersionBatchRequest(req, self));
 				}
 				when(RestoreVersionBatchRequest req = waitNext(applierInterf.finishRestore.getFuture())) {
 					requestTypeStr = "finishRestore";
-					exitRole = handleFinishRestoreRequest(req, self);
+					handleFinishRestoreRequest(req, self);
+					exitRole = Void();
 				}
 				when(wait(exitRole)) {
 					TraceEvent("FastRestore").detail("RestoreApplierCore", "ExitRole").detail("NodeID", self->id());
@ -115,12 +116,14 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendMutationVec
 		state int mIndex = 0;
 		for (mIndex = 0; mIndex < mutations.size(); mIndex++) {
 			MutationRef mutation = mutations[mIndex];
-			// TraceEvent(SevDebug, "FastRestore")
-			//     .detail("ApplierNode", self->id())
-			//     .detail("FileUID", req.fileUID)
-			//     .detail("Version", commitVersion)
-			//     .detail("MutationReceived", mutation.toString());
+			TraceEvent(SevDebug, "FastRestore")
+			    .detail("ApplierNode", self->id())
+			    .detail("FileUID", req.fileIndex)
+			    .detail("Version", commitVersion)
+			    .detail("Index", mIndex)
+			    .detail("MutationReceived", mutation.toString());
 			self->kvOps[commitVersion].push_back_deep(self->kvOps[commitVersion].arena(), mutation);
+			// TODO: What if log file's mutations are delivered out-of-order (behind) the range file's mutations?!
 		}
 		curFilePos.set(req.version);
 	}
@ -218,9 +221,8 @@ struct DBApplyProgress {
 	}

 	bool shouldCommit() {
-		// TODO: Change transactionSize > 0 to transactionSize > opConfig.transactionBatchSizeThreshold to batch
-		// mutations in a txn
-		return (!lastTxnHasError && (startNextVersion || transactionSize > 0 || curItInCurTxn == self->kvOps.end()));
+		return (!lastTxnHasError && (startNextVersion || transactionSize >= opConfig.transactionBatchSizeThreshold ||
+		                             curItInCurTxn == self->kvOps.end()));
 	}

 	bool hasError() { return lastTxnHasError; }
@ -270,6 +272,29 @@ ACTOR Future<Void> applyToDB(Reference<RestoreApplierData> self, Database cx) {
 	}

 	state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
+	// Sanity check the restoreApplierKeys, which should be empty at this point
+	loop {
+		try {
+			tr->reset();
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+			Key begin = restoreApplierKeyFor(self->id(), 0);
+			Key end = restoreApplierKeyFor(self->id(), std::numeric_limits<int64_t>::max());
+			Standalone<RangeResultRef> txnIds = wait(tr->getRange(KeyRangeRef(begin, end), CLIENT_KNOBS->TOO_MANY));
+			if (txnIds.size() > 0) {
+				TraceEvent(SevError, "FastRestore_ApplyTxnStateNotClean").detail("TxnIds", txnIds.size());
+				for (auto& kv : txnIds) {
+					std::pair<UID, Version> applierInfo = decodeRestoreApplierKey(kv.key);
+					TraceEvent(SevError, "FastRestore_ApplyTxnStateNotClean")
+					    .detail("Applier", applierInfo.first)
+					    .detail("ResidueTxnID", applierInfo.second);
+				}
+			}
+			break;
+		} catch (Error& e) {
+			wait(tr->onError(e));
+		}
+	}

 	loop { // Transaction retry loop
 		try {
@ -299,7 +324,7 @@ ACTOR Future<Void> applyToDB(Reference<RestoreApplierData> self, Database cx) {
 				TraceEvent("FastRestore_ApplierTxn")
 				    .detail("ApplierApplyToDB", self->id())
 				    .detail("TxnId", progress.curTxnId)
-				    .detail("StartIndexInCurrentTxn", progress.curIndexInCurTxn)
+				    .detail("CurrentIndexInCurrentTxn", progress.curIndexInCurTxn)
 				    .detail("CurrentIteratorMutations", progress.curItInCurTxn->second.size())
 				    .detail("Version", progress.curItInCurTxn->first);

@ -315,7 +340,13 @@ ACTOR Future<Void> applyToDB(Reference<RestoreApplierData> self, Database cx) {
 						TraceEvent(SevError, "FastRestore").detail("InvalidMutationType", m.type);
 					}

-					// TraceEvent(SevDebug, "FastRestore_Debug").detail("ApplierApplyToDB", self->describeNode()).detail("Version", progress.curItInCurTxn->first).detail("Mutation", m.toString());
+					TraceEvent(SevDebug, "FastRestore_Debug")
+					    .detail("ApplierApplyToDB", self->describeNode())
+					    .detail("Version", progress.curItInCurTxn->first)
+					    .detail("Index", progress.curIndexInCurTxn)
+					    .detail("Mutation", m.toString())
+					    .detail("MutationSize", m.expectedSize())
+					    .detail("TxnSize", progress.transactionSize);
 					if (m.type == MutationRef::SetValue) {
 						tr->set(m.param1, m.param2);
 					} else if (m.type == MutationRef::ClearRange) {
@ -332,14 +363,10 @@ ACTOR Future<Void> applyToDB(Reference<RestoreApplierData> self, Database cx) {

 					progress.transactionSize += m.expectedSize();

-					if (progress.transactionSize >= opConfig.transactionBatchSizeThreshold) { // commit per 512B
+					progress.nextMutation(); // Prepare for the next mutation
+					// commit per transactionBatchSizeThreshold bytes; and commit does not cross version boundary
+					if (progress.shouldCommit()) {
 						break; // Got enough mutation in the txn
-					} else {
-						progress.nextMutation();
-						// Mutations in the same transaction come from the same version
-						if (progress.startNextVersion || progress.isDone()) {
-							break;
-						}
 					}
 				}
 			} // !lastTxnHasError
@ -348,8 +375,7 @@ ACTOR Future<Void> applyToDB(Reference<RestoreApplierData> self, Database cx) {
 			if (progress.shouldCommit()) {
 				wait(tr->commit());
 			}
-			// Logic for a successful transaction: Update current txn info and uncommitted txn info
-			progress.nextMutation();
+
 			if (progress.isDone()) { // Are all mutations processed?
 				break;
 			}
@ -359,7 +385,7 @@ ACTOR Future<Void> applyToDB(Reference<RestoreApplierData> self, Database cx) {
 			    .detail("TxnStatus", "?")
 			    .detail("ApplierApplyToDB", self->id())
 			    .detail("TxnId", progress.curTxnId)
-			    .detail("StartIndexInCurrentTxn", progress.curIndexInCurTxn)
+			    .detail("CurrentIndexInCurrentTxn", progress.curIndexInCurTxn)
 			    .detail("Version", progress.curItInCurTxn->first)
 			    .error(e, true);
 			progress.lastTxnHasError = true;
@ -381,8 +407,9 @@ ACTOR Future<Void> applyToDB(Reference<RestoreApplierData> self, Database cx) {
 			tr->reset();
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+			// Clear txnIds in [0, progress.curTxnId). We add 100 to curTxnId just to be safe.
 			tr->clear(KeyRangeRef(restoreApplierKeyFor(self->id(), 0),
-			                      restoreApplierKeyFor(self->id(), progress.curTxnId + 1)));
+			                      restoreApplierKeyFor(self->id(), progress.curTxnId + 100)));
 			wait(tr->commit());
 			break;
 		} catch (Error& e) {
--- a/fdbserver/RestoreApplier.actor.h
+++ b/fdbserver/RestoreApplier.actor.h
@ -34,7 +34,7 @@
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/Locality.h"
 #include "fdbserver/CoordinationInterface.h"
-#include "fdbserver/RestoreWorkerInterface.h"
+#include "fdbclient/RestoreWorkerInterface.actor.h"
 #include "fdbserver/RestoreUtil.h"
 #include "fdbserver/RestoreRoleCommon.actor.h"

@ -128,4 +128,4 @@ struct RestoreApplierData : RestoreRoleData, public ReferenceCounted<RestoreAppl
 ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int nodeIndex, Database cx);

 #include "flow/unactorcompiler.h"
-#endif
+#endif
--- a/fdbserver/RestoreCommon.actor.cpp
+++ b/fdbserver/RestoreCommon.actor.cpp
@ -32,6 +32,7 @@
 #include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/MutationList.h"
 #include "fdbclient/BackupContainer.h"
+#include "flow/actorcompiler.h" // This must be the last #include.

 // Split RestoreConfigFR defined in FileBackupAgent.actor.cpp to declaration in Restore.actor.h and implementation in
 // RestoreCommon.actor.cpp
@ -268,7 +269,6 @@ ACTOR Future<std::string> RestoreConfigFR::getFullStatus_impl(Reference<RestoreC
 	state Future<std::string> progress = restore->getProgress(tr);

 	// restore might no longer be valid after the first wait so make sure it is not needed anymore.
-	state UID uid = restore->getUid();
 	wait(success(ranges) && success(addPrefix) && success(removePrefix) &&
 		 success(url) && success(restoreVersion) && success(progress));

@ -322,8 +322,8 @@ struct StringRefReader {

 	// Functions for consuming big endian (network byte order) integers.
 	// Consumes a big endian number, swaps it to little endian, and returns it.
-	const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume<int32_t>()); }
-	const uint32_t consumeNetworkUInt32() { return bigEndian32(consume<uint32_t>()); }
+	int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume<int32_t>()); }
+	uint32_t consumeNetworkUInt32() { return bigEndian32(consume<uint32_t>()); }

 	bool eof() { return rptr == end; }

@ -433,4 +433,4 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeLogFileBlock(Reference<IA
 	}
 }

-} // namespace parallelFileRestore
+} // namespace parallelFileRestore
--- a/fdbserver/RestoreCommon.actor.h
+++ b/fdbserver/RestoreCommon.actor.h
@ -236,7 +236,7 @@ struct RestoreFileFR {
 		ss << "version:" << std::to_string(version) << " fileName:" << fileName
 		   << " isRange:" << std::to_string(isRange) << " blockSize:" << std::to_string(blockSize)
 		   << " fileSize:" << std::to_string(fileSize) << " endVersion:" << std::to_string(endVersion)
-		   << std::to_string(beginVersion) << " cursor:" << std::to_string(cursor)
+		   << " beginVersion:" << std::to_string(beginVersion) << " cursor:" << std::to_string(cursor)
 		   << " fileIndex:" << std::to_string(fileIndex);
 		return ss.str();
 	}
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -36,24 +36,23 @@ typedef std::map<Standalone<StringRef>, uint32_t> SerializedMutationPartMap;
 bool isRangeMutation(MutationRef m);
 void splitMutation(Reference<RestoreLoaderData> self, MutationRef m, Arena& mvector_arena,
                   VectorRef<MutationRef>& mvector, Arena& nodeIDs_arena, VectorRef<UID>& nodeIDs);
-void _parseSerializedMutation(VersionedMutationsMap* kvOps, SerializedMutationListMap* mutationMap,
-                              bool isSampling = false);
+void _parseSerializedMutation(std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
+                              SerializedMutationListMap* mutationMap, bool isSampling = false);

-ACTOR Future<Void> handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference<RestoreLoaderData> self);
-ACTOR Future<Void> handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req,
-                                                         Reference<RestoreLoaderData> self);
+void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference<RestoreLoaderData> self);
 ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<RestoreLoaderData> self,
                                         bool isSampling = false);
+ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req,
+                                              Reference<RestoreLoaderData> self);
 ACTOR Future<Void> sendMutationsToApplier(Reference<RestoreLoaderData> self, VersionedMutationsMap* kvOps,
                                          bool isRangeFile, Version startVersion, Version endVersion, int fileIndex);
 ACTOR static Future<Void> _parseLogFileToMutationsOnLoader(
    NotifiedVersion* pProcessedFileOffset, SerializedMutationListMap* mutationMap,
    SerializedMutationPartMap* mutationPartMap, Reference<IBackupContainer> bc, Version version, std::string fileName,
    int64_t readOffset, int64_t readLen, KeyRange restoreRange, Key addPrefix, Key removePrefix, Key mutationLogPrefix);
-ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(VersionedMutationsMap* kvOps,
-                                                             Reference<IBackupContainer> bc, Version version,
-                                                             std::string fileName, int64_t readOffset_input,
-                                                             int64_t readLen_input, KeyRange restoreRange);
+ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
+    std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter, Reference<IBackupContainer> bc, Version version,
+    std::string fileName, int64_t readOffset_input, int64_t readLen_input, KeyRange restoreRange);

 ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int nodeIndex, Database cx) {
 	state Reference<RestoreLoaderData> self =
@ -72,25 +71,25 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no
 				}
 				when(RestoreSysInfoRequest req = waitNext(loaderInterf.updateRestoreSysInfo.getFuture())) {
 					requestTypeStr = "updateRestoreSysInfo";
-					actors.add(handleRestoreSysInfoRequest(req, self));
-				}
-				when(RestoreSetApplierKeyRangeVectorRequest req =
-				         waitNext(loaderInterf.setApplierKeyRangeVectorRequest.getFuture())) {
-					requestTypeStr = "setApplierKeyRangeVectorRequest";
-					actors.add(handleSetApplierKeyRangeVectorRequest(req, self));
+					handleRestoreSysInfoRequest(req, self);
 				}
 				when(RestoreLoadFileRequest req = waitNext(loaderInterf.loadFile.getFuture())) {
 					requestTypeStr = "loadFile";
 					self->initBackupContainer(req.param.url);
 					actors.add(handleLoadFileRequest(req, self, false));
 				}
+				when(RestoreSendMutationsToAppliersRequest req = waitNext(loaderInterf.sendMutations.getFuture())) {
+					requestTypeStr = "sendMutations";
+					actors.add(handleSendMutationsRequest(req, self));
+				}
 				when(RestoreVersionBatchRequest req = waitNext(loaderInterf.initVersionBatch.getFuture())) {
 					requestTypeStr = "initVersionBatch";
-					actors.add(handleInitVersionBatchRequest(req, self));
+					wait(handleInitVersionBatchRequest(req, self));
 				}
 				when(RestoreVersionBatchRequest req = waitNext(loaderInterf.finishRestore.getFuture())) {
 					requestTypeStr = "finishRestore";
-					exitRole = handleFinishRestoreRequest(req, self);
+					handleFinishRestoreRequest(req, self);
+					exitRole = Void();
 				}
 				when(wait(exitRole)) {
 					TraceEvent("FastRestore").detail("RestoreLoaderCore", "ExitRole").detail("NodeID", self->id());
@ -109,31 +108,19 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no
 }

 // Assume: Only update the local data if it (applierInterf) has not been set
-ACTOR Future<Void> handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference<RestoreLoaderData> self) {
+void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference<RestoreLoaderData> self) {
 	TraceEvent("FastRestore").detail("HandleRestoreSysInfoRequest", self->id());
 	ASSERT(self.isValid());

 	// The loader has received the appliers interfaces
 	if (!self->appliersInterf.empty()) {
 		req.reply.send(RestoreCommonReply(self->id()));
-		return Void();
+		return;
 	}

 	self->appliersInterf = req.sysInfo.appliers;

 	req.reply.send(RestoreCommonReply(self->id()));
-	return Void();
-}
-
-ACTOR Future<Void> handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req,
-                                                         Reference<RestoreLoaderData> self) {
-	// Idempodent operation. OK to re-execute the duplicate cmd
-	if (self->rangeToApplier.empty()) {
-		self->rangeToApplier = req.rangeToApplier;
-	}
-	req.reply.send(RestoreCommonReply(self->id()));
-
-	return Void();
 }

 ACTOR Future<Void> _processLoadingParam(LoadingParam param, Reference<RestoreLoaderData> self) {
@ -141,10 +128,14 @@ ACTOR Future<Void> _processLoadingParam(LoadingParam param, Reference<RestoreLoa
 	TraceEvent("FastRestore").detail("Loader", self->id()).detail("StartProcessLoadParam", param.toString());
 	ASSERT(param.blockSize > 0);
 	ASSERT(param.offset % param.blockSize == 0); // Parse file must be at block bondary.
+	ASSERT(self->kvOpsPerLP.find(param) == self->kvOpsPerLP.end());
+	// NOTE: map's iterator is guaranteed to be stable, but pointer may not.
+	// state VersionedMutationsMap* kvOps = &self->kvOpsPerLP[param];
+	self->kvOpsPerLP.emplace(param, VersionedMutationsMap());
+	state std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsPerLPIter = self->kvOpsPerLP.find(param);

-	// Temporary data structure for parsing range and log files into (version, <K, V, mutationType>)
+	// Temporary data structure for parsing log files into (version, <K, V, mutationType>)
 	// Must use StandAlone to save mutations, otherwise, the mutationref memory will be corrupted
-	state VersionedMutationsMap kvOps;
 	// mutationMap: Key is the unique identifier for a batch of mutation logs at the same version
 	state SerializedMutationListMap mutationMap;
 	state std::map<Standalone<StringRef>, uint32_t> mutationPartMap; // Sanity check the data parsing is correct
@ -159,7 +150,7 @@ ACTOR Future<Void> _processLoadingParam(LoadingParam param, Reference<RestoreLoa
 		readLen = std::min<int64_t>(param.blockSize, param.length - j);
 		if (param.isRangeFile) {
 			fileParserFutures.push_back(_parseRangeFileToMutationsOnLoader(
-			    &kvOps, self->bc, param.version, param.filename, readOffset, readLen, param.restoreRange));
+			    kvOpsPerLPIter, self->bc, param.version, param.filename, readOffset, readLen, param.restoreRange));
 		} else {
 			fileParserFutures.push_back(_parseLogFileToMutationsOnLoader(
 			    &processedFileOffset, &mutationMap, &mutationPartMap, self->bc, param.version, param.filename,
@ -169,12 +160,9 @@ ACTOR Future<Void> _processLoadingParam(LoadingParam param, Reference<RestoreLoa
 	wait(waitForAll(fileParserFutures));

 	if (!param.isRangeFile) {
-		_parseSerializedMutation(&kvOps, &mutationMap);
+		_parseSerializedMutation(kvOpsPerLPIter, &mutationMap);
 	}

-	// Send the parsed mutation to applier who will apply the mutation to DB
-	wait(sendMutationsToApplier(self, &kvOps, param.isRangeFile, param.prevVersion, param.endVersion, param.fileIndex));
-
 	TraceEvent("FastRestore").detail("Loader", self->id()).detail("FinishLoadingFile", param.filename);

 	return Void();
@ -187,10 +175,35 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
 		TraceEvent("FastRestore").detail("Loader", self->id()).detail("ProcessLoadParam", req.param.toString());
 		self->processedFileParams[req.param] = Never();
 		self->processedFileParams[req.param] = _processLoadingParam(req.param, self);
+	} else {
+		TraceEvent("FastRestore").detail("Loader", self->id()).detail("WaitOnProcessLoadParam", req.param.toString());
 	}
 	ASSERT(self->processedFileParams.find(req.param) != self->processedFileParams.end());
 	wait(self->processedFileParams[req.param]); // wait on the processing of the req.param.

+	// TODO: Send sampled mutations back to master
+	req.reply.send(RestoreCommonReply(self->id()));
+	return Void();
+}
+
+ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req,
+                                              Reference<RestoreLoaderData> self) {
+	if (self->rangeToApplier.empty()) {
+		self->rangeToApplier = req.rangeToApplier;
+	} else {
+		ASSERT(self->rangeToApplier == req.rangeToApplier);
+	}
+
+	// Send mutations from log files first to ensure log mutation at the same version is before the range kv
+	state std::map<LoadingParam, VersionedMutationsMap>::iterator item = self->kvOpsPerLP.begin();
+	for (; item != self->kvOpsPerLP.end(); item++) {
+		if (item->first.isRangeFile == req.useRangeFile) {
+			// Send the parsed mutation to applier who will apply the mutation to DB
+			wait(sendMutationsToApplier(self, &item->second, item->first.isRangeFile, item->first.prevVersion,
+			                            item->first.endVersion, item->first.fileIndex));
+		}
+	}
+
 	req.reply.send(RestoreCommonReply(self->id()));
 	return Void();
 }
@ -345,8 +358,6 @@ void splitMutation(Reference<RestoreLoaderData> self, MutationRef m, Arena& mvec
 		mvector.push_back_deep(mvector_arena, curm);
 		nodeIDs.push_back(nodeIDs_arena, itApplier->second);
 	}
-
-	return;
 }

 // key_input format:
@ -360,13 +371,14 @@ bool concatenateBackupMutationForLogFile(std::map<Standalone<StringRef>, Standal
 	std::string prefix = "||\t";
 	std::stringstream ss;
 	StringRef val = val_input.contents();
+	const int key_prefix_len = sizeof(uint8_t) + sizeof(Version) + sizeof(uint32_t);

 	StringRefReaderMX reader(val, restore_corrupted_data());
 	StringRefReaderMX readerKey(key_input, restore_corrupted_data()); // read key_input!
-	int logRangeMutationFirstLength = key_input.size() - 1 - 8 - 4;
+	int logRangeMutationFirstLength = key_input.size() - key_prefix_len;
 	bool concatenated = false;

-	ASSERT_WE_THINK(key_input.size() >= 1 + 8 + 4);
+	ASSERT_WE_THINK(key_input.size() >= key_prefix_len);

 	if (logRangeMutationFirstLength > 0) {
 		// Strip out the [logRangeMutation.first]; otherwise, the following readerKey.consume will produce wrong value
@ -374,10 +386,10 @@ bool concatenateBackupMutationForLogFile(std::map<Standalone<StringRef>, Standal
 	}

 	readerKey.consume<uint8_t>(); // uint8_t hashValue = readerKey.consume<uint8_t>()
-	uint64_t commitVersion = readerKey.consumeNetworkUInt64();
+	Version commitVersion = readerKey.consumeNetworkUInt64();
 	uint32_t part = readerKey.consumeNetworkUInt32();
 	// Use commitVersion as id
-	Standalone<StringRef> id = StringRef((uint8_t*)&commitVersion, 8);
+	Standalone<StringRef> id = StringRef((uint8_t*)&commitVersion, sizeof(Version));

 	if (mutationMap.find(id) == mutationMap.end()) {
 		mutationMap.insert(std::make_pair(id, val_input));
@ -425,8 +437,9 @@ bool isRangeMutation(MutationRef m) {
 // we may not get the entire mutation list for the version encoded_list_of_mutations:
 // [mutation1][mutation2]...[mutationk], where
 //	a mutation is encoded as [type:uint32_t][keyLength:uint32_t][valueLength:uint32_t][keyContent][valueContent]
-void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationListMap* pmutationMap, bool isSampling) {
-	VersionedMutationsMap& kvOps = *pkvOps;
+void _parseSerializedMutation(std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
+                              SerializedMutationListMap* pmutationMap, bool isSampling) {
+	VersionedMutationsMap& kvOps = kvOpsIter->second;
 	SerializedMutationListMap& mutationMap = *pmutationMap;

 	for (auto& m : mutationMap) {
@ -439,10 +452,11 @@ void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationL

 		StringRefReaderMX vReader(val, restore_corrupted_data());
 		vReader.consume<uint64_t>(); // Consume the includeVersion
-		uint32_t val_length_decoded =
-		    vReader.consume<uint32_t>(); // Parse little endian value, confirmed it is correct!
-		ASSERT(val_length_decoded ==
-		       val.size() - 12); // 12 is the length of [includeVersion:uint64_t][val_length:uint32_t]
+		// TODO(xumengpanda): verify the protocol version is compatible and raise error if needed
+
+		// Parse little endian value, confirmed it is correct!
+		uint32_t val_length_decoded = vReader.consume<uint32_t>();
+		ASSERT(val_length_decoded == val.size() - sizeof(uint64_t) - sizeof(uint32_t));

 		while (1) {
 			// stop when reach the end of the string
@ -457,7 +471,9 @@ void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationL
 			const uint8_t* v = vReader.consume(vLen);

 			MutationRef mutation((MutationRef::Type)type, KeyRef(k, kLen), KeyRef(v, vLen));
-			//TraceEvent(SevDebug, "FastRestore_VerboseDebug").detail("CommitVersion", commitVersion).detail("ParsedMutation", mutation.toString());
+			TraceEvent(SevFRMutationInfo, "FastRestore_VerboseDebug")
+			    .detail("CommitVersion", commitVersion)
+			    .detail("ParsedMutation", mutation.toString());
 			kvOps[commitVersion].push_back_deep(kvOps[commitVersion].arena(), mutation);
 			ASSERT_WE_THINK(kLen >= 0 && kLen < val.size());
 			ASSERT_WE_THINK(vLen >= 0 && vLen < val.size());
@ -466,11 +482,10 @@ void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationL
 }

 // Parsing the data blocks in a range file
-ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(VersionedMutationsMap* pkvOps,
-                                                             Reference<IBackupContainer> bc, Version version,
-                                                             std::string fileName, int64_t readOffset, int64_t readLen,
-                                                             KeyRange restoreRange) {
-	state VersionedMutationsMap& kvOps = *pkvOps;
+ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
+    std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter, Reference<IBackupContainer> bc, Version version,
+    std::string fileName, int64_t readOffset, int64_t readLen, KeyRange restoreRange) {
+	state VersionedMutationsMap& kvOps = kvOpsIter->second;

 	// The set of key value version is rangeFile.version. the key-value set in the same range file has the same version
 	Reference<IAsyncFile> inFile = wait(bc->readFile(fileName));
@ -519,7 +534,9 @@ ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(VersionedMutationsM

 		// We cache all kv operations into kvOps, and apply all kv operations later in one place
 		kvOps.insert(std::make_pair(version, VectorRef<MutationRef>()));
-		//TraceEvent(SevDebug, "FastRestore_VerboseDebug").detail("CommitVersion", version).detail("ParsedMutationKV", m.toString());
+		TraceEvent(SevFRMutationInfo, "FastRestore_VerboseDebug")
+		    .detail("CommitVersion", version)
+		    .detail("ParsedMutationKV", m.toString());

 		ASSERT_WE_THINK(kvOps.find(version) != kvOps.end());
 		kvOps[version].push_back_deep(kvOps[version].arena(), m);
--- a/fdbserver/RestoreLoader.actor.h
+++ b/fdbserver/RestoreLoader.actor.h
@ -34,7 +34,7 @@
 #include "fdbrpc/fdbrpc.h"
 #include "fdbserver/CoordinationInterface.h"
 #include "fdbrpc/Locality.h"
-#include "fdbserver/RestoreWorkerInterface.h"
+#include "fdbclient/RestoreWorkerInterface.actor.h"
 #include "fdbserver/RestoreUtil.h"
 #include "fdbserver/RestoreCommon.actor.h"
 #include "fdbserver/RestoreRoleCommon.actor.h"
@ -44,6 +44,7 @@

 struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoaderData> {
 	std::map<LoadingParam, Future<Void>> processedFileParams;
+	std::map<LoadingParam, VersionedMutationsMap> kvOpsPerLP; // Buffered kvOps for each loading param

 	// rangeToApplier is in master and loader. Loader uses this to determine which applier a mutation should be sent
 	//   KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for
@ -79,6 +80,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
 		keyOpsCount.clear();
 		numSampledMutations = 0;
 		processedFileParams.clear();
+		kvOpsPerLP.clear();
 	}

 	// Only get the appliers that are responsible for a range
@ -104,4 +106,4 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
 ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int nodeIndex, Database cx);

 #include "flow/unactorcompiler.h"
-#endif
+#endif
--- a/fdbserver/RestoreMaster.actor.cpp
+++ b/fdbserver/RestoreMaster.actor.cpp
@ -51,7 +51,6 @@ ACTOR static Future<Void> distributeRestoreSysInfo(Reference<RestoreWorkerData>

 ACTOR static Future<Standalone<VectorRef<RestoreRequest>>> collectRestoreRequests(Database cx);
 ACTOR static Future<Void> initializeVersionBatch(Reference<RestoreMasterData> self);
-ACTOR static Future<Void> notifyLoaderAppliersKeyRange(Reference<RestoreMasterData> self);
 ACTOR static Future<Void> notifyApplierToApplyMutations(Reference<RestoreMasterData> self);
 ACTOR static Future<Void> notifyRestoreCompleted(Reference<RestoreMasterData> self, Database cx);

@ -193,7 +192,7 @@ ACTOR Future<Void> startProcessRestoreRequests(Reference<RestoreMasterData> self
 		for (restoreIndex = 0; restoreIndex < restoreRequests.size(); restoreIndex++) {
 			RestoreRequest& request = restoreRequests[restoreIndex];
 			TraceEvent("FastRestore").detail("RestoreRequestInfo", request.toString());
-			Version ver = wait(processRestoreRequest(self, cx, request));
+			wait(success(processRestoreRequest(self, cx, request)));
 		}
 	} catch (Error& e) {
 		TraceEvent(SevError, "FastRestoreFailed").detail("RestoreRequest", restoreRequests[restoreIndex].toString());
@ -308,6 +307,21 @@ ACTOR static Future<Void> loadFilesOnLoaders(Reference<RestoreMasterData> self,
 	return Void();
 }

+// Ask loaders to send its buffered mutations to appliers
+ACTOR static Future<Void> sendMutationsFromLoaders(Reference<RestoreMasterData> self, bool useRangeFile) {
+	TraceEvent("FastRestore")
+	    .detail("SendMutationsFromLoaders", self->batchIndex)
+	    .detail("UseRangeFiles", useRangeFile);
+
+	std::vector<std::pair<UID, RestoreSendMutationsToAppliersRequest>> requests;
+	for (auto& loader : self->loadersInterf) {
+		requests.emplace_back(loader.first, RestoreSendMutationsToAppliersRequest(self->rangeToApplier, useRangeFile));
+	}
+	wait(sendBatchRequests(&RestoreLoaderInterface::sendMutations, self->loadersInterf, requests));
+
+	return Void();
+}
+
 ACTOR static Future<Void> distributeWorkloadPerVersionBatch(Reference<RestoreMasterData> self, Database cx,
                                                            RestoreRequest request, VersionBatch versionBatch) {
 	ASSERT(!versionBatch.isEmpty());
@ -315,13 +329,19 @@ ACTOR static Future<Void> distributeWorkloadPerVersionBatch(Reference<RestoreMas
 	ASSERT(self->loadersInterf.size() > 0);
 	ASSERT(self->appliersInterf.size() > 0);

-	dummySampleWorkload(self);
-	wait(notifyLoaderAppliersKeyRange(self));
+	dummySampleWorkload(self); // TODO: Delete

 	// Parse log files and send mutations to appliers before we parse range files
+	// TODO: Allow loading both range and log files in parallel
 	wait(loadFilesOnLoaders(self, cx, request, versionBatch, false));
 	wait(loadFilesOnLoaders(self, cx, request, versionBatch, true));

+	// Loaders should ensure log files' mutations sent to appliers before range files' mutations
+	// TODO: Let applier buffer mutations from log and range files differently so that loaders can send mutations in
+	// parallel
+	wait(sendMutationsFromLoaders(self, false));
+	wait(sendMutationsFromLoaders(self, true));
+
 	wait(notifyApplierToApplyMutations(self));

 	return Void();
@ -331,20 +351,22 @@ ACTOR static Future<Void> distributeWorkloadPerVersionBatch(Reference<RestoreMas
 // Produce the key-range for each applier
 void dummySampleWorkload(Reference<RestoreMasterData> self) {
 	int numAppliers = self->appliersInterf.size();
-	std::vector<UID> keyrangeSplitter;
+	std::vector<Key> keyrangeSplitter;
 	// We will use the splitter at [1, numAppliers - 1]. The first splitter is normalKeys.begin
 	int i;
-	for (i = 0; i < numAppliers - 1; i++) {
-		keyrangeSplitter.push_back(deterministicRandom()->randomUniqueID());
+	for (i = 0; i < numAppliers; i++) {
+		keyrangeSplitter.push_back(Key(deterministicRandom()->randomUniqueID().toString()));
 	}
 	std::sort(keyrangeSplitter.begin(), keyrangeSplitter.end());
 	i = 0;
+	self->rangeToApplier.clear();
 	for (auto& applier : self->appliersInterf) {
 		if (i == 0) {
 			self->rangeToApplier[normalKeys.begin] = applier.first;
 		} else {
-			self->rangeToApplier[StringRef(keyrangeSplitter[i].toString())] = applier.first;
+			self->rangeToApplier[keyrangeSplitter[i]] = applier.first;
 		}
+		i++;
 	}
 	self->logApplierKeyRange();
 }
@ -412,11 +434,13 @@ ACTOR static Future<Void> collectBackupFiles(Reference<IBackupContainer> bc, std
 	for (const RangeFile& f : restorable.get().ranges) {
 		TraceEvent("FastRestore").detail("RangeFile", f.toString());
 		RestoreFileFR file(f.version, f.fileName, true, f.blockSize, f.fileSize, f.version, f.version);
+		TraceEvent("FastRestore").detail("RangeFileFR", file.toString());
 		files->push_back(file);
 	}
 	for (const LogFile& f : restorable.get().logs) {
 		TraceEvent("FastRestore").detail("LogFile", f.toString());
 		RestoreFileFR file(f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion, f.beginVersion);
+		TraceEvent("FastRestore").detail("LogFileFR", file.toString());
 		files->push_back(file);
 	}

@ -464,17 +488,6 @@ ACTOR static Future<Void> notifyApplierToApplyMutations(Reference<RestoreMasterD
 	return Void();
 }

-// Send the map of key-range to applier to each loader
-ACTOR static Future<Void> notifyLoaderAppliersKeyRange(Reference<RestoreMasterData> self) {
-	std::vector<std::pair<UID, RestoreSetApplierKeyRangeVectorRequest>> requests;
-	for (auto& loader : self->loadersInterf) {
-		requests.push_back(std::make_pair(loader.first, RestoreSetApplierKeyRangeVectorRequest(self->rangeToApplier)));
-	}
-	wait(sendBatchRequests(&RestoreLoaderInterface::setApplierKeyRangeVectorRequest, self->loadersInterf, requests));
-
-	return Void();
-}
-
 // Ask all loaders and appliers to perform housecleaning at the end of restore and
 // Register the restoreRequestDoneKey to signal the end of restore
 ACTOR static Future<Void> notifyRestoreCompleted(Reference<RestoreMasterData> self, Database cx) {
@ -514,4 +527,4 @@ ACTOR static Future<Void> notifyRestoreCompleted(Reference<RestoreMasterData> se
 	TraceEvent("FastRestore").detail("RestoreMaster", "RestoreCompleted");

 	return Void();
-}
+}
--- a/fdbserver/RestoreMaster.actor.h
+++ b/fdbserver/RestoreMaster.actor.h
@ -54,7 +54,7 @@ struct VersionBatch {
 struct RestoreMasterData : RestoreRoleData, public ReferenceCounted<RestoreMasterData> {
 	// rangeToApplier is in master and loader node. Loader uses this to determine which applier a mutation should be sent.
 	//   KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for
-	std::map<Standalone<KeyRef>, UID> rangeToApplier;
+	std::map<Key, UID> rangeToApplier;
 	std::map<Version, VersionBatch> versionBatches; // key is the beginVersion of the version batch

 	int batchIndex;
@ -68,7 +68,7 @@ struct RestoreMasterData : RestoreRoleData, public ReferenceCounted<RestoreMaste
 	RestoreMasterData() {
 		role = RestoreRole::Master;
 		nodeID = UID();
-		batchIndex = 0;
+		batchIndex = 1; // starts with 1 because batchId (NotifiedVersion) in loaders and appliers start with 0
 	}

 	~RestoreMasterData() = default;
@ -128,15 +128,23 @@ struct RestoreMasterData : RestoreRoleData, public ReferenceCounted<RestoreMaste
 		// Assumption: fileIndex starts at 1. Each loader's initized fileIndex (NotifiedVersion type) starts at 0
 		int fileIndex = 0; // fileIndex must be unique; ideally it continuously increase across verstionBatches for
 		                   // easier progress tracking
+		int versionBatchId = 1;
 		for (auto versionBatch = versionBatches->begin(); versionBatch != versionBatches->end(); versionBatch++) {
 			std::sort(versionBatch->second.rangeFiles.begin(), versionBatch->second.rangeFiles.end());
 			std::sort(versionBatch->second.logFiles.begin(), versionBatch->second.logFiles.end());
 			for (auto& logFile : versionBatch->second.logFiles) {
 				logFile.fileIndex = ++fileIndex;
+				TraceEvent("FastRestore")
+				    .detail("VersionBatchId", versionBatchId)
+				    .detail("LogFile", logFile.toString());
 			}
 			for (auto& rangeFile : versionBatch->second.rangeFiles) {
 				rangeFile.fileIndex = ++fileIndex;
+				TraceEvent("FastRestore")
+				    .detail("VersionBatchId", versionBatchId)
+				    .detail("RangeFile", rangeFile.toString());
 			}
+			versionBatchId++;
 		}

 		TraceEvent("FastRestore").detail("VersionBatches", versionBatches->size());
--- a/fdbserver/RestoreRoleCommon.actor.cpp
+++ b/fdbserver/RestoreRoleCommon.actor.cpp
@ -39,11 +39,10 @@ struct RestoreWorkerData;
 ACTOR Future<Void> handleHeartbeat(RestoreSimpleRequest req, UID id) {
 	wait(delayJittered(5.0)); // Random jitter reduces heat beat monitor's pressure
 	req.reply.send(RestoreCommonReply(id));
-
 	return Void();
 }

-ACTOR Future<Void> handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference<RestoreRoleData> self) {
+void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference<RestoreRoleData> self) {
 	if (self->versionBatchStart) {
 		self->versionBatchStart = false;
 	}
@ -54,19 +53,22 @@ ACTOR Future<Void> handleFinishRestoreRequest(RestoreVersionBatchRequest req, Re
 	    .detail("Node", self->id());

 	req.reply.send(RestoreCommonReply(self->id()));
-
-	return Void();
 }

 ACTOR Future<Void> handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference<RestoreRoleData> self) {
-	self->resetPerVersionBatch();
-	TraceEvent("FastRestore")
-	    .detail("InitVersionBatch", req.batchID)
-	    .detail("Role", getRoleStr(self->role))
-	    .detail("Node", self->id());
+	// batchId is continuous. (req.batchID-1) is the id of the just finished batch.
+	wait(self->versionBatchId.whenAtLeast(req.batchID - 1));
+
+	if (self->versionBatchId.get() == req.batchID - 1) {
+		self->resetPerVersionBatch();
+		TraceEvent("FastRestore")
+		    .detail("InitVersionBatch", req.batchID)
+		    .detail("Role", getRoleStr(self->role))
+		    .detail("Node", self->id());
+		self->versionBatchId.set(req.batchID);
+	}

 	req.reply.send(RestoreCommonReply(self->id()));
-
 	return Void();
 }

--- a/fdbserver/RestoreRoleCommon.actor.h
+++ b/fdbserver/RestoreRoleCommon.actor.h
@ -32,10 +32,11 @@
 #include "flow/Stats.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/CommitTransaction.h"
+#include "fdbclient/Notified.h"
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/Locality.h"
 #include "fdbserver/CoordinationInterface.h"
-#include "fdbserver/RestoreWorkerInterface.h"
+#include "fdbclient/RestoreWorkerInterface.actor.h"
 #include "fdbserver/RestoreUtil.h"

 #include "flow/actorcompiler.h" // has to be last include
@ -55,7 +56,7 @@ typedef std::map<Version, Standalone<VectorRef<MutationRef>>> VersionedMutations

 ACTOR Future<Void> handleHeartbeat(RestoreSimpleRequest req, UID id);
 ACTOR Future<Void> handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference<RestoreRoleData> self);
-ACTOR Future<Void> handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference<RestoreRoleData> self);
+void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference<RestoreRoleData> self);

 // Helper class for reading restore data from a buffer and throwing the right errors.
 // This struct is mostly copied from StringRefReader. We add a sanity check in this struct.
@ -90,12 +91,12 @@ struct StringRefReaderMX {

 	// Functions for consuming big endian (network byte oselfer) integers.
 	// Consumes a big endian number, swaps it to little endian, and returns it.
-	const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume<int32_t>()); }
-	const uint32_t consumeNetworkUInt32() { return bigEndian32(consume<uint32_t>()); }
+	int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume<int32_t>()); }
+	uint32_t consumeNetworkUInt32() { return bigEndian32(consume<uint32_t>()); }

 	// Convert big Endian value (e.g., encoded in log file) into a littleEndian uint64_t value.
-	const int64_t consumeNetworkInt64() { return (int64_t)bigEndian64((uint32_t)consume<int64_t>()); }
-	const uint64_t consumeNetworkUInt64() { return bigEndian64(consume<uint64_t>()); }
+	int64_t consumeNetworkInt64() { return (int64_t)bigEndian64((uint32_t)consume<int64_t>()); }
+	uint64_t consumeNetworkUInt64() { return bigEndian64(consume<uint64_t>()); }

 	bool eof() { return rptr == end; }

@ -114,6 +115,8 @@ public:
 	std::map<UID, RestoreApplierInterface> appliersInterf;
 	RestoreApplierInterface masterApplierInterf;

+	NotifiedVersion versionBatchId; // Continuously increase for each versionBatch
+
 	bool versionBatchStart = false;

 	uint32_t inProgressFlag = 0;
@ -135,4 +138,4 @@ public:
 };

 #include "flow/unactorcompiler.h"
-#endif
+#endif
--- a/fdbserver/RestoreUtil.h
+++ b/fdbserver/RestoreUtil.h
@ -34,6 +34,9 @@
 #include <cstdint>
 #include <cstdarg>

+//#define SevFRMutationInfo SevVerbose
+#define SevFRMutationInfo SevInfo
+
 enum class RestoreRole { Invalid = 0, Master = 1, Loader, Applier };
 BINARY_SERIALIZABLE(RestoreRole);
 std::string getRoleStr(RestoreRole role);
--- a/fdbserver/RestoreWorker.actor.cpp
+++ b/fdbserver/RestoreWorker.actor.cpp
@ -1,5 +1,5 @@
 /*
- * Restore.actor.cpp
+ * RestoreWorker.actor.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
@ -98,8 +98,9 @@ ACTOR Future<Void> handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Refer
 		self->loaderInterf = RestoreLoaderInterface();
 		self->loaderInterf.get().initEndpoints();
 		RestoreLoaderInterface& recruited = self->loaderInterf.get();
-		DUMPTOKEN(recruited.setApplierKeyRangeVectorRequest);
 		DUMPTOKEN(recruited.initVersionBatch);
+		DUMPTOKEN(recruited.loadFile);
+		DUMPTOKEN(recruited.sendMutations);
 		DUMPTOKEN(recruited.collectRestoreRoleInterfaces);
 		DUMPTOKEN(recruited.finishRestore);
 		actors->add(restoreLoaderCore(self->loaderInterf.get(), req.nodeIndex, cx));
@ -183,7 +184,7 @@ void initRestoreWorkerConfig() {
 	opConfig.num_loaders = g_network->isSimulated() ? 3 : opConfig.num_loaders;
 	opConfig.num_appliers = g_network->isSimulated() ? 3 : opConfig.num_appliers;
 	opConfig.transactionBatchSizeThreshold =
-	    g_network->isSimulated() ? 1 : opConfig.transactionBatchSizeThreshold; // Byte
+	    g_network->isSimulated() ? 512 : opConfig.transactionBatchSizeThreshold; // Byte
 	TraceEvent("FastRestore")
 	    .detail("InitOpConfig", "Result")
 	    .detail("NumLoaders", opConfig.num_loaders)
--- a/fdbserver/RestoreWorker.actor.h
+++ b/fdbserver/RestoreWorker.actor.h
@ -34,7 +34,7 @@
 #include <cstdint>
 #include <cstdarg>

-#include "fdbserver/RestoreWorkerInterface.h"
+#include "fdbclient/RestoreWorkerInterface.actor.h"
 #include "fdbserver/RestoreUtil.h"
 #include "fdbserver/RestoreCommon.actor.h"
 #include "fdbserver/RestoreRoleCommon.actor.h"
@ -70,4 +70,4 @@ struct RestoreWorkerData :  NonCopyable, public ReferenceCounted<RestoreWorkerDa
 };

 #include "flow/unactorcompiler.h"
-#endif //FDBSERVER_RESTOREWORKER_H
+#endif // FDBSERVER_RESTOREWORKER_H
--- a/fdbserver/ServerDBInfo.h
+++ b/fdbserver/ServerDBInfo.h
@ -50,6 +50,7 @@ struct ServerDBInfo {
 	LogSystemConfig logSystemConfig;
 	std::vector<UID> priorCommittedLogServers;   // If !fullyRecovered and logSystemConfig refers to a new log system which may not have been committed to the coordinated state yet, then priorCommittedLogServers are the previous, fully committed generation which need to stay alive in case this recovery fails
 	Optional<LatencyBandConfig> latencyBandConfig;
+	std::vector<std::pair<uint16_t,StorageServerInterface>> storageCaches;

 	explicit ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED) {}

@ -58,7 +59,7 @@ struct ServerDBInfo {

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig);
+		serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches);
 	}
 };

--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -43,7 +43,7 @@
 #undef min

 extern "C" int g_expect_full_pointermap;
-extern const char* getHGVersion();
+extern const char* getSourceVersion();

 const int MACHINE_REBOOT_TIME = 10;

@ -232,7 +232,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<ClusterConnec
 				.detail("Excluded", process->excluded)
 				.detail("UsingSSL", sslEnabled);
 			TraceEvent("ProgramStart").detail("Cycles", cycles).detail("RandomId", randomId)
-				.detail("SourceVersion", getHGVersion())
+				.detail("SourceVersion", getSourceVersion())
 				.detail("Version", FDB_VT_VERSION)
 				.detail("PackageName", FDB_VT_PACKAGE_NAME)
 				.detail("DataFolder", *dataFolder)
@ -1254,6 +1254,13 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFo
 		int dcCoordinators = coordinatorCount / dataCenters + (dc < coordinatorCount%dataCenters);
 		printf("Datacenter %d: %d/%d machines, %d/%d coordinators\n", dc, machines, machineCount, dcCoordinators, coordinatorCount);
 		ASSERT( dcCoordinators <= machines );
+		
+		//FIXME: temporarily code to test storage cache
+		//TODO: caching disabled for this merge
+		//if(dc==0) {
+		//	machines++;
+		//}
+
 		int useSeedForMachine = deterministicRandom()->randomInt(0, machines);
 		Standalone<StringRef> zoneId;
 		Standalone<StringRef> newZoneId;
@ -1277,6 +1284,13 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFo
 					nonVersatileMachines++;
 			}

+			//FIXME: temporarily code to test storage cache
+			//TODO: caching disabled for this merge
+			//if(machine==machines-1 && dc==0) {
+			//	processClass = ProcessClass(ProcessClass::StorageCacheClass, ProcessClass::CommandLineSource);
+			//	nonVersatileMachines++;
+			//}
+
 			std::vector<IPAddress> ips;
 			for (int i = 0; i < processesPerMachine; i++) {
 				ips.push_back(makeIPAddressForSim(useIPv6, { 2, dc, deterministicRandom()->randomInt(1, i + 2), machine }));
@ -1395,8 +1409,6 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot
 	state int extraDB = 0;
 	state int minimumReplication = 0;
 	state int minimumRegions = 0;
-	state float timeout = 5400; // old default is 5400 seconds
-	state float buggify_timeout = 36000.0; // old default is 36000 seconds
 	checkExtraDB(testFile, extraDB, minimumReplication, minimumRegions);

 	// TODO (IPv6) Use IPv6?
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -1151,26 +1151,61 @@ ACTOR static Future<Void> consistencyCheckStatusFetcher(Database cx, JsonBuilder
 	return Void();
 }

+struct LogRangeAndUID {
+	KeyRange range;
+	UID destID;
+
+	LogRangeAndUID(KeyRange const& range, UID const& destID) : range(range), destID(destID) {}
+
+	bool operator < (LogRangeAndUID const& r) const { 
+		if(range.begin != r.range.begin) return range.begin < r.range.begin;
+		if(range.end != r.range.end) return range.end < r.range.end;
+		return destID < r.destID; 
+	}
+};
+
 ACTOR static Future<Void> logRangeWarningFetcher(Database cx, JsonBuilderArray *messages, std::set<std::string> *incomplete_reasons) {
 	try {
 		state Transaction tr(cx);
+		state Future<Void> timeoutFuture = timeoutError(Future<Void>(Never()), 5.0);
 		loop {
 			try {
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-				tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
-				tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);

-				Standalone<RangeResultRef> existingDestUidValues = wait(timeoutError(tr.getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY), 5.0));
-				std::set<std::pair<Key,Key>> existingRanges;
-				for(auto it : existingDestUidValues) {
-					KeyRange range = BinaryReader::fromStringRef<KeyRange>(it.key.removePrefix(destUidLookupPrefix), IncludeVersion());
-					std::pair<Key,Key> rangePair = std::make_pair(range.begin,range.end);
-					if(existingRanges.count(rangePair)) {
-						messages->push_back(JsonString::makeMessage("duplicate_mutation_streams", format("Backup and DR are not sharing the same stream of mutations for `%s` - `%s`", printable(range.begin).c_str(), printable(range.end).c_str()).c_str()));
-						break;
-					}
-					existingRanges.insert(rangePair);
+				state Future<Standalone<RangeResultRef>> existingDestUidValues = tr.getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY);
+				state Future<Standalone<RangeResultRef>> existingLogRanges = tr.getRange(logRangesRange, CLIENT_KNOBS->TOO_MANY);
+				wait( (success(existingDestUidValues) && success(existingLogRanges)) || timeoutFuture );
+
+				std::set<LogRangeAndUID> loggingRanges;
+				for(auto& it : existingLogRanges.get()) {
+					Key logDestination;
+					UID logUid;
+					KeyRef logRangeBegin = logRangesDecodeKey(it.key, &logUid);
+					Key logRangeEnd = logRangesDecodeValue(it.value, &logDestination);
+					loggingRanges.insert(LogRangeAndUID(KeyRangeRef(logRangeBegin, logRangeEnd), logUid));
 				}
+
+				std::set<std::pair<Key,Key>> existingRanges;
+				for(auto& it : existingDestUidValues.get()) {
+					KeyRange range = BinaryReader::fromStringRef<KeyRange>(it.key.removePrefix(destUidLookupPrefix), IncludeVersion());
+					UID logUid = BinaryReader::fromStringRef<UID>(it.value, Unversioned());
+					if(loggingRanges.count(LogRangeAndUID(range, logUid))) {
+						std::pair<Key,Key> rangePair = std::make_pair(range.begin,range.end);
+						if(existingRanges.count(rangePair)) {
+							messages->push_back(JsonString::makeMessage("duplicate_mutation_streams", format("Backup and DR are not sharing the same stream of mutations for `%s` - `%s`", printable(range.begin).c_str(), printable(range.end).c_str()).c_str()));
+							break;
+						}
+						existingRanges.insert(rangePair);
+					} else {
+						//This cleanup is done during status, because it should only be required once after upgrading to 6.2.7 or later.
+						//There is no other good location to detect that the metadata is mismatched.
+						TraceEvent(SevWarnAlways, "CleaningDestUidLookup").detail("K", it.key.printable()).detail("V", it.value.printable());
+						tr.clear(it.key);
+					}
+				}
+				wait(tr.commit() || timeoutFuture);
 				break;
 			} catch(Error &e) {
 				if(e.code() == error_code_timed_out) {
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -344,6 +344,7 @@ struct TLogData : NonCopyable {
 	std::map<Tag, Version> toBePopped; // map of Tag->Version for all the pops
                                       // that came when ignorePopRequest was set
 	Reference<AsyncVar<bool>> degraded;
+	std::vector<TagsAndMessage> tempTagMessages;

 	TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference<AsyncVar<ServerDBInfo>> dbInfo, Reference<AsyncVar<bool>> degraded, std::string folder)
 			: dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()),
@ -958,6 +959,81 @@ ACTOR Future<Void> updatePersistentData( TLogData* self, Reference<LogData> logD
 	return Void();
 }

+ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference<LogData> logData ) {
+	if (self->ignorePopRequest) {
+		TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline);
+
+		if (self->toBePopped.find(inputTag) == self->toBePopped.end()
+			|| to > self->toBePopped[inputTag]) {
+			self->toBePopped[inputTag] = to;
+		}
+		// add the pop to the toBePopped map
+		TraceEvent(SevDebug, "IgnoringPopRequest")
+			.detail("IgnorePopDeadline", self->ignorePopDeadline)
+			.detail("Tag", inputTag.toString())
+			.detail("Version", to);
+		return Void();
+	}
+	state Version upTo = to;
+	int8_t tagLocality = inputTag.locality;
+	if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) {
+		upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to);
+		tagLocality = tagLocalityLogRouter;
+	}
+	state Tag tag(tagLocality, inputTag.id);
+	auto tagData = logData->getTagData(tag);
+	if (!tagData) {
+		tagData = logData->createTagData(tag, upTo, true, true, false);
+	} else if (upTo > tagData->popped) {
+		tagData->popped = upTo;
+		tagData->poppedRecently = true;
+		tagData->requiresPoppedLocationUpdate = true;
+
+		if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) {
+			tagData->unpoppedRecovered = false;
+			logData->unpoppedRecoveredTags--;
+			TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt);
+			if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) {
+				logData->recoveryComplete.send(Void());
+			}
+		}
+
+		if (upTo > logData->persistentDataDurableVersion)
+			wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop));
+		//TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo);
+	}
+	return Void();
+}
+
+ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogData> logData ) {
+	// timeout check for ignorePopRequest
+	if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) {
+
+		TraceEvent("EnableTLogPlayAllIgnoredPops");
+		// use toBePopped and issue all the pops
+		std::map<Tag, Version>::iterator it;
+		vector<Future<Void>> ignoredPops;
+		self->ignorePopRequest = false;
+		self->ignorePopUid = "";
+		self->ignorePopDeadline = 0.0;
+		for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) {
+			TraceEvent("PlayIgnoredPop")
+				.detail("Tag", it->first.toString())
+				.detail("Version", it->second);
+			ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData));
+		}
+		self->toBePopped.clear();
+		wait(waitForAll(ignoredPops));
+		TraceEvent("ResetIgnorePopRequest")
+		    .detail("Now", g_network->now())
+		    .detail("IgnorePopRequest", self->ignorePopRequest)
+		    .detail("IgnorePopDeadline", self->ignorePopDeadline);
+	}
+	wait(tLogPopCore(self, req.tag, req.to, logData));
+	req.reply.send(Void());
+	return Void();
+}
+
 // This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources.
 // For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important
 // work (e.g. commits).
@ -977,6 +1053,26 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {

 	state FlowLock::Releaser commitLockReleaser;

+	//FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after two consecutive recoveries.
+	// It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this tag; which is not intended to ever happen.
+	Optional<Version> cachePopVersion;
+	for(auto& it : self->id_data) {
+		if(!it.second->stopped) {
+			if(it.second->version.get() - it.second->unrecoveredBefore > SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) {
+				cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS;
+			}
+			break;
+		}
+	}
+
+	if(cachePopVersion.present()) {
+		state std::vector<Future<Void>> cachePopFutures;
+		for(auto& it : self->id_data) {
+			cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(),0,cacheTag), it.second));
+		}
+		wait( waitForAll(cachePopFutures) );
+	}
+
 	if(logData->stopped) {
 		if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) {
 			while(logData->persistentDataDurableVersion != logData->version.get()) {
@ -1178,13 +1274,13 @@ void commitMessages( TLogData* self, Reference<LogData> logData, Version version

 void commitMessages( TLogData *self, Reference<LogData> logData, Version version, Arena arena, StringRef messages ) {
 	ArenaReader rd( arena, messages, Unversioned() );
-	std::vector<TagsAndMessage> msgs;
+	self->tempTagMessages.clear();
 	while(!rd.empty()) {
 		TagsAndMessage tagsAndMsg;
 		tagsAndMsg.loadFromArena(&rd, nullptr);
-		msgs.push_back(std::move(tagsAndMsg));
+		self->tempTagMessages.push_back(std::move(tagsAndMsg));
 	}
-	commitMessages(self, logData, version, msgs);
+	commitMessages(self, logData, version, self->tempTagMessages);
 }

 Version poppedVersion( Reference<LogData> self, Tag tag) {
@ -1207,81 +1303,6 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>> & getVersionMessages( Re
 	return tagData->versionMessages;
 };

-ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference<LogData> logData ) {
-	if (self->ignorePopRequest) {
-		TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline);
-
-		if (self->toBePopped.find(inputTag) == self->toBePopped.end()
-			|| to > self->toBePopped[inputTag]) {
-			self->toBePopped[inputTag] = to;
-		}
-		// add the pop to the toBePopped map
-		TraceEvent(SevDebug, "IgnoringPopRequest")
-			.detail("IgnorePopDeadline", self->ignorePopDeadline)
-			.detail("Tag", inputTag.toString())
-			.detail("Version", to);
-		return Void();
-	}
-	state Version upTo = to;
-	int8_t tagLocality = inputTag.locality;
-	if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) {
-		upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to);
-		tagLocality = tagLocalityLogRouter;
-	}
-	state Tag tag(tagLocality, inputTag.id);
-	auto tagData = logData->getTagData(tag);
-	if (!tagData) {
-		tagData = logData->createTagData(tag, upTo, true, true, false);
-	} else if (upTo > tagData->popped) {
-		tagData->popped = upTo;
-		tagData->poppedRecently = true;
-		tagData->requiresPoppedLocationUpdate = true;
-
-		if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) {
-			tagData->unpoppedRecovered = false;
-			logData->unpoppedRecoveredTags--;
-			TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt);
-			if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) {
-				logData->recoveryComplete.send(Void());
-			}
-		}
-
-		if (upTo > logData->persistentDataDurableVersion)
-			wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop));
-		//TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo);
-	}
-	return Void();
-}
-
-ACTOR Future<Void> tLogPop( TLogData* self, TLogPopRequest req, Reference<LogData> logData ) {
-	// timeout check for ignorePopRequest
-	if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) {
-
-		TraceEvent("EnableTLogPlayAllIgnoredPops");
-		// use toBePopped and issue all the pops
-		std::map<Tag, Version>::iterator it;
-		vector<Future<Void>> ignoredPops;
-		self->ignorePopRequest = false;
-		self->ignorePopUid = "";
-		self->ignorePopDeadline = 0.0;
-		for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) {
-			TraceEvent("PlayIgnoredPop")
-				.detail("Tag", it->first.toString())
-				.detail("Version", it->second);
-			ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData));
-		}
-		self->toBePopped.clear();
-		wait(waitForAll(ignoredPops));
-		TraceEvent("ResetIgnorePopRequest")
-		    .detail("Now", g_network->now())
-		    .detail("IgnorePopRequest", self->ignorePopRequest)
-		    .detail("IgnorePopDeadline", self->ignorePopDeadline);
-	}
-	wait(tLogPopCore(self, req.tag, req.to, logData));
-	req.reply.send(Void());
-	return Void();
-}
-
 void peekMessagesFromMemory( Reference<LogData> self, TLogPeekRequest const& req, BinaryWriter& messages, Version& endVersion ) {
 	ASSERT( !messages.getLength() );

@ -1340,6 +1361,9 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 		try {
 			peekId = req.sequence.get().first;
 			sequence = req.sequence.get().second;
+			if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) {
+				throw timed_out();
+			}
 			auto& trackerData = logData->peekTracker[peekId];
 			if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
 				trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
@ -1630,6 +1654,7 @@ ACTOR Future<Void> doQueueCommit( TLogData* self, Reference<LogData> logData, st
 	self->queueCommitBegin = commitNumber;
 	logData->queueCommittingVersion = ver;

+	g_network->setCurrentTask(TaskPriority::TLogCommitReply);
 	Future<Void> c = self->persistentQueue->commit();
 	self->diskQueueCommitBytes = 0;
 	self->largeDiskQueueCommitBytes.set(false);
@ -2128,8 +2153,7 @@ void removeLog( TLogData* self, Reference<LogData> logData ) {
 	}
 }

-// copy data from old gene to new gene without desiarlzing 
-ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, std::vector<Tag> tags, Version beginVersion, Optional<Version> endVersion, bool poppedIsKnownCommitted, bool parallelGetMore ) {
+ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, std::vector<Tag> tags, Version beginVersion, Optional<Version> endVersion, bool poppedIsKnownCommitted ) {
 	state Future<Void> dbInfoChange = Void();
 	state Reference<ILogSystem::IPeekCursor> r;
 	state Version tagAt = beginVersion;
@ -2147,7 +2171,7 @@ ACTOR Future<Void> pullAsyncData( TLogData* self, Reference<LogData> logData, st
 				}
 				when( wait( dbInfoChange ) ) {
 					if( logData->logSystem->get() ) {
-						r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, parallelGetMore );
+						r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, true );
 					} else {
 						r = Reference<ILogSystem::IPeekCursor>();
 					}
@ -2284,7 +2308,7 @@ ACTOR Future<Void> tLogCore( TLogData* self, Reference<LogData> logData, TLogInt
 	if(!logData->isPrimary) {
 		std::vector<Tag> tags;
 		tags.push_back(logData->remoteTag);
-		logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional<Version>(), true, true) );
+		logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional<Version>(), true) );
 	}

 	try {
@ -2679,10 +2703,10 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
 					logData->logRouterPopToVersion = req.recoverAt;
 					std::vector<Tag> tags;
 					tags.push_back(logData->remoteTag);
-					wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true, false) || logData->removed);
+					wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true) || logData->removed);
 				} else if(!req.recoverTags.empty()) {
 					ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion);
-					wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false, true) || logData->removed);
+					wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false) || logData->removed);
 				}
 				pulledRecoveryVersions = true;
 				logData->knownCommittedVersion = req.recoverAt;
@ -2783,6 +2807,7 @@ ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ

 		self.sharedActors.send( commitQueue(&self) );
 		self.sharedActors.send( updateStorageLoop(&self) );
+		state Future<Void> activeSharedChange = Void();

 		loop {
 			choose {
@ -2795,7 +2820,7 @@ ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ
 					}
 				}
 				when ( wait( error ) ) { throw internal_error(); }
-				when ( wait( activeSharedTLog->onChange() ) ) {
+				when ( wait( activeSharedChange ) ) {
 					if (activeSharedTLog->get() == tlogId) {
 						TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get());
 						self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD;
@ -2804,6 +2829,7 @@ ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ
 						TraceEvent("SharedTLogQueueSpilling", self.dbgid).detail("NowActive", activeSharedTLog->get());
 						self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) );
 					}
+					activeSharedChange = activeSharedTLog->onChange();
 				}
 			}
 		}
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -459,7 +459,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				foundSpecial = true;
 			}
 			if(log->isLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality ||
-				tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) {
+				tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) {
 				lastBegin = std::max(lastBegin, log->startVersion);
 				localSets.push_back(log);
 				if(log->locality != tagLocalitySatellite) {
@ -486,7 +486,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			int i = 0;
 			while(begin < lastBegin) {
 				if(i == oldLogData.size()) {
-					if(tag == txsTag || tag.locality == tagLocalityTxs) {
+					if(tag == txsTag || tag.locality == tagLocalityTxs || tag == cacheTag) {
 						break;
 					}
 					TraceEvent("TLogPeekAllDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size());
@ -502,7 +502,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 						thisSpecial = true;
 					}
 					if(log->isLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality ||
-						tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) {
+						tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) {
 						thisBegin = std::max(thisBegin, log->startVersion);
 						localOldSets.push_back(log);
 						if(log->locality != tagLocalitySatellite) {
@ -538,7 +538,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 	}

-	Reference<IPeekCursor> peekRemote( UID dbgid, Version begin, Tag tag, bool parallelGetMore ) {
+	Reference<IPeekCursor> peekRemote( UID dbgid, Version begin, Optional<Version> end, Tag tag, bool parallelGetMore ) {
 		int bestSet = -1;
 		Version lastBegin = recoveredAt.present() ? recoveredAt.get() + 1 : 0;
 		for(int t = 0; t < tLogs.size(); t++) {
@ -552,22 +552,22 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			}
 		}
 		if(bestSet == -1) {
-			TraceEvent("TLogPeekRemoteNoBestSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin);
-			return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>>(), tag, begin, getPeekEnd(), false, false ) );
+			TraceEvent("TLogPeekRemoteNoBestSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd());
+			return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) );
 		}
 		if(begin >= lastBegin) {
-			TraceEvent("TLogPeekRemoteBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString());
-			return Reference<ILogSystem::MergedPeekCursor>( new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, begin, getPeekEnd(), false, std::vector<LocalityData>(), Reference<IReplicationPolicy>(), 0 ) );
+			TraceEvent("TLogPeekRemoteBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString());
+			return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor( tLogs[bestSet]->logRouters, tag, begin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore ) );
 		} else {
 			std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
 			std::vector< LogMessageVersion > epochEnds;
-			TraceEvent("TLogPeekRemoteAddingBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString());
-			cursors.emplace_back(new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, lastBegin, getPeekEnd(), false, std::vector<LocalityData>(), Reference<IReplicationPolicy>(), 0 ) );
+			TraceEvent("TLogPeekRemoteAddingBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString());
+			cursors.emplace_back(new ILogSystem::BufferedCursor( tLogs[bestSet]->logRouters, tag, lastBegin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore ) );
 			int i = 0;
 			while(begin < lastBegin) {
 				if(i == oldLogData.size()) {
-					TraceEvent("TLogPeekRemoteDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size());
-					return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>>(), tag, begin, getPeekEnd(), false, false ) );
+					TraceEvent("TLogPeekRemoteDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size());
+					return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) );
 				}

 				int bestOldSet = -1;
@ -583,15 +583,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					}
 				}
 				if(bestOldSet == -1) {
-					TraceEvent("TLogPeekRemoteNoOldBestSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin);
-					return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>>(), tag, begin, getPeekEnd(), false, false ) );
+					TraceEvent("TLogPeekRemoteNoOldBestSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd());
+					return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) );
 				}

 				if(thisBegin < lastBegin) {
-					TraceEvent("TLogPeekRemoteAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestOldSet", bestOldSet).detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString())
+					TraceEvent("TLogPeekRemoteAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("BestOldSet", bestOldSet).detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString())
 					.detail("LastBegin", lastBegin).detail("ThisBegin", thisBegin).detail("BestStartVer", oldLogData[i].tLogs[bestOldSet]->startVersion);
-					cursors.emplace_back(new ILogSystem::MergedPeekCursor(oldLogData[i].tLogs[bestOldSet]->logRouters, -1, (int)oldLogData[i].tLogs[bestOldSet]->logRouters.size(), tag,
-						thisBegin, lastBegin, false, std::vector<LocalityData>(), Reference<IReplicationPolicy>(), 0));
+					cursors.emplace_back(new ILogSystem::BufferedCursor(oldLogData[i].tLogs[bestOldSet]->logRouters, tag, thisBegin, lastBegin, parallelGetMore));
 					epochEnds.emplace_back(lastBegin);
 					lastBegin = thisBegin;
 				}
@ -602,14 +601,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 	}

-	virtual Reference<IPeekCursor> peek( UID dbgid, Version begin, Tag tag, bool parallelGetMore ) {
+	virtual Reference<IPeekCursor> peek( UID dbgid, Version begin, Optional<Version> end, Tag tag, bool parallelGetMore ) {
 		if(!tLogs.size()) {
 			TraceEvent("TLogPeekNoLogSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin);
 			return Reference<ILogSystem::ServerPeekCursor>( new ILogSystem::ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>>(), tag, begin, getPeekEnd(), false, false ) );
 		}

 		if(tag.locality == tagLocalityRemoteLog) {
-			return peekRemote(dbgid, begin, tag, parallelGetMore);
+			return peekRemote(dbgid, begin, end, tag, parallelGetMore);
 		} else {
 			return peekAll(dbgid, begin, getPeekEnd(), tag, parallelGetMore);
 		}
@ -622,12 +621,12 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}

 		if(tags.size() == 1) {
-			return peek(dbgid, begin, tags[0], parallelGetMore);
+			return peek(dbgid, begin, end, tags[0], parallelGetMore);
 		}

 		std::vector< Reference<ILogSystem::IPeekCursor> > cursors;
 		for(auto tag : tags) {
-			cursors.push_back(peek(dbgid, begin, tag, parallelGetMore));
+			cursors.push_back(peek(dbgid, begin, end, tag, parallelGetMore));
 		}
 		return Reference<ILogSystem::BufferedCursor>( new ILogSystem::BufferedCursor(cursors, begin, end.present() ? end.get() + 1 : getPeekEnd(), true, tLogs[0]->locality == tagLocalityUpgraded, false) );
 	}
@ -1033,7 +1032,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	ACTOR static Future<Void> popFromLog( TagPartitionedLogSystem* self, Reference<AsyncVar<OptionalInterface<TLogInterface>>> log, Tag tag, double time ) {
 		state Version last = 0;
 		loop {
-			wait( delay(time) );
+			wait( delay(time, TaskPriority::TLogPop) );

 			state std::pair<Version,Version> to = self->outstandingPops[ std::make_pair(log->get().id(),tag) ];

@ -1045,7 +1044,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			try {
 				if( !log->get().present() )
 					return Void();
-				wait(log->get().interf().popMessages.getReply( TLogPopRequest( to.first, to.second, tag ) ) );
+				wait(log->get().interf().popMessages.getReply( TLogPopRequest( to.first, to.second, tag ), TaskPriority::TLogPop ) );

 				last = to.first;
 			} catch (Error& e) {
@ -1270,7 +1269,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			return std::numeric_limits<Version>::max();
 	}

-	virtual void getPushLocations(std::vector<Tag> const& tags, std::vector<int>& locations, bool allLocations) {
+	virtual void getPushLocations(VectorRef<Tag> tags, std::vector<int>& locations, bool allLocations) {
 		int locationOffset = 0;
 		for(auto& log : tLogs) {
 			if(log->isLocal && log->logServers.size()) {
@ -1907,7 +1906,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			std::vector<int> locations;
 			for( Tag tag : localTags ) {
 				locations.clear();
-				logSet->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+				logSet->getPushLocations( VectorRef<Tag>(&tag, 1), locations, 0 );
 				for(int loc : locations)
 					remoteTLogReqs[ loc ].recoverTags.push_back( tag );
 			}
@ -1923,7 +1922,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 					Tag pushTag = (i==-1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i%self->txsTags);
 					locations.clear();
-					logSet->getPushLocations( {pushTag}, locations, 0 );
+					logSet->getPushLocations( VectorRef<Tag>(&pushTag, 1), locations, 0 );
 					for(int loc : locations)
 						remoteTLogReqs[ loc ].recoverTags.push_back( tag );
 				}
@ -2117,7 +2116,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		std::vector<int> locations;
 		for( Tag tag : localTags ) {
 			locations.clear();
-			logSystem->tLogs[0]->getPushLocations( vector<Tag>(1, tag), locations, 0 );
+			logSystem->tLogs[0]->getPushLocations( VectorRef<Tag>(&tag, 1), locations, 0 );
 			for(int loc : locations)
 				reqs[ loc ].recoverTags.push_back( tag );
 		}
@ -2131,7 +2130,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 				Tag pushTag = (i==-1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i%logSystem->txsTags);
 				locations.clear();
-				logSystem->tLogs[0]->getPushLocations( vector<Tag>(1, pushTag), locations, 0 );
+				logSystem->tLogs[0]->getPushLocations( VectorRef<Tag>(&pushTag, 1), locations, 0 );
 				for(int loc : locations)
 					reqs[ loc ].recoverTags.push_back( tag );
 			}
@ -2183,7 +2182,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					// are the preferred location for id%logRouterTags.
 					Tag pushLocation = Tag(tagLocalityLogRouter, i%logSystem->logRouterTags);
 					locations.clear();
-					logSystem->tLogs[1]->getPushLocations( {pushLocation}, locations, 0 );
+					logSystem->tLogs[1]->getPushLocations( VectorRef<Tag>(&pushLocation,1), locations, 0 );
 					for(int loc : locations)
 						sreqs[ loc ].recoverTags.push_back( tag );
 				}
@ -2193,7 +2192,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i);
 					Tag pushTag = (i==-1 || nonShardedTxs) ? txsTag : Tag(tagLocalityTxs, i%logSystem->txsTags);
 					locations.clear();
-					logSystem->tLogs[1]->getPushLocations( {pushTag}, locations, 0 );
+					logSystem->tLogs[1]->getPushLocations( VectorRef<Tag>(&pushTag,1), locations, 0 );
 					for(int loc : locations)
 						sreqs[ loc ].recoverTags.push_back( tag );
 				}
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@ -386,6 +386,7 @@ struct Role {
 	static const Role LOG_ROUTER;
 	static const Role DATA_DISTRIBUTOR;
 	static const Role RATEKEEPER;
+	static const Role STORAGE_CACHE;
 	static const Role COORDINATOR;

 	std::string roleName;
@ -455,6 +456,7 @@ ACTOR Future<Void> logRouter(TLogInterface interf, InitializeLogRouterRequest re
                             Reference<AsyncVar<ServerDBInfo>> db);
 ACTOR Future<Void> dataDistributor(DataDistributorInterface ddi, Reference<AsyncVar<ServerDBInfo>> db);
 ACTOR Future<Void> ratekeeper(RatekeeperInterface rki, Reference<AsyncVar<ServerDBInfo>> db);
+ACTOR Future<Void> storageCache(StorageServerInterface interf, uint16_t id, Reference<AsyncVar<ServerDBInfo>> db);

 void registerThreadForProfiling();
 void updateCpuProfiler(ProfilerRequest req);
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -34,7 +34,7 @@
 #include "fdbclient/FailureMonitorClient.h"
 #include "fdbserver/CoordinationInterface.h"
 #include "fdbserver/WorkerInterface.actor.h"
-#include "fdbserver/RestoreWorkerInterface.h"
+#include "fdbclient/RestoreWorkerInterface.actor.h"
 #include "fdbserver/ClusterRecruitmentInterface.h"
 #include "fdbserver/ServerDBInfo.h"
 #include "fdbserver/MoveKeys.actor.h"
@ -183,7 +183,7 @@ extern void createTemplateDatabase();
 // FIXME: this really belongs in a header somewhere since it is actually used.
 extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs);

-extern const char* getHGVersion();
+extern const char* getSourceVersion();

 extern void flushTraceFileVoid();

@ -518,7 +518,7 @@ void* parentWatcher(void *arg) {

 static void printVersion() {
 	printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n");
-	printf("source version %s\n", getHGVersion());
+	printf("source version %s\n", getSourceVersion());
 	printf("protocol %" PRIx64 "\n", currentProtocolVersion.version());
 }

@ -1672,7 +1672,7 @@ int main(int argc, char* argv[]) {
 		TraceEvent("ProgramStart")
 		    .setMaxEventLength(12000)
 		    .detail("RandomSeed", opts.randomSeed)
-		    .detail("SourceVersion", getHGVersion())
+		    .detail("SourceVersion", getSourceVersion())
 		    .detail("Version", FDB_VT_VERSION)
 		    .detail("PackageName", FDB_VT_PACKAGE_NAME)
 		    .detail("FileSystem", opts.fileSystemPath)
--- a/fdbserver/fdbserver.vcxproj
+++ b/fdbserver/fdbserver.vcxproj
@ -46,7 +46,6 @@
    <ActorCompiler Include="KeyValueStoreMemory.actor.cpp" />
    <ActorCompiler Include="SimulatedCluster.actor.cpp" />
    <ActorCompiler Include="KeyValueStoreCompressTestData.actor.cpp" />
-    <ActorCompiler Include="IndirectShadowPager.actor.cpp" />
    <ClCompile Include="Knobs.cpp" />
    <ActorCompiler Include="FDBExecHelper.actor.cpp" />
    <ActorCompiler Include="QuietDatabase.actor.cpp" />
@ -54,6 +53,7 @@
    <ActorCompiler Include="workloads\Unreadable.actor.cpp" />
    <ActorCompiler Include="workloads\SaveAndKill.actor.cpp" />
    <ActorCompiler Include="Resolver.actor.cpp" />
+    <ActorCompiler Include="StorageCache.actor.cpp" />
    <ActorCompiler Include="RestoreWorker.actor.cpp" />
    <ActorCompiler Include="RestoreUtil.actor.cpp" />
    <ActorCompiler Include="RestoreCommon.actor.cpp" />
@ -63,7 +63,6 @@
    <ActorCompiler Include="RestoreApplier.actor.cpp" />
    <ActorCompiler Include="LogSystemDiskQueueAdapter.actor.cpp" />
    <ActorCompiler Include="LogSystemPeekCursor.actor.cpp" />
-    <ActorCompiler Include="MemoryPager.actor.cpp" />
    <ActorCompiler Include="LogRouter.actor.cpp" />
    <ClCompile Include="LatencyBandConfig.cpp" />
    <ActorCompiler Include="OldTLogServer_4_6.actor.cpp" />
@ -188,7 +187,6 @@
    </ActorCompiler>
    <ClInclude Include="IDiskQueue.h" />
    <ClInclude Include="IKeyValueStore.h" />
-    <ClInclude Include="IndirectShadowPager.h" />
    <ClInclude Include="IPager.h" />
    <ClInclude Include="IVersionedStore.h" />
    <ClInclude Include="LatencyBandConfig.h" />
@ -198,7 +196,6 @@
    <ClInclude Include="LogSystemConfig.h" />
    <ClInclude Include="LogSystemDiskQueueAdapter.h" />
    <ClInclude Include="MasterInterface.h" />
-    <ClInclude Include="MemoryPager.h" />
    <ActorCompiler Include="MoveKeys.actor.h">
      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">false</EnableCompile>
      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
--- a/fdbserver/fdbserver.vcxproj.filters
+++ b/fdbserver/fdbserver.vcxproj.filters
@ -197,6 +197,7 @@
      <Filter>workloads</Filter>
    </ActorCompiler>
    <ActorCompiler Include="Resolver.actor.cpp" />
+    <ActorCompiler Include="StorageCache.actor.cpp" />
    <ActorCompiler Include="LogSystemDiskQueueAdapter.actor.cpp" />
    <ActorCompiler Include="Orderer.actor.h" />
    <ActorCompiler Include="workloads\DiskDurabilityTest.actor.cpp">
@ -274,8 +275,6 @@
    <ActorCompiler Include="workloads\AtomicRestore.actor.cpp">
      <Filter>workloads</Filter>
    </ActorCompiler>
-    <ActorCompiler Include="MemoryPager.actor.cpp" />
-    <ActorCompiler Include="IndirectShadowPager.actor.cpp" />
    <ActorCompiler Include="OldTLogServer.actor.cpp" />
    <ActorCompiler Include="LogRouter.actor.cpp" />
    <ActorCompiler Include="workloads\SlowTaskWorkload.actor.cpp">
@ -330,7 +329,6 @@
    <ClInclude Include="pubsub.h" />
    <ClInclude Include="Knobs.h" />
    <ClInclude Include="WorkerInterface.h" />
-    <ClInclude Include="RestoreWorkerInterface.h" />
    <ClInclude Include="RestoreCommon.actor.h" />
    <ClInclude Include="WaitFailure.h" />
    <ClInclude Include="TesterInterface.actor.h" />
@ -387,8 +385,6 @@
    <ClInclude Include="LogProtocolMessage.h" />
    <ClInclude Include="IPager.h" />
    <ClInclude Include="IVersionedStore.h" />
-    <ClInclude Include="MemoryPager.h" />
-    <ClInclude Include="IndirectShadowPager.h" />
    <ClInclude Include="template_fdb.h" />
    <ClInclude Include="LatencyBandConfig.h" />
  </ItemGroup>
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@ -684,6 +684,9 @@ ACTOR Future<Void> readTransactionSystemState( Reference<MasterData> self, Refer

 	Standalone<VectorRef<KeyValueRef>> rawTags = wait( self->txnStateStore->readRange( serverTagKeys ) );
 	self->allTags.clear();
+	if(self->lastEpochEnd > 0) {
+		self->allTags.push_back(cacheTag);
+	}

 	if(self->forceRecovery) {
 		self->safeLocality = oldLogSystem->getLogSystemConfig().tLogs[0].locality;
@ -1345,6 +1348,15 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
 	tr.set(recoveryCommitRequest.arena, coordinatorsKey, self->coordinators.ccf->getConnectionString().toString());
 	tr.set(recoveryCommitRequest.arena, logsKey, self->logSystem->getLogsValue());
 	tr.set(recoveryCommitRequest.arena, primaryDatacenterKey, self->myInterface.locality.dcId().present() ? self->myInterface.locality.dcId().get() : StringRef());
+	
+	//FIXME: remove this code, caching the entire normal keyspace as a test of functionality
+	//TODO: caching disabled for this merge
+	//tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.begin), storageCacheValue({0}));
+	//tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.end), storageCacheValue({}));
+	//tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.begin), serverKeysTrue);
+	//tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.end), serverKeysFalse);
+	//tr.set(recoveryCommitRequest.arena, cacheChangeKeyFor(0), BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned()));
+	//tr.set(recoveryCommitRequest.arena, cacheChangeKey, BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned()));

 	tr.clear(recoveryCommitRequest.arena, tLogDatacentersKeys);
 	for(auto& dc : self->primaryDcId) {
@ -1356,7 +1368,7 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
 		}
 	}

-	applyMetadataMutations(self->dbgid, recoveryCommitRequest.arena, tr.mutations.slice(mmApplied, tr.mutations.size()), self->txnStateStore, NULL, NULL);
+	applyMetadataMutations(self->dbgid, recoveryCommitRequest.arena, tr.mutations.slice(mmApplied, tr.mutations.size()), self->txnStateStore, nullptr, nullptr);
 	mmApplied = tr.mutations.size();

 	tr.read_snapshot = self->recoveryTransactionVersion;  // lastEpochEnd would make more sense, but isn't in the initial window of the resolver(s)
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -75,25 +75,6 @@ inline bool canReplyWith(Error e) {
 	};
 }

-struct StorageServer;
-class ValueOrClearToRef {
-public:
-	static ValueOrClearToRef value(ValueRef const& v) { return ValueOrClearToRef(v, false); }
-	static ValueOrClearToRef clearTo(KeyRef const& k) { return ValueOrClearToRef(k, true); }
-
-	bool isValue() const { return !isClear; };
-	bool isClearTo() const { return isClear; }
-
-	ValueRef const& getValue() const { ASSERT( isValue() ); return item; };
-	KeyRef const&  getEndKey() const { ASSERT(isClearTo()); return item; };
-
-private:
-	ValueOrClearToRef( StringRef item, bool isClear ) : item(item), isClear(isClear) {}
-
-	StringRef item;
-	bool isClear;
-};
-
 struct AddingShard : NonCopyable {
 	KeyRange keys;
 	Future<Void> fetchClient;			// holds FetchKeys() actor
@ -390,6 +371,8 @@ public:
 	KeyRangeMap< Reference<ShardInfo> > shards;
 	uint64_t shardChangeCounter;      // max( shards->changecounter )

+	KeyRangeMap <bool> cachedRangeMap; // indicates if a key-range is being cached
+
 	// newestAvailableVersion[k]
 	//   == invalidVersion -> k is unavailable at all versions
 	//   <= storageVersion -> k is unavailable at all versions (but might be read anyway from storage if we are in the process of committing makeShardDurable)
@ -516,6 +499,8 @@ public:
 			specialCounter(cc, "VersionLag", [self](){ return self->versionLag; });
 			specialCounter(cc, "LocalRate", [self]{ return self->currentRate() * 100; });

+			specialCounter(cc, "BytesReadSampleCount", [self]() { return self->metrics.bytesReadSample.queue.size(); });
+
 			specialCounter(cc, "FetchKeysFetchActive", [self](){ return self->fetchKeysParallelismLock.activePermits(); });
 			specialCounter(cc, "FetchKeysWaiting", [self](){ return self->fetchKeysParallelismLock.waiters(); });

@ -890,9 +875,10 @@ ACTOR Future<Void> getValueQ( StorageServer* data, GetValueRequest req ) {
 		}

 		StorageMetrics metrics;
-		metrics.bytesReadPerKSecond = v.present() ? std::max((int64_t)(req.key.size() + v.get().size()),
-		                                                     SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE)
-		                                          : SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE;
+		// If the read yields no value, randomly sample the empty read.
+		metrics.bytesReadPerKSecond =
+		    v.present() ? std::max((int64_t)(req.key.size() + v.get().size()), SERVER_KNOBS->EMPTY_READ_PENALTY)
+		                : SERVER_KNOBS->EMPTY_READ_PENALTY;
 		data->metrics.notify(req.key, metrics);

 		if( req.debugID.present() )
@ -1082,7 +1068,6 @@ void merge( Arena& arena, VectorRef<KeyValueRef, VecSerStrategy::String>& output
 	ASSERT( output.size() <= originalLimit );
 }

-// readRange reads up to |limit| rows from the given range and version, combining data->storage and data->versionedData.
 // If limit>=0, it returns the first rows in the range (sorted ascending), otherwise the last rows (sorted descending).
 // readRange has O(|result|) + O(log |data|) cost
 ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version, KeyRange range, int limit, int* pLimitBytes ) {
@ -1100,6 +1085,12 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
 	//state int originalLimitBytes = *pLimitBytes;
 	//state bool track = rrid.first() == 0x1bc134c2f752187cLL;

+	// Check if the desired key-range intersects the cached key-ranges
+	// TODO Find a more efficient way to do it
+	// TODO Also need this check in single key/value lookup
+	auto cached = data->cachedRangeMap.intersectingRanges(range);
+	result.cached = (cached.begin() != cached.end());
+
 	// FIXME: Review pLimitBytes behavior
 	// if (limit >= 0) we are reading forward, else backward

@ -1271,15 +1262,15 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
 	result.more = limit == 0 || *pLimitBytes<=0;  // FIXME: Does this have to be exact?
 	result.version = version;
 	StorageMetrics metrics;
-	metrics.bytesReadPerKSecond = std::max(readSize, SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE);
+	metrics.bytesReadPerKSecond = std::max(readSize, SERVER_KNOBS->EMPTY_READ_PENALTY);
 	data->metrics.notify(limit >= 0 ? range.begin : range.end, metrics);
 	return result;
 }

-bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) {
+//bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) {
 	// Returns true if the given range suffices to at least begin to resolve the given KeySelectorRef
-	return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end);
-}
+//	return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end);
+//}

 ACTOR Future<Key> findKey( StorageServer* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset)
 // Attempts to find the key indicated by sel in the data at version, within range.
@ -1327,14 +1318,13 @@ ACTOR Future<Key> findKey( StorageServer* data, KeySelectorRef sel, Version vers
 		*pOffset = 0;

 		StorageMetrics metrics;
-		metrics.bytesReadPerKSecond =
-		    std::max((int64_t)rep.data[index].key.size(), SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE);
+		metrics.bytesReadPerKSecond = std::max((int64_t)rep.data[index].key.size(), SERVER_KNOBS->EMPTY_READ_PENALTY);
 		data->metrics.notify(sel.getKey(), metrics);

 		return rep.data[ index ].key;
 	} else {
 		StorageMetrics metrics;
-		metrics.bytesReadPerKSecond = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE;
+		metrics.bytesReadPerKSecond = SERVER_KNOBS->EMPTY_READ_PENALTY;
 		data->metrics.notify(sel.getKey(), metrics);

 		// FIXME: If range.begin=="" && !forward, return success?
@ -1466,7 +1456,7 @@ ACTOR Future<Void> getKeyValues( StorageServer* data, GetKeyValuesRequest req )

 			for (int i = 0; i < r.data.size(); i++) {
 				StorageMetrics m;
-				m.bytesReadPerKSecond = r.data[i].expectedSize();
+				m.bytesReadPerKSecond = std::max((int64_t)r.data[i].expectedSize(), SERVER_KNOBS->EMPTY_READ_PENALTY);
 				data->metrics.notify(r.data[i].key, m);
 			}

@ -1772,11 +1762,6 @@ bool expandMutation( MutationRef& m, StorageServer::VersionedData const& data, U
 	return true;
 }

-bool isClearContaining( StorageServer::VersionedData::ViewAtVersion const& view, KeyRef key ) {
-	auto i = view.lastLessOrEqual(key);
-	return i && i->isClearTo() && i->getEndKey() > key;
-}
-
 void applyMutation( StorageServer *self, MutationRef const& m, Arena& arena, StorageServer::VersionedData &data ) {
 	// m is expected to be in arena already
 	// Clear split keys are added to arena
@ -1806,7 +1791,7 @@ void applyMutation( StorageServer *self, MutationRef const& m, Arena& arena, Sto
 	} else if (m.type == MutationRef::ClearRange) {
 		data.erase( m.param1, m.param2 );
 		ASSERT( m.param2 > m.param1 );
-		ASSERT( !isClearContaining( data.atLatest(), m.param1 ) );
+		ASSERT( !data.isClearContaining( data.atLatest(), m.param1 ) );
 		data.insert( m.param1, ValueOrClearToRef::clearTo(m.param2) );
 		self->watches.triggerRange( m.param1, m.param2 );
 	}
@ -2461,6 +2446,8 @@ void StorageServer::addMutation(Version version, MutationRef const& mutation, Ke
 			printf("  eager: %s\n", printable( eagerReads->getKeyEnd( mutation.param2 ) ).c_str() );
 	}
 	applyMutation( this, expanded, mLog.arena(), mutableData() );
+	//printf("\nSSUpdate: Printing versioned tree after applying mutation\n");
+	//mutableData().printTree(version);
 }

 struct OrderByVersion {
@ -2490,8 +2477,8 @@ static const KeyRef persistPrimaryLocality = LiteralStringRef( PERSIST_PREFIX "P

 class StorageUpdater {
 public:
-	StorageUpdater() : fromVersion(invalidVersion), currentVersion(invalidVersion), restoredVersion(invalidVersion), processedStartKey(false) {}
-	StorageUpdater(Version fromVersion, Version restoredVersion) : fromVersion(fromVersion), currentVersion(fromVersion), restoredVersion(restoredVersion), processedStartKey(false) {}
+	StorageUpdater() : fromVersion(invalidVersion), currentVersion(invalidVersion), restoredVersion(invalidVersion), processedStartKey(false), processedCacheStartKey(false) {}
+	StorageUpdater(Version fromVersion, Version restoredVersion) : fromVersion(fromVersion), currentVersion(fromVersion), restoredVersion(restoredVersion), processedStartKey(false), processedCacheStartKey(false) {}

 	void applyMutation(StorageServer* data, MutationRef const& m, Version ver) {
 		//TraceEvent("SSNewVersion", data->thisServerID).detail("VerWas", data->mutableData().latestVersion).detail("ChVer", ver);
@ -2503,8 +2490,12 @@ public:
 		}

 		if (m.param1.startsWith( systemKeys.end )) {
-			//TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver);
-			applyPrivateData( data, m );
+			if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix))
+				applyPrivateCacheData( data, m);
+			else {
+				//TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver);
+				applyPrivateData( data, m );
+			}
 		} else {
 			// FIXME: enable when debugMutation is active
 			//for(auto m = changes[c].mutations.begin(); m; ++m) {
@ -2526,6 +2517,9 @@ private:
 	bool nowAssigned;
 	bool processedStartKey;

+	KeyRef cacheStartKey;
+	bool processedCacheStartKey;
+
 	void applyPrivateData( StorageServer* data, MutationRef const& m ) {
 		TraceEvent(SevDebug, "SSPrivateMutation", data->thisServerID).detail("Mutation", m.toString());

@ -2586,6 +2580,37 @@ private:
 			ASSERT(false);  // Unknown private mutation
 		}
 	}
+
+	void applyPrivateCacheData( StorageServer* data, MutationRef const& m ) {
+		TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString());
+
+		if (processedCacheStartKey) {
+			// Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end)
+			ASSERT((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix));
+			KeyRangeRef keys( cacheStartKey.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ),
+							  m.param1.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ));
+			data->cachedRangeMap.insert(keys, true);
+			//TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Begin", keys.begin).detail("End", keys.end);
+			//fprintf(stderr, "applyPrivateCacheData : begin: %s, end: %s\n", printable(keys.begin).c_str(), printable(keys.end).c_str());
+
+			//Figure out the affected shard ranges and maintain the cached key-range information in the in-memory map
+			// TODO revisit- we are not splitting the cached ranges based on shards as of now.
+			if (0) {
+			auto cachedRanges = data->shards.intersectingRanges(keys);
+			for(auto shard = cachedRanges.begin(); shard != cachedRanges.end(); ++shard) {
+				KeyRangeRef intersectingRange = shard.range() & keys;
+				data->cachedRangeMap.insert(KeyRangeRef(intersectingRange.begin, intersectingRange.end), true);
+			}
+			}
+			processedStartKey = false;
+		} else if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) {
+			// Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end)
+			cacheStartKey = m.param1;
+			processedCacheStartKey = true;
+		} else {
+			ASSERT(false);  // Unknown private mutation
+		}
+	}
 };

 ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@ -360,6 +360,7 @@ ACTOR Future<Void> pingDatabase( Database cx ) {
 	loop {
 		try {
 			tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE );
+			tr.setOption( FDBTransactionOptions::LOCK_AWARE );
 			Optional<Value> v = wait( tr.get( StringRef("/Liveness/" + deterministicRandom()->randomUniqueID().toString() ) ) );
 			tr.makeSelfConflicting();
 			wait( tr.commit() );
@ -1092,11 +1093,12 @@ ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControlle
 		// do we handle a failure here?
 	}

-	printf("\n%d tests passed; %d tests failed, waiting for DD to end...\n\n", passCount, failCount);
-	
+	printf("\n%d tests passed; %d tests failed.\n", passCount, failCount);
+
 	//If the database was deleted during the workload we need to recreate the database
 	if(tests.empty() || useDB) {
 		if(waitForQuiescenceEnd) {
+			printf("Waiting for DD to end...\n");
 			try {
 				wait(quietDatabase(cx, dbInfo, "End", 0, 2e6, 2e6) ||
 				     (databasePingDelay == 0.0 ? Never()
@ -1107,6 +1109,7 @@ ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControlle
 			}
 		}
 	}
+	printf("\n");

 	return Void();
 }
--- a/Show More
+++ b/Show More