Merge branch 'master' into feature-small-endpoint

# Conflicts: # fdbclient/StorageServerInterface.h
2020-05-08 16:37:35 -07:00 · 2020-05-08 16:37:35 -07:00 · f0f52fb2be
parent 8166e1af8b a08bbcc539
commit f0f52fb2be
125 changed files with 2763 additions and 1209 deletions
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -449,7 +449,7 @@ FDBFuture* fdb_transaction_get_range_impl(

 	/* _ITERATOR mode maps to one of the known streaming modes
 	   depending on iteration */
-	static const int mode_bytes_array[] = {CLIENT_KNOBS->BYTE_LIMIT_UNLIMITED, 256, 1000, 4096, 80000};
+	const int mode_bytes_array[] = { CLIENT_KNOBS->BYTE_LIMIT_UNLIMITED, 256, 1000, 4096, 80000 };

 	/* The progression used for FDB_STREAMING_MODE_ITERATOR.
 	   Goes from small -> medium -> large.  Then 1.5 * previous until serial. */
--- a/bindings/flow/CMakeLists.txt
+++ b/bindings/flow/CMakeLists.txt
@ -18,6 +18,12 @@ set(SRCS

 add_flow_target(STATIC_LIBRARY NAME fdb_flow SRCS ${SRCS})
 target_link_libraries(fdb_flow PUBLIC fdb_c)
+target_include_directories(fdb_flow PUBLIC
+  "${CMAKE_CURRENT_BINARY_DIR}"
+  "${CMAKE_CURRENT_SOURCE_DIR}"
+  "${CMAKE_CURRENT_SOURCE_DIR}/tester"
+  "${CMAKE_CURRENT_BINARY_DIR}/tester"
+  )

 add_subdirectory(tester)

--- a/bindings/flow/tester/Tester.actor.h
+++ b/bindings/flow/tester/Tester.actor.h
@ -34,11 +34,11 @@
 #include "bindings/flow/DirectoryLayer.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.

-#define LOG_ALL 0
-#define LOG_INSTRUCTIONS LOG_ALL || 0
-#define LOG_OPS LOG_ALL || 0
-#define LOG_DIRS LOG_ALL || 0
-#define LOG_ERRORS LOG_ALL || 0
+constexpr bool LOG_ALL = false;
+constexpr bool LOG_INSTRUCTIONS = LOG_ALL || false;
+constexpr bool LOG_OPS = LOG_ALL || false;
+constexpr bool LOG_DIRS = LOG_ALL || false;
+constexpr bool LOG_ERRORS = LOG_ALL || false;

 struct FlowTesterData;

--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@ -88,13 +88,20 @@ func (o NetworkOptions) SetTraceFormat(param string) error {
 	return o.setOpt(34, []byte(param))
 }

-// Select clock source for trace files. now (default) or realtime are supported.
+// Select clock source for trace files. now (the default) or realtime are supported.
 //
 // Parameter: Trace clock source
 func (o NetworkOptions) SetTraceClockSource(param string) error {
 	return o.setOpt(35, []byte(param))
 }

+// Once provided, this string will be used to replace the port/PID in the log file names.
+//
+// Parameter: The identifier that will be part of all trace file names
+func (o NetworkOptions) SetTraceFileIdentifier(param string) error {
+	return o.setOpt(36, []byte(param))
+}
+
 // Set internal tuning or debugging knobs
 //
 // Parameter: knob_name=knob_value
@ -223,11 +230,16 @@ func (o NetworkOptions) SetDisableClientStatisticsLogging() error {
 	return o.setOpt(70, nil)
 }

-// Enables debugging feature to perform slow task profiling. Requires trace logging to be enabled. WARNING: this feature is not recommended for use in production.
+// Deprecated
 func (o NetworkOptions) SetEnableSlowTaskProfiling() error {
 	return o.setOpt(71, nil)
 }

+// Enables debugging feature to perform run loop profiling. Requires trace logging to be enabled. WARNING: this feature is not recommended for use in production.
+func (o NetworkOptions) SetEnableRunLoopProfiling() error {
+	return o.setOpt(71, nil)
+}
+
 // Enable client buggify - will make requests randomly fail (intended for client testing)
 func (o NetworkOptions) SetClientBuggifyEnable() error {
 	return o.setOpt(80, nil)
@ -441,6 +453,11 @@ func (o TransactionOptions) SetTransactionLoggingMaxFieldLength(param int64) err
 	return o.setOpt(405, int64ToBytes(param))
 }

+// Sets an identifier for server tracing of this transaction. When committed, this identifier triggers logging when each part of the transaction authority encounters it, which is helpful in diagnosing slowness in misbehaving clusters. The identifier is randomly generated. When there is also a debug_transaction_identifier, both IDs are logged together.
+func (o TransactionOptions) SetServerRequestTracing() error {
+	return o.setOpt(406, nil)
+}
+
 // Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option.
 //
 // Parameter: value in milliseconds of timeout
@ -499,6 +516,11 @@ func (o TransactionOptions) SetUseProvisionalProxies() error {
 	return o.setOpt(711, nil)
 }

+// The transaction can retrieve keys that are conflicting with other transactions.
+func (o TransactionOptions) SetReportConflictingKeys() error {
+	return o.setOpt(712, nil)
+}
+
 type StreamingMode int

 const (
@ -636,15 +658,15 @@ type ErrorPredicate int

 const (

-	// Returns ``true`` if the error indicates the operations in the
-	// transactions should be retried because of transient error.
+	// Returns ``true`` if the error indicates the operations in the transactions
+	// should be retried because of transient error.
 	ErrorPredicateRetryable ErrorPredicate = 50000

-	// Returns ``true`` if the error indicates the transaction may have
-	// succeeded, though not in a way the system can verify.
+	// Returns ``true`` if the error indicates the transaction may have succeeded,
+	// though not in a way the system can verify.
 	ErrorPredicateMaybeCommitted ErrorPredicate = 50001

-	// Returns ``true`` if the error indicates the transaction has not
-	// committed, though in a way that can be retried.
+	// Returns ``true`` if the error indicates the transaction has not committed,
+	// though in a way that can be retried.
 	ErrorPredicateRetryableNotCommitted ErrorPredicate = 50002
 )
--- a/bindings/java/CMakeLists.txt
+++ b/bindings/java/CMakeLists.txt
@ -140,7 +140,7 @@ set_target_properties(java_workloads PROPERTIES
 target_link_libraries(java_workloads PUBLIC fdb_c ${JNI_LIBRARIES})
 target_include_directories(java_workloads PUBLIC ${JNI_INCLUDE_DIRS})

-set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.8" "-target" "1.8")
+set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.8" "-target" "1.8" "-XDignore.symbol.file")
 set(CMAKE_JNI_TARGET TRUE)

 # build a manifest file
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@ -123,6 +123,7 @@ function(add_fdb_test)
    -b ${PROJECT_BINARY_DIR}
    -t ${test_type}
    -O ${OLD_FDBSERVER_BINARY}
+    --crash
    --aggregate-traces ${TEST_AGGREGATE_TRACES}
    --log-format ${TEST_LOG_FORMAT}
    --keep-logs ${TEST_KEEP_LOGS}
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -8,6 +8,7 @@ env_set(ALLOC_INSTRUMENTATION OFF BOOL "Instrument alloc")
 env_set(WITH_UNDODB OFF BOOL "Use rr or undodb")
 env_set(USE_ASAN OFF BOOL "Compile with address sanitizer")
 env_set(USE_UBSAN OFF BOOL "Compile with undefined behavior sanitizer")
+env_set(USE_TSAN OFF BOOL "Compile with thread sanitizer")
 env_set(FDB_RELEASE OFF BOOL "This is a building of a final release")
 env_set(USE_CCACHE OFF BOOL "Use ccache for compilation if available")
 env_set(RELATIVE_DEBUG_PATHS OFF BOOL "Use relative file paths in debug info")
@ -57,7 +58,7 @@ if(FDB_RELEASE)
 endif()

 include_directories(${CMAKE_SOURCE_DIR})
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_BINARY_DIR})
 if (NOT OPEN_FOR_IDE)
  add_definitions(-DNO_INTELLISENSE)
 endif()
@ -81,6 +82,7 @@ include(CheckFunctionExists)
 set(CMAKE_REQUIRED_INCLUDES stdlib.h malloc.h)
 set(CMAKE_REQUIRED_LIBRARIES c)
 set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_C_STANDARD 11)

 if(WIN32)
  # see: https://docs.microsoft.com/en-us/windows/desktop/WinProg/using-the-windows-headers
@ -164,6 +166,15 @@ else()
    set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS}    -fsanitize=undefined ${CMAKE_THREAD_LIBS_INIT}")
  endif()

+  if(USE_TSAN)
+    add_compile_options(
+      -fsanitize=thread
+      -DUSE_SANITIZER)
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -fsanitize=thread")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fsanitize=thread")
+    set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS}    -fsanitize=thread ${CMAKE_THREAD_LIBS_INIT}")
+  endif()
+
  if(PORTABLE_BINARY)
    message(STATUS "Create a more portable binary")
    set(CMAKE_MODULE_LINKER_FLAGS "-static-libstdc++ -static-libgcc ${CMAKE_MODULE_LINKER_FLAGS}")
--- a/cmake/FlowCommands.cmake
+++ b/cmake/FlowCommands.cmake
@ -237,5 +237,4 @@ function(add_flow_target)
      strip_debug_symbols(${AFT_NAME})
    endif()
  endif()
-  target_include_directories(${AFT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
 endfunction()
--- a/cmake/InstallLayout.cmake
+++ b/cmake/InstallLayout.cmake
@ -211,6 +211,12 @@ set(CPACK_PACKAGE_CHECKSUM SHA256)
 configure_file("${CMAKE_SOURCE_DIR}/cmake/CPackConfig.cmake" "${CMAKE_BINARY_DIR}/packaging/CPackConfig.cmake")
 set(CPACK_PROJECT_CONFIG_FILE "${CMAKE_BINARY_DIR}/packaging/CPackConfig.cmake")

+################################################################################
+# User config
+################################################################################
+
+set(GENERATE_DEBUG_PACKAGES "${FDB_RELEASE}" CACHE BOOL "Build debug rpm/deb packages (default: only ON for FDB_RELEASE)")
+
 ################################################################################
 # Version information
 ################################################################################
@ -337,7 +343,7 @@ set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
  "/lib/systemd"
  "/lib/systemd/system"
  "/etc/rc.d/init.d")
-set(CPACK_RPM_DEBUGINFO_PACKAGE ON)
+set(CPACK_RPM_DEBUGINFO_PACKAGE ${GENERATE_DEBUG_PACKAGES})
 #set(CPACK_RPM_BUILD_SOURCE_DIRS_PREFIX /usr/src)
 set(CPACK_RPM_COMPONENT_INSTALL ON)

@ -382,7 +388,7 @@ set(CPACK_RPM_SERVER-EL7_PACKAGE_REQUIRES
 set(CPACK_DEBIAN_CLIENTS-DEB_FILE_NAME "${deb-clients-filename}_amd64.deb")
 set(CPACK_DEBIAN_SERVER-DEB_FILE_NAME "${deb-server-filename}_amd64.deb")
 set(CPACK_DEB_COMPONENT_INSTALL ON)
-set(CPACK_DEBIAN_DEBUGINFO_PACKAGE ON)
+set(CPACK_DEBIAN_DEBUGINFO_PACKAGE ${GENERATE_DEBUG_PACKAGES})
 set(CPACK_DEBIAN_PACKAGE_SECTION "database")
 set(CPACK_DEBIAN_ENABLE_COMPONENT_DEPENDS ON)

--- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py
+++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py
@ -45,11 +45,12 @@ PROTOCOL_VERSION_5_2 = 0x0FDB00A552000001
 PROTOCOL_VERSION_6_0 = 0x0FDB00A570010001
 PROTOCOL_VERSION_6_1 = 0x0FDB00B061060001
 PROTOCOL_VERSION_6_2 = 0x0FDB00B062010001
+PROTOCOL_VERSION_6_3 = 0x0FDB00B063010001
 supported_protocol_versions = frozenset([PROTOCOL_VERSION_5_2, PROTOCOL_VERSION_6_0, PROTOCOL_VERSION_6_1,
-                                         PROTOCOL_VERSION_6_2])
+                                         PROTOCOL_VERSION_6_2, PROTOCOL_VERSION_6_3])


-fdb.api_version(600)
+fdb.api_version(520)

 BASIC_FORMAT = "%(asctime)s - %(levelname)-8s %(message)s"
 LOG_PATH = "transaction_profiling_analyzer.log"
@ -180,7 +181,8 @@ class GetVersionInfo(BaseInfo):
        self.latency = bb.get_double()
        if protocol_version >= PROTOCOL_VERSION_6_2:
            self.transaction_priority_type = bb.get_int()
-
+        if protocol_version >= PROTOCOL_VERSION_6_3:
+            self.read_version = bb.get_long()	

 class GetInfo(BaseInfo):
    def __init__(self, bb):
@ -205,6 +207,8 @@ class CommitInfo(BaseInfo):
        self.num_mutations = bb.get_int()
        self.commit_bytes = bb.get_int()
    
+        if protocol_version >= PROTOCOL_VERSION_6_3:
+            self.commit_version = bb.get_long() 
        read_conflict_range = bb.get_key_range_list()
        if full_output:
            self.read_conflict_range = read_conflict_range
--- a/documentation/sphinx/source/api-python.rst
+++ b/documentation/sphinx/source/api-python.rst
@ -837,6 +837,8 @@ Transaction options

    .. warning:: |option-priority-system-immediate-warning|

+.. _api-python-option-set-causal-read-risky:
+
 .. method:: Transaction.options.set_causal_read_risky

    |option-causal-read-risky-blurb|
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@ -218,7 +218,7 @@ The ``start`` subcommand is used to start a backup.  If there is already a backu

 ::

-   user@host$ fdbbackup start [-t <TAG>] -d <BACKUP_URL> [-z] [-s <DURATION>] [-w] [-k '<BEGIN>[ <END>]']...
+   user@host$ fdbbackup start [-t <TAG>] -d <BACKUP_URL> [-z] [-s <DURATION>] [--partitioned_log_experimental] [-w] [-k '<BEGIN>[ <END>]']...

 ``-z``
  Perform the backup continuously rather than terminating once a restorable backup is achieved.  Database mutations within the backup's target key ranges will be continuously written to the backup as well as repeated inconsistent snapshots at the configured snapshot rate.
@ -226,6 +226,9 @@ The ``start`` subcommand is used to start a backup.  If there is already a backu
 ``-s <DURATION>`` or ``--snapshot_interval <DURATION>``  
  Specifies the duration, in seconds, of the inconsistent snapshots written to the backup in continuous mode.  The default is 864000 which is 10 days.

+``--partitioned_log_experimental``
+  Specifies the backup uses the partitioned mutation logs generated by backup workers. Since FDB version 6.3, this option is experimental and requires using fast restore for restoring the database from the generated files. The default is to use non-partitioned mutation logs generated by backup agents.
+
 ``-w``
  Wait for the backup to complete with behavior identical to that of the :ref:`wait command <backup-wait>`.

--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@ -371,7 +371,7 @@ An additional important property, though technically not part of ACID, is also g

 FoundationDB implements these properties using multiversion concurrency control (MVCC) for reads and optimistic concurrency for writes. As a result, neither reads nor writes are blocked by other readers or writers. Instead, conflicting transactions will fail at commit time and will usually be retried by the client.

-In particular, the reads in a transaction take place from an instantaneous snapshot of the database. From the perspective of the transaction this snapshot is not modified by the writes of other, concurrent transactions. When the transaction is ready to be committed, the FoundationDB cluster checks that it does not conflict with any previously committed transaction (i.e. that no value read by a transaction has been modified by another transaction since the read occurred) and, if it does conflict, rejects it. Rejected conflicting transactions are usually retried by the client. Accepted transactions are written to disk on multiple cluster nodes and then reported accepted to the client.
+In particular, the reads in a transaction take place from an instantaneous snapshot of the database. From the perspective of the transaction this snapshot is not modified by the writes of other, concurrent transactions. When the read-write transaction is ready to be committed (read-only transactions don't get committed and therefore never conflict), the FoundationDB cluster checks that it does not conflict with any previously committed transaction (i.e. that no value read by a transaction has been modified by another transaction since the read occurred) and, if it does conflict, rejects it. Rejected conflicting transactions are usually retried by the client. Accepted transactions are written to disk on multiple cluster nodes and then reported accepted to the client.

 * For more background on transactions, see Wikipedia articles for `Database transaction <http://en.wikipedia.org/wiki/Database_transaction>`_, `Atomicity (database systems) <http://en.wikipedia.org/wiki/Atomicity_(database_systems)>`_, and `Concurrency Control <http://en.wikipedia.org/wiki/Concurrency_control>`_.

@ -823,3 +823,136 @@ Loading data is a common task in any database. Loading data in FoundationDB will
 * Use multiple processes loading in parallel if a single one is CPU-bound.

 Using these techniques, our cluster of 24 nodes and 48 SSDs loads about 3 billion (100 byte) key-value pairs per hour.
+
+Implementation Details
+======================
+
+These following sections go into some of the gritty details of FoundationDB. Most users don't need to read or understand this in order to use FoundationDB efficiently.
+
+How FoundationDB Detects Conflicts
+----------------------------------
+
+As written above, FoundationDB implements serializable transactions with external consistency. The underlying algorithm uses multi-version concurrency control. At commit time, each transaction is checked for read-write conflicts.
+
+Conceptually this algorithm is quite simple. Each transaction will get a read version assigned when it issues the first read or before it tries to commit. All reads that happen during that transaction will be read as of that version. Writes will go into a local cache and will be sent to FoundationDB during commit time. The transaction can successfully commit if it is conflict free; it will then get a commit-version assigned. A transaction is conflict free if and only if there have been no writes to any key that was read by that transaction between the time the transaction started and the commit time. This is true if there was no transaction with a commit version larger than our read version but smaller than our commit version that wrote to any of the keys that we read.
+
+This form of conflict detection, while simple, can often be confusing for people who are familiar with databases that check for write-write conflicts.
+
+Some interesting properties of FoundationDB transactions are:
+
+* FoundationDB transactions are optimistic: we never block on reads or writes (there are no locks), instead we abort transactions at commit time.
+* Read-only transactions will never conflict and never cause conflicts with other transactions.
+* Write-only transactions will never conflict but might cause future transactions to conflict.
+* For read-write transactions: A read will never cause any other transaction to be aborted - but reading a key might result in the current transaction being aborted at commit time. A write will never cause a conflict in the current transaction but might cause conflicts in transactions that try to commit in the future.
+* FoundationDB only uses the read conflict set and the write conflict set to resolve transactions. A user can read from and write to FoundationDB without adding entries to these sets. If not done carefully, this can cause non-serializable executions (see :ref:`Snapshot Reads <api-python-snapshot-reads>` and the :ref:`no-write-conflict-range option <api-python-no-write-conflict-range>` option).
+
+How Versions are Generated and Assigned
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Versions are generated by the process that runs the *master* role. FoundationDB guarantees that no version will be generated twice and that the versions are monotonically increasing.
+
+In order to assign read and commit versions to transactions, a client will never talk to the master. Instead it will get both from a proxy. Getting a read version is more complex than a commit version. Let's first look at commit versions:
+
+1. The client will send a commit message to a proxy.
+1. The proxy will put this commit message in a queue in order to build a batch.
+1. In parallel, the proxy will ask for a new version from the master (note that this means that only proxies will ever ask for new versions - which scales much better as it puts less stress on the network).
+1. The proxy will then resolve all transactions within that batch (discussed later) and assign the version it got from the master to *all* transactions within that batch. It will then write the transactions to the transaction log system to make it durable.
+1. If the transaction succeeded, it will send back the version as commit version to the client. Otherwise it will send back an error.
+
+As mentioned before, the algorithm to assign read versions is a bit more complex. At the start of a transaction, a client will ask a proxy server for a read version. The proxy will reply with the last committed version as of the time it received the request - this is important to guarantee external consistency. This is how this is achieved:
+
+#. The client will send a GRV (get read version) request to a proxy.
+#. The proxy will batch GRV requests for a short amount of time (it depends on load and configuartion how big these batches will be).
+#. The proxy will do the following steps in parallel:
+   * Ask all other proxies for their most recent committed version (the largest version they received from the master for which it successfully wrote the transactions to the transaction log system).
+   * Send a message to the transaction log system to verify that it is still writable. This is to prevent that we fetch read versions from a proxy that has been declared to be dead.
+#. It will then take the largest committed version from all proxies (including its own) and send it back to the clients.
+
+Checking whether the log-system is still writeable can be especially expensive if a clusters runs in a multi-region configuration. If a user is fine to sacrifice strict serializability they can use :ref:`option-causal-read-risky <api-python-option-set-causal-read-risky>`.
+
+Conflict Detection
+~~~~~~~~~~~~~~~~~~
+
+This section will only explain conceptually how transactions are resolved in FoundationDB. The implementation will use multiple servers running the *Resolver* role and the keyspace will be sharded across them. It will also only allow resolving transactions whose read versions are less than 5 million versions older than their commit version (around 5 seconds).
+
+A resolver will keep a map in memory which stores the written keys of each commit version. A simpified resolver state could look like this:
+
+=======    =======
+Version    Keys
+=======    =======
+1000       a, b
+1200       f, q, c
+1210       a
+1340       t, u, x
+=======    =======
+
+Now let's assume we have a transaction with read version *1200* and the assigned commit version will be something larger than 1340 - let's say it is *1450*. In that transaction we read keys ``b, m, s`` and we want to write to ``a``. Note that we didn't read ``a`` - so we will issue a blind write. The resolver will check whether any of the read keys (``b, m, or s``) appers in any line between version *1200* and the most recent version, *1450*. The last write to ``b`` was at version 1000 which was before the read version. This means that transaction read the most recent value. We don't know about any recent writes to the other keys. Therefore the resolver will decide that this transaction does *NOT* conflict and it can be committed. It will then add this new write set to its internal state so that it can resolve future transactions. The new state will look like this:
+
+=======    =======
+Version    Keys
+=======    =======
+1000       a, b
+1200       f, q, c
+1210       a
+1340       t, u, x
+1450       a
+=======    =======
+
+Note that the resolver didn't use the write set at all in order to make a decision whether the transaction can commit or not. This means that blind writes (writes to keys without reading them first) will never cause a conflict. But since the resolver will then remember these writes, blind writes can cause future transactions to conflict.
+
+Error Handling
+--------------
+
+When using FoundationDB we strongly recommend users to use the retry-loop. In Python the retry loop would look this this:
+
+.. code-block:: python
+
+   tr = tr.create_transaction()
+   while True:
+       try:
+           # execute reads and writes on FDB using the tr object
+           tr.commit().wait()
+           break
+       except FDBError as e:
+           tr.on_error(e.code).wait()
+
+This is also what the transaction decoration in python does, if you pass a ``Database`` object to a decorated function. There are some interesting properies of this retry loop:
+
+* We never create a new transaction within that loop. Instead ``tr.on_error`` will create a soft reset on the transaction.
+* ``tr.on_error`` returns a future. This is because ``on_error`` will do back off to make sure we don't overwhelm the cluster.
+* If ``tr.on_error`` throws an error, we exit the retry loop.
+
+If you use this retry loop, there are very few caveats. If you write your own and you are not careful, some things might behave differently than you would expect. The following sections will go over the most common errors you will see, the guarantees FoundationDB provides during failures, and common caveats. This retry loop will take care of most of these errors, but it might still be beneficial to understand those.
+
+Errors where we know the State of the Transaction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The most common errors you will see are errors where we know that the transaction failed to commit. In this case, we're guaranteed that nothing that we attempted to write was written to the database. The most common error codes for this are:
+
+* ``not_committed`` is thrown whenever there was a conflict. This will only be thrown by a ``commit``, read and write operations won't generate this error.
+* ``transaction_too_old`` is thrown if your transaction runs for more than five seconds. If you see this error often, you should try to make your transactions shorter.
+* ``future_version`` is one of the slightly more complex errors. There are a couple ways this error could be generated: if you set the read version of your transaction manually to something larger than exists or if the storage servers are falling behind. The second case should be more common. This is usually caused by a write load that is too high for FoundationDB to handle or by faulty/slow disks.
+
+The good thing about these errors is that retrying is simple: you know that the transaction didn't commit and therefore you can retry even without thinking much about weird corner cases.
+
+The ``commit_unknown_result`` Error
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``commit_unknown_result`` can be thrown during a commit. This error is difficult to handle as you won't know whether your transaction was committed or not. There are mostly two reasons why you might see this error:
+
+#. The client lost the connection to the proxy to which it did send the commit. So it never got a reply and therefore can't know whether the commit was successful or not.
+#. There was a FoundationDB failure - for example a proxy failed during the commit. In that case there is no way for the client know whether the transaction succeeded or not.
+
+However, there is one guarantee FoundationDB gives to the caller: at the point of time where you receive this error, the transaction either committed or not and if it didn't commit, it will never commit in the future. Or: it is guaranteed that the transaction is not in-flight anymore. This is an important guarantee as it means that if your transaction is idempotent you can simply retry. For more explanations see developer-guide-unknown-results_.
+
+Non-Retryable Errors
+~~~~~~~~~~~~~~~~~~~~
+
+The trickiest errors are non-retryable errors. ``Transaction.on_error`` will rethrow these. Some examples of non-retryable errors are:
+
+#. ``transaction_timed_out``. If you set a timeout for a transaction, the transaction will throw this error as soon as that timeout occurs.
+#. ``operation_cancelled``. This error is thrown if you call ``cancel()`` on any future returned by a transaction. So if this future is shared by multiple threads or coroutines, all other waiters will see this error.
+
+If you see one of those errors, the best way of action is to fail the client.
+
+At a first glance this looks very similar to an ``commit_unknown_result``. However, these errors lack the one guarantee ``commit_unknown_result`` still gives to the user: if the commit has already been sent to the database, the transaction could get committed at a later point in time. This means that if you retry the transaction, your new transaction might race with the old transaction. While this technically doesn't violate any consistency guarantees, abandoning a transaction means that there are no causality guaranatees.
--- a/documentation/sphinx/source/kv-architecture.rst
+++ b/documentation/sphinx/source/kv-architecture.rst
@ -38,7 +38,7 @@ The transaction logs make mutations durable to disk for fast commit latencies. T
 Resolvers
 =========

-The resolvers are responsible determining conflicts between transactions. A transaction conflicts if it reads a key that has been written between the transaction's read version and commit version. The resolver does this by holding the last 5 seconds of committed writes in memory, and comparing a new transaction's reads against this set of commits.
+The resolvers are responsible determining conflicts between transactions. A read-write transaction conflicts if it reads a key that has been written between the transaction's read version and commit version. The resolver does this by holding the last 5 seconds of committed writes in memory, and comparing a new transaction's reads against this set of commits.

 Storage Servers
 ===============
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -63,9 +63,7 @@ using std::endl;
 #endif
 #endif

-#if defined(CMAKE_BUILD) || !defined(WIN32)
-#include "versions.h"
-#endif
+#include "fdbclient/IncludeVersions.h"

 #include "flow/SimpleOpt.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.
@ -170,8 +168,9 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = {
 	{ OPT_NOSTOPWHENDONE,   "--no-stop-when-done",SO_NONE },
 	{ OPT_DESTCONTAINER,    "-d",               SO_REQ_SEP },
 	{ OPT_DESTCONTAINER,    "--destcontainer",  SO_REQ_SEP },
-	{ OPT_USE_PARTITIONED_LOG, "-p",                 SO_NONE },
-	{ OPT_USE_PARTITIONED_LOG, "--partitioned_log",  SO_NONE },
+	// Enable "-p" option after GA
+	// { OPT_USE_PARTITIONED_LOG, "-p",                 SO_NONE },
+	{ OPT_USE_PARTITIONED_LOG, "--partitioned_log_experimental",  SO_NONE },
 	{ OPT_SNAPSHOTINTERVAL, "-s",                   SO_REQ_SEP },
 	{ OPT_SNAPSHOTINTERVAL, "--snapshot_interval",  SO_REQ_SEP },
 	{ OPT_TAGNAME,         "-t",               SO_REQ_SEP },
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -48,9 +48,7 @@
 #include "fdbcli/linenoise/linenoise.h"
 #endif

-#if defined(CMAKE_BUILD) || !defined(WIN32)
-#include "versions.h"
-#endif
+#include "fdbclient/IncludeVersions.h"

 #include "flow/actorcompiler.h"  // This must be the last #include.

--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@ -505,8 +505,6 @@ Standalone<VectorRef<KeyRangeRef>> getApplyRanges(Version beginVersion, Version
 Future<Void> eraseLogData(Reference<ReadYourWritesTransaction> tr, Key logUidValue, Key destUidValue, Optional<Version> endVersion = Optional<Version>(), bool checkBackupUid = false, Version backupUid = 0);
 Key getApplyKey( Version version, Key backupUid );
 std::pair<Version, uint32_t> decodeBKMutationLogKey(Key key);
-Standalone<VectorRef<MutationRef>> decodeBackupLogValue(StringRef value);
-void decodeBackupLogValue(Arena& arena, VectorRef<MutationRef>& result, int64_t& mutationSize, StringRef value, StringRef addPrefix = StringRef(), StringRef removePrefix = StringRef());
 Future<Void> logError(Database cx, Key keyErrors, const std::string& message);
 Future<Void> logError(Reference<ReadYourWritesTransaction> tr, Key keyErrors, const std::string& message);
 Future<Void> checkVersion(Reference<ReadYourWritesTransaction> const& tr);
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@ -209,69 +209,6 @@ std::pair<Version, uint32_t> decodeBKMutationLogKey(Key key) {
 		bigEndian32(*(int32_t*)(key.begin() + backupLogPrefixBytes + sizeof(UID) + sizeof(uint8_t) + sizeof(int64_t))));
 }

-// value is an iterable representing all of the transaction log data for
-// a given version.Returns an iterable(generator) yielding a tuple for
-// each mutation in the log.At present, all mutations are represented as
-// (type, param1, param2) where type is an integer and param1 and param2 are byte strings
-Standalone<VectorRef<MutationRef>> decodeBackupLogValue(StringRef value) {
-	try {
-		uint64_t offset(0);
-		uint64_t protocolVersion = 0;
-		memcpy(&protocolVersion, value.begin(), sizeof(uint64_t));
-		offset += sizeof(uint64_t);
-		if (protocolVersion <= 0x0FDB00A200090001){
-			TraceEvent(SevError, "DecodeBackupLogValue").detail("IncompatibleProtocolVersion", protocolVersion)
-				.detail("ValueSize", value.size()).detail("Value", value);
-			throw incompatible_protocol_version();
-		}
-
-		Standalone<VectorRef<MutationRef>> result;
-		uint32_t totalBytes = 0;
-		memcpy(&totalBytes, value.begin() + offset, sizeof(uint32_t));
-		offset += sizeof(uint32_t);
-		uint32_t consumed = 0;
-
-		if(totalBytes + offset > value.size())
-			throw restore_missing_data();
-
-		int originalOffset = offset;
-
-		while (consumed < totalBytes){
-			uint32_t type = 0;
-			memcpy(&type, value.begin() + offset, sizeof(uint32_t));
-			offset += sizeof(uint32_t);
-			uint32_t len1 = 0;
-			memcpy(&len1, value.begin() + offset, sizeof(uint32_t));
-			offset += sizeof(uint32_t);
-			uint32_t len2 = 0;
-			memcpy(&len2, value.begin() + offset, sizeof(uint32_t));
-			offset += sizeof(uint32_t);
-
-			MutationRef logValue;
-			logValue.type = type;
-			logValue.param1 = value.substr(offset, len1);
-			offset += len1;
-			logValue.param2 = value.substr(offset, len2);
-			offset += len2;
-			result.push_back_deep(result.arena(), logValue);
-
-			consumed += BackupAgentBase::logHeaderSize + len1 + len2;
-		}
-
-		ASSERT(consumed == totalBytes);
-		if (value.size() != offset) {
-			TraceEvent(SevError, "BA_DecodeBackupLogValue").detail("UnexpectedExtraDataSize", value.size()).detail("Offset", offset).detail("TotalBytes", totalBytes).detail("Consumed", consumed).detail("OriginalOffset", originalOffset);
-			throw restore_corrupted_data();
-		}
-
-		return result;
-	}
-	catch (Error& e) {
-		TraceEvent(e.code() == error_code_restore_missing_data ? SevWarn : SevError, "BA_DecodeBackupLogValue").error(e).GetLastError().detail("ValueSize", value.size()).detail("Value", value);
-		throw;
-	}
-}
-
 void decodeBackupLogValue(Arena& arena, VectorRef<MutationRef>& result, int& mutationSize, StringRef value, StringRef addPrefix, StringRef removePrefix, Version version, Reference<KeyRangeMap<Version>> key_version) {
 	try {
 		uint64_t offset(0);
--- a/fdbclient/BlobStore.actor.cpp
+++ b/fdbclient/BlobStore.actor.cpp
@ -27,7 +27,7 @@
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/classification.hpp>
 #include "fdbrpc/IAsyncFile.h"
-#include "rapidxml/rapidxml.hpp"
+#include "fdbclient/rapidxml/rapidxml.hpp"
 #include "flow/actorcompiler.h" // has to be last include

 using namespace rapidxml;
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -71,6 +71,7 @@ set(FDBCLIENT_SRCS
  Tuple.h
  VersionedMap.actor.h
  VersionedMap.h
+  VersionedMap.cpp
  WriteMap.h
  json_spirit/json_spirit_error_position.h
  json_spirit/json_spirit_reader_template.h
--- a/fdbclient/ClientLogEvents.h
+++ b/fdbclient/ClientLogEvents.h
@ -108,6 +108,41 @@ namespace FdbClientLogEvents {
 		}
 	};

+	// Version V3 of EventGetVersion starting at 6.3
+	struct EventGetVersion_V3 : public Event {
+		EventGetVersion_V3(double ts, double lat, uint32_t type, Version version) : Event(GET_VERSION_LATENCY, ts), latency(lat), readVersion(version) {
+			if(type == GetReadVersionRequest::PRIORITY_DEFAULT) {
+				priorityType = PRIORITY_DEFAULT;
+			} else if (type == GetReadVersionRequest::PRIORITY_BATCH) {
+				priorityType = PRIORITY_BATCH;
+			} else if (type == GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE){
+				priorityType = PRIORITY_IMMEDIATE;
+			} else {
+				ASSERT(0);
+			}
+		 }
+		EventGetVersion_V3() { }
+
+		template <typename Ar>	Ar& serialize(Ar &ar) {
+			if (!ar.isDeserializing)
+				return serializer(Event::serialize(ar), latency, priorityType, readVersion);
+			else
+				return serializer(ar, latency, priorityType, readVersion);
+		}
+
+		double latency;
+		TrasactionPriorityType priorityType {PRIORITY_END};
+		Version readVersion;
+
+		void logEvent(std::string id, int maxFieldLength) const {
+			TraceEvent("TransactionTrace_GetVersion")
+			.detail("TransactionID", id)
+			.detail("Latency", latency)
+			.detail("PriorityType", priorityType)
+			.detail("ReadVersion", readVersion);
+		}
+	};
+
 	struct EventGet : public Event {
 		EventGet(double ts, double lat, int size, const KeyRef &in_key) : Event(GET_LATENCY, ts), latency(lat), valueSize(size), key(in_key) { }
 		EventGet() { }
@ -213,6 +248,61 @@ namespace FdbClientLogEvents {
 		}
 	};

+	// Version V2 of EventGetVersion starting at 6.3
+	struct EventCommit_V2 : public Event {
+		EventCommit_V2(double ts, double lat, int mut, int bytes, Version version, const CommitTransactionRequest &commit_req) 
+			: Event(COMMIT_LATENCY, ts), latency(lat), numMutations(mut), commitBytes(bytes), commitVersion(version), req(commit_req) { }
+		EventCommit_V2() { }
+
+		template <typename Ar>	Ar& serialize(Ar &ar) {
+			if (!ar.isDeserializing)
+				return serializer(Event::serialize(ar), latency, numMutations, commitBytes, commitVersion, req.transaction, req.arena);
+			else
+				return serializer(ar, latency, numMutations, commitBytes, commitVersion, req.transaction, req.arena);
+		}
+
+		double latency;
+		int numMutations;
+		int commitBytes;
+		Version commitVersion;
+		CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized
+
+		void logEvent(std::string id, int maxFieldLength) const {
+			for (auto &read_range : req.transaction.read_conflict_ranges) {
+				TraceEvent("TransactionTrace_Commit_ReadConflictRange")
+				.setMaxEventLength(-1)
+				.detail("TransactionID", id)
+				.setMaxFieldLength(maxFieldLength)
+				.detail("Begin", read_range.begin)
+				.detail("End", read_range.end);
+			}
+
+			for (auto &write_range : req.transaction.write_conflict_ranges) {
+				TraceEvent("TransactionTrace_Commit_WriteConflictRange")
+				.setMaxEventLength(-1)
+				.detail("TransactionID", id)
+				.setMaxFieldLength(maxFieldLength)
+				.detail("Begin", write_range.begin)
+				.detail("End", write_range.end);
+			}
+
+			for (auto &mutation : req.transaction.mutations) {
+				TraceEvent("TransactionTrace_Commit_Mutation")
+				.setMaxEventLength(-1)
+				.detail("TransactionID", id)
+				.setMaxFieldLength(maxFieldLength)
+				.detail("Mutation", mutation.toString());
+			}
+
+			TraceEvent("TransactionTrace_Commit")
+			.detail("TransactionID", id)
+			.detail("CommitVersion", commitVersion)
+			.detail("Latency", latency)
+			.detail("NumMutations", numMutations)
+			.detail("CommitSizeBytes", commitBytes);
+		}
+	};
+
 	struct EventGetError : public Event {
 		EventGetError(double ts, int err_code, const KeyRef &in_key) : Event(ERROR_GET, ts), errCode(err_code), key(in_key) { }
 		EventGetError() { }
--- a/fdbclient/CommitTransaction.h
+++ b/fdbclient/CommitTransaction.h
@ -98,19 +98,16 @@ struct MutationRef {
 	}

 	std::string toString() const {
-		if (type < MutationRef::MAX_ATOMIC_OP) {
-			return format("code: %s param1: %s param2: %s", typeString[type], printable(param1).c_str(), printable(param2).c_str());
-		}
-		else {
-			return format("code: Invalid param1: %s param2: %s", printable(param1).c_str(), printable(param2).c_str());
-		}
+		return format("code: %s param1: %s param2: %s",
+		              type < MutationRef::MAX_ATOMIC_OP ? typeString[(int)type] : "Unset", printable(param1).c_str(),
+		              printable(param2).c_str());
 	}

 	bool isAtomicOp() const { return (ATOMIC_MASK & (1 << type)) != 0; }

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		if (!ar.isDeserializing && type == ClearRange && equalsKeyAfter(param1, param2)) {
+		if (ar.isSerializing && type == ClearRange && equalsKeyAfter(param1, param2)) {
 			StringRef empty;
 			serializer(ar, type, param2, empty);
 		} else {
@ -139,6 +136,10 @@ static inline std::string getTypeString(MutationRef::Type type) {
 	return type < MutationRef::MAX_ATOMIC_OP ? typeString[(int)type] : "Unset";
 }

+static inline std::string getTypeString(uint8_t type) {
+	return type < MutationRef::MAX_ATOMIC_OP ? typeString[type] : "Unset";
+}
+
 // A 'single key mutation' is one which affects exactly the value of the key specified by its param1
 static inline bool isSingleKeyMutation(MutationRef::Type type) {
 	return (MutationRef::SINGLE_KEY_MASK & (1<<type)) != 0;
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -231,6 +231,8 @@ public:
 	UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaults;
 	std::shared_ptr<SpecialKeySpace> specialKeySpace;
 	std::shared_ptr<ConflictingKeysImpl> cKImpl;
+	std::shared_ptr<ReadConflictRangeImpl> rCRImpl;
+	std::shared_ptr<WriteConflictRangeImpl> wCRImpl;
 };

 #endif
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -268,6 +268,10 @@ struct KeyRangeRef {
 		return KeyRangeRef( begin.withPrefix(prefix), end.withPrefix(prefix) );
 	}

+	KeyRangeRef withPrefix(const StringRef& prefix, Arena& arena) const {
+		return KeyRangeRef(begin.withPrefix(prefix, arena), end.withPrefix(prefix, arena));
+	}
+
 	KeyRangeRef removePrefix( const StringRef& prefix ) const {
 		return KeyRangeRef( begin.removePrefix(prefix), end.removePrefix(prefix) );
 	}
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -3691,8 +3691,7 @@ public:
 					auto range = backupRanges[restoreIndex];
 					Standalone<StringRef> restoreTag(backupTag.toString() + "_" + std::to_string(restoreIndex));
 					// Register the request request in DB, which will be picked up by restore worker leader
-					struct RestoreRequest restoreRequest(restoreIndex, restoreTag, bcUrl, true, targetVersion, true,
-					                                     range, Key(), Key(), lockDB,
+					struct RestoreRequest restoreRequest(restoreIndex, restoreTag, bcUrl, targetVersion, range,
 					                                     deterministicRandom()->randomUniqueID());
 					tr->set(restoreRequestKeyFor(restoreRequest.index), restoreRequestValue(restoreRequest));
 				}
@ -4614,11 +4613,10 @@ public:
 	// Similar to atomicRestore, only used in simulation test.
 	// locks the database before discontinuing the backup and that same lock is then used while doing the restore.
 	// the tagname of the backup must be the same as the restore.
-	ACTOR static Future<Void> atomicParallelRestore(FileBackupAgent* backupAgent, Database cx, Key tagName,
+	static Future<Void> atomicParallelRestore(FileBackupAgent* backupAgent, Database cx, Key tagName,
 	                                                Standalone<VectorRef<KeyRangeRef>> ranges, Key addPrefix,
 	                                                Key removePrefix) {
-		Version ver = wait(atomicRestore(backupAgent, cx, tagName, ranges, addPrefix, removePrefix, true));
-		return Void();
+		return success(atomicRestore(backupAgent, cx, tagName, ranges, addPrefix, removePrefix, true));
 	}
 };

--- a/fdbclient/IncludeVersions.h
+++ b/fdbclient/IncludeVersions.h
@ -0,0 +1,28 @@
+/*
+ * IncludeVersions.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This is a simple header to isolate the stupidity that results out of two
+// build systems and versions.h include directives
+
+#if defined(CMAKE_BUILD)
+#  include "fdbclient/versions.h"
+#elif !defined(WIN32)
+#  include "versions.h"
+#endif
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -49,9 +49,7 @@
 #include "flow/TLSConfig.actor.h"
 #include "flow/UnitTest.h"

-#if defined(CMAKE_BUILD) || !defined(WIN32)
-#include "versions.h"
-#endif
+#include "fdbclient/IncludeVersions.h"

 #ifdef WIN32
 #define WIN32_LEAN_AND_MEAN
@ -530,7 +528,9 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
    commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
    healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal),
    specialKeySpace(std::make_shared<SpecialKeySpace>(normalKeys.begin, specialKeys.end)),
-    cKImpl(std::make_shared<ConflictingKeysImpl>(conflictingKeysRange)) {
+    cKImpl(std::make_shared<ConflictingKeysImpl>(conflictingKeysRange)),
+    rCRImpl(std::make_shared<ReadConflictRangeImpl>(readConflictRangeKeysRange)),
+    wCRImpl(std::make_shared<WriteConflictRangeImpl>(writeConflictRangeKeysRange)) {
 	dbId = deterministicRandom()->randomUniqueID();
 	connected = clientInfo->get().proxies.size() ? Void() : clientInfo->onChange();

@ -550,6 +550,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	monitorMasterProxiesInfoChange = monitorMasterProxiesChange(clientInfo, &masterProxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 	specialKeySpace->registerKeyRange(conflictingKeysRange, cKImpl.get());
+	specialKeySpace->registerKeyRange(readConflictRangeKeysRange, rCRImpl.get());
+	specialKeySpace->registerKeyRange(writeConflictRangeKeysRange, wCRImpl.get());
 }

 DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("TransactionMetrics"), transactionReadVersions("ReadVersions", cc), 
@ -2429,7 +2431,7 @@ void Transaction::atomicOp(const KeyRef& key, const ValueRef& operand, MutationR

 	t.mutations.push_back( req.arena, MutationRef( operationType, r.begin, v ) );

-	if( addConflictRange )
+	if (addConflictRange && operationType != MutationRef::SetVersionstampedKey)
 		t.write_conflict_ranges.push_back( req.arena, r );

 	TEST(true); //NativeAPI atomic operation
@ -2786,7 +2788,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 					cx->commitLatencies.addSample(latency);
 					cx->latencies.addSample(now() - tr->startTime);
 					if (trLogInfo)
-						trLogInfo->addLog(FdbClientLogEvents::EventCommit(startTime, latency, req.transaction.mutations.size(), req.transaction.mutations.expectedSize(), req));
+						trLogInfo->addLog(FdbClientLogEvents::EventCommit_V2(startTime, latency, req.transaction.mutations.size(), req.transaction.mutations.expectedSize(), ci.version, req));
 					return Void();
 				} else {
 					// clear the RYW transaction which contains previous conflicting keys
@ -3224,7 +3226,7 @@ ACTOR Future<Version> extractReadVersion(DatabaseContext* cx, uint32_t flags, Re
 	double latency = now() - startTime;
 	cx->GRVLatencies.addSample(latency);
 	if (trLogInfo)
-		trLogInfo->addLog(FdbClientLogEvents::EventGetVersion_V2(startTime, latency, flags & GetReadVersionRequest::FLAG_PRIORITY_MASK));
+		trLogInfo->addLog(FdbClientLogEvents::EventGetVersion_V3(startTime, latency, flags & GetReadVersionRequest::FLAG_PRIORITY_MASK, rep.version));
 	if (rep.version == 1 && rep.locked) {
 		throw proxy_memory_limit_exceeded();
 	}
@ -3453,6 +3455,51 @@ ACTOR Future< StorageMetrics > extractMetrics( Future<std::pair<Optional<Storage
 	return x.first.get();
 }

+ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getReadHotRanges(Database cx, KeyRange keys) {
+	loop {
+		int64_t shardLimit = 100; // Shard limit here does not really matter since this function is currently only used
+		                          // to find the read-hot sub ranges within a read-hot shard.
+		vector<pair<KeyRange, Reference<LocationInfo>>> locations =
+		    wait(getKeyRangeLocations(cx, keys, shardLimit, false, &StorageServerInterface::getReadHotRanges,
+		                              TransactionInfo(TaskPriority::DataDistribution)));
+		try {
+			// TODO: how to handle this?
+			// This function is called whenever a shard becomes read-hot. But somehow the shard was splitted across more
+			// than one storage server after become read-hot and before this function is called, i.e. a race condition.
+			// Should we abort and wait the newly splitted shards to be hot again?
+			state int nLocs = locations.size();
+			// if (nLocs > 1) {
+			// 	TraceEvent("RHDDebug")
+			// 	    .detail("NumSSIs", nLocs)
+			// 	    .detail("KeysBegin", keys.begin.printable().c_str())
+			// 	    .detail("KeysEnd", keys.end.printable().c_str());
+			// }
+			state vector<Future<ReadHotSubRangeReply>> fReplies(nLocs);
+			for (int i = 0; i < nLocs; i++) {
+				ReadHotSubRangeRequest req(locations[i].first);
+				fReplies[i] = loadBalance(locations[i].second, &StorageServerInterface::getReadHotRanges, req,
+				                          TaskPriority::DataDistribution);
+			}
+
+			wait(waitForAll(fReplies));
+			Standalone<VectorRef<KeyRangeRef>> results;
+
+			for (int i = 0; i < nLocs; i++)
+				results.append(results.arena(), fReplies[i].get().readHotRanges.begin(),
+				               fReplies[i].get().readHotRanges.size());
+
+			return results;
+		} catch (Error& e) {
+			if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
+				TraceEvent(SevError, "GetReadHotSubRangesError").error(e);
+				throw;
+			}
+			cx->invalidateCache(keys);
+			wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
+		}
+	}
+}
+	
 ACTOR Future< std::pair<Optional<StorageMetrics>, int> > waitStorageMetrics(
 	Database cx,
 	KeyRange keys,
@ -3521,6 +3568,10 @@ Future< StorageMetrics > Transaction::getStorageMetrics( KeyRange const& keys, i
 	}
 }

+Future<Standalone<VectorRef<KeyRangeRef>>> Transaction::getReadHotRanges(KeyRange const& keys) {
+	return ::getReadHotRanges(cx, keys);
+}
+
 ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx, KeyRange keys, StorageMetrics limit, StorageMetrics estimated )
 {
 	loop {
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -251,6 +251,7 @@ public:
 	// Pass a negative value for `shardLimit` to indicate no limit on the shard number.
 	Future< StorageMetrics > getStorageMetrics( KeyRange const& keys, int shardLimit );
 	Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( KeyRange const& keys, StorageMetrics const& limit, StorageMetrics const& estimated );
+	Future<Standalone<VectorRef<KeyRangeRef>>> getReadHotRanges(KeyRange const& keys);

 	// If checkWriteConflictRanges is true, existing write conflict ranges will be searched for this key
 	void set( const KeyRef& key, const ValueRef& value, bool addConflictRange = true );
@ -300,6 +301,15 @@ public:
 	TransactionOptions options;
 	double startTime;
 	Reference<TransactionLogInfo> trLogInfo;
+
+	const vector<Future<std::pair<Key, Key>>>& getExtraReadConflictRanges() const { return extraConflictRanges; }
+	Standalone<VectorRef<KeyRangeRef>> readConflictRanges() const {
+		return Standalone<VectorRef<KeyRangeRef>>(tr.transaction.read_conflict_ranges, tr.arena);
+	}
+	Standalone<VectorRef<KeyRangeRef>> writeConflictRanges() const {
+		return Standalone<VectorRef<KeyRangeRef>>(tr.transaction.write_conflict_ranges, tr.arena);
+	}
+
 private:
 	Future<Version> getReadVersion(uint32_t flags);
 	void setPriority(uint32_t priorityFlag);
--- a/fdbclient/RYWIterator.cpp
+++ b/fdbclient/RYWIterator.cpp
@ -72,7 +72,7 @@ RYWIterator& RYWIterator::operator++() {
 	if (end_key_cmp <= 0) ++cache;
 	if (end_key_cmp >= 0) ++writes;
 	begin_key_cmp = -end_key_cmp;
-	end_key_cmp = cache.endKey().cmp(writes.endKey());
+	end_key_cmp = cache.endKey().compare(writes.endKey());
 	return *this;
 }

@ -80,7 +80,7 @@ RYWIterator& RYWIterator::operator--() {
 	if (begin_key_cmp >= 0) --cache;
 	if (begin_key_cmp <= 0) --writes;
 	end_key_cmp = -begin_key_cmp;
-	begin_key_cmp = cache.beginKey().cmp(writes.beginKey());
+	begin_key_cmp = cache.beginKey().compare(writes.beginKey());
 	return *this;
 }

@ -117,8 +117,8 @@ void RYWIterator::dbg() {
 }

 void RYWIterator::updateCmp() {
-	begin_key_cmp = cache.beginKey().cmp(writes.beginKey());
-	end_key_cmp = cache.endKey().cmp(writes.endKey());
+	begin_key_cmp = cache.beginKey().compare(writes.beginKey());
+	end_key_cmp = cache.endKey().compare(writes.endKey());
 }

 void testESR() {
@ -160,7 +160,7 @@ void testESR() {

 			/*
 			int c = ssrs[i] < ssrs[j] ? -1 : ssrs[i] == ssrs[j] ? 0 : 1;
-			int c2 = srs[i].cmp(srs[j]);
+			int c2 = srs[i].compare(srs[j]);
 			if ( c != (0<c2)-(c2<0) ) {
 			    printf("Error: '%s' cmp '%s' = %d\n", printable(ssrs[i]).c_str(), printable(ssrs[j]).c_str(), c2);
 			    return;
@ -413,8 +413,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	it.skip(allKeys.begin);

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -423,8 +423,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(it.is_conflict_range());
 	ASSERT(it.is_operation());
@ -434,8 +434,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -444,8 +444,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(it.is_conflict_range());
 	ASSERT(it.is_operation());
@ -455,8 +455,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("\xff\xff")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("\xff\xff")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -486,8 +486,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	it.skip(allKeys.begin);

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -496,8 +496,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(it.is_conflict_range());
 	ASSERT(it.is_operation());
@ -507,8 +507,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp123")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp123")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -517,8 +517,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp123")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp123\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp123")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp123\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(it.is_conflict_range());
 	ASSERT(it.is_operation());
@ -528,8 +528,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp123\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("\xff\xff")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp123\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("\xff\xff")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -1040,6 +1040,18 @@ public:
 			wait( ryw->resetPromise.getFuture() || ready );

 			if( ryw->options.readYourWritesDisabled ) {
+
+				// Stash away conflict ranges to read after commit
+				ryw->nativeReadRanges = ryw->tr.readConflictRanges();
+				ryw->nativeWriteRanges = ryw->tr.writeConflictRanges();
+				for (const auto& f : ryw->tr.getExtraReadConflictRanges()) {
+					if (f.isReady() && f.get().first < f.get().second)
+						ryw->nativeReadRanges.push_back(
+						    ryw->nativeReadRanges.arena(),
+						    KeyRangeRef(f.get().first, f.get().second)
+						        .withPrefix(readConflictRangeKeysRange.begin, ryw->nativeReadRanges.arena()));
+				}
+
 				if (ryw->resetPromise.isSet())
 					throw ryw->resetPromise.getFuture().getError();
 				wait( ryw->resetPromise.getFuture() || ryw->tr.commit() );
@ -1132,7 +1144,7 @@ public:

 ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx)
  : cache(&arena), writes(&arena), tr(cx), retries(0), approximateSize(0), creationTime(now()), commitStarted(false),
-    options(tr), deferredError(cx->deferredError) {
+    options(tr), deferredError(cx->deferredError), versionStampFuture(tr.getVersionstamp()) {
 	std::copy(cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(),
 	          std::back_inserter(persistentOptions));
 	applyPersistentOptions();
@ -1290,7 +1302,7 @@ Future< Standalone<RangeResultRef> > ReadYourWritesTransaction::getRange(
 	}

 	// special key space are only allowed to query if both begin and end are in \xff\xff, \xff\xff\xff
-	if (specialKeys.contains(begin.getKey()) && specialKeys.contains(end.getKey()))
+	if (specialKeys.contains(begin.getKey()) && end.getKey() <= specialKeys.end)
 		return getDatabase()->specialKeySpace->getRange(Reference<ReadYourWritesTransaction>::addRef(this), begin, end,
 		                                                limits, reverse);

@ -1545,6 +1557,104 @@ void ReadYourWritesTransaction::getWriteConflicts( KeyRangeMap<bool> *result ) {
 	}
 }

+Standalone<RangeResultRef> ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRangeRef kr) {
+	ASSERT(readConflictRangeKeysRange.contains(kr));
+	ASSERT(!tr.options.checkWritesEnabled)
+	Standalone<RangeResultRef> result;
+	if (!options.readYourWritesDisabled) {
+		kr = kr.removePrefix(readConflictRangeKeysRange.begin);
+		auto iter = readConflicts.rangeContainingKeyBefore(kr.begin);
+		if (iter->begin() == allKeys.begin && !iter->value()) {
+			++iter; // Conventionally '' is missing from the result range if it's not part of a read conflict
+		}
+		for (; iter->begin() < kr.end; ++iter) {
+			if (kr.begin <= iter->begin() && iter->begin() < kr.end) {
+				result.push_back(result.arena(),
+				                 KeyValueRef(iter->begin().withPrefix(readConflictRangeKeysRange.begin, result.arena()),
+				                             iter->value() ? LiteralStringRef("1") : LiteralStringRef("0")));
+			}
+		}
+	} else {
+		CoalescedKeyRefRangeMap<ValueRef> readConflicts{ LiteralStringRef("0"), specialKeys.end };
+		for (const auto& range : tr.readConflictRanges())
+			readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()),
+			                     LiteralStringRef("1"));
+		for (const auto& range : nativeReadRanges)
+			readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()),
+			                     LiteralStringRef("1"));
+		for (const auto& f : tr.getExtraReadConflictRanges()) {
+			if (f.isReady() && f.get().first < f.get().second)
+				readConflicts.insert(KeyRangeRef(f.get().first, f.get().second)
+				                         .withPrefix(readConflictRangeKeysRange.begin, result.arena()),
+				                     LiteralStringRef("1"));
+		}
+		auto beginIter = readConflicts.rangeContaining(kr.begin);
+		if (beginIter->begin() != kr.begin) ++beginIter;
+		for (auto it = beginIter; it->begin() < kr.end; ++it) {
+			result.push_back(result.arena(), KeyValueRef(it->begin(), it->value()));
+		}
+	}
+	return result;
+}
+
+Standalone<RangeResultRef> ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRangeRef kr) {
+	ASSERT(writeConflictRangeKeysRange.contains(kr));
+	Standalone<RangeResultRef> result;
+
+	// Memory owned by result
+	CoalescedKeyRefRangeMap<ValueRef> writeConflicts{ LiteralStringRef("0"), specialKeys.end };
+
+	if (!options.readYourWritesDisabled) {
+		KeyRangeRef strippedWriteRangePrefix = kr.removePrefix(writeConflictRangeKeysRange.begin);
+		WriteMap::iterator it(&writes);
+		it.skip(strippedWriteRangePrefix.begin);
+		if (it.beginKey() > allKeys.begin) --it;
+		for (; it.beginKey() < strippedWriteRangePrefix.end; ++it) {
+			if (it.is_conflict_range())
+				writeConflicts.insert(
+				    KeyRangeRef(it.beginKey().toArena(result.arena()), it.endKey().toArena(result.arena()))
+				        .withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
+				    LiteralStringRef("1"));
+		}
+	} else {
+		for (const auto& range : tr.writeConflictRanges())
+			writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
+			                      LiteralStringRef("1"));
+		for (const auto& range : nativeWriteRanges)
+			writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
+			                      LiteralStringRef("1"));
+	}
+
+	for (const auto& k : versionStampKeys) {
+		KeyRange range;
+		if (versionStampFuture.isValid() && versionStampFuture.isReady() && !versionStampFuture.isError()) {
+			const auto& stamp = versionStampFuture.get();
+			StringRef key(range.arena(), k); // Copy
+			ASSERT(k.size() >= 4);
+			int32_t pos;
+			memcpy(&pos, k.end() - sizeof(int32_t), sizeof(int32_t));
+			pos = littleEndian32(pos);
+			ASSERT(pos >= 0 && pos + stamp.size() <= key.size());
+			memcpy(mutateString(key) + pos, stamp.begin(), stamp.size());
+			*(mutateString(key) + key.size() - 4) = '\x00';
+			// singleKeyRange, but share begin and end's memory
+			range = KeyRangeRef(key.substr(0, key.size() - 4), key.substr(0, key.size() - 3));
+		} else {
+			range = getVersionstampKeyRange(result.arena(), k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey());
+		}
+		writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
+		                      LiteralStringRef("1"));
+	}
+
+	auto beginIter = writeConflicts.rangeContaining(kr.begin);
+	if (beginIter->begin() != kr.begin) ++beginIter;
+	for (auto it = beginIter; it->begin() < kr.end; ++it) {
+		result.push_back(result.arena(), KeyValueRef(it->begin(), it->value()));
+	}
+
+	return result;
+}
+
 void ReadYourWritesTransaction::atomicOp( const KeyRef& key, const ValueRef& operand, uint32_t operationType ) {
 	bool addWriteConflict = !options.getAndResetWriteConflictDisabled();

@ -1590,8 +1700,11 @@ void ReadYourWritesTransaction::atomicOp( const KeyRef& key, const ValueRef& ope
 	}

 	if(operationType == MutationRef::SetVersionstampedKey) {
+		TEST(options.readYourWritesDisabled); // SetVersionstampedKey without ryw enabled
 		// this does validation of the key and needs to be performed before the readYourWritesDisabled path
 		KeyRangeRef range = getVersionstampKeyRange(arena, k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey());
+		versionStampKeys.push_back(arena, k);
+		addWriteConflict = false;
 		if(!options.readYourWritesDisabled) {
 			writeRangeToNativeTransaction(range);
 			writes.addUnmodifiedAndUnreadableRange(range);
@ -1910,6 +2023,9 @@ void ReadYourWritesTransaction::operator=(ReadYourWritesTransaction&& r) BOOST_N
 	cache.arena = &arena;
 	writes.arena = &arena;
 	persistentOptions = std::move(r.persistentOptions);
+	nativeReadRanges = std::move(r.nativeReadRanges);
+	nativeWriteRanges = std::move(r.nativeWriteRanges);
+	versionStampKeys = std::move(r.versionStampKeys);
 }

 ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&& r) BOOST_NOEXCEPT :
@ -1934,6 +2050,9 @@ ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&&
 	watchMap = std::move( r.watchMap );
 	r.resetPromise = Promise<Void>();
 	persistentOptions = std::move(r.persistentOptions);
+	nativeReadRanges = std::move(r.nativeReadRanges);
+	nativeWriteRanges = std::move(r.nativeWriteRanges);
+	versionStampKeys = std::move(r.versionStampKeys);
 }

 Future<Void> ReadYourWritesTransaction::onError(Error const& e) {
@ -1968,6 +2087,9 @@ void ReadYourWritesTransaction::resetRyow() {
 	cache = SnapshotCache(&arena);
 	writes = WriteMap(&arena);
 	readConflicts = CoalescedKeyRefRangeMap<bool>();
+	versionStampKeys = VectorRef<KeyRef>();
+	nativeReadRanges = Standalone<VectorRef<KeyRangeRef>>();
+	nativeWriteRanges = Standalone<VectorRef<KeyRangeRef>>();
 	watchMap.clear();
 	reading = AndFuture();
 	approximateSize = 0;
@ -1998,6 +2120,7 @@ void ReadYourWritesTransaction::reset() {
 	options.reset(tr);
 	transactionDebugInfo.clear();
 	tr.fullReset();
+	versionStampFuture = tr.getVersionstamp();
 	std::copy(tr.getDatabase().getTransactionDefaults().begin(), tr.getDatabase().getTransactionDefaults().end(), std::back_inserter(persistentOptions));
 	resetRyow();
 }
--- a/fdbclient/ReadYourWrites.h
+++ b/fdbclient/ReadYourWrites.h
@ -121,6 +121,9 @@ public:

 	Future<Void> debug_onIdle() { return reading; }

+	// Wait for all reads that are currently pending to complete
+	Future<Void> pendingReads() { return resetPromise.getFuture() || reading; }
+
 	// Used by ThreadSafeTransaction for exceptions thrown in void methods
 	Error deferredError;

@ -135,6 +138,12 @@ public:
 	const TransactionInfo& getTransactionInfo() const {
 		return tr.info;
 	}
+
+	// Read from the special key space readConflictRangeKeysRange
+	Standalone<RangeResultRef> getReadConflictRangeIntersecting(KeyRangeRef kr);
+	// Read from the special key space writeConflictRangeKeysRange
+	Standalone<RangeResultRef> getWriteConflictRangeIntersecting(KeyRangeRef kr);
+
 private:
 	friend class RYWImpl;

@ -152,6 +161,14 @@ private:
 	double creationTime;
 	bool commitStarted;

+	// For reading conflict ranges from the special key space
+	VectorRef<KeyRef> versionStampKeys;
+	Future<Standalone<StringRef>> versionStampFuture;
+	Standalone<VectorRef<KeyRangeRef>>
+	    nativeReadRanges; // Used to read conflict ranges after committing an ryw disabled transaction
+	Standalone<VectorRef<KeyRangeRef>>
+	    nativeWriteRanges; // Used to read conflict ranges after committing an ryw disabled transaction
+
 	Reference<TransactionDebugInfo> transactionDebugInfo;

 	void resetTimeout();
--- a/fdbclient/RestoreWorkerInterface.actor.h
+++ b/fdbclient/RestoreWorkerInterface.actor.h
@ -466,29 +466,26 @@ struct RestoreSendVersionedMutationsRequest : TimedRequest {

 	Version msgIndex; // Monitonically increasing index of mutation messages
 	bool isRangeFile;
-	MutationsVec mutations; // Mutations that may be at different versions parsed by one loader
-	LogMessageVersionVec mVersions; // (version, subversion) of each mutation in mutations field
+	VersionedMutationsVec versionedMutations; // Versioned mutations may be at different versions parsed by one loader

 	ReplyPromise<RestoreCommonReply> reply;

 	RestoreSendVersionedMutationsRequest() = default;
 	explicit RestoreSendVersionedMutationsRequest(int batchIndex, const RestoreAsset& asset, Version msgIndex,
-	                                              bool isRangeFile, MutationsVec mutations,
-	                                              LogMessageVersionVec mVersions)
-	  : batchIndex(batchIndex), asset(asset), msgIndex(msgIndex), isRangeFile(isRangeFile), mutations(mutations),
-	    mVersions(mVersions) {}
+	                                              bool isRangeFile, VersionedMutationsVec versionedMutations)
+	  : batchIndex(batchIndex), asset(asset), msgIndex(msgIndex), isRangeFile(isRangeFile),
+	    versionedMutations(versionedMutations) {}

 	std::string toString() {
 		std::stringstream ss;
 		ss << "VersionBatchIndex:" << batchIndex << "RestoreAsset:" << asset.toString() << " msgIndex:" << msgIndex
-		   << " isRangeFile:" << isRangeFile << " mutations.size:" << mutations.size()
-		   << " mVersions.size:" << mVersions.size();
+		   << " isRangeFile:" << isRangeFile << " versionedMutations.size:" << versionedMutations.size();
 		return ss.str();
 	}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, batchIndex, asset, msgIndex, isRangeFile, mutations, mVersions, reply);
+		serializer(ar, batchIndex, asset, msgIndex, isRangeFile, versionedMutations, reply);
 	}
 };

@ -543,42 +540,27 @@ struct RestoreRequest {
 	int index;
 	Key tagName;
 	Key url;
-	bool waitForComplete;
 	Version targetVersion;
-	bool verbose;
 	KeyRange range;
-	Key addPrefix;
-	Key removePrefix;
-	bool lockDB;
 	UID randomUid;

-	std::vector<int> restoreRequests;
-	// Key restoreTag;
-
 	ReplyPromise<struct RestoreCommonReply> reply;

 	RestoreRequest() = default;
-	explicit RestoreRequest(const int index, const Key& tagName, const Key& url, bool waitForComplete,
-	                        Version targetVersion, bool verbose, const KeyRange& range, const Key& addPrefix,
-	                        const Key& removePrefix, bool lockDB, const UID& randomUid)
-	  : index(index), tagName(tagName), url(url), waitForComplete(waitForComplete), targetVersion(targetVersion),
-	    verbose(verbose), range(range), addPrefix(addPrefix), removePrefix(removePrefix), lockDB(lockDB),
-	    randomUid(randomUid) {}
+	explicit RestoreRequest(const int index, const Key& tagName, const Key& url, Version targetVersion,
+	                        const KeyRange& range, const UID& randomUid)
+	  : index(index), tagName(tagName), url(url), targetVersion(targetVersion), range(range), randomUid(randomUid) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, index, tagName, url, waitForComplete, targetVersion, verbose, range, addPrefix, removePrefix,
-		           lockDB, randomUid, restoreRequests, reply);
+		serializer(ar, index, tagName, url, targetVersion, range, randomUid, reply);
 	}

 	std::string toString() const {
 		std::stringstream ss;
 		ss << "index:" << std::to_string(index) << " tagName:" << tagName.contents().toString()
-		   << " url:" << url.contents().toString() << " waitForComplete:" << std::to_string(waitForComplete)
-		   << " targetVersion:" << std::to_string(targetVersion) << " verbose:" << std::to_string(verbose)
-		   << " range:" << range.toString() << " addPrefix:" << addPrefix.contents().toString()
-		   << " removePrefix:" << removePrefix.contents().toString() << " lockDB:" << std::to_string(lockDB)
-		   << " randomUid:" << randomUid.toString();
+		   << " url:" << url.contents().toString() << " targetVersion:" << std::to_string(targetVersion)
+		   << " range:" << range.toString() << " randomUid:" << randomUid.toString();
 		return ss.str();
 	}
 };
--- a/fdbclient/SnapshotCache.h
+++ b/fdbclient/SnapshotCache.h
@ -71,7 +71,7 @@ struct ExtStringRef {

 	int size() const { return base.size() + extra_zero_bytes; }

-	int cmp(ExtStringRef const& rhs) const {
+	int compare(ExtStringRef const& rhs) const {
 		int cbl = std::min(base.size(), rhs.base.size());
 		if (cbl > 0) {
 			int c = memcmp(base.begin(), rhs.base.begin(), cbl);
@ -82,7 +82,7 @@ struct ExtStringRef {
 			if (base[i]) return 1;
 		for(int i=cbl; i<rhs.base.size(); i++)
 			if (rhs.base[i]) return -1;
-		return size() - rhs.size();
+		return ::compare(size(), rhs.size());
 	}

 	bool startsWith( const ExtStringRef& s ) const { 
@ -114,13 +114,21 @@ private:
 	int extra_zero_bytes;
 };
 inline bool operator == (const ExtStringRef& lhs, const ExtStringRef& rhs ) {
-	return lhs.size() == rhs.size() && !lhs.cmp(rhs);
+	return lhs.size() == rhs.size() && !lhs.compare(rhs);
 }
 inline bool operator != (const ExtStringRef& lhs, const ExtStringRef& rhs ) { return !(lhs==rhs); }
-inline bool operator < ( const ExtStringRef& lhs, const ExtStringRef& rhs ) { return lhs.cmp(rhs)<0; }
-inline bool operator > ( const ExtStringRef& lhs, const ExtStringRef& rhs ) { return lhs.cmp(rhs)>0; }
-inline bool operator <= ( const ExtStringRef& lhs, const ExtStringRef& rhs ) { return lhs.cmp(rhs)<=0; }
-inline bool operator >= ( const ExtStringRef& lhs, const ExtStringRef& rhs ) { return lhs.cmp(rhs)>=0; }
+inline bool operator<(const ExtStringRef& lhs, const ExtStringRef& rhs) {
+	return lhs.compare(rhs) < 0;
+}
+inline bool operator>(const ExtStringRef& lhs, const ExtStringRef& rhs) {
+	return lhs.compare(rhs) > 0;
+}
+inline bool operator<=(const ExtStringRef& lhs, const ExtStringRef& rhs) {
+	return lhs.compare(rhs) <= 0;
+}
+inline bool operator>=(const ExtStringRef& lhs, const ExtStringRef& rhs) {
+	return lhs.compare(rhs) >= 0;
+}

 template<>
 struct Traceable<ExtStringRef> : std::true_type {
@ -152,25 +160,10 @@ private:
 		{
 			values.push_back( arena, kv );
 		}
+		int compare(Entry const& r) const { return ::compare(beginKey, r.beginKey); }
 		bool operator < (Entry const& r) const {
 			return beginKey < r.beginKey;
 		}
-		bool operator < (StringRef const& r) const {
-			return beginKey < r;
-		}
-		bool operator <= (Entry const& r) const {
-			return beginKey <= r.beginKey;
-		}
-		bool operator <= (StringRef const& r) const {
-			return beginKey <= r;
-		}
-		bool operator == (Entry const& r) const {
-			return beginKey == r.beginKey;
-		}
-		bool operator == (StringRef const& r) const {
-			return beginKey == r;
-		}
-
 		int segments() const { return 2*(values.size()+1); }
 	};

--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -243,6 +243,26 @@ Future<Optional<Value>> SpecialKeySpace::get(Reference<ReadYourWritesTransaction
 	return getActor(this, ryw, key);
 }

+ReadConflictRangeImpl::ReadConflictRangeImpl(KeyRangeRef kr) : SpecialKeyRangeBaseImpl(kr) {}
+
+ACTOR static Future<Standalone<RangeResultRef>> getReadConflictRangeImpl(Reference<ReadYourWritesTransaction> ryw,
+                                                                         KeyRange kr) {
+	wait(ryw->pendingReads());
+	return ryw->getReadConflictRangeIntersecting(kr);
+}
+
+Future<Standalone<RangeResultRef>> ReadConflictRangeImpl::getRange(Reference<ReadYourWritesTransaction> ryw,
+                                                                   KeyRangeRef kr) const {
+	return getReadConflictRangeImpl(ryw, kr);
+}
+
+WriteConflictRangeImpl::WriteConflictRangeImpl(KeyRangeRef kr) : SpecialKeyRangeBaseImpl(kr) {}
+
+Future<Standalone<RangeResultRef>> WriteConflictRangeImpl::getRange(Reference<ReadYourWritesTransaction> ryw,
+                                                                    KeyRangeRef kr) const {
+	return ryw->getWriteConflictRangeIntersecting(kr);
+}
+
 ConflictingKeysImpl::ConflictingKeysImpl(KeyRangeRef kr) : SpecialKeyRangeBaseImpl(kr) {}

 Future<Standalone<RangeResultRef>> ConflictingKeysImpl::getRange(Reference<ReadYourWritesTransaction> ryw,
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@ -95,5 +95,19 @@ public:
 	                                            KeyRangeRef kr) const override;
 };

+class ReadConflictRangeImpl : public SpecialKeyRangeBaseImpl {
+public:
+	explicit ReadConflictRangeImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
+	                                            KeyRangeRef kr) const override;
+};
+
+class WriteConflictRangeImpl : public SpecialKeyRangeBaseImpl {
+public:
+	explicit WriteConflictRangeImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
+	                                            KeyRangeRef kr) const override;
+};
+
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -71,6 +71,7 @@ struct StorageServerInterface {

 	RequestStream<ReplyPromise<KeyValueStoreType>> getKeyValueStoreType;
 	RequestStream<struct WatchValueRequest> watchValue;
+	RequestStream<struct ReadHotSubRangeRequest> getReadHotRanges;

 	explicit StorageServerInterface(UID uid) : uniqueID( uid ) {}
 	StorageServerInterface() : uniqueID( deterministicRandom()->randomUniqueID() ) {}
@ -98,6 +99,7 @@ struct StorageServerInterface {
 				getQueuingMetrics = RequestStream<struct StorageQueuingMetricsRequest>( base.getAdjustedEndpoint(8) );
 				getKeyValueStoreType = RequestStream<ReplyPromise<KeyValueStoreType>>( base.getAdjustedEndpoint(9) );
 				watchValue = RequestStream<struct WatchValueRequest>( base.getAdjustedEndpoint(10) );
+				getReadHotRanges = RequestStream<struct ReadHotSubRangeRequest>( base.getAdjustedEndpoint(11) );
 			}
 		} else {
 			ASSERT(Ar::isDeserializing);
@ -125,6 +127,7 @@ struct StorageServerInterface {
 		streams.push_back(getQueuingMetrics.getReceiver());
 		streams.push_back(getKeyValueStoreType.getReceiver());
 		streams.push_back(watchValue.getReceiver());
+		streams.push_back(getReadHotRanges.getReceiver());
 		base = FlowTransport::transport().addEndpoints(streams);
 	}
 };
@ -416,6 +419,30 @@ struct SplitMetricsRequest {
 	}
 };

+struct ReadHotSubRangeReply {
+	constexpr static FileIdentifier file_identifier = 10424537;
+	Standalone<VectorRef<KeyRangeRef>> readHotRanges;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, readHotRanges);
+	}
+};
+struct ReadHotSubRangeRequest {
+	constexpr static FileIdentifier file_identifier = 10259266;
+	Arena arena;
+	KeyRangeRef keys;
+	ReplyPromise<ReadHotSubRangeReply> reply;
+
+	ReadHotSubRangeRequest() {}
+	ReadHotSubRangeRequest(KeyRangeRef const& keys) : keys(arena, keys) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, keys, reply, arena);
+	}
+};
+
 struct GetStorageMetricsReply {
 	constexpr static FileIdentifier file_identifier = 15491478;
 	StorageMetrics load;
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -138,11 +138,20 @@ void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& v
 	}
 }

-const KeyRangeRef conflictingKeysRange = KeyRangeRef(LiteralStringRef("\xff\xff/transaction/conflicting_keys/"),
-                                                     LiteralStringRef("\xff\xff/transaction/conflicting_keys/\xff"));
+const KeyRangeRef conflictingKeysRange =
+    KeyRangeRef(LiteralStringRef("\xff\xff/transaction/conflicting_keys/"),
+                LiteralStringRef("\xff\xff/transaction/conflicting_keys/\xff\xff"));
 const ValueRef conflictingKeysTrue = LiteralStringRef("1");
 const ValueRef conflictingKeysFalse = LiteralStringRef("0");

+const KeyRangeRef readConflictRangeKeysRange =
+    KeyRangeRef(LiteralStringRef("\xff\xff/transaction/read_conflict_range/"),
+                LiteralStringRef("\xff\xff/transaction/read_conflict_range/\xff\xff"));
+
+const KeyRangeRef writeConflictRangeKeysRange =
+    KeyRangeRef(LiteralStringRef("\xff\xff/transaction/write_conflict_range/"),
+                LiteralStringRef("\xff\xff/transaction/write_conflict_range/\xff\xff"));
+
 //    "\xff/storageCache/[[begin]]" := "[[vector<uint16_t>]]"
 const KeyRangeRef storageCacheKeys( LiteralStringRef("\xff/storageCache/"), LiteralStringRef("\xff/storageCache0") );
 const KeyRef storageCachePrefix = storageCacheKeys.begin;
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -77,6 +77,8 @@ bool serverHasKey( ValueRef storedValue );

 extern const KeyRangeRef conflictingKeysRange;
 extern const ValueRef conflictingKeysTrue, conflictingKeysFalse;
+extern const KeyRangeRef writeConflictRangeKeysRange;
+extern const KeyRangeRef readConflictRangeKeysRange;

 extern const KeyRef cacheKeysPrefix;

--- a/fdbclient/ThreadSafeTransaction.actor.cpp
+++ b/fdbclient/ThreadSafeTransaction.actor.cpp
@ -21,10 +21,7 @@
 #include "fdbclient/ThreadSafeTransaction.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/DatabaseContext.h"
-#if defined(CMAKE_BUILD) || !defined(WIN32)
-#include "versions.h"
-#endif
-#include <new>
+#include "fdbclient/IncludeVersions.h"

 // Users of ThreadSafeTransaction might share Reference<ThreadSafe...> between different threads as long as they don't call addRef (e.g. C API follows this).
 // Therefore, it is unsafe to call (explicitly or implicitly) this->addRef in any of these functions.
--- a/fdbclient/VersionedMap.cpp
+++ b/fdbclient/VersionedMap.cpp
@ -0,0 +1,56 @@
+#include "fdbclient/VersionedMap.h"
+#include "flow/TreeBenchmark.h"
+#include "flow/UnitTest.h"
+
+template <typename K>
+struct VersionedMapHarness {
+	using map = VersionedMap<K, int>;
+	using key_type = K;
+
+	struct result {
+		typename map::iterator it;
+
+		result(typename map::iterator it) : it(it) {}
+
+		result& operator++() {
+			++it;
+			return *this;
+		}
+
+		const K& operator*() const { return it.key(); }
+
+		const K& operator->() const { return it.key(); }
+
+		bool operator==(result const& k) const { return it == k.it; }
+	};
+
+	map s;
+
+	void insert(K const& k) { s.insert(k, 1); }
+	result find(K const& k) const { return result(s.atLatest().find(k)); }
+	result not_found() const { return result(s.atLatest().end()); }
+	result begin() const { return result(s.atLatest().begin()); }
+	result end() const { return result(s.atLatest().end()); }
+	result lower_bound(K const& k) const { return result(s.atLatest().lower_bound(k)); }
+	result upper_bound(K const& k) const { return result(s.atLatest().upper_bound(k)); }
+	void erase(K const& k) { s.erase(k); }
+};
+
+TEST_CASE("performance/map/int/VersionedMap") {
+    VersionedMapHarness<int> tree;
+
+	treeBenchmark(tree, *randomInt);
+
+	return Void();
+}
+
+TEST_CASE("performance/map/StringRef/VersionedMap") {
+	Arena arena;
+    VersionedMapHarness<StringRef> tree;
+    
+	treeBenchmark(tree, [&arena]() { return randomStr(arena); });
+
+	return Void();
+}
+
+void forceLinkVersionedMapTests() {}
--- a/fdbclient/VersionedMap.h
+++ b/fdbclient/VersionedMap.h
@ -67,6 +67,62 @@ namespace PTreeImpl {
 		PTree(PTree const&);
 	};

+    template <class T>
+    class PTreeFinger {
+	    using PTreeFingerEntry = PTree<T> const*;
+	    // This finger size supports trees with up to exp(96/4.3) ~= 4,964,514,749 entries.
+	    // see also: check().
+	    static constexpr size_t N = 96;
+	    PTreeFingerEntry entries_[N];
+	    size_t size_ = 0;
+	    size_t bound_sz_ = 0;
+
+	public:
+	    PTreeFinger() {}
+
+	    // Explicit copy constructors ensure we copy the live values in entries_.
+	    PTreeFinger(PTreeFinger const& f) { *this = f; }
+	    PTreeFinger(PTreeFinger&& f) { *this = f; }
+
+	    PTreeFinger& operator=(PTreeFinger const& f) {
+		    size_ = f.size_;
+		    bound_sz_ = f.bound_sz_;
+		    std::copy(f.entries_, f.entries_ + size_, entries_);
+		    return *this;
+	    }
+
+	    PTreeFinger& operator=(PTreeFinger&& f) {
+		    size_ = std::exchange(f.size_, 0);
+		    bound_sz_ = f.bound_sz_;
+		    std::copy(f.entries_, f.entries_ + size_, entries_);
+		    return *this;
+	    }
+
+	    size_t size() const { return size_; }
+	    PTree<T> const* back() const { return entries_[size_ - 1]; }
+	    void pop_back() { size_--; }
+	    void clear() { size_ = 0; }
+	    PTree<T> const* operator[](size_t i) const { return entries_[i]; }
+
+	    void resize(size_t sz) {
+		    size_ = sz;
+		    ASSERT(size_ < N);
+	    }
+
+	    void push_back(PTree<T> const* node) {
+		    entries_[size_++] = { node };
+		    ASSERT(size_ < N);
+	    }
+
+	    void push_for_bound(PTree<T> const* node, bool less) {
+		    push_back(node);
+		    bound_sz_ = less ? size_ : bound_sz_;
+	    }
+
+	    // remove the end of the finger so that the last entry is less than the probe
+	    void trim_to_bound() { size_ = bound_sz_; }
+    };
+
    template<class T>
 	static Reference<PTree<T>> update( Reference<PTree<T>> const& node, bool which, Reference<PTree<T>> const& ptr, Version at ) {
 		if (ptr.getPtr() == node->child(which, at).getPtr()/* && node->replacedVersion <= at*/) {
@ -109,37 +165,40 @@ namespace PTreeImpl {
 	template<class T, class X>
 	bool contains(const Reference<PTree<T>>& p, Version at, const X& x) {
 		if (!p) return false;
-		bool less = x < p->data;
-		if (!less && !(p->data<x)) return true;  // x == p->data
+		int cmp = compare(x, p->data);
+		bool less = cmp < 0;
+		if (cmp == 0) return true;
 		return contains(p->child(!less, at), at, x);
 	}

-	template<class T, class X>
-	void lower_bound(const Reference<PTree<T>>& p, Version at, const X& x, std::vector<const PTree<T>*>& f){
+	// TODO: Remove the number of invocations of operator<, and replace with something closer to memcmp.
+	// and same for upper_bound.
+    template <class T, class X>
+    void lower_bound(const Reference<PTree<T>>& p, Version at, const X& x, PTreeFinger<T>& f) {
 	    if (!p) {
-			while (f.size() && !(x < f.back()->data))
-				f.pop_back();
+		    f.trim_to_bound();
 		    return;
 		}
-		f.push_back(p.getPtr());
-		bool less = x < p->data;
-		if (!less && !(p->data<x)) return;  // x == p->data
+		int cmp = compare(x, p->data);
+		bool less = cmp < 0;
+	    f.push_for_bound(p.getPtr(), less);
+	    if (cmp == 0) return;
 	    lower_bound(p->child(!less, at), at, x, f);
    }

-	template<class T, class X>
-	void upper_bound(const Reference<PTree<T>>& p, Version at, const X& x, std::vector<const PTree<T>*>& f){
+    template <class T, class X>
+    void upper_bound(const Reference<PTree<T>>& p, Version at, const X& x, PTreeFinger<T>& f) {
 	    if (!p) {
-			while (f.size() && !(x < f.back()->data))
-				f.pop_back();
+		    f.trim_to_bound();
 		    return;
 		}
-		f.push_back(p.getPtr());
-		upper_bound(p->child(!(x < p->data), at), at, x, f);
+		bool less = x < p->data;
+	    f.push_for_bound(p.getPtr(), less);
+	    upper_bound(p->child(!less, at), at, x, f);
    }

-	template<class T, bool forward>
-	void move(Version at, std::vector<const PTree<T>*>& f){
+    template <class T, bool forward>
+    void move(Version at, PTreeFinger<T>& f) {
 	    ASSERT(f.size());
 		const PTree<T> *n;
 		n = f.back();
@ -157,8 +216,8 @@ namespace PTreeImpl {
 		}
    }

-	template<class T, bool forward>
-	int halfMove(Version at, std::vector<const PTree<T>*>& f) {
+    template <class T, bool forward>
+    int halfMove(Version at, PTreeFinger<T>& f) {
 	    // Post: f[:return_value] is the finger that would have been returned by move<forward>(at,f), and f[:original_length_of_f] is unmodified
 		ASSERT(f.size());
 		const PTree<T> *n;
@ -180,28 +239,28 @@ namespace PTreeImpl {
 		}
    }

-	template<class T>
-	void next(Version at, std::vector<const PTree<T>*>& f){
+    template <class T>
+    void next(Version at, PTreeFinger<T>& f) {
 	    move<T,true>(at, f);
    }

-	template<class T>
-	void previous(Version at, std::vector<const PTree<T>*>& f){
+    template <class T>
+    void previous(Version at, PTreeFinger<T>& f) {
 	    move<T,false>(at, f);
    }

-	template<class T>
-	int halfNext(Version at, std::vector<const PTree<T>*>& f){
+    template <class T>
+    int halfNext(Version at, PTreeFinger<T>& f) {
 	    return halfMove<T,true>(at, f);
    }

-	template<class T>
-	int halfPrevious(Version at, std::vector<const PTree<T>*>& f){
+    template <class T>
+    int halfPrevious(Version at, PTreeFinger<T>& f) {
 	    return halfMove<T,false>(at, f);
    }

-	template<class T>
-	T get(std::vector<const PTree<T>*>& f){
+    template <class T>
+    T get(PTreeFinger<T>& f) {
 	    ASSERT(f.size());
 		return f.back()->data;
    }
@ -235,20 +294,20 @@ namespace PTreeImpl {
 		return lastNode(p->right(at), at);
 	}

-	template<class T, bool last>
-	void firstOrLastFinger(const Reference<PTree<T>>& p, Version at, std::vector<const PTree<T>*>& f) {
+    template <class T, bool last>
+    void firstOrLastFinger(const Reference<PTree<T>>& p, Version at, PTreeFinger<T>& f) {
 	    if (!p) return;
 		f.push_back(p.getPtr());
 		firstOrLastFinger<T, last>(p->child(last, at), at, f);
    }

-	template<class T>
-	void first(const Reference<PTree<T>>& p, Version at, std::vector<const PTree<T>*>& f) {
+    template <class T>
+    void first(const Reference<PTree<T>>& p, Version at, PTreeFinger<T>& f) {
 	    return firstOrLastFinger<T, false>(p, at, f);
    }

-	template<class T>
-	void last(const Reference<PTree<T>>& p, Version at, std::vector<const PTree<T>*>& f) {
+    template <class T>
+    void last(const Reference<PTree<T>>& p, Version at, PTreeFinger<T>& f) {
 	    return firstOrLastFinger<T, true>(p, at, f);
    }

@ -272,24 +331,27 @@ namespace PTreeImpl {
 	template<class T, class X>
 	void remove(Reference<PTree<T>>& p, Version at, const X& x) {
 		if (!p) ASSERT(false); // attempt to remove item not present in PTree
-		if (x < p->data) {
+		int cmp = compare(x, p->data);
+		if (cmp < 0) {
 			Reference<PTree<T>> child = p->child(0, at);
 			remove(child, at, x);
 			p = update(p, 0, child, at);
-		} else if (p->data < x) {
+		} else if (cmp > 0) {
 			Reference<PTree<T>> child = p->child(1, at);
 			remove(child, at, x);
 			p = update(p, 1, child, at);
-		} else
+		} else {
 			removeRoot(p, at);
 		}
+	}

 	template<class T, class X>
 	void remove(Reference<PTree<T>>& p, Version at, const X& begin, const X& end) {
 		if (!p) return;
 		int beginDir, endDir;
-		if (begin < p->data) beginDir = -1;
-		else if (p->data < begin) beginDir = +1;
+		int beginCmp = compare(begin, p->data);
+		if (beginCmp < 0) beginDir = -1;
+		else if (beginCmp > 0) beginDir = +1;
 		else beginDir = 0;
 		if (!(p->data < end)) endDir = -1;
 		else endDir = +1;
@ -364,7 +426,9 @@ namespace PTreeImpl {
 		if (!right) return left;

 		Reference<PTree<T>> r = Reference<PTree<T>>(new PTree<T>(lastNode(left, at)->data, at));
+		if (EXPENSIVE_VALIDATION) {
 			ASSERT( r->data < firstNode(right, at)->data);
+		}
 		Reference<PTree<T>> a = left;
 		remove(a, at, r->data);

@ -513,6 +577,7 @@ class VersionedMap : NonCopyable {
 //private:
 public:
 	typedef PTreeImpl::PTree<MapPair<K,std::pair<T,Version>>> PTreeT;
+	typedef PTreeImpl::PTreeFinger<MapPair<K, std::pair<T, Version>>> PTreeFingerT;
 	typedef Reference< PTreeT > Tree;

 	Version oldestVersion, latestVersion;
@ -589,7 +654,7 @@ public:

 		UNSTOPPABLE_ASSERT(r->first == newOldestVersion);

-		vector<Tree> toFree;
+		std::vector<Tree> toFree;
 		toFree.reserve(10000);
 		auto newBegin = r;
 		Tree *lastRoot = nullptr;
@ -679,7 +744,7 @@ public:
 		friend class VersionedMap<K,T>;
 		Tree root;
 		Version at;
-		vector< PTreeT const* > finger;
+		PTreeFingerT finger;
 	};

 	class ViewAtVersion {
--- a/fdbclient/WriteMap.h
+++ b/fdbclient/WriteMap.h
@ -107,18 +107,35 @@ struct WriteMapEntry {

 	WriteMapEntry( KeyRef const& key, OperationStack && stack, bool following_keys_cleared, bool following_keys_conflict, bool is_conflict, bool following_keys_unreadable, bool is_unreadable ) : key(key), stack(std::move(stack)), following_keys_cleared(following_keys_cleared), following_keys_conflict(following_keys_conflict), is_conflict(is_conflict), following_keys_unreadable(following_keys_unreadable), is_unreadable(is_unreadable) {}

+	int compare(StringRef const& r) const { return key.compare(r); }
+
+	int compare(ExtStringRef const& r) const { return -r.compare(key); }
+
 	std::string toString() const { return printable(key); }
 };

+inline int compare(StringRef const& l, WriteMapEntry const& r) {
+	return l.compare(r.key);
+}
+
+inline int compare(ExtStringRef const& l, WriteMapEntry const& r) {
+	return l.compare(r.key);
+}
+
 inline bool operator < ( const WriteMapEntry& lhs, const WriteMapEntry& rhs ) { return lhs.key < rhs.key; }
 inline bool operator < ( const WriteMapEntry& lhs, const StringRef& rhs ) { return lhs.key < rhs; }
 inline bool operator < ( const StringRef& lhs, const WriteMapEntry& rhs ) { return lhs < rhs.key; }
-inline bool operator < ( const WriteMapEntry& lhs, const ExtStringRef& rhs ) { return rhs.cmp(lhs.key)>0; }
-inline bool operator < ( const ExtStringRef& lhs, const WriteMapEntry& rhs ) { return lhs.cmp(rhs.key)<0; }
+inline bool operator<(const WriteMapEntry& lhs, const ExtStringRef& rhs) {
+	return rhs.compare(lhs.key) > 0;
+}
+inline bool operator<(const ExtStringRef& lhs, const WriteMapEntry& rhs) {
+	return lhs.compare(rhs.key) < 0;
+}

 class WriteMap {
 private:
-	typedef PTreeImpl::PTree< WriteMapEntry > PTreeT;
+	typedef PTreeImpl::PTree<WriteMapEntry> PTreeT;
+	typedef PTreeImpl::PTreeFinger<WriteMapEntry> PTreeFingerT;
 	typedef Reference<PTreeT> Tree;

 public:
@ -374,7 +391,7 @@ public:
 		Tree tree;
 		Version at;
 		int beginLen, endLen;
-		vector< PTreeT const* > finger;
+		PTreeFingerT finger;
 		bool offset;  // false-> the operation stack at entry(); true-> the following cleared or unmodified range
 	};

--- a/fdbclient/fdbclient.vcxproj
+++ b/fdbclient/fdbclient.vcxproj
@ -1,248 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.1" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <Import Project="$(SolutionDir)versions.target" />
-  <PropertyGroup Condition="'$(Release)' != 'true' ">
-    <PreReleaseDecoration>-PRERELEASE</PreReleaseDecoration>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Release)' == 'true' ">
-    <PreReleaseDecoration>
-    </PreReleaseDecoration>
-    <PreprocessorDefinitions>FDB_CLEAN_BUILD;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-  </PropertyGroup>
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|X64">
-      <Configuration>Debug</Configuration>
-      <Platform>X64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|X64">
-      <Configuration>Release</Configuration>
-      <Platform>X64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ActorCompiler Include="AsyncFileBlobStore.actor.h">
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">false</EnableCompile>
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
-    </ActorCompiler>
-    <ClInclude Include="Atomic.h" />
-    <ClInclude Include="BackupContainer.h" />
-    <ActorCompiler Include="BackupAgent.actor.h">
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">false</EnableCompile>
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
-    </ActorCompiler>
-    <ClInclude Include="BlobStore.h" />
-    <ClInclude Include="ClientLogEvents.h" />
-    <ClInclude Include="ClientWorkerInterface.h" />
-    <ClInclude Include="ClusterInterface.h" />
-    <ClInclude Include="CommitTransaction.h" />
-    <ClInclude Include="CoordinationInterface.h" />
-    <ClInclude Include="DatabaseConfiguration.h" />
-    <ClInclude Include="DatabaseContext.h" />
-    <ActorCompiler Include="EventTypes.actor.h">
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">false</EnableCompile>
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
-    </ActorCompiler>
-    <ClInclude Include="FDBOptions.g.h" />
-    <ClInclude Include="FDBOptions.h" />
-    <ClInclude Include="FDBTypes.h" />
-    <ClInclude Include="HTTP.h" />
-    <ClInclude Include="KeyBackedTypes.h" />
-    <ClInclude Include="MetricLogger.h" />
-    <ClInclude Include="IClientApi.h" />
-    <ClInclude Include="JsonBuilder.h" />
-    <ClInclude Include="JSONDoc.h" />
-    <ClInclude Include="json_spirit\json_spirit_error_position.h" />
-    <ClInclude Include="json_spirit\json_spirit_reader_template.h" />
-    <ClInclude Include="json_spirit\json_spirit_value.h" />
-    <ClInclude Include="json_spirit\json_spirit_writer_options.h" />
-    <ClInclude Include="json_spirit\json_spirit_writer_template.h" />
-    <ClInclude Include="KeyRangeMap.h" />
-    <ClInclude Include="Knobs.h" />
-    <ClInclude Include="libb64\cdecode.h" />
-    <ClInclude Include="libb64\cencode.h" />
-    <ClInclude Include="libb64\decode.h" />
-    <ClInclude Include="libb64\encode.h" />
-    <ActorCompiler Include="ManagementAPI.actor.h">
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">false</EnableCompile>
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
-    </ActorCompiler>
-    <ClInclude Include="MasterProxyInterface.h" />
-    <ClInclude Include="md5\md5.h" />
-    <ClInclude Include="MonitorLeader.h" />
-    <ClInclude Include="MultiVersionAssignmentVars.h" />
-    <ClInclude Include="MultiVersionTransaction.h" />
-    <ClInclude Include="MutationList.h" />
-    <ActorCompiler Include="NativeAPI.actor.h">
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">false</EnableCompile>
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
-    </ActorCompiler>
-    <ClInclude Include="Notified.h" />
-    <ClInclude Include="ReadYourWrites.h" />
-    <ActorCompiler Include="RunTransaction.actor.h" />
-    <ClInclude Include="RYWIterator.h" />
-    <ClInclude Include="Schemas.h" />
-    <ClInclude Include="sha1\SHA1.h" />
-    <ClInclude Include="SnapshotCache.h" />
-    <ActorCompiler Include="SpecialKeySpace.actor.h" />
-    <ClInclude Include="Status.h" />
-    <ClInclude Include="StatusClient.h" />
-    <ClInclude Include="StorageServerInterface.h" />
-    <ClInclude Include="Subspace.h" />
-    <ClInclude Include="SystemData.h" />
-    <ActorCompiler Include="RestoreWorkerInterface.actor.h">
-        <EnableCompile>false</EnableCompile>
-    </ActorCompiler>
-    <ClInclude Include="TaskBucket.h" />
-    <ClInclude Include="ThreadSafeTransaction.h" />
-    <ClInclude Include="Tuple.h" />
-    <ActorCompiler Include="VersionedMap.actor.h">
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">false</EnableCompile>
-      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
-    </ActorCompiler>
-    <ClInclude Include="VersionedMap.h" />
-    <ClInclude Include="WriteMap.h" />
-    <ClInclude Include="zipf.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <ActorCompiler Include="AsyncFileBlobStore.actor.cpp" />
-    <ClCompile Include="AutoPublicAddress.cpp" />
-    <ActorCompiler Include="BackupAgentBase.actor.cpp" />
-    <ActorCompiler Include="BackupContainer.actor.cpp" />
-    <ActorCompiler Include="BlobStore.actor.cpp" />
-    <ActorCompiler Include="DatabaseBackupAgent.actor.cpp" />
-    <ClCompile Include="DatabaseConfiguration.cpp" />
-    <ClCompile Include="FDBOptions.g.cpp" />
-    <ActorCompiler Include="FileBackupAgent.actor.cpp" />
-    <ActorCompiler Include="HTTP.actor.cpp" />
-    <ActorCompiler Include="KeyRangeMap.actor.cpp" />
-    <ClCompile Include="Knobs.cpp" />
-    <ClCompile Include="libb64\cdecode.c" />
-    <ClCompile Include="libb64\cencode.c" />
-    <ClCompile Include="md5\md5.c" />
-    <ActorCompiler Include="MetricLogger.actor.cpp" />
-    <ActorCompiler Include="MonitorLeader.actor.cpp" />
-    <ActorCompiler Include="ManagementAPI.actor.cpp" />
-    <ActorCompiler Include="MultiVersionTransaction.actor.cpp" />
-    <ActorCompiler Include="NativeAPI.actor.cpp" />
-    <ActorCompiler Include="ReadYourWrites.actor.cpp" />
-    <ClCompile Include="RYWIterator.cpp" />
-    <ActorCompiler Include="StatusClient.actor.cpp" />
-    <ClCompile Include="Schemas.cpp" />
-    <ClCompile Include="SystemData.cpp" />
-    <ClCompile Include="sha1\SHA1.cpp" />
-    <ActorCompiler Include="SpecialKeySpace.actor.cpp" />
-    <ActorCompiler Include="ThreadSafeTransaction.actor.cpp" />
-    <ActorCompiler Include="TaskBucket.actor.cpp" />
-    <ClCompile Include="Subspace.cpp" />
-    <ClCompile Include="Tuple.cpp" />
-    <ClCompile Include="JsonBuilder.cpp" />
-    <ClCompile Include="zipf.c" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGUID>{E2939DAA-238E-4970-96C4-4C57980F93BD}</ProjectGUID>
-    <TargetFrameworkVersion>v4.5.2</TargetFrameworkVersion>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>flow</RootNamespace>
-  </PropertyGroup>
-  <PropertyGroup>
-    <OutDir>$(SolutionDir)bin\$(Configuration)\</OutDir>
-    <IntDir>$(SystemDrive)\temp\msvcfdb\$(Platform)$(Configuration)\$(MSBuildProjectName)\</IntDir>
-    <BuildLogFile>$(IntDir)\$(MSBuildProjectName).log</BuildLogFile>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|X64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|X64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <CharacterSet>MultiByte</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets">
-    <Import Project="$(LocalAppData)\Microsoft\VisualStudio\10.0\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(LocalAppData)\Microsoft\VisualStudio\10.0\Microsoft.Cpp.$(Platform).user.props')" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">
-    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|X64'">
-    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>$(IncludePath);../;C:\Program Files\boost_1_67_0</IncludePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup>
-    <ClCompile>
-      <PreprocessorDefinitions>FDB_VT_VERSION="$(Version)$(PreReleaseDecoration)";FDB_VT_PACKAGE_NAME="$(PackageName)";%(PreprocessorDefinitions)</PreprocessorDefinitions>
-			<LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <MinimalRebuild>false</MinimalRebuild>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <Optimization>Disabled</Optimization>
-      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
-      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
-      <PreprocessorDefinitions>TLS_DISABLED;WIN32;_WIN32_WINNT=0x0502;WINVER=0x0502;BOOST_ALL_NO_LIB;NTDDI_VERSION=0x05020000;_DEBUG;_HAS_ITERATOR_DEBUGGING=0;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-      <AdditionalOptions>/bigobj @../flow/no_intellisense.opt %(AdditionalOptions)</AdditionalOptions>
-			<LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Advapi32.lib</AdditionalDependencies>
-    </Link>
-    <Lib>
-      <AdditionalDependencies>$(TargetDir)flow.lib;$(TargetDir)fdbrpc.lib;winmm.lib</AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|X64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-      <Optimization>Full</Optimization>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>TLS_DISABLED;WIN32;_WIN32_WINNT=0x0502;WINVER=0x0502;BOOST_ALL_NO_LIB;NTDDI_VERSION=0x05020000;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
-      <EnablePREfast>false</EnablePREfast>
-      <AdditionalOptions>/bigobj @../flow/no_intellisense.opt %(AdditionalOptions)</AdditionalOptions>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <MinimalRebuild>false</MinimalRebuild>
-			<LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>false</EnableCOMDATFolding>
-      <OptimizeReferences>false</OptimizeReferences>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-      <AdditionalDependencies>Advapi32.lib</AdditionalDependencies>
-      <AdditionalOptions>/LTCG %(AdditionalOptions)</AdditionalOptions>
-    </Link>
-    <Lib>
-      <AdditionalDependencies>$(TargetDir)flow.lib;$(TargetDir)fdbrpc.lib;winmm.lib</AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ImportGroup Label="ExtensionTargets">
-    <Import Project="..\flow\actorcompiler\ActorCompiler.targets" />
-  </ImportGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <Target Name="MyPreCompileSteps" AfterTargets="CLCompile">
-    <Exec Command="&quot;$(SolutionDir)bin\$(Configuration)\coveragetool.exe&quot; &quot;$(OutDir)coverage.$(TargetName).xml&quot; @(ActorCompiler -> '%(RelativeDir)%(Filename)%(Extension)', ' ') @(CLInclude -> '%(RelativeDir)%(Filename)%(Extension)', ' ') @(CLCompile -> '%(RelativeDir)%(Filename)%(Extension)', ' ')" />
-  </Target>
-</Project>
--- a/fdbmonitor/CMakeLists.txt
+++ b/fdbmonitor/CMakeLists.txt
@ -8,7 +8,6 @@ if(UNIX AND NOT APPLE)
 endif()
 # FIXME: This include directory is an ugly hack. We probably want to fix this
 # as soon as we get rid of the old build system
-target_include_directories(fdbmonitor PRIVATE ${CMAKE_BINARY_DIR}/fdbclient)
 target_link_libraries(fdbmonitor PUBLIC Threads::Threads)

 fdb_install(TARGETS fdbmonitor DESTINATION fdbmonitor COMPONENT server)
--- a/fdbmonitor/fdbmonitor.cpp
+++ b/fdbmonitor/fdbmonitor.cpp
@ -77,9 +77,7 @@
 #include "flow/SimpleOpt.h"
 #include "SimpleIni.h"

-#if defined(CMAKE_BUILD) || !defined(WIN32)
-#include "versions.h"
-#endif
+#include "fdbclient/IncludeVersions.h"

 #ifdef __linux__
 typedef fd_set* fdb_fd_set;
--- a/fdbrpc/ActorFuzz.actor.cpp
+++ b/fdbrpc/ActorFuzz.actor.cpp
@ -22,7 +22,8 @@

 // THIS FILE WAS GENERATED BY actorFuzz.py; DO NOT MODIFY IT DIRECTLY

-#include "ActorFuzz.h"
+#include "fdbrpc/ActorFuzz.h"
+#include "flow/actorcompiler.h" // has to be last include

 #ifndef WIN32

--- a/fdbrpc/AsyncFileEIO.actor.h
+++ b/fdbrpc/AsyncFileEIO.actor.h
@ -415,7 +415,7 @@ private:
 		return data.result.get();
 	}

-	static volatile int32_t want_poll;
+	static std::atomic<int32_t> want_poll;

 	ACTOR static void poll_eio() {
 		while (eio_poll() == -1)
@ -445,7 +445,7 @@ private:
 };

 #ifdef FILESYSTEM_IMPL
-volatile int32_t AsyncFileEIO::want_poll = 0;
+std::atomic<int32_t> AsyncFileEIO::want_poll = 0;
 #endif

 #include "flow/unactorcompiler.h"
--- a/fdbrpc/FailureMonitor.h
+++ b/fdbrpc/FailureMonitor.h
@ -23,7 +23,6 @@
 #pragma once

 #include "flow/flow.h"
-#include "flow/IndexedSet.h"
 #include "fdbrpc/FlowTransport.h" // Endpoint
 #include <unordered_map>

--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@ -80,6 +80,7 @@ class LambdaCallback : public CallbackType, public FastAllocated<LambdaCallback<
 	ErrFunc errFunc;

 	virtual void fire(T const& t) { CallbackType::remove(); func(t); delete this; }
+	virtual void fire(T && t) { CallbackType::remove(); func(std::move(t)); delete this; }
 	virtual void error(Error e) { CallbackType::remove(); errFunc(e); delete this; }

 public:
@ -1384,21 +1385,23 @@ struct Tracker {
 		this->copied = other.copied + 1;
 		return *this;
 	}
+	~Tracker() = default;

 	ACTOR static Future<Void> listen(FutureStream<Tracker> stream) {
-		Tracker t = waitNext(stream);
-		ASSERT(!t.moved);
-		ASSERT(t.copied == 0);
+		Tracker movedTracker = waitNext(stream);
+		ASSERT(!movedTracker.moved);
+		ASSERT(movedTracker.copied == 0);
 		return Void();
 	}
 };

 TEST_CASE("/flow/flow/PromiseStream/move") {
 	state PromiseStream<Tracker> stream;
+	state Future<Void> listener;
 	{
 		// This tests the case when a callback is added before
 		// a movable value is sent
-		state Future<Void> listener = Tracker::listen(stream.getFuture());
+		listener = Tracker::listen(stream.getFuture());
 		stream.send(Tracker{});
 		wait(listener);
 	}
@ -1417,15 +1420,14 @@ TEST_CASE("/flow/flow/PromiseStream/move") {
 		stream.send(Tracker{});
 		stream.send(Tracker{});
 		{
-			Tracker t = waitNext(stream.getFuture());
-			ASSERT(!t.moved);
-			ASSERT(t.copied == 0);
-		}
-		choose {
-			when(Tracker t = waitNext(stream.getFuture())) {
-				ASSERT(!t.moved);
-				ASSERT(t.copied == 0);
+			state Tracker movedTracker = waitNext(stream.getFuture());
+			ASSERT(!movedTracker.moved);
+			ASSERT(movedTracker.copied == 0);
 		}
+		{
+			Tracker movedTracker = waitNext(stream.getFuture());
+			ASSERT(!movedTracker.moved);
+			ASSERT(movedTracker.copied == 0);
 		}
 	}
 	{
@ -1436,19 +1438,29 @@ TEST_CASE("/flow/flow/PromiseStream/move") {
 		stream.send(namedTracker1);
 		stream.send(namedTracker2);
 		{
-			Tracker t = waitNext(stream.getFuture());
-			ASSERT(!t.moved);
+			state Tracker copiedTracker = waitNext(stream.getFuture());
+			ASSERT(!copiedTracker.moved);
 			// must copy onto queue
-			ASSERT(t.copied == 1);
+			ASSERT(copiedTracker.copied == 1);
 		}
-		choose {
-			when(Tracker t = waitNext(stream.getFuture())) {
-				ASSERT(!t.moved);
+		{
+			Tracker copiedTracker = waitNext(stream.getFuture());
+			ASSERT(!copiedTracker.moved);
 			// must copy onto queue
-				ASSERT(t.copied == 1);
-			}
+			ASSERT(copiedTracker.copied == 1);
 		}
 	}

 	return Void();
 }
+
+TEST_CASE("/flow/flow/PromiseStream/move2") {
+	PromiseStream<Tracker> stream;
+	stream.send(Tracker{});
+	Tracker tracker = waitNext(stream.getFuture());
+	Tracker movedTracker = std::move(tracker);
+	ASSERT(tracker.moved);
+	ASSERT(!movedTracker.moved);
+	ASSERT(movedTracker.copied == 0);
+	return Void();
+}
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -473,12 +473,13 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,
 		.detail("ConnSet", (bool)conn);
 	ASSERT_WE_THINK(FlowTransport::transport().getLocalAddress() != self->destination);

+	state Future<Void> delayedHealthUpdateF;
 	state Optional<double> firstConnFailedTime = Optional<double>();
 	state int retryConnect = false;

 	loop {
 		try {
-			state Future<Void> delayedHealthUpdateF = Future<Void>();
+			delayedHealthUpdateF = Future<Void>();

 			if (!conn) {  // Always, except for the first loop with an incoming connection
 				self->outgoingConnectionIdle = true;
--- a/fdbrpc/libeio/eio.c
+++ b/fdbrpc/libeio/eio.c
@ -366,19 +366,19 @@ tvdiff (struct timeval *tv1, struct timeval *tv2)
       + ((tv2->tv_usec - tv1->tv_usec) >> 10);
 }

-static unsigned int started, idle, wanted = 4;
+static _Atomic(unsigned int) started, idle, wanted = 4;

 static void (*want_poll_cb) (void);
 static void (*done_poll_cb) (void);

-static unsigned int max_poll_time;     /* reslock */
-static unsigned int max_poll_reqs;     /* reslock */
+static _Atomic(unsigned int) max_poll_time; /* reslock */
+static _Atomic(unsigned int) max_poll_reqs; /* reslock */

-static unsigned int nreqs;    /* reqlock */
-static unsigned int nready;   /* reqlock */
-static unsigned int npending; /* reqlock */
-static unsigned int max_idle = 4;      /* maximum number of threads that can idle indefinitely */
-static unsigned int idle_timeout = 10; /* number of seconds after which an idle threads exit */
+static _Atomic(unsigned int) nreqs; /* reqlock */
+static _Atomic(unsigned int) nready; /* reqlock */
+static _Atomic(unsigned int) npending; /* reqlock */
+static _Atomic(unsigned int) max_idle = 4; /* maximum number of threads that can idle indefinitely */
+static _Atomic(unsigned int) idle_timeout = 10; /* number of seconds after which an idle threads exit */

 static xmutex_t wrklock;
 static xmutex_t reslock;
@ -435,9 +435,7 @@ static unsigned int
 etp_nreqs (void)
 {
  int retval;
-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  retval = nreqs;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
  return retval;
 }

@ -446,9 +444,7 @@ etp_nready (void)
 {
  unsigned int retval;

-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  retval = nready;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);

  return retval;
 }
@ -458,9 +454,7 @@ etp_npending (void)
 {
  unsigned int retval;

-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  retval = npending;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);

  return retval;
 }
@ -470,9 +464,7 @@ etp_nthreads (void)
 {
  unsigned int retval;

-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  retval = started;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);

  return retval;
 }
@ -744,33 +736,25 @@ etp_submit (ETP_REQ *req)
 static void ecb_cold
 etp_set_max_poll_time (double nseconds)
 {
-  if (WORDACCESS_UNSAFE) X_LOCK   (reslock);
  max_poll_time = nseconds * EIO_TICKS;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reslock);
 }

 static void ecb_cold
 etp_set_max_poll_reqs (unsigned int maxreqs)
 {
-  if (WORDACCESS_UNSAFE) X_LOCK   (reslock);
  max_poll_reqs = maxreqs;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reslock);
 }

 static void ecb_cold
 etp_set_max_idle (unsigned int nthreads)
 {
-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  max_idle = nthreads;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 }

 static void ecb_cold
 etp_set_idle_timeout (unsigned int seconds)
 {
-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  idle_timeout = seconds;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 }

 static void ecb_cold
--- a/fdbrpc/libeio/xthread.h
+++ b/fdbrpc/libeio/xthread.h
@ -1,18 +1,6 @@
 #ifndef XTHREAD_H_
 #define XTHREAD_H_

-/* whether word reads are potentially non-atomic.
- * this is conservative, likely most arches this runs
- * on have atomic word read/writes.
- */
-#ifndef WORDACCESS_UNSAFE
-# if __i386 || __x86_64
-#  define WORDACCESS_UNSAFE 0
-# else
-#  define WORDACCESS_UNSAFE 1
-# endif
-#endif
-
 /////////////////////////////////////////////////////////////////////////////

 #ifdef _WIN32
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@ -77,6 +77,7 @@ set(FDBSERVER_SRCS
  RestoreWorker.actor.cpp
  Resolver.actor.cpp
  ResolverInterface.h
+  ServerDBInfo.actor.h
  ServerDBInfo.h
  SimulatedCluster.actor.cpp
  SimulatedCluster.h
@ -108,6 +109,7 @@ set(FDBSERVER_SRCS
  workloads/AsyncFileRead.actor.cpp
  workloads/AsyncFileWrite.actor.cpp
  workloads/AtomicOps.actor.cpp
+  workloads/ReadHotDetection.actor.cpp
  workloads/AtomicOpsApiCorrectness.actor.cpp
  workloads/AtomicRestore.actor.cpp
  workloads/AtomicSwitchover.actor.cpp
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -57,10 +57,13 @@ BandwidthStatus getBandwidthStatus( StorageMetrics const& metrics ) {
 }

 ReadBandwidthStatus getReadBandwidthStatus(StorageMetrics const& metrics) {
-	if (metrics.bytesReadPerKSecond > SERVER_KNOBS->SHARD_MAX_BYTES_READ_PER_KSEC)
-		return ReadBandwidthStatusHigh;
-	else
+	if (metrics.bytesReadPerKSecond <= SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS ||
+	    metrics.bytesReadPerKSecond <= SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO * metrics.bytes *
+	                                       SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS) {
 		return ReadBandwidthStatusNormal;
+	} else {
+		return ReadBandwidthStatusHigh;
+	}
 }

 ACTOR Future<Void> updateMaxShardSize( Reference<AsyncVar<int64_t>> dbSizeEstimate, Reference<AsyncVar<Optional<int64_t>>> maxShardSize ) {
@ -102,6 +105,9 @@ struct DataDistributionTracker {
 	Promise<Void> readyToStart;
 	Reference<AsyncVar<bool>> anyZeroHealthyTeams;

+	// Read hot detection
+	PromiseStream<KeyRange> readHotShard;
+
 	DataDistributionTracker(Database cx, UID distributorId, Promise<Void> const& readyToStart, PromiseStream<RelocateShard> const& output, Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure, Reference<AsyncVar<bool>> anyZeroHealthyTeams)
 		: cx(cx), distributorId( distributorId ), dbSizeEstimate( new AsyncVar<int64_t>() ), systemSizeEstimate(0),
 			maxShardSize( new AsyncVar<Optional<int64_t>>() ),
@ -115,10 +121,8 @@ struct DataDistributionTracker {
 	}
 };

-void restartShardTrackers(
-	DataDistributionTracker* self,
-	KeyRangeRef keys,
-	Optional<ShardMetrics> startingSize = Optional<ShardMetrics>());
+void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys,
+                          Optional<ShardMetrics> startingMetrics = Optional<ShardMetrics>());

 // Gets the permitted size and IO bounds for a shard. A shard that starts at allKeys.begin
 //  (i.e. '') will have a permitted size of 0, since the database can contain no data.
@ -160,11 +164,8 @@ int64_t getMaxShardSize( double dbSizeEstimate ) {
 		(int64_t)SERVER_KNOBS->MAX_SHARD_BYTES);
 }

-ACTOR Future<Void> trackShardBytes(
-		DataDistributionTracker* self,
-		KeyRange keys,
-		Reference<AsyncVar<Optional<ShardMetrics>>> shardMetrics)
-{
+ACTOR Future<Void> trackShardMetrics(DataDistributionTracker* self, KeyRange keys,
+                                     Reference<AsyncVar<Optional<ShardMetrics>>> shardMetrics) {
 	state BandwidthStatus bandwidthStatus = shardMetrics->get().present() ? getBandwidthStatus( shardMetrics->get().get().metrics ) : BandwidthStatusNormal;
 	state double lastLowBandwidthStartTime = shardMetrics->get().present() ? shardMetrics->get().get().lastLowBandwidthStartTime : now();
 	state int shardCount = shardMetrics->get().present() ? shardMetrics->get().get().shardCount : 1;
@ -172,7 +173,7 @@ ACTOR Future<Void> trackShardBytes(

 	wait( delay( 0, TaskPriority::DataDistribution ) );

-	/*TraceEvent("TrackShardBytesStarting")
+	/*TraceEvent("TrackShardMetricsStarting")
 	    .detail("TrackerID", trackerID)
 	    .detail("Keys", keys)
 	    .detail("TrackedBytesInitiallyPresent", shardMetrics->get().present())
@ -184,7 +185,8 @@ ACTOR Future<Void> trackShardBytes(
 			state ShardSizeBounds bounds;
 			if( shardMetrics->get().present() ) {
 				auto bytes = shardMetrics->get().get().metrics.bytes;
-				auto newReadBandwidthStatus = getReadBandwidthStatus(shardMetrics->get().get().metrics);
+				auto readBandwidthStatus = getReadBandwidthStatus(shardMetrics->get().get().metrics);
+
 				bounds.max.bytes = std::max( int64_t(bytes * 1.1), (int64_t)SERVER_KNOBS->MIN_SHARD_BYTES );
 				bounds.min.bytes = std::min( int64_t(bytes * 0.9), std::max(int64_t(bytes - (SERVER_KNOBS->MIN_SHARD_BYTES * 0.1)), (int64_t)0) );
 				bounds.permittedError.bytes = bytes * 0.1;
@ -204,24 +206,24 @@ ACTOR Future<Void> trackShardBytes(
 					ASSERT( false );
 				}
 				// handle read bandkwith status
-				if (newReadBandwidthStatus != readBandwidthStatus) {
-					TraceEvent("ReadBandwidthStatusChanged")
-					    .detail("From", readBandwidthStatus == ReadBandwidthStatusNormal ? "Normal" : "High")
-					    .detail("To", newReadBandwidthStatus == ReadBandwidthStatusNormal ? "Normal" : "High");
-					readBandwidthStatus = newReadBandwidthStatus;
-				}
-				if (newReadBandwidthStatus == ReadBandwidthStatusNormal) {
-					TEST(true);
-					bounds.max.bytesReadPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_READ_PER_KSEC *
-					                                 (1.0 + SERVER_KNOBS->SHARD_MAX_BYTES_READ_PER_KSEC_JITTER);
+				if (readBandwidthStatus == ReadBandwidthStatusNormal) {
+					bounds.max.bytesReadPerKSecond =
+					    std::max((int64_t)(SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO * bytes *
+					                       SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS *
+					                       (1.0 + SERVER_KNOBS->SHARD_MAX_BYTES_READ_PER_KSEC_JITTER)),
+					             SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS);
 					bounds.min.bytesReadPerKSecond = 0;
 					bounds.permittedError.bytesReadPerKSecond = bounds.min.bytesReadPerKSecond / 4;
-				} else if (newReadBandwidthStatus == ReadBandwidthStatusHigh) {
-					TEST(true);
+				} else if (readBandwidthStatus == ReadBandwidthStatusHigh) {
 					bounds.max.bytesReadPerKSecond = bounds.max.infinity;
-					bounds.min.bytesReadPerKSecond = SERVER_KNOBS->SHARD_MAX_BYTES_READ_PER_KSEC *
+					bounds.min.bytesReadPerKSecond = SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO * bytes *
+					                                 SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS *
 					                                 (1.0 - SERVER_KNOBS->SHARD_MAX_BYTES_READ_PER_KSEC_JITTER);
 					bounds.permittedError.bytesReadPerKSecond = bounds.min.bytesReadPerKSecond / 4;
+					// TraceEvent("RHDTriggerReadHotLoggingForShard")
+					//     .detail("ShardBegin", keys.begin.printable().c_str())
+					//     .detail("ShardEnd", keys.end.printable().c_str());
+					self->readHotShard.send(keys);
 				} else {
 					ASSERT(false);
 				}
@ -290,6 +292,32 @@ ACTOR Future<Void> trackShardBytes(
 	}
 }

+ACTOR Future<Void> readHotDetector(DataDistributionTracker* self) {
+	try {
+		loop {
+			state KeyRange keys = waitNext(self->readHotShard.getFuture());
+			state Transaction tr(self->cx);
+			loop {
+				try {
+					Standalone<VectorRef<KeyRangeRef>> readHotRanges = wait(tr.getReadHotRanges(keys));
+					for (auto& keyRange : readHotRanges) {
+						TraceEvent("ReadHotRangeLog")
+						    .detail("KeyRangeBegin", keyRange.begin)
+						    .detail("KeyRangeEnd", keyRange.end);
+					}
+					break;
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+	} catch (Error& e) {
+		if (e.code() != error_code_actor_cancelled)
+			self->output.sendError(e); // Propagate failure to dataDistributionTracker
+		throw e;
+	}
+}
+
 /*
 ACTOR Future<Void> extrapolateShardBytes( Reference<AsyncVar<Optional<int64_t>>> inBytes, Reference<AsyncVar<Optional<int64_t>>> outBytes ) {
 	state std::deque< std::pair<double,int64_t> > past;
@ -690,7 +718,7 @@ ACTOR Future<Void> shardTracker(
 	}
 }

-void restartShardTrackers( DataDistributionTracker* self, KeyRangeRef keys, Optional<ShardMetrics> startingSize ) {
+void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optional<ShardMetrics> startingMetrics) {
 	auto ranges = self->shards.getAffectedRangesAfterInsertion( keys, ShardTrackedData() );
 	for(int i=0; i<ranges.size(); i++) {
 		if( !ranges[i].value.trackShard.isValid() && ranges[i].begin != keys.begin ) {
@ -700,24 +728,24 @@ void restartShardTrackers( DataDistributionTracker* self, KeyRangeRef keys, Opti
 			continue;
 		}

-		Reference<AsyncVar<Optional<ShardMetrics>>> shardSize( new AsyncVar<Optional<ShardMetrics>>() );
+		Reference<AsyncVar<Optional<ShardMetrics>>> shardMetrics(new AsyncVar<Optional<ShardMetrics>>());

 		// For the case where the new tracker will take over at the boundaries of current shard(s)
 		//  we can use the old size if it is available. This will be the case when merging shards.
-		if( startingSize.present() ) {
+		if (startingMetrics.present()) {
 			ASSERT( ranges.size() == 1 );
 			/*TraceEvent("ShardTrackerSizePreset", self->distributorId)
 			    .detail("Keys", keys)
-				.detail("Size", startingSize.get().metrics.bytes)
-				.detail("Merges", startingSize.get().merges);*/
+			    .detail("Size", startingMetrics.get().metrics.bytes)
+			    .detail("Merges", startingMetrics.get().merges);*/
 			TEST( true ); // shardTracker started with trackedBytes already set
-			shardSize->set( startingSize );
+			shardMetrics->set(startingMetrics);
 		}

 		ShardTrackedData data;
-		data.stats = shardSize;
-		data.trackShard = shardTracker( self, ranges[i], shardSize );
-		data.trackBytes = trackShardBytes( self, ranges[i], shardSize );
+		data.stats = shardMetrics;
+		data.trackShard = shardTracker(self, ranges[i], shardMetrics);
+		data.trackBytes = trackShardMetrics(self, ranges[i], shardMetrics);
 		self->shards.insert( ranges[i], data );
 	}
 }
@ -798,6 +826,7 @@ ACTOR Future<Void> dataDistributionTracker(
 {
 	state DataDistributionTracker self(cx, distributorId, readyToStart, output, shardsAffectedByTeamFailure, anyZeroHealthyTeams);
 	state Future<Void> loggingTrigger = Void();
+	state Future<Void> readHotDetect = readHotDetector(&self);
 	try {
 		wait( trackInitialShards( &self, initData ) );
 		initData = Reference<InitialDataDistribution>();
--- a/fdbserver/FDBExecHelper.actor.cpp
+++ b/fdbserver/FDBExecHelper.actor.cpp
@ -7,9 +7,7 @@
 #include "fdbserver/FDBExecHelper.actor.h"
 #include "flow/Trace.h"
 #include "flow/flow.h"
-#if defined(CMAKE_BUILD) || !defined(_WIN32)
-#include "versions.h"
-#endif
+#include "fdbclient/IncludeVersions.h"
 #include "fdbserver/Knobs.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.

@ -79,21 +77,25 @@ ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> par
 	return 0;
 }
 #else
-ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> paramList, double maxWaitTime, bool isSync, double maxSimDelayTime)
+
+pid_t fork_child(const std::string& path,
+				 std::vector<char*>& paramList)
 {
-	state std::string argsString;
-	for (auto const& elem : paramList) {
-		argsString += elem + ",";
+	pid_t pid = fork();
+	if (pid == -1) {
+		return -1;
 	}
-	TraceEvent("SpawnProcess").detail("Cmd", binPath).detail("Args", argsString);
+	if (pid == 0) {
+		execv(const_cast<char*>(path.c_str()), &paramList[0]);
+		_exit(EXIT_FAILURE);
+	}
+	return pid;
+}

-	state int err = 0;
-	state double runTime = 0;
-	state boost::process::child c(binPath, boost::process::args(paramList),
-								  boost::process::std_err > boost::process::null);
-
-	// for async calls in simulator, always delay by a deterinistic amount of time and do the call
-	// synchronously, otherwise the predictability of the simulator breaks
+ACTOR Future<int> spawnProcess(std::string path, std::vector<std::string> args, double maxWaitTime, bool isSync, double maxSimDelayTime)
+{
+	// for async calls in simulator, always delay by a deterministic amount of time and then
+	// do the call synchronously, otherwise the predictability of the simulator breaks
 	if (!isSync && g_network->isSimulated()) {
 		double snapDelay = std::max(maxSimDelayTime - 1, 0.0);
 		// add some randomness
@ -103,41 +105,70 @@ ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> par
 		wait(delay(snapDelay));
 	}

-	if (!isSync && !g_network->isSimulated()) {
-		while (c.running() && runTime <= maxWaitTime) {
-			wait(delay(0.1));
-			runTime += 0.1;
-		}
-	} else {
-		if (g_network->isSimulated()) {
-			// to keep the simulator deterministic, wait till the process exits,
-			// hence giving a large wait time
-			c.wait_for(std::chrono::hours(24));
-			ASSERT(!c.running());
-		} else {
-			int maxWaitTimeInt = static_cast<int>(maxWaitTime + 1.0);
-			c.wait_for(std::chrono::seconds(maxWaitTimeInt));
+	std::vector<char*> paramList;
+	for (int i = 0; i < args.size(); i++) {
+		paramList.push_back(const_cast<char*>(args[i].c_str()));
 	}
+	paramList.push_back(nullptr);
+
+	state std::string allArgs;
+	for (int i = 0; i < args.size(); i++) {
+		allArgs += args[i];
 	}

-	if (c.running()) {
-		TraceEvent(SevWarnAlways, "ChildTermination")
-				.detail("Cmd", binPath)
-				.detail("Args", argsString);
-		c.terminate();
-		err = -1;
-		if (!c.wait_for(std::chrono::seconds(1))) {
-			TraceEvent(SevWarnAlways, "SpawnProcessFailedToExit")
-				.detail("Cmd", binPath)
-				.detail("Args", argsString);
+	state pid_t pid = fork_child(path, paramList);
+	if (pid == -1) {
+		TraceEvent(SevWarnAlways, "SpawnProcess: Command failed to spawn")
+			.detail("Cmd", path)
+			.detail("Args", allArgs);
+		return -1;
+	} else if (pid > 0) {
+		state int status = -1;
+		state double runTime = 0;
+		while (true) {
+			if (runTime > maxWaitTime) {
+				// timing out
+				TraceEvent(SevWarnAlways, "SpawnProcess : Command failed, timeout")
+					.detail("Cmd", path)
+					.detail("Args", allArgs);
+				return -1;
 			}
+			int err = waitpid(pid, &status, WNOHANG);
+			if (err < 0) {
+				TraceEvent(SevWarnAlways, "SpawnProcess : Command failed")
+					.detail("Cmd", path)
+					.detail("Args", allArgs)
+					.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
+				return -1;
+			} else if (err == 0) {
+				// child process has not completed yet
+				if (isSync || g_network->isSimulated()) {
+					// synchronously sleep
+					threadSleep(0.1);
 				} else {
-		err = c.exit_code();
+					// yield for other actors to run
+					wait(delay(0.1));
 				}
-	TraceEvent("SpawnProcess")
-		.detail("Cmd", binPath)
-		.detail("Error", err);
-	return err;
+				runTime += 0.1;
+			} else {
+				// child process completed
+				if (!(WIFEXITED(status) && WEXITSTATUS(status) == 0)) {
+					TraceEvent(SevWarnAlways, "SpawnProcess : Command failed")
+						.detail("Cmd", path)
+						.detail("Args", allArgs)
+						.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
+					return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
+				}
+				TraceEvent("SpawnProcess : Command status")
+					.detail("Cmd", path)
+					.detail("Args", allArgs)
+					.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : 0);
+				return 0;
+			}
+		}
+	}
+	return -1;
+
 }
 #endif

@ -150,6 +181,7 @@ ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::stri
 		// get bin path
 		auto snapBin = execArg->getBinaryPath();
 		std::vector<std::string> paramList;
+		paramList.push_back(snapBin.toString());
 		// get user passed arguments
 		auto listArgs = execArg->getBinaryArgs();
 		for (auto elem : listArgs) {
@ -176,6 +208,7 @@ ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::stri
 		folderTo = folder + "-snap-" + uidStr.toString() + "-" + role;
 		std::vector<std::string> paramList;
 		std::string mkdirBin = "/bin/mkdir";
+		paramList.push_back(mkdirBin);
 		paramList.push_back(folderTo);
 		cmdErr = spawnProcess(mkdirBin, paramList, maxWaitTime, false /*isSync*/, maxSimDelayTime);
 		wait(success(cmdErr));
@ -183,6 +216,7 @@ ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::stri
 		if (err == 0) {
 			std::vector<std::string> paramList;
 			std::string cpBin = "/bin/cp";
+			paramList.push_back(cpBin);
 			paramList.push_back("-a");
 			paramList.push_back(folderFrom);
 			paramList.push_back(folderTo);
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -138,10 +138,18 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( SHARD_BYTES_PER_SQRT_BYTES,                             45 ); if( buggifySmallShards ) SHARD_BYTES_PER_SQRT_BYTES = 0;//Approximately 10000 bytes per shard
 	init( MAX_SHARD_BYTES,                                 500000000 );
 	init( KEY_SERVER_SHARD_BYTES,                          500000000 );
-	bool buggifySmallReadBandwidth = randomize && BUGGIFY;
-	init( SHARD_MAX_BYTES_READ_PER_KSEC,            8LL*1000000*1000 ); if( buggifySmallReadBandwidth ) SHARD_MAX_BYTES_READ_PER_KSEC = 100LL*1000*1000;
-	/* 8*1MB/sec * 1000sec/ksec
-		Shards with more than this read bandwidth will be considered as a read cache candidate
+	init( SHARD_MAX_READ_DENSITY_RATIO,                           2.0);
+	/*
+		The bytesRead/byteSize radio. Will be declared as read hot when larger than this. 2.0 was chosen to avoid reporting table scan as read hot.
+	*/
+	init ( SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS,      166667 * 1000);
+	/*
+		The read bandwidth of a given shard needs to be larger than this value in order to be evaluated if it's read hot. The roughly 167KB per second is calculated as following:
+			- Heuristic data suggests that each storage process can do max 50K read operations per second
+			- Each read has a minimum cost of EMPTY_READ_PENALTY, which is 20 bytes
+			- Thus that gives a minimum 1MB per second
+			- But to be conservative, set that number to be 1/6 of 1MB, which is roughly 166,667 bytes per second
+		Shard with a read bandwidth smaller than this value will never be too busy to handle the reads.
 	*/
 	init( SHARD_MAX_BYTES_READ_PER_KSEC_JITTER,     0.1 );
 	bool buggifySmallBandwidthSplit = randomize && BUGGIFY;
@ -492,6 +500,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( IOPS_UNITS_PER_SAMPLE,                                10000 * 1000 / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 100 );
 	init( BANDWIDTH_UNITS_PER_SAMPLE,                           SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 );
 	init( BYTES_READ_UNITS_PER_SAMPLE,                          100000 ); // 100K bytes
+	init( READ_HOT_SUB_RANGE_CHUNK_SIZE,                        10000000); // 10MB
 	init( EMPTY_READ_PENALTY,                                   20 ); // 20 bytes
 	init( READ_SAMPLING_ENABLED,                                true ); if ( randomize && BUGGIFY ) READ_SAMPLING_ENABLED = false;// enable/disable read sampling

@ -577,7 +586,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( FASTRESTORE_VB_PARALLELISM,                              3 ); if( randomize && BUGGIFY ) { FASTRESTORE_VB_PARALLELISM = deterministicRandom()->random01() * 20 + 1; }
 	init( FASTRESTORE_VB_MONITOR_DELAY,                            5 ); if( randomize && BUGGIFY ) { FASTRESTORE_VB_MONITOR_DELAY = deterministicRandom()->random01() * 20 + 1; }
 	init( FASTRESTORE_VB_LAUNCH_DELAY,                             5 ); if( randomize && BUGGIFY ) { FASTRESTORE_VB_LAUNCH_DELAY = deterministicRandom()->random01() * 60 + 1; }
-	init( FASTRESTORE_ROLE_LOGGING_DELAY,                          5 ); if( randomize && BUGGIFY ) { FASTRESTORE_ROLE_LOGGING_DELAY = deterministicRandom()->random01() * 60 + 1; }
+	init( FASTRESTORE_ROLE_LOGGING_DELAY,                         60 ); if( randomize && BUGGIFY ) { FASTRESTORE_ROLE_LOGGING_DELAY = deterministicRandom()->random01() * 60 + 1; }
 	init( FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL,               5 ); if( randomize && BUGGIFY ) { FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL = deterministicRandom()->random01() * 60 + 1; }
 	init( FASTRESTORE_ATOMICOP_WEIGHT,                           100 ); if( randomize && BUGGIFY ) { FASTRESTORE_ATOMICOP_WEIGHT = deterministicRandom()->random01() * 200 + 1; }
 	init( FASTRESTORE_APPLYING_PARALLELISM,               	     100 ); if( randomize && BUGGIFY ) { FASTRESTORE_APPLYING_PARALLELISM = deterministicRandom()->random01() * 10 + 1; }
@ -592,6 +601,10 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( FASTRESTORE_APPLIER_FETCH_KEYS_SIZE,                   100 ); if( randomize && BUGGIFY ) { FASTRESTORE_APPLIER_FETCH_KEYS_SIZE = deterministicRandom()->random01() * 10240 + 1; }
 	init( FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES, 1.0 * 1024.0 * 1024.0 ); if( randomize && BUGGIFY ) { FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES = deterministicRandom()->random01() * 10.0 * 1024.0 * 1024.0 + 1; }
 	init( FASTRESTORE_GET_RANGE_VERSIONS_EXPENSIVE,            false ); if( randomize && BUGGIFY ) { FASTRESTORE_GET_RANGE_VERSIONS_EXPENSIVE = deterministicRandom()->random01() < 0.5 ? true : false; }
+	init( FASTRESTORE_REQBATCH_PARALLEL,                          50 ); if( randomize && BUGGIFY ) { FASTRESTORE_REQBATCH_PARALLEL = deterministicRandom()->random01() * 100 + 1; }
+	init( FASTRESTORE_REQBATCH_LOG,                            false ); if( randomize && BUGGIFY ) { FASTRESTORE_REQBATCH_LOG = deterministicRandom()->random01() < 0.2 ? true : false; }
+	init( FASTRESTORE_TXN_CLEAR_MAX,                            1000 ); if( randomize && BUGGIFY ) { FASTRESTORE_TXN_CLEAR_MAX = deterministicRandom()->random01() * 100 + 1; }
+	init( FASTRESTORE_TXN_RETRY_MAX,                              10 ); if( randomize && BUGGIFY ) { FASTRESTORE_TXN_RETRY_MAX = deterministicRandom()->random01() * 100 + 1; }

 	// clang-format on

--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@ -137,7 +137,8 @@ public:
 	int64_t SHARD_MAX_BYTES_PER_KSEC, // Shards with more than this bandwidth will be split immediately
 		SHARD_MIN_BYTES_PER_KSEC,     // Shards with more than this bandwidth will not be merged
 		SHARD_SPLIT_BYTES_PER_KSEC;   // When splitting a shard, it is split into pieces with less than this bandwidth
-	int64_t SHARD_MAX_BYTES_READ_PER_KSEC;
+	double SHARD_MAX_READ_DENSITY_RATIO;
+	int64_t SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS;
 	double SHARD_MAX_BYTES_READ_PER_KSEC_JITTER;
 	double STORAGE_METRIC_TIMEOUT;
 	double METRIC_DELAY;
@ -424,6 +425,7 @@ public:
 	int64_t IOPS_UNITS_PER_SAMPLE;
 	int64_t BANDWIDTH_UNITS_PER_SAMPLE;
 	int64_t BYTES_READ_UNITS_PER_SAMPLE;
+	int64_t READ_HOT_SUB_RANGE_CHUNK_SIZE;
 	int64_t EMPTY_READ_PENALTY;
 	bool READ_SAMPLING_ENABLED;

@ -530,6 +532,10 @@ public:
 	int64_t FASTRESTORE_APPLIER_FETCH_KEYS_SIZE; // number of keys to fetch in a txn on applier
 	int64_t FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES; // desired size of mutation message sent from loader to appliers
 	bool FASTRESTORE_GET_RANGE_VERSIONS_EXPENSIVE; // parse each range file to get (range, version) it has?
+	int64_t FASTRESTORE_REQBATCH_PARALLEL; // number of requests to wait on for getBatchReplies()
+	bool FASTRESTORE_REQBATCH_LOG; // verbose log information for getReplyBatches
+	int FASTRESTORE_TXN_CLEAR_MAX; // threshold to start tracking each clear op in a txn
+	int FASTRESTORE_TXN_RETRY_MAX; // threshold to start output error on too many retries

 	ServerKnobs();
 	void initialize(bool randomize = false, ClientKnobs* clientKnobs = NULL, bool isSimulated = false);
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@ -522,7 +522,8 @@ ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerD
 	int64_t maxTLogQueueGate = 5e6, int64_t maxStorageServerQueueGate = 5e6, int64_t maxDataDistributionQueueSize = 0, int64_t maxPoppedVersionLag = 30e6 ) {
 	state Future<Void> reconfig = reconfigureAfter(cx, 100 + (deterministicRandom()->random01()*100), dbInfo, "QuietDatabase");

-	TraceEvent(("QuietDatabase" + phase + "Begin").c_str());
+	auto traceMessage = "QuietDatabase" + phase + "Begin";
+	TraceEvent(traceMessage.c_str());

 	//In a simulated environment, wait 5 seconds so that workers can move to their optimal locations
 	if(g_network->isSimulated())
@ -575,7 +576,8 @@ ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerD
 				numSuccesses = 0;
 			} else {
 				if(++numSuccesses == 3) {
-					TraceEvent(("QuietDatabase" + phase + "Done").c_str());
+					auto msg = "QuietDatabase" + phase + "Done";
+					TraceEvent(msg.c_str());
 					break;
 				} else {
 					wait(delay( g_network->isSimulated() ? 2.0 : 30.0));
--- a/fdbserver/RestoreApplier.actor.cpp
+++ b/fdbserver/RestoreApplier.actor.cpp
@ -47,8 +47,8 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
 	state Future<Void> exitRole = Never();
 	state Future<Void> updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);

-	actors.add(traceProcessMetrics(self, "Applier"));
-	actors.add(traceRoleVersionBatchProgress(self, "Applier"));
+	actors.add(traceProcessMetrics(self, "RestoreApplier"));
+	actors.add(traceRoleVersionBatchProgress(self, "RestoreApplier"));

 	loop {
 		state std::string requestTypeStr = "[Init]";
@ -84,14 +84,14 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
 					updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);
 				}
 				when(wait(exitRole)) {
-					TraceEvent("FastRestore").detail("RestoreApplierCore", "ExitRole").detail("NodeID", self->id());
+					TraceEvent("RestoreApplierCoreExitRole", self->id());
 					break;
 				}
 			}
 		} catch (Error& e) {
-			TraceEvent(SevWarn, "FastRestore")
-			    .detail("RestoreLoaderError", e.what())
-			    .detail("RequestType", requestTypeStr);
+			TraceEvent(SevWarn, "FastRestoreApplierError", self->id())
+			    .detail("RequestType", requestTypeStr)
+			    .error(e, true);
 			break;
 		}
 	}
@ -113,6 +113,7 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
 	state NotifiedVersion& curMsgIndex = batchData->processedFileState[req.asset];

 	TraceEvent(SevInfo, "FastRestoreApplierPhaseReceiveMutations", self->id())
+	    .suppressFor(1.0)
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("RestoreAsset", req.asset.toString())
 	    .detail("RestoreAssetMesssageIndex", curMsgIndex.get())
@ -128,35 +129,36 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
 	state bool isDuplicated = true;
 	if (curMsgIndex.get() == req.msgIndex - 1) {
 		isDuplicated = false;
-		ASSERT(req.mutations.size() == req.mVersions.size());

-		for (int mIndex = 0; mIndex < req.mutations.size(); mIndex++) {
-			const MutationRef& mutation = req.mutations[mIndex];
-			const LogMessageVersion mutationVersion(req.mVersions[mIndex]);
+		for (int mIndex = 0; mIndex < req.versionedMutations.size(); mIndex++) {
+			const VersionedMutation& versionedMutation = req.versionedMutations[mIndex];
 			TraceEvent(SevFRMutationInfo, "FastRestoreApplierPhaseReceiveMutations", self->id())
 			    .detail("RestoreAsset", req.asset.toString())
-			    .detail("Version", mutationVersion.toString())
+			    .detail("Version", versionedMutation.version.toString())
 			    .detail("Index", mIndex)
-			    .detail("MutationReceived", mutation.toString());
-			batchData->counters.receivedBytes += mutation.totalSize();
-			batchData->counters.receivedWeightedBytes += mutation.weightedTotalSize(); // atomicOp will be amplified
+			    .detail("MutationReceived", versionedMutation.mutation.toString());
+			batchData->counters.receivedBytes += versionedMutation.mutation.totalSize();
+			batchData->counters.receivedWeightedBytes +=
+			    versionedMutation.mutation.weightedTotalSize(); // atomicOp will be amplified
 			batchData->counters.receivedMutations += 1;
-			batchData->counters.receivedAtomicOps += isAtomicOp((MutationRef::Type)mutation.type) ? 1 : 0;
+			batchData->counters.receivedAtomicOps +=
+			    isAtomicOp((MutationRef::Type)versionedMutation.mutation.type) ? 1 : 0;
 			// Sanity check
-			ASSERT_WE_THINK(req.asset.isInVersionRange(mutationVersion.version));
-			ASSERT_WE_THINK(req.asset.isInKeyRange(mutation));
+			ASSERT_WE_THINK(req.asset.isInVersionRange(versionedMutation.version.version));
+			ASSERT_WE_THINK(req.asset.isInKeyRange(versionedMutation.mutation));

 			// Note: Log and range mutations may be delivered out of order. Can we handle it?
-			batchData->addMutation(mutation, mutationVersion);
+			batchData->addMutation(versionedMutation.mutation, versionedMutation.version);

-			ASSERT(mutation.type != MutationRef::SetVersionstampedKey &&
-			       mutation.type != MutationRef::SetVersionstampedValue);
+			ASSERT(versionedMutation.mutation.type != MutationRef::SetVersionstampedKey &&
+			       versionedMutation.mutation.type != MutationRef::SetVersionstampedValue);
 		}
 		curMsgIndex.set(req.msgIndex);
 	}

 	req.reply.send(RestoreCommonReply(self->id(), isDuplicated));
 	TraceEvent(SevInfo, "FastRestoreApplierPhaseReceiveMutationsDone", self->id())
+	    .suppressFor(1.0)
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("RestoreAsset", req.asset.toString())
 	    .detail("ProcessedMessageIndex", curMsgIndex.get())
@ -165,8 +167,16 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
 }

 // Clear all ranges in input ranges
-ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRangeRef>> ranges, Database cx) {
+ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRangeRef>> ranges, double delayTime,
+                                                   Database cx, UID applierID, int batchIndex) {
 	state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
+	state int retries = 0;
+	state double numOps = 0;
+	wait(delay(delayTime + deterministicRandom()->random01() * delayTime));
+	TraceEvent("FastRestoreApplierClearRangeMutationsStart", applierID)
+	    .detail("BatchIndex", batchIndex)
+	    .detail("Ranges", ranges.size())
+	    .detail("DelayTime", delayTime);
 	loop {
 		try {
 			tr->reset();
@ -176,10 +186,25 @@ ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRange
 				debugFRMutation("FastRestoreApplierApplyClearRangeMutation", 0,
 				                MutationRef(MutationRef::ClearRange, range.begin, range.end));
 				tr->clear(range);
+				++numOps;
+				if (numOps >= SERVER_KNOBS->FASTRESTORE_TXN_CLEAR_MAX) {
+					TraceEvent(SevWarnAlways, "FastRestoreApplierClearRangeMutationsTooManyClearsInTxn")
+					    .suppressFor(1.0)
+					    .detail("Clears", numOps)
+					    .detail("Ranges", ranges.size())
+					    .detail("Range", range.toString());
+				}
 			}
 			wait(tr->commit());
 			break;
 		} catch (Error& e) {
+			retries++;
+			if (retries > SERVER_KNOBS->FASTRESTORE_TXN_RETRY_MAX) {
+				TraceEvent(SevWarnAlways, "RestoreApplierApplyClearRangeMutationsStuck", applierID)
+				    .detail("BatchIndex", batchIndex)
+				    .detail("ClearRanges", ranges.size())
+				    .error(e);
+			}
 			wait(tr->onError(e));
 		}
 	}
@ -188,13 +213,17 @@ ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRange

 // Get keys in incompleteStagingKeys and precompute the stagingKey which is stored in batchData->stagingKeys
 ACTOR static Future<Void> getAndComputeStagingKeys(
-    std::map<Key, std::map<Key, StagingKey>::iterator> incompleteStagingKeys, Database cx, UID applierID) {
+    std::map<Key, std::map<Key, StagingKey>::iterator> incompleteStagingKeys, double delayTime, Database cx,
+    UID applierID, int batchIndex) {
 	state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
 	state std::vector<Future<Optional<Value>>> fValues;
 	state int retries = 0;

+	wait(delay(delayTime + deterministicRandom()->random01() * delayTime));
 	TraceEvent("FastRestoreApplierGetAndComputeStagingKeysStart", applierID)
-	    .detail("GetKeys", incompleteStagingKeys.size());
+	    .detail("BatchIndex", batchIndex)
+	    .detail("GetKeys", incompleteStagingKeys.size())
+	    .detail("DelayTime", delayTime);
 	loop {
 		try {
 			tr->reset();
@ -207,11 +236,12 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 			break;
 		} catch (Error& e) {
 			if (retries++ > 10) {
-				TraceEvent(SevError, "FastRestoreApplierGetAndComputeStagingKeysGetKeysStuck")
+				TraceEvent(SevError, "FastRestoreApplierGetAndComputeStagingKeysGetKeysStuck", applierID)
+				    .detail("BatchIndex", batchIndex)
 				    .detail("GetKeys", incompleteStagingKeys.size())
 				    .error(e);
+				break;
 			}
-
 			wait(tr->onError(e));
 			fValues.clear();
 		}
@ -220,31 +250,31 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 	ASSERT(fValues.size() == incompleteStagingKeys.size());
 	int i = 0;
 	for (auto& key : incompleteStagingKeys) {
-		if (!fValues[i].get().present()) {
-			TraceEvent(SevDebug, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
+		if (!fValues[i].get().present()) { // Debug info to understand which key does not exist in DB
+			TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB", applierID)
+			    .detail("BatchIndex", batchIndex)
 			    .detail("Key", key.first)
 			    .detail("Reason", "Not found in DB")
 			    .detail("PendingMutations", key.second->second.pendingMutations.size())
 			    .detail("StagingKeyType", (int)key.second->second.type);
 			for (auto& vm : key.second->second.pendingMutations) {
-				TraceEvent(SevDebug, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
+				TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
 				    .detail("PendingMutationVersion", vm.first.toString())
 				    .detail("PendingMutation", vm.second.toString());
 			}
-			key.second->second.precomputeResult("GetAndComputeStagingKeysNoBaseValueInDB");
-			i++;
-			continue;
+			key.second->second.precomputeResult("GetAndComputeStagingKeysNoBaseValueInDB", applierID, batchIndex);
 		} else {
 			// The key's version ideally should be the most recently committed version.
 			// But as long as it is > 1 and less than the start version of the version batch, it is the same result.
 			MutationRef m(MutationRef::SetValue, key.first, fValues[i].get().get());
 			key.second->second.add(m, LogMessageVersion(1));
-			key.second->second.precomputeResult("GetAndComputeStagingKeys");
-			i++;
+			key.second->second.precomputeResult("GetAndComputeStagingKeys", applierID, batchIndex);
 		}
+		i++;
 	}

 	TraceEvent("FastRestoreApplierGetAndComputeStagingKeysDone", applierID)
+	    .detail("BatchIndex", batchIndex)
 	    .detail("GetKeys", incompleteStagingKeys.size());

 	return Void();
@ -253,43 +283,44 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData> batchData, UID applierID,
                                                    int64_t batchIndex, Database cx) {
 	// Apply range mutations (i.e., clearRange) to database cx
-	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
+	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResultStart", applierID)
 	    .detail("BatchIndex", batchIndex)
 	    .detail("Step", "Applying clear range mutations to DB")
 	    .detail("ClearRanges", batchData->stagingKeyRanges.size());
 	state std::vector<Future<Void>> fClearRanges;
-	std::vector<Standalone<VectorRef<KeyRangeRef>>> clearBuf;
-	clearBuf.push_back(Standalone<VectorRef<KeyRangeRef>>());
-	Standalone<VectorRef<KeyRangeRef>> clearRanges = clearBuf.back();
+	Standalone<VectorRef<KeyRangeRef>> clearRanges;
 	double curTxnSize = 0;
+	double delayTime = 0;
 	for (auto& rangeMutation : batchData->stagingKeyRanges) {
 		KeyRangeRef range(rangeMutation.mutation.param1, rangeMutation.mutation.param2);
 		debugFRMutation("FastRestoreApplierPrecomputeMutationsResultClearRange", rangeMutation.version.version,
 		                MutationRef(MutationRef::ClearRange, range.begin, range.end));
-		clearRanges.push_back(clearRanges.arena(), range);
+		clearRanges.push_back_deep(clearRanges.arena(), range);
 		curTxnSize += range.expectedSize();
 		if (curTxnSize >= SERVER_KNOBS->FASTRESTORE_TXN_BATCH_MAX_BYTES) {
-			fClearRanges.push_back(applyClearRangeMutations(clearRanges, cx));
-			clearBuf.push_back(Standalone<VectorRef<KeyRangeRef>>());
-			clearRanges = clearBuf.back();
+			fClearRanges.push_back(applyClearRangeMutations(clearRanges, delayTime, cx, applierID, batchIndex));
+			delayTime += 0.1;
+			clearRanges = Standalone<VectorRef<KeyRangeRef>>();
 			curTxnSize = 0;
 		}
 	}
 	if (curTxnSize > 0) {
-		fClearRanges.push_back(applyClearRangeMutations(clearRanges, cx));
+		fClearRanges.push_back(applyClearRangeMutations(clearRanges, delayTime, cx, applierID, batchIndex));
 	}

 	// Apply range mutations (i.e., clearRange) to stagingKeyRanges
 	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
 	    .detail("BatchIndex", batchIndex)
 	    .detail("Step", "Applying clear range mutations to staging keys")
-	    .detail("ClearRanges", batchData->stagingKeyRanges.size());
+	    .detail("ClearRanges", batchData->stagingKeyRanges.size())
+	    .detail("FutureClearRanges", fClearRanges.size());
 	for (auto& rangeMutation : batchData->stagingKeyRanges) {
+		ASSERT(rangeMutation.mutation.param1 <= rangeMutation.mutation.param2);
 		std::map<Key, StagingKey>::iterator lb = batchData->stagingKeys.lower_bound(rangeMutation.mutation.param1);
 		std::map<Key, StagingKey>::iterator ub = batchData->stagingKeys.lower_bound(rangeMutation.mutation.param2);
 		while (lb != ub) {
 			if (lb->first >= rangeMutation.mutation.param2) {
-				TraceEvent(SevError, "FastRestoreApplerPhasePrecomputeMutationsResult_IncorrectUpperBound")
+				TraceEvent(SevError, "FastRestoreApplerPhasePrecomputeMutationsResultIncorrectUpperBound")
 				    .detail("Key", lb->first)
 				    .detail("ClearRangeUpperBound", rangeMutation.mutation.param2)
 				    .detail("UsedUpperBound", ub->first);
@ -301,6 +332,10 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
 			lb++;
 		}
 	}
+	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
+	    .detail("BatchIndex", batchIndex)
+	    .detail("Step", "Wait on applying clear range mutations to DB")
+	    .detail("FutureClearRanges", fClearRanges.size());

 	wait(waitForAll(fClearRanges));
 	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
@ -313,6 +348,7 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
 	std::map<Key, std::map<Key, StagingKey>::iterator> incompleteStagingKeys;
 	std::map<Key, StagingKey>::iterator stagingKeyIter = batchData->stagingKeys.begin();
 	int numKeysInBatch = 0;
+	double delayTime = 0; // Start transactions at different time to avoid overwelming FDB.
 	for (; stagingKeyIter != batchData->stagingKeys.end(); stagingKeyIter++) {
 		if (!stagingKeyIter->second.hasBaseValue()) {
 			incompleteStagingKeys.emplace(stagingKeyIter->first, stagingKeyIter);
@ -320,13 +356,16 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
 			numKeysInBatch++;
 		}
 		if (numKeysInBatch == SERVER_KNOBS->FASTRESTORE_APPLIER_FETCH_KEYS_SIZE) {
-			fGetAndComputeKeys.push_back(getAndComputeStagingKeys(incompleteStagingKeys, cx, applierID));
+			fGetAndComputeKeys.push_back(
+			    getAndComputeStagingKeys(incompleteStagingKeys, delayTime, cx, applierID, batchIndex));
+			delayTime += 0.1;
 			numKeysInBatch = 0;
 			incompleteStagingKeys.clear();
 		}
 	}
 	if (numKeysInBatch > 0) {
-		fGetAndComputeKeys.push_back(getAndComputeStagingKeys(incompleteStagingKeys, cx, applierID));
+		fGetAndComputeKeys.push_back(
+		    getAndComputeStagingKeys(incompleteStagingKeys, delayTime, cx, applierID, batchIndex));
 	}

 	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
@ -337,7 +376,7 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
 	for (stagingKeyIter = batchData->stagingKeys.begin(); stagingKeyIter != batchData->stagingKeys.end();
 	     stagingKeyIter++) {
 		if (stagingKeyIter->second.hasBaseValue()) {
-			stagingKeyIter->second.precomputeResult("HasBaseValue");
+			stagingKeyIter->second.precomputeResult("HasBaseValue", applierID, batchIndex);
 		}
 	}

@ -420,7 +459,7 @@ ACTOR static Future<Void> applyStagingKeys(Reference<ApplierBatchData> batchData
 	std::map<Key, StagingKey>::iterator cur = begin;
 	double txnSize = 0;
 	std::vector<Future<Void>> fBatches;
-	TraceEvent("FastRestoreApplerPhaseApplyStagingKeys", applierID)
+	TraceEvent("FastRestoreApplerPhaseApplyStagingKeysStart", applierID)
 	    .detail("BatchIndex", batchIndex)
 	    .detail("StagingKeys", batchData->stagingKeys.size());
 	while (cur != batchData->stagingKeys.end()) {
@ -447,7 +486,7 @@ ACTOR static Future<Void> applyStagingKeys(Reference<ApplierBatchData> batchData
 // Write mutations to the destination DB
 ACTOR Future<Void> writeMutationsToDB(UID applierID, int64_t batchIndex, Reference<ApplierBatchData> batchData,
                                      Database cx) {
-	TraceEvent("FastRestoreApplerPhaseApplyTxn", applierID).detail("BatchIndex", batchIndex);
+	TraceEvent("FastRestoreApplerPhaseApplyTxnStart", applierID).detail("BatchIndex", batchIndex);
 	wait(precomputeMutationsResult(batchData, applierID, batchIndex, cx));

 	wait(applyStagingKeys(batchData, applierID, batchIndex, cx));
@ -458,23 +497,29 @@ ACTOR Future<Void> writeMutationsToDB(UID applierID, int64_t batchIndex, Referen

 ACTOR static Future<Void> handleApplyToDBRequest(RestoreVersionBatchRequest req, Reference<RestoreApplierData> self,
                                                 Database cx) {
+	TraceEvent("FastRestoreApplierPhaseHandleApplyToDBStart", self->id())
+	    .detail("BatchIndex", req.batchIndex)
+	    .detail("FinishedBatch", self->finishedBatch.get());
+
 	// Ensure batch (i-1) is applied before batch i
 	wait(self->finishedBatch.whenAtLeast(req.batchIndex - 1));

 	state bool isDuplicated = true;
+	if (self->finishedBatch.get() == req.batchIndex - 1) {
 		Reference<ApplierBatchData> batchData = self->batch[req.batchIndex];
-	TraceEvent("FastRestoreApplierPhaseHandleApplyToDB", self->id())
+		TraceEvent("FastRestoreApplierPhaseHandleApplyToDBRunning", self->id())
 		    .detail("BatchIndex", req.batchIndex)
 		    .detail("FinishedBatch", self->finishedBatch.get())
 		    .detail("HasStarted", batchData->dbApplier.present())
+		    .detail("WroteToDBDone", batchData->dbApplier.present() ? batchData->dbApplier.get().isReady() : 0)
 		    .detail("PreviousVersionBatchState", batchData->vbState.get());
-	batchData->vbState = ApplierVersionBatchState::WRITE_TO_DB;
-	if (self->finishedBatch.get() == req.batchIndex - 1) {
+
 		ASSERT(batchData.isValid());
 		if (!batchData->dbApplier.present()) {
 			isDuplicated = false;
 			batchData->dbApplier = Never();
 			batchData->dbApplier = writeMutationsToDB(self->id(), req.batchIndex, batchData, cx);
+			batchData->vbState = ApplierVersionBatchState::WRITE_TO_DB;
 		}

 		ASSERT(batchData->dbApplier.present());
@ -485,14 +530,22 @@ ACTOR static Future<Void> handleApplyToDBRequest(RestoreVersionBatchRequest req,
 		// Avoid setting finishedBatch when finishedBatch > req.batchIndex
 		if (self->finishedBatch.get() == req.batchIndex - 1) {
 			self->finishedBatch.set(req.batchIndex);
-		}
-	}
-
+			self->batch[req.batchIndex]->vbState = ApplierVersionBatchState::DONE;
+			// Free memory for the version batch
+			self->batch.erase(req.batchIndex);
 			if (self->delayedActors > 0) {
 				self->checkMemory.trigger();
 			}
+		}
+	}
+
 	req.reply.send(RestoreCommonReply(self->id(), isDuplicated));

+	TraceEvent("FastRestoreApplierPhaseHandleApplyToDBDone", self->id())
+	    .detail("BatchIndex", req.batchIndex)
+	    .detail("FinishedBatch", self->finishedBatch.get())
+	    .detail("IsDuplicated", isDuplicated);
+
 	return Void();
 }

@ -521,7 +574,7 @@ Value applyAtomicOp(Optional<StringRef> existingValue, Value value, MutationRef:
 	else {
 		TraceEvent(SevError, "ApplyAtomicOpUnhandledType")
 		    .detail("TypeCode", (int)type)
-		    .detail("TypeName", typeString[type]);
+		    .detail("TypeName", getTypeString(type));
 		ASSERT(false);
 	}
 	return Value();
--- a/fdbserver/RestoreApplier.actor.h
+++ b/fdbserver/RestoreApplier.actor.h
@ -88,7 +88,7 @@ struct StagingKey {
 					TraceEvent("StagingKeyAdd")
 					    .detail("Version", version.toString())
 					    .detail("NewVersion", newVersion.toString())
-					    .detail("MType", typeString[(int)type])
+					    .detail("MType", getTypeString(type))
 					    .detail("Key", key)
 					    .detail("Val", val)
 					    .detail("NewMutation", m.toString());
@ -117,14 +117,14 @@ struct StagingKey {

 	// Precompute the final value of the key.
 	// TODO: Look at the last LogMessageVersion, if it set or clear, we can ignore the rest of versions.
-	void precomputeResult(const char* context) {
-		// TODO: Change typeString[(int)type] to a safe function that validate type range
-		TraceEvent(SevDebug, "FastRestoreApplierPrecomputeResult")
+	void precomputeResult(const char* context, UID applierID, int batchIndex) {
+		TraceEvent(SevDebug, "FastRestoreApplierPrecomputeResult", applierID)
+		    .detail("BatchIndex", batchIndex)
 		    .detail("Context", context)
 		    .detail("Version", version.toString())
 		    .detail("Key", key)
 		    .detail("Value", val)
-		    .detail("MType", type < MutationRef::MAX_ATOMIC_OP ? typeString[(int)type] : "[Unset]")
+		    .detail("MType", type < MutationRef::MAX_ATOMIC_OP ? getTypeString(type) : "[Unset]")
 		    .detail("LargestPendingVersion",
 		            (pendingMutations.empty() ? "[none]" : pendingMutations.rbegin()->first.toString()));
 		std::map<LogMessageVersion, Standalone<MutationRef>>::iterator lb = pendingMutations.lower_bound(version);
@ -137,9 +137,11 @@ struct StagingKey {
 			MutationRef m = lb->second;
 			if (m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) {
 				if (std::tie(type, key, val) != std::tie(m.type, m.param1, m.param2)) {
-					TraceEvent(SevError, "FastRestoreApplierPrecomputeResultUnhandledSituation")
-					    .detail("BufferedType", typeString[type])
-					    .detail("PendingType", typeString[m.type])
+					TraceEvent(SevError, "FastRestoreApplierPrecomputeResultUnhandledSituation", applierID)
+					    .detail("BatchIndex", batchIndex)
+					    .detail("Context", context)
+					    .detail("BufferedType", getTypeString(type))
+					    .detail("PendingType", getTypeString(m.type))
 					    .detail("BufferedVal", val.toString())
 					    .detail("PendingVal", m.param2.toString());
 				}
@ -168,12 +170,16 @@ struct StagingKey {
 				type = MutationRef::SetValue; // Precomputed result should be set to DB.
 			} else if (mutation.type == MutationRef::SetValue || mutation.type == MutationRef::ClearRange) {
 				type = MutationRef::SetValue; // Precomputed result should be set to DB.
-				TraceEvent(SevError, "FastRestoreApplierPrecomputeResultUnexpectedSet")
-				    .detail("MutationType", typeString[mutation.type])
+				TraceEvent(SevError, "FastRestoreApplierPrecomputeResultUnexpectedSet", applierID)
+				    .detail("BatchIndex", batchIndex)
+				    .detail("Context", context)
+				    .detail("MutationType", getTypeString(mutation.type))
 				    .detail("Version", lb->first.toString());
 			} else {
-				TraceEvent(SevWarnAlways, "FastRestoreApplierPrecomputeResultSkipUnexpectedBackupMutation")
-				    .detail("MutationType", typeString[mutation.type])
+				TraceEvent(SevWarnAlways, "FastRestoreApplierPrecomputeResultSkipUnexpectedBackupMutation", applierID)
+				    .detail("BatchIndex", batchIndex)
+				    .detail("Context", context)
+				    .detail("MutationType", getTypeString(mutation.type))
 				    .detail("Version", lb->first.toString());
 			}
 			ASSERT(lb->first > version);
@ -219,7 +225,8 @@ public:
 	static const int INIT = 1;
 	static const int RECEIVE_MUTATIONS = 2;
 	static const int WRITE_TO_DB = 3;
-	static const int INVALID = 4;
+	static const int DONE = 4;
+	static const int INVALID = 5;

 	explicit ApplierVersionBatchState(int newState) {
 		vbState = newState;
@ -267,9 +274,9 @@ struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
 	explicit ApplierBatchData(UID nodeID, int batchIndex)
 	  : counters(this, nodeID, batchIndex), applyStagingKeysBatchLock(SERVER_KNOBS->FASTRESTORE_APPLYING_PARALLELISM),
 	    vbState(ApplierVersionBatchState::NOT_INIT) {
-		pollMetrics =
-		    traceCounters("FastRestoreApplierMetrics", nodeID, SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
-		                  &counters.cc, nodeID.toString() + "/RestoreApplierMetrics/" + std::to_string(batchIndex));
+		pollMetrics = traceCounters(format("FastRestoreApplierMetrics%d", batchIndex), nodeID,
+		                            SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY, &counters.cc,
+		                            nodeID.toString() + "/RestoreApplierMetrics/" + std::to_string(batchIndex));
 		TraceEvent("FastRestoreApplierMetricsCreated").detail("Node", nodeID);
 	}
 	~ApplierBatchData() = default;
@ -328,7 +335,7 @@ struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
 				    isAtomicOp((MutationRef::Type)m->type))
 					continue;
 				else {
-					TraceEvent(SevError, "FastRestore").detail("UnknownMutationType", m->type);
+					TraceEvent(SevError, "FastRestoreApplier").detail("UnknownMutationType", m->type);
 					return false;
 				}
 			}
--- a/fdbserver/RestoreCommon.actor.h
+++ b/fdbserver/RestoreCommon.actor.h
@ -281,22 +281,28 @@ Future<Void> getBatchReplies(RequestStream<Request> Interface::*channel, std::ma
 				ongoingReplies.clear();
 				ongoingRepliesIndex.clear();
 				for (int i = 0; i < cmdReplies.size(); ++i) {
-					// TraceEvent(SevDebug, "FastRestoreGetBatchReplies")
-					//     .detail("Requests", requests.size())
-					//     .detail("OutstandingReplies", oustandingReplies)
-					//     .detail("ReplyIndex", i)
-					//     .detail("ReplyReady", cmdReplies[i].isReady())
-					//     .detail("RequestNode", requests[i].first)
-					//     .detail("Request", requests[i].second.toString());
+					if (SERVER_KNOBS->FASTRESTORE_REQBATCH_LOG) {
+						TraceEvent(SevInfo, "FastRestoreGetBatchReplies")
+						    .suppressFor(1.0)
+						    .detail("Requests", requests.size())
+						    .detail("OutstandingReplies", oustandingReplies)
+						    .detail("ReplyIndex", i)
+						    .detail("ReplyIsReady", cmdReplies[i].isReady())
+						    .detail("ReplyIsError", cmdReplies[i].isError())
+						    .detail("RequestNode", requests[i].first)
+						    .detail("Request", requests[i].second.toString());
+					}
 					if (!cmdReplies[i].isReady()) { // still wait for reply
 						ongoingReplies.push_back(cmdReplies[i]);
 						ongoingRepliesIndex.push_back(i);
 					}
 				}
+				ASSERT(ongoingReplies.size() == oustandingReplies);
 				if (ongoingReplies.empty()) {
 					break;
 				} else {
-					wait(waitForAll(ongoingReplies));
+					wait(quorum(ongoingReplies, std::min((int)SERVER_KNOBS->FASTRESTORE_REQBATCH_PARALLEL,
+					                                     (int)ongoingReplies.size())));
 				}
 				// At least one reply is received; Calculate the reply duration
 				for (int j = 0; j < ongoingReplies.size(); ++j) {
@ -352,12 +358,14 @@ Future<Void> getBatchReplies(RequestStream<Request> Interface::*channel, std::ma
 			break;
 		} catch (Error& e) {
 			if (e.code() == error_code_operation_cancelled) break;
-			fprintf(stdout, "sendBatchRequests Error code:%d, error message:%s\n", e.code(), e.what());
+			// fprintf(stdout, "sendBatchRequests Error code:%d, error message:%s\n", e.code(), e.what());
+			TraceEvent(SevWarn, "FastRestoreSendBatchRequests").error(e);
 			for (auto& request : requests) {
-				TraceEvent(SevWarn, "FastRestore")
+				TraceEvent(SevWarn, "FastRestoreSendBatchRequests")
 				    .detail("SendBatchRequests", requests.size())
 				    .detail("RequestID", request.first)
 				    .detail("Request", request.second.toString());
+				resetReply(request.second);
 			}
 		}
 	}
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -29,12 +29,11 @@

 #include "flow/actorcompiler.h" // This must be the last #include.

-// SerializedMutationListMap:
-// Key is the signature/version of the mutation list, Value is the mutation list (or part of the mutation list)
-typedef std::map<Standalone<StringRef>, Standalone<StringRef>> SerializedMutationListMap;
-// SerializedMutationPartMap:
-// Key has the same semantics as SerializedMutationListMap; Value is the part number of the splitted mutation list
-typedef std::map<Standalone<StringRef>, uint32_t> SerializedMutationPartMap;
+// SerializedMutationListMap: Buffered mutation lists from data blocks in log files
+// Key is the signature/version of the mutation list; Value.first is the mutation list which may come from multiple
+// data blocks of log file; Value.second is the largest part number of the mutation list, which is used to sanity check
+// the data blocks for the same mutation list are concatenated in increasing order of part number.
+typedef std::map<Standalone<StringRef>, std::pair<Standalone<StringRef>, uint32_t>> SerializedMutationListMap;

 std::vector<UID> getApplierIDs(std::map<Key, UID>& rangeToApplier);
 void splitMutation(std::map<Key, UID>* pRangeToApplier, MutationRef m, Arena& mvector_arena,
@ -54,7 +53,6 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
                                          std::map<UID, RestoreApplierInterface>* pApplierInterfaces);
 ACTOR static Future<Void> _parseLogFileToMutationsOnLoader(NotifiedVersion* pProcessedFileOffset,
                                                           SerializedMutationListMap* mutationMap,
-                                                           SerializedMutationPartMap* mutationPartMap,
                                                           Reference<IBackupContainer> bc, RestoreAsset asset);
 ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
    std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsIter,
@ -69,7 +67,7 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no
 	state Future<Void> exitRole = Never();
 	state Future<Void> updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);

-	actors.add(traceProcessMetrics(self, "Loader"));
+	actors.add(traceProcessMetrics(self, "RestoreLoader"));

 	loop {
 		state std::string requestTypeStr = "[Init]";
@ -113,14 +111,12 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no
 					updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);
 				}
 				when(wait(exitRole)) {
-					TraceEvent("FastRestore").detail("RestoreLoaderCore", "ExitRole").detail("NodeID", self->id());
+					TraceEvent("FastRestoreLoaderCoreExitRole", self->id());
 					break;
 				}
 			}
 		} catch (Error& e) {
-			TraceEvent(SevWarn, "FastRestore")
-			    .detail("RestoreLoaderError", e.what())
-			    .detail("RequestType", requestTypeStr);
+			TraceEvent(SevWarn, "FastRestoreLoader", self->id()).detail("RequestType", requestTypeStr).error(e, true);
 			break;
 		}
 	}
@ -188,7 +184,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
 	int rLen = wait(file->read(mutateString(buf), asset.len, asset.offset));
 	if (rLen != asset.len) throw restore_bad_read();

-	TraceEvent("FastRestore")
+	TraceEvent("FastRestoreLoader")
 	    .detail("DecodingLogFile", asset.filename)
 	    .detail("Offset", asset.offset)
 	    .detail("Length", asset.len);
@ -277,7 +273,6 @@ ACTOR Future<Void> _processLoadingParam(KeyRangeMap<Version>* pRangeVersions, Lo
 	// Must use StandAlone to save mutations, otherwise, the mutationref memory will be corrupted
 	// mutationMap: Key is the unique identifier for a batch of mutation logs at the same version
 	state SerializedMutationListMap mutationMap;
-	state std::map<Standalone<StringRef>, uint32_t> mutationPartMap; // Sanity check the data parsing is correct
 	state NotifiedVersion processedFileOffset(0);
 	state std::vector<Future<Void>> fileParserFutures;
 	state std::map<LoadingParam, VersionedMutationsMap>::iterator kvOpsPerLPIter = batchData->kvOpsPerLP.end();
@ -310,8 +305,8 @@ ACTOR Future<Void> _processLoadingParam(KeyRangeMap<Version>* pRangeVersions, Lo
 				                                                             kvOpsPerLPIter, samplesIter,
 				                                                             &batchData->counters, bc, subAsset));
 			} else {
-				fileParserFutures.push_back(_parseLogFileToMutationsOnLoader(&processedFileOffset, &mutationMap,
-				                                                             &mutationPartMap, bc, subAsset));
+				fileParserFutures.push_back(
+				    _parseLogFileToMutationsOnLoader(&processedFileOffset, &mutationMap, bc, subAsset));
 			}
 		}
 	}
@ -341,6 +336,8 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
 	    .detail("NotProcessed", !paramExist)
 	    .detail("Processed", isReady)
 	    .detail("CurrentMemory", getSystemStatistics().processMemory);
+	// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
+	ASSERT(self->finishedBatch.get() < req.batchIndex);

 	wait(isSchedulable(self, req.batchIndex, __FUNCTION__));

@ -381,6 +378,8 @@ ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequ
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("UseRangeFile", req.useRangeFile)
 	    .detail("LoaderSendStatus", batchStatus->toString());
+	// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
+	ASSERT(self->finishedBatch.get() < req.batchIndex);

 	// Ensure each file is sent exactly once by using batchStatus->sendAllLogs and batchStatus->sendAllRanges
 	if (!req.useRangeFile) {
@ -458,7 +457,6 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 	state VersionedMutationsMap::iterator kvOp = kvOps.begin();
 	state int kvCount = 0;
 	state int splitMutationIndex = 0;
-	state std::vector<std::pair<UID, RestoreSendVersionedMutationsRequest>> requests;
 	state Version msgIndex = 1; // Monotonically increased index for send message, must start at 1
 	state std::vector<UID> applierIDs = getApplierIDs(*pRangeToApplier);
 	state double msgSize = 0; // size of mutations in the message
@ -483,22 +481,20 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 	splitMutationIndex = 0;
 	kvCount = 0;

-	// applierMutationsBuffer is the mutation vector to be sent to each applier
-	// applierMutationsSize is buffered mutation vector size for each applier
-	state std::map<UID, MutationsVec> applierMutationsBuffer;
-	state std::map<UID, LogMessageVersionVec> applierVersionsBuffer;
-	state std::map<UID, double> applierMutationsSize;
+	// applierVersionedMutationsBuffer is the mutation-and-its-version vector to be sent to each applier
+	state std::map<UID, VersionedMutationsVec> applierVersionedMutationsBuffer;
+	state int mIndex = 0;
+	state LogMessageVersion commitVersion;
+	state std::vector<Future<Void>> fSends;
 	for (auto& applierID : applierIDs) {
-		applierMutationsBuffer[applierID] = MutationsVec();
-		applierVersionsBuffer[applierID] = LogMessageVersionVec();
-		applierMutationsSize[applierID] = 0.0;
+		applierVersionedMutationsBuffer[applierID] = VersionedMutationsVec();
 	}
 	for (kvOp = kvOps.begin(); kvOp != kvOps.end(); kvOp++) {
-		const LogMessageVersion& commitVersion = kvOp->first;
+		commitVersion = kvOp->first;
 		ASSERT(commitVersion.version >= asset.beginVersion);
 		ASSERT(commitVersion.version <= asset.endVersion); // endVersion is an empty commit to ensure progress
-
-		for (const MutationRef& kvm : kvOp->second) {
+		for (mIndex = 0; mIndex < kvOp->second.size(); mIndex++) {
+			MutationRef& kvm = kvOp->second[mIndex];
 			// Send the mutation to applier
 			if (isRangeMutation(kvm)) {
 				MutationsVec mvector;
@ -526,9 +522,10 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 						    .detail("Version", commitVersion.toString())
 						    .detail("Mutation", mutation.toString());
 					}
-					applierMutationsBuffer[applierID].push_back_deep(applierMutationsBuffer[applierID].arena(), mutation);
-					applierVersionsBuffer[applierID].push_back(applierVersionsBuffer[applierID].arena(), commitVersion);
-					applierMutationsSize[applierID] += mutation.expectedSize();
+					// CAREFUL: The splitted mutations' lifetime is shorter than the for-loop
+					// Must use deep copy for splitted mutations
+					applierVersionedMutationsBuffer[applierID].push_back_deep(
+					    applierVersionedMutationsBuffer[applierID].arena(), VersionedMutation(mutation, commitVersion));
 					msgSize += mutation.expectedSize();

 					kvCount++;
@ -546,44 +543,59 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 					    .detail("Version", commitVersion.toString())
 					    .detail("Mutation", kvm.toString());
 				}
-				applierMutationsBuffer[applierID].push_back_deep(applierMutationsBuffer[applierID].arena(), kvm);
-				applierVersionsBuffer[applierID].push_back(applierVersionsBuffer[applierID].arena(), commitVersion);
-				applierMutationsSize[applierID] += kvm.expectedSize();
+				// kvm data is saved in pkvOps in batchData, so shallow copy is ok here.
+				applierVersionedMutationsBuffer[applierID].push_back(applierVersionedMutationsBuffer[applierID].arena(),
+				                                                     VersionedMutation(kvm, commitVersion));
 				msgSize += kvm.expectedSize();
 			}
-		} // Mutations at the same LogMessageVersion

-		// Batch same Version's mutations in one request. We could batch more by
-		// changing the version comparison below.
-		auto next = std::next(kvOp, 1);
-		if (next == kvOps.end() || commitVersion.version < next->first.version) {
-			// if (next == kvOps.end() || msgSize >= SERVER_KNOBS->FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES) {
-			// TODO: Sanity check each asset has been received exactly once!
-			// Send the mutations to appliers for each version
+			// Batch mutations at multiple versions up to FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES size
+			// to improve bandwidth from a loader to appliers
+			if (msgSize >= SERVER_KNOBS->FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES) {
+				std::vector<std::pair<UID, RestoreSendVersionedMutationsRequest>> requests;
 				for (const UID& applierID : applierIDs) {
-				requests.emplace_back(applierID,
-				                      RestoreSendVersionedMutationsRequest(batchIndex, asset, msgIndex, isRangeFile,
-				                                                           applierMutationsBuffer[applierID],
-				                                                           applierVersionsBuffer[applierID]));
+					requests.emplace_back(
+					    applierID, RestoreSendVersionedMutationsRequest(batchIndex, asset, msgIndex, isRangeFile,
+					                                                    applierVersionedMutationsBuffer[applierID]));
 				}
 				TraceEvent(SevDebug, "FastRestoreLoaderSendMutationToApplier")
 				    .detail("MessageIndex", msgIndex)
 				    .detail("RestoreAsset", asset.toString())
 				    .detail("Requests", requests.size());
-			wait(sendBatchRequests(&RestoreApplierInterface::sendMutationVector, *pApplierInterfaces, requests,
-			                       TaskPriority::RestoreLoaderSendMutations));
+				fSends.push_back(sendBatchRequests(&RestoreApplierInterface::sendMutationVector, *pApplierInterfaces,
+				                                   requests, TaskPriority::RestoreLoaderSendMutations));
 				msgIndex++;
 				msgSize = 0;
-			requests.clear();
 				for (auto& applierID : applierIDs) {
-				applierMutationsBuffer[applierID] = MutationsVec();
-				applierVersionsBuffer[applierID] = LogMessageVersionVec();
-				applierMutationsSize[applierID] = 0.0;
+					applierVersionedMutationsBuffer[applierID] = VersionedMutationsVec();
 				}
 			}
+		} // Mutations at the same LogMessageVersion
 	} // all versions of mutations in the same file

-	TraceEvent("FastRestore").detail("LoaderSendMutationOnAppliers", kvCount);
+	// Send the remaining mutations in the applierMutationsBuffer
+	if (msgSize > 0) {
+		// TODO: Sanity check each asset has been received exactly once!
+		std::vector<std::pair<UID, RestoreSendVersionedMutationsRequest>> requests;
+		for (const UID& applierID : applierIDs) {
+			requests.emplace_back(applierID,
+			                      RestoreSendVersionedMutationsRequest(batchIndex, asset, msgIndex, isRangeFile,
+			                                                           applierVersionedMutationsBuffer[applierID]));
+		}
+		TraceEvent(SevDebug, "FastRestoreLoaderSendMutationToApplier")
+		    .detail("MessageIndex", msgIndex)
+		    .detail("RestoreAsset", asset.toString())
+		    .detail("Requests", requests.size());
+		fSends.push_back(sendBatchRequests(&RestoreApplierInterface::sendMutationVector, *pApplierInterfaces, requests,
+		                                   TaskPriority::RestoreLoaderSendMutations));
+	}
+	wait(waitForAll(fSends));
+
+	kvOps = VersionedMutationsMap(); // Free memory for parsed mutations at the restore asset.
+	TraceEvent("FastRestoreLoaderSendMutationToAppliers")
+	    .detail("BatchIndex", batchIndex)
+	    .detail("RestoreAsset", asset.toString())
+	    .detail("Mutations", kvCount);
 	return Void();
 }

@ -646,12 +658,9 @@ void splitMutation(std::map<Key, UID>* pRangeToApplier, MutationRef m, Arena& mv
 // key_input format:
 // [logRangeMutation.first][hash_value_of_commit_version:1B][bigEndian64(commitVersion)][bigEndian32(part)]
 // value_input: serialized binary of mutations at the same version
-bool concatenateBackupMutationForLogFile(std::map<Standalone<StringRef>, Standalone<StringRef>>* pMutationMap,
-                                         std::map<Standalone<StringRef>, uint32_t>* pMutationPartMap,
-                                         Standalone<StringRef> key_input, Standalone<StringRef> val_input,
-                                         const RestoreAsset& asset) {
+bool concatenateBackupMutationForLogFile(SerializedMutationListMap* pMutationMap, Standalone<StringRef> key_input,
+                                         Standalone<StringRef> val_input, const RestoreAsset& asset) {
 	SerializedMutationListMap& mutationMap = *pMutationMap;
-	std::map<Standalone<StringRef>, uint32_t>& mutationPartMap = *pMutationPartMap;
 	const int key_prefix_len = sizeof(uint8_t) + sizeof(Version) + sizeof(uint32_t);

 	StringRefReader readerKey(key_input, restore_corrupted_data()); // read key_input!
@ -678,19 +687,19 @@ bool concatenateBackupMutationForLogFile(std::map<Standalone<StringRef>, Standal

 	auto it = mutationMap.find(id);
 	if (it == mutationMap.end()) {
-		mutationMap.insert(std::make_pair(id, val_input));
+		mutationMap.emplace(id, std::make_pair(val_input, 0));
 		if (part != 0) {
-			TraceEvent(SevError, "FastRestore")
+			TraceEvent(SevError, "FastRestoreLoader")
 			    .detail("FirstPartNotZero", part)
 			    .detail("KeyInput", getHexString(key_input));
 		}
-		mutationPartMap.insert(std::make_pair(id, part));
 	} else { // Concatenate the val string with the same commitVersion
-		it->second = it->second.contents().withSuffix(val_input.contents()); // Assign the new Areana to the map's value
-		auto& currentPart = mutationPartMap[id];
+		it->second.first =
+		    it->second.first.contents().withSuffix(val_input.contents()); // Assign the new Areana to the map's value
+		auto& currentPart = it->second.second;
 		if (part != (currentPart + 1)) {
 			// Check if the same range or log file has been processed more than once!
-			TraceEvent(SevError, "FastRestore")
+			TraceEvent(SevError, "FastRestoreLoader")
 			    .detail("CurrentPart1", currentPart)
 			    .detail("CurrentPart2", part)
 			    .detail("KeyInput", getHexString(key_input))
@ -726,7 +735,7 @@ void _parseSerializedMutation(KeyRangeMap<Version>* pRangeVersions,

 	for (auto& m : mutationMap) {
 		StringRef k = m.first.contents();
-		StringRef val = m.second.contents();
+		StringRef val = m.second.first.contents();

 		StringRefReader kReader(k, restore_corrupted_data());
 		uint64_t commitVersion = kReader.consume<uint64_t>(); // Consume little Endian data
@ -821,7 +830,7 @@ ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
 	try {
 		Standalone<VectorRef<KeyValueRef>> kvs =
 		    wait(fileBackup::decodeRangeFileBlock(inFile, asset.offset, asset.len));
-		TraceEvent("FastRestore")
+		TraceEvent("FastRestoreLoader")
 		    .detail("DecodedRangeFile", asset.filename)
 		    .detail("DataSize", kvs.contents().size());
 		blockData = kvs;
@ -894,13 +903,12 @@ ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
 // pMutationMap: concatenated mutation list string at the mutation's commit version
 ACTOR static Future<Void> _parseLogFileToMutationsOnLoader(NotifiedVersion* pProcessedFileOffset,
                                                           SerializedMutationListMap* pMutationMap,
-                                                           SerializedMutationPartMap* pMutationPartMap,
                                                           Reference<IBackupContainer> bc, RestoreAsset asset) {
 	Reference<IAsyncFile> inFile = wait(bc->readFile(asset.filename));
 	// decodeLogFileBlock() must read block by block!
 	state Standalone<VectorRef<KeyValueRef>> data =
 	    wait(parallelFileRestore::decodeLogFileBlock(inFile, asset.offset, asset.len));
-	TraceEvent("FastRestore")
+	TraceEvent("FastRestoreLoader")
 	    .detail("DecodedLogFile", asset.filename)
 	    .detail("Offset", asset.offset)
 	    .detail("Length", asset.len)
@ -912,7 +920,7 @@ ACTOR static Future<Void> _parseLogFileToMutationsOnLoader(NotifiedVersion* pPro
 	if (pProcessedFileOffset->get() == asset.offset) {
 		for (const KeyValueRef& kv : data) {
 			// Concatenate the backuped param1 and param2 (KV) at the same version.
-			concatenateBackupMutationForLogFile(pMutationMap, pMutationPartMap, kv.key, kv.value, asset);
+			concatenateBackupMutationForLogFile(pMutationMap, kv.key, kv.value, asset);
 		}
 		pProcessedFileOffset->set(asset.offset + asset.len);
 	}
@ -941,6 +949,9 @@ ACTOR Future<Void> handleFinishVersionBatchRequest(RestoreVersionBatchRequest re
 	wait(self->finishedBatch.whenAtLeast(req.batchIndex - 1));
 	if (self->finishedBatch.get() == req.batchIndex - 1) {
 		self->finishedBatch.set(req.batchIndex);
+		// Clean up batchData
+		self->batch.erase(req.batchIndex);
+		self->status.erase(req.batchIndex);
 	}
 	if (self->delayedActors > 0) {
 		self->checkMemory.trigger();
--- a/fdbserver/RestoreLoader.actor.h
+++ b/fdbserver/RestoreLoader.actor.h
@ -92,9 +92,9 @@ struct LoaderBatchData : public ReferenceCounted<LoaderBatchData> {
 	} counters;

 	explicit LoaderBatchData(UID nodeID, int batchIndex) : counters(this, nodeID, batchIndex), vbState(LoaderVersionBatchState::NOT_INIT) {
-		pollMetrics =
-		    traceCounters("FastRestoreLoaderMetrics", nodeID, SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
-		                  &counters.cc, nodeID.toString() + "/RestoreLoaderMetrics/" + std::to_string(batchIndex));
+		pollMetrics = traceCounters(format("FastRestoreLoaderMetrics%d", batchIndex), nodeID,
+		                            SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY, &counters.cc,
+		                            nodeID.toString() + "/RestoreLoaderMetrics/" + std::to_string(batchIndex));
 		TraceEvent("FastRestoreLoaderMetricsCreated").detail("Node", nodeID);
 	}

@ -169,7 +169,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
 	}

 	void initVersionBatch(int batchIndex) {
-		TraceEvent("FastRestore").detail("InitVersionBatchOnLoader", nodeID);
+		TraceEvent("FastRestoreLoaderInitVersionBatch", nodeID).detail("BatchIndex", batchIndex);
 		batch[batchIndex] = Reference<LoaderBatchData>(new LoaderBatchData(nodeID, batchIndex));
 		status[batchIndex] = Reference<LoaderBatchStatus>(new LoaderBatchStatus());
 	}
@ -177,6 +177,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
 	void resetPerRestoreRequest() {
 		batch.clear();
 		status.clear();
+		finishedBatch = NotifiedVersion(0);
 	}

 	void initBackupContainer(Key url) {
--- a/fdbserver/RestoreMaster.actor.cpp
+++ b/fdbserver/RestoreMaster.actor.cpp
@ -81,6 +81,7 @@ ACTOR Future<Void> startRestoreMaster(Reference<RestoreWorkerData> masterWorker,

 		actors.add(updateHeartbeatTime(self));
 		actors.add(checkRolesLiveness(self));
+		actors.add(traceProcessMetrics(self, "RestoreMaster"));

 		wait(startProcessRestoreRequests(self, cx));
 	} catch (Error& e) {
@ -193,7 +194,6 @@ ACTOR Future<Void> distributeRestoreSysInfo(Reference<RestoreMasterData> masterD
 ACTOR Future<Void> startProcessRestoreRequests(Reference<RestoreMasterData> self, Database cx) {
 	state UID randomUID = deterministicRandom()->randomUniqueID();
 	state Standalone<VectorRef<RestoreRequest>> restoreRequests = wait(collectRestoreRequests(cx));
-	state int numTries = 0;
 	state int restoreIndex = 0;

 	TraceEvent("FastRestoreMasterWaitOnRestoreRequests", self->id()).detail("RestoreRequests", restoreRequests.size());
@ -316,7 +316,8 @@ ACTOR static Future<Version> processRestoreRequest(Reference<RestoreMasterData>
 		TraceEvent("FastRestoreMasterDispatchVersionBatches")
 		    .detail("BatchIndex", batchIndex)
 		    .detail("BatchSize", versionBatch->size)
-		    .detail("RunningVersionBatches", self->runningVersionBatches.get());
+		    .detail("RunningVersionBatches", self->runningVersionBatches.get())
+		    .detail("VersionBatches", versionBatches.size());
 		self->batch[batchIndex] = Reference<MasterBatchData>(new MasterBatchData());
 		self->batchStatus[batchIndex] = Reference<MasterBatchStatus>(new MasterBatchStatus());
 		fBatches.push_back(distributeWorkloadPerVersionBatch(self, batchIndex, cx, request, *versionBatch));
@ -326,7 +327,7 @@ ACTOR static Future<Version> processRestoreRequest(Reference<RestoreMasterData>

 	wait(waitForAll(fBatches));

-	TraceEvent("FastRestore").detail("RestoreToVersion", request.targetVersion);
+	TraceEvent("FastRestoreMaster").detail("RestoreToVersion", request.targetVersion);
 	return request.targetVersion;
 }

@ -403,6 +404,7 @@ ACTOR static Future<Void> loadFilesOnLoaders(Reference<MasterBatchData> batchDat
 		++paramIdx;
 	}
 	TraceEvent(files->size() != paramIdx ? SevError : SevInfo, "FastRestoreMasterPhaseLoadFiles")
+	    .detail("BatchIndex", batchIndex)
 	    .detail("Files", files->size())
 	    .detail("LoadParams", paramIdx);

@ -494,6 +496,12 @@ ACTOR static Future<Void> distributeWorkloadPerVersionBatch(Reference<RestoreMas
                                                            VersionBatch versionBatch) {
 	state Reference<MasterBatchData> batchData = self->batch[batchIndex];
 	state Reference<MasterBatchStatus> batchStatus = self->batchStatus[batchIndex];
+	state double startTime = now();
+
+	TraceEvent("FastRestoreMasterDispatchVersionBatchesStart")
+	    .detail("BatchIndex", batchIndex)
+	    .detail("BatchSize", versionBatch.size)
+	    .detail("RunningVersionBatches", self->runningVersionBatches.get());

 	self->runningVersionBatches.set(self->runningVersionBatches.get() + 1);

@ -540,6 +548,13 @@ ACTOR static Future<Void> distributeWorkloadPerVersionBatch(Reference<RestoreMas
 	if (self->delayedActors > 0) {
 		self->checkMemory.trigger();
 	}
+
+	TraceEvent("FastRestoreMasterDispatchVersionBatchesDone")
+	    .detail("BatchIndex", batchIndex)
+	    .detail("BatchSize", versionBatch.size)
+	    .detail("RunningVersionBatches", self->runningVersionBatches.get())
+	    .detail("Latency", now() - startTime);
+
 	return Void();
 }

@ -549,6 +564,9 @@ ACTOR static Future<Void> distributeWorkloadPerVersionBatch(Reference<RestoreMas
 void splitKeyRangeForAppliers(Reference<MasterBatchData> batchData,
                              std::map<UID, RestoreApplierInterface> appliersInterf, int batchIndex) {
 	ASSERT(batchData->samplesSize >= 0);
+	// Sanity check: samples should not be used after freed
+	ASSERT((batchData->samplesSize > 0 && !batchData->samples.empty()) ||
+	       batchData->samplesSize == 0 && batchData->samples.empty());
 	int numAppliers = appliersInterf.size();
 	double slotSize = std::max(batchData->samplesSize / numAppliers, 1.0);
 	double cumulativeSize = slotSize;
@ -607,6 +625,7 @@ void splitKeyRangeForAppliers(Reference<MasterBatchData> batchData,
 	    .detail("BatchIndex", batchIndex)
 	    .detail("SamplingSize", batchData->samplesSize)
 	    .detail("SlotSize", slotSize);
+	batchData->samples.clear();
 }

 ACTOR static Future<Standalone<VectorRef<RestoreRequest>>> collectRestoreRequests(Database cx) {
@ -816,17 +835,17 @@ ACTOR static Future<Void> notifyApplierToApplyMutations(Reference<MasterBatchDat
                                                        Reference<MasterBatchStatus> batchStatus,
                                                        std::map<UID, RestoreApplierInterface> appliersInterf,
                                                        int batchIndex, NotifiedVersion* finishedBatch) {
-
-	wait(finishedBatch->whenAtLeast(batchIndex - 1));
-	TraceEvent("FastRestoreMasterPhaseApplyToDB")
+	TraceEvent("FastRestoreMasterPhaseApplyToDBStart")
 	    .detail("BatchIndex", batchIndex)
 	    .detail("FinishedBatch", finishedBatch->get());

+	wait(finishedBatch->whenAtLeast(batchIndex - 1));
+
 	if (finishedBatch->get() == batchIndex - 1) {
 		// Prepare the applyToDB requests
 		std::vector<std::pair<UID, RestoreVersionBatchRequest>> requests;

-		TraceEvent("FastRestoreMasterPhaseApplyToDB")
+		TraceEvent("FastRestoreMasterPhaseApplyToDBRunning")
 		    .detail("BatchIndex", batchIndex)
 		    .detail("Appliers", appliersInterf.size());
 		for (auto& applier : appliersInterf) {
@ -945,7 +964,7 @@ ACTOR static Future<Void> signalRestoreCompleted(Reference<RestoreMasterData> se
 		}
 	}

-	TraceEvent("FastRestore").detail("RestoreMaster", "AllRestoreCompleted");
+	TraceEvent("FastRestoreMasterAllRestoreCompleted");

 	return Void();
 }
--- a/fdbserver/RestoreMaster.actor.h
+++ b/fdbserver/RestoreMaster.actor.h
@ -89,7 +89,7 @@ struct MasterBatchData : public ReferenceCounted<MasterBatchData> {
 			if (applierToRange.find(applier.second) == applierToRange.end()) {
 				applierToRange[applier.second] = applier.first;
 			} else {
-				TraceEvent(SevError, "FastRestore")
+				TraceEvent(SevError, "FastRestoreMaster")
 				    .detail("SanityCheckApplierKeyRange", applierToRange.size())
 				    .detail("ApplierID", applier.second)
 				    .detail("Key1", applierToRange[applier.second])
--- a/fdbserver/RestoreRoleCommon.actor.cpp
+++ b/fdbserver/RestoreRoleCommon.actor.cpp
@ -57,6 +57,9 @@ ACTOR Future<Void> handleInitVersionBatchRequest(RestoreVersionBatchRequest req,
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("Role", getRoleStr(self->role))
 	    .detail("VersionBatchNotifiedVersion", self->versionBatchId.get());
+	// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
+	ASSERT(self->finishedBatch.get() < req.batchIndex);
+
 	// batchId is continuous. (req.batchIndex-1) is the id of the just finished batch.
 	wait(self->versionBatchId.whenAtLeast(req.batchIndex - 1));

@ -110,7 +113,8 @@ ACTOR Future<Void> isSchedulable(Reference<RestoreRoleData> self, int actorBatch
 		}
 		if (memory < memoryThresholdBytes || self->finishedBatch.get() + 1 == actorBatchIndex) {
 			if (memory >= memoryThresholdBytes) {
-				TraceEvent(SevWarn, "FastRestoreMemoryUsageAboveThreshold")
+				TraceEvent(SevWarn, "FastRestoreMemoryUsageAboveThreshold", self->id())
+				    .detail("Role", getRoleStr(self->role))
 				    .detail("BatchIndex", actorBatchIndex)
 				    .detail("FinishedBatch", self->finishedBatch.get())
 				    .detail("Actor", name)
@ -119,10 +123,12 @@ ACTOR Future<Void> isSchedulable(Reference<RestoreRoleData> self, int actorBatch
 			self->delayedActors--;
 			break;
 		} else {
-			TraceEvent(SevDebug, "FastRestoreMemoryUsageAboveThresholdWait")
+			TraceEvent(SevInfo, "FastRestoreMemoryUsageAboveThresholdWait", self->id())
+			    .detail("Role", getRoleStr(self->role))
 			    .detail("BatchIndex", actorBatchIndex)
 			    .detail("Actor", name)
 			    .detail("CurrentMemory", memory);
+			// TODO: Set FASTRESTORE_WAIT_FOR_MEMORY_LATENCY to a large value. It should be able to avoided
 			wait(delay(SERVER_KNOBS->FASTRESTORE_WAIT_FOR_MEMORY_LATENCY) || self->checkMemory.onTrigger());
 		}
 	}
--- a/fdbserver/RestoreRoleCommon.actor.h
+++ b/fdbserver/RestoreRoleCommon.actor.h
@ -104,8 +104,6 @@ public:
 	NotifiedVersion versionBatchId; // The index of the version batch that has been initialized and put into pipeline
 	NotifiedVersion finishedBatch; // The highest batch index all appliers have applied mutations

-	bool versionBatchStart = false;
-
 	RestoreRoleData() : role(RestoreRole::Invalid), cpuUsage(0.0), memory(0.0), residentMemory(0.0), delayedActors(0){};

 	virtual ~RestoreRoleData() = default;
--- a/fdbserver/RestoreUtil.h
+++ b/fdbserver/RestoreUtil.h
@ -38,8 +38,25 @@
 #define SevFRMutationInfo SevVerbose
 //#define SevFRMutationInfo SevInfo

+struct VersionedMutation {
+	MutationRef mutation;
+	LogMessageVersion version;
+
+	VersionedMutation() = default;
+	explicit VersionedMutation(MutationRef mutation, LogMessageVersion version)
+	  : mutation(mutation), version(version) {}
+	explicit VersionedMutation(Arena& arena, const VersionedMutation& vm)
+	  : mutation(arena, vm.mutation), version(vm.version) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, mutation, version);
+	}
+};
+
 using MutationsVec = Standalone<VectorRef<MutationRef>>;
 using LogMessageVersionVec = Standalone<VectorRef<LogMessageVersion>>;
+using VersionedMutationsVec = Standalone<VectorRef<VersionedMutation>>;

 enum class RestoreRole { Invalid = 0, Master = 1, Loader, Applier };
 BINARY_SERIALIZABLE(RestoreRole);
--- a/fdbserver/RestoreWorker.actor.cpp
+++ b/fdbserver/RestoreWorker.actor.cpp
@ -66,7 +66,7 @@ ACTOR Future<Void> handlerTerminateWorkerRequest(RestoreSimpleRequest req, Refer
 		return Void();
 	}));

-	TraceEvent("FastRestore").detail("HandleTerminateWorkerReq", self->id());
+	TraceEvent("FastRestoreWorker").detail("HandleTerminateWorkerReq", self->id());

 	return Void();
 }
@ -97,7 +97,7 @@ void handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Reference<RestoreWo
 		DUMPTOKEN(recruited.collectRestoreRoleInterfaces);
 		DUMPTOKEN(recruited.finishRestore);
 		actors->add(restoreLoaderCore(self->loaderInterf.get(), req.nodeIndex, cx));
-		TraceEvent("FastRestore").detail("RecruitedLoaderNodeIndex", req.nodeIndex);
+		TraceEvent("FastRestoreWorker").detail("RecruitedLoaderNodeIndex", req.nodeIndex);
 		req.reply.send(
 		    RestoreRecruitRoleReply(self->loaderInterf.get().id(), RestoreRole::Loader, self->loaderInterf.get()));
 	} else if (req.role == RestoreRole::Applier) {
@ -111,12 +111,11 @@ void handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Reference<RestoreWo
 		DUMPTOKEN(recruited.collectRestoreRoleInterfaces);
 		DUMPTOKEN(recruited.finishRestore);
 		actors->add(restoreApplierCore(self->applierInterf.get(), req.nodeIndex, cx));
-		TraceEvent("FastRestore").detail("RecruitedApplierNodeIndex", req.nodeIndex);
+		TraceEvent("FastRestoreWorker").detail("RecruitedApplierNodeIndex", req.nodeIndex);
 		req.reply.send(
 		    RestoreRecruitRoleReply(self->applierInterf.get().id(), RestoreRole::Applier, self->applierInterf.get()));
 	} else {
-		TraceEvent(SevError, "FastRestore")
-		    .detail("HandleRecruitRoleRequest", "UnknownRole"); //.detail("Request", req.printable());
+		TraceEvent(SevError, "FastRestoreWorkerHandleRecruitRoleRequestUnknownRole").detail("Request", req.toString());
 	}

 	return;
@ -147,7 +146,7 @@ ACTOR Future<Void> collectRestoreWorkerInterface(Reference<RestoreWorkerData> se
 				}
 				break;
 			}
-			TraceEvent("FastRestore")
+			TraceEvent("FastRestoreWorker")
 			    .suppressFor(10.0)
 			    .detail("NotEnoughWorkers", agentValues.size())
 			    .detail("MinWorkers", min_num_workers);
@ -158,7 +157,7 @@ ACTOR Future<Void> collectRestoreWorkerInterface(Reference<RestoreWorkerData> se
 	}
 	ASSERT(agents.size() >= min_num_workers); // ASSUMPTION: We must have at least 1 loader and 1 applier

-	TraceEvent("FastRestore").detail("CollectWorkerInterfaceNumWorkers", self->workerInterfaces.size());
+	TraceEvent("FastRestoreWorker").detail("CollectWorkerInterfaceNumWorkers", self->workerInterfaces.size());

 	return Void();
 }
@ -182,12 +181,12 @@ ACTOR Future<Void> monitorWorkerLiveness(Reference<RestoreWorkerData> self) {
 ACTOR Future<Void> startRestoreWorkerLeader(Reference<RestoreWorkerData> self, RestoreWorkerInterface workerInterf,
                                            Database cx) {
 	// We must wait for enough time to make sure all restore workers have registered their workerInterfaces into the DB
-	TraceEvent("FastRestore")
+	TraceEvent("FastRestoreWorker")
 	    .detail("Master", workerInterf.id())
 	    .detail("WaitForRestoreWorkerInterfaces",
 	            SERVER_KNOBS->FASTRESTORE_NUM_LOADERS + SERVER_KNOBS->FASTRESTORE_NUM_APPLIERS);
 	wait(delay(10.0));
-	TraceEvent("FastRestore")
+	TraceEvent("FastRestoreWorker")
 	    .detail("Master", workerInterf.id())
 	    .detail("CollectRestoreWorkerInterfaces",
 	            SERVER_KNOBS->FASTRESTORE_NUM_LOADERS + SERVER_KNOBS->FASTRESTORE_NUM_APPLIERS);
@ -236,14 +235,12 @@ ACTOR Future<Void> startRestoreWorker(Reference<RestoreWorkerData> self, Restore
 					exitRole = handlerTerminateWorkerRequest(req, self, interf, cx);
 				}
 				when(wait(exitRole)) {
-					TraceEvent("FastRestore").detail("RestoreWorkerCore", "ExitRole").detail("NodeID", self->id());
+					TraceEvent("FastRestoreWorkerCoreExitRole", self->id());
 					break;
 				}
 			}
 		} catch (Error& e) {
-			TraceEvent(SevWarn, "FastRestore")
-			    .detail("RestoreWorkerError", e.what())
-			    .detail("RequestType", requestTypeStr);
+			TraceEvent(SevWarn, "FastRestoreWorkerError").detail("RequestType", requestTypeStr).error(e, true);
 			break;
 		}
 	}
--- a/fdbserver/RestoreWorker.actor.h
+++ b/fdbserver/RestoreWorker.actor.h
@ -57,6 +57,7 @@ struct RestoreWorkerData :  NonCopyable, public ReferenceCounted<RestoreWorkerDa
 	RestoreWorkerData() = default;

 	~RestoreWorkerData() {
+		TraceEvent("RestoreWorkerDataDeleted").detail("WorkerID", workerID.toString());
 		printf("[Exit] Worker:%s RestoreWorkerData is deleted\n", workerID.toString().c_str());
 	}

--- a/fdbserver/ServerDBInfo.actor.h
+++ b/fdbserver/ServerDBInfo.actor.h
@ -0,0 +1,101 @@
+/*
+ * ServerDBInfo.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_SERVERDBINFO_ACTOR_G_H)
+#define FDBSERVER_SERVERDBINFO_ACTOR_G_H
+#include "fdbserver/ServerDBInfo.actor.g.h"
+#elif !defined(FDBSERVER_SERVERDBINFO_ACTOR_H)
+#define FDBSERVER_SERVERDBINFO_ACTOR_H
+#define FDBSERVER_SERVERDBINFO_H
+#pragma once
+
+#include "fdbserver/DataDistributorInterface.h"
+#include "fdbserver/MasterInterface.h"
+#include "fdbserver/LogSystemConfig.h"
+#include "fdbserver/RatekeeperInterface.h"
+#include "fdbserver/RecoveryState.h"
+#include "fdbserver/LatencyBandConfig.h"
+#include "fdbserver/WorkerInterface.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+struct ServerDBInfo {
+	constexpr static FileIdentifier file_identifier = 13838807;
+	// This structure contains transient information which is broadcast to all workers for a database,
+	// permitting them to communicate with each other.  It is not available to the client.  This mechanism
+	// (see GetServerDBInfoRequest) is closely parallel to OpenDatabaseRequest for the client.
+
+	UID id;  // Changes each time any other member changes
+	ClusterControllerFullInterface clusterInterface;
+	ClientDBInfo client;           // After a successful recovery, eventually proxies that communicate with it
+	Optional<DataDistributorInterface> distributor;  // The best guess of current data distributor.
+	MasterInterface master;        // The best guess as to the most recent master, which might still be recovering
+	Optional<RatekeeperInterface> ratekeeper;
+	std::vector<ResolverInterface> resolvers;
+	DBRecoveryCount recoveryCount; // A recovery count from DBCoreState.  A successful master recovery increments it twice; unsuccessful recoveries may increment it once. Depending on where the current master is in its recovery process, this might not have been written by the current master.
+	RecoveryState recoveryState;
+	LifetimeToken masterLifetime;  // Used by masterserver to detect not being the currently chosen master
+	LocalityData myLocality;       // (Not serialized) Locality information, if available, for the *local* process
+	LogSystemConfig logSystemConfig;
+	std::vector<UID> priorCommittedLogServers;   // If !fullyRecovered and logSystemConfig refers to a new log system which may not have been committed to the coordinated state yet, then priorCommittedLogServers are the previous, fully committed generation which need to stay alive in case this recovery fails
+	Optional<LatencyBandConfig> latencyBandConfig;
+	std::vector<std::pair<uint16_t,StorageServerInterface>> storageCaches;
+	int64_t infoGeneration;
+
+	ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED), logSystemConfig(0), infoGeneration(0) {}
+
+	bool operator == (ServerDBInfo const& r) const { return id == r.id; }
+	bool operator != (ServerDBInfo const& r) const { return id != r.id; }
+
+	template <class Ar>
+	void serialize( Ar& ar ) {
+		serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches, infoGeneration);
+	}
+};
+
+struct UpdateServerDBInfoRequest {
+	constexpr static FileIdentifier file_identifier = 9467438;
+	Standalone<StringRef> serializedDbInfo;
+	std::vector<Endpoint> broadcastInfo;
+	ReplyPromise<std::vector<Endpoint>> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, serializedDbInfo, broadcastInfo, reply);
+	}
+};
+
+struct GetServerDBInfoRequest {
+	constexpr static FileIdentifier file_identifier = 9467439;
+	UID knownServerInfoID;
+	ReplyPromise<struct ServerDBInfo> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, knownServerInfoID, reply);
+	}
+};
+
+
+ACTOR Future<Void> broadcastTxnRequest(TxnStateRequest req, int sendAmount, bool sendReply);
+
+ACTOR Future<std::vector<Endpoint>> broadcastDBInfoRequest(UpdateServerDBInfoRequest req, int sendAmount, Optional<Endpoint> sender, bool sendReply);
+
+#include "flow/unactorcompiler.h"
+#endif
--- a/fdbserver/ServerDBInfo.h
+++ b/fdbserver/ServerDBInfo.h
@ -22,74 +22,6 @@
 #define FDBSERVER_SERVERDBINFO_H
 #pragma once

-#include "fdbserver/DataDistributorInterface.h"
-#include "fdbserver/MasterInterface.h"
-#include "fdbserver/LogSystemConfig.h"
-#include "fdbserver/RatekeeperInterface.h"
-#include "fdbserver/RecoveryState.h"
-#include "fdbserver/LatencyBandConfig.h"
-#include "fdbserver/WorkerInterface.actor.h"
-
-struct ServerDBInfo {
-	constexpr static FileIdentifier file_identifier = 13838807;
-	// This structure contains transient information which is broadcast to all workers for a database,
-	// permitting them to communicate with each other.  It is not available to the client.  This mechanism
-	// (see GetServerDBInfoRequest) is closely parallel to OpenDatabaseRequest for the client.
-
-	UID id;  // Changes each time any other member changes
-	ClusterControllerFullInterface clusterInterface;
-	ClientDBInfo client;           // After a successful recovery, eventually proxies that communicate with it
-	Optional<DataDistributorInterface> distributor;  // The best guess of current data distributor.
-	MasterInterface master;        // The best guess as to the most recent master, which might still be recovering
-	Optional<RatekeeperInterface> ratekeeper;
-	std::vector<ResolverInterface> resolvers;
-	DBRecoveryCount recoveryCount; // A recovery count from DBCoreState.  A successful master recovery increments it twice; unsuccessful recoveries may increment it once. Depending on where the current master is in its recovery process, this might not have been written by the current master.
-	RecoveryState recoveryState;
-	LifetimeToken masterLifetime;  // Used by masterserver to detect not being the currently chosen master
-	LocalityData myLocality;       // (Not serialized) Locality information, if available, for the *local* process
-	LogSystemConfig logSystemConfig;
-	std::vector<UID> priorCommittedLogServers;   // If !fullyRecovered and logSystemConfig refers to a new log system which may not have been committed to the coordinated state yet, then priorCommittedLogServers are the previous, fully committed generation which need to stay alive in case this recovery fails
-	Optional<LatencyBandConfig> latencyBandConfig;
-	std::vector<std::pair<uint16_t,StorageServerInterface>> storageCaches;
-	int64_t infoGeneration;
-
-	ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED), logSystemConfig(0), infoGeneration(0) {}
-
-	bool operator == (ServerDBInfo const& r) const { return id == r.id; }
-	bool operator != (ServerDBInfo const& r) const { return id != r.id; }
-
-	template <class Ar>
-	void serialize( Ar& ar ) {
-		serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches, infoGeneration);
-	}
-};
-
-struct UpdateServerDBInfoRequest {
-	constexpr static FileIdentifier file_identifier = 9467438;
-	Standalone<StringRef> serializedDbInfo;
-	std::vector<Endpoint> broadcastInfo;
-	ReplyPromise<std::vector<Endpoint>> reply;
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, serializedDbInfo, broadcastInfo, reply);
-	}
-};
-
-struct GetServerDBInfoRequest {
-	constexpr static FileIdentifier file_identifier = 9467439;
-	UID knownServerInfoID;
-	ReplyPromise<struct ServerDBInfo> reply;
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, knownServerInfoID, reply);
-	}
-};
-
-
-Future<Void> broadcastTxnRequest(TxnStateRequest const& req, int const& sendAmount, bool const& sendReply);
-
-Future<std::vector<Endpoint>> broadcastDBInfoRequest(UpdateServerDBInfoRequest const& req, int const& sendAmount, Optional<Endpoint> const& sender, bool const& sendReply);
+#include "fdbserver/ServerDBInfo.actor.h"

 #endif
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -31,9 +31,7 @@
 #include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/BackupAgent.actor.h"
-#if defined(CMAKE_BUILD) || !defined(WIN32)
-#include "versions.h"
-#endif
+#include "fdbclient/IncludeVersions.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.

 #undef max
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -843,7 +843,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
 			double networkMetricsElapsed = networkMetrics.getDouble("Elapsed");

 			try {
-				double runLoopBusy = networkMetrics.getDouble("PriorityBusy1");
+				double runLoopBusy = networkMetrics.getDouble("PriorityStarvedBelow1");
 				statusObj["run_loop_busy"] = runLoopBusy / networkMetricsElapsed;
 			}
 			catch(Error &e) {
@ -974,9 +974,9 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(WorkerDetails
 		// TODO:  time_in_recovery: 0.5
 		//        time_in_state: 0.1

-		TraceEventFields md = wait(activeGens);
-		if(md.size()) {
-			int activeGenerations = md.getInt("ActiveGenerations");
+		TraceEventFields mdActiveGens = wait(activeGens);
+		if(mdActiveGens.size()) {
+			int activeGenerations = mdActiveGens.getInt("ActiveGenerations");
 			message["active_generations"] = activeGenerations;
 		}

--- a/fdbserver/StorageMetrics.actor.h
+++ b/fdbserver/StorageMetrics.actor.h
@ -413,6 +413,62 @@ struct StorageServerMetrics {

 	Future<Void> waitMetrics(WaitMetricsRequest req, Future<Void> delay);

+	// Given a read hot shard, this function will divide the shard into chunks and find those chunks whose
+	// readBytes/sizeBytes exceeds the `readDensityRatio`. Please make sure to run unit tests
+	// `StorageMetricsSampleTests.txt` after change made.
+	std::vector<KeyRangeRef> getReadHotRanges(KeyRangeRef shard, double readDensityRatio, int64_t baseChunkSize,
+	                                          int64_t minShardReadBandwidthPerKSeconds) {
+		std::vector<KeyRangeRef> toReturn;
+		double shardSize = (double)byteSample.getEstimate(shard);
+		int64_t shardReadBandwidth = bytesReadSample.getEstimate(shard);
+		if (shardReadBandwidth * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS <=
+		    minShardReadBandwidthPerKSeconds) {
+			return toReturn;
+		}
+		if (shardSize <= baseChunkSize) {
+			// Shard is small, use it as is
+			if (bytesReadSample.getEstimate(shard) > (readDensityRatio * shardSize)) {
+				toReturn.push_back(shard);
+			}
+			return toReturn;
+		}
+		KeyRef beginKey = shard.begin;
+		IndexedSet<Key, int64_t>::iterator endKey =
+		    byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + baseChunkSize);
+		while (endKey != byteSample.sample.end()) {
+			if (*endKey > shard.end) endKey = byteSample.sample.lower_bound(shard.end);
+			if (*endKey == beginKey) {
+				++endKey;
+				continue;
+			}
+			if (bytesReadSample.getEstimate(KeyRangeRef(beginKey, *endKey)) >
+			    (readDensityRatio * std::max(baseChunkSize, byteSample.getEstimate(KeyRangeRef(beginKey, *endKey))))) {
+				auto range = KeyRangeRef(beginKey, *endKey);
+				if (!toReturn.empty() && toReturn.back().end == range.begin) {
+					// in case two consecutive chunks both are over the ratio, merge them.
+					auto updatedTail = KeyRangeRef(toReturn.back().begin, *endKey);
+					toReturn.pop_back();
+					toReturn.push_back(updatedTail);
+				} else {
+					toReturn.push_back(range);
+				}
+			}
+			beginKey = *endKey;
+			endKey = byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) +
+			                                 baseChunkSize);
+		}
+		return toReturn;
+	}
+
+	void getReadHotRanges(ReadHotSubRangeRequest req) {
+		ReadHotSubRangeReply reply;
+		std::vector<KeyRangeRef> v = getReadHotRanges(req.keys, SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO,
+		                                              SERVER_KNOBS->READ_HOT_SUB_RANGE_CHUNK_SIZE,
+		                                              SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS);
+		reply.readHotRanges = VectorRef<KeyRangeRef>(v.data(), v.size());
+		req.reply.send(reply);
+	}
+
 private:
 	static void collapse( KeyRangeMap<int>& map, KeyRef const& key ) {
 		auto range = map.rangeContaining(key);
@ -433,6 +489,100 @@ private:
 	}
 };

+TEST_CASE("/fdbserver/StorageMetricSample/readHotDetect/simple") {
+
+	int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE;
+	StorageServerMetrics ssm;
+
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Banana"), 2000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Cat"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Cathode"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Dog"), 1000 * sampleUnit);
+
+	ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit);
+
+	vector<KeyRangeRef> t =
+	    ssm.getReadHotRanges(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 2.0, 200 * sampleUnit, 0);
+
+	ASSERT(t.size() == 1 && (*t.begin()).begin == LiteralStringRef("Bah") &&
+	       (*t.begin()).end == LiteralStringRef("Bob"));
+
+	return Void();
+}
+
+TEST_CASE("/fdbserver/StorageMetricSample/readHotDetect/moreThanOneRange") {
+
+	int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE;
+	StorageServerMetrics ssm;
+
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Banana"), 2000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Cat"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Cathode"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Dog"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Final"), 2000 * sampleUnit);
+
+	ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Dah"), 300 * sampleUnit);
+
+	vector<KeyRangeRef> t =
+	    ssm.getReadHotRanges(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D")), 2.0, 200 * sampleUnit, 0);
+
+	ASSERT(t.size() == 2 && (*t.begin()).begin == LiteralStringRef("Bah") &&
+	       (*t.begin()).end == LiteralStringRef("Bob"));
+	ASSERT(t.at(1).begin == LiteralStringRef("Cat") && t.at(1).end == LiteralStringRef("Dah"));
+
+	return Void();
+}
+
+TEST_CASE("/fdbserver/StorageMetricSample/readHotDetect/consecutiveRanges") {
+
+	int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE;
+	StorageServerMetrics ssm;
+
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Banana"), 2000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Bucket"), 2000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Cat"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Cathode"), 1000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Dog"), 5000 * sampleUnit);
+	ssm.bytesReadSample.sample.insert(LiteralStringRef("Final"), 2000 * sampleUnit);
+
+	ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit);
+	ssm.byteSample.sample.insert(LiteralStringRef("Dah"), 300 * sampleUnit);
+
+	vector<KeyRangeRef> t =
+	    ssm.getReadHotRanges(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D")), 2.0, 200 * sampleUnit, 0);
+
+	ASSERT(t.size() == 2 && (*t.begin()).begin == LiteralStringRef("Bah") &&
+	       (*t.begin()).end == LiteralStringRef("But"));
+	ASSERT(t.at(1).begin == LiteralStringRef("Cat") && t.at(1).end == LiteralStringRef("Dah"));
+
+	return Void();
+}
+
 //Contains information about whether or not a key-value pair should be included in a byte sample
 //Also contains size information about the byte sample
 struct ByteSampleInfo {
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@ -3074,7 +3074,7 @@ private:
 	};

 public:
-#include "ArtMutationBuffer.h"
+#include "fdbserver/ArtMutationBuffer.h"
 	struct MutationBufferStdMap {
 		MutationBufferStdMap() {
 			// Create range representing the entire keyspace.  This reduces edge cases to applying mutations
@ -4691,7 +4691,7 @@ public:
 	};
 };

-#include "art_impl.h"
+#include "fdbserver/art_impl.h"

 RedwoodRecordRef VersionedBTree::dbBegin(StringRef(), 0);
 RedwoodRecordRef VersionedBTree::dbEnd(LiteralStringRef("\xff\xff\xff\xff\xff"));
--- a/fdbserver/art.h
+++ b/fdbserver/art.h
@ -20,6 +20,7 @@


 #include <stdint.h>
+#include "fdbclient/FDBTypes.h"
 #include "flow/Arena.h"
 #include "flow/Platform.h"

@ -47,7 +48,11 @@ struct art_tree {
 #define ART_NEITHER 0


-#define ART_IS_LEAF(x) ( (*((ART_NODE_TYPE*)x) == ART_LEAF))
+//#define ART_IS_LEAF(x) ( (*((ART_NODE_TYPE*)x) == ART_LEAF))
+    template<class T>
+    static inline bool ART_IS_LEAF(T const& x) {
+        return *((ART_NODE_TYPE*)x) == ART_LEAF;
+    }

 #define ART_LEAF_RAW(x) ((art_leaf*)(x))

--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -54,9 +54,7 @@
 #include "fdbrpc/AsyncFileCached.actor.h"
 #include "fdbserver/CoroFlow.h"
 #include "flow/TLSConfig.actor.h"
-#if defined(CMAKE_BUILD) || !defined(WIN32)
-#include "versions.h"
-#endif
+#include "fdbclient/IncludeVersions.h"

 #include "fdbmonitor/SimpleIni.h"

@ -1868,7 +1866,7 @@ int main(int argc, char* argv[]) {
 				vector<Future<Void>> actors(listenErrors.begin(), listenErrors.end());
 				actors.push_back(restoreWorker(opts.connectionFile, opts.localities, dataFolder));
 				f = stopAfter(waitForAll(actors));
-				printf("Fast restore worker exits\n");
+				printf("Fast restore worker started\n");
 				g_network->run();
 				printf("g_network->run() done\n");
 			} else { // Call fdbd roles in conventional way
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -1046,9 +1046,10 @@ ACTOR Future<Void> getShardStateQ( StorageServer* data, GetShardStateRequest req
 	return Void();
 }

-void merge( Arena& arena, VectorRef<KeyValueRef, VecSerStrategy::String>& output, VectorRef<KeyValueRef> const& base,
-	        StorageServer::VersionedData::iterator& start, StorageServer::VersionedData::iterator const& end,
-			int versionedDataCount, int limit, bool stopAtEndOfBase, int limitBytes = 1<<30 )
+void merge( Arena& arena, VectorRef<KeyValueRef, VecSerStrategy::String>& output,
+			VectorRef<KeyValueRef> const& vm_output,
+			VectorRef<KeyValueRef> const& base,
+			int& vCount, int limit, bool stopAtEndOfBase, int& pos, int limitBytes = 1<<30 )
 // Combines data from base (at an older version) with sets from newer versions in [start, end) and appends the first (up to) |limit| rows to output
 // If limit<0, base and output are in descending order, and start->key()>end->key(), but start is still inclusive and end is exclusive
 {
@ -1058,17 +1059,17 @@ void merge( Arena& arena, VectorRef<KeyValueRef, VecSerStrategy::String>& output
 	if (!forward) limit = -limit;
 	int adjustedLimit = limit + output.size();
 	int accumulatedBytes = 0;
-
 	KeyValueRef const* baseStart = base.begin();
 	KeyValueRef const* baseEnd = base.end();
-	while (baseStart!=baseEnd && start!=end && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
-		if (forward ? baseStart->key < start.key() : baseStart->key > start.key()) {
+	while (baseStart!=baseEnd && vCount>0 && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
+		if (forward ? baseStart->key < vm_output[pos].key : baseStart->key > vm_output[pos].key) {
 			output.push_back_deep( arena, *baseStart++ );
 		}
 		else {
-			output.push_back_deep( arena, KeyValueRef(start.key(), start->getValue()) );
-			if (baseStart->key == start.key()) ++baseStart;
-			if (forward) ++start; else --start;
+			output.push_back_deep( arena, vm_output[pos]);
+			if (baseStart->key == vm_output[pos].key) ++baseStart;
+			++pos;
+			vCount--;
 		}
 		accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
 	}
@ -1077,10 +1078,11 @@ void merge( Arena& arena, VectorRef<KeyValueRef, VecSerStrategy::String>& output
 		accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
 	}
 	if( !stopAtEndOfBase ) {
-		while (start!=end && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
-			output.push_back_deep( arena, KeyValueRef(start.key(), start->getValue()) );
+		while (vCount>0 && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
+			output.push_back_deep( arena, vm_output[pos]);
 			accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
-			if (forward) ++start; else --start;
+			++pos;
+			vCount--;
 		}
 	}
 }
@ -1090,12 +1092,19 @@ void merge( Arena& arena, VectorRef<KeyValueRef, VecSerStrategy::String>& output
 ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version, KeyRange range, int limit, int* pLimitBytes ) {
 	state GetKeyValuesReply result;
 	state StorageServer::VersionedData::ViewAtVersion view = data->data().at(version);
-	state StorageServer::VersionedData::iterator vStart = view.end();
-	state StorageServer::VersionedData::iterator vEnd = view.end();
+	state StorageServer::VersionedData::iterator vCurrent = view.end();
 	state KeyRef readBegin;
 	state KeyRef readEnd;
 	state Key readBeginTemp;
-	state int vCount;
+	state int vCount = 0;
+
+	// for caching the storage queue results during the first PTree traversal
+	state VectorRef<KeyValueRef> resultCache;
+
+
+	// for remembering the position in the resultCache
+	state int pos = 0;
+

 	// Check if the desired key-range intersects the cached key-ranges
 	// TODO Find a more efficient way to do it
@ -1107,40 +1116,50 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
 	if (limit >= 0) {
 		// We might care about a clear beginning before start that
 		//  runs into range
-		vStart = view.lastLessOrEqual(range.begin);
-		if (vStart && vStart->isClearTo() && vStart->getEndKey() > range.begin)
-			readBegin = vStart->getEndKey();
+		vCurrent = view.lastLessOrEqual(range.begin);
+		if (vCurrent && vCurrent->isClearTo() && vCurrent->getEndKey() > range.begin)
+			readBegin = vCurrent->getEndKey();
 		else
 			readBegin = range.begin;

-		vStart = view.lower_bound(readBegin);
+		vCurrent = view.lower_bound(readBegin);

 		while (limit>0 && *pLimitBytes>0 && readBegin < range.end) {
-			ASSERT( !vStart || vStart.key() >= readBegin );
-			if (vStart) { auto b = vStart; --b; ASSERT( !b || b.key() < readBegin ); }
+			ASSERT( !vCurrent || vCurrent.key() >= readBegin );
 			ASSERT( data->storageVersion() <= version );

-			// Read up to limit items from the view, stopping at the next clear (or the end of the range)
-			vEnd = vStart;
-			vCount = 0;
-			int vSize = 0;
-			while (vEnd && vEnd.key() < range.end && !vEnd->isClearTo() && vCount < limit && vSize < *pLimitBytes){
-				vSize += sizeof(KeyValueRef) + vEnd->getValue().expectedSize() + vEnd.key().expectedSize();
-				++vCount;
-				++vEnd;
+			/* Traverse the PTree further, if thare are no unconsumed resultCache items */
+			if (pos == resultCache.size()) {
+				if (vCurrent) {
+					auto b = vCurrent;
+					--b;
+					ASSERT(!b || b.key() < readBegin);
 				}

-			// Read the data on disk up to vEnd (or the end of the range)
-			readEnd = vEnd ? std::min( vEnd.key(), range.end ) : range.end;
+				// Read up to limit items from the view, stopping at the next clear (or the end of the range)
+				int vSize = 0;
+				while (vCurrent && vCurrent.key() < range.end && !vCurrent->isClearTo() && vCount < limit &&
+					   vSize < *pLimitBytes) {
+					// Store the versionedData results in resultCache
+					resultCache.push_back(result.arena, KeyValueRef(vCurrent.key(), vCurrent->getValue()));
+					vSize += sizeof(KeyValueRef) + resultCache.cback().expectedSize();
+					++vCount;
+					++vCurrent;
+				}
+			}
+
+			// Read the data on disk up to vCurrent (or the end of the range)
+			readEnd = vCurrent ? std::min( vCurrent.key(), range.end ) : range.end;
 			Standalone<RangeResultRef> atStorageVersion = wait(
 				data->storage.readRange( KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes ) );

 			ASSERT( atStorageVersion.size() <= limit );
 			if (data->storageVersion() > version) throw transaction_too_old();

-			// merge the sets in [vStart,vEnd) with the sets on disk, stopping at the last key from disk if we were limited
+			// merge the sets in resultCache with the sets on disk, stopping at the last key from disk if there is 'more'
 			int prevSize = result.data.size();
-			merge( result.arena, result.data, atStorageVersion, vStart, vEnd, vCount, limit, atStorageVersion.more, *pLimitBytes );
+			merge( result.arena, result.data, resultCache,
+				   atStorageVersion, vCount, limit, atStorageVersion.more, pos, *pLimitBytes );
 			limit -= result.data.size() - prevSize;

 			for (auto i = result.data.begin() + prevSize; i != result.data.end(); i++) {
@ -1151,50 +1170,56 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
 				break;
 			}

+			// Setup for the next iteration
 			// If we hit our limits reading from disk but then combining with MVCC gave us back more room
-			if (atStorageVersion.more) {
+			if (atStorageVersion.more) { // if there might be more data, begin reading right after what we already found to find out
 				ASSERT(result.data.end()[-1].key == atStorageVersion.end()[-1].key);
-				readBegin = readBeginTemp = keyAfter(result.data.end()[-1].key);
-			} else if (vEnd && vEnd->isClearTo()) {
-				ASSERT(vStart == vEnd); // vStart will have been advanced by merge()
-				ASSERT(vEnd->getEndKey() > readBegin);
-				readBegin = vEnd->getEndKey();
-				++vStart;
+				readBegin = readBeginTemp = keyAfter( result.data.end()[-1].key );
+			} else if (vCurrent && vCurrent->isClearTo()){ // if vCurrent is a clear, skip it.
+				ASSERT(vCurrent->getEndKey() > readBegin);
+				readBegin = vCurrent->getEndKey();  // next disk read should start at the end of the clear
+				++vCurrent;
 			} else {
 				ASSERT(readEnd == range.end);
 				break;
 			}
 		}
 	} else {
-		vStart = view.lastLess(range.end);
+		vCurrent = view.lastLess(range.end);

 		// A clear might extend all the way to range.end
-		if (vStart && vStart->isClearTo() && vStart->getEndKey() >= range.end) {
-			readEnd = vStart.key();
-			--vStart;
+		if (vCurrent && vCurrent->isClearTo() && vCurrent->getEndKey() >= range.end) {
+			readEnd = vCurrent.key();
+			--vCurrent;
 		} else {
 			readEnd = range.end;
 		}

 		while (limit < 0 && *pLimitBytes > 0 && readEnd > range.begin) {
-			ASSERT(!vStart || vStart.key() < readEnd);
-			if (vStart) {
-				auto b = vStart;
+			ASSERT(!vCurrent || vCurrent.key() < readEnd);
+			ASSERT(data->storageVersion() <= version);
+
+			/* Traverse the PTree further, if thare are no unconsumed resultCache items */
+			if (pos == resultCache.size()) {
+				if (vCurrent) {
+					auto b = vCurrent;
 					++b;
 					ASSERT(!b || b.key() >= readEnd);
 				}
-			ASSERT(data->storageVersion() <= version);

-			vEnd = vStart;
 				vCount = 0;
-			int vSize=0;
-			while (vEnd && vEnd.key() >= range.begin && !vEnd->isClearTo() && vCount < -limit && vSize < *pLimitBytes){
-				vSize += sizeof(KeyValueRef) + vEnd->getValue().expectedSize() + vEnd.key().expectedSize();
+				int vSize = 0;
+				while (vCurrent && vCurrent.key() >= range.begin && !vCurrent->isClearTo() && vCount < -limit &&
+					   vSize < *pLimitBytes) {
+					// Store the versionedData results in resultCache
+					resultCache.push_back(result.arena, KeyValueRef(vCurrent.key(), vCurrent->getValue()));
+					vSize += sizeof(KeyValueRef) + resultCache.cback().expectedSize();
 					++vCount;
-				--vEnd;
+					--vCurrent;
+				}
 			}

-			readBegin = vEnd ? std::max(vEnd->isClearTo() ? vEnd->getEndKey() : vEnd.key(), range.begin) : range.begin;
+			readBegin = vCurrent ? std::max(vCurrent->isClearTo() ? vCurrent->getEndKey() : vCurrent.key(), range.begin) : range.begin;
 			Standalone<RangeResultRef> atStorageVersion =
 			    wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes));

@ -1202,7 +1227,8 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
 			if (data->storageVersion() > version) throw transaction_too_old();

 			int prevSize = result.data.size();
-			merge(result.arena, result.data, atStorageVersion, vStart, vEnd, vCount, limit, atStorageVersion.more, *pLimitBytes);
+			merge( result.arena, result.data, resultCache,
+				   atStorageVersion, vCount, limit, atStorageVersion.more, pos, *pLimitBytes );
 			limit += result.data.size() - prevSize;

 			for (auto i = result.data.begin() + prevSize; i != result.data.end(); i++) {
@ -1216,11 +1242,10 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
 			if (atStorageVersion.more) {
 				ASSERT(result.data.end()[-1].key == atStorageVersion.end()[-1].key);
 				readEnd = result.data.end()[-1].key;
-			} else if (vEnd && vEnd->isClearTo()) {
-				ASSERT(vStart == vEnd);
-				ASSERT(vEnd.key() < readEnd)
-				readEnd = vEnd.key();
-				--vStart;
+			} else if (vCurrent && vCurrent->isClearTo()) {
+				ASSERT(vCurrent.key() < readEnd)
+				readEnd = vCurrent.key();
+				--vCurrent;
 			} else {
 				ASSERT(readBegin == range.begin);
 				break;
@ -1230,7 +1255,6 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,

 	// all but the last item are less than *pLimitBytes
 	ASSERT(result.data.size() == 0 || *pLimitBytes + result.data.end()[-1].expectedSize() + sizeof(KeyValueRef) > 0);
-
 	result.more = limit == 0 || *pLimitBytes<=0;  // FIXME: Does this have to be exact?
 	result.version = version;
 	return result;
@ -3486,6 +3510,14 @@ ACTOR Future<Void> metricsCore( StorageServer* self, StorageServerInterface ssi
 				StorageBytes sb = self->storage.getStorageBytes();
 				self->metrics.getStorageMetrics( req, sb, self->counters.bytesInput.getRate(), self->versionLag, self->lastUpdate );
 			}
+			when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
+				if (!self->isReadable(req.keys)) {
+					TEST(true); // readHotSubRanges immediate wrong_shard_server()
+					self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
+				} else {
+					self->metrics.getReadHotRanges(req);
+				}
+			}
 			when (wait(doPollMetrics) ) {
 				self->metrics.poll();
 				doPollMetrics = delay(SERVER_KNOBS->STORAGE_SERVER_POLL_METRICS_DELAY);
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@ -376,7 +376,8 @@ ACTOR Future<Void> testDatabaseLiveness( Database cx, double databasePingDelay,
 	loop {
 		try {
 			state double start = now();
-			TraceEvent(("PingingDatabaseLiveness_" + context).c_str());
+			auto traceMsg = "PingingDatabaseLiveness_" + context;
+			TraceEvent(traceMsg.c_str());
 			wait( timeoutError( pingDatabase( cx ), databasePingDelay ) );
 			double pingTime = now() - start;
 			ASSERT( pingTime > 0 );
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -506,6 +506,7 @@ ACTOR Future<Void> registrationClient(
 					DUMPTOKEN(recruited.getShardState);
 					DUMPTOKEN(recruited.waitMetrics);
 					DUMPTOKEN(recruited.splitMetrics);
+					DUMPTOKEN(recruited.getReadHotRanges);
 					DUMPTOKEN(recruited.getStorageMetrics);
 					DUMPTOKEN(recruited.waitFailure);
 					DUMPTOKEN(recruited.getQueuingMetrics);
@ -695,6 +696,7 @@ ACTOR Future<Void> storageServerRollbackRebooter( Future<Void> prevStorageServer
 		DUMPTOKEN(recruited.getShardState);
 		DUMPTOKEN(recruited.waitMetrics);
 		DUMPTOKEN(recruited.splitMetrics);
+		DUMPTOKEN(recruited.getReadHotRanges);
 		DUMPTOKEN(recruited.getStorageMetrics);
 		DUMPTOKEN(recruited.waitFailure);
 		DUMPTOKEN(recruited.getQueuingMetrics);
@ -977,6 +979,7 @@ ACTOR Future<Void> workerServer(
 				DUMPTOKEN(recruited.getShardState);
 				DUMPTOKEN(recruited.waitMetrics);
 				DUMPTOKEN(recruited.splitMetrics);
+				DUMPTOKEN(recruited.getReadHotRanges);
 				DUMPTOKEN(recruited.getStorageMetrics);
 				DUMPTOKEN(recruited.waitFailure);
 				DUMPTOKEN(recruited.getQueuingMetrics);
@ -1252,6 +1255,7 @@ ACTOR Future<Void> workerServer(
 					DUMPTOKEN(recruited.getShardState);
 					DUMPTOKEN(recruited.waitMetrics);
 					DUMPTOKEN(recruited.splitMetrics);
+					DUMPTOKEN(recruited.getReadHotRanges);
 					DUMPTOKEN(recruited.getStorageMetrics);
 					DUMPTOKEN(recruited.waitFailure);
 					DUMPTOKEN(recruited.getQueuingMetrics);
@ -1551,7 +1555,8 @@ ACTOR Future<UID> createAndLockProcessIdFile(std::string folder) {
 					if(!g_network->isSimulated()) {
 						throw;
 					}
-					deleteFile(lockFilePath);
+					lockFile = ErrorOr<Reference<IAsyncFile>>();
+					wait(IAsyncFileSystem::filesystem()->deleteFile(lockFilePath, true));
 				}
 			}
 		}
--- a/fdbserver/workloads/AtomicOps.actor.cpp
+++ b/fdbserver/workloads/AtomicOps.actor.cpp
@ -205,7 +205,7 @@ struct AtomicOpsWorkload : TestWorkload {
 				} catch( Error &e ) {
 					if (e.code() == 1021) {
 						self->ubsum += intValue;
-						TraceEvent(SevWarnAlways, "TxnCommitUnknownResult")
+						TraceEvent(SevInfo, "TxnCommitUnknownResult")
 						    .detail("Value", intValue)
 						    .detail("LogKey", logDebugKey.first)
 						    .detail("OpsKey", opsKey);
--- a/fdbserver/workloads/AtomicRestore.actor.cpp
+++ b/fdbserver/workloads/AtomicRestore.actor.cpp
@ -86,7 +86,7 @@ struct AtomicRestoreWorkload : TestWorkload {
 		TraceEvent("AtomicRestore_RestoreStart");

 		if (self->fastRestore) { // New fast parallel restore
-			TraceEvent(SevWarnAlways, "AtomicParallelRestore");
+			TraceEvent(SevInfo, "AtomicParallelRestore");
 			wait(backupAgent.atomicParallelRestore(cx, BackupAgentBase::getDefaultTag(), self->backupRanges,
 			                                       StringRef(), StringRef()));
 		} else { // Old style restore
--- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
@ -335,7 +335,6 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 		state bool extraTasks = false;
 		state UID randomID = nondeterministicRandom()->randomUniqueID();
 		state int restoreIndex = 0;
-		state bool restoreDone = false;
 		state ReadYourWritesTransaction tr2(cx);

 		TraceEvent("BARW_Arguments")
--- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
+++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
@ -39,6 +39,13 @@ namespace ClientLogEventsParser {
 		ASSERT(gv.priorityType >= 0 && gv.priorityType < FdbClientLogEvents::PRIORITY_END);
 	}

+	void parseEventGetVersion_V3(BinaryReader &reader) {
+		FdbClientLogEvents::EventGetVersion_V3 gv;
+		reader >> gv;
+		ASSERT(gv.latency < 10000);
+		ASSERT(gv.priorityType >= 0 && gv.priorityType < FdbClientLogEvents::PRIORITY_END && gv.readVersion > 0);
+	}
+
 	void parseEventGet(BinaryReader &reader) {
 		FdbClientLogEvents::EventGet g;
 		reader >> g;
@ -57,6 +64,12 @@ namespace ClientLogEventsParser {
 		ASSERT(c.latency < 10000 && c.commitBytes < CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT && c.numMutations < 1000000);
 	}

+	void parseEventCommit_V2(BinaryReader &reader) {
+		FdbClientLogEvents::EventCommit_V2 c;
+		reader >> c;
+		ASSERT(c.latency < 10000 && c.commitBytes < CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT && c.numMutations < 1000000 && c.commitVersion > 0);
+	}
+
 	void parseEventErrorGet(BinaryReader &reader) {
 		FdbClientLogEvents::EventGetError ge;
 		reader >> ge;
@ -94,10 +107,19 @@ namespace ClientLogEventsParser {
 		Parser_V2() { parseGetVersion = parseEventGetVersion_V2; }
 		virtual ~Parser_V2() override {}
 	};
+	struct Parser_V3 : ParserBase {
+		Parser_V3() {
+			parseGetVersion = parseEventGetVersion_V3;
+			parseCommit = parseEventCommit_V2;
+		}
+		virtual ~Parser_V3() override {}
+	};

 	struct ParserFactory {
 		static std::unique_ptr<ParserBase> getParser(ProtocolVersion version) {
-			if(version.version() >= (uint64_t) 0x0FDB00B062000001LL) {
+			if(version.version() >= (uint64_t) 0x0FDB00B063010001LL) {
+				return std::unique_ptr<ParserBase>(new Parser_V3());
+			} else if(version.version() >= (uint64_t) 0x0FDB00B062000001LL) {
 				return std::unique_ptr<ParserBase>(new Parser_V2());
 			} else {
 				return std::unique_ptr<ParserBase>(new Parser_V1());
--- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
@ -652,7 +652,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 					limit = deterministicRandom()->randomInt(0, INT_MAX)+1;
 			}

-			bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && specialKeys.contains(keysel2.getKey());
+			bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && keysel2.getKey() <= specialKeys.end;

 			contract = {
 				std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf(limit < 0) ),
@ -685,7 +685,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 			keysel2 = makeKeySel();
 			limits = makeRangeLimits();

-			bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && specialKeys.contains(keysel2.getKey());
+			bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && keysel2.getKey() <= specialKeys.end;

 			contract = {
 				std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf( !limits.isReached() && !limits.isValid()) ),
@ -729,7 +729,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 					limit = deterministicRandom()->randomInt(0, INT_MAX)+1;
 			}

-			bool isSpecialKeyRange = specialKeys.contains(key1) && specialKeys.contains(key2);
+			bool isSpecialKeyRange = specialKeys.contains(key1) && key2 <= specialKeys.end;

 			contract = {
 				std::make_pair( error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2) ),
@ -764,7 +764,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 			key2 = makeKey();
 			limits = makeRangeLimits();

-			bool isSpecialKeyRange = specialKeys.contains(key1) && specialKeys.contains(key2);
+			bool isSpecialKeyRange = specialKeys.contains(key1) && key2 <= specialKeys.end;

 			contract = {
 				std::make_pair( error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2) ),
--- a/fdbserver/workloads/ReadHotDetection.actor.cpp
+++ b/fdbserver/workloads/ReadHotDetection.actor.cpp
@ -0,0 +1,161 @@
+/*
+ * ReadHotDetection.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbrpc/ContinuousSample.h"
+#include "fdbclient/NativeAPI.actor.h"
+#include "fdbserver/TesterInterface.actor.h"
+#include "fdbserver/workloads/BulkSetup.actor.h"
+#include "fdbclient/ReadYourWrites.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+struct ReadHotDetectionWorkload : TestWorkload {
+	int actorCount, keyCount;
+
+	double testDuration, transactionsPerSecond;
+	vector<Future<Void>> clients;
+	Future<Void> readHotCheck;
+	Key readKey;
+	KeyRange wholeRange;
+	bool passed;
+
+	ReadHotDetectionWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		testDuration = getOption(options, LiteralStringRef("testDuration"), 120.0);
+		transactionsPerSecond = getOption(options, LiteralStringRef("transactionsPerSecond"), 1000.0) / clientCount;
+		actorCount = getOption(options, LiteralStringRef("actorsPerClient"), transactionsPerSecond / 5);
+		keyCount = getOption(options, LiteralStringRef("keyCount"), 100);
+		readKey = StringRef(format("testkey%08x", deterministicRandom()->randomInt(0, keyCount)));
+	}
+
+	virtual std::string description() { return "ReadHotDetection"; }
+
+	virtual Future<Void> setup(Database const& cx) { return _setup(cx, this); }
+
+	virtual Future<Void> start(Database const& cx) {
+		for (int c = 0; c < actorCount; c++) {
+			clients.push_back(timeout(keyReader(cx->clone(), this, actorCount / transactionsPerSecond,
+			                                    deterministicRandom()->random01() > 0.4),
+			                          testDuration, Void()));
+		}
+		readHotCheck = clientId == 0 ? _check(cx->clone(), this) : Void();
+		return delay(testDuration);
+	}
+
+	virtual Future<bool> check(Database const& cx) {
+		if (clientId != 0) return true;
+		return passed;
+	}
+
+	ACTOR Future<Void> _setup(Database cx, ReadHotDetectionWorkload* self) {
+		state int g = 0;
+		state Standalone<StringRef> largeValue;
+		state Standalone<StringRef> smallValue;
+		largeValue = self->randomString(largeValue.arena(), 100000);
+		smallValue = self->randomString(smallValue.arena(), 100);
+		state ReadYourWritesTransaction tr(cx);
+		loop {
+			try {
+				for (int i = 0; i < self->keyCount; i++) {
+					auto key = StringRef(format("testkey%08x", i));
+					if (key == self->readKey) {
+						tr.set(key, largeValue);
+					} else {
+						tr.set(key, deterministicRandom()->random01() > 0.8 ? largeValue : smallValue);
+					}
+				}
+				wait(tr.commit());
+				break;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+		self->wholeRange = KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\xff"));
+		// TraceEvent("RHDLog").detail("Phase", "DoneSetup");
+		return Void();
+	}
+
+	ACTOR Future<Void> _check(Database cx, ReadHotDetectionWorkload* self) {
+		loop {
+			state Transaction tr(cx);
+			try {
+				StorageMetrics sm = wait(tr.getStorageMetrics(self->wholeRange, 100));
+				// TraceEvent("RHDCheckPhaseLog")
+				//     .detail("KeyRangeSize", sm.bytes)
+				//     .detail("KeyRangeReadBandwith", sm.bytesReadPerKSecond);
+				Standalone<VectorRef<KeyRangeRef>> keyRanges = wait(tr.getReadHotRanges(self->wholeRange));
+				// TraceEvent("RHDCheckPhaseLog")
+				//     .detail("KeyRangesSize", keyRanges.size())
+				//     .detail("ReadKey", self->readKey.printable().c_str())
+				//     .detail("KeyRangesBackBeginKey", keyRanges.back().begin)
+				//     .detail("KeyRangesBackEndKey", keyRanges.back().end);
+				// Loose check.
+				for (auto kr : keyRanges) {
+					if (kr.contains(self->readKey)) {
+						self->passed = true;
+					}
+				}
+				// The key ranges deemed read hot does not contain the readKey, which is impossible here.
+				// TraceEvent("RHDCheckPhaseFailed")
+				// 	.detail("KeyRangesSize", keyRanges.size())
+				// 	.detail("ReadKey", self->readKey.printable().c_str())
+				// 	.detail("KeyRangesBackBeginKey", keyRanges.back().begin)
+				// 	.detail("KeyRangesBackEndKey", keyRanges.back().end);
+				// for(auto kr : keyRanges) {
+				// 	TraceEvent("RHDCheckPhaseFailed").detail("KeyRagneBegin", kr.begin).detail("KeyRagneEnd", kr.end);
+				// }
+				self->passed = false;
+			} catch (Error& e) {
+				// TraceEvent("RHDCheckPhaseReadGotError").error(e);
+				wait(tr.onError(e));
+			}
+		}
+	}
+
+	virtual void getMetrics(vector<PerfMetric>& m) {}
+
+	ACTOR Future<Void> keyReader(Database cx, ReadHotDetectionWorkload* self, double delay, bool useReadKey) {
+		state double lastTime = now();
+		loop {
+			wait(poisson(&lastTime, delay));
+			state ReadYourWritesTransaction tr(cx);
+			loop {
+				try {
+					Optional<Value> v = wait(tr.get(
+					    useReadKey
+					        ? self->readKey
+					        : StringRef(format("testkey%08x", deterministicRandom()->randomInt(0, self->keyCount)))));
+					break;
+				} catch (Error& e) {
+					wait(tr.onError(e));
+				}
+			}
+		}
+	}
+	StringRef randomString(Arena& arena, int len, char firstChar = 'a', char lastChar = 'z') {
+		++lastChar;
+		StringRef s = makeString(len, arena);
+		for (int i = 0; i < len; ++i) {
+			*(uint8_t*)(s.begin() + i) = (uint8_t)deterministicRandom()->randomInt(firstChar, lastChar);
+		}
+		return s;
+	}
+};
+
+WorkloadFactory<ReadHotDetectionWorkload> ReadHotDetectionWorkloadFactory("ReadHotDetection");
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@ -42,7 +42,7 @@ public:

 struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {

-	int actorCount, minKeysPerRange, maxKeysPerRange, rangeCount, keyBytes, valBytes;
+	int actorCount, minKeysPerRange, maxKeysPerRange, rangeCount, keyBytes, valBytes, conflictRangeSizeFactor;
 	double testDuration, absoluteRandomProb, transactionsPerSecond;
 	PerfIntCounter wrongResults, keysCount;
 	Reference<ReadYourWritesTransaction> ryw; // used to store all populated data
@ -60,6 +60,9 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		transactionsPerSecond = getOption(options, LiteralStringRef("transactionsPerSecond"), 100.0);
 		actorCount = getOption(options, LiteralStringRef("actorCount"), 1);
 		absoluteRandomProb = getOption(options, LiteralStringRef("absoluteRandomProb"), 0.5);
+		// Controls the relative size of read/write conflict ranges and the number of random getranges
+		conflictRangeSizeFactor = getOption(options, LiteralStringRef("conflictRangeSizeFactor"), 10);
+		ASSERT(conflictRangeSizeFactor >= 1);
 	}

 	virtual std::string description() { return "SpecialKeySpaceCorrectness"; }
@ -72,6 +75,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 	double getCheckTimeout() override { return std::numeric_limits<double>::max(); }

 	Future<Void> _setup(Database cx, SpecialKeySpaceCorrectnessWorkload* self) {
+		cx->specialKeySpace = std::make_shared<SpecialKeySpace>();
 		if (self->clientId == 0) {
 			self->ryw = Reference(new ReadYourWritesTransaction(cx));
 			self->ryw->setVersion(100);
@ -97,7 +101,11 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		return Void();
 	}
 	ACTOR Future<Void> _start(Database cx, SpecialKeySpaceCorrectnessWorkload* self) {
-		if (self->clientId == 0) wait(timeout(self->getRangeCallActor(cx, self), self->testDuration, Void()));
+		if (self->clientId == 0) {
+			wait(timeout(self->getRangeCallActor(cx, self) && testConflictRanges(cx, /*read*/ true, self) &&
+			                 testConflictRanges(cx, /*read*/ false, self),
+			             self->testDuration, Void()));
+		}
 		return Void();
 	}

@ -161,6 +169,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 				    .detail("TestValue", printable(res2[i].value));
 				return false;
 			}
+			TEST(true); // Special key space keys equal
 		}
 		return true;
 	}
@ -201,6 +210,131 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {

 		return GetRangeLimits(rowLimits, byteLimits);
 	}
+
+	ACTOR static Future<Void> testConflictRanges(Database cx_, bool read, SpecialKeySpaceCorrectnessWorkload* self) {
+		state StringRef prefix = read ? readConflictRangeKeysRange.begin : writeConflictRangeKeysRange.begin;
+		TEST(read); // test read conflict range special key implementation
+		TEST(!read); // test write conflict range special key implementation
+		// Get a default special key range instance
+		Database cx = cx_->clone();
+		state Reference<ReadYourWritesTransaction> tx = Reference(new ReadYourWritesTransaction(cx));
+		state Reference<ReadYourWritesTransaction> referenceTx = Reference(new ReadYourWritesTransaction(cx));
+		state bool ryw = deterministicRandom()->coinflip();
+		if (!ryw) {
+			tx->setOption(FDBTransactionOptions::READ_YOUR_WRITES_DISABLE);
+		}
+		referenceTx->setVersion(100); // Prevent this from doing a GRV or committing
+		referenceTx->clear(normalKeys);
+		referenceTx->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		int numKeys = deterministicRandom()->randomInt(1, self->conflictRangeSizeFactor) * 4;
+		state std::vector<std::string> keys; // Must all be distinct
+		keys.resize(numKeys);
+		int lastKey = 0;
+		for (auto& key : keys) {
+			key = std::to_string(lastKey++);
+		}
+		if (deterministicRandom()->coinflip()) {
+			// Include beginning of keyspace
+			keys.push_back("");
+		}
+		if (deterministicRandom()->coinflip()) {
+			// Include end of keyspace
+			keys.push_back("\xff");
+		}
+		std::mt19937 g(deterministicRandom()->randomUInt32());
+		std::shuffle(keys.begin(), keys.end(), g);
+		// First half of the keys will be ranges, the other keys will mix in some read boundaries that aren't range
+		// boundaries
+		std::sort(keys.begin(), keys.begin() + keys.size() / 2);
+		for (auto iter = keys.begin(); iter + 1 < keys.begin() + keys.size() / 2; iter += 2) {
+			Standalone<KeyRangeRef> range = KeyRangeRef(*iter, *(iter + 1));
+			if (read) {
+				tx->addReadConflictRange(range);
+				// Add it twice so that we can observe the de-duplication that should get done
+				tx->addReadConflictRange(range);
+			} else {
+				tx->addWriteConflictRange(range);
+				tx->addWriteConflictRange(range);
+			}
+			// TODO test that fails if we don't wait on tx->pendingReads()
+			referenceTx->set(range.begin, LiteralStringRef("1"));
+			referenceTx->set(range.end, LiteralStringRef("0"));
+		}
+		if (!read && deterministicRandom()->coinflip()) {
+			try {
+				wait(tx->commit());
+			} catch (Error& e) {
+				if (e.code() == error_code_actor_cancelled) throw;
+				return Void();
+			}
+			TEST(true); // Read write conflict range of committed transaction
+		}
+		for (int i = 0; i < self->conflictRangeSizeFactor; ++i) {
+			GetRangeLimits limit;
+			KeySelector begin;
+			KeySelector end;
+			loop {
+				begin = firstGreaterOrEqual(deterministicRandom()->randomChoice(keys));
+				end = firstGreaterOrEqual(deterministicRandom()->randomChoice(keys));
+				if (begin.getKey() <= end.getKey()) break;
+			}
+			bool reverse = deterministicRandom()->coinflip();
+
+			auto correctResultFuture = referenceTx->getRange(begin, end, limit, false, reverse);
+			ASSERT(correctResultFuture.isReady());
+			begin.setKey(begin.getKey().withPrefix(prefix, begin.arena()));
+			end.setKey(end.getKey().withPrefix(prefix, begin.arena()));
+			auto testResultFuture = tx->getRange(begin, end, limit, false, reverse);
+			ASSERT(testResultFuture.isReady());
+			auto correct_iter = correctResultFuture.get().begin();
+			auto test_iter = testResultFuture.get().begin();
+			bool had_error = false;
+			while (correct_iter != correctResultFuture.get().end() && test_iter != testResultFuture.get().end()) {
+				if (correct_iter->key != test_iter->key.removePrefix(prefix) ||
+				    correct_iter->value != test_iter->value) {
+					TraceEvent(SevError, "TestFailure")
+					    .detail("Reason", "Mismatched keys")
+					    .detail("ConflictType", read ? "read" : "write")
+					    .detail("CorrectKey", correct_iter->key)
+					    .detail("TestKey", test_iter->key)
+					    .detail("CorrectValue", correct_iter->value)
+					    .detail("TestValue", test_iter->value)
+					    .detail("Begin", begin.toString())
+					    .detail("End", end.toString())
+					    .detail("Ryw", ryw);
+					had_error = true;
+				}
+				++correct_iter;
+				++test_iter;
+			}
+			while (correct_iter != correctResultFuture.get().end()) {
+				TraceEvent(SevError, "TestFailure")
+				    .detail("Reason", "Extra correct key")
+				    .detail("ConflictType", read ? "read" : "write")
+				    .detail("CorrectKey", correct_iter->key)
+				    .detail("CorrectValue", correct_iter->value)
+				    .detail("Begin", begin.toString())
+				    .detail("End", end.toString())
+				    .detail("Ryw", ryw);
+				++correct_iter;
+				had_error = true;
+			}
+			while (test_iter != testResultFuture.get().end()) {
+				TraceEvent(SevError, "TestFailure")
+				    .detail("Reason", "Extra test key")
+				    .detail("ConflictType", read ? "read" : "write")
+				    .detail("TestKey", test_iter->key)
+				    .detail("TestValue", test_iter->value)
+				    .detail("Begin", begin.toString())
+				    .detail("End", end.toString())
+				    .detail("Ryw", ryw);
+				++test_iter;
+				had_error = true;
+			}
+			if (had_error) break;
+		}
+		return Void();
+	}
 };

 WorkloadFactory<SpecialKeySpaceCorrectnessWorkload> SpecialKeySpaceCorrectnessFactory("SpecialKeySpaceCorrectness");
--- a/fdbserver/workloads/TPCC.actor.cpp
+++ b/fdbserver/workloads/TPCC.actor.cpp
@ -91,7 +91,7 @@ struct TPCCMetrics {
 			++failedCounter;
 		}
 		TraceEvent("TransactionComplete")
-		    .detail("Type", txnType)
+		    .detail("TransactionType", txnType)
 		    .detail("Latency", responseTime)
 		    .detail("Begin", txnStartTime)
 		    .detail("End", txnStartTime + responseTime)
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@ -25,6 +25,7 @@
 void forceLinkIndexedSetTests();
 void forceLinkDequeTests();
 void forceLinkFlowTests();
+void forceLinkVersionedMapTests();

 struct UnitTestWorkload : TestWorkload {
 	bool enabled;
@ -43,6 +44,7 @@ struct UnitTestWorkload : TestWorkload {
 		forceLinkIndexedSetTests();
 		forceLinkDequeTests();
 		forceLinkFlowTests();
+		forceLinkVersionedMapTests();
 	}

 	virtual std::string description() { return "UnitTests"; }
--- a/fdbserver/workloads/VersionStamp.actor.cpp
+++ b/fdbserver/workloads/VersionStamp.actor.cpp
@ -321,7 +321,11 @@ struct VersionStampWorkload : TestWorkload {
 				versionStampValue = value.withSuffix(LiteralStringRef("\x00\x00\x00\x00"));
 			}

+			state bool ryw = deterministicRandom()->coinflip();
 			loop{
+				if (!ryw) {
+					tr.setOption(FDBTransactionOptions::READ_YOUR_WRITES_DISABLE);
+				}
 				state bool error = false;
 				state Error err;
 				//TraceEvent("VST_CommitBegin").detail("Key", printable(key)).detail("VsKey", printable(versionStampKey)).detail("Clear", printable(range));
--- a/fdbservice/CMakeLists.txt
+++ b/fdbservice/CMakeLists.txt
@ -1,7 +1,3 @@
 set(FDBSERVICE_SRCS FDBService.cpp ServiceBase.cpp)

 add_executable(fdbmonitor ${FDBSERVICE_SRCS})
-#
-# FIXME: This include directory is an ugly hack. We probably want to fix this
-# as soon as we get rid of the old build system
-target_include_directories(fdbmonitor PRIVATE ${CMAKE_BINARY_DIR}/fdbclient)
--- a/fdbservice/FDBService.cpp
+++ b/fdbservice/FDBService.cpp
@ -30,9 +30,7 @@

 #include "..\flow\SimpleOpt.h"
 #include "..\fdbmonitor\SimpleIni.h"
-#if defined(CMAKE_BUILD) || !defined(WIN32)
-#include "versions.h"
-#endif
+#include "fdbclient/IncludeVersions.h"

 // For PathFileExists
 #include "Shlwapi.h"
--- a/flow/Arena.h
+++ b/flow/Arena.h
@ -530,11 +530,12 @@ public:
 	int expectedSize() const { return size(); }

 	int compare(StringRef const& other) const {
-		if (std::min(size(), other.size()) > 0) {
-			int c = memcmp(begin(), other.begin(), std::min(size(), other.size()));
+		size_t minSize = std::min(size(), other.size());
+		if (minSize != 0) {
+			int c = memcmp(begin(), other.begin(), minSize);
 			if (c != 0) return c;
 		}
-		return size() - other.size();
+		return ::compare(size(), other.size());
 	}

 	// Removes bytes from begin up to and including the sep string, returns StringRef of the part before sep
@ -845,6 +846,12 @@ public:
 	bool empty() const { return m_size == 0; }
 	const T& operator[](int i) const { return data[i]; }

+	// const versions of some VectorRef operators
+	const T* cbegin() const { return data; }
+	const T* cend() const { return data + m_size; }
+	T const& cfront() const { return *begin(); }
+	T const& cback() const { return end()[-1]; }
+
 	std::reverse_iterator<const T*> rbegin() const { return std::reverse_iterator<const T*>(end()); }
 	std::reverse_iterator<const T*> rend() const { return std::reverse_iterator<const T*>(begin()); }

--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@ -61,6 +61,7 @@ set(FLOW_SRCS
  ThreadSafeQueue.h
  Trace.cpp
  Trace.h
+  TreeBenchmark.h
  UnitTest.cpp
  UnitTest.h
  XmlTraceLogFormatter.cpp
@ -87,7 +88,6 @@ set(FLOW_SRCS
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)

 add_flow_target(STATIC_LIBRARY NAME flow SRCS ${FLOW_SRCS})
-target_include_directories(flow PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 if (NOT APPLE AND NOT WIN32)
  set (FLOW_LIBS ${FLOW_LIBS} rt)
 elseif(WIN32)
--- a/flow/IKeyValueContainer.h
+++ b/flow/IKeyValueContainer.h
@ -41,11 +41,22 @@ struct KeyValueMapPair {
 	KeyValueMapPair(KeyRef key, ValueRef value)
 	  : arena(key.expectedSize() + value.expectedSize()), key(arena, key), value(arena, value) {}

+	int compare(KeyValueMapPair const& r) const { return ::compare(key, r.key); }
+
+	template <class CompatibleWithKey>
+	int compare(CompatibleWithKey const& r) const {
+		return ::compare(key, r);
+	}
+
 	bool operator<(KeyValueMapPair const& r) const { return key < r.key; }
 	bool operator==(KeyValueMapPair const& r) const { return key == r.key; }
 	bool operator!=(KeyValueMapPair const& r) const { return key != r.key; }
 };

+template <class CompatibleWithKey>
+int compare(CompatibleWithKey const& l, KeyValueMapPair const& r) {
+	return ::compare(l, r.key);
+}
 template <class CompatibleWithKey>
 bool operator<(KeyValueMapPair const& l, CompatibleWithKey const& r) {
 	return l.key < r;
--- a/flow/IRandom.h
+++ b/flow/IRandom.h
@ -34,6 +34,32 @@
 #endif
 #include <functional>

+// Until we move to C++20, we'll need something to take the place of operator<=>.
+// This is as good a place as any, I guess.
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value, int>::type compare(T l, T r) {
+	const int gt = l > r;
+	const int lt = l < r;
+	return gt - lt;
+	// GCC also emits branchless code for the following, but the above performs
+	// slightly better in benchmarks as of this writing.
+	// return l < r ? -1 : l == r ? 0 : 1;
+}
+
+template <typename T, typename U>
+typename std::enable_if<!std::is_integral<T>::value, int>::type compare(T const& l, U const& r) {
+	return l.compare(r);
+}
+
+template <class K, class V>
+int compare(std::pair<K, V> const& l, std::pair<K, V> const& r) {
+	if (int cmp = compare(l.first, r.first)) {
+		return cmp;
+	}
+	return compare(l.second, r.second);
+}
+
 class UID {
 	uint64_t part[2];
 public:
@ -44,6 +70,12 @@ public:
 	std::string shortString() const;
 	bool isValid() const { return part[0] || part[1]; }

+	int compare(const UID& r) const {
+		if (int cmp = ::compare(part[0], r.part[0])) {
+			return cmp;
+		}
+		return ::compare(part[1], r.part[1]);
+	}
 	bool operator == ( const UID& r ) const { return part[0]==r.part[0] && part[1]==r.part[1]; }
 	bool operator != ( const UID& r ) const { return part[0]!=r.part[0] || part[1]!=r.part[1]; }
 	bool operator < ( const UID& r ) const { return part[0] < r.part[0] || (part[0] == r.part[0] && part[1] < r.part[1]); }
--- a/flow/IThreadPool.h
+++ b/flow/IThreadPool.h
@ -92,12 +92,16 @@ public:
 	void send( T const& t ) {  // Can be called safely from another thread.  Call send or sendError at most once.
 		Promise<Void> signal;
 		tagAndForward( &promise, t, signal.getFuture() );
-		g_network->onMainThread( std::move(signal), incrementPriorityIfEven( g_network->getCurrentTask() ) );
+		g_network->onMainThread(std::move(signal), g_network->isOnMainThread()
+		                                               ? incrementPriorityIfEven(g_network->getCurrentTask())
+		                                               : TaskPriority::DefaultOnMainThread);
 	}
 	void sendError( Error const& e ) {  // Can be called safely from another thread.  Call send or sendError at most once.
 		Promise<Void> signal;
 		tagAndForwardError( &promise, e, signal.getFuture() );
-		g_network->onMainThread( std::move(signal), incrementPriorityIfEven( g_network->getCurrentTask() ) );
+		g_network->onMainThread(std::move(signal), g_network->isOnMainThread()
+		                                               ? incrementPriorityIfEven(g_network->getCurrentTask())
+		                                               : TaskPriority::DefaultOnMainThread);
 	}
 private:
 	Promise<T> promise;
--- a/flow/IndexedSet.cpp
+++ b/flow/IndexedSet.cpp
@ -31,8 +31,8 @@
 #include <cstring>
 #include <deque>
 #include <random>
+#include "flow/TreeBenchmark.h"
 #include "flow/UnitTest.h"
-
 template <class Node>
 int ISGetHeight(Node* n){
 	if (!n) return 0;
@ -137,7 +137,123 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {
 	return Void();
 }

-/*TEST_CASE("/flow/IndexedSet/performance") {
+TEST_CASE("/flow/IndexedSet/random ops") {
+	for (int t = 0; t < 100; t++) {
+		IndexedSet<int, int> is;
+		int rr = deterministicRandom()->randomInt(0, 600) * deterministicRandom()->randomInt(0, 600);
+		for (int n = 0; n < rr; n++) {
+			if (deterministicRandom()->random01() < (double)is.sumTo(is.end()) / rr * 2)
+				is.erase(is.lower_bound(deterministicRandom()->randomInt(0, 10000000)));
+			else
+				is.insert(deterministicRandom()->randomInt(0, 10000000), 3);
+		}
+
+		int b = deterministicRandom()->randomInt(0, 10000000);
+		// int e = b + deterministicRandom()->randomInt(0, 10);
+		int e = deterministicRandom()->randomInt(0, 10000000);
+		if (e < b) std::swap(b, e);
+		auto ib = is.lower_bound(b);
+		auto ie = is.lower_bound(e);
+
+		int original_count = is.sumTo(is.end()) / 3;
+		int original_incount = is.sumRange(ib, ie) / 3;
+
+		// printf("\n#%d Erasing %d of %d items\n", t, original_incount, original_count);
+
+		is.erase(ib, ie);
+		is.testonly_assertBalanced();
+
+		int count = 0, incount = 0;
+		for (auto i : is) {
+			++count;
+			if (i >= b && i < e) {
+				// printf("Remaining item: %d (%d - %d)\n", i, b, e);
+				incount++;
+			}
+		}
+
+		// printf("%d items remain, totalling %d\n", count, is.sumTo(is.end()));
+		// printf("%d items remain in erased range\n", incount);
+
+		ASSERT(incount == 0);
+		ASSERT(count == original_count - original_incount);
+		ASSERT(is.sumTo(is.end()) == count * 3);
+	}
+	return Void();
+}
+
+TEST_CASE("/flow/IndexedSet/strings") {
+	Map<std::string, int> myMap;
+	std::map<std::string, int> aMap;
+	myMap["Hello"] = 1;
+	myMap["Planet"] = 5;
+	for (auto i = myMap.begin(); i != myMap.end(); ++i) aMap[i->key] = i->value;
+
+	ASSERT(myMap.find(std::string("Hello"))->value == 1);
+	ASSERT(myMap.find(std::string("World")) == myMap.end());
+	ASSERT(myMap["Hello"] == 1);
+
+	auto a = myMap.upper_bound("A")->key;
+	auto x = myMap.lower_bound("M")->key;
+
+	ASSERT((a + x) == (std::string) "HelloPlanet");
+
+	return Void();
+}
+
+template <typename K>
+struct IndexedSetHarness {
+	using map = IndexedSet<K, int>;
+	using result = typename map::iterator;
+	using key_type = K;
+
+	map s;
+
+	void insert(K const& k) { s.insert(K(k), 1); }
+	result find(K const& k) const { return s.find(k); }
+	result not_found() const { return s.end(); }
+	result begin() const { return s.begin(); }
+	result end() const { return s.end(); }
+	result lower_bound(K const& k) const { return s.lower_bound(k); }
+	result upper_bound(K const& k) const { return s.upper_bound(k); }
+	void erase(K const& k) { s.erase(k); }
+};
+
+TEST_CASE("performance/map/StringRef/IndexedSet") {
+	Arena arena;
+
+	IndexedSetHarness<StringRef> is;
+	treeBenchmark(is, [&arena]() { return randomStr(arena); });
+
+	return Void();
+}
+
+TEST_CASE("performance/map/StringRef/StdMap") {
+	Arena arena;
+
+	MapHarness<StringRef> is;
+	treeBenchmark(is, [&arena]() { return randomStr(arena); });
+
+	return Void();
+}
+
+TEST_CASE("performance/map/int/IndexedSet") {
+	IndexedSetHarness<int> is;
+	treeBenchmark(is, &randomInt);
+
+	return Void();
+}
+
+TEST_CASE("performance/map/int/StdMap") {
+	MapHarness<int> is;
+	treeBenchmark(is, &randomInt);
+
+	return Void();
+}
+
+TEST_CASE("performance/flow/IndexedSet/integers") {
+	std::mt19937_64 urng(deterministicRandom()->randomUInt32());
+
 	std::vector<int> x;
 	for (int i = 0; i<1000000; i++)
 		x.push_back(deterministicRandom()->randomInt(0, 10000000));
@ -151,7 +267,6 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {
 	double end = timer();
 	double kps = x.size() / 1000.0 / (end - start);
 	printf("%0.1f Kinsert/sec\n", kps);
-	ASSERT(kps >= 500);                                           //< Or something?

 	start = timer();
 	for (int i = 0; i<x.size(); i++)
@ -159,7 +274,6 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {
 	end = timer();
 	kps = x.size() / 1000.0 / (end - start);
 	printf("%0.1f Kfind/sec\n", kps);
-	ASSERT(kps >= 500);

 	{
 		//std::set<int> ss;
@ -194,7 +308,7 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {

 	is.testonly_assertBalanced();

-	std::random_shuffle(x.begin(), x.end());
+	std::shuffle(x.begin(), x.end(), urng);
 	start = timer();
 	for (int i = 0; i<x.size(); i++) {
 		is.erase(x[i]);
@ -204,87 +318,41 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {

 	printf("%0.1f Kerase/sec\n", x.size() / 1000.0 / (end - start));
 	is.testonly_assertBalanced();
-	for (int i = 0; i<x.size() / 2; i++)
+	for (int i = 0; i < x.size() / 2; i++) {
 		ASSERT(is.find(x[i]) == is.end());
-}*/
-
-TEST_CASE("/flow/IndexedSet/random ops") {
-	for (int t = 0; t<100; t++) {
-		IndexedSet<int, int> is;
-		int rr = deterministicRandom()->randomInt(0, 600) * deterministicRandom()->randomInt(0, 600);
-		for (int n = 0; n<rr; n++) {
-			if (deterministicRandom()->random01() < (double)is.sumTo(is.end()) / rr * 2)
-				is.erase(is.lower_bound(deterministicRandom()->randomInt(0, 10000000)));
-			else
-				is.insert(deterministicRandom()->randomInt(0, 10000000), 3);
 	}

-		int b = deterministicRandom()->randomInt(0, 10000000);
-		//int e = b + deterministicRandom()->randomInt(0, 10);
-		int e = deterministicRandom()->randomInt(0, 10000000);
-		if (e<b) std::swap(b, e);
-		auto ib = is.lower_bound(b);
-		auto ie = is.lower_bound(e);
-
-		int original_count = is.sumTo(is.end())/3;
-		int original_incount = is.sumRange(ib, ie)/3;
-
-		//printf("\n#%d Erasing %d of %d items\n", t, original_incount, original_count);
-
-		is.erase(ib, ie);
-		is.testonly_assertBalanced();
-
-		int count = 0, incount = 0;
-		for (auto i : is) {
-			++count;
-			if (i >= b && i < e) { 
-				//printf("Remaining item: %d (%d - %d)\n", i, b, e); 
-				incount++; 
-			}
-		}
-
-		//printf("%d items remain, totalling %d\n", count, is.sumTo(is.end()));
-		//printf("%d items remain in erased range\n", incount);
-
-		ASSERT(incount == 0);
-		ASSERT(count == original_count - original_incount);
-		ASSERT(is.sumTo(is.end()) == count*3);
-	}
 	return Void();
 }

-TEST_CASE("/flow/IndexedSet/strings") {
+TEST_CASE("performance/flow/IndexedSet/strings") {
+	constexpr size_t count = 1000000;
 	Map< std::string, int > myMap;
 	std::map< std::string, int > aMap;
-	myMap["Hello"] = 1;
-	myMap["Planet"] = 5;
-	for (auto i = myMap.begin(); i != myMap.end(); ++i)
-		aMap[i->key] = i->value;
+	double start, end;
+	int tt = 0;

-	ASSERT(myMap.find("Hello")->value == 1);
-	ASSERT(myMap.find("World") == myMap.end());
-	ASSERT(myMap["Hello"] == 1);
-
-	auto a = myMap.upper_bound("A")->key;
-	auto x = myMap.lower_bound("M")->key;
-
-	ASSERT((a + x) == (std::string)"HelloPlanet");
-
-	/* This was a performance test:
-
-		double start = timer();
-		volatile int tt=0;
-		for(int i=0; i<1000000; i++)
-		tt += myMap.find( "Hello" )->value;
-		double end = timer();
-		printf("%0.1f Map.KfindStr/sec\n", 1000000/1000.0/(end-start));
+	std::string const hello{ "Hello" };
+	myMap[hello] = 1;
+	aMap["Hello"] = 1;

 	start = timer();
-		for(int i=0; i<1000000; i++)
-		aMap.find( "Hello" );
+
+	for (size_t i = 0; i < count; i++) {
+		tt += myMap.find(hello)->value;
+	}
 	end = timer();
-		printf("%0.1f std::map.KfindStr/sec\n", 1000000/1000.0/(end-start));
-	*/
+
+	ASSERT(tt == count);
+
+	printf("%0.1f Map.KfindStr/sec\n", count / 1000.0 / (end - start));
+
+	start = timer();
+	for (size_t i = 0; i < count; i++) {
+		aMap.find(hello);
+	}
+	end = timer();
+	printf("%0.1f std::map.KfindStr/sec\n", count / 1000.0 / (end - start));

 	return Void();
 }
@ -340,6 +408,7 @@ TEST_CASE("/flow/IndexedSet/data constructor and destructor calls match") {
 		~Counter() { count--; }
 		Counter(const Counter& r) :value(r.value) { count++; }
 		void operator=(const Counter& r) { value = r.value; }
+		int compare(const Counter& r) const { return ::compare(value, r.value); }
 		bool operator<(const Counter& r) const { return value < r.value; }
 	};
 	IndexedSet<Counter, NoMetric> mySet;
--- a/flow/IndexedSet.h
+++ b/flow/IndexedSet.h
@ -22,6 +22,7 @@
 #define FLOW_INDEXEDSET_H
 #pragma once

+#include "flow/Arena.h"
 #include "flow/Platform.h"
 #include "flow/FastAlloc.h"
 #include "flow/Trace.h"
@ -199,7 +200,7 @@ private:

 	Node *root;

-	Metric eraseHalf( Node* start, Node* end, int eraseDir, int& heightDelta, std::vector<Node*>& toFree );
+	Metric eraseHalf(Node* start, Node* end, int eraseDir, int& heightDelta, std::vector<Node*>& toFree);
 	void erase( iterator begin, iterator end, std::vector<Node*>& toFree );

 	void replacePointer( Node* oldNode, Node* newNode ) {
@ -252,6 +253,11 @@ public:
 	MapPair(MapPair&& r) BOOST_NOEXCEPT  : key(std::move(r.key)), value(std::move(r.value)) {}
 	void operator=(MapPair&& r) BOOST_NOEXCEPT { key = std::move(r.key); value = std::move(r.value); }

+	int compare(MapPair<Key, Value> const& r) const { return ::compare(key, r.key); }
+	template <class CompatibleWithKey>
+	int compare(CompatibleWithKey const& r) const {
+		return ::compare(key, r);
+	}
 	bool operator<(MapPair<Key,Value> const& r) const { return key < r.key; }
 	bool operator<=(MapPair<Key,Value> const& r) const { return key <= r.key; }
 	bool operator==(MapPair<Key,Value> const& r) const { return key == r.key; }
@ -260,6 +266,11 @@ public:
 //private: MapPair( const MapPair& );
 };

+template <class Key, class Value, class CompatibleWithKey>
+inline int compare(CompatibleWithKey const& l, MapPair<Key, Value> const& r) {
+	return compare(l, r.key);
+}
+
 template <class Key, class Value>
 inline MapPair<typename std::decay<Key>::type, typename std::decay<Value>::type> mapPair(Key&& key, Value&& value) { return MapPair<typename std::decay<Key>::type, typename std::decay<Value>::type>(std::forward<Key>(key), std::forward<Value>(value)); }

@ -614,8 +625,8 @@ typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::insert(T_&& data,
 	int d; // direction
 	// traverse to find insert point
 	while (true){
-		d = t->data < data;
-		if (!d && !(data < t->data)) {	// t->data == data
+		int cmp = compare(data, t->data);
+		if (cmp == 0) {
 			Node *returnNode = t;
 			if(replaceExisting) {
 				t->data = std::forward<T_>(data);
@ -633,6 +644,7 @@ typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::insert(T_&& data,

 			return returnNode;
 		}
+		d = cmp > 0;
 		Node *nextT = t->child[d];
 		if (!nextT) break;
 		t = nextT;
@ -689,7 +701,7 @@ int IndexedSet<T,Metric>::insert(const std::vector<std::pair<T,Metric>>& dataVec
 		int d = 1; // direction
 		if(blockStart == NULL || (blockEnd != NULL && data >= blockEnd->data)) {
 			blockEnd = NULL;
-			if (root == NULL){
+			if (root == NULL) {
 				root = new Node(std::move(data), metric);
 				num_inserted++;
 				blockStart = root;
@ -699,11 +711,12 @@ int IndexedSet<T,Metric>::insert(const std::vector<std::pair<T,Metric>>& dataVec
 			Node *t = root;
 			// traverse to find insert point
 			bool foundNode = false;
-			while (true){
-				d = t->data < data;
-				if (!d)
+			while (true) {
+				int cmp = compare(data, t->data);
+				d = cmp > 0;
+				if (d == 0)
 					blockEnd = t;
-				if (!d && !(data < t->data)) {	// t->data == data
+				if (cmp == 0) {
 					Node *returnNode = t;
 					if(replaceExisting) {
 						num_inserted++;
@ -784,7 +797,8 @@ int IndexedSet<T,Metric>::insert(const std::vector<std::pair<T,Metric>>& dataVec
 }

 template <class T, class Metric>
-Metric IndexedSet<T,Metric>::eraseHalf( Node* start, Node* end, int eraseDir, int& heightDelta, std::vector<Node*>& toFree ) {
+Metric IndexedSet<T, Metric>::eraseHalf(Node* start, Node* end, int eraseDir, int& heightDelta,
+                                        std::vector<Node*>& toFree) {
 	// Removes all nodes between start (inclusive) and end (exclusive) from the set, where start is equal to end or one of its descendants
 	// eraseDir 1 means erase the right half (nodes > at) of the left subtree of end.  eraseDir 0 means the left half of the right subtree
 	// toFree is extended with the roots of completely removed subtrees
@ -860,7 +874,7 @@ void IndexedSet<T,Metric>::erase( typename IndexedSet<T,Metric>::iterator begin,
 	// Removes all nodes in the set between first and last, inclusive.
 	// toFree is extended with the roots of completely removed subtrees.

-	ASSERT(!end.i || (begin.i && *begin <= *end));
+	ASSERT(!end.i || (begin.i && (::compare(*begin, *end) <= 0)));

 	if(begin == end)
 		return;
@ -876,8 +890,8 @@ void IndexedSet<T,Metric>::erase( typename IndexedSet<T,Metric>::iterator begin,
 	
 	// Erase all matching nodes that descend from subRoot, by first erasing descendants of subRoot->child[0] and then erasing the descendants of subRoot->child[1]
 	// subRoot is not removed from the tree at this time
-	metricDelta = metricDelta + eraseHalf( first, subRoot, 1, leftHeightDelta, toFree );
-	metricDelta = metricDelta + eraseHalf( last, subRoot, 0, rightHeightDelta, toFree );
+	metricDelta = metricDelta + eraseHalf(first, subRoot, 1, leftHeightDelta, toFree);
+	metricDelta = metricDelta + eraseHalf(last, subRoot, 0, rightHeightDelta, toFree);

 	// Change in the height of subRoot due to past activity, before subRoot is rebalanced. subRoot->balance already reflects changes in height to its children.
 	int heightDelta = leftHeightDelta + rightHeightDelta; 
@ -995,10 +1009,9 @@ template <class Key>
 typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::find(const Key &key) const {
 	Node* t = root;
 	while (t){
-		int d = t->data < key;
-		if (!d && !(key < t->data)) // t->data == key
-			return iterator(t);
-		t = t->child[d];
+		int cmp = compare(key, t->data);
+		if (cmp == 0) return iterator(t);
+		t = t->child[cmp > 0];
 	}
 	return end();
 }
@ -1009,14 +1022,15 @@ template <class Key>
 typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::lower_bound(const Key &key) const {
 	Node* t = root;
 	if (!t) return iterator();
+	bool less;
 	while (true) {
-		Node *n = t->child[ t->data < key ];
+		less = t->data < key;
+		Node* n = t->child[less];
 		if (!n) break;
 		t = n;
 	}

-	if (t->data < key)
-		moveIterator<1>(t);
+	if (less) moveIterator<1>(t);

 	return iterator(t);
 }
@ -1027,14 +1041,15 @@ template <class Key>
 typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::upper_bound(const Key &key) const {
 	Node* t = root;
 	if (!t) return iterator();
+	bool not_less;
 	while (true) {
-		Node *n = t->child[ !(key < t->data) ];
+		not_less = !(key < t->data);
+		Node* n = t->child[not_less];
 		if (!n) break;
 		t = n;
 	}

-	if (!(key < t->data))
-		moveIterator<1>(t);
+	if (not_less) moveIterator<1>(t);

 	return iterator(t);
 }
--- a/Show More
+++ b/Show More