Merge remote-tracking branch 'origin/master' into features/flatbuffers-debugtx

2020-06-10 15:31:29 -07:00 · 2020-06-10 15:31:29 -07:00 · 4ab3441a95
parent 20cb50c1bc 097261b7a4
commit 4ab3441a95
79 changed files with 5119 additions and 1152 deletions
--- a/48
+++ b/48
@ -536,3 +536,51 @@ sse2neon Authors (sse2neon)
 	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 	SOFTWARE.
+
+
+rte_memcpy.h (from DPDK):
+   SPDX-License-Identifier: BSD-3-Clause
+   Copyright(c) 2010-2014 Intel Corporation
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from this
+   software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+
+folly_memcpy:
+
+ Copyright (c) Facebook, Inc. and its affiliates.
+ Author: Bin Liu <binliu@fb.com>
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
--- a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java
+++ b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java
@ -35,6 +35,7 @@ import com.apple.foundationdb.Transaction;
 *
 */
 public class ByteArrayUtil extends FastByteComparisons {
+	private static final byte[] EMPTY_BYTES = new byte[0];

 	/**
 	 * Joins a set of byte arrays into a larger array. The {@code interlude} is placed
@ -45,36 +46,46 @@ public class ByteArrayUtil extends FastByteComparisons {
 	 *  concatenated elements.
 	 * @param parts the pieces to be joined. May be {@code null}, but does not allow
 	 *  for elements in the list to be {@code null}.
-	 *
+	 * 
 	 * @return a newly created concatenation of the input
 	 */
 	public static byte[] join(byte[] interlude, List<byte[]> parts) {
+		return interludeJoin(interlude, parts.toArray(new byte[0][]));
+	}
+	/**
+	 * Joins a set of byte arrays into a larger array. The {@code interlude} is placed
+	 *  between each of the elements, but not at the beginning or end. In the case that
+	 *  the list is empty or {@code null}, a zero-length byte array will be returned.
+	 *
+	 * @param interlude can be {@code null} or zero length. Placed internally between
+	 *  concatenated elements.
+	 * @param parts the pieces to be joined. May be {@code null}, but does not allow
+	 *  for elements in the array to be {@code null}.
+	 *
+	 * @return a newly created concatenation of the input
+	 */
+	public static byte[] interludeJoin(byte[] interlude, byte[][] parts) {
 		if(parts == null)
 			return new byte[0];
-		int partCount = parts.size();
+		int partCount = parts.length;
 		if(partCount == 0)
-			return new byte[0];
+			return EMPTY_BYTES;

 		if(interlude == null)
-			interlude = new byte[0];
+			interlude = EMPTY_BYTES;

 		int elementTotals = 0;
 		int interludeSize = interlude.length;
-		for(byte[] e : parts) {
-			elementTotals += e.length;
+		for (int i = 0; i < partCount; i++) {
+			elementTotals += parts[i].length;
 		}
-
 		byte[] dest = new byte[(interludeSize * (partCount - 1)) + elementTotals];
-
-		//System.out.println(" interlude -> " + ArrayUtils.printable(interlude));
-
 		int startByte = 0;
 		int index = 0;
-		for(byte[] part : parts) {
-			//System.out.println(" section -> " + ArrayUtils.printable(parts.get(i)));
-			int length = part.length;
+		for (int i = 0; i < partCount; i++) {
+			int length = parts[i].length;
 			if(length > 0) {
-				System.arraycopy(part, 0, dest, startByte, length);
+				System.arraycopy(parts[i], 0, dest, startByte, length);
 				startByte += length;
 			}
 			if(index < partCount - 1 && interludeSize > 0) {
@ -84,8 +95,6 @@ public class ByteArrayUtil extends FastByteComparisons {
 			}
 			index++;
 		}
-
-		//System.out.println(" complete -> " + ArrayUtils.printable(dest));
 		return dest;
 	}

@ -97,7 +106,7 @@ public class ByteArrayUtil extends FastByteComparisons {
 	 * @return a newly created concatenation of the input
 	 */
 	public static byte[] join(byte[]... parts) {
-		return join(null, Arrays.asList(parts));
+		return interludeJoin(null, parts);
 	}

 	/**
--- a/bindings/python/LICENSE
+++ b/bindings/python/LICENSE
@ -1,207 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-------------------------------------------------------------------------------
-SOFTWARE DISTRIBUTED WITH FOUNDATIONDB:
-
-The FoundationDB software includes a number of subcomponents with separate 
-copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
-------------------------------------------------------------------------------
--- a/build/docker-compose.yaml
+++ b/build/docker-compose.yaml
@ -74,6 +74,14 @@ services:
    <<: *snapshot-bindings-cmake


+  snapshot-cmake: &snapshot-testpackages
+    <<: *build-setup
+    command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DFDB_RELEASE=0 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}"'
+
+  prb-testpackages:
+    <<: *snapshot-testpackages
+
+
  snapshot-ctest: &snapshot-ctest
    <<: *build-setup
    command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DFDB_RELEASE=1 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}" && ctest -L fast -j "$${MAKEJOBS}" --output-on-failure'
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@ -151,10 +151,15 @@ if(NOT WIN32)
  set(TEST_PACKAGE_ADD_DIRECTORIES "" CACHE STRING "A ;-separated list of directories. All files within each directory will be added to the test package")
 endif()

-function(create_test_package)
-  if(WIN32)
-    return()
-  endif()
+# This sets up a directory with the correctness files common to all correctness packages.
+# This function should be called with the following arguments:
+#
+# - OUT_DIR the directory where files will be staged
+# - CONTEXT the type of correctness package being built (e.g. 'valgrind correctness')
+function(stage_correctness_package)
+  set(oneValueArgs OUT_DIR CONTEXT)
+  cmake_parse_arguments(STAGE "" "${oneValueArgs}" "" "${ARGN}")
+  file(MAKE_DIRECTORY ${STAGE_OUT_DIR}/bin)
  string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length)
  foreach(test IN LISTS TEST_NAMES)
    if(("${TEST_TYPE_${test}}" STREQUAL "simulation") AND
@ -162,12 +167,14 @@ function(create_test_package)
        (NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE}))
      foreach(file IN LISTS TEST_FILES_${test})
        string(SUBSTRING ${file} ${base_length} -1 rel_out_file)
-        set(out_file ${CMAKE_BINARY_DIR}/packages/tests/${rel_out_file})
-        list(APPEND out_files ${out_file})
+        set(out_file ${STAGE_OUT_DIR}/tests/${rel_out_file})
+        list(APPEND test_files ${out_file})
        add_custom_command(
          OUTPUT ${out_file}
          DEPENDS ${file}
-          COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file})
+          COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file}
+          COMMENT "Copying ${STAGE_CONTEXT} test file ${rel_out_file}"
+          )
      endforeach()
    endif()
  endforeach()
@ -181,68 +188,83 @@ function(create_test_package)
      # SUBSTRING will fail
      set(src_dir "${src_dir}/")
      string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir)
-      string(SUBSTRING ${file} ${dir_len} -1 out_file)
-      list(APPEND external_files ${CMAKE_BINARY_DIR}/packages/${out_file})
-      file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/packages/${dest_dir})
+      string(SUBSTRING ${file} ${dir_len} -1 rel_out_file)
+	  set(out_file ${STAGE_OUT_DIR}/${rel_out_file})
+      list(APPEND external_files ${out_file})
+	  add_custom_command(
+        OUTPUT ${out_file}
+		DEPENDS ${file}
+		COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file}
+		COMMENT "Copying ${STAGE_CONTEXT} external file ${file}"
+		)
    endforeach()
  endforeach()
-  if(NOT USE_VALGRIND)
-    set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness-${CMAKE_PROJECT_VERSION}.tar.gz)
-    add_custom_command(
-      OUTPUT ${tar_file}
-      DEPENDS ${out_files}
-              ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
-              ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
-              ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
-              ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
-              ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
-              ${external_files}
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
-                                       ${CMAKE_BINARY_DIR}/packages/joshua_test
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
-                                       ${CMAKE_BINARY_DIR}/packages/joshua_timeout
-      COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
-                                          ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
-                                          ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
-                                          ${CMAKE_BINARY_DIR}/packages/joshua_test
-                                          ${CMAKE_BINARY_DIR}/packages/joshua_timeout
-                                          ${out_files}
-                                          ${external_files}
-      COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/packages/joshua_test ${CMAKE_BINARY_DIR}/packages/joshua_timeout
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages
-      COMMENT "Package correctness archive"
-      )
-    add_custom_target(package_tests ALL DEPENDS ${tar_file})
-    # seems make needs this dependency while this does nothing with ninja
-    add_dependencies(package_tests strip_only_fdbserver TestHarness)
-  endif()
+  list(APPEND package_files ${STAGE_OUT_DIR}/bin/fdbserver
+                            ${STAGE_OUT_DIR}/bin/TestHarness.exe
+                            ${STAGE_OUT_DIR}/bin/TraceLogHelper.dll
+                            ${STAGE_OUT_DIR}/CMakeCache.txt
+    )
+  add_custom_command(
+    OUTPUT ${package_files}
+    DEPENDS ${CMAKE_BINARY_DIR}/CMakeCache.txt
+            ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
+            ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
+            ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/CMakeCache.txt ${STAGE_OUT_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
+                                     ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
+                                     ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
+                                     ${STAGE_OUT_DIR}/bin
+    COMMENT "Copying files for ${STAGE_CONTEXT} package"
+    )
+  list(APPEND package_files ${test_files} ${external_files})
+  set(package_files ${package_files} PARENT_SCOPE)
+endfunction()

+function(create_correctness_package)
+  if(WIN32)
+    return()
+  endif()
+  set(out_dir "${CMAKE_BINARY_DIR}/correctness")
+  stage_correctness_package(OUT_DIR ${out_dir} CONTEXT "correctness")
+  set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness-${CMAKE_PROJECT_VERSION}.tar.gz)
+  add_custom_command(
+    OUTPUT ${tar_file}
+    DEPENDS ${package_files}
+            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
+            ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
+                                     ${out_dir}/joshua_test
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
+                                     ${out_dir}/joshua_timeout
+    COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} *
+    WORKING_DIRECTORY ${out_dir}
+    COMMENT "Package correctness archive"
+    )
+  add_custom_target(package_tests ALL DEPENDS ${tar_file})
+  add_dependencies(package_tests strip_only_fdbserver TestHarness)
+endfunction()
+
+function(create_valgrind_correctness_package)
+  if(WIN32)
+    return()
+  endif()
  if(USE_VALGRIND)
+    set(out_dir "${CMAKE_BINARY_DIR}/valgrind_correctness")
+    stage_correctness_package(OUT_DIR ${out_dir} CONTEXT "valgrind correctness")
    set(tar_file ${CMAKE_BINARY_DIR}/packages/valgrind-${CMAKE_PROJECT_VERSION}.tar.gz)
    add_custom_command(
      OUTPUT ${tar_file}
-      DEPENDS ${out_files}
-              ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
-              ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
-              ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
+      DEPENDS ${package_files}
              ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh
              ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh
-              ${external_files}
      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh
-                                       ${CMAKE_BINARY_DIR}/packages/joshua_test
+                                       ${out_dir}/joshua_test
      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh
-                                       ${CMAKE_BINARY_DIR}/packages/joshua_timeout
-      COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file}
-                                          ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
-                                          ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
-                                          ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
-                                          ${CMAKE_BINARY_DIR}/packages/joshua_test
-                                          ${CMAKE_BINARY_DIR}/packages/joshua_timeout
-                                          ${out_files}
-                                          ${external_files}
-      COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/packages/joshua_test ${CMAKE_BINARY_DIR}/packages/joshua_timeout
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages
-      COMMENT "Package correctness archive"
+                                       ${out_dir}/joshua_timeout
+      COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} *
+      WORKING_DIRECTORY ${out_dir}
+      COMMENT "Package valgrind correctness archive"
      )
    add_custom_target(package_valgrind_tests ALL DEPENDS ${tar_file})
    add_dependencies(package_valgrind_tests strip_only_fdbserver TestHarness)
@ -262,7 +284,8 @@ function(package_bindingtester)
  set(outfiles ${bdir}/fdbcli ${bdir}/fdbserver ${bdir}/${fdbcName} ${bdir}/joshua_test ${bdir}/joshua_timeout)
  add_custom_command(
    OUTPUT ${outfiles}
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/packages/bin/fdbcli
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/CMakeCache.txt
+            ${CMAKE_BINARY_DIR}/packages/bin/fdbcli
            ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
            ${CMAKE_BINARY_DIR}/packages/lib/${fdbcName}
            ${bdir}
@ -270,7 +293,7 @@ function(package_bindingtester)
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/bindingTimeout.sh ${bdir}/joshua_timeout
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/localClusterStart.sh ${bdir}/localClusterStart.sh
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/bindingTestScript.sh ${bdir}/bindingTestScript.sh
-    COMMENT "Copy executes to bindingtester dir")
+    COMMENT "Copy executables and scripts to bindingtester dir")
  file(GLOB_RECURSE test_files ${CMAKE_SOURCE_DIR}/bindings/*)
  add_custom_command(
    OUTPUT "${CMAKE_BINARY_DIR}/bindingtester.touch"
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -209,6 +209,25 @@ else()
  #   -mavx
  #   -msse4.2)

+  # Tentatively re-enabling vector instructions
+  set(USE_AVX512F OFF CACHE BOOL "Enable AVX 512F instructions")
+  if (USE_AVX512F)
+    add_compile_options(-mavx512f)
+  endif()
+  set(USE_AVX ON CACHE BOOL "Enable AVX instructions")
+  if (USE_AVX)
+    add_compile_options(-mavx)
+  endif()
+
+  # Intentionally using builtin memcpy.  G++ does a good job on small memcpy's when the size is known at runtime.
+  # If the size is not known, then it falls back on the memcpy that's available at runtime (rte_memcpy, as of this
+  # writing; see flow.cpp).
+  #
+  # The downside of the builtin memcpy is that it's slower at large copies, so if we spend a lot of time on large
+  # copies of sizes that are known at compile time, this might not be a win.  See the output of performance/memcpy
+  # for more information.
+  #add_compile_options(-fno-builtin-memcpy)
+
  if (USE_VALGRIND)
    add_compile_options(-DVALGRIND -DUSE_VALGRIND)
  endif()
@ -254,7 +273,6 @@ else()
  endif()
  if (GCC)
    add_compile_options(-Wno-pragmas)
-
    # Otherwise `state [[maybe_unused]] int x;` will issue a warning.
    # https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is
    add_compile_options(-Wno-attributes)
@ -268,6 +286,7 @@ else()
    -fvisibility=hidden
    -Wreturn-type
    -fPIC)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-Wclass-memaccess>)
  if (GPERFTOOLS_FOUND AND GCC)
    add_compile_options(
      -fno-builtin-malloc
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@ -493,7 +493,7 @@ If a process has had more than 10 TCP segments retransmitted in the last 5 secon
      10.0.4.1:4500       ( 3% cpu;  2% machine; 0.004 Gbps;  0% disk; REXMIT! 2.5 GB / 4.1 GB RAM  )

 Machine-readable status
--------------------------------
+-----------------------

 The status command can provide a complete summary of statistics about the cluster and the database with the ``json`` argument. Full documentation for ``status json`` output can be found :doc:`here <mr-status>`.
 From the output of ``status json``, operators can find useful health metrics to determine whether or not their cluster is hitting performance limits.
@ -505,6 +505,72 @@ Durable version lag     ``cluster.qos.worst_durability_lag_storage_server`` cont
 Transaction log queue   ``cluster.qos.worst_queue_bytes_log_server`` contains the maximum size in bytes of the mutations stored on a transaction log that have not yet been popped by storage servers. A large transaction log queue size can potentially cause the ratekeeper to increase throttling.
 ====================== ==============================================================================================================

+Server-side latency band tracking
+---------------------------------
+
+As part of the status document, ``status json`` provides some sampled latency metrics obtained by running probe transactions internally. While this can often be useful, it does not necessarily reflect the distribution of latencies for requests originated by clients.
+
+FoundationDB additionally provides optional functionality to measure the latencies of all incoming get read version (GRV), read, and commit requests and report some basic details about those requests. The latencies are measured from the time the server receives the request to the point when it replies, and will therefore not include time spent in transit between the client and server or delays in the client process itself.
+
+The latency band tracking works by configuring various latency thresholds and counting the number of requests that occur in each band (i.e. between two consecutive thresholds). For example, if you wanted to define a service-level objective (SLO) for your cluster where 99.9% of read requests were answered within N seconds, you could set a read latency threshold at N. You could then count the number of requests below and above the threshold and determine whether the required percentage of requests are answered sufficiently quickly.
+
+Configuration of server-side latency bands is performed by setting the ``\xff\x02/latencyBandConfig`` key to a string encoding the following JSON document::
+
+  { 
+    "get_read_version" : { 
+      "bands" : [ 0.01, 0.1] 
+    }, 
+    "read" : { 
+      "bands" : [ 0.01, 0.1],
+      "max_key_selector_offset" : 1000,
+      "max_read_bytes" : 1000000 
+    }, 
+    "commit" : { 
+      "bands" : [ 0.01, 0.1], 
+      "max_commit_bytes" : 1000000 
+    } 
+  }
+
+Every field in this configuration is optional, and any missing fields will be left unset (i.e. no bands will be tracked or limits will not apply). The configuration takes the following arguments:
+
+* ``bands`` - a list of thresholds (in seconds) to be measured for the given request type (``get_read_version``, ``read``, or ``commit``) 
+* ``max_key_selector_offset`` - an integer specifying the maximum key selector offset a read request can have and still be counted
+* ``max_read_bytes`` - an integer specifying the maximum size in bytes of a read response that will be counted
+* ``max_commit_bytes`` - an integer specifying the maximum size in bytes of a commit request that will be counted
+
+Setting this configuration key to a value that changes the configuration will result in the cluster controller server process logging a ``LatencyBandConfigChanged`` event. This event will indicate whether a configuration is present or not using its ``Present`` field. Specifying an invalid configuration will result in the latency band feature being unconfigured, and the server process running the cluster controller will log a ``InvalidLatencyBandConfiguration`` trace event.
+
+.. note:: GRV requests are counted only at default and immediate priority. Batch priority GRV requests are ignored for the purposes of latency band tracking.
+
+When configured, the ``status json`` output will include additional fields to report the number of requests in each latency band located at ``cluster.processes.<ID>.roles[N].*_latency_bands``::
+
+  "grv_latency_bands" : {
+    0.01: 10,
+    0.1: 0,
+    inf: 1,
+    filtered: 0
+  },
+  "read_latency_bands" : {
+    0.01: 12,
+    0.1: 1,
+    inf: 0,
+    filtered: 0
+  },
+  "commit_latency_bands" : {
+    0.01: 5,
+    0.1: 5,
+    inf: 2,
+    filtered: 1
+  }
+
+The ``grv_latency_bands`` and ``commit_latency_bands`` objects will only be logged for ``proxy`` roles, and ``read_latency_bands`` will only be logged for storage roles. Each threshold is represented as a key in the map, and its associated value will be the total number of requests in the lifetime of the process with a latency smaller than the threshold but larger than the next smaller threshold. 
+
+For example, ``0.1: 1`` in ``read_latency_bands`` indicates that there has been 1 read request with a latency in the range ``[0.01, 0.1)``. For the smallest specified threshold, the lower bound is 0 (e.g. ``[0, 0.01)`` in the example above). Requests that took longer than any defined latency band will be reported in the ``inf`` (infinity) band. Requests that were filtered by the configuration (e.g. using ``max_read_bytes``) are reported in the ``filtered`` category.
+
+Because each threshold reports latencies strictly in the range between the next lower threshold and itself, it may be necessary to sum up the counts for multiple bands to determine the total number of requests below a certain threshold.
+
+.. note:: No history of request counts is recorded for processes that ran in the past. This includes the history prior to restart for a process that has been restarted, for which the counts get reset to 0. For this reason, it is recommended that you collect this information periodically if you need to be able to track requests from such processes.
+
 .. _administration_fdbmonitor:

 ``fdbmonitor`` and ``fdbserver``
--- a/documentation/sphinx/source/downloads.rst
+++ b/documentation/sphinx/source/downloads.rst
@ -10,38 +10,38 @@ macOS

 The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server.

-* `FoundationDB-6.3.0.pkg <https://www.foundationdb.org/downloads/6.3.0/macOS/installers/FoundationDB-6.3.0.pkg>`_
+* `FoundationDB-6.3.1.pkg <https://www.foundationdb.org/downloads/6.3.1/macOS/installers/FoundationDB-6.3.1.pkg>`_

 Ubuntu
 ------

 The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x.

-* `foundationdb-clients-6.3.0-1_amd64.deb <https://www.foundationdb.org/downloads/6.3.0/ubuntu/installers/foundationdb-clients_6.3.0-1_amd64.deb>`_
-* `foundationdb-server-6.3.0-1_amd64.deb <https://www.foundationdb.org/downloads/6.3.0/ubuntu/installers/foundationdb-server_6.3.0-1_amd64.deb>`_ (depends on the clients package)
+* `foundationdb-clients-6.3.1-1_amd64.deb <https://www.foundationdb.org/downloads/6.3.1/ubuntu/installers/foundationdb-clients_6.3.1-1_amd64.deb>`_
+* `foundationdb-server-6.3.1-1_amd64.deb <https://www.foundationdb.org/downloads/6.3.1/ubuntu/installers/foundationdb-server_6.3.1-1_amd64.deb>`_ (depends on the clients package)

 RHEL/CentOS EL6
 ---------------

 The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x.

-* `foundationdb-clients-6.3.0-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.0/rhel6/installers/foundationdb-clients-6.3.0-1.el6.x86_64.rpm>`_
-* `foundationdb-server-6.3.0-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.0/rhel6/installers/foundationdb-server-6.3.0-1.el6.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.3.1-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.1/rhel6/installers/foundationdb-clients-6.3.1-1.el6.x86_64.rpm>`_
+* `foundationdb-server-6.3.1-1.el6.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.1/rhel6/installers/foundationdb-server-6.3.1-1.el6.x86_64.rpm>`_ (depends on the clients package)

 RHEL/CentOS EL7
 ---------------

 The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x.

-* `foundationdb-clients-6.3.0-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.0/rhel7/installers/foundationdb-clients-6.3.0-1.el7.x86_64.rpm>`_
-* `foundationdb-server-6.3.0-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.0/rhel7/installers/foundationdb-server-6.3.0-1.el7.x86_64.rpm>`_ (depends on the clients package)
+* `foundationdb-clients-6.3.1-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.1/rhel7/installers/foundationdb-clients-6.3.1-1.el7.x86_64.rpm>`_
+* `foundationdb-server-6.3.1-1.el7.x86_64.rpm <https://www.foundationdb.org/downloads/6.3.1/rhel7/installers/foundationdb-server-6.3.1-1.el7.x86_64.rpm>`_ (depends on the clients package)

 Windows
 -------

 The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server.

-* `foundationdb-6.3.0-x64.msi <https://www.foundationdb.org/downloads/6.3.0/windows/installers/foundationdb-6.3.0-x64.msi>`_
+* `foundationdb-6.3.1-x64.msi <https://www.foundationdb.org/downloads/6.3.1/windows/installers/foundationdb-6.3.1-x64.msi>`_

 API Language Bindings
 =====================
@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part

 If you need to use the FoundationDB Python API from other Python installations or paths, use the Python package manager ``pip`` (``pip install foundationdb``) or download the Python package:

-* `foundationdb-6.3.0.tar.gz <https://www.foundationdb.org/downloads/6.3.0/bindings/python/foundationdb-6.3.0.tar.gz>`_
+* `foundationdb-6.3.1.tar.gz <https://www.foundationdb.org/downloads/6.3.1/bindings/python/foundationdb-6.3.1.tar.gz>`_

 Ruby 1.9.3/2.0.0+
 -----------------

-* `fdb-6.3.0.gem <https://www.foundationdb.org/downloads/6.3.0/bindings/ruby/fdb-6.3.0.gem>`_
+* `fdb-6.3.1.gem <https://www.foundationdb.org/downloads/6.3.1/bindings/ruby/fdb-6.3.1.gem>`_

 Java 8+
 -------

-* `fdb-java-6.3.0.jar <https://www.foundationdb.org/downloads/6.3.0/bindings/java/fdb-java-6.3.0.jar>`_
-* `fdb-java-6.3.0-javadoc.jar <https://www.foundationdb.org/downloads/6.3.0/bindings/java/fdb-java-6.3.0-javadoc.jar>`_
+* `fdb-java-6.3.1.jar <https://www.foundationdb.org/downloads/6.3.1/bindings/java/fdb-java-6.3.1.jar>`_
+* `fdb-java-6.3.1-javadoc.jar <https://www.foundationdb.org/downloads/6.3.1/bindings/java/fdb-java-6.3.1-javadoc.jar>`_

 Go 1.11+
 --------
--- a/documentation/sphinx/source/old-release-notes/release-notes-620.rst
+++ b/documentation/sphinx/source/old-release-notes/release-notes-620.rst
@ -2,6 +2,15 @@
 Release Notes
 #############

+6.2.22
+======
+
+Fixes
+-----
+
+* Coordinator class processes could be recruited as the cluster controller. `(PR #3282) <https://github.com/apple/foundationdb/pull/3282>`_
+* HTTPS requests made by backup would fail (introduced in 6.2.21). `(PR #3284) <https://github.com/apple/foundationdb/pull/3284>`_
+
 6.2.21
 ======

@ -11,6 +20,7 @@ Fixes
 * HTTPS requests made by backup could hang indefinitely. `(PR #3027) <https://github.com/apple/foundationdb/pull/3027>`_
 * ``fdbrestore`` prefix options required exactly a single hyphen instead of the standard two. `(PR #3056) <https://github.com/apple/foundationdb/pull/3056>`_
 * Commits could stall on a newly elected proxy because of inaccurate compute estimates. `(PR #3123) <https://github.com/apple/foundationdb/pull/3123>`_
+* A transaction class process with a bad disk could be repeatedly recruited as a transaction log. `(PR #3268) <https://github.com/apple/foundationdb/pull/3268>`_

 Features
 --------
--- a/documentation/sphinx/source/old-release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/old-release-notes/release-notes-630.rst
@ -0,0 +1,123 @@
+#############
+Release Notes
+#############
+
+6.3.1
+=====
+
+Features
+--------
+
+* Added the ability to set arbitrary tags on transactions. Tags can be specifically throttled using ``fdbcli``, and certain types of tags can be automatically throttled by ratekeeper. `(PR #2942) <https://github.com/apple/foundationdb/pull/2942>`_
+* Add an option for transactions to report conflicting keys by calling ``getRange`` with the special key prefix ``\xff\xff/transaction/conflicting_keys/``. `(PR 2257) <https://github.com/apple/foundationdb/pull/2257>`_
+* Added the ``exclude failed`` command to ``fdbcli``. This command designates that a process is dead and will never come back, so the transaction logs can forget about mutations sent to that process. `(PR #1955) <https://github.com/apple/foundationdb/pull/1955>`_
+* A new fast restore system that can restore a database to a point in time from backup files. It is a Spark-like parallel processing framework that processes backup data asynchronously, in parallel and in pipeline. `(Fast Restore Project) <https://github.com/apple/foundationdb/projects/7>`_
+* Added backup workers for pulling mutations from transaction logs and uploading them to blob storage. Switching from the previous backup implementation will double a cluster's maximum write bandwidth. `(PR #1625) <https://github.com/apple/foundationdb/pull/1625>`_ `(PR #2588) <https://github.com/apple/foundationdb/pull/2588>`_ `(PR #2642) <https://github.com/apple/foundationdb/pull/2642>`_ 
+* Added a new API in all bindings that can be used to query the estimated byte size of a given range. `(PR #2537) <https://github.com/apple/foundationdb/pull/2537>`_
+* Added the ``lock`` and ``unlock`` commands to ``fdbcli`` which lock or unlock a cluster. `(PR #2890) <https://github.com/apple/foundationdb/pull/2890>`_
+* Add a framework which helps to add client functions using special keys (keys within ``[\xff\xff, \xff\xff\xff)``). `(PR #2662) <https://github.com/apple/foundationdb/pull/2662>`_
+
+Performance
+-----------
+
+* Improved the client's load balancing algorithm so that each proxy processes an equal number of requests. `(PR #2520) <https://github.com/apple/foundationdb/pull/2520>`_
+* Significantly reduced the amount of work done on the cluster controller by removing the centralized failure monitoring. `(PR #2518) <https://github.com/apple/foundationdb/pull/2518>`_
+* Improved master recovery speeds by more efficiently broadcasting the recovery state between processes.  `(PR #2941) <https://github.com/apple/foundationdb/pull/2941>`_
+* Significantly reduced the number of network connections opened to the coordinators. `(PR #3069) <https://github.com/apple/foundationdb/pull/3069>`_
+* Improve GRV tail latencies, particularly as the transaction rate gets nearer the ratekeeper limit. `(PR #2735) <https://github.com/apple/foundationdb/pull/2735>`_
+* The proxies are now more responsive to changes in workload when unthrottling lower priority transactions. `(PR #2735) <https://github.com/apple/foundationdb/pull/2735>`_
+* Removed a lot of unnecessary copying across the codebase. `(PR #2986) <https://github.com/apple/foundationdb/pull/2986>`_ `(PR #2915) <https://github.com/apple/foundationdb/pull/2915>`_ `(PR #3024) <https://github.com/apple/foundationdb/pull/3024>`_ `(PR #2999) <https://github.com/apple/foundationdb/pull/2999>`_
+* Optimized the performance of the storage server. `(PR #1988) <https://github.com/apple/foundationdb/pull/1988>`_ `(PR #3103) <https://github.com/apple/foundationdb/pull/3103>`_
+* Optimized the performance of the resolver. `(PR #2648) <https://github.com/apple/foundationdb/pull/2648>`_ 
+* Replaced most uses of hashlittle2 with crc32 for better performance.  `(PR #2538) <https://github.com/apple/foundationdb/pull/2538>`_
+* Significantly reduced the serialized size of conflict ranges and single key clears. `(PR #2513) <https://github.com/apple/foundationdb/pull/2513>`_
+* Improved range read performance when the reads overlap recently cleared key ranges. `(PR #2028) <https://github.com/apple/foundationdb/pull/2028>`_
+* Reduced the number of comparisons used by various map implementations. `(PR #2882) <https://github.com/apple/foundationdb/pull/2882>`_
+* Reduced the serialized size of empty strings. `(PR #3063) <https://github.com/apple/foundationdb/pull/3063>`_
+* Reduced the serialized size of various interfaces by 10x. `(PR #3068) <https://github.com/apple/foundationdb/pull/3068>`_
+
+Reliability
+-----------
+
+* Connections that disconnect frequently are not immediately marked available. `(PR #2932) <https://github.com/apple/foundationdb/pull/2932>`_
+* The data distributor will consider storage servers that are continually lagging behind as if they were failed. `(PR #2917) <https://github.com/apple/foundationdb/pull/2917>`_
+* Changing the storage engine type of a cluster will no longer cause the cluster to run out of memory. Instead, the cluster will gracefully migrate storage server processes to the new storage engine one by one. `(PR #1985) <https://github.com/apple/foundationdb/pull/1985>`_
+* Batch priority transactions which are being throttled by ratekeeper will get a ``batch_transaction_throttled`` error instead of hanging indefinitely.  `(PR #1868) <https://github.com/apple/foundationdb/pull/1868>`_
+* Avoid using too much memory on the transaction logs when multiple types of transaction logs exist in the same process. `(PR #2213) <https://github.com/apple/foundationdb/pull/2213>`_
+
+Fixes
+-----
+
+* The ``SetVersionstampedKey`` atomic operation no longer conflicts with versions smaller than the current read version of the transaction. `(PR #2557) <https://github.com/apple/foundationdb/pull/2557>`_
+* Ratekeeper would measure durability lag a few seconds higher than reality. `(PR #2499) <https://github.com/apple/foundationdb/pull/2499>`_
+* In very rare scenarios, the data distributor process could get stuck in an infinite loop. `(PR #2228) <https://github.com/apple/foundationdb/pull/2228>`_
+* If the number of configured transaction logs were reduced at the exact same time a change to the system keyspace took place, it was possible for the transaction state store to become corrupted. `(PR #3051) <https://github.com/apple/foundationdb/pull/3051>`_
+* Fix multiple data races between threads on the client. `(PR #3026) <https://github.com/apple/foundationdb/pull/3026>`_
+* Transaction logs configured to spill by reference had an unintended delay between each spilled batch. `(PR #3153) <https://github.com/apple/foundationdb/pull/3153>`_
+* Added guards to honor ``DISABLE_POSIX_KERNEL_AIO``. `(PR #2888) <https://github.com/apple/foundationdb/pull/2888>`_
+
+Status
+------
+
+* A process's ``memory.available_bytes`` can no longer exceed the memory limit of the process. For purposes of this statistic, processes on the same machine will be allocated memory proportionally based on the size of their memory limits. `(PR #3174) <https://github.com/apple/foundationdb/pull/3174>`_
+* Replaced ``cluster.database_locked`` status field with ``cluster.database_lock_state``, which contains two subfields: ``locked`` (boolean) and ``lock_uid`` (which contains the database lock uid if the database is locked). `(PR #2058) <https://github.com/apple/foundationdb/pull/2058>`_
+* Removed fields ``worst_version_lag_storage_server`` and ``limiting_version_lag_storage_server`` from the ``cluster.qos`` section. The ``worst_data_lag_storage_server`` and ``limiting_data_lag_storage_server`` objects can be used instead. `(PR #3196) <https://github.com/apple/foundationdb/pull/3196>`_
+* If a process is unable to flush trace logs to disk, the problem will now be reported via the output of ``status`` command inside ``fdbcli``. `(PR #2605) <https://github.com/apple/foundationdb/pull/2605>`_ `(PR #2820) <https://github.com/apple/foundationdb/pull/2820>`_
+
+Bindings
+--------
+
+* API version updated to 630. See the :ref:`API version upgrade guide <api-version-upgrade-guide-630>` for upgrade details.
+* Python: The ``@fdb.transactional`` decorator will now throw an error if the decorated function returns a generator. `(PR #1724) <https://github.com/apple/foundationdb/pull/1724>`_
+* Java: Add caching for various JNI objects to improve performance. `(PR #2809) <https://github.com/apple/foundationdb/pull/2809>`_
+* Java: Optimize byte array comparisons in ``ByteArrayUtil``. `(PR #2823) <https://github.com/apple/foundationdb/pull/2823>`_
+* Java: Add ``FDB.disableShutdownHook`` that can be used to prevent the default shutdown hook from running. Users of this new function should make sure to call ``stopNetwork`` before terminating a client process. `(PR #2635) <https://github.com/apple/foundationdb/pull/2635>`_
+* Java: Introduced ``keyAfter`` utility function that can be used to create the immediate next key for a given byte array. `(PR #2458) <https://github.com/apple/foundationdb/pull/2458>`_
+* Golang: The ``Transact`` function will unwrap errors that have been wrapped using ``xerrors`` to determine if a retryable FoundationDB error is in the error chain. `(PR #3131) <https://github.com/apple/foundationdb/pull/3131>`_
+* Golang: Added ``Subspace.PackWithVersionstamp`` that can be used to pack a ``Tuple`` that contains a versionstamp. `(PR #2243) <https://github.com/apple/foundationdb/pull/2243>`_
+* Golang: Implement ``Stringer`` interface for ``Tuple``, ``Subspace``, ``UUID``, and ``Versionstamp``. `(PR #3032) <https://github.com/apple/foundationdb/pull/3032>`_
+* C: The ``FDBKeyValue`` struct's ``key`` and ``value`` members have changed type from ``void*`` to ``uint8_t*``. `(PR #2622) <https://github.com/apple/foundationdb/pull/2622>`_
+* Deprecated ``enable_slow_task_profiling`` network option and replaced it with ``enable_run_loop_profiling``. `(PR #2608) <https://github.com/apple/foundationdb/pull/2608>`_
+
+Other Changes
+-------------
+
+* Small key ranges which are being heavily read will be reported in the logs using the trace event ``ReadHotRangeLog``. `(PR #2046) <https://github.com/apple/foundationdb/pull/2046>`_ `(PR #2378) <https://github.com/apple/foundationdb/pull/2378>`_ `(PR #2532) <https://github.com/apple/foundationdb/pull/2532>`_
+* Added the read version, commit version, and datacenter locality to the client transaction information.  `(PR #3079) <https://github.com/apple/foundationdb/pull/3079>`_  `(PR #3205) <https://github.com/apple/foundationdb/pull/3205>`_
+* Added a network option ``TRACE_FILE_IDENTIFIER`` that can be used to provide a custom identifier string that will be part of the file name for all trace log files created on the client. `(PR #2869) <https://github.com/apple/foundationdb/pull/2869>`_
+* It is now possible to use the ``TRACE_LOG_GROUP`` option on a client process after the database has been created. `(PR #2862) <https://github.com/apple/foundationdb/pull/2862>`_
+* Added a network option ``TRACE_CLOCK_SOURCE`` that can be used to switch the trace event timestamps to use a realtime clock source. `(PR #2329) <https://github.com/apple/foundationdb/pull/2329>`_
+* The ``INCLUDE_PORT_IN_ADDRESS`` transaction option is now on by default. This means ``get_addresses_for_key`` will always return ports in the address strings. `(PR #2639) <https://github.com/apple/foundationdb/pull/2639>`_
+* Added the ``getversion`` command to ``fdbcli`` which returns the current read version of the cluster.  `(PR #2882) <https://github.com/apple/foundationdb/pull/2882>`_
+* Added the ``advanceversion`` command to ``fdbcli`` which increases the current version of a cluster.  `(PR #2965) <https://github.com/apple/foundationdb/pull/2965>`_
+* Improved the slow task profiler to also report backtraces for periods when the run loop is saturated. `(PR #2608) <https://github.com/apple/foundationdb/pull/2608>`_
+* Double the number of shard locations that the client will cache locally. `(PR #2198) <https://github.com/apple/foundationdb/pull/2198>`_
+* Replaced the ``-add_prefix`` and ``-remove_prefix`` options with ``--add_prefix`` and ``--remove_prefix`` in ``fdbrestore`` `(PR 3206) <https://github.com/apple/foundationdb/pull/3206>`_
+* Data distribution metrics can now be read using the special keyspace ``\xff\xff/metrics/data_distribution_stats``. `(PR #2547) <https://github.com/apple/foundationdb/pull/2547>`_
+* The ``\xff\xff/worker_interfaces/`` keyspace now begins at a key which includes a trailing ``/`` (previously ``\xff\xff/worker_interfaces``). Range reads to this range now respect the end key passed into the range and include the keyspace prefix in the resulting keys. `(PR #3095) <https://github.com/apple/foundationdb/pull/3095>`_
+* Added FreeBSD support. `(PR #2634) <https://github.com/apple/foundationdb/pull/2634>`_
+* Updated boost to 1.72.  `(PR #2684) <https://github.com/apple/foundationdb/pull/2684>`_
+
+Earlier release notes
+---------------------
+* :doc:`6.2 (API Version 620) </old-release-notes/release-notes-620>`
+* :doc:`6.1 (API Version 610) </old-release-notes/release-notes-610>`
+* :doc:`6.0 (API Version 600) </old-release-notes/release-notes-600>`
+* :doc:`5.2 (API Version 520) </old-release-notes/release-notes-520>`
+* :doc:`5.1 (API Version 510) </old-release-notes/release-notes-510>`
+* :doc:`5.0 (API Version 500) </old-release-notes/release-notes-500>`
+* :doc:`4.6 (API Version 460) </old-release-notes/release-notes-460>`
+* :doc:`4.5 (API Version 450) </old-release-notes/release-notes-450>`
+* :doc:`4.4 (API Version 440) </old-release-notes/release-notes-440>`
+* :doc:`4.3 (API Version 430) </old-release-notes/release-notes-430>`
+* :doc:`4.2 (API Version 420) </old-release-notes/release-notes-420>`
+* :doc:`4.1 (API Version 410) </old-release-notes/release-notes-410>`
+* :doc:`4.0 (API Version 400) </old-release-notes/release-notes-400>`
+* :doc:`3.0 (API Version 300) </old-release-notes/release-notes-300>`
+* :doc:`2.0 (API Version 200) </old-release-notes/release-notes-200>`
+* :doc:`1.0 (API Version 100) </old-release-notes/release-notes-100>`
+* :doc:`Beta 3 (API Version 23) </old-release-notes/release-notes-023>`
+* :doc:`Beta 2 (API Version 22) </old-release-notes/release-notes-022>`
+* :doc:`Beta 1 (API Version 21) </old-release-notes/release-notes-021>`
+* :doc:`Alpha 6 (API Version 16) </old-release-notes/release-notes-016>`
+* :doc:`Alpha 5 (API Version 14) </old-release-notes/release-notes-014>`
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@ -2,7 +2,7 @@
 Release Notes
 #############

-6.3.0
+6.3.1
 =====

 Features
@ -100,6 +100,7 @@ Other Changes

 Earlier release notes
 ---------------------
+* :doc:`6.3 (API Version 630) </old-release-notes/release-notes-630>`
 * :doc:`6.2 (API Version 620) </old-release-notes/release-notes-620>`
 * :doc:`6.1 (API Version 610) </old-release-notes/release-notes-610>`
 * :doc:`6.0 (API Version 600) </old-release-notes/release-notes-600>`
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@ -883,7 +883,7 @@ public:
 		}
 		TraceEvent t(SevWarn, "FileBackupError");
 		t.error(e).detail("BackupUID", uid).detail("Description", details).detail("TaskInstance", (uint64_t)taskInstance);
-		// These should not happen
+		// key_not_found could happen
 		if(e.code() == error_code_key_not_found)
 			t.backtrace();

--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -20,6 +20,11 @@

 #ifndef DatabaseContext_h
 #define DatabaseContext_h
+#include "flow/FastAlloc.h"
+#include "flow/FastRef.h"
+#include "fdbclient/StorageServerInterface.h"
+#include "flow/genericactors.actor.h"
+#include <vector>
 #pragma once

 #include "fdbclient/NativeAPI.actor.h"
@ -44,7 +49,25 @@ private:
 	StorageServerInfo( DatabaseContext *cx, StorageServerInterface const& interf, LocalityData const& locality ) : cx(cx), ReferencedInterface<StorageServerInterface>(interf, locality) {}
 };

-typedef MultiInterface<ReferencedInterface<StorageServerInterface>> LocationInfo;
+struct LocationInfo : MultiInterface<ReferencedInterface<StorageServerInterface>>, FastAllocated<LocationInfo> {
+	using Locations = MultiInterface<ReferencedInterface<StorageServerInterface>>;
+	explicit LocationInfo(const std::vector<Reference<ReferencedInterface<StorageServerInterface>>>& v)
+		: Locations(v)
+	{}
+	LocationInfo(const std::vector<Reference<ReferencedInterface<StorageServerInterface>>>& v, bool hasCaches)
+		: Locations(v)
+		, hasCaches(hasCaches)
+	{}
+	LocationInfo(const LocationInfo&) = delete;
+	LocationInfo(LocationInfo&&) = delete;
+	LocationInfo& operator=(const LocationInfo&) = delete;
+	LocationInfo& operator=(LocationInfo&&) = delete;
+	bool hasCaches = false;
+	Reference<Locations> locations() {
+		return Reference<Locations>::addRef(this);
+	}
+};
+
 typedef ModelInterface<MasterProxyInterface> ProxyInfo;

 class ClientTagThrottleData : NonCopyable {
@ -131,7 +154,7 @@ public:

 	Database clone() const { return Database(new DatabaseContext( connectionFile, clientInfo, clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, internal, apiVersion, switchable )); }

-	std::pair<KeyRange,Reference<LocationInfo>> getCachedLocation( const KeyRef&, bool isBackward = false );
+	std::pair<KeyRange, Reference<LocationInfo>> getCachedLocation( const KeyRef&, bool isBackward = false );
 	bool getCachedLocations( const KeyRangeRef&, vector<std::pair<KeyRange,Reference<LocationInfo>>>&, int limit, bool reverse );
 	Reference<LocationInfo> setCachedLocation( const KeyRangeRef&, const vector<struct StorageServerInterface>& );
 	void invalidateCache( const KeyRef&, bool isBackward = false );
@ -234,7 +257,7 @@ public:

 	// Cache of location information
 	int locationCacheSize;
-	CoalescedKeyRangeMap< Reference<LocationInfo> > locationCache;
+	CoalescedKeyRangeMap<Reference<LocationInfo>> locationCache;

 	std::map< UID, StorageServerInfo* > server_interf;

@ -316,7 +339,8 @@ public:
 	double detailedHealthMetricsLastUpdated;

 	UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaults;
-
+	Future<Void> cacheListMonitor;
+	AsyncTrigger updateCache;
 	std::vector<std::unique_ptr<SpecialKeyRangeBaseImpl>> specialKeySpaceModules;
 	std::unique_ptr<SpecialKeySpace> specialKeySpace;
 	void registerSpecialKeySpaceModule(SpecialKeySpace::MODULE module, std::unique_ptr<SpecialKeyRangeBaseImpl> impl);
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -26,6 +26,7 @@
 #include <string>
 #include <vector>

+#include "flow/Arena.h"
 #include "flow/flow.h"
 #include "fdbclient/Knobs.h"

@ -78,6 +79,10 @@ struct Tag {
 		serializer(ar, locality, id);
 	}
 };
+
+template <>
+struct flow_ref<Tag> : std::integral_constant<bool, false> {};
+
 #pragma pack(pop)

 template <class Ar> void load( Ar& ar, Tag& tag ) { tag.serialize_unversioned(ar); }
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -235,7 +235,7 @@ public:
 		}
 		TraceEvent t(SevWarn, "FileRestoreError");
 		t.error(e).detail("RestoreUID", uid).detail("Description", details).detail("TaskInstance", (uint64_t)taskInstance);
-		// These should not happen
+		// key_not_found could happen
 		if(e.code() == error_code_key_not_found)
 			t.backtrace();

@ -3580,33 +3580,38 @@ public:
 	// Parallel restore
 	ACTOR static Future<Void> parallelRestoreFinish(Database cx, UID randomUID) {
 		state ReadYourWritesTransaction tr(cx);
-		state Future<Void> watchForRestoreRequestDone;
-		state bool restoreDone = false;
+		state Optional<Value> restoreRequestDoneKeyValue;
 		TraceEvent("FastRestoreAgentWaitForRestoreToFinish").detail("DBLock", randomUID);
+		// TODO: register watch first and then check if the key exist
 		loop {
 			try {
-				tr.reset();
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-				Optional<Value> restoreRequestDoneKeyValue = wait(tr.get(restoreRequestDoneKey));
+				Optional<Value> _restoreRequestDoneKeyValue = wait(tr.get(restoreRequestDoneKey));
+				restoreRequestDoneKeyValue = _restoreRequestDoneKeyValue;
 				// Restore may finish before restoreAgent waits on the restore finish event.
 				if (restoreRequestDoneKeyValue.present()) {
-					restoreDone = true; // In case commit clears the key but in unknown_state
-					tr.clear(restoreRequestDoneKey);
-					wait(tr.commit());
 					break;
-				} else if (!restoreDone) {
-					watchForRestoreRequestDone = tr.watch(restoreRequestDoneKey);
+				} else {
+					state Future<Void> watchForRestoreRequestDone = tr.watch(restoreRequestDoneKey);
 					wait(tr.commit());
 					wait(watchForRestoreRequestDone);
-				} else {
 					break;
 				}
 			} catch (Error& e) {
 				wait(tr.onError(e));
 			}
 		}
+		TraceEvent("FastRestoreAgentRestoreFinished")
+		    .detail("ClearRestoreRequestDoneKey", restoreRequestDoneKeyValue.present());
+		// Only this agent can clear the restoreRequestDoneKey
+		wait(runRYWTransaction(cx, [](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+			tr->clear(restoreRequestDoneKey);
+			return Void();
+		}));
+
 		TraceEvent("FastRestoreAgentRestoreFinished").detail("UnlockDBStart", randomUID);
 		try {
 			wait(unlockDatabase(cx, randomUID));
@ -3671,18 +3676,18 @@ public:
 				TraceEvent("FastRestoreAgentSubmitRestoreRequests").detail("DBIsLocked", randomUID);
 				break;
 			} catch (Error& e) {
-				TraceEvent("FastRestoreAgentSubmitRestoreRequests").detail("CheckLockError", e.what());
-				TraceEvent(numTries > 50 ? SevError : SevWarnAlways, "FastRestoreMayFail")
+				TraceEvent(numTries > 50 ? SevError : SevWarnAlways, "FastRestoreAgentSubmitRestoreRequestsMayFail")
 				    .detail("Reason", "DB is not properly locked")
-				    .detail("ExpectedLockID", randomUID);
+				    .detail("ExpectedLockID", randomUID)
+				    .error(e);
 				numTries++;
-				wait(delay(5.0));
+				wait(tr->onError(e));
 			}
 		}

 		// set up restore request
+		tr->reset();
 		loop {
-			tr->reset();
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			try {
@ -4444,7 +4449,10 @@ public:
 		return r;
 	}

-	ACTOR static Future<Version> restore(FileBackupAgent* backupAgent, Database cx, Optional<Database> cxOrig, Key tagName, Key url, Standalone<VectorRef<KeyRangeRef>> ranges, bool waitForComplete, Version targetVersion, bool verbose, Key addPrefix, Key removePrefix, bool lockDB, UID randomUid) {
+	ACTOR static Future<Version> restore(FileBackupAgent* backupAgent, Database cx, Optional<Database> cxOrig,
+	                                     Key tagName, Key url, Standalone<VectorRef<KeyRangeRef>> ranges,
+	                                     bool waitForComplete, Version targetVersion, bool verbose, Key addPrefix,
+	                                     Key removePrefix, bool lockDB, UID randomUid) {
 		state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString());

 		state BackupDescription desc = wait(bc->describeBackup());
--- a/fdbclient/HTTP.actor.cpp
+++ b/fdbclient/HTTP.actor.cpp
@ -352,6 +352,9 @@ namespace HTTP {
 			send_start = timer();

 			loop {
+				wait(conn->onWritable());
+				wait( delay( 0, TaskPriority::WriteSocket ) );
+
 				// If we already got a response, before finishing sending the request, then close the connection,
 				// set the Connection header to "close" as a hint to the caller that this connection can't be used
 				// again, and break out of the send loop.
@ -372,11 +375,6 @@ namespace HTTP {
 				pContent->sent(len);
 				if(pContent->empty())
 					break;
-
-				if(len == 0) {
-					wait(conn->onWritable());
-					wait( delay( 0, TaskPriority::WriteSocket ) );
-				}
 			}

 			wait(responseReading);
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -19,7 +19,12 @@
 */

 #include <cinttypes>
+#include <vector>

+#include "flow/Arena.h"
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/FDBTypes.h"
+#include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/ManagementAPI.actor.h"

 #include "fdbclient/SystemData.h"
@ -1858,6 +1863,69 @@ ACTOR Future<Void> waitForPrimaryDC( Database cx, StringRef dcId ) {
 	}
 }

+ACTOR Future<Void> changeCachedRange(Database cx, KeyRangeRef range, bool add) {
+	state ReadYourWritesTransaction tr(cx);
+	state KeyRange sysRange = KeyRangeRef(storageCacheKey(range.begin), storageCacheKey(range.end));
+	state KeyRange sysRangeClear = KeyRangeRef(storageCacheKey(range.begin), keyAfter(storageCacheKey(range.end)));
+	state KeyRange privateRange = KeyRangeRef(cacheKeysKey(0, range.begin), cacheKeysKey(0, range.end));
+	state Value trueValue = storageCacheValue(std::vector<uint16_t>{ 0 });
+	state Value falseValue = storageCacheValue(std::vector<uint16_t>{});
+	loop {
+		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		try {
+			tr.clear(sysRangeClear);
+			tr.clear(privateRange);
+			tr.addReadConflictRange(privateRange);
+			Standalone<RangeResultRef> previous =
+			    wait(tr.getRange(KeyRangeRef(storageCachePrefix, sysRange.begin), 1, true));
+			bool prevIsCached = false;
+			if (!previous.empty()) {
+				std::vector<uint16_t> prevVal;
+				decodeStorageCacheValue(previous[0].value, prevVal);
+				prevIsCached = !prevVal.empty();
+			}
+			if (prevIsCached && !add) {
+				// we need to uncache from here
+				tr.set(sysRange.begin, falseValue);
+				tr.set(privateRange.begin, serverKeysFalse);
+			} else if (!prevIsCached && add) {
+				// we need to cache, starting from here
+				tr.set(sysRange.begin, trueValue);
+				tr.set(privateRange.begin, serverKeysTrue);
+			}
+			Standalone<RangeResultRef> after =
+			    wait(tr.getRange(KeyRangeRef(sysRange.end, storageCacheKeys.end), 1, false));
+			bool afterIsCached = false;
+			if (!after.empty()) {
+				std::vector<uint16_t> afterVal;
+				decodeStorageCacheValue(after[0].value, afterVal);
+				afterIsCached = afterVal.empty();
+			}
+			if (afterIsCached && !add) {
+				tr.set(sysRange.end, trueValue);
+				tr.set(privateRange.end, serverKeysTrue);
+			} else if (!afterIsCached && add) {
+				tr.set(sysRange.end, falseValue);
+				tr.set(privateRange.end, serverKeysFalse);
+			}
+			wait(tr.commit());
+			return Void();
+		} catch (Error& e) {
+			state Error err = e;
+			wait(tr.onError(err));
+			TraceEvent(SevDebug, "ChangeCachedRangeError").error(err);
+		}
+	}
+}
+
+Future<Void> addCachedRange(const Database& cx, KeyRangeRef range) {
+	return changeCachedRange(cx, range, true);
+}
+Future<Void> removeCachedRange(const Database& cx, KeyRangeRef range) {
+	return changeCachedRange(cx, range, false);
+}
+
 json_spirit::Value_type normJSONType(json_spirit::Value_type type) {
 	if (type == json_spirit::int_type)
 		return json_spirit::real_type;
--- a/fdbclient/ManagementAPI.actor.h
+++ b/fdbclient/ManagementAPI.actor.h
@ -201,5 +201,8 @@ bool schemaMatch( json_spirit::mValue const& schema, json_spirit::mValue const&
 // storage nodes
 ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);

+Future<Void> addCachedRange(const Database& cx, KeyRangeRef range);
+Future<Void> removeCachedRange(const Database& cx, KeyRangeRef range);
+
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@ -167,7 +167,11 @@ class DLDatabase : public IDatabase, ThreadSafeReferenceCounted<DLDatabase> {
 public:
 	DLDatabase(Reference<FdbCApi> api, FdbCApi::FDBDatabase *db) : api(api), db(db), ready(Void()) {}
 	DLDatabase(Reference<FdbCApi> api, ThreadFuture<FdbCApi::FDBDatabase*> dbFuture);
-	~DLDatabase() { api->databaseDestroy(db); }
+	~DLDatabase() {
+		if (db) {
+			api->databaseDestroy(db);
+		}
+	}

 	ThreadFuture<Void> onReady();

--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -20,9 +20,17 @@

 #include "fdbclient/NativeAPI.actor.h"

+#include <algorithm>
 #include <iterator>
 #include <regex>
 #include <unordered_set>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "fdbclient/FDBTypes.h"
+#include "fdbrpc/FailureMonitor.h"
+#include "fdbrpc/MultiInterface.h"

 #include "fdbclient/Atomic.h"
 #include "fdbclient/ClusterInterface.h"
@ -42,15 +50,19 @@
 #include "fdbrpc/LoadBalance.h"
 #include "fdbrpc/Net2FileSystem.h"
 #include "fdbrpc/simulator.h"
+#include "flow/Arena.h"
 #include "flow/ActorCollection.h"
 #include "flow/DeterministicRandom.h"
-#include "flow/IRandom.h"
+#include "flow/Error.h"
+#include "flow/flow.h"
+#include "flow/genericactors.actor.h"
 #include "flow/Knobs.h"
 #include "flow/Platform.h"
 #include "flow/SystemMonitor.h"
 #include "flow/TLSConfig.actor.h"
 #include "flow/Tracing.h"
 #include "flow/UnitTest.h"
+#include "flow/serialize.h"

 #include "fdbclient/versions.h"
 #include "flow/serialize.h"
@ -71,6 +83,33 @@ using std::max;
 using std::min;
 using std::pair;

+namespace {
+
+ACTOR template <class T, class Fun>
+Future<T> runAfter(Future<T> in, Fun func) {
+	T res = wait(in);
+	return func(res);
+}
+
+template <class Interface, class Request>
+Future<REPLY_TYPE(Request)> loadBalance(
+	DatabaseContext* ctx, const Reference<LocationInfo> alternatives, RequestStream<Request> Interface::*channel,
+	const Request& request = Request(), TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
+	bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically
+	QueueModel* model = NULL) {
+	if (alternatives->hasCaches) {
+		return loadBalance(alternatives->locations(), channel, request, taskID, atMostOnce, model);
+	}
+	return runAfter(loadBalance(alternatives->locations(), channel, request, taskID, atMostOnce, model),
+					[ctx](auto res) {
+						if (res.cached) {
+							ctx->updateCache.trigger();
+						}
+						return res;
+	                });
+}
+} // namespace
+
 NetworkOptions networkOptions;
 TLSConfig tlsConfig(TLSEndpointType::CLIENT);

@ -458,6 +497,166 @@ ACTOR static Future<Void> monitorMasterProxiesChange(Reference<AsyncVar<ClientDB
 	}
 }

+void updateLocationCacheWithCaches(DatabaseContext* self, const std::map<UID, StorageServerInterface>& removed,
+								   const std::map<UID, StorageServerInterface>& added) {
+	// TODO: this needs to be more clever in the future
+	auto ranges = self->locationCache.ranges();
+	for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
+		if (iter->value() && iter->value()->hasCaches) {
+			auto& val = iter->value();
+			std::vector<Reference<ReferencedInterface<StorageServerInterface>>> interfaces;
+			interfaces.reserve(val->size() - removed.size() + added.size());
+			for (int i = 0; i < val->size(); ++i) {
+				const auto& interf = (*val)[i];
+				if (removed.count(interf->interf.id()) == 0) {
+					interfaces.emplace_back(interf);
+				}
+			}
+			for (const auto& p : added) {
+				interfaces.emplace_back(Reference<ReferencedInterface<StorageServerInterface>>{new ReferencedInterface<StorageServerInterface>{p.second}});
+			}
+			iter->value() = Reference<LocationInfo>{ new LocationInfo(interfaces, true) };
+		}
+	}
+}
+
+Reference<LocationInfo> addCaches(const Reference<LocationInfo>& loc,
+								  const std::vector<Reference<ReferencedInterface<StorageServerInterface>>>& other) {
+	std::vector<Reference<ReferencedInterface<StorageServerInterface>>> interfaces;
+	interfaces.reserve(loc->size() + other.size());
+	for (int i = 0; i < loc->size(); ++i) {
+		interfaces.emplace_back((*loc)[i]);
+	}
+	interfaces.insert(interfaces.end(), other.begin(), other.end());
+	return Reference<LocationInfo>{ new LocationInfo{ interfaces, true } };
+}
+
+ACTOR Future<Void> updateCachedRanges(DatabaseContext* self, std::map<UID, StorageServerInterface>* cacheServers) {
+	state Database db(self);
+	state ReadYourWritesTransaction tr(db);
+	state Value trueValue = storageCacheValue(std::vector<uint16_t>{ 0 });
+	state Value falseValue = storageCacheValue(std::vector<uint16_t>{});
+	try {
+		loop {
+			wait(self->updateCache.onTrigger());
+			tr.reset();
+			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+			try {
+				Standalone<RangeResultRef> range = wait(tr.getRange(storageCacheKeys, CLIENT_KNOBS->TOO_MANY));
+				ASSERT(!range.more);
+				std::vector<Reference<ReferencedInterface<StorageServerInterface>>> cacheInterfaces;
+				cacheInterfaces.reserve(cacheServers->size());
+				for (const auto& p : *cacheServers) {
+					cacheInterfaces.emplace_back(Reference<ReferencedInterface<StorageServerInterface>>{
+					    new ReferencedInterface<StorageServerInterface>{ p.second } });
+				}
+				bool currCached = false;
+				KeyRef begin, end;
+				for (const auto& kv : range) {
+					// These booleans have to flip consistently
+					ASSERT(currCached == (kv.value == falseValue));
+					if (kv.value == trueValue) {
+						begin = kv.key.substr(storageCacheKeys.begin.size());
+						currCached = true;
+					} else {
+						currCached = false;
+						end = kv.key.substr(storageCacheKeys.begin.size());
+						KeyRangeRef cachedRange{begin, end};
+						auto ranges = self->locationCache.containedRanges(cachedRange);
+						KeyRef containedRangesBegin, containedRangesEnd, prevKey;
+						if (!ranges.empty()) {
+							containedRangesBegin = ranges.begin().range().begin;
+						}
+						for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
+							// We probably don't want to do the code below? Otherwise we would never
+							// fetch the corresponding storages - which would give us a different semantics
+							//if (containedRangesEnd > iter->range().begin) {
+							//	self->locationCache.insert(
+							//	    KeyRangeRef{ containedRangesEnd, iter->range().begin },
+							//	    Reference<LocationInfo>{ new LocationInfo{ cacheInterfaces, true } });
+							//}
+							containedRangesEnd = iter->range().end;
+							if (iter->value() && !iter->value()->hasCaches) {
+								iter->value() = addCaches(iter->value(), cacheInterfaces);
+							}
+						}
+						auto iter = self->locationCache.rangeContaining(begin);
+						if (iter->value() && !iter->value()->hasCaches) {
+							if (end>=iter->range().end) {
+								self->locationCache.insert(KeyRangeRef{ begin, iter->range().end },
+														   addCaches(iter->value(), cacheInterfaces));
+							} else {
+								self->locationCache.insert(KeyRangeRef{ begin, end },
+														   addCaches(iter->value(), cacheInterfaces));
+							}
+						}
+						iter = self->locationCache.rangeContainingKeyBefore(end);
+						if (iter->value() && !iter->value()->hasCaches) {
+							self->locationCache.insert(KeyRangeRef{iter->range().begin, end}, addCaches(iter->value(), cacheInterfaces));
+						}
+					}
+				}
+				wait(delay(2.0)); // we want to wait at least some small amount of time before
+				// updating this list again
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	} catch (Error& e) {
+		TraceEvent(SevError, "UpdateCachedRangesFailed")
+			.error(e);
+		throw;
+	}
+}
+
+ACTOR Future<Void> monitorCacheList(DatabaseContext* self) {
+	state Database db(self);
+	state Transaction tr(db);
+	state std::map<UID, StorageServerInterface> cacheServerMap;
+	state Future<Void> updateRanges = updateCachedRanges(self, &cacheServerMap);
+	// if no caches are configured, we don't want to run this actor at all
+	// so we just wait for the first trigger from a storage server
+	wait(self->updateCache.onTrigger());
+	try {
+		loop {
+			tr.reset();
+			try {
+				Standalone<RangeResultRef> cacheList =
+				    wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY));
+				ASSERT(!cacheList.more);
+				bool hasChanges = false;
+				std::map<UID, StorageServerInterface> allCacheServers;
+				for (auto kv : cacheList) {
+					auto ssi = BinaryReader::fromStringRef<StorageServerInterface>(kv.value, IncludeVersion());
+					allCacheServers.emplace(ssi.id(), ssi);
+				}
+				std::map<UID, StorageServerInterface> newCacheServers;
+				std::map<UID, StorageServerInterface> deletedCacheServers;
+				std::set_difference(allCacheServers.begin(), allCacheServers.end(), cacheServerMap.begin(),
+									cacheServerMap.end(),
+									std::insert_iterator<std::map<UID, StorageServerInterface>>(
+										newCacheServers, newCacheServers.begin()));
+				std::set_difference(cacheServerMap.begin(), cacheServerMap.end(), allCacheServers.begin(),
+									allCacheServers.end(),
+									std::insert_iterator<std::map<UID, StorageServerInterface>>(
+										deletedCacheServers, deletedCacheServers.begin()));
+				hasChanges = !(newCacheServers.empty() && deletedCacheServers.empty());
+				if (hasChanges) {
+					updateLocationCacheWithCaches(self, deletedCacheServers, newCacheServers);
+				}
+				cacheServerMap = std::move(allCacheServers);
+				wait(delay(5.0));
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	} catch (Error& e) {
+		TraceEvent(SevError, "MonitorCacheListFailed").error(e);
+		throw;
+	}
+}
+
 ACTOR static Future<HealthMetrics> getHealthMetricsActor(DatabaseContext *cx, bool detailed) {
 	if (now() - cx->healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) {
 		if (detailed) {
@ -604,6 +803,7 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF

 	monitorMasterProxiesInfoChange = monitorMasterProxiesChange(clientInfo, &masterProxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
+	cacheListMonitor = monitorCacheList(this);
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, std::make_unique<ConflictingKeysImpl>(conflictingKeysRange));
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, std::make_unique<ReadConflictRangeImpl>(readConflictRangeKeysRange));
@ -685,14 +885,15 @@ Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo, F
 }

 DatabaseContext::~DatabaseContext() {
+	cacheListMonitor.cancel();
 	monitorMasterProxiesInfoChange.cancel();
 	for(auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it))
 		it->second->notifyContextDestroyed();
 	ASSERT_ABORT( server_interf.empty() );
-	locationCache.insert( allKeys, Reference<LocationInfo>() );
+	locationCache.insert(allKeys, Reference<LocationInfo>());
 }

-pair<KeyRange,Reference<LocationInfo>> DatabaseContext::getCachedLocation( const KeyRef& key, bool isBackward ) {
+pair<KeyRange, Reference<LocationInfo>> DatabaseContext::getCachedLocation( const KeyRef& key, bool isBackward ) {
 	if( isBackward ) {
 		auto range = locationCache.rangeContainingKeyBefore(key);
 		return std::make_pair(range->range(), range->value());
@ -744,23 +945,24 @@ Reference<LocationInfo> DatabaseContext::setCachedLocation( const KeyRangeRef& k
 		attempts++;
 		auto r = locationCache.randomRange();
 		Key begin = r.begin(), end = r.end();  // insert invalidates r, so can't be passed a mere reference into it
-		locationCache.insert( KeyRangeRef(begin, end), Reference<LocationInfo>() );
+		locationCache.insert(KeyRangeRef(begin, end), Reference<LocationInfo>());
 	}
 	locationCache.insert( keys, loc );
 	return loc;
 }

 void DatabaseContext::invalidateCache( const KeyRef& key, bool isBackward ) {
-	if( isBackward )
+	if( isBackward ) {
 		locationCache.rangeContainingKeyBefore(key)->value() = Reference<LocationInfo>();
-	else
+	} else {
 		locationCache.rangeContaining(key)->value() = Reference<LocationInfo>();
+	}
 }

 void DatabaseContext::invalidateCache( const KeyRangeRef& keys ) {
 	auto rs = locationCache.intersectingRanges(keys);
 	Key begin = rs.begin().begin(), end = rs.end().begin();  // insert invalidates rs, so can't be passed a mere reference into it
-	locationCache.insert( KeyRangeRef(begin, end), Reference<LocationInfo>() );
+	locationCache.insert(KeyRangeRef(begin, end), Reference<LocationInfo>());
 }

 Future<Void> DatabaseContext::onMasterProxiesChanged() {
@ -1370,7 +1572,11 @@ ACTOR Future< pair<KeyRange,Reference<LocationInfo>> > getKeyLocation_internal(
 }

 template <class F>
-Future<pair<KeyRange, Reference<LocationInfo>>> getKeyLocation( Database const& cx, Key const& key, F StorageServerInterface::*member, TransactionInfo const& info, bool isBackward = false ) {
+Future<pair<KeyRange, Reference<LocationInfo>>> getKeyLocation(Database const& cx, Key const& key,
+															   F StorageServerInterface::*member,
+															   TransactionInfo const& info,
+															   bool isBackward = false) {
+	// we first check whether this range is cached
 	auto ssi = cx->getCachedLocation( key, isBackward );
 	if (!ssi.second) {
 		return getKeyLocation_internal( cx, key, info, isBackward );
@ -1524,9 +1730,9 @@ ACTOR Future<Optional<Value>> getValue( Future<Version> version, Key key, Databa
 				choose {
 					when(wait(cx->connectionFileChanged())) { throw transaction_too_old(); }
 					when(GetValueReply _reply = wait(
-					         loadBalance(ssi.second, &StorageServerInterface::getValue,
-					                     GetValueRequest(span->context, key, ver, cx->sampleReadTags() ? tags : Optional<TagSet>(),
-					                                     getValueID),
+					         loadBalance(cx.getPtr(), ssi.second, &StorageServerInterface::getValue,
+					                     GetValueRequest(span->context, key, ver,
+					                                     cx->sampleReadTags() ? tags : Optional<TagSet>(), getValueID),
 					                     TaskPriority::DefaultPromiseEndpoint, false,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
 						reply = _reply;
@ -1615,7 +1821,7 @@ ACTOR Future<Key> getKey( Database cx, KeySelector k, Future<Version> version, T
 				choose {
 					when(wait(cx->connectionFileChanged())) { throw transaction_too_old(); }
 					when(GetKeyReply _reply =
-					         wait(loadBalance(ssi.second, &StorageServerInterface::getKey,
+					         wait(loadBalance(cx.getPtr(), ssi.second, &StorageServerInterface::getKey,
 					                          GetKeyRequest(span->context, k, version.get(),
 					                                        cx->sampleReadTags() ? tags : Optional<TagSet>(), getKeyID),
 					                          TaskPriority::DefaultPromiseEndpoint, false,
@ -1715,8 +1921,8 @@ ACTOR Future<Void> watchValue(Future<Version> version, Key key, Optional<Value>
 			state WatchValueReply resp;
 			choose {
 				when(WatchValueReply r = wait(
-				         loadBalance(ssi.second, &StorageServerInterface::watchValue,
-				                     WatchValueRequest(info.span->context, key, value, ver,
+				         loadBalance(cx.getPtr(), ssi.second, &StorageServerInterface::watchValue,
+				                     WatchValueRequest(span->context, key, value, ver,
 				                                       cx->sampleReadTags() ? tags : Optional<TagSet>(), watchValueID),
 				                     TaskPriority::DefaultPromiseEndpoint))) {
 					resp = r;
@ -1822,10 +2028,10 @@ ACTOR Future<Standalone<RangeResultRef>> getExactRange( Database cx, Version ver
 				try {
 					choose {
 						when(wait(cx->connectionFileChanged())) { throw transaction_too_old(); }
-						when(GetKeyValuesReply _rep =
-								wait(loadBalance(locations[shard].second, &StorageServerInterface::getKeyValues, req,
-												TaskPriority::DefaultPromiseEndpoint, false,
-												cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
+						when(GetKeyValuesReply _rep = wait(
+								 loadBalance(cx.getPtr(), locations[shard].second, &StorageServerInterface::getKeyValues,
+											 req, TaskPriority::DefaultPromiseEndpoint, false,
+											 cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
 							rep = _rep;
 						}
 					}
@ -2123,7 +2329,10 @@ ACTOR Future<Standalone<RangeResultRef>> getRange( Database cx, Reference<Transa
 								transaction_too_old(), future_version()
 									});
 					}
-					GetKeyValuesReply _rep = wait( loadBalance(beginServer.second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) );
+					GetKeyValuesReply _rep =
+						wait(loadBalance(cx.getPtr(), beginServer.second, &StorageServerInterface::getKeyValues, req,
+										 TaskPriority::DefaultPromiseEndpoint, false,
+										 cx->enableLocalityLoadBalance ? &cx->queueModel : NULL));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
 				} catch(Error&) {
@ -3716,7 +3925,7 @@ ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx, KeyRangeRef keys,
 			req.min.bytes = 0;
 			req.max.bytes = -1;
 			StorageMetrics m = wait(
-			    loadBalance(locationInfo, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
+			    loadBalance(locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
 			return m;
 		} catch (Error& e) {
 			if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
@ -3758,8 +3967,8 @@ ACTOR Future<Void> trackBoundedStorageMetrics(
 	try {
 		loop {
 			WaitMetricsRequest req( keys, x - halfError, x + halfError );
-			StorageMetrics nextX = wait( loadBalance( location, &StorageServerInterface::waitMetrics, req ) );
-			deltaStream.send( nextX - x );
+			StorageMetrics nextX = wait(loadBalance(location->locations(), &StorageServerInterface::waitMetrics, req));
+			deltaStream.send(nextX - x);
 			x = nextX;
 		}
 	} catch (Error& e) {
@ -3784,8 +3993,8 @@ ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(
 		WaitMetricsRequest req(locations[i].first, StorageMetrics(), StorageMetrics());
 		req.min.bytes = 0;
 		req.max.bytes = -1;
-		fx[i] =
-		    loadBalance(locations[i].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution);
+		fx[i] = loadBalance(locations[i].second->locations(), &StorageServerInterface::waitMetrics, req,
+		                    TaskPriority::DataDistribution);
 	}
 	wait(waitForAll(fx));

@ -3833,7 +4042,7 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getReadHotRanges(Database cx, K
 			state vector<Future<ReadHotSubRangeReply>> fReplies(nLocs);
 			for (int i = 0; i < nLocs; i++) {
 				ReadHotSubRangeRequest req(locations[i].first);
-				fReplies[i] = loadBalance(locations[i].second, &StorageServerInterface::getReadHotRanges, req,
+				fReplies[i] = loadBalance(locations[i].second->locations(), &StorageServerInterface::getReadHotRanges, req,
 				                          TaskPriority::DataDistribution);
 			}

@ -3879,7 +4088,8 @@ ACTOR Future< std::pair<Optional<StorageMetrics>, int> > waitStorageMetrics(
 					fx = waitStorageMetricsMultipleLocations(locations, min, max, permittedError);
 				} else {
 					WaitMetricsRequest req( keys, min, max );
-					fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution );
+					fx = loadBalance(locations[0].second->locations(), &StorageServerInterface::waitMetrics, req,
+					                 TaskPriority::DataDistribution);
 				}
 				StorageMetrics x = wait(fx);
 				return std::make_pair(x,-1);
@ -3967,8 +4177,12 @@ ACTOR Future< Standalone<VectorRef<KeyRef>> > splitStorageMetrics( Database cx,
 				state int i = 0;
 				for(; i<locations.size(); i++) {
 					SplitMetricsRequest req( locations[i].first, limit, used, estimated, i == locations.size() - 1 );
-					SplitMetricsReply res = wait( loadBalance( locations[i].second, &StorageServerInterface::splitMetrics, req, TaskPriority::DataDistribution ) );
-					if( res.splits.size() && res.splits[0] <= results.back() ) { // split points are out of order, possibly because of moving data, throw error to retry
+					SplitMetricsReply res =
+					    wait(loadBalance(locations[i].second->locations(), &StorageServerInterface::splitMetrics, req,
+					                     TaskPriority::DataDistribution));
+					if (res.splits.size() &&
+					    res.splits[0] <= results.back()) { // split points are out of order, possibly because of moving
+						                                   // data, throw error to retry
 						ASSERT_WE_THINK(false);   // FIXME: This seems impossible and doesn't seem to be covered by testing
 						throw all_alternatives_failed();
 					}
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -1119,8 +1119,7 @@ public:
 			}

 			bool retry_limit_hit = ryw->options.maxRetries != -1 && ryw->retries >= ryw->options.maxRetries;
-			if (ryw->retries < std::numeric_limits<int>::max()) 
-				ryw->retries++;
+			if (ryw->retries < std::numeric_limits<int>::max()) ryw->retries++;
 			if(retry_limit_hit) {
 				throw e;
 			}
@ -1130,7 +1129,7 @@ public:
 			ryw->debugLogRetries(e);

 			ryw->resetRyow();
-			return Void(); 
+			return Void();
 		} catch( Error &e ) {
 			if ( !ryw->resetPromise.isSet() ) {
 				if(ryw->tr.apiVersionAtLeast(610)) {
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -72,9 +72,9 @@ struct StorageServerInterface {
 	RequestStream<ReplyPromise<KeyValueStoreType>> getKeyValueStoreType;
 	RequestStream<struct WatchValueRequest> watchValue;
 	RequestStream<struct ReadHotSubRangeRequest> getReadHotRanges;
-
-	explicit StorageServerInterface(UID uid) : uniqueID( uid ) {}
-	StorageServerInterface() : uniqueID( deterministicRandom()->randomUniqueID() ) {}
+	bool isCacheServer;
+	explicit StorageServerInterface(UID uid) : uniqueID( uid ), isCacheServer(false) {}
+	StorageServerInterface() : uniqueID( deterministicRandom()->randomUniqueID() ), isCacheServer(false) {}
 	NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); }
 	NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); }
 	Optional<NetworkAddress> secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; }
@ -86,7 +86,7 @@ struct StorageServerInterface {
 		//To change this serialization, ProtocolVersion::ServerListValue must be updated, and downgrades need to be considered

 		if (ar.protocolVersion().hasSmallEndpoints()) {
-			serializer(ar, uniqueID, locality, getValue);
+			serializer(ar, uniqueID, locality, getValue, isCacheServer);
 			if( Ar::isDeserializing ) {
 				getKey = RequestStream<struct GetKeyRequest>( getValue.getEndpoint().getAdjustedEndpoint(1) );
 				getKeyValues = RequestStream<struct GetKeyValuesRequest>( getValue.getEndpoint().getAdjustedEndpoint(2) );
@ -108,6 +108,7 @@ struct StorageServerInterface {
 			serializer(ar, uniqueID, locality, getValue, getKey, getKeyValues, getShardState, waitMetrics,
 					splitMetrics, getStorageMetrics, waitFailure, getQueuingMetrics, getKeyValueStoreType);
 			if (ar.protocolVersion().hasWatches()) serializer(ar, watchValue);
+			if (ar.protocolVersion().hasCacheRole()) serializer(ar, isCacheServer);
 		}
 	}
 	bool operator == (StorageServerInterface const& s) const { return uniqueID == s.uniqueID; }
@ -157,13 +158,14 @@ struct ServerCacheInfo {
 struct GetValueReply : public LoadBalancedReply {
 	constexpr static FileIdentifier file_identifier = 1378929;
 	Optional<Value> value;
+	bool cached;

-	GetValueReply() {}
-	GetValueReply(Optional<Value> value) : value(value) {}
+	GetValueReply() : cached(false) {}
+	GetValueReply(Optional<Value> value, bool cached) : value(value), cached(cached) {}

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, value);
+		serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, value, cached);
 	}
 };

@ -190,12 +192,13 @@ struct WatchValueReply {
 	constexpr static FileIdentifier file_identifier = 3;

 	Version version;
+	bool cached = false;
 	WatchValueReply() = default;
 	explicit WatchValueReply(Version version) : version(version) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, version);
+		serializer(ar, version, cached);
 	}
 };

@ -226,13 +229,13 @@ struct GetKeyValuesReply : public LoadBalancedReply {
 	VectorRef<KeyValueRef, VecSerStrategy::String> data;
 	Version version; // useful when latestVersion was requested
 	bool more;
-	bool cached;
+	bool cached = false;

 	GetKeyValuesReply() : version(invalidVersion), more(false), cached(false) {}

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, data, version, more, arena);
+		serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, data, version, more, cached, arena);
 	}
 };

@ -258,13 +261,14 @@ struct GetKeyValuesRequest : TimedRequest {
 struct GetKeyReply : public LoadBalancedReply {
 	constexpr static FileIdentifier file_identifier = 11226513;
 	KeySelector sel;
+	bool cached;

-	GetKeyReply() {}
-	GetKeyReply(KeySelector sel) : sel(sel) {}
+	GetKeyReply() : cached(false) {}
+	GetKeyReply(KeySelector sel, bool cached) : sel(sel), cached(cached) {}

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, sel);
+		serializer(ar, LoadBalancedReply::penalty, LoadBalancedReply::error, sel, cached);
 	}
 };

--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -19,10 +19,12 @@
 */

 #include "fdbclient/SystemData.h"
-#include "fdbclient/StorageServerInterface.h"
-#include "flow/TDMetric.actor.h"
+#include "fdbclient/FDBTypes.h"
 #include "fdbclient/NativeAPI.actor.h"
-
+#include "fdbclient/StorageServerInterface.h"
+#include "flow/Arena.h"
+#include "flow/TDMetric.actor.h"
+#include "flow/serialize.h"

 const KeyRef systemKeysPrefix = LiteralStringRef("\xff");
 const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix);
@ -200,6 +202,29 @@ const KeyRangeRef writeConflictRangeKeysRange =
    KeyRangeRef(LiteralStringRef("\xff\xff/transaction/write_conflict_range/"),
                LiteralStringRef("\xff\xff/transaction/write_conflict_range/\xff\xff"));

+// "\xff/cacheServer/[[UID]] := StorageServerInterface"
+// This will be added by the cache server on initialization and removed by DD
+// TODO[mpilman]: We will need a way to map uint16_t ids to UIDs in a future
+//                versions. For now caches simply cache everything so the ids
+//                are not yet meaningful.
+const KeyRangeRef storageCacheServerKeys(LiteralStringRef("\xff/cacheServer/"),
+                                         LiteralStringRef("\xff/cacheServer0"));
+const KeyRef storageCacheServersPrefix = storageCacheServerKeys.begin;
+const KeyRef storageCacheServersEnd = storageCacheServerKeys.end;
+
+const Key storageCacheServerKey(UID id) {
+	BinaryWriter wr(Unversioned());
+	wr.serializeBytes(storageCacheServersPrefix);
+	wr << id;
+	return wr.toValue();
+}
+
+const Value storageCacheServerValue(const StorageServerInterface& ssi) {
+	BinaryWriter wr(IncludeVersion());
+	wr << ssi;
+	return wr.toValue();
+}
+
 const KeyRangeRef ddStatsRange = KeyRangeRef(LiteralStringRef("\xff\xff/metrics/data_distribution_stats/"),
                                             LiteralStringRef("\xff\xff/metrics/data_distribution_stats/\xff\xff"));

@ -526,6 +551,7 @@ StorageServerInterface decodeServerListValue( ValueRef const& value ) {
 	return s;
 }

+
 // processClassKeys.contains(k) iff k.startsWith( processClassKeys.begin ) because '/'+1 == '0'
 const KeyRangeRef processClassKeys(
 	LiteralStringRef("\xff/processClass/"),
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -62,6 +62,12 @@ void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& v
 void decodeKeyServersValue( std::map<Tag, UID> const& tag_uid, const ValueRef& value,
                            std::vector<UID>& src, std::vector<UID>& dest );

+// "\xff/storageCacheServer/[[UID]] := StorageServerInterface"
+extern const KeyRangeRef storageCacheServerKeys;
+extern const KeyRef storageCacheServersPrefix, storageCacheServersEnd;
+const Key storageCacheServerKey(UID id);
+const Value storageCacheServerValue(const StorageServerInterface& ssi);
+
 //    "\xff/storageCache/[[begin]]" := "[[vector<uint16_t>]]"
 extern const KeyRangeRef storageCacheKeys;
 extern const KeyRef storageCachePrefix;
--- a/fdbclient/VersionedMap.h
+++ b/fdbclient/VersionedMap.h
@ -538,7 +538,8 @@ namespace PTreeImpl {
 			return;
 		}
 		if (p->updated && p->lastUpdateVersion <= newOldestVersion) {
-			/* If the node has been updated, figure out which pointer was repalced. And delete that pointer */
+		/* If the node has been updated, figure out which pointer was replaced. And replace that pointer with the updated pointer.
+		   Then we can get rid of the updated child pointer and then make room in the node for future updates */
 			auto which = p->replacedPointer;
 			p->pointer[which] = p->pointer[2];
 			p->updated = false;
--- a/fdbrpc/MultiInterface.h
+++ b/fdbrpc/MultiInterface.h
@ -22,6 +22,11 @@
 #define FLOW_MULTIINTERFACE_H
 #pragma once

+#include "flow/FastRef.h"
+#include "fdbrpc/Locality.h"
+
+#include <vector>
+
 extern uint64_t debug_lastLoadBalanceResultEndpointToken;

 template <class K, class V>
@ -168,7 +173,7 @@ class MultiInterface : public ReferenceCounted<MultiInterface<T>> {
 template <class T>
 class MultiInterface<ReferencedInterface<T>> : public ReferenceCounted<MultiInterface<ReferencedInterface<T>>> {
 public:
-	MultiInterface( const vector<Reference<ReferencedInterface<T>>>& v ) : alternatives(v), bestCount(0) {
+	MultiInterface( const std::vector<Reference<ReferencedInterface<T>>>& v ) : alternatives(v), bestCount(0) {
 		deterministicRandom()->randomShuffle(alternatives);
 		if ( LBLocalityData<T>::Present ) {
 			std::stable_sort( alternatives.begin(), alternatives.end(), ReferencedInterface<T>::sort_by_distance );
@ -204,6 +209,18 @@ public:

 	T const& getInterface(int index) { return alternatives[index]->interf; }
 	UID getId( int index ) const { return alternatives[index]->interf.id(); }
+	bool hasInterface(UID id) const {
+		for (const auto& ref : alternatives) {
+			if (ref->interf.id() == id) {
+				return true;
+			}
+		}
+		return false;
+	}
+
+	Reference<ReferencedInterface<T>>& operator[](int i) { return alternatives[i]; }
+
+	const Reference<ReferencedInterface<T>>& operator[](int i) const { return alternatives[i]; }

 	virtual ~MultiInterface() {}

@ -211,7 +228,7 @@ public:
 		return describe( alternatives );
 	}
 private:
-	vector<Reference<ReferencedInterface<T>>> alternatives;
+	std::vector<Reference<ReferencedInterface<T>>> alternatives;
 	int16_t bestCount;
 };

--- a/fdbrpc/ReplicationUtils.cpp
+++ b/fdbrpc/ReplicationUtils.cpp
@ -99,7 +99,6 @@ int mostUsedZoneCount(Reference<LocalitySet>& logServerSet, std::vector<Locality
 bool findBestPolicySetSimple(int targetUniqueValueCount, Reference<LocalitySet>& logServerSet, std::vector<LocalityEntry>& bestSet,
                             int desired) {
 	auto& mutableEntries = logServerSet->getMutableEntries();
-	deterministicRandom()->randomShuffle(mutableEntries);
 	// First make sure the current localitySet is able to fulfuill the policy
 	AttribKey indexKey = logServerSet->keyIndex("zoneid");
 	int uniqueValueCount = logServerSet->getKeyValueArray()[indexKey._id].size();
@ -118,18 +117,24 @@ bool findBestPolicySetSimple(int targetUniqueValueCount, Reference<LocalitySet>&
 	}

 	ASSERT_WE_THINK(uniqueValueCount == entries.size());
+	std::vector<std::vector<int>> randomizedEntries;
+	randomizedEntries.resize(entries.size());
+	for(auto it : entries) {
+		randomizedEntries.push_back(it.second);
+	}
+	deterministicRandom()->randomShuffle(randomizedEntries);

 	desired = std::max(desired, targetUniqueValueCount);
-	auto it = entries.begin();
+	auto it = randomizedEntries.begin();
 	while (bestSet.size() < desired) {
-		if(it->second.size()) {
-			bestSet.push_back(mutableEntries[it->second.back()]);
-			it->second.pop_back();
+		if(it->size()) {
+			bestSet.push_back(mutableEntries[it->back()]);
+			it->pop_back();
 		}
 		
 		++it;
-		if(it == entries.end()) {
-			it = entries.begin();
+		if(it == randomizedEntries.end()) {
+			it = randomizedEntries.begin();
 		}
 	}

--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -144,7 +144,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 						{
 							MutationRef privatized = m;
 							privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
-							TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
+							//TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
 							cachedRangeInfo[k] = privatized;
 						}
 					if(k != allKeys.end) {
@ -161,7 +161,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 				if(toCommit) {
 					MutationRef privatized = m;
 					privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
-					TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
+					//TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString());
 					toCommit->addTag( cacheTag );
 					toCommit->addTypedMessage(privatized);
 				}
@ -276,6 +276,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 							allTags.insert(decodeServerTagValue(kv.value));
 						}
 					}
+					allTags.insert(cacheTag);

 					if (m.param1 == lastEpochEndKey) {
 						toCommit->addTags(allTags);
@ -494,14 +495,24 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 				keyBegin = itr->first;
 				mutationBegin = itr->second;
 				++itr;
-				keyEnd = itr->first;
-				mutationEnd = itr->second;
+				if (itr != cachedRangeInfo.end()) {
+					keyEnd = itr->first;
+					mutationEnd = itr->second;
+				} else {
+					//TraceEvent(SevDebug, "EndKeyNotFound", dbgid).detail("KeyBegin", keyBegin.toString());
+					break;
+				}
 			} else {
 				keyEnd = itr->first;
 				mutationEnd = itr->second;
 				++itr;
-				keyBegin = itr->first;
-				mutationBegin = itr->second;
+				if (itr != cachedRangeInfo.end()) {
+					keyBegin = itr->first;
+					mutationBegin = itr->second;
+				} else {
+					//TraceEvent(SevDebug, "BeginKeyNotFound", dbgid).detail("KeyEnd", keyEnd.toString());
+					break;
+				}
 			}

 			// Now get all the storage server tags for the cached key-ranges
--- a/fdbserver/BackupProgress.actor.cpp
+++ b/fdbserver/BackupProgress.actor.cpp
@ -115,7 +115,7 @@ std::map<std::tuple<LogEpoch, Version, int>, std::map<Tag, Version>> BackupProgr
 					// ASSERT(info.logRouterTags == epochTags[rit->first]);

 					updateTagVersions(&tagVersions, &tags, rit->second, info.epochEnd, adjustedBeginVersion, epoch);
-					break;
+					if (tags.empty()) break;
 				}
 				rit++;
 			}
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@ -124,6 +124,7 @@ set(FDBSERVER_SRCS
  workloads/BackupToDBUpgrade.actor.cpp
  workloads/BulkLoad.actor.cpp
  workloads/BulkSetup.actor.h
+  workloads/Cache.actor.cpp
  workloads/ChangeConfig.actor.cpp
  workloads/ClientTransactionProfileCorrectness.actor.cpp
  workloads/TriggerRecovery.actor.cpp
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -61,7 +61,6 @@ struct WorkerInfo : NonCopyable {
 	WorkerDetails details;
 	Future<Void> haltRatekeeper;
 	Future<Void> haltDistributor;
-	Optional<uint16_t> storageCacheInfo;
 	Standalone<VectorRef<StringRef>> issues;

 	WorkerInfo() : gen(-1), reboots(0), priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
@ -70,7 +69,7 @@ struct WorkerInfo : NonCopyable {

 	WorkerInfo( WorkerInfo&& r ) BOOST_NOEXCEPT : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen),
 		reboots(r.reboots), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)),
-		haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), storageCacheInfo(r.storageCacheInfo), issues(r.issues) {}
+		haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), issues(r.issues) {}
 	void operator=( WorkerInfo&& r ) BOOST_NOEXCEPT {
 		watcher = std::move(r.watcher);
 		reply = std::move(r.reply);
@ -81,7 +80,6 @@ struct WorkerInfo : NonCopyable {
 		details = std::move(r.details);
 		haltRatekeeper = r.haltRatekeeper;
 		haltDistributor = r.haltDistributor;
-		storageCacheInfo = r.storageCacheInfo;
 		issues = r.issues;
 	}
 };
@ -111,7 +109,6 @@ public:
 		Database db;
 		int unfinishedRecoveries;
 		int logGenerations;
-		std::map<uint16_t, std::pair<Optional<StorageServerInterface>, Optional<Key>>> cacheInterfaces;
 		bool cachePopulated;
 		std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>> clientStatus;

@ -138,28 +135,6 @@ public:
 			serverInfo->set( newInfo );
 		}

-		void setStorageCache(uint16_t id, const StorageServerInterface& interf) {
-			auto newInfo = serverInfo->get();
-			bool found = false;
-			for(auto& it : newInfo.storageCaches) {
-				if(it.first == id) {
-					if(it.second != interf) {
-						newInfo.id = deterministicRandom()->randomUniqueID();
-						newInfo.infoGeneration = ++dbInfoCount;
-						it.second = interf;
-					}
-					found = true;
-					break;
-				}
-			}
-			if(!found) {
-				newInfo.id = deterministicRandom()->randomUniqueID();
-				newInfo.infoGeneration = ++dbInfoCount;
-				newInfo.storageCaches.push_back(std::make_pair(id, interf));
-			}
-			serverInfo->set( newInfo );
-		}
-
 		void clearInterf(ProcessClass::ClassType t) {
 			auto newInfo = serverInfo->get();
 			newInfo.id = deterministicRandom()->randomUniqueID();
@ -172,18 +147,6 @@ public:
 			serverInfo->set( newInfo );
 		}

-		void clearStorageCache(uint16_t id) {
-			auto newInfo = serverInfo->get();
-			for(auto it = newInfo.storageCaches.begin(); it != newInfo.storageCaches.end(); ++it) {
-				if(it->first == id) {
-					newInfo.id = deterministicRandom()->randomUniqueID();
-					newInfo.infoGeneration = ++dbInfoCount;
-					newInfo.storageCaches.erase(it);
-					break;
-				}
-			}
-			serverInfo->set( newInfo );
-		}
 	};

 	struct UpdateWorkerList {
@ -365,7 +328,7 @@ public:
 						logServerMap->add(worker.interf.locality, &worker);
 					}
 				}
-				
+
 				if (logServerSet->size() < (addingDegraded == 0 ? desired : required)) {
 				}
 				else if (logServerSet->size() == required || logServerSet->size() <= desired) {
@ -1441,7 +1404,6 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 				dbInfo.clusterInterface = db->serverInfo->get().clusterInterface;
 				dbInfo.distributor = db->serverInfo->get().distributor;
 				dbInfo.ratekeeper = db->serverInfo->get().ratekeeper;
-				dbInfo.storageCaches = db->serverInfo->get().storageCaches;
 				dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig;

 				TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id);
@ -1496,7 +1458,7 @@ ACTOR Future<Void> clusterOpenDatabase(ClusterControllerData::DBInfo* db, OpenDa
 	if(db->clientStatus.size() > 10000) {
 		TraceEvent(SevWarnAlways, "TooManyClientStatusEntries").suppressFor(1.0);
 	}
-	
+
 	while (db->clientInfo->get().id == req.knownClientInfoID) {
 		choose {
 			when (wait( db->clientInfo->onChange() )) {}
@ -1747,27 +1709,9 @@ ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass
 			}
 			when( wait( failed ) ) {  // remove workers that have failed
 				WorkerInfo& failedWorkerInfo = cluster->id_worker[ worker.locality.processId() ];
-				if(failedWorkerInfo.storageCacheInfo.present()) {
-					bool found = false;
-					for(auto& it : cluster->id_worker) {
-						if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) {
-							found = true;
-							it.second.storageCacheInfo = failedWorkerInfo.storageCacheInfo;
-							cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional<StorageServerInterface>(), it.first);
-							if(!it.second.reply.isSet()) {
-								it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, failedWorkerInfo.storageCacheInfo) );
-							}
-							break;
-						}
-					}
-					if(!found) {
-						cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional<StorageServerInterface>(), Optional<Key>());
-					}
-					cluster->db.clearStorageCache(failedWorkerInfo.storageCacheInfo.get());
-				}
-				
+
 				if (!failedWorkerInfo.reply.isSet()) {
-					failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo, Optional<uint16_t>()) );
+					failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo) );
 				}
 				if (worker.locality.processId() == cluster->masterProcessId) {
 					cluster->masterProcessId = Optional<Key>();
@ -2055,7 +1999,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
 				if ( it.second.priorityInfo.isExcluded != isExcludedFromConfig ) {
 					it.second.priorityInfo.isExcluded = isExcludedFromConfig;
 					if( !it.second.reply.isSet() ) {
-						it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) );
+						it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) );
 					}
 				}
 			}
@ -2228,56 +2172,10 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) {
 			}
 		}
 	}
-	Optional<uint16_t> newStorageCache = req.storageCacheInterf.present() ? req.storageCacheInterf.get().first : Optional<uint16_t>();
-	auto& cacheInfo = self->id_worker[w.locality.processId()].storageCacheInfo;
-	if (req.storageCacheInterf.present()) {
-		auto it = self->db.cacheInterfaces.find(req.storageCacheInterf.get().first);
-		if(it == self->db.cacheInterfaces.end()) {
-			if(self->db.cachePopulated) {
-				if(cacheInfo.present()) {
-					self->db.clearStorageCache(cacheInfo.get());
-				}
-				newStorageCache = Optional<uint16_t>();
-				cacheInfo = Optional<uint16_t>();
-			} else {
-				self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second);
-				self->db.cacheInterfaces[req.storageCacheInterf.get().first] = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId());
-				cacheInfo = req.storageCacheInterf.get().first;
-			}
-		} else {
-			if(!it->second.second.present() || (cacheInfo.present() && cacheInfo.get() == it->first) ) {
-				self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second);
-				it->second = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId());
-				cacheInfo = req.storageCacheInterf.get().first;
-			}
-			else {
-				if(cacheInfo.present()) {
-					self->db.clearStorageCache(cacheInfo.get());
-				}
-				newStorageCache = Optional<uint16_t>();
-				cacheInfo = Optional<uint16_t>();
-			}
-		}
-	} else {
-		newStorageCache = cacheInfo;
-	}
-
-	if(self->gotProcessClasses && newProcessClass == ProcessClass::StorageCacheClass && !newStorageCache.present()) {
-		for(auto& it : self->db.cacheInterfaces) {
-			if(!it.second.second.present()) {
-				it.second.second = w.locality.processId();
-				self->id_worker[w.locality.processId()].storageCacheInfo = it.first;
-				newStorageCache = it.first;
-				break;
-			}
-		}
-	}

 	// Notify the worker to register again with new process class/exclusive property
-	if ( !req.reply.isSet() && ( newPriorityInfo != req.priorityInfo || 
-				newStorageCache.present() != req.storageCacheInterf.present() ||
-				(newStorageCache.present() && newStorageCache.get() != req.storageCacheInterf.get().first) ) ) {
-		req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo, newStorageCache) );
+	if ( !req.reply.isSet() && newPriorityInfo != req.priorityInfo ) {
+		req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo) );
 	}
 }

@ -2504,7 +2402,7 @@ ACTOR Future<Void> monitorProcessClasses(ClusterControllerData *self) {
 							w.second.details.processClass = newProcessClass;
 							w.second.priorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController);
 							if (!w.second.reply.isSet()) {
-								w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo, w.second.storageCacheInfo) );
+								w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo) );
 							}
 						}
 					}
@ -2558,81 +2456,7 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
 				break;
 			}
 			catch (Error &e) {
-				wait(tr.onError(e));		
-			}
-		}
-	}
-}
-
-ACTOR Future<Void> monitorStorageCache(ClusterControllerData* self) {
-	loop {
-		state ReadYourWritesTransaction tr(self->db.db);
-		loop {
-			try {
-				tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
-				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-				tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
-
-				Optional<Value> changeVal = wait(tr.get(cacheChangeKey));
-				Standalone<RangeResultRef> changeKeys = wait(tr.getRange(cacheChangeKeys, CLIENT_KNOBS->TOO_MANY));
-				ASSERT( !changeKeys.more && changeKeys.size() < CLIENT_KNOBS->TOO_MANY );
-
-				std::set<uint16_t> changeIDs;
-				for(auto& it : changeKeys) {
-					changeIDs.insert(cacheChangeKeyDecodeIndex(it.key));
-				}
-
-				for(auto& it : changeIDs) {
-					if(!self->db.cacheInterfaces.count(it)) {
-						self->db.cacheInterfaces[it] = std::make_pair(Optional<StorageServerInterface>(), Optional<Key>());
-					}
-				}
-
-				std::vector<uint16_t> removeIDs;
-				for(auto& it : self->db.cacheInterfaces) {
-					if(!changeIDs.count(it.first)) {
-						removeIDs.push_back(it.first);
-						if(it.second.second.present()) {
-							self->id_worker[it.second.second.get()].storageCacheInfo = Optional<uint16_t>();
-						}
-						self->db.clearStorageCache(it.first);
-					}
-				}
-
-				for(auto& it : removeIDs) {
-					self->db.cacheInterfaces.erase(it);
-				}
-
-				for(auto& c : self->db.cacheInterfaces) {
-					if(!c.second.second.present()) {
-						bool found = false;
-						for(auto& it : self->id_worker) {
-							if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) {
-								found = true;
-								it.second.storageCacheInfo = c.first;
-								c.second.second = it.first;
-								if(!it.second.reply.isSet()) {
-									it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, c.first) );
-								}
-								break;
-							}
-						}
-						if(!found) {
-							break;
-						}
-					}
-				}
-
-				state Future<Void> configChangeFuture = tr.watch(cacheChangeKey);
-
-				self->db.cachePopulated = true;
-				wait(tr.commit());
-				wait(configChangeFuture);
-
-				break;
-			}
-			catch (Error &e) {
-				wait(tr.onError(e));		
+				wait(tr.onError(e));
 			}
 		}
 	}
@ -2688,7 +2512,7 @@ ACTOR Future<Void> updatedChangingDatacenters(ClusterControllerData *self) {
 			if ( worker.priorityInfo.dcFitness > newFitness ) {
 				worker.priorityInfo.dcFitness = newFitness;
 				if(!worker.reply.isSet()) {
-					worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) );
+					worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) );
 				}
 			} else {
 				state int currentFit = ProcessClass::BestFit;
@ -2701,7 +2525,7 @@ ACTOR Future<Void> updatedChangingDatacenters(ClusterControllerData *self) {
 								updated = true;
 								it.second.priorityInfo.dcFitness = fitness;
 								if(!it.second.reply.isSet()) {
-									it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) );
+									it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) );
 								}
 							}
 						}
@ -2740,7 +2564,7 @@ ACTOR Future<Void> updatedChangedDatacenters(ClusterControllerData *self) {
 						if( worker.priorityInfo.dcFitness != newFitness ) {
 							worker.priorityInfo.dcFitness = newFitness;
 							if(!worker.reply.isSet()) {
-								worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) );
+								worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) );
 							}
 						}
 					} else {
@ -2754,7 +2578,7 @@ ACTOR Future<Void> updatedChangedDatacenters(ClusterControllerData *self) {
 										updated = true;
 										it.second.priorityInfo.dcFitness = fitness;
 										if(!it.second.reply.isSet()) {
-											it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) );
+											it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) );
 										}
 									}
 								}
@ -2908,7 +2732,7 @@ ACTOR Future<DataDistributorInterface> startDataDistributor( ClusterControllerDa
 			if (self->onMasterIsBetter(worker, ProcessClass::DataDistributor)) {
 				worker = self->id_worker[self->masterProcessId.get()].details;
 			}
-			
+
 			InitializeDataDistributorRequest req(deterministicRandom()->randomUniqueID());
 			TraceEvent("CCDataDistributorRecruit", self->id).detail("Addr", worker.interf.address());

@ -3091,7 +2915,6 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
 	self.addActor.send( handleForcedRecoveries(&self, interf) );
 	self.addActor.send( monitorDataDistributor(&self) );
 	self.addActor.send( monitorRatekeeper(&self) );
-	self.addActor.send( monitorStorageCache(&self) );
 	self.addActor.send( dbInfoUpdater(&self) );
 	self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
 	self.addActor.send( traceRole(Role::CLUSTER_CONTROLLER, interf.id()) );
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -20,6 +20,10 @@

 #include <set>
 #include <sstream>
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/FDBTypes.h"
+#include "fdbclient/Knobs.h"
+#include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/SystemData.h"
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/ManagementAPI.actor.h"
@ -35,9 +39,11 @@
 #include "fdbserver/TLogInterface.h"
 #include "fdbserver/WaitFailure.h"
 #include "flow/ActorCollection.h"
+#include "flow/Arena.h"
 #include "flow/Trace.h"
 #include "flow/UnitTest.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.
+#include "flow/serialize.h"

 class TCTeamInfo;
 struct TCMachineInfo;
@ -4853,6 +4859,56 @@ ACTOR Future<Void> ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest
 	return Void();
 }

+ACTOR Future<Void> waitFailCacheServer(Database* db, StorageServerInterface ssi) {
+	state Transaction tr(*db);
+	state Key key = storageCacheServerKey(ssi.id());
+	wait(waitFailureClient(ssi.waitFailure));
+	loop {
+		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		try {
+			tr.addReadConflictRange(storageCacheServerKeys);
+			tr.clear(key);
+			wait(tr.commit());
+			break;
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+	return Void();
+}
+
+ACTOR Future<Void> cacheServerWatcher(Database* db) {
+	state Transaction tr(*db);
+	state ActorCollection actors(false);
+	state std::set<UID> knownCaches;
+	loop {
+		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		try {
+			Standalone<RangeResultRef> range = wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY));
+			ASSERT(!range.more);
+			std::set<UID> caches;
+			for (auto& kv : range) {
+				UID id;
+				BinaryReader reader{kv.key.removePrefix(storageCacheServersPrefix), Unversioned()};
+				reader >> id;
+				caches.insert(id);
+				if (knownCaches.find(id) == knownCaches.end()) {
+					StorageServerInterface ssi;
+					BinaryReader reader{kv.value, IncludeVersion()};
+					reader >> ssi;
+					actors.add(waitFailCacheServer(db, ssi));
+				}
+			}
+			knownCaches = std::move(caches);
+			tr.reset();
+			wait(delay(5.0) || actors.getResult());
+			ASSERT(!actors.getResult().isReady());
+		} catch (Error& e) {
+			wait(tr.onError(e));
+		}
+	}
+}
+
 ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncVar<struct ServerDBInfo>> db ) {
 	state Reference<DataDistributorData> self( new DataDistributorData(db, di.id()) );
 	state Future<Void> collection = actorCollection( self->addActor.getFuture() );
@ -4865,6 +4921,7 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
 	try {
 		TraceEvent("DataDistributorRunning", di.id());
 		self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) );
+		self->addActor.send(cacheServerWatcher(&cx));
 		state Future<Void> distributor = reportErrorsExcept( dataDistribution(self, getShardMetricsList), "DataDistribution", di.id(), &normalDataDistributorErrors() );

 		loop choose {
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@ -1013,7 +1013,7 @@ private:
 		ASSERT( nextPageSeq%sizeof(Page)==0 );

 		auto& p = backPage();
-		memset(&p, 0, sizeof(Page)); // FIXME: unnecessary?
+		memset(static_cast<void*>(&p), 0, sizeof(Page)); // FIXME: unnecessary?
 		p.magic = 0xFDB;
 		switch (diskQueueVersion) {
 		case DiskQueueVersion::V0:
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -629,6 +629,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi

 	init( REDWOOD_DEFAULT_PAGE_SIZE,                            4096 );
 	init( REDWOOD_KVSTORE_CONCURRENT_READS,                       64 );
+	init( REDWOOD_COMMIT_CONCURRENT_READS,                        64 );
 	init( REDWOOD_PAGE_REBUILD_FILL_FACTOR,                     0.66 );
 	init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES,                    10 );
 	init( REDWOOD_LAZY_CLEAR_MIN_PAGES,                            0 );
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@ -561,6 +561,7 @@ public:

 	int REDWOOD_DEFAULT_PAGE_SIZE;  // Page size for new Redwood files
 	int REDWOOD_KVSTORE_CONCURRENT_READS;  // Max number of simultaneous point or range reads in progress.
+	int REDWOOD_COMMIT_CONCURRENT_READS;   // Max number of concurrent reads done to support commit operations
 	double REDWOOD_PAGE_REBUILD_FILL_FACTOR; // When rebuilding pages, start a new page after this capacity
 	int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at once
 	int REDWOOD_LAZY_CLEAR_MIN_PAGES;  // Minimum number of pages to free before ending a lazy clear cycle, unless the queue is empty
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@ -85,6 +85,10 @@ struct LogRouterData {
 	bool allowPops;
 	LogSet logSet;
 	bool foundEpochEnd;
+	double waitForVersionTime = 0;
+	double maxWaitForVersionTime = 0;
+	double getMoreTime = 0;
+	double maxGetMoreTime = 0;

 	struct PeekTrackerData {
 		std::map<int, Promise<std::pair<Version, bool>>> sequence_version;
@ -94,6 +98,7 @@ struct LogRouterData {
 	std::map<UID, PeekTrackerData> peekTracker;

 	CounterCollection cc;
+	Counter getMoreCount, getMoreBlockedCount;
 	Future<Void> logger;
 	Reference<EventCacheHolder> eventCacheHolder;

@ -116,7 +121,7 @@ struct LogRouterData {

 	LogRouterData(UID dbgid, const InitializeLogRouterRequest& req) : dbgid(dbgid), routerTag(req.routerTag), logSystem(new AsyncVar<Reference<ILogSystem>>()), 
 	  version(req.startVersion-1), minPopped(0), startVersion(req.startVersion), allowPops(false), minKnownCommittedVersion(0), poppedVersion(0), foundEpochEnd(false),
-		cc("LogRouter", dbgid.toString()) {
+		cc("LogRouter", dbgid.toString()), getMoreCount("GetMoreCount", cc), getMoreBlockedCount("GetMoreBlockedCount", cc) {
 		//setup just enough of a logSet to be able to call getPushLocations
 		logSet.logServers.resize(req.tLogLocalities.size());
 		logSet.tLogPolicy = req.tLogPolicy;
@ -133,11 +138,16 @@ struct LogRouterData {

 		eventCacheHolder = Reference<EventCacheHolder>( new EventCacheHolder(dbgid.shortString() + ".PeekLocation") );

-		specialCounter(cc, "Version", [this](){return this->version.get(); });
-		specialCounter(cc, "MinPopped", [this](){return this->minPopped.get(); });
+		specialCounter(cc, "Version", [this](){ return this->version.get(); });
+		specialCounter(cc, "MinPopped", [this](){ return this->minPopped.get(); });
 		specialCounter(cc, "FetchedVersions", [this](){ return std::max<Version>(0, std::min<Version>(SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS, this->version.get() - this->minPopped.get())); });
 		specialCounter(cc, "MinKnownCommittedVersion", [this](){ return this->minKnownCommittedVersion; });
 		specialCounter(cc, "PoppedVersion", [this](){ return this->poppedVersion; });
+		specialCounter(cc, "FoundEpochEnd", [this](){ return this->foundEpochEnd; });
+		specialCounter(cc, "WaitForVersionMS", [this](){ double val = this->waitForVersionTime; this->waitForVersionTime = 0; return 1000*val; });
+		specialCounter(cc, "WaitForVersionMaxMS", [this](){ double val = this->maxWaitForVersionTime; this->maxWaitForVersionTime = 0; return 1000*val; });
+		specialCounter(cc, "GetMoreMS", [this](){ double val = this->getMoreTime; this->getMoreTime = 0; return 1000*val; });
+		specialCounter(cc, "GetMoreMaxMS", [this](){ double val = this->maxGetMoreTime; this->maxGetMoreTime = 0; return 1000*val; });
 		logger = traceCounters("LogRouterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "LogRouterMetrics");
 	}
 };
@ -195,11 +205,14 @@ void commitMessages( LogRouterData* self, Version version, const std::vector<Tag
 ACTOR Future<Void> waitForVersion( LogRouterData *self, Version ver ) {
 	// The only time the log router should allow a gap in versions larger than MAX_READ_TRANSACTION_LIFE_VERSIONS is when processing epoch end.
 	// Since one set of log routers is created per generation of transaction logs, the gap caused by epoch end will be within MAX_VERSIONS_IN_FLIGHT of the log routers start version.
+	state double startTime = now();
 	if(self->version.get() < self->startVersion) {
 		if(ver > self->startVersion) {
 			self->version.set(self->startVersion);
 			wait(self->minPopped.whenAtLeast(self->version.get()));
 		}
+		self->waitForVersionTime += now() - startTime;
+		self->maxWaitForVersionTime = std::max(self->maxWaitForVersionTime, now() - startTime);
 		return Void();
 	}
 	if(!self->foundEpochEnd) {
@ -217,6 +230,8 @@ ACTOR Future<Void> waitForVersion( LogRouterData *self, Version ver ) {
 	if(ver >= self->startVersion + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT) {
 		self->foundEpochEnd = true;
 	}
+	self->waitForVersionTime += now() - startTime;
+	self->maxWaitForVersionTime = std::max(self->maxWaitForVersionTime, now() - startTime);
 	return Void();
 }

@ -229,8 +244,19 @@ ACTOR Future<Void> pullAsyncData( LogRouterData *self ) {

 	loop {
 		loop {
+			Future<Void> getMoreF = Never();
+			if(r) {
+				getMoreF = r->getMore(TaskPriority::TLogCommit);
+				++self->getMoreCount;
+				if(!getMoreF.isReady()) {
+					++self->getMoreBlockedCount;
+				}
+			}
+			state double startTime = now();
 			choose {
-				when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) {
+				when(wait( getMoreF ) ) {
+					self->getMoreTime += now() - startTime;
+					self->maxGetMoreTime = std::max(self->maxGetMoreTime, now() - startTime);
 					break;
 				}
 				when( wait( dbInfoChange ) ) { //FIXME: does this actually happen?
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@ -738,10 +738,10 @@ void ILogSystem::SetPeekCursor::advanceTo(LogMessageVersion n) {

 ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) {
 	loop {
-		//TraceEvent("LPC_GetMore1", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag);
+		//TraceEvent("LPC_GetMore1", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag.toString());
 		if(self->bestServer >= 0 && self->bestSet >= 0 && self->serverCursors[self->bestSet][self->bestServer]->isActive()) {
 			ASSERT(!self->serverCursors[self->bestSet][self->bestServer]->hasMessage());
-			//TraceEvent("LPC_GetMore2", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag);
+			//TraceEvent("LPC_GetMore2", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag.toString());
 			wait( self->serverCursors[self->bestSet][self->bestServer]->getMore(taskID) || self->serverCursors[self->bestSet][self->bestServer]->onFailed() );
 			self->useBestSet = true;
 		} else {
@ -778,7 +778,7 @@ ACTOR Future<Void> setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVer
 			} else {
 				//FIXME: this will peeking way too many cursors when satellites exist, and does not need to peek bestSet cursors since we cannot get anymore data from them
 				vector<Future<Void>> q;
-				//TraceEvent("LPC_GetMore4", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag);
+				//TraceEvent("LPC_GetMore4", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag.toString());
 				for(auto& cursors : self->serverCursors) {
 					for (auto& c :cursors) {
 						if (!c->hasMessage()) {
--- a/fdbserver/ResolverInterface.h
+++ b/fdbserver/ResolverInterface.h
@ -25,7 +25,10 @@
 #include "fdbrpc/fdbrpc.h"
 #pragma once

+#include "fdbrpc/Locality.h"
+#include "fdbrpc/fdbrpc.h"
 #include "fdbclient/FDBTypes.h"
+#include "fdbclient/CommitTransaction.h"

 struct ResolverInterface {
 	constexpr static FileIdentifier file_identifier = 1755944;
--- a/fdbserver/RestoreApplier.actor.cpp
+++ b/fdbserver/RestoreApplier.actor.cpp
@ -83,6 +83,7 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
 					updateProcessStats(self);
 					updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);
 				}
+				when(wait(actors.getResult())) {}
 				when(wait(exitRole)) {
 					TraceEvent("RestoreApplierCoreExitRole", self->id());
 					break;
@ -92,6 +93,7 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
 			TraceEvent(SevWarn, "FastRestoreApplierError", self->id())
 			    .detail("RequestType", requestTypeStr)
 			    .error(e, true);
+			actors.clear(false);
 			break;
 		}
 	}
@ -179,7 +181,6 @@ ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRange
 	    .detail("DelayTime", delayTime);
 	loop {
 		try {
-			tr->reset();
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			for (auto& range : ranges) {
@ -216,47 +217,50 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
    std::map<Key, std::map<Key, StagingKey>::iterator> incompleteStagingKeys, double delayTime, Database cx,
    UID applierID, int batchIndex) {
 	state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
-	state std::vector<Future<Optional<Value>>> fValues;
+	state std::vector<Future<Optional<Value>>> fValues(incompleteStagingKeys.size(), Never());
 	state int retries = 0;
+	state UID randomID = deterministicRandom()->randomUniqueID();

 	wait(delay(delayTime + deterministicRandom()->random01() * delayTime));
 	TraceEvent("FastRestoreApplierGetAndComputeStagingKeysStart", applierID)
+	    .detail("RandomUID", randomID)
 	    .detail("BatchIndex", batchIndex)
 	    .detail("GetKeys", incompleteStagingKeys.size())
 	    .detail("DelayTime", delayTime);
+
 	loop {
 		try {
-			tr->reset();
+			int i = 0;
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			for (auto& key : incompleteStagingKeys) {
-				fValues.push_back(tr->get(key.first));
+				fValues[i++] = tr->get(key.first);
 			}
 			wait(waitForAll(fValues));
 			break;
 		} catch (Error& e) {
-			if (retries++ > 10) { // TODO: Can we stop retry at the first error?
-				TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysGetKeysStuck", applierID)
+			if (retries++ > incompleteStagingKeys.size()) {
+				TraceEvent(SevWarnAlways, "GetAndComputeStagingKeys", applierID)
+				    .suppressFor(1.0)
+				    .detail("RandomUID", randomID)
 				    .detail("BatchIndex", batchIndex)
-				    .detail("GetKeys", incompleteStagingKeys.size())
 				    .error(e);
-				break;
 			}
 			wait(tr->onError(e));
-			fValues.clear();
 		}
 	}

 	ASSERT(fValues.size() == incompleteStagingKeys.size());
 	int i = 0;
 	for (auto& key : incompleteStagingKeys) {
-		if (!fValues[i].get().present()) { // Debug info to understand which key does not exist in DB
+		if (!fValues[i].get().present()) { // Key not exist in DB
+			// if condition: fValues[i].Valid() && fValues[i].isReady() && !fValues[i].isError() &&
 			TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB", applierID)
 			    .detail("BatchIndex", batchIndex)
 			    .detail("Key", key.first)
-			    .detail("Reason", "Not found in DB")
+			    .detail("IsReady", fValues[i].isReady())
 			    .detail("PendingMutations", key.second->second.pendingMutations.size())
-			    .detail("StagingKeyType", (int)key.second->second.type);
+			    .detail("StagingKeyType", getTypeString(key.second->second.type));
 			for (auto& vm : key.second->second.pendingMutations) {
 				TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
 				    .detail("PendingMutationVersion", vm.first.toString())
@ -274,8 +278,10 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 	}

 	TraceEvent("FastRestoreApplierGetAndComputeStagingKeysDone", applierID)
+	    .detail("RandomUID", randomID)
 	    .detail("BatchIndex", batchIndex)
-	    .detail("GetKeys", incompleteStagingKeys.size());
+	    .detail("GetKeys", incompleteStagingKeys.size())
+	    .detail("DelayTime", delayTime);

 	return Void();
 }
@ -404,7 +410,6 @@ ACTOR static Future<Void> applyStagingKeysBatch(std::map<Key, StagingKey>::itera
 	TraceEvent("FastRestoreApplierPhaseApplyStagingKeysBatch", applierID).detail("Begin", begin->first);
 	loop {
 		try {
-			tr->reset();
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			std::map<Key, StagingKey>::iterator iter = begin;
@ -502,6 +507,7 @@ ACTOR static Future<Void> handleApplyToDBRequest(RestoreVersionBatchRequest req,
 	    .detail("FinishedBatch", self->finishedBatch.get());

 	// Ensure batch (i-1) is applied before batch i
+	// TODO: Add a counter to warn when too many requests are waiting on the actor
 	wait(self->finishedBatch.whenAtLeast(req.batchIndex - 1));

 	state bool isDuplicated = true;
@ -523,6 +529,8 @@ ACTOR static Future<Void> handleApplyToDBRequest(RestoreVersionBatchRequest req,
 		}

 		ASSERT(batchData->dbApplier.present());
+		ASSERT(!batchData->dbApplier.get().isError()); // writeMutationsToDB actor cannot have error.
+		                                               // We cannot blindly retry because it is not idempodent

 		wait(batchData->dbApplier.get());

@ -578,4 +586,4 @@ Value applyAtomicOp(Optional<StringRef> existingValue, Value value, MutationRef:
 		ASSERT(false);
 	}
 	return Value();
-}
+}
--- a/fdbserver/RestoreApplier.actor.h
+++ b/fdbserver/RestoreApplier.actor.h
@ -123,7 +123,8 @@ struct StagingKey {
 		    .detail("Value", val)
 		    .detail("MType", type < MutationRef::MAX_ATOMIC_OP ? getTypeString(type) : "[Unset]")
 		    .detail("LargestPendingVersion",
-		            (pendingMutations.empty() ? "[none]" : pendingMutations.rbegin()->first.toString()));
+		            (pendingMutations.empty() ? "[none]" : pendingMutations.rbegin()->first.toString()))
+		    .detail("PendingMutations", pendingMutations.size());
 		std::map<LogMessageVersion, Standalone<MutationRef>>::iterator lb = pendingMutations.lower_bound(version);
 		if (lb == pendingMutations.end()) {
 			return;
--- a/fdbserver/RestoreCommon.actor.cpp
+++ b/fdbserver/RestoreCommon.actor.cpp
@ -122,7 +122,7 @@ Future<Void> RestoreConfigFR::logError(Database cx, Error e, std::string const&
 	}
 	TraceEvent t(SevWarn, "FileRestoreError");
 	t.error(e).detail("RestoreUID", uid).detail("Description", details).detail("TaskInstance", (uint64_t)taskInstance);
-	// These should not happen
+	// key_not_found could happen
 	if (e.code() == error_code_key_not_found) t.backtrace();

 	return updateErrorInfo(cx, e, details);
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -111,13 +111,17 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no
 					updateProcessStats(self);
 					updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);
 				}
+				when(wait(actors.getResult())) {}
 				when(wait(exitRole)) {
 					TraceEvent("FastRestoreLoaderCoreExitRole", self->id());
 					break;
 				}
 			}
 		} catch (Error& e) {
-			TraceEvent(SevWarn, "FastRestoreLoader", self->id()).detail("RequestType", requestTypeStr).error(e, true);
+			TraceEvent(SevWarn, "FastRestoreLoaderError", self->id())
+			    .detail("RequestType", requestTypeStr)
+			    .error(e, true);
+			actors.clear(false);
 			break;
 		}
 	}
--- a/fdbserver/RestoreMaster.actor.cpp
+++ b/fdbserver/RestoreMaster.actor.cpp
@ -198,17 +198,25 @@ ACTOR Future<Void> startProcessRestoreRequests(Reference<RestoreMasterData> self

 	TraceEvent("FastRestoreMasterWaitOnRestoreRequests", self->id()).detail("RestoreRequests", restoreRequests.size());

-	// DB has been locked where restore request is submitted
-	wait(clearDB(cx));
+	// TODO: Sanity check restoreRequests' key ranges do not overlap

 	// Step: Perform the restore requests
 	try {
 		for (restoreIndex = 0; restoreIndex < restoreRequests.size(); restoreIndex++) {
-			RestoreRequest& request = restoreRequests[restoreIndex];
+			state RestoreRequest request = restoreRequests[restoreIndex];
 			TraceEvent("FastRestoreMasterProcessRestoreRequests", self->id())
 			    .detail("RestoreRequestInfo", request.toString());
 			// TODO: Initialize MasterData and all loaders and appliers' data for each restore request!
 			self->resetPerRestoreRequest();
+
+			// clear the key range that will be restored
+			wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+				tr->clear(request.range);
+				return Void();
+			}));
+
 			wait(success(processRestoreRequest(self, cx, request)));
 			wait(notifyRestoreCompleted(self, false));
 		}
@ -637,7 +645,6 @@ ACTOR static Future<Standalone<VectorRef<RestoreRequest>>> collectRestoreRequest
 	loop {
 		try {
 			TraceEvent("FastRestoreMasterPhaseCollectRestoreRequestsWait");
-			tr.reset();
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);

@ -866,6 +873,7 @@ ACTOR static Future<Void> notifyApplierToApplyMutations(Reference<MasterBatchDat
 		}

 		ASSERT(batchData->applyToDB.present());
+		ASSERT(!batchData->applyToDB.get().isError());
 		wait(batchData->applyToDB.get());

 		// Sanity check all appliers have applied data to destination DB
@ -943,7 +951,7 @@ ACTOR static Future<Void> notifyRestoreCompleted(Reference<RestoreMasterData> se
 ACTOR static Future<Void> signalRestoreCompleted(Reference<RestoreMasterData> self, Database cx) {
 	state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));

-	wait(notifyRestoreCompleted(self, true));
+	wait(notifyRestoreCompleted(self, true)); // notify workers the restore has completed

 	wait(delay(5.0)); // Give some time for loaders and appliers to exit

--- a/fdbserver/RestoreWorker.actor.cpp
+++ b/fdbserver/RestoreWorker.actor.cpp
@ -249,7 +249,6 @@ ACTOR Future<Void> startRestoreWorker(Reference<RestoreWorkerData> self, Restore
 }

 ACTOR static Future<Void> waitOnRestoreRequests(Database cx, UID nodeID = UID()) {
-	state Future<Void> watch4RestoreRequest;
 	state ReadYourWritesTransaction tr(cx);
 	state Optional<Value> numRequests;

@ -263,10 +262,10 @@ ACTOR static Future<Void> waitOnRestoreRequests(Database cx, UID nodeID = UID())
 			Optional<Value> _numRequests = wait(tr.get(restoreRequestTriggerKey));
 			numRequests = _numRequests;
 			if (!numRequests.present()) {
-				watch4RestoreRequest = tr.watch(restoreRequestTriggerKey);
+				state Future<Void> watchForRestoreRequest = tr.watch(restoreRequestTriggerKey);
 				wait(tr.commit());
 				TraceEvent(SevInfo, "FastRestoreWaitOnRestoreRequestTriggerKey", nodeID);
-				wait(watch4RestoreRequest);
+				wait(watchForRestoreRequest);
 				TraceEvent(SevInfo, "FastRestoreDetectRestoreRequestTriggerKeyChanged", nodeID);
 			} else {
 				TraceEvent(SevInfo, "FastRestoreRestoreRequestTriggerKey", nodeID)
--- a/fdbserver/ServerDBInfo.actor.h
+++ b/fdbserver/ServerDBInfo.actor.h
@ -55,7 +55,6 @@ struct ServerDBInfo {
 	LogSystemConfig logSystemConfig;
 	std::vector<UID> priorCommittedLogServers;   // If !fullyRecovered and logSystemConfig refers to a new log system which may not have been committed to the coordinated state yet, then priorCommittedLogServers are the previous, fully committed generation which need to stay alive in case this recovery fails
 	Optional<LatencyBandConfig> latencyBandConfig;
-	std::vector<std::pair<uint16_t,StorageServerInterface>> storageCaches;
 	int64_t infoGeneration;

 	ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED), logSystemConfig(0), infoGeneration(0) {}
@ -65,7 +64,7 @@ struct ServerDBInfo {

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches, infoGeneration);
+		serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, infoGeneration);
 	}
 };

--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -1220,9 +1220,9 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFo

 		//FIXME: temporarily code to test storage cache
 		//TODO: caching disabled for this merge
-		//if(dc==0) {
-		//	machines++;
-		//}
+		if(dc==0) {
+			machines++;
+		}

 		int useSeedForMachine = deterministicRandom()->randomInt(0, machines);
 		Standalone<StringRef> zoneId;
@ -1249,10 +1249,10 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors, std::string baseFo

 			//FIXME: temporarily code to test storage cache
 			//TODO: caching disabled for this merge
-			//if(machine==machines-1 && dc==0) {
-			//	processClass = ProcessClass(ProcessClass::StorageCacheClass, ProcessClass::CommandLineSource);
-			//	nonVersatileMachines++;
-			//}
+			if(machine==machines-1 && dc==0) {
+				processClass = ProcessClass(ProcessClass::StorageCacheClass, ProcessClass::CommandLineSource);
+				nonVersatileMachines++;
+			}

 			std::vector<IPAddress> ips;
 			for (int i = 0; i < processesPerMachine; i++) {
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -28,6 +28,7 @@
 #include "fdbclient/SystemData.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbserver/WorkerInterface.actor.h"
+#include "fdbserver/LogProtocolMessage.h"
 #include "fdbserver/TLogInterface.h"
 #include "fdbserver/Knobs.h"
 #include "fdbserver/IKeyValueStore.h"
@ -1677,7 +1678,10 @@ ACTOR Future<Void> tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere
 	reply.end = endVersion;
 	reply.onlySpilled = onlySpilled;

-	//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress());
+	//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("Tag", req.tag.toString()).
+	//	detail("BeginVer", req.begin).detail("EndVer", reply.end).
+	//	detail("MsgBytes", reply.messages.expectedSize()).
+	//	detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress());

 	if(req.sequence.present()) {
 		auto& trackerData = logData->peekTracker[peekId];
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -28,6 +28,7 @@
 #include "fdbrpc/Replication.h"
 #include "fdbrpc/ReplicationUtils.h"
 #include "fdbserver/RecoveryState.h"
+#include "fdbserver/LogProtocolMessage.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.

 ACTOR Future<Version> minVersionWhenReady(Future<Void> f, std::vector<Future<TLogCommitReply>> replies) {
@ -690,10 +691,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 	}

 	Reference<IPeekCursor> peekLocal( UID dbgid, Tag tag, Version begin, Version end, bool useMergePeekCursors, int8_t peekLocality = tagLocalityInvalid ) {
-		if(tag.locality >= 0 || tag.locality == tagLocalityUpgraded) {
+		if(tag.locality >= 0 || tag.locality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial) {
 			peekLocality = tag.locality;
 		}
-		ASSERT(peekLocality >= 0 || peekLocality == tagLocalityUpgraded);
+		ASSERT(peekLocality >= 0 || peekLocality == tagLocalityUpgraded || tag.locality == tagLocalitySpecial);

 		int bestSet = -1;
 		bool foundSpecial = false;
@ -702,7 +703,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			if(tLogs[t]->logServers.size() && tLogs[t]->locality != tagLocalitySatellite) {
 				logCount++;
 			}
-			if(tLogs[t]->logServers.size() && (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded || tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded)) {
+			if(tLogs[t]->logServers.size() && (tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded || tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded || peekLocality == tagLocalitySpecial)) {
 				if( tLogs[t]->locality == tagLocalitySpecial || tLogs[t]->locality == tagLocalityUpgraded ) {
 					foundSpecial = true;
 				}
@ -757,7 +758,8 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					if(oldLogData[i].tLogs[t]->logServers.size() && oldLogData[i].tLogs[t]->locality != tagLocalitySatellite) {
 						logCount++;
 					}
-					if(oldLogData[i].tLogs[t]->logServers.size() && (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded || oldLogData[i].tLogs[t]->locality == peekLocality || peekLocality == tagLocalityUpgraded)) {
+					if(oldLogData[i].tLogs[t]->logServers.size() && (oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded || oldLogData[i].tLogs[t]->locality == peekLocality ||
+																	 peekLocality == tagLocalityUpgraded || peekLocality == tagLocalitySpecial)) {
 						if( oldLogData[i].tLogs[t]->locality == tagLocalitySpecial || oldLogData[i].tLogs[t]->locality == tagLocalityUpgraded ) {
 							nextFoundSpecial = true;
 						}
@ -783,8 +785,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				Version thisBegin = std::max(oldLogData[i].tLogs[bestOldSet]->startVersion, begin);
 				if(thisBegin < lastBegin) {
 					if(thisBegin < end) {
-						TraceEvent("TLogPeekLocalAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end)
+						TraceEvent("TLogPeekLocalAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end).detail("BestOldSet", bestOldSet)
 							.detail("LogServers", oldLogData[i].tLogs[bestOldSet]->logServerString()).detail("ThisBegin", thisBegin).detail("LastBegin", lastBegin);
+						//detail("LogId", oldLogData[i].tLogs[bestOldSet]->logServers[tLogs[bestOldSet]->bestLocationFor( tag )]->get().id());
 						cursors.emplace_back(new ILogSystem::MergedPeekCursor( oldLogData[i].tLogs[bestOldSet]->logServers, oldLogData[i].tLogs[bestOldSet]->bestLocationFor( tag ), oldLogData[i].tLogs[bestOldSet]->logServers.size() + 1 - oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor, tag,
 							thisBegin, std::min(lastBegin, end), useMergePeekCursors, oldLogData[i].tLogs[bestOldSet]->tLogLocalities, oldLogData[i].tLogs[bestOldSet]->tLogPolicy, oldLogData[i].tLogs[bestOldSet]->tLogReplicationFactor));
 						epochEnds.emplace_back(std::min(lastBegin, end));
--- a/fdbserver/VFSAsync.cpp
+++ b/fdbserver/VFSAsync.cpp
@ -531,7 +531,7 @@ static int asyncOpen(
 	if (flags & SQLITE_OPEN_WAL) oflags |= IAsyncFile::OPEN_LARGE_PAGES;
 	oflags |= IAsyncFile::OPEN_LOCK;

-	memset(p, 0, sizeof(VFSAsyncFile));
+	memset(static_cast<void*>(p), 0, sizeof(VFSAsyncFile));
 	new (p) VFSAsyncFile(zName, flags);
 	try {
 		// Note that SQLiteDB::open also opens the db file, so its flags and modes are important, too
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@ -2204,6 +2204,7 @@ struct SplitStringRef {
 // A BTree "page id" is actually a list of LogicalPageID's whose contents should be concatenated together.
 // NOTE: Uses host byte order
 typedef VectorRef<LogicalPageID> BTreePageIDRef;
+constexpr LogicalPageID maxPageID = (LogicalPageID)-1;

 std::string toString(BTreePageIDRef id) {
 	return std::string("BTreePageID") + toString(id.begin(), id.end());
@ -2246,6 +2247,10 @@ struct RedwoodRecordRef {

 	inline RedwoodRecordRef withoutValue() const { return RedwoodRecordRef(key, version); }

+	inline RedwoodRecordRef withMaxPageID() const {
+		return RedwoodRecordRef(key, version, StringRef((uint8_t *)&maxPageID, sizeof(maxPageID)));
+	}
+
 	// Truncate (key, version, part) tuple to len bytes.
 	void truncate(int len) {
 		ASSERT(len <= key.size());
@ -2988,7 +2993,8 @@ public:

 	VersionedBTree(IPager2* pager, std::string name)
 	  : m_pager(pager), m_writeVersion(invalidVersion), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr),
-	    m_name(name) {
+	    m_commitReadLock(SERVER_KNOBS->REDWOOD_COMMIT_CONCURRENT_READS), m_name(name) {
+
 		m_lazyClearActor = 0;
 		m_init = init_impl(this);
 		m_latestCommit = m_init;
@ -3435,6 +3441,7 @@ private:
 	Version m_writeVersion;
 	Version m_lastCommittedVersion;
 	Version m_newOldestVersion;
+	FlowLock m_commitReadLock;
 	Future<Void> m_latestCommit;
 	Future<Void> m_init;
 	std::string m_name;
@ -3872,7 +3879,7 @@ private:
 			// If the decode upper boundary is the subtree upper boundary the pointers will be the same
 			// For the lower boundary, if the pointers are not the same there is still a possibility
 			// that the keys are the same.  This happens for the first remaining subtree of an internal page
-			// after the previous first subtree was cleared.
+			// after the prior subtree(s) were cleared.
 			return (decodeUpperBound == subtreeUpperBound) &&
 			       (decodeLowerBound == subtreeLowerBound || decodeLowerBound->sameExceptValue(*subtreeLowerBound));
 		}
@ -4126,8 +4133,13 @@ private:
 		}

 		state Version writeVersion = self->getLastCommittedVersion() + 1;
+
+		wait(self->m_commitReadLock.take());
+		state FlowLock::Releaser readLock(self->m_commitReadLock);
 		state Reference<const IPage> page =
 		    wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound));
+		readLock.release();
+
 		state BTreePage* btPage = (BTreePage*)page->begin();
 		ASSERT(isLeaf == btPage->isLeaf());
 		g_redwoodMetrics.level(btPage->height).pageCommitStart += 1;
@ -4984,6 +4996,246 @@ public:
 		Future<bool> moveLast() { return move_end(this, false); }
 	};

+	// Cursor designed for short lifespans.
+	// Holds references to all pages touched.
+	// All record references returned from it are valid until the cursor is destroyed.
+	class BTreeCursor {
+		Arena arena;
+		Reference<IPagerSnapshot> pager;
+		std::unordered_map<LogicalPageID, Reference<const IPage>> pages;
+		VersionedBTree* btree;
+		bool valid;
+
+		struct PathEntry {
+			BTreePage* btPage;
+			BTreePage::BinaryTree::Cursor cursor;
+		};
+		VectorRef<PathEntry> path;
+
+	public:
+		BTreeCursor() {}
+
+		bool isValid() const { return valid; }
+
+		std::string toString() const {
+			std::string r;
+			for (int i = 0; i < path.size(); ++i) {
+				r += format("[%d/%d: %s] ", i + 1, path.size(),
+				            path[i].cursor.valid() ? path[i].cursor.get().toString(path[i].btPage->isLeaf()).c_str()
+				                                   : "<invalid>");
+			}
+			if (!valid) {
+				r += " (invalid) ";
+			}
+			return r;
+		}
+
+		const RedwoodRecordRef& get() { return path.back().cursor.get(); }
+
+		bool inRoot() const { return path.size() == 1; }
+
+		// Pop and return the page cursor at the end of the path.
+		// This is meant to enable range scans to consume the contents of a leaf page more efficiently.
+		// Can only be used when inRoot() is true.
+		BTreePage::BinaryTree::Cursor popPath() {
+			BTreePage::BinaryTree::Cursor c = path.back().cursor;
+			path.pop_back();
+			return c;
+		}
+
+		Future<Void> pushPage(BTreePageIDRef id, const RedwoodRecordRef& lowerBound,
+		                      const RedwoodRecordRef& upperBound) {
+			Reference<const IPage>& page = pages[id.front()];
+			if (page.isValid()) {
+				path.push_back(arena, { (BTreePage*)page->begin(), getCursor(page) });
+				return Void();
+			}
+
+			return map(readPage(pager, id, &lowerBound, &upperBound), [this, &page, id](Reference<const IPage> p) {
+				page = p;
+				path.push_back(arena, { (BTreePage*)p->begin(), getCursor(p) });
+				return Void();
+			});
+		}
+
+		Future<Void> pushPage(BTreePage::BinaryTree::Cursor c) {
+			const RedwoodRecordRef& rec = c.get();
+			auto next = c;
+			next.moveNext();
+			BTreePageIDRef id = rec.getChildPage();
+			return pushPage(id, rec, next.getOrUpperBound());
+		}
+
+		Future<Void> init(VersionedBTree* btree_in, Reference<IPagerSnapshot> pager_in, BTreePageIDRef root) {
+			btree = btree_in;
+			pager = pager_in;
+			path.reserve(arena, 6);
+			valid = false;
+			return pushPage(root, dbBegin, dbEnd);
+		}
+
+		// Seeks cursor to query if it exists, the record before or after it, or an undefined and invalid
+		// position between those records
+		// If 0 is returned, then
+		//   If the cursor is valid then it points to query
+		//   If the cursor is not valid then the cursor points to some place in the btree such that
+		//     If there is a record in the tree < query then movePrev() will move to it, and
+		//     If there is a record in the tree > query then moveNext() will move to it.
+		// If non-zero is returned then the cursor is valid and the return value is logically equivalent
+		// to query.compare(cursor.get())
+		ACTOR Future<int> seek_impl(BTreeCursor* self, RedwoodRecordRef query, int prefetchBytes) {
+			state RedwoodRecordRef internalPageQuery = query.withMaxPageID();
+			self->path = self->path.slice(0, 1);
+			debug_printf("seek(%s, %d) start cursor = %s\n", query.toString().c_str(), prefetchBytes,
+			             self->toString().c_str());
+
+			loop {
+				auto& entry = self->path.back();
+				if (entry.btPage->isLeaf()) {
+					int cmp = entry.cursor.seek(query);
+					self->valid = entry.cursor.valid() && !entry.cursor.node->isDeleted();
+					debug_printf("seek(%s, %d) loop exit cmp=%d cursor=%s\n", query.toString().c_str(), prefetchBytes,
+					             cmp, self->toString().c_str());
+					return self->valid ? cmp : 0;
+				}
+
+				// Internal page, so seek to the branch where query must be
+				// Currently, after a subtree deletion internal page boundaries are still strictly adhered
+				// to and will be updated if anything is inserted into the cleared range, so if the seek fails
+				// or it finds an entry with a null child page then query does not exist in the BTree.
+				if (entry.cursor.seekLessThan(internalPageQuery) && entry.cursor.get().value.present()) {
+					debug_printf("seek(%s, %d) loop seek success cursor=%s\n", query.toString().c_str(), prefetchBytes,
+					             self->toString().c_str());
+					Future<Void> f = self->pushPage(entry.cursor);
+
+					// Prefetch siblings, at least prefetchBytes, at level 2 but without jumping to another level 2
+					// sibling
+					if (prefetchBytes != 0 && entry.btPage->height == 2) {
+						auto c = entry.cursor;
+						bool fwd = prefetchBytes > 0;
+						prefetchBytes = abs(prefetchBytes);
+						// While we should still preload more bytes and a move in the target direction is successful
+						while (prefetchBytes > 0 && (fwd ? c.moveNext() : c.movePrev())) {
+							// If there is a page link, preload it.
+							if (c.get().value.present()) {
+								BTreePageIDRef childPage = c.get().getChildPage();
+								preLoadPage(self->pager.getPtr(), childPage);
+								prefetchBytes -= self->btree->m_blockSize * childPage.size();
+							}
+						}
+					}
+
+					wait(f);
+				} else {
+					self->valid = false;
+					debug_printf("seek(%s, %d) loop exit cmp=0 cursor=%s\n", query.toString().c_str(), prefetchBytes,
+					             self->toString().c_str());
+					return 0;
+				}
+			}
+		}
+
+		Future<int> seek(RedwoodRecordRef query, int prefetchBytes) { return seek_impl(this, query, prefetchBytes); }
+
+		ACTOR Future<Void> seekGTE_impl(BTreeCursor* self, RedwoodRecordRef query, int prefetchBytes) {
+			debug_printf("seekGTE(%s, %d) start\n", query.toString().c_str(), prefetchBytes);
+			int cmp = wait(self->seek(query, prefetchBytes));
+			if (cmp > 0 || (cmp == 0 && !self->isValid())) {
+				wait(self->moveNext());
+			}
+			return Void();
+		}
+
+		Future<Void> seekGTE(RedwoodRecordRef query, int prefetchBytes) {
+			return seekGTE_impl(this, query, prefetchBytes);
+		}
+
+		ACTOR Future<Void> seekLT_impl(BTreeCursor* self, RedwoodRecordRef query, int prefetchBytes) {
+			debug_printf("seekLT(%s, %d) start\n", query.toString().c_str(), prefetchBytes);
+			int cmp = wait(self->seek(query, prefetchBytes));
+			if (cmp <= 0) {
+				wait(self->movePrev());
+			}
+			return Void();
+		}
+
+		Future<Void> seekLT(RedwoodRecordRef query, int prefetchBytes) {
+			return seekLT_impl(this, query, -prefetchBytes);
+		}
+
+		ACTOR Future<Void> move_impl(BTreeCursor* self, bool forward) {
+			// Try to the move cursor at the end of the path in the correct direction
+			debug_printf("move%s() start cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
+			while (1) {
+				debug_printf("move%s() first loop cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
+				auto& entry = self->path.back();
+				bool success;
+				if(entry.cursor.valid()) {
+					success = forward ? entry.cursor.moveNext() : entry.cursor.movePrev();
+				} else {
+					success = forward ? entry.cursor.moveFirst() : false;
+				}
+
+				// Skip over internal page entries that do not link to child pages.  There should never be two in a row.
+				if (success && !entry.btPage->isLeaf() && !entry.cursor.get().value.present()) {
+					success = forward ? entry.cursor.moveNext() : entry.cursor.movePrev();
+					ASSERT(!success || entry.cursor.get().value.present());
+				}
+
+				// Stop if successful
+				if (success) {
+					break;
+				}
+
+				if (self->path.size() == 1) {
+					self->valid = false;
+					return Void();
+				}
+
+				// Move to parent
+				self->path = self->path.slice(0, self->path.size() - 1);
+			}
+
+			// While not on a leaf page, move down to get to one.
+			while (1) {
+				debug_printf("move%s() second loop cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
+				auto& entry = self->path.back();
+				if (entry.btPage->isLeaf()) {
+					break;
+				}
+
+				// The last entry in an internal page could be a null link, if so move back
+				if (!forward && !entry.cursor.get().value.present()) {
+					ASSERT(entry.cursor.movePrev());
+					ASSERT(entry.cursor.get().value.present());
+				}
+
+				wait(self->pushPage(entry.cursor));
+				auto& newEntry = self->path.back();
+				ASSERT(forward ? newEntry.cursor.moveFirst() : newEntry.cursor.moveLast());
+			}
+
+			self->valid = true;
+
+			debug_printf("move%s() exit cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
+			return Void();
+		}
+
+		Future<Void> moveNext() { return move_impl(this, true); }
+		Future<Void> movePrev() { return move_impl(this, false); }
+	};
+
+	Future<Void> initBTreeCursor(BTreeCursor* cursor, Version snapshotVersion) {
+		// Only committed versions can be read.
+		ASSERT(snapshotVersion <= m_lastCommittedVersion);
+		Reference<IPagerSnapshot> snapshot = m_pager->getReadSnapshot(snapshotVersion);
+
+		// This is a ref because snapshot will continue to hold the metakey value memory
+		KeyRef m = snapshot->getMetaKey();
+
+		return cursor->init(this, snapshot, ((MetaKey*)m.begin())->root.get());
+	}
+
 	// Cursor is for reading and interating over user visible KV pairs at a specific version
 	// KeyValueRefs returned become invalid once the cursor is moved
 	class Cursor : public IStoreCursor, public ReferenceCounted<Cursor>, public FastAllocated<Cursor>, NonCopyable {
@ -5264,10 +5516,13 @@ public:

 	ACTOR static Future<Standalone<RangeResultRef>> readRange_impl(KeyValueStoreRedwoodUnversioned* self, KeyRange keys,
 	                                                               int rowLimit, int byteLimit) {
+		state VersionedBTree::BTreeCursor cur;
+		wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion()));
+
 		wait(self->m_concurrentReads.take());
 		state FlowLock::Releaser releaser(self->m_concurrentReads);
-
 		++g_redwoodMetrics.opGetRange;
+
 		state Standalone<RangeResultRef> result;
 		state int accumulatedBytes = 0;
 		ASSERT(byteLimit > 0);
@ -5276,33 +5531,58 @@ public:
 			return result;
 		}

-		state Reference<IStoreCursor> cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion());
-		// Prefetch is currently only done in the forward direction
-		state int prefetchBytes = rowLimit > 1 ? byteLimit : 0;
+		// Prefetch is disabled for now pending some decent logic for deciding how much to fetch
+		state int prefetchBytes = 0;

 		if (rowLimit > 0) {
-			wait(cur->findFirstEqualOrGreater(keys.begin, prefetchBytes));
-			while (cur->isValid() && cur->getKey() < keys.end) {
-				KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue()));
-				accumulatedBytes += kv.expectedSize();
-				result.push_back(result.arena(), kv);
-				if (--rowLimit == 0 || accumulatedBytes >= byteLimit) {
+			wait(cur.seekGTE(keys.begin, prefetchBytes));
+			while (cur.isValid()) {
+				// Read page contents without using waits
+				bool isRoot = cur.inRoot();
+				BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
+				while(leafCursor.valid()) {
+					KeyValueRef kv = leafCursor.get().toKeyValueRef();
+					if(kv.key >= keys.end) {
+						break;
+					}
+					accumulatedBytes += kv.expectedSize();
+					result.push_back_deep(result.arena(), kv);
+					if (--rowLimit == 0 || accumulatedBytes >= byteLimit) {
+						break;
+					}
+					leafCursor.moveNext();
+				}
+				// Stop if the leaf cursor is still valid which means we hit a key or size limit or
+				// if we started in the root page
+				if(leafCursor.valid() || isRoot) {
 					break;
 				}
-				wait(cur->next());
+				wait(cur.moveNext());
 			}
 		} else {
-			wait(cur->findLastLessOrEqual(keys.end));
-			if (cur->isValid() && cur->getKey() == keys.end) wait(cur->prev());
-
-			while (cur->isValid() && cur->getKey() >= keys.begin) {
-				KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue()));
-				accumulatedBytes += kv.expectedSize();
-				result.push_back(result.arena(), kv);
-				if (++rowLimit == 0 || accumulatedBytes >= byteLimit) {
+			wait(cur.seekLT(keys.end, prefetchBytes));
+			while (cur.isValid()) {
+				// Read page contents without using waits
+				bool isRoot = cur.inRoot();
+				BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
+				while(leafCursor.valid()) {
+					KeyValueRef kv = leafCursor.get().toKeyValueRef();
+					if(kv.key < keys.begin) {
+						break;
+					}
+					accumulatedBytes += kv.expectedSize();
+					result.push_back_deep(result.arena(), kv);
+					if (++rowLimit == 0 || accumulatedBytes >= byteLimit) {
+						break;
+					}
+					leafCursor.movePrev();
+				}
+				// Stop if the leaf cursor is still valid which means we hit a key or size limit or
+				// if we started in the root page
+				if(leafCursor.valid() || isRoot) {
 					break;
 				}
-				wait(cur->prev());
+				wait(cur.movePrev());
 			}
 		}

@ -5316,15 +5596,16 @@ public:

 	ACTOR static Future<Optional<Value>> readValue_impl(KeyValueStoreRedwoodUnversioned* self, Key key,
 	                                                    Optional<UID> debugID) {
+		state VersionedBTree::BTreeCursor cur;
+		wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion()));
+
 		wait(self->m_concurrentReads.take());
 		state FlowLock::Releaser releaser(self->m_concurrentReads);
-
 		++g_redwoodMetrics.opGet;
-		state Reference<IStoreCursor> cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion());

-		wait(cur->findEqual(key));
-		if (cur->isValid()) {
-			return cur->getValue();
+		wait(cur.seekGTE(key, 0));
+		if (cur.isValid() && cur.get().key == key) {
+			return cur.get().value.get();
 		}
 		return Optional<Value>();
 	}
@ -5335,18 +5616,20 @@ public:

 	ACTOR static Future<Optional<Value>> readValuePrefix_impl(KeyValueStoreRedwoodUnversioned* self, Key key,
 	                                                          int maxLength, Optional<UID> debugID) {
+		state VersionedBTree::BTreeCursor cur;
+		wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion()));
+
 		wait(self->m_concurrentReads.take());
 		state FlowLock::Releaser releaser(self->m_concurrentReads);
-
 		++g_redwoodMetrics.opGet;
-		state Reference<IStoreCursor> cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion());

-		wait(cur->findEqual(key));
-		if (cur->isValid()) {
-			Value v = cur->getValue();
+		wait(cur.seekGTE(key, 0));
+		if (cur.isValid() && cur.get().key == key) {
+			Value v = cur.get().value.get();
 			int len = std::min(v.size(), maxLength);
-			return Value(cur->getValue().substr(0, len));
+			return Value(v.substr(0, len));
 		}
+
 		return Optional<Value>();
 	}

@ -5411,6 +5694,157 @@ KeyValue randomKV(int maxKeySize = 10, int maxValueSize = 5) {
 	return kv;
 }

+// Verify a range using a BTreeCursor.
+// Assumes that the BTree holds a single data version and the version is 0.
+ACTOR Future<int> verifyRangeBTreeCursor(VersionedBTree* btree, Key start, Key end, Version v,
+                                         std::map<std::pair<std::string, Version>, Optional<std::string>>* written,
+                                         int* pErrorCount) {
+	state int errors = 0;
+	if (end <= start) end = keyAfter(start);
+
+	state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator i =
+	    written->lower_bound(std::make_pair(start.toString(), 0));
+	state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator iEnd =
+	    written->upper_bound(std::make_pair(end.toString(), 0));
+	state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator iLast;
+
+	state VersionedBTree::BTreeCursor cur;
+	wait(btree->initBTreeCursor(&cur, v));
+	debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Start\n", v, start.printable().c_str(), end.printable().c_str());
+
+	// Randomly use the cursor for something else first.
+	if (deterministicRandom()->coinflip()) {
+		state Key randomKey = randomKV().key;
+		debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", v, start.printable().c_str(),
+		             end.printable().c_str(), randomKey.toString().c_str());
+		wait(success(cur.seek(randomKey, 0)));
+	}
+
+	debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.printable().c_str(),
+	             end.printable().c_str());
+	wait(cur.seekGTE(start, 0));
+
+	state std::vector<KeyValue> results;
+
+	while (cur.isValid() && cur.get().key < end) {
+		// Find the next written kv pair that would be present at this version
+		while (1) {
+			iLast = i;
+			if (i == iEnd) break;
+			++i;
+
+			if (iLast->first.second <= v && iLast->second.present() &&
+			    (i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) {
+				debug_printf("VerifyRange(@%" PRId64 ", %s, %s) Found key in written map: %s\n", v,
+				             start.printable().c_str(), end.printable().c_str(), iLast->first.first.c_str());
+				break;
+			}
+		}
+
+		if (iLast == iEnd) {
+			++errors;
+			++*pErrorCount;
+			printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v,
+			       start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str());
+			break;
+		}
+
+		if (cur.get().key != iLast->first.first) {
+			++errors;
+			++*pErrorCount;
+			printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' but expected '%s'\n", v,
+			       start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str(),
+			       iLast->first.first.c_str());
+			break;
+		}
+		if (cur.get().value.get() != iLast->second.get()) {
+			++errors;
+			++*pErrorCount;
+			printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' has tree value '%s' but expected '%s'\n", v,
+			       start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str(),
+			       cur.get().value.get().toString().c_str(), iLast->second.get().c_str());
+			break;
+		}
+
+		ASSERT(errors == 0);
+
+		results.push_back(KeyValue(KeyValueRef(cur.get().key, cur.get().value.get())));
+		wait(cur.moveNext());
+	}
+
+	// Make sure there are no further written kv pairs that would be present at this version.
+	while (1) {
+		iLast = i;
+		if (i == iEnd) break;
+		++i;
+		if (iLast->first.second <= v && iLast->second.present() &&
+		    (i == iEnd || i->first.first != iLast->first.first || i->first.second > v))
+			break;
+	}
+
+	if (iLast != iEnd) {
+		++errors;
+		++*pErrorCount;
+		printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has @%" PRId64 " '%s'\n", v,
+		       start.printable().c_str(), end.printable().c_str(), iLast->first.second, iLast->first.first.c_str());
+	}
+
+	debug_printf("VerifyRangeReverse(@%" PRId64 ", %s, %s): start\n", v, start.printable().c_str(),
+	             end.printable().c_str());
+
+	// Randomly use a new cursor at the same version for the reverse range read, if the version is still available for
+	// opening new cursors
+	if (v >= btree->getOldestVersion() && deterministicRandom()->coinflip()) {
+		cur = VersionedBTree::BTreeCursor();
+		wait(btree->initBTreeCursor(&cur, v));
+	}
+
+	// Now read the range from the tree in reverse order and compare to the saved results
+	wait(cur.seekLT(end, 0));
+
+	state std::vector<KeyValue>::const_reverse_iterator r = results.rbegin();
+
+	while (cur.isValid() && cur.get().key >= start) {
+		if (r == results.rend()) {
+			++errors;
+			++*pErrorCount;
+			printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' vs nothing in written map.\n", v,
+			       start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str());
+			break;
+		}
+
+		if (cur.get().key != r->key) {
+			++errors;
+			++*pErrorCount;
+			printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree key '%s' but expected '%s'\n", v,
+			       start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str(),
+			       r->key.toString().c_str());
+			break;
+		}
+		if (cur.get().value.get() != r->value) {
+			++errors;
+			++*pErrorCount;
+			printf("VerifyRangeReverse(@%" PRId64
+			       ", %s, %s) ERROR: Tree key '%s' has tree value '%s' but expected '%s'\n",
+			       v, start.printable().c_str(), end.printable().c_str(), cur.get().key.toString().c_str(),
+			       cur.get().value.get().toString().c_str(), r->value.toString().c_str());
+			break;
+		}
+
+		++r;
+		wait(cur.movePrev());
+	}
+
+	if (r != results.rend()) {
+		++errors;
+		++*pErrorCount;
+		printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: Tree range ended but written has '%s'\n", v,
+		       start.printable().c_str(), end.printable().c_str(), r->key.toString().c_str());
+	}
+
+	return errors;
+}
+
 ACTOR Future<int> verifyRange(VersionedBTree* btree, Key start, Key end, Version v,
                              std::map<std::pair<std::string, Version>, Optional<std::string>>* written,
                              int* pErrorCount) {
@ -5607,6 +6041,58 @@ ACTOR Future<int> seekAll(VersionedBTree* btree, Version v,
 	return errors;
 }

+// Verify the result of point reads for every set or cleared key at the given version
+ACTOR Future<int> seekAllBTreeCursor(VersionedBTree* btree, Version v,
+                          std::map<std::pair<std::string, Version>, Optional<std::string>>* written, int* pErrorCount) {
+	state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator i = written->cbegin();
+	state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator iEnd = written->cend();
+	state int errors = 0;
+	state VersionedBTree::BTreeCursor cur;
+
+	wait(btree->initBTreeCursor(&cur, v));
+
+	while (i != iEnd) {
+		state std::string key = i->first.first;
+		state Version ver = i->first.second;
+		if (ver == v) {
+			state Optional<std::string> val = i->second;
+			debug_printf("Verifying @%" PRId64 " '%s'\n", ver, key.c_str());
+			state Arena arena;
+			wait(cur.seekGTE(RedwoodRecordRef(KeyRef(arena, key), 0), 0));
+			bool foundKey = cur.isValid() && cur.get().key == key;
+			bool hasValue = foundKey && cur.get().value.present();
+
+			if (val.present()) {
+				bool valueMatch = hasValue && cur.get().value.get() == val.get();
+				if (!foundKey || !hasValue || !valueMatch) {
+					++errors;
+					++*pErrorCount;
+					if (!foundKey) {
+						printf("Verify ERROR: key_not_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(),
+						       val.get().c_str(), ver);
+					}
+					else if (!hasValue) {
+						printf("Verify ERROR: value_not_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(),
+						       val.get().c_str(), ver);
+					}
+					else if (!valueMatch) {
+						printf("Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%" PRId64 "\n",
+						       key.c_str(), cur.get().value.get().toString().c_str(), val.get().c_str(),
+						       ver);
+					}
+				}
+			} else if (foundKey && hasValue) {
+				++errors;
+				++*pErrorCount;
+				printf("Verify ERROR: cleared_key_found: '%s' -> '%s' @%" PRId64 "\n", key.c_str(),
+						cur.get().value.get().toString().c_str(), ver);
+			}
+		}
+		++i;
+	}
+	return errors;
+}
+
 ACTOR Future<Void> verify(VersionedBTree* btree, FutureStream<Version> vStream,
                          std::map<std::pair<std::string, Version>, Optional<std::string>>* written, int* pErrorCount,
                          bool serial) {
@ -5637,7 +6123,13 @@ ACTOR Future<Void> verify(VersionedBTree* btree, FutureStream<Version> vStream,
 			state Reference<IStoreCursor> cur = btree->readAtVersion(v);

 			debug_printf("Verifying entire key range at version %" PRId64 "\n", v);
-			fRangeAll = verifyRange(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pErrorCount);
+			if(deterministicRandom()->coinflip()) {
+				fRangeAll = verifyRange(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written,
+												pErrorCount);
+			} else {
+				fRangeAll = verifyRangeBTreeCursor(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written,
+												pErrorCount);
+			}
 			if (serial) {
 				wait(success(fRangeAll));
 			}
@ -5646,13 +6138,21 @@ ACTOR Future<Void> verify(VersionedBTree* btree, FutureStream<Version> vStream,
 			Key end = randomKV().key;
 			debug_printf("Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(),
 			             toString(end).c_str(), v);
-			fRangeRandom = verifyRange(btree, begin, end, v, written, pErrorCount);
+			if(deterministicRandom()->coinflip()) {
+				fRangeRandom = verifyRange(btree, begin, end, v, written, pErrorCount);
+			} else {
+				fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written, pErrorCount);
+			}
 			if (serial) {
 				wait(success(fRangeRandom));
 			}

 			debug_printf("Verifying seeks to each changed key at version %" PRId64 "\n", v);
-			fSeekAll = seekAll(btree, v, written, pErrorCount);
+			if(deterministicRandom()->coinflip()) {
+				fSeekAll = seekAll(btree, v, written, pErrorCount);
+			} else {
+				fSeekAll = seekAllBTreeCursor(btree, v, written, pErrorCount);
+			}
 			if (serial) {
 				wait(success(fSeekAll));
 			}
@ -6485,11 +6985,11 @@ TEST_CASE("!/redwood/correctness/btree") {
 	state int maxKeySize = deterministicRandom()->randomInt(1, pageSize * 2);
 	state int maxValueSize = randomSize(pageSize * 25);
 	state int maxCommitSize = shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6));
-	state int mutationBytesTarget = shortTest ? 100000 : randomSize(std::min<int>(maxCommitSize * 100, 100e6));
+	state int mutationBytesTarget = shortTest ? 100000 : randomSize(std::min<int>(maxCommitSize * 100, pageSize * 100000));
 	state double clearProbability = deterministicRandom()->random01() * .1;
 	state double clearSingleKeyProbability = deterministicRandom()->random01();
 	state double clearPostSetProbability = deterministicRandom()->random01() * .1;
-	state double coldStartProbability = pagerMemoryOnly ? 0 : deterministicRandom()->random01();
+	state double coldStartProbability = pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3);
 	state double advanceOldVersionProbability = deterministicRandom()->random01();
 	state double maxDuration = 60;
 	state int64_t cacheSizeBytes =
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@ -163,14 +163,13 @@ struct RegisterWorkerReply {
 	constexpr static FileIdentifier file_identifier = 16475696;
 	ProcessClass processClass;
 	ClusterControllerPriorityInfo priorityInfo;
-	Optional<uint16_t> storageCache;

 	RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {}
-	RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Optional<uint16_t> storageCache) : processClass(processClass), priorityInfo(priorityInfo), storageCache(storageCache) {}
+	RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo) : processClass(processClass), priorityInfo(priorityInfo) {}

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, processClass, priorityInfo, storageCache);
+		serializer(ar, processClass, priorityInfo);
 	}
 };

@ -302,19 +301,18 @@ struct RegisterWorkerRequest {
 	Generation generation;
 	Optional<DataDistributorInterface> distributorInterf;
 	Optional<RatekeeperInterface> ratekeeperInterf;
-	Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf;
 	Standalone<VectorRef<StringRef>> issues;
 	std::vector<NetworkAddress> incompatiblePeers;
 	ReplyPromise<RegisterWorkerReply> reply;
 	bool degraded;

 	RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {}
-	RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional<DataDistributorInterface> ddInterf, Optional<RatekeeperInterface> rkInterf, Optional<std::pair<uint16_t,StorageServerInterface>> storageCacheInterf, bool degraded) :
-	wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), storageCacheInterf(storageCacheInterf), degraded(degraded) {}
+	RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional<DataDistributorInterface> ddInterf, Optional<RatekeeperInterface> rkInterf, bool degraded) :
+	wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), degraded(degraded) {}

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, storageCacheInterf, issues, incompatiblePeers, reply, degraded);
+		serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, issues, incompatiblePeers, reply, degraded);
 	}
 };

@ -712,7 +710,7 @@ ACTOR Future<Void> logRouter(TLogInterface interf, InitializeLogRouterRequest re
                             Reference<AsyncVar<ServerDBInfo>> db);
 ACTOR Future<Void> dataDistributor(DataDistributorInterface ddi, Reference<AsyncVar<ServerDBInfo>> db);
 ACTOR Future<Void> ratekeeper(RatekeeperInterface rki, Reference<AsyncVar<ServerDBInfo>> db);
-ACTOR Future<Void> storageCache(StorageServerInterface interf, uint16_t id, Reference<AsyncVar<ServerDBInfo>> db);
+ACTOR Future<Void> storageCacheServer(StorageServerInterface interf, uint16_t id, Reference<AsyncVar<ServerDBInfo>> db);
 ACTOR Future<Void> backupWorker(BackupInterface bi, InitializeBackupRequest req, Reference<AsyncVar<ServerDBInfo>> db);

 void registerThreadForProfiling();
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@ -157,7 +157,7 @@ private:
 		} else {
 			self->fullyRecovered.send(Void());
 		}
-		
+
 		return Void();
 	}
 };
@ -373,7 +373,7 @@ ACTOR Future<Void> newSeedServers( Reference<MasterData> self, RecruitFromConfig
 				dcId_tags[recruits.storageServers[idx].locality.dcId()] = Tag(nextLocality, 0);
 				nextLocality++;
 			}
-				
+
 			Tag& tag = dcId_tags[recruits.storageServers[idx].locality.dcId()];
 			tag.id++;
 			idx++;
@ -582,7 +582,7 @@ ACTOR Future<vector<Standalone<CommitTransactionRef>>> recruitEverything( Refere
 			.detail("DesiredResolvers", self->configuration.getDesiredResolvers())
 			.detail("StoreType", self->configuration.storageServerStoreType)
 			.trackLatest("MasterRecoveryState");
-	
+
 	//FIXME: we only need log routers for the same locality as the master
 	int maxLogRouters = self->cstate.prevDBState.logRouterTags;
 	for(auto& old : self->cstate.prevDBState.oldTLogData) {
@ -1506,15 +1506,6 @@ ACTOR Future<Void> masterCore( Reference<MasterData> self ) {
 	tr.set(recoveryCommitRequest.arena, coordinatorsKey, self->coordinators.ccf->getConnectionString().toString());
 	tr.set(recoveryCommitRequest.arena, logsKey, self->logSystem->getLogsValue());
 	tr.set(recoveryCommitRequest.arena, primaryDatacenterKey, self->myInterface.locality.dcId().present() ? self->myInterface.locality.dcId().get() : StringRef());
-	
-	//FIXME: remove this code, caching the entire normal keyspace as a test of functionality
-	//TODO: caching disabled for this merge
-	//tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.begin), storageCacheValue({0}));
-	//tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.end), storageCacheValue({}));
-	//tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.begin), serverKeysTrue);
-	//tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.end), serverKeysFalse);
-	//tr.set(recoveryCommitRequest.arena, cacheChangeKeyFor(0), BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned()));
-	//tr.set(recoveryCommitRequest.arena, cacheChangeKey, BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned()));

 	tr.clear(recoveryCommitRequest.arena, tLogDatacentersKeys);
 	for(auto& dc : self->primaryDcId) {
@ -1655,7 +1646,7 @@ ACTOR Future<Void> masterServer( MasterInterface mi, Reference<AsyncVar<ServerDB
 		while(!self->addActor.isEmpty()) {
 			self->addActor.getFuture().pop();
 		}
-			
+
 		TEST(err.code() == error_code_master_tlog_failed);  // Master: terminated because of a tLog failure
 		TEST(err.code() == error_code_master_proxy_failed);  // Master: terminated because of a proxy failure
 		TEST(err.code() == error_code_master_resolver_failed);  // Master: terminated because of a resolver failure
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -986,7 +986,12 @@ ACTOR Future<Void> getValueQ( StorageServer* data, GetValueRequest req, Span spa
 		if( req.debugID.present() )
 			g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask());

-		GetValueReply reply(v);
+		// Check if the desired key might be cached
+		auto cached = data->cachedRangeMap[req.key];
+		//if (cached)
+		//	TraceEvent(SevDebug, "SSGetValueCached").detail("Key", req.key);
+
+		GetValueReply reply(v, cached);
 		reply.penalty = data->getPenalty();
 		req.reply.send(reply);
 	} catch (Error& e) {
@ -1198,11 +1203,14 @@ ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version,
 	state int pos = 0;


-	// Check if the desired key-range intersects the cached key-ranges
-	// TODO Find a more efficient way to do it
-	// TODO Also need this check in single key/value lookup
-	auto cached = data->cachedRangeMap.intersectingRanges(range);
-	result.cached = (cached.begin() != cached.end());
+	// Check if the desired key-range is cached
+	auto containingRange = data->cachedRangeMap.rangeContaining(range.begin);
+	if (containingRange.value() && containingRange->range().end >= range.end) {
+		//TraceEvent(SevDebug, "SSReadRangeCached").detail("Size",data->cachedRangeMap.size()).detail("ContainingRangeBegin",containingRange->range().begin).detail("ContainingRangeEnd",containingRange->range().end).
+		//	detail("Begin", range.begin).detail("End",range.end);
+		result.cached = true;
+	} else
+		result.cached = false;

 	// if (limit >= 0) we are reading forward, else backward
 	if (limit >= 0) {
@ -1623,8 +1631,14 @@ ACTOR Future<Void> getKeyQ( StorageServer* data, GetKeyRequest req, Span span )
 		data->counters.bytesQueried += resultSize;
 		++data->counters.rowsQueried;

-		GetKeyReply reply(updated);
+		// Check if the desired key might be cached
+		auto cached = data->cachedRangeMap[k];
+		//if (cached)
+		//	TraceEvent(SevDebug, "SSGetKeyCached").detail("Key", k).detail("Begin", shard.begin.printable()).detail("End", shard.end.printable());
+
+		GetKeyReply reply(updated, cached);
 		reply.penalty = data->getPenalty();
+
 		req.reply.send(reply);
 	}
 	catch (Error& e) {
@ -2597,7 +2611,6 @@ public:
 			if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix))
 				applyPrivateCacheData( data, m);
 			else {
-				//TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver);
 				applyPrivateData( data, m );
 			}
 		} else {
@ -2686,7 +2699,7 @@ private:
 	}

 	void applyPrivateCacheData( StorageServer* data, MutationRef const& m ) {
-		TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString());
+		//TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString());

 		if (processedCacheStartKey) {
 			// Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end)
@ -2694,17 +2707,16 @@ private:
 			KeyRangeRef keys( cacheStartKey.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ),
 							  m.param1.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ));
 			data->cachedRangeMap.insert(keys, true);
-			//TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Begin", keys.begin).detail("End", keys.end);
-			//fprintf(stderr, "applyPrivateCacheData : begin: %s, end: %s\n", printable(keys.begin).c_str(), printable(keys.end).c_str());

 			//Figure out the affected shard ranges and maintain the cached key-range information in the in-memory map
 			// TODO revisit- we are not splitting the cached ranges based on shards as of now.
 			if (0) {
-			auto cachedRanges = data->shards.intersectingRanges(keys);
-			for(auto shard = cachedRanges.begin(); shard != cachedRanges.end(); ++shard) {
-				KeyRangeRef intersectingRange = shard.range() & keys;
-				data->cachedRangeMap.insert(KeyRangeRef(intersectingRange.begin, intersectingRange.end), true);
-			}
+				auto cachedRanges = data->shards.intersectingRanges(keys);
+				for(auto shard = cachedRanges.begin(); shard != cachedRanges.end(); ++shard) {
+					KeyRangeRef intersectingRange = shard.range() & keys;
+					TraceEvent(SevDebug, "SSPrivateCacheMutationInsertUnexpected", data->thisServerID).detail("Begin", intersectingRange.begin).detail("End", intersectingRange.end);
+					data->cachedRangeMap.insert(intersectingRange, true);
+				}
 			}
 			processedStartKey = false;
 		} else if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) {
@ -2741,7 +2753,6 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
 		}

 		state Reference<ILogSystem::IPeekCursor> cursor = data->logCursor;
-		//TraceEvent("SSUpdatePeeking", data->thisServerID).detail("MyVer", data->version.get()).detail("Epoch", data->updateEpoch).detail("Seq", data->updateSequence);

 		loop {
 			wait( cursor->getMore() );
@ -2788,12 +2799,14 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
 				if (LogProtocolMessage::isNextIn(cloneReader)) {
 					LogProtocolMessage lpm;
 					cloneReader >> lpm;
+					//TraceEvent(SevDebug, "SSReadingLPM", data->thisServerID).detail("Mutation", lpm.toString());
 					dbgLastMessageWasProtocol = true;
 					cloneCursor1->setProtocolVersion(cloneReader.protocolVersion());
 				}
 				else {
 					MutationRef msg;
 					cloneReader >> msg;
+					//TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg.toString());

 					if (firstMutation && msg.param1.startsWith(systemKeys.end))
 						hasPrivateData = true;
@ -2857,7 +2870,6 @@ ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )

 		state Version ver = invalidVersion;
 		cloneCursor2->setProtocolVersion(data->logProtocol);
-		//TraceEvent("SSUpdatePeeked", data->thisServerID).detail("FromEpoch", data->updateEpoch).detail("FromSeq", data->updateSequence).detail("ToEpoch", results.end_epoch).detail("ToSeq", results.end_seq).detail("MsgSize", results.messages.size());
 		for (;cloneCursor2->hasMessage(); cloneCursor2->nextMessage()) {
 			if(mutationBytes > SERVER_KNOBS->DESIRED_UPDATE_BYTES) {
 				mutationBytes = 0;
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -21,6 +21,8 @@
 #include <tuple>
 #include <boost/lexical_cast.hpp>

+#include "fdbrpc/Locality.h"
+#include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
 #include "flow/ActorCollection.h"
 #include "flow/SystemMonitor.h"
@ -212,7 +214,8 @@ ACTOR Future<Void> workerHandleErrors(FutureStream<ErrorInfo> errors) {

 			endRole(err.role, err.id, "Error", ok, err.error);

-			if (err.error.code() == error_code_please_reboot || err.error.code() == error_code_io_timeout) throw err.error;
+
+			if (err.error.code() == error_code_please_reboot || err.error.code() == error_code_io_timeout || (err.role == Role::SHARED_TRANSACTION_LOG && err.error.code() == error_code_io_error )) throw err.error;
 		}
 	}
 }
@ -452,7 +455,7 @@ ACTOR Future<Void> registrationClient(
 	state Future<Void> cacheErrorsFuture;
 	state Optional<double> incorrectTime;
 	loop {
-		RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), scInterf->get(), degraded->get());
+		RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), degraded->get());
 		for (auto const& i : issues->get()) {
 			request.issues.push_back_deep(request.issues.arena(), i);
 		}
@ -490,41 +493,10 @@ ACTOR Future<Void> registrationClient(
 			when ( RegisterWorkerReply reply = wait( registrationReply )) {
 				processClass = reply.processClass;
 				asyncPriorityInfo->set( reply.priorityInfo );
-
-				if(!reply.storageCache.present()) {
-					cacheProcessFuture.cancel();
-					scInterf->set(Optional<std::pair<uint16_t,StorageServerInterface>>());
-				} else if (!scInterf->get().present() || scInterf->get().get().first != reply.storageCache.get()) {
-					StorageServerInterface recruited;
-					recruited.locality = locality;
-					recruited.initEndpoints();
-
-					std::map<std::string, std::string> details;
-					startRole( Role::STORAGE_CACHE, recruited.id(), interf.id(), details );
-
-					//DUMPTOKEN(recruited.getVersion);
-					DUMPTOKEN(recruited.getValue);
-					DUMPTOKEN(recruited.getKey);
-					DUMPTOKEN(recruited.getKeyValues);
-					DUMPTOKEN(recruited.getShardState);
-					DUMPTOKEN(recruited.waitMetrics);
-					DUMPTOKEN(recruited.splitMetrics);
-					DUMPTOKEN(recruited.getReadHotRanges);
-					DUMPTOKEN(recruited.getStorageMetrics);
-					DUMPTOKEN(recruited.waitFailure);
-					DUMPTOKEN(recruited.getQueuingMetrics);
-					DUMPTOKEN(recruited.getKeyValueStoreType);
-					DUMPTOKEN(recruited.watchValue);
-
-					cacheProcessFuture = storageCache( recruited, reply.storageCache.get(), dbInfo );
-					cacheErrorsFuture = forwardError(errors, Role::STORAGE_CACHE, recruited.id(), setWhenDoneOrError(cacheProcessFuture, scInterf, Optional<std::pair<uint16_t,StorageServerInterface>>()));
-					scInterf->set(std::make_pair(reply.storageCache.get(), recruited));
-				}
 			}
 			when ( wait( ccInterface->onChange() )) {}
 			when ( wait( ddInterf->onChange() ) ) {}
 			when ( wait( rkInterf->onChange() ) ) {}
-			when ( wait( scInterf->onChange() ) ) {}
 			when ( wait( degraded->onChange() ) ) {}
 			when ( wait( FlowTransport::transport().onIncompatibleChanged() ) ) {}
 			when ( wait( issues->onChange() ) ) {}
@ -711,6 +683,41 @@ ACTOR Future<Void> storageServerRollbackRebooter( Future<Void> prevStorageServer
 	}
 }

+ACTOR Future<Void> storageCacheRollbackRebooter( Future<Void> prevStorageCache, UID id, LocalityData locality, Reference<AsyncVar<ServerDBInfo>> db) {
+	loop {
+		ErrorOr<Void> e = wait( errorOr( prevStorageCache) );
+		if (!e.isError()) {
+			TraceEvent("StorageCacheRequestedReboot1", id);
+			return Void();
+		}
+		else if (e.getError().code() != error_code_please_reboot && e.getError().code() != error_code_worker_removed) {
+			TraceEvent("StorageCacheRequestedReboot2", id).detail("Code",e.getError().code());
+			throw e.getError();
+		}
+
+		TraceEvent("StorageCacheRequestedReboot", id);
+
+		StorageServerInterface recruited;
+		recruited.uniqueID = deterministicRandom()->randomUniqueID();// id;
+		recruited.locality = locality;
+		recruited.initEndpoints();
+
+		DUMPTOKEN(recruited.getValue);
+		DUMPTOKEN(recruited.getKey);
+		DUMPTOKEN(recruited.getKeyValues);
+		DUMPTOKEN(recruited.getShardState);
+		DUMPTOKEN(recruited.waitMetrics);
+		DUMPTOKEN(recruited.splitMetrics);
+		DUMPTOKEN(recruited.getStorageMetrics);
+		DUMPTOKEN(recruited.waitFailure);
+		DUMPTOKEN(recruited.getQueuingMetrics);
+		DUMPTOKEN(recruited.getKeyValueStoreType);
+		DUMPTOKEN(recruited.watchValue);
+
+		prevStorageCache = storageCacheServer(recruited, 0, db);
+	}
+}
+
 // FIXME:  This will not work correctly in simulation as all workers would share the same roles map
 std::set<std::pair<std::string, std::string>> g_roles;

@ -1048,10 +1055,40 @@ ACTOR Future<Void> workerServer(
 			}
 		}

+		bool hasCache = false;
+		//  start cache role if we have the right process class
+		if (initialClass.classType() == ProcessClass::StorageCacheClass) {
+			hasCache = true;
+			StorageServerInterface recruited;
+			recruited.locality = locality;
+			recruited.initEndpoints();
+
+			std::map<std::string, std::string> details;
+			startRole(Role::STORAGE_CACHE, recruited.id(), interf.id(), details);
+
+			// DUMPTOKEN(recruited.getVersion);
+			DUMPTOKEN(recruited.getValue);
+			DUMPTOKEN(recruited.getKey);
+			DUMPTOKEN(recruited.getKeyValues);
+			DUMPTOKEN(recruited.getShardState);
+			DUMPTOKEN(recruited.waitMetrics);
+			DUMPTOKEN(recruited.splitMetrics);
+			DUMPTOKEN(recruited.getStorageMetrics);
+			DUMPTOKEN(recruited.waitFailure);
+			DUMPTOKEN(recruited.getQueuingMetrics);
+			DUMPTOKEN(recruited.getKeyValueStoreType);
+			DUMPTOKEN(recruited.watchValue);
+
+			auto f = storageCacheServer(recruited, 0, dbInfo);
+			f = storageCacheRollbackRebooter( f, recruited.id(), recruited.locality, dbInfo);
+			errorForwarders.add(forwardError(errors, Role::STORAGE_CACHE, recruited.id(), f));
+		}
+
 		std::map<std::string, std::string> details;
 		details["Locality"] = locality.toString();
 		details["DataFolder"] = folder;
 		details["StoresPresent"] = format("%d", stores.size());
+		details["CachePresent"] = hasCache ? "true" : "false";
 		startRole( Role::WORKER, interf.id(), interf.id(), details );
 		errorForwarders.add(traceRole(Role::WORKER, interf.id()));

@ -1346,7 +1383,7 @@ ACTOR Future<Void> workerServer(
 				DUMPTOKEN( recruited.getQueuingMetrics );
 				DUMPTOKEN( recruited.confirmRunning );

-				errorForwarders.add( zombie(recruited, forwardError( errors, Role::LOG_ROUTER, recruited.id(), 
+				errorForwarders.add( zombie(recruited, forwardError( errors, Role::LOG_ROUTER, recruited.id(),
 						logRouter( recruited, req, dbInfo ) ) ) );
 				req.reply.send(recruited);
 			}
@ -1725,7 +1762,7 @@ ACTOR Future<Void> fdbd(
 		Reference<AsyncVar<ServerDBInfo>> dbInfo( new AsyncVar<ServerDBInfo>(ServerDBInfo()) );

 		actors.push_back(reportErrors(monitorAndWriteCCPriorityInfo(fitnessFilePath, asyncPriorityInfo), "MonitorAndWriteCCPriorityInfo"));
-		if (processClass == ProcessClass::TesterClass) {
+		if (processClass.machineClassFitness(ProcessClass::ClusterController) == ProcessClass::NeverAssign) {
 			actors.push_back( reportErrors( monitorLeader( connFile, cc ), "ClusterController" ) );
 		} else if (processClass == ProcessClass::StorageClass && SERVER_KNOBS->MAX_DELAY_STORAGE_CANDIDACY_SECONDS > 0) {
 			actors.push_back( reportErrors( monitorLeaderRemotelyWithDelayedCandidacy( connFile, cc, asyncPriorityInfo, recoveredDiskFiles.getFuture(), localities, dbInfo ), "ClusterController" ) );
--- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
@ -507,9 +507,9 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 				}

 				// Wait for parallel restore to finish before we can proceed
-				TraceEvent("FastRestore").detail("BackupAndParallelRestore", "WaitForRestoreToFinish");
+				TraceEvent("FastRestoreWorkload").detail("WaitForRestoreToFinish", randomID);
 				wait(backupAgent.parallelRestoreFinish(cx, randomID));
-				TraceEvent("FastRestore").detail("BackupAndParallelRestore", "RestoreFinished");
+				TraceEvent("FastRestoreWorkload").detail("RestoreFinished", randomID);

 				for (auto& restore : restores) {
 					ASSERT(!restore.isError());
@ -668,7 +668,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 				g_simulator.backupAgents = ISimulator::NoBackupAgents;
 			}
 		} catch (Error& e) {
-			TraceEvent(SevError, "BackupAndRestoreCorrectness").error(e).GetLastError();
+			TraceEvent(SevError, "BackupAndParallelRestoreCorrectness").error(e).GetLastError();
 			throw;
 		}
 		return Void();
--- a/fdbserver/workloads/Cache.actor.cpp
+++ b/fdbserver/workloads/Cache.actor.cpp
@ -0,0 +1,33 @@
+#include "fdbclient/ManagementAPI.actor.h"
+#include "fdbserver/TesterInterface.actor.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "flow/actorcompiler.h"  // This must be the last #include.
+
+struct CacheWorkload : TestWorkload {
+	Key keyPrefix;
+
+	CacheWorkload(WorkloadContext const& wcx)
+		: TestWorkload(wcx)
+	{
+		keyPrefix = unprintable( getOption(options, LiteralStringRef("keyPrefix"), LiteralStringRef("")).toString() );
+	}
+
+	virtual std::string description() { return "CacheWorkload"; }
+	virtual Future<Void> setup( Database const& cx ) {
+        if (clientId == 0) {
+            //Call management API to cache keys under the given prefix
+            return addCachedRange(cx, prefixRange(keyPrefix));
+        }
+        return Void();
+	}
+	virtual Future<Void> start( Database const& cx ) {
+        return Void();
+	}
+	virtual Future<bool> check( Database const& cx ) {
+        return true;
+	}
+	virtual void getMetrics( vector<PerfMetric>& m ) {
+	}
+};
+
+WorkloadFactory<CacheWorkload> CacheWorkloadFactory("Cache");
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@ -713,6 +713,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 			state vector<UID> storageServers = (isRelocating) ? destStorageServers : sourceStorageServers;
 			state vector<StorageServerInterface> storageServerInterfaces;

+			//TraceEvent("ConsistencyCheck_GetStorageInfo").detail("StorageServers", storageServers.size());
 			loop {
 				try {
 					vector< Future< Optional<Value> > > serverListEntries;
@ -725,6 +726,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 						else if (self->performQuiescentChecks)
 							self->testFailure("/FF/serverList changing in a quiescent database");
 					}
+
 					break;
 				}
 				catch(Error &e) {
@ -922,7 +924,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 							else if(!isRelocating)
 							{
 								TraceEvent("ConsistencyCheck_StorageServerUnavailable").suppressFor(1.0).detail("StorageServer", storageServers[j]).detail("ShardBegin", printable(range.begin)).detail("ShardEnd", printable(range.end))
-									.detail("Address", storageServerInterfaces[j].address()).detail("GetKeyValuesToken", storageServerInterfaces[j].getKeyValues.getEndpoint().token);
+									.detail("Address", storageServerInterfaces[j].address()).detail("UID", storageServerInterfaces[j].id()).detail("GetKeyValuesToken", storageServerInterfaces[j].getKeyValues.getEndpoint().token);

 								//All shards should be available in quiscence
 								if(self->performQuiescentChecks)
--- a/fdbserver/workloads/Cycle.actor.cpp
+++ b/fdbserver/workloads/Cycle.actor.cpp
@ -128,9 +128,9 @@ struct CycleWorkload : TestWorkload {
 						tr.set( self->key(r), self->value(r3) );
 						tr.set( self->key(r2), self->value(r4) );
 						tr.set( self->key(r3), self->value(r2) );
-						// TraceEvent("CyclicTestMX").detail("Key", self->key(r).toString()).detail("Value", self->value(r3).toString());
-						// TraceEvent("CyclicTestMX").detail("Key", self->key(r2).toString()).detail("Value", self->value(r4).toString());
-						// TraceEvent("CyclicTestMX").detail("Key", self->key(r3).toString()).detail("Value", self->value(r2).toString());
+						//TraceEvent("CyclicTestMX1").detail("Key", self->key(r).toString()).detail("Value", self->value(r3).toString());
+						//TraceEvent("CyclicTestMX2").detail("Key", self->key(r2).toString()).detail("Value", self->value(r4).toString());
+						//TraceEvent("CyclicTestMX3").detail("Key", self->key(r3).toString()).detail("Value", self->value(r2).toString());

 						wait( tr.commit() );
 						// TraceEvent("CycleCommit");
@ -174,7 +174,10 @@ struct CycleWorkload : TestWorkload {
 			return false;
 		}
 		int i=0;
-		for(int c=0; c<nodeCount; c++) {
+		int iPrev=0;
+		double d;
+		int c;
+		for(c=0; c<nodeCount; c++) {
 			if (c && !i) {
 				TraceEvent(SevError, "TestFailure").detail("Reason", "Cycle got shorter").detail("Before", nodeCount).detail("After", c).detail("KeyPrefix", keyPrefix.printable());
 				logTestData(data);
@ -185,7 +188,8 @@ struct CycleWorkload : TestWorkload {
 				logTestData(data);
 				return false;
 			}
-			double d = testKeyToDouble(data[i].value, keyPrefix);
+			d = testKeyToDouble(data[i].value, keyPrefix);
+			iPrev = i;
 			i = (int)d;
 			if ( i != d || i<0 || i>=nodeCount) {
 				TraceEvent(SevError, "TestFailure").detail("Reason", "Invalid value").detail("KeyPrefix", keyPrefix.printable());
@ -194,7 +198,8 @@ struct CycleWorkload : TestWorkload {
 			}
 		}
 		if (i != 0) {
-			TraceEvent(SevError, "TestFailure").detail("Reason", "Cycle got longer").detail("KeyPrefix", keyPrefix.printable());
+			TraceEvent(SevError, "TestFailure").detail("Reason", "Cycle got longer").detail("KeyPrefix", keyPrefix.printable()).detail("Key", key(i)).detail("Value", data[i].value).
+				detail("Iteration", c).detail("Nodecount", nodeCount).detail("Int", i).detail("Double", d).detail("ValuePrev", data[iPrev].value).detail("KeyPrev", data[iPrev].key);
 			logTestData(data);
 			return false;
 		}
--- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
@ -59,7 +59,9 @@ struct ExceptionContract {
 			e.code() == error_code_transaction_cancelled ||
 			e.code() == error_code_key_too_large ||
 			e.code() == error_code_value_too_large ||
-			e.code() == error_code_process_behind)
+			e.code() == error_code_process_behind ||
+			e.code() == error_code_batch_transaction_throttled ||
+			e.code() == error_code_tag_throttled)
 		{
 			return;
 		}
--- a/fdbserver/workloads/ReadWrite.actor.cpp
+++ b/fdbserver/workloads/ReadWrite.actor.cpp
@ -226,15 +226,21 @@ struct ReadWriteWorkload : KVWorkload {
 	ACTOR static Future<bool> traceDumpWorkers( Reference<AsyncVar<ServerDBInfo>> db ) {
 		try {
 			loop {
-				ErrorOr<std::vector<WorkerDetails>> workerList = wait( db->get().clusterInterface.getWorkers.tryGetReply( GetWorkersRequest() ) );
-				if( workerList.present() ) {
-					std::vector<Future<ErrorOr<Void>>> dumpRequests;
-					for( int i = 0; i < workerList.get().size(); i++)
-						dumpRequests.push_back( workerList.get()[i].interf.traceBatchDumpRequest.tryGetReply( TraceBatchDumpRequest() ) );
-					wait( waitForAll( dumpRequests ) );
-					return true;
+				choose {
+					when( wait( db->onChange() ) ) {}
+
+					when (ErrorOr<std::vector<WorkerDetails>> workerList = wait( db->get().clusterInterface.getWorkers.tryGetReply( GetWorkersRequest() ) );)
+					{
+						if( workerList.present() ) {
+							std::vector<Future<ErrorOr<Void>>> dumpRequests;
+							for( int i = 0; i < workerList.get().size(); i++)
+								dumpRequests.push_back( workerList.get()[i].interf.traceBatchDumpRequest.tryGetReply( TraceBatchDumpRequest() ) );
+							wait( waitForAll( dumpRequests ) );
+							return true;
+						}
+						wait( delay( 1.0 ) );
+					}
 				}
-				wait( delay( 1.0 ) );
 			}
 		} catch( Error &e ) {
 			TraceEvent(SevError, "FailedToDumpWorkers").error(e);
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@ -26,6 +26,8 @@ void forceLinkIndexedSetTests();
 void forceLinkDequeTests();
 void forceLinkFlowTests();
 void forceLinkVersionedMapTests();
+void forceLinkMemcpyTests();
+void forceLinkMemcpyPerfTests();

 struct UnitTestWorkload : TestWorkload {
 	bool enabled;
@ -45,6 +47,8 @@ struct UnitTestWorkload : TestWorkload {
 		forceLinkDequeTests();
 		forceLinkFlowTests();
 		forceLinkVersionedMapTests();
+		forceLinkMemcpyTests();
+		forceLinkMemcpyPerfTests();
 	}

 	virtual std::string description() { return "UnitTests"; }
--- a/flow/Arena.h
+++ b/flow/Arena.h
@ -380,12 +380,11 @@ public:
 	}
 #else
 	Standalone( const T& t, const Arena& arena ) : Arena( arena ), T( t ) {}
-	Standalone( const Standalone<T> & t ) : Arena((Arena const&)t), T((T const&)t) {}
-	Standalone<T>& operator=( const Standalone<T> & t ) {
-		*(Arena*)this = (Arena const&)t;
-		*(T*)this = (T const&)t;
-		return *this;
-	}
+	Standalone(const Standalone<T>&) = default;
+	Standalone<T>& operator=(const Standalone<T>&) = default;
+	Standalone(Standalone<T>&&) = default;
+	Standalone<T>& operator=(Standalone<T>&&) = default;
+	~Standalone() = default;
 #endif

 	template <class U> Standalone<U> castTo() const {
@ -713,15 +712,20 @@ inline bool operator != (const StringRef& lhs, const StringRef& rhs ) { return !
 inline bool operator <= ( const StringRef& lhs, const StringRef& rhs ) { return !(lhs>rhs); }
 inline bool operator >= ( const StringRef& lhs, const StringRef& rhs ) { return !(lhs<rhs); }

-// This trait is used by VectorRef to determine if it should just memcpy the vector contents.
-// FIXME:  VectorRef really should use std::is_trivially_copyable for this BUT that is not implemented
-// in gcc c++0x so instead we will use this custom trait which defaults to std::is_trivial, which
-// handles most situations but others will have to be specialized.
+// This trait is used by VectorRef to determine if deep copy constructor should recursively
+// call deep copies of each element.
+//
+// TODO: There should be an easier way to identify the difference between flow_ref and non-flow_ref types.
+// std::is_trivially_copyable does not work because some flow_ref types are trivially copyable
+// and some non-flow_ref types are not trivially copyable.
 template <typename T>
-struct memcpy_able : std::is_trivial<T> {};
+struct flow_ref : std::integral_constant<bool, !std::is_fundamental_v<T>> {};

 template <>
-struct memcpy_able<UID> : std::integral_constant<bool, true> {};
+struct flow_ref<UID> : std::integral_constant<bool, false> {};
+
+template <class A, class B>
+struct flow_ref<std::pair<A, B>> : std::integral_constant<bool, false> {};

 template<class T>
 struct string_serialized_traits : std::false_type {
@ -797,7 +801,7 @@ public:
 	using value_type = T;
 	static_assert(SerStrategy == VecSerStrategy::FlatBuffers || string_serialized_traits<T>::value);

-	// T must be trivially destructible (and copyable)!
+	// T must be trivially destructible!
 	VectorRef() : data(0), m_size(0), m_capacity(0) {}

 	template <VecSerStrategy S>
@ -812,19 +816,19 @@ public:
 		return *this;
 	}

-	// Arena constructor for non-Ref types, identified by memcpy_able
+	// Arena constructor for non-Ref types, identified by !flow_ref
 	template <class T2 = T, VecSerStrategy S>
-	VectorRef(Arena& p, const VectorRef<T, S>& toCopy, typename std::enable_if<memcpy_able<T2>::value, int>::type = 0)
+	VectorRef(Arena& p, const VectorRef<T, S>& toCopy, typename std::enable_if<!flow_ref<T2>::value, int>::type = 0)
 	  : VPS(toCopy), data((T*)new (p) uint8_t[sizeof(T) * toCopy.size()]), m_size(toCopy.size()),
 	    m_capacity(toCopy.size()) {
 		if (m_size > 0) {
-			memcpy(data, toCopy.data, m_size * sizeof(T));
+			std::copy(toCopy.data, toCopy.data + m_size, data);
 		}
 	}

 	// Arena constructor for Ref types, which must have an Arena constructor
 	template <class T2 = T, VecSerStrategy S>
-	VectorRef(Arena& p, const VectorRef<T, S>& toCopy, typename std::enable_if<!memcpy_able<T2>::value, int>::type = 0)
+	VectorRef(Arena& p, const VectorRef<T, S>& toCopy, typename std::enable_if<flow_ref<T2>::value, int>::type = 0)
 	  : VPS(), data((T*)new (p) uint8_t[sizeof(T) * toCopy.size()]), m_size(toCopy.size()), m_capacity(toCopy.size()) {
 		for (int i = 0; i < m_size; i++) {
 			auto ptr = new (&data[i]) T(p, toCopy[i]);
@ -920,7 +924,7 @@ public:
 		if (m_size + count > m_capacity) reallocate(p, m_size + count);
 		VPS::invalidate();
 		if (count > 0) {
-			memcpy(data + m_size, begin, sizeof(T) * count);
+			std::copy(begin, begin + count, data + m_size);
 		}
 		m_size += count;
 	}
@ -960,15 +964,15 @@ public:
 		if (size > m_capacity) reallocate(p, size);
 	}

-	// expectedSize() for non-Ref types, identified by memcpy_able
+	// expectedSize() for non-Ref types, identified by !flow_ref
 	template <class T2 = T>
-	typename std::enable_if<memcpy_able<T2>::value, size_t>::type expectedSize() const {
+	typename std::enable_if<!flow_ref<T2>::value, size_t>::type expectedSize() const {
 		return sizeof(T) * m_size;
 	}

 	// expectedSize() for Ref types, which must in turn have expectedSize() implemented.
 	template <class T2 = T>
-	typename std::enable_if<!memcpy_able<T2>::value, size_t>::type expectedSize() const {
+	typename std::enable_if<flow_ref<T2>::value, size_t>::type expectedSize() const {
 		size_t t = sizeof(T) * m_size;
 		for (int i = 0; i < m_size; i++) t += data[i].expectedSize();
 		return t;
@ -985,9 +989,9 @@ private:
 	void reallocate(Arena& p, int requiredCapacity) {
 		requiredCapacity = std::max(m_capacity * 2, requiredCapacity);
 		// SOMEDAY: Maybe we are right at the end of the arena and can expand cheaply
-		T* newData = (T*)new (p) uint8_t[requiredCapacity * sizeof(T)];
+		T* newData = new (p) T[requiredCapacity];
 		if (m_size > 0) {
-			memcpy(newData, data, m_size * sizeof(T));
+			std::move(data, data + m_size, newData);
 		}
 		data = newData;
 		m_capacity = requiredCapacity;
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@ -69,7 +69,7 @@ set(FLOW_SRCS
  XmlTraceLogFormatter.cpp
  XmlTraceLogFormatter.h
  actorcompiler.h
-	crc32c.h
+  crc32c.h
  crc32c.cpp
  error_definitions.h
  ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h
@ -77,14 +77,18 @@ set(FLOW_SRCS
  flat_buffers.h
  flow.cpp
  flow.h
+  folly_memcpy.S
  genericactors.actor.cpp
  genericactors.actor.h
  network.cpp
  network.h
+  rte_memcpy.h
  serialize.cpp
  serialize.h
  stacktrace.amalgamation.cpp
  stacktrace.h
+  test_memcpy.cpp
+  test_memcpy_perf.cpp
  version.cpp)

 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)
--- a/flow/IKeyValueContainer.h
+++ b/flow/IKeyValueContainer.h
@ -69,23 +69,33 @@ bool operator<(CompatibleWithKey const& l, KeyValueMapPair const& r) {

 class IKeyValueContainer {
 public:
-	typedef typename IndexedSet<KeyValueMapPair, uint64_t>::iterator iterator;
+	using const_iterator = IndexedSet<KeyValueMapPair, uint64_t>::const_iterator;
+	using iterator = IndexedSet<KeyValueMapPair, uint64_t>::iterator;

 	IKeyValueContainer() = default;
 	~IKeyValueContainer() = default;

-	bool empty() { return data.empty(); }
+	bool empty() const { return data.empty(); }
 	void clear() { return data.clear(); }

-	std::tuple<size_t, size_t, size_t> size() { return std::make_tuple(0, 0, 0); }
+	std::tuple<size_t, size_t, size_t> size() const { return std::make_tuple(0, 0, 0); }

+	const_iterator find(const StringRef& key) const { return data.find(key); }
 	iterator find(const StringRef& key) { return data.find(key); }
+	const_iterator begin() const { return data.begin(); }
 	iterator begin() { return data.begin(); }
+	const_iterator cbegin() const { return begin(); }
+	const_iterator end() const { return data.end(); }
 	iterator end() { return data.end(); }
+	const_iterator cend() const { return end(); }

+	const_iterator lower_bound(const StringRef& key) const { return data.lower_bound(key); }
 	iterator lower_bound(const StringRef& key) { return data.lower_bound(key); }
+	const_iterator upper_bound(const StringRef& key) const { return data.upper_bound(key); }
 	iterator upper_bound(const StringRef& key) { return data.upper_bound(key); }
-	iterator previous(iterator i) const { return data.previous(i); }
+	const_iterator previous(const_iterator i) const { return data.previous(i); }
+	const_iterator previous(iterator i) const { return data.previous(const_iterator{ i }); }
+	iterator previous(iterator i) { return data.previous(i); }

 	void erase(iterator begin, iterator end) { data.erase(begin, end); }
 	iterator insert(const StringRef& key, const StringRef& val, bool replaceExisting = true) {
@ -96,7 +106,8 @@ public:
 		return data.insert(pairs, replaceExisting);
 	}

-	uint64_t sumTo(iterator to) { return data.sumTo(to); }
+	uint64_t sumTo(const_iterator to) const { return data.sumTo(to); }
+	uint64_t sumTo(iterator to) const { return data.sumTo(const_iterator{ to }); }

 	static int getElementBytes() { return IndexedSet<KeyValueMapPair, uint64_t>::getElementBytes(); }

--- a/flow/IndexedSet.cpp
+++ b/flow/IndexedSet.cpp
@ -31,6 +31,7 @@
 #include <cstring>
 #include <deque>
 #include <random>
+#include <type_traits>
 #include "flow/TreeBenchmark.h"
 #include "flow/UnitTest.h"
 template <class Node>
@ -204,18 +205,25 @@ TEST_CASE("/flow/IndexedSet/strings") {
 template <typename K>
 struct IndexedSetHarness {
 	using map = IndexedSet<K, int>;
+	using const_result = typename map::const_iterator;
 	using result = typename map::iterator;
 	using key_type = K;

 	map s;

 	void insert(K const& k) { s.insert(K(k), 1); }
-	result find(K const& k) const { return s.find(k); }
-	result not_found() const { return s.end(); }
-	result begin() const { return s.begin(); }
-	result end() const { return s.end(); }
-	result lower_bound(K const& k) const { return s.lower_bound(k); }
-	result upper_bound(K const& k) const { return s.upper_bound(k); }
+	const_result find(K const& k) const { return s.find(k); }
+	result find(K const& k) { return s.find(k); }
+	const_result not_found() const { return s.end(); }
+	result not_found() { return s.end(); }
+	const_result begin() const { return s.begin(); }
+	result begin() { return s.begin(); }
+	const_result end() const { return s.end(); }
+	result end() { return s.end(); }
+	const_result lower_bound(K const& k) const { return s.lower_bound(k); }
+	result lower_bound(K const& k) { return s.lower_bound(k); }
+	const_result upper_bound(K const& k) const { return s.upper_bound(k); }
+	result upper_bound(K const& k) { return s.upper_bound(k); }
 	void erase(K const& k) { s.erase(k); }
 };

@ -494,4 +502,60 @@ TEST_CASE("/flow/IndexedSet/all numbers") {
 	return Void();
 }

+template <class T>
+static constexpr bool is_const_ref_v = std::is_const_v<typename std::remove_reference_t<T>>;
+
+TEST_CASE("/flow/IndexedSet/const_iterator") {
+	struct Key {
+		int key;
+		explicit Key(int key) : key(key) {}
+	};
+	struct Metric {
+		int metric;
+		explicit Metric(int metric) : metric(metric) {}
+	};
+
+	IndexedSet<int, int64_t> is;
+	for (int i = 0; i < 10; ++i) is.insert(i, 1);
+
+	IndexedSet<int, int64_t>& ncis = is;
+	static_assert(!is_const_ref_v<decltype(ncis)>);
+	static_assert(!is_const_ref_v<decltype(*ncis.begin())>);
+	static_assert(is_const_ref_v<decltype(*ncis.cbegin())>);
+	static_assert(!is_const_ref_v<decltype(*ncis.previous(ncis.end()))>);
+	static_assert(is_const_ref_v<decltype(*ncis.previous(ncis.cend()))>);
+	static_assert(!is_const_ref_v<decltype(*ncis.index(Metric{ 5 }))>);
+	static_assert(!is_const_ref_v<decltype(*ncis.find(Key{ 5 }))>);
+	static_assert(!is_const_ref_v<decltype(*ncis.upper_bound(Key{ 5 }))>);
+	static_assert(!is_const_ref_v<decltype(*ncis.lower_bound(Key{ 5 }))>);
+	static_assert(!is_const_ref_v<decltype(*ncis.lastLessOrEqual(Key{ 5 }))>);
+	static_assert(!is_const_ref_v<decltype(*ncis.lastItem())>);
+
+	const IndexedSet<int, int64_t>& cis = is;
+	static_assert(is_const_ref_v<decltype(cis)>);
+	static_assert(is_const_ref_v<decltype(*cis.begin())>);
+	static_assert(is_const_ref_v<decltype(*cis.cbegin())>);
+	static_assert(is_const_ref_v<decltype(*cis.previous(cis.end()))>);
+	static_assert(is_const_ref_v<decltype(*cis.previous(cis.cend()))>);
+	static_assert(is_const_ref_v<decltype(*cis.previous(ncis.end()))>);
+	static_assert(is_const_ref_v<decltype(*cis.previous(ncis.cend()))>);
+	static_assert(is_const_ref_v<decltype(*cis.index(Metric{ 5 }))>);
+	static_assert(is_const_ref_v<decltype(*cis.find(Key{ 5 }))>);
+	static_assert(is_const_ref_v<decltype(*cis.upper_bound(Key{ 5 }))>);
+	static_assert(is_const_ref_v<decltype(*cis.lower_bound(Key{ 5 }))>);
+	static_assert(is_const_ref_v<decltype(*cis.lastLessOrEqual(Key{ 5 }))>);
+	static_assert(is_const_ref_v<decltype(*cis.lastItem())>);
+
+	for (auto& val : ncis) {
+		static_assert(!is_const_ref_v<decltype(val)>);
+	}
+	for (const auto& val : ncis) {
+		static_assert(is_const_ref_v<decltype(val)>);
+	}
+	for (auto& val : cis) {
+		static_assert(is_const_ref_v<decltype(val)>);
+	}
+
+	return Void();
+}
 void forceLinkIndexedSetTests() {}
--- a/flow/IndexedSet.h
+++ b/flow/IndexedSet.h
@ -29,6 +29,7 @@
 #include "flow/Error.h"

 #include <deque>
+#include <type_traits>
 #include <vector>

 // IndexedSet<T, Metric> is similar to a std::set<T>, with the following additional features:
@ -39,7 +40,6 @@
 //   - Search functions (find(), lower_bound(), etc) can accept a type comparable to T instead of T
 //     (e.g. StringRef when T is std::string or Standalone<StringRef>).  This can save a lot of needless
 //     copying at query time for read-mostly sets with string keys.
-//   - iterators are not const; the responsibility of not changing the order lies with the caller
 //   - the size() function is missing; if the metric being used is a count sumTo(end()) will do instead
 // A number of STL compatibility features are missing and should be added as needed.
 // T must define operator <, which must define a total order.  Unlike std::set,
@ -70,8 +70,10 @@ private: // Forward-declare IndexedSet::Node because Clang is much stricter abou
 		// combinations, but still take advantage of move constructors when available (or required).
 		template <class T_, class Metric_>
 		Node(T_&& data, Metric_&& m, Node* parent=0) : data(std::forward<T_>(data)), total(std::forward<Metric_>(m)), parent(parent), balance(0) {
-			child[0] = child[1] = NULL;
+			child[0] = child[1] = nullptr;
 		}
+		Node(Node const&) = delete;
+		Node& operator=(Node const&) = delete;
 		~Node(){
 			delete child[0];
 			delete child[1];
@ -84,35 +86,93 @@ private: // Forward-declare IndexedSet::Node because Clang is much stricter abou
 		Node *parent;
 	};

-public:
-	struct iterator{
-		typename IndexedSet::Node *i;
-		iterator() : i(0) {};
-		iterator(typename IndexedSet::Node *n) : i(n) {};
-		T& operator*() { return i->data; };
-		T* operator->() { return &i->data; }
+	template <bool isConst>
+	struct IteratorImpl {
+		typename std::conditional_t<isConst, const IndexedSet::Node, IndexedSet::Node>* node;
+
+		explicit IteratorImpl<isConst>(const IteratorImpl<!isConst>& nonConstIter) : node(nonConstIter.node) {
+			static_assert(isConst);
+		}
+
+		explicit IteratorImpl(decltype(node) n = nullptr) : node(n){};
+
+		typename std::conditional_t<isConst, const T, T>& operator*() const { return node->data; }
+
+		typename std::conditional_t<isConst, const T, T>* operator->() const { return &node->data; }
+
 		void operator++();
 		void decrementNonEnd();
-		bool operator == ( const iterator& r ) const { return i == r.i; }
-		bool operator != ( const iterator& r ) const { return i != r.i; }
+		bool operator==(const IteratorImpl<isConst>& r) const { return node == r.node; }
+		bool operator!=(const IteratorImpl<isConst>& r) const { return node != r.node; }
 		// following two methods are for memory storage engine(KeyValueStoreMemory class) use only
 		// in order to have same interface as radixtree
-		StringRef& getKey(uint8_t* dummyContent) const { return i->data.key; }
-		StringRef& getValue() const { return i->data.value; }
+		typename std::conditional_t<isConst, const StringRef, StringRef>& getKey(uint8_t* dummyContent) const {
+			return node->data.key;
+		}
+		typename std::conditional_t<isConst, const StringRef, StringRef>& getValue() const { return node->data.value; }
 	};

-	IndexedSet() : root(NULL) {};
+	template <bool isConst>
+	struct Impl {
+		using NodeT = std::conditional_t<isConst, const Node, Node>;
+		using IteratorT = IteratorImpl<isConst>;
+		using SetT = std::conditional_t<isConst, const IndexedSet<T, Metric>, IndexedSet<T, Metric>>;
+
+		static IteratorT begin(SetT&);
+
+		template <bool constIterator>
+		static IteratorImpl<isConst || constIterator> previous(SetT&, IteratorImpl<constIterator>);
+
+		template <class M>
+		static IteratorT index(SetT&, const M&);
+
+		template <class Key>
+		static IteratorT find(SetT&, const Key&);
+
+		template <class Key>
+		static IteratorT upper_bound(SetT&, const Key&);
+
+		template <class Key>
+		static IteratorT lower_bound(SetT&, const Key&);
+
+		template <class Key>
+		static IteratorT lastLessOrEqual(SetT&, const Key&);
+
+		static IteratorT lastItem(SetT&);
+	};
+
+	using ConstImpl = Impl<true>;
+	using NonConstImpl = Impl<false>;
+
+public:
+	using iterator = IteratorImpl<false>;
+	using const_iterator = IteratorImpl<true>;
+
+	IndexedSet() : root(nullptr){};
 	~IndexedSet() { delete root; }
-	IndexedSet(IndexedSet&& r) BOOST_NOEXCEPT : root(r.root) { r.root = NULL; }
+	IndexedSet(IndexedSet&& r) BOOST_NOEXCEPT : root(r.root) { r.root = nullptr; }
 	IndexedSet& operator=(IndexedSet&& r) BOOST_NOEXCEPT { delete root; root = r.root; r.root = 0; return *this; }

-	iterator begin() const;
-	iterator end() const { return iterator(); }
-	iterator previous(iterator i) const;
-	iterator lastItem() const;
+	const_iterator begin() const { return ConstImpl::begin(*this); };
+	iterator begin() { return NonConstImpl::begin(*this); };
+	const_iterator cbegin() const { return begin(); }
+
+	const_iterator end() const { return const_iterator{}; }
+	iterator end() { return iterator{}; }
+	const_iterator cend() const { return end(); }
+
+	const_iterator previous(const_iterator i) const { return ConstImpl::previous(*this, i); }
+	const_iterator previous(iterator i) const { return ConstImpl::previous(*this, const_iterator{ i }); }
+	iterator previous(iterator i) { return NonConstImpl::previous(*this, i); }
+
+	const_iterator lastItem() const { return ConstImpl::lastItem(*this); }
+	iterator lastItem() { return NonConstImpl::lastItem(*this); }

 	bool empty() const { return !root; }
-	void clear() { delete root; root = NULL; }
+	void clear() {
+		delete root;
+		root = nullptr;
+	}
 	void swap( IndexedSet& r ) { std::swap( root, r.root ); }

 	// Place data in the set with the given metric.  If an item equal to data is already in the set and,
@ -159,36 +219,78 @@ public:

 	// Returns x such that key==*x, or end()
 	template <class Key>
-	iterator find(const Key &key) const;
+	const_iterator find(const Key& key) const {
+		return ConstImpl::find(*this, key);
+	}
+
+	template <class Key>
+	iterator find(const Key& key) {
+		return NonConstImpl::find(*this, key);
+	}

 	// Returns the smallest x such that *x>=key, or end()
 	template <class Key>
-	iterator lower_bound(const Key &key) const;
+	const_iterator lower_bound(const Key& key) const {
+		return ConstImpl::lower_bound(*this, key);
+	}
+
+	template <class Key>
+	iterator lower_bound(const Key& key) {
+		return NonConstImpl::lower_bound(*this, key);
+	};

 	// Returns the smallest x such that *x>key, or end()
 	template <class Key>
-	iterator upper_bound(const Key &key) const;
+	const_iterator upper_bound(const Key& key) const {
+		return ConstImpl::upper_bound(*this, key);
+	}
+
+	template <class Key>
+	iterator upper_bound(const Key& key) {
+		return NonConstImpl::upper_bound(*this, key);
+	};

 	// Returns the largest x such that *x<=key, or end()
 	template <class Key>
-	iterator lastLessOrEqual( const Key &key ) const;
+	const_iterator lastLessOrEqual(const Key& key) const {
+		return ConstImpl::lastLessOrEqual(*this, key);
+	};
+
+	template <class Key>
+	iterator lastLessOrEqual(const Key& key) {
+		return NonConstImpl::lastLessOrEqual(*this, key);
+	}

 	// Returns smallest x such that sumTo(x+1) > metric, or end()
 	template <class M>
-	iterator index( M const& metric ) const;
+	const_iterator index(M const& metric) const {
+		return ConstImpl::index(*this, metric);
+	};
+
+	template <class M>
+	iterator index(M const& metric) {
+		return NonConstImpl::index(*this, metric);
+	}

 	// Return the metric inserted with item x
-	Metric getMetric(iterator x) const; 
+	Metric getMetric(const_iterator x) const;
+	Metric getMetric(iterator x) const { return getMetric(const_iterator{ x }); }

 	// Return the sum of getMetric(x) for begin()<=x<to
-	Metric sumTo(iterator to) const;
+	Metric sumTo(const_iterator to) const;
+	Metric sumTo(iterator to) const { return sumTo(const_iterator{ to }); }

 	// Return the sum of getMetric(x) for begin<=x<end
-	Metric sumRange(iterator begin, iterator end) const { return sumTo(end) - sumTo(begin); }
+	Metric sumRange(const_iterator begin, const_iterator end) const { return sumTo(end) - sumTo(begin); }
+	Metric sumRange(iterator begin, iterator end) const {
+		return sumTo(const_iterator{ end }) - sumTo(const_iterator{ begin });
+	}

 	// Return the sum of getMetric(x) for all x s.t. begin <= *x && *x < end
-	template <class Key> 
-	Metric sumRange(const Key& begin, const Key& end) const { return sumRange(lower_bound(begin), lower_bound(end)); }
+	template <class Key>
+	Metric sumRange(const Key& begin, const Key& end) const {
+		return sumRange(lower_bound(begin), lower_bound(end));
+	}

 	// Return the amount of memory used by an entry in the IndexedSet
 	static int getElementBytes() { return sizeof(Node); }
@ -212,18 +314,25 @@ private:
 			newNode->parent = oldNode->parent;
 	}

+	template <int direction, bool isConst>
+	static void moveIteratorImpl(std::conditional_t<isConst, const Node, Node>*& node) {
+		if (node->child[0 ^ direction]) {
+			node = node->child[0 ^ direction];
+			while (node->child[1 ^ direction]) node = node->child[1 ^ direction];
+		} else {
+			while (node->parent && node->parent->child[0 ^ direction] == node) node = node->parent;
+			node = node->parent;
+		}
+	}
+
 	// direction 0 = left, 1 = right
 	template <int direction>
-	static void moveIterator(Node* &i){
-		if (i->child[0^direction]) {
-			i = i->child[0^direction];
-			while (i->child[1^direction])
-				i = i->child[1^direction];
-		} else {
-			while (i->parent && i->parent->child[0^direction] == i)
-				i = i->parent;
-			i = i->parent;
-		}
+	static void moveIterator(Node const*& node) {
+		moveIteratorImpl<direction, true>(node);
+	}
+	template <int direction>
+	static void moveIterator(Node*& node) {
+		moveIteratorImpl<direction, false>(node);
 	}

 public: // but testonly
@ -284,12 +393,19 @@ template <class Key, class Value, class Pair = MapPair<Key,Value>, class Metric=
 class Map {
 public:
 	typedef typename IndexedSet<Pair,Metric>::iterator iterator;
+	typedef typename IndexedSet<Pair, Metric>::const_iterator const_iterator;

 	Map() {}
-	iterator begin() const { return set.begin(); }
-	iterator end() const { return set.end(); }
-	iterator lastItem() const { return set.lastItem(); }
-	iterator previous(iterator i) const { return set.previous(i); }
+	const_iterator begin() const { return set.begin(); }
+	iterator begin() { return set.begin(); }
+	const_iterator cbegin() const { return begin(); }
+	const_iterator end() const { return set.end(); }
+	iterator end() { return set.end(); }
+	const_iterator cend() const { return end(); }
+	const_iterator lastItem() const { return set.lastItem(); }
+	iterator lastItem() { return set.lastItem(); }
+	const_iterator previous(const_iterator i) const { return set.previous(i); }
+	iterator previous(iterator i) { return set.previous(i); }
 	bool empty() const { return set.empty(); }

 	Value& operator[]( const Key& key ) { 
@ -317,18 +433,58 @@ public:
 	}

 	template <class KeyCompatible>
-	iterator find( KeyCompatible const& k ) const { return set.find(k); }
+	const_iterator find(KeyCompatible const& k) const {
+		return set.find(k);
+	}
 	template <class KeyCompatible>
-	iterator lower_bound( KeyCompatible const& k ) const { return set.lower_bound(k); }
+	iterator find(KeyCompatible const& k) {
+		return set.find(k);
+	}
+
 	template <class KeyCompatible>
-	iterator upper_bound( KeyCompatible const& k ) const { return set.upper_bound(k); }
+	const_iterator lower_bound(KeyCompatible const& k) const {
+		return set.lower_bound(k);
+	}
 	template <class KeyCompatible>
-	iterator lastLessOrEqual( KeyCompatible const& k ) const { return set.lastLessOrEqual(k); }
-	template <class M> 
-	iterator index( M const& metric ) const { return set.index(metric); }
-	Metric getMetric(iterator x) const { return set.getMetric(x); }
-	Metric sumTo(iterator to) const { return set.sumTo(to); }
-	Metric sumRange(iterator begin, iterator end) const { return set.sumRange(begin,end); }
+	iterator lower_bound(KeyCompatible const& k) {
+		return set.lower_bound(k);
+	}
+
+	template <class KeyCompatible>
+	const_iterator upper_bound(KeyCompatible const& k) const {
+		return set.upper_bound(k);
+	}
+	template <class KeyCompatible>
+	iterator upper_bound(KeyCompatible const& k) {
+		return set.upper_bound(k);
+	}
+
+	template <class KeyCompatible>
+	const_iterator lastLessOrEqual(KeyCompatible const& k) const {
+		return set.lastLessOrEqual(k);
+	}
+	template <class KeyCompatible>
+	iterator lastLessOrEqual(KeyCompatible const& k) {
+		return set.lastLessOrEqual(k);
+	}
+
+	template <class M>
+	const_iterator index(M const& metric) const {
+		return set.index(metric);
+	}
+	template <class M>
+	iterator index(M const& metric) {
+		return set.index(metric);
+	}
+
+	Metric getMetric(const_iterator x) const { return set.getMetric(x); }
+	Metric getMetric(iterator x) const { return getMetric(const_iterator{ x }); }
+
+	Metric sumTo(const_iterator to) const { return set.sumTo(to); }
+	Metric sumTo(iterator to) const { return sumTo(const_iterator{ to }); }
+
+	Metric sumRange(const_iterator begin, const_iterator end) const { return set.sumRange(begin, end); }
+	Metric sumRange(iterator begin, iterator end) const { return set.sumRange(begin, end); }
 	template <class KeyCompatible> 
 	Metric sumRange(const KeyCompatible& begin, const KeyCompatible& end) const { return set.sumRange(begin,end); }

@ -347,13 +503,15 @@ private:
 /////////////////////// implementation //////////////////////////

 template <class T, class Metric>
-void IndexedSet<T,Metric>::iterator::operator++(){
-	moveIterator<1>(i);
+template <bool isConst>
+void IndexedSet<T, Metric>::IteratorImpl<isConst>::operator++() {
+	moveIterator<1>(node);
 }

 template <class T, class Metric>
-void IndexedSet<T,Metric>::iterator::decrementNonEnd(){
-	moveIterator<0>(i);
+template <bool isConst>
+void IndexedSet<T, Metric>::IteratorImpl<isConst>::decrementNonEnd() {
+	moveIterator<0>(node);
 }

 template <class Node>
@ -578,28 +736,33 @@ Node* ISCommonSubtreeRoot(Node* first, Node* last) {
 }

 template <class T, class Metric>
-typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::begin() const {
-	Node *x = root;
-	while (x && x->child[0])
-		x = x->child[0];
-	return x;
+template <bool isConst>
+typename IndexedSet<T, Metric>::template Impl<isConst>::IteratorT IndexedSet<T, Metric>::Impl<isConst>::begin(
+    IndexedSet<T, Metric>::Impl<isConst>::SetT& self) {
+	NodeT* x = self.root;
+	while (x && x->child[0]) x = x->child[0];
+	return IteratorT{ x };
 }

 template <class T, class Metric>
-typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::previous(typename IndexedSet<T,Metric>::iterator i) const {
-	if (i==end())
-		return lastItem();
+template <bool isConst>
+template <bool constIterator>
+typename IndexedSet<T, Metric>::template IteratorImpl<isConst || constIterator>
+IndexedSet<T, Metric>::Impl<isConst>::previous(IndexedSet<T, Metric>::Impl<isConst>::SetT& self,
+                                               IndexedSet<T, Metric>::IteratorImpl<constIterator> iter) {
+	if (iter == self.end()) return self.lastItem();

-	moveIterator<0>(i.i);
-	return i;
+	moveIterator<0>(iter.node);
+	return iter;
 }

 template <class T, class Metric>
-typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::lastItem() const {
-	Node *x = root;
-	while (x && x->child[1])
-		x = x->child[1];
-	return x;
+template <bool isConst>
+typename IndexedSet<T, Metric>::template Impl<isConst>::IteratorT IndexedSet<T, Metric>::Impl<isConst>::lastItem(
+    IndexedSet<T, Metric>::Impl<isConst>::SetT& self) {
+	NodeT* x = self.root;
+	while (x && x->child[1]) x = x->child[1];
+	return IteratorT{ x };
 }

 template <class T, class Metric> template<class T_, class Metric_>
@ -617,9 +780,9 @@ Metric IndexedSet<T,Metric>::addMetric(T_&& data, Metric_&& metric){

 template <class T, class Metric> template<class T_, class Metric_>
 typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::insert(T_&& data, Metric_&& metric, bool replaceExisting){
-	if (root == NULL){
+	if (root == nullptr) {
 		root = new Node(std::forward<T_>(data), std::forward<Metric_>(metric));
-		return root;
+		return iterator{ root };
 	}
 	Node *t = root;
 	int d; // direction
@ -642,7 +805,7 @@ typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::insert(T_&& data,
 				}
 			}

-			return returnNode;
+			return iterator{ returnNode };
 		}
 		d = cmp > 0;
 		Node *nextT = t->child[d];
@ -685,23 +848,23 @@ typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::insert(T_&& data,
 		t->total = t->total + metric;
 	}

-	return newNode;
+	return iterator{ newNode };
 }

 template <class T, class Metric>
 int IndexedSet<T,Metric>::insert(const std::vector<std::pair<T,Metric>>& dataVector, bool replaceExisting) {
 	int num_inserted = 0;
-	Node *blockStart = NULL;
-	Node *blockEnd = NULL;
+	Node* blockStart = nullptr;
+	Node* blockEnd = nullptr;

 	for(int i = 0; i < dataVector.size(); ++i) {
 		Metric metric = dataVector[i].second;
 		T data = std::move(dataVector[i].first);

 		int d = 1; // direction
-		if(blockStart == NULL || (blockEnd != NULL && data >= blockEnd->data)) {
-			blockEnd = NULL;
-			if (root == NULL) {
+		if (blockStart == nullptr || (blockEnd != nullptr && data >= blockEnd->data)) {
+			blockEnd = nullptr;
+			if (root == nullptr) {
 				root = new Node(std::move(data), metric);
 				num_inserted++;
 				blockStart = root;
@ -842,8 +1005,8 @@ Metric IndexedSet<T, Metric>::eraseHalf(Node* start, Node* end, int eraseDir, in
 				metricDelta = metricDelta - n->total;
 				n->parent = start->parent;
 			}
-			
-			start->child[fromDir] = NULL;
+
+			start->child[fromDir] = nullptr;
 			toFree.push_back( start );
 		}

@ -874,13 +1037,13 @@ void IndexedSet<T,Metric>::erase( typename IndexedSet<T,Metric>::iterator begin,
 	// Removes all nodes in the set between first and last, inclusive.
 	// toFree is extended with the roots of completely removed subtrees.

-	ASSERT(!end.i || (begin.i && (::compare(*begin, *end) <= 0)));
+	ASSERT(!end.node || (begin.node && (::compare(*begin, *end) <= 0)));

 	if(begin == end)
 		return;
-	
-	IndexedSet<T,Metric>::Node* first = begin.i;
-	IndexedSet<T,Metric>::Node* last = previous(end).i; 
+
+	IndexedSet<T, Metric>::Node* first = begin.node;
+	IndexedSet<T, Metric>::Node* last = previous(end).node;

 	IndexedSet<T,Metric>::Node* subRoot = ISCommonSubtreeRoot(first, last);

@ -897,7 +1060,7 @@ void IndexedSet<T,Metric>::erase( typename IndexedSet<T,Metric>::iterator begin,
 	int heightDelta = leftHeightDelta + rightHeightDelta; 

 	// Rebalance and update metrics for all nodes from subRoot up to the root
-	for(auto p = subRoot; p != NULL; p = p->parent) {
+	for (auto p = subRoot; p != nullptr; p = p->parent) {
 		p->total = p->total - metricDelta;

 		auto& pc = p->parent ? p->parent->child[p->parent->child[1]==p] : root;
@ -925,7 +1088,7 @@ void IndexedSet<T,Metric>::erase(iterator toErase) {

 	{
 		// Find the node to erase
-		Node* t = toErase.i;
+		Node* t = toErase.node;
 		if (!t) return;

 		if (!t->child[0] || !t->child[1]) {
@ -1005,101 +1168,106 @@ void IndexedSet<T,Metric>::erase(iterator toErase) {

 // Returns x such that key==*x, or end()
 template <class T, class Metric>
+template <bool isConst>
 template <class Key>
-typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::find(const Key &key) const {
-	Node* t = root;
+typename IndexedSet<T, Metric>::template Impl<isConst>::IteratorT IndexedSet<T, Metric>::Impl<isConst>::find(
+    IndexedSet<T, Metric>::Impl<isConst>::SetT& self, const Key& key) {
+	NodeT* t = self.root;
 	while (t){
 		int cmp = compare(key, t->data);
-		if (cmp == 0) return iterator(t);
+		if (cmp == 0) return IteratorT{ t };
 		t = t->child[cmp > 0];
 	}
-	return end();
+	return self.end();
 }

 // Returns the smallest x such that *x>=key, or end()
 template <class T, class Metric>
+template <bool isConst>
 template <class Key>
-typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::lower_bound(const Key &key) const {
-	Node* t = root;
-	if (!t) return iterator();
+typename IndexedSet<T, Metric>::template Impl<isConst>::IteratorT IndexedSet<T, Metric>::Impl<isConst>::lower_bound(
+    IndexedSet<T, Metric>::Impl<isConst>::SetT& self, const Key& key) {
+	NodeT* t = self.root;
+	if (!t) return self.end();
 	bool less;
 	while (true) {
 		less = t->data < key;
-		Node* n = t->child[less];
+		NodeT* n = t->child[less];
 		if (!n) break;
 		t = n;
 	}

 	if (less) moveIterator<1>(t);

-	return iterator(t);
+	return IteratorT{ t };
 }

 // Returns the smallest x such that *x>key, or end()
 template <class T, class Metric>
+template <bool isConst>
 template <class Key>
-typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::upper_bound(const Key &key) const {
-	Node* t = root;
-	if (!t) return iterator();
+typename IndexedSet<T, Metric>::template Impl<isConst>::IteratorT IndexedSet<T, Metric>::Impl<isConst>::upper_bound(
+    IndexedSet<T, Metric>::Impl<isConst>::SetT& self, const Key& key) {
+	NodeT* t = self.root;
+	if (!t) return self.end();
 	bool not_less;
 	while (true) {
 		not_less = !(key < t->data);
-		Node* n = t->child[not_less];
+		NodeT* n = t->child[not_less];
 		if (!n) break;
 		t = n;
 	}

 	if (not_less) moveIterator<1>(t);

-	return iterator(t);
+	return IteratorT{ t };
 }

 template <class T, class Metric>
+template <bool isConst>
 template <class Key>
-typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::lastLessOrEqual(const Key &key) const {
-	iterator i = upper_bound(key);
-	if (i == begin()) return end();
-	return previous(i);
+typename IndexedSet<T, Metric>::template Impl<isConst>::IteratorT IndexedSet<T, Metric>::Impl<isConst>::lastLessOrEqual(
+    IndexedSet<T, Metric>::Impl<isConst>::SetT& self, const Key& key) {
+	auto i = self.upper_bound(key);
+	if (i == self.begin()) return self.end();
+	return self.previous(i);
 }

 // Returns first x such that metric < sum(begin(), x+1), or end()
 template <class T, class Metric>
+template <bool isConst>
 template <class M>
-typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::index( M const& metric ) const
-{
+typename IndexedSet<T, Metric>::template Impl<isConst>::IteratorT IndexedSet<T, Metric>::Impl<isConst>::index(
+    IndexedSet<T, Metric>::Impl<isConst>::SetT& self, const M& metric) {
 	M m = metric;
-	Node* t = root;
+	NodeT* t = self.root;
 	while (t) {
 		if (t->child[0] && m < t->child[0]->total)
 			t = t->child[0];
 		else {
 			m = m - t->total;
-			if (t->child[1])
-				m = m + t->child[1]->total;
-			if (m < M())
-				return iterator(t);
+			if (t->child[1]) m = m + t->child[1]->total;
+			if (m < M()) return IteratorT{ t };
 			t = t->child[1];
 		}
 	}
-	return end();
+	return self.end();
 }

 template <class T, class Metric>
-Metric IndexedSet<T,Metric>::getMetric(typename IndexedSet<T,Metric>::iterator x) const {
-	Metric m = x.i->total;
+Metric IndexedSet<T, Metric>::getMetric(typename IndexedSet<T, Metric>::const_iterator x) const {
+	Metric m = x.node->total;
 	for(int i=0; i<2; i++)
-		if (x.i->child[i]) 
-			m = m - x.i->child[i]->total;
+		if (x.node->child[i]) m = m - x.node->child[i]->total;
 	return m;
 }

 template <class T, class Metric>
-Metric IndexedSet<T,Metric>::sumTo(typename IndexedSet<T,Metric>::iterator end) const {
-	if (!end.i)
-		return root ? root->total : Metric();
+Metric IndexedSet<T, Metric>::sumTo(typename IndexedSet<T, Metric>::const_iterator end) const {
+	if (!end.node) return root ? root->total : Metric();

-	Metric m = end.i->child[0] ? end.i->child[0]->total : Metric();
-	for(Node* p = end.i; p->parent; p=p->parent) {
+	Metric m = end.node->child[0] ? end.node->child[0]->total : Metric();
+	for (const Node* p = end.node; p->parent; p = p->parent) {
 		if (p->parent->child[1] == p) {
 			m = m - p->total;
 			m = m + p->parent->total;
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@ -296,6 +296,35 @@ public:
 	}
 };

+struct SendBufferIterator {
+	typedef boost::asio::const_buffer value_type;
+	typedef std::forward_iterator_tag iterator_category;
+	typedef size_t difference_type;
+	typedef boost::asio::const_buffer* pointer;
+	typedef boost::asio::const_buffer& reference;
+
+	SendBuffer const* p;
+	int limit;
+
+	SendBufferIterator(SendBuffer const* p=0, int limit = std::numeric_limits<int>::max()) : p(p), limit(limit) {
+		ASSERT(limit > 0);
+	}
+
+	bool operator == (SendBufferIterator const& r) const { return p == r.p; }
+	bool operator != (SendBufferIterator const& r) const { return p != r.p; }
+	void operator++() {
+		limit -= p->bytes_written - p->bytes_sent;
+		if(limit > 0)
+			p = p->next;
+		else
+			p = NULL;
+	}
+
+	boost::asio::const_buffer operator*() const {
+		return boost::asio::const_buffer( p->data + p->bytes_sent, std::min(limit, p->bytes_written - p->bytes_sent) );
+	}
+};
+
 class Connection : public IConnection, ReferenceCounted<Connection> {
 public:
 	virtual void addref() { ReferenceCounted<Connection>::addref(); }
@ -420,35 +449,6 @@ private:
 	tcp::socket socket;
 	NetworkAddress peer_address;

-	struct SendBufferIterator {
-		typedef boost::asio::const_buffer value_type;
-		typedef std::forward_iterator_tag iterator_category;
-		typedef size_t difference_type;
-		typedef boost::asio::const_buffer* pointer;
-		typedef boost::asio::const_buffer& reference;
-
-		SendBuffer const* p;
-		int limit;
-
-		SendBufferIterator(SendBuffer const* p=0, int limit = std::numeric_limits<int>::max()) : p(p), limit(limit) {
-			ASSERT(limit > 0);
-		}
-
-		bool operator == (SendBufferIterator const& r) const { return p == r.p; }
-		bool operator != (SendBufferIterator const& r) const { return p != r.p; }
-		void operator++() {
-			limit -= p->bytes_written - p->bytes_sent;
-			if(limit > 0)
-				p = p->next;
-			else
-				p = NULL;
-		}
-
-		boost::asio::const_buffer operator*() const {
-			return boost::asio::const_buffer( p->data + p->bytes_sent, std::min(limit, p->bytes_written - p->bytes_sent) );
-		}
-	};
-
 	void init() {
 		// Socket settings that have to be set after connect or accept succeeds
 		socket.non_blocking(true);
@ -721,6 +721,10 @@ public:

 	// Writes as many bytes as possible from the given SendBuffer chain into the write buffer and returns the number of bytes written (might be 0)
 	virtual int write( SendBuffer const* data, int limit ) {
+#ifdef __APPLE__
+		// For some reason, writing ssl_sock with more than 2016 bytes when socket is writeable sometimes results in a broken pipe error.
+		limit = std::min(limit, 2016);
+#endif
 		boost::system::error_code err;
 		++g_net2->countWrites;

@ -763,35 +767,6 @@ private:
 	NetworkAddress peer_address;
 	Reference<ReferencedObject<boost::asio::ssl::context>> sslContext;

-	struct SendBufferIterator {
-		typedef boost::asio::const_buffer value_type;
-		typedef std::forward_iterator_tag iterator_category;
-		typedef size_t difference_type;
-		typedef boost::asio::const_buffer* pointer;
-		typedef boost::asio::const_buffer& reference;
-
-		SendBuffer const* p;
-		int limit;
-
-		SendBufferIterator(SendBuffer const* p=0, int limit = std::numeric_limits<int>::max()) : p(p), limit(limit) {
-			ASSERT(limit > 0);
-		}
-
-		bool operator == (SendBufferIterator const& r) const { return p == r.p; }
-		bool operator != (SendBufferIterator const& r) const { return p != r.p; }
-		void operator++() {
-			limit -= p->bytes_written - p->bytes_sent;
-			if(limit > 0)
-				p = p->next;
-			else
-				p = NULL;
-		}
-
-		boost::asio::const_buffer operator*() const {
-			return boost::asio::const_buffer( p->data + p->bytes_sent, std::min(limit, p->bytes_written - p->bytes_sent) );
-		}
-	};
-
 	void init() {
 		// Socket settings that have to be set after connect or accept succeeds
 		socket.non_blocking(true);
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@ -122,6 +122,7 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, BackupWorker);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, ReportConflictingKeys);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, SmallEndpoints);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, CacheRole);
 };

 // These impact both communications and the deserialization of certain database and IKeyValueStore keys.
--- a/flow/error_definitions.h
+++ b/flow/error_definitions.h
@ -35,6 +35,7 @@ ERROR( end_of_stream, 1, "End of stream" )
 ERROR( operation_failed, 1000, "Operation failed")
 ERROR( wrong_shard_server, 1001, "Shard is not available from this server")
 ERROR( operation_obsolete, 1002, "Operation result no longer necessary")
+ERROR( cold_cache_server, 1003, "Cache server is not warm for this range")
 ERROR( timed_out, 1004, "Operation timed out" )
 ERROR( coordinated_state_conflict, 1005, "Conflict occurred while changing coordination information" )
 ERROR( all_alternatives_failed, 1006, "All alternatives failed" )
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@ -21,9 +21,28 @@
 #include "flow/flow.h"
 #include "flow/DeterministicRandom.h"
 #include "flow/UnitTest.h"
+#include "flow/rte_memcpy.h"
+#include "flow/folly_memcpy.h"
 #include <stdarg.h>
 #include <cinttypes>

+#if (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
+// For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
+void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
+	return rte_memcpy(__dest, __src, __n);
+}
+
+// This compilation unit will be linked in to the main binary, so this should override glibc memcpy
+__attribute__((visibility ("default"))) void *memcpy (void *__restrict __dest, const void *__restrict __src, size_t __n) {
+	// folly_memcpy is faster for small copies, but rte seems to win out in most other circumstances
+	return rte_memcpy(__dest, __src, __n);
+}
+#else
+void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
+	return memcpy(__dest, __src, __n);
+}
+#endif // (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
+
 INetwork *g_network = 0;

 FILE* randLog = 0;
--- a/flow/folly_memcpy.S
+++ b/flow/folly_memcpy.S
@ -0,0 +1,178 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * memcpy: An optimized memcpy implementation for x86_64. It uses AVX when
+ * __AVX__ is defined, and uses SSE2 otherwise.
+ *
+ * @author Bin Liu <binliu@fb.com>
+ */
+
+#if defined(__x86_64__) && defined(__linux__) && !defined(__CYGWIN__)
+
+        .file     "memcpy.S"
+        .text
+
+/*
+ * _memcpy_short is a local helper used when length < 8. It cannot be called
+ * from outside, because it expects a non-standard calling convention:
+ *
+ *    %rax:  destination buffer address.
+ *    %rsi:  source buffer address.
+ *    %edx:  length, in the range of [0, 7]
+ */
+        .type     _memcpy_short, @function
+_memcpy_short:
+.LSHORT:
+        .cfi_startproc
+        //        if (length == 0) return;
+        test      %edx, %edx
+        jz        .LEND
+
+        movzbl    (%rsi), %ecx
+        //        if (length - 4 < 0) goto LS4;
+        sub       $4, %edx
+        jb        .LS4
+
+        mov       (%rsi), %ecx
+        mov       (%rsi, %rdx), %edi
+        mov       %ecx, (%rax)
+        mov       %edi, (%rax, %rdx)
+.LEND:
+        rep
+        ret
+        nop
+
+.LS4:
+        //        At this point, length can be 1 or 2 or 3, and $cl contains
+        //        the first byte.
+        mov       %cl, (%rax)
+        //        if (length - 4 + 2 < 0) return;
+        add       $2, %edx
+        jnc       .LEND
+
+        //        length is 2 or 3 here. In either case, just copy the last
+        //        two bytes.
+        movzwl    (%rsi, %rdx), %ecx
+        mov       %cx, (%rax, %rdx)
+        ret
+
+        .cfi_endproc
+        .size     _memcpy_short, .-_memcpy_short
+
+
+/*
+ * void* memcpy(void* dst, void* src, uint32_t length);
+ *
+ */
+        .align    16
+        .globl    folly_memcpy
+        .type     folly_memcpy, @function
+folly_memcpy:
+        .cfi_startproc
+
+        mov       %rdx, %rcx
+        mov       %rdi, %rax
+        cmp       $8, %rdx
+        jb        .LSHORT
+
+        mov       -8(%rsi, %rdx), %r8
+        mov       (%rsi), %r9
+        mov       %r8, -8(%rdi, %rdx)
+        and       $24, %rcx
+        jz        .L32
+
+        mov       %r9, (%rdi)
+        mov       %rcx, %r8
+        sub       $16, %rcx
+        jb        .LT32
+#ifndef __AVX__
+        movdqu    (%rsi, %rcx), %xmm1
+        movdqu    %xmm1, (%rdi, %rcx)
+#else
+        vmovdqu   (%rsi, %rcx), %xmm1
+        vmovdqu   %xmm1, (%rdi, %rcx)
+#endif
+        //        Test if there are 32-byte groups
+.LT32:
+        add       %r8, %rsi
+        and       $-32, %rdx
+        jnz       .L32_adjDI
+        ret
+
+        .align    16
+.L32_adjDI:
+        add       %r8, %rdi
+.L32:
+#ifndef __AVX__
+        movdqu    (%rsi), %xmm0
+        movdqu    16(%rsi), %xmm1
+#else
+        vmovdqu   (%rsi), %ymm0
+#endif
+        shr       $6, %rdx
+        jnc       .L64_32read
+#ifndef __AVX__
+        movdqu    %xmm0, (%rdi)
+        movdqu    %xmm1, 16(%rdi)
+#else
+        vmovdqu   %ymm0, (%rdi)
+#endif
+        lea       32(%rsi), %rsi
+        jnz       .L64_adjDI
+#ifdef __AVX__
+        vzeroupper
+#endif
+        ret
+
+.L64_adjDI:
+        add       $32, %rdi
+
+.L64:
+#ifndef __AVX__
+        movdqu    (%rsi), %xmm0
+        movdqu    16(%rsi), %xmm1
+#else
+        vmovdqu   (%rsi), %ymm0
+#endif
+
+.L64_32read:
+#ifndef __AVX__
+        movdqu    32(%rsi), %xmm2
+        movdqu    48(%rsi), %xmm3
+        add       $64, %rsi
+        movdqu    %xmm0, (%rdi)
+        movdqu    %xmm1, 16(%rdi)
+        movdqu    %xmm2, 32(%rdi)
+        movdqu    %xmm3, 48(%rdi)
+#else
+        vmovdqu   32(%rsi), %ymm1
+        add       $64, %rsi
+        vmovdqu   %ymm0, (%rdi)
+        vmovdqu   %ymm1, 32(%rdi)
+#endif
+        add       $64, %rdi
+        dec       %rdx
+        jnz       .L64
+#ifdef __AVX__
+        vzeroupper
+#endif
+        ret
+
+        .cfi_endproc
+        .size folly_memcpy, .-folly_memcpy
+
+#endif
--- a/flow/folly_memcpy.h
+++ b/flow/folly_memcpy.h
@ -0,0 +1,33 @@
+/*
+ * flow.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLOW_FOLLY_MEMCPY_H
+#define FLOW_FOLLY_MEMCPY_H
+#pragma once
+
+#if (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
+
+extern "C" {
+	void* folly_memcpy(void* dst, const void* src, uint32_t length);
+}
+
+#endif // linux or bsd and avx
+
+#endif
--- a/flow/rte_memcpy.h
+++ b/flow/rte_memcpy.h
@ -0,0 +1,913 @@
+/*
+SPDX-License-Identifier: BSD-3-Clause
+Copyright(c) 2010-2014 Intel Corporation
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCPY_X86_64_H_
+#define _RTE_MEMCPY_X86_64_H_
+
+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <flow/Platform.h>
+
+#if (defined (__linux__) || defined (__FreeBSD__)) && defined(__AVX__)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Copy bytes from one location to another. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   Pointer to the destination data.
+ */
+static force_inline void *
+rte_memcpy(void *dst, const void *src, size_t n);
+
+#ifdef __AVX512F__
+#define RTE_MACHINE_CPUFLAG_AVX512F
+#elif defined(__AVX__)
+#define RTE_MACHINE_CPUFLAG_AVX2
+#endif
+
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
+
+#define ALIGNMENT_MASK 0x3F
+
+/**
+ * AVX512 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_loadu_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__m512i zmm0;
+
+	zmm0 = _mm512_loadu_si512((const void *)src);
+	_mm512_storeu_si512((void *)dst, zmm0);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov64(dst + 0 * 64, src + 0 * 64);
+	rte_mov64(dst + 1 * 64, src + 1 * 64);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov64(dst + 0 * 64, src + 0 * 64);
+	rte_mov64(dst + 1 * 64, src + 1 * 64);
+	rte_mov64(dst + 2 * 64, src + 2 * 64);
+	rte_mov64(dst + 3 * 64, src + 3 * 64);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m512i zmm0, zmm1;
+
+	while (n >= 128) {
+		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+		n -= 128;
+		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+		src = src + 128;
+		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+		dst = dst + 128;
+	}
+}
+
+/**
+ * Copy 512-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+
+	while (n >= 512) {
+		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
+		n -= 512;
+		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
+		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
+		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
+		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
+		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
+		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
+		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
+		src = src + 512;
+		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
+		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
+		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
+		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
+		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
+		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
+		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
+		dst = dst + 512;
+	}
+}
+
+static force_inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t n)
+{
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08)
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 512 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				  (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				  (const uint8_t *)src - 32 + n);
+		return ret;
+	}
+	if (n <= 512) {
+		if (n >= 256) {
+			n -= 256;
+			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK63:
+		if (n > 64) {
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov64((uint8_t *)dst - 64 + n,
+					  (const uint8_t *)src - 64 + n);
+			return ret;
+		}
+		if (n > 0)
+			rte_mov64((uint8_t *)dst - 64 + n,
+					  (const uint8_t *)src - 64 + n);
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 512 bytes
+	 */
+	dstofss = ((uintptr_t)dst & 0x3F);
+	if (dstofss > 0) {
+		dstofss = 64 - dstofss;
+		n -= dstofss;
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 512-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	bits = n;
+	n = n & 511;
+	bits -= n;
+	src = (const uint8_t *)src + bits;
+	dst = (uint8_t *)dst + bits;
+
+	/**
+	 * Copy 128-byte blocks.
+	 * Use copy block function for better instruction order control,
+	 * which is important when load is unaligned.
+	 */
+	if (n >= 128) {
+		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+		bits = n;
+		n = n & 127;
+		bits -= n;
+		src = (const uint8_t *)src + bits;
+		dst = (uint8_t *)dst + bits;
+	}
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_128_BACK63;
+}
+
+#elif defined RTE_MACHINE_CPUFLAG_AVX2
+
+#define ALIGNMENT_MASK 0x1F
+
+/**
+ * AVX2 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_loadu_si256((const __m256i *)src);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
+	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
+}
+
+/**
+ * Copy 128-byte blocks from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	__m256i ymm0, ymm1, ymm2, ymm3;
+
+	while (n >= 128) {
+		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
+		n -= 128;
+		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
+		ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
+		ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
+		src = (const uint8_t *)src + 128;
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
+		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
+		dst = (uint8_t *)dst + 128;
+	}
+}
+
+static force_inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t n)
+{
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08) {
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		}
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 256 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
+		return ret;
+	}
+	if (n <= 256) {
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK31:
+		if (n >= 64) {
+			n -= 64;
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+		if (n > 32) {
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+			return ret;
+		}
+		if (n > 0) {
+			rte_mov32((uint8_t *)dst - 32 + n,
+					(const uint8_t *)src - 32 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 256 bytes
+	 */
+	dstofss = (uintptr_t)dst & 0x1F;
+	if (dstofss > 0) {
+		dstofss = 32 - dstofss;
+		n -= dstofss;
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+
+	/**
+	 * Copy 128-byte blocks
+	 */
+	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
+	bits = n;
+	n = n & 127;
+	bits -= n;
+	src = (const uint8_t *)src + bits;
+	dst = (uint8_t *)dst + bits;
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_128_BACK31;
+}
+
+#else /* RTE_MACHINE_CPUFLAG */
+
+#define ALIGNMENT_MASK 0x0F
+
+/**
+ * SSE & AVX implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+}
+
+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+}
+
+/**
+ * Copy 128 bytes from one location to another,
+ * locations should not overlap.
+ */
+static force_inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+}
+
+/**
+ * Copy 256 bytes from one location to another,
+ * locations should not overlap.
+ */
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
+	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
+	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
+	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
+	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
+	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
+	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
+	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
+}
+
+/**
+ * Macro for copying unaligned block from one location to another with constant load offset,
+ * 47 bytes leftover maximum,
+ * locations should not overlap.
+ * Requirements:
+ * - Store is aligned
+ * - Load offset is <offset>, which must be immediate value within [1, 15]
+ * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
+ * - <dst>, <src>, <len> must be variables
+ * - __m128i <xmm0> ~ <xmm8> must be pre-defined
+ */
+#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
+__extension__ ({                                                                                            \
+    size_t tmp;                                                                                                \
+    while (len >= 128 + 16 - offset) {                                                                      \
+        xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
+        len -= 128;                                                                                         \
+        xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
+        xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
+        xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
+        xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
+        xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
+        xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
+        xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
+        xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
+        src = (const uint8_t *)src + 128;                                                                   \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
+        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
+        dst = (uint8_t *)dst + 128;                                                                         \
+    }                                                                                                       \
+    tmp = len;                                                                                              \
+    len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
+    tmp -= len;                                                                                             \
+    src = (const uint8_t *)src + tmp;                                                                       \
+    dst = (uint8_t *)dst + tmp;                                                                             \
+    if (len >= 32 + 16 - offset) {                                                                          \
+        while (len >= 32 + 16 - offset) {                                                                   \
+            xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
+            len -= 32;                                                                                      \
+            xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
+            xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
+            src = (const uint8_t *)src + 32;                                                                \
+            _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
+            _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
+            dst = (uint8_t *)dst + 32;                                                                      \
+        }                                                                                                   \
+        tmp = len;                                                                                          \
+        len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
+        tmp -= len;                                                                                         \
+        src = (const uint8_t *)src + tmp;                                                                   \
+        dst = (uint8_t *)dst + tmp;                                                                         \
+    }                                                                                                       \
+})
+
+/**
+ * Macro for copying unaligned block from one location to another,
+ * 47 bytes leftover maximum,
+ * locations should not overlap.
+ * Use switch here because the aligning instruction requires immediate value for shift count.
+ * Requirements:
+ * - Store is aligned
+ * - Load offset is <offset>, which must be within [1, 15]
+ * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
+ * - <dst>, <src>, <len> must be variables
+ * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
+ */
+#define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
+__extension__ ({                                                      \
+    switch (offset) {                                                 \
+    case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
+    case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
+    case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
+    case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
+    case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
+    case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
+    case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
+    case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
+    case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
+    case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
+    case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
+    case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
+    case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
+    case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
+    case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
+    default:;                                                         \
+    }                                                                 \
+})
+
+static force_inline void *
+rte_memcpy_generic(void *dst, const void *src, size_t n)
+{
+	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+	uintptr_t dstu = (uintptr_t)dst;
+	uintptr_t srcu = (uintptr_t)src;
+	void *ret = dst;
+	size_t dstofss;
+	size_t srcofs;
+
+	/**
+	 * Copy less than 16 bytes
+	 */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dstu = *(const uint8_t *)srcu;
+			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+			dstu = (uintptr_t)((uint8_t *)dstu + 1);
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dstu = *(const uint16_t *)srcu;
+			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+			dstu = (uintptr_t)((uint16_t *)dstu + 1);
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dstu = *(const uint32_t *)srcu;
+			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+			dstu = (uintptr_t)((uint32_t *)dstu + 1);
+		}
+		if (n & 0x08) {
+			*(uint64_t *)dstu = *(const uint64_t *)srcu;
+		}
+		return ret;
+	}
+
+	/**
+	 * Fast way when copy size doesn't exceed 512 bytes
+	 */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		return ret;
+	}
+	if (n <= 128) {
+		goto COPY_BLOCK_128_BACK15;
+	}
+	if (n <= 512) {
+		if (n >= 256) {
+			n -= 256;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
+			src = (const uint8_t *)src + 256;
+			dst = (uint8_t *)dst + 256;
+		}
+COPY_BLOCK_255_BACK15:
+		if (n >= 128) {
+			n -= 128;
+			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 128;
+			dst = (uint8_t *)dst + 128;
+		}
+COPY_BLOCK_128_BACK15:
+		if (n >= 64) {
+			n -= 64;
+			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 64;
+			dst = (uint8_t *)dst + 64;
+		}
+COPY_BLOCK_64_BACK15:
+		if (n >= 32) {
+			n -= 32;
+			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+			src = (const uint8_t *)src + 32;
+			dst = (uint8_t *)dst + 32;
+		}
+		if (n > 16) {
+			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+			return ret;
+		}
+		if (n > 0) {
+			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+		}
+		return ret;
+	}
+
+	/**
+	 * Make store aligned when copy size exceeds 512 bytes,
+	 * and make sure the first 15 bytes are copied, because
+	 * unaligned copy functions require up to 15 bytes
+	 * backwards access.
+	 */
+	dstofss = (uintptr_t)dst & 0x0F;
+	if (dstofss > 0) {
+		dstofss = 16 - dstofss + 16;
+		n -= dstofss;
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		src = (const uint8_t *)src + dstofss;
+		dst = (uint8_t *)dst + dstofss;
+	}
+	srcofs = ((uintptr_t)src & 0x0F);
+
+	/**
+	 * For aligned copy
+	 */
+	if (srcofs == 0) {
+		/**
+		 * Copy 256-byte blocks
+		 */
+		for (; n >= 256; n -= 256) {
+			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+			dst = (uint8_t *)dst + 256;
+			src = (const uint8_t *)src + 256;
+		}
+
+		/**
+		 * Copy whatever left
+		 */
+		goto COPY_BLOCK_255_BACK15;
+	}
+
+	/**
+	 * For copy with unaligned load
+	 */
+	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
+
+	/**
+	 * Copy whatever left
+	 */
+	goto COPY_BLOCK_64_BACK15;
+}
+
+#endif /* RTE_MACHINE_CPUFLAG */
+
+static force_inline void *
+rte_memcpy_aligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+
+	/* Copy size <= 16 bytes */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dst = *(const uint8_t *)src;
+			src = (const uint8_t *)src + 1;
+			dst = (uint8_t *)dst + 1;
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dst = *(const uint16_t *)src;
+			src = (const uint16_t *)src + 1;
+			dst = (uint16_t *)dst + 1;
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dst = *(const uint32_t *)src;
+			src = (const uint32_t *)src + 1;
+			dst = (uint32_t *)dst + 1;
+		}
+		if (n & 0x08)
+			*(uint64_t *)dst = *(const uint64_t *)src;
+
+		return ret;
+	}
+
+	/* Copy 16 <= size <= 32 bytes */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+				(const uint8_t *)src - 16 + n);
+
+		return ret;
+	}
+
+	/* Copy 32 < size <= 64 bytes */
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+				(const uint8_t *)src - 32 + n);
+
+		return ret;
+	}
+
+	/* Copy 64 bytes blocks */
+	for (; n >= 64; n -= 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		dst = (uint8_t *)dst + 64;
+		src = (const uint8_t *)src + 64;
+	}
+
+	/* Copy whatever left */
+	rte_mov64((uint8_t *)dst - 64 + n,
+			(const uint8_t *)src - 64 + n);
+
+	return ret;
+}
+
+static force_inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
+		return rte_memcpy_aligned(dst, src, n);
+	else
+		return rte_memcpy_generic(dst, src, n);
+}
+
+static inline uint64_t
+rte_rdtsc(void)
+{
+	union {
+		uint64_t tsc_64;
+		struct {
+			uint32_t lo_32;
+			uint32_t hi_32;
+		};
+	} tsc;
+
+	asm volatile("rdtsc" :
+		     "=a" (tsc.lo_32),
+		     "=d" (tsc.hi_32));
+	return tsc.tsc_64;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* defined (__linux__) || defined (__FreeBSD__) */
+
+#endif /* _RTE_MEMCPY_X86_64_H_ */
--- a/flow/test_memcpy.cpp
+++ b/flow/test_memcpy.cpp
@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "flow/folly_memcpy.h"
+#include "flow/rte_memcpy.h"
+#include "flow/IRandom.h"
+
+#include "flow/UnitTest.h"
+
+/*
+ * Set this to the maximum buffer size you want to test. If it is 0, then the
+ * values in the buf_sizes[] array below will be used.
+ */
+#define TEST_VALUE_RANGE        0
+
+/* List of buffer sizes to test */
+#if TEST_VALUE_RANGE == 0
+static size_t buf_sizes[] = {
+	0, 1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255,
+	256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518, 1522, 1600,
+	2048, 3072, 4096, 5120, 6144, 7168, 8192
+};
+/* MUST be as large as largest packet size above */
+#define SMALL_BUFFER_SIZE       8192
+#else /* TEST_VALUE_RANGE != 0 */
+static size_t buf_sizes[TEST_VALUE_RANGE];
+#define SMALL_BUFFER_SIZE       TEST_VALUE_RANGE
+#endif /* TEST_VALUE_RANGE == 0 */
+
+/* Data is aligned on this many bytes (power of 2) */
+#define ALIGNMENT_UNIT          32
+
+
+/*
+ * Create two buffers, and initialise one with random values. These are copied
+ * to the second buffer and then compared to see if the copy was successful.
+ * The bytes outside the copied area are also checked to make sure they were not
+ * changed.
+ */
+static int
+test_single_memcpy(unsigned int off_src, unsigned int off_dst, size_t size)
+{
+	unsigned int i;
+	uint8_t dest[SMALL_BUFFER_SIZE + ALIGNMENT_UNIT];
+	uint8_t src[SMALL_BUFFER_SIZE + ALIGNMENT_UNIT];
+	void * ret;
+
+	/* Setup buffers */
+	for (i = 0; i < SMALL_BUFFER_SIZE + ALIGNMENT_UNIT; i++) {
+		dest[i] = 0;
+		src[i] = (uint8_t) deterministicRandom()->randomUInt32();
+	}
+
+	/* Do the copy */
+	ret = memcpy(dest + off_dst, src + off_src, size);
+	if (ret != (dest + off_dst)) {
+		printf("memcpy() returned %p, not %p\n",
+		       ret, dest + off_dst);
+	}
+
+	/* Check nothing before offset is affected */
+	for (i = 0; i < off_dst; i++) {
+		if (dest[i] != 0) {
+			printf("memcpy() failed for %u bytes (offsets=%u,%u): "
+			       "[modified before start of dst].\n",
+			       (unsigned)size, off_src, off_dst);
+			return -1;
+		}
+	}
+
+	/* Check everything was copied */
+	for (i = 0; i < size; i++) {
+		if (dest[i + off_dst] != src[i + off_src]) {
+			printf("memcpy() failed for %u bytes (offsets=%u,%u): "
+			       "[didn't copy byte %u].\n",
+			       (unsigned)size, off_src, off_dst, i);
+			return -1;
+		}
+	}
+
+	/* Check nothing after copy was affected */
+	for (i = size; i < SMALL_BUFFER_SIZE; i++) {
+		if (dest[i + off_dst] != 0) {
+			printf("memcpy() failed for %u bytes (offsets=%u,%u): "
+			       "[copied too many].\n",
+			       (unsigned)size, off_src, off_dst);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Check functionality for various buffer sizes and data offsets/alignments.
+ */
+TEST_CASE("/rte/memcpy") {
+	unsigned int off_src, off_dst, i;
+	unsigned int num_buf_sizes = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	int ret;
+
+	for (off_src = 0; off_src < ALIGNMENT_UNIT; off_src++) {
+		for (off_dst = 0; off_dst < ALIGNMENT_UNIT; off_dst++) {
+			for (i = 0; i < num_buf_sizes; i++) {
+				ret = test_single_memcpy(off_src, off_dst,
+				                         buf_sizes[i]);
+				ASSERT(ret == 0);
+			}
+		}
+	}
+	return Void();
+}
+
+void forceLinkMemcpyTests() { }
--- a/flow/test_memcpy_perf.cpp
+++ b/flow/test_memcpy_perf.cpp
@ -0,0 +1,357 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "flow/rte_memcpy.h"
+#include "flow/IRandom.h"
+#include "flow/UnitTest.h"
+#include "flow/flow.h"
+
+#if (defined (__linux__) || defined (__FreeBSD__)) && defined (__AVX__)
+extern "C" {
+	void* folly_memcpy(void* dst, const void* src, uint32_t length);
+}
+
+
+void * rte_memcpy_noinline(void* dst, const void* src, size_t length); // for performance comparisons
+
+/*
+ * Set this to the maximum buffer size you want to test. If it is 0, then the
+ * values in the buf_sizes[] array below will be used.
+ */
+#define TEST_VALUE_RANGE        0
+
+/* List of buffer sizes to test */
+#if TEST_VALUE_RANGE == 0
+static size_t buf_sizes[] = {
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
+	129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448,
+	449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600,
+	2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192
+};
+/* MUST be as large as largest packet size above */
+#define SMALL_BUFFER_SIZE       8192
+#else /* TEST_VALUE_RANGE != 0 */
+static size_t buf_sizes[TEST_VALUE_RANGE];
+#define SMALL_BUFFER_SIZE       TEST_VALUE_RANGE
+#endif /* TEST_VALUE_RANGE == 0 */
+
+
+/*
+ * Arrays of this size are used for measuring uncached memory accesses by
+ * picking a random location within the buffer. Make this smaller if there are
+ * memory allocation errors.
+ */
+#define LARGE_BUFFER_SIZE       (100 * 1024 * 1024)
+
+/* How many times to run timing loop for performance tests */
+#define TEST_ITERATIONS         1000000
+#define TEST_BATCH_SIZE         100
+
+/* Data is aligned on this many bytes (power of 2) */
+// #ifdef RTE_MACHINE_CPUFLAG_AVX512F
+#define ALIGNMENT_UNIT          64
+// #elif defined RTE_MACHINE_CPUFLAG_AVX2
+// #define ALIGNMENT_UNIT          32
+// #else /* RTE_MACHINE_CPUFLAG */
+// #define ALIGNMENT_UNIT          16
+// #endif /* RTE_MACHINE_CPUFLAG */
+
+/*
+ * Pointers used in performance tests. The two large buffers are for uncached
+ * access where random addresses within the buffer are used for each
+ * memcpy. The two small buffers are for cached access.
+ */
+static uint8_t *large_buf_read, *large_buf_write;
+static uint8_t *small_buf_read, *small_buf_write;
+
+static size_t round_up(size_t sz, size_t alignment) {
+	return (((sz - 1) / alignment) + 1) * alignment;
+}
+
+static uint8_t * rte_malloc(char const * ignored, size_t sz, size_t align) {
+	return (uint8_t*) aligned_alloc(align, round_up(sz, align));
+}
+
+static void rte_free(void * ptr) {
+	if (!!ptr) {
+		free(ptr);
+	}
+}
+
+/* Initialise data buffers. */
+static int
+init_buffers(void)
+{
+	unsigned i;
+
+	large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (large_buf_read == NULL)
+		goto error_large_buf_read;
+
+	large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (large_buf_write == NULL)
+		goto error_large_buf_write;
+
+	small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (small_buf_read == NULL)
+		goto error_small_buf_read;
+
+	small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
+	if (small_buf_write == NULL)
+		goto error_small_buf_write;
+
+	for (i = 0; i < LARGE_BUFFER_SIZE; i++)
+		large_buf_read[i] = deterministicRandom()->randomUInt32();
+	for (i = 0; i < SMALL_BUFFER_SIZE; i++)
+		small_buf_read[i] = deterministicRandom()->randomUInt32();
+
+	return 0;
+
+error_small_buf_write:
+	rte_free(small_buf_read);
+error_small_buf_read:
+	rte_free(large_buf_write);
+error_large_buf_write:
+	rte_free(large_buf_read);
+error_large_buf_read:
+	printf("ERROR: not enough memory\n");
+	return -1;
+}
+
+/* Cleanup data buffers */
+static void
+free_buffers(void)
+{
+	rte_free(large_buf_read);
+	rte_free(large_buf_write);
+	rte_free(small_buf_read);
+	rte_free(small_buf_write);
+}
+
+/*
+ * Get a random offset into large array, with enough space needed to perform
+ * max copy size. Offset is aligned, uoffset is used for unalignment setting.
+ */
+static inline size_t
+get_rand_offset(size_t uoffset)
+{
+	return ((deterministicRandom()->randomUInt32() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
+			~(ALIGNMENT_UNIT - 1)) + uoffset;
+}
+
+/* Fill in source and destination addresses. */
+static inline void
+fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset,
+				 size_t *src_addr, int is_src_cached, size_t src_uoffset)
+{
+	unsigned int i;
+
+	for (i = 0; i < TEST_BATCH_SIZE; i++) {
+		dst_addr[i] = (is_dst_cached) ? dst_uoffset : get_rand_offset(dst_uoffset);
+		src_addr[i] = (is_src_cached) ? src_uoffset : get_rand_offset(src_uoffset);
+	}
+}
+
+/*
+ * WORKAROUND: For some reason the first test doing an uncached write
+ * takes a very long time (~25 times longer than is expected). So we do
+ * it once without timing.
+ */
+static void
+do_uncached_write(uint8_t *dst, int is_dst_cached,
+				  const uint8_t *src, int is_src_cached, size_t size)
+{
+	unsigned i, j;
+	size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];
+
+	for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
+		fill_addr_arrays(dst_addrs, is_dst_cached, 0,
+						 src_addrs, is_src_cached, 0);
+		for (j = 0; j < TEST_BATCH_SIZE; j++) {
+			memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
+		}
+	}
+}
+
+/*
+ * Run a single memcpy performance test. This is a macro to ensure that if
+ * the "size" parameter is a constant it won't be converted to a variable.
+ */
+#define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset,                   \
+                         src, is_src_cached, src_uoffset, size)             \
+do {                                                                        \
+    unsigned int iter, t;                                                   \
+    size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];          \
+    uint64_t start_time, total_time = 0;                                    \
+    uint64_t total_time2 = 0;                                               \
+    for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {    \
+        fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset,             \
+                         src_addrs, is_src_cached, src_uoffset);            \
+        start_time = rte_rdtsc();                                           \
+        for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
+            rte_memcpy_noinline(dst+dst_addrs[t], src+src_addrs[t], size);           \
+        total_time += rte_rdtsc() - start_time;                             \
+    }                                                                       \
+    for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {    \
+        fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset,             \
+                         src_addrs, is_src_cached, src_uoffset);            \
+        start_time = rte_rdtsc();                                           \
+        for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
+            memcpy(dst+dst_addrs[t], src+src_addrs[t], size);               \
+        total_time2 += rte_rdtsc() - start_time;                            \
+    }                                                                       \
+    printf("%3.0f -", (double)total_time  / TEST_ITERATIONS);                 \
+    printf("%3.0f",   (double)total_time2 / TEST_ITERATIONS);                 \
+    printf("(%6.2f%%) ", ((double)total_time - total_time2)*100/total_time2); \
+} while (0)
+
+/* Run aligned memcpy tests for each cached/uncached permutation */
+#define ALL_PERF_TESTS_FOR_SIZE(n)                                       \
+do {                                                                     \
+    if (__builtin_constant_p(n))                                         \
+        printf("\nC%6u", (unsigned)n);                                   \
+    else                                                                 \
+        printf("\n%7u", (unsigned)n);                                    \
+    SINGLE_PERF_TEST(small_buf_write, 1, 0, small_buf_read, 1, 0, n);    \
+    SINGLE_PERF_TEST(large_buf_write, 0, 0, small_buf_read, 1, 0, n);    \
+    SINGLE_PERF_TEST(small_buf_write, 1, 0, large_buf_read, 0, 0, n);    \
+    SINGLE_PERF_TEST(large_buf_write, 0, 0, large_buf_read, 0, 0, n);    \
+} while (0)
+
+/* Run unaligned memcpy tests for each cached/uncached permutation */
+#define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n)                             \
+do {                                                                     \
+    if (__builtin_constant_p(n))                                         \
+        printf("\nC%6u", (unsigned)n);                                   \
+    else                                                                 \
+        printf("\n%7u", (unsigned)n);                                    \
+    SINGLE_PERF_TEST(small_buf_write, 1, 1, small_buf_read, 1, 5, n);    \
+    SINGLE_PERF_TEST(large_buf_write, 0, 1, small_buf_read, 1, 5, n);    \
+    SINGLE_PERF_TEST(small_buf_write, 1, 1, large_buf_read, 0, 5, n);    \
+    SINGLE_PERF_TEST(large_buf_write, 0, 1, large_buf_read, 0, 5, n);    \
+} while (0)
+
+/* Run memcpy tests for constant length */
+#define ALL_PERF_TEST_FOR_CONSTANT                                      \
+do {                                                                    \
+    TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U);         \
+    TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U);      \
+    TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U);    \
+} while (0)
+
+/* Run all memcpy tests for aligned constant cases */
+static inline void
+perf_test_constant_aligned(void)
+{
+#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE
+	ALL_PERF_TEST_FOR_CONSTANT;
+#undef TEST_CONSTANT
+}
+
+/* Run all memcpy tests for unaligned constant cases */
+static inline void
+perf_test_constant_unaligned(void)
+{
+#define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE_UNALIGNED
+	ALL_PERF_TEST_FOR_CONSTANT;
+#undef TEST_CONSTANT
+}
+
+/* Run all memcpy tests for aligned variable cases */
+static inline void
+perf_test_variable_aligned(void)
+{
+	unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	unsigned i;
+	for (i = 0; i < n; i++) {
+		ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]);
+	}
+}
+
+/* Run all memcpy tests for unaligned variable cases */
+static inline void
+perf_test_variable_unaligned(void)
+{
+	unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
+	unsigned i;
+	for (i = 0; i < n; i++) {
+		ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]);
+	}
+}
+
+/* Run all memcpy tests */
+TEST_CASE("performance/memcpy/rte") {
+	int ret;
+	struct timeval tv_begin, tv_end;
+	double time_aligned, time_unaligned;
+	double time_aligned_const, time_unaligned_const;
+
+	ret = init_buffers();
+	ASSERT(ret == 0);
+
+#if TEST_VALUE_RANGE != 0
+	/* Set up buf_sizes array, if required */
+	unsigned i;
+	for (i = 0; i < TEST_VALUE_RANGE; i++)
+		buf_sizes[i] = i;
+#endif
+
+	/* See function comment */
+	do_uncached_write(large_buf_write, 0, small_buf_read, 1, SMALL_BUFFER_SIZE);
+
+	printf("\n** rte_memcpy() - memcpy perf. tests (C = compile-time constant) **\n"
+		   "======= ================= ================= ================= =================\n"
+		   "   Size   Cache to cache     Cache to mem      Mem to cache        Mem to mem\n"
+		   "(bytes)          (ticks)          (ticks)           (ticks)           (ticks)\n"
+		   "------- ----------------- ----------------- ----------------- -----------------");
+
+	printf("\n================================= %2dB aligned =================================",
+		ALIGNMENT_UNIT);
+	/* Do aligned tests where size is a variable */
+	gettimeofday(&tv_begin, NULL);
+	perf_test_variable_aligned();
+	gettimeofday(&tv_end, NULL);
+	time_aligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+		+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
+	printf("\n------- ----------------- ----------------- ----------------- -----------------");
+	/* Do aligned tests where size is a compile-time constant */
+	gettimeofday(&tv_begin, NULL);
+	perf_test_constant_aligned();
+	gettimeofday(&tv_end, NULL);
+	time_aligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+		+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
+	printf("\n================================== Unaligned ==================================");
+	/* Do unaligned tests where size is a variable */
+	gettimeofday(&tv_begin, NULL);
+	perf_test_variable_unaligned();
+	gettimeofday(&tv_end, NULL);
+	time_unaligned = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+		+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
+	printf("\n------- ----------------- ----------------- ----------------- -----------------");
+	/* Do unaligned tests where size is a compile-time constant */
+	gettimeofday(&tv_begin, NULL);
+	perf_test_constant_unaligned();
+	gettimeofday(&tv_end, NULL);
+	time_unaligned_const = (double)(tv_end.tv_sec - tv_begin.tv_sec)
+		+ ((double)tv_end.tv_usec - tv_begin.tv_usec)/1000000;
+	printf("\n======= ================= ================= ================= =================\n\n");
+
+	printf("Test Execution Time (seconds):\n");
+	printf("Aligned variable copy size   = %8.3f\n", time_aligned);
+	printf("Aligned constant copy size   = %8.3f\n", time_aligned_const);
+	printf("Unaligned variable copy size = %8.3f\n", time_unaligned);
+	printf("Unaligned constant copy size = %8.3f\n", time_unaligned_const);
+	free_buffers();
+
+	return Void();
+}
+
+#endif // defined (__linux__) || defined (__FreeBSD__)
+
+void forceLinkMemcpyPerfTests() {}
--- a/packaging/msi/FDBInstaller.wxs
+++ b/packaging/msi/FDBInstaller.wxs
@ -32,7 +32,7 @@

 <Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
  <Product Name='$(var.Title)'
-           Id='{3B1F197F-A65C-401D-AEAA-2C37CFEAF2F9}'
+           Id='{D1DF8A00-7A76-448F-AA71-8734A5D8F7D3}'
           UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
           Version='$(var.Version)'
           Manufacturer='$(var.Manufacturer)'
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -110,6 +110,7 @@ if(WITH_PYTHON)
  add_fdb_test(TEST_FILES fast/BackupCorrectnessClean.txt)
  add_fdb_test(TEST_FILES fast/BackupToDBCorrectness.txt)
  add_fdb_test(TEST_FILES fast/BackupToDBCorrectnessClean.txt)
+  add_fdb_test(TEST_FILES fast/CacheTest.txt)
  add_fdb_test(TEST_FILES fast/CloggedSideband.txt)
  add_fdb_test(TEST_FILES fast/ConfigureLocked.txt)
  add_fdb_test(TEST_FILES fast/ConstrainedRandomSelector.txt)
@ -246,6 +247,9 @@ if(WITH_PYTHON)

  verify_testing()
  if (NOT OPEN_FOR_IDE AND NOT WIN32)
-    create_test_package()
+    create_correctness_package()
+    if (USE_VALGRIND)
+      create_valgrind_correctness_package()
+    endif()
  endif()
 endif()
--- a/tests/fast/CacheTest.txt
+++ b/tests/fast/CacheTest.txt
@ -0,0 +1,11 @@
+testTitle=Cached
+    testName=Cache
+    keyPrefix=foo/
+
+testTitle=Cycle
+    testName=Cycle
+    transactionsPerSecond=2500.0
+    testDuration=10.0
+    expectedRate=0.80
+    keyPrefix=foo/
+