Merge branch 'feature-tree-broadcast' into feature-small-endpoint

2020-04-17 17:16:04 -07:00 · 2020-04-17 17:16:04 -07:00 · 0ee62badcd
parent dbf6afc78e b04478704e
commit 0ee62badcd
74 changed files with 1443 additions and 525 deletions
--- a/bindings/go/src/fdb/database.go
+++ b/bindings/go/src/fdb/database.go
@ -27,7 +27,7 @@ package fdb
 import "C"

 import (
-	"runtime"
+	"sync"
 )

 // Database is a handle to a FoundationDB database. Database is a lightweight
@ -74,13 +74,14 @@ func (d Database) CreateTransaction() (Transaction, error) {
 		return Transaction{}, Error{int(err)}
 	}

-	t := &transaction{outt, d}
-	runtime.SetFinalizer(t, (*transaction).destroy)
+	t := &transaction{outt, d, sync.Once{}}

 	return Transaction{t}, nil
 }

-func retryable(wrapped func() (interface{}, error), onError func(Error) FutureNil) (ret interface{}, e error) {
+func retryable(t Transaction, wrapped func() (interface{}, error), onError func(Error) FutureNil) (ret interface{}, e error) {
+	defer t.Close()
+
 	for {
 		ret, e = wrapped()

@ -140,7 +141,7 @@ func (d Database) Transact(f func(Transaction) (interface{}, error)) (interface{
 		return
 	}

-	return retryable(wrapped, tr.OnError)
+	return retryable(tr, wrapped, tr.OnError)
 }

 // ReadTransact runs a caller-provided function inside a retry loop, providing
@ -180,7 +181,7 @@ func (d Database) ReadTransact(f func(ReadTransaction) (interface{}, error)) (in
 		return
 	}

-	return retryable(wrapped, tr.OnError)
+	return retryable(tr, wrapped, tr.OnError)
 }

 // Options returns a DatabaseOptions instance suitable for setting options
--- a/bindings/go/src/fdb/directory/directoryLayer.go
+++ b/bindings/go/src/fdb/directory/directoryLayer.go
@ -417,6 +417,7 @@ func (dl directoryLayer) subdirNames(rtr fdb.ReadTransaction, node subspace.Subs

 	rr := rtr.GetRange(sd, fdb.RangeOptions{})
 	ri := rr.Iterator()
+	defer ri.Close()

 	var ret []string

@ -442,6 +443,7 @@ func (dl directoryLayer) subdirNodes(tr fdb.Transaction, node subspace.Subspace)

 	rr := tr.GetRange(sd, fdb.RangeOptions{})
 	ri := rr.Iterator()
+	defer ri.Close()

 	var ret []subspace.Subspace

--- a/bindings/go/src/fdb/fdb_test.go
+++ b/bindings/go/src/fdb/fdb_test.go
@ -246,6 +246,7 @@ func ExampleRangeIterator() {

 	rr := tr.GetRange(fdb.KeyRange{fdb.Key(""), fdb.Key{0xFF}}, fdb.RangeOptions{})
 	ri := rr.Iterator()
+	defer ri.Close()

 	// Advance will return true until the iterator is exhausted
 	for ri.Advance() {
--- a/bindings/go/src/fdb/futures.go
+++ b/bindings/go/src/fdb/futures.go
@ -39,7 +39,6 @@ package fdb
 import "C"

 import (
-	"runtime"
 	"sync"
 	"unsafe"
 )
@ -75,9 +74,7 @@ type future struct {
 }

 func newFuture(ptr *C.FDBFuture) *future {
-	f := &future{ptr}
-	runtime.SetFinalizer(f, func(f *future) { C.fdb_future_destroy(f.ptr) })
-	return f
+	return &future{ptr}
 }

 // Note: This function guarantees the callback will be executed **at most once**.
@ -100,17 +97,14 @@ func fdb_future_block_until_ready(f *C.FDBFuture) {
 }

 func (f *future) BlockUntilReady() {
-	defer runtime.KeepAlive(f)
 	fdb_future_block_until_ready(f.ptr)
 }

 func (f *future) IsReady() bool {
-	defer runtime.KeepAlive(f)
 	return C.fdb_future_is_ready(f.ptr) != 0
 }

 func (f *future) Cancel() {
-	defer runtime.KeepAlive(f)
 	C.fdb_future_cancel(f.ptr)
 }

@ -142,7 +136,7 @@ type futureByteSlice struct {

 func (f *futureByteSlice) Get() ([]byte, error) {
 	f.o.Do(func() {
-		defer runtime.KeepAlive(f.future)
+		defer C.fdb_future_destroy(f.ptr)

 		var present C.fdb_bool_t
 		var value *C.uint8_t
@ -156,10 +150,14 @@ func (f *futureByteSlice) Get() ([]byte, error) {
 		}

 		if present != 0 {
-			f.v = C.GoBytes(unsafe.Pointer(value), length)
-		}
+			// Copy the native `value` into a Go byte slice so the underlying
+			// native Future can be freed. This avoids the need for finalizers.
+			valueDestination := make([]byte, length)
+			valueSource := C.GoBytes(unsafe.Pointer(value), length)
+			copy(valueDestination, valueSource)

-		C.fdb_future_release_memory(f.ptr)
+			f.v = valueDestination
+		}
 	})

 	return f.v, f.e
@ -199,7 +197,7 @@ type futureKey struct {

 func (f *futureKey) Get() (Key, error) {
 	f.o.Do(func() {
-		defer runtime.KeepAlive(f.future)
+		defer C.fdb_future_destroy(f.ptr)

 		var value *C.uint8_t
 		var length C.int
@ -211,8 +209,11 @@ func (f *futureKey) Get() (Key, error) {
 			return
 		}

-		f.k = C.GoBytes(unsafe.Pointer(value), length)
-		C.fdb_future_release_memory(f.ptr)
+		keySource := C.GoBytes(unsafe.Pointer(value), length)
+		keyDestination := make([]byte, length)
+		copy(keyDestination, keySource)
+
+		f.k = keyDestination
 	})

 	return f.k, f.e
@ -245,17 +246,21 @@ type FutureNil interface {

 type futureNil struct {
 	*future
+	o sync.Once
+	e error
 }

 func (f *futureNil) Get() error {
-	defer runtime.KeepAlive(f.future)
+	f.o.Do(func() {
+		defer C.fdb_future_destroy(f.ptr)

 		f.BlockUntilReady()
 		if err := C.fdb_future_get_error(f.ptr); err != 0 {
-		return Error{int(err)}
+			f.e = Error{int(err)}
 		}
+	})

-	return nil
+	return f.e
 }

 func (f *futureNil) MustGet() {
@ -281,8 +286,6 @@ func stringRefToSlice(ptr unsafe.Pointer) []byte {
 }

 func (f *futureKeyValueArray) Get() ([]KeyValue, bool, error) {
-	defer runtime.KeepAlive(f.future)
-
 	f.BlockUntilReady()

 	var kvs *C.FDBKeyValue
@ -293,13 +296,42 @@ func (f *futureKeyValueArray) Get() ([]KeyValue, bool, error) {
 		return nil, false, Error{int(err)}
 	}

+	// To minimize the number of individual allocations, we first calculate the
+	// final size used by all keys and values returned from this iteration,
+	// then perform one larger allocation and slice within it.
+
+	poolSize := 0
+	for i := 0; i < int(count); i++ {
+		kvptr := unsafe.Pointer(uintptr(unsafe.Pointer(kvs)) + uintptr(i*24))
+
+		poolSize += len(stringRefToSlice(kvptr))
+		poolSize += len(stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12)))
+	}
+
+	poolOffset := 0
+	pool := make([]byte, poolSize)
+
 	ret := make([]KeyValue, int(count))

 	for i := 0; i < int(count); i++ {
 		kvptr := unsafe.Pointer(uintptr(unsafe.Pointer(kvs)) + uintptr(i*24))

-		ret[i].Key = stringRefToSlice(kvptr)
-		ret[i].Value = stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12))
+		keySource := stringRefToSlice(kvptr)
+		valueSource := stringRefToSlice(unsafe.Pointer(uintptr(kvptr) + 12))
+
+		keyDestination := pool[poolOffset : poolOffset+len(keySource)]
+		poolOffset += len(keySource)
+
+		valueDestination := pool[poolOffset : poolOffset+len(valueSource)]
+		poolOffset += len(valueSource)
+
+		copy(keyDestination, keySource)
+		copy(valueDestination, valueSource)
+
+		ret[i] = KeyValue{
+			Key:   keyDestination,
+			Value: valueDestination,
+		}
 	}

 	return ret, (more != 0), nil
@ -324,19 +356,28 @@ type FutureInt64 interface {

 type futureInt64 struct {
 	*future
+	o sync.Once
+	e error
+	v int64
 }

 func (f *futureInt64) Get() (int64, error) {
-	defer runtime.KeepAlive(f.future)
+	f.o.Do(func() {
+		defer C.fdb_future_destroy(f.ptr)

 		f.BlockUntilReady()

 		var ver C.int64_t
 		if err := C.fdb_future_get_int64(f.ptr, &ver); err != 0 {
-		return 0, Error{int(err)}
+			f.v = 0
+			f.e = Error{int(err)}
+			return
 		}

-	return int64(ver), nil
+		f.v = int64(ver)
+	})
+
+	return f.v, f.e
 }

 func (f *futureInt64) MustGet() int64 {
@ -367,10 +408,14 @@ type FutureStringSlice interface {

 type futureStringSlice struct {
 	*future
+	o sync.Once
+	e error
+	v []string
 }

 func (f *futureStringSlice) Get() ([]string, error) {
-	defer runtime.KeepAlive(f.future)
+	f.o.Do(func() {
+		defer C.fdb_future_destroy(f.ptr)

 		f.BlockUntilReady()

@ -378,16 +423,25 @@ func (f *futureStringSlice) Get() ([]string, error) {
 		var count C.int

 		if err := C.fdb_future_get_string_array(f.ptr, (***C.char)(unsafe.Pointer(&strings)), &count); err != 0 {
-		return nil, Error{int(err)}
+			f.e = Error{int(err)}
+			return
 		}

 		ret := make([]string, int(count))

 		for i := 0; i < int(count); i++ {
-		ret[i] = C.GoString((*C.char)(*(**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(strings)) + uintptr(i*8)))))
+			source := C.GoString((*C.char)(*(**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(strings)) + uintptr(i*8)))))
+
+			destination := make([]byte, len(source))
+			copy(destination, source)
+
+			ret[i] = string(destination)
 		}

-	return ret, nil
+		f.v = ret
+	})
+
+	return f.v, f.e
 }

 func (f *futureStringSlice) MustGet() []string {
--- a/bindings/go/src/fdb/range.go
+++ b/bindings/go/src/fdb/range.go
@ -28,6 +28,7 @@ import "C"

 import (
 	"fmt"
+	"sync"
 )

 // KeyValue represents a single key-value pair in the database.
@ -140,6 +141,7 @@ func (rr RangeResult) GetSliceWithError() ([]KeyValue, error) {
 	var ret []KeyValue

 	ri := rr.Iterator()
+	defer ri.Close()

 	if rr.options.Limit != 0 {
 		ri.options.Mode = StreamingModeExact
@ -207,6 +209,18 @@ type RangeIterator struct {
 	index     int
 	err       error
 	snapshot  bool
+	o         sync.Once
+}
+
+// Close releases the underlying native resources for all the `KeyValue`s
+// ever returned by this iterator. The `KeyValue`s themselves are copied
+// before they're returned, so they are still safe to use after calling
+// this function. This is instended to be called with `defer` inside
+// your transaction function.
+func (ri *RangeIterator) Close() {
+	ri.o.Do(func() {
+		C.fdb_future_destroy(ri.f.ptr)
+	})
 }

 // Advance attempts to advance the iterator to the next key-value pair. Advance
--- a/bindings/go/src/fdb/transaction.go
+++ b/bindings/go/src/fdb/transaction.go
@ -25,6 +25,7 @@ package fdb
 // #define FDB_API_VERSION 630
 // #include <foundationdb/fdb_c.h>
 import "C"
+import "sync"

 // A ReadTransaction can asynchronously read from a FoundationDB
 // database. Transaction and Snapshot both satisfy the ReadTransaction
@ -70,6 +71,7 @@ type Transaction struct {
 type transaction struct {
 	ptr *C.FDBTransaction
 	db  Database
+	o   sync.Once
 }

 // TransactionOptions is a handle with which to set options that affect a
@ -85,16 +87,18 @@ func (opt TransactionOptions) setOpt(code int, param []byte) error {
 	}, param)
 }

-func (t *transaction) destroy() {
-	C.fdb_transaction_destroy(t.ptr)
-}
-
 // GetDatabase returns a handle to the database with which this transaction is
 // interacting.
 func (t Transaction) GetDatabase() Database {
 	return t.transaction.db
 }

+func (t Transaction) Close() {
+	t.o.Do(func() {
+		C.fdb_transaction_destroy(t.ptr)
+	})
+}
+
 // Transact executes the caller-provided function, passing it the Transaction
 // receiver object.
 //
--- a/bindings/java/CMakeLists.txt
+++ b/bindings/java/CMakeLists.txt
@ -169,8 +169,6 @@ file(WRITE ${MANIFEST_FILE} ${MANIFEST_TEXT})
 add_jar(fdb-java ${JAVA_BINDING_SRCS} ${GENERATED_JAVA_FILES} ${CMAKE_SOURCE_DIR}/LICENSE
  OUTPUT_DIR ${PROJECT_BINARY_DIR}/lib VERSION ${CMAKE_PROJECT_VERSION} MANIFEST ${MANIFEST_FILE})
 add_dependencies(fdb-java fdb_java_options fdb_java)
-add_jar(foundationdb-tests SOURCES ${JAVA_TESTS_SRCS} INCLUDE_JARS fdb-java)
-add_dependencies(foundationdb-tests fdb_java_options)

 # TODO[mpilman]: The java RPM will require some more effort (mostly on debian). However,
 # most people will use the fat-jar, so it is not clear how high this priority is.
@ -237,6 +235,16 @@ if(NOT OPEN_FOR_IDE)
    WORKING_DIRECTORY ${unpack_dir}
    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/lib_copied
    COMMENT "Build ${target_jar}")
+  add_jar(foundationdb-tests SOURCES ${JAVA_TESTS_SRCS} INCLUDE_JARS fdb-java)
+  add_dependencies(foundationdb-tests fdb_java_options)
+  set(tests_jar ${jar_destination}/fdb-java-${CMAKE_PROJECT_VERSION}${prerelease_string}-tests.jar)
+  add_custom_command(OUTPUT ${tests_jar}
+    COMMAND ${CMAKE_COMMAND} -E copy foundationdb-tests.jar "${tests_jar}"
+    WORKING_DIRECTORY .
+    DEPENDS foundationdb-tests
+    COMMENT "Build ${tests_jar}")
+  add_custom_target(fdb-java-tests ALL DEPENDS ${tests_jar})
+  add_dependencies(fdb-java-tests foundationdb-tests)
  add_custom_target(fat-jar ALL DEPENDS ${target_jar})
  add_dependencies(fat-jar fdb-java)
  add_dependencies(fat-jar copy_lib)
--- a/build/gen_dev_docker.sh
+++ b/build/gen_dev_docker.sh
@ -30,7 +30,7 @@ num_groups=${#gids[@]}
 additional_groups="-G sudo"
 for ((i=0;i<num_groups;i++))
 do
-        echo "RUN groupadd -g ${gids[$i]} ${groups[$i]}" >> Dockerfile
+        echo "RUN groupadd -g ${gids[$i]} ${groups[$i]} || true" >> Dockerfile
        if [ ${gids[i]} -ne ${gid} ]
        then
                additional_groups="${additional_groups},${gids[$i]}"
@ -72,9 +72,21 @@ sudo docker run --rm `# delete (temporary) image after return` \\
                --security-opt seccomp=unconfined \\
                -v "${HOME}:${HOME}" `# Mount home directory` \\
                \${ccache_args} \\
-                ${image}
+                ${image} "\$@"
 EOF

+cat <<EOF $HOME/bin/clangd
+#!/usr/bin/bash
+
+fdb-dev scl enable devtoolset-8 rh-python36 rh-ruby24 -- clangd
+EOF
+
+if [[ ":$PATH:" != *":$HOME/bin:"* ]]
+then
+        echo "WARNING: $HOME/bin is not in your PATH!"
+        echo -e "\tThis can cause problems with some scripts (like fdb-clangd)"
+fi
 chmod +x $HOME/bin/fdb-dev
 echo "To start the dev docker image run $HOME/bin/fdb-dev"
-echo "You can edit this file but be aware that this script will overwrite your changes if you rerun it"
+echo "$HOME/bin/clangd can be used for IDE integration"
+echo "You can edit these files but be aware that this script will overwrite your changes if you rerun it"
--- a/cmake/AddFdbTest.cmake
+++ b/cmake/AddFdbTest.cmake
@ -189,19 +189,29 @@ function(create_test_package)
    add_custom_command(
      OUTPUT ${tar_file}
      DEPENDS ${out_files}
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh ${CMAKE_BINARY_DIR}/packages/joshua_test
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh ${CMAKE_BINARY_DIR}/packages/joshua_timeout
+              ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
+              ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
+              ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
+              ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
+              ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
+              ${external_files}
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
+                                       ${CMAKE_BINARY_DIR}/packages/joshua_test
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
+                                       ${CMAKE_BINARY_DIR}/packages/joshua_timeout
      COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
                                          ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
                                          ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
                                          ${CMAKE_BINARY_DIR}/packages/joshua_test
                                          ${CMAKE_BINARY_DIR}/packages/joshua_timeout
-      ${out_files} ${external_files}
+                                          ${out_files}
+                                          ${external_files}
      COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/packages/joshua_test ${CMAKE_BINARY_DIR}/packages/joshua_timeout
      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages
      COMMENT "Package correctness archive"
      )
    add_custom_target(package_tests ALL DEPENDS ${tar_file})
+    # seems make needs this dependency while this does nothing with ninja
    add_dependencies(package_tests strip_only_fdbserver TestHarness)
  endif()

@ -210,14 +220,24 @@ function(create_test_package)
    add_custom_command(
      OUTPUT ${tar_file}
      DEPENDS ${out_files}
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh ${CMAKE_BINARY_DIR}/packages/joshua_test
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh ${CMAKE_BINARY_DIR}/packages/joshua_timeout
-      COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
+              ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
+              ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
+              ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
+              ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh
+              ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh
+              ${external_files}
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTest.sh
+                                       ${CMAKE_BINARY_DIR}/packages/joshua_test
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/valgrindTimeout.sh
+                                       ${CMAKE_BINARY_DIR}/packages/joshua_timeout
+      COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file}
+                                          ${CMAKE_BINARY_DIR}/packages/bin/fdbserver
                                          ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe
                                          ${CMAKE_BINARY_DIR}/packages/bin/TraceLogHelper.dll
                                          ${CMAKE_BINARY_DIR}/packages/joshua_test
                                          ${CMAKE_BINARY_DIR}/packages/joshua_timeout
-      ${out_files} ${external_files}
+                                          ${out_files}
+                                          ${external_files}
      COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/packages/joshua_test ${CMAKE_BINARY_DIR}/packages/joshua_timeout
      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages
      COMMENT "Package correctness archive"
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -21,6 +21,10 @@ set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info")
 set(STATIC_LINK_LIBCXX ON CACHE BOOL "Statically link libstdcpp/libc++")
 set(USE_WERROR OFF CACHE BOOL "Compile with -Werror. Recommended for local development and CI.")

+if(USE_LIBCXX AND STATIC_LINK_LIBCXX AND NOT USE_LD STREQUAL "LLD")
+  message(FATAL_ERROR "Unsupported configuration: STATIC_LINK_LIBCXX with libc+++ only works if USE_LD=LLD")
+endif()
+
 set(rel_debug_paths OFF)
 if(RELATIVE_DEBUG_PATHS)
  set(rel_debug_paths ON)
@ -189,13 +193,16 @@ else()
    add_compile_options()
    # Clang has link errors unless `atomic` is specifically requested.
    if(NOT APPLE)
-      add_link_options(-latomic)
+      #add_link_options(-latomic)
    endif()
    if (APPLE OR USE_LIBCXX)
      add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-stdlib=libc++>)
      add_compile_definitions(WITH_LIBCXX)
      if (NOT APPLE)
-        add_link_options(-lc++ -lc++abi -Wl,-build-id=sha1)
+        if (STATIC_LINK_LIBCXX)
+          add_link_options(-static-libgcc -nostdlib++  -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic)
+        endif()
+        add_link_options(-stdlib=libc++ -Wl,-build-id=sha1)
      endif()
    endif()
    if (OPEN_FOR_IDE)
@ -215,7 +222,7 @@ else()
    if (USE_CCACHE)
      add_compile_options(
        -Wno-register
-        -Wno-error=unused-command-line-argument)
+        -Wno-unused-command-line-argument)
    endif()
  endif()
  if (USE_WERROR)
--- a/cmake/FlowCommands.cmake
+++ b/cmake/FlowCommands.cmake
@ -135,11 +135,11 @@ function(strip_debug_symbols target)
  add_custom_target(strip_only_${target} DEPENDS ${out_file})
  if(is_exec AND NOT APPLE)
    add_custom_command(OUTPUT "${out_file}.debug"
+      DEPENDS strip_only_${target}
      COMMAND objcopy --verbose --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug"
      COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}"
      COMMENT "Copy debug symbols to ${out_name}.debug")
    add_custom_target(strip_${target} DEPENDS  "${out_file}.debug")
-    add_dependencies(strip_${target} ${target} strip_only_${target})
  else()
    add_custom_target(strip_${target})
    add_dependencies(strip_${target} strip_only_${target})
--- a/contrib/TestHarness/CMakeLists.txt
+++ b/contrib/TestHarness/CMakeLists.txt
@ -10,8 +10,6 @@ set(out_file ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe)
 add_custom_command(OUTPUT ${out_file}
  COMMAND ${MCS_EXECUTABLE} ARGS ${TEST_HARNESS_REFERENCES} ${SRCS} "-target:exe" "-out:${out_file}"
  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-  DEPENDS ${SRCS}
+  DEPENDS ${SRCS} TraceLogHelper
  COMMENT "Compile TestHarness" VERBATIM)
 add_custom_target(TestHarness DEPENDS ${out_file})
-add_dependencies(TestHarness TraceLogHelper)
-set(TestHarnesExe "${out_file}" PARENT_SCOPE)
--- a/contrib/monitoring/CMakeLists.txt
+++ b/contrib/monitoring/CMakeLists.txt
@ -1 +1,2 @@
 add_executable(actor_flamegraph actor_flamegraph.cpp)
+target_link_libraries(actor_flamegraph PRIVATE Threads::Threads)
--- a/design/special-key-space.md
+++ b/design/special-key-space.md
@ -0,0 +1,81 @@
+# Special-Key-Space
+This document discusses why we need the proposed special-key-space framwork. And for what problems the framework aims to solve and in what scenarios a developer should use it.   
+
+## Motivation
+Currently, there are several client functions implemented as FDB calls by passing through special keys(`prefixed with \xff\xff`). Below are all existing features:
+- **status/json**: `get("\xff\xff/status/json")`
+- **cluster_file_path**: `get("\xff\xff/cluster_file_path)`
+- **connection_string**: `get("\xff\xff/connection_string)`
+- **worker_interfaces**: `getRange("\xff\xff/worker_interfaces", <any_key>)`
+- **conflicting-keys**: `getRange("\xff\xff/transaction/conflicting_keys/", "\xff\xff/transaction/conflicting_keys/\xff")`
+
+At present, implementions are hard-coded and the pain points are obvious:
+- **Maintainability**: As more features added, the hard-coded snippets are hard to maintain 
+- **Granularity**: It is impossible to scale up and down. For example, you want a cheap call like `get("\xff\xff/status/json/<certain_field>")` instead of calling `status/json` and parsing the results. On the contrary, sometime you want to aggregate results from several similiar features like `getRange("\xff\xff/transaction/, \xff\xff/transaction/\xff")` to get all transaction related info. Both of them are not achievable at present.
+- **Consistency**: While using FDB calls like `get` or `getRange`, the behavior that the result of `get("\xff\xff/B")` is not included in `getRange("\xff\xff/A", "\xff\xff/C")` is inconsistent with general FDB calls.
+
+Consequently, the special-key-space framework wants to integrate all client functions using special keys(`prefixed with \xff`) and solve the pain points listed above.
+
+## When
+If your feature is exposing information to clients and the results are easily formatted as key-value pairs, then you can use special-key-space to implement your client function.
+
+## How
+If you choose to use, you need to implement a function class that inherits from `SpecialKeyRangeBaseImpl`, which has an abstract method `Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw, KeyRangeRef kr)`.
+This method can be treated as a callback, whose implementation details are determined by the developer.
+Once you fill out the method, register the function class to the corresponding key range.
+Below is a detailed example.
+```c++
+// Implement the function class,
+// the corresponding key range is [\xff\xff/example/, \xff\xff/example/\xff)
+class SKRExampleImpl : public SpecialKeyRangeBaseImpl {
+public:
+    explicit SKRExampleImpl(KeyRangeRef kr): SpecialKeyRangeBaseImpl(kr) {
+        // Our implementation is quite simple here, the key-value pairs are formatted as:
+        // \xff\xff/example/<country_name> : <capital_city_name>
+        CountryToCapitalCity[LiteralStringRef("USA")] = LiteralStringRef("Washington, D.C.");
+        CountryToCapitalCity[LiteralStringRef("UK")] = LiteralStringRef("London");
+        CountryToCapitalCity[LiteralStringRef("Japan")] = LiteralStringRef("Tokyo");
+        CountryToCapitalCity[LiteralStringRef("China")] = LiteralStringRef("Beijing");
+    }
+    // Implement the getRange interface
+    Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
+                                            KeyRangeRef kr) const override {
+        
+        Standalone<RangeResultRef> result;
+        for (auto const& country : CountryToCapitalCity) {
+            // the registered range here: [\xff\xff/example/, \xff\xff/example/\xff]
+            Key keyWithPrefix = country.first.withPrefix(range.begin);
+            // check if any valid keys are given in the range
+            if (kr.contains(keyWithPrefix)) {
+                result.push_back(result.arena(), KeyValueRef(keyWithPrefix, country.second));
+                result.arena().dependsOn(keyWithPrefix.arena());
+            }
+        }
+        return result;
+    }
+private:
+    std::map<Key, Value> CountryToCapitalCity;
+};
+// Instantiate the function object
+// In development, you should have a function object pointer in DatabaseContext(DatabaseContext.h) and initialize in DatabaseContext's constructor(NativeAPI.actor.cpp)
+const KeyRangeRef exampleRange(LiteralStringRef("\xff\xff/example/"), LiteralStringRef("\xff\xff/example/\xff"));
+SKRExampleImpl exampleImpl(exampleRange);
+// Assuming the database handler is `cx`, register to special-key-space
+// In development, you should register all function objects in the constructor of DatabaseContext(NativeAPI.actor.cpp)
+cx->specialKeySpace->registerKeyRange(exampleRange, &exampleImpl);
+// Now any ReadYourWritesTransaction associated with `cx` is able to query the info
+state ReadYourWritesTransaction tr(cx);
+// get
+Optional<Value> res1 = wait(tr.get("\xff\xff/example/Japan"));
+ASSERT(res1.present() && res.getValue() == LiteralStringRef("Tokyo"));
+// getRange
+// Note: for getRange(key1, key2), both key1 and key2 should prefixed with \xff\xff
+// something like getRange("normal_key", "\xff\xff/...") is not supported yet
+Standalone<RangeResultRef> res2 = wait(tr.getRange(LiteralStringRef("\xff\xff/example/U"), LiteralStringRef("\xff\xff/example/U\xff")));
+// res2 should contain USA and UK
+ASSERT(
+    res2.size() == 2 &&
+    res2[0].value == LiteralStringRef("London") &&
+    res2[1].value == LiteralStringRef("Washington, D.C.")
+);
+```
--- a/design/tlog-forward-compatibility.md.html
+++ b/design/tlog-forward-compatibility.md.html
@ -0,0 +1,217 @@
+<meta charset="utf-8">
+
+# Forward Compatibility for Transaction Logs
+
+## Background
+
+A repeated concern with adopting FoundationDB has been that upgrades are one
+way, with no supported rollback.  If one were to upgrade a cluster running 6.0
+to a 6.1, then there's no way to roll back to 6.0 if the new version results in
+worse client application performance or unavailability.  In the interest of
+increasing adoption, work has begun on supporting on-disk forward
+compatibility, which allows for upgrades to be rolled back.
+
+The traditional way of allowing roll backs is to have one version, `N`, that
+introduces a feature, but is left as disabled.  `N+1` enables the feature, and
+then `N+2` removes whatever was deprecated in `N`.  However, FDB currently has
+a 6 month release cadence, and waiting 6 months to be able to use a new feature
+in production is unacceptably long.  Thus, the goal is to have a way to be able
+to have a sane and user-friendly, rollback-supporting upgrade path, but still
+allow features to be used immediately if desired.
+
+This document also carries two specific restrictions to the scope of what it covers:
+
+1. This document specifically is **not** a discussion of network protocol
+   compatibility nor supporting rolling upgrades.  Rolling upgrades of FDB are
+   still discouraged, and minor versions are still protocol incompatible with
+   each other.
+2. This only covers the proposed design of how forward compatibility for
+   transaction logs will be handled, and not forward compatibility for
+   FoundationDB as a whole.  There are other parts of the system that durably
+   store data, the coordinators and storage servers, that will not be discussed.
+
+## Overview
+
+A new configuration option, `log_version`, will be introduced to allow a user
+to control which on-disk format the transaction logs are allowed to use.  Not
+every release will affect the on-disk format of the transaction logs, so
+`log_version` is an opaque integer that is incremented by one whenever the
+on-disk format of the transaction log is changed.
+
+`log_version` is set by from `fdbcli`, with an invocation looking like
+`$ fdbcli -C cluster.file --exec "configure log_version:=2"`.  Note that `:=`
+is used instead of `=`, to keep the convention in `fdbcli` that configuration
+options that users aren't expected to need (or wish) to modify are set with
+`:=`.
+
+Right now, FDB releases and `log_version` values are as follows:
+
+| Release | Log Version |
+| ------- | ----------- |
+| pre-5.2 | 1           |
+| 5.2-6.0 | 2           |
+| 6.1+    | 3           |
+| 6.2     | 4           |
+| 6.3     | 5           |
+
+If a user does not specify any configuration for `log_version`, then
+`log_version` will be set so that rolling back to the previous minor version of
+FDB will be possible.  FDB will always support loading files generated by
+default from the next minor version.  It will be possible to configure
+`log_version` to a higher value on the release that introduces it, it the user
+is willing to sacrifice the ability to roll back.
+
+This means FDB's releases will work like the following:
+
+|              | 6.0 | 6.1 |   6.2 |     6.3 |
+|--------------|-----|-----|-------|---------|
+| Configurable |   2 | 2,3 |   3,4 |     4,5 |
+| Default      |   2 |   2 |     3 |       4 |
+| Recoverable  |   2 | 2,3 | 2,3,4 | 2,3,4,5 |
+
+Where...
+
+* "configurable" means values considered an acceptable configuration setting for `fdbcli> configure log_version:=N`.
+* "default" means what `log_version` will be if you don't configure it.
+* "recoverable" means that FDB can load files that were generated from the specified `log_version`.
+
+Configuring to a `log_version` will cause FDB to use the maximum of that
+`log_version` and default `log_version`.  The default `log_version` will always
+be the minimum configurable log version.  This is done so that manually setting
+`log_version` once, and then upgrading FDB multiple times, will eventually
+cause a low `log_version` left in the database configuration to act as a
+request for the default.  Configuring `log_version` to a very high number (e.g. 9999)
+will cause FDB to always use the highest available log version.
+
+As a concrete example, 6.1 will introduce a new transaction log feature with
+on-disk format implications.  If you wish to use it, you'll first have to
+`configure log_version:=3`. Otherwise, after upgrading to FDB6.2, it will
+become the default.  If problems are discovered when upgrading to FDB6.2, then
+roll back to FDB6.1.  (Theoretically.  See scope restrictions above.)
+
+## Detailed Implementation
+
+`fdbcli> configure log_version:=3` sets `\xff/conf/log_version` to `3`.  This
+version is also persisted as part of the `LogSystemConfig` and thus
+`DBCoreState`, so that any code handling the log system will have access to the
+`log_version` that was used to create it.
+
+Changing `log_version` will result in a recovery, and FoundationDB will recover
+into the requested transaction log implementation.  This involves locking the
+previous generation of transaction logs, and then recruiting a new generation
+of transaction logs.  FDB will load `\xff/conf/log_version` as the requested
+`log_version`, and when sending a `InitializeTLogRequest` to recruit a new
+transaction log, it uses the maximum of the requested log version and the
+default `log_version`.
+
+A worker, when receiving an `InitializeTLogRequest`, will initialize a
+transaction log corresponding to the requested `log_version`.  Transaction logs
+can pack multiple generations of transaction logs into the same shared entity,
+a `SharedTLog`.  `SharedTLog` instances correspond to one set of files, and
+will only contain transaction log generations of the same `log_version`.
+
+This allows us to have multiple generations of transaction logs running within
+one worker that have different `log_version`s, and if the worker crashes and
+restarts, we need to be able to recreate those transaction log instances.
+
+Transaction logs maintain two types of files, one is a pair files prefixed with
+`logqueue-` that are the DiskQueue, and the other is the metadata store, which
+is normally a mini `ssd-2` storage engine running within the transaction log.
+
+When a worker first starts, it scans its data directory for any files that were
+instances of a transaction log.  It then needs to construct a transaction log
+instance that can read the format of the file to be able to  reconnect the data
+in the files back to the FDB cluster, so that it can be used in a recovery if
+needed.
+
+This presents a problem that the worker needs to know all the configuration
+options that were used to decide the file format of the transaction log
+*before* it can rejoin a cluster and get far enough through a recovery to find
+out what that configuration was.  To get around this, the relevant
+configuration options have been added to the file name so that they're
+available when scanning the list of files.
+
+Currently, FDB identifies a transaction log instance via seeing a file that starts
+with `log-`, which represents the metadata store.  This filename has the format
+of `log-<UUID>.<SUFFIX>` where UUID is the `logId`, and SUFFIX tells us if the
+metadata store is a memory or ssd storage engine file.
+
+This format is being changed to `log2-&lt;KV PAIRS&gt;-<UUID>.<SUFFIX>`, where KV
+PAIRS is a small amount of information encoded into the file name to give us
+the metadata *about* the file that is required.  According to POSIX, the
+characters allowed for "fully portable filenames" are `A–Z a–z 0–9 . _ -` and
+the filename length should stay under 255 characters.  This leaves only `_` as
+the only character not already used.  Therefore, the KV pair encoding
+`K1_V1_K2_V2_...`, so keys and values separated by an `_`, and kv pairs are
+also separated by an `_`.
+
+The currently supported keys are:
+
+V
+: A copy of `log_version`
+
+LS
+: `log_spill`, a new configuration option in 6.1
+
+and any unrecognized keys are ignored, which will likely help forward compatibility.
+
+An example file name is `log2-V_3_LS_2-46a5f353ac18d787852d44c3a2e51527-0.fdq`.
+
+### Testing
+
+`SimulationConfig` has been changed to randomly set `log_version` according to
+what is supported.  This means that with restarting upgrade tests that simulate
+upgrading from `N` to `N+1`, the `N+1` version will see files that came from an
+FDB running with any `log_version` value that was previously supported.  If
+`N+1` can't handle the files correctly, then the simulation test will fail.
+
+`ConfigureTest` tries randomly toggling `log_version` up and down in a live
+database, along with all the other log related options.  Some are valid, some
+are invalid and should be rejected, or will cause ASSERTs in later parts of the
+code.
+
+I've added a new test, `ConfigureTestRestart` that tests changing
+configurations and then upgrading FDB, to cover testing that upgrades still
+happen correctly when `log_version` has been changed.  This also verifies that
+on-disk formats for those `log_version`s are still loadable by future FDB
+versions.
+
+There are no tests that mix the `ConfigureDatabase` and `Attrition` workloads.
+It would be good to do so, to cover the case of `log_version` changes in the
+presence of failures, but one cannot be added easily.  The simulator calculates
+what processes/machines are safe to kill by looking at the current
+configuration.  For `ConfigureTest`, this isn't good enough, because `triple`
+could mean that there are three replicas, or that the FDB cluster just changed
+from `single` to `triple` and only have one replica of data until data
+distribution finishes.  It would be good to add a `ConfigureKillTest` sometime
+in the future.
+
+For FDB to actually announce that rolling back from `N+1` to `N` is supported,
+there will need to be downgrade tests from `N+1` to `N` also.  The default in
+`N+1` should always be recoverable within `N`.  As FDB isn't promising forward
+compatibility yet, these tests haven't been implemented.
+
+# Transaction Log Forward Compatibility Operational Guide
+
+## Notable Behavior Changes
+
+When release notes mention a new `log_version` is available, after deploying
+that release, it's worth considering upgrading `log_version`.  Doing so will
+allow a controlled upgrade, and reduce the number of new changes that will
+take effect when upgrading to the next release.
+
+## Observability
+
+* When running with a non-default `log_version`, the setting will appear in `fdbcli> status`.
+
+## Monitoring and Alerting
+
+If anyone is doing anything that relies on the file names the transaction log uses, they'll be changing.
+
+
+<!-- Force long-style table of contents -->
+<script>window.markdeepOptions={}; window.markdeepOptions.tocStyle="long";</script>
+<!-- When printed, top level section headers should force page breaks -->
+<style>.md h1, .md .nonumberh1 {page-break-before:always}</style>
+<!-- Markdeep: -->
+<style class="fallback">body{visibility:hidden;white-space:pre;font-family:monospace}</style><script src="markdeep.min.js" charset="utf-8"></script><script src="https://casual-effects.com/markdeep/latest/markdeep.min.js" charset="utf-8"></script><script>window.alreadyProcessedMarkdeep||(document.body.style.visibility="visible")</script>
--- a/documentation/sphinx/source/release-notes.rst
+++ b/documentation/sphinx/source/release-notes.rst
@ -5,9 +5,16 @@ Release Notes
 7.0.0
 =====

+Features
+--------
+* Improved the slow task profiler to also report backtraces for periods when the run loop is saturated. `(PR #2608) <https://github.com/apple/foundationdb/pull/2608>`_
+
 Performance
 -----------

+* Improve GRV tail latencies, particularly as the transaction rate gets nearer the ratekeeper limit. `(PR #2735) <https://github.com/apple/foundationdb/pull/2735>`_
+* The proxies are now more responsive to changes in workload when unthrottling lower priority transactions. `(PR #2735) <https://github.com/apple/foundationdb/pull/2735>`_
+
 Fixes
 -----

@ -20,6 +27,10 @@ Bindings
 * API version updated to 630. See the :ref:`API version upgrade guide <api-version-upgrade-guide-630>` for upgrade details.
 * Java: Introduced ``keyAfter`` utility function that can be used to create the immediate next key for a given byte array. `(PR #2458) <https://github.com/apple/foundationdb/pull/2458>`_
 * C: The ``FDBKeyValue`` struct's ``key`` and ``value`` members have changed type from ``void*`` to ``uint8_t*``. `(PR #2622) <https://github.com/apple/foundationdb/pull/2622>`_
+* Deprecated ``enable_slow_task_profiling`` transaction option and replaced it with ``enable_run_loop_profiling``. `(PR #2608) <https://github.com/apple/foundationdb/pull/2608>`_
+* Go: Added a ``Close`` function to ``RangeIterator`` which **must** be called to free resources returned from ``Transaction.GetRange``. `(PR #1910) <https://github.com/apple/foundationdb/pull/1910>`_.
+* Go: Finalizers are no longer used to clean up native resources. ``Future`` results are now copied from the native heap to the Go heap, and native resources are freed immediately. `(PR #1910) <https://github.com/apple/foundationdb/pull/1910>`_.
+

 Other Changes
 -------------
--- a/fdbclient/BackupContainer.actor.cpp
+++ b/fdbclient/BackupContainer.actor.cpp
@ -115,6 +115,7 @@ std::string BackupDescription::toString() const {

 	info.append(format("URL: %s\n", url.c_str()));
 	info.append(format("Restorable: %s\n", maxRestorableVersion.present() ? "true" : "false"));
+	info.append(format("Partitioned logs: %s\n", partitioned ? "true" : "false"));

 	auto formatVersion = [&](Version v) {
 		std::string s;
@ -169,6 +170,7 @@ std::string BackupDescription::toJSON() const {
 	doc.setKey("SchemaVersion", "1.0.0");
 	doc.setKey("URL", url.c_str());
 	doc.setKey("Restorable", maxRestorableVersion.present());
+	doc.setKey("Partitioned", partitioned);

 	auto formatVersion = [&](Version v) {
 		JsonBuilderObject doc;
@ -243,10 +245,10 @@ std::string BackupDescription::toJSON() const {
 *       /plogs/...log,startVersion,endVersion,UID,tagID-of-N,blocksize
 *       /logs/.../log,startVersion,endVersion,UID,blockSize
 *     where ... is a multi level path which sorts lexically into version order and results in approximately 1
- *     unique folder per day containing about 5,000 files. Logs after 7.0 are stored in "plogs"
- *     directory and are partitioned according to tagIDs (0, 1, 2, ...) and the total number
- *     partitions is N. Logs before 7.0 are
- *     stored in "logs" directory and are not partitioned.
+ *     unique folder per day containing about 5,000 files. Logs after FDB 6.3 are stored in "plogs"
+ *     directory and are partitioned according to tagIDs (0, 1, 2, ...) and the total number partitions is N.
+ *     Old backup logs FDB 6.2 and earlier are stored in "logs" directory and are not partitioned.
+ *     After FDB 6.3, users can choose to use the new partitioned logs or old logs.
 *
 *
 *   BACKWARD COMPATIBILITY
@ -657,18 +659,6 @@ public:
 		return dumpFileList_impl(Reference<BackupContainerFileSystem>::addRef(this), begin, end);
 	}

-	ACTOR static Future<bool> isPartitionedBackup_impl(Reference<BackupContainerFileSystem> bc) {
-		BackupFileList list = wait(bc->dumpFileList(0, std::numeric_limits<Version>::max()));
-		for (const auto& file : list.logs) {
-			if (file.isPartitionedLog()) return true;
-		}
-		return false;
-	}
-
-	Future<bool> isPartitionedBackup() final {
-		return isPartitionedBackup_impl(Reference<BackupContainerFileSystem>::addRef(this));
-	}
-
 	static Version resolveRelativeVersion(Optional<Version> max, Version v, const char *name, Error e) {
 		if(v == invalidVersion) {
 			TraceEvent(SevError, "BackupExpireInvalidVersion").detail(name, v);
@ -704,7 +694,8 @@ public:
 		}
 	}

-	ACTOR static Future<BackupDescription> describeBackup_impl(Reference<BackupContainerFileSystem> bc, bool deepScan, Version logStartVersionOverride, bool partitioned) {
+	ACTOR static Future<BackupDescription> describeBackup_impl(Reference<BackupContainerFileSystem> bc, bool deepScan,
+	                                                           Version logStartVersionOverride) {
 		state BackupDescription desc;
 		desc.url = bc->getURL();

@ -722,8 +713,7 @@ public:
 		// from which to resolve the relative version.
 		// This could be handled more efficiently without recursion but it's tricky, this will do for now.
 		if(logStartVersionOverride != invalidVersion && logStartVersionOverride < 0) {
-			BackupDescription tmp = wait(partitioned ? bc->describePartitionedBackup(false, invalidVersion)
-			                                         : bc->describeBackup(false, invalidVersion));
+			BackupDescription tmp = wait(bc->describeBackup(false, invalidVersion));
 			logStartVersionOverride = resolveRelativeVersion(tmp.maxLogEnd, logStartVersionOverride,
 			                                                 "LogStartVersionOverride", invalid_option_value());
 		}
@ -733,10 +723,12 @@ public:
 		state Optional<Version> metaLogEnd;
 		state Optional<Version> metaExpiredEnd;
 		state Optional<Version> metaUnreliableEnd;
+		state Optional<Version> metaLogType;

 		std::vector<Future<Void>> metaReads;
 		metaReads.push_back(store(metaExpiredEnd, bc->expiredEndVersion().get()));
 		metaReads.push_back(store(metaUnreliableEnd, bc->unreliableEndVersion().get()));
+		metaReads.push_back(store(metaLogType, bc->logType().get()));

 		// Only read log begin/end versions if not doing a deep scan, otherwise scan files and recalculate them.
 		if(!deepScan) {
@ -752,7 +744,8 @@ public:
 		    .detail("ExpiredEndVersion", metaExpiredEnd.orDefault(invalidVersion))
 		    .detail("UnreliableEndVersion", metaUnreliableEnd.orDefault(invalidVersion))
 		    .detail("LogBeginVersion", metaLogBegin.orDefault(invalidVersion))
-			.detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion));
+		    .detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion))
+		    .detail("LogType", metaLogType.orDefault(-1));

 		// If the logStartVersionOverride is positive (not relative) then ensure that unreliableEndVersion is equal or greater
 		if(logStartVersionOverride != invalidVersion && metaUnreliableEnd.orDefault(invalidVersion) < logStartVersionOverride) {
@ -811,9 +804,18 @@ public:
 		}

 		state std::vector<LogFile> logs;
-		wait(store(logs, bc->listLogFiles(scanBegin, scanEnd, partitioned)) &&
+		state std::vector<LogFile> plogs;
+		wait(store(logs, bc->listLogFiles(scanBegin, scanEnd, false)) &&
+		     store(plogs, bc->listLogFiles(scanBegin, scanEnd, true)) &&
 		     store(desc.snapshots, bc->listKeyspaceSnapshots()));

+		if (plogs.size() > 0) {
+			desc.partitioned = true;
+			logs.swap(plogs);
+		} else {
+			desc.partitioned = metaLogType.present() && metaLogType.get() == PARTITIONED_MUTATION_LOG;
+		}
+
 		// List logs in version order so log continuity can be analyzed
 		std::sort(logs.begin(), logs.end());

@ -823,7 +825,7 @@ public:
 			// If we didn't get log versions above then seed them using the first log file
 			if (!desc.contiguousLogEnd.present()) {
 				desc.minLogBegin = logs.begin()->beginVersion;
-				if (partitioned) {
+				if (desc.partitioned) {
 					// Cannot use the first file's end version, which may not be contiguous
 					// for other partitions. Set to its beginVersion to be safe.
 					desc.contiguousLogEnd = logs.begin()->beginVersion;
@ -832,7 +834,7 @@ public:
 				}
 			}

-			if (partitioned) {
+			if (desc.partitioned) {
 				updatePartitionedLogsContinuousEnd(&desc, logs, scanBegin, scanEnd);
 			} else {
 				Version& end = desc.contiguousLogEnd.get();
@ -858,6 +860,11 @@ public:
 					updates = updates && bc->logEndVersion().set(desc.contiguousLogEnd.get());
 				}

+				if (!metaLogType.present()) {
+					updates = updates && bc->logType().set(desc.partitioned ? PARTITIONED_MUTATION_LOG
+					                                                        : NON_PARTITIONED_MUTATION_LOG);
+				}
+
 				wait(updates);
 			} catch(Error &e) {
 				if(e.code() == error_code_actor_cancelled)
@ -906,11 +913,8 @@ public:

 	// Uses the virtual methods to describe the backup contents
 	Future<BackupDescription> describeBackup(bool deepScan, Version logStartVersionOverride) final {
-		return describeBackup_impl(Reference<BackupContainerFileSystem>::addRef(this), deepScan, logStartVersionOverride, false);
-	}
-
-	Future<BackupDescription> describePartitionedBackup(bool deepScan, Version logStartVersionOverride) final {
-		return describeBackup_impl(Reference<BackupContainerFileSystem>::addRef(this), deepScan, logStartVersionOverride, true);
+		return describeBackup_impl(Reference<BackupContainerFileSystem>::addRef(this), deepScan,
+		                           logStartVersionOverride);
 	}

 	ACTOR static Future<Void> expireData_impl(Reference<BackupContainerFileSystem> bc, Version expireEndVersion, bool force, ExpireProgress *progress, Version restorableBeginVersion) {
@ -1287,7 +1291,7 @@ public:
 		return end;
 	}

-	ACTOR static Future<Optional<RestorableFileSet>> getRestoreSet_impl(Reference<BackupContainerFileSystem> bc, Version targetVersion, bool partitioned) {
+	ACTOR static Future<Optional<RestorableFileSet>> getRestoreSet_impl(Reference<BackupContainerFileSystem> bc, Version targetVersion) {
 		// Find the most recent keyrange snapshot to end at or before targetVersion
 		state Optional<KeyspaceSnapshotFile> snapshot;
 		std::vector<KeyspaceSnapshotFile> snapshots = wait(bc->listKeyspaceSnapshots());
@ -1311,9 +1315,13 @@ public:
 			}

 			// FIXME: check if there are tagged logs. for each tag, there is no version gap.
-			state std::vector<LogFile> logs = wait(bc->listLogFiles(snapshot.get().beginVersion, targetVersion, partitioned));
+			state std::vector<LogFile> logs;
+			state std::vector<LogFile> plogs;
+			wait(store(logs, bc->listLogFiles(snapshot.get().beginVersion, targetVersion, false)) &&
+			     store(plogs, bc->listLogFiles(snapshot.get().beginVersion, targetVersion, true)));

-			if (partitioned) {
+			if (plogs.size() > 0) {
+				logs.swap(plogs);
 				// sort by tag ID so that filterDuplicates works.
 				std::sort(logs.begin(), logs.end(), [](const LogFile& a, const LogFile& b) {
 					return std::tie(a.tagId, a.beginVersion, a.endVersion) <
@ -1349,11 +1357,7 @@ public:
 	}

 	Future<Optional<RestorableFileSet>> getRestoreSet(Version targetVersion) final {
-		return getRestoreSet_impl(Reference<BackupContainerFileSystem>::addRef(this), targetVersion, false);
-	}
-
-	Future<Optional<RestorableFileSet>> getPartitionedRestoreSet(Version targetVersion) final {
-		return getRestoreSet_impl(Reference<BackupContainerFileSystem>::addRef(this), targetVersion, true);
+		return getRestoreSet_impl(Reference<BackupContainerFileSystem>::addRef(this), targetVersion);
 	}

 private:
@ -1388,6 +1392,11 @@ public:
 	VersionProperty expiredEndVersion() {   return {Reference<BackupContainerFileSystem>::addRef(this), "expired_end_version"}; }
 	VersionProperty unreliableEndVersion() {   return {Reference<BackupContainerFileSystem>::addRef(this), "unreliable_end_version"}; }

+	// Backup log types
+	const static Version NON_PARTITIONED_MUTATION_LOG = 0;
+	const static Version PARTITIONED_MUTATION_LOG = 1;
+	VersionProperty logType() { return { Reference<BackupContainerFileSystem>::addRef(this), "mutation_log_type" }; }
+
 	ACTOR static Future<Void> writeVersionProperty(Reference<BackupContainerFileSystem> bc, std::string path, Version v) {
 		try {
 			state Reference<IBackupFile> f = wait(bc->writeFile(path));
--- a/fdbclient/BackupContainer.h
+++ b/fdbclient/BackupContainer.h
@ -178,6 +178,7 @@ struct BackupDescription {
 	// The minimum version which this backup can be used to restore to
 	Optional<Version> minRestorableVersion;
 	std::string extendedDetail;  // Freeform container-specific info.
+	bool partitioned; // If this backup contains partitioned mutation logs.

 	// Resolves the versions above to timestamps using a given database's TimeKeeper data.
 	// toString will use this information if present.
@ -260,23 +261,12 @@ public:
 	// be after deleting all data prior to logStartVersionOverride.
 	virtual Future<BackupDescription> describeBackup(bool deepScan = false, Version logStartVersionOverride = invalidVersion) = 0;

-	// The same as above, except using partitioned mutation logs.
-	virtual Future<BackupDescription> describePartitionedBackup(bool deepScan = false, Version logStartVersionOverride = invalidVersion) = 0;
-
 	virtual Future<BackupFileList> dumpFileList(Version begin = 0, Version end = std::numeric_limits<Version>::max()) = 0;

-	// If there are partitioned log files, then returns true; otherwise, returns false.
-	virtual Future<bool> isPartitionedBackup() = 0;
-
 	// Get exactly the files necessary to restore to targetVersion.  Returns non-present if
 	// restore to given version is not possible.
 	virtual Future<Optional<RestorableFileSet>> getRestoreSet(Version targetVersion) = 0;

-	// Get exactly the files necessary to restore to targetVersion. Returns non-present if
-	// restore to given version is not possible. This is intended for parallel
-	// restore in FDB 7.0, which reads partitioned mutation logs.
-	virtual Future<Optional<RestorableFileSet>> getPartitionedRestoreSet(Version targetVersion) = 0;
-
 	// Get an IBackupContainer based on a container spec string
 	static Reference<IBackupContainer> openContainer(std::string url);
 	static std::vector<std::string> getURLFormats();
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -820,6 +820,11 @@ struct LogMessageVersion {
 	explicit LogMessageVersion(Version version) : version(version), sub(0) {}
 	LogMessageVersion() : version(0), sub(0) {}
 	bool empty() const { return (version == 0) && (sub == 0); }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, version, sub);
+	}
 };

 struct AddressExclusion {
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -75,7 +75,7 @@ TLSConfig tlsConfig(TLSEndpointType::CLIENT);
 NetworkOptions::NetworkOptions()
 	: localAddress(""), clusterFile(""), traceDirectory(Optional<std::string>()),
 	  traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"),
-	  traceFormat("xml"), traceClockSource("now"), slowTaskProfilingEnabled(false) {
+	  traceFormat("xml"), traceClockSource("now"), runLoopProfilingEnabled(false) {

 	Standalone<VectorRef<ClientVersionRef>> defaultSupportedVersions;

@ -1010,9 +1010,9 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> valu

 			break;
 		}
-		case FDBNetworkOptions::ENABLE_SLOW_TASK_PROFILING:
+		case FDBNetworkOptions::ENABLE_RUN_LOOP_PROFILING: // Same as ENABLE_SLOW_TASK_PROFILING
 			validateOptionValue(value, false);
-			networkOptions.slowTaskProfilingEnabled = true;
+			networkOptions.runLoopProfilingEnabled = true;
 			break;
 		default:
 			break;
@ -1035,8 +1035,8 @@ void runNetwork() {
 	if(!g_network)
 		throw network_not_setup();

-	if(networkOptions.traceDirectory.present() && networkOptions.slowTaskProfilingEnabled) {
-		setupSlowTaskProfiler();
+	if(networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) {
+		setupRunLoopProfiler();
 	}

 	g_network->run();
@ -2233,6 +2233,8 @@ ACTOR Future<Standalone<VectorRef<const char*>>> getAddressesForKeyActor(Key key
 	// If key >= allKeys.end, then getRange will return a kv-pair with an empty value. This will result in our serverInterfaces vector being empty, which will cause us to return an empty addresses list.

 	state Key ksKey = keyServersKey(key);
+	state Standalone<RangeResultRef> serverTagResult = wait( getRange(cx, ver, lastLessOrEqual(serverTagKeys.begin), firstGreaterThan(serverTagKeys.end), GetRangeLimits(CLIENT_KNOBS->TOO_MANY), false, info ) );
+	ASSERT( !serverTagResult.more && serverTagResult.size() < CLIENT_KNOBS->TOO_MANY );
 	Future<Standalone<RangeResultRef>> futureServerUids = getRange(cx, ver, lastLessOrEqual(ksKey), firstGreaterThan(ksKey), GetRangeLimits(1), false, info);
 	Standalone<RangeResultRef> serverUids = wait( futureServerUids );

@ -2240,7 +2242,7 @@ ACTOR Future<Standalone<VectorRef<const char*>>> getAddressesForKeyActor(Key key

 	vector<UID> src;
 	vector<UID> ignore; // 'ignore' is so named because it is the vector into which we decode the 'dest' servers in the case where this key is being relocated. But 'src' is the canonical location until the move is finished, because it could be cancelled at any time.
-	decodeKeyServersValue(serverUids[0].value, src, ignore);
+	decodeKeyServersValue(serverTagResult, serverUids[0].value, src, ignore);
 	Optional<vector<StorageServerInterface>> serverInterfaces = wait( transactionalGetServerInterfaces(ver, cx, info, src) );

 	ASSERT( serverInterfaces.present() );  // since this is happening transactionally, /FF/keyServers and /FF/serverList need to be consistent with one another
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -61,7 +61,7 @@ struct NetworkOptions {
 	std::string traceFileIdentifier;
 	Optional<bool> logClientInfo;
 	Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions;
-	bool slowTaskProfilingEnabled;
+	bool runLoopProfilingEnabled;

 	NetworkOptions();
 };
--- a/fdbclient/RestoreWorkerInterface.actor.h
+++ b/fdbclient/RestoreWorkerInterface.actor.h
@ -460,31 +460,31 @@ struct RestoreSendVersionedMutationsRequest : TimedRequest {
 	int batchIndex; // version batch index
 	RestoreAsset asset; // Unique identifier for the current restore asset

-	Version prevVersion, version; // version is the commitVersion of the mutation vector.
+	Version msgIndex; // Monitonically increasing index of mutation messages
 	bool isRangeFile;
-	MutationsVec mutations; // All mutations at the same version parsed by one loader
-	SubSequenceVec subs; // Sub-sequence number for mutations
+	MutationsVec mutations; // Mutations that may be at different versions parsed by one loader
+	LogMessageVersionVec mVersions; // (version, subversion) of each mutation in mutations field

 	ReplyPromise<RestoreCommonReply> reply;

 	RestoreSendVersionedMutationsRequest() = default;
-	explicit RestoreSendVersionedMutationsRequest(int batchIndex, const RestoreAsset& asset, Version prevVersion,
-	                                              Version version, bool isRangeFile, MutationsVec mutations,
-	                                              SubSequenceVec subs)
-	  : batchIndex(batchIndex), asset(asset), prevVersion(prevVersion), version(version), isRangeFile(isRangeFile),
-	    mutations(mutations), subs(subs) {}
+	explicit RestoreSendVersionedMutationsRequest(int batchIndex, const RestoreAsset& asset, Version msgIndex,
+	                                              bool isRangeFile, MutationsVec mutations,
+	                                              LogMessageVersionVec mVersions)
+	  : batchIndex(batchIndex), asset(asset), msgIndex(msgIndex), isRangeFile(isRangeFile), mutations(mutations),
+	    mVersions(mVersions) {}

 	std::string toString() {
 		std::stringstream ss;
-		ss << "VersionBatchIndex:" << batchIndex << "RestoreAsset:" << asset.toString()
-		   << " prevVersion:" << prevVersion << " version:" << version << " isRangeFile:" << isRangeFile
-		   << " mutations.size:" << mutations.size() << " subs.size:" << subs.size();
+		ss << "VersionBatchIndex:" << batchIndex << "RestoreAsset:" << asset.toString() << " msgIndex:" << msgIndex
+		   << " isRangeFile:" << isRangeFile << " mutations.size:" << mutations.size()
+		   << " mVersions.size:" << mVersions.size();
 		return ss.str();
 	}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, batchIndex, asset, prevVersion, version, isRangeFile, mutations, subs, reply);
+		serializer(ar, batchIndex, asset, msgIndex, isRangeFile, mutations, mVersions, reply);
 	}
 };

--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -21,6 +21,8 @@
 #include "fdbclient/SystemData.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "flow/TDMetric.actor.h"
+#include "fdbclient/NativeAPI.actor.h"
+

 const KeyRef systemKeysPrefix = LiteralStringRef("\xff");
 const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix);
@ -43,20 +45,67 @@ const Key keyServersKey( const KeyRef& k ) {
 const KeyRef keyServersKey( const KeyRef& k, Arena& arena ) {
 	return k.withPrefix( keyServersPrefix, arena );
 }
-const Value keyServersValue( const vector<UID>& src, const vector<UID>& dest ) {
+const Value keyServersValue( Standalone<RangeResultRef> result, const std::vector<UID>& src, const std::vector<UID>& dest ) {
+	std::vector<Tag> srcTag;
+	std::vector<Tag> destTag;
+
+	for (const KeyValueRef kv : result) {
+		UID uid = decodeServerTagKey(kv.key);
+		if (std::find(src.begin(), src.end(), uid) != src.end()) {
+			srcTag.push_back( decodeServerTagValue(kv.value) );
+		}
+		if (std::find(dest.begin(), dest.end(), uid) != dest.end()) {
+			destTag.push_back( decodeServerTagValue(kv.value) );
+		}
+	}
+
+	return keyServersValue(srcTag, destTag);
+}
+const Value keyServersValue( const std::vector<Tag>& srcTag, const std::vector<Tag>& destTag ) {
 	// src and dest are expected to be sorted
-	ASSERT( std::is_sorted(src.begin(), src.end()) && std::is_sorted(dest.begin(), dest.end()) );
-	BinaryWriter wr((IncludeVersion())); wr << src << dest;
+	BinaryWriter wr(IncludeVersion()); wr << srcTag << destTag;
 	return wr.toValue();
 }
-void decodeKeyServersValue( const ValueRef& value, vector<UID>& src, vector<UID>& dest ) {
-	if (value.size()) {
-		BinaryReader rd(value, IncludeVersion());
-		rd >> src >> dest;
-	} else {
+
+void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& value,
+                            std::vector<UID>& src, std::vector<UID>& dest  ) {
+	if (value.size() == 0) {
 		src.clear();
 		dest.clear();
+		return;
 	}
+
+	BinaryReader rd(value, IncludeVersion());
+	rd.checkpoint();
+	int srcLen, destLen;
+	rd >> srcLen;
+	rd.readBytes(srcLen * sizeof(Tag));
+	rd >> destLen;
+	rd.rewind();
+
+	if (value.size() != sizeof(ProtocolVersion) + sizeof(int) + srcLen * sizeof(Tag) + sizeof(int) + destLen * sizeof(Tag)) {
+		rd >> src >> dest;
+		rd.assertEnd();
+		return;
+	}
+
+	std::vector<Tag> srcTag, destTag;
+	rd >> srcTag >> destTag;
+
+	src.clear();
+	dest.clear();
+
+	for (const KeyValueRef kv : result) {
+		Tag tag = decodeServerTagValue(kv.value);
+		if (std::find(srcTag.begin(), srcTag.end(), tag) != srcTag.end()) {
+			src.push_back( decodeServerTagKey(kv.key) );
+		}
+		if (std::find(destTag.begin(), destTag.end(), tag) != destTag.end()) {
+			dest.push_back( decodeServerTagKey(kv.key) );
+		}
+	}
+	std::sort(src.begin(), src.end());
+	std::sort(dest.begin(), dest.end());
 }

 const KeyRef conflictingKeysPrefix = LiteralStringRef("/transaction/conflicting_keys/");
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -28,6 +28,10 @@
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"

+// Don't warn on constants being defined in this file.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+
 struct RestoreLoaderInterface;
 struct RestoreApplierInterface;
 struct RestoreMasterInterface;
@ -39,23 +43,28 @@ extern const KeyRangeRef allKeys; // '' to systemKeys.end
 extern const KeyRangeRef specialKeys; // [FF][FF] to [FF][FF][FF][FF]
 extern const KeyRef afterAllKeys;

-//    "\xff/keyServers/[[begin]]" := "[[vector<serverID>, vector<serverID>]]"
+//    "\xff/keyServers/[[begin]]" := "[[vector<serverID>, vector<serverID>]|[vector<Tag>, vector<Tag>]]"
 extern const KeyRangeRef keyServersKeys, keyServersKeyServersKeys;
 extern const KeyRef keyServersPrefix, keyServersEnd, keyServersKeyServersKey;
 const Key keyServersKey( const KeyRef& k );
 const KeyRef keyServersKey( const KeyRef& k, Arena& arena );
 const Value keyServersValue(
-	const vector<UID>& src,
-	const vector<UID>& dest = vector<UID>() );
-void decodeKeyServersValue( const ValueRef& value,
-	vector<UID>& src, vector<UID>& dest  );
+	Standalone<RangeResultRef> result,
+	const std::vector<UID>& src,
+	const std::vector<UID>& dest = std::vector<UID>() );
+const Value keyServersValue(
+	const std::vector<Tag>& srcTag,
+	const std::vector<Tag>& destTag = std::vector<Tag>());
+// `result` must be the full result of getting serverTagKeys
+void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& value,
+	std::vector<UID>& src, std::vector<UID>& dest  );

 //    "\xff/storageCache/[[begin]]" := "[[vector<uint16_t>]]"
 extern const KeyRangeRef storageCacheKeys;
 extern const KeyRef storageCachePrefix;
 const Key storageCacheKey( const KeyRef& k );
-const Value storageCacheValue( const vector<uint16_t>& serverIndices );
-void decodeStorageCacheValue( const ValueRef& value, vector<uint16_t>& serverIndices );
+const Value storageCacheValue( const std::vector<uint16_t>& serverIndices );
+void decodeStorageCacheValue( const ValueRef& value, std::vector<uint16_t>& serverIndices );

 //    "\xff/serverKeys/[[serverID]]/[[begin]]" := "" | "1" | "2"
 extern const KeyRef serverKeysPrefix;
@ -82,6 +91,7 @@ extern const KeyRef cacheChangePrefix;
 const Key cacheChangeKeyFor( uint16_t idx );
 uint16_t cacheChangeKeyDecodeIndex( const KeyRef& key );

+// "\xff/serverTag/[[serverID]]" = "[[Tag]]"
 extern const KeyRangeRef serverTagKeys;
 extern const KeyRef serverTagPrefix;
 extern const KeyRangeRef serverTagMaxKeys;
@ -366,4 +376,6 @@ std::pair<Key,Version> decodeHealthyZoneValue( ValueRef const& );
 // Used to create artifically large txnStateStore instances in testing.
 extern const KeyRangeRef testOnlyTxnStateStorePrefixRange;

+#pragma clang diagnostic pop
+
 #endif
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@ -112,7 +112,9 @@ description is not currently required but encouraged.
    <Option name="disable_client_statistics_logging" code="70"
            description="Disables logging of client statistics, such as sampled transaction activity." />
    <Option name="enable_slow_task_profiling" code="71"
-            description="Enables debugging feature to perform slow task profiling. Requires trace logging to be enabled. WARNING: this feature is not recommended for use in production." />
+            description="Deprecated" />
+    <Option name="enable_run_loop_profiling" code="71"
+            description="Enables debugging feature to perform run loop profiling. Requires trace logging to be enabled. WARNING: this feature is not recommended for use in production." />
    <Option name="client_buggify_enable" code="80"
            description="Enable client buggify - will make requests randomly fail (intended for client testing)" />
    <Option name="client_buggify_disable" code="81"
--- a/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/AsyncFileKAIO.actor.h
@ -97,6 +97,7 @@ public:
 #endif

 	static Future<Reference<IAsyncFile>> open( std::string filename, int flags, int mode, void* ignore ) {
+		ASSERT( !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO );
 		ASSERT( flags & OPEN_UNBUFFERED );

 		if (flags & OPEN_LOCK)
@ -153,6 +154,7 @@ public:
 	}

 	static void init( Reference<IEventFD> ev, double ioTimeout ) {
+		ASSERT( !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO );
 		if( !g_network->isSimulated() ) {
 			ctx.countAIOSubmit.init(LiteralStringRef("AsyncFile.CountAIOSubmit"));
 			ctx.countAIOCollect.init(LiteralStringRef("AsyncFile.CountAIOCollect"));
@ -578,7 +580,7 @@ private:
 	static Context ctx;

 	explicit AsyncFileKAIO(int fd, int flags, std::string const& filename) : fd(fd), flags(flags), filename(filename), failed(false) {
-
+		ASSERT( !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO );
 		if( !g_network->isSimulated() ) {
 			countFileLogicalWrites.init(LiteralStringRef("AsyncFile.CountFileLogicalWrites"), filename);
 			countFileLogicalReads.init( LiteralStringRef("AsyncFile.CountFileLogicalReads"), filename);
--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@ -1360,3 +1360,94 @@ TEST_CASE("/flow/DeterministicRandom/SignedOverflow") {
 	       std::numeric_limits<int64_t>::max() - 1);
 	return Void();
 }
+
+struct Tracker {
+	int copied;
+	bool moved;
+	Tracker(int copied = 0) : moved(false), copied(copied) {}
+	Tracker(Tracker&& other) : Tracker(other.copied) {
+		ASSERT(!other.moved);
+		other.moved = true;
+	}
+	Tracker& operator=(Tracker&& other) {
+		ASSERT(!other.moved);
+		other.moved = true;
+		this->moved = false;
+		this->copied = other.copied;
+		return *this;
+	}
+	Tracker(const Tracker& other) : Tracker(other.copied + 1) { ASSERT(!other.moved); }
+	Tracker& operator=(const Tracker& other) {
+		ASSERT(!other.moved);
+		this->moved = false;
+		this->copied = other.copied + 1;
+		return *this;
+	}
+
+	ACTOR static Future<Void> listen(FutureStream<Tracker> stream) {
+		Tracker t = waitNext(stream);
+		ASSERT(!t.moved);
+		ASSERT(t.copied == 0);
+		return Void();
+	}
+};
+
+TEST_CASE("/flow/flow/PromiseStream/move") {
+	state PromiseStream<Tracker> stream;
+	{
+		// This tests the case when a callback is added before
+		// a movable value is sent
+		state Future<Void> listener = Tracker::listen(stream.getFuture());
+		stream.send(Tracker{});
+		wait(listener);
+	}
+
+	{
+		// This tests the case when a callback is added before
+		// a unmovable value is sent
+		listener = Tracker::listen(stream.getFuture());
+		Tracker namedTracker;
+		stream.send(namedTracker);
+		wait(listener);
+	}
+	{
+		// This tests the case when no callback is added until
+		// after a movable value is sent
+		stream.send(Tracker{});
+		stream.send(Tracker{});
+		{
+			Tracker t = waitNext(stream.getFuture());
+			ASSERT(!t.moved);
+			ASSERT(t.copied == 0);
+		}
+		choose {
+			when(Tracker t = waitNext(stream.getFuture())) {
+				ASSERT(!t.moved);
+				ASSERT(t.copied == 0);
+			}
+		}
+	}
+	{
+		// This tests the case when no callback is added until
+		// after an unmovable value is sent
+		Tracker namedTracker1;
+		Tracker namedTracker2;
+		stream.send(namedTracker1);
+		stream.send(namedTracker2);
+		{
+			Tracker t = waitNext(stream.getFuture());
+			ASSERT(!t.moved);
+			// must copy onto queue
+			ASSERT(t.copied == 1);
+		}
+		choose {
+			when(Tracker t = waitNext(stream.getFuture())) {
+				ASSERT(!t.moved);
+				// must copy onto queue
+				ASSERT(t.copied == 1);
+			}
+		}
+	}
+
+	return Void();
+}
--- a/fdbrpc/Net2FileSystem.cpp
+++ b/fdbrpc/Net2FileSystem.cpp
@ -93,6 +93,7 @@ Net2FileSystem::Net2FileSystem(double ioTimeout, std::string fileSystemPath)
 {
 	Net2AsyncFile::init();
 #ifdef __linux__
+	if (!FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO)
 		AsyncFileKAIO::init( Reference<IEventFD>(N2::ASIOReactor::getEventFD()), ioTimeout );

 	if (fileSystemPath.empty()) {
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@ -253,6 +253,15 @@ public:
 		else
 			queue->send(value);
 	}
+
+	void send(T&& value) const {
+		if (queue->isRemoteEndpoint()) {
+			FlowTransport::transport().sendUnreliable(SerializeSource<T>(std::move(value)), getEndpoint(), true);
+		}
+		else
+			queue->send(std::move(value));
+	}
+
 	/*void sendError(const Error& error) const {
 	ASSERT( !queue->isRemoteEndpoint() );
 	queue->sendError(error);
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@ -61,7 +61,10 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef<MutationRe
 						KeyRef end = keyInfo->rangeContaining(k).end();
 						KeyRangeRef insertRange(k,end);
 						vector<UID> src, dest;
-						decodeKeyServersValue(m.param2, src, dest);
+						// txnStateStore is always an in-memory KVS, and must always be recovered before
+						// applyMetadataMutations is called, so a wait here should never be needed.
+						Future<Standalone<RangeResultRef>> fResult = txnStateStore->readRange(serverTagKeys);
+						decodeKeyServersValue(fResult.get(), m.param2, src, dest);

 						ASSERT(storageCache);
 						ServerCacheInfo info;
--- a/fdbserver/BackupProgress.actor.cpp
+++ b/fdbserver/BackupProgress.actor.cpp
@ -45,17 +45,21 @@ void BackupProgress::addBackupStatus(const WorkerBackupStatus& status) {
 }

 void BackupProgress::updateTagVersions(std::map<Tag, Version>* tagVersions, std::set<Tag>* tags,
-                                       const std::map<Tag, Version>& progress, Version endVersion, LogEpoch epoch) {
+                                       const std::map<Tag, Version>& progress, Version endVersion,
+                                       Version adjustedBeginVersion, LogEpoch epoch) {
 	for (const auto& [tag, savedVersion] : progress) {
 		// If tag is not in "tags", it means the old epoch has more tags than
 		// new epoch's tags. Just ignore the tag here.
 		auto n = tags->erase(tag);
 		if (n > 0 && savedVersion < endVersion - 1) {
-			tagVersions->insert({ tag, savedVersion + 1 });
+			const Version beginVersion =
+			    (savedVersion + 1 > adjustedBeginVersion) ? (savedVersion + 1) : adjustedBeginVersion;
+			tagVersions->insert({ tag, beginVersion });
 			TraceEvent("BackupVersionRange", dbgid)
 			    .detail("OldEpoch", epoch)
 			    .detail("Tag", tag.toString())
 			    .detail("BeginVersion", savedVersion + 1)
+			    .detail("AdjustedBeginVersion", beginVersion)
 			    .detail("EndVersion", endVersion);
 		}
 	}
@ -66,12 +70,20 @@ std::map<std::tuple<LogEpoch, Version, int>, std::map<Tag, Version>> BackupProgr

 	if (!backupStartedValue.present()) return toRecruit; // No active backups

+	Version lastEnd = invalidVersion;
 	for (const auto& [epoch, info] : epochInfos) {
 		std::set<Tag> tags = enumerateLogRouterTags(info.logRouterTags);
 		std::map<Tag, Version> tagVersions;
+
+		// Sometimes, an epoch's begin version is lower than the previous epoch's
+		// end version. In this case, adjust the epoch's begin version to be the
+		// same as previous end version.
+		Version adjustedBeginVersion = lastEnd > info.epochBegin ? lastEnd : info.epochBegin;
+		lastEnd = info.epochEnd;
+
 		auto progressIt = progress.lower_bound(epoch);
 		if (progressIt != progress.end() && progressIt->first == epoch) {
-			updateTagVersions(&tagVersions, &tags, progressIt->second, info.epochEnd, epoch);
+			updateTagVersions(&tagVersions, &tags, progressIt->second, info.epochEnd, adjustedBeginVersion, epoch);
 		} else {
 			auto rit = std::find_if(
 			    progress.rbegin(), progress.rend(),
@ -90,17 +102,18 @@ std::map<std::tuple<LogEpoch, Version, int>, std::map<Tag, Version>> BackupProgr
 					// The logRouterTags are the same
 					// ASSERT(info.logRouterTags == epochTags[rit->first]);

-					updateTagVersions(&tagVersions, &tags, rit->second, info.epochEnd, epoch);
+					updateTagVersions(&tagVersions, &tags, rit->second, info.epochEnd, adjustedBeginVersion, epoch);
 				}
 			}
 		}

 		for (const Tag tag : tags) { // tags without progress data
-			tagVersions.insert({ tag, info.epochBegin });
+			tagVersions.insert({ tag, adjustedBeginVersion });
 			TraceEvent("BackupVersionRange", dbgid)
 			    .detail("OldEpoch", epoch)
 			    .detail("Tag", tag.toString())
 			    .detail("BeginVersion", info.epochBegin)
+			    .detail("AdjustedBeginVersion", adjustedBeginVersion)
 			    .detail("EndVersion", info.epochEnd);
 		}
 		if (!tagVersions.empty()) {
--- a/fdbserver/BackupProgress.actor.h
+++ b/fdbserver/BackupProgress.actor.h
@ -81,7 +81,8 @@ private:
 	// For each tag in progress, the saved version is smaller than endVersion - 1,
 	// add {tag, savedVersion+1} to tagVersions and remove the tag from "tags".
 	void updateTagVersions(std::map<Tag, Version>* tagVersions, std::set<Tag>* tags,
-	                       const std::map<Tag, Version>& progress, Version endVersion, LogEpoch epoch);
+	                       const std::map<Tag, Version>& progress, Version endVersion, Version adjustedBeginVersion,
+	                       LogEpoch epoch);

 	const UID dbgid;

--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@ -179,7 +179,9 @@ struct BackupData {
 							config.startedBackupWorkers().set(tr, workers.get());
 						}
 						for (auto p : workers.get()) {
-							TraceEvent("BackupWorkerDebug", self->myId).detail("Epoch", p.first).detail("TagID", p.second);
+							TraceEvent("BackupWorkerDebugTag", self->myId)
+							    .detail("Epoch", p.first)
+							    .detail("TagID", p.second);
 						}
 						wait(tr->commit());

--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -1326,7 +1326,7 @@ public:
 	Future<Void> outstandingRequestChecker;
 	Future<Void> outstandingRemoteRequestChecker;
 	AsyncTrigger updateDBInfo;
-	std::vector<Endpoint> updateDBInfoEndpoints;
+	std::set<Endpoint> updateDBInfoEndpoints;
 	std::set<Endpoint> removedDBInfoEndpoints;

 	DBInfo db;
@ -1732,7 +1732,7 @@ ACTOR Future<Void> workerAvailabilityWatch( WorkerInterface worker, ProcessClass
 	        ? Never()
 	        : waitFailureClient(worker.waitFailure, SERVER_KNOBS->WORKER_FAILURE_TIME);
 	cluster->updateWorkerList.set( worker.locality.processId(), ProcessData(worker.locality, startingClass, worker.stableAddress()) );
-	cluster->updateDBInfoEndpoints.push_back(worker.updateServerDBInfo.getEndpoint());
+	cluster->updateDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
 	cluster->updateDBInfo.trigger();
 	// This switching avoids a race where the worker can be added to id_worker map after the workerAvailabilityWatch fails for the worker.
 	wait(delay(0));
@ -2395,12 +2395,12 @@ ACTOR Future<Void> statusServer(FutureStream< StatusRequest> requests,

 			// Get status but trap errors to send back to client.
 			vector<WorkerDetails> workers;
-			std::vector<std::pair<NetworkAddress, Standalone<VectorRef<StringRef>>>> workerIssues;
+			std::vector<ProcessIssues> workerIssues;

 			for(auto& it : self->id_worker) {
 				workers.push_back(it.second.details);
 				if(it.second.issues.size()) {
-					workerIssues.push_back(std::make_pair(it.second.details.interf.address(), it.second.issues));
+					workerIssues.push_back(ProcessIssues(it.second.details.interf.address(), it.second.issues));
 				}
 			}

@ -3032,30 +3032,26 @@ ACTOR Future<Void> dbInfoUpdater( ClusterControllerData* self ) {
 			when(wait(dbInfoChange)) {}
 		}
 		
+		UpdateServerDBInfoRequest req;
 		if(dbInfoChange.isReady()) {
-			self->updateDBInfoEndpoints.clear();
 			for(auto &it : self->id_worker) {
-				self->updateDBInfoEndpoints.push_back(it.second.details.interf.updateServerDBInfo.getEndpoint());
+				req.broadcastInfo.push_back(it.second.details.interf.updateServerDBInfo.getEndpoint());
 			}
 		} else {
-			uniquify(self->updateDBInfoEndpoints);
-			for(int i = 0; i < self->updateDBInfoEndpoints.size(); i++) {
-				if(self->removedDBInfoEndpoints.count(self->updateDBInfoEndpoints[i])) {
-					self->updateDBInfoEndpoints[i] = self->updateDBInfoEndpoints.back();
-					self->updateDBInfoEndpoints.pop_back();
-				}
+			for(auto it : self->removedDBInfoEndpoints) {
+				self->updateDBInfoEndpoints.erase(it);
 			}
+			req.broadcastInfo = std::vector<Endpoint>(self->updateDBInfoEndpoints.begin(), self->updateDBInfoEndpoints.end());
 		}

+		self->updateDBInfoEndpoints.clear();
 		self->removedDBInfoEndpoints.clear();
+		
 		dbInfoChange = self->db.serverInfo->onChange();
 		updateDBInfo = self->updateDBInfo.onTrigger();

-		UpdateServerDBInfoRequest req;
 		req.serializedDbInfo = BinaryWriter::toValue(self->db.serverInfo->get(), AssumeVersion(currentProtocolVersion));
-		req.broadcastInfo = self->updateDBInfoEndpoints;

-		self->updateDBInfoEndpoints.clear();
 		TraceEvent("DBInfoStartBroadcast", self->id);
 		choose {
 			when(std::vector<Endpoint> notUpdated = wait( broadcastDBInfoRequest(req, SERVER_KNOBS->DBINFO_SEND_AMOUNT, Optional<Endpoint>(), false) )) {
@ -3063,8 +3059,8 @@ ACTOR Future<Void> dbInfoUpdater( ClusterControllerData* self ) {
 				for(auto &it : notUpdated) {
 					TraceEvent("DBInfoNotUpdated", self->id).detail("Addr", it.getPrimaryAddress());
 				}
-				self->updateDBInfoEndpoints.insert(self->updateDBInfoEndpoints.end(), notUpdated.begin(), notUpdated.end());
 				if(notUpdated.size()) {
+					self->updateDBInfoEndpoints.insert(notUpdated.begin(), notUpdated.end());
 					self->updateDBInfo.trigger();
 				}
 			}
--- a/fdbserver/CoroFlow.actor.cpp
+++ b/fdbserver/CoroFlow.actor.cpp
@ -180,10 +180,10 @@ class WorkPool : public IThreadPool, public ReferenceCounted<WorkPool<Threadlike
 	ACTOR Future<Void> stopOnError( WorkPool* w ) {
 		try { 
 			wait( w->getError() );  
+			ASSERT(false);
 		} catch (Error& e) {
-			w->error = e;
+			w->stop(e);
 		}
-		w->stop();
 		return Void();
 	}

@ -230,12 +230,14 @@ public:
 		} else
 			pool->queueLock.leave();
 	}
-	virtual Future<Void> stop() {
-		if (error.code() == invalid_error_code) error = success();
+	virtual Future<Void> stop(Error const& e) {
+		if (error.code() == invalid_error_code) {
+			error = e;
+		}

 		pool->queueLock.enter();
 		TraceEvent("WorkPool_Stop").detail("Workers", pool->workers.size()).detail("Idle", pool->idle.size())
-			.detail("Work", pool->work.size());
+			.detail("Work", pool->work.size()).error(e, true);

 		for (uint32_t i=0; i<pool->work.size(); i++)
 			pool->work[i]->cancel();   // What if cancel() does something to this?
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -474,6 +474,8 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution( Dat
 			try {
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 				wait(checkMoveKeysLockReadOnly(&tr, moveKeysLock));
+				state Standalone<RangeResultRef> UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+				ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
 				Standalone<RangeResultRef> keyServers = wait(krmGetRanges(&tr, keyServersPrefix, KeyRangeRef(beginKey, allKeys.end), SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
 				succeeded = true;

@ -482,7 +484,7 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution( Dat
 				// for each range
 				for(int i = 0; i < keyServers.size() - 1; i++) {
 					DDShardInfo info( keyServers[i].key );
-					decodeKeyServersValue( keyServers[i].value, src, dest );
+					decodeKeyServersValue( UIDtoTagMap, keyServers[i].value, src, dest );
 					if(remoteDcIds.size()) {
 						auto srcIter = team_cache.find(src);
 						if(srcIter == team_cache.end()) {
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -537,7 +537,7 @@ struct DDQueueData {

 		// FIXME: is the merge case needed
 		if( input.priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD ) {
-			wait( delay( 0.5, decrementPriority(decrementPriority(TaskPriority::DataDistribution )) ) );
+			wait( delay( 0.5, TaskPriority::DataDistributionVeryLow ) );
 		} else {
 			wait( delay( 0.0001, TaskPriority::DataDistributionLaunch ) );
 		}
@ -546,6 +546,8 @@ struct DDQueueData {
 			servers.clear();
 			tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE );
 			try {
+				state Standalone<RangeResultRef> UIDtoTagMap = wait( tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY ) );
+				ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
 				Standalone<RangeResultRef> keyServersEntries  = wait(
 					tr.getRange( lastLessOrEqual( keyServersKey( input.keys.begin ) ),
 						firstGreaterOrEqual( keyServersKey( input.keys.end ) ), SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS ) );
@ -553,7 +555,7 @@ struct DDQueueData {
 				if(keyServersEntries.size() < SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS) {
 					for( int shard = 0; shard < keyServersEntries.size(); shard++ ) {
 						vector<UID> src, dest;
-						decodeKeyServersValue( keyServersEntries[shard].value, src, dest );
+						decodeKeyServersValue( UIDtoTagMap, keyServersEntries[shard].value, src, dest );
 						ASSERT( src.size() );
 						for( int i = 0; i < src.size(); i++ ) {
 							servers.insert( src[i] );
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -367,14 +367,14 @@ ACTOR Future<Void> changeSizes( DataDistributionTracker* self, KeyRange keys, in
 struct HasBeenTrueFor : ReferenceCounted<HasBeenTrueFor> {
 	explicit HasBeenTrueFor( Optional<ShardMetrics> value ) {
 		if(value.present()) {
-			trigger = delayJittered(std::max(0.0, SERVER_KNOBS->DD_MERGE_COALESCE_DELAY + value.get().lastLowBandwidthStartTime - now()), decrementPriority(TaskPriority::DataDistribution) ) || cleared.getFuture();
+			trigger = delayJittered(std::max(0.0, SERVER_KNOBS->DD_MERGE_COALESCE_DELAY + value.get().lastLowBandwidthStartTime - now()), TaskPriority::DataDistributionLow ) || cleared.getFuture();
 		}
 	}

 	Future<Void> set() {
 		if( !trigger.isValid() ) {
 			cleared = Promise<Void>();
-			trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, decrementPriority(TaskPriority::DataDistribution) ) || cleared.getFuture();
+			trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, TaskPriority::DataDistributionLow ) || cleared.getFuture();
 		}
 		return trigger;
 	}
--- a/fdbserver/KeyValueStoreMemory.actor.cpp
+++ b/fdbserver/KeyValueStoreMemory.actor.cpp
@ -26,6 +26,7 @@
 #include "fdbclient/Notified.h"
 #include "fdbclient/SystemData.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.
+#include "fdbserver/DeltaTree.h"

 #define OP_DISK_OVERHEAD (sizeof(OpHeader) + 1)

@ -268,7 +269,8 @@ private:
 		OpSnapshotEnd,
 		OpSnapshotAbort, // terminate an in progress snapshot in order to start a full snapshot
 		OpCommit, // only in log, not in queue
-		OpRollback // only in log, not in queue
+		OpRollback, // only in log, not in queue
+		OpSnapshotItemDelta
 	};

 	struct OpRef {
@ -344,8 +346,7 @@ private:
 	int64_t overheadWriteBytes;
 	NotifiedVersion notifiedCommittedWriteBytes;
 	Key recoveredSnapshotKey; // After recovery, the next key in the currently uncompleted snapshot
-	IDiskQueue::location
-	    currentSnapshotEnd; // The end of the most recently completed snapshot (this snapshot cannot be discarded)
+	IDiskQueue::location currentSnapshotEnd;  // The end of the most recently completed snapshot (this snapshot cannot be discarded)
 	IDiskQueue::location previousSnapshotEnd; // The end of the second most recently completed snapshot (on commit, this
 	                                          // snapshot can be discarded)
 	PromiseStream<Future<Void>> addActor;
@ -443,6 +444,7 @@ private:

 			state OpQueue recoveryQueue;
 			state OpHeader h;
+			state Standalone<StringRef> lastSnapshotKey;

 			TraceEvent("KVSMemRecoveryStarted", self->id)
 				.detail("SnapshotEndLocation", uncommittedSnapshotEnd);
@ -485,7 +487,7 @@ private:
 						StringRef p1 = data.substr(0, h.len1);
 						StringRef p2 = data.substr(h.len1, h.len2);

-						if (h.op == OpSnapshotItem) { // snapshot data item
+						if (h.op == OpSnapshotItem || h.op == OpSnapshotItemDelta) { // snapshot data item
 							/*if (p1 < uncommittedNextKey) {
 								TraceEvent(SevError, "RecSnapshotBack", self->id)
 									.detail("NextKey", uncommittedNextKey)
@ -493,11 +495,27 @@ private:
 									.detail("Nextlocation", self->log->getNextReadLocation());
 							}
 							ASSERT( p1 >= uncommittedNextKey );*/
+							if(h.op == OpSnapshotItemDelta) {
+								ASSERT(p1.size() > 1);
+								// Get number of bytes borrowed from previous item key
+								int borrowed = *(uint8_t *)p1.begin();
+								ASSERT(borrowed <= lastSnapshotKey.size());
+								// Trim p1 to just the suffix
+								StringRef suffix = p1.substr(1);
+								// Allocate a new string in data arena to hold prefix + suffix
+								Arena &dataArena = *(Arena *)&data.arena();
+								p1 = makeString(borrowed + suffix.size(), dataArena);
+								// Copy the prefix into the new reconstituted key
+								memcpy(mutateString(p1), lastSnapshotKey.begin(), borrowed);
+								// Copy the suffix into the new reconstituted key
+								memcpy(mutateString(p1) + borrowed, suffix.begin(), suffix.size());
+							}
 							if( p1 >= uncommittedNextKey )
 								recoveryQueue.clear( KeyRangeRef(uncommittedNextKey, p1), &uncommittedNextKey.arena() ); //FIXME: Not sure what this line is for, is it necessary?
 							recoveryQueue.set( KeyValueRef(p1, p2), &data.arena() );
 							uncommittedNextKey = keyAfter(p1);
 							++dbgSnapshotItemCount;
+							lastSnapshotKey = Key(p1, data.arena());
 						} else if (h.op == OpSnapshotEnd || h.op == OpSnapshotAbort) { // snapshot complete
 							TraceEvent("RecSnapshotEnd", self->id)
 								.detail("NextKey", uncommittedNextKey)
@ -511,6 +529,7 @@ private:
 							}

 							uncommittedNextKey = Key();
+							lastSnapshotKey = Key();
 							++dbgSnapshotEndCount;
 						} else if (h.op == OpSet) { // set mutation
 							recoveryQueue.set( KeyValueRef(p1,p2), &data.arena() );
@ -629,6 +648,12 @@ private:
 		state int snapItems = 0;
 		state uint64_t snapshotBytes = 0;

+		// Snapshot keys will be alternately written to two preallocated buffers.
+		// This allows consecutive snapshot keys to be compared for delta compression while only copying each key's bytes once.
+		state Key lastSnapshotKeyA = makeString(CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT);
+		state Key lastSnapshotKeyB = makeString(CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT);
+		state bool lastSnapshotKeyUsingA = true;
+
 		TraceEvent("KVSMemStartingSnapshot", self->id).detail("StartKey", nextKey);

 		loop {
@ -652,7 +677,19 @@ private:
 				    .detail("LastOperationWasASnapshot", nextKey == Key() && !nextKeyAfter);
 			lastDiff = diff;

+			// Since notifiedCommittedWriteBytes is only set() once per commit, before logging the commit operation, when
+			// this line is reached it is certain that there are no snapshot items in this commit yet.  Since this commit
+			// could be the first thing read during recovery, we can't write a delta yet.
+			bool useDelta = false;
+
+			// Write snapshot items until the wait above would block because we've used up all of the byte budget
+			loop {
+
 				if (next == self->data.end()) {
+					// After a snapshot end is logged, recovery may not see the last snapshot item logged before it so the 
+					// next snapshot item logged cannot be a delta.
+					useDelta = false;
+
 					auto thisSnapshotEnd = self->log_op(OpSnapshotEnd, StringRef(), StringRef());
 					//TraceEvent("SnapshotEnd", self->id)
 					//	.detail("LastKey", lastKey.present() ? lastKey.get() : LiteralStringRef("<none>"))
@ -670,22 +707,88 @@ private:
 					if (++self->snapshotCount == 2) {
 						self->replaceContent = false;
 					}
+
+					snapItems = 0;
+					snapshotBytes = 0;
+					snapshotTotalWrittenBytes += OP_DISK_OVERHEAD;
+
+					// If we're not stopping now, reset next
+					if(snapshotTotalWrittenBytes < self->notifiedCommittedWriteBytes.get()) {
+						next = self->data.begin();
+					}
+					else {
+						// Otherwise, save state for continuing after the next wait and stop
 						nextKey = Key();
 						nextKeyAfter = false;
-				snapItems = 0;
+						break;
+					}

-				snapshotBytes = 0;
-
-				snapshotTotalWrittenBytes += OP_DISK_OVERHEAD;
 				} else {
-				StringRef tempKey = next.getKey(self->reserved_buffer);
+					// destKey is whichever of the two last key buffers we should write to next.
+					Key &destKey = lastSnapshotKeyUsingA ? lastSnapshotKeyA : lastSnapshotKeyB;
+
+					// Get the key, using destKey as a temporary buffer if needed.
+					KeyRef tempKey = next.getKey(mutateString(destKey));
+					int opKeySize = tempKey.size();
+
+					// If tempKey did not use the start of destKey, then copy tempKey into destKey.
+					// It's technically possible for the source and dest to overlap but with the current container implementations that will not happen.
+					if(tempKey.begin() != destKey.begin()) {
+						memcpy(mutateString(destKey), tempKey.begin(), tempKey.size());
+					}
+
+					// Now, tempKey's bytes definitely exist in memory at destKey.begin() so update destKey's contents to be a proper KeyRef of the key.
+					// This intentionally leaves the Arena alone and doesn't copy anything into it.
+					destKey.contents() = KeyRef(destKey.begin(), tempKey.size());
+
+					// Get the common prefix between this key and the previous one, or 0 if there was no previous one.
+					int commonPrefix;
+					if(useDelta) {
+						commonPrefix = commonPrefixLength(lastSnapshotKeyA, lastSnapshotKeyB);
+					}
+					else {
+						commonPrefix = 0;
+						useDelta = true;
+					}
+
+					// If the common prefix is greater than 1, write a delta item.  It isn't worth doing for 0 or 1 bytes, it would merely add decode overhead (string copying).
+					if(commonPrefix > 1) {
+						// Cap the common prefix length to 255.  Sorry, ridiculously long keys!
+						commonPrefix = std::min<int>(commonPrefix, std::numeric_limits<uint8_t>::max());
+
+						// We're going to temporarily write a 1-byte integer just before the key suffix to create the log op key and log it, then restore that byte.
+						uint8_t &prefixLength = mutateString(destKey)[commonPrefix - 1];
+						uint8_t backupByte = prefixLength;
+						prefixLength = commonPrefix;
+
+						opKeySize = opKeySize - commonPrefix + 1;
+						KeyRef opKey(&prefixLength, opKeySize);
+						self->log_op(OpSnapshotItemDelta, opKey, next.getValue());
+
+						// Restore the overwritten byte
+						prefixLength = backupByte;
+					}
+					else {
 						self->log_op(OpSnapshotItem, tempKey, next.getValue());
-				nextKey = tempKey;
-				nextKeyAfter = true;
+					}
+
 					snapItems++;
-				uint64_t opBytes = tempKey.size() + next.getValue().size() + OP_DISK_OVERHEAD;
+					uint64_t opBytes = opKeySize + next.getValue().size() + OP_DISK_OVERHEAD;
 					snapshotBytes += opBytes;
 					snapshotTotalWrittenBytes += opBytes;
+					lastSnapshotKeyUsingA = !lastSnapshotKeyUsingA;
+
+					// If we're not stopping now, increment next
+					if(snapshotTotalWrittenBytes < self->notifiedCommittedWriteBytes.get()) {
+						++next;
+					}
+					else {
+						// Otherwise, save state for continuing after the next wait and stop
+						nextKey = destKey;
+						nextKeyAfter = true;
+						break;
+					}
+				}
 			}
 		}
 	}
--- a/fdbserver/KeyValueStoreSQLite.actor.cpp
+++ b/fdbserver/KeyValueStoreSQLite.actor.cpp
@ -1858,13 +1858,15 @@ private:
 	ACTOR static Future<Void> stopOnError( KeyValueStoreSQLite* self ) {
 		try {
 			wait( self->readThreads->getError() || self->writeThread->getError() );
+			ASSERT(false);
 		} catch (Error& e) {
 			if (e.code() == error_code_actor_cancelled)
 				throw;
+
+			self->readThreads->stop(e);
+			self->writeThread->stop(e);
 		}

-		self->readThreads->stop();
-		self->writeThread->stop();
 		return Void();
 	}

--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -201,7 +201,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( STORAGE_METRICS_POLLING_DELAY,                         2.0 ); if( randomize && BUGGIFY ) STORAGE_METRICS_POLLING_DELAY = 15.0;
 	init( STORAGE_METRICS_RANDOM_DELAY,                          0.2 );
 	init( AVAILABLE_SPACE_RATIO_CUTOFF,                         0.05 );
-	init( DESIRED_TEAMS_PER_SERVER,                                5 ); if( randomize && BUGGIFY ) DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10);
+	init( DESIRED_TEAMS_PER_SERVER,                                5 ); DESIRED_TEAMS_PER_SERVER = deterministicRandom()->randomInt(1, 10);
 	init( MAX_TEAMS_PER_SERVER,           5*DESIRED_TEAMS_PER_SERVER );
 	init( DD_SHARD_SIZE_GRANULARITY,                         5000000 );
 	init( DD_SHARD_SIZE_GRANULARITY_SIM,                      500000 ); if( randomize && BUGGIFY ) DD_SHARD_SIZE_GRANULARITY_SIM = 0;
@ -307,6 +307,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL,        0.001 );
 	init( START_TRANSACTION_MAX_TRANSACTIONS_TO_START,        100000 );
 	init( START_TRANSACTION_MAX_REQUESTS_TO_START,             10000 );
+	init( START_TRANSACTION_RATE_WINDOW,                         2.0 );
+	init( START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET,             10.0 );
 	init( START_TRANSACTION_MAX_QUEUE_SIZE,                      1e6 );
 	init( KEY_LOCATION_MAX_QUEUE_SIZE,                           1e6 );

@ -326,7 +328,6 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( COMMIT_TRANSACTION_BATCH_BYTES_SCALE_BASE,           100000 );
 	init( COMMIT_TRANSACTION_BATCH_BYTES_SCALE_POWER,             0.0 );

-	init( TRANSACTION_BUDGET_TIME,							   0.050 ); if( randomize && BUGGIFY ) TRANSACTION_BUDGET_TIME = 0.0;
 	init( RESOLVER_COALESCE_TIME,                                1.0 );
 	init( BUGGIFIED_ROW_LIMIT,                  APPLY_MUTATION_BYTES ); if( randomize && BUGGIFY ) BUGGIFIED_ROW_LIMIT = deterministicRandom()->randomInt(3, 30);
 	init( PROXY_SPIN_DELAY,                                     0.01 );
@ -360,7 +361,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( PROVISIONAL_START_DELAY,                               1.0 );
 	init( PROVISIONAL_MAX_DELAY,                                60.0 );
 	init( PROVISIONAL_DELAY_GROWTH,                              1.5 );
-	init( SECONDS_BEFORE_RECRUIT_BACKUP_WORKER,                  4.0 );
+	init( SECONDS_BEFORE_RECRUIT_BACKUP_WORKER,                  4.0 );  if( randomize && BUGGIFY ) SECONDS_BEFORE_RECRUIT_BACKUP_WORKER = deterministicRandom()->random01() * 8;

 	// Resolver
 	init( SAMPLE_OFFSET_PER_KEY,                                 100 );
@ -584,6 +585,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( FASTRESTORE_HEARTBEAT_DELAY,                            10 ); if( randomize && BUGGIFY ) { FASTRESTORE_HEARTBEAT_DELAY = deterministicRandom()->random01() * 120 + 2; }
 	init( FASTRESTORE_HEARTBEAT_MAX_DELAY,                        10 ); if( randomize && BUGGIFY ) { FASTRESTORE_HEARTBEAT_MAX_DELAY = FASTRESTORE_HEARTBEAT_DELAY * 10; }
 	init( FASTRESTORE_APPLIER_FETCH_KEYS_SIZE,                   100 ); if( randomize && BUGGIFY ) { FASTRESTORE_APPLIER_FETCH_KEYS_SIZE =  deterministicRandom()->random01() * 10240 + 1; }
+	init( FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES, 1.0 * 1024.0 * 1024.0 ); if( randomize && BUGGIFY ) { FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES =  deterministicRandom()->random01() * 10.0 * 1024.0 * 1024.0 + 1; }

 	// clang-format on

--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@ -248,6 +248,8 @@ public:
 	double START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL;
 	double START_TRANSACTION_MAX_TRANSACTIONS_TO_START;
 	int START_TRANSACTION_MAX_REQUESTS_TO_START;
+	double START_TRANSACTION_RATE_WINDOW;
+	double START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET;
 	int START_TRANSACTION_MAX_QUEUE_SIZE;
 	int KEY_LOCATION_MAX_QUEUE_SIZE;

@ -265,7 +267,6 @@ public:
 	double COMMIT_BATCHES_MEM_FRACTION_OF_TOTAL;
 	double COMMIT_BATCHES_MEM_TO_TOTAL_MEM_SCALE_FACTOR;

-	double TRANSACTION_BUDGET_TIME;
 	double RESOLVER_COALESCE_TIME;
 	int BUGGIFIED_ROW_LIMIT;
 	double PROXY_SPIN_DELAY;
@ -522,6 +523,7 @@ public:
 	int64_t FASTRESTORE_HEARTBEAT_DELAY; // interval for master to ping loaders and appliers
 	int64_t FASTRESTORE_HEARTBEAT_MAX_DELAY; // master claim a node is down if no heart beat from the node for this delay
 	int64_t FASTRESTORE_APPLIER_FETCH_KEYS_SIZE; // number of keys to fetch in a txn on applier
+	int64_t FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES; // desired size of mutation message sent from loader to appliers

 	ServerKnobs();
 	void initialize(bool randomize = false, ClientKnobs* clientKnobs = NULL, bool isSimulated = false);
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@ -120,8 +120,82 @@ struct ProxyStats {
 	}
 };

-ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64_t* inTransactionCount, int64_t* inBatchTransactionCount, double* outTransactionRate,
-						   double* outBatchTransactionRate, GetHealthMetricsReply* healthMetricsReply, GetHealthMetricsReply* detailedHealthMetricsReply) {
+struct TransactionRateInfo {
+	double rate;
+	double limit;
+	double budget;
+
+	bool disabled;
+
+	Smoother smoothRate;
+	Smoother smoothReleased;
+
+	TransactionRateInfo(double rate) : rate(rate), limit(0), budget(0), disabled(true), smoothRate(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW), 
+	                                   smoothReleased(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW) {}
+
+	void reset() {
+		// Determine the number of transactions that this proxy is allowed to release
+		// Roughly speaking, this is done by computing the number of transactions over some historical window that we could
+		// have started but didn't, and making that our limit. More precisely, we track a smoothed rate limit and release rate,
+		// the difference of which is the rate of additional transactions that we could have released based on that window.
+		// Then we multiply by the window size to get a number of transactions.
+		// 
+		// Limit can be negative in the event that we are releasing more transactions than we are allowed (due to the use of
+		// our budget or because of higher priority transactions).
+		double releaseRate = smoothRate.smoothTotal() - smoothReleased.smoothRate();
+		limit = SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW * releaseRate;
+	}
+
+	bool canStart(int64_t numAlreadyStarted, int64_t count) {
+		return numAlreadyStarted + count <= std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
+	}
+
+	void updateBudget(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed) {
+		// Update the budget to accumulate any extra capacity available or remove any excess that was used.
+		// The actual delta is the portion of the limit we didn't use multiplied by the fraction of the window that elapsed.
+		// 
+		// We may have exceeded our limit due to the budget or because of higher priority transactions, in which case this 
+		// delta will be negative. The delta can also be negative in the event that our limit was negative, which can happen 
+		// if we had already started more transactions in our window than our rate would have allowed.
+		//
+		// This budget has the property that when the budget is required to start transactions (because batches are big),
+		// the sum limit+budget will increase linearly from 0 to the batch size over time and decrease by the batch size
+		// upon starting a batch. In other words, this works equivalently to a model where we linearly accumulate budget over 
+		// time in the case that our batches are too big to take advantage of the window based limits.
+		budget = std::max(0.0, budget + elapsed * (limit - numStartedAtPriority) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
+
+		// If we are emptying out the queue of requests, then we don't need to carry much budget forward
+		// If we did keep accumulating budget, then our responsiveness to changes in workflow could be compromised
+		if(queueEmptyAtPriority) {
+			budget = std::min(budget, SERVER_KNOBS->START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET);
+		}
+
+		smoothReleased.addDelta(numStartedAtPriority);
+	}
+
+	void disable() {
+		disabled = true;
+		rate = 0;
+		smoothRate.reset(0);
+	}
+
+	void setRate(double rate) {
+		ASSERT(rate >= 0 && rate != std::numeric_limits<double>::infinity() && !isnan(rate));
+
+		this->rate = rate;
+		if(disabled) {
+			smoothRate.reset(rate);
+			disabled = false;
+		}
+		else {
+			smoothRate.setTotal(rate);
+		}
+	}
+};
+
+
+ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64_t* inTransactionCount, int64_t* inBatchTransactionCount, TransactionRateInfo *transactionRateInfo,
+						   TransactionRateInfo *batchTransactionRateInfo, GetHealthMetricsReply* healthMetricsReply, GetHealthMetricsReply* detailedHealthMetricsReply) {
 	state Future<Void> nextRequestTimer = Never();
 	state Future<Void> leaseTimeout = Never();
 	state Future<GetRateInfoReply> reply = Never();
@ -150,8 +224,9 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
 		}
 		when ( GetRateInfoReply rep = wait(reply) ) {
 			reply = Never();
-			*outTransactionRate = rep.transactionRate;
-			*outBatchTransactionRate = rep.batchTransactionRate;
+
+			transactionRateInfo->setRate(rep.transactionRate);
+			batchTransactionRateInfo->setRate(rep.batchTransactionRate);
 			//TraceEvent("MasterProxyRate", myID).detail("Rate", rep.transactionRate).detail("BatchRate", rep.batchTransactionRate).detail("Lease", rep.leaseDuration).detail("ReleasedTransactions", *inTransactionCount - lastTC);
 			lastTC = *inTransactionCount;
 			leaseTimeout = delay(rep.leaseDuration);
@ -163,35 +238,15 @@ ACTOR Future<Void> getRate(UID myID, Reference<AsyncVar<ServerDBInfo>> db, int64
 			}
 		}
 		when ( wait( leaseTimeout ) ) {
-			*outTransactionRate = 0;
-			*outBatchTransactionRate = 0;
-			//TraceEvent("MasterProxyRate", myID).detail("Rate", 0.0).detail("BatchRate", 0.0).detail("Lease", "Expired");
+			transactionRateInfo->disable();
+			batchTransactionRateInfo->disable();
+			TraceEvent(SevWarn, "MasterProxyRateLeaseExpired", myID).suppressFor(5.0);
+			//TraceEvent("MasterProxyRate", myID).detail("Rate", 0.0).detail("BatchRate", 0.0).detail("Lease", 0);
 			leaseTimeout = Never();
 		}
 	}
 }

-struct TransactionRateInfo {
-	double rate;
-	double limit;
-
-	TransactionRateInfo(double rate) : rate(rate), limit(0) {}
-
-	void reset(double elapsed) {
-		limit = std::min(0.0, limit) + rate * elapsed; // Adjust the limit based on the full elapsed interval in order to properly erase a deficit
-		limit = std::min(limit, rate * SERVER_KNOBS->START_TRANSACTION_BATCH_INTERVAL_MAX); // Don't allow the rate to exceed what would be allowed in the maximum batch interval
-		limit = std::min(limit, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
-	}
-
-	bool canStart(int64_t numAlreadyStarted) {
-		return numAlreadyStarted < limit;
-	}
-
-	void updateBudget(int64_t numStarted) {
-		limit -= numStarted;
-	}
-};
-
 ACTOR Future<Void> queueTransactionStartRequests(
 	Reference<AsyncVar<ServerDBInfo>> db,
 	Deque<GetReadVersionRequest> *systemQueue,
@ -531,7 +586,7 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData *commitData, PromiseStream<std:
 					}

 					if((batchBytes + bytes > CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT || req.firstInBatch()) && batch.size()) {
-						out.send({ batch, batchBytes });
+						out.send({ std::move(batch), batchBytes });
 						lastBatch = now();
 						timeout = delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher);
 						batch = std::vector<CommitTransactionRequest>();
@ -1354,7 +1409,7 @@ ACTOR static Future<Void> transactionStarter(
 	state vector<MasterProxyInterface> otherProxies;

 	state PromiseStream<double> replyTimes;
-	addActor.send(getRate(proxy.id(), db, &transactionCount, &batchTransactionCount, &normalRateInfo.rate, &batchRateInfo.rate, healthMetricsReply, detailedHealthMetricsReply));
+	addActor.send(getRate(proxy.id(), db, &transactionCount, &batchTransactionCount, &normalRateInfo, &batchRateInfo, healthMetricsReply, detailedHealthMetricsReply));
 	addActor.send(queueTransactionStartRequests(db, &systemQueue, &defaultQueue, &batchQueue, proxy.getConsistentReadVersion.getFuture(),
 	                                            GRVTimer, &lastGRVTime, &GRVBatchTime, replyTimes.getFuture(),
 	                                            &commitData->stats, &batchRateInfo));
@ -1380,8 +1435,8 @@ ACTOR static Future<Void> transactionStarter(

 		if(elapsed == 0) elapsed = 1e-15; // resolve a possible indeterminant multiplication with infinite transaction rate

-		normalRateInfo.reset(elapsed);
-		batchRateInfo.reset(elapsed);
+		normalRateInfo.reset();
+		batchRateInfo.reset();

 		int transactionsStarted[2] = {0,0};
 		int systemTransactionsStarted[2] = {0,0};
@ -1408,11 +1463,10 @@ ACTOR static Future<Void> transactionStarter(
 			auto& req = transactionQueue->front();
 			int tc = req.transactionCount;

-			if (req.priority() < GetReadVersionRequest::PRIORITY_DEFAULT &&
-			    !batchRateInfo.canStart(transactionsStarted[0] + transactionsStarted[1])) {
+			if(req.priority() < GetReadVersionRequest::PRIORITY_DEFAULT && !batchRateInfo.canStart(transactionsStarted[0] + transactionsStarted[1], tc)) {
 				break;
-			} else if (req.priority() < GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE &&
-			           !normalRateInfo.canStart(transactionsStarted[0] + transactionsStarted[1])) {
+			}
+			else if(req.priority() < GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE && !normalRateInfo.canStart(transactionsStarted[0] + transactionsStarted[1], tc)) {
 				break;	
 			}

@ -1450,11 +1504,15 @@ ACTOR static Future<Void> transactionStarter(
 		.detail("TransactionBudget", transactionBudget)
 		.detail("BatchTransactionBudget", batchTransactionBudget);*/

-		transactionCount += transactionsStarted[0] + transactionsStarted[1];
-		batchTransactionCount += batchPriTransactionsStarted[0] + batchPriTransactionsStarted[1];
+		int systemTotalStarted = systemTransactionsStarted[0] + systemTransactionsStarted[1];
+		int normalTotalStarted = defaultPriTransactionsStarted[0] + defaultPriTransactionsStarted[1];
+		int batchTotalStarted = batchPriTransactionsStarted[0] + batchPriTransactionsStarted[1];

-		normalRateInfo.updateBudget(transactionsStarted[0] + transactionsStarted[1]);
-		batchRateInfo.updateBudget(transactionsStarted[0] + transactionsStarted[1]);
+		transactionCount += transactionsStarted[0] + transactionsStarted[1];
+		batchTransactionCount += batchTotalStarted;
+
+		normalRateInfo.updateBudget(systemTotalStarted + normalTotalStarted, systemQueue.empty() && defaultQueue.empty(), elapsed);
+		batchRateInfo.updateBudget(systemTotalStarted + normalTotalStarted + batchTotalStarted, systemQueue.empty() && defaultQueue.empty() && batchQueue.empty(), elapsed);

 		if (debugID.present()) {
 			g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "MasterProxyServer.masterProxyServerCore.Broadcast");
@ -1935,6 +1993,7 @@ ACTOR Future<Void> masterProxyServerCore(
 					state KeyRange txnKeys = allKeys;
 					loop {
 						wait(yield());
+						Standalone<RangeResultRef> UIDtoTagMap = commitData.txnStateStore->readRange( serverTagKeys ).get();
 						Standalone<RangeResultRef> data = commitData.txnStateStore->readRange(txnKeys, SERVER_KNOBS->BUGGIFIED_ROW_LIMIT, SERVER_KNOBS->APPLY_MUTATION_BYTES).get();
 						if(!data.size()) break;
 						((KeyRangeRef&)txnKeys) = KeyRangeRef( keyAfter(data.back().key, txnKeys.arena()), txnKeys.end );
@ -1947,7 +2006,7 @@ ACTOR Future<Void> masterProxyServerCore(
 							if( kv.key.startsWith(keyServersPrefix) ) {
 								KeyRef k = kv.key.removePrefix(keyServersPrefix);
 								if(k != allKeys.end) {
-									decodeKeyServersValue(kv.value, src, dest);
+									decodeKeyServersValue(UIDtoTagMap, kv.value, src, dest);
 									info.tags.clear();
 									info.src_info.clear();
 									info.dest_info.clear();
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@ -211,13 +211,15 @@ ACTOR Future<vector<UID>> addReadWriteDestinations(KeyRangeRef shard, vector<Sto
 }

 ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> shards, Transaction* tr, int desiredHealthy, int maxServers) {
+	state Standalone<RangeResultRef> UIDtoTagMap = wait( tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY) );
+	ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
 	vector<Future<Optional<Value>>> serverListEntries;
 	std::set<UID> fetching;
 	for(int i = 0; i < shards.size() - 1; ++i) {
 		vector<UID> src;
 		vector<UID> dest;

-		decodeKeyServersValue( shards[i].value, src, dest );
+		decodeKeyServersValue( UIDtoTagMap, shards[i].value, src, dest );

 		for(int s=0; s<src.size(); s++) {
 			if(!fetching.count(src[s])) {
@ -251,7 +253,7 @@ ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> s
 		vector<StorageServerInterface> srcInterfs;
 		vector<StorageServerInterface> destInterfs;

-		decodeKeyServersValue( shards[i].value, src, dest );
+		decodeKeyServersValue( UIDtoTagMap, shards[i].value, src, dest );

 		for(int s=0; s<src.size(); s++) {
 			srcInterfs.push_back( ssiMap[src[s]] );
@ -356,6 +358,8 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
 					// 	printf("'%s': '%s'\n", old[i].key.toString().c_str(), old[i].value.toString().c_str());

 					//Check that enough servers for each shard are in the correct state
+					state Standalone<RangeResultRef> UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+					ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
 					vector<vector<UID>> addAsSource = wait(additionalSources(old, &tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER*servers.size()));

 					// For each intersecting range, update keyServers[range] dest to be servers and clear existing dest servers from serverKeys
@ -363,7 +367,7 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
 						KeyRangeRef rangeIntersectKeys( old[i].key, old[i+1].key );
 						vector<UID> src;
 						vector<UID> dest;
-						decodeKeyServersValue( old[i].value, src, dest );
+						decodeKeyServersValue( UIDtoTagMap, old[i].value, src, dest );

 						// TraceEvent("StartMoveKeysOldRange", relocationIntervalId)
 						//     .detail("KeyBegin", rangeIntersectKeys.begin.toString())
@ -378,7 +382,7 @@ ACTOR Future<Void> startMoveKeys( Database occ, KeyRange keys, vector<UID> serve
 						uniquify(src);

 						//Update dest servers for this range to be equal to servers
-						krmSetPreviouslyEmptyRange( &tr, keyServersPrefix, rangeIntersectKeys, keyServersValue(src, servers), old[i+1].value );
+						krmSetPreviouslyEmptyRange( &tr, keyServersPrefix, rangeIntersectKeys, keyServersValue(UIDtoTagMap, src, servers), old[i+1].value );

 						//Track old destination servers.  They may be removed from serverKeys soon, since they are about to be overwritten in keyServers
 						for(auto s = dest.begin(); s != dest.end(); ++s) {
@ -555,6 +559,8 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
 					wait( checkMoveKeysLock(&tr, lock) );

 					state KeyRange currentKeys = KeyRangeRef(begin, keys.end);
+					state Standalone<RangeResultRef> UIDtoTagMap = wait( tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY) );
+					ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
 					state Standalone<RangeResultRef> keyServers = wait( krmGetRanges( &tr, keyServersPrefix, currentKeys, SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES ) );

 					//Determine the last processed key (which will be the beginning for the next iteration)
@ -575,7 +581,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
 					//Iterate through the beginning of keyServers until we find one that hasn't already been processed
 					int currentIndex;
 					for(currentIndex = 0; currentIndex < keyServers.size() - 1 && alreadyMoved; currentIndex++) {
-						decodeKeyServersValue( keyServers[currentIndex].value, src, dest );
+						decodeKeyServersValue( UIDtoTagMap, keyServers[currentIndex].value, src, dest );

 						std::set<UID> srcSet;
 						for(int s = 0; s < src.size(); s++) {
@ -633,7 +639,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
 					//Process the rest of the key servers
 					for(; currentIndex < keyServers.size() - 1; currentIndex++) {
 						vector<UID> src2, dest2;
-						decodeKeyServersValue( keyServers[currentIndex].value, src2, dest2 );
+						decodeKeyServersValue( UIDtoTagMap, keyServers[currentIndex].value, src2, dest2 );

 						std::set<UID> srcSet;
 						for(int s = 0; s < src2.size(); s++)
@ -718,7 +724,7 @@ ACTOR Future<Void> finishMoveKeys( Database occ, KeyRange keys, vector<UID> dest
 					if( count == dest.size() ) {
 						// update keyServers, serverKeys
 						// SOMEDAY: Doing these in parallel is safe because none of them overlap or touch (one per server)
-						wait( krmSetRangeCoalescing( &tr, keyServersPrefix, currentKeys, keys, keyServersValue( dest ) ) );
+						wait( krmSetRangeCoalescing( &tr, keyServersPrefix, currentKeys, keys, keyServersValue( UIDtoTagMap, dest ) ) );

 						std::set<UID>::iterator asi = allServers.begin();
 						std::vector<Future<Void>> actors;
@ -989,6 +995,8 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
 				// Get all values of keyServers and remove serverID from every occurrence
 				// Very inefficient going over every entry in keyServers
 				// No shortcut because keyServers and serverKeys are not guaranteed same shard boundaries
+				state Standalone<RangeResultRef> UIDtoTagMap = wait( tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY) );
+				ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
 				state Standalone<RangeResultRef> keyServers =
 				    wait(krmGetRanges(&tr, keyServersPrefix, KeyRangeRef(begin, allKeys.end),
 				                      SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
@ -997,7 +1005,7 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
 					auto it = keyServers[i];
 					vector<UID> src;
 					vector<UID> dest;
-					decodeKeyServersValue(it.value, src, dest);
+					decodeKeyServersValue(UIDtoTagMap, it.value, src, dest);

 					// The failed server is not present
 					if (std::find(src.begin(), src.end(), serverID) == src.end() &&
@ -1013,7 +1021,7 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
 						.detail("Key", it.key)
 						.detail("ValueSrc", describe(src))
 						.detail("ValueDest", describe(dest));
-					tr.set(keyServersKey(it.key), keyServersValue(src, dest));
+					tr.set(keyServersKey(it.key), keyServersValue(UIDtoTagMap, src, dest));
 				}

 				// Set entire range for our serverID in serverKeys keyspace to false to signal erasure
@ -1095,13 +1103,13 @@ void seedShardServers(
 		tr.set(arena, serverListKeyFor(servers[s].id()), serverListValue(servers[s]));
 	}

-	std::vector<UID> serverIds;
+	std::vector<Tag> serverTags;
 	for(int i=0;i<servers.size();i++)
-		serverIds.push_back(servers[i].id());
+		serverTags.push_back(server_tag[servers[i].id()]);

 	// We have to set this range in two blocks, because the master tracking of "keyServersLocations" depends on a change to a specific
 	//   key (keyServersKeyServersKey)
-	krmSetPreviouslyEmptyRange( tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), keyServersValue( serverIds ), Value() );
+	krmSetPreviouslyEmptyRange( tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), keyServersValue( serverTags ), Value() );

 	for(int s=0; s<servers.size(); s++)
 		krmSetPreviouslyEmptyRange( tr, arena, serverKeysPrefixFor( servers[s].id() ), allKeys, serverKeysTrue, serverKeysFalse );
--- a/fdbserver/RestoreApplier.actor.cpp
+++ b/fdbserver/RestoreApplier.actor.cpp
@ -110,36 +110,29 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
 	state Reference<ApplierBatchData> batchData = self->batch[req.batchIndex];
 	// Assume: processedFileState[req.asset] will not be erased while the actor is active.
 	// Note: Insert new items into processedFileState will not invalidate the reference.
-	state NotifiedVersion& curFilePos = batchData->processedFileState[req.asset];
+	state NotifiedVersion& curMsgIndex = batchData->processedFileState[req.asset];

 	TraceEvent(SevDebug, "FastRestoreApplierPhaseReceiveMutations", self->id())
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("RestoreAsset", req.asset.toString())
-	    .detail("ProcessedFileVersion", curFilePos.get())
+	    .detail("RestoreAssetMesssageIndex", curMsgIndex.get())
 	    .detail("Request", req.toString())
 	    .detail("CurrentMemory", getSystemStatistics().processMemory)
 	    .detail("PreviousVersionBatchState", batchData->vbState.get());

 	wait(isSchedulable(self, req.batchIndex, __FUNCTION__));

-	wait(curFilePos.whenAtLeast(req.prevVersion));
+	wait(curMsgIndex.whenAtLeast(req.msgIndex - 1));
 	batchData->vbState = ApplierVersionBatchState::RECEIVE_MUTATIONS;

 	state bool isDuplicated = true;
-	if (curFilePos.get() == req.prevVersion) {
+	if (curMsgIndex.get() == req.msgIndex - 1) {
 		isDuplicated = false;
-		const Version commitVersion = req.version;
-		uint16_t numVersionStampedKV = 0;
-		// Sanity check: mutations in range file is in [beginVersion, endVersion);
-		// mutations in log file is in [beginVersion, endVersion], both inclusive.
-		ASSERT(commitVersion >= req.asset.beginVersion);
-		// Loader sends the endVersion to ensure all useful versions are sent
-		ASSERT(commitVersion <= req.asset.endVersion);
-		ASSERT(req.mutations.size() == req.subs.size());
+		ASSERT(req.mutations.size() == req.mVersions.size());

 		for (int mIndex = 0; mIndex < req.mutations.size(); mIndex++) {
 			const MutationRef& mutation = req.mutations[mIndex];
-			const LogMessageVersion mutationVersion(commitVersion, req.subs[mIndex]);
+			const LogMessageVersion mutationVersion(req.mVersions[mIndex]);
 			TraceEvent(SevFRMutationInfo, "FastRestoreApplierPhaseReceiveMutations", self->id())
 			    .detail("RestoreAsset", req.asset.toString())
 			    .detail("Version", mutationVersion.toString())
@ -150,26 +143,23 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
 			batchData->counters.receivedMutations += 1;
 			batchData->counters.receivedAtomicOps += isAtomicOp((MutationRef::Type)mutation.type) ? 1 : 0;
 			// Sanity check
+			ASSERT_WE_THINK(req.asset.isInVersionRange(mutationVersion.version));
 			ASSERT_WE_THINK(req.asset.isInKeyRange(mutation));

 			// Note: Log and range mutations may be delivered out of order. Can we handle it?
-			if (mutation.type == MutationRef::SetVersionstampedKey ||
-			    mutation.type == MutationRef::SetVersionstampedValue) {
-				ASSERT(false); // No version stamp mutations in backup logs
-				batchData->addVersionStampedKV(mutation, mutationVersion, numVersionStampedKV);
-				numVersionStampedKV++;
-			} else {
 			batchData->addMutation(mutation, mutationVersion);
+
+			ASSERT(mutation.type != MutationRef::SetVersionstampedKey &&
+			       mutation.type != MutationRef::SetVersionstampedValue);
 		}
-		}
-		curFilePos.set(req.version);
+		curMsgIndex.set(req.msgIndex);
 	}

 	req.reply.send(RestoreCommonReply(self->id(), isDuplicated));
 	TraceEvent(SevDebug, "FastRestoreApplierPhaseReceiveMutationsDone", self->id())
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("RestoreAsset", req.asset.toString())
-	    .detail("ProcessedFileVersion", curFilePos.get())
+	    .detail("ProcessedMessageIndex", curMsgIndex.get())
 	    .detail("Request", req.toString());
 	return Void();
 }
--- a/fdbserver/RestoreApplier.actor.h
+++ b/fdbserver/RestoreApplier.actor.h
@ -107,8 +107,9 @@ struct StagingKey {
 				// TODO: Add SevError here
 				TraceEvent("SameVersion")
 				    .detail("Version", version.toString())
-				    .detail("Mutation", m.toString())
-				    .detail("NewVersion", newVersion.toString());
+				    .detail("NewVersion", newVersion.toString())
+				    .detail("OldMutation", it->second.toString())
+				    .detail("NewMutation", m.toString());
 				ASSERT(it->second.type == m.type && it->second.param1 == m.param1 && it->second.param2 == m.param2);
 			}
 		}
@ -282,26 +283,6 @@ struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
 		}
 	}

-	void addVersionStampedKV(MutationRef m, LogMessageVersion ver, uint16_t numVersionStampedKV) {
-		if (m.type == MutationRef::SetVersionstampedKey) {
-			// Assume transactionNumber = 0 does not affect result
-			TraceEvent(SevDebug, "FastRestoreApplierAddMutation")
-			    .detail("MutationType", typeString[m.type])
-			    .detail("FakedTransactionNumber", numVersionStampedKV);
-			transformVersionstampMutation(m, &MutationRef::param1, ver.version, numVersionStampedKV);
-			addMutation(m, ver);
-		} else if (m.type == MutationRef::SetVersionstampedValue) {
-			// Assume transactionNumber = 0 does not affect result
-			TraceEvent(SevDebug, "FastRestoreApplierAddMutation")
-			    .detail("MutationType", typeString[m.type])
-			    .detail("FakedTransactionNumber", numVersionStampedKV);
-			transformVersionstampMutation(m, &MutationRef::param2, ver.version, numVersionStampedKV);
-			addMutation(m, ver);
-		} else {
-			ASSERT(false);
-		}
-	}
-
 	// Return true if all staging keys have been precomputed
 	bool allKeysPrecomputed() {
 		for (auto& stagingKey : stagingKeys) {
--- a/fdbserver/RestoreCommon.actor.cpp
+++ b/fdbserver/RestoreCommon.actor.cpp
@ -343,7 +343,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
 		return results;

 	} catch (Error& e) {
-		TraceEvent(SevWarn, "FileRestoreCorruptRangeFileBlock")
+		TraceEvent(SevError, "FileRestoreCorruptRangeFileBlock")
 		    .error(e)
 		    .detail("Filename", file->getFilename())
 		    .detail("BlockOffset", offset)
@ -388,7 +388,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeLogFileBlock(Reference<IA
 		return results;

 	} catch (Error& e) {
-		TraceEvent(SevWarn, "FileRestoreCorruptLogFileBlock")
+		TraceEvent(SevError, "FileRestoreCorruptLogFileBlock")
 		    .error(e)
 		    .detail("Filename", file->getFilename())
 		    .detail("BlockOffset", offset)
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -413,8 +413,9 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 	state int kvCount = 0;
 	state int splitMutationIndex = 0;
 	state std::vector<std::pair<UID, RestoreSendVersionedMutationsRequest>> requests;
-	state Version prevVersion = 0; // startVersion
+	state Version msgIndex = 1; // Monotonically increased index for send message, must start at 1
 	state std::vector<UID> applierIDs = getApplierIDs(*pRangeToApplier);
+	state double msgSize = 0; // size of mutations in the message

 	TraceEvent("FastRestoreLoaderSendMutationToApplier")
 	    .detail("IsRangeFile", isRangeFile)
@ -439,11 +440,11 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 	// applierMutationsBuffer is the mutation vector to be sent to each applier
 	// applierMutationsSize is buffered mutation vector size for each applier
 	state std::map<UID, MutationsVec> applierMutationsBuffer;
-	state std::map<UID, SubSequenceVec> applierSubsBuffer;
+	state std::map<UID, LogMessageVersionVec> applierVersionsBuffer;
 	state std::map<UID, double> applierMutationsSize;
 	for (auto& applierID : applierIDs) {
 		applierMutationsBuffer[applierID] = MutationsVec();
-		applierSubsBuffer[applierID] = SubSequenceVec();
+		applierVersionsBuffer[applierID] = LogMessageVersionVec();
 		applierMutationsSize[applierID] = 0.0;
 	}
 	for (kvOp = kvOps.begin(); kvOp != kvOps.end(); kvOp++) {
@ -458,7 +459,6 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 				Standalone<VectorRef<UID>> nodeIDs;
 				// Because using a vector of mutations causes overhead, and the range mutation should happen rarely;
 				// We handle the range mutation and key mutation differently for the benefit of avoiding memory copy
-				// WARNING: The splitMutation() may have bugs
 				splitMutation(pRangeToApplier, kvm, mvector.arena(), mvector.contents(), nodeIDs.arena(),
 				              nodeIDs.contents());
 				ASSERT(mvector.size() == nodeIDs.size());
@ -475,16 +475,15 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 				for (splitMutationIndex = 0; splitMutationIndex < mvector.size(); splitMutationIndex++) {
 					MutationRef mutation = mvector[splitMutationIndex];
 					UID applierID = nodeIDs[splitMutationIndex];
-					// printf("SPLITTED MUTATION: %d: mutation:%s applierID:%s\n", splitMutationIndex,
-					// mutation.toString().c_str(), applierID.toString().c_str());
 					if (debugMutation("RestoreLoader", commitVersion.version, mutation)) {
 						TraceEvent("SplittedMutation")
 						    .detail("Version", commitVersion.toString())
 						    .detail("Mutation", mutation.toString());
 					}
 					applierMutationsBuffer[applierID].push_back_deep(applierMutationsBuffer[applierID].arena(), mutation);
-					applierSubsBuffer[applierID].push_back(applierSubsBuffer[applierID].arena(), commitVersion.sub);
+					applierVersionsBuffer[applierID].push_back(applierVersionsBuffer[applierID].arena(), commitVersion);
 					applierMutationsSize[applierID] += mutation.expectedSize();
+					msgSize += mutation.expectedSize();

 					kvCount++;
 				}
@ -502,8 +501,9 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 					    .detail("Mutation", kvm.toString());
 				}
 				applierMutationsBuffer[applierID].push_back_deep(applierMutationsBuffer[applierID].arena(), kvm);
-				applierSubsBuffer[applierID].push_back(applierSubsBuffer[applierID].arena(), commitVersion.sub);
+				applierVersionsBuffer[applierID].push_back(applierVersionsBuffer[applierID].arena(), commitVersion);
 				applierMutationsSize[applierID] += kvm.expectedSize();
+				msgSize += kvm.expectedSize();
 			}
 		} // Mutations at the same LogMessageVersion

@ -511,26 +511,27 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 		// changing the version comparison below.
 		auto next = std::next(kvOp, 1);
 		if (next == kvOps.end() || commitVersion.version < next->first.version) {
+			// if (next == kvOps.end() || msgSize >= SERVER_KNOBS->FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES) {
 			// TODO: Sanity check each asset has been received exactly once!
 			// Send the mutations to appliers for each version
 			for (const UID& applierID : applierIDs) {
-				requests.emplace_back(applierID, RestoreSendVersionedMutationsRequest(
-				                                     batchIndex, asset, prevVersion, commitVersion.version, isRangeFile,
-				                                     applierMutationsBuffer[applierID], applierSubsBuffer[applierID]));
+				requests.emplace_back(applierID,
+				                      RestoreSendVersionedMutationsRequest(batchIndex, asset, msgIndex, isRangeFile,
+				                                                           applierMutationsBuffer[applierID],
+				                                                           applierVersionsBuffer[applierID]));
 			}
 			TraceEvent(SevDebug, "FastRestoreLoaderSendMutationToApplier")
-			    .detail("PrevVersion", prevVersion)
-			    .detail("CommitVersion", commitVersion.toString())
+			    .detail("MessageIndex", msgIndex)
 			    .detail("RestoreAsset", asset.toString())
 			    .detail("Requests", requests.size());
-			ASSERT(prevVersion < commitVersion.version);
-			prevVersion = commitVersion.version;
 			wait(sendBatchRequests(&RestoreApplierInterface::sendMutationVector, *pApplierInterfaces, requests,
 			                       TaskPriority::RestoreLoaderSendMutations));
+			msgIndex++;
+			msgSize = 0;
 			requests.clear();
 			for (auto& applierID : applierIDs) {
 				applierMutationsBuffer[applierID] = MutationsVec();
-				applierSubsBuffer[applierID] = SubSequenceVec();
+				applierVersionsBuffer[applierID] = LogMessageVersionVec();
 				applierMutationsSize[applierID] = 0.0;
 			}
 		}
@ -540,7 +541,6 @@ ACTOR Future<Void> sendMutationsToApplier(VersionedMutationsMap* pkvOps, int bat
 	return Void();
 }

-// TODO: Add a unit test for this function
 void splitMutation(std::map<Key, UID>* pRangeToApplier, MutationRef m, Arena& mvector_arena,
                   VectorRef<MutationRef>& mvector, Arena& nodeIDs_arena, VectorRef<UID>& nodeIDs) {
 	TraceEvent(SevWarn, "FastRestoreSplitMutation").detail("Mutation", m.toString());
--- a/fdbserver/RestoreMaster.actor.cpp
+++ b/fdbserver/RestoreMaster.actor.cpp
@ -617,8 +617,7 @@ ACTOR static Future<Standalone<VectorRef<RestoreRequest>>> collectRestoreRequest
 ACTOR static Future<Version> collectBackupFiles(Reference<IBackupContainer> bc, std::vector<RestoreFileFR>* rangeFiles,
                                                std::vector<RestoreFileFR>* logFiles, Database cx,
                                                RestoreRequest request) {
-	state bool partitioned = wait(bc->isPartitionedBackup());
-	state BackupDescription desc = wait(partitioned ? bc->describePartitionedBackup() : bc->describeBackup());
+	state BackupDescription desc = wait(bc->describeBackup());

 	// Convert version to real time for operators to read the BackupDescription desc.
 	wait(desc.resolveVersionTimes(cx));
@ -634,8 +633,7 @@ ACTOR static Future<Version> collectBackupFiles(Reference<IBackupContainer> bc,
 		std::cout << "Restore to version: " << request.targetVersion << "\nBackupDesc: \n" << desc.toString() << "\n\n";
 	}

-	Optional<RestorableFileSet> restorable = wait(partitioned ? bc->getPartitionedRestoreSet(request.targetVersion)
-	                                                          : bc->getRestoreSet(request.targetVersion));
+	Optional<RestorableFileSet> restorable = wait(bc->getRestoreSet(request.targetVersion));

 	if (!restorable.present()) {
 		TraceEvent(SevWarn, "FastRestoreMasterPhaseCollectBackupFiles").detail("NotRestorable", request.targetVersion);
--- a/fdbserver/RestoreUtil.h
+++ b/fdbserver/RestoreUtil.h
@ -39,7 +39,7 @@
 #define SevFRMutationInfo SevInfo

 using MutationsVec = Standalone<VectorRef<MutationRef>>;
-using SubSequenceVec = Standalone<VectorRef<uint32_t>>;
+using LogMessageVersionVec = Standalone<VectorRef<LogMessageVersion>>;

 enum class RestoreRole { Invalid = 0, Master = 1, Loader, Applier };
 BINARY_SERIALIZABLE(RestoreRole);
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -1966,14 +1966,14 @@ static std::string getIssueDescription(std::string name) {
 }

 static std::map<std::string, std::vector<JsonBuilderObject>> getProcessIssuesAsMessages(
-    std::vector<std::pair<NetworkAddress, Standalone<VectorRef<StringRef>>>> const& issues) {
+    std::vector<ProcessIssues> const& issues) {
 	std::map<std::string, std::vector<JsonBuilderObject>> issuesMap;

 	try {
 		for (auto processIssues : issues) {
-			for (auto issue : processIssues.second) {
+			for (auto issue : processIssues.issues) {
 				std::string issueStr = issue.toString();
-				issuesMap[processIssues.first.toString()].push_back(
+				issuesMap[processIssues.address.toString()].push_back(
 				    JsonString::makeMessage(issueStr.c_str(), getIssueDescription(issueStr).c_str()));
 			}
 		}
@ -2163,7 +2163,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
 		Reference<AsyncVar<ServerDBInfo>> db,
 		Database cx,
 		vector<WorkerDetails> workers,
-		std::vector<std::pair<NetworkAddress, Standalone<VectorRef<StringRef>>>> workerIssues,
+		std::vector<ProcessIssues> workerIssues,
 		std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>>* clientStatus,
 		ServerCoordinators coordinators,
 		std::vector<NetworkAddress> incompatibleConnections,
--- a/fdbserver/Status.h
+++ b/fdbserver/Status.h
@ -27,7 +27,14 @@
 #include "fdbserver/MasterInterface.h"
 #include "fdbclient/ClusterInterface.h"

-Future<StatusReply> clusterGetStatus( Reference<AsyncVar<struct ServerDBInfo>> const& db, Database const& cx, vector<WorkerDetails> const& workers, std::vector<std::pair<NetworkAddress, Standalone<VectorRef<StringRef>>>> const& workerIssues,
+struct ProcessIssues {
+    NetworkAddress address;
+    Standalone<VectorRef<StringRef>> issues;
+
+	ProcessIssues(NetworkAddress address, Standalone<VectorRef<StringRef>> issues) : address(address), issues(issues) {}
+};
+
+Future<StatusReply> clusterGetStatus( Reference<AsyncVar<struct ServerDBInfo>> const& db, Database const& cx, vector<WorkerDetails> const& workers, std::vector<ProcessIssues> const& workerIssues,
 	std::map<NetworkAddress, std::pair<double, OpenDatabaseRequest>>* const& clientStatus, ServerCoordinators const& coordinators, std::vector<NetworkAddress> const& incompatibleConnections, Version const& datacenterVersionDifference );

 #endif
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@ -120,7 +120,7 @@ struct ClusterControllerFullInterface {
 	RequestStream< struct RegisterWorkerRequest > registerWorker;
 	RequestStream< struct GetWorkersRequest > getWorkers;
 	RequestStream< struct RegisterMasterRequest > registerMaster;
-	RequestStream< struct GetServerDBInfoRequest > getServerDBInfo;
+	RequestStream< struct GetServerDBInfoRequest > getServerDBInfo; //only used by testers; the cluster controller will send the serverDBInfo to workers

 	UID id() const { return clientInterface.id(); }
 	bool operator == (ClusterControllerFullInterface const& r) const { return id() == r.id(); }
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -1872,7 +1872,7 @@ int main(int argc, char* argv[]) {
 			} else { // Call fdbd roles in conventional way
 				ASSERT(opts.connectionFile);

-				setupSlowTaskProfiler();
+				setupRunLoopProfiler();

 				auto dataFolder = opts.dataFolder;
 				if (!dataFolder.size())
@ -1898,7 +1898,7 @@ int main(int argc, char* argv[]) {
 			                       opts.localities));
 			g_network->run();
 		} else if (role == ConsistencyCheck) {
-			setupSlowTaskProfiler();
+			setupRunLoopProfiler();

 			auto m = startSystemMonitor(opts.dataFolder, opts.zoneId, opts.zoneId);
 			f = stopAfter(runTests(opts.connectionFile, TEST_TYPE_CONSISTENCY_CHECK, TEST_HERE, 1, opts.testFile,
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -1048,7 +1048,7 @@ ACTOR Future<Void> workerServer(
 				ServerDBInfo localInfo = BinaryReader::fromStringRef<ServerDBInfo>(req.serializedDbInfo, AssumeVersion(currentProtocolVersion));
 				localInfo.myLocality = locality;

-				if(ccInterface->get().present() && localInfo.infoGeneration < dbInfo->get().infoGeneration && dbInfo->get().clusterInterface == ccInterface->get().get()) {
+				if(localInfo.infoGeneration < dbInfo->get().infoGeneration && localInfo.clusterInterface == dbInfo->get().clusterInterface) {
 					std::vector<Endpoint> rep = req.broadcastInfo;
 					rep.push_back(interf.updateServerDBInfo.getEndpoint());
 					req.reply.send(rep);
@ -1057,7 +1057,7 @@ ACTOR Future<Void> workerServer(
 					if(!ccInterface->get().present() || localInfo.clusterInterface != ccInterface->get().get()) {
 						notUpdated = interf.updateServerDBInfo.getEndpoint();
 					}
-					if(ccInterface->get().present() && localInfo.clusterInterface == ccInterface->get().get() && (localInfo.infoGeneration > dbInfo->get().infoGeneration || dbInfo->get().clusterInterface != ccInterface->get().get())) {
+					else if(localInfo.infoGeneration > dbInfo->get().infoGeneration || dbInfo->get().clusterInterface != ccInterface->get().get()) {
 						
 						TraceEvent("GotServerDBInfoChange").detail("ChangeID", localInfo.id).detail("MasterID", localInfo.master.id())
 						.detail("RatekeeperID", localInfo.ratekeeper.present() ? localInfo.ratekeeper.get().id() : UID())
--- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
@ -21,6 +21,7 @@
 #include "fdbrpc/simulator.h"
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/BackupContainer.h"
+#include "fdbclient/ManagementAPI.actor.h"
 #include "fdbserver/workloads/workloads.actor.h"
 #include "fdbserver/workloads/BulkSetup.actor.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"
@ -213,9 +214,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {

 					state bool restorable = false;
 					if (lastBackupContainer) {
-						state Future<BackupDescription> fdesc = self->usePartitionedLogs
-						                                            ? lastBackupContainer->describePartitionedBackup()
-						                                            : lastBackupContainer->describeBackup();
+						state Future<BackupDescription> fdesc = lastBackupContainer->describeBackup();
 						wait(ready(fdesc));

 						if(!fdesc.isError()) {
@ -423,6 +422,11 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 					// wait(attemptDirtyRestore(self, cx, &backupAgent, StringRef(lastBackupContainer->getURL()),
 					// randomID));
 				}
+
+				// We must ensure no backup workers are running, otherwise the clear DB
+				// below can be picked up by backup workers and applied during restore.
+				wait(success(changeConfig(cx, "backup_worker_enabled:=0", true)));
+
 				// Clear DB before restore
 				wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
 					for (auto& kvrange : self->backupRanges) tr->clear(kvrange);
@ -436,14 +440,8 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 				    .detail("BackupTag", printable(self->backupTag));

 				auto container = IBackupContainer::openContainer(lastBackupContainer->getURL());
-				BackupDescription desc = wait(self->usePartitionedLogs ? container->describePartitionedBackup()
-				                                                       : container->describeBackup());
-
-				TraceEvent("BAFRW_Restore", randomID)
-				    .detail("LastBackupContainer", lastBackupContainer->getURL())
-				    .detail("MinRestorableVersion", desc.minRestorableVersion.get())
-				    .detail("MaxRestorableVersion", desc.maxRestorableVersion.get())
-				    .detail("ContiguousLogEnd", desc.contiguousLogEnd.get());
+				BackupDescription desc = wait(container->describeBackup());
+				ASSERT(self->usePartitionedLogs == desc.partitioned);

 				state Version targetVersion = -1;
 				if (desc.maxRestorableVersion.present()) {
@ -463,6 +461,13 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 					}
 				}

+				TraceEvent("BAFRW_Restore", randomID)
+				    .detail("LastBackupContainer", lastBackupContainer->getURL())
+				    .detail("MinRestorableVersion", desc.minRestorableVersion.get())
+				    .detail("MaxRestorableVersion", desc.maxRestorableVersion.get())
+				    .detail("ContiguousLogEnd", desc.contiguousLogEnd.get())
+				    .detail("TargetVersion", targetVersion);
+
 				state std::vector<Future<Version>> restores;
 				state std::vector<Standalone<StringRef>> restoreTags;

--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@ -667,7 +667,9 @@ struct ConsistencyCheckWorkload : TestWorkload
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			state int bytesReadInRange = 0;

-			decodeKeyServersValue(keyLocations[shard].value, sourceStorageServers, destStorageServers);
+			Standalone<RangeResultRef> UIDtoTagMap = wait( tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY ) );
+			ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
+			decodeKeyServersValue(UIDtoTagMap, keyLocations[shard].value, sourceStorageServers, destStorageServers);

 			//If the destStorageServers is non-empty, then this shard is being relocated
 			state bool isRelocating = destStorageServers.size() > 0;
--- a/fdbserver/workloads/SlowTaskWorkload.actor.cpp
+++ b/fdbserver/workloads/SlowTaskWorkload.actor.cpp
@ -36,7 +36,7 @@ struct SlowTaskWorkload : TestWorkload {
 	}

 	virtual Future<Void> start(Database const& cx) {
-		setupSlowTaskProfiler();
+		setupRunLoopProfiler();
 		return go();
 	}

--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@ -87,7 +87,6 @@ set(FLOW_SRCS
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h)

 add_flow_target(STATIC_LIBRARY NAME flow SRCS ${FLOW_SRCS})
-target_include_directories(flow SYSTEM PUBLIC ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(flow PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 if (NOT APPLE AND NOT WIN32)
  set (FLOW_LIBS ${FLOW_LIBS} rt)
--- a/flow/IThreadPool.cpp
+++ b/flow/IThreadPool.cpp
@ -78,7 +78,7 @@ class ThreadPool : public IThreadPool, public ReferenceCounted<ThreadPool> {
 public:
 	ThreadPool() : dontstop(ios), mode(Run) {}
 	~ThreadPool() {}
-	Future<Void> stop() {
+	Future<Void> stop(Error const& e = success()) {
 		if (mode == Shutdown) return Void();
 		ReferenceCounted<ThreadPool>::addref();
 		ios.stop(); // doesn't work?
--- a/flow/IThreadPool.h
+++ b/flow/IThreadPool.h
@ -60,7 +60,7 @@ public:
 	virtual Future<Void> getError() = 0;  // asynchronously throws an error if there is an internal error
 	virtual void addThread( IThreadPoolReceiver* userData ) = 0;
 	virtual void post( PThreadAction action ) = 0;
-	virtual Future<Void> stop() = 0;
+	virtual Future<Void> stop(Error const& e = success()) = 0;
 	virtual bool isCoro() const { return false; }
 	virtual void addref() = 0;
 	virtual void delref() = 0;
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@ -48,9 +48,13 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
 	init( DISABLE_ASSERTS,                                       0 );
 	init( QUEUE_MODEL_SMOOTHING_AMOUNT,                        2.0 );

-	init( SLOWTASK_PROFILING_INTERVAL,                       0.125 ); // A value of 0 disables SlowTask profiling
+	init( RUN_LOOP_PROFILING_INTERVAL,                       0.125 ); // A value of 0 disables run loop profiling
+	init( SLOWTASK_PROFILING_LOG_INTERVAL,                       0 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
 	init( SLOWTASK_PROFILING_MAX_LOG_INTERVAL,                 1.0 );
 	init( SLOWTASK_PROFILING_LOG_BACKOFF,                      2.0 );
+	init( SATURATION_PROFILING_LOG_INTERVAL,                   0.5 ); // A value of 0 means use RUN_LOOP_PROFILING_INTERVAL
+	init( SATURATION_PROFILING_MAX_LOG_INTERVAL,               5.0 );
+	init( SATURATION_PROFILING_LOG_BACKOFF,                    2.0 );

 	init( RANDOMSEED_RETRY_LIMIT,                                4 );
 	init( FAST_ALLOC_LOGGING_BYTES,                           10e6 );
@ -129,6 +133,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
 	init( SLOW_LOOP_CUTOFF,                          15.0 / 1000.0 );
 	init( SLOW_LOOP_SAMPLING_RATE,                             0.1 );
 	init( TSC_YIELD_TIME,                                  1000000 );
+	init( MIN_LOGGED_PRIORITY_BUSY_FRACTION,                  0.05 );
 	init( CERT_FILE_MAX_SIZE,                      5 * 1024 * 1024 );

 	//Network
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@ -69,10 +69,14 @@ public:
 	double HUGE_ARENA_LOGGING_BYTES;
 	double HUGE_ARENA_LOGGING_INTERVAL;

-	//slow task profiling
-	double SLOWTASK_PROFILING_INTERVAL;
+	//run loop profiling
+	double RUN_LOOP_PROFILING_INTERVAL;
+	double SLOWTASK_PROFILING_LOG_INTERVAL;
 	double SLOWTASK_PROFILING_MAX_LOG_INTERVAL;
 	double SLOWTASK_PROFILING_LOG_BACKOFF;
+	double SATURATION_PROFILING_LOG_INTERVAL;
+	double SATURATION_PROFILING_MAX_LOG_INTERVAL;
+	double SATURATION_PROFILING_LOG_BACKOFF;

 	//connectionMonitor
 	double CONNECTION_MONITOR_LOOP_TIME;
@ -147,6 +151,7 @@ public:
 	double SLOW_LOOP_SAMPLING_RATE;
 	int64_t TSC_YIELD_TIME;
 	int64_t REACTOR_FLAGS;
+	double MIN_LOGGED_PRIORITY_BUSY_FRACTION;
 	int CERT_FILE_MAX_SIZE;

 	//Network
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@ -57,7 +57,8 @@ using namespace boost::asio::ip;
 #if defined(__linux__)
 #include <execinfo.h>

-std::atomic<int64_t> net2liveness(0);
+std::atomic<int64_t> net2RunLoopIterations(0);
+std::atomic<int64_t> net2RunLoopSleeps(0);

 volatile size_t net2backtraces_max = 10000;
 volatile void** volatile net2backtraces = NULL;
@ -171,7 +172,7 @@ public:

 	INetworkConnections *network;  // initially this, but can be changed

-	int64_t tsc_begin, tsc_end;
+	int64_t tscBegin, tscEnd;
 	double taskBegin;
 	TaskPriority currentTaskID;
 	uint64_t tasksIssued;
@ -182,7 +183,7 @@ public:

 	uint64_t numYields;

-	TaskPriority lastMinTaskID;
+	NetworkMetrics::PriorityStats* lastPriorityStats;

 	std::priority_queue<OrderedTask, std::vector<OrderedTask>> ready;
 	ThreadSafeQueue<OrderedTask> threadReady;
@ -195,9 +196,9 @@ public:
 	std::priority_queue<DelayedTask, std::vector<DelayedTask>> timers;

 	void checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority);
-	bool check_yield(TaskPriority taskId, bool isRunLoop);
+	bool check_yield(TaskPriority taskId, int64_t tscNow);
 	void processThreadReady();
-	void trackMinPriority( TaskPriority minTaskID, double now );
+	void trackAtPriority( TaskPriority priority, double now );
 	void stopImmediately() {
 		stopped=true; decltype(ready) _1; ready.swap(_1); decltype(timers) _2; timers.swap(_2);
 	}
@ -221,7 +222,7 @@ public:
 	Int64MetricHandle countYieldCalls;
 	Int64MetricHandle countYieldCallsTrue;
 	Int64MetricHandle countASIOEvents;
-	Int64MetricHandle countSlowTaskSignals;
+	Int64MetricHandle countRunLoopProfilingSignals;
 	Int64MetricHandle countTLSPolicyFailures;
 	Int64MetricHandle priorityMetric;
 	DoubleMetricHandle countLaunchTime;
@ -863,9 +864,9 @@ Net2::Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics)
 	  stopped(false),
 	  tasksIssued(0),
 	  // Until run() is called, yield() will always yield
-	  tsc_begin(0), tsc_end(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield),
-	  lastMinTaskID(TaskPriority::Zero),
+	  tscBegin(0), tscEnd(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield),
 	  numYields(0),
+	  lastPriorityStats(nullptr),
 	  tlsInitialized(false),
 	  tlsConfig(tlsConfig)
 #ifndef TLS_DISABLED
@ -887,13 +888,7 @@ Net2::Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics)
 	setGlobal(INetwork::enEventFD, (flowGlobalType) N2::ASIOReactor::newEventFD(reactor));
 #endif

-
-	int priBins[] = { 1, 2050, 3050, 4050, 4950, 5050, 7050, 8050, 10050 };
-	static_assert( sizeof(priBins) == sizeof(int)*NetworkMetrics::PRIORITY_BINS, "Fix priority bins");
-	for(int i=0; i<NetworkMetrics::PRIORITY_BINS; i++)
-		networkInfo.metrics.priorityBins[i] = static_cast<TaskPriority>(priBins[i]);
 	updateNow();
-
 }

 #ifndef TLS_DISABLED
@ -1009,7 +1004,7 @@ void Net2::initMetrics() {
 	countYieldCalls.init(LiteralStringRef("Net2.CountYieldCalls"));
 	countASIOEvents.init(LiteralStringRef("Net2.CountASIOEvents"));
 	countYieldCallsTrue.init(LiteralStringRef("Net2.CountYieldCallsTrue"));
-	countSlowTaskSignals.init(LiteralStringRef("Net2.CountSlowTaskSignals"));
+	countRunLoopProfilingSignals.init(LiteralStringRef("Net2.CountRunLoopProfilingSignals"));
 	countTLSPolicyFailures.init(LiteralStringRef("Net2.CountTLSPolicyFailures"));
 	priorityMetric.init(LiteralStringRef("Net2.Priority"));
 	awakeMetric.init(LiteralStringRef("Net2.Awake"));
@ -1047,13 +1042,14 @@ void Net2::run() {
 		++countRunLoop;

 		if (runFunc) {
-			tsc_begin = __rdtsc();
+			tscBegin = __rdtsc();
 			taskBegin = nnow;
-			trackMinPriority(TaskPriority::RunCycleFunction, taskBegin);
+			trackAtPriority(TaskPriority::RunCycleFunction, taskBegin);
 			runFunc();
 			double taskEnd = timer_monotonic();
+			trackAtPriority(TaskPriority::RunLoop, taskEnd);
 			countLaunchTime += taskEnd - taskBegin;
-			checkForSlowTask(tsc_begin, __rdtsc(), taskEnd - taskBegin, TaskPriority::RunCycleFunction);
+			checkForSlowTask(tscBegin, __rdtsc(), taskEnd - taskBegin, TaskPriority::RunCycleFunction);
 		}

 		double sleepTime = 0;
@ -1070,7 +1066,12 @@ void Net2::run() {
 				sleepTime = timers.top().at - sleepStart;  // + 500e-6?
 			}
 			if (sleepTime > 0) {
-				trackMinPriority(TaskPriority::Zero, sleepStart);
+#if defined(__linux__)
+				// notify the run loop monitoring thread that we have gone idle
+				net2RunLoopSleeps.fetch_add(1);
+#endif
+
+				trackAtPriority(TaskPriority::Zero, sleepStart);
 				awakeMetric = false;
 				priorityMetric = 0;
 				reactor.sleep(sleepTime);
@ -1078,16 +1079,17 @@ void Net2::run() {
 			}
 		}

-		tsc_begin = __rdtsc();
+		tscBegin = __rdtsc();
 		taskBegin = timer_monotonic();
-		trackMinPriority(TaskPriority::ASIOReactor, taskBegin);
+		trackAtPriority(TaskPriority::ASIOReactor, taskBegin);
 		reactor.react();
 		
 		updateNow();
 		double now = this->currentTime;
+		trackAtPriority(TaskPriority::RunLoop, now);

 		countReactTime += now - taskBegin;
-		checkForSlowTask(tsc_begin, __rdtsc(), now - taskBegin, TaskPriority::ASIOReactor);
+		checkForSlowTask(tscBegin, __rdtsc(), now - taskBegin, TaskPriority::ASIOReactor);

 		if ((now-nnow) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (now-nnow)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE)
 			TraceEvent("SomewhatSlowRunLoopTop").detail("Elapsed", now - nnow);
@ -1104,8 +1106,8 @@ void Net2::run() {

 		processThreadReady();

-		tsc_begin = __rdtsc();
-		tsc_end = tsc_begin + FLOW_KNOBS->TSC_YIELD_TIME;
+		tscBegin = __rdtsc();
+		tscEnd = tscBegin + FLOW_KNOBS->TSC_YIELD_TIME;
 		taskBegin = timer_monotonic();
 		numYields = 0;
 		TaskPriority minTaskID = TaskPriority::Max;
@ -1115,8 +1117,11 @@ void Net2::run() {
 		while (!ready.empty()) {
 			++countTasks;
 			currentTaskID = ready.top().taskID;
+			if(currentTaskID < minTaskID) {
+				trackAtPriority(currentTaskID, taskBegin);
+				minTaskID = currentTaskID;
+			}
 			priorityMetric = static_cast<int64_t>(currentTaskID);
-			minTaskID = std::min(minTaskID, currentTaskID);
 			Task* task = ready.top().task;
 			ready.pop();

@ -1128,19 +1133,26 @@ void Net2::run() {
 				TraceEvent(SevError, "TaskError").error(unknown_error());
 			}

-			if (check_yield(TaskPriority::Max, true)) {
+			double tscNow = __rdtsc();
+			double newTaskBegin = timer_monotonic();
+			if (check_yield(TaskPriority::Max, tscNow)) {
+				checkForSlowTask(tscBegin, tscNow, newTaskBegin - taskBegin, currentTaskID);
 				FDB_TRACE_PROBE(run_loop_yield);
 				++countYields;
 				break;
 			}
+
+			taskBegin = newTaskBegin;
+			tscBegin = tscNow;
 		}
+
+		trackAtPriority(TaskPriority::RunLoop, taskBegin);
+
 		queueSize = ready.size();
 		FDB_TRACE_PROBE(run_loop_done, queueSize);

-		trackMinPriority(minTaskID, now);
-
 #if defined(__linux__)
-		if(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) {
+		if(FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL > 0) {
 			sigset_t orig_set;
 			pthread_sigmask(SIG_BLOCK, &sigprof_set, &orig_set);

@ -1148,7 +1160,7 @@ void Net2::run() {
 			bool was_overflow = net2backtraces_overflow;
 			int signal_count = net2backtraces_count;

-			countSlowTaskSignals += signal_count;
+			countRunLoopProfilingSignals += signal_count;

 			if (other_offset) {
 				volatile void** _traces = net2backtraces;
@ -1164,7 +1176,7 @@ void Net2::run() {
 			pthread_sigmask(SIG_SETMASK, &orig_set, NULL);

 			if (was_overflow) {
-				TraceEvent("Net2SlowTaskOverflow")
+				TraceEvent("Net2RunLoopProfilerOverflow")
 					.detail("SignalsReceived", signal_count)
 					.detail("BackTraceHarvested", other_offset != 0);
 			}
@ -1172,13 +1184,13 @@ void Net2::run() {
 				size_t iter_offset = 0;
 				while (iter_offset < other_offset) {
 					ProfilingSample *ps = (ProfilingSample *)(other_backtraces + iter_offset);
-					TraceEvent(SevWarn, "Net2SlowTaskTrace").detailf("TraceTime", "%.6f", ps->timestamp).detail("Trace", platform::format_backtrace(ps->frames, ps->length));
+					TraceEvent(SevWarn, "Net2RunLoopTrace").detailf("TraceTime", "%.6f", ps->timestamp).detail("Trace", platform::format_backtrace(ps->frames, ps->length));
 					iter_offset += ps->length + 2;
 				}
 			}

-			// to keep the thread liveness check happy
-			net2liveness.fetch_add(1);
+			// notify the run loop monitoring thread that we are making progress 
+			net2RunLoopIterations.fetch_add(1);
 		}
 #endif
 		nnow = timer_monotonic();
@ -1192,24 +1204,43 @@ void Net2::run() {
 	#endif
 }

-void Net2::trackMinPriority( TaskPriority minTaskID, double now ) {
-	if (minTaskID != lastMinTaskID) {
-		for(int c=0; c<NetworkMetrics::PRIORITY_BINS; c++) {
-			TaskPriority pri = networkInfo.metrics.priorityBins[c];
-			if (pri > minTaskID && pri <= lastMinTaskID) {  // busy -> idle
-				networkInfo.metrics.priorityBlocked[c] = false;
-				networkInfo.metrics.priorityBlockedDuration[c] += now - networkInfo.metrics.windowedPriorityTimer[c];
-				networkInfo.metrics.priorityMaxBlockedDuration[c] = std::max(networkInfo.metrics.priorityMaxBlockedDuration[c], now - networkInfo.metrics.priorityTimer[c]);
+void Net2::trackAtPriority( TaskPriority priority, double now ) {
+	if (lastPriorityStats == nullptr || priority != lastPriorityStats->priority) {
+		// Start tracking current priority
+		auto activeStatsItr = networkInfo.metrics.activeTrackers.try_emplace(priority, priority);
+		activeStatsItr.first->second.active = true;
+		activeStatsItr.first->second.windowedTimer = now;
+
+		if(lastPriorityStats != nullptr) {
+			// Stop tracking previous priority
+			lastPriorityStats->active = false;
+			lastPriorityStats->duration += now - lastPriorityStats->windowedTimer;
 		}
-			if (pri <= minTaskID && pri > lastMinTaskID) {  // idle -> busy
-				networkInfo.metrics.priorityBlocked[c] = true;
-				networkInfo.metrics.priorityTimer[c] = now;
-				networkInfo.metrics.windowedPriorityTimer[c] = now;
+
+		// Update starvation trackers
+		TaskPriority lastPriority = (lastPriorityStats == nullptr) ? TaskPriority::Zero : lastPriorityStats->priority;
+		for(auto& binStats : networkInfo.metrics.starvationTrackers) {
+			if(binStats.priority > lastPriority && binStats.priority > priority) {
+				break;
 			}
+
+			// Busy -> idle at binStats.priority
+			if(binStats.priority > priority && binStats.priority <= lastPriority) { 
+				binStats.active = false;
+				binStats.duration += now - binStats.windowedTimer;
+				binStats.maxDuration = std::max(binStats.maxDuration, now - binStats.timer);
+			}
+
+			// Idle -> busy at binStats.priority
+			else if(binStats.priority <= priority && binStats.priority > lastPriority) {
+				binStats.active = true;
+				binStats.timer = now;
+				binStats.windowedTimer = now;
 			}
 		}

-	lastMinTaskID = minTaskID;
+		lastPriorityStats = &activeStatsItr.first->second;
+	}
 }

 void Net2::processThreadReady() {
@ -1241,7 +1272,8 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, T
 		slowTaskMetric->log();

 		double sampleRate = std::min(1.0, (elapsed > warnThreshold) ? 1.0 : elapsed / 10e9);
-		if(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0 && duration > FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL) {
+		double slowTaskProfilingLogInterval = std::max(FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL, FLOW_KNOBS->SLOWTASK_PROFILING_LOG_INTERVAL);
+		if(slowTaskProfilingLogInterval > 0 && duration > slowTaskProfilingLogInterval) {
 			sampleRate = 1; // Always include slow task events that could show up in our slow task profiling.
 		}

@ -1250,12 +1282,8 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, T
 	}
 }

-bool Net2::check_yield( TaskPriority taskID, bool isRunLoop ) {
-	if(!isRunLoop && numYields > 0) {
-		++numYields;
-		return true;
-	}
-
+bool Net2::check_yield( TaskPriority taskID, int64_t tscNow ) {
+	// SOMEDAY: Yield if there are lots of higher priority tasks queued?
 	if ((g_stackYieldLimit) && ( (intptr_t)&taskID < g_stackYieldLimit )) {
 		++countYieldBigStack;
 		return true;
@ -1268,35 +1296,31 @@ bool Net2::check_yield( TaskPriority taskID, bool isRunLoop ) {
 		return true;
 	}

-	// SOMEDAY: Yield if there are lots of higher priority tasks queued?
-	int64_t tsc_now = __rdtsc();
-	double newTaskBegin = timer_monotonic();
-	if (tsc_now < tsc_begin) {
+	if (tscNow < tscBegin) {
 		return true;
 	}

-	if(isRunLoop) {
-		checkForSlowTask(tsc_begin, tsc_now, newTaskBegin-taskBegin, currentTaskID);
-	}
-
-	if (tsc_now > tsc_end) {
+	if (tscNow > tscEnd) {
 		++numYields;
 		return true;
 	}

-	taskBegin = newTaskBegin;
-	tsc_begin = tsc_now;
 	return false;
 }

 bool Net2::check_yield( TaskPriority taskID ) {
-	return check_yield(taskID, false);
+	if(numYields > 0) {
+		++numYields;
+		return true;
+	}
+
+	return check_yield(taskID, __rdtsc());
 }

 Future<class Void> Net2::yield( TaskPriority taskID ) {
 	++countYieldCalls;
 	if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID;
-	if (check_yield(taskID, false)) {
+	if (check_yield(taskID)) {
 		++countYieldCallsTrue;
 		return delay(0, taskID);
 	}
--- a/flow/Platform.cpp
+++ b/flow/Platform.cpp
@ -2867,7 +2867,8 @@ extern volatile size_t net2backtraces_offset;
 extern volatile size_t net2backtraces_max;
 extern volatile bool net2backtraces_overflow;
 extern volatile int64_t net2backtraces_count;
-extern std::atomic<int64_t> net2liveness;
+extern std::atomic<int64_t> net2RunLoopIterations;
+extern std::atomic<int64_t> net2RunLoopSleeps;
 extern void initProfiling();

 std::atomic<double> checkThreadTime;
@ -2953,28 +2954,64 @@ void* checkThread(void *arg) {
 	pthread_t mainThread = *(pthread_t*)arg;
 	free(arg);

-	int64_t lastValue = net2liveness.load();
-	double lastSignal = 0;
-	double logInterval = FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL;
+	int64_t lastRunLoopIterations = net2RunLoopIterations.load();
+	int64_t lastRunLoopSleeps = net2RunLoopSleeps.load();
+
+	double lastSlowTaskSignal = 0;
+	double lastSaturatedSignal = 0;
+
+	const double minSlowTaskLogInterval = std::max(FLOW_KNOBS->SLOWTASK_PROFILING_LOG_INTERVAL, FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
+	const double minSaturationLogInterval = std::max(FLOW_KNOBS->SATURATION_PROFILING_LOG_INTERVAL, FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
+
+	double slowTaskLogInterval = minSlowTaskLogInterval;
+	double saturatedLogInterval = minSaturationLogInterval;
+
 	while(true) {
-		threadSleep(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL);
-		int64_t currentLiveness = net2liveness.load();
-		if(lastValue == currentLiveness) {
+		threadSleep(FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
+
+		int64_t currentRunLoopIterations = net2RunLoopIterations.load();
+		int64_t currentRunLoopSleeps = net2RunLoopSleeps.load();
+
+		bool slowTask = lastRunLoopIterations == currentRunLoopIterations;
+		bool saturated = lastRunLoopSleeps == currentRunLoopSleeps;
+
+		if(slowTask) {
 			double t = timer();
-			if(lastSignal == 0 || t - lastSignal >= logInterval) {
-				if(lastSignal > 0) {
-					logInterval = std::min(FLOW_KNOBS->SLOWTASK_PROFILING_MAX_LOG_INTERVAL, FLOW_KNOBS->SLOWTASK_PROFILING_LOG_BACKOFF * logInterval);
+			if(lastSlowTaskSignal == 0 || t - lastSlowTaskSignal >= slowTaskLogInterval) {
+				if(lastSlowTaskSignal > 0) {
+					slowTaskLogInterval = std::min(FLOW_KNOBS->SLOWTASK_PROFILING_MAX_LOG_INTERVAL, FLOW_KNOBS->SLOWTASK_PROFILING_LOG_BACKOFF * slowTaskLogInterval);
 				}

-				lastSignal = t;
-				checkThreadTime.store(lastSignal);
+				lastSlowTaskSignal = t;
+				checkThreadTime.store(lastSlowTaskSignal);
 				pthread_kill(mainThread, SIGPROF);
 			}
 		}
 		else {
-			lastValue = currentLiveness;
-			lastSignal = 0;
-			logInterval = FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL;
+			lastSlowTaskSignal = 0;
+			lastRunLoopIterations = currentRunLoopIterations;
+			slowTaskLogInterval = minSlowTaskLogInterval;
+		}
+
+		if(saturated) {
+			double t = timer();
+			if(lastSaturatedSignal == 0 || t - lastSaturatedSignal >= saturatedLogInterval) {
+				if(lastSaturatedSignal > 0) {
+					saturatedLogInterval = std::min(FLOW_KNOBS->SATURATION_PROFILING_MAX_LOG_INTERVAL, FLOW_KNOBS->SATURATION_PROFILING_LOG_BACKOFF * saturatedLogInterval);
+				}
+
+				lastSaturatedSignal = t;
+
+				if(!slowTask) {
+					checkThreadTime.store(lastSaturatedSignal);
+					pthread_kill(mainThread, SIGPROF);
+				}
+			}
+		}
+		else {
+			lastSaturatedSignal = 0;
+			lastRunLoopSleeps = currentRunLoopSleeps;
+			saturatedLogInterval = minSaturationLogInterval;
 		}
 	}
 	return NULL;
@ -3000,10 +3037,10 @@ void fdb_probe_actor_exit(const char* name, unsigned long id, int index) {
 #endif


-void setupSlowTaskProfiler() {
+void setupRunLoopProfiler() {
 #ifdef __linux__
-	if (!profileThread && FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) {
-		TraceEvent("StartingSlowTaskProfilingThread").detail("Interval", FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL);
+	if (!profileThread && FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL > 0) {
+		TraceEvent("StartingRunLoopProfilingThread").detail("Interval", FLOW_KNOBS->RUN_LOOP_PROFILING_INTERVAL);
 		initProfiling();
 		profileThread = true;

--- a/flow/Platform.h
+++ b/flow/Platform.h
@ -619,7 +619,7 @@ EXTERNC void flushAndExit(int exitCode);
 void platformInit();

 void registerCrashHandler();
-void setupSlowTaskProfiler();
+void setupRunLoopProfiler();
 EXTERNC void setProfilingEnabled(int enabled);

 // Use _exit() or criticalError(), not exit()
--- a/flow/SystemMonitor.cpp
+++ b/flow/SystemMonitor.cpp
@ -126,7 +126,7 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 				.detail("Yields", netData.countYields - statState->networkState.countYields)
 				.detail("YieldCalls", netData.countYieldCalls - statState->networkState.countYieldCalls)
 				.detail("YieldCallsTrue", netData.countYieldCallsTrue - statState->networkState.countYieldCallsTrue)
-				.detail("SlowTaskSignals", netData.countSlowTaskSignals - statState->networkState.countSlowTaskSignals)
+				.detail("RunLoopProfilingSignals", netData.countRunLoopProfilingSignals - statState->networkState.countRunLoopProfilingSignals)
 				.detail("YieldBigStack", netData.countYieldBigStack - statState->networkState.countYieldBigStack)
 				.detail("RunLoopIterations", netData.countRunLoop - statState->networkState.countRunLoop)
 				.detail("TimersExecuted", netData.countTimers - statState->networkState.countTimers)
@ -148,17 +148,36 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta
 				}
 			}

-			for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkInfo.metrics.priorityBins[i] != TaskPriority::Zero; i++) {
-				if(g_network->networkInfo.metrics.priorityBlocked[i]) {
-					g_network->networkInfo.metrics.priorityBlockedDuration[i] += now() - g_network->networkInfo.metrics.windowedPriorityTimer[i];
-					g_network->networkInfo.metrics.priorityMaxBlockedDuration[i] = std::max(g_network->networkInfo.metrics.priorityMaxBlockedDuration[i], now() - g_network->networkInfo.metrics.priorityTimer[i]);
-					g_network->networkInfo.metrics.windowedPriorityTimer[i] = now();
+			std::map<TaskPriority, double> loggedDurations;
+			for (auto &itr : g_network->networkInfo.metrics.activeTrackers) {
+				if(itr.second.active) {
+					itr.second.duration += now() - itr.second.windowedTimer;
+					itr.second.windowedTimer = now();
 				}

-				n.detail(format("PriorityBusy%d", g_network->networkInfo.metrics.priorityBins[i]).c_str(), std::min(currentStats.elapsed, g_network->networkInfo.metrics.priorityBlockedDuration[i] - statState->networkMetricsState.priorityBlockedDuration[i]));
-				n.detail(format("PriorityMaxBusy%d", g_network->networkInfo.metrics.priorityBins[i]).c_str(), g_network->networkInfo.metrics.priorityMaxBlockedDuration[i]);
+				if(itr.second.duration / currentStats.elapsed >= FLOW_KNOBS->MIN_LOGGED_PRIORITY_BUSY_FRACTION) {
+					loggedDurations[itr.first] = std::min(currentStats.elapsed, itr.second.duration);
+				}

-				g_network->networkInfo.metrics.priorityMaxBlockedDuration[i] = 0;
+				itr.second.duration = 0;
+			}
+
+			for (auto const& itr : loggedDurations) {
+				n.detail(format("PriorityBusy%d", itr.first).c_str(), itr.second);
+			}
+
+			for (auto &itr : g_network->networkInfo.metrics.starvationTrackers) {
+				if(itr.active) {
+					itr.duration += now() - itr.windowedTimer;
+					itr.maxDuration = std::max(itr.maxDuration, now() - itr.timer);
+					itr.windowedTimer = now();
+				}
+
+				n.detail(format("PriorityStarvedBelow%d", itr.priority).c_str(), std::min(currentStats.elapsed, itr.duration));
+				n.detail(format("PriorityMaxStarvedBelow%d", itr.priority).c_str(), itr.maxDuration);
+
+				itr.duration = 0;
+				itr.maxDuration = 0;
 			}

 			n.trackLatest("NetworkMetrics");
--- a/flow/SystemMonitor.h
+++ b/flow/SystemMonitor.h
@ -62,7 +62,7 @@ struct NetworkData {
 	int64_t countYieldCalls;
 	int64_t countASIOEvents;
 	int64_t countYieldCallsTrue;
-	int64_t countSlowTaskSignals;
+	int64_t countRunLoopProfilingSignals;
 	int64_t countFileLogicalWrites;
 	int64_t countFileLogicalReads;
 	int64_t countAIOSubmit;
@ -104,7 +104,7 @@ struct NetworkData {
 		countYieldCalls = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountYieldCalls"));
 		countASIOEvents = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountASIOEvents"));
 		countYieldCallsTrue = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountYieldCallsTrue"));
-		countSlowTaskSignals = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountSlowTaskSignals"));
+		countRunLoopProfilingSignals = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountRunLoopProfilingSignals"));
 		countConnEstablished = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountConnEstablished"));
 		countConnClosedWithError = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountConnClosedWithError"));
 		countConnClosedWithoutError = Int64Metric::getValueOrDefault(LiteralStringRef("Net2.CountConnClosedWithoutError"));
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@ -74,7 +74,7 @@ public:
 			errors.sendError( unknown_error() );
 		}
 	}
-	Future<Void> stop() {
+	Future<Void> stop(Error const& e) {
 		return Void();
 	}
 	void addref() {
@ -377,11 +377,11 @@ public:
 			eventBuffer.clear();
 		}

+		opened = true;
 		for(TraceEventFields &fields : eventBuffer) {
 			annotateEvent(fields);
 		}

-		opened = true;
 		if(preopenOverflowCount > 0) {
 			TraceEvent(SevWarn, "TraceLogPreopenOverflow").detail("OverflowEventCount", preopenOverflowCount);
 			preopenOverflowCount = 0;
@ -389,6 +389,9 @@ public:
 	}

 	void annotateEvent(TraceEventFields& fields) {
+		MutexHolder holder(mutex);
+		if (!opened || fields.isAnnotated())
+			return;
 		if(localAddress.present()) {
 			fields.addField("Machine", formatIpPort(localAddress.get().ip, localAddress.get().port));
 		}
@ -399,14 +402,13 @@ public:
 		if(r.rolesString.size() > 0) {
 			fields.addField("Roles", r.rolesString);
 		}
+		fields.setAnnotated();
 	}

 	void writeEvent(TraceEventFields fields, std::string trackLatestKey, bool trackError) {
 		MutexHolder hold(mutex);

-		if(opened) {
 		annotateEvent(fields);
-		}

 		if(!trackLatestKey.empty()) {
 			fields.addField("TrackLatestType", "Original");
@ -418,6 +420,7 @@ public:
 		}

 		// FIXME: What if we are using way too much memory for buffer?
+		ASSERT(!isOpen() || fields.isAnnotated());
 		eventBuffer.push_back(fields);
 		bufferLength += fields.sizeBytes();

@ -1189,23 +1192,33 @@ TraceInterval& TraceInterval::begin() {
 	return *this;
 }

+bool TraceBatch::dumpImmediately() {
+	return (g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP);
+}
+
 void TraceBatch::addEvent( const char *name, uint64_t id, const char *location ) {
-	eventBatch.push_back( EventInfo(TraceEvent::getCurrentTime(), name, id, location));
-	if( g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP )
+	auto& eventInfo = eventBatch.emplace_back(EventInfo(TraceEvent::getCurrentTime(), name, id, location));
+	if (dumpImmediately())
 		dump();
+	else
+		g_traceLog.annotateEvent(eventInfo.fields);
 }

 void TraceBatch::addAttach( const char *name, uint64_t id, uint64_t to ) {
-	attachBatch.push_back( AttachInfo(TraceEvent::getCurrentTime(), name, id, to));
-	if( g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP )
+	auto& attachInfo = attachBatch.emplace_back(AttachInfo(TraceEvent::getCurrentTime(), name, id, to));
+	if (dumpImmediately())
 		dump();
+	else
+		g_traceLog.annotateEvent(attachInfo.fields);
 }

 void TraceBatch::addBuggify( int activated, int line, std::string file ) {
 	if( g_network ) {
-		buggifyBatch.push_back( BuggifyInfo(TraceEvent::getCurrentTime(), activated, line, file));
-		if( g_network->isSimulated() || FLOW_KNOBS->AUTOMATIC_TRACE_DUMP )
+		auto& buggifyInfo = buggifyBatch.emplace_back(BuggifyInfo(TraceEvent::getCurrentTime(), activated, line, file));
+		if (dumpImmediately())
 			dump();
+		else
+			g_traceLog.annotateEvent(buggifyInfo.fields);
 	} else {
 		buggifyBatch.push_back(BuggifyInfo(0, activated, line, file));
 	}
@ -1272,7 +1285,7 @@ TraceBatch::BuggifyInfo::BuggifyInfo(double time, int activated, int line, std::
 	fields.addField("Line", format("%d", line));
 }

-TraceEventFields::TraceEventFields() : bytes(0) {}
+TraceEventFields::TraceEventFields() : bytes(0), annotated(false) {}

 void TraceEventFields::addField(const std::string& key, const std::string& value) {
 	bytes += key.size() + value.size();
@ -1300,6 +1313,14 @@ TraceEventFields::FieldIterator TraceEventFields::end() const {
 	return fields.cend();
 }

+bool TraceEventFields::isAnnotated() const {
+	return annotated;
+}
+
+void TraceEventFields::setAnnotated() {
+	annotated = true;
+}
+
 const TraceEventFields::Field &TraceEventFields::operator[] (int index) const {
 	ASSERT(index >= 0 && index < size());
 	return fields.at(index);
--- a/flow/Trace.h
+++ b/flow/Trace.h
@ -71,6 +71,8 @@ public:
 	size_t sizeBytes() const;
 	FieldIterator begin() const;
 	FieldIterator end() const;
+	bool isAnnotated() const;
+	void setAnnotated();

 	void addField(const std::string& key, const std::string& value);
 	void addField(std::string&& key, std::string&& value);
@ -95,6 +97,7 @@ public:
 private:
 	FieldContainer fields;
 	size_t bytes;
+	bool annotated;
 };

 template <class Archive>
@ -144,6 +147,7 @@ private:
 	std::vector<EventInfo> eventBatch;
 	std::vector<AttachInfo> attachBatch;
 	std::vector<BuggifyInfo> buggifyBatch;
+	static bool dumpImmediately();
 };

 struct DynamicEventMetric;
--- a/flow/actorcompiler/ActorCompiler.cs
+++ b/flow/actorcompiler/ActorCompiler.cs
@ -813,7 +813,7 @@ namespace actorcompiler
                    returnType = "void",
                    formalParameters = new string[] { 
                        ch.CallbackTypeInStateClass + "*",
-                        ch.Stmt.wait.result.type + " value"
+                        ch.Stmt.wait.result.type + " const& value"
                    },
                    endIsUnreachable = true
                };
--- a/flow/flow.h
+++ b/flow/flow.h
@ -586,7 +586,7 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 			if (error.isValid()) throw error;
 			throw internal_error();
 		}
-		auto copy = queue.front();
+		auto copy = std::move(queue.front());
 		queue.pop();
 		return copy;
 	}
@ -908,6 +908,9 @@ public:
 	void send(const T& value) const {
 		queue->send(value);
 	}
+	void send(T&& value) const {
+		queue->send(std::move(value));
+	}
 	void sendError(const Error& error) const {
 		queue->sendError(error);
 	}
--- a/flow/network.cpp
+++ b/flow/network.cpp
@ -167,6 +167,8 @@ Future<Reference<IConnection>> INetworkConnections::connect( std::string host, s
 	});
 }

+const std::vector<int> NetworkMetrics::starvationBins = { 1, 3500, 7000, 7500, 8500, 8900, 10500 };
+
 TEST_CASE("/flow/network/ipaddress") {
 	ASSERT(NetworkAddress::parse("[::1]:4800").toString() == "[::1]:4800");

--- a/flow/network.h
+++ b/flow/network.h
@ -35,6 +35,7 @@

 enum class TaskPriority {
 	Max = 1000000,
+	RunLoop = 30000,
 	ASIOReactor = 20001,
 	RunCycleFunction = 20000,
 	FlushTrace = 10500,
@ -84,7 +85,9 @@ enum class TaskPriority {
 	MoveKeys = 3550,
 	DataDistributionLaunch = 3530,
 	Ratekeeper = 3510,
-	DataDistribution = 3500,
+	DataDistribution = 3502,
+	DataDistributionLow = 3501,
+	DataDistributionVeryLow = 3500,
 	DiskWrite = 3010,
 	UpdateStorage = 3000,
 	CompactCache = 2900,
@ -322,18 +325,31 @@ struct NetworkMetrics {
 	enum { SLOW_EVENT_BINS = 16 };
 	uint64_t countSlowEvents[SLOW_EVENT_BINS] = {};

-	enum { PRIORITY_BINS = 9 };
-	TaskPriority priorityBins[PRIORITY_BINS] = {};
-	bool priorityBlocked[PRIORITY_BINS] = {};
-	double priorityBlockedDuration[PRIORITY_BINS] = {};
-	double priorityMaxBlockedDuration[PRIORITY_BINS] = {};
-	double priorityTimer[PRIORITY_BINS] = {};
-	double windowedPriorityTimer[PRIORITY_BINS] = {};
-
 	double secSquaredSubmit = 0;
 	double secSquaredDiskStall = 0;

-	NetworkMetrics() {}
+	struct PriorityStats {
+		TaskPriority priority;
+
+		bool active = false;
+		double duration = 0;
+		double timer = 0;
+		double windowedTimer = 0;
+		double maxDuration = 0;
+
+		PriorityStats(TaskPriority priority) : priority(priority) {}
+	};
+
+	std::unordered_map<TaskPriority, struct PriorityStats> activeTrackers;
+	std::vector<struct PriorityStats> starvationTrackers;
+
+	static const std::vector<int> starvationBins;
+
+	NetworkMetrics() {
+		for(int priority : starvationBins) {
+			starvationTrackers.emplace_back(static_cast<TaskPriority>(priority));
+		}
+	}
 };

 struct BoundedFlowLock;