Merge remote-tracking branch 'origin' into remove-boost-dependencies

2021-06-25 11:53:29 -07:00 · 2021-06-25 11:53:29 -07:00 · 9862fec764
parent a424abe5c2 f82cf46124
commit 9862fec764
50 changed files with 2916 additions and 1000 deletions
--- a/cmake/CompileRocksDB.cmake
+++ b/cmake/CompileRocksDB.cmake
@ -22,6 +22,9 @@ if (RocksDB_FOUND)
               -DWITH_SNAPPY=OFF
               -DWITH_ZLIB=OFF
               -DWITH_ZSTD=OFF
+               -DWITH_TSAN=${USE_TSAN}
+               -DWITH_ASAN=${USE_ASAN}
+               -DWITH_UBSAN=${USE_UBSAN}
               -DROCKSDB_BUILD_SHARED=OFF
               -DCMAKE_POSITION_INDEPENDENT_CODE=True
    BUILD_BYPRODUCTS <BINARY_DIR>/librocksdb.a
@ -49,6 +52,9 @@ else()
               -DWITH_SNAPPY=OFF
               -DWITH_ZLIB=OFF
               -DWITH_ZSTD=OFF
+               -DWITH_TSAN=${USE_TSAN}
+               -DWITH_ASAN=${USE_ASAN}
+               -DWITH_UBSAN=${USE_UBSAN}
               -DROCKSDB_BUILD_SHARED=OFF
               -DCMAKE_POSITION_INDEPENDENT_CODE=True
    BUILD_BYPRODUCTS <BINARY_DIR>/librocksdb.a
--- a/documentation/sphinx/source/api-reference.rst
+++ b/documentation/sphinx/source/api-reference.rst
@ -16,3 +16,5 @@ The following documents give detailed descriptions of the API for each language:
   Go API <https://godoc.org/github.com/apple/foundationdb/bindings/go/src/fdb>
   api-c
   api-error-codes
+   special-keys
+   global-configuration
--- a/documentation/sphinx/source/command-line-interface.rst
+++ b/documentation/sphinx/source/command-line-interface.rst
@ -112,7 +112,7 @@ For recommendations on appropriate values for process types in large clusters, s
 perpetual storage wiggle
 ^^^^^^^^^^^^^^^^^^^^^^^^

-Set the value speed (a.k.a., the number of processes that the Data Distributor should wiggle at a time). Currently, only 0 and 1 are supported. The value 0 means to disable the perpetual storage wiggle.
+Set the value speed (a.k.a., the number of processes that the Data Distributor should wiggle at a time). Currently, only 0 and 1 are supported. The value 0 means to disable the perpetual storage wiggle. For more details, see :ref:`perpetual-storage-wiggle`.

 consistencycheck
 ----------------
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@ -757,240 +757,6 @@ If you only need to detect the *fact* of a change, and your response doesn't dep
 .. _developer-guide-peformance-considerations:


-Special keys
-============
-
-Keys starting with the bytes ``\xff\xff`` are called "special" keys, and they are materialized when read. :doc:`\\xff\\xff/status/json <mr-status>` is an example of a special key.
-As of api version 630, additional features have been exposed as special keys and are available to read as ranges instead of just individual keys. Additionally, the special keys are now organized into "modules".
-
-Read-only modules
-----------------
-
-A module is loosely defined as a key range in the special key space where a user can expect similar behavior from reading any key in that range.
-By default, users will see a ``special_keys_no_module_found`` error if they read from a range not contained in a module.
-The error indicates the read would always return an empty set of keys if it proceeded. This could be caused by typo in the keys to read.
-Users will also (by default) see a ``special_keys_cross_module_read`` error if their read spans a module boundary.
-The error is to save the user from the surprise of seeing the behavior of multiple modules in the same read.
-Users may opt out of these restrictions by setting the ``special_key_space_relaxed`` transaction option.
-
-Each special key that existed before api version 630 is its own module. These are
-
-#. ``\xff\xff/cluster_file_path`` See :ref:`cluster file client access <cluster-file-client-access>`
-#. ``\xff\xff/cluster_file_path`` See :ref:`cluster file client access <cluster-file-client-access>`
-#. ``\xff\xff/status/json`` See :doc:`Machine-readable status <mr-status>`
-
-Prior to api version 630, it was also possible to read a range starting at
-``\xff\xff/worker_interfaces``. This is mostly an implementation detail of fdbcli,
-but it's available in api version 630 as a module with prefix ``\xff\xff/worker_interfaces/``.
-
-Api version 630 includes two new modules with prefixes
-``\xff\xff/transaction/`` (information about the current transaction), and
-``\xff\xff/metrics/`` (various metrics, not transactional).
-
-Transaction module
------------------
-
-Reads from the transaction module generally do not require an rpc and only inspect in-memory state for the current transaction.
-
-There are three sets of keys exposed by the transaction module, and each set uses the same encoding, so let's first describe that encoding.
-
-Let's say we have a set of keys represented as intervals of the form ``begin1 <= k < end1 && begin2 <= k < end2 && ...``.
-It could be the case that some of the intervals overlap, e.g. if ``begin1 <= begin2 < end1``, or are adjacent, e.g. if ``end1 == begin2``.
-If we merge all overlapping/adjacent intervals then sort, we end up with a canonical representation of this set of keys.
-
-We encode this canonical set as ordered key value pairs like this::
-
-  <namespace><begin1> -> "1"
-  <namespace><end1> -> "0"
-  <namespace><begin2> -> "1"
-  <namespace><end2> -> "0"
-  ...
-
-Python example::
-
-  >>> tr = db.create_transaction()
-  >>> tr.add_read_conflict_key('foo')
-  >>> tr.add_read_conflict_range('bar/', 'bar0')
-  >>> for k, v in tr.get_range_startswith('\xff\xff/transaction/read_conflict_range/'):
-  ...     print(k, v)
-  ...
-  ('\xff\xff/transaction/read_conflict_range/bar/', '1')
-  ('\xff\xff/transaction/read_conflict_range/bar0', '0')
-  ('\xff\xff/transaction/read_conflict_range/foo', '1')
-  ('\xff\xff/transaction/read_conflict_range/foo\x00', '0')
-
-For read-your-writes transactions, this canonical encoding of conflict ranges
-is already available in memory, and so requesting small ranges is
-correspondingly cheaper than large ranges.
-
-For transactions with read-your-writes disabled, this canonical encoding is computed on
-every read, so you're paying the full cost in CPU time whether or not you
-request a small range.
-
-The namespaces for sets of keys are
-
-#. ``\xff\xff/transaction/read_conflict_range/`` This is the set of keys that will be used for read conflict detection. If another transaction writes to any of these keys after this transaction's read version, then this transaction won't commit.
-#. ``\xff\xff/transaction/write_conflict_range/`` This is the set of keys that will be used for write conflict detection. Keys in this range may cause other transactions which read these keys to abort if this transaction commits.
-#. ``\xff\xff/transaction/conflicting_keys/`` If this transaction failed due to a conflict, it must be the case that some transaction attempted [#conflicting_keys]_ to commit with a write conflict range that intersects this transaction's read conflict range. This is the subset of your read conflict range that actually intersected a write conflict from another transaction.
-
-Caveats
-~~~~~~~
-
-#. ``\xff\xff/transaction/read_conflict_range/`` The conflict range for a read is sometimes not known until that read completes (e.g. range reads with limits, key selectors). When you read from these special keys, the returned future first blocks until all pending reads are complete so it can give an accurate response.
-#. ``\xff\xff/transaction/write_conflict_range/`` The conflict range range for a ``set_versionstamped_key`` atomic op is not known until commit time. You'll get an approximate range (the actual range will be a subset of the approximate range) until the precise range is known.
-#. ``\xff\xff/transaction/conflicting_keys/`` Since using this feature costs server (i.e., commit proxy and resolver) resources, it's disabled by default. You must opt in by setting the ``report_conflicting_keys`` transaction option.
-
-Metrics module
--------------
-
-Reads in the metrics module are not transactional and may require rpcs to complete.
-
-``\xff\xff/metrics/data_distribution_stats/<begin>`` represent stats about the shard that begins at ``<begin>``
-
-  >>> for k, v in db.get_range_startswith('\xff\xff/metrics/data_distribution_stats/', limit=3):
-  ...     print(k, v)
-  ...
-  ('\xff\xff/metrics/data_distribution_stats/', '{"shard_bytes":3828000}')
-  ('\xff\xff/metrics/data_distribution_stats/mako00079', '{"shard_bytes":2013000}')
-  ('\xff\xff/metrics/data_distribution_stats/mako00126', '{"shard_bytes":3201000}')
-
-========================= ======== ===============
-**Field**                 **Type** **Description**
------------------------- -------- ---------------
-shard_bytes               number   An estimate of the sum of kv sizes for this shard.
-========================= ======== ===============
-
-Keys starting with ``\xff\xff/metrics/health/`` represent stats about the health of the cluster, suitable for application-level throttling.
-Some of this information is also available in ``\xff\xff/status/json``, but these keys are significantly cheaper (in terms of server resources) to read.
-
-  >>> for k, v in db.get_range_startswith('\xff\xff/metrics/health/'):
-  ...     print(k, v)
-  ...
-  ('\xff\xff/metrics/health/aggregate', '{"batch_limited":false,"limiting_storage_durability_lag":5000000,"limiting_storage_queue":1000,"tps_limit":483988.66315011407,"worst_storage_durability_lag":5000001,"worst_storage_queue":2036,"worst_log_queue":300}')
-  ('\xff\xff/metrics/health/log/e639a9ad0373367784cc550c615c469b', '{"log_queue":300}')
-  ('\xff\xff/metrics/health/storage/ab2ce4caf743c9c1ae57063629c6678a', '{"cpu_usage":2.398696781487125,"disk_usage":0.059995917598039405,"storage_durability_lag":5000001,"storage_queue":2036}')
-
-``\xff\xff/metrics/health/aggregate``
-
-Aggregate stats about cluster health. Reading this key alone is slightly cheaper than reading any of the per-process keys.
-
-=================================== ======== ===============
-**Field**                           **Type** **Description**
----------------------------------- -------- ---------------
-batch_limited                       boolean  Whether or not the cluster is limiting batch priority transactions
-limiting_storage_durability_lag     number   storage_durability_lag that ratekeeper is using to determing throttling (see the description for storage_durability_lag)
-limiting_storage_queue              number   storage_queue that ratekeeper is using to determing throttling (see the description for storage_queue)
-tps_limit                           number   The rate at which normal priority transactions are allowed to start
-worst_storage_durability_lag        number   See the description for storage_durability_lag
-worst_storage_queue                 number   See the description for storage_queue
-worst_log_queue                     number   See the description for log_queue
-=================================== ======== ===============
-
-``\xff\xff/metrics/health/log/<id>``
-
-Stats about the health of a particular transaction log process
-
-========================= ======== ===============
-**Field**                 **Type** **Description**
------------------------- -------- ---------------
-log_queue                 number   The number of bytes of mutations that need to be stored in memory on this transaction log process
-========================= ======== ===============
-
-``\xff\xff/metrics/health/storage/<id>``
-
-Stats about the health of a particular storage process
-
-========================== ======== ===============
-**Field**                  **Type** **Description**
-------------------------- -------- ---------------
-cpu_usage                  number   The cpu percentage used by this storage process
-disk_usage                 number   The disk IO percentage used by this storage process
-storage_durability_lag     number   The difference between the newest version and the durable version on this storage process. On a lightly loaded cluster this will stay just above 5000000 [#max_read_transaction_life_versions]_.
-storage_queue              number   The number of bytes of mutations that need to be stored in memory on this storage process
-========================== ======== ===============
-
-Caveats
-~~~~~~~
-
-#. ``\xff\xff/metrics/health/`` These keys may return data that's several seconds old, and the data may not be available for a brief period during recovery. This will be indicated by the keys being absent.
-
-
-Read/write modules
------------------
-
-As of api version 700, some modules in the special key space allow writes as
-well as reads. In these modules, a user can expect that mutations (i.e. sets,
-clears, etc) do not have side-effects outside of the current transaction
-until commit is called (the same is true for writes to the normal key space).
-A user can also expect the effects on commit to be atomic. Reads to
-special keys may require reading system keys (whose format is an implementation
-detail), and for those reads appropriate read conflict ranges are added on
-the underlying system keys.
-
-Writes to read/write modules in the special key space are disabled by
-default. Use the ``special_key_space_enable_writes`` transaction option to
-enable them [#special_key_space_enable_writes]_.
-
-
-.. _special-key-space-management-module:
-
-Management module
-~~~~~~~~~~~~~~~~~
-
-The management module is for temporary cluster configuration changes. For
-example, in order to safely remove a process from the cluster, one can add an
-exclusion to the ``\xff\xff/management/excluded/`` key prefix that matches
-that process, and wait for necessary data to be moved away.
-
-#. ``\xff\xff/management/excluded/<exclusion>`` Read/write. Indicates that the cluster should move data away from processes matching ``<exclusion>``, so that they can be safely removed. See :ref:`removing machines from a cluster <removing-machines-from-a-cluster>` for documentation for the corresponding fdbcli command.
-#. ``\xff\xff/management/failed/<exclusion>`` Read/write. Indicates that the cluster should consider matching processes as permanently failed. This allows the cluster to avoid maintaining extra state and doing extra work in the hope that these processes come back. See :ref:`removing machines from a cluster <removing-machines-from-a-cluster>` for documentation for the corresponding fdbcli command.
-#. ``\xff\xff/management/in_progress_exclusion/<address>`` Read-only. Indicates that the process matching ``<address>`` matches an exclusion, but still has necessary data and can't yet be safely removed.
-#. ``\xff\xff/management/options/excluded/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/excluded/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
-#. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
-#. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
-#. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
-#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of a non-negative ``double`` which represents the remaining time for the zone to be in maintenance. Commiting with an invalid value will throw ``special_keys_api_failure``. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
-   In addition, a special key ``\xff\xff/management/maintenance/IgnoreSSFailures`` in the range, if set, will disable datadistribution for storage server failures.
-   It is doing the same thing as the fdbcli command ``datadistribution disable ssfailure``.
-   Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
-   While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
-#. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
-#. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``.
-#. ``\xff\xff/management/db_locked`` Read/write. A single key that can be read and modified. Set the key will lock the database and clear the key will unlock. If the database is already locked, then the commit will fail with the ``special_keys_api_failure`` error. For more details, see help text of ``fdbcli`` command ``lock`` and ``unlock``.
-#. ``\xff\xff/management/auto_coordinators`` Read-only. A single key, if read, will return a set of processes which is able to satisfy the current redundency level and serve as new coordinators. The return value is formatted as a comma delimited string of network addresses of coordinators, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``.
-
-An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
-an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,
-then all processes on that host match the exclusion.
-
-Configuration module
-~~~~~~~~~~~~~~~~~~~~
-
-The configuration module is for changing the cluster configuration.
-For example, you can change a process type or update coordinators by manipulating related special keys through transactions.
-
-#. ``\xff\xff/configuration/process/class_type/<address> := <class_type>`` Read/write. Reading keys in the range will retrieve processes' class types. Setting keys in the range will update processes' class types. The process matching ``<address>`` will be assigned to the given class type if the commit is successful. The valid class types are ``storage``, ``transaction``, ``resolution``, etc. A full list of class type can be found via ``fdbcli`` command ``help setclass``. Clearing keys is forbidden in the range. Instead, you can set the type as ``default``, which will clear the assigned class type if existing. For more details, see help text of ``fdbcli`` command ``setclass``.
-#. ``\xff\xff/configuration/process/class_source/<address> := <class_source>`` Read-only. Reading keys in the range will retrieve processes' class source. The class source is one of ``command_line``, ``configure_auto``, ``set_class`` and ``invalid``, indicating the source that the process's class type comes from.
-#. ``\xff\xff/configuration/coordinators/processes := <ip:port>,<ip:port>,...,<ip:port>`` Read/write. A single key, if read, will return a comma delimited string of coordinators' network addresses. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. As there's always the need to have coordinators, clear on the key is forbidden and a transaction will fail with the ``special_keys_api_failure`` error if the clear is committed. For more details, see help text of ``fdbcli`` command ``coordinators``.
-#. ``\xff\xff/configuration/coordinators/cluster_description := <new_description>`` Read/write. A single key, if read, will return the cluster description. Thus modifying the key will update the cluster decription. The new description needs to match ``[A-Za-z0-9_]+``, otherwise, the ``special_keys_api_failure`` error will be thrown. In addition, clear on the key is meaningless thus forbidden. For more details, see help text of ``fdbcli`` command ``coordinators``.
-
-The ``<address>`` here is the network address of the corresponding process. Thus the general form is ``ip:port``.
-
-Error message module
-~~~~~~~~~~~~~~~~~~~~
-
-Each module written to validates the transaction before committing, and this
-validation failing is indicated by a ``special_keys_api_failure`` error.
-More detailed information about why this validation failed can be accessed through the ``\xff\xff/error_message`` key, whose value is a json document with the following schema.
-
-========================== ======== ===============
-**Field**                  **Type** **Description**
-------------------------- -------- ---------------
-retriable                  boolean  Whether or not this operation might succeed if retried
-command                    string   The fdbcli command corresponding to this operation
-message                    string   Help text explaining the reason this operation failed
-========================== ======== ===============
-
 Performance considerations
 ==========================

@ -1189,7 +955,3 @@ The trickiest errors are non-retryable errors. ``Transaction.on_error`` will ret
 If you see one of those errors, the best way of action is to fail the client.

 At a first glance this looks very similar to an ``commit_unknown_result``. However, these errors lack the one guarantee ``commit_unknown_result`` still gives to the user: if the commit has already been sent to the database, the transaction could get committed at a later point in time. This means that if you retry the transaction, your new transaction might race with the old transaction. While this technically doesn't violate any consistency guarantees, abandoning a transaction means that there are no causality guaranatees.
-
-.. [#conflicting_keys] In practice, the transaction probably committed successfully. However, if you're running multiple resolvers then it's possible for a transaction to cause another to abort even if it doesn't commit successfully.
-.. [#max_read_transaction_life_versions] The number 5000000 comes from the server knob MAX_READ_TRANSACTION_LIFE_VERSIONS
-.. [#special_key_space_enable_writes] Enabling this option enables other transaction options, such as ``ACCESS_SYSTEM_KEYS``. This may change in the future.
--- a/documentation/sphinx/source/global-configuration.rst
+++ b/documentation/sphinx/source/global-configuration.rst
@ -0,0 +1,130 @@
+.. _global-configuration:
+.. default-domain:: cpp
+.. highlight:: cpp
+
+====================
+Global Configuration
+====================
+
+The global configuration framework is an eventually consistent configuration
+mechanism to efficiently make runtime changes to all clients and servers. It
+works by broadcasting updates made to the global configuration key space,
+relying on individual machines to store existing configuration in-memory.
+
+The global configuration framework provides a key-value interface to all
+processes and clients in a FoundationDB cluster.
+
+The global configuration framework is internal to FoundationDB and clients will
+usually have no need to interact with it. The API is provided here for
+reference.
+
+Reading data
+------------
+
+The global configuration framework is exposed through the
+``GlobalConfig::globalConfig()`` static function. There are separate ways to
+read a value, depending on if it is an object or a primitive.
+
+.. function:: template<class T> const T get(KeyRef name, T defaultVal)
+
+   Returns the value associated with ``name`` stored in global configuration,
+   or ``defaultVal`` if no key matching ``name`` exists. This templated
+   function is enabled only when the ``std::is_arithmetic<T>`` specialization
+   returns true.
+
+   .. code-block:: cpp
+   
+      auto& config = GlobalConfig::globalConfig();
+      double value = config.get<double>("path/to/key", 1.0);
+
+.. function:: const Reference<ConfigValue> get(KeyRef name)
+
+   Returns the value associated with ``name`` stored in global configuration.
+
+   .. code-block:: cpp
+   
+      auto& config = GlobalConfig::globalConfig();
+      auto configValue = config.get("path/to/key");
+   
+      // Check if value exists
+      ASSERT(configValue.value.has_value());
+      // Cast to correct type
+      auto str = std::any_cast<StringRef>(configValue.value);
+
+.. function:: const std::map<KeyRef, Reference<ConfigValue>> get(KeyRangeRef range)
+
+   Returns all values in the specified range.
+
+.. type:: ConfigValue
+
+   Holds a global configuration value and the arena where it lives. ::
+
+     struct ConfigValue : ReferenceCounted<ConfigValue> {
+        Arena arena;
+        std::any value;
+     }
+
+   ``arena``
+       The arena where the value (and the associated key) lives in memory.
+
+   ``value``
+       The stored value.
+
+Writing data
+------------
+
+Writing data to global configuration is done using transactions written to the
+special key space range ``\xff\xff/global_config/ - \xff\xff/global_config/0``.
+Values must always be encoded according to the :ref:`api-python-tuple-layer`.
+
+.. code-block:: cpp
+
+   // In GlobalConfig.actor.h
+   extern const KeyRef myGlobalConfigKey;
+   // In GlobalConfig.actor.cpp
+   const KeyRef myGlobalConfigKey = LiteralStringRef("config/key");
+   
+   // When you want to set the value..
+   Tuple value = Tuple().appendDouble(1.5);
+   
+   FDBTransaction* tr = ...;
+   tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+   tr->set(GlobalConfig::prefixedKey(myGlobalConfigKey), value.pack());
+   // commit transaction
+
+The client is responsible for avoiding conflicts with other global
+configuration keys. For most uses, it is recommended to create a new key space.
+For example, an application that wants to write configuration data should use
+the ``global_config/config/`` namespace, instead of storing keys in the top
+level ``global_config/`` key space.
+
+^^^^^^^^^^^^^
+Clearing data
+^^^^^^^^^^^^^
+
+Data can be removed from global configuration using standard transaction
+semantics. Submit a clear or clear range to the appropriate global
+configuration keys in the special key space to clear data.
+
+Watching data
+-------------
+
+Global configuration provides functionality to watch for changes.
+
+.. function:: Future<Void> onInitialized()
+
+   Returns a ``Future`` which will be triggered when global configuration has
+   been successfully initialized and populated with data.
+
+.. function:: Future<Void> onChange()
+
+   Returns a ``Future`` which will be triggered when any key-value pair in
+   global configuration changes.
+
+.. function:: void trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn)
+
+   Registers a callback to be called when the value for global configuration
+   key ``key`` is changed. The callback function takes a single argument, an
+   optional which will be populated with the updated value when ``key`` is
+   changed, or an empty optional if the value was cleared. If the value is an
+   allocated object, its memory remains in control of global configuration.
--- a/documentation/sphinx/source/operations.rst
+++ b/documentation/sphinx/source/operations.rst
@ -28,6 +28,8 @@ Ready to operate an externally accessible FoundationDB cluster? You'll find what

 * :doc:`tss` gives an overview of the Testing Storage Server feature of FoundationDB, which allows you to safely run an untrusted storage engine in a production cluster.

+* :doc:`perpetual-storage-wiggle` gives an overview of Perpetual Storage Wiggle feature about how to use it.
+
 .. toctree::
 :maxdepth: 2
 :titlesonly:
@ -44,3 +46,4 @@ Ready to operate an externally accessible FoundationDB cluster? You'll find what
 platforms
 transaction-tagging
 tss
+ perpetual-storage-wiggle
--- a/documentation/sphinx/source/perpetual-storage-wiggle.rst
+++ b/documentation/sphinx/source/perpetual-storage-wiggle.rst
@ -0,0 +1,56 @@
+.. _perpetual-storage-wiggle:
+
+############################
+Perpetual Storage Wiggle
+############################
+
+.. include:: guide-common.rst.inc
+
+This document covers the concept and usage of perpetual storage wiggle.
+
+.. _perpetual-storage-wiggle-introduction:
+
+Summary
+============
+Perpetual storage wiggle is a feature that forces the data distributor to constantly build new storage teams when the cluster is healthy. On a high-level note, the process is like this:
+
+Order storage servers by process id. For each storage server n:
+
+a. Exclude storage server n.
+
+b. Wait until all data has been moved off the storage server.
+
+c. Include storage n
+
+Goto a to wiggle the next storage process with different process id.
+
+With a perpetual wiggle, storage migrations will be much less impactful. The wiggler will detect the healthy status based on healthy teams, available disk space and the number of unhealthy relocations. It will pause the wiggle until the cluster is healthy again.
+
+Configuration
+=============
+
+You can configure the Perpetual Storage Wiggle via the FDB :ref:`command line interface <command-line-interface>`.
+
+Example commands
+----------------
+
+Open perpetual storage wiggle: ``configure perpetual_storage_wiggle=1``.
+
+Disable perpetual storage wiggle on the cluster: ``configure perpetual_storage_wiggle=0``.
+
+Monitor
+=======
+
+The ``status`` command in the FDB :ref:`command line interface <command-line-interface>` will show the current perpetual_storage_wiggle value.
+
+Trace Events
+----------------------
+``PerpetualStorageWiggleOpen`` shows up when you switch on perpetual storage wiggle, while ``PerpetualStorageWiggleClose`` appears when you turn it off;
+
+``PerpetualStorageWiggleStart`` event means the wiggler start wiggling 1 process, it also contains the process id of the wiggling process how many healthy teams we has now. It's worthy to note ``ExtraHealthyTeamCount`` that indicates how many healthy team we need to restart the paused wiggler and ``HealthyTeamCount``. If ``ExtraHealthyTeamCount`` keep being larger than team count, then you may need to add more storage server.
+
+``PerpetualStorageWigglePause`` event shows up when the wiggler pause because it detect the cluster is unhealthy;
+
+``PerpetualStorageWiggleFinish`` event indicates the wiggle is done on current process.
+
+In ``MovingData`` event, the field ``PriorityStorageWiggle`` shows how many relocations are in the queue because storage wiggle.
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@ -11,7 +11,7 @@ Features
 --------
 * Added a new API in all bindings that can be used to get a list of split points that will split the given range into (roughly) equally sized chunks. `(PR #3394) <https://github.com/apple/foundationdb/pull/3394>`_
 * Added support for writing backup files directly to Azure blob storage. This is not yet performance tested on large-scale clusters. `(PR #3961) <https://github.com/apple/foundationdb/pull/3961>`_
-* Added the Testing Storage Server (TSS), which allows FoundationDB to run an "untrusted" storage engine with identical workload to the current storage engine, with zero impact on durability or correctness, and minimal impact on performance. `(PR #4556) <https://github.com/apple/foundationdb/pull/4556>`_ `(PR #4892) <https://github.com/apple/foundationdb/pull/4892>`_ `(PR #4895) <https://github.com/apple/foundationdb/pull/4895>`_ `(PR #4934) <https://github.com/apple/foundationdb/pull/4934>`_ `(PR #4949) <https://github.com/apple/foundationdb/pull/4949>`_ `(PR #4965) <https://github.com/apple/foundationdb/pull/4965>`_ 
+* Added the Testing Storage Server (TSS), which allows FoundationDB to run an "untrusted" storage engine with identical workload to the current storage engine, with zero impact on durability or correctness, and minimal impact on performance. `(Documentation) <https://github.com/apple/foundationdb/blob/master/documentation/sphinx/source/tss.rst>`_ `(PR #4556) <https://github.com/apple/foundationdb/pull/4556>`_

 Performance
 -----------
--- a/documentation/sphinx/source/special-keys.rst
+++ b/documentation/sphinx/source/special-keys.rst
@ -0,0 +1,260 @@
+.. _special-keys:
+
+============
+Special Keys
+============
+
+Keys starting with the bytes ``\xff\xff`` are called "special" keys, and they are materialized when read. :doc:`\\xff\\xff/status/json <mr-status>` is an example of a special key.
+As of api version 630, additional features have been exposed as special keys and are available to read as ranges instead of just individual keys. Additionally, the special keys are now organized into "modules".
+
+Read-only modules
+=================
+
+A module is loosely defined as a key range in the special key space where a user can expect similar behavior from reading any key in that range.
+By default, users will see a ``special_keys_no_module_found`` error if they read from a range not contained in a module.
+The error indicates the read would always return an empty set of keys if it proceeded. This could be caused by typo in the keys to read.
+Users will also (by default) see a ``special_keys_cross_module_read`` error if their read spans a module boundary.
+The error is to save the user from the surprise of seeing the behavior of multiple modules in the same read.
+Users may opt out of these restrictions by setting the ``special_key_space_relaxed`` transaction option.
+
+Each special key that existed before api version 630 is its own module. These are
+
+#. ``\xff\xff/cluster_file_path`` See :ref:`cluster file client access <cluster-file-client-access>`
+#. ``\xff\xff/status/json`` See :doc:`Machine-readable status <mr-status>`
+
+Prior to api version 630, it was also possible to read a range starting at
+``\xff\xff/worker_interfaces``. This is mostly an implementation detail of fdbcli,
+but it's available in api version 630 as a module with prefix ``\xff\xff/worker_interfaces/``.
+
+Api version 630 includes two new modules with prefixes
+``\xff\xff/transaction/`` (information about the current transaction), and
+``\xff\xff/metrics/`` (various metrics, not transactional).
+
+Transaction module
+------------------
+
+Reads from the transaction module generally do not require an rpc and only inspect in-memory state for the current transaction.
+
+There are three sets of keys exposed by the transaction module, and each set uses the same encoding, so let's first describe that encoding.
+
+Let's say we have a set of keys represented as intervals of the form ``begin1 <= k < end1 && begin2 <= k < end2 && ...``.
+It could be the case that some of the intervals overlap, e.g. if ``begin1 <= begin2 < end1``, or are adjacent, e.g. if ``end1 == begin2``.
+If we merge all overlapping/adjacent intervals then sort, we end up with a canonical representation of this set of keys.
+
+We encode this canonical set as ordered key value pairs like this::
+
+  <namespace><begin1> -> "1"
+  <namespace><end1> -> "0"
+  <namespace><begin2> -> "1"
+  <namespace><end2> -> "0"
+  ...
+
+Python example::
+
+  >>> tr = db.create_transaction()
+  >>> tr.add_read_conflict_key('foo')
+  >>> tr.add_read_conflict_range('bar/', 'bar0')
+  >>> for k, v in tr.get_range_startswith('\xff\xff/transaction/read_conflict_range/'):
+  ...     print(k, v)
+  ...
+  ('\xff\xff/transaction/read_conflict_range/bar/', '1')
+  ('\xff\xff/transaction/read_conflict_range/bar0', '0')
+  ('\xff\xff/transaction/read_conflict_range/foo', '1')
+  ('\xff\xff/transaction/read_conflict_range/foo\x00', '0')
+
+For read-your-writes transactions, this canonical encoding of conflict ranges
+is already available in memory, and so requesting small ranges is
+correspondingly cheaper than large ranges.
+
+For transactions with read-your-writes disabled, this canonical encoding is computed on
+every read, so you're paying the full cost in CPU time whether or not you
+request a small range.
+
+The namespaces for sets of keys are
+
+#. ``\xff\xff/transaction/read_conflict_range/`` This is the set of keys that will be used for read conflict detection. If another transaction writes to any of these keys after this transaction's read version, then this transaction won't commit.
+#. ``\xff\xff/transaction/write_conflict_range/`` This is the set of keys that will be used for write conflict detection. Keys in this range may cause other transactions which read these keys to abort if this transaction commits.
+#. ``\xff\xff/transaction/conflicting_keys/`` If this transaction failed due to a conflict, it must be the case that some transaction attempted [#conflicting_keys]_ to commit with a write conflict range that intersects this transaction's read conflict range. This is the subset of your read conflict range that actually intersected a write conflict from another transaction.
+
+Caveats
+~~~~~~~
+
+#. ``\xff\xff/transaction/read_conflict_range/`` The conflict range for a read is sometimes not known until that read completes (e.g. range reads with limits, key selectors). When you read from these special keys, the returned future first blocks until all pending reads are complete so it can give an accurate response.
+#. ``\xff\xff/transaction/write_conflict_range/`` The conflict range range for a ``set_versionstamped_key`` atomic op is not known until commit time. You'll get an approximate range (the actual range will be a subset of the approximate range) until the precise range is known.
+#. ``\xff\xff/transaction/conflicting_keys/`` Since using this feature costs server (i.e., commit proxy and resolver) resources, it's disabled by default. You must opt in by setting the ``report_conflicting_keys`` transaction option.
+
+Metrics module
+--------------
+
+Reads in the metrics module are not transactional and may require rpcs to complete.
+
+``\xff\xff/metrics/data_distribution_stats/<begin>`` represent stats about the shard that begins at ``<begin>``
+
+  >>> for k, v in db.get_range_startswith('\xff\xff/metrics/data_distribution_stats/', limit=3):
+  ...     print(k, v)
+  ...
+  ('\xff\xff/metrics/data_distribution_stats/', '{"shard_bytes":3828000}')
+  ('\xff\xff/metrics/data_distribution_stats/mako00079', '{"shard_bytes":2013000}')
+  ('\xff\xff/metrics/data_distribution_stats/mako00126', '{"shard_bytes":3201000}')
+
+========================= ======== ===============
+**Field**                 **Type** **Description**
+------------------------- -------- ---------------
+shard_bytes               number   An estimate of the sum of kv sizes for this shard.
+========================= ======== ===============
+
+Keys starting with ``\xff\xff/metrics/health/`` represent stats about the health of the cluster, suitable for application-level throttling.
+Some of this information is also available in ``\xff\xff/status/json``, but these keys are significantly cheaper (in terms of server resources) to read.
+
+  >>> for k, v in db.get_range_startswith('\xff\xff/metrics/health/'):
+  ...     print(k, v)
+  ...
+  ('\xff\xff/metrics/health/aggregate', '{"batch_limited":false,"limiting_storage_durability_lag":5000000,"limiting_storage_queue":1000,"tps_limit":483988.66315011407,"worst_storage_durability_lag":5000001,"worst_storage_queue":2036,"worst_log_queue":300}')
+  ('\xff\xff/metrics/health/log/e639a9ad0373367784cc550c615c469b', '{"log_queue":300}')
+  ('\xff\xff/metrics/health/storage/ab2ce4caf743c9c1ae57063629c6678a', '{"cpu_usage":2.398696781487125,"disk_usage":0.059995917598039405,"storage_durability_lag":5000001,"storage_queue":2036}')
+
+``\xff\xff/metrics/health/aggregate``
+
+Aggregate stats about cluster health. Reading this key alone is slightly cheaper than reading any of the per-process keys.
+
+=================================== ======== ===============
+**Field**                           **Type** **Description**
+----------------------------------- -------- ---------------
+batch_limited                       boolean  Whether or not the cluster is limiting batch priority transactions
+limiting_storage_durability_lag     number   storage_durability_lag that ratekeeper is using to determing throttling (see the description for storage_durability_lag)
+limiting_storage_queue              number   storage_queue that ratekeeper is using to determing throttling (see the description for storage_queue)
+tps_limit                           number   The rate at which normal priority transactions are allowed to start
+worst_storage_durability_lag        number   See the description for storage_durability_lag
+worst_storage_queue                 number   See the description for storage_queue
+worst_log_queue                     number   See the description for log_queue
+=================================== ======== ===============
+
+``\xff\xff/metrics/health/log/<id>``
+
+Stats about the health of a particular transaction log process
+
+========================= ======== ===============
+**Field**                 **Type** **Description**
+------------------------- -------- ---------------
+log_queue                 number   The number of bytes of mutations that need to be stored in memory on this transaction log process
+========================= ======== ===============
+
+``\xff\xff/metrics/health/storage/<id>``
+
+Stats about the health of a particular storage process
+
+========================== ======== ===============
+**Field**                  **Type** **Description**
+-------------------------- -------- ---------------
+cpu_usage                  number   The cpu percentage used by this storage process
+disk_usage                 number   The disk IO percentage used by this storage process
+storage_durability_lag     number   The difference between the newest version and the durable version on this storage process. On a lightly loaded cluster this will stay just above 5000000 [#max_read_transaction_life_versions]_.
+storage_queue              number   The number of bytes of mutations that need to be stored in memory on this storage process
+========================== ======== ===============
+
+Caveats
+~~~~~~~
+
+#. ``\xff\xff/metrics/health/`` These keys may return data that's several seconds old, and the data may not be available for a brief period during recovery. This will be indicated by the keys being absent.
+
+
+Read/write modules
+==================
+
+As of api version 700, some modules in the special key space allow writes as
+well as reads. In these modules, a user can expect that mutations (i.e. sets,
+clears, etc) do not have side-effects outside of the current transaction
+until commit is called (the same is true for writes to the normal key space).
+A user can also expect the effects on commit to be atomic. Reads to
+special keys may require reading system keys (whose format is an implementation
+detail), and for those reads appropriate read conflict ranges are added on
+the underlying system keys.
+
+Writes to read/write modules in the special key space are disabled by
+default. Use the ``special_key_space_enable_writes`` transaction option to
+enable them [#special_key_space_enable_writes]_.
+
+
+.. _special-key-space-management-module:
+
+Management module
+-----------------
+
+The management module is for temporary cluster configuration changes. For
+example, in order to safely remove a process from the cluster, one can add an
+exclusion to the ``\xff\xff/management/excluded/`` key prefix that matches
+that process, and wait for necessary data to be moved away.
+
+#. ``\xff\xff/management/excluded/<exclusion>`` Read/write. Indicates that the cluster should move data away from processes matching ``<exclusion>``, so that they can be safely removed. See :ref:`removing machines from a cluster <removing-machines-from-a-cluster>` for documentation for the corresponding fdbcli command.
+#. ``\xff\xff/management/failed/<exclusion>`` Read/write. Indicates that the cluster should consider matching processes as permanently failed. This allows the cluster to avoid maintaining extra state and doing extra work in the hope that these processes come back. See :ref:`removing machines from a cluster <removing-machines-from-a-cluster>` for documentation for the corresponding fdbcli command.
+#. ``\xff\xff/management/in_progress_exclusion/<address>`` Read-only. Indicates that the process matching ``<address>`` matches an exclusion, but still has necessary data and can't yet be safely removed.
+#. ``\xff\xff/management/options/excluded/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/excluded/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
+#. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
+#. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
+#. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
+#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of a non-negative ``double`` which represents the remaining time for the zone to be in maintenance. Commiting with an invalid value will throw ``special_keys_api_failure``. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
+   In addition, a special key ``\xff\xff/management/maintenance/IgnoreSSFailures`` in the range, if set, will disable datadistribution for storage server failures.
+   It is doing the same thing as the fdbcli command ``datadistribution disable ssfailure``.
+   Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
+   While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
+#. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
+#. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``.
+#. ``\xff\xff/management/db_locked`` Read/write. A single key that can be read and modified. Set the key will lock the database and clear the key will unlock. If the database is already locked, then the commit will fail with the ``special_keys_api_failure`` error. For more details, see help text of ``fdbcli`` command ``lock`` and ``unlock``.
+#. ``\xff\xff/management/auto_coordinators`` Read-only. A single key, if read, will return a set of processes which is able to satisfy the current redundency level and serve as new coordinators. The return value is formatted as a comma delimited string of network addresses of coordinators, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``.
+
+An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
+an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,
+then all processes on that host match the exclusion.
+
+Configuration module
+--------------------
+
+The configuration module is for changing the cluster configuration.
+For example, you can change a process type or update coordinators by manipulating related special keys through transactions.
+
+#. ``\xff\xff/configuration/process/class_type/<address> := <class_type>`` Read/write. Reading keys in the range will retrieve processes' class types. Setting keys in the range will update processes' class types. The process matching ``<address>`` will be assigned to the given class type if the commit is successful. The valid class types are ``storage``, ``transaction``, ``resolution``, etc. A full list of class type can be found via ``fdbcli`` command ``help setclass``. Clearing keys is forbidden in the range. Instead, you can set the type as ``default``, which will clear the assigned class type if existing. For more details, see help text of ``fdbcli`` command ``setclass``.
+#. ``\xff\xff/configuration/process/class_source/<address> := <class_source>`` Read-only. Reading keys in the range will retrieve processes' class source. The class source is one of ``command_line``, ``configure_auto``, ``set_class`` and ``invalid``, indicating the source that the process's class type comes from.
+#. ``\xff\xff/configuration/coordinators/processes := <ip:port>,<ip:port>,...,<ip:port>`` Read/write. A single key, if read, will return a comma delimited string of coordinators' network addresses. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. As there's always the need to have coordinators, clear on the key is forbidden and a transaction will fail with the ``special_keys_api_failure`` error if the clear is committed. For more details, see help text of ``fdbcli`` command ``coordinators``.
+#. ``\xff\xff/configuration/coordinators/cluster_description := <new_description>`` Read/write. A single key, if read, will return the cluster description. Thus modifying the key will update the cluster decription. The new description needs to match ``[A-Za-z0-9_]+``, otherwise, the ``special_keys_api_failure`` error will be thrown. In addition, clear on the key is meaningless thus forbidden. For more details, see help text of ``fdbcli`` command ``coordinators``.
+
+The ``<address>`` here is the network address of the corresponding process. Thus the general form is ``ip:port``.
+
+Error message module
+--------------------
+
+Each module written to validates the transaction before committing, and this
+validation failing is indicated by a ``special_keys_api_failure`` error.
+More detailed information about why this validation failed can be accessed through the ``\xff\xff/error_message`` key, whose value is a json document with the following schema.
+
+========================== ======== ===============
+**Field**                  **Type** **Description**
+-------------------------- -------- ---------------
+retriable                  boolean  Whether or not this operation might succeed if retried
+command                    string   The fdbcli command corresponding to this operation
+message                    string   Help text explaining the reason this operation failed
+========================== ======== ===============
+
+Global configuration module
+---------------------------
+
+The global configuration module provides an interface to read and write values
+to :doc:`global-configuration`. In general, clients should not read and write
+the global configuration special key space keys directly, but should instead
+use the global configuration functions.
+
+#. ``\xff\xff/global_config/<key> := <value>`` Read/write. Reading keys in the range will return a tuple decoded string representation of the value for the given key. Writing a value will update all processes in the cluster with the new key-value pair. Values must be written using the :ref:`api-python-tuple-layer`.
+
+Tracing module
+--------------
+
+The tracing module provides read and write access to a transactions' tracing
+data. Every transaction contains a unique identifier which follows the
+transaction through the system. By providing access to set this identifier,
+clients can connect FoundationDB transactions to outside events.
+
+#. ``\xff\xff/tracing/transaction_id := <transaction_id>`` Read/write. A 64-bit integer transaction ID which follows the transaction as it moves through FoundationDB. All transactions are assigned a random transaction ID on creation, and this key can be read to surface the randomly generated ID. Alternatively, set this key to provide a custom identifier. When setting this key, provide a string in the form of a 64-bit integer, which will be automatically converted to the appropriate type.
+#. ``\xff\xff/tracing/token := <tracing_enabled>`` Read/write. Set to true/false to enable or disable tracing for the transaction, respectively. If read, returns a 64-bit integer set to 0 if tracing has been disabled, or a random 64-bit integer otherwise (this integers value has no meaning to the client other than to determine whether the transaction will be traced).
+
+.. [#conflicting_keys] In practice, the transaction probably committed successfully. However, if you're running multiple resolvers then it's possible for a transaction to cause another to abort even if it doesn't commit successfully.
+.. [#max_read_transaction_life_versions] The number 5000000 comes from the server knob MAX_READ_TRANSACTION_LIFE_VERSIONS
+.. [#special_key_space_enable_writes] Enabling this option enables other transaction options, such as ``ACCESS_SYSTEM_KEYS``. This may change in the future.
--- a/documentation/tutorial/tutorial.actor.cpp
+++ b/documentation/tutorial/tutorial.actor.cpp
@ -1,23 +1,23 @@
 /*
- * tutorial.actor.cpp
+* tutorial.actor.cpp

- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+*
+* This source file is part of the FoundationDB open source project
+*
+* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/

 #include "flow/flow.h"
 #include "flow/Platform.h"
@ -100,10 +100,11 @@ struct EchoServerInterface {
 	RequestStream<struct GetInterfaceRequest> getInterface;
 	RequestStream<struct EchoRequest> echo;
 	RequestStream<struct ReverseRequest> reverse;
+	RequestStream<struct StreamRequest> stream;

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, echo, reverse);
+		serializer(ar, echo, reverse, stream);
 	}
 };

@ -141,19 +142,60 @@ struct ReverseRequest {
 	}
 };

+struct StreamReply : ReplyPromiseStreamReply {
+	constexpr static FileIdentifier file_identifier = 440804;
+
+	int index = 0;
+	StreamReply() = default;
+	explicit StreamReply(int index) : index(index) {}
+
+	size_t expectedSize() const { return 2e6; }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, index);
+	}
+};
+
+struct StreamRequest {
+	constexpr static FileIdentifier file_identifier = 5410805;
+	ReplyPromiseStream<StreamReply> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, reply);
+	}
+};
+
 uint64_t tokenCounter = 1;

 ACTOR Future<Void> echoServer() {
 	state EchoServerInterface echoServer;
 	echoServer.getInterface.makeWellKnownEndpoint(UID(-1, ++tokenCounter), TaskPriority::DefaultEndpoint);
 	loop {
-		choose {
-			when(GetInterfaceRequest req = waitNext(echoServer.getInterface.getFuture())) {
-				req.reply.send(echoServer);
+		try {
+			choose {
+				when(GetInterfaceRequest req = waitNext(echoServer.getInterface.getFuture())) {
+					req.reply.send(echoServer);
+				}
+				when(EchoRequest req = waitNext(echoServer.echo.getFuture())) { req.reply.send(req.message); }
+				when(ReverseRequest req = waitNext(echoServer.reverse.getFuture())) {
+					req.reply.send(std::string(req.message.rbegin(), req.message.rend()));
+				}
+				when(state StreamRequest req = waitNext(echoServer.stream.getFuture())) {
+					state int i = 0;
+					for (; i < 100; ++i) {
+						wait(req.reply.onReady());
+						std::cout << "Send " << i << std::endl;
+						req.reply.send(StreamReply{ i });
+					}
+					req.reply.sendError(end_of_stream());
+				}
 			}
-			when(EchoRequest req = waitNext(echoServer.echo.getFuture())) { req.reply.send(req.message); }
-			when(ReverseRequest req = waitNext(echoServer.reverse.getFuture())) {
-				req.reply.send(std::string(req.message.rbegin(), req.message.rend()));
+		} catch (Error& e) {
+			if (e.code() != error_code_operation_obsolete) {
+				fprintf(stderr, "Error: %s\n", e.what());
+				throw e;
 			}
 		}
 	}
@ -172,6 +214,18 @@ ACTOR Future<Void> echoClient() {
 	reverseRequest.message = "Hello World";
 	std::string reverseString = wait(server.reverse.getReply(reverseRequest));
 	std::cout << format("Sent %s to reverse, received %s\n", "Hello World", reverseString.c_str());
+
+	state ReplyPromiseStream<StreamReply> stream = server.stream.getReplyStream(StreamRequest{});
+	state int j = 0;
+	try {
+		loop {
+			StreamReply rep = waitNext(stream.getFuture());
+			std::cout << "Rep: " << rep.index << std::endl;
+			ASSERT(rep.index == j++);
+		}
+	} catch (Error& e) {
+		ASSERT(e.code() == error_code_end_of_stream || e.code() == error_code_connection_failed);
+	}
 	return Void();
 }

@ -347,6 +401,68 @@ ACTOR Future<Void> multipleClients() {

 std::string clusterFile = "fdb.cluster";

+ACTOR Future<Void> logThroughput(int64_t* v, Key* next) {
+	loop {
+		state int64_t last = *v;
+		wait(delay(1));
+		printf("throughput: %ld bytes/s, next: %s\n", *v - last, printable(*next).c_str());
+	}
+}
+
+ACTOR Future<Void> fdbClientStream() {
+	state Database db = Database::createDatabase(clusterFile, 300);
+	state Transaction tx(db);
+	state Key next;
+	state int64_t bytes = 0;
+	state Future<Void> logFuture = logThroughput(&bytes, &next);
+	loop {
+		state PromiseStream<Standalone<RangeResultRef>> results;
+		try {
+			state Future<Void> stream = tx.getRangeStream(results,
+			                                              KeySelector(firstGreaterOrEqual(next), next.arena()),
+			                                              KeySelector(firstGreaterOrEqual(normalKeys.end)),
+			                                              GetRangeLimits());
+			loop {
+				Standalone<RangeResultRef> range = waitNext(results.getFuture());
+				if (range.size()) {
+					bytes += range.expectedSize();
+					next = keyAfter(range.back().key);
+				}
+			}
+		} catch (Error& e) {
+			if (e.code() == error_code_end_of_stream) {
+				break;
+			}
+			wait(tx.onError(e));
+		}
+	}
+	return Void();
+}
+
+ACTOR Future<Void> fdbClientGetRange() {
+	state Database db = Database::createDatabase(clusterFile, 300);
+	state Transaction tx(db);
+	state Key next;
+	state int64_t bytes = 0;
+	state Future<Void> logFuture = logThroughput(&bytes, &next);
+	loop {
+		try {
+			Standalone<RangeResultRef> range =
+			    wait(tx.getRange(KeySelector(firstGreaterOrEqual(next), next.arena()),
+			                     KeySelector(firstGreaterOrEqual(normalKeys.end)),
+			                     GetRangeLimits(GetRangeLimits::ROW_LIMIT_UNLIMITED, CLIENT_KNOBS->REPLY_BYTE_LIMIT)));
+			bytes += range.expectedSize();
+			if (!range.more) {
+				break;
+			}
+			next = keyAfter(range.back().key);
+		} catch (Error& e) {
+			wait(tx.onError(e));
+		}
+	}
+	return Void();
+}
+
 ACTOR Future<Void> fdbClient() {
 	wait(delay(30));
 	state Database db = Database::createDatabase(clusterFile, 300);
@ -403,6 +519,8 @@ std::unordered_map<std::string, std::function<Future<Void>()>> actors = {
 	{ "kvStoreServer", &kvStoreServer }, // ./tutorial -p 6666 kvStoreServer
 	{ "kvSimpleClient", &kvSimpleClient }, // ./tutorial -s 127.0.0.1:6666 kvSimpleClient
 	{ "multipleClients", &multipleClients }, // ./tutorial -s 127.0.0.1:6666 multipleClients
+	{ "fdbClientStream", &fdbClientStream }, // ./tutorial -C $CLUSTER_FILE_PATH fdbClientStream
+	{ "fdbClientGetRange", &fdbClientGetRange }, // ./tutorial -C $CLUSTER_FILE_PATH fdbClientGetRange
 	{ "fdbClient", &fdbClient }, // ./tutorial -C $CLUSTER_FILE_PATH fdbClient
 	{ "fdbStatusStresser", &fdbStatusStresser }
 }; // ./tutorial -C $CLUSTER_FILE_PATH fdbStatusStresser
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -22,6 +22,7 @@ set(FDBCLIENT_SRCS
  ClientLogEvents.h
  ClientWorkerInterface.h
  ClusterInterface.h
+  CommitProxyInterface.h
  CommitTransaction.h
  ConfigKnobs.cpp
  ConfigKnobs.h
@ -57,7 +58,6 @@ set(FDBCLIENT_SRCS
  IKnobCollection.h
  ManagementAPI.actor.cpp
  ManagementAPI.actor.h
-  CommitProxyInterface.h
  MonitorLeader.actor.cpp
  MonitorLeader.h
  MultiVersionAssignmentVars.h
@ -67,6 +67,8 @@ set(FDBCLIENT_SRCS
  NativeAPI.actor.cpp
  NativeAPI.actor.h
  Notified.h
+  ParallelStream.actor.cpp
+  ParallelStream.actor.h
  PaxosConfigTransaction.actor.cpp
  PaxosConfigTransaction.h
  SimpleConfigTransaction.actor.cpp
@ -88,6 +90,8 @@ set(FDBCLIENT_SRCS
  ServerKnobs.h
  SimpleConfigTransaction.h
  SnapshotCache.h
+  SpecialKeySpace.actor.cpp
+  SpecialKeySpace.actor.h
  Status.h
  StatusClient.actor.cpp
  StatusClient.h
--- a/fdbclient/ClientKnobs.cpp
+++ b/fdbclient/ClientKnobs.cpp
@ -95,6 +95,8 @@ void ClientKnobs::initialize(Randomize _randomize) {
 	init( DETAILED_HEALTH_METRICS_MAX_STALENESS,   5.0 );
 	init( MID_SHARD_SIZE_MAX_STALENESS,           10.0 );
 	init( TAG_ENCODE_KEY_SERVERS,                false ); if( randomize && BUGGIFY ) TAG_ENCODE_KEY_SERVERS = true;
+	init( RANGESTREAM_FRAGMENT_SIZE,               1e6 );
+	init( RANGESTREAM_BUFFERED_FRAGMENTS_LIMIT,     20 );
 	init( QUARANTINE_TSS_ON_MISMATCH,             true ); if( randomize && BUGGIFY ) QUARANTINE_TSS_ON_MISMATCH = false; // if true, a tss mismatch will put the offending tss in quarantine. If false, it will just be killed

 	//KeyRangeMap
--- a/fdbclient/ClientKnobs.h
+++ b/fdbclient/ClientKnobs.h
@ -90,6 +90,8 @@ public:
 	double DETAILED_HEALTH_METRICS_MAX_STALENESS;
 	double MID_SHARD_SIZE_MAX_STALENESS;
 	bool TAG_ENCODE_KEY_SERVERS;
+	int64_t RANGESTREAM_FRAGMENT_SIZE;
+	int RANGESTREAM_BUFFERED_FRAGMENTS_LIMIT;
 	bool QUARANTINE_TSS_ON_MISMATCH;

 	// KeyRangeMap
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -351,6 +351,7 @@ public:
 	Counter transactionGetKeyRequests;
 	Counter transactionGetValueRequests;
 	Counter transactionGetRangeRequests;
+	Counter transactionGetRangeStreamRequests;
 	Counter transactionWatchRequests;
 	Counter transactionGetAddressesForKeyRequests;
 	Counter transactionBytesRead;
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -669,7 +669,6 @@ struct RangeResultRef : VectorRef<KeyValueRef> {
 		       " readToBegin:" + std::to_string(readToBegin) + " readThroughEnd:" + std::to_string(readThroughEnd);
 	}
 };
-using RangeResult = Standalone<RangeResultRef>;

 template <>
 struct Traceable<RangeResultRef> : std::true_type {
--- a/fdbclient/IKnobCollection.h
+++ b/fdbclient/IKnobCollection.h
@ -39,6 +39,8 @@ class IKnobCollection {
 	static std::unique_ptr<IKnobCollection> globalKnobCollection;

 public:
+	virtual ~IKnobCollection() = default;
+
 	enum class Type {
 		CLIENT,
 		SERVER,
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -46,6 +46,7 @@
 #include "fdbclient/MonitorLeader.h"
 #include "fdbclient/MutationList.h"
 #include "fdbclient/ReadYourWrites.h"
+#include "fdbclient/ParallelStream.actor.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/SystemData.h"
@ -1090,7 +1091,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
    transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc),
    transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc),
    transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc),
-    transactionGetRangeRequests("GetRangeRequests", cc), transactionWatchRequests("WatchRequests", cc),
+    transactionGetRangeRequests("GetRangeRequests", cc),
+    transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc),
    transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc),
    transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc),
    transactionCommittedMutations("CommittedMutations", cc),
@ -1324,7 +1326,8 @@ DatabaseContext::DatabaseContext(const Error& err)
    transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc),
    transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc),
    transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc),
-    transactionGetRangeRequests("GetRangeRequests", cc), transactionWatchRequests("WatchRequests", cc),
+    transactionGetRangeRequests("GetRangeRequests", cc),
+    transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc),
    transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc),
    transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc),
    transactionCommittedMutations("CommittedMutations", cc),
@ -3477,6 +3480,324 @@ ACTOR Future<RangeResult> getRange(Database cx,
 	}
 }

+// Streams all of the KV pairs in a target key range into a ParallelStream fragment
+ACTOR Future<Void> getRangeStreamFragment(ParallelStream<RangeResult>::Fragment* results,
+                                          Database cx,
+                                          Reference<TransactionLogInfo> trLogInfo,
+                                          Version version,
+                                          KeyRange keys,
+                                          GetRangeLimits limits,
+                                          bool snapshot,
+                                          bool reverse,
+                                          TransactionInfo info,
+                                          TagSet tags,
+                                          SpanID spanContext) {
+	loop {
+		state vector<pair<KeyRange, Reference<LocationInfo>>> locations = wait(getKeyRangeLocations(
+		    cx, keys, CLIENT_KNOBS->GET_RANGE_SHARD_LIMIT, reverse, &StorageServerInterface::getKeyValuesStream, info));
+		ASSERT(locations.size());
+		state int shard = 0;
+		loop {
+			const KeyRange& range = locations[shard].first;
+
+			state GetKeyValuesStreamRequest req;
+			req.version = version;
+			req.begin = firstGreaterOrEqual(range.begin);
+			req.end = firstGreaterOrEqual(range.end);
+			req.spanContext = spanContext;
+			req.limit = reverse ? -CLIENT_KNOBS->REPLY_BYTE_LIMIT : CLIENT_KNOBS->REPLY_BYTE_LIMIT;
+			req.limitBytes = std::numeric_limits<int>::max();
+
+			ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);
+
+			// FIXME: buggify byte limits on internal functions that use them, instead of globally
+			req.tags = cx->sampleReadTags() ? tags : Optional<TagSet>();
+			req.debugID = info.debugID;
+
+			try {
+				if (info.debugID.present()) {
+					g_traceBatch.addEvent(
+					    "TransactionDebug", info.debugID.get().first(), "NativeAPI.RangeStream.Before");
+				}
+				++cx->transactionPhysicalReads;
+				state GetKeyValuesStreamReply rep;
+
+				if (locations[shard].second->size() == 0) {
+					wait(cx->connectionFileChanged());
+					results->sendError(transaction_too_old());
+					return Void();
+				}
+
+				state int useIdx = -1;
+
+				loop {
+					// FIXME: create a load balance function for this code so future users of reply streams do not have
+					// to duplicate this code
+					int count = 0;
+					for (int i = 0; i < locations[shard].second->size(); i++) {
+						if (!IFailureMonitor::failureMonitor()
+						         .getState(locations[shard]
+						                       .second->get(i, &StorageServerInterface::getKeyValuesStream)
+						                       .getEndpoint())
+						         .failed) {
+							if (deterministicRandom()->random01() <= 1.0 / ++count) {
+								useIdx = i;
+							}
+						}
+					}
+
+					if (useIdx >= 0) {
+						break;
+					}
+
+					vector<Future<Void>> ok(locations[shard].second->size());
+					for (int i = 0; i < ok.size(); i++) {
+						ok[i] = IFailureMonitor::failureMonitor().onStateEqual(
+						    locations[shard].second->get(i, &StorageServerInterface::getKeyValuesStream).getEndpoint(),
+						    FailureStatus(false));
+					}
+
+					// Making this SevWarn means a lot of clutter
+					if (now() - g_network->networkInfo.newestAlternativesFailure > 1 ||
+					    deterministicRandom()->random01() < 0.01) {
+						TraceEvent("AllAlternativesFailed")
+						    .detail("Alternatives", locations[shard].second->description());
+					}
+
+					wait(allAlternativesFailedDelay(quorum(ok, 1)));
+				}
+
+				state ReplyPromiseStream<GetKeyValuesStreamReply> replyStream =
+				    locations[shard]
+				        .second->get(useIdx, &StorageServerInterface::getKeyValuesStream)
+				        .getReplyStream(req);
+				state bool breakAgain = false;
+				loop {
+					wait(results->onEmpty());
+					try {
+						choose {
+							when(wait(cx->connectionFileChanged())) {
+								results->sendError(transaction_too_old());
+								return Void();
+							}
+
+							when(GetKeyValuesStreamReply _rep = waitNext(replyStream.getFuture())) { rep = _rep; }
+						}
+						++cx->transactionPhysicalReadsCompleted;
+					} catch (Error& e) {
+						++cx->transactionPhysicalReadsCompleted;
+						if (e.code() == error_code_broken_promise) {
+							throw connection_failed();
+						}
+						if (e.code() != error_code_end_of_stream) {
+							throw;
+						}
+						rep = GetKeyValuesStreamReply();
+					}
+					if (info.debugID.present())
+						g_traceBatch.addEvent(
+						    "TransactionDebug", info.debugID.get().first(), "NativeAPI.getExactRange.After");
+					RangeResult output(RangeResultRef(rep.data, rep.more), rep.arena);
+
+					int64_t bytes = 0;
+					for (const KeyValueRef& kv : output) {
+						bytes += kv.key.size() + kv.value.size();
+					}
+
+					cx->transactionBytesRead += bytes;
+					cx->transactionKeysRead += output.size();
+
+					// If the reply says there is more but we know that we finished the shard, then fix rep.more
+					if (reverse && output.more && rep.data.size() > 0 &&
+					    output[output.size() - 1].key == locations[shard].first.begin) {
+						output.more = false;
+					}
+
+					if (output.more) {
+						if (!rep.data.size()) {
+							TraceEvent(SevError, "GetRangeStreamError")
+							    .detail("Reason", "More data indicated but no rows present")
+							    .detail("LimitBytes", limits.bytes)
+							    .detail("LimitRows", limits.rows)
+							    .detail("OutputSize", output.size())
+							    .detail("OutputBytes", output.expectedSize())
+							    .detail("BlockSize", rep.data.size())
+							    .detail("BlockBytes", rep.data.expectedSize());
+							ASSERT(false);
+						}
+						TEST(true); // GetKeyValuesStreamReply.more in getRangeStream
+						// Make next request to the same shard with a beginning key just after the last key returned
+						if (reverse)
+							locations[shard].first =
+							    KeyRangeRef(locations[shard].first.begin, output[output.size() - 1].key);
+						else
+							locations[shard].first =
+							    KeyRangeRef(keyAfter(output[output.size() - 1].key), locations[shard].first.end);
+					}
+
+					if (locations[shard].first.empty()) {
+						output.more = false;
+					}
+
+					if (!output.more) {
+						const KeyRange& range = locations[shard].first;
+						if (shard == locations.size() - 1) {
+							KeyRef begin = reverse ? keys.begin : range.end;
+							KeyRef end = reverse ? range.begin : keys.end;
+
+							if (begin >= end) {
+								if (range.begin == allKeys.begin) {
+									output.readToBegin = true;
+								}
+								if (range.end == allKeys.end) {
+									output.readThroughEnd = true;
+								}
+								output.arena().dependsOn(keys.arena());
+								output.readThrough = reverse ? keys.begin : keys.end;
+								results->send(std::move(output));
+								results->finish();
+								return Void();
+							}
+							keys = KeyRangeRef(begin, end);
+							breakAgain = true;
+						} else {
+							++shard;
+						}
+						output.arena().dependsOn(range.arena());
+						output.readThrough = reverse ? range.begin : range.end;
+						results->send(std::move(output));
+						break;
+					}
+
+					ASSERT(output.size());
+					if (keys.begin == allKeys.begin && !reverse) {
+						output.readToBegin = true;
+					}
+					if (keys.end == allKeys.end && reverse) {
+						output.readThroughEnd = true;
+					}
+					results->send(std::move(output));
+				}
+				if (breakAgain) {
+					break;
+				}
+			} catch (Error& e) {
+				if (e.code() == error_code_actor_cancelled) {
+					throw;
+				}
+				if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed ||
+				    e.code() == error_code_connection_failed) {
+					const KeyRangeRef& range = locations[shard].first;
+
+					if (reverse)
+						keys = KeyRangeRef(keys.begin, range.end);
+					else
+						keys = KeyRangeRef(range.begin, keys.end);
+
+					cx->invalidateCache(keys);
+					wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
+					break;
+				} else {
+					results->sendError(e);
+					return Void();
+				}
+			}
+		}
+	}
+}
+
+ACTOR Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(Database cx, KeyRange keys, int64_t chunkSize);
+
+static KeyRange intersect(KeyRangeRef lhs, KeyRangeRef rhs) {
+	return KeyRange(KeyRangeRef(std::max(lhs.begin, rhs.begin), std::min(lhs.end, rhs.end)));
+}
+
+// Divides the requested key range into 1MB fragments, create range streams for each fragment, and merges the results so
+// the client get them in order
+ACTOR Future<Void> getRangeStream(PromiseStream<RangeResult> _results,
+                                  Database cx,
+                                  Reference<TransactionLogInfo> trLogInfo,
+                                  Future<Version> fVersion,
+                                  KeySelector begin,
+                                  KeySelector end,
+                                  GetRangeLimits limits,
+                                  Promise<std::pair<Key, Key>> conflictRange,
+                                  bool snapshot,
+                                  bool reverse,
+                                  TransactionInfo info,
+                                  TagSet tags) {
+
+	state ParallelStream<RangeResult> results(_results, CLIENT_KNOBS->RANGESTREAM_BUFFERED_FRAGMENTS_LIMIT);
+
+	// FIXME: better handling to disable row limits
+	ASSERT(!limits.hasRowLimit());
+	state Span span("NAPI:getRangeStream"_loc, info.spanID);
+
+	state Version version = wait(fVersion);
+	cx->validateVersion(version);
+
+	Future<Key> fb = resolveKey(cx, begin, version, info, tags);
+	state Future<Key> fe = resolveKey(cx, end, version, info, tags);
+
+	state Key b = wait(fb);
+	state Key e = wait(fe);
+
+	if (!snapshot) {
+		// FIXME: this conflict range is too large, and should be updated continously as results are returned
+		conflictRange.send(std::make_pair(std::min(b, Key(begin.getKey(), begin.arena())),
+		                                  std::max(e, Key(end.getKey(), end.arena()))));
+	}
+
+	if (b >= e) {
+		wait(results.finish());
+		return Void();
+	}
+
+	// if e is allKeys.end, we have read through the end of the database
+	// if b is allKeys.begin, we have either read through the beginning of the database,
+	// or allKeys.begin exists in the database and will be part of the conflict range anyways
+
+	state std::vector<Future<Void>> outstandingRequests;
+	while (b < e) {
+		state pair<KeyRange, Reference<LocationInfo>> ssi =
+		    wait(getKeyLocation(cx, reverse ? e : b, &StorageServerInterface::getKeyValuesStream, info, reverse));
+		state KeyRange shardIntersection = intersect(ssi.first, KeyRangeRef(b, e));
+		state Standalone<VectorRef<KeyRef>> splitPoints =
+		    wait(getRangeSplitPoints(cx, shardIntersection, CLIENT_KNOBS->RANGESTREAM_FRAGMENT_SIZE));
+		state std::vector<KeyRange> toSend;
+		// state std::vector<Future<std::list<KeyRangeRef>::iterator>> outstandingRequests;
+
+		if (!splitPoints.empty()) {
+			toSend.push_back(KeyRange(KeyRangeRef(shardIntersection.begin, splitPoints.front()), splitPoints.arena()));
+			for (int i = 0; i < splitPoints.size() - 1; ++i) {
+				toSend.push_back(KeyRange(KeyRangeRef(splitPoints[i], splitPoints[i + 1]), splitPoints.arena()));
+			}
+			toSend.push_back(KeyRange(KeyRangeRef(splitPoints.back(), shardIntersection.end), splitPoints.arena()));
+		} else {
+			toSend.push_back(KeyRange(KeyRangeRef(shardIntersection.begin, shardIntersection.end)));
+		}
+
+		state int idx = 0;
+		state int useIdx = 0;
+		for (; idx < toSend.size(); ++idx) {
+			useIdx = reverse ? toSend.size() - idx - 1 : idx;
+			if (toSend[useIdx].empty()) {
+				continue;
+			}
+			ParallelStream<RangeResult>::Fragment* fragment = wait(results.createFragment());
+			outstandingRequests.push_back(getRangeStreamFragment(
+			    fragment, cx, trLogInfo, version, toSend[useIdx], limits, snapshot, reverse, info, tags, span.context));
+		}
+		if (reverse) {
+			e = shardIntersection.begin;
+		} else {
+			b = shardIntersection.end;
+		}
+	}
+	wait(waitForAll(outstandingRequests) && results.finish());
+	return Void();
+}
+
 Future<RangeResult> getRange(Database const& cx,
                             Future<Version> const& fVersion,
                             KeySelector const& begin,
@ -3832,6 +4153,67 @@ Future<RangeResult> Transaction::getRange(const KeySelector& begin,
 	return getRange(begin, end, GetRangeLimits(limit), snapshot, reverse);
 }

+// A method for streaming data from the storage server that is more efficient than getRange when reading large amounts
+// of data
+Future<Void> Transaction::getRangeStream(const PromiseStream<RangeResult>& results,
+                                         const KeySelector& begin,
+                                         const KeySelector& end,
+                                         GetRangeLimits limits,
+                                         bool snapshot,
+                                         bool reverse) {
+	++cx->transactionLogicalReads;
+	++cx->transactionGetRangeStreamRequests;
+
+	// FIXME: limits are not implemented yet, and this code has not be tested with reverse=true
+	ASSERT(!limits.hasByteLimit() && !limits.hasRowLimit() && !reverse);
+
+	KeySelector b = begin;
+	if (b.orEqual) {
+		TEST(true); // Native stream begin orEqual==true
+		b.removeOrEqual(b.arena());
+	}
+
+	KeySelector e = end;
+	if (e.orEqual) {
+		TEST(true); // Native stream end orEqual==true
+		e.removeOrEqual(e.arena());
+	}
+
+	if (b.offset >= e.offset && b.getKey() >= e.getKey()) {
+		TEST(true); // Native stream range inverted
+		results.sendError(end_of_stream());
+		return Void();
+	}
+
+	Promise<std::pair<Key, Key>> conflictRange;
+	if (!snapshot) {
+		extraConflictRanges.push_back(conflictRange.getFuture());
+	}
+
+	return forwardErrors(::getRangeStream(results,
+	                                      cx,
+	                                      trLogInfo,
+	                                      getReadVersion(),
+	                                      b,
+	                                      e,
+	                                      limits,
+	                                      conflictRange,
+	                                      snapshot,
+	                                      reverse,
+	                                      info,
+	                                      options.readTags),
+	                     results);
+}
+
+Future<Void> Transaction::getRangeStream(const PromiseStream<RangeResult>& results,
+                                         const KeySelector& begin,
+                                         const KeySelector& end,
+                                         int limit,
+                                         bool snapshot,
+                                         bool reverse) {
+	return getRangeStream(results, begin, end, GetRangeLimits(limit), snapshot, reverse);
+}
+
 void Transaction::addReadConflictRange(KeyRangeRef const& keys) {
 	ASSERT(!keys.empty());

@ -5548,7 +5930,7 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> getRangeSplitPoints(Database cx, Key
 		state vector<pair<KeyRange, Reference<LocationInfo>>> locations =
 		    wait(getKeyRangeLocations(cx,
 		                              keys,
-		                              100,
+		                              CLIENT_KNOBS->TOO_MANY,
 		                              false,
 		                              &StorageServerInterface::getRangeSplitPoints,
 		                              TransactionInfo(TaskPriority::DataDistribution, span.context)));
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -283,6 +283,45 @@ public:
 		                reverse);
 	}

+	// A method for streaming data from the storage server that is more efficient than getRange when reading large
+	// amounts of data
+	[[nodiscard]] Future<Void> getRangeStream(const PromiseStream<Standalone<RangeResultRef>>& results,
+	                                          const KeySelector& begin,
+	                                          const KeySelector& end,
+	                                          int limit,
+	                                          bool snapshot = false,
+	                                          bool reverse = false);
+	[[nodiscard]] Future<Void> getRangeStream(const PromiseStream<Standalone<RangeResultRef>>& results,
+	                                          const KeySelector& begin,
+	                                          const KeySelector& end,
+	                                          GetRangeLimits limits,
+	                                          bool snapshot = false,
+	                                          bool reverse = false);
+	[[nodiscard]] Future<Void> getRangeStream(const PromiseStream<Standalone<RangeResultRef>>& results,
+	                                          const KeyRange& keys,
+	                                          int limit,
+	                                          bool snapshot = false,
+	                                          bool reverse = false) {
+		return getRangeStream(results,
+		                      KeySelector(firstGreaterOrEqual(keys.begin), keys.arena()),
+		                      KeySelector(firstGreaterOrEqual(keys.end), keys.arena()),
+		                      limit,
+		                      snapshot,
+		                      reverse);
+	}
+	[[nodiscard]] Future<Void> getRangeStream(const PromiseStream<Standalone<RangeResultRef>>& results,
+	                                          const KeyRange& keys,
+	                                          GetRangeLimits limits,
+	                                          bool snapshot = false,
+	                                          bool reverse = false) {
+		return getRangeStream(results,
+		                      KeySelector(firstGreaterOrEqual(keys.begin), keys.arena()),
+		                      KeySelector(firstGreaterOrEqual(keys.end), keys.arena()),
+		                      limits,
+		                      snapshot,
+		                      reverse);
+	}
+
 	[[nodiscard]] Future<Standalone<VectorRef<const char*>>> getAddressesForKey(const Key& key);

 	void enableCheckWrites();
--- a/fdbclient/ParallelStream.actor.cpp
+++ b/fdbclient/ParallelStream.actor.cpp
@ -0,0 +1,80 @@
+/*
+ * ParallelStreamCorrectness.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+#include "fdbclient/ParallelStream.actor.h"
+#include "flow/UnitTest.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+namespace ParallelStreamTest {
+
+struct TestValue {
+	int x;
+	TestValue(int x) : x(x) {}
+	int expectedSize() const { return sizeof(int); }
+};
+
+ACTOR static Future<Void> produce(ParallelStream<ParallelStreamTest::TestValue>::Fragment* fragment,
+                                  ParallelStreamTest::TestValue value) {
+	wait(delay(deterministicRandom()->random01()));
+	fragment->send(value);
+	wait(delay(deterministicRandom()->random01()));
+	fragment->finish();
+	return Void();
+}
+
+ACTOR static Future<Void> consume(FutureStream<ParallelStreamTest::TestValue> stream, int expected) {
+	state int next;
+	try {
+		loop {
+			ParallelStreamTest::TestValue value = waitNext(stream);
+			ASSERT(value.x == next++);
+		}
+	} catch (Error& e) {
+		ASSERT(e.code() == error_code_end_of_stream);
+		ASSERT(next == expected);
+		return Void();
+	}
+}
+
+} // namespace ParallelStreamTest
+
+TEST_CASE("/fdbclient/ParallelStream") {
+	state PromiseStream<ParallelStreamTest::TestValue> results;
+	state size_t bufferLimit = deterministicRandom()->randomInt(0, 21);
+	state size_t numProducers = deterministicRandom()->randomInt(1, 1001);
+	state ParallelStream<ParallelStreamTest::TestValue> parallelStream(results, bufferLimit);
+	state Future<Void> consumer = ParallelStreamTest::consume(results.getFuture(), numProducers);
+	state std::vector<Future<Void>> producers;
+	TraceEvent("StartingParallelStreamTest")
+	    .detail("BufferLimit", bufferLimit)
+	    .detail("NumProducers", numProducers);
+	state int i = 0;
+	for (; i < numProducers; ++i) {
+		ParallelStream<ParallelStreamTest::TestValue>::Fragment* fragment = wait(parallelStream.createFragment());
+		producers.push_back(ParallelStreamTest::produce(fragment, ParallelStreamTest::TestValue(i)));
+	}
+	wait(parallelStream.finish());
+	wait(consumer);
+	return Void();
+}
+
+void forceLinkParallelStreamTests() {}
--- a/fdbclient/ParallelStream.actor.h
+++ b/fdbclient/ParallelStream.actor.h
@ -0,0 +1,131 @@
+/*
+ * ParallelStream.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// When actually compiled (NO_INTELLISENSE), include the generated version of this file.  In intellisense use the source
+// version.
+#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_PARALLEL_STREAM_ACTOR_G_H)
+#define FDBCLIENT_PARALLEL_STREAM_ACTOR_G_H
+#include "fdbclient/ParallelStream.actor.g.h"
+#elif !defined(FDBCLIENT_PARALLEL_STREAM_ACTOR_H)
+#define FDBCLIENT_PARALLEL_STREAM_ACTOR_H
+
+#include "flow/genericactors.actor.h"
+#include "flow/actorcompiler.h" // must be last include
+
+// ParallelStream is used to fetch data from multiple streams in parallel and then merge them back into a single stream
+// in order.
+template <class T>
+class ParallelStream {
+	Reference<BoundedFlowLock> semaphore;
+	struct FragmentConstructorTag {
+		explicit FragmentConstructorTag() = default;
+	};
+
+public:
+	// A Fragment is a single stream that will get results to be merged back into the main output stream
+	class Fragment : public ReferenceCounted<Fragment> {
+		Reference<BoundedFlowLock> semaphore;
+		PromiseStream<T> stream;
+		BoundedFlowLock::Releaser releaser;
+		friend class ParallelStream;
+
+	public:
+		Fragment(Reference<BoundedFlowLock> semaphore, int64_t permitNumber, FragmentConstructorTag)
+		  : semaphore(semaphore), releaser(semaphore.getPtr(), permitNumber) {}
+		template <class U>
+		void send(U&& value) {
+			stream.send(std::forward<U>(value));
+		}
+		void sendError(Error e) { stream.sendError(e); }
+		void finish() {
+			releaser.release(); // Release before destruction to free up pending fragments
+			stream.sendError(end_of_stream());
+		}
+		Future<Void> onEmpty() { return stream.onEmpty(); }
+	};
+
+private:
+	PromiseStream<Reference<Fragment>> fragments;
+	size_t fragmentsProcessed{ 0 };
+	PromiseStream<T> results;
+	Future<Void> flusher;
+
+public:
+	// A background actor which take results from the oldest fragment and sends them to the main output stream
+	ACTOR static Future<Void> flushToClient(ParallelStream<T>* self) {
+		state const int messagesBetweenYields = 1000;
+		state int messagesSinceYield = 0;
+		try {
+			loop {
+				state Reference<Fragment> fragment = waitNext(self->fragments.getFuture());
+				loop {
+					try {
+						wait(self->results.onEmpty());
+						T value = waitNext(fragment->stream.getFuture());
+						self->results.send(value);
+						if (++messagesSinceYield == messagesBetweenYields) {
+							wait(yield());
+							messagesSinceYield = 0;
+						}
+					} catch (Error& e) {
+						if (e.code() == error_code_end_of_stream) {
+							fragment.clear();
+							break;
+						} else {
+							throw e;
+						}
+					}
+				}
+			}
+		} catch (Error& e) {
+			if (e.code() == error_code_actor_cancelled) {
+				throw;
+			}
+			self->results.sendError(e);
+			return Void();
+		}
+	}
+
+	ParallelStream(PromiseStream<T> results, size_t bufferLimit) : results(results) {
+		semaphore = makeReference<BoundedFlowLock>(1, bufferLimit);
+		flusher = flushToClient(this);
+	}
+
+	// Creates a fragment to get merged into the main output stream
+	ACTOR static Future<Fragment*> createFragmentImpl(ParallelStream<T>* self) {
+		int64_t permitNumber = wait(self->semaphore->take());
+		auto fragment = makeReference<Fragment>(self->semaphore, permitNumber, FragmentConstructorTag());
+		self->fragments.send(fragment);
+		return fragment.getPtr();
+	}
+
+	Future<Fragment*> createFragment() { return createFragmentImpl(this); }
+
+	Future<Void> finish() {
+		fragments.sendError(end_of_stream());
+		return flusher;
+	}
+};
+
+#include "flow/unactorcompiler.h"
+
+#endif
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -210,6 +210,7 @@ void ServerKnobs::initialize(Randomize _randomize, ClientKnobs* clientKnobs, IsS
 	init( ALL_DATA_REMOVED_DELAY,                                1.0 );
 	init( INITIAL_FAILURE_REACTION_DELAY,                       30.0 ); if( randomize && BUGGIFY ) INITIAL_FAILURE_REACTION_DELAY = 0.0;
 	init( CHECK_TEAM_DELAY,                                     30.0 );
+	init( PERPETUAL_WIGGLE_DELAY,                               50.0 );
 	init( LOG_ON_COMPLETION_DELAY,         DD_QUEUE_LOGGING_INTERVAL );
 	init( BEST_TEAM_MAX_TEAM_TRIES,                               10 );
 	init( BEST_TEAM_OPTION_COUNT,                                  4 );
@ -577,11 +578,14 @@ void ServerKnobs::initialize(Randomize _randomize, ClientKnobs* clientKnobs, IsS
 	init( FUTURE_VERSION_DELAY,                                  1.0 );
 	init( STORAGE_LIMIT_BYTES,                                500000 );
 	init( BUGGIFY_LIMIT_BYTES,                                  1000 );
+	init( FETCH_USING_STREAMING,                                true ); if( randomize && BUGGIFY ) FETCH_USING_STREAMING = false; //Determines if fetch keys uses streaming reads
 	init( FETCH_BLOCK_BYTES,                                     2e6 );
 	init( FETCH_KEYS_PARALLELISM_BYTES,                          4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6;
+	init( FETCH_KEYS_PARALLELISM,                                  2 );
 	init( FETCH_KEYS_LOWER_PRIORITY,                               0 );
 	init( BUGGIFY_BLOCK_BYTES,                                 10000 );
 	init( STORAGE_COMMIT_BYTES,                             10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
+	init( STORAGE_FETCH_BYTES,                               2500000 ); if( randomize && BUGGIFY ) STORAGE_FETCH_BYTES =  500000;
 	init( STORAGE_DURABILITY_LAG_REJECT_THRESHOLD,              0.25 );
 	init( STORAGE_DURABILITY_LAG_MIN_RATE,                       0.1 );
 	init( STORAGE_COMMIT_INTERVAL,                               0.5 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_INTERVAL = 2.0;
@ -608,6 +612,7 @@ void ServerKnobs::initialize(Randomize _randomize, ClientKnobs* clientKnobs, IsS
 	init( DD_METRICS_REPORT_INTERVAL,                           30.0 );
 	init( FETCH_KEYS_TOO_LONG_TIME_CRITERIA,                   300.0 );
 	init( MAX_STORAGE_COMMIT_TIME,                             120.0 ); //The max fsync stall time on the storage server and tlog before marking a disk as failed
+	init( RANGESTREAM_LIMIT_BYTES,                               2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;

 	//Wait Failure
 	init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS,                 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
--- a/fdbclient/ServerKnobs.h
+++ b/fdbclient/ServerKnobs.h
@ -160,6 +160,7 @@ public:
 	double ALL_DATA_REMOVED_DELAY;
 	double INITIAL_FAILURE_REACTION_DELAY;
 	double CHECK_TEAM_DELAY;
+	double PERPETUAL_WIGGLE_DELAY;
 	double LOG_ON_COMPLETION_DELAY;
 	int BEST_TEAM_MAX_TEAM_TRIES;
 	int BEST_TEAM_OPTION_COUNT;
@ -508,13 +509,16 @@ public:
 	double FUTURE_VERSION_DELAY;
 	int STORAGE_LIMIT_BYTES;
 	int BUGGIFY_LIMIT_BYTES;
+	bool FETCH_USING_STREAMING;
 	int FETCH_BLOCK_BYTES;
 	int FETCH_KEYS_PARALLELISM_BYTES;
+	int FETCH_KEYS_PARALLELISM;
 	int FETCH_KEYS_LOWER_PRIORITY;
 	int BUGGIFY_BLOCK_BYTES;
 	double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD;
 	double STORAGE_DURABILITY_LAG_MIN_RATE;
 	int STORAGE_COMMIT_BYTES;
+	int STORAGE_FETCH_BYTES;
 	double STORAGE_COMMIT_INTERVAL;
 	double UPDATE_SHARD_VERSION_INTERVAL;
 	int BYTE_SAMPLING_FACTOR;
@ -539,6 +543,7 @@ public:
 	double DD_METRICS_REPORT_INTERVAL;
 	double FETCH_KEYS_TOO_LONG_TIME_CRITERIA;
 	double MAX_STORAGE_COMMIT_TIME;
+	int64_t RANGESTREAM_LIMIT_BYTES;

 	// Wait Failure
 	int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -76,6 +76,7 @@ struct StorageServerInterface {
 	RequestStream<struct WatchValueRequest> watchValue;
 	RequestStream<struct ReadHotSubRangeRequest> getReadHotRanges;
 	RequestStream<struct SplitRangeRequest> getRangeSplitPoints;
+	RequestStream<struct GetKeyValuesStreamRequest> getKeyValuesStream;

 	explicit StorageServerInterface(UID uid) : uniqueID(uid) {}
 	StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {}
@ -116,6 +117,8 @@ struct StorageServerInterface {
 				    RequestStream<struct ReadHotSubRangeRequest>(getValue.getEndpoint().getAdjustedEndpoint(11));
 				getRangeSplitPoints =
 				    RequestStream<struct SplitRangeRequest>(getValue.getEndpoint().getAdjustedEndpoint(12));
+				getKeyValuesStream =
+				    RequestStream<struct GetKeyValuesStreamRequest>(getValue.getEndpoint().getAdjustedEndpoint(13));
 			}
 		} else {
 			ASSERT(Ar::isDeserializing);
@ -157,6 +160,7 @@ struct StorageServerInterface {
 		streams.push_back(watchValue.getReceiver());
 		streams.push_back(getReadHotRanges.getReceiver());
 		streams.push_back(getRangeSplitPoints.getReceiver());
+		streams.push_back(getKeyValuesStream.getReceiver(TaskPriority::LoadBalancedEndpoint));
 		FlowTransport::transport().addEndpoints(streams);
 	}
 };
@ -293,6 +297,45 @@ struct GetKeyValuesRequest : TimedRequest {
 	}
 };

+struct GetKeyValuesStreamReply : public ReplyPromiseStreamReply {
+	constexpr static FileIdentifier file_identifier = 1783066;
+	Arena arena;
+	VectorRef<KeyValueRef, VecSerStrategy::String> data;
+	Version version; // useful when latestVersion was requested
+	bool more;
+	bool cached = false;
+
+	GetKeyValuesStreamReply() : version(invalidVersion), more(false), cached(false) {}
+	GetKeyValuesStreamReply(GetKeyValuesReply r)
+	  : arena(r.arena), data(r.data), version(r.version), more(r.more), cached(r.cached) {}
+
+	int expectedSize() const { return sizeof(GetKeyValuesStreamReply) + data.expectedSize(); }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, data, version, more, cached, arena);
+	}
+};
+
+struct GetKeyValuesStreamRequest {
+	constexpr static FileIdentifier file_identifier = 6795746;
+	SpanID spanContext;
+	Arena arena;
+	KeySelectorRef begin, end;
+	Version version; // or latestVersion
+	int limit, limitBytes;
+	bool isFetchKeys;
+	Optional<TagSet> tags;
+	Optional<UID> debugID;
+	ReplyPromiseStream<GetKeyValuesStreamReply> reply;
+
+	GetKeyValuesStreamRequest() : isFetchKeys(false) {}
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, begin, end, version, limit, limitBytes, isFetchKeys, tags, debugID, reply, spanContext, arena);
+	}
+};
+
 struct GetKeyReply : public LoadBalancedReply {
 	constexpr static FileIdentifier file_identifier = 11226513;
 	KeySelector sel;
--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@ -14,6 +14,7 @@ set(FDBRPC_SRCS
  genericactors.actor.cpp
  HealthMonitor.actor.cpp
  IAsyncFile.actor.cpp
+  LoadBalance.actor.cpp
  LoadBalance.actor.h
  Locality.cpp
  Net2FileSystem.cpp
--- a/fdbrpc/FailureMonitor.actor.cpp
+++ b/fdbrpc/FailureMonitor.actor.cpp
@ -69,7 +69,7 @@ Future<Void> IFailureMonitor::onFailedFor(Endpoint const& endpoint, double susta
 	return waitForContinuousFailure(this, endpoint, sustainedFailureDuration, slope);
 }

-SimpleFailureMonitor::SimpleFailureMonitor() : endpointKnownFailed() {
+SimpleFailureMonitor::SimpleFailureMonitor() {
 	// Mark ourselves as available in FailureMonitor
 	const auto& localAddresses = FlowTransport::transport().getLocalAddresses();
 	addressStatus[localAddresses.address] = FailureStatus(false);
@ -126,13 +126,20 @@ void SimpleFailureMonitor::endpointNotFound(Endpoint const& endpoint) {
 	    .suppressFor(1.0)
 	    .detail("Address", endpoint.getPrimaryAddress())
 	    .detail("Token", endpoint.token);
-	failedEndpoints.insert(endpoint);
+	if (endpoint.getPrimaryAddress().isPublic()) {
+		if (failedEndpoints.size() > 100000) {
+			TraceEvent(SevWarnAlways, "TooManyFailedEndpoints").suppressFor(1.0);
+			failedEndpoints.clear();
+		}
+		failedEndpoints.insert(endpoint);
+	}
 	endpointKnownFailed.trigger(endpoint);
 }

 void SimpleFailureMonitor::notifyDisconnect(NetworkAddress const& address) {
 	//TraceEvent("NotifyDisconnect").detail("Address", address);
 	endpointKnownFailed.triggerRange(Endpoint({ address }, UID()), Endpoint({ address }, UID(-1, -1)));
+	disconnectTriggers.trigger(address);
 }

 Future<Void> SimpleFailureMonitor::onDisconnectOrFailure(Endpoint const& endpoint) {
@ -149,6 +156,10 @@ Future<Void> SimpleFailureMonitor::onDisconnectOrFailure(Endpoint const& endpoin
 	return endpointKnownFailed.onChange(endpoint);
 }

+Future<Void> SimpleFailureMonitor::onDisconnect(NetworkAddress const& address) {
+	return disconnectTriggers.onChange(address);
+}
+
 Future<Void> SimpleFailureMonitor::onStateChanged(Endpoint const& endpoint) {
 	// Wait on endpointKnownFailed if it is false, to pick up both endpointNotFound errors (which set it to true)
 	//   and changes to addressStatus (which trigger a range).  Don't wait on endpointKnownFailed if it is true, because
--- a/fdbrpc/FailureMonitor.h
+++ b/fdbrpc/FailureMonitor.h
@ -98,9 +98,12 @@ public:
 	// The next time the known status for the endpoint changes, returns the new status.
 	virtual Future<Void> onStateChanged(Endpoint const& endpoint) = 0;

-	// Returns when onFailed(endpoint) || transport().onDisconnect( endpoint.getPrimaryAddress() ), but more efficiently
+	// Returns when onFailed(endpoint) || transport().onDisconnect( endpoint.getPrimaryAddress() )
 	virtual Future<Void> onDisconnectOrFailure(Endpoint const& endpoint) = 0;

+	// Returns when transport().onDisconnect( address )
+	virtual Future<Void> onDisconnect(NetworkAddress const& address) = 0;
+
 	// Returns true if the endpoint is failed but the address of the endpoint is not failed.
 	virtual bool onlyEndpointFailed(Endpoint const& endpoint) const = 0;

@ -147,6 +150,7 @@ public:
 	FailureStatus getState(Endpoint const& endpoint) const override;
 	FailureStatus getState(NetworkAddress const& address) const override;
 	Future<Void> onDisconnectOrFailure(Endpoint const& endpoint) override;
+	Future<Void> onDisconnect(NetworkAddress const& address) override;
 	bool onlyEndpointFailed(Endpoint const& endpoint) const override;
 	bool permanentlyFailed(Endpoint const& endpoint) const override;

@ -155,6 +159,7 @@ public:
 private:
 	std::unordered_map<NetworkAddress, FailureStatus> addressStatus;
 	YieldedAsyncMap<Endpoint, bool> endpointKnownFailed;
+	YieldedAsyncMap<NetworkAddress, bool> disconnectTriggers;
 	std::unordered_set<Endpoint> failedEndpoints;

 	friend class OnStateChangedActorActor;
--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@ -1581,26 +1581,18 @@ TEST_CASE("/flow/flow/FlowMutex") {
 				}
 				error = e;

-				// Wait for all actors still running to finish their waits and try to take the mutex
+				// Some actors can still be running, waiting while locked or unlocked,
+				// but all should become ready, some with errors.
+				state int i;
 				if (verbose) {
-					printf("Waiting for completions\n");
+					printf("Waiting for completions.  Future end states:\n");
 				}
-				wait(delay(2 * mutexTestDelay));
-
-				if (verbose) {
-					printf("Future end states:\n");
-				}
-				// All futures should be ready, some with errors.
-				bool allReady = true;
-				for (int i = 0; i < tests.size(); ++i) {
-					auto f = tests[i];
+				for (i = 0; i < tests.size(); ++i) {
+					ErrorOr<Void> f = wait(errorOr(tests[i]));
 					if (verbose) {
-						printf(
-						    "  %d: %s\n", i, f.isReady() ? (f.isError() ? f.getError().what() : "done") : "not ready");
+						printf("  %d: %s\n", i, f.isError() ? f.getError().what() : "done");
 					}
-					allReady = allReady && f.isReady();
 				}
-				ASSERT(allReady);
 			}

 			// If an error was caused, one should have been detected.
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -199,8 +199,9 @@ struct EndpointNotFoundReceiver final : NetworkMessageReceiver {

 	void receive(ArenaObjectReader& reader) override {
 		// Remote machine tells us it doesn't have endpoint e
-		Endpoint e;
-		reader.deserialize(e);
+		UID token;
+		reader.deserialize(token);
+		Endpoint e = FlowTransport::transport().loadedEndpoint(token);
 		IFailureMonitor::failureMonitor().endpointNotFound(e);
 	}
 };
@ -624,7 +625,6 @@ ACTOR Future<Void> connectionKeeper(Reference<Peer> self,
 				            IFailureMonitor::failureMonitor().getState(self->destination).isAvailable() ? "OK"
 				                                                                                        : "FAILED");
 				++self->connectOutgoingCount;
-
 				try {
 					choose {
 						when(Reference<IConnection> _conn =
@ -957,13 +957,13 @@ ACTOR static void deliver(TransportData* self,
 		if (destination.token.first() != -1) {
 			if (self->isLocalAddress(destination.getPrimaryAddress())) {
 				sendLocal(self,
-				          SerializeSource<Endpoint>(Endpoint(self->localAddresses, destination.token)),
+				          SerializeSource<UID>(destination.token),
 				          Endpoint(destination.addresses, WLTOKEN_ENDPOINT_NOT_FOUND));
 			} else {
 				Reference<Peer> peer = self->getOrOpenPeer(destination.getPrimaryAddress());
 				sendPacket(self,
 				           peer,
-				           SerializeSource<Endpoint>(Endpoint(self->localAddresses, destination.token)),
+				           SerializeSource<UID>(destination.token),
 				           Endpoint(destination.addresses, WLTOKEN_ENDPOINT_NOT_FOUND),
 				           false);
 			}
@ -1476,7 +1476,7 @@ Endpoint FlowTransport::loadedEndpoint(const UID& token) {
 }

 void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
-	if (!isStream || !endpoint.getPrimaryAddress().isValid())
+	if (!isStream || !endpoint.getPrimaryAddress().isValid() || !endpoint.getPrimaryAddress().isPublic())
 		return;

 	Reference<Peer> peer = self->getOrOpenPeer(endpoint.getPrimaryAddress());
@ -1488,7 +1488,7 @@ void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) {
 }

 void FlowTransport::removePeerReference(const Endpoint& endpoint, bool isStream) {
-	if (!isStream || !endpoint.getPrimaryAddress().isValid())
+	if (!isStream || !endpoint.getPrimaryAddress().isValid() || !endpoint.getPrimaryAddress().isPublic())
 		return;
 	Reference<Peer> peer = self->getPeer(endpoint.getPrimaryAddress());
 	if (peer) {
@ -1723,4 +1723,4 @@ void FlowTransport::createInstance(bool isClient, uint64_t transportId) {

 HealthMonitor* FlowTransport::healthMonitor() {
 	return &self->healthMonitor;
-}
+}
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@ -64,7 +64,7 @@ public:

 	NetworkAddress getStableAddress() const { return addresses.getTLSAddress(); }

-	Endpoint getAdjustedEndpoint(uint32_t index) {
+	Endpoint getAdjustedEndpoint(uint32_t index) const {
 		uint32_t newIndex = token.second();
 		newIndex += index;
 		return Endpoint(
--- a/fdbrpc/LoadBalance.actor.cpp
+++ b/fdbrpc/LoadBalance.actor.cpp
@ -0,0 +1,52 @@
+/*
+ * LoadBalance.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/flow.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+// Throwing all_alternatives_failed will cause the client to issue a GetKeyLocationRequest to the proxy, so this actor
+// attempts to limit the number of these errors thrown by a single client to prevent it from saturating the proxies with
+// these requests
+ACTOR Future<Void> allAlternativesFailedDelay(Future<Void> okFuture) {
+	if (now() - g_network->networkInfo.newestAlternativesFailure > FLOW_KNOBS->ALTERNATIVES_FAILURE_RESET_TIME) {
+		g_network->networkInfo.oldestAlternativesFailure = now();
+	}
+
+	double delay = FLOW_KNOBS->ALTERNATIVES_FAILURE_MIN_DELAY;
+	if (now() - g_network->networkInfo.lastAlternativesFailureSkipDelay > FLOW_KNOBS->ALTERNATIVES_FAILURE_SKIP_DELAY) {
+		g_network->networkInfo.lastAlternativesFailureSkipDelay = now();
+	} else {
+		double elapsed = now() - g_network->networkInfo.oldestAlternativesFailure;
+		delay = std::max(delay,
+		                 std::min(elapsed * FLOW_KNOBS->ALTERNATIVES_FAILURE_DELAY_RATIO,
+		                          FLOW_KNOBS->ALTERNATIVES_FAILURE_MAX_DELAY));
+		delay = std::max(delay,
+		                 std::min(elapsed * FLOW_KNOBS->ALTERNATIVES_FAILURE_SLOW_DELAY_RATIO,
+		                          FLOW_KNOBS->ALTERNATIVES_FAILURE_SLOW_MAX_DELAY));
+	}
+
+	g_network->networkInfo.newestAlternativesFailure = now();
+
+	choose {
+		when(wait(okFuture)) {}
+		when(wait(::delayJittered(delay))) { throw all_alternatives_failed(); }
+	}
+	return Void();
+}
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@ -42,6 +42,8 @@

 using std::vector;

+ACTOR Future<Void> allAlternativesFailedDelay(Future<Void> okFuture);
+
 struct ModelHolder : NonCopyable, public ReferenceCounted<ModelHolder> {
 	QueueModel* model;
 	bool released;
@ -527,43 +529,17 @@ Future<REPLY_TYPE(Request)> loadBalance(
 				                                                       FailureStatus(false));
 			}

+			Future<Void> okFuture = quorum(ok, 1);
+
 			if (!alternatives->alwaysFresh()) {
-				if (now() - g_network->networkInfo.newestAlternativesFailure >
-				    FLOW_KNOBS->ALTERNATIVES_FAILURE_RESET_TIME) {
-					g_network->networkInfo.oldestAlternativesFailure = now();
-				}
-
-				double delay = FLOW_KNOBS->ALTERNATIVES_FAILURE_MIN_DELAY;
-				if (now() - g_network->networkInfo.lastAlternativesFailureSkipDelay >
-				    FLOW_KNOBS->ALTERNATIVES_FAILURE_SKIP_DELAY) {
-					g_network->networkInfo.lastAlternativesFailureSkipDelay = now();
-				} else {
-					double elapsed = now() - g_network->networkInfo.oldestAlternativesFailure;
-					delay = std::max(delay,
-					                 std::min(elapsed * FLOW_KNOBS->ALTERNATIVES_FAILURE_DELAY_RATIO,
-					                          FLOW_KNOBS->ALTERNATIVES_FAILURE_MAX_DELAY));
-					delay = std::max(delay,
-					                 std::min(elapsed * FLOW_KNOBS->ALTERNATIVES_FAILURE_SLOW_DELAY_RATIO,
-					                          FLOW_KNOBS->ALTERNATIVES_FAILURE_SLOW_MAX_DELAY));
-				}
-
 				// Making this SevWarn means a lot of clutter
 				if (now() - g_network->networkInfo.newestAlternativesFailure > 1 ||
 				    deterministicRandom()->random01() < 0.01) {
-					TraceEvent("AllAlternativesFailed")
-					    .detail("Interval", FLOW_KNOBS->CACHE_REFRESH_INTERVAL_WHEN_ALL_ALTERNATIVES_FAILED)
-					    .detail("Alternatives", alternatives->description())
-					    .detail("Delay", delay);
-				}
-
-				g_network->networkInfo.newestAlternativesFailure = now();
-
-				choose {
-					when(wait(quorum(ok, 1))) {}
-					when(wait(::delayJittered(delay))) { throw all_alternatives_failed(); }
+					TraceEvent("AllAlternativesFailed").detail("Alternatives", alternatives->description());
 				}
+				wait(allAlternativesFailedDelay(okFuture));
 			} else {
-				wait(quorum(ok, 1));
+				wait(okFuture);
 			}

 			numAttempts = 0; // now that we've got a server back, reset the backoff
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@ -79,6 +79,8 @@ struct FlowReceiver : public NetworkMessageReceiver {
 		FlowTransport::transport().addWellKnownEndpoint(endpoint, this, taskID);
 	}

+	const Endpoint& getRawEndpoint() { return endpoint; }
+
 private:
 	Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_;
 	Endpoint endpoint;
@ -251,6 +253,319 @@ void setReplyPriority(const ReplyPromise<Reply>& p, TaskPriority taskID) {
 	p.getEndpoint(taskID);
 }

+struct ReplyPromiseStreamReply {
+	Optional<UID> acknowledgeToken;
+	ReplyPromiseStreamReply() {}
+};
+
+struct AcknowledgementReply {
+	constexpr static FileIdentifier file_identifier = 1389929;
+	int64_t bytes;
+
+	AcknowledgementReply() : bytes(0) {}
+	explicit AcknowledgementReply(int64_t bytes) : bytes(bytes) {}
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, bytes);
+	}
+};
+
+// Registered on the server to recieve acknowledgements that the client has received stream data. This prevents the
+// server from sending too much data to the client if the client is not consuming it.
+struct AcknowledgementReceiver final : FlowReceiver, FastAllocated<AcknowledgementReceiver> {
+	using FastAllocated<AcknowledgementReceiver>::operator new;
+	using FastAllocated<AcknowledgementReceiver>::operator delete;
+
+	int64_t bytesSent;
+	int64_t bytesAcknowledged;
+	int64_t bytesLimit;
+	Promise<Void> ready;
+	Future<Void> failures;
+
+	AcknowledgementReceiver() : bytesSent(0), bytesAcknowledged(0), bytesLimit(0), ready(nullptr) {}
+	AcknowledgementReceiver(const Endpoint& remoteEndpoint)
+	  : FlowReceiver(remoteEndpoint, false), bytesSent(0), bytesAcknowledged(0), bytesLimit(0), ready(nullptr) {}
+
+	void receive(ArenaObjectReader& reader) override {
+		ErrorOr<AcknowledgementReply> message;
+		reader.deserialize(message);
+		if (message.isError()) {
+			// The client will send an operation_obsolete error on the acknowledgement stream when it cancels the
+			// ReplyPromiseStream
+			if (!ready.isValid()) {
+				ready = Promise<Void>();
+			}
+			ready.sendError(message.getError());
+		} else {
+			ASSERT(message.get().bytes > bytesAcknowledged);
+			bytesAcknowledged = message.get().bytes;
+			if (ready.isValid() && bytesSent - bytesAcknowledged < bytesLimit) {
+				Promise<Void> hold = ready;
+				ready = Promise<Void>(nullptr);
+				// Sending to this promise could cause the ready to be replaced, so we need to hold a local copy
+				hold.send(Void());
+			}
+		}
+	}
+};
+
+// A version of NetNotifiedQueue which adds support for acknowledgments.
+template <class T>
+struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
+                                                    FlowReceiver,
+                                                    FastAllocated<NetNotifiedQueueWithAcknowledgements<T>> {
+	using FastAllocated<NetNotifiedQueueWithAcknowledgements<T>>::operator new;
+	using FastAllocated<NetNotifiedQueueWithAcknowledgements<T>>::operator delete;
+
+	AcknowledgementReceiver acknowledgements;
+	Endpoint requestStreamEndpoint;
+	bool sentError = false;
+
+	NetNotifiedQueueWithAcknowledgements(int futures, int promises) : NotifiedQueue<T>(futures, promises) {}
+	NetNotifiedQueueWithAcknowledgements(int futures, int promises, const Endpoint& remoteEndpoint)
+	  : NotifiedQueue<T>(futures, promises), FlowReceiver(remoteEndpoint, true) {
+		// A ReplyPromiseStream will be terminated on the server side if the network connection with the client breaks
+		acknowledgements.failures = tagError<Void>(
+		    makeDependent<T>(IFailureMonitor::failureMonitor()).onDisconnect(remoteEndpoint.getPrimaryAddress()),
+		    operation_obsolete());
+	}
+
+	void destroy() override { delete this; }
+	void receive(ArenaObjectReader& reader) override {
+		this->addPromiseRef();
+		ErrorOr<EnsureTable<T>> message;
+		reader.deserialize(message);
+
+		if (message.isError()) {
+			if (message.getError().code() == error_code_broken_promise) {
+				ASSERT(requestStreamEndpoint.isValid());
+				// We will get a broken_promise on the client side only if the ReplyPromiseStream was cancelled without
+				// sending an error. In this case the storage server actor must have been cancelled so future
+				// GetKeyValuesStream requests on the same endpoint will fail
+				IFailureMonitor::failureMonitor().endpointNotFound(requestStreamEndpoint);
+			}
+			this->sendError(message.getError());
+		} else {
+			if (message.get().asUnderlyingType().acknowledgeToken.present()) {
+				acknowledgements = AcknowledgementReceiver(
+				    FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()));
+			}
+			if (this->shouldFireImmediately()) {
+				// This message is going to be consumed by the client immediately (and therefore will not call pop()) so
+				// send an ack immediately
+				if (acknowledgements.getRawEndpoint().isValid()) {
+					acknowledgements.bytesAcknowledged += message.get().asUnderlyingType().expectedSize();
+					FlowTransport::transport().sendUnreliable(
+					    SerializeSource<ErrorOr<AcknowledgementReply>>(
+					        AcknowledgementReply(acknowledgements.bytesAcknowledged)),
+					    acknowledgements.getEndpoint(TaskPriority::ReadSocket),
+					    false);
+				}
+			}
+
+			this->send(std::move(message.get().asUnderlyingType()));
+		}
+		this->delPromiseRef();
+	}
+
+	T pop() override {
+		T res = this->popImpl();
+		// A reply that has been queued up is being consumed, so send an ack to the server
+		if (acknowledgements.getRawEndpoint().isValid()) {
+			acknowledgements.bytesAcknowledged += res.expectedSize();
+			FlowTransport::transport().sendUnreliable(SerializeSource<ErrorOr<AcknowledgementReply>>(
+			                                              AcknowledgementReply(acknowledgements.bytesAcknowledged)),
+			                                          acknowledgements.getEndpoint(TaskPriority::ReadSocket),
+			                                          false);
+		}
+		return res;
+	}
+
+	~NetNotifiedQueueWithAcknowledgements() {
+		if (acknowledgements.getRawEndpoint().isValid() && acknowledgements.isRemoteEndpoint() && !this->hasError()) {
+			// Notify the server that a client is not using this ReplyPromiseStream anymore
+			FlowTransport::transport().sendUnreliable(
+			    SerializeSource<ErrorOr<AcknowledgementReply>>(operation_obsolete()),
+			    acknowledgements.getEndpoint(TaskPriority::ReadSocket),
+			    false);
+		}
+		if (isRemoteEndpoint() && !sentError && !acknowledgements.failures.isReady()) {
+			// The ReplyPromiseStream was cancelled before sending an error, so the storage server must have died
+			FlowTransport::transport().sendUnreliable(SerializeSource<ErrorOr<EnsureTable<T>>>(broken_promise()),
+			                                          getEndpoint(TaskPriority::ReadSocket),
+			                                          false);
+		}
+	}
+
+	bool isStream() const override { return true; }
+};
+
+template <class T>
+class ReplyPromiseStream {
+public:
+	// The endpoints of a ReplyPromiseStream must be initialized at Task::ReadSocket, because with lower priorities a
+	// delay(0) in FlowTransport deliver can cause out of order delivery.
+
+	// stream.send( request )
+	//   Unreliable at most once delivery: Delivers request unless there is a connection failure (zero or one times)
+
+	template <class U>
+	void send(U&& value) const {
+		if (queue->isRemoteEndpoint()) {
+			if (!queue->acknowledgements.getRawEndpoint().isValid()) {
+				value.acknowledgeToken = queue->acknowledgements.getEndpoint(TaskPriority::ReadSocket).token;
+			}
+			queue->acknowledgements.bytesSent += value.expectedSize();
+			FlowTransport::transport().sendUnreliable(
+			    SerializeSource<ErrorOr<EnsureTable<T>>>(value), getEndpoint(), false);
+		} else {
+			queue->send(std::forward<U>(value));
+		}
+	}
+
+	template <class E>
+	void sendError(const E& exc) const {
+		if (queue->isRemoteEndpoint() && !queue->sentError) {
+			queue->sentError = true;
+			FlowTransport::transport().sendUnreliable(
+			    SerializeSource<ErrorOr<EnsureTable<T>>>(exc), getEndpoint(), false);
+		} else {
+			queue->sendError(exc);
+			if (errors && errors->canBeSet()) {
+				errors->sendError(exc);
+			}
+		}
+	}
+
+	FutureStream<T> getFuture() const {
+		queue->addFutureRef();
+		return FutureStream<T>(queue);
+	}
+	ReplyPromiseStream() : queue(new NetNotifiedQueueWithAcknowledgements<T>(0, 1)), errors(new SAV<Void>(0, 1)) {}
+	ReplyPromiseStream(const ReplyPromiseStream& rhs) : queue(rhs.queue), errors(rhs.errors) {
+		queue->addPromiseRef();
+		if (errors) {
+			errors->addPromiseRef();
+		}
+	}
+	ReplyPromiseStream(ReplyPromiseStream&& rhs) noexcept : queue(rhs.queue), errors(rhs.errors) {
+		rhs.queue = nullptr;
+		rhs.errors = nullptr;
+	}
+	explicit ReplyPromiseStream(const Endpoint& endpoint)
+	  : queue(new NetNotifiedQueueWithAcknowledgements<T>(0, 1, endpoint)), errors(nullptr) {}
+
+	// Used by endStreamOnDisconnect to detect when all references to the ReplyPromiseStream have been dropped
+	Future<Void> getErrorFutureAndDelPromiseRef() {
+		ASSERT(errors && errors->getPromiseReferenceCount() > 1);
+		errors->addFutureRef();
+		errors->delPromiseRef();
+		Future<Void> res(errors);
+		errors = nullptr;
+		return res;
+	}
+
+	void setRequestStreamEndpoint(const Endpoint& endpoint) { queue->requestStreamEndpoint = endpoint; }
+
+	~ReplyPromiseStream() {
+		if (queue)
+			queue->delPromiseRef();
+		if (errors)
+			errors->delPromiseRef();
+	}
+
+	const Endpoint& getEndpoint() const { return queue->getEndpoint(TaskPriority::ReadSocket); }
+
+	bool operator==(const ReplyPromiseStream<T>& rhs) const { return queue == rhs.queue; }
+	bool isEmpty() const { return !queue->isReady(); }
+	uint32_t size() const { return queue->size(); }
+
+	// Must be called on the server before sending results on the stream to ratelimit the amount of data outstanding to
+	// the client
+	Future<Void> onReady() {
+		ASSERT(queue->acknowledgements.bytesLimit > 0);
+		if (queue->acknowledgements.failures.isError()) {
+			return queue->acknowledgements.failures.getError();
+		}
+		if (queue->acknowledgements.ready.isValid() && queue->acknowledgements.ready.isSet()) {
+			return queue->acknowledgements.ready.getFuture().getError();
+		}
+		if (queue->acknowledgements.bytesSent - queue->acknowledgements.bytesAcknowledged <
+		    queue->acknowledgements.bytesLimit) {
+			return Void();
+		}
+		if (!queue->acknowledgements.ready.isValid()) {
+			queue->acknowledgements.ready = Promise<Void>();
+		}
+		return queue->acknowledgements.ready.getFuture() || queue->acknowledgements.failures;
+	}
+
+	// Must be called on the server before using a ReplyPromiseStream to limit the amount of outstanding bytes to the
+	// client
+	void setByteLimit(int64_t byteLimit) { queue->acknowledgements.bytesLimit = byteLimit; }
+
+	void operator=(const ReplyPromiseStream& rhs) {
+		rhs.queue->addPromiseRef();
+		if (queue)
+			queue->delPromiseRef();
+		queue = rhs.queue;
+		if (rhs.errors)
+			rhs.errors->addPromiseRef();
+		if (errors)
+			errors->delPromiseRef();
+		errors = rhs.errors;
+	}
+	void operator=(ReplyPromiseStream&& rhs) noexcept {
+		if (queue != rhs.queue) {
+			if (queue)
+				queue->delPromiseRef();
+			queue = rhs.queue;
+			rhs.queue = 0;
+		}
+		if (errors != rhs.errors) {
+			if (errors)
+				errors->delPromiseRef();
+			errors = rhs.errors;
+			rhs.errors = 0;
+		}
+	}
+
+private:
+	NetNotifiedQueueWithAcknowledgements<T>* queue;
+	SAV<Void>* errors;
+};
+
+template <class Ar, class T>
+void save(Ar& ar, const ReplyPromiseStream<T>& value) {
+	auto const& ep = value.getEndpoint().token;
+	ar << ep;
+}
+
+template <class Ar, class T>
+void load(Ar& ar, ReplyPromiseStream<T>& value) {
+	UID token;
+	ar >> token;
+	Endpoint endpoint = FlowTransport::transport().loadedEndpoint(token);
+	value = ReplyPromiseStream<T>(endpoint);
+}
+
+template <class T>
+struct serializable_traits<ReplyPromiseStream<T>> : std::true_type {
+	template <class Archiver>
+	static void serialize(Archiver& ar, ReplyPromiseStream<T>& p) {
+		if constexpr (Archiver::isDeserializing) {
+			UID token;
+			serializer(ar, token);
+			auto endpoint = FlowTransport::transport().loadedEndpoint(token);
+			p = ReplyPromiseStream<T>(endpoint);
+		} else {
+			const auto& ep = p.getEndpoint().token;
+			serializer(ar, ep);
+		}
+	}
+};
+
 template <class T>
 struct NetNotifiedQueue final : NotifiedQueue<T>, FlowReceiver, FastAllocated<NetNotifiedQueue<T>> {
 	using FastAllocated<NetNotifiedQueue<T>>::operator new;
@ -366,6 +681,30 @@ public:
 		}
 	}

+	// stream.getReplyStream( request )
+	//   Unreliable at most once delivery.
+	//   Registers the request with the remote endpoint which sends back a stream of replies, followed by an
+	//   end_of_stream error. If the connection is ever broken the remote endpoint will stop attempting to send replies.
+	//   The caller sends acknowledgements to the remote endpoint so that at most 2MB of replies is ever inflight.
+
+	template <class X>
+	ReplyPromiseStream<REPLYSTREAM_TYPE(X)> getReplyStream(const X& value) const {
+		if (queue->isRemoteEndpoint()) {
+			Future<Void> disc =
+			    makeDependent<T>(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint());
+			auto& p = getReplyPromiseStream(value);
+			Reference<Peer> peer =
+			    FlowTransport::transport().sendUnreliable(SerializeSource<T>(value), getEndpoint(), true);
+			// FIXME: defer sending the message until we know the connection is established
+			endStreamOnDisconnect(disc, p, getEndpoint(), peer);
+			return p;
+		} else {
+			send(value);
+			auto& p = getReplyPromiseStream(value);
+			return p;
+		}
+	}
+
 	// stream.getReplyUnlessFailedFor( request, double sustainedFailureDuration, double sustainedFailureSlope )
 	//   Reliable at least once delivery: Like getReply, delivers request at least once and returns one of the replies.
 	//   However, if
@ -435,7 +774,7 @@ public:
 		// queue = (NetNotifiedQueue<T>*)0xdeadbeef;
 	}

-	Endpoint getEndpoint(TaskPriority taskID = TaskPriority::DefaultEndpoint) const {
+	const Endpoint& getEndpoint(TaskPriority taskID = TaskPriority::DefaultEndpoint) const {
 		return queue->getEndpoint(taskID);
 	}
 	void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) {
--- a/fdbrpc/genericactors.actor.h
+++ b/fdbrpc/genericactors.actor.h
@ -197,6 +197,25 @@ struct PeerHolder {
 	}
 };

+// Implements getRepyStream, this a void actor with the same lifetime as the input ReplyPromiseStream.
+// Because this actor holds a reference to the stream, normally it would be impossible to know when there are no other
+// references. To get around this, there is a SAV inside the stream that has one less promise reference than it should
+// (caused by getErrorFutureAndDelPromiseRef()). When that SAV gets a broken promise because no one besides this void
+// actor is referencing it, this void actor will get a broken_promise dropping the final reference to the full
+// ReplyPromiseStream
+ACTOR template <class X>
+void endStreamOnDisconnect(Future<Void> signal,
+                           ReplyPromiseStream<X> stream,
+                           Endpoint endpoint,
+                           Reference<Peer> peer = Reference<Peer>()) {
+	state PeerHolder holder = PeerHolder(peer);
+	stream.setRequestStreamEndpoint(endpoint);
+	choose {
+		when(wait(signal)) { stream.sendError(connection_failed()); }
+		when(wait(stream.getErrorFutureAndDelPromiseRef())) {}
+	}
+}
+
 // Implements tryGetReply, getReplyUnlessFailedFor
 ACTOR template <class X>
 Future<ErrorOr<X>> waitValueOrSignal(Future<X> value,
@ -224,7 +243,6 @@ Future<ErrorOr<X>> waitValueOrSignal(Future<X> value,
 			// receiving the failure signal
 			if (e.code() != error_code_broken_promise || signal.isError())
 				return ErrorOr<X>(e);
-
 			IFailureMonitor::failureMonitor().endpointNotFound(endpoint);
 			value = Never();
 		}
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@ -70,7 +70,6 @@ set(FDBSERVER_SRCS
  OldTLogServer_6_2.actor.cpp
  OnDemandStore.actor.cpp
  OnDemandStore.h
-  Orderer.actor.h
  PaxosConfigConsumer.actor.cpp
  PaxosConfigConsumer.h
  PaxosConfigDatabaseNode.actor.cpp
@ -179,6 +178,7 @@ set(FDBSERVER_SRCS
  workloads/FileSystem.actor.cpp
  workloads/Fuzz.cpp
  workloads/FuzzApiCorrectness.actor.cpp
+  workloads/GetRangeStream.actor.cpp
  workloads/HealthMetricsApi.actor.cpp
  workloads/IncrementalBackup.actor.cpp
  workloads/Increment.actor.cpp
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -2840,8 +2840,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 				}
 				this->wiggle_addresses.push_back(addr);
 				this->excludedServers.set(addr, DDTeamCollection::Status::WIGGLING);
-				moveFutures.push_back(
-				    waitForAllDataRemoved(this->cx, info->lastKnownInterface.id(), info->addedVersion, this));
+				moveFutures.push_back(info->onRemoved);
 			}
 			if (!moveFutures.empty()) {
 				this->restartRecruiting.trigger();
@ -3898,29 +3897,27 @@ ACTOR Future<vector<std::pair<StorageServerInterface, ProcessClass>>> getServerL
 // to a sorted PID set maintained by the data distributor. If now no storage server exists, the new Process ID is 0.
 ACTOR Future<Void> updateNextWigglingStoragePID(DDTeamCollection* teamCollection) {
 	state ReadYourWritesTransaction tr(teamCollection->cx);
-	state Value writeValue = LiteralStringRef("0");
+	state Value writeValue;
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			Optional<Value> value = wait(tr.get(wigglingStorageServerKey));
 			if (teamCollection->pid2server_info.empty()) {
-				tr.set(wigglingStorageServerKey, LiteralStringRef("0"));
+				writeValue = LiteralStringRef("");
 			} else {
 				Value pid = teamCollection->pid2server_info.begin()->first;
 				if (value.present()) {
 					auto nextIt = teamCollection->pid2server_info.upper_bound(value.get());
 					if (nextIt == teamCollection->pid2server_info.end()) {
-						tr.set(wigglingStorageServerKey, pid);
 						writeValue = pid;
 					} else {
-						tr.set(wigglingStorageServerKey, nextIt->first);
 						writeValue = nextIt->first;
 					}
 				} else {
-					tr.set(wigglingStorageServerKey, pid);
 					writeValue = pid;
 				}
 			}
+			tr.set(wigglingStorageServerKey, writeValue);
 			wait(tr.commit());
 			break;
 		} catch (Error& e) {
@ -3939,10 +3936,20 @@ ACTOR Future<Void> updateNextWigglingStoragePID(DDTeamCollection* teamCollection
 ACTOR Future<Void> perpetualStorageWiggleIterator(AsyncVar<bool>* stopSignal,
                                                  FutureStream<Void> finishStorageWiggleSignal,
                                                  DDTeamCollection* teamCollection) {
+	state int lastFinishTime = now();
 	loop {
 		choose {
 			when(wait(stopSignal->onChange())) {}
-			when(waitNext(finishStorageWiggleSignal)) { wait(updateNextWigglingStoragePID(teamCollection)); }
+			when(waitNext(finishStorageWiggleSignal)) {
+				state bool takeRest = true; // delay to avoid delete and update ServerList too frequently
+				while (takeRest) {
+					wait(delayJittered(SERVER_KNOBS->PERPETUAL_WIGGLE_DELAY));
+					// there must not have other teams to place wiggled data
+					takeRest = teamCollection->server_info.size() <= teamCollection->configuration.storageTeamSize ||
+						   teamCollection->machine_info.size() < teamCollection->configuration.storageTeamSize;
+				}
+				wait(updateNextWigglingStoragePID(teamCollection));
+			}
 		}
 		if (stopSignal->get()) {
 			break;
@ -3976,17 +3983,24 @@ ACTOR Future<std::pair<Future<Void>, Value>> watchPerpetualStoragePIDChange(DDTe
 }

 // periodically check whether the cluster is healthy if we continue perpetual wiggle
-ACTOR Future<Void> clusterHealthCheckForPerpetualWiggle(DDTeamCollection* self) {
+ACTOR Future<Void> clusterHealthCheckForPerpetualWiggle(DDTeamCollection* self, int* extraTeamCount) {
+	state int pausePenalty = 1;
 	loop {
 		Promise<int> countp;
 		self->getUnhealthyRelocationCount.send(countp);
 		int count = wait(countp.getFuture());
 		// pause wiggle when
 		// a. DDQueue is busy with unhealthy relocation request
-		// b. no healthy team
+		// b. healthy teams are not enough
 		// c. the overall disk space is not enough
-		if (count >= SERVER_KNOBS->DD_STORAGE_WIGGLE_PAUSE_THRESHOLD || self->healthyTeamCount == 0 ||
+		if (count >= SERVER_KNOBS->DD_STORAGE_WIGGLE_PAUSE_THRESHOLD || self->healthyTeamCount <= *extraTeamCount ||
 		    self->bestTeamStuck) {
+			// if we pause wiggle not because the reason a, increase extraTeamCount. This helps avoid oscillation
+			// between pause and non-pause status.
+			if ((self->healthyTeamCount <= *extraTeamCount || self->bestTeamStuck) && !self->pauseWiggle->get()) {
+				*extraTeamCount = std::min(*extraTeamCount + pausePenalty, (int)self->teams.size());
+				pausePenalty = std::min(pausePenalty * 2, (int)self->teams.size());
+			}
 			self->pauseWiggle->set(true);
 		} else {
 			self->pauseWiggle->set(false);
@ -4004,9 +4018,9 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncVar<bool>* stopSignal,
                                           const DDEnabledState* ddEnabledState) {
 	state Future<Void> watchFuture = Never();
 	state Future<Void> moveFinishFuture = Never();
-	state Future<Void> ddQueueCheck = clusterHealthCheckForPerpetualWiggle(self);
+	state int extraTeamCount = 0;
+	state Future<Void> ddQueueCheck = clusterHealthCheckForPerpetualWiggle(self, &extraTeamCount);
 	state int movingCount = 0;
-	state vector<UID> excludedServerIds;
 	state std::pair<Future<Void>, Value> res = wait(watchPerpetualStoragePIDChange(self));
 	ASSERT(!self->wigglingPid.present()); // only single process wiggle is allowed
 	self->wigglingPid = Optional<Key>(res.second);
@ -4020,26 +4034,19 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncVar<bool>* stopSignal,
 				self->includeStorageServersForWiggle();
 				TraceEvent("PerpetualStorageWigglePause", self->distributorId)
 				    .detail("ProcessId", pid)
+				    .detail("ExtraHealthyTeamCount", extraTeamCount)
+				    .detail("HealthyTeamCount", self->healthyTeamCount)
 				    .detail("StorageCount", movingCount);
 			} else {
-				// pre-check whether wiggling chosen servers still satisfy replica requirement
-				excludedServerIds.clear();
-				for (const auto& info : self->pid2server_info[self->wigglingPid.get()]) {
-					excludedServerIds.push_back(info->id);
-				}
-				if (_exclusionSafetyCheck(excludedServerIds, self)) {
-					TEST(true); // start wiggling
-					auto fv = self->excludeStorageServersForWiggle(pid);
-					movingCount = fv.size();
-					moveFinishFuture = waitForAll(fv);
-					TraceEvent("PerpetualStorageWiggleStart", self->distributorId)
-					    .detail("ProcessId", pid)
-					    .detail("StorageCount", movingCount);
-				} else {
-					TEST(true); // skip wiggling current process
-					TraceEvent("PerpetualStorageWiggleSkip", self->distributorId).detail("ProcessId", pid.toString());
-					moveFinishFuture = Void();
-				}
+				TEST(true); // start wiggling
+				auto fv = self->excludeStorageServersForWiggle(pid);
+				movingCount = fv.size();
+				moveFinishFuture = waitForAll(fv);
+				TraceEvent("PerpetualStorageWiggleStart", self->distributorId)
+				    .detail("ProcessId", pid)
+				    .detail("ExtraHealthyTeamCount", extraTeamCount)
+				    .detail("HealthyTeamCount", self->healthyTeamCount)
+				    .detail("StorageCount", movingCount);
 			}
 		}

@ -4055,9 +4062,9 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncVar<bool>* stopSignal,
 				wait(delayJittered(5.0, TaskPriority::DataDistributionLow));
 			}
 			when(wait(moveFinishFuture)) {
-				TEST(true); // finish wiggling this process
 				ASSERT(self->wigglingPid.present());
 				StringRef pid = self->wigglingPid.get();
+				TEST(pid != LiteralStringRef("")); // finish wiggling this process

 				moveFinishFuture = Never();
 				self->includeStorageServersForWiggle();
@ -4068,6 +4075,7 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncVar<bool>* stopSignal,
 				self->wigglingPid.reset();
 				watchFuture = res.first;
 				finishStorageWiggleSignal.send(Void());
+				extraTeamCount = std::max(0, extraTeamCount - 1);
 			}
 			when(wait(ddQueueCheck || self->pauseWiggle->onChange() || stopSignal->onChange())) {}
 		}
@ -4093,7 +4101,7 @@ ACTOR Future<Void> perpetualStorageWiggler(AsyncVar<bool>* stopSignal,
 ACTOR Future<Void> monitorPerpetualStorageWiggle(DDTeamCollection* teamCollection,
                                                 const DDEnabledState* ddEnabledState) {
 	state int speed = 0;
-	state AsyncVar<bool> stopWiggleSignal(false);
+	state AsyncVar<bool> stopWiggleSignal(true);
 	state PromiseStream<Void> finishStorageWiggleSignal;
 	state SignalableActorCollection collection;
 	teamCollection->pauseWiggle = makeReference<AsyncVar<bool>>(true);
@ -4119,11 +4127,13 @@ ACTOR Future<Void> monitorPerpetualStorageWiggle(DDTeamCollection* teamCollectio
 					collection.add(perpetualStorageWiggler(
 					    &stopWiggleSignal, finishStorageWiggleSignal, teamCollection, ddEnabledState));
 					TraceEvent("PerpetualStorageWiggleOpen", teamCollection->distributorId);
-				} else if (speed == 0 && !stopWiggleSignal.get()) {
-					stopWiggleSignal.set(true);
-					wait(collection.signalAndReset());
+				} else if (speed == 0) {
+					if (!stopWiggleSignal.get()) {
+						stopWiggleSignal.set(true);
+						wait(collection.signalAndReset());
+						teamCollection->pauseWiggle->set(true);
+					}
 					TraceEvent("PerpetualStorageWiggleClose", teamCollection->distributorId);
-					teamCollection->pauseWiggle->set(true);
 				}
 				wait(watchFuture);
 				break;
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -1662,6 +1662,8 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
 					            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM])
 					    .detail("PriorityRebalanceOverutilizedTeam",
 					            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM])
+					    .detail("PriorityStorageWiggle",
+					            self.priority_relocations[SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE])
 					    .detail("PriorityTeamHealthy", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_HEALTHY])
 					    .detail("PriorityTeamContainsUndesiredServer",
 					            self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER])
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@ -36,6 +36,8 @@ struct GrvProxyStats {
 	Counter txnBatchPriorityStartIn, txnBatchPriorityStartOut;
 	Counter txnDefaultPriorityStartIn, txnDefaultPriorityStartOut;
 	Counter txnThrottled;
+	Counter updatesFromRatekeeper, leaseTimeouts;
+	int systemGRVQueueSize, defaultGRVQueueSize, batchGRVQueueSize;
 	double transactionRateAllowed, batchTransactionRateAllowed;
 	double transactionLimit, batchTransactionLimit;
 	// how much of the GRV requests queue was processed in one attempt to hand out read version.
@ -89,12 +91,13 @@ struct GrvProxyStats {
 	    txnBatchPriorityStartOut("TxnBatchPriorityStartOut", cc),
 	    txnDefaultPriorityStartIn("TxnDefaultPriorityStartIn", cc),
 	    txnDefaultPriorityStartOut("TxnDefaultPriorityStartOut", cc), txnThrottled("TxnThrottled", cc),
-	    transactionRateAllowed(0), batchTransactionRateAllowed(0), transactionLimit(0), batchTransactionLimit(0),
-	    percentageOfDefaultGRVQueueProcessed(0), percentageOfBatchGRVQueueProcessed(0),
-	    defaultTxnGRVTimeInQueue("DefaultTxnGRVTimeInQueue",
-	                             id,
-	                             SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                             SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    updatesFromRatekeeper("UpdatesFromRatekeeper", cc), leaseTimeouts("LeaseTimeouts", cc), systemGRVQueueSize(0),
+	    defaultGRVQueueSize(0), batchGRVQueueSize(0), transactionRateAllowed(0), batchTransactionRateAllowed(0),
+	    transactionLimit(0), batchTransactionLimit(0), percentageOfDefaultGRVQueueProcessed(0),
+	    percentageOfBatchGRVQueueProcessed(0), defaultTxnGRVTimeInQueue("DefaultTxnGRVTimeInQueue",
+	                                                                    id,
+	                                                                    SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                                                                    SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 	    batchTxnGRVTimeInQueue("BatchTxnGRVTimeInQueue",
 	                           id,
 	                           SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
@ -109,6 +112,9 @@ struct GrvProxyStats {
 	                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 	    grvLatencyBands("GRVLatencyBands", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 		// The rate at which the limit(budget) is allowed to grow.
+		specialCounter(cc, "SystemGRVQueueSize", [this]() { return this->systemGRVQueueSize; });
+		specialCounter(cc, "DefaultGRVQueueSize", [this]() { return this->defaultGRVQueueSize; });
+		specialCounter(cc, "BatchGRVQueueSize", [this]() { return this->batchGRVQueueSize; });
 		specialCounter(
 		    cc, "SystemAndDefaultTxnRateAllowed", [this]() { return int64_t(this->transactionRateAllowed); });
 		specialCounter(
@ -267,6 +273,7 @@ ACTOR Future<Void> healthMetricsRequestServer(GrvProxyInterface grvProxy,
 	}
 }

+// Get transaction rate info from RateKeeper.
 ACTOR Future<Void> getRate(UID myID,
                           Reference<AsyncVar<ServerDBInfo>> db,
                           int64_t* inTransactionCount,
@ -320,6 +327,7 @@ ACTOR Future<Void> getRate(UID myID,
 			batchTransactionRateInfo->setRate(rep.batchTransactionRate);
 			stats->transactionRateAllowed = rep.transactionRate;
 			stats->batchTransactionRateAllowed = rep.batchTransactionRate;
+			++stats->updatesFromRatekeeper;
 			//TraceEvent("GrvProxyRate", myID).detail("Rate", rep.transactionRate).detail("BatchRate", rep.batchTransactionRate).detail("Lease", rep.leaseDuration).detail("ReleasedTransactions", *inTransactionCount - lastTC);
 			lastTC = *inTransactionCount;
 			leaseTimeout = delay(rep.leaseDuration);
@ -339,6 +347,7 @@ ACTOR Future<Void> getRate(UID myID,
 		when(wait(leaseTimeout)) {
 			transactionRateInfo->disable();
 			batchTransactionRateInfo->disable();
+			++stats->leaseTimeouts;
 			TraceEvent(SevWarn, "GrvProxyRateLeaseExpired", myID).suppressFor(5.0);
 			//TraceEvent("GrvProxyRate", myID).detail("Rate", 0.0).detail("BatchRate", 0.0).detail("Lease", 0);
 			leaseTimeout = Never();
@ -390,14 +399,17 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>>
 				} else if (req.priority == TransactionPriority::DEFAULT) {
 					if (!batchQueue->empty()) {
 						dropRequestFromQueue(batchQueue, stats);
+						--stats->batchGRVQueueSize;
 					} else {
 						canBeQueued = false;
 					}
 				} else {
 					if (!batchQueue->empty()) {
 						dropRequestFromQueue(batchQueue, stats);
+						--stats->batchGRVQueueSize;
 					} else if (!defaultQueue->empty()) {
 						dropRequestFromQueue(defaultQueue, stats);
+						--stats->defaultGRVQueueSize;
 					} else {
 						canBeQueued = false;
 					}
@ -427,12 +439,14 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>>
 					++stats->txnRequestIn;
 					stats->txnStartIn += req.transactionCount;
 					stats->txnSystemPriorityStartIn += req.transactionCount;
+					++stats->systemGRVQueueSize;
 					systemQueue->push_back(req);
 					systemQueue->span.addParent(req.spanContext);
 				} else if (req.priority >= TransactionPriority::DEFAULT) {
 					++stats->txnRequestIn;
 					stats->txnStartIn += req.transactionCount;
 					stats->txnDefaultPriorityStartIn += req.transactionCount;
+					++stats->defaultGRVQueueSize;
 					defaultQueue->push_back(req);
 					defaultQueue->span.addParent(req.spanContext);
 				} else {
@ -445,6 +459,7 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>>
 						++stats->txnRequestIn;
 						stats->txnStartIn += req.transactionCount;
 						stats->txnBatchPriorityStartIn += req.transactionCount;
+						++stats->batchGRVQueueSize;
 						batchQueue->push_back(req);
 						batchQueue->span.addParent(req.spanContext);
 					}
@ -791,12 +806,15 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 			double currentTime = g_network->timer();
 			if (req.priority >= TransactionPriority::IMMEDIATE) {
 				systemTransactionsStarted[req.flags & 1] += tc;
+				--grvProxyData->stats.systemGRVQueueSize;
 			} else if (req.priority >= TransactionPriority::DEFAULT) {
 				defaultPriTransactionsStarted[req.flags & 1] += tc;
 				grvProxyData->stats.defaultTxnGRVTimeInQueue.addMeasurement(currentTime - req.requestTime());
+				--grvProxyData->stats.defaultGRVQueueSize;
 			} else {
 				batchPriTransactionsStarted[req.flags & 1] += tc;
 				grvProxyData->stats.batchTxnGRVTimeInQueue.addMeasurement(currentTime - req.requestTime());
+				--grvProxyData->stats.batchGRVQueueSize;
 			}

 			start[req.flags & 1].push_back(std::move(req));
--- a/fdbserver/NetworkTest.h
+++ b/fdbserver/NetworkTest.h
@ -28,6 +28,7 @@

 struct NetworkTestInterface {
 	RequestStream<struct NetworkTestRequest> test;
+	RequestStream<struct NetworkTestStreamingRequest> testStream;
 	NetworkTestInterface() {}
 	NetworkTestInterface(NetworkAddress remote);
 	NetworkTestInterface(INetwork* local);
@ -57,6 +58,29 @@ struct NetworkTestRequest {
 	}
 };

+struct NetworkTestStreamingReply : ReplyPromiseStreamReply {
+	constexpr static FileIdentifier file_identifier = 3726830;
+
+	int index = 0;
+	NetworkTestStreamingReply() = default;
+	explicit NetworkTestStreamingReply(int index) : index(index) {}
+	size_t expectedSize() const { return 4e6; /*sizeof(*this);*/ }
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, index);
+	}
+};
+
+struct NetworkTestStreamingRequest {
+	constexpr static FileIdentifier file_identifier = 2794452;
+	ReplyPromiseStream<struct NetworkTestStreamingReply> reply;
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, reply);
+	}
+};
+
 Future<Void> networkTestServer();

 Future<Void> networkTestClient(std::string const& testServers);
--- a/fdbserver/Orderer.actor.h
+++ b/fdbserver/Orderer.actor.h
@ -1,78 +0,0 @@
-/*
- * Orderer.actor.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-// When actually compiled (NO_INTELLISENSE), include the generated version of this file.  In intellisense use the source
-// version.
-#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ORDERER_ACTOR_G_H)
-#define FDBSERVER_ORDERER_ACTOR_G_H
-#include "fdbserver/Orderer.actor.g.h"
-#elif !defined(FDBSERVER_ORDERER_ACTOR_H)
-#define FDBSERVER_ORDERER_ACTOR_H
-
-#include "fdbclient/Notified.h"
-#include "flow/actorcompiler.h" // This must be the last #include.
-
-template <class Seq>
-class Orderer {
-public:
-	explicit Orderer(Seq s) : ready(s), started(false) {}
-	void reset(Seq s) {
-		ready = NotifiedVersion(s);
-		started = false;
-	}
-	Future<bool> order(Seq s, TaskPriority taskID = TaskPriority::DefaultYield) {
-		if (ready.get() < s)
-			return waitAndOrder(this, s, taskID);
-		else
-			return dedup(s);
-	}
-	void complete(Seq s) {
-		ASSERT(s == ready.get() && started);
-		started = false;
-		ready.set(s + 1);
-	}
-	Seq getNextSequence() {
-		return ready.get();
-	} // Returns the next sequence number which has *not* been returned from order()
-	Future<Void> whenNextSequenceAtLeast(Seq v) { return ready.whenAtLeast(v); }
-
-private:
-	ACTOR static Future<bool> waitAndOrder(Orderer<Seq>* self, Seq s, TaskPriority taskID) {
-		wait(self->ready.whenAtLeast(s));
-		wait(yield(taskID) || self->shutdown.getFuture());
-		return self->dedup(s);
-	}
-	bool dedup(Seq s) {
-		if (s != ready.get() || started)
-			return false;
-		started = true;
-		return true;
-	}
-
-	bool started;
-	NotifiedVersion ready; // FIXME: Notified<Seq>
-	Promise<Void> shutdown; // Never set, only broken on destruction
-};
-
-#include "flow/unactorcompiler.h"
-
-#endif
--- a/fdbserver/Resolver.actor.cpp
+++ b/fdbserver/Resolver.actor.cpp
@ -18,18 +18,19 @@
 * limitations under the License.
 */

-#include "flow/ActorCollection.h"
 #include "fdbclient/NativeAPI.actor.h"
-#include "fdbserver/ConflictSet.h"
-#include "fdbserver/ResolverInterface.h"
-#include "fdbserver/MasterInterface.h"
-#include "fdbserver/WorkerInterface.actor.h"
-#include "fdbserver/WaitFailure.h"
-#include "fdbserver/Knobs.h"
-#include "fdbserver/ServerDBInfo.h"
-#include "fdbserver/Orderer.actor.h"
-#include "fdbserver/StorageMetrics.h"
+#include "fdbclient/Notified.h"
 #include "fdbclient/SystemData.h"
+#include "fdbserver/ConflictSet.h"
+#include "fdbserver/Knobs.h"
+#include "fdbserver/MasterInterface.h"
+#include "fdbserver/ResolverInterface.h"
+#include "fdbserver/ServerDBInfo.h"
+#include "fdbserver/StorageMetrics.h"
+#include "fdbserver/WaitFailure.h"
+#include "fdbserver/WorkerInterface.actor.h"
+#include "flow/ActorCollection.h"
+
 #include "flow/actorcompiler.h" // This must be the last #include.

 namespace {
--- a/fdbserver/SimpleConfigDatabaseNode.actor.cpp
+++ b/fdbserver/SimpleConfigDatabaseNode.actor.cpp
@ -293,7 +293,7 @@ class SimpleConfigDatabaseNodeImpl {
 		for (const auto& kv : snapshot) {
 			auto configKey =
 			    BinaryReader::fromStringRef<ConfigKey>(kv.key.removePrefix(kvKeys.begin), IncludeVersion());
-			if (configKey.configClass.castTo<Key>() == req.configClass) {
+			if (configKey.configClass.template castTo<Key>() == req.configClass) {
 				knobSet.insert(configKey.knobName);
 			}
 		}
@ -301,7 +301,7 @@ class SimpleConfigDatabaseNodeImpl {
 		state Standalone<VectorRef<VersionedConfigMutationRef>> mutations =
 		    wait(getMutations(self, lastCompactedVersion + 1, req.version));
 		for (const auto& versionedMutation : mutations) {
-			if (versionedMutation.mutation.getConfigClass().castTo<Key>() == req.configClass) {
+			if (versionedMutation.mutation.getConfigClass().template castTo<Key>() == req.configClass) {
 				if (versionedMutation.mutation.isSet()) {
 					knobSet.insert(versionedMutation.mutation.getKnobName());
 				} else {
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -1128,6 +1128,7 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>>* systemActors,
 struct SimulationConfig {
 	explicit SimulationConfig(const TestConfig& testConfig);
 	int extraDB;
+	bool generateFearless;

 	DatabaseConfiguration db;

@ -1135,11 +1136,23 @@ struct SimulationConfig {

 	// Simulation layout
 	int datacenters;
+	int replication_type;
 	int machine_count; // Total, not per DC.
 	int processes_per_machine;
 	int coordinators;

 private:
+	void setRandomConfig();
+	void setSimpleConfig();
+	void setSpecificConfig(const TestConfig& testConfig);
+	void setDatacenters(const TestConfig& testConfig);
+	void setStorageEngine(const TestConfig& testConfig);
+	void setRegions(const TestConfig& testConfig);
+	void setReplicationType(const TestConfig& testConfig);
+	void setMachineCount(const TestConfig& testConfig);
+	void setCoordinators(const TestConfig& testConfig);
+	void setProcessesPerMachine(const TestConfig& testConfig);
+	void setTss(const TestConfig& testConfig);
 	void generateNormalConfig(const TestConfig& testConfig);
 };

@ -1159,16 +1172,68 @@ void SimulationConfig::set_config(std::string config) {
 StringRef StringRefOf(const char* s) {
 	return StringRef((uint8_t*)s, strlen(s));
 }
-// Generates and sets an appropriate configuration for the database according to
-// the provided testConfig. Some attributes are randomly generated for more coverage
-// of different combinations
-void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
-	set_config("new");
-	// generateMachineTeamTestConfig set up the number of servers per machine and the number of machines such that
-	// if we do not remove the surplus server and machine teams, the simulation test will report error.
-	// This is needed to make sure the number of server (and machine) teams is no larger than the desired number.
-	bool generateMachineTeamTestConfig = BUGGIFY_WITH_PROB(0.1) ? true : false;
-	bool generateFearless =
+
+// Set the randomly generated options of the config. Compiled here to easily observe and trace random options
+void SimulationConfig::setRandomConfig() {
+	if (deterministicRandom()->random01() < 0.25) {
+		db.desiredTLogCount = deterministicRandom()->randomInt(1, 7);
+	}
+	if (deterministicRandom()->random01() < 0.25) {
+		db.commitProxyCount = deterministicRandom()->randomInt(1, 7);
+	}
+	if (deterministicRandom()->random01() < 0.25) {
+		db.grvProxyCount = deterministicRandom()->randomInt(1, 4);
+	}
+	if (deterministicRandom()->random01() < 0.25) {
+		db.resolverCount = deterministicRandom()->randomInt(1, 7);
+	}
+	// TraceEvent("SimulatedConfigRandom")
+	// 	.detail("DesiredTLogCount", db.desiredTLogCount)
+	// 	.detail("CommitProxyCount", db.commitProxyCount)
+	// 	.detail("GRVProxyCount", db.grvProxyCount)
+	// 	.detail("ResolverCount", db.resolverCount);
+
+	if (deterministicRandom()->random01() < 0.5) {
+		// TraceEvent("SimulatedConfigRandom").detail("PerpetualWiggle", 0);
+		set_config("perpetual_storage_wiggle=0");
+	} else {
+		// TraceEvent("SimulatedConfigRandom").detail("PerpetualWiggle", 1);
+		set_config("perpetual_storage_wiggle=1");
+	}
+
+	if (deterministicRandom()->random01() < 0.5) {
+		set_config("backup_worker_enabled:=1");
+	}
+}
+
+// Overwrite DB with simple options, used when simpleConfig is true in the TestConfig
+void SimulationConfig::setSimpleConfig() {
+	db.desiredTLogCount = 1;
+	db.commitProxyCount = 1;
+	db.grvProxyCount = 1;
+	db.resolverCount = 1;
+}
+
+// Overwrite previous options with ones specified by TestConfig
+void SimulationConfig::setSpecificConfig(const TestConfig& testConfig) {
+	if (testConfig.desiredTLogCount.present()) {
+		db.desiredTLogCount = testConfig.desiredTLogCount.get();
+	}
+	if (testConfig.commitProxyCount.present()) {
+		db.commitProxyCount = testConfig.commitProxyCount.get();
+	}
+	if (testConfig.grvProxyCount.present()) {
+		db.grvProxyCount = testConfig.grvProxyCount.get();
+	}
+	if (testConfig.resolverCount.present()) {
+		db.resolverCount = testConfig.resolverCount.get();
+	}
+}
+
+// Sets generateFearless and number of dataCenters based on testConfig details
+// The number of datacenters may be overwritten in setRegions
+void SimulationConfig::setDatacenters(const TestConfig& testConfig) {
+	generateFearless =
 	    testConfig.simpleConfig ? false : (testConfig.minimumRegions > 1 || deterministicRandom()->random01() < 0.5);
 	if (testConfig.generateFearless.present()) {
 		// overwrite whatever decision we made before
@ -1179,32 +1244,15 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	        ? 1
 	        : (generateFearless ? (testConfig.minimumReplication > 0 || deterministicRandom()->random01() < 0.5 ? 4 : 6)
 	                            : deterministicRandom()->randomInt(1, 4));
+
+	// Overwrite with specific option if present
 	if (testConfig.datacenters.present()) {
 		datacenters = testConfig.datacenters.get();
 	}
-	if (testConfig.desiredTLogCount.present()) {
-		db.desiredTLogCount = testConfig.desiredTLogCount.get();
-	} else if (deterministicRandom()->random01() < 0.25) {
-		db.desiredTLogCount = deterministicRandom()->randomInt(1, 7);
-	}
+}

-	if (testConfig.commitProxyCount.present()) {
-		db.commitProxyCount = testConfig.commitProxyCount.get();
-	} else if (deterministicRandom()->random01() < 0.25) {
-		db.commitProxyCount = deterministicRandom()->randomInt(1, 7);
-	}
-
-	if (testConfig.grvProxyCount.present()) {
-		db.grvProxyCount = testConfig.grvProxyCount.get();
-	} else if (deterministicRandom()->random01() < 0.25) {
-		db.grvProxyCount = deterministicRandom()->randomInt(1, 4);
-	}
-
-	if (testConfig.resolverCount.present()) {
-		db.resolverCount = testConfig.resolverCount.get();
-	} else if (deterministicRandom()->random01() < 0.25) {
-		db.resolverCount = deterministicRandom()->randomInt(1, 7);
-	}
+// Sets storage engine based on testConfig details
+void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
 	int storage_engine_type = deterministicRandom()->randomInt(0, 4);
 	if (testConfig.storageEngineType.present()) {
 		storage_engine_type = testConfig.storageEngineType.get();
@ -1241,38 +1289,15 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	default:
 		ASSERT(false); // Programmer forgot to adjust cases.
 	}
+}

-	int tssCount = 0;
-	if (!testConfig.simpleConfig && !testConfig.disableTss && deterministicRandom()->random01() < 0.25) {
-		// 1 or 2 tss
-		tssCount = deterministicRandom()->randomInt(1, 3);
-	}
-
-	//	if (deterministicRandom()->random01() < 0.5) {
-	//		set_config("ssd");
-	//	} else {
-	//		set_config("memory");
-	//	}
-	//	set_config("memory");
-	//  set_config("memory-radixtree-beta");
-
-    if (deterministicRandom()->random01() < 0.5) {
-        set_config("perpetual_storage_wiggle=0");
-    } else {
-        set_config("perpetual_storage_wiggle=1");
-    }
-// 	set_config("perpetual_storage_wiggle=1");
-	if (testConfig.simpleConfig) {
-		db.desiredTLogCount = 1;
-		db.commitProxyCount = 1;
-		db.grvProxyCount = 1;
-		db.resolverCount = 1;
-	}
-	int replication_type = testConfig.simpleConfig
-	                           ? 1
-	                           : (std::max(testConfig.minimumReplication,
-	                                       datacenters > 4 ? deterministicRandom()->randomInt(1, 3)
-	                                                       : std::min(deterministicRandom()->randomInt(0, 6), 3)));
+// Sets replication type and TLogSpillType and Version
+void SimulationConfig::setReplicationType(const TestConfig& testConfig) {
+	replication_type = testConfig.simpleConfig
+	                       ? 1
+	                       : (std::max(testConfig.minimumReplication,
+	                                   datacenters > 4 ? deterministicRandom()->randomInt(1, 3)
+	                                                   : std::min(deterministicRandom()->randomInt(0, 6), 3)));
 	if (testConfig.config.present()) {
 		set_config(testConfig.config.get());
 	} else {
@ -1333,211 +1358,213 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 			if (deterministicRandom()->random01() < 0.5)
 				set_config(format("log_spill:=%d", TLogSpillType::DEFAULT));
 		}
-
-		if (deterministicRandom()->random01() < 0.5) {
-			set_config("backup_worker_enabled:=1");
-		}
 	}
+}

-	if (generateFearless || (datacenters == 2 && deterministicRandom()->random01() < 0.5)) {
-		// The kill region workload relies on the fact that all "0", "2", and "4" are all of the possible primary dcids.
-		StatusObject primaryObj;
-		StatusObject primaryDcObj;
-		primaryDcObj["id"] = "0";
-		primaryDcObj["priority"] = 2;
-		StatusArray primaryDcArr;
-		primaryDcArr.push_back(primaryDcObj);
+// Set the regions of the config, including the primary and remote options
+// This will also determine the replication types used for satellite and remote.
+void SimulationConfig::setRegions(const TestConfig& testConfig) {
+	// The kill region workload relies on the fact that all "0", "2", and "4" are all of the possible primary dcids.
+	StatusObject primaryObj;
+	StatusObject primaryDcObj;
+	primaryDcObj["id"] = "0";
+	primaryDcObj["priority"] = 2;
+	StatusArray primaryDcArr;
+	primaryDcArr.push_back(primaryDcObj);

-		StatusObject remoteObj;
-		StatusObject remoteDcObj;
-		remoteDcObj["id"] = "1";
-		remoteDcObj["priority"] = 1;
-		StatusArray remoteDcArr;
-		remoteDcArr.push_back(remoteDcObj);
+	StatusObject remoteObj;
+	StatusObject remoteDcObj;
+	remoteDcObj["id"] = "1";
+	remoteDcObj["priority"] = 1;
+	StatusArray remoteDcArr;
+	remoteDcArr.push_back(remoteDcObj);

-		bool needsRemote = generateFearless;
-		if (generateFearless) {
-			if (datacenters > 4) {
-				// FIXME: we cannot use one satellite replication with more than one satellite per region because
-				// canKillProcesses does not respect usable_dcs
-				int satellite_replication_type = deterministicRandom()->randomInt(0, 3);
-				switch (satellite_replication_type) {
-				case 0: {
-					TEST(true); // Simulated cluster using no satellite redundancy mode (>4 datacenters)
-					break;
-				}
-				case 1: {
-					TEST(true); // Simulated cluster using two satellite fast redundancy mode
-					primaryObj["satellite_redundancy_mode"] = "two_satellite_fast";
-					remoteObj["satellite_redundancy_mode"] = "two_satellite_fast";
-					break;
-				}
-				case 2: {
-					TEST(true); // Simulated cluster using two satellite safe redundancy mode
-					primaryObj["satellite_redundancy_mode"] = "two_satellite_safe";
-					remoteObj["satellite_redundancy_mode"] = "two_satellite_safe";
-					break;
-				}
-				default:
-					ASSERT(false); // Programmer forgot to adjust cases.
-				}
-			} else {
-				int satellite_replication_type = deterministicRandom()->randomInt(0, 5);
-				switch (satellite_replication_type) {
-				case 0: {
-					// FIXME: implement
-					TEST(true); // Simulated cluster using custom satellite redundancy mode
-					break;
-				}
-				case 1: {
-					TEST(true); // Simulated cluster using no satellite redundancy mode (<4 datacenters)
-					break;
-				}
-				case 2: {
-					TEST(true); // Simulated cluster using single satellite redundancy mode
-					primaryObj["satellite_redundancy_mode"] = "one_satellite_single";
-					remoteObj["satellite_redundancy_mode"] = "one_satellite_single";
-					break;
-				}
-				case 3: {
-					TEST(true); // Simulated cluster using double satellite redundancy mode
-					primaryObj["satellite_redundancy_mode"] = "one_satellite_double";
-					remoteObj["satellite_redundancy_mode"] = "one_satellite_double";
-					break;
-				}
-				case 4: {
-					TEST(true); // Simulated cluster using triple satellite redundancy mode
-					primaryObj["satellite_redundancy_mode"] = "one_satellite_triple";
-					remoteObj["satellite_redundancy_mode"] = "one_satellite_triple";
-					break;
-				}
-				default:
-					ASSERT(false); // Programmer forgot to adjust cases.
-				}
-			}
-
-			if (deterministicRandom()->random01() < 0.25)
-				primaryObj["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
-			if (deterministicRandom()->random01() < 0.25)
-				remoteObj["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
-
-			// We cannot run with a remote DC when MAX_READ_TRANSACTION_LIFE_VERSIONS is too small, because the log
-			// routers will not be able to keep up.
-			if (testConfig.minimumRegions <= 1 &&
-			    (deterministicRandom()->random01() < 0.25 ||
-			     SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < SERVER_KNOBS->VERSIONS_PER_SECOND)) {
-				TEST(true); // Simulated cluster using one region
-				needsRemote = false;
-			} else {
-				TEST(true); // Simulated cluster using two regions
-				db.usableRegions = 2;
-			}
-
-			int remote_replication_type = deterministicRandom()->randomInt(0, datacenters > 4 ? 4 : 5);
-			switch (remote_replication_type) {
+	bool needsRemote = generateFearless;
+	if (generateFearless) {
+		if (datacenters > 4) {
+			// FIXME: we cannot use one satellite replication with more than one satellite per region because
+			// canKillProcesses does not respect usable_dcs
+			int satellite_replication_type = deterministicRandom()->randomInt(0, 3);
+			switch (satellite_replication_type) {
 			case 0: {
-				// FIXME: implement
-				TEST(true); // Simulated cluster using custom remote redundancy mode
+				TEST(true); // Simulated cluster using no satellite redundancy mode (>4 datacenters)
 				break;
 			}
 			case 1: {
-				TEST(true); // Simulated cluster using default remote redundancy mode
+				TEST(true); // Simulated cluster using two satellite fast redundancy mode
+				primaryObj["satellite_redundancy_mode"] = "two_satellite_fast";
+				remoteObj["satellite_redundancy_mode"] = "two_satellite_fast";
 				break;
 			}
 			case 2: {
-				TEST(true); // Simulated cluster using single remote redundancy mode
-				set_config("remote_single");
-				break;
-			}
-			case 3: {
-				TEST(true); // Simulated cluster using double remote redundancy mode
-				set_config("remote_double");
-				break;
-			}
-			case 4: {
-				TEST(true); // Simulated cluster using triple remote redundancy mode
-				set_config("remote_triple");
+				TEST(true); // Simulated cluster using two satellite safe redundancy mode
+				primaryObj["satellite_redundancy_mode"] = "two_satellite_safe";
+				remoteObj["satellite_redundancy_mode"] = "two_satellite_safe";
 				break;
 			}
 			default:
 				ASSERT(false); // Programmer forgot to adjust cases.
 			}
-
-			if (deterministicRandom()->random01() < 0.25)
-				db.desiredLogRouterCount = deterministicRandom()->randomInt(1, 7);
-			if (deterministicRandom()->random01() < 0.25)
-				db.remoteDesiredTLogCount = deterministicRandom()->randomInt(1, 7);
-
-			bool useNormalDCsAsSatellites =
-			    datacenters > 4 && testConfig.minimumRegions < 2 && deterministicRandom()->random01() < 0.3;
-			StatusObject primarySatelliteObj;
-			primarySatelliteObj["id"] = useNormalDCsAsSatellites ? "1" : "2";
-			primarySatelliteObj["priority"] = 1;
-			primarySatelliteObj["satellite"] = 1;
-			if (deterministicRandom()->random01() < 0.25)
-				primarySatelliteObj["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
-			primaryDcArr.push_back(primarySatelliteObj);
-
-			StatusObject remoteSatelliteObj;
-			remoteSatelliteObj["id"] = useNormalDCsAsSatellites ? "0" : "3";
-			remoteSatelliteObj["priority"] = 1;
-			remoteSatelliteObj["satellite"] = 1;
-			if (deterministicRandom()->random01() < 0.25)
-				remoteSatelliteObj["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
-			remoteDcArr.push_back(remoteSatelliteObj);
-
-			if (datacenters > 4) {
-				StatusObject primarySatelliteObjB;
-				primarySatelliteObjB["id"] = useNormalDCsAsSatellites ? "2" : "4";
-				primarySatelliteObjB["priority"] = 1;
-				primarySatelliteObjB["satellite"] = 1;
-				if (deterministicRandom()->random01() < 0.25)
-					primarySatelliteObjB["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
-				primaryDcArr.push_back(primarySatelliteObjB);
-
-				StatusObject remoteSatelliteObjB;
-				remoteSatelliteObjB["id"] = useNormalDCsAsSatellites ? "2" : "5";
-				remoteSatelliteObjB["priority"] = 1;
-				remoteSatelliteObjB["satellite"] = 1;
-				if (deterministicRandom()->random01() < 0.25)
-					remoteSatelliteObjB["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
-				remoteDcArr.push_back(remoteSatelliteObjB);
-			}
-			if (useNormalDCsAsSatellites) {
-				datacenters = 3;
-			}
-		}
-
-		primaryObj["datacenters"] = primaryDcArr;
-		remoteObj["datacenters"] = remoteDcArr;
-
-		StatusArray regionArr;
-		regionArr.push_back(primaryObj);
-		if (needsRemote || deterministicRandom()->random01() < 0.5) {
-			regionArr.push_back(remoteObj);
-		}
-
-		if (needsRemote) {
-			g_simulator.originalRegions = "regions=" + json_spirit::write_string(json_spirit::mValue(regionArr),
-			                                                                     json_spirit::Output_options::none);
-
-			StatusArray disablePrimary = regionArr;
-			disablePrimary[0].get_obj()["datacenters"].get_array()[0].get_obj()["priority"] = -1;
-			g_simulator.disablePrimary = "regions=" + json_spirit::write_string(json_spirit::mValue(disablePrimary),
-			                                                                    json_spirit::Output_options::none);
-
-			StatusArray disableRemote = regionArr;
-			disableRemote[1].get_obj()["datacenters"].get_array()[0].get_obj()["priority"] = -1;
-			g_simulator.disableRemote = "regions=" + json_spirit::write_string(json_spirit::mValue(disableRemote),
-			                                                                   json_spirit::Output_options::none);
 		} else {
-			// In order to generate a starting configuration with the remote disabled, do not apply the region
-			// configuration to the DatabaseConfiguration until after creating the starting conf string.
-			set_config("regions=" +
-			           json_spirit::write_string(json_spirit::mValue(regionArr), json_spirit::Output_options::none));
+			int satellite_replication_type = deterministicRandom()->randomInt(0, 5);
+			switch (satellite_replication_type) {
+			case 0: {
+				// FIXME: implement
+				TEST(true); // Simulated cluster using custom satellite redundancy mode
+				break;
+			}
+			case 1: {
+				TEST(true); // Simulated cluster using no satellite redundancy mode (<4 datacenters)
+				break;
+			}
+			case 2: {
+				TEST(true); // Simulated cluster using single satellite redundancy mode
+				primaryObj["satellite_redundancy_mode"] = "one_satellite_single";
+				remoteObj["satellite_redundancy_mode"] = "one_satellite_single";
+				break;
+			}
+			case 3: {
+				TEST(true); // Simulated cluster using double satellite redundancy mode
+				primaryObj["satellite_redundancy_mode"] = "one_satellite_double";
+				remoteObj["satellite_redundancy_mode"] = "one_satellite_double";
+				break;
+			}
+			case 4: {
+				TEST(true); // Simulated cluster using triple satellite redundancy mode
+				primaryObj["satellite_redundancy_mode"] = "one_satellite_triple";
+				remoteObj["satellite_redundancy_mode"] = "one_satellite_triple";
+				break;
+			}
+			default:
+				ASSERT(false); // Programmer forgot to adjust cases.
+			}
+		}
+
+		if (deterministicRandom()->random01() < 0.25)
+			primaryObj["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
+		if (deterministicRandom()->random01() < 0.25)
+			remoteObj["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
+
+		// We cannot run with a remote DC when MAX_READ_TRANSACTION_LIFE_VERSIONS is too small, because the log
+		// routers will not be able to keep up.
+		if (testConfig.minimumRegions <= 1 &&
+		    (deterministicRandom()->random01() < 0.25 ||
+		     SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < SERVER_KNOBS->VERSIONS_PER_SECOND)) {
+			TEST(true); // Simulated cluster using one region
+			needsRemote = false;
+		} else {
+			TEST(true); // Simulated cluster using two regions
+			db.usableRegions = 2;
+		}
+
+		int remote_replication_type = deterministicRandom()->randomInt(0, datacenters > 4 ? 4 : 5);
+		switch (remote_replication_type) {
+		case 0: {
+			// FIXME: implement
+			TEST(true); // Simulated cluster using custom remote redundancy mode
+			break;
+		}
+		case 1: {
+			TEST(true); // Simulated cluster using default remote redundancy mode
+			break;
+		}
+		case 2: {
+			TEST(true); // Simulated cluster using single remote redundancy mode
+			set_config("remote_single");
+			break;
+		}
+		case 3: {
+			TEST(true); // Simulated cluster using double remote redundancy mode
+			set_config("remote_double");
+			break;
+		}
+		case 4: {
+			TEST(true); // Simulated cluster using triple remote redundancy mode
+			set_config("remote_triple");
+			break;
+		}
+		default:
+			ASSERT(false); // Programmer forgot to adjust cases.
+		}
+
+		if (deterministicRandom()->random01() < 0.25)
+			db.desiredLogRouterCount = deterministicRandom()->randomInt(1, 7);
+		if (deterministicRandom()->random01() < 0.25)
+			db.remoteDesiredTLogCount = deterministicRandom()->randomInt(1, 7);
+
+		bool useNormalDCsAsSatellites =
+		    datacenters > 4 && testConfig.minimumRegions < 2 && deterministicRandom()->random01() < 0.3;
+		StatusObject primarySatelliteObj;
+		primarySatelliteObj["id"] = useNormalDCsAsSatellites ? "1" : "2";
+		primarySatelliteObj["priority"] = 1;
+		primarySatelliteObj["satellite"] = 1;
+		if (deterministicRandom()->random01() < 0.25)
+			primarySatelliteObj["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
+		primaryDcArr.push_back(primarySatelliteObj);
+
+		StatusObject remoteSatelliteObj;
+		remoteSatelliteObj["id"] = useNormalDCsAsSatellites ? "0" : "3";
+		remoteSatelliteObj["priority"] = 1;
+		remoteSatelliteObj["satellite"] = 1;
+		if (deterministicRandom()->random01() < 0.25)
+			remoteSatelliteObj["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
+		remoteDcArr.push_back(remoteSatelliteObj);
+
+		if (datacenters > 4) {
+			StatusObject primarySatelliteObjB;
+			primarySatelliteObjB["id"] = useNormalDCsAsSatellites ? "2" : "4";
+			primarySatelliteObjB["priority"] = 1;
+			primarySatelliteObjB["satellite"] = 1;
+			if (deterministicRandom()->random01() < 0.25)
+				primarySatelliteObjB["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
+			primaryDcArr.push_back(primarySatelliteObjB);
+
+			StatusObject remoteSatelliteObjB;
+			remoteSatelliteObjB["id"] = useNormalDCsAsSatellites ? "2" : "5";
+			remoteSatelliteObjB["priority"] = 1;
+			remoteSatelliteObjB["satellite"] = 1;
+			if (deterministicRandom()->random01() < 0.25)
+				remoteSatelliteObjB["satellite_logs"] = deterministicRandom()->randomInt(1, 7);
+			remoteDcArr.push_back(remoteSatelliteObjB);
+		}
+		if (useNormalDCsAsSatellites) {
+			datacenters = 3;
 		}
 	}

+	primaryObj["datacenters"] = primaryDcArr;
+	remoteObj["datacenters"] = remoteDcArr;
+
+	StatusArray regionArr;
+	regionArr.push_back(primaryObj);
+	if (needsRemote || deterministicRandom()->random01() < 0.5) {
+		regionArr.push_back(remoteObj);
+	}
+
+	if (needsRemote) {
+		g_simulator.originalRegions =
+		    "regions=" + json_spirit::write_string(json_spirit::mValue(regionArr), json_spirit::Output_options::none);
+
+		StatusArray disablePrimary = regionArr;
+		disablePrimary[0].get_obj()["datacenters"].get_array()[0].get_obj()["priority"] = -1;
+		g_simulator.disablePrimary = "regions=" + json_spirit::write_string(json_spirit::mValue(disablePrimary),
+		                                                                    json_spirit::Output_options::none);
+
+		StatusArray disableRemote = regionArr;
+		disableRemote[1].get_obj()["datacenters"].get_array()[0].get_obj()["priority"] = -1;
+		g_simulator.disableRemote = "regions=" + json_spirit::write_string(json_spirit::mValue(disableRemote),
+		                                                                   json_spirit::Output_options::none);
+	} else {
+		// In order to generate a starting configuration with the remote disabled, do not apply the region
+		// configuration to the DatabaseConfiguration until after creating the starting conf string.
+		set_config("regions=" +
+		           json_spirit::write_string(json_spirit::mValue(regionArr), json_spirit::Output_options::none));
+	}
+}
+
+// Sets the machine count based on the testConfig. May be overwritten later
+// if the end result is not a viable config.
+void SimulationConfig::setMachineCount(const TestConfig& testConfig) {
 	if (testConfig.machineCount.present()) {
 		machine_count = testConfig.machineCount.get();
 	} else if (generateFearless && testConfig.minimumReplication > 1) {
@ -1554,6 +1581,10 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 		                         ((db.minDatacentersRequired() > 0) ? datacenters : 1) *
 		                             std::max(3, db.minZonesRequiredPerDatacenter()));
 		machine_count = deterministicRandom()->randomInt(machine_count, std::max(machine_count + 1, extraDB ? 6 : 10));
+		// generateMachineTeamTestConfig set up the number of servers per machine and the number of machines such that
+		// if we do not remove the surplus server and machine teams, the simulation test will report error.
+		// This is needed to make sure the number of server (and machine) teams is no larger than the desired number.
+		bool generateMachineTeamTestConfig = BUGGIFY_WITH_PROB(0.1) ? true : false;
 		if (generateMachineTeamTestConfig) {
 			// When DESIRED_TEAMS_PER_SERVER is set to 1, the desired machine team number is 5
 			// while the max possible machine team number is 10.
@ -1562,7 +1593,11 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 			machine_count = std::max(machine_count, deterministicRandom()->randomInt(5, extraDB ? 6 : 10));
 		}
 	}
+}

+// Sets the coordinator count based on the testConfig. May be overwritten later
+// if the end result is not a viable config.
+void SimulationConfig::setCoordinators(const TestConfig& testConfig) {
 	if (testConfig.coordinators.present()) {
 		coordinators = testConfig.coordinators.get();
 	} else {
@ -1572,14 +1607,10 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 		                   ? deterministicRandom()->randomInt(1, std::max(machine_count, 2))
 		                   : 1;
 	}
+}

-	if (testConfig.minimumReplication > 1 && datacenters == 3) {
-		// low latency tests in 3 data hall mode need 2 other data centers with 2 machines each to avoid waiting for
-		// logs to recover.
-		machine_count = std::max(machine_count, 6);
-		coordinators = 3;
-	}
-
+// Sets the processes per machine based on the testConfig.
+void SimulationConfig::setProcessesPerMachine(const TestConfig& testConfig) {
 	if (testConfig.processesPerMachine.present()) {
 		processes_per_machine = testConfig.processesPerMachine.get();
 	} else if (generateFearless) {
@ -1587,6 +1618,16 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	} else {
 		processes_per_machine = deterministicRandom()->randomInt(1, (extraDB ? 14 : 28) / machine_count + 2);
 	}
+}
+
+// Sets the TSS configuration based on the testConfig.
+// Also configures the cluster behaviour through setting some flags on the simulator.
+void SimulationConfig::setTss(const TestConfig& testConfig) {
+	int tssCount = 0;
+	if (!testConfig.simpleConfig && !testConfig.disableTss && deterministicRandom()->random01() < 0.25) {
+		// 1 or 2 tss
+		tssCount = deterministicRandom()->randomInt(1, 3);
+	}

 	// reduce tss to half of extra non-seed servers that can be recruited in usable regions.
 	tssCount =
@ -1611,6 +1652,43 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	}
 }

+// Generates and sets an appropriate configuration for the database according to
+// the provided testConfig. Some attributes are randomly generated for more coverage
+// of different combinations
+void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
+	set_config("new");
+	// Some of these options will overwrite one another so the ordering is important.
+	// This is a bit inefficient but separates the different types of option setting paths for better readability.
+	setDatacenters(testConfig);
+
+	// These 3 sets will only change the settings with trivial logic and low coupling with
+	// other portions of the configuration. The parameters that are more involved and use
+	// complex logic will be found in their respective "set----" methods following after.
+	setRandomConfig();
+	if (testConfig.simpleConfig) {
+		setSimpleConfig();
+	}
+	setSpecificConfig(testConfig);
+
+	setStorageEngine(testConfig);
+	setReplicationType(testConfig);
+	if (generateFearless || (datacenters == 2 && deterministicRandom()->random01() < 0.5)) {
+		setRegions(testConfig);
+	}
+	setMachineCount(testConfig);
+	setCoordinators(testConfig);
+
+	if (testConfig.minimumReplication > 1 && datacenters == 3) {
+		// low latency tests in 3 data hall mode need 2 other data centers with 2 machines each to avoid waiting for
+		// logs to recover.
+		machine_count = std::max(machine_count, 6);
+		coordinators = 3;
+	}
+
+	setProcessesPerMachine(testConfig);
+	setTss(testConfig);
+}
+
 // Configures the system according to the given specifications in order to run
 // simulation under the correct conditions
 void setupSimulatedSystem(vector<Future<Void>>* systemActors,
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@ -92,6 +92,48 @@ ACTOR Future<Void> networkTestServer() {
 	}
 }

+ACTOR Future<Void> networkTestStreamingServer() {
+	state NetworkTestInterface interf( g_network );
+	state Future<Void> logging = delay( 1.0 );
+	state double lastTime = now();
+	state int sent = 0;
+	state LatencyStats latency;
+
+	loop {
+		try {
+			choose {
+				when(state NetworkTestStreamingRequest req = waitNext(interf.testStream.getFuture())) {
+					state LatencyStats::sample sample = latency.tick();
+					state int i = 0;
+					for (; i < 100; ++i) {
+						wait(req.reply.onReady());
+						req.reply.send(NetworkTestStreamingReply{ i });
+					}
+					req.reply.sendError(end_of_stream());
+					latency.tock(sample);
+					sent++;
+				}
+				when( wait( logging ) ) {
+					auto spd = sent / (now() - lastTime);
+					if (FLOW_KNOBS->NETWORK_TEST_SCRIPT_MODE) {
+						fprintf(stderr, "%f\t%.3f\t%.3f\n", spd, latency.mean() * 1e6, latency.stddev() * 1e6);
+					} else {
+						fprintf(stderr, "responses per second: %f (%f us)\n", spd, latency.mean() * 1e6);
+					}
+					latency.reset();
+					lastTime = now();
+					sent = 0;
+					logging = delay( 1.0 );
+				}
+			}
+		} catch (Error &e) {
+			if(e.code() != error_code_operation_obsolete) {
+				throw e;
+			}
+		}
+	}
+}
+
 static bool moreRequestsPending(int count) {
 	if (count == -1) {
 		return false;
@ -128,6 +170,32 @@ ACTOR Future<Void> testClient(std::vector<NetworkTestInterface> interfs,
 	return Void();
 }

+ACTOR Future<Void> testClientStream(std::vector<NetworkTestInterface> interfs, int* sent, int* completed,
+                              LatencyStats* latency) {
+	state std::string request_payload(FLOW_KNOBS->NETWORK_TEST_REQUEST_SIZE, '.');
+	state LatencyStats::sample sample;
+
+	while (moreRequestsPending(*sent)) {
+		(*sent)++;
+		sample = latency->tick();
+		state ReplyPromiseStream<NetworkTestStreamingReply> stream =
+		    interfs[deterministicRandom()->randomInt(0, interfs.size())].testStream.getReplyStream(
+		        NetworkTestStreamingRequest{});
+		state int j = 0;
+		try {
+			loop {
+				NetworkTestStreamingReply rep = waitNext(stream.getFuture());
+				ASSERT(rep.index == j++);
+			}
+		} catch (Error& e) {
+			ASSERT(e.code() == error_code_end_of_stream || e.code() == error_code_connection_failed);
+		}
+		latency->tock(sample);
+		(*completed)++;
+	}
+	return Void();
+}
+
 ACTOR Future<Void> logger(int* sent, int* completed, LatencyStats* latency) {
 	state double lastTime = now();
 	state int logged = 0;
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -639,6 +639,8 @@ public:

 	FlowLock durableVersionLock;
 	FlowLock fetchKeysParallelismLock;
+	int64_t fetchKeysBytesBudget;
+	AsyncVar<bool> fetchKeysBudgetUsed;
 	vector<Promise<FetchInjectionInfo*>> readyFetchKeys;

 	int64_t instanceID;
@ -730,8 +732,8 @@ public:

 	struct Counters {
 		CounterCollection cc;
-		Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, finishedQueries, lowPriorityQueries,
-		    rowsQueried, bytesQueried, watchQueries, emptyQueries;
+		Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, getRangeStreamQueries, finishedQueries,
+		    lowPriorityQueries, rowsQueried, bytesQueried, watchQueries, emptyQueries;

 		// Bytes of the mutations that have been added to the memory of the storage server. When the data is durable
 		// and cleared from the memory, we do not subtract it but add it to bytesDurable.
@ -764,21 +766,22 @@ public:
 		Counters(StorageServer* self)
 		  : cc("StorageServer", self->thisServerID.toString()), getKeyQueries("GetKeyQueries", cc),
 		    getValueQueries("GetValueQueries", cc), getRangeQueries("GetRangeQueries", cc),
-		    allQueries("QueryQueue", cc), finishedQueries("FinishedQueries", cc),
-		    lowPriorityQueries("LowPriorityQueries", cc), rowsQueried("RowsQueried", cc),
-		    bytesQueried("BytesQueried", cc), watchQueries("WatchQueries", cc), emptyQueries("EmptyQueries", cc),
-		    bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), bytesFetched("BytesFetched", cc),
-		    mutationBytes("MutationBytes", cc), sampledBytesCleared("SampledBytesCleared", cc),
-		    kvFetched("KVFetched", cc), mutations("Mutations", cc), setMutations("SetMutations", cc),
-		    clearRangeMutations("ClearRangeMutations", cc), atomicMutations("AtomicMutations", cc),
-		    updateBatches("UpdateBatches", cc), updateVersions("UpdateVersions", cc), loops("Loops", cc),
-		    fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc),
-		    fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc),
-		    readsRejected("ReadsRejected", cc), fetchedVersions("FetchedVersions", cc),
-		    fetchesFromLogs("FetchesFromLogs", cc), readLatencySample("ReadLatencyMetrics",
-		                                                              self->thisServerID,
-		                                                              SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                                                              SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		    getRangeStreamQueries("GetRangeStreamQueries", cc), allQueries("QueryQueue", cc),
+		    finishedQueries("FinishedQueries", cc), lowPriorityQueries("LowPriorityQueries", cc),
+		    rowsQueried("RowsQueried", cc), bytesQueried("BytesQueried", cc), watchQueries("WatchQueries", cc),
+		    emptyQueries("EmptyQueries", cc), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc),
+		    bytesFetched("BytesFetched", cc), mutationBytes("MutationBytes", cc),
+		    sampledBytesCleared("SampledBytesCleared", cc), kvFetched("KVFetched", cc), mutations("Mutations", cc),
+		    setMutations("SetMutations", cc), clearRangeMutations("ClearRangeMutations", cc),
+		    atomicMutations("AtomicMutations", cc), updateBatches("UpdateBatches", cc),
+		    updateVersions("UpdateVersions", cc), loops("Loops", cc), fetchWaitingMS("FetchWaitingMS", cc),
+		    fetchWaitingCount("FetchWaitingCount", cc), fetchExecutingMS("FetchExecutingMS", cc),
+		    fetchExecutingCount("FetchExecutingCount", cc), readsRejected("ReadsRejected", cc),
+		    fetchedVersions("FetchedVersions", cc), fetchesFromLogs("FetchesFromLogs", cc),
+		    readLatencySample("ReadLatencyMetrics",
+		                      self->thisServerID,
+		                      SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+		                      SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 			specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
 			specialCounter(cc, "Version", [self]() { return self->version.get(); });
@ -789,17 +792,13 @@ public:
 			specialCounter(cc, "LocalRate", [self] { return int64_t(self->currentRate() * 100); });

 			specialCounter(cc, "BytesReadSampleCount", [self]() { return self->metrics.bytesReadSample.queue.size(); });
-
 			specialCounter(
 			    cc, "FetchKeysFetchActive", [self]() { return self->fetchKeysParallelismLock.activePermits(); });
 			specialCounter(cc, "FetchKeysWaiting", [self]() { return self->fetchKeysParallelismLock.waiters(); });
-
 			specialCounter(cc, "QueryQueueMax", [self]() { return self->getAndResetMaxQueryQueueSize(); });
-
 			specialCounter(cc, "BytesStored", [self]() { return self->metrics.byteSample.getEstimate(allKeys); });
 			specialCounter(cc, "ActiveWatches", [self]() { return self->numWatches; });
 			specialCounter(cc, "WatchBytes", [self]() { return self->watchBytes; });
-
 			specialCounter(cc, "KvstoreSizeTotal", [self]() { return std::get<0>(self->storage.getSize()); });
 			specialCounter(cc, "KvstoreNodeTotal", [self]() { return std::get<1>(self->storage.getSize()); });
 			specialCounter(cc, "KvstoreInlineKey", [self]() { return std::get<2>(self->storage.getSize()); });
@ -813,7 +812,8 @@ public:
 	    db(db), actors(false), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0),
 	    rebootAfterDurableVersion(std::numeric_limits<Version>::max()), durableInProgress(Void()), versionLag(0),
 	    primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0),
-	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), shuttingDown(false),
+	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM),
+	    fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false), shuttingDown(false),
 	    debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0), logProtocol(0),
 	    counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()), tssInQuarantine(false),
 	    readQueueSizeMetric(LiteralStringRef("StorageServer.ReadQueueSize")), behind(false), versionBehind(false),
@ -1268,12 +1268,13 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 		DEBUG_MUTATION("ShardGetValue",
 		               version,
 		               MutationRef(MutationRef::DebugKey, req.key, v.present() ? v.get() : LiteralStringRef("<null>")));
-		DEBUG_MUTATION(
-		    "ShardGetPath",
-		    version,
-		    MutationRef(MutationRef::DebugKey,
-		                req.key,
-		                path == 0 ? LiteralStringRef("0") : path == 1 ? LiteralStringRef("1") : LiteralStringRef("2")));
+		DEBUG_MUTATION("ShardGetPath",
+		               version,
+		               MutationRef(MutationRef::DebugKey,
+		                           req.key,
+		                           path == 0   ? LiteralStringRef("0")
+		                           : path == 1 ? LiteralStringRef("1")
+		                                       : LiteralStringRef("2")));

 		/*
 		StorageMetrics m;
@ -2072,6 +2073,186 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 	return Void();
 }

+ACTOR Future<Void> getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRequest req)
+// Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large
+// selector offset prevents all data from being read in one range read
+{
+	state Span span("SS:getKeyValuesStream"_loc, { req.spanContext });
+	state int64_t resultSize = 0;
+
+	req.reply.setByteLimit(SERVER_KNOBS->RANGESTREAM_LIMIT_BYTES);
+	++data->counters.getRangeStreamQueries;
+	++data->counters.allQueries;
+	++data->readQueueSizeMetric;
+	data->maxQueryQueue = std::max<int>(
+	    data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());
+
+	// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
+	// so we need to downgrade here
+	if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) {
+		wait(delay(0, TaskPriority::FetchKeys));
+	} else {
+		wait(delay(0, TaskPriority::DefaultEndpoint));
+	}
+
+	try {
+		if (req.debugID.present())
+			g_traceBatch.addEvent(
+			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesStream.Before");
+		state Version version = wait(waitForVersion(data, req.version, span.context));
+
+		state uint64_t changeCounter = data->shardChangeCounter;
+		//		try {
+		state KeyRange shard = getShardKeyRange(data, req.begin);
+
+		if (req.debugID.present())
+			g_traceBatch.addEvent(
+			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesStream.AfterVersion");
+		//.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end);
+		//} catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin",
+		//req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard",
+		//"None").detail("In", "getKeyValues>getShardKeyRange"); throw e; }
+
+		if (!selectorInRange(req.end, shard) && !(req.end.isFirstGreaterOrEqual() && req.end.getKey() == shard.end)) {
+			//			TraceEvent("WrongShardServer1", data->thisServerID).detail("Begin",
+			//req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin",
+			//shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkShardExtents");
+			throw wrong_shard_server();
+		}
+
+		state int offset1;
+		state int offset2;
+		state Future<Key> fBegin = req.begin.isFirstGreaterOrEqual()
+		                               ? Future<Key>(req.begin.getKey())
+		                               : findKey(data, req.begin, version, shard, &offset1, span.context);
+		state Future<Key> fEnd = req.end.isFirstGreaterOrEqual()
+		                             ? Future<Key>(req.end.getKey())
+		                             : findKey(data, req.end, version, shard, &offset2, span.context);
+		state Key begin = wait(fBegin);
+		state Key end = wait(fEnd);
+		if (req.debugID.present())
+			g_traceBatch.addEvent(
+			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesStream.AfterKeys");
+		//.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey());
+
+		// Offsets of zero indicate begin/end keys in this shard, which obviously means we can answer the query
+		// An end offset of 1 is also OK because the end key is exclusive, so if the first key of the next shard is the
+		// end the last actual key returned must be from this shard. A begin offset of 1 is also OK because then either
+		// begin is past end or equal to end (so the result is definitely empty)
+		if ((offset1 && offset1 != 1) || (offset2 && offset2 != 1)) {
+			TEST(true); // wrong_shard_server due to offset in rangeStream
+			// We could detect when offset1 takes us off the beginning of the database or offset2 takes us off the end,
+			// and return a clipped range rather than an error (since that is what the NativeAPI.getRange will do anyway
+			// via its "slow path"), but we would have to add some flags to the response to encode whether we went off
+			// the beginning and the end, since it needs that information.
+			//TraceEvent("WrongShardServer2", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkOffsets").detail("BeginKey", begin).detail("EndKey", end).detail("BeginOffset", offset1).detail("EndOffset", offset2);
+			throw wrong_shard_server();
+		}
+
+		if (begin >= end) {
+			if (req.debugID.present())
+				g_traceBatch.addEvent(
+				    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesStream.Send");
+			//.detail("Begin",begin).detail("End",end);
+
+			GetKeyValuesStreamReply none;
+			none.version = version;
+			none.more = false;
+
+			data->checkChangeCounter(changeCounter,
+			                         KeyRangeRef(std::min<KeyRef>(req.begin.getKey(), req.end.getKey()),
+			                                     std::max<KeyRef>(req.begin.getKey(), req.end.getKey())));
+			req.reply.send(none);
+			req.reply.sendError(end_of_stream());
+		} else {
+			loop {
+				wait(req.reply.onReady());
+
+				if (version < data->oldestVersion.get()) {
+					throw transaction_too_old();
+				}
+
+				state int byteLimit = CLIENT_KNOBS->REPLY_BYTE_LIMIT;
+				GetKeyValuesReply _r =
+				    wait(readRange(data, version, KeyRangeRef(begin, end), req.limit, &byteLimit, span.context));
+				GetKeyValuesStreamReply r(_r);
+
+				if (req.debugID.present())
+					g_traceBatch.addEvent("TransactionDebug",
+					                      req.debugID.get().first(),
+					                      "storageserver.getKeyValuesStream.AfterReadRange");
+				//.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size());
+				data->checkChangeCounter(
+				    changeCounter,
+				    KeyRangeRef(std::min<KeyRef>(begin, std::min<KeyRef>(req.begin.getKey(), req.end.getKey())),
+				                std::max<KeyRef>(end, std::max<KeyRef>(req.begin.getKey(), req.end.getKey()))));
+				if (EXPENSIVE_VALIDATION) {
+					for (int i = 0; i < r.data.size(); i++)
+						ASSERT(r.data[i].key >= begin && r.data[i].key < end);
+					ASSERT(r.data.size() <= std::abs(req.limit));
+				}
+
+				/*for( int i = 0; i < r.data.size(); i++ ) {
+					StorageMetrics m;
+					m.bytesPerKSecond = r.data[i].expectedSize();
+					m.iosPerKSecond = 1; //FIXME: this should be 1/r.data.size(), but we cannot do that because it is an int
+					data->metrics.notify(r.data[i].key, m);
+				}*/
+
+				// For performance concerns, the cost of a range read is billed to the start key and end key of the
+				// range.
+				int64_t totalByteSize = 0;
+				for (int i = 0; i < r.data.size(); i++) {
+					totalByteSize += r.data[i].expectedSize();
+				}
+				if (totalByteSize > 0 && SERVER_KNOBS->READ_SAMPLING_ENABLED) {
+					int64_t bytesReadPerKSecond = std::max(totalByteSize, SERVER_KNOBS->EMPTY_READ_PENALTY) / 2;
+					data->metrics.notifyBytesReadPerKSecond(r.data[0].key, bytesReadPerKSecond);
+					data->metrics.notifyBytesReadPerKSecond(r.data[r.data.size() - 1].key, bytesReadPerKSecond);
+				}
+
+				req.reply.send(r);
+
+				data->counters.rowsQueried += r.data.size();
+				if (r.data.size() == 0) {
+					++data->counters.emptyQueries;
+				}
+				if (!r.more) {
+					req.reply.sendError(end_of_stream());
+					break;
+				}
+				ASSERT(r.data.size());
+
+				if (req.limit >= 0) {
+					begin = keyAfter(r.data.back().key);
+				} else {
+					end = r.data.back().key;
+				}
+
+				if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) {
+					wait(delay(0, TaskPriority::FetchKeys));
+				} else {
+					wait(delay(0, TaskPriority::DefaultEndpoint));
+				}
+
+				data->transactionTagCounter.addRequest(req.tags, resultSize);
+			}
+		}
+	} catch (Error& e) {
+		if (e.code() != error_code_operation_obsolete) {
+			if (!canReplyWith(e))
+				throw;
+			req.reply.sendError(e);
+		}
+	}
+
+	data->transactionTagCounter.addRequest(req.tags, resultSize);
+	++data->counters.finishedQueries;
+	--data->readQueueSizeMetric;
+
+	return Void();
+}
+
 ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 	state Span span("SS:getKey"_loc, { req.spanContext });
 	state int64_t resultSize = 0;
@ -2504,72 +2685,6 @@ void coalesceShards(StorageServer* data, KeyRangeRef keys) {
 	}
 }

-ACTOR Future<RangeResult> tryGetRange(Database cx,
-                                      Version version,
-                                      KeyRangeRef keys,
-                                      GetRangeLimits limits,
-                                      bool* isTooOld) {
-	state Transaction tr(cx);
-	state RangeResult output;
-	state KeySelectorRef begin = firstGreaterOrEqual(keys.begin);
-	state KeySelectorRef end = firstGreaterOrEqual(keys.end);
-
-	if (*isTooOld)
-		throw transaction_too_old();
-
-	ASSERT(!cx->switchable);
-	tr.setVersion(version);
-	tr.info.taskID = TaskPriority::FetchKeys;
-	limits.minRows = 0;
-
-	try {
-		loop {
-			RangeResult rep = wait(tr.getRange(begin, end, limits, true));
-			limits.decrement(rep);
-
-			if (limits.isReached() || !rep.more) {
-				if (output.size()) {
-					output.arena().dependsOn(rep.arena());
-					output.append(output.arena(), rep.begin(), rep.size());
-					if (limits.isReached() && rep.readThrough.present())
-						output.readThrough = rep.readThrough.get();
-				} else {
-					output = rep;
-				}
-
-				output.more = limits.isReached();
-
-				return output;
-			} else if (rep.readThrough.present()) {
-				output.arena().dependsOn(rep.arena());
-				if (rep.size()) {
-					output.append(output.arena(), rep.begin(), rep.size());
-					ASSERT(rep.readThrough.get() > rep.end()[-1].key);
-				} else {
-					ASSERT(rep.readThrough.get() > keys.begin);
-				}
-				begin = firstGreaterOrEqual(rep.readThrough.get());
-			} else {
-				output.arena().dependsOn(rep.arena());
-				output.append(output.arena(), rep.begin(), rep.size());
-				begin = firstGreaterThan(output.end()[-1].key);
-			}
-		}
-	} catch (Error& e) {
-		if (begin.getKey() != keys.begin &&
-		    (e.code() == error_code_transaction_too_old || e.code() == error_code_future_version ||
-		     e.code() == error_code_process_behind)) {
-			if (e.code() == error_code_transaction_too_old)
-				*isTooOld = true;
-			output.more = true;
-			if (begin.isFirstGreaterOrEqual())
-				output.readThrough = begin.getKey();
-			return output;
-		}
-		throw;
-	}
-}
-
 template <class T>
 void addMutation(T& target, Version version, MutationRef const& mutation) {
 	target.addMutation(version, mutation);
@ -2667,13 +2782,46 @@ public:
 	}
 };

+ACTOR Future<Void> tryGetRange(PromiseStream<RangeResult> results, Transaction* tr, KeyRange keys) {
+	state KeySelectorRef begin = firstGreaterOrEqual(keys.begin);
+	state KeySelectorRef end = firstGreaterOrEqual(keys.end);
+
+	try {
+		loop {
+			GetRangeLimits limits(GetRangeLimits::ROW_LIMIT_UNLIMITED, SERVER_KNOBS->FETCH_BLOCK_BYTES);
+			limits.minRows = 0;
+			state RangeResult rep = wait(tr->getRange(begin, end, limits, true));
+			if (!rep.more) {
+				rep.readThrough = keys.end;
+			}
+			results.send(rep);
+
+			if (!rep.more) {
+				results.sendError(end_of_stream());
+				return Void();
+			}
+
+			if (rep.readThrough.present()) {
+				begin = firstGreaterOrEqual(rep.readThrough.get());
+			} else {
+				begin = firstGreaterThan(rep.end()[-1].key);
+			}
+		}
+	} catch (Error& e) {
+		if (e.code() == error_code_actor_cancelled) {
+			throw;
+		}
+		results.sendError(e);
+		throw;
+	}
+}
+
 ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 	state const UID fetchKeysID = deterministicRandom()->randomUniqueID();
 	state TraceInterval interval("FetchKeys");
 	state KeyRange keys = shard->keys;
 	state Future<Void> warningLogger = logFetchKeysWarning(shard);
 	state const double startTime = now();
-	state int fetchBlockBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_BLOCK_BYTES : SERVER_KNOBS->FETCH_BLOCK_BYTES;
 	state FetchKeysMetricReporter metricReporter(fetchKeysID,
 	                                             startTime,
 	                                             keys,
@ -2714,8 +2862,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {

 		TraceEvent(SevDebug, "FetchKeysVersionSatisfied", data->thisServerID).detail("FKID", interval.pairID);

-		wait(data->fetchKeysParallelismLock.take(TaskPriority::DefaultYield, fetchBlockBytes));
-		state FlowLock::Releaser holdingFKPL(data->fetchKeysParallelismLock, fetchBlockBytes);
+		wait(data->fetchKeysParallelismLock.take(TaskPriority::DefaultYield));
+		state FlowLock::Releaser holdingFKPL(data->fetchKeysParallelismLock);

 		state double executeStart = now();
 		++data->counters.fetchWaitingCount;
@ -2727,7 +2875,6 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 		wait(data->durableVersionLock.take());

 		shard->phase = AddingShard::Fetching;
-		state Version fetchVersion = data->version.get();

 		data->durableVersionLock.release();

@ -2747,107 +2894,78 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 		data->cx->invalidateCache(keys);

 		loop {
+			state Transaction tr(data->cx);
+			state Version fetchVersion = data->version.get();
+			while (!shard->updates.empty() && shard->updates[0].version <= fetchVersion)
+				shard->updates.pop_front();
+			tr.setVersion(fetchVersion);
+			tr.info.taskID = TaskPriority::FetchKeys;
+			state PromiseStream<RangeResult> results;
+			state Future<Void> hold = SERVER_KNOBS->FETCH_USING_STREAMING
+			                              ? tr.getRangeStream(results, keys, GetRangeLimits(), true)
+			                              : tryGetRange(results, &tr, keys);
+			state Key nfk = keys.begin;
+
 			try {
-				TEST(true); // Fetching keys for transferred shard
+				loop {
+					TEST(true); // Fetching keys for transferred shard
+					while (data->fetchKeysBudgetUsed.get()) {
+						wait(data->fetchKeysBudgetUsed.onChange());
+					}
+					state RangeResult this_block = waitNext(results.getFuture());

-				state RangeResult this_block =
-				    wait(tryGetRange(data->cx,
-				                     fetchVersion,
-				                     keys,
-				                     GetRangeLimits(GetRangeLimits::ROW_LIMIT_UNLIMITED, fetchBlockBytes),
-				                     &isTooOld));
+					state int expectedBlockSize =
+					    (int)this_block.expectedSize() + (8 - (int)sizeof(KeyValueRef)) * this_block.size();

-				int expectedSize = (int)this_block.expectedSize() + (8 - (int)sizeof(KeyValueRef)) * this_block.size();
+					TraceEvent(SevDebug, "FetchKeysBlock", data->thisServerID)
+					    .detail("FKID", interval.pairID)
+					    .detail("BlockRows", this_block.size())
+					    .detail("BlockBytes", expectedBlockSize)
+					    .detail("KeyBegin", keys.begin)
+					    .detail("KeyEnd", keys.end)
+					    .detail("Last", this_block.size() ? this_block.end()[-1].key : std::string())
+					    .detail("Version", fetchVersion)
+					    .detail("More", this_block.more);
+					DEBUG_KEY_RANGE("fetchRange", fetchVersion, keys);
+					for (auto k = this_block.begin(); k != this_block.end(); ++k)
+						DEBUG_MUTATION("fetch", fetchVersion, MutationRef(MutationRef::SetValue, k->key, k->value));

-				TraceEvent(SevDebug, "FetchKeysBlock", data->thisServerID)
-				    .detail("FKID", interval.pairID)
-				    .detail("BlockRows", this_block.size())
-				    .detail("BlockBytes", expectedSize)
-				    .detail("KeyBegin", keys.begin)
-				    .detail("KeyEnd", keys.end)
-				    .detail("Last", this_block.size() ? this_block.end()[-1].key : std::string())
-				    .detail("Version", fetchVersion)
-				    .detail("More", this_block.more);
-				DEBUG_KEY_RANGE("fetchRange", fetchVersion, keys);
-				for (auto k = this_block.begin(); k != this_block.end(); ++k)
-					DEBUG_MUTATION("fetch", fetchVersion, MutationRef(MutationRef::SetValue, k->key, k->value));
+					metricReporter.addFetchedBytes(expectedBlockSize, this_block.size());

-				metricReporter.addFetchedBytes(expectedSize, this_block.size());
+					// Write this_block to storage
+					state KeyValueRef* kvItr = this_block.begin();
+					for (; kvItr != this_block.end(); ++kvItr) {
+						data->storage.writeKeyValue(*kvItr);
+						wait(yield());
+					}

-				if (fetchBlockBytes > expectedSize) {
-					holdingFKPL.release(fetchBlockBytes - expectedSize);
-				}
+					kvItr = this_block.begin();
+					for (; kvItr != this_block.end(); ++kvItr) {
+						data->byteSampleApplySet(*kvItr, invalidVersion);
+						wait(yield());
+					}

-				// Wait for permission to proceed
-				// wait( data->fetchKeysStorageWriteLock.take() );
-				// state FlowLock::Releaser holdingFKSWL( data->fetchKeysStorageWriteLock );
+					ASSERT(this_block.readThrough.present() || this_block.size());
+					nfk = this_block.readThrough.present() ? this_block.readThrough.get()
+					                                       : keyAfter(this_block.end()[-1].key);
+					this_block = RangeResult();

-				// Write this_block directly to storage, bypassing update() which write to MVCC in memory.
-				state KeyValueRef* kvItr = this_block.begin();
-				for (; kvItr != this_block.end(); ++kvItr) {
-					data->storage.writeKeyValue(*kvItr);
-					wait(yield());
-				}
-
-				kvItr = this_block.begin();
-				for (; kvItr != this_block.end(); ++kvItr) {
-					data->byteSampleApplySet(*kvItr, invalidVersion);
-					wait(yield());
-				}
-
-				if (this_block.more) {
-					Key nfk = this_block.readThrough.present() ? this_block.readThrough.get()
-					                                           : keyAfter(this_block.end()[-1].key);
-					if (nfk != keys.end) {
-						std::deque<Standalone<VerUpdateRef>> updatesToSplit = std::move(shard->updates);
-
-						// This actor finishes committing the keys [keys.begin,nfk) that we already fetched.
-						// The remaining unfetched keys [nfk,keys.end) will become a separate AddingShard with its own
-						// fetchKeys.
-						shard->server->addShard(ShardInfo::addingSplitLeft(KeyRangeRef(keys.begin, nfk), shard));
-						shard->server->addShard(ShardInfo::newAdding(data, KeyRangeRef(nfk, keys.end)));
-						shard = data->shards.rangeContaining(keys.begin).value()->adding.get();
-						warningLogger = logFetchKeysWarning(shard);
-						AddingShard* otherShard = data->shards.rangeContaining(nfk).value()->adding.get();
-						keys = shard->keys;
-
-						// Split our prior updates.  The ones that apply to our new, restricted key range will go back
-						// into shard->updates, and the ones delivered to the new shard will be discarded because it is
-						// in WaitPrevious phase (hasn't chosen a fetchVersion yet). What we are doing here is expensive
-						// and could get more expensive if we started having many more blocks per shard. May need
-						// optimization in the future.
-						std::deque<Standalone<VerUpdateRef>>::iterator u = updatesToSplit.begin();
-						for (; u != updatesToSplit.end(); ++u) {
-							splitMutations(data, data->shards, *u);
-						}
-
-						TEST(true); // fetchkeys has more
-						TEST(shard->updates.size()); // Shard has updates
-						ASSERT(otherShard->updates.empty());
+					data->fetchKeysBytesBudget -= expectedBlockSize;
+					if (data->fetchKeysBytesBudget <= 0) {
+						data->fetchKeysBudgetUsed.set(true);
 					}
 				}
-
-				this_block = RangeResult();
-
-				if (BUGGIFY)
-					wait(delay(1));
-
-				break;
 			} catch (Error& e) {
-				TraceEvent("FKBlockFail", data->thisServerID)
-				    .error(e, true)
-				    .suppressFor(1.0)
-				    .detail("FKID", interval.pairID);
-				if (e.code() == error_code_transaction_too_old) {
-					TEST(true); // A storage server has forgotten the history data we are fetching
-					Version lastFV = fetchVersion;
-					fetchVersion = data->version.get();
-					isTooOld = false;
-
-					// Throw away deferred updates from before fetchVersion, since we don't need them to use blocks
-					// fetched at that version
-					while (!shard->updates.empty() && shard->updates[0].version <= fetchVersion)
-						shard->updates.pop_front();
+				if (e.code() != error_code_end_of_stream && e.code() != error_code_connection_failed &&
+				    e.code() != error_code_transaction_too_old && e.code() != error_code_future_version &&
+				    e.code() != error_code_process_behind) {
+					throw;
+				}
+				if (nfk == keys.begin) {
+					TraceEvent("FKBlockFail", data->thisServerID)
+					    .error(e, true)
+					    .suppressFor(1.0)
+					    .detail("FKID", interval.pairID);

 					// FIXME: remove when we no longer support upgrades from 5.X
 					if (debug_getRangeRetries >= 100) {
@ -2861,17 +2979,40 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 						TraceEvent(SevWarn, "FetchPast", data->thisServerID)
 						    .detail("TotalAttempts", debug_getRangeRetries)
 						    .detail("FKID", interval.pairID)
-						    .detail("V", lastFV)
 						    .detail("N", fetchVersion)
 						    .detail("E", data->version.get());
 					}
-				} else if (e.code() == error_code_future_version || e.code() == error_code_process_behind) {
-					TEST(true); // fetchKeys got future_version or process_behind, so there must be a huge storage lag
-					            // somewhere.  Keep trying.
-				} else {
-					throw;
+					wait(delayJittered(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+					continue;
 				}
-				wait(delayJittered(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+				if (nfk < keys.end) {
+					std::deque<Standalone<VerUpdateRef>> updatesToSplit = std::move(shard->updates);
+
+					// This actor finishes committing the keys [keys.begin,nfk) that we already fetched.
+					// The remaining unfetched keys [nfk,keys.end) will become a separate AddingShard with its own
+					// fetchKeys.
+					shard->server->addShard(ShardInfo::addingSplitLeft(KeyRangeRef(keys.begin, nfk), shard));
+					shard->server->addShard(ShardInfo::newAdding(data, KeyRangeRef(nfk, keys.end)));
+					shard = data->shards.rangeContaining(keys.begin).value()->adding.get();
+					warningLogger = logFetchKeysWarning(shard);
+					AddingShard* otherShard = data->shards.rangeContaining(nfk).value()->adding.get();
+					keys = shard->keys;
+
+					// Split our prior updates.  The ones that apply to our new, restricted key range will go back into
+					// shard->updates, and the ones delivered to the new shard will be discarded because it is in
+					// WaitPrevious phase (hasn't chosen a fetchVersion yet). What we are doing here is expensive and
+					// could get more expensive if we started having many more blocks per shard. May need optimization
+					// in the future.
+					std::deque<Standalone<VerUpdateRef>>::iterator u = updatesToSplit.begin();
+					for (; u != updatesToSplit.end(); ++u) {
+						splitMutations(data, data->shards, *u);
+					}
+
+					TEST(true); // fetchkeys has more
+					TEST(shard->updates.size()); // Shard has updates
+					ASSERT(otherShard->updates.empty());
+				}
+				break;
 			}
 		}

@ -2891,8 +3032,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 		// being recovered. Instead we wait for the updateStorage loop to commit something (and consequently also what
 		// we have written)

-		wait(data->durableVersion.whenAtLeast(data->storageVersion() + 1));
 		holdingFKPL.release();
+		wait(data->durableVersion.whenAtLeast(data->storageVersion() + 1));

 		TraceEvent(SevDebug, "FKAfterFinalCommit", data->thisServerID)
 		    .detail("FKID", interval.pairID)
@ -3922,8 +4063,11 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
 		data->ssDurableVersionUpdateLatencyHistogram->sampleSeconds(now() - beforeSSDurableVersionUpdate);

 		//TraceEvent("StorageServerDurable", data->thisServerID).detail("Version", newOldestVersion);
-
-		wait(durableDelay);
+		data->fetchKeysBytesBudget = SERVER_KNOBS->STORAGE_FETCH_BYTES;
+		data->fetchKeysBudgetUsed.set(false);
+		if (!data->fetchKeysBudgetUsed.get()) {
+			wait(durableDelay || data->fetchKeysBudgetUsed.onChange());
+		}
 	}
 }

@ -4651,6 +4795,17 @@ ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<G
 	}
 }

+ACTOR Future<Void> serveGetKeyValuesStreamRequests(StorageServer* self,
+                                                   FutureStream<GetKeyValuesStreamRequest> getKeyValuesStream) {
+	loop {
+		GetKeyValuesStreamRequest req = waitNext(getKeyValuesStream);
+		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
+		// before doing real work
+		// FIXME: add readGuard again
+		self->actors.add(getKeyValuesStreamQ(self, req));
+	}
+}
+
 ACTOR Future<Void> serveGetKeyRequests(StorageServer* self, FutureStream<GetKeyRequest> getKey) {
 	loop {
 		GetKeyRequest req = waitNext(getKey);
@ -4810,6 +4965,7 @@ ACTOR Future<Void> storageServerCore(StorageServer* self, StorageServerInterface
 	self->actors.add(checkBehind(self));
 	self->actors.add(serveGetValueRequests(self, ssi.getValue.getFuture()));
 	self->actors.add(serveGetKeyValuesRequests(self, ssi.getKeyValues.getFuture()));
+	self->actors.add(serveGetKeyValuesStreamRequests(self, ssi.getKeyValuesStream.getFuture()));
 	self->actors.add(serveGetKeyRequests(self, ssi.getKey.getFuture()));
 	self->actors.add(serveWatchValueRequests(self, ssi.watchValue.getFuture()));
 	self->actors.add(traceRole(Role::STORAGE_SERVER, ssi.id()));
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -943,6 +943,7 @@ ACTOR Future<Void> storageServerRollbackRebooter(std::set<std::pair<UID, KeyValu
 		DUMPTOKEN(recruited.getQueuingMetrics);
 		DUMPTOKEN(recruited.getKeyValueStoreType);
 		DUMPTOKEN(recruited.watchValue);
+		DUMPTOKEN(recruited.getKeyValuesStream);

 		prevStorageServer =
 		    storageServer(store, recruited, db, folder, Promise<Void>(), Reference<ClusterConnectionFile>(nullptr));
@ -1305,6 +1306,7 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				DUMPTOKEN(recruited.getQueuingMetrics);
 				DUMPTOKEN(recruited.getKeyValueStoreType);
 				DUMPTOKEN(recruited.watchValue);
+				DUMPTOKEN(recruited.getKeyValuesStream);

 				Promise<Void> recovery;
 				Future<Void> f = storageServer(kv, recruited, dbInfo, folder, recovery, connFile);
@ -1712,6 +1714,7 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 					DUMPTOKEN(recruited.getQueuingMetrics);
 					DUMPTOKEN(recruited.getKeyValueStoreType);
 					DUMPTOKEN(recruited.watchValue);
+					DUMPTOKEN(recruited.getKeyValuesStream);
 					// printf("Recruited as storageServer\n");

 					std::string filename =
--- a/fdbserver/workloads/GetRangeStream.actor.cpp
+++ b/fdbserver/workloads/GetRangeStream.actor.cpp
@ -0,0 +1,128 @@
+/*
+ * GetRangeStream.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/NativeAPI.actor.h"
+#include "fdbserver/TesterInterface.actor.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/workloads/BulkSetup.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+struct GetRangeStream : TestWorkload {
+	PerfIntCounter bytesRead;
+	bool useGetRange;
+	Key begin;
+	Key end;
+	bool printKVPairs;
+
+	GetRangeStream(WorkloadContext const& wcx) : TestWorkload(wcx), bytesRead("BytesRead") {
+		useGetRange = getOption(options, LiteralStringRef("useGetRange"), false);
+		begin = getOption(options, LiteralStringRef("begin"), normalKeys.begin);
+		end = getOption(options, LiteralStringRef("end"), normalKeys.end);
+		printKVPairs = getOption(options, LiteralStringRef("printKVPairs"), false);
+	}
+
+	std::string description() const override { return "GetRangeStreamWorkload"; }
+
+	Future<Void> setup(Database const& cx) override { return Void(); }
+
+	Future<Void> start(Database const& cx) override {
+		return clientId != 0 ? Void() : useGetRange ? fdbClientGetRange(cx, this) : fdbClientStream(cx, this);
+	}
+
+	Future<bool> check(Database const& cx) override { return true; }
+
+	void getMetrics(vector<PerfMetric>& m) override { m.push_back(bytesRead.getMetric()); }
+
+	ACTOR static Future<Void> logThroughput(GetRangeStream* self, Key* next) {
+		loop {
+			state int64_t last = self->bytesRead.getValue();
+			state double before = g_network->now();
+			wait(delay(1));
+			state double after = g_network->now();
+			if (after > before) {
+				printf("throughput: %g bytes/s, next: %s\n",
+				       (self->bytesRead.getValue() - last) / (after - before),
+				       printable(*next).c_str());
+			}
+		}
+	}
+
+	ACTOR static Future<Void> fdbClientGetRange(Database db, GetRangeStream* self) {
+		state Transaction tx(db);
+		state Key next = self->begin;
+		state Future<Void> logFuture = logThroughput(self, &next);
+		loop {
+			try {
+				Standalone<RangeResultRef> range = wait(
+				    tx.getRange(KeySelector(firstGreaterOrEqual(next), next.arena()),
+				                KeySelector(firstGreaterOrEqual(self->end)),
+				                GetRangeLimits(GetRangeLimits::ROW_LIMIT_UNLIMITED, CLIENT_KNOBS->REPLY_BYTE_LIMIT)));
+				for (const auto& [k, v] : range) {
+					if (self->printKVPairs) {
+						printf("%s -> %s\n", printable(k).c_str(), printable(v).c_str());
+					}
+					self->bytesRead += k.size() + v.size();
+				}
+				if (!range.more) {
+					break;
+				}
+				next = keyAfter(range.back().key);
+			} catch (Error& e) {
+				wait(tx.onError(e));
+			}
+		}
+		return Void();
+	}
+
+	ACTOR static Future<Void> fdbClientStream(Database db, GetRangeStream* self) {
+		state Transaction tx(db);
+		state Key next = self->begin;
+		state Future<Void> logFuture = logThroughput(self, &next);
+		loop {
+			state PromiseStream<Standalone<RangeResultRef>> results;
+			try {
+				state Future<Void> stream = tx.getRangeStream(results,
+				                                              KeySelector(firstGreaterOrEqual(next), next.arena()),
+				                                              KeySelector(firstGreaterOrEqual(self->end)),
+				                                              GetRangeLimits());
+				loop {
+					Standalone<RangeResultRef> range = waitNext(results.getFuture());
+					for (const auto& [k, v] : range) {
+						if (self->printKVPairs) {
+							printf("%s -> %s\n", printable(k).c_str(), printable(v).c_str());
+						}
+						self->bytesRead += k.size() + v.size();
+					}
+					if (range.size()) {
+						next = keyAfter(range.back().key);
+					}
+				}
+			} catch (Error& e) {
+				if (e.code() == error_code_end_of_stream) {
+					break;
+				}
+				wait(tx.onError(e));
+			}
+		}
+		return Void();
+	}
+};
+
+WorkloadFactory<GetRangeStream> GetRangeStreamWorkloadFactory("GetRangeStream");
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@ -28,6 +28,7 @@ void forceLinkFlowTests();
 void forceLinkVersionedMapTests();
 void forceLinkMemcpyTests();
 void forceLinkMemcpyPerfTests();
+void forceLinkParallelStreamTests();
 void forceLinkSimExternalConnectionTests();

 struct UnitTestWorkload : TestWorkload {
@ -61,6 +62,7 @@ struct UnitTestWorkload : TestWorkload {
 		forceLinkVersionedMapTests();
 		forceLinkMemcpyTests();
 		forceLinkMemcpyPerfTests();
+		forceLinkParallelStreamTests();
 		forceLinkSimExternalConnectionTests();
 	}

--- a/flow/flow.h
+++ b/flow/flow.h
@ -23,7 +23,7 @@
 #pragma once

 #pragma warning(disable : 4244 4267) // SOMEDAY: Carefully check for integer overflow issues (e.g. size_t to int
-                                     // conversions like this suppresses)
+// conversions like this suppresses)
 #pragma warning(disable : 4345)
 #pragma warning(error : 4239)

@ -580,88 +580,6 @@ public:
 	void fire(T const&) override { ASSERT(false); }
 };

-template <class T>
-struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>> {
-	int promises; // one for each promise (and one for an active actor if this is an actor)
-	int futures; // one for each future and one more if there are any callbacks
-
-	// Invariant: SingleCallback<T>::next==this || (queue.empty() && !error.isValid())
-	std::queue<T, Deque<T>> queue;
-	Error error;
-
-	NotifiedQueue(int futures, int promises) : futures(futures), promises(promises) { SingleCallback<T>::next = this; }
-
-	bool isReady() const { return !queue.empty() || error.isValid(); }
-	bool isError() const { return queue.empty() && error.isValid(); } // the *next* thing queued is an error
-	uint32_t size() const { return queue.size(); }
-
-	T pop() {
-		if (queue.empty()) {
-			if (error.isValid())
-				throw error;
-			throw internal_error();
-		}
-		auto copy = std::move(queue.front());
-		queue.pop();
-		return copy;
-	}
-
-	template <class U>
-	void send(U&& value) {
-		if (error.isValid())
-			return;
-
-		if (SingleCallback<T>::next != this) {
-			SingleCallback<T>::next->fire(std::forward<U>(value));
-		} else {
-			queue.emplace(std::forward<U>(value));
-		}
-	}
-
-	void sendError(Error err) {
-		if (error.isValid())
-			return;
-
-		this->error = err;
-		if (SingleCallback<T>::next != this)
-			SingleCallback<T>::next->error(err);
-	}
-
-	void addPromiseRef() { promises++; }
-	void addFutureRef() { futures++; }
-
-	void delPromiseRef() {
-		if (!--promises) {
-			if (futures) {
-				sendError(broken_promise());
-			} else
-				destroy();
-		}
-	}
-	void delFutureRef() {
-		if (!--futures) {
-			if (promises)
-				cancel();
-			else
-				destroy();
-		}
-	}
-
-	int getFutureReferenceCount() const { return futures; }
-	int getPromiseReferenceCount() const { return promises; }
-
-	virtual void destroy() { delete this; }
-	virtual void cancel() {}
-
-	void addCallbackAndDelFutureRef(SingleCallback<T>* cb) {
-		ASSERT(SingleCallback<T>::next == this);
-		cb->insert(this);
-	}
-	void unwait() override { delFutureRef(); }
-	void fire(T const&) override { ASSERT(false); }
-	void fire(T&&) override { ASSERT(false); }
-};
-
 template <class T>
 class Promise;

@ -838,6 +756,113 @@ private:
 	SAV<T>* sav;
 };

+template <class T>
+struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>> {
+	int promises; // one for each promise (and one for an active actor if this is an actor)
+	int futures; // one for each future and one more if there are any callbacks
+
+	// Invariant: SingleCallback<T>::next==this || (queue.empty() && !error.isValid())
+	std::queue<T, Deque<T>> queue;
+	Promise<Void> onEmpty;
+	Error error;
+
+	NotifiedQueue(int futures, int promises) : futures(futures), promises(promises), onEmpty(nullptr) {
+		SingleCallback<T>::next = this;
+	}
+
+	bool isReady() const { return !queue.empty() || error.isValid(); }
+	bool isError() const { return queue.empty() && error.isValid(); } // the *next* thing queued is an error
+	uint32_t size() const { return queue.size(); }
+
+	virtual T pop() {
+		if (queue.empty()) {
+			if (error.isValid())
+				throw error;
+			throw internal_error();
+		}
+		auto copy = std::move(queue.front());
+		queue.pop();
+		if (onEmpty.isValid() && queue.empty()) {
+			Promise<Void> hold = onEmpty;
+			onEmpty = Promise<Void>(nullptr);
+			hold.send(Void());
+		}
+		return copy;
+	}
+
+	template <class U>
+	void send(U&& value) {
+		if (error.isValid())
+			return;
+
+		if (SingleCallback<T>::next != this) {
+			SingleCallback<T>::next->fire(std::forward<U>(value));
+		} else {
+			queue.emplace(std::forward<U>(value));
+		}
+	}
+
+	void sendError(Error err) {
+		if (error.isValid())
+			return;
+
+		this->error = err;
+		if (shouldFireImmediately()) {
+			SingleCallback<T>::next->error(err);
+		}
+	}
+
+	void addPromiseRef() { promises++; }
+	void addFutureRef() { futures++; }
+
+	void delPromiseRef() {
+		if (!--promises) {
+			if (futures) {
+				sendError(broken_promise());
+			} else
+				destroy();
+		}
+	}
+	void delFutureRef() {
+		if (!--futures) {
+			if (promises)
+				cancel();
+			else
+				destroy();
+		}
+	}
+
+	int getFutureReferenceCount() const { return futures; }
+	int getPromiseReferenceCount() const { return promises; }
+
+	virtual void destroy() { delete this; }
+	virtual void cancel() {}
+
+	void addCallbackAndDelFutureRef(SingleCallback<T>* cb) {
+		ASSERT(SingleCallback<T>::next == this);
+		cb->insert(this);
+	}
+	virtual void unwait() override { delFutureRef(); }
+	virtual void fire(T const&) override { ASSERT(false); }
+	virtual void fire(T&&) override { ASSERT(false); }
+
+protected:
+	T popImpl() {
+		if (queue.empty()) {
+			if (error.isValid())
+				throw error;
+			throw internal_error();
+		}
+		auto copy = std::move(queue.front());
+		queue.pop();
+		return copy;
+	}
+
+	bool hasError() { return error.isValid(); }
+
+	bool shouldFireImmediately() { return SingleCallback<T>::next != this; }
+};
+
 template <class T>
 class FutureStream {
 public:
@ -893,6 +918,11 @@ decltype(std::declval<Request>().reply) const& getReplyPromise(Request const& r)
 	return r.reply;
 }

+template <class Request>
+auto const& getReplyPromiseStream(Request const& r) {
+	return r.reply;
+}
+
 // Neither of these implementations of REPLY_TYPE() works on both MSVC and g++, so...
 #ifdef __GNUG__
 #define REPLY_TYPE(RequestType) decltype(getReplyPromise(std::declval<RequestType>()).getFuture().getValue())
@ -981,10 +1011,39 @@ public:
 	bool operator==(const PromiseStream<T>& rhs) const { return queue == rhs.queue; }
 	bool isEmpty() const { return !queue->isReady(); }

+	Future<Void> onEmpty() {
+		if (isEmpty()) {
+			return Void();
+		}
+		if (!queue->onEmpty.isValid()) {
+			queue->onEmpty = Promise<Void>();
+		}
+		return queue->onEmpty.getFuture();
+	}
+
 private:
 	NotifiedQueue<T>* queue;
 };

+// Neither of these implementations of REPLY_TYPE() works on both MSVC and g++, so...
+#ifdef __GNUG__
+#define REPLYSTREAM_TYPE(RequestType) decltype(getReplyPromiseStream(std::declval<RequestType>()).getFuture().pop())
+#else
+template <class T>
+struct ReplyStreamType {
+	// Doing this calculation directly in the return value declaration for PromiseStream<T>::getReply()
+	//   breaks IntelliSense in VS2010; this is a workaround.
+	typedef decltype(std::declval<T>().reply.getFuture().pop()) Type;
+};
+template <class T>
+class ReplyPromiseStream;
+template <class T>
+struct ReplyStreamType<ReplyPromiseStream<T>> {
+	typedef T Type;
+};
+#define REPLYSTREAM_TYPE(RequestType) typename ReplyStreamType<RequestType>::Type
+#endif
+
 // extern int actorCount;

 template <class T>
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@ -1305,6 +1305,17 @@ private:
 	Promise<Void> lastPromise;
 };

+ACTOR template <class T, class V>
+Future<T> forwardErrors(Future<T> f, PromiseStream<V> output) {
+	try {
+		T val = wait(f);
+		return val;
+	} catch (Error& e) {
+		output.sendError(e);
+		throw;
+	}
+}
+
 struct FlowLock : NonCopyable, public ReferenceCounted<FlowLock> {
 	// FlowLock implements a nonblocking critical section: there can be only a limited number of clients executing code
 	// between wait(take()) and release(). Not thread safe. take() returns only when the number of holders of the lock
--- a/flow/serialize.h
+++ b/flow/serialize.h
@ -369,6 +369,8 @@ public:
 	}
 	template <class T>
 	void serializeBinaryItem(const T& t) {
+		static_assert(is_binary_serializable<T>::value,
+		              "Object must be binary serializable, see BINARY_SERIALIZABLE macro");
 		*(T*)writeBytes(sizeof(T)) = t;
 	}
 	void* getData() { return data; }
@ -543,6 +545,8 @@ public:
 	}
 	template <class T>
 	void serializeBinaryItem(const T& t) {
+		static_assert(is_binary_serializable<T>::value,
+		              "Object must be binary serializable, see BINARY_SERIALIZABLE macro");
 		writeBytes(&t, sizeof(T));
 	}

@ -577,6 +581,8 @@ public:

 	template <class T>
 	void serializeBinaryItem(T& t) {
+		static_assert(is_binary_serializable<T>::value,
+		              "Object must be binary serializable, see BINARY_SERIALIZABLE macro");
 		t = *(T*)(static_cast<Impl*>(this)->readBytes(sizeof(T)));
 	}

@ -808,6 +814,8 @@ struct PacketWriter {
 	void serializeBytes(StringRef bytes) { serializeBytes(bytes.begin(), bytes.size()); }
 	template <class T>
 	void serializeBinaryItem(const T& t) {
+		static_assert(is_binary_serializable<T>::value,
+		              "Object must be binary serializable, see BINARY_SERIALIZABLE macro");
 		if (sizeof(T) <= buffer->bytes_unwritten()) {
 			*(T*)(buffer->data() + buffer->bytes_written) = t;
 			buffer->bytes_written += sizeof(T);