Merge branch 'master' into transaction-tagging

# Conflicts: # fdbclient/DatabaseContext.h
2020-05-09 07:50:29 -07:00 · 2020-05-09 07:50:29 -07:00 · 02307ba7b6
parent 0e21c0c17e cec63f2429
commit 02307ba7b6
86 changed files with 2213 additions and 1028 deletions
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@ -449,7 +449,7 @@ FDBFuture* fdb_transaction_get_range_impl(

 	/* _ITERATOR mode maps to one of the known streaming modes
 	   depending on iteration */
-	static const int mode_bytes_array[] = {CLIENT_KNOBS->BYTE_LIMIT_UNLIMITED, 256, 1000, 4096, 80000};
+	const int mode_bytes_array[] = { CLIENT_KNOBS->BYTE_LIMIT_UNLIMITED, 256, 1000, 4096, 80000 };

 	/* The progression used for FDB_STREAMING_MODE_ITERATOR.
 	   Goes from small -> medium -> large.  Then 1.5 * previous until serial. */
--- a/bindings/go/src/fdb/generated.go
+++ b/bindings/go/src/fdb/generated.go
@ -88,13 +88,20 @@ func (o NetworkOptions) SetTraceFormat(param string) error {
 	return o.setOpt(34, []byte(param))
 }

-// Select clock source for trace files. now (default) or realtime are supported.
+// Select clock source for trace files. now (the default) or realtime are supported.
 //
 // Parameter: Trace clock source
 func (o NetworkOptions) SetTraceClockSource(param string) error {
 	return o.setOpt(35, []byte(param))
 }

+// Once provided, this string will be used to replace the port/PID in the log file names.
+//
+// Parameter: The identifier that will be part of all trace file names
+func (o NetworkOptions) SetTraceFileIdentifier(param string) error {
+	return o.setOpt(36, []byte(param))
+}
+
 // Set internal tuning or debugging knobs
 //
 // Parameter: knob_name=knob_value
@ -223,11 +230,16 @@ func (o NetworkOptions) SetDisableClientStatisticsLogging() error {
 	return o.setOpt(70, nil)
 }

-// Enables debugging feature to perform slow task profiling. Requires trace logging to be enabled. WARNING: this feature is not recommended for use in production.
+// Deprecated
 func (o NetworkOptions) SetEnableSlowTaskProfiling() error {
 	return o.setOpt(71, nil)
 }

+// Enables debugging feature to perform run loop profiling. Requires trace logging to be enabled. WARNING: this feature is not recommended for use in production.
+func (o NetworkOptions) SetEnableRunLoopProfiling() error {
+	return o.setOpt(71, nil)
+}
+
 // Enable client buggify - will make requests randomly fail (intended for client testing)
 func (o NetworkOptions) SetClientBuggifyEnable() error {
 	return o.setOpt(80, nil)
@ -441,6 +453,11 @@ func (o TransactionOptions) SetTransactionLoggingMaxFieldLength(param int64) err
 	return o.setOpt(405, int64ToBytes(param))
 }

+// Sets an identifier for server tracing of this transaction. When committed, this identifier triggers logging when each part of the transaction authority encounters it, which is helpful in diagnosing slowness in misbehaving clusters. The identifier is randomly generated. When there is also a debug_transaction_identifier, both IDs are logged together.
+func (o TransactionOptions) SetServerRequestTracing() error {
+	return o.setOpt(406, nil)
+}
+
 // Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option.
 //
 // Parameter: value in milliseconds of timeout
@ -499,6 +516,11 @@ func (o TransactionOptions) SetUseProvisionalProxies() error {
 	return o.setOpt(711, nil)
 }

+// The transaction can retrieve keys that are conflicting with other transactions.
+func (o TransactionOptions) SetReportConflictingKeys() error {
+	return o.setOpt(712, nil)
+}
+
 type StreamingMode int

 const (
@ -636,15 +658,15 @@ type ErrorPredicate int

 const (

-	// Returns ``true`` if the error indicates the operations in the
-	// transactions should be retried because of transient error.
+	// Returns ``true`` if the error indicates the operations in the transactions
+	// should be retried because of transient error.
 	ErrorPredicateRetryable ErrorPredicate = 50000

-	// Returns ``true`` if the error indicates the transaction may have
-	// succeeded, though not in a way the system can verify.
+	// Returns ``true`` if the error indicates the transaction may have succeeded,
+	// though not in a way the system can verify.
 	ErrorPredicateMaybeCommitted ErrorPredicate = 50001

-	// Returns ``true`` if the error indicates the transaction has not
-	// committed, though in a way that can be retried.
+	// Returns ``true`` if the error indicates the transaction has not committed,
+	// though in a way that can be retried.
 	ErrorPredicateRetryableNotCommitted ErrorPredicate = 50002
 )
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@ -8,6 +8,7 @@ env_set(ALLOC_INSTRUMENTATION OFF BOOL "Instrument alloc")
 env_set(WITH_UNDODB OFF BOOL "Use rr or undodb")
 env_set(USE_ASAN OFF BOOL "Compile with address sanitizer")
 env_set(USE_UBSAN OFF BOOL "Compile with undefined behavior sanitizer")
+env_set(USE_TSAN OFF BOOL "Compile with thread sanitizer")
 env_set(FDB_RELEASE OFF BOOL "This is a building of a final release")
 env_set(USE_CCACHE OFF BOOL "Use ccache for compilation if available")
 env_set(RELATIVE_DEBUG_PATHS OFF BOOL "Use relative file paths in debug info")
@ -81,6 +82,7 @@ include(CheckFunctionExists)
 set(CMAKE_REQUIRED_INCLUDES stdlib.h malloc.h)
 set(CMAKE_REQUIRED_LIBRARIES c)
 set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_C_STANDARD 11)

 if(WIN32)
  # see: https://docs.microsoft.com/en-us/windows/desktop/WinProg/using-the-windows-headers
@ -164,6 +166,15 @@ else()
    set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS}    -fsanitize=undefined ${CMAKE_THREAD_LIBS_INIT}")
  endif()

+  if(USE_TSAN)
+    add_compile_options(
+      -fsanitize=thread
+      -DUSE_SANITIZER)
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -fsanitize=thread")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fsanitize=thread")
+    set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS}    -fsanitize=thread ${CMAKE_THREAD_LIBS_INIT}")
+  endif()
+
  if(PORTABLE_BINARY)
    message(STATUS "Create a more portable binary")
    set(CMAKE_MODULE_LINKER_FLAGS "-static-libstdc++ -static-libgcc ${CMAKE_MODULE_LINKER_FLAGS}")
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@ -73,6 +73,7 @@ set(FDBCLIENT_SRCS
  Tuple.h
  VersionedMap.actor.h
  VersionedMap.h
+  VersionedMap.cpp
  WriteMap.h
  json_spirit/json_spirit_error_position.h
  json_spirit/json_spirit_reader_template.h
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@ -309,6 +309,9 @@ public:
 	std::shared_ptr<SpecialKeySpace> specialKeySpace;
 	std::shared_ptr<ConflictingKeysImpl> cKImpl;

+	std::shared_ptr<ReadConflictRangeImpl> rCRImpl;
+	std::shared_ptr<WriteConflictRangeImpl> wCRImpl;
+	
 	static bool debugUseTags;
 	static const std::vector<std::string> debugTransactionTagChoices; 
 };
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@ -268,6 +268,10 @@ struct KeyRangeRef {
 		return KeyRangeRef( begin.withPrefix(prefix), end.withPrefix(prefix) );
 	}

+	KeyRangeRef withPrefix(const StringRef& prefix, Arena& arena) const {
+		return KeyRangeRef(begin.withPrefix(prefix, arena), end.withPrefix(prefix, arena));
+	}
+
 	KeyRangeRef removePrefix( const StringRef& prefix ) const {
 		return KeyRangeRef( begin.removePrefix(prefix), end.removePrefix(prefix) );
 	}
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@ -3691,8 +3691,7 @@ public:
 					auto range = backupRanges[restoreIndex];
 					Standalone<StringRef> restoreTag(backupTag.toString() + "_" + std::to_string(restoreIndex));
 					// Register the request request in DB, which will be picked up by restore worker leader
-					struct RestoreRequest restoreRequest(restoreIndex, restoreTag, bcUrl, true, targetVersion, true,
-					                                     range, Key(), Key(), lockDB,
+					struct RestoreRequest restoreRequest(restoreIndex, restoreTag, bcUrl, targetVersion, range,
 					                                     deterministicRandom()->randomUniqueID());
 					tr->set(restoreRequestKeyFor(restoreRequest.index), restoreRequestValue(restoreRequest));
 				}
--- a/fdbclient/MasterProxyInterface.h
+++ b/fdbclient/MasterProxyInterface.h
@ -40,8 +40,9 @@ struct MasterProxyInterface {
 	enum { LocationAwareLoadBalance = 1 };
 	enum { AlwaysFresh = 1 };

-	LocalityData locality;
+	Optional<Key> processId;
 	bool provisional;
+	Endpoint base;
 	RequestStream< struct CommitTransactionRequest > commit;
 	RequestStream< struct GetReadVersionRequest > getConsistentReadVersion;  // Returns a version which (1) is committed, and (2) is >= the latest version reported committed (by a commit response) when this request was sent
 															     //   (at some point between when this request is sent and when its response is received, the latest version reported committed)
@ -64,17 +65,34 @@ struct MasterProxyInterface {

 	template <class Archive>
 	void serialize(Archive& ar) {
-		serializer(ar, locality, provisional, commit, getConsistentReadVersion, getKeyServersLocations,
-				   waitFailure, getStorageServerRejoinInfo, getRawCommittedVersion,
-				   txnState, getHealthMetrics, proxySnapReq, exclusionSafetyCheckReq);
+		serializer(ar, processId, provisional, base);
+		if( Archive::isDeserializing ) {
+			commit = RequestStream< struct CommitTransactionRequest >( base.getAdjustedEndpoint(0) );
+			getConsistentReadVersion = RequestStream< struct GetReadVersionRequest >( base.getAdjustedEndpoint(1) );
+			getKeyServersLocations = RequestStream< struct GetKeyServerLocationsRequest >( base.getAdjustedEndpoint(2) );
+			getStorageServerRejoinInfo = RequestStream< struct GetStorageServerRejoinInfoRequest >( base.getAdjustedEndpoint(3) );
+			waitFailure = RequestStream<ReplyPromise<Void>>( base.getAdjustedEndpoint(4) );
+			getRawCommittedVersion = RequestStream< struct GetRawCommittedVersionRequest >( base.getAdjustedEndpoint(5) );
+			txnState = RequestStream< struct TxnStateRequest >( base.getAdjustedEndpoint(6) );
+			getHealthMetrics = RequestStream< struct GetHealthMetricsRequest >( base.getAdjustedEndpoint(7) );
+			proxySnapReq = RequestStream< struct ProxySnapRequest >( base.getAdjustedEndpoint(8) );
+			exclusionSafetyCheckReq = RequestStream< struct ExclusionSafetyCheckRequest >( base.getAdjustedEndpoint(9) );
+		}
 	}

 	void initEndpoints() {
-		getConsistentReadVersion.getEndpoint(TaskPriority::ReadSocket);
-		getRawCommittedVersion.getEndpoint(TaskPriority::ProxyGetRawCommittedVersion);
-		commit.getEndpoint(TaskPriority::ReadSocket);
-		getStorageServerRejoinInfo.getEndpoint(TaskPriority::ProxyStorageRejoin);
-		getKeyServersLocations.getEndpoint(TaskPriority::ReadSocket); //priority lowered to TaskPriority::DefaultEndpoint on the proxy
+		std::vector<std::pair<FlowReceiver*, TaskPriority>> streams;
+		streams.push_back(commit.getReceiver(TaskPriority::ReadSocket));
+		streams.push_back(getConsistentReadVersion.getReceiver(TaskPriority::ReadSocket));
+		streams.push_back(getKeyServersLocations.getReceiver(TaskPriority::ReadSocket)); //priority lowered to TaskPriority::DefaultEndpoint on the proxy
+		streams.push_back(getStorageServerRejoinInfo.getReceiver(TaskPriority::ProxyStorageRejoin));
+		streams.push_back(waitFailure.getReceiver());
+		streams.push_back(getRawCommittedVersion.getReceiver(TaskPriority::ProxyGetRawCommittedVersion));
+		streams.push_back(txnState.getReceiver());
+		streams.push_back(getHealthMetrics.getReceiver());
+		streams.push_back(proxySnapReq.getReceiver());
+		streams.push_back(exclusionSafetyCheckReq.getReceiver());
+		base = FlowTransport::transport().addEndpoints(streams);
 	}
 };

--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -530,7 +530,9 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
    commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0),
    healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal),
    specialKeySpace(std::make_shared<SpecialKeySpace>(normalKeys.begin, specialKeys.end)),
-    cKImpl(std::make_shared<ConflictingKeysImpl>(conflictingKeysRange)) {
+    cKImpl(std::make_shared<ConflictingKeysImpl>(conflictingKeysRange)),
+    rCRImpl(std::make_shared<ReadConflictRangeImpl>(readConflictRangeKeysRange)),
+    wCRImpl(std::make_shared<WriteConflictRangeImpl>(writeConflictRangeKeysRange)) {
 	dbId = deterministicRandom()->randomUniqueID();
 	connected = clientInfo->get().proxies.size() ? Void() : clientInfo->onChange();

@ -551,6 +553,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 	throttleExpirer = recurring([this](){ expireThrottles(); }, CLIENT_KNOBS->TAG_THROTTLE_EXPIRATION_INTERVAL);
 	specialKeySpace->registerKeyRange(conflictingKeysRange, cKImpl.get());
+	specialKeySpace->registerKeyRange(readConflictRangeKeysRange, rCRImpl.get());
+	specialKeySpace->registerKeyRange(writeConflictRangeKeysRange, wCRImpl.get());
 }

 DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("TransactionMetrics"), transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc),
@ -2489,7 +2493,7 @@ void Transaction::atomicOp(const KeyRef& key, const ValueRef& operand, MutationR

 	t.mutations.push_back( req.arena, MutationRef( operationType, r.begin, v ) );

-	if( addConflictRange )
+	if (addConflictRange && operationType != MutationRef::SetVersionstampedKey)
 		t.write_conflict_ranges.push_back( req.arena, r );

 	TEST(true); //NativeAPI atomic operation
@ -3116,7 +3120,7 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri
 		case FDBTransactionOptions::DEBUG_TRANSACTION_IDENTIFIER:
 			validateOptionValue(value, true);

-			if (value.get().size() > 100) {
+			if (value.get().size() > 100 || value.get().size() == 0) {
 				throw invalid_option_value();
 			}

@ -3143,7 +3147,7 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional<Stri

 		case FDBTransactionOptions::LOG_TRANSACTION:
 			validateOptionValue(value, false);
-			if (trLogInfo) {
+			if (trLogInfo && !trLogInfo->identifier.empty()) {
 				trLogInfo->logTo(TransactionLogInfo::TRACE_LOG);
 			}
 			else {
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@ -306,6 +306,15 @@ public:
 	TransactionOptions options;
 	double startTime;
 	Reference<TransactionLogInfo> trLogInfo;
+
+	const vector<Future<std::pair<Key, Key>>>& getExtraReadConflictRanges() const { return extraConflictRanges; }
+	Standalone<VectorRef<KeyRangeRef>> readConflictRanges() const {
+		return Standalone<VectorRef<KeyRangeRef>>(tr.transaction.read_conflict_ranges, tr.arena);
+	}
+	Standalone<VectorRef<KeyRangeRef>> writeConflictRanges() const {
+		return Standalone<VectorRef<KeyRangeRef>>(tr.transaction.write_conflict_ranges, tr.arena);
+	}
+
 private:
 	Future<Version> getReadVersion(uint32_t flags);
 	Database cx;
--- a/fdbclient/RYWIterator.cpp
+++ b/fdbclient/RYWIterator.cpp
@ -72,7 +72,7 @@ RYWIterator& RYWIterator::operator++() {
 	if (end_key_cmp <= 0) ++cache;
 	if (end_key_cmp >= 0) ++writes;
 	begin_key_cmp = -end_key_cmp;
-	end_key_cmp = cache.endKey().cmp(writes.endKey());
+	end_key_cmp = cache.endKey().compare(writes.endKey());
 	return *this;
 }

@ -80,7 +80,7 @@ RYWIterator& RYWIterator::operator--() {
 	if (begin_key_cmp >= 0) --cache;
 	if (begin_key_cmp <= 0) --writes;
 	end_key_cmp = -begin_key_cmp;
-	begin_key_cmp = cache.beginKey().cmp(writes.beginKey());
+	begin_key_cmp = cache.beginKey().compare(writes.beginKey());
 	return *this;
 }

@ -117,8 +117,8 @@ void RYWIterator::dbg() {
 }

 void RYWIterator::updateCmp() {
-	begin_key_cmp = cache.beginKey().cmp(writes.beginKey());
-	end_key_cmp = cache.endKey().cmp(writes.endKey());
+	begin_key_cmp = cache.beginKey().compare(writes.beginKey());
+	end_key_cmp = cache.endKey().compare(writes.endKey());
 }

 void testESR() {
@ -157,13 +157,13 @@ void testESR() {
 				printf("Error: '%s' cmp '%s' = %d\n", printable(ssrs[i]).c_str(), printable(ssrs[j]).c_str(), c2);
 				return;
 			}
-			
+
 			/*
 			int c = ssrs[i] < ssrs[j] ? -1 : ssrs[i] == ssrs[j] ? 0 : 1;
-			int c2 = srs[i].cmp(srs[j]);
+			int c2 = srs[i].compare(srs[j]);
 			if ( c != (0<c2)-(c2<0) ) {
-				printf("Error: '%s' cmp '%s' = %d\n", printable(ssrs[i]).c_str(), printable(ssrs[j]).c_str(), c2);
-				return;
+			    printf("Error: '%s' cmp '%s' = %d\n", printable(ssrs[i]).c_str(), printable(ssrs[j]).c_str(), c2);
+			    return;
 			}*/

 			/*
@ -413,8 +413,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	it.skip(allKeys.begin);

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -423,8 +423,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(it.is_conflict_range());
 	ASSERT(it.is_operation());
@ -434,8 +434,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -444,8 +444,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(it.is_conflict_range());
 	ASSERT(it.is_operation());
@ -455,8 +455,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("\xff\xff")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("\xff\xff")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -486,8 +486,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	it.skip(allKeys.begin);

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -496,8 +496,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(it.is_conflict_range());
 	ASSERT(it.is_operation());
@ -507,8 +507,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp123")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp123")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
@ -517,8 +517,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp123")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("stamp123\x00")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp123")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("stamp123\x00")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(it.is_conflict_range());
 	ASSERT(it.is_operation());
@ -528,8 +528,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") {
 	++it;

 	ASSERT(it.beginKey() < allKeys.end);
-	ASSERT(it.beginKey().cmp(LiteralStringRef("stamp123\x00")) == 0);
-	ASSERT(it.endKey().cmp(LiteralStringRef("\xff\xff")) == 0);
+	ASSERT(it.beginKey().compare(LiteralStringRef("stamp123\x00")) == 0);
+	ASSERT(it.endKey().compare(LiteralStringRef("\xff\xff")) == 0);
 	ASSERT(!it.is_cleared_range());
 	ASSERT(!it.is_conflict_range());
 	ASSERT(!it.is_operation());
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -1040,6 +1040,18 @@ public:
 			wait( ryw->resetPromise.getFuture() || ready );

 			if( ryw->options.readYourWritesDisabled ) {
+
+				// Stash away conflict ranges to read after commit
+				ryw->nativeReadRanges = ryw->tr.readConflictRanges();
+				ryw->nativeWriteRanges = ryw->tr.writeConflictRanges();
+				for (const auto& f : ryw->tr.getExtraReadConflictRanges()) {
+					if (f.isReady() && f.get().first < f.get().second)
+						ryw->nativeReadRanges.push_back(
+						    ryw->nativeReadRanges.arena(),
+						    KeyRangeRef(f.get().first, f.get().second)
+						        .withPrefix(readConflictRangeKeysRange.begin, ryw->nativeReadRanges.arena()));
+				}
+
 				if (ryw->resetPromise.isSet())
 					throw ryw->resetPromise.getFuture().getError();
 				wait( ryw->resetPromise.getFuture() || ryw->tr.commit() );
@ -1132,7 +1144,7 @@ public:

 ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx)
  : cache(&arena), writes(&arena), tr(cx), retries(0), approximateSize(0), creationTime(now()), commitStarted(false),
-    options(tr), deferredError(cx->deferredError) {
+    options(tr), deferredError(cx->deferredError), versionStampFuture(tr.getVersionstamp()) {
 	std::copy(cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(),
 	          std::back_inserter(persistentOptions));
 	applyPersistentOptions();
@ -1290,7 +1302,7 @@ Future< Standalone<RangeResultRef> > ReadYourWritesTransaction::getRange(
 	}

 	// special key space are only allowed to query if both begin and end are in \xff\xff, \xff\xff\xff
-	if (specialKeys.contains(begin.getKey()) && specialKeys.contains(end.getKey()))
+	if (specialKeys.contains(begin.getKey()) && end.getKey() <= specialKeys.end)
 		return getDatabase()->specialKeySpace->getRange(Reference<ReadYourWritesTransaction>::addRef(this), begin, end,
 		                                                limits, reverse);

@ -1545,6 +1557,104 @@ void ReadYourWritesTransaction::getWriteConflicts( KeyRangeMap<bool> *result ) {
 	}
 }

+Standalone<RangeResultRef> ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRangeRef kr) {
+	ASSERT(readConflictRangeKeysRange.contains(kr));
+	ASSERT(!tr.options.checkWritesEnabled)
+	Standalone<RangeResultRef> result;
+	if (!options.readYourWritesDisabled) {
+		kr = kr.removePrefix(readConflictRangeKeysRange.begin);
+		auto iter = readConflicts.rangeContainingKeyBefore(kr.begin);
+		if (iter->begin() == allKeys.begin && !iter->value()) {
+			++iter; // Conventionally '' is missing from the result range if it's not part of a read conflict
+		}
+		for (; iter->begin() < kr.end; ++iter) {
+			if (kr.begin <= iter->begin() && iter->begin() < kr.end) {
+				result.push_back(result.arena(),
+				                 KeyValueRef(iter->begin().withPrefix(readConflictRangeKeysRange.begin, result.arena()),
+				                             iter->value() ? LiteralStringRef("1") : LiteralStringRef("0")));
+			}
+		}
+	} else {
+		CoalescedKeyRefRangeMap<ValueRef> readConflicts{ LiteralStringRef("0"), specialKeys.end };
+		for (const auto& range : tr.readConflictRanges())
+			readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()),
+			                     LiteralStringRef("1"));
+		for (const auto& range : nativeReadRanges)
+			readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()),
+			                     LiteralStringRef("1"));
+		for (const auto& f : tr.getExtraReadConflictRanges()) {
+			if (f.isReady() && f.get().first < f.get().second)
+				readConflicts.insert(KeyRangeRef(f.get().first, f.get().second)
+				                         .withPrefix(readConflictRangeKeysRange.begin, result.arena()),
+				                     LiteralStringRef("1"));
+		}
+		auto beginIter = readConflicts.rangeContaining(kr.begin);
+		if (beginIter->begin() != kr.begin) ++beginIter;
+		for (auto it = beginIter; it->begin() < kr.end; ++it) {
+			result.push_back(result.arena(), KeyValueRef(it->begin(), it->value()));
+		}
+	}
+	return result;
+}
+
+Standalone<RangeResultRef> ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRangeRef kr) {
+	ASSERT(writeConflictRangeKeysRange.contains(kr));
+	Standalone<RangeResultRef> result;
+
+	// Memory owned by result
+	CoalescedKeyRefRangeMap<ValueRef> writeConflicts{ LiteralStringRef("0"), specialKeys.end };
+
+	if (!options.readYourWritesDisabled) {
+		KeyRangeRef strippedWriteRangePrefix = kr.removePrefix(writeConflictRangeKeysRange.begin);
+		WriteMap::iterator it(&writes);
+		it.skip(strippedWriteRangePrefix.begin);
+		if (it.beginKey() > allKeys.begin) --it;
+		for (; it.beginKey() < strippedWriteRangePrefix.end; ++it) {
+			if (it.is_conflict_range())
+				writeConflicts.insert(
+				    KeyRangeRef(it.beginKey().toArena(result.arena()), it.endKey().toArena(result.arena()))
+				        .withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
+				    LiteralStringRef("1"));
+		}
+	} else {
+		for (const auto& range : tr.writeConflictRanges())
+			writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
+			                      LiteralStringRef("1"));
+		for (const auto& range : nativeWriteRanges)
+			writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
+			                      LiteralStringRef("1"));
+	}
+
+	for (const auto& k : versionStampKeys) {
+		KeyRange range;
+		if (versionStampFuture.isValid() && versionStampFuture.isReady() && !versionStampFuture.isError()) {
+			const auto& stamp = versionStampFuture.get();
+			StringRef key(range.arena(), k); // Copy
+			ASSERT(k.size() >= 4);
+			int32_t pos;
+			memcpy(&pos, k.end() - sizeof(int32_t), sizeof(int32_t));
+			pos = littleEndian32(pos);
+			ASSERT(pos >= 0 && pos + stamp.size() <= key.size());
+			memcpy(mutateString(key) + pos, stamp.begin(), stamp.size());
+			*(mutateString(key) + key.size() - 4) = '\x00';
+			// singleKeyRange, but share begin and end's memory
+			range = KeyRangeRef(key.substr(0, key.size() - 4), key.substr(0, key.size() - 3));
+		} else {
+			range = getVersionstampKeyRange(result.arena(), k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey());
+		}
+		writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()),
+		                      LiteralStringRef("1"));
+	}
+
+	auto beginIter = writeConflicts.rangeContaining(kr.begin);
+	if (beginIter->begin() != kr.begin) ++beginIter;
+	for (auto it = beginIter; it->begin() < kr.end; ++it) {
+		result.push_back(result.arena(), KeyValueRef(it->begin(), it->value()));
+	}
+
+	return result;
+}
+
 void ReadYourWritesTransaction::atomicOp( const KeyRef& key, const ValueRef& operand, uint32_t operationType ) {
 	bool addWriteConflict = !options.getAndResetWriteConflictDisabled();

@ -1593,6 +1703,8 @@ void ReadYourWritesTransaction::atomicOp( const KeyRef& key, const ValueRef& ope
 		TEST(options.readYourWritesDisabled); // SetVersionstampedKey without ryw enabled
 		// this does validation of the key and needs to be performed before the readYourWritesDisabled path
 		KeyRangeRef range = getVersionstampKeyRange(arena, k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey());
+		versionStampKeys.push_back(arena, k);
+		addWriteConflict = false;
 		if(!options.readYourWritesDisabled) {
 			writeRangeToNativeTransaction(range);
 			writes.addUnmodifiedAndUnreadableRange(range);
@ -1911,6 +2023,9 @@ void ReadYourWritesTransaction::operator=(ReadYourWritesTransaction&& r) BOOST_N
 	cache.arena = &arena;
 	writes.arena = &arena;
 	persistentOptions = std::move(r.persistentOptions);
+	nativeReadRanges = std::move(r.nativeReadRanges);
+	nativeWriteRanges = std::move(r.nativeWriteRanges);
+	versionStampKeys = std::move(r.versionStampKeys);
 }

 ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&& r) BOOST_NOEXCEPT :
@ -1935,6 +2050,9 @@ ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&&
 	watchMap = std::move( r.watchMap );
 	r.resetPromise = Promise<Void>();
 	persistentOptions = std::move(r.persistentOptions);
+	nativeReadRanges = std::move(r.nativeReadRanges);
+	nativeWriteRanges = std::move(r.nativeWriteRanges);
+	versionStampKeys = std::move(r.versionStampKeys);
 }

 Future<Void> ReadYourWritesTransaction::onError(Error const& e) {
@ -1969,6 +2087,9 @@ void ReadYourWritesTransaction::resetRyow() {
 	cache = SnapshotCache(&arena);
 	writes = WriteMap(&arena);
 	readConflicts = CoalescedKeyRefRangeMap<bool>();
+	versionStampKeys = VectorRef<KeyRef>();
+	nativeReadRanges = Standalone<VectorRef<KeyRangeRef>>();
+	nativeWriteRanges = Standalone<VectorRef<KeyRangeRef>>();
 	watchMap.clear();
 	reading = AndFuture();
 	approximateSize = 0;
@ -1999,6 +2120,7 @@ void ReadYourWritesTransaction::reset() {
 	options.reset(tr);
 	transactionDebugInfo.clear();
 	tr.fullReset();
+	versionStampFuture = tr.getVersionstamp();
 	std::copy(tr.getDatabase().getTransactionDefaults().begin(), tr.getDatabase().getTransactionDefaults().end(), std::back_inserter(persistentOptions));
 	resetRyow();
 }
--- a/fdbclient/ReadYourWrites.h
+++ b/fdbclient/ReadYourWrites.h
@ -119,7 +119,10 @@ public:
 	void reset();
 	void debugTransaction(UID dID) { tr.debugTransaction(dID); }

-	Future<Void> debug_onIdle() {  return reading; }
+	Future<Void> debug_onIdle() { return reading; }
+
+	// Wait for all reads that are currently pending to complete
+	Future<Void> pendingReads() { return resetPromise.getFuture() || reading; }

 	// Used by ThreadSafeTransaction for exceptions thrown in void methods
 	Error deferredError;
@ -135,6 +138,12 @@ public:
 	const TransactionInfo& getTransactionInfo() const {
 		return tr.info;
 	}
+
+	// Read from the special key space readConflictRangeKeysRange
+	Standalone<RangeResultRef> getReadConflictRangeIntersecting(KeyRangeRef kr);
+	// Read from the special key space writeConflictRangeKeysRange
+	Standalone<RangeResultRef> getWriteConflictRangeIntersecting(KeyRangeRef kr);
+
 private:
 	friend class RYWImpl;

@ -152,6 +161,14 @@ private:
 	double creationTime;
 	bool commitStarted;

+	// For reading conflict ranges from the special key space
+	VectorRef<KeyRef> versionStampKeys;
+	Future<Standalone<StringRef>> versionStampFuture;
+	Standalone<VectorRef<KeyRangeRef>>
+	    nativeReadRanges; // Used to read conflict ranges after committing an ryw disabled transaction
+	Standalone<VectorRef<KeyRangeRef>>
+	    nativeWriteRanges; // Used to read conflict ranges after committing an ryw disabled transaction
+
 	Reference<TransactionDebugInfo> transactionDebugInfo;

 	void resetTimeout();
--- a/fdbclient/RestoreWorkerInterface.actor.h
+++ b/fdbclient/RestoreWorkerInterface.actor.h
@ -540,42 +540,27 @@ struct RestoreRequest {
 	int index;
 	Key tagName;
 	Key url;
-	bool waitForComplete;
 	Version targetVersion;
-	bool verbose;
 	KeyRange range;
-	Key addPrefix;
-	Key removePrefix;
-	bool lockDB;
 	UID randomUid;

-	std::vector<int> restoreRequests;
-	// Key restoreTag;
-
 	ReplyPromise<struct RestoreCommonReply> reply;

 	RestoreRequest() = default;
-	explicit RestoreRequest(const int index, const Key& tagName, const Key& url, bool waitForComplete,
-	                        Version targetVersion, bool verbose, const KeyRange& range, const Key& addPrefix,
-	                        const Key& removePrefix, bool lockDB, const UID& randomUid)
-	  : index(index), tagName(tagName), url(url), waitForComplete(waitForComplete), targetVersion(targetVersion),
-	    verbose(verbose), range(range), addPrefix(addPrefix), removePrefix(removePrefix), lockDB(lockDB),
-	    randomUid(randomUid) {}
+	explicit RestoreRequest(const int index, const Key& tagName, const Key& url, Version targetVersion,
+	                        const KeyRange& range, const UID& randomUid)
+	  : index(index), tagName(tagName), url(url), targetVersion(targetVersion), range(range), randomUid(randomUid) {}

 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, index, tagName, url, waitForComplete, targetVersion, verbose, range, addPrefix, removePrefix,
-		           lockDB, randomUid, restoreRequests, reply);
+		serializer(ar, index, tagName, url, targetVersion, range, randomUid, reply);
 	}

 	std::string toString() const {
 		std::stringstream ss;
 		ss << "index:" << std::to_string(index) << " tagName:" << tagName.contents().toString()
-		   << " url:" << url.contents().toString() << " waitForComplete:" << std::to_string(waitForComplete)
-		   << " targetVersion:" << std::to_string(targetVersion) << " verbose:" << std::to_string(verbose)
-		   << " range:" << range.toString() << " addPrefix:" << addPrefix.contents().toString()
-		   << " removePrefix:" << removePrefix.contents().toString() << " lockDB:" << std::to_string(lockDB)
-		   << " randomUid:" << randomUid.toString();
+		   << " url:" << url.contents().toString() << " targetVersion:" << std::to_string(targetVersion)
+		   << " range:" << range.toString() << " randomUid:" << randomUid.toString();
 		return ss.str();
 	}
 };
--- a/fdbclient/SnapshotCache.h
+++ b/fdbclient/SnapshotCache.h
@ -71,7 +71,7 @@ struct ExtStringRef {

 	int size() const { return base.size() + extra_zero_bytes; }

-	int cmp(ExtStringRef const& rhs) const {
+	int compare(ExtStringRef const& rhs) const {
 		int cbl = std::min(base.size(), rhs.base.size());
 		if (cbl > 0) {
 			int c = memcmp(base.begin(), rhs.base.begin(), cbl);
@ -82,7 +82,7 @@ struct ExtStringRef {
 			if (base[i]) return 1;
 		for(int i=cbl; i<rhs.base.size(); i++)
 			if (rhs.base[i]) return -1;
-		return size() - rhs.size();
+		return ::compare(size(), rhs.size());
 	}

 	bool startsWith( const ExtStringRef& s ) const { 
@ -114,13 +114,21 @@ private:
 	int extra_zero_bytes;
 };
 inline bool operator == (const ExtStringRef& lhs, const ExtStringRef& rhs ) {
-	return lhs.size() == rhs.size() && !lhs.cmp(rhs);
+	return lhs.size() == rhs.size() && !lhs.compare(rhs);
 }
 inline bool operator != (const ExtStringRef& lhs, const ExtStringRef& rhs ) { return !(lhs==rhs); }
-inline bool operator < ( const ExtStringRef& lhs, const ExtStringRef& rhs ) { return lhs.cmp(rhs)<0; }
-inline bool operator > ( const ExtStringRef& lhs, const ExtStringRef& rhs ) { return lhs.cmp(rhs)>0; }
-inline bool operator <= ( const ExtStringRef& lhs, const ExtStringRef& rhs ) { return lhs.cmp(rhs)<=0; }
-inline bool operator >= ( const ExtStringRef& lhs, const ExtStringRef& rhs ) { return lhs.cmp(rhs)>=0; }
+inline bool operator<(const ExtStringRef& lhs, const ExtStringRef& rhs) {
+	return lhs.compare(rhs) < 0;
+}
+inline bool operator>(const ExtStringRef& lhs, const ExtStringRef& rhs) {
+	return lhs.compare(rhs) > 0;
+}
+inline bool operator<=(const ExtStringRef& lhs, const ExtStringRef& rhs) {
+	return lhs.compare(rhs) <= 0;
+}
+inline bool operator>=(const ExtStringRef& lhs, const ExtStringRef& rhs) {
+	return lhs.compare(rhs) >= 0;
+}

 template<>
 struct Traceable<ExtStringRef> : std::true_type {
@ -152,25 +160,10 @@ private:
 		{
 			values.push_back( arena, kv );
 		}
+		int compare(Entry const& r) const { return ::compare(beginKey, r.beginKey); }
 		bool operator < (Entry const& r) const {
 			return beginKey < r.beginKey;
 		}
-		bool operator < (StringRef const& r) const {
-			return beginKey < r;
-		}
-		bool operator <= (Entry const& r) const {
-			return beginKey <= r.beginKey;
-		}
-		bool operator <= (StringRef const& r) const {
-			return beginKey <= r;
-		}
-		bool operator == (Entry const& r) const {
-			return beginKey == r.beginKey;
-		}
-		bool operator == (StringRef const& r) const {
-			return beginKey == r;
-		}
-
 		int segments() const { return 2*(values.size()+1); }
 	};

--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@ -243,6 +243,26 @@ Future<Optional<Value>> SpecialKeySpace::get(Reference<ReadYourWritesTransaction
 	return getActor(this, ryw, key);
 }

+ReadConflictRangeImpl::ReadConflictRangeImpl(KeyRangeRef kr) : SpecialKeyRangeBaseImpl(kr) {}
+
+ACTOR static Future<Standalone<RangeResultRef>> getReadConflictRangeImpl(Reference<ReadYourWritesTransaction> ryw,
+                                                                         KeyRange kr) {
+	wait(ryw->pendingReads());
+	return ryw->getReadConflictRangeIntersecting(kr);
+}
+
+Future<Standalone<RangeResultRef>> ReadConflictRangeImpl::getRange(Reference<ReadYourWritesTransaction> ryw,
+                                                                   KeyRangeRef kr) const {
+	return getReadConflictRangeImpl(ryw, kr);
+}
+
+WriteConflictRangeImpl::WriteConflictRangeImpl(KeyRangeRef kr) : SpecialKeyRangeBaseImpl(kr) {}
+
+Future<Standalone<RangeResultRef>> WriteConflictRangeImpl::getRange(Reference<ReadYourWritesTransaction> ryw,
+                                                                    KeyRangeRef kr) const {
+	return ryw->getWriteConflictRangeIntersecting(kr);
+}
+
 ConflictingKeysImpl::ConflictingKeysImpl(KeyRangeRef kr) : SpecialKeyRangeBaseImpl(kr) {}

 Future<Standalone<RangeResultRef>> ConflictingKeysImpl::getRange(Reference<ReadYourWritesTransaction> ryw,
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@ -95,5 +95,19 @@ public:
 	                                            KeyRangeRef kr) const override;
 };

+class ReadConflictRangeImpl : public SpecialKeyRangeBaseImpl {
+public:
+	explicit ReadConflictRangeImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
+	                                            KeyRangeRef kr) const override;
+};
+
+class WriteConflictRangeImpl : public SpecialKeyRangeBaseImpl {
+public:
+	explicit WriteConflictRangeImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(Reference<ReadYourWritesTransaction> ryw,
+	                                            KeyRangeRef kr) const override;
+};
+
 #include "flow/unactorcompiler.h"
 #endif
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@ -54,6 +54,7 @@ struct StorageServerInterface {

 	LocalityData locality;
 	UID uniqueID;
+	Endpoint base;

 	RequestStream<struct GetValueRequest> getValue;
 	RequestStream<struct GetKeyRequest> getKey;
@ -65,13 +66,13 @@ struct StorageServerInterface {
 	RequestStream<struct GetShardStateRequest> getShardState;
 	RequestStream<struct WaitMetricsRequest> waitMetrics;
 	RequestStream<struct SplitMetricsRequest> splitMetrics;
-	RequestStream<struct ReadHotSubRangeRequest> getReadHotRanges;
 	RequestStream<struct GetStorageMetricsRequest> getStorageMetrics;
 	RequestStream<ReplyPromise<Void>> waitFailure;
 	RequestStream<struct StorageQueuingMetricsRequest> getQueuingMetrics;

 	RequestStream<ReplyPromise<KeyValueStoreType>> getKeyValueStoreType;
 	RequestStream<struct WatchValueRequest> watchValue;
+	RequestStream<struct ReadHotSubRangeRequest> getReadHotRanges;

 	explicit StorageServerInterface(UID uid) : uniqueID( uid ) {}
 	StorageServerInterface() : uniqueID( deterministicRandom()->randomUniqueID() ) {}
@ -85,22 +86,50 @@ struct StorageServerInterface {
 		// StorageServerInterface is persisted in the database and in the tLog's data structures, so changes here have to be
 		// versioned carefully!

-		if constexpr (!is_fb_function<Ar>) {
-			serializer(ar, uniqueID, locality, getValue, getKey, getKeyValues, getShardState, waitMetrics, splitMetrics,
-			           getReadHotRanges, getStorageMetrics, waitFailure, getQueuingMetrics, getKeyValueStoreType);
-			if (ar.protocolVersion().hasWatches()) serializer(ar, watchValue);
+		if (ar.protocolVersion().hasSmallEndpoints()) {
+			serializer(ar, uniqueID, locality, base);
+			if( Ar::isDeserializing ) {
+				getValue = RequestStream<struct GetValueRequest>( base.getAdjustedEndpoint(0) );
+				getKey = RequestStream<struct GetKeyRequest>( base.getAdjustedEndpoint(1) );
+				getKeyValues = RequestStream<struct GetKeyValuesRequest>( base.getAdjustedEndpoint(2) );
+				getShardState = RequestStream<struct GetShardStateRequest>( base.getAdjustedEndpoint(3) );
+				waitMetrics = RequestStream<struct WaitMetricsRequest>( base.getAdjustedEndpoint(4) );
+				splitMetrics = RequestStream<struct SplitMetricsRequest>( base.getAdjustedEndpoint(5) );
+				getStorageMetrics = RequestStream<struct GetStorageMetricsRequest>( base.getAdjustedEndpoint(6) );
+				waitFailure = RequestStream<ReplyPromise<Void>>( base.getAdjustedEndpoint(7) );
+				getQueuingMetrics = RequestStream<struct StorageQueuingMetricsRequest>( base.getAdjustedEndpoint(8) );
+				getKeyValueStoreType = RequestStream<ReplyPromise<KeyValueStoreType>>( base.getAdjustedEndpoint(9) );
+				watchValue = RequestStream<struct WatchValueRequest>( base.getAdjustedEndpoint(10) );
+				getReadHotRanges = RequestStream<struct ReadHotSubRangeRequest>( base.getAdjustedEndpoint(11) );
+			}
 		} else {
-			serializer(ar, uniqueID, locality, getValue, getKey, getKeyValues, getShardState, waitMetrics, splitMetrics,
-			           getReadHotRanges, getStorageMetrics, waitFailure, getQueuingMetrics, getKeyValueStoreType,
-			           watchValue);
+			ASSERT(Ar::isDeserializing);
+			if constexpr (is_fb_function<Ar>) {
+				ASSERT(false);
+			}
+			serializer(ar, uniqueID, locality, getValue, getKey, getKeyValues, getShardState, waitMetrics,
+					splitMetrics, getStorageMetrics, waitFailure, getQueuingMetrics, getKeyValueStoreType);
+			if (ar.protocolVersion().hasWatches()) serializer(ar, watchValue);
+			base = getValue.getEndpoint();
 		}
 	}
 	bool operator == (StorageServerInterface const& s) const { return uniqueID == s.uniqueID; }
 	bool operator < (StorageServerInterface const& s) const { return uniqueID < s.uniqueID; }
 	void initEndpoints() {
-		getValue.getEndpoint( TaskPriority::LoadBalancedEndpoint );
-		getKey.getEndpoint( TaskPriority::LoadBalancedEndpoint );
-		getKeyValues.getEndpoint( TaskPriority::LoadBalancedEndpoint );
+		std::vector<std::pair<FlowReceiver*, TaskPriority>> streams;
+		streams.push_back(getValue.getReceiver(TaskPriority::LoadBalancedEndpoint));
+		streams.push_back(getKey.getReceiver(TaskPriority::LoadBalancedEndpoint));
+		streams.push_back(getKeyValues.getReceiver(TaskPriority::LoadBalancedEndpoint));
+		streams.push_back(getShardState.getReceiver());
+		streams.push_back(waitMetrics.getReceiver());
+		streams.push_back(splitMetrics.getReceiver());
+		streams.push_back(getStorageMetrics.getReceiver());
+		streams.push_back(waitFailure.getReceiver());
+		streams.push_back(getQueuingMetrics.getReceiver());
+		streams.push_back(getKeyValueStoreType.getReceiver());
+		streams.push_back(watchValue.getReceiver());
+		streams.push_back(getReadHotRanges.getReceiver());
+		base = FlowTransport::transport().addEndpoints(streams);
 	}
 };

--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -49,16 +49,31 @@ const Value keyServersValue( Standalone<RangeResultRef> result, const std::vecto
 	std::vector<Tag> srcTag;
 	std::vector<Tag> destTag;

+	bool foundOldLocality = false;
 	for (const KeyValueRef kv : result) {
 		UID uid = decodeServerTagKey(kv.key);
 		if (std::find(src.begin(), src.end(), uid) != src.end()) {
 			srcTag.push_back( decodeServerTagValue(kv.value) );
+			if(srcTag.back().locality == tagLocalityUpgraded) {
+				foundOldLocality = true;
+				break;
+			}
 		}
 		if (std::find(dest.begin(), dest.end(), uid) != dest.end()) {
 			destTag.push_back( decodeServerTagValue(kv.value) );
+			if(destTag.back().locality == tagLocalityUpgraded) {
+				foundOldLocality = true;
+				break;
+			}
 		}
 	}

+	if(foundOldLocality || src.size() != srcTag.size() || dest.size() != destTag.size()) {
+		ASSERT_WE_THINK(foundOldLocality);
+		BinaryWriter wr(IncludeVersion()); wr << src << dest;
+		return wr.toValue();
+	}
+
 	return keyServersValue(srcTag, destTag);
 }
 const Value keyServersValue( const std::vector<Tag>& srcTag, const std::vector<Tag>& destTag ) {
@ -68,7 +83,7 @@ const Value keyServersValue( const std::vector<Tag>& srcTag, const std::vector<T
 }

 void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& value,
-                            std::vector<UID>& src, std::vector<UID>& dest  ) {
+                            std::vector<UID>& src, std::vector<UID>& dest, bool missingIsError ) {
 	if (value.size() == 0) {
 		src.clear();
 		dest.clear();
@ -106,13 +121,37 @@ void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& v
 	}
 	std::sort(src.begin(), src.end());
 	std::sort(dest.begin(), dest.end());
+	if(missingIsError && (src.size() != srcTag.size() || dest.size() != destTag.size())) {
+		TraceEvent(SevError, "AttemptedToDecodeMissingTag");
+		for (const KeyValueRef kv : result) {
+			Tag tag = decodeServerTagValue(kv.value);
+			UID serverID = decodeServerTagKey(kv.key);
+			TraceEvent("TagUIDMap").detail("Tag", tag.toString()).detail("UID", serverID.toString());
+		}
+		for(auto& it : srcTag) {
+			TraceEvent("SrcTag").detail("Tag", it.toString());
+		}
+		for(auto& it : destTag) {
+			TraceEvent("DestTag").detail("Tag", it.toString());
+		}
+		ASSERT(false);
+	}
 }

-const KeyRangeRef conflictingKeysRange = KeyRangeRef(LiteralStringRef("\xff\xff/transaction/conflicting_keys/"),
-                                                     LiteralStringRef("\xff\xff/transaction/conflicting_keys/\xff"));
+const KeyRangeRef conflictingKeysRange =
+    KeyRangeRef(LiteralStringRef("\xff\xff/transaction/conflicting_keys/"),
+                LiteralStringRef("\xff\xff/transaction/conflicting_keys/\xff\xff"));
 const ValueRef conflictingKeysTrue = LiteralStringRef("1");
 const ValueRef conflictingKeysFalse = LiteralStringRef("0");

+const KeyRangeRef readConflictRangeKeysRange =
+    KeyRangeRef(LiteralStringRef("\xff\xff/transaction/read_conflict_range/"),
+                LiteralStringRef("\xff\xff/transaction/read_conflict_range/\xff\xff"));
+
+const KeyRangeRef writeConflictRangeKeysRange =
+    KeyRangeRef(LiteralStringRef("\xff\xff/transaction/write_conflict_range/"),
+                LiteralStringRef("\xff\xff/transaction/write_conflict_range/\xff\xff"));
+
 //    "\xff/storageCache/[[begin]]" := "[[vector<uint16_t>]]"
 const KeyRangeRef storageCacheKeys( LiteralStringRef("\xff/storageCache/"), LiteralStringRef("\xff/storageCache0") );
 const KeyRef storageCachePrefix = storageCacheKeys.begin;
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -58,7 +58,7 @@ const Value keyServersValue(
 	const std::vector<Tag>& destTag = std::vector<Tag>());
 // `result` must be the full result of getting serverTagKeys
 void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& value,
-	std::vector<UID>& src, std::vector<UID>& dest  );
+	std::vector<UID>& src, std::vector<UID>& dest, bool missingIsError = true );

 //    "\xff/storageCache/[[begin]]" := "[[vector<uint16_t>]]"
 extern const KeyRangeRef storageCacheKeys;
@ -77,6 +77,8 @@ bool serverHasKey( ValueRef storedValue );

 extern const KeyRangeRef conflictingKeysRange;
 extern const ValueRef conflictingKeysTrue, conflictingKeysFalse;
+extern const KeyRangeRef writeConflictRangeKeysRange;
+extern const KeyRangeRef readConflictRangeKeysRange;

 extern const KeyRef cacheKeysPrefix;

--- a/fdbclient/VersionedMap.cpp
+++ b/fdbclient/VersionedMap.cpp
@ -0,0 +1,56 @@
+#include "fdbclient/VersionedMap.h"
+#include "flow/TreeBenchmark.h"
+#include "flow/UnitTest.h"
+
+template <typename K>
+struct VersionedMapHarness {
+	using map = VersionedMap<K, int>;
+	using key_type = K;
+
+	struct result {
+		typename map::iterator it;
+
+		result(typename map::iterator it) : it(it) {}
+
+		result& operator++() {
+			++it;
+			return *this;
+		}
+
+		const K& operator*() const { return it.key(); }
+
+		const K& operator->() const { return it.key(); }
+
+		bool operator==(result const& k) const { return it == k.it; }
+	};
+
+	map s;
+
+	void insert(K const& k) { s.insert(k, 1); }
+	result find(K const& k) const { return result(s.atLatest().find(k)); }
+	result not_found() const { return result(s.atLatest().end()); }
+	result begin() const { return result(s.atLatest().begin()); }
+	result end() const { return result(s.atLatest().end()); }
+	result lower_bound(K const& k) const { return result(s.atLatest().lower_bound(k)); }
+	result upper_bound(K const& k) const { return result(s.atLatest().upper_bound(k)); }
+	void erase(K const& k) { s.erase(k); }
+};
+
+TEST_CASE("performance/map/int/VersionedMap") {
+    VersionedMapHarness<int> tree;
+
+	treeBenchmark(tree, *randomInt);
+
+	return Void();
+}
+
+TEST_CASE("performance/map/StringRef/VersionedMap") {
+	Arena arena;
+    VersionedMapHarness<StringRef> tree;
+    
+	treeBenchmark(tree, [&arena]() { return randomStr(arena); });
+
+	return Void();
+}
+
+void forceLinkVersionedMapTests() {}
--- a/fdbclient/VersionedMap.h
+++ b/fdbclient/VersionedMap.h
@ -67,7 +67,63 @@ namespace PTreeImpl {
 		PTree(PTree const&);
 	};

-	template<class T>
+    template <class T>
+    class PTreeFinger {
+	    using PTreeFingerEntry = PTree<T> const*;
+	    // This finger size supports trees with up to exp(96/4.3) ~= 4,964,514,749 entries.
+	    // see also: check().
+	    static constexpr size_t N = 96;
+	    PTreeFingerEntry entries_[N];
+	    size_t size_ = 0;
+	    size_t bound_sz_ = 0;
+
+	public:
+	    PTreeFinger() {}
+
+	    // Explicit copy constructors ensure we copy the live values in entries_.
+	    PTreeFinger(PTreeFinger const& f) { *this = f; }
+	    PTreeFinger(PTreeFinger&& f) { *this = f; }
+
+	    PTreeFinger& operator=(PTreeFinger const& f) {
+		    size_ = f.size_;
+		    bound_sz_ = f.bound_sz_;
+		    std::copy(f.entries_, f.entries_ + size_, entries_);
+		    return *this;
+	    }
+
+	    PTreeFinger& operator=(PTreeFinger&& f) {
+		    size_ = std::exchange(f.size_, 0);
+		    bound_sz_ = f.bound_sz_;
+		    std::copy(f.entries_, f.entries_ + size_, entries_);
+		    return *this;
+	    }
+
+	    size_t size() const { return size_; }
+	    PTree<T> const* back() const { return entries_[size_ - 1]; }
+	    void pop_back() { size_--; }
+	    void clear() { size_ = 0; }
+	    PTree<T> const* operator[](size_t i) const { return entries_[i]; }
+
+	    void resize(size_t sz) {
+		    size_ = sz;
+		    ASSERT(size_ < N);
+	    }
+
+	    void push_back(PTree<T> const* node) {
+		    entries_[size_++] = { node };
+		    ASSERT(size_ < N);
+	    }
+
+	    void push_for_bound(PTree<T> const* node, bool less) {
+		    push_back(node);
+		    bound_sz_ = less ? size_ : bound_sz_;
+	    }
+
+	    // remove the end of the finger so that the last entry is less than the probe
+	    void trim_to_bound() { size_ = bound_sz_; }
+    };
+
+    template<class T>
 	static Reference<PTree<T>> update( Reference<PTree<T>> const& node, bool which, Reference<PTree<T>> const& ptr, Version at ) {
 		if (ptr.getPtr() == node->child(which, at).getPtr()/* && node->replacedVersion <= at*/) {
 			return node;
@ -109,38 +165,41 @@ namespace PTreeImpl {
 	template<class T, class X>
 	bool contains(const Reference<PTree<T>>& p, Version at, const X& x) {
 		if (!p) return false;
-		bool less = x < p->data;
-		if (!less && !(p->data<x)) return true;  // x == p->data
+		int cmp = compare(x, p->data);
+		bool less = cmp < 0;
+		if (cmp == 0) return true;
 		return contains(p->child(!less, at), at, x);
 	}

-	template<class T, class X>
-	void lower_bound(const Reference<PTree<T>>& p, Version at, const X& x, std::vector<const PTree<T>*>& f){
-		if (!p) {
-			while (f.size() && !(x < f.back()->data))
-				f.pop_back();
-			return;
+	// TODO: Remove the number of invocations of operator<, and replace with something closer to memcmp.
+	// and same for upper_bound.
+    template <class T, class X>
+    void lower_bound(const Reference<PTree<T>>& p, Version at, const X& x, PTreeFinger<T>& f) {
+	    if (!p) {
+		    f.trim_to_bound();
+		    return;
 		}
-		f.push_back(p.getPtr());
-		bool less = x < p->data;
-		if (!less && !(p->data<x)) return;  // x == p->data
-		lower_bound(p->child(!less, at), at, x, f);
-	}
+		int cmp = compare(x, p->data);
+		bool less = cmp < 0;
+	    f.push_for_bound(p.getPtr(), less);
+	    if (cmp == 0) return;
+	    lower_bound(p->child(!less, at), at, x, f);
+    }

-	template<class T, class X>
-	void upper_bound(const Reference<PTree<T>>& p, Version at, const X& x, std::vector<const PTree<T>*>& f){
-		if (!p) {
-			while (f.size() && !(x < f.back()->data))
-				f.pop_back();
-			return;
+    template <class T, class X>
+    void upper_bound(const Reference<PTree<T>>& p, Version at, const X& x, PTreeFinger<T>& f) {
+	    if (!p) {
+		    f.trim_to_bound();
+		    return;
 		}
-		f.push_back(p.getPtr());
-		upper_bound(p->child(!(x < p->data), at), at, x, f);
-	}
-	
-	template<class T, bool forward>
-	void move(Version at, std::vector<const PTree<T>*>& f){
-		ASSERT(f.size());
+		bool less = x < p->data;
+	    f.push_for_bound(p.getPtr(), less);
+	    upper_bound(p->child(!less, at), at, x, f);
+    }
+
+    template <class T, bool forward>
+    void move(Version at, PTreeFinger<T>& f) {
+	    ASSERT(f.size());
 		const PTree<T> *n;
 		n = f.back();
 		if (n->child(forward, at)){
@ -155,11 +214,11 @@ namespace PTreeImpl {
 				f.pop_back();
 			} while (f.size() && f.back()->child(forward, at).getPtr() == n);
 		}
-	}
+    }

-	template<class T, bool forward>
-	int halfMove(Version at, std::vector<const PTree<T>*>& f) {
-		// Post: f[:return_value] is the finger that would have been returned by move<forward>(at,f), and f[:original_length_of_f] is unmodified
+    template <class T, bool forward>
+    int halfMove(Version at, PTreeFinger<T>& f) {
+	    // Post: f[:return_value] is the finger that would have been returned by move<forward>(at,f), and f[:original_length_of_f] is unmodified
 		ASSERT(f.size());
 		const PTree<T> *n;
 		n = f.back();
@ -178,35 +237,35 @@ namespace PTreeImpl {
 			} while (s && f[s-1]->child(forward, at).getPtr() == n);
 			return s;
 		}
-	}
+    }

-	template<class T>
-	void next(Version at, std::vector<const PTree<T>*>& f){
-		move<T,true>(at, f);
-	}
-	
-	template<class T>
-	void previous(Version at, std::vector<const PTree<T>*>& f){
-		move<T,false>(at, f);
-	}
+    template <class T>
+    void next(Version at, PTreeFinger<T>& f) {
+	    move<T,true>(at, f);
+    }

-	template<class T>
-	int halfNext(Version at, std::vector<const PTree<T>*>& f){
-		return halfMove<T,true>(at, f);
-	}
-	
-	template<class T>
-	int halfPrevious(Version at, std::vector<const PTree<T>*>& f){
-		return halfMove<T,false>(at, f);
-	}
+    template <class T>
+    void previous(Version at, PTreeFinger<T>& f) {
+	    move<T,false>(at, f);
+    }

-	template<class T>
-	T get(std::vector<const PTree<T>*>& f){
-		ASSERT(f.size());
+    template <class T>
+    int halfNext(Version at, PTreeFinger<T>& f) {
+	    return halfMove<T,true>(at, f);
+    }
+
+    template <class T>
+    int halfPrevious(Version at, PTreeFinger<T>& f) {
+	    return halfMove<T,false>(at, f);
+    }
+
+    template <class T>
+    T get(PTreeFinger<T>& f) {
+	    ASSERT(f.size());
 		return f.back()->data;
-	}
+    }

-	// Modifies p to point to a PTree with x inserted
+    // Modifies p to point to a PTree with x inserted
 	template<class T>
 	void insert(Reference<PTree<T>>& p, Version at, const T& x) {
 		if (!p){
@ -235,24 +294,24 @@ namespace PTreeImpl {
 		return lastNode(p->right(at), at);
 	}

-	template<class T, bool last>
-	void firstOrLastFinger(const Reference<PTree<T>>& p, Version at, std::vector<const PTree<T>*>& f) {
-		if (!p) return;
+    template <class T, bool last>
+    void firstOrLastFinger(const Reference<PTree<T>>& p, Version at, PTreeFinger<T>& f) {
+	    if (!p) return;
 		f.push_back(p.getPtr());
 		firstOrLastFinger<T, last>(p->child(last, at), at, f);
-	}
-	
-	template<class T>
-	void first(const Reference<PTree<T>>& p, Version at, std::vector<const PTree<T>*>& f) {
-		return firstOrLastFinger<T, false>(p, at, f);
-	}
+    }

-	template<class T>
-	void last(const Reference<PTree<T>>& p, Version at, std::vector<const PTree<T>*>& f) {
-		return firstOrLastFinger<T, true>(p, at, f);
-	}
+    template <class T>
+    void first(const Reference<PTree<T>>& p, Version at, PTreeFinger<T>& f) {
+	    return firstOrLastFinger<T, false>(p, at, f);
+    }

-	// modifies p to point to a PTree with the root of p removed
+    template <class T>
+    void last(const Reference<PTree<T>>& p, Version at, PTreeFinger<T>& f) {
+	    return firstOrLastFinger<T, true>(p, at, f);
+    }
+
+    // modifies p to point to a PTree with the root of p removed
 	template<class T>
 	void removeRoot(Reference<PTree<T>>& p, Version at) {
 		if (!p->right(at))
@ -272,24 +331,27 @@ namespace PTreeImpl {
 	template<class T, class X>
 	void remove(Reference<PTree<T>>& p, Version at, const X& x) {
 		if (!p) ASSERT(false); // attempt to remove item not present in PTree
-		if (x < p->data) {
+		int cmp = compare(x, p->data);
+		if (cmp < 0) {
 			Reference<PTree<T>> child = p->child(0, at);
 			remove(child, at, x);
 			p = update(p, 0, child, at);
-		} else if (p->data < x) {
+		} else if (cmp > 0) {
 			Reference<PTree<T>> child = p->child(1, at);
 			remove(child, at, x);
 			p = update(p, 1, child, at);
-		} else
+		} else {
 			removeRoot(p, at);
+		}
 	}

 	template<class T, class X>
 	void remove(Reference<PTree<T>>& p, Version at, const X& begin, const X& end) {
 		if (!p) return;
 		int beginDir, endDir;
-		if (begin < p->data) beginDir = -1;
-		else if (p->data < begin) beginDir = +1;
+		int beginCmp = compare(begin, p->data);
+		if (beginCmp < 0) beginDir = -1;
+		else if (beginCmp > 0) beginDir = +1;
 		else beginDir = 0;
 		if (!(p->data < end)) endDir = -1;
 		else endDir = +1;
@ -364,7 +426,9 @@ namespace PTreeImpl {
 		if (!right) return left;

 		Reference<PTree<T>> r = Reference<PTree<T>>(new PTree<T>(lastNode(left, at)->data, at));
-		ASSERT( r->data < firstNode(right, at)->data);
+		if (EXPENSIVE_VALIDATION) {
+			ASSERT( r->data < firstNode(right, at)->data);
+		}
 		Reference<PTree<T>> a = left;
 		remove(a, at, r->data);

@ -513,6 +577,7 @@ class VersionedMap : NonCopyable {
 //private:
 public:
 	typedef PTreeImpl::PTree<MapPair<K,std::pair<T,Version>>> PTreeT;
+	typedef PTreeImpl::PTreeFinger<MapPair<K, std::pair<T, Version>>> PTreeFingerT;
 	typedef Reference< PTreeT > Tree;

 	Version oldestVersion, latestVersion;
@ -589,7 +654,7 @@ public:

 		UNSTOPPABLE_ASSERT(r->first == newOldestVersion);

-		vector<Tree> toFree;
+		std::vector<Tree> toFree;
 		toFree.reserve(10000);
 		auto newBegin = r;
 		Tree *lastRoot = nullptr;
@ -679,7 +744,7 @@ public:
 		friend class VersionedMap<K,T>;
 		Tree root;
 		Version at;
-		vector< PTreeT const* > finger;
+		PTreeFingerT finger;
 	};

 	class ViewAtVersion {
--- a/fdbclient/WriteMap.h
+++ b/fdbclient/WriteMap.h
@ -107,18 +107,35 @@ struct WriteMapEntry {

 	WriteMapEntry( KeyRef const& key, OperationStack && stack, bool following_keys_cleared, bool following_keys_conflict, bool is_conflict, bool following_keys_unreadable, bool is_unreadable ) : key(key), stack(std::move(stack)), following_keys_cleared(following_keys_cleared), following_keys_conflict(following_keys_conflict), is_conflict(is_conflict), following_keys_unreadable(following_keys_unreadable), is_unreadable(is_unreadable) {}

+	int compare(StringRef const& r) const { return key.compare(r); }
+
+	int compare(ExtStringRef const& r) const { return -r.compare(key); }
+
 	std::string toString() const { return printable(key); }
 };

+inline int compare(StringRef const& l, WriteMapEntry const& r) {
+	return l.compare(r.key);
+}
+
+inline int compare(ExtStringRef const& l, WriteMapEntry const& r) {
+	return l.compare(r.key);
+}
+
 inline bool operator < ( const WriteMapEntry& lhs, const WriteMapEntry& rhs ) { return lhs.key < rhs.key; }
 inline bool operator < ( const WriteMapEntry& lhs, const StringRef& rhs ) { return lhs.key < rhs; }
 inline bool operator < ( const StringRef& lhs, const WriteMapEntry& rhs ) { return lhs < rhs.key; }
-inline bool operator < ( const WriteMapEntry& lhs, const ExtStringRef& rhs ) { return rhs.cmp(lhs.key)>0; }
-inline bool operator < ( const ExtStringRef& lhs, const WriteMapEntry& rhs ) { return lhs.cmp(rhs.key)<0; }
+inline bool operator<(const WriteMapEntry& lhs, const ExtStringRef& rhs) {
+	return rhs.compare(lhs.key) > 0;
+}
+inline bool operator<(const ExtStringRef& lhs, const WriteMapEntry& rhs) {
+	return lhs.compare(rhs.key) < 0;
+}

 class WriteMap {
 private:
-	typedef PTreeImpl::PTree< WriteMapEntry > PTreeT;
+	typedef PTreeImpl::PTree<WriteMapEntry> PTreeT;
+	typedef PTreeImpl::PTreeFinger<WriteMapEntry> PTreeFingerT;
 	typedef Reference<PTreeT> Tree;

 public:
@ -374,7 +391,7 @@ public:
 		Tree tree;
 		Version at;
 		int beginLen, endLen;
-		vector< PTreeT const* > finger;
+		PTreeFingerT finger;
 		bool offset;  // false-> the operation stack at entry(); true-> the following cleared or unmodified range
 	};

--- a/fdbrpc/AsyncFileEIO.actor.h
+++ b/fdbrpc/AsyncFileEIO.actor.h
@ -415,7 +415,7 @@ private:
 		return data.result.get();
 	}

-	static volatile int32_t want_poll;
+	static std::atomic<int32_t> want_poll;

 	ACTOR static void poll_eio() {
 		while (eio_poll() == -1)
@ -445,7 +445,7 @@ private:
 };

 #ifdef FILESYSTEM_IMPL
-volatile int32_t AsyncFileEIO::want_poll = 0;
+std::atomic<int32_t> AsyncFileEIO::want_poll = 0;
 #endif

 #include "flow/unactorcompiler.h"
--- a/fdbrpc/FailureMonitor.h
+++ b/fdbrpc/FailureMonitor.h
@ -23,7 +23,6 @@
 #pragma once

 #include "flow/flow.h"
-#include "flow/IndexedSet.h"
 #include "fdbrpc/FlowTransport.h" // Endpoint
 #include <unordered_map>

--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -52,6 +52,7 @@ class EndpointMap : NonCopyable {
 public:
 	EndpointMap();
 	void insert( NetworkMessageReceiver* r, Endpoint::Token& token, TaskPriority priority );
+	const Endpoint& insert( NetworkAddressList localAddresses, std::vector<std::pair<FlowReceiver*, TaskPriority>> const& streams );
 	NetworkMessageReceiver* get( Endpoint::Token const& token );
 	TaskPriority getPriority( Endpoint::Token const& token );
 	void remove( Endpoint::Token const& token, NetworkMessageReceiver* r );
@ -96,6 +97,41 @@ void EndpointMap::insert( NetworkMessageReceiver* r, Endpoint::Token& token, Tas
 	data[index].receiver = r;
 }

+const Endpoint& EndpointMap::insert( NetworkAddressList localAddresses, std::vector<std::pair<FlowReceiver*, TaskPriority>> const& streams ) {
+	int adjacentFree = 0;
+	int adjacentStart = -1;
+	firstFree = -1;
+	for(int i = 0; i < data.size(); i++) {
+		if(data[i].receiver) {
+			adjacentFree = 0;
+		} else {
+			data[i].nextFree = firstFree;
+			firstFree = i;
+			if(adjacentStart == -1 && ++adjacentFree == streams.size()) {
+				adjacentStart = i+1-adjacentFree;
+				firstFree = data[adjacentStart].nextFree;
+			}
+		}
+	}
+	if(adjacentStart == -1) {
+		data.resize( data.size()+streams.size()-adjacentFree );
+		adjacentStart = data.size()-streams.size();
+		if(adjacentFree > 0) {
+			firstFree = data[adjacentStart].nextFree;
+		}
+	}
+
+	UID base = deterministicRandom()->randomUniqueID();
+	for(int i=0; i<streams.size(); i++) {
+		int index = adjacentStart+i;
+		streams[i].first->setEndpoint( Endpoint( localAddresses, UID( base.first() | TOKEN_STREAM_FLAG, (base.second()&0xffffffff00000000LL) | index) ) );
+		data[index].token() = Endpoint::Token( base.first() | TOKEN_STREAM_FLAG, (base.second()&0xffffffff00000000LL) | static_cast<uint32_t>(streams[i].second) );
+		data[index].receiver = (NetworkMessageReceiver*) streams[i].first;
+	}
+
+	return streams[0].first->getEndpoint(TaskPriority::DefaultEndpoint);
+}
+
 NetworkMessageReceiver* EndpointMap::get( Endpoint::Token const& token ) {
 	uint32_t index = token.second();
 	if ( index < data.size() && data[index].token().first() == token.first() && ((data[index].token().second()&0xffffffff00000000LL)|index)==token.second() )
@ -559,7 +595,7 @@ ACTOR Future<Void> connectionKeeper( Reference<Peer> self,

 			// Don't immediately mark connection as failed. To stay closed to earlier behaviour of centralized
 			// failure monitoring, wait until connection stays failed for FLOW_KNOBS->FAILURE_DETECTION_DELAY timeout.
-			retryConnect = self->destination.isPublic() && e.code() == error_code_connection_failed;
+			retryConnect = true;
 			if (e.code() == error_code_connection_failed) {
 				if (!self->destination.isPublic()) {
 					// Can't connect back to non-public addresses.
@ -1240,6 +1276,10 @@ void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* rec
 	self->endpoints.insert( receiver, endpoint.token, taskID );
 }

+const Endpoint& FlowTransport::addEndpoints( std::vector<std::pair<FlowReceiver*, TaskPriority>> const& streams ) {
+	return self->endpoints.insert( self->localAddresses, streams );
+}
+
 void FlowTransport::removeEndpoint( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) {
 	self->endpoints.remove(endpoint.token, receiver);
 }
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@ -65,6 +65,12 @@ public:
 		return addresses.getTLSAddress();
 	}

+	Endpoint getAdjustedEndpoint( uint32_t index ) {
+		uint32_t newIndex = token.second();
+		newIndex += index;
+		return Endpoint( addresses, UID(token.first(), (token.second()&0xffffffff00000000LL) | newIndex) );
+	}
+
 	bool operator == (Endpoint const& r) const {
 		return getPrimaryAddress() == r.getPrimaryAddress() && token == r.token;
 	}
@ -180,6 +186,8 @@ public:
 	void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID );
 	// Sets endpoint to be a new local endpoint which delivers messages to the given receiver

+	const Endpoint& addEndpoints( std::vector<std::pair<struct FlowReceiver*, TaskPriority>> const& streams );
+
 	void removeEndpoint( const Endpoint&, NetworkMessageReceiver* );
 	// The given local endpoint no longer delivers messages to the given receiver or uses resources

--- a/fdbrpc/ReplicationUtils.cpp
+++ b/fdbrpc/ReplicationUtils.cpp
@ -922,6 +922,14 @@ void filterLocalityDataForPolicy(const std::set<std::string>& keys, LocalityData
 }
 }

+void filterLocalityDataForPolicyDcAndProcess(Reference<IReplicationPolicy> policy, LocalityData* ld) {
+	if (!policy) return;
+	std::set<std::string> keys = policy->attributeKeys();
+	keys.insert(LocalityData::keyDcId.toString());
+	keys.insert(LocalityData::keyProcessId.toString());
+	filterLocalityDataForPolicy(policy->attributeKeys(), ld);
+}
+
 void filterLocalityDataForPolicy(Reference<IReplicationPolicy> policy, LocalityData* ld) {
 	if (!policy) return;
 	filterLocalityDataForPolicy(policy->attributeKeys(), ld);
--- a/fdbrpc/ReplicationUtils.h
+++ b/fdbrpc/ReplicationUtils.h
@ -85,6 +85,7 @@ extern bool validateAllCombinations(
 	bool															bCheckIfValid = true);

 /// Remove all pieces of locality information from the LocalityData that will not be used when validating the policy.
+void filterLocalityDataForPolicyDcAndProcess(Reference<IReplicationPolicy> policy, LocalityData* ld);
 void filterLocalityDataForPolicy(Reference<IReplicationPolicy> policy, LocalityData* ld);
 void filterLocalityDataForPolicy(Reference<IReplicationPolicy> policy, std::vector<LocalityData>* vld);

--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@ -28,7 +28,7 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/networksender.actor.h"

-struct FlowReceiver : private NetworkMessageReceiver {
+struct FlowReceiver : public NetworkMessageReceiver {
 	// Common endpoint code for NetSAV<> and NetNotifiedQueue<>

 	FlowReceiver() : m_isLocalEndpoint(false), m_stream(false) {
@ -60,6 +60,12 @@ struct FlowReceiver : private NetworkMessageReceiver {
 		return endpoint;
 	}

+	void setEndpoint(Endpoint const& e) {
+		ASSERT(!endpoint.isValid());
+		m_isLocalEndpoint = true;
+		endpoint = e;
+	}
+
 	void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) {
 		ASSERT(!endpoint.isValid());
 		m_isLocalEndpoint = true;
@ -392,6 +398,10 @@ public:
 	bool isEmpty() const { return !queue->isReady(); }
 	uint32_t size() const { return queue->size(); }

+	std::pair<FlowReceiver*, TaskPriority> getReceiver( TaskPriority taskID = TaskPriority::DefaultEndpoint ) {
+		return std::make_pair((FlowReceiver*)queue, taskID);
+	}
+
 private:
 	NetNotifiedQueue<T>* queue;
 };
--- a/fdbrpc/libeio/eio.c
+++ b/fdbrpc/libeio/eio.c
@ -366,19 +366,19 @@ tvdiff (struct timeval *tv1, struct timeval *tv2)
       + ((tv2->tv_usec - tv1->tv_usec) >> 10);
 }

-static unsigned int started, idle, wanted = 4;
+static _Atomic(unsigned int) started, idle, wanted = 4;

 static void (*want_poll_cb) (void);
 static void (*done_poll_cb) (void);
- 
-static unsigned int max_poll_time;     /* reslock */
-static unsigned int max_poll_reqs;     /* reslock */

-static unsigned int nreqs;    /* reqlock */
-static unsigned int nready;   /* reqlock */
-static unsigned int npending; /* reqlock */
-static unsigned int max_idle = 4;      /* maximum number of threads that can idle indefinitely */
-static unsigned int idle_timeout = 10; /* number of seconds after which an idle threads exit */
+static _Atomic(unsigned int) max_poll_time; /* reslock */
+static _Atomic(unsigned int) max_poll_reqs; /* reslock */
+
+static _Atomic(unsigned int) nreqs; /* reqlock */
+static _Atomic(unsigned int) nready; /* reqlock */
+static _Atomic(unsigned int) npending; /* reqlock */
+static _Atomic(unsigned int) max_idle = 4; /* maximum number of threads that can idle indefinitely */
+static _Atomic(unsigned int) idle_timeout = 10; /* number of seconds after which an idle threads exit */

 static xmutex_t wrklock;
 static xmutex_t reslock;
@ -435,9 +435,7 @@ static unsigned int
 etp_nreqs (void)
 {
  int retval;
-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  retval = nreqs;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
  return retval;
 }

@ -446,9 +444,7 @@ etp_nready (void)
 {
  unsigned int retval;

-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  retval = nready;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);

  return retval;
 }
@ -458,9 +454,7 @@ etp_npending (void)
 {
  unsigned int retval;

-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  retval = npending;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);

  return retval;
 }
@ -470,9 +464,7 @@ etp_nthreads (void)
 {
  unsigned int retval;

-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  retval = started;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);

  return retval;
 }
@ -744,33 +736,25 @@ etp_submit (ETP_REQ *req)
 static void ecb_cold
 etp_set_max_poll_time (double nseconds)
 {
-  if (WORDACCESS_UNSAFE) X_LOCK   (reslock);
  max_poll_time = nseconds * EIO_TICKS;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reslock);
 }

 static void ecb_cold
 etp_set_max_poll_reqs (unsigned int maxreqs)
 {
-  if (WORDACCESS_UNSAFE) X_LOCK   (reslock);
  max_poll_reqs = maxreqs;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reslock);
 }

 static void ecb_cold
 etp_set_max_idle (unsigned int nthreads)
 {
-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  max_idle = nthreads;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 }

 static void ecb_cold
 etp_set_idle_timeout (unsigned int seconds)
 {
-  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
  idle_timeout = seconds;
-  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 }

 static void ecb_cold
--- a/fdbrpc/libeio/xthread.h
+++ b/fdbrpc/libeio/xthread.h
@ -1,18 +1,6 @@
 #ifndef XTHREAD_H_
 #define XTHREAD_H_

-/* whether word reads are potentially non-atomic.
- * this is conservative, likely most arches this runs
- * on have atomic word read/writes.
- */
-#ifndef WORDACCESS_UNSAFE
-# if __i386 || __x86_64
-#  define WORDACCESS_UNSAFE 0
-# else
-#  define WORDACCESS_UNSAFE 1
-# endif
-#endif
-
 /////////////////////////////////////////////////////////////////////////////

 #ifdef _WIN32
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -1054,7 +1054,7 @@ public:
 		m->machine = &machine;
 		machine.processes.push_back(m);
 		currentlyRebootingProcesses.erase(addresses.address);
-		m->excluded = g_simulator.isExcluded(addresses.address);
+		m->excluded = g_simulator.isExcluded(NetworkAddress(ip, port, true, false));
 		m->cleared = g_simulator.isCleared(addresses.address);

 		m->setGlobal(enTDMetrics, (flowGlobalType) &m->tdmetrics);
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -1025,7 +1025,7 @@ public:

 		for( auto& logSet : dbi.logSystemConfig.tLogs ) {
 			for( auto& it : logSet.tLogs ) {
-				auto tlogWorker = id_worker.find(it.interf().locality.processId());
+				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
 				if ( tlogWorker == id_worker.end() )
 					return false;
 				if ( tlogWorker->second.priorityInfo.isExcluded )
@ -1042,7 +1042,7 @@ public:
 			}

 			for( auto& it : logSet.logRouters ) {
-				auto tlogWorker = id_worker.find(it.interf().locality.processId());
+				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
 				if ( tlogWorker == id_worker.end() )
 					return false;
 				if ( tlogWorker->second.priorityInfo.isExcluded )
@ -1067,7 +1067,7 @@ public:
 		// Get proxy classes
 		std::vector<WorkerDetails> proxyClasses;
 		for(auto& it : dbi.client.proxies ) {
-			auto proxyWorker = id_worker.find(it.locality.processId());
+			auto proxyWorker = id_worker.find(it.processId);
 			if ( proxyWorker == id_worker.end() )
 				return false;
 			if ( proxyWorker->second.priorityInfo.isExcluded )
@ -1260,11 +1260,11 @@ public:
 		auto& dbInfo = db.serverInfo->get();
 		for (const auto& tlogset : dbInfo.logSystemConfig.tLogs) {
 			for (const auto& tlog: tlogset.tLogs) {
-				if (tlog.present() && tlog.interf().locality.processId() == processId) return true;
+				if (tlog.present() && tlog.interf().filteredLocality.processId() == processId) return true;
 			}
 		}
 		for (const MasterProxyInterface& interf : dbInfo.client.proxies) {
-			if (interf.locality.processId() == processId) return true;
+			if (interf.processId == processId) return true;
 		}
 		for (const ResolverInterface& interf: dbInfo.resolvers) {
 			if (interf.locality.processId() == processId) return true;
@ -1291,13 +1291,13 @@ public:
 		for (const auto& tlogset : dbInfo.logSystemConfig.tLogs) {
 			for (const auto& tlog: tlogset.tLogs) {
 				if (tlog.present()) {
-					idUsed[tlog.interf().locality.processId()]++;
+					idUsed[tlog.interf().filteredLocality.processId()]++;
 				}
 			}
 		}
 		for (const MasterProxyInterface& interf : dbInfo.client.proxies) {
-			ASSERT(interf.locality.processId().present());
-			idUsed[interf.locality.processId()]++;
+			ASSERT(interf.processId.present());
+			idUsed[interf.processId]++;
 		}
 		for (const ResolverInterface& interf: dbInfo.resolvers) {
 			ASSERT(interf.locality.processId().present());
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@ -367,9 +367,10 @@ public:

 		const T* upperBound() const { return upper; }

+		DeltaTree* tree;
+
 	private:
 		Arena arena;
-		DeltaTree* tree;
 		DecodedNode* root;
 		const T* lower;
 		const T* upper;
@ -470,7 +471,8 @@ public:
 			newNode->prev = prev;
 			newNode->next = next;

-			ASSERT(deltaSize == k.writeDelta(raw->delta(tree->largeNodes), *base, commonPrefix));
+			int written = k.writeDelta(raw->delta(tree->largeNodes), *base, commonPrefix);
+			ASSERT(deltaSize == written);
 			raw->delta(tree->largeNodes).setPrefixSource(basePrev);

 			// Initialize node's item from the delta (instead of copying into arena) to avoid unnecessary arena space
--- a/fdbserver/FDBExecHelper.actor.cpp
+++ b/fdbserver/FDBExecHelper.actor.cpp
@ -77,65 +77,98 @@ ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> par
 	return 0;
 }
 #else
-ACTOR Future<int> spawnProcess(std::string binPath, std::vector<std::string> paramList, double maxWaitTime, bool isSync, double maxSimDelayTime)
+
+pid_t fork_child(const std::string& path,
+				 std::vector<char*>& paramList)
 {
-	state std::string argsString;
-	for (auto const& elem : paramList) {
-		argsString += elem + ",";
+	pid_t pid = fork();
+	if (pid == -1) {
+		return -1;
 	}
-	TraceEvent("SpawnProcess").detail("Cmd", binPath).detail("Args", argsString);
+	if (pid == 0) {
+		execv(const_cast<char*>(path.c_str()), &paramList[0]);
+		_exit(EXIT_FAILURE);
+	}
+	return pid;
+}

-	state int err = 0;
-	state double runTime = 0;
-	state boost::process::child c(binPath, boost::process::args(paramList),
-								  boost::process::std_err > boost::process::null);
-
-	// for async calls in simulator, always delay by a deterinistic amount of time and do the call
-	// synchronously, otherwise the predictability of the simulator breaks
+ACTOR Future<int> spawnProcess(std::string path, std::vector<std::string> args, double maxWaitTime, bool isSync, double maxSimDelayTime)
+{
+	// for async calls in simulator, always delay by a deterministic amount of time and then
+	// do the call synchronously, otherwise the predictability of the simulator breaks
 	if (!isSync && g_network->isSimulated()) {
 		double snapDelay = std::max(maxSimDelayTime - 1, 0.0);
 		// add some randomness
 		snapDelay += deterministicRandom()->random01();
 		TraceEvent("SnapDelaySpawnProcess")
-			.detail("SnapDelay", snapDelay);
+				.detail("SnapDelay", snapDelay);
 		wait(delay(snapDelay));
 	}

-	if (!isSync && !g_network->isSimulated()) {
-		while (c.running() && runTime <= maxWaitTime) {
-			wait(delay(0.1));
-			runTime += 0.1;
-		}
-	} else {
-		if (g_network->isSimulated()) {
-			// to keep the simulator deterministic, wait till the process exits,
-			// hence giving a large wait time
-			c.wait_for(std::chrono::hours(24));
-			ASSERT(!c.running());
-		} else {
-			int maxWaitTimeInt = static_cast<int>(maxWaitTime + 1.0);
-			c.wait_for(std::chrono::seconds(maxWaitTimeInt));
-		}
+	std::vector<char*> paramList;
+	for (int i = 0; i < args.size(); i++) {
+		paramList.push_back(const_cast<char*>(args[i].c_str()));
+	}
+	paramList.push_back(nullptr);
+
+	state std::string allArgs;
+	for (int i = 0; i < args.size(); i++) {
+		allArgs += args[i];
 	}

-	if (c.running()) {
-		TraceEvent(SevWarnAlways, "ChildTermination")
-				.detail("Cmd", binPath)
-				.detail("Args", argsString);
-		c.terminate();
-		err = -1;
-		if (!c.wait_for(std::chrono::seconds(1))) {
-			TraceEvent(SevWarnAlways, "SpawnProcessFailedToExit")
-				.detail("Cmd", binPath)
-				.detail("Args", argsString);
+	state pid_t pid = fork_child(path, paramList);
+	if (pid == -1) {
+		TraceEvent(SevWarnAlways, "SpawnProcess: Command failed to spawn")
+			.detail("Cmd", path)
+			.detail("Args", allArgs);
+		return -1;
+	} else if (pid > 0) {
+		state int status = -1;
+		state double runTime = 0;
+		while (true) {
+			if (runTime > maxWaitTime) {
+				// timing out
+				TraceEvent(SevWarnAlways, "SpawnProcess : Command failed, timeout")
+					.detail("Cmd", path)
+					.detail("Args", allArgs);
+				return -1;
+			}
+			int err = waitpid(pid, &status, WNOHANG);
+			if (err < 0) {
+				TraceEvent(SevWarnAlways, "SpawnProcess : Command failed")
+					.detail("Cmd", path)
+					.detail("Args", allArgs)
+					.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
+				return -1;
+			} else if (err == 0) {
+				// child process has not completed yet
+				if (isSync || g_network->isSimulated()) {
+					// synchronously sleep
+					threadSleep(0.1);
+				} else {
+					// yield for other actors to run
+					wait(delay(0.1));
+				}
+				runTime += 0.1;
+			} else {
+				// child process completed
+				if (!(WIFEXITED(status) && WEXITSTATUS(status) == 0)) {
+					TraceEvent(SevWarnAlways, "SpawnProcess : Command failed")
+						.detail("Cmd", path)
+						.detail("Args", allArgs)
+						.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
+					return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
+				}
+				TraceEvent("SpawnProcess : Command status")
+					.detail("Cmd", path)
+					.detail("Args", allArgs)
+					.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : 0);
+				return 0;
+			}
 		}
-	} else {
-		err = c.exit_code();
 	}
-	TraceEvent("SpawnProcess")
-		.detail("Cmd", binPath)
-		.detail("Error", err);
-	return err;
+	return -1;
+
 }
 #endif

@ -148,6 +181,7 @@ ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::stri
 		// get bin path
 		auto snapBin = execArg->getBinaryPath();
 		std::vector<std::string> paramList;
+		paramList.push_back(snapBin.toString());
 		// get user passed arguments
 		auto listArgs = execArg->getBinaryArgs();
 		for (auto elem : listArgs) {
@ -174,6 +208,7 @@ ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::stri
 		folderTo = folder + "-snap-" + uidStr.toString() + "-" + role;
 		std::vector<std::string> paramList;
 		std::string mkdirBin = "/bin/mkdir";
+		paramList.push_back(mkdirBin);
 		paramList.push_back(folderTo);
 		cmdErr = spawnProcess(mkdirBin, paramList, maxWaitTime, false /*isSync*/, maxSimDelayTime);
 		wait(success(cmdErr));
@ -181,6 +216,7 @@ ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::stri
 		if (err == 0) {
 			std::vector<std::string> paramList;
 			std::string cpBin = "/bin/cp";
+			paramList.push_back(cpBin);
 			paramList.push_back("-a");
 			paramList.push_back(folderFrom);
 			paramList.push_back(folderTo);
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@ -27,33 +27,6 @@
 #include "flow/flow.h"
 #include "fdbclient/FDBTypes.h"

-#define REDWOOD_DEBUG 0
-
-#define debug_printf_stream stdout
-#define debug_printf_always(...)                                                                                       \
-	{                                                                                                                  \
-		fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); \
-		fprintf(debug_printf_stream, __VA_ARGS__);                                                                     \
-		fflush(debug_printf_stream);                                                                                   \
-	}
-
-#define debug_printf_noop(...)
-
-#if defined(NO_INTELLISENSE)
-#if REDWOOD_DEBUG
-#define debug_printf debug_printf_always
-#else
-#define debug_printf debug_printf_noop
-#endif
-#else
-// To get error-checking on debug_printf statements in IDE
-#define debug_printf printf
-#endif
-
-#define BEACON debug_printf_always("HERE\n")
-#define TRACE                                                                                                          \
-	debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str());
-
 #ifndef VALGRIND
 #define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
 #define VALGRIND_MAKE_MEM_DEFINED(x, y)
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -425,7 +425,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi

 	init( POLICY_RATING_TESTS,                                   200 ); if( randomize && BUGGIFY ) POLICY_RATING_TESTS = 20;
 	init( POLICY_GENERATIONS,                                    100 ); if( randomize && BUGGIFY ) POLICY_GENERATIONS = 10;
-	init( DBINFO_SEND_AMOUNT,                                      2 );
+	init( DBINFO_SEND_AMOUNT,                                      5 );
 	init( DBINFO_BATCH_DELAY,                                    0.1 );

 	//Move Keys
@ -619,6 +619,9 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES, 1.0 * 1024.0 * 1024.0 ); if( randomize && BUGGIFY ) { FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES = deterministicRandom()->random01() * 10.0 * 1024.0 * 1024.0 + 1; }
 	init( FASTRESTORE_GET_RANGE_VERSIONS_EXPENSIVE,            false ); if( randomize && BUGGIFY ) { FASTRESTORE_GET_RANGE_VERSIONS_EXPENSIVE = deterministicRandom()->random01() < 0.5 ? true : false; }
 	init( FASTRESTORE_REQBATCH_PARALLEL,                          50 ); if( randomize && BUGGIFY ) { FASTRESTORE_REQBATCH_PARALLEL = deterministicRandom()->random01() * 100 + 1; }
+	init( FASTRESTORE_REQBATCH_LOG,                            false ); if( randomize && BUGGIFY ) { FASTRESTORE_REQBATCH_LOG = deterministicRandom()->random01() < 0.2 ? true : false; }
+	init( FASTRESTORE_TXN_CLEAR_MAX,                            1000 ); if( randomize && BUGGIFY ) { FASTRESTORE_TXN_CLEAR_MAX = deterministicRandom()->random01() * 100 + 1; }
+	init( FASTRESTORE_TXN_RETRY_MAX,                              10 ); if( randomize && BUGGIFY ) { FASTRESTORE_TXN_RETRY_MAX = deterministicRandom()->random01() * 100 + 1; }

 	// clang-format on

--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@ -551,6 +551,9 @@ public:
 	int64_t FASTRESTORE_LOADER_SEND_MUTATION_MSG_BYTES; // desired size of mutation message sent from loader to appliers
 	bool FASTRESTORE_GET_RANGE_VERSIONS_EXPENSIVE; // parse each range file to get (range, version) it has?
 	int64_t FASTRESTORE_REQBATCH_PARALLEL; // number of requests to wait on for getBatchReplies()
+	bool FASTRESTORE_REQBATCH_LOG; // verbose log information for getReplyBatches
+	int FASTRESTORE_TXN_CLEAR_MAX; // threshold to start tracking each clear op in a txn
+	int FASTRESTORE_TXN_RETRY_MAX; // threshold to start output error on too many retries

 	ServerKnobs();
 	void initialize(bool randomize = false, ClientKnobs* clientKnobs = NULL, bool isSimulated = false);
--- a/fdbserver/LogRouter.actor.cpp
+++ b/fdbserver/LogRouter.actor.cpp
@ -529,7 +529,7 @@ ACTOR Future<Void> logRouterCore(
 	loop choose {
 		when( wait( dbInfoChange ) ) {
 			dbInfoChange = db->onChange();
-			logRouterData.allowPops = db->get().recoveryState == RecoveryState::FULLY_RECOVERED;
+			logRouterData.allowPops = db->get().recoveryState == RecoveryState::FULLY_RECOVERED && db->get().recoveryCount >= req.recoveryCount;
 			logRouterData.logSystem->set(ILogSystem::fromServerDBInfo( logRouterData.dbgid, db->get(), true ));
 		}
 		when( TLogPeekRequest req = waitNext( interf.peekMessages.getFuture() ) ) {
--- a/fdbserver/LogSystemConfig.h
+++ b/fdbserver/LogSystemConfig.h
@ -233,7 +233,7 @@ struct LogSystemConfig {
 			if(!tLogs[i].isLocal) {
 				for( int j = 0; j < tLogs[i].tLogs.size(); j++ ) {
 					if( tLogs[i].tLogs[j].present() ) {
-						return tLogs[i].tLogs[j].interf().locality.dcId();
+						return tLogs[i].tLogs[j].interf().filteredLocality.dcId();
 					}
 				}
 			}
@ -277,7 +277,7 @@ struct LogSystemConfig {
 		for( auto& tLogSet : tLogs ) {
 			for( auto& tLog : tLogSet.tLogs ) {
 				if( tLogSet.locality >= 0 ) {
-					if( tLog.present() && tLog.interf().locality.dcId() == dcId ) {
+					if( tLog.present() && tLog.interf().filteredLocality.dcId() == dcId ) {
 						matchingLocalities[tLogSet.locality]++;
 					} else {
 						allLocalities[tLogSet.locality]++;
@ -290,7 +290,7 @@ struct LogSystemConfig {
 			for( auto& tLogSet : oldLog.tLogs ) {
 				for( auto& tLog : tLogSet.tLogs ) {
 					if( tLogSet.locality >= 0 ) {
-						if( tLog.present() && tLog.interf().locality.dcId() == dcId ) {
+						if( tLog.present() && tLog.interf().filteredLocality.dcId() == dcId ) {
 							matchingLocalities[tLogSet.locality]++;
 						} else {
 							allLocalities[tLogSet.locality]++;
--- a/fdbserver/MasterInterface.h
+++ b/fdbserver/MasterInterface.h
@ -33,6 +33,7 @@ typedef uint64_t DBRecoveryCount;
 struct MasterInterface {
 	constexpr static FileIdentifier file_identifier = 5979145;
 	LocalityData locality;
+	Endpoint base;
 	RequestStream< ReplyPromise<Void> > waitFailure;
 	RequestStream< struct TLogRejoinRequest > tlogRejoin; // sent by tlog (whether or not rebooted) to communicate with a new master
 	RequestStream< struct ChangeCoordinatorsRequest > changeCoordinators;
@ -48,12 +49,24 @@ struct MasterInterface {
 		if constexpr (!is_fb_function<Archive>) {
 			ASSERT(ar.protocolVersion().isValid());
 		}
-		serializer(ar, locality, waitFailure, tlogRejoin, changeCoordinators, getCommitVersion, notifyBackupWorkerDone);
+		serializer(ar, locality, base);
+		if( Archive::isDeserializing ) {
+			waitFailure = RequestStream< ReplyPromise<Void> >( base.getAdjustedEndpoint(0) );
+			tlogRejoin = RequestStream< struct TLogRejoinRequest >( base.getAdjustedEndpoint(1) );
+			changeCoordinators = RequestStream< struct ChangeCoordinatorsRequest >( base.getAdjustedEndpoint(2) );
+			getCommitVersion = RequestStream< struct GetCommitVersionRequest >( base.getAdjustedEndpoint(3) );
+			notifyBackupWorkerDone = RequestStream<struct BackupWorkerDoneRequest>( base.getAdjustedEndpoint(4) );
+		}
 	}

 	void initEndpoints() {
-		getCommitVersion.getEndpoint( TaskPriority::GetConsistentReadVersion );
-		tlogRejoin.getEndpoint( TaskPriority::MasterTLogRejoin );
+		std::vector<std::pair<FlowReceiver*, TaskPriority>> streams;
+		streams.push_back(waitFailure.getReceiver());
+		streams.push_back(tlogRejoin.getReceiver(TaskPriority::MasterTLogRejoin));
+		streams.push_back(changeCoordinators.getReceiver());
+		streams.push_back(getCommitVersion.getReceiver(TaskPriority::GetConsistentReadVersion));
+		streams.push_back(notifyBackupWorkerDone.getReceiver());
+		base = FlowTransport::transport().addEndpoints(streams);
 	}
 };

--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@ -1690,10 +1690,12 @@ ACTOR static Future<Void> rejoinServer( MasterProxyInterface proxy, ProxyCommitD
 				rep.history.push_back(std::make_pair(decodeServerTagHistoryKey(history[i].key), decodeServerTagValue(history[i].value)));
 			}
 			auto localityKey = commitData->txnStateStore->readValue(tagLocalityListKeyFor(req.dcId)).get();
+			rep.newLocality = false;
 			if( localityKey.present() ) {
-				rep.newLocality = false;
 				int8_t locality = decodeTagLocalityListValue(localityKey.get());
-				if(locality != rep.tag.locality) {
+				if(rep.tag.locality != tagLocalityUpgraded && locality != rep.tag.locality) {
+					TraceEvent(SevWarnAlways, "SSRejoinedWithChangedLocality").detail("Tag", rep.tag.toString()).detail("DcId", req.dcId).detail("NewLocality", locality);
+				} else if(locality != rep.tag.locality) {
 					uint16_t tagId = 0;
 					std::vector<uint16_t> usedTags;
 					auto tagKeys = commitData->txnStateStore->readRange(serverTagKeys).get();
@ -1722,6 +1724,8 @@ ACTOR static Future<Void> rejoinServer( MasterProxyInterface proxy, ProxyCommitD
 					}
 					rep.newTag = Tag(locality, tagId);
 				}
+			} else if(rep.tag.locality != tagLocalityUpgraded) {
+				TraceEvent(SevWarnAlways, "SSRejoinedWithUnknownLocality").detail("Tag", rep.tag.toString()).detail("DcId", req.dcId);
 			} else {
 				rep.newLocality = true;
 				int8_t maxTagLocality = -1;
--- a/fdbserver/RestoreApplier.actor.cpp
+++ b/fdbserver/RestoreApplier.actor.cpp
@ -47,8 +47,8 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
 	state Future<Void> exitRole = Never();
 	state Future<Void> updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);

-	actors.add(traceProcessMetrics(self, "Applier"));
-	actors.add(traceRoleVersionBatchProgress(self, "Applier"));
+	actors.add(traceProcessMetrics(self, "RestoreApplier"));
+	actors.add(traceRoleVersionBatchProgress(self, "RestoreApplier"));

 	loop {
 		state std::string requestTypeStr = "[Init]";
@ -113,6 +113,7 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
 	state NotifiedVersion& curMsgIndex = batchData->processedFileState[req.asset];

 	TraceEvent(SevInfo, "FastRestoreApplierPhaseReceiveMutations", self->id())
+	    .suppressFor(1.0)
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("RestoreAsset", req.asset.toString())
 	    .detail("RestoreAssetMesssageIndex", curMsgIndex.get())
@ -157,6 +158,7 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu

 	req.reply.send(RestoreCommonReply(self->id(), isDuplicated));
 	TraceEvent(SevInfo, "FastRestoreApplierPhaseReceiveMutationsDone", self->id())
+	    .suppressFor(1.0)
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("RestoreAsset", req.asset.toString())
 	    .detail("ProcessedMessageIndex", curMsgIndex.get())
@ -165,8 +167,16 @@ ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendVersionedMu
 }

 // Clear all ranges in input ranges
-ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRangeRef>> ranges, Database cx) {
+ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRangeRef>> ranges, double delayTime,
+                                                   Database cx, UID applierID, int batchIndex) {
 	state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
+	state int retries = 0;
+	state double numOps = 0;
+	wait(delay(delayTime + deterministicRandom()->random01() * delayTime));
+	TraceEvent("FastRestoreApplierClearRangeMutationsStart", applierID)
+	    .detail("BatchIndex", batchIndex)
+	    .detail("Ranges", ranges.size())
+	    .detail("DelayTime", delayTime);
 	loop {
 		try {
 			tr->reset();
@ -176,10 +186,25 @@ ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRange
 				debugFRMutation("FastRestoreApplierApplyClearRangeMutation", 0,
 				                MutationRef(MutationRef::ClearRange, range.begin, range.end));
 				tr->clear(range);
+				++numOps;
+				if (numOps >= SERVER_KNOBS->FASTRESTORE_TXN_CLEAR_MAX) {
+					TraceEvent(SevWarnAlways, "FastRestoreApplierClearRangeMutationsTooManyClearsInTxn")
+					    .suppressFor(1.0)
+					    .detail("Clears", numOps)
+					    .detail("Ranges", ranges.size())
+					    .detail("Range", range.toString());
+				}
 			}
 			wait(tr->commit());
 			break;
 		} catch (Error& e) {
+			retries++;
+			if (retries > SERVER_KNOBS->FASTRESTORE_TXN_RETRY_MAX) {
+				TraceEvent(SevWarnAlways, "RestoreApplierApplyClearRangeMutationsStuck", applierID)
+				    .detail("BatchIndex", batchIndex)
+				    .detail("ClearRanges", ranges.size())
+				    .error(e);
+			}
 			wait(tr->onError(e));
 		}
 	}
@ -188,13 +213,17 @@ ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRange

 // Get keys in incompleteStagingKeys and precompute the stagingKey which is stored in batchData->stagingKeys
 ACTOR static Future<Void> getAndComputeStagingKeys(
-    std::map<Key, std::map<Key, StagingKey>::iterator> incompleteStagingKeys, Database cx, UID applierID) {
+    std::map<Key, std::map<Key, StagingKey>::iterator> incompleteStagingKeys, double delayTime, Database cx,
+    UID applierID, int batchIndex) {
 	state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
 	state std::vector<Future<Optional<Value>>> fValues;
 	state int retries = 0;

+	wait(delay(delayTime + deterministicRandom()->random01() * delayTime));
 	TraceEvent("FastRestoreApplierGetAndComputeStagingKeysStart", applierID)
-	    .detail("GetKeys", incompleteStagingKeys.size());
+	    .detail("BatchIndex", batchIndex)
+	    .detail("GetKeys", incompleteStagingKeys.size())
+	    .detail("DelayTime", delayTime);
 	loop {
 		try {
 			tr->reset();
@ -207,11 +236,12 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 			break;
 		} catch (Error& e) {
 			if (retries++ > 10) {
-				TraceEvent(SevError, "FastRestoreApplierGetAndComputeStagingKeysGetKeysStuck")
+				TraceEvent(SevError, "FastRestoreApplierGetAndComputeStagingKeysGetKeysStuck", applierID)
+				    .detail("BatchIndex", batchIndex)
 				    .detail("GetKeys", incompleteStagingKeys.size())
 				    .error(e);
+				break;
 			}
-
 			wait(tr->onError(e));
 			fValues.clear();
 		}
@ -220,31 +250,31 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 	ASSERT(fValues.size() == incompleteStagingKeys.size());
 	int i = 0;
 	for (auto& key : incompleteStagingKeys) {
-		if (!fValues[i].get().present()) {
-			TraceEvent(SevDebug, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
+		if (!fValues[i].get().present()) { // Debug info to understand which key does not exist in DB
+			TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB", applierID)
+			    .detail("BatchIndex", batchIndex)
 			    .detail("Key", key.first)
 			    .detail("Reason", "Not found in DB")
 			    .detail("PendingMutations", key.second->second.pendingMutations.size())
 			    .detail("StagingKeyType", (int)key.second->second.type);
 			for (auto& vm : key.second->second.pendingMutations) {
-				TraceEvent(SevDebug, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
+				TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysNoBaseValueInDB")
 				    .detail("PendingMutationVersion", vm.first.toString())
 				    .detail("PendingMutation", vm.second.toString());
 			}
-			key.second->second.precomputeResult("GetAndComputeStagingKeysNoBaseValueInDB");
-			i++;
-			continue;
+			key.second->second.precomputeResult("GetAndComputeStagingKeysNoBaseValueInDB", applierID, batchIndex);
 		} else {
 			// The key's version ideally should be the most recently committed version.
 			// But as long as it is > 1 and less than the start version of the version batch, it is the same result.
 			MutationRef m(MutationRef::SetValue, key.first, fValues[i].get().get());
 			key.second->second.add(m, LogMessageVersion(1));
-			key.second->second.precomputeResult("GetAndComputeStagingKeys");
-			i++;
+			key.second->second.precomputeResult("GetAndComputeStagingKeys", applierID, batchIndex);
 		}
+		i++;
 	}

 	TraceEvent("FastRestoreApplierGetAndComputeStagingKeysDone", applierID)
+	    .detail("BatchIndex", batchIndex)
 	    .detail("GetKeys", incompleteStagingKeys.size());

 	return Void();
@ -253,43 +283,44 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
 ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData> batchData, UID applierID,
                                                    int64_t batchIndex, Database cx) {
 	// Apply range mutations (i.e., clearRange) to database cx
-	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
+	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResultStart", applierID)
 	    .detail("BatchIndex", batchIndex)
 	    .detail("Step", "Applying clear range mutations to DB")
 	    .detail("ClearRanges", batchData->stagingKeyRanges.size());
 	state std::vector<Future<Void>> fClearRanges;
-	std::vector<Standalone<VectorRef<KeyRangeRef>>> clearBuf;
-	clearBuf.push_back(Standalone<VectorRef<KeyRangeRef>>());
-	Standalone<VectorRef<KeyRangeRef>> clearRanges = clearBuf.back();
+	Standalone<VectorRef<KeyRangeRef>> clearRanges;
 	double curTxnSize = 0;
+	double delayTime = 0;
 	for (auto& rangeMutation : batchData->stagingKeyRanges) {
 		KeyRangeRef range(rangeMutation.mutation.param1, rangeMutation.mutation.param2);
 		debugFRMutation("FastRestoreApplierPrecomputeMutationsResultClearRange", rangeMutation.version.version,
 		                MutationRef(MutationRef::ClearRange, range.begin, range.end));
-		clearRanges.push_back(clearRanges.arena(), range);
+		clearRanges.push_back_deep(clearRanges.arena(), range);
 		curTxnSize += range.expectedSize();
 		if (curTxnSize >= SERVER_KNOBS->FASTRESTORE_TXN_BATCH_MAX_BYTES) {
-			fClearRanges.push_back(applyClearRangeMutations(clearRanges, cx));
-			clearBuf.push_back(Standalone<VectorRef<KeyRangeRef>>());
-			clearRanges = clearBuf.back();
+			fClearRanges.push_back(applyClearRangeMutations(clearRanges, delayTime, cx, applierID, batchIndex));
+			delayTime += 0.1;
+			clearRanges = Standalone<VectorRef<KeyRangeRef>>();
 			curTxnSize = 0;
 		}
 	}
 	if (curTxnSize > 0) {
-		fClearRanges.push_back(applyClearRangeMutations(clearRanges, cx));
+		fClearRanges.push_back(applyClearRangeMutations(clearRanges, delayTime, cx, applierID, batchIndex));
 	}

 	// Apply range mutations (i.e., clearRange) to stagingKeyRanges
 	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
 	    .detail("BatchIndex", batchIndex)
 	    .detail("Step", "Applying clear range mutations to staging keys")
-	    .detail("ClearRanges", batchData->stagingKeyRanges.size());
+	    .detail("ClearRanges", batchData->stagingKeyRanges.size())
+	    .detail("FutureClearRanges", fClearRanges.size());
 	for (auto& rangeMutation : batchData->stagingKeyRanges) {
+		ASSERT(rangeMutation.mutation.param1 <= rangeMutation.mutation.param2);
 		std::map<Key, StagingKey>::iterator lb = batchData->stagingKeys.lower_bound(rangeMutation.mutation.param1);
 		std::map<Key, StagingKey>::iterator ub = batchData->stagingKeys.lower_bound(rangeMutation.mutation.param2);
 		while (lb != ub) {
 			if (lb->first >= rangeMutation.mutation.param2) {
-				TraceEvent(SevError, "FastRestoreApplerPhasePrecomputeMutationsResult_IncorrectUpperBound")
+				TraceEvent(SevError, "FastRestoreApplerPhasePrecomputeMutationsResultIncorrectUpperBound")
 				    .detail("Key", lb->first)
 				    .detail("ClearRangeUpperBound", rangeMutation.mutation.param2)
 				    .detail("UsedUpperBound", ub->first);
@ -301,6 +332,10 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
 			lb++;
 		}
 	}
+	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
+	    .detail("BatchIndex", batchIndex)
+	    .detail("Step", "Wait on applying clear range mutations to DB")
+	    .detail("FutureClearRanges", fClearRanges.size());

 	wait(waitForAll(fClearRanges));
 	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
@ -313,6 +348,7 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
 	std::map<Key, std::map<Key, StagingKey>::iterator> incompleteStagingKeys;
 	std::map<Key, StagingKey>::iterator stagingKeyIter = batchData->stagingKeys.begin();
 	int numKeysInBatch = 0;
+	double delayTime = 0; // Start transactions at different time to avoid overwelming FDB.
 	for (; stagingKeyIter != batchData->stagingKeys.end(); stagingKeyIter++) {
 		if (!stagingKeyIter->second.hasBaseValue()) {
 			incompleteStagingKeys.emplace(stagingKeyIter->first, stagingKeyIter);
@ -320,13 +356,16 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
 			numKeysInBatch++;
 		}
 		if (numKeysInBatch == SERVER_KNOBS->FASTRESTORE_APPLIER_FETCH_KEYS_SIZE) {
-			fGetAndComputeKeys.push_back(getAndComputeStagingKeys(incompleteStagingKeys, cx, applierID));
+			fGetAndComputeKeys.push_back(
+			    getAndComputeStagingKeys(incompleteStagingKeys, delayTime, cx, applierID, batchIndex));
+			delayTime += 0.1;
 			numKeysInBatch = 0;
 			incompleteStagingKeys.clear();
 		}
 	}
 	if (numKeysInBatch > 0) {
-		fGetAndComputeKeys.push_back(getAndComputeStagingKeys(incompleteStagingKeys, cx, applierID));
+		fGetAndComputeKeys.push_back(
+		    getAndComputeStagingKeys(incompleteStagingKeys, delayTime, cx, applierID, batchIndex));
 	}

 	TraceEvent("FastRestoreApplerPhasePrecomputeMutationsResult", applierID)
@ -337,7 +376,7 @@ ACTOR static Future<Void> precomputeMutationsResult(Reference<ApplierBatchData>
 	for (stagingKeyIter = batchData->stagingKeys.begin(); stagingKeyIter != batchData->stagingKeys.end();
 	     stagingKeyIter++) {
 		if (stagingKeyIter->second.hasBaseValue()) {
-			stagingKeyIter->second.precomputeResult("HasBaseValue");
+			stagingKeyIter->second.precomputeResult("HasBaseValue", applierID, batchIndex);
 		}
 	}

@ -420,7 +459,7 @@ ACTOR static Future<Void> applyStagingKeys(Reference<ApplierBatchData> batchData
 	std::map<Key, StagingKey>::iterator cur = begin;
 	double txnSize = 0;
 	std::vector<Future<Void>> fBatches;
-	TraceEvent("FastRestoreApplerPhaseApplyStagingKeys", applierID)
+	TraceEvent("FastRestoreApplerPhaseApplyStagingKeysStart", applierID)
 	    .detail("BatchIndex", batchIndex)
 	    .detail("StagingKeys", batchData->stagingKeys.size());
 	while (cur != batchData->stagingKeys.end()) {
@ -458,23 +497,29 @@ ACTOR Future<Void> writeMutationsToDB(UID applierID, int64_t batchIndex, Referen

 ACTOR static Future<Void> handleApplyToDBRequest(RestoreVersionBatchRequest req, Reference<RestoreApplierData> self,
                                                 Database cx) {
+	TraceEvent("FastRestoreApplierPhaseHandleApplyToDBStart", self->id())
+	    .detail("BatchIndex", req.batchIndex)
+	    .detail("FinishedBatch", self->finishedBatch.get());
+
 	// Ensure batch (i-1) is applied before batch i
 	wait(self->finishedBatch.whenAtLeast(req.batchIndex - 1));

 	state bool isDuplicated = true;
-	Reference<ApplierBatchData> batchData = self->batch[req.batchIndex];
-	TraceEvent("FastRestoreApplierPhaseHandleApplyToDB", self->id())
-	    .detail("BatchIndex", req.batchIndex)
-	    .detail("FinishedBatch", self->finishedBatch.get())
-	    .detail("HasStarted", batchData->dbApplier.present())
-	    .detail("PreviousVersionBatchState", batchData->vbState.get());
-	batchData->vbState = ApplierVersionBatchState::WRITE_TO_DB;
 	if (self->finishedBatch.get() == req.batchIndex - 1) {
+		Reference<ApplierBatchData> batchData = self->batch[req.batchIndex];
+		TraceEvent("FastRestoreApplierPhaseHandleApplyToDBRunning", self->id())
+		    .detail("BatchIndex", req.batchIndex)
+		    .detail("FinishedBatch", self->finishedBatch.get())
+		    .detail("HasStarted", batchData->dbApplier.present())
+		    .detail("WroteToDBDone", batchData->dbApplier.present() ? batchData->dbApplier.get().isReady() : 0)
+		    .detail("PreviousVersionBatchState", batchData->vbState.get());
+
 		ASSERT(batchData.isValid());
 		if (!batchData->dbApplier.present()) {
 			isDuplicated = false;
 			batchData->dbApplier = Never();
 			batchData->dbApplier = writeMutationsToDB(self->id(), req.batchIndex, batchData, cx);
+			batchData->vbState = ApplierVersionBatchState::WRITE_TO_DB;
 		}

 		ASSERT(batchData->dbApplier.present());
@ -485,14 +530,22 @@ ACTOR static Future<Void> handleApplyToDBRequest(RestoreVersionBatchRequest req,
 		// Avoid setting finishedBatch when finishedBatch > req.batchIndex
 		if (self->finishedBatch.get() == req.batchIndex - 1) {
 			self->finishedBatch.set(req.batchIndex);
+			self->batch[req.batchIndex]->vbState = ApplierVersionBatchState::DONE;
+			// Free memory for the version batch
+			self->batch.erase(req.batchIndex);
+			if (self->delayedActors > 0) {
+				self->checkMemory.trigger();
+			}
 		}
 	}

-	if (self->delayedActors > 0) {
-		self->checkMemory.trigger();
-	}
 	req.reply.send(RestoreCommonReply(self->id(), isDuplicated));

+	TraceEvent("FastRestoreApplierPhaseHandleApplyToDBDone", self->id())
+	    .detail("BatchIndex", req.batchIndex)
+	    .detail("FinishedBatch", self->finishedBatch.get())
+	    .detail("IsDuplicated", isDuplicated);
+
 	return Void();
 }

--- a/fdbserver/RestoreApplier.actor.h
+++ b/fdbserver/RestoreApplier.actor.h
@ -117,8 +117,9 @@ struct StagingKey {

 	// Precompute the final value of the key.
 	// TODO: Look at the last LogMessageVersion, if it set or clear, we can ignore the rest of versions.
-	void precomputeResult(const char* context) {
-		TraceEvent(SevDebug, "FastRestoreApplierPrecomputeResult")
+	void precomputeResult(const char* context, UID applierID, int batchIndex) {
+		TraceEvent(SevDebug, "FastRestoreApplierPrecomputeResult", applierID)
+		    .detail("BatchIndex", batchIndex)
 		    .detail("Context", context)
 		    .detail("Version", version.toString())
 		    .detail("Key", key)
@ -136,7 +137,9 @@ struct StagingKey {
 			MutationRef m = lb->second;
 			if (m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) {
 				if (std::tie(type, key, val) != std::tie(m.type, m.param1, m.param2)) {
-					TraceEvent(SevError, "FastRestoreApplierPrecomputeResultUnhandledSituation")
+					TraceEvent(SevError, "FastRestoreApplierPrecomputeResultUnhandledSituation", applierID)
+					    .detail("BatchIndex", batchIndex)
+					    .detail("Context", context)
 					    .detail("BufferedType", getTypeString(type))
 					    .detail("PendingType", getTypeString(m.type))
 					    .detail("BufferedVal", val.toString())
@ -167,11 +170,15 @@ struct StagingKey {
 				type = MutationRef::SetValue; // Precomputed result should be set to DB.
 			} else if (mutation.type == MutationRef::SetValue || mutation.type == MutationRef::ClearRange) {
 				type = MutationRef::SetValue; // Precomputed result should be set to DB.
-				TraceEvent(SevError, "FastRestoreApplierPrecomputeResultUnexpectedSet")
+				TraceEvent(SevError, "FastRestoreApplierPrecomputeResultUnexpectedSet", applierID)
+				    .detail("BatchIndex", batchIndex)
+				    .detail("Context", context)
 				    .detail("MutationType", getTypeString(mutation.type))
 				    .detail("Version", lb->first.toString());
 			} else {
-				TraceEvent(SevWarnAlways, "FastRestoreApplierPrecomputeResultSkipUnexpectedBackupMutation")
+				TraceEvent(SevWarnAlways, "FastRestoreApplierPrecomputeResultSkipUnexpectedBackupMutation", applierID)
+				    .detail("BatchIndex", batchIndex)
+				    .detail("Context", context)
 				    .detail("MutationType", getTypeString(mutation.type))
 				    .detail("Version", lb->first.toString());
 			}
@ -218,7 +225,8 @@ public:
 	static const int INIT = 1;
 	static const int RECEIVE_MUTATIONS = 2;
 	static const int WRITE_TO_DB = 3;
-	static const int INVALID = 4;
+	static const int DONE = 4;
+	static const int INVALID = 5;

 	explicit ApplierVersionBatchState(int newState) {
 		vbState = newState;
--- a/fdbserver/RestoreCommon.actor.h
+++ b/fdbserver/RestoreCommon.actor.h
@ -281,18 +281,23 @@ Future<Void> getBatchReplies(RequestStream<Request> Interface::*channel, std::ma
 				ongoingReplies.clear();
 				ongoingRepliesIndex.clear();
 				for (int i = 0; i < cmdReplies.size(); ++i) {
-					// TraceEvent(SevDebug, "FastRestoreGetBatchReplies")
-					//     .detail("Requests", requests.size())
-					//     .detail("OutstandingReplies", oustandingReplies)
-					//     .detail("ReplyIndex", i)
-					//     .detail("ReplyReady", cmdReplies[i].isReady())
-					//     .detail("RequestNode", requests[i].first)
-					//     .detail("Request", requests[i].second.toString());
+					if (SERVER_KNOBS->FASTRESTORE_REQBATCH_LOG) {
+						TraceEvent(SevInfo, "FastRestoreGetBatchReplies")
+						    .suppressFor(1.0)
+						    .detail("Requests", requests.size())
+						    .detail("OutstandingReplies", oustandingReplies)
+						    .detail("ReplyIndex", i)
+						    .detail("ReplyIsReady", cmdReplies[i].isReady())
+						    .detail("ReplyIsError", cmdReplies[i].isError())
+						    .detail("RequestNode", requests[i].first)
+						    .detail("Request", requests[i].second.toString());
+					}
 					if (!cmdReplies[i].isReady()) { // still wait for reply
 						ongoingReplies.push_back(cmdReplies[i]);
 						ongoingRepliesIndex.push_back(i);
 					}
 				}
+				ASSERT(ongoingReplies.size() == oustandingReplies);
 				if (ongoingReplies.empty()) {
 					break;
 				} else {
@ -356,7 +361,7 @@ Future<Void> getBatchReplies(RequestStream<Request> Interface::*channel, std::ma
 			// fprintf(stdout, "sendBatchRequests Error code:%d, error message:%s\n", e.code(), e.what());
 			TraceEvent(SevWarn, "FastRestoreSendBatchRequests").error(e);
 			for (auto& request : requests) {
-				TraceEvent(SevWarn, "FastRestoreLoader")
+				TraceEvent(SevWarn, "FastRestoreSendBatchRequests")
 				    .detail("SendBatchRequests", requests.size())
 				    .detail("RequestID", request.first)
 				    .detail("Request", request.second.toString());
--- a/fdbserver/RestoreLoader.actor.cpp
+++ b/fdbserver/RestoreLoader.actor.cpp
@ -67,7 +67,7 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no
 	state Future<Void> exitRole = Never();
 	state Future<Void> updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);

-	actors.add(traceProcessMetrics(self, "Loader"));
+	actors.add(traceProcessMetrics(self, "RestoreLoader"));

 	loop {
 		state std::string requestTypeStr = "[Init]";
@ -336,6 +336,8 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
 	    .detail("NotProcessed", !paramExist)
 	    .detail("Processed", isReady)
 	    .detail("CurrentMemory", getSystemStatistics().processMemory);
+	// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
+	ASSERT(self->finishedBatch.get() < req.batchIndex);

 	wait(isSchedulable(self, req.batchIndex, __FUNCTION__));

@ -376,6 +378,8 @@ ACTOR Future<Void> handleSendMutationsRequest(RestoreSendMutationsToAppliersRequ
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("UseRangeFile", req.useRangeFile)
 	    .detail("LoaderSendStatus", batchStatus->toString());
+	// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
+	ASSERT(self->finishedBatch.get() < req.batchIndex);

 	// Ensure each file is sent exactly once by using batchStatus->sendAllLogs and batchStatus->sendAllRanges
 	if (!req.useRangeFile) {
@ -945,6 +949,9 @@ ACTOR Future<Void> handleFinishVersionBatchRequest(RestoreVersionBatchRequest re
 	wait(self->finishedBatch.whenAtLeast(req.batchIndex - 1));
 	if (self->finishedBatch.get() == req.batchIndex - 1) {
 		self->finishedBatch.set(req.batchIndex);
+		// Clean up batchData
+		self->batch.erase(req.batchIndex);
+		self->status.erase(req.batchIndex);
 	}
 	if (self->delayedActors > 0) {
 		self->checkMemory.trigger();
--- a/fdbserver/RestoreLoader.actor.h
+++ b/fdbserver/RestoreLoader.actor.h
@ -177,6 +177,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
 	void resetPerRestoreRequest() {
 		batch.clear();
 		status.clear();
+		finishedBatch = NotifiedVersion(0);
 	}

 	void initBackupContainer(Key url) {
--- a/fdbserver/RestoreMaster.actor.cpp
+++ b/fdbserver/RestoreMaster.actor.cpp
@ -81,6 +81,7 @@ ACTOR Future<Void> startRestoreMaster(Reference<RestoreWorkerData> masterWorker,

 		actors.add(updateHeartbeatTime(self));
 		actors.add(checkRolesLiveness(self));
+		actors.add(traceProcessMetrics(self, "RestoreMaster"));

 		wait(startProcessRestoreRequests(self, cx));
 	} catch (Error& e) {
@ -315,7 +316,8 @@ ACTOR static Future<Version> processRestoreRequest(Reference<RestoreMasterData>
 		TraceEvent("FastRestoreMasterDispatchVersionBatches")
 		    .detail("BatchIndex", batchIndex)
 		    .detail("BatchSize", versionBatch->size)
-		    .detail("RunningVersionBatches", self->runningVersionBatches.get());
+		    .detail("RunningVersionBatches", self->runningVersionBatches.get())
+		    .detail("VersionBatches", versionBatches.size());
 		self->batch[batchIndex] = Reference<MasterBatchData>(new MasterBatchData());
 		self->batchStatus[batchIndex] = Reference<MasterBatchStatus>(new MasterBatchStatus());
 		fBatches.push_back(distributeWorkloadPerVersionBatch(self, batchIndex, cx, request, *versionBatch));
@ -402,6 +404,7 @@ ACTOR static Future<Void> loadFilesOnLoaders(Reference<MasterBatchData> batchDat
 		++paramIdx;
 	}
 	TraceEvent(files->size() != paramIdx ? SevError : SevInfo, "FastRestoreMasterPhaseLoadFiles")
+	    .detail("BatchIndex", batchIndex)
 	    .detail("Files", files->size())
 	    .detail("LoadParams", paramIdx);

@ -561,6 +564,9 @@ ACTOR static Future<Void> distributeWorkloadPerVersionBatch(Reference<RestoreMas
 void splitKeyRangeForAppliers(Reference<MasterBatchData> batchData,
                              std::map<UID, RestoreApplierInterface> appliersInterf, int batchIndex) {
 	ASSERT(batchData->samplesSize >= 0);
+	// Sanity check: samples should not be used after freed
+	ASSERT((batchData->samplesSize > 0 && !batchData->samples.empty()) ||
+	       batchData->samplesSize == 0 && batchData->samples.empty());
 	int numAppliers = appliersInterf.size();
 	double slotSize = std::max(batchData->samplesSize / numAppliers, 1.0);
 	double cumulativeSize = slotSize;
@ -619,6 +625,7 @@ void splitKeyRangeForAppliers(Reference<MasterBatchData> batchData,
 	    .detail("BatchIndex", batchIndex)
 	    .detail("SamplingSize", batchData->samplesSize)
 	    .detail("SlotSize", slotSize);
+	batchData->samples.clear();
 }

 ACTOR static Future<Standalone<VectorRef<RestoreRequest>>> collectRestoreRequests(Database cx) {
--- a/fdbserver/RestoreRoleCommon.actor.cpp
+++ b/fdbserver/RestoreRoleCommon.actor.cpp
@ -57,6 +57,9 @@ ACTOR Future<Void> handleInitVersionBatchRequest(RestoreVersionBatchRequest req,
 	    .detail("BatchIndex", req.batchIndex)
 	    .detail("Role", getRoleStr(self->role))
 	    .detail("VersionBatchNotifiedVersion", self->versionBatchId.get());
+	// Loader destroy batchData once the batch finishes and self->finishedBatch.set(req.batchIndex);
+	ASSERT(self->finishedBatch.get() < req.batchIndex);
+
 	// batchId is continuous. (req.batchIndex-1) is the id of the just finished batch.
 	wait(self->versionBatchId.whenAtLeast(req.batchIndex - 1));

@ -110,7 +113,8 @@ ACTOR Future<Void> isSchedulable(Reference<RestoreRoleData> self, int actorBatch
 		}
 		if (memory < memoryThresholdBytes || self->finishedBatch.get() + 1 == actorBatchIndex) {
 			if (memory >= memoryThresholdBytes) {
-				TraceEvent(SevWarn, "FastRestoreMemoryUsageAboveThreshold")
+				TraceEvent(SevWarn, "FastRestoreMemoryUsageAboveThreshold", self->id())
+				    .detail("Role", getRoleStr(self->role))
 				    .detail("BatchIndex", actorBatchIndex)
 				    .detail("FinishedBatch", self->finishedBatch.get())
 				    .detail("Actor", name)
@ -119,10 +123,12 @@ ACTOR Future<Void> isSchedulable(Reference<RestoreRoleData> self, int actorBatch
 			self->delayedActors--;
 			break;
 		} else {
-			TraceEvent(SevDebug, "FastRestoreMemoryUsageAboveThresholdWait")
+			TraceEvent(SevInfo, "FastRestoreMemoryUsageAboveThresholdWait", self->id())
+			    .detail("Role", getRoleStr(self->role))
 			    .detail("BatchIndex", actorBatchIndex)
 			    .detail("Actor", name)
 			    .detail("CurrentMemory", memory);
+			// TODO: Set FASTRESTORE_WAIT_FOR_MEMORY_LATENCY to a large value. It should be able to avoided
 			wait(delay(SERVER_KNOBS->FASTRESTORE_WAIT_FOR_MEMORY_LATENCY) || self->checkMemory.onTrigger());
 		}
 	}
--- a/fdbserver/RestoreRoleCommon.actor.h
+++ b/fdbserver/RestoreRoleCommon.actor.h
@ -104,8 +104,6 @@ public:
 	NotifiedVersion versionBatchId; // The index of the version batch that has been initialized and put into pipeline
 	NotifiedVersion finishedBatch; // The highest batch index all appliers have applied mutations

-	bool versionBatchStart = false;
-
 	RestoreRoleData() : role(RestoreRole::Invalid), cpuUsage(0.0), memory(0.0), residentMemory(0.0), delayedActors(0){};

 	virtual ~RestoreRoleData() = default;
--- a/fdbserver/TLogInterface.h
+++ b/fdbserver/TLogInterface.h
@ -33,9 +33,11 @@ struct TLogInterface {
 	enum { LocationAwareLoadBalance = 1 };
 	enum { AlwaysFresh = 1 };

-	LocalityData locality;
+	LocalityData filteredLocality;
 	UID uniqueID;
 	UID sharedTLogID;
+	Endpoint base;
+
 	RequestStream< struct TLogPeekRequest > peekMessages;
 	RequestStream< struct TLogPopRequest > popMessages;

@ -50,21 +52,30 @@ struct TLogInterface {
 	RequestStream< struct TLogSnapRequest> snapRequest;

 	TLogInterface() {}
-	explicit TLogInterface(const LocalityData& locality) : uniqueID( deterministicRandom()->randomUniqueID() ), locality(locality) { sharedTLogID = uniqueID; }
-	TLogInterface(UID sharedTLogID, const LocalityData& locality) : uniqueID( deterministicRandom()->randomUniqueID() ), sharedTLogID(sharedTLogID), locality(locality) {}
-	TLogInterface(UID uniqueID, UID sharedTLogID, const LocalityData& locality) : uniqueID(uniqueID), sharedTLogID(sharedTLogID), locality(locality) {}
+	explicit TLogInterface(const LocalityData& locality) : uniqueID( deterministicRandom()->randomUniqueID() ), filteredLocality(locality) { sharedTLogID = uniqueID; }
+	TLogInterface(UID sharedTLogID, const LocalityData& locality) : uniqueID( deterministicRandom()->randomUniqueID() ), sharedTLogID(sharedTLogID), filteredLocality(locality) {}
+	TLogInterface(UID uniqueID, UID sharedTLogID, const LocalityData& locality) : uniqueID(uniqueID), sharedTLogID(sharedTLogID), filteredLocality(locality) {}
 	UID id() const { return uniqueID; }
 	UID getSharedTLogID() const { return sharedTLogID; }
 	std::string toString() const { return id().shortString(); }
 	bool operator == ( TLogInterface const& r ) const { return id() == r.id(); }
 	NetworkAddress address() const { return peekMessages.getEndpoint().getPrimaryAddress(); }
 	Optional<NetworkAddress> secondaryAddress() const { return peekMessages.getEndpoint().addresses.secondaryAddress; }
+
 	void initEndpoints() {
-		getQueuingMetrics.getEndpoint( TaskPriority::TLogQueuingMetrics );
-		popMessages.getEndpoint( TaskPriority::TLogPop );
-		peekMessages.getEndpoint( TaskPriority::TLogPeek );
-		confirmRunning.getEndpoint( TaskPriority::TLogConfirmRunning );
-		commit.getEndpoint( TaskPriority::TLogCommit );
+		std::vector<std::pair<FlowReceiver*, TaskPriority>> streams;
+		streams.push_back(peekMessages.getReceiver(TaskPriority::TLogPeek));
+		streams.push_back(popMessages.getReceiver(TaskPriority::TLogPop));
+		streams.push_back(commit.getReceiver(TaskPriority::TLogCommit));
+		streams.push_back(lock.getReceiver());
+		streams.push_back(getQueuingMetrics.getReceiver(TaskPriority::TLogQueuingMetrics));
+		streams.push_back(confirmRunning.getReceiver(TaskPriority::TLogConfirmRunning));
+		streams.push_back(waitFailure.getReceiver());
+		streams.push_back(recoveryFinished.getReceiver());
+		streams.push_back(disablePopRequest.getReceiver());
+		streams.push_back(enablePopRequest.getReceiver());
+		streams.push_back(snapRequest.getReceiver());
+		base = FlowTransport::transport().addEndpoints(streams);
 	}

 	template <class Ar> 
@ -72,9 +83,20 @@ struct TLogInterface {
 		if constexpr (!is_fb_function<Ar>) {
 			ASSERT(ar.isDeserializing || uniqueID != UID());
 		}
-		serializer(ar, uniqueID, sharedTLogID, locality, peekMessages, popMessages
-		  , commit, lock, getQueuingMetrics, confirmRunning, waitFailure, recoveryFinished
-		  , disablePopRequest, enablePopRequest, snapRequest);
+		serializer(ar, uniqueID, sharedTLogID, filteredLocality, base);
+		if( Ar::isDeserializing ) {
+			peekMessages = RequestStream< struct TLogPeekRequest >( base.getAdjustedEndpoint(0) );
+			popMessages = RequestStream< struct TLogPopRequest >( base.getAdjustedEndpoint(1) );
+			commit = RequestStream< struct TLogCommitRequest >( base.getAdjustedEndpoint(2) );
+			lock = RequestStream< ReplyPromise< struct TLogLockResult > >( base.getAdjustedEndpoint(3) );
+			getQueuingMetrics = RequestStream< struct TLogQueuingMetricsRequest >( base.getAdjustedEndpoint(4) );
+			confirmRunning = RequestStream< struct TLogConfirmRunningRequest >( base.getAdjustedEndpoint(5) );
+			waitFailure = RequestStream< ReplyPromise<Void> >( base.getAdjustedEndpoint(6) );
+			recoveryFinished = RequestStream< struct TLogRecoveryFinishedRequest >( base.getAdjustedEndpoint(7) );
+			disablePopRequest = RequestStream< struct TLogDisablePopRequest >( base.getAdjustedEndpoint(8) );
+			enablePopRequest = RequestStream< struct TLogEnablePopRequest >( base.getAdjustedEndpoint(9) );
+			snapRequest = RequestStream< struct TLogSnapRequest >( base.getAdjustedEndpoint(10) );
+		}
 	}
 };

--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -356,7 +356,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				newState.tLogs.emplace_back(*t);
 				newState.tLogs.back().tLogLocalities.clear();
 				for (const auto& log : t->logServers) {
-					newState.tLogs.back().tLogLocalities.push_back(log->get().interf().locality);
+					newState.tLogs.back().tLogLocalities.push_back(log->get().interf().filteredLocality);
 				}
 			}
 		}
@ -1677,7 +1677,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS

 		// trackRejoins listens for rejoin requests from the tLogs that we are recovering from, to learn their TLogInterfaces
 		state std::vector<LogLockInfo> lockResults;
-		state std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> allLogServers;
+		state std::vector<std::pair<Reference<AsyncVar<OptionalInterface<TLogInterface>>>,Reference<IReplicationPolicy>>> allLogServers;
 		state std::vector<Reference<LogSet>> logServers;
 		state std::vector<OldLogData> oldLogData;
 		state std::vector<std::vector<Reference<AsyncVar<bool>>>> logFailed;
@ -1686,8 +1686,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		for (const CoreTLogSet& coreSet : prevState.tLogs) {
 			logServers.emplace_back(new LogSet(coreSet));
 			std::vector<Reference<AsyncVar<bool>>> failed;
+
 			for (const auto& logVar : logServers.back()->logServers) {
-				allLogServers.push_back(logVar);
+				allLogServers.push_back(std::make_pair(logVar,coreSet.tLogPolicy));
 				failed.emplace_back(new AsyncVar<bool>());
 				failureTrackers.push_back(monitorLog(logVar, failed.back()));
 			}
@ -1698,7 +1699,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			oldLogData.emplace_back(oldTlogData);

 			for (const auto& logSet : oldLogData.back().tLogs) {
-				allLogServers.insert(allLogServers.end(), logSet->logServers.begin(), logSet->logServers.end());
+				for (const auto& logVar : logSet->logServers) {
+					allLogServers.push_back(std::make_pair(logVar,logSet->tLogPolicy));
+				}
 			}
 		}
 		state Future<Void> rejoins = trackRejoins( dbgid, allLogServers, rejoinRequests );
@ -2458,7 +2461,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		return logSystem;
 	}

-	ACTOR static Future<Void> trackRejoins( UID dbgid, std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> logServers, FutureStream< struct TLogRejoinRequest > rejoinRequests ) {
+	ACTOR static Future<Void> trackRejoins( UID dbgid, std::vector<std::pair<Reference<AsyncVar<OptionalInterface<TLogInterface>>>,Reference<IReplicationPolicy>>> logServers, FutureStream< struct TLogRejoinRequest > rejoinRequests ) {
 		state std::map<UID, ReplyPromise<TLogRejoinReply>> lastReply;

 		try {
@ -2466,15 +2469,18 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				TLogRejoinRequest req = waitNext( rejoinRequests );
 				int pos = -1;
 				for( int i = 0; i < logServers.size(); i++ ) {
-					if( logServers[i]->get().id() == req.myInterface.id() ) {
+					if( logServers[i].first->get().id() == req.myInterface.id() ) {
 						pos = i;
 						break;
 					}
 				}
 				if ( pos != -1 ) {
 					TraceEvent("TLogJoinedMe", dbgid).detail("TLog", req.myInterface.id()).detail("Address", req.myInterface.commit.getEndpoint().getPrimaryAddress().toString());
-					if( !logServers[pos]->get().present() || req.myInterface.commit.getEndpoint() != logServers[pos]->get().interf().commit.getEndpoint())
-						logServers[pos]->setUnconditional( OptionalInterface<TLogInterface>(req.myInterface) );
+					if( !logServers[pos].first->get().present() || req.myInterface.commit.getEndpoint() != logServers[pos].first->get().interf().commit.getEndpoint()) {
+						TLogInterface interf = req.myInterface;
+						filterLocalityDataForPolicyDcAndProcess(logServers[pos].second, &interf.filteredLocality);
+						logServers[pos].first->setUnconditional( OptionalInterface<TLogInterface>(interf) );
+					}
 					lastReply[req.myInterface.id()].send(TLogRejoinReply{ false });
 					lastReply[req.myInterface.id()] = req.reply;
 				}
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -1866,7 +1866,7 @@ int main(int argc, char* argv[]) {
 				vector<Future<Void>> actors(listenErrors.begin(), listenErrors.end());
 				actors.push_back(restoreWorker(opts.connectionFile, opts.localities, dataFolder));
 				f = stopAfter(waitForAll(actors));
-				printf("Fast restore worker exits\n");
+				printf("Fast restore worker started\n");
 				g_network->run();
 				printf("g_network->run() done\n");
 			} else { // Call fdbd roles in conventional way
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@ -501,7 +501,6 @@ ACTOR Future<Standalone<CommitTransactionRef>> provisionalMaster( Reference<Mast
 	// Register a fake master proxy (to be provided right here) to make ourselves available to clients
 	parent->provisionalProxies = vector<MasterProxyInterface>(1);
 	parent->provisionalProxies[0].provisional = true;
-	parent->provisionalProxies[0].locality = parent->myInterface.locality;
 	parent->provisionalProxies[0].initEndpoints();
 	state Future<Void> waitFailure = waitFailureServer(parent->provisionalProxies[0].waitFailure.getFuture());
 	parent->registrationTrigger.trigger();
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -818,7 +818,6 @@ ACTOR Future<Void> monitorTraceLogIssues(Reference<AsyncVar<std::set<std::string
 	state bool pingTimeout = false;
 	loop {
 		wait(delay(SERVER_KNOBS->TRACE_LOG_FLUSH_FAILURE_CHECK_INTERVAL_SECONDS));
-		TraceEvent("CrashDebugPingActionSetupInWorker");
 		Future<Void> pingAck = pingTraceLogWriterThread();
 		try {
 			wait(timeoutError(pingAck, SERVER_KNOBS->TRACE_LOG_PING_TIMEOUT_SECONDS));
@ -1280,7 +1279,7 @@ ACTOR Future<Void> workerServer(
 			}
 			when( InitializeMasterProxyRequest req = waitNext(interf.masterProxy.getFuture()) ) {
 				MasterProxyInterface recruited;
-				recruited.locality = locality;
+				recruited.processId = locality.processId();
 				recruited.provisional = false;
 				recruited.initEndpoints();

--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@ -670,7 +670,7 @@ struct ConsistencyCheckWorkload : TestWorkload

 			Standalone<RangeResultRef> UIDtoTagMap = wait( tr.getRange( serverTagKeys, CLIENT_KNOBS->TOO_MANY ) );
 			ASSERT( !UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY );
-			decodeKeyServersValue(UIDtoTagMap, keyLocations[shard].value, sourceStorageServers, destStorageServers);
+			decodeKeyServersValue(UIDtoTagMap, keyLocations[shard].value, sourceStorageServers, destStorageServers, false);

 			//If the destStorageServers is non-empty, then this shard is being relocated
 			state bool isRelocating = destStorageServers.size() > 0;
@ -1482,8 +1482,8 @@ struct ConsistencyCheckWorkload : TestWorkload
 							TraceEvent("ConsistencyCheck_LogRouterNotInNonExcludedWorkers").detail("Id", logRouter.id());
 							return false;
 						}
-						if (logRouter.interf().locality.dcId() != expectedRemoteDcId) {
-							TraceEvent("ConsistencyCheck_LogRouterNotBestDC").detail("expectedDC", getOptionalString(expectedRemoteDcId)).detail("ActualDC", getOptionalString(logRouter.interf().locality.dcId()));
+						if (logRouter.interf().filteredLocality.dcId() != expectedRemoteDcId) {
+							TraceEvent("ConsistencyCheck_LogRouterNotBestDC").detail("expectedDC", getOptionalString(expectedRemoteDcId)).detail("ActualDC", getOptionalString(logRouter.interf().filteredLocality.dcId()));
 							return false;
 						}
 					}
--- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
@ -652,7 +652,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 					limit = deterministicRandom()->randomInt(0, INT_MAX)+1;
 			}

-			bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && specialKeys.contains(keysel2.getKey());
+			bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && keysel2.getKey() <= specialKeys.end;

 			contract = {
 				std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf(limit < 0) ),
@ -685,7 +685,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 			keysel2 = makeKeySel();
 			limits = makeRangeLimits();

-			bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && specialKeys.contains(keysel2.getKey());
+			bool isSpecialKeyRange = specialKeys.contains(keysel1.getKey()) && keysel2.getKey() <= specialKeys.end;

 			contract = {
 				std::make_pair( error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf( !limits.isReached() && !limits.isValid()) ),
@ -729,7 +729,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 					limit = deterministicRandom()->randomInt(0, INT_MAX)+1;
 			}

-			bool isSpecialKeyRange = specialKeys.contains(key1) && specialKeys.contains(key2);
+			bool isSpecialKeyRange = specialKeys.contains(key1) && key2 <= specialKeys.end;

 			contract = {
 				std::make_pair( error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2) ),
@ -764,7 +764,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 			key2 = makeKey();
 			limits = makeRangeLimits();

-			bool isSpecialKeyRange = specialKeys.contains(key1) && specialKeys.contains(key2);
+			bool isSpecialKeyRange = specialKeys.contains(key1) && key2 <= specialKeys.end;

 			contract = {
 				std::make_pair( error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2) ),
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@ -42,7 +42,7 @@ public:

 struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {

-	int actorCount, minKeysPerRange, maxKeysPerRange, rangeCount, keyBytes, valBytes;
+	int actorCount, minKeysPerRange, maxKeysPerRange, rangeCount, keyBytes, valBytes, conflictRangeSizeFactor;
 	double testDuration, absoluteRandomProb, transactionsPerSecond;
 	PerfIntCounter wrongResults, keysCount;
 	Reference<ReadYourWritesTransaction> ryw; // used to store all populated data
@ -60,6 +60,9 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		transactionsPerSecond = getOption(options, LiteralStringRef("transactionsPerSecond"), 100.0);
 		actorCount = getOption(options, LiteralStringRef("actorCount"), 1);
 		absoluteRandomProb = getOption(options, LiteralStringRef("absoluteRandomProb"), 0.5);
+		// Controls the relative size of read/write conflict ranges and the number of random getranges
+		conflictRangeSizeFactor = getOption(options, LiteralStringRef("conflictRangeSizeFactor"), 10);
+		ASSERT(conflictRangeSizeFactor >= 1);
 	}

 	virtual std::string description() { return "SpecialKeySpaceCorrectness"; }
@ -72,6 +75,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 	double getCheckTimeout() override { return std::numeric_limits<double>::max(); }

 	Future<Void> _setup(Database cx, SpecialKeySpaceCorrectnessWorkload* self) {
+		cx->specialKeySpace = std::make_shared<SpecialKeySpace>();
 		if (self->clientId == 0) {
 			self->ryw = Reference(new ReadYourWritesTransaction(cx));
 			self->ryw->setVersion(100);
@ -97,7 +101,11 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		return Void();
 	}
 	ACTOR Future<Void> _start(Database cx, SpecialKeySpaceCorrectnessWorkload* self) {
-		if (self->clientId == 0) wait(timeout(self->getRangeCallActor(cx, self), self->testDuration, Void()));
+		if (self->clientId == 0) {
+			wait(timeout(self->getRangeCallActor(cx, self) && testConflictRanges(cx, /*read*/ true, self) &&
+			                 testConflictRanges(cx, /*read*/ false, self),
+			             self->testDuration, Void()));
+		}
 		return Void();
 	}

@ -161,6 +169,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 				    .detail("TestValue", printable(res2[i].value));
 				return false;
 			}
+			TEST(true); // Special key space keys equal
 		}
 		return true;
 	}
@ -201,6 +210,131 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {

 		return GetRangeLimits(rowLimits, byteLimits);
 	}
+
+	ACTOR static Future<Void> testConflictRanges(Database cx_, bool read, SpecialKeySpaceCorrectnessWorkload* self) {
+		state StringRef prefix = read ? readConflictRangeKeysRange.begin : writeConflictRangeKeysRange.begin;
+		TEST(read); // test read conflict range special key implementation
+		TEST(!read); // test write conflict range special key implementation
+		// Get a default special key range instance
+		Database cx = cx_->clone();
+		state Reference<ReadYourWritesTransaction> tx = Reference(new ReadYourWritesTransaction(cx));
+		state Reference<ReadYourWritesTransaction> referenceTx = Reference(new ReadYourWritesTransaction(cx));
+		state bool ryw = deterministicRandom()->coinflip();
+		if (!ryw) {
+			tx->setOption(FDBTransactionOptions::READ_YOUR_WRITES_DISABLE);
+		}
+		referenceTx->setVersion(100); // Prevent this from doing a GRV or committing
+		referenceTx->clear(normalKeys);
+		referenceTx->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		int numKeys = deterministicRandom()->randomInt(1, self->conflictRangeSizeFactor) * 4;
+		state std::vector<std::string> keys; // Must all be distinct
+		keys.resize(numKeys);
+		int lastKey = 0;
+		for (auto& key : keys) {
+			key = std::to_string(lastKey++);
+		}
+		if (deterministicRandom()->coinflip()) {
+			// Include beginning of keyspace
+			keys.push_back("");
+		}
+		if (deterministicRandom()->coinflip()) {
+			// Include end of keyspace
+			keys.push_back("\xff");
+		}
+		std::mt19937 g(deterministicRandom()->randomUInt32());
+		std::shuffle(keys.begin(), keys.end(), g);
+		// First half of the keys will be ranges, the other keys will mix in some read boundaries that aren't range
+		// boundaries
+		std::sort(keys.begin(), keys.begin() + keys.size() / 2);
+		for (auto iter = keys.begin(); iter + 1 < keys.begin() + keys.size() / 2; iter += 2) {
+			Standalone<KeyRangeRef> range = KeyRangeRef(*iter, *(iter + 1));
+			if (read) {
+				tx->addReadConflictRange(range);
+				// Add it twice so that we can observe the de-duplication that should get done
+				tx->addReadConflictRange(range);
+			} else {
+				tx->addWriteConflictRange(range);
+				tx->addWriteConflictRange(range);
+			}
+			// TODO test that fails if we don't wait on tx->pendingReads()
+			referenceTx->set(range.begin, LiteralStringRef("1"));
+			referenceTx->set(range.end, LiteralStringRef("0"));
+		}
+		if (!read && deterministicRandom()->coinflip()) {
+			try {
+				wait(tx->commit());
+			} catch (Error& e) {
+				if (e.code() == error_code_actor_cancelled) throw;
+				return Void();
+			}
+			TEST(true); // Read write conflict range of committed transaction
+		}
+		for (int i = 0; i < self->conflictRangeSizeFactor; ++i) {
+			GetRangeLimits limit;
+			KeySelector begin;
+			KeySelector end;
+			loop {
+				begin = firstGreaterOrEqual(deterministicRandom()->randomChoice(keys));
+				end = firstGreaterOrEqual(deterministicRandom()->randomChoice(keys));
+				if (begin.getKey() <= end.getKey()) break;
+			}
+			bool reverse = deterministicRandom()->coinflip();
+
+			auto correctResultFuture = referenceTx->getRange(begin, end, limit, false, reverse);
+			ASSERT(correctResultFuture.isReady());
+			begin.setKey(begin.getKey().withPrefix(prefix, begin.arena()));
+			end.setKey(end.getKey().withPrefix(prefix, begin.arena()));
+			auto testResultFuture = tx->getRange(begin, end, limit, false, reverse);
+			ASSERT(testResultFuture.isReady());
+			auto correct_iter = correctResultFuture.get().begin();
+			auto test_iter = testResultFuture.get().begin();
+			bool had_error = false;
+			while (correct_iter != correctResultFuture.get().end() && test_iter != testResultFuture.get().end()) {
+				if (correct_iter->key != test_iter->key.removePrefix(prefix) ||
+				    correct_iter->value != test_iter->value) {
+					TraceEvent(SevError, "TestFailure")
+					    .detail("Reason", "Mismatched keys")
+					    .detail("ConflictType", read ? "read" : "write")
+					    .detail("CorrectKey", correct_iter->key)
+					    .detail("TestKey", test_iter->key)
+					    .detail("CorrectValue", correct_iter->value)
+					    .detail("TestValue", test_iter->value)
+					    .detail("Begin", begin.toString())
+					    .detail("End", end.toString())
+					    .detail("Ryw", ryw);
+					had_error = true;
+				}
+				++correct_iter;
+				++test_iter;
+			}
+			while (correct_iter != correctResultFuture.get().end()) {
+				TraceEvent(SevError, "TestFailure")
+				    .detail("Reason", "Extra correct key")
+				    .detail("ConflictType", read ? "read" : "write")
+				    .detail("CorrectKey", correct_iter->key)
+				    .detail("CorrectValue", correct_iter->value)
+				    .detail("Begin", begin.toString())
+				    .detail("End", end.toString())
+				    .detail("Ryw", ryw);
+				++correct_iter;
+				had_error = true;
+			}
+			while (test_iter != testResultFuture.get().end()) {
+				TraceEvent(SevError, "TestFailure")
+				    .detail("Reason", "Extra test key")
+				    .detail("ConflictType", read ? "read" : "write")
+				    .detail("TestKey", test_iter->key)
+				    .detail("TestValue", test_iter->value)
+				    .detail("Begin", begin.toString())
+				    .detail("End", end.toString())
+				    .detail("Ryw", ryw);
+				++test_iter;
+				had_error = true;
+			}
+			if (had_error) break;
+		}
+		return Void();
+	}
 };

 WorkloadFactory<SpecialKeySpaceCorrectnessWorkload> SpecialKeySpaceCorrectnessFactory("SpecialKeySpaceCorrectness");
--- a/fdbserver/workloads/TPCC.actor.cpp
+++ b/fdbserver/workloads/TPCC.actor.cpp
@ -91,7 +91,7 @@ struct TPCCMetrics {
 			++failedCounter;
 		}
 		TraceEvent("TransactionComplete")
-		    .detail("Type", txnType)
+		    .detail("TransactionType", txnType)
 		    .detail("Latency", responseTime)
 		    .detail("Begin", txnStartTime)
 		    .detail("End", txnStartTime + responseTime)
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@ -25,6 +25,7 @@
 void forceLinkIndexedSetTests();
 void forceLinkDequeTests();
 void forceLinkFlowTests();
+void forceLinkVersionedMapTests();

 struct UnitTestWorkload : TestWorkload {
 	bool enabled;
@ -43,6 +44,7 @@ struct UnitTestWorkload : TestWorkload {
 		forceLinkIndexedSetTests();
 		forceLinkDequeTests();
 		forceLinkFlowTests();
+		forceLinkVersionedMapTests();
 	}

 	virtual std::string description() { return "UnitTests"; }
--- a/flow/Arena.h
+++ b/flow/Arena.h
@ -530,11 +530,12 @@ public:
 	int expectedSize() const { return size(); }

 	int compare(StringRef const& other) const {
-		if (std::min(size(), other.size()) > 0) {
-			int c = memcmp(begin(), other.begin(), std::min(size(), other.size()));
+		size_t minSize = std::min(size(), other.size());
+		if (minSize != 0) {
+			int c = memcmp(begin(), other.begin(), minSize);
 			if (c != 0) return c;
 		}
-		return size() - other.size();
+		return ::compare(size(), other.size());
 	}

 	// Removes bytes from begin up to and including the sep string, returns StringRef of the part before sep
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@ -61,6 +61,7 @@ set(FLOW_SRCS
  ThreadSafeQueue.h
  Trace.cpp
  Trace.h
+  TreeBenchmark.h
  UnitTest.cpp
  UnitTest.h
  XmlTraceLogFormatter.cpp
--- a/flow/IKeyValueContainer.h
+++ b/flow/IKeyValueContainer.h
@ -41,11 +41,22 @@ struct KeyValueMapPair {
 	KeyValueMapPair(KeyRef key, ValueRef value)
 	  : arena(key.expectedSize() + value.expectedSize()), key(arena, key), value(arena, value) {}

+	int compare(KeyValueMapPair const& r) const { return ::compare(key, r.key); }
+
+	template <class CompatibleWithKey>
+	int compare(CompatibleWithKey const& r) const {
+		return ::compare(key, r);
+	}
+
 	bool operator<(KeyValueMapPair const& r) const { return key < r.key; }
 	bool operator==(KeyValueMapPair const& r) const { return key == r.key; }
 	bool operator!=(KeyValueMapPair const& r) const { return key != r.key; }
 };

+template <class CompatibleWithKey>
+int compare(CompatibleWithKey const& l, KeyValueMapPair const& r) {
+	return ::compare(l, r.key);
+}
 template <class CompatibleWithKey>
 bool operator<(KeyValueMapPair const& l, CompatibleWithKey const& r) {
 	return l.key < r;
--- a/flow/IRandom.h
+++ b/flow/IRandom.h
@ -34,6 +34,32 @@
 #endif
 #include <functional>

+// Until we move to C++20, we'll need something to take the place of operator<=>.
+// This is as good a place as any, I guess.
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value, int>::type compare(T l, T r) {
+	const int gt = l > r;
+	const int lt = l < r;
+	return gt - lt;
+	// GCC also emits branchless code for the following, but the above performs
+	// slightly better in benchmarks as of this writing.
+	// return l < r ? -1 : l == r ? 0 : 1;
+}
+
+template <typename T, typename U>
+typename std::enable_if<!std::is_integral<T>::value, int>::type compare(T const& l, U const& r) {
+	return l.compare(r);
+}
+
+template <class K, class V>
+int compare(std::pair<K, V> const& l, std::pair<K, V> const& r) {
+	if (int cmp = compare(l.first, r.first)) {
+		return cmp;
+	}
+	return compare(l.second, r.second);
+}
+
 class UID {
 	uint64_t part[2];
 public:
@ -44,6 +70,12 @@ public:
 	std::string shortString() const;
 	bool isValid() const { return part[0] || part[1]; }

+	int compare(const UID& r) const {
+		if (int cmp = ::compare(part[0], r.part[0])) {
+			return cmp;
+		}
+		return ::compare(part[1], r.part[1]);
+	}
 	bool operator == ( const UID& r ) const { return part[0]==r.part[0] && part[1]==r.part[1]; }
 	bool operator != ( const UID& r ) const { return part[0]!=r.part[0] || part[1]!=r.part[1]; }
 	bool operator < ( const UID& r ) const { return part[0] < r.part[0] || (part[0] == r.part[0] && part[1] < r.part[1]); }
--- a/flow/IThreadPool.h
+++ b/flow/IThreadPool.h
@ -92,12 +92,16 @@ public:
 	void send( T const& t ) {  // Can be called safely from another thread.  Call send or sendError at most once.
 		Promise<Void> signal;
 		tagAndForward( &promise, t, signal.getFuture() );
-		g_network->onMainThread( std::move(signal), incrementPriorityIfEven( g_network->getCurrentTask() ) );
+		g_network->onMainThread(std::move(signal), g_network->isOnMainThread()
+		                                               ? incrementPriorityIfEven(g_network->getCurrentTask())
+		                                               : TaskPriority::DefaultOnMainThread);
 	}
 	void sendError( Error const& e ) {  // Can be called safely from another thread.  Call send or sendError at most once.
 		Promise<Void> signal;
 		tagAndForwardError( &promise, e, signal.getFuture() );
-		g_network->onMainThread( std::move(signal), incrementPriorityIfEven( g_network->getCurrentTask() ) );
+		g_network->onMainThread(std::move(signal), g_network->isOnMainThread()
+		                                               ? incrementPriorityIfEven(g_network->getCurrentTask())
+		                                               : TaskPriority::DefaultOnMainThread);
 	}
 private:
 	Promise<T> promise;
--- a/flow/IndexedSet.cpp
+++ b/flow/IndexedSet.cpp
@ -31,8 +31,8 @@
 #include <cstring>
 #include <deque>
 #include <random>
+#include "flow/TreeBenchmark.h"
 #include "flow/UnitTest.h"
-
 template <class Node>
 int ISGetHeight(Node* n){
 	if (!n) return 0;
@ -137,7 +137,123 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {
 	return Void();
 }

-/*TEST_CASE("/flow/IndexedSet/performance") {
+TEST_CASE("/flow/IndexedSet/random ops") {
+	for (int t = 0; t < 100; t++) {
+		IndexedSet<int, int> is;
+		int rr = deterministicRandom()->randomInt(0, 600) * deterministicRandom()->randomInt(0, 600);
+		for (int n = 0; n < rr; n++) {
+			if (deterministicRandom()->random01() < (double)is.sumTo(is.end()) / rr * 2)
+				is.erase(is.lower_bound(deterministicRandom()->randomInt(0, 10000000)));
+			else
+				is.insert(deterministicRandom()->randomInt(0, 10000000), 3);
+		}
+
+		int b = deterministicRandom()->randomInt(0, 10000000);
+		// int e = b + deterministicRandom()->randomInt(0, 10);
+		int e = deterministicRandom()->randomInt(0, 10000000);
+		if (e < b) std::swap(b, e);
+		auto ib = is.lower_bound(b);
+		auto ie = is.lower_bound(e);
+
+		int original_count = is.sumTo(is.end()) / 3;
+		int original_incount = is.sumRange(ib, ie) / 3;
+
+		// printf("\n#%d Erasing %d of %d items\n", t, original_incount, original_count);
+
+		is.erase(ib, ie);
+		is.testonly_assertBalanced();
+
+		int count = 0, incount = 0;
+		for (auto i : is) {
+			++count;
+			if (i >= b && i < e) {
+				// printf("Remaining item: %d (%d - %d)\n", i, b, e);
+				incount++;
+			}
+		}
+
+		// printf("%d items remain, totalling %d\n", count, is.sumTo(is.end()));
+		// printf("%d items remain in erased range\n", incount);
+
+		ASSERT(incount == 0);
+		ASSERT(count == original_count - original_incount);
+		ASSERT(is.sumTo(is.end()) == count * 3);
+	}
+	return Void();
+}
+
+TEST_CASE("/flow/IndexedSet/strings") {
+	Map<std::string, int> myMap;
+	std::map<std::string, int> aMap;
+	myMap["Hello"] = 1;
+	myMap["Planet"] = 5;
+	for (auto i = myMap.begin(); i != myMap.end(); ++i) aMap[i->key] = i->value;
+
+	ASSERT(myMap.find(std::string("Hello"))->value == 1);
+	ASSERT(myMap.find(std::string("World")) == myMap.end());
+	ASSERT(myMap["Hello"] == 1);
+
+	auto a = myMap.upper_bound("A")->key;
+	auto x = myMap.lower_bound("M")->key;
+
+	ASSERT((a + x) == (std::string) "HelloPlanet");
+
+	return Void();
+}
+
+template <typename K>
+struct IndexedSetHarness {
+	using map = IndexedSet<K, int>;
+	using result = typename map::iterator;
+	using key_type = K;
+
+	map s;
+
+	void insert(K const& k) { s.insert(K(k), 1); }
+	result find(K const& k) const { return s.find(k); }
+	result not_found() const { return s.end(); }
+	result begin() const { return s.begin(); }
+	result end() const { return s.end(); }
+	result lower_bound(K const& k) const { return s.lower_bound(k); }
+	result upper_bound(K const& k) const { return s.upper_bound(k); }
+	void erase(K const& k) { s.erase(k); }
+};
+
+TEST_CASE("performance/map/StringRef/IndexedSet") {
+	Arena arena;
+
+	IndexedSetHarness<StringRef> is;
+	treeBenchmark(is, [&arena]() { return randomStr(arena); });
+
+	return Void();
+}
+
+TEST_CASE("performance/map/StringRef/StdMap") {
+	Arena arena;
+
+	MapHarness<StringRef> is;
+	treeBenchmark(is, [&arena]() { return randomStr(arena); });
+
+	return Void();
+}
+
+TEST_CASE("performance/map/int/IndexedSet") {
+	IndexedSetHarness<int> is;
+	treeBenchmark(is, &randomInt);
+
+	return Void();
+}
+
+TEST_CASE("performance/map/int/StdMap") {
+	MapHarness<int> is;
+	treeBenchmark(is, &randomInt);
+
+	return Void();
+}
+
+TEST_CASE("performance/flow/IndexedSet/integers") {
+	std::mt19937_64 urng(deterministicRandom()->randomUInt32());
+
 	std::vector<int> x;
 	for (int i = 0; i<1000000; i++)
 		x.push_back(deterministicRandom()->randomInt(0, 10000000));
@ -151,7 +267,6 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {
 	double end = timer();
 	double kps = x.size() / 1000.0 / (end - start);
 	printf("%0.1f Kinsert/sec\n", kps);
-	ASSERT(kps >= 500);                                           //< Or something?

 	start = timer();
 	for (int i = 0; i<x.size(); i++)
@ -159,7 +274,6 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {
 	end = timer();
 	kps = x.size() / 1000.0 / (end - start);
 	printf("%0.1f Kfind/sec\n", kps);
-	ASSERT(kps >= 500);

 	{
 		//std::set<int> ss;
@ -194,7 +308,7 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {

 	is.testonly_assertBalanced();

-	std::random_shuffle(x.begin(), x.end());
+	std::shuffle(x.begin(), x.end(), urng);
 	start = timer();
 	for (int i = 0; i<x.size(); i++) {
 		is.erase(x[i]);
@ -204,87 +318,41 @@ TEST_CASE("/flow/IndexedSet/erase 400k of 1M") {

 	printf("%0.1f Kerase/sec\n", x.size() / 1000.0 / (end - start));
 	is.testonly_assertBalanced();
-	for (int i = 0; i<x.size() / 2; i++)
+	for (int i = 0; i < x.size() / 2; i++) {
 		ASSERT(is.find(x[i]) == is.end());
-}*/
-
-TEST_CASE("/flow/IndexedSet/random ops") {
-	for (int t = 0; t<100; t++) {
-		IndexedSet<int, int> is;
-		int rr = deterministicRandom()->randomInt(0, 600) * deterministicRandom()->randomInt(0, 600);
-		for (int n = 0; n<rr; n++) {
-			if (deterministicRandom()->random01() < (double)is.sumTo(is.end()) / rr * 2)
-				is.erase(is.lower_bound(deterministicRandom()->randomInt(0, 10000000)));
-			else
-				is.insert(deterministicRandom()->randomInt(0, 10000000), 3);
-		}
-
-		int b = deterministicRandom()->randomInt(0, 10000000);
-		//int e = b + deterministicRandom()->randomInt(0, 10);
-		int e = deterministicRandom()->randomInt(0, 10000000);
-		if (e<b) std::swap(b, e);
-		auto ib = is.lower_bound(b);
-		auto ie = is.lower_bound(e);
-
-		int original_count = is.sumTo(is.end())/3;
-		int original_incount = is.sumRange(ib, ie)/3;
-
-		//printf("\n#%d Erasing %d of %d items\n", t, original_incount, original_count);
-
-		is.erase(ib, ie);
-		is.testonly_assertBalanced();
-
-		int count = 0, incount = 0;
-		for (auto i : is) {
-			++count;
-			if (i >= b && i < e) { 
-				//printf("Remaining item: %d (%d - %d)\n", i, b, e); 
-				incount++; 
-			}
-		}
-
-		//printf("%d items remain, totalling %d\n", count, is.sumTo(is.end()));
-		//printf("%d items remain in erased range\n", incount);
-
-		ASSERT(incount == 0);
-		ASSERT(count == original_count - original_incount);
-		ASSERT(is.sumTo(is.end()) == count*3);
 	}
+
 	return Void();
 }

-TEST_CASE("/flow/IndexedSet/strings") {
+TEST_CASE("performance/flow/IndexedSet/strings") {
+	constexpr size_t count = 1000000;
 	Map< std::string, int > myMap;
 	std::map< std::string, int > aMap;
-	myMap["Hello"] = 1;
-	myMap["Planet"] = 5;
-	for (auto i = myMap.begin(); i != myMap.end(); ++i)
-		aMap[i->key] = i->value;
+	double start, end;
+	int tt = 0;

-	ASSERT(myMap.find("Hello")->value == 1);
-	ASSERT(myMap.find("World") == myMap.end());
-	ASSERT(myMap["Hello"] == 1);
+	std::string const hello{ "Hello" };
+	myMap[hello] = 1;
+	aMap["Hello"] = 1;

-	auto a = myMap.upper_bound("A")->key;
-	auto x = myMap.lower_bound("M")->key;
+	start = timer();

-	ASSERT((a + x) == (std::string)"HelloPlanet");
+	for (size_t i = 0; i < count; i++) {
+		tt += myMap.find(hello)->value;
+	}
+	end = timer();

-	/* This was a performance test:
+	ASSERT(tt == count);

-		double start = timer();
-		volatile int tt=0;
-		for(int i=0; i<1000000; i++)
-		tt += myMap.find( "Hello" )->value;
-		double end = timer();
-		printf("%0.1f Map.KfindStr/sec\n", 1000000/1000.0/(end-start));
+	printf("%0.1f Map.KfindStr/sec\n", count / 1000.0 / (end - start));

-		start = timer();
-		for(int i=0; i<1000000; i++)
-		aMap.find( "Hello" );
-		end = timer();
-		printf("%0.1f std::map.KfindStr/sec\n", 1000000/1000.0/(end-start));
-	*/
+	start = timer();
+	for (size_t i = 0; i < count; i++) {
+		aMap.find(hello);
+	}
+	end = timer();
+	printf("%0.1f std::map.KfindStr/sec\n", count / 1000.0 / (end - start));

 	return Void();
 }
@ -340,6 +408,7 @@ TEST_CASE("/flow/IndexedSet/data constructor and destructor calls match") {
 		~Counter() { count--; }
 		Counter(const Counter& r) :value(r.value) { count++; }
 		void operator=(const Counter& r) { value = r.value; }
+		int compare(const Counter& r) const { return ::compare(value, r.value); }
 		bool operator<(const Counter& r) const { return value < r.value; }
 	};
 	IndexedSet<Counter, NoMetric> mySet;
--- a/flow/IndexedSet.h
+++ b/flow/IndexedSet.h
@ -22,6 +22,7 @@
 #define FLOW_INDEXEDSET_H
 #pragma once

+#include "flow/Arena.h"
 #include "flow/Platform.h"
 #include "flow/FastAlloc.h"
 #include "flow/Trace.h"
@ -199,7 +200,7 @@ private:

 	Node *root;

-	Metric eraseHalf( Node* start, Node* end, int eraseDir, int& heightDelta, std::vector<Node*>& toFree );
+	Metric eraseHalf(Node* start, Node* end, int eraseDir, int& heightDelta, std::vector<Node*>& toFree);
 	void erase( iterator begin, iterator end, std::vector<Node*>& toFree );

 	void replacePointer( Node* oldNode, Node* newNode ) {
@ -252,6 +253,11 @@ public:
 	MapPair(MapPair&& r) BOOST_NOEXCEPT  : key(std::move(r.key)), value(std::move(r.value)) {}
 	void operator=(MapPair&& r) BOOST_NOEXCEPT { key = std::move(r.key); value = std::move(r.value); }

+	int compare(MapPair<Key, Value> const& r) const { return ::compare(key, r.key); }
+	template <class CompatibleWithKey>
+	int compare(CompatibleWithKey const& r) const {
+		return ::compare(key, r);
+	}
 	bool operator<(MapPair<Key,Value> const& r) const { return key < r.key; }
 	bool operator<=(MapPair<Key,Value> const& r) const { return key <= r.key; }
 	bool operator==(MapPair<Key,Value> const& r) const { return key == r.key; }
@ -260,6 +266,11 @@ public:
 //private: MapPair( const MapPair& );
 };

+template <class Key, class Value, class CompatibleWithKey>
+inline int compare(CompatibleWithKey const& l, MapPair<Key, Value> const& r) {
+	return compare(l, r.key);
+}
+
 template <class Key, class Value>
 inline MapPair<typename std::decay<Key>::type, typename std::decay<Value>::type> mapPair(Key&& key, Value&& value) { return MapPair<typename std::decay<Key>::type, typename std::decay<Value>::type>(std::forward<Key>(key), std::forward<Value>(value)); }

@ -614,8 +625,8 @@ typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::insert(T_&& data,
 	int d; // direction
 	// traverse to find insert point
 	while (true){
-		d = t->data < data;
-		if (!d && !(data < t->data)) {	// t->data == data
+		int cmp = compare(data, t->data);
+		if (cmp == 0) {
 			Node *returnNode = t;
 			if(replaceExisting) {
 				t->data = std::forward<T_>(data);
@ -633,6 +644,7 @@ typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::insert(T_&& data,

 			return returnNode;
 		}
+		d = cmp > 0;
 		Node *nextT = t->child[d];
 		if (!nextT) break;
 		t = nextT;
@ -689,7 +701,7 @@ int IndexedSet<T,Metric>::insert(const std::vector<std::pair<T,Metric>>& dataVec
 		int d = 1; // direction
 		if(blockStart == NULL || (blockEnd != NULL && data >= blockEnd->data)) {
 			blockEnd = NULL;
-			if (root == NULL){
+			if (root == NULL) {
 				root = new Node(std::move(data), metric);
 				num_inserted++;
 				blockStart = root;
@ -699,11 +711,12 @@ int IndexedSet<T,Metric>::insert(const std::vector<std::pair<T,Metric>>& dataVec
 			Node *t = root;
 			// traverse to find insert point
 			bool foundNode = false;
-			while (true){
-				d = t->data < data;
-				if (!d)
+			while (true) {
+				int cmp = compare(data, t->data);
+				d = cmp > 0;
+				if (d == 0)
 					blockEnd = t;
-				if (!d && !(data < t->data)) {	// t->data == data
+				if (cmp == 0) {
 					Node *returnNode = t;
 					if(replaceExisting) {
 						num_inserted++;
@ -784,7 +797,8 @@ int IndexedSet<T,Metric>::insert(const std::vector<std::pair<T,Metric>>& dataVec
 }

 template <class T, class Metric>
-Metric IndexedSet<T,Metric>::eraseHalf( Node* start, Node* end, int eraseDir, int& heightDelta, std::vector<Node*>& toFree ) {
+Metric IndexedSet<T, Metric>::eraseHalf(Node* start, Node* end, int eraseDir, int& heightDelta,
+                                        std::vector<Node*>& toFree) {
 	// Removes all nodes between start (inclusive) and end (exclusive) from the set, where start is equal to end or one of its descendants
 	// eraseDir 1 means erase the right half (nodes > at) of the left subtree of end.  eraseDir 0 means the left half of the right subtree
 	// toFree is extended with the roots of completely removed subtrees
@ -860,7 +874,7 @@ void IndexedSet<T,Metric>::erase( typename IndexedSet<T,Metric>::iterator begin,
 	// Removes all nodes in the set between first and last, inclusive.
 	// toFree is extended with the roots of completely removed subtrees.

-	ASSERT(!end.i || (begin.i && *begin <= *end));
+	ASSERT(!end.i || (begin.i && (::compare(*begin, *end) <= 0)));

 	if(begin == end)
 		return;
@ -876,8 +890,8 @@ void IndexedSet<T,Metric>::erase( typename IndexedSet<T,Metric>::iterator begin,
 	
 	// Erase all matching nodes that descend from subRoot, by first erasing descendants of subRoot->child[0] and then erasing the descendants of subRoot->child[1]
 	// subRoot is not removed from the tree at this time
-	metricDelta = metricDelta + eraseHalf( first, subRoot, 1, leftHeightDelta, toFree );
-	metricDelta = metricDelta + eraseHalf( last, subRoot, 0, rightHeightDelta, toFree );
+	metricDelta = metricDelta + eraseHalf(first, subRoot, 1, leftHeightDelta, toFree);
+	metricDelta = metricDelta + eraseHalf(last, subRoot, 0, rightHeightDelta, toFree);

 	// Change in the height of subRoot due to past activity, before subRoot is rebalanced. subRoot->balance already reflects changes in height to its children.
 	int heightDelta = leftHeightDelta + rightHeightDelta; 
@ -995,10 +1009,9 @@ template <class Key>
 typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::find(const Key &key) const {
 	Node* t = root;
 	while (t){
-		int d = t->data < key;
-		if (!d && !(key < t->data)) // t->data == key
-			return iterator(t);
-		t = t->child[d];
+		int cmp = compare(key, t->data);
+		if (cmp == 0) return iterator(t);
+		t = t->child[cmp > 0];
 	}
 	return end();
 }
@ -1009,14 +1022,15 @@ template <class Key>
 typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::lower_bound(const Key &key) const {
 	Node* t = root;
 	if (!t) return iterator();
+	bool less;
 	while (true) {
-		Node *n = t->child[ t->data < key ];
+		less = t->data < key;
+		Node* n = t->child[less];
 		if (!n) break;
 		t = n;
 	}

-	if (t->data < key)
-		moveIterator<1>(t);
+	if (less) moveIterator<1>(t);

 	return iterator(t);
 }
@ -1027,14 +1041,15 @@ template <class Key>
 typename IndexedSet<T,Metric>::iterator IndexedSet<T,Metric>::upper_bound(const Key &key) const {
 	Node* t = root;
 	if (!t) return iterator();
+	bool not_less;
 	while (true) {
-		Node *n = t->child[ !(key < t->data) ];
+		not_less = !(key < t->data);
+		Node* n = t->child[not_less];
 		if (!n) break;
 		t = n;
 	}

-	if (!(key < t->data))
-		moveIterator<1>(t);
+	if (not_less) moveIterator<1>(t);

 	return iterator(t);
 }
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@ -91,6 +91,7 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063000000LL, UnifiedTLogSpilling);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, BackupWorker);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, ReportConflictingKeys);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, SmallEndpoints);
 };

 // These impact both communications and the deserialization of certain database and IKeyValueStore keys.
--- a/flow/SignalSafeUnwind.cpp
+++ b/flow/SignalSafeUnwind.cpp
@ -22,7 +22,7 @@

 int64_t dl_iterate_phdr_calls = 0;

-#ifdef __linux__
+#if defined(__linux__) && !defined(USE_SANITIZER)

 #include <link.h>
 #include <mutex>
--- a/flow/TDMetric.actor.h
+++ b/flow/TDMetric.actor.h
@ -28,7 +28,6 @@
        #define FLOW_TDMETRIC_ACTOR_H

 #include "flow/flow.h"
-#include "flow/IndexedSet.h"
 #include "flow/network.h"
 #include "flow/Knobs.h"
 #include "flow/genericactors.actor.h"
@ -56,9 +55,21 @@ struct MetricNameRef {
 	int expectedSize() const {
 		return type.expectedSize() + name.expectedSize();
 	}
+
+	inline int compare(MetricNameRef const& r) const {
+		int cmp;
+		if ((cmp = type.compare(r.type))) {
+			return cmp;
+		}
+		if ((cmp = name.compare(r.name))) {
+			return cmp;
+		}
+		return id.compare(r.id);
+	}
 };

 extern std::string reduceFilename(std::string const &filename);
+
 inline bool operator < (const MetricNameRef& l, const MetricNameRef& r ) {
 	int cmp = l.type.compare(r.type);
 	if(cmp == 0) {
--- a/flow/ThreadHelper.actor.h
+++ b/flow/ThreadHelper.actor.h
@ -197,12 +197,8 @@ public:
 	};

 	void blockUntilReady() {
-		if(isReadyUnsafe()) {
-			ThreadSpinLockHolder holder(mutex);
-			ASSERT(isReadyUnsafe());
-		}
-		else {
-			BlockCallback cb( *this );
+		if (!isReady()) {
+			BlockCallback cb(*this);
 		}
 	}

--- a/flow/ThreadPrimitives.h
+++ b/flow/ThreadPrimitives.h
@ -22,6 +22,8 @@
 #define FLOW_THREADPRIMITIVES_H
 #pragma once

+#include <atomic>
+
 #include "flow/Error.h"
 #include "flow/Trace.h"

@ -45,7 +47,7 @@
 class ThreadSpinLock {
 public:
 // #ifdef _WIN32
-	ThreadSpinLock(bool initiallyLocked=false) : isLocked(initiallyLocked) {
+	ThreadSpinLock() {
 #if VALGRIND
 		ANNOTATE_RWLOCK_CREATE(this);
 #endif
@ -56,31 +58,26 @@ public:
 #endif
 	}
 	void enter() {
-		while (interlockedCompareExchange(&isLocked, 1, 0) == 1)
-			_mm_pause();
+		while (isLocked.test_and_set(std::memory_order_acquire)) _mm_pause();
 #if VALGRIND
 		ANNOTATE_RWLOCK_ACQUIRED(this, true);
 #endif
 	}
 	void leave() {
-#if defined(__linux__)
-	__sync_synchronize();
-#endif
-		isLocked = 0;
-#if defined(__linux__)
-	__sync_synchronize();
-#endif
+		isLocked.clear(std::memory_order_release);
 #if VALGRIND
 		ANNOTATE_RWLOCK_RELEASED(this, true);
 #endif
 	}
 	void assertNotEntered() {
-		ASSERT( !isLocked );
+		ASSERT(!isLocked.test_and_set(std::memory_order_acquire));
+		isLocked.clear(std::memory_order_release);
 	}
+
 private:
 	ThreadSpinLock(const ThreadSpinLock&);
 	void operator=(const ThreadSpinLock&);
-	volatile int32_t isLocked;
+	std::atomic_flag isLocked = ATOMIC_FLAG_INIT;
 };

 class ThreadSpinLockHolder {
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@ -1053,7 +1053,7 @@ TraceEvent& TraceEvent::suppressFor( double duration, bool logSuppressedEventCou
 				}
 			}
 			else {
-				TraceEvent(SevWarnAlways, "SuppressionFromNonNetworkThread").detail("Type", type);
+				TraceEvent(SevWarnAlways, "SuppressionFromNonNetworkThread").detail("Event", type);
 				detail("__InvalidSuppression__", ""); // Choosing a detail name that is unlikely to collide with other names
 			}
 		}
--- a/flow/TreeBenchmark.h
+++ b/flow/TreeBenchmark.h
@ -0,0 +1,126 @@
+/*
+ * IndexedSet.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2020-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLOW_TREEBENCHMARK_H
+#define FLOW_TREEBENCHMARK_H
+#pragma once
+
+#include "flow/flow.h"
+#include <random>
+
+struct opTimer {
+	double start = timer();
+	char const* name;
+	int opCount;
+
+	opTimer(char const* name, int opCount) : name(name), opCount(opCount) {}
+
+	~opTimer() { printf("%s: %0.1f Kop/s\n", name, (opCount / 1000.0) / (timer() - start)); }
+};
+
+template <typename F, typename T>
+void timedRun(char const* name, T& t, F f) {
+	opTimer timer(name, t.size());
+	for (auto& i : t) {
+		f(i);
+	}
+}
+
+template <typename K>
+struct MapHarness {
+	using map = std::map<K, int>;
+	using key_type = K;
+
+	struct result {
+		typename map::const_iterator it;
+
+		result(typename map::const_iterator it) : it(it) {}
+
+		result& operator++() {
+			it++;
+			return *this;
+		}
+
+		const K& operator*() const { return (*it).first; }
+
+		const K& operator->() const { return it->first; }
+
+		bool operator==(result const& k) const { return it == k.it; }
+	};
+
+	map s;
+
+	void insert(K const& k) { s.insert(std::pair<K, int>(k, 1)); }
+	result find(K const& k) const { return result(s.find(k)); }
+	result not_found() const { return result(s.end()); }
+	result begin() const { return result(s.begin()); }
+	result end() const { return result(s.end()); }
+	result lower_bound(K const& k) const { return result(s.lower_bound(k)); }
+	result upper_bound(K const& k) const { return result(s.upper_bound(k)); }
+	void erase(K const& k) { s.erase(k); }
+};
+
+template <typename T, typename F>
+void treeBenchmark(T& tree, F generateKey) {
+	std::mt19937_64 urng(deterministicRandom()->randomUInt32());
+
+	using key = typename T::key_type;
+
+	int keyCount = 1000000;
+
+	std::vector<key> keys;
+	for (int i = 0; i < keyCount; i++) {
+		keys.push_back(generateKey());
+	}
+
+	timedRun("insert", keys, [&tree](key const& k) { tree.insert(k); });
+	timedRun("find", keys, [&tree](key const& k) { ASSERT(tree.find(k) != tree.not_found()); });
+	timedRun("lower_bound", keys, [&tree](key const & k) { ASSERT(tree.lower_bound(k) != tree.not_found()); });
+	timedRun("upper_bound", keys, [&tree](key const & k) { tree.upper_bound(k); });
+
+
+	std::sort(keys.begin(), keys.end());
+	keys.resize(std::unique(keys.begin(), keys.end()) - keys.begin());
+
+	auto iter = tree.lower_bound(*keys.begin());
+	timedRun("scan", keys, [&tree, &iter](key const& k) {
+		ASSERT(k == *iter);
+		++iter;
+	});
+	ASSERT(iter == tree.end());
+
+	timedRun("find (sorted)", keys, [&tree](key const& k) { ASSERT(tree.find(k) != tree.end()); });
+
+	std::shuffle(keys.begin(), keys.end(), urng);
+
+	timedRun("erase", keys, [&tree](key const& k) { tree.erase(k); });
+	ASSERT(tree.begin() == tree.end());
+}
+
+static inline StringRef randomStr(Arena& arena) {
+	size_t keySz = 100;
+	return StringRef(arena, deterministicRandom()->randomAlphaNumeric(keySz));
+}
+
+static inline int randomInt() {
+	return deterministicRandom()->randomInt(0, INT32_MAX);
+}
+
+#endif // FLOW_TREEBENCHMARK_H
--- a/flow/stacktrace.amalgamation.cpp
+++ b/flow/stacktrace.amalgamation.cpp
@ -2926,7 +2926,7 @@ static class VDSOInitHelper {
 /* Each function is empty and called (via a macro) only in debug mode.
   The arguments are captured by dynamic tools at runtime. */

-#if DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0 && !defined(__native_client__)
+#if DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0 && !defined(__native_client__) && !__has_feature(thread_sanitizer)

 #if __has_feature(memory_sanitizer)
 #include <sanitizer/msan_interface.h>
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -170,17 +170,17 @@ if(WITH_PYTHON)
    TEST_FILES restarting/StorefrontTestRestart-1.txt
               restarting/StorefrontTestRestart-2.txt)
  add_fdb_test(
-    TEST_FILES SnapTestAttrition-1.txt
-               SnapTestAttrition-2.txt IGNORE)
+    TEST_FILES restarting/from_6.2.0/SnapTestAttrition-1.txt
+               restarting/from_6.2.0/SnapTestAttrition-2.txt)
  add_fdb_test(
-    TEST_FILES SnapTestSimpleRestart-1.txt
-               SnapTestSimpleRestart-2.txt IGNORE)
+    TEST_FILES restarting/from_6.2.0/SnapTestSimpleRestart-1.txt
+               restarting/from_6.2.0/SnapTestSimpleRestart-2.txt)
  add_fdb_test(
-    TEST_FILES SnapTestRestart-1.txt
-               SnapTestRestart-2.txt IGNORE)
+    TEST_FILES restarting/from_6.2.0/SnapTestRestart-1.txt
+               restarting/from_6.2.0/SnapTestRestart-2.txt)
  add_fdb_test(
-    TEST_FILES SnapCycleRestart-1.txt
-               SnapCycleRestart-2.txt IGNORE)
+    TEST_FILES restarting/from_6.2.0/SnapCycleRestart-1.txt
+               restarting/from_6.2.0/SnapCycleRestart-2.txt)
  add_fdb_test(
    TEST_FILES restarting/from_5.1.7/DrUpgradeRestart-1.txt
               restarting/from_5.1.7/DrUpgradeRestart-2.txt)
--- a/tests/restarting/from_6.2.0/SnapCycleRestart-1.txt
+++ b/tests/restarting/from_6.2.0/SnapCycleRestart-1.txt
--- a/tests/restarting/from_6.2.0/SnapCycleRestart-2.txt
+++ b/tests/restarting/from_6.2.0/SnapCycleRestart-2.txt
--- a/tests/restarting/from_6.2.0/SnapTestAttrition-1.txt
+++ b/tests/restarting/from_6.2.0/SnapTestAttrition-1.txt
--- a/tests/restarting/from_6.2.0/SnapTestAttrition-2.txt
+++ b/tests/restarting/from_6.2.0/SnapTestAttrition-2.txt
--- a/tests/restarting/from_6.2.0/SnapTestRestart-1.txt
+++ b/tests/restarting/from_6.2.0/SnapTestRestart-1.txt
--- a/tests/restarting/from_6.2.0/SnapTestRestart-2.txt
+++ b/tests/restarting/from_6.2.0/SnapTestRestart-2.txt
--- a/tests/restarting/from_6.2.0/SnapTestSimpleRestart-1.txt
+++ b/tests/restarting/from_6.2.0/SnapTestSimpleRestart-1.txt
--- a/tests/restarting/from_6.2.0/SnapTestSimpleRestart-2.txt
+++ b/tests/restarting/from_6.2.0/SnapTestSimpleRestart-2.txt