foundationdb/fdbserver/DataDistributionQueue.actor...

/*
 * DataDistributionQueue.actor.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
 * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "flow/actorcompiler.h"
#include "flow/ActorCollection.h"
#include "fdbrpc/sim_validation.h"
#include "fdbclient/SystemData.h"
#include "DataDistribution.h"
#include "fdbclient/DatabaseContext.h"
#include "MoveKeys.h"
#include "Knobs.h"
#include "fdbrpc/simulator.h"

#define WORK_FULL_UTILIZATION 10000   // This is not a knob; it is a fixed point scaling factor!

struct RelocateData {
	KeyRange keys;
	int priority;
	double startTime;
	UID randomId;
	int workFactor;
	std::vector<UID> src;
	bool wantsNewServers;
	TraceInterval interval;

	RelocateData() : startTime(-1), priority(-1), workFactor(0), wantsNewServers(false), interval("QueuedRelocation") {}
	RelocateData( RelocateShard const& rs ) : keys(rs.keys), priority(rs.priority), startTime(now()), randomId(g_random->randomUniqueID()), workFactor(0),
		wantsNewServers(
			rs.priority == PRIORITY_REBALANCE_SHARD ||
			rs.priority == PRIORITY_REBALANCE_OVERUTILIZED_TEAM ||
			rs.priority == PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
			rs.priority == PRIORITY_SPLIT_SHARD ), interval("QueuedRelocation") {}

	bool operator> (const RelocateData& rhs) const {
		return priority != rhs.priority ? priority > rhs.priority : ( startTime != rhs.startTime ? startTime < rhs.startTime : randomId > rhs.randomId );
	}

	bool operator== (const RelocateData& rhs) const {
		return priority == rhs.priority && keys == rhs.keys && startTime == rhs.startTime && workFactor == rhs.workFactor && src == rhs.src && wantsNewServers == rhs.wantsNewServers && randomId == rhs.randomId;
	}

	bool changesBoundaries() {
		return priority == PRIORITY_MERGE_SHARD ||
			priority == PRIORITY_SPLIT_SHARD ||
			priority == PRIORITY_RECOVER_MOVE;
	}
};

struct Busyness {
	vector<int> ledger;

	Busyness() : ledger( 10, 0 ) {}

	bool canLaunch( int prio, int work ) {
		ASSERT( prio > 0 && prio < 1000 );
		return ledger[ prio / 100 ] <= WORK_FULL_UTILIZATION - work;  // allow for rounding errors in double division
	}
	void addWork( int prio, int work ) {
		ASSERT( prio > 0 && prio < 1000 );
		for( int i = 0; i <= (prio / 100); i++ )
			ledger[i] += work;
	}
	void removeWork( int prio, int work ) {
		addWork( prio, -work );
	}
	std::string toString() {
		std::string result;
		for(int i = 1; i < ledger.size();) {
			int j = i+1;
			while(j < ledger.size() && ledger[i] == ledger[j])
				j++;
			if(i != 1)
				result += ", ";
			result += i+1 == j ? format("%03d", i*100) : format("%03d/%03d", i*100, (j-1)*100);
			result += format("=%1.02f", (float)ledger[i] / WORK_FULL_UTILIZATION);
			i = j;
		}
		return result;
	}
};

// find the "workFactor" for this, were it launched now
int getWorkFactor( RelocateData const& relocation ) {
	// Avoid the divide by 0!
	ASSERT( relocation.src.size() );

	if( relocation.priority >= PRIORITY_TEAM_1_LEFT )
		return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
	else if( relocation.priority >= PRIORITY_TEAM_2_LEFT )
		return WORK_FULL_UTILIZATION / 2 / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
	else // for now we assume that any message at a lower priority can best be assumed to have a full team left for work
		return WORK_FULL_UTILIZATION / relocation.src.size() / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
}

// return true if servers are not too busy to launch the relocation
bool canLaunch( RelocateData & relocation, int teamSize, std::map<UID, Busyness> & busymap,
		std::vector<RelocateData> cancellableRelocations ) {
	// assert this has not already been launched
	ASSERT( relocation.workFactor == 0 );
	ASSERT( relocation.src.size() != 0 );

	// find the "workFactor" for this, were it launched now
	int workFactor = getWorkFactor( relocation );
	int neededServers = std::max( 1, (int)relocation.src.size() - teamSize + 1 );
	// see if each of the SS can launch this task
	for( int i = 0; i < relocation.src.size(); i++ ) {
		// For each source server for this relocation, copy and modify its busyness to reflect work that WOULD be cancelled
		auto busyCopy = busymap[ relocation.src[i] ];
		for( int j = 0; j < cancellableRelocations.size(); j++ ) {
			auto& servers = cancellableRelocations[j].src;
			if( std::count( servers.begin(), servers.end(), relocation.src[i] ) )
				busyCopy.removeWork( cancellableRelocations[j].priority, cancellableRelocations[j].workFactor );
		}
		// Use this modified busyness to check if this relocation could be launched
		if( busyCopy.canLaunch( relocation.priority, workFactor ) ) {
			--neededServers;
			if( neededServers == 0 )
				return true;
		}
	}
	return false;
}

// update busyness for each server
void launch( RelocateData & relocation, std::map<UID, Busyness> & busymap ) {
	// if we are here this means that we can launch and should adjust all the work the servers can do
	relocation.workFactor = getWorkFactor( relocation );
	for( int i = 0; i < relocation.src.size(); i++ )
		busymap[ relocation.src[i] ].addWork( relocation.priority, relocation.workFactor );
}

void complete( RelocateData const& relocation, std::map<UID, Busyness> & busymap ) {
	ASSERT( relocation.workFactor > 0 );
	for( int i = 0; i < relocation.src.size(); i++ )
		busymap[ relocation.src[i] ].removeWork( relocation.priority, relocation.workFactor );
}

Future<Void> dataDistributionRelocator( struct DDQueueData* const& self, RelocateData const& rd );

struct DDQueueData {
	MasterInterface mi;
	MoveKeysLock lock;
	Database cx;

	TeamCollectionInterface teamCollection;
	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
	PromiseStream<Promise<int64_t>> getAverageShardBytes;

	FlowLock startMoveKeysParallelismLock;
	FlowLock finishMoveKeysParallelismLock;

	int activeRelocations;
	int queuedRelocations;
	int bytesWritten;
	std::map<int, int> priority_relocations;
	int teamSize;
	int durableStorageQuorum;

	std::map<UID, Busyness> busymap;

	KeyRangeMap< RelocateData > queueMap;
	std::set<RelocateData, std::greater<RelocateData>> fetchingSourcesQueue;
	std::set<RelocateData, std::greater<RelocateData>> fetchKeysComplete;
	KeyRangeActorMap getSourceActors;
	std::map<UID, std::set<RelocateData, std::greater<RelocateData>>> queue;

	KeyRangeMap< RelocateData > inFlight;
	KeyRangeActorMap inFlightActors;

	Promise<Void> error;
	PromiseStream<RelocateData> dataTransferComplete;
	PromiseStream<RelocateData> relocationComplete;
	PromiseStream<RelocateData> fetchSourceServersComplete;

	PromiseStream<RelocateShard> input;
	PromiseStream<GetMetricsRequest> getShardMetrics;

	double* lastLimited;

	DDQueueData( MasterInterface mi, MoveKeysLock lock, Database cx, TeamCollectionInterface teamCollection,
		Reference<ShardsAffectedByTeamFailure> sABTF, PromiseStream<Promise<int64_t>> getAverageShardBytes,
		int teamSize, int durableStorageQuorum, PromiseStream<RelocateShard> input,
		PromiseStream<GetMetricsRequest> getShardMetrics, double* lastLimited ) :
			activeRelocations( 0 ), queuedRelocations( 0 ), bytesWritten ( 0 ), teamCollection( teamCollection ),
			shardsAffectedByTeamFailure( sABTF ), getAverageShardBytes( getAverageShardBytes ), mi( mi ), lock( lock ),
			cx( cx ), teamSize( teamSize ), durableStorageQuorum( durableStorageQuorum ), input( input ),
			getShardMetrics( getShardMetrics ), startMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ),
			finishMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ), lastLimited(lastLimited) {}

	void validate() {
		if( EXPENSIVE_VALIDATION ) {
			for( auto it = fetchingSourcesQueue.begin(); it != fetchingSourcesQueue.end(); ++it ) {
				// relocates in the fetching queue do not have src servers yet.
				if( it->src.size() )
					TraceEvent(SevError, "DDQueueValidateError1").detail("Problem", "relocates in the fetching queue do not have src servers yet");

				// relocates in the fetching queue do not have a work factor yet.
				if( it->workFactor != 0.0 )
					TraceEvent(SevError, "DDQueueValidateError2").detail("Problem", "relocates in the fetching queue do not have a work factor yet");

				// relocates in the fetching queue are in the queueMap.
				auto range = queueMap.rangeContaining( it->keys.begin );
				if( range.value() != *it || range.range() != it->keys )
					TraceEvent(SevError, "DDQueueValidateError3").detail("Problem", "relocates in the fetching queue are in the queueMap");
			}

			/*
			for( auto it = queue.begin(); it != queue.end(); ++it ) {
				for( auto rdit = it->second.begin(); rdit != it->second.end(); ++rdit ) {
					// relocates in the queue are in the queueMap exactly.
					auto range = queueMap.rangeContaining( rdit->keys.begin );
					if( range.value() != *rdit || range.range() != rdit->keys )
						TraceEvent(SevError, "DDQueueValidateError4").detail("Problem", "relocates in the queue are in the queueMap exactly")
						.detail("RangeBegin", printable(range.range().begin))
						.detail("RangeEnd", printable(range.range().end))
						.detail("RelocateBegin2", printable(range.value().keys.begin))
						.detail("RelocateEnd2", printable(range.value().keys.end))
						.detail("RelocateStart", range.value().startTime)
						.detail("MapStart", rdit->startTime)
						.detail("RelocateWork", range.value().workFactor)
						.detail("MapWork", rdit->workFactor)
						.detail("RelocateSrc", range.value().src.size())
						.detail("MapSrc", rdit->src.size())
						.detail("RelocatePrio", range.value().priority)
						.detail("MapPrio", rdit->priority);

					// relocates in the queue have src servers
					if( !rdit->src.size() )
						TraceEvent(SevError, "DDQueueValidateError5").detail("Problem", "relocates in the queue have src servers");

					// relocates in the queue do not have a work factor yet.
					if( rdit->workFactor != 0.0 )
						TraceEvent(SevError, "DDQueueValidateError6").detail("Problem", "relocates in the queue do not have a work factor yet");

					bool contains = false;
					for( int i = 0; i < rdit->src.size(); i++ ) {
						if( rdit->src[i] == it->first ) {
							contains = true;
							break;
						}
					}
					if( !contains )
						TraceEvent(SevError, "DDQueueValidateError7").detail("Problem", "queued relocate data does not include ss under which its filed");
				}
			}*/

			auto inFlightRanges = inFlight.ranges();
			for( auto it = inFlightRanges.begin(); it != inFlightRanges.end(); ++it ) {
				for( int i = 0; i < it->value().src.size(); i++ ) {
					// each server in the inFlight map is in the busymap
					if( !busymap.count( it->value().src[i] ) )
						TraceEvent(SevError, "DDQueueValidateError8").detail("Problem", "each server in the inFlight map is in the busymap");

					// relocate data that is inFlight is not also in the queue
					if( queue[it->value().src[i]].count( it->value() ) )
						TraceEvent(SevError, "DDQueueValidateError9").detail("Problem", "relocate data that is inFlight is not also in the queue");
				}

				// in flight relocates have source servers
				if( it->value().startTime != -1 && !it->value().src.size() )
					TraceEvent(SevError, "DDQueueValidateError10").detail("Problem", "in flight relocates have source servers");

				if( inFlightActors.liveActorAt( it->range().begin ) ) {
					// the key range in the inFlight map matches the key range in the RelocateData message
					if( it->value().keys != it->range() )
						TraceEvent(SevError, "DDQueueValidateError11").detail("Problem", "the key range in the inFlight map matches the key range in the RelocateData message");
				}
			}

			for( auto it = busymap.begin(); it != busymap.end(); ++it ) {
				for( int i = 0; i < it->second.ledger.size() - 1; i++ ) {
					if( it->second.ledger[i] < it->second.ledger[i+1] )
						TraceEvent(SevError, "DDQueueValidateError12").detail("Problem", "ascending ledger problem")
						.detail("ledgerLevel", i).detail("ledgerValueA", it->second.ledger[i]).detail("ledgerValueB", it->second.ledger[i+1]);
					if( it->second.ledger[i] < 0.0 )
						TraceEvent(SevError, "DDQueueValidateError13").detail("Problem", "negative ascending problem")
						.detail("ledgerLevel", i).detail("ledgerValue", it->second.ledger[i]);
				}
			}

			std::set<RelocateData, std::greater<RelocateData>> queuedRelocationsMatch;
			for(auto it = queue.begin(); it != queue.end(); ++it)
				queuedRelocationsMatch.insert( it->second.begin(), it->second.end() );
			ASSERT( queuedRelocations == queuedRelocationsMatch.size() + fetchingSourcesQueue.size() );

			int testActive = 0;
			for(auto it = priority_relocations.begin(); it != priority_relocations.end(); ++it )
				testActive += it->second;
			ASSERT( activeRelocations + queuedRelocations == testActive );
		}
	}

	ACTOR Future<Void> getSourceServersForRange( Database cx, MasterInterface mi, RelocateData input, PromiseStream<RelocateData> output ) {
		state std::set<UID> servers;
		state Transaction tr(cx);

		// FIXME: is the merge case needed
		if( input.priority == PRIORITY_MERGE_SHARD ) {
			Void _ = wait( delay( 0.5, TaskDataDistribution - 2 ) );
		} else {
			Void _ = wait( delay( 0.0001, TaskDataDistributionLaunch ) );
		}

		loop {
			servers.clear();
			tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE );
			try {
				Standalone<RangeResultRef> keyServersEntries  = wait(
					tr.getRange( lastLessOrEqual( keyServersKey( input.keys.begin ) ),
						firstGreaterOrEqual( keyServersKey( input.keys.end ) ), SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS ) );

				if(keyServersEntries.size() < SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS) {
					for( int shard = 0; shard < keyServersEntries.size(); shard++ ) {
						vector<UID> src, dest;
						decodeKeyServersValue( keyServersEntries[shard].value, src, dest );
						ASSERT( src.size() );
						for( int i = 0; i < src.size(); i++ )
							servers.insert( src[i] );
					}

					ASSERT(servers.size() > 0);
				}

				//If the size of keyServerEntries is large, then just assume we are using all storage servers
				else {
					Standalone<RangeResultRef> serverList = wait( tr.getRange( serverListKeys, CLIENT_KNOBS->TOO_MANY ) );
					ASSERT( !serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY );

					for(auto s = serverList.begin(); s != serverList.end(); ++s)
						servers.insert(decodeServerListValue( s->value ).id());

					ASSERT(servers.size() > 0);
				}

				break;
			} catch( Error& e ) {
				Void _ = wait( tr.onError(e) );
			}
		}

		input.src = std::vector<UID>( servers.begin(), servers.end() );
		output.send( input );
		return Void();
	}

	//This function cannot handle relocation requests which split a shard into three pieces
	void queueRelocation( RelocateData rd, std::set<UID> &serversToLaunchFrom ) {
		// Update sabtf for changes from DDTracker
		if( rd.changesBoundaries() )
			shardsAffectedByTeamFailure->defineShard( rd.keys );

		//TraceEvent("QueueRelocationBegin").detail("Begin", printable(rd.keys.begin)).detail("End", printable(rd.keys.end));

		// remove all items from both queues that are fully contained in the new relocation (i.e. will be overwritten)
		auto ranges = queueMap.intersectingRanges( rd.keys );
		for(auto r = ranges.begin(); r != ranges.end(); ++r ) {
			RelocateData& rrs = r->value();

			auto fetchingSourcesItr = fetchingSourcesQueue.find(rrs);
			bool foundActiveFetching = fetchingSourcesItr != fetchingSourcesQueue.end();
			std::set<RelocateData, std::greater<RelocateData>>* firstQueue;
			std::set<RelocateData, std::greater<RelocateData>>::iterator firstRelocationItr;
			bool foundActiveRelocation = false;

			if( !foundActiveFetching && rrs.src.size() ) {
				firstQueue = &queue[rrs.src[0]];
				firstRelocationItr = firstQueue->find( rrs );
				foundActiveRelocation = firstRelocationItr != firstQueue->end();
			}

			// If there is a queued job that wants data relocation which we are about to cancel/modify,
			//  make sure that we keep the relocation intent for the job that we queue up
			if( foundActiveFetching || foundActiveRelocation ) {
				rd.wantsNewServers |= rrs.wantsNewServers;
				rd.startTime = std::min( rd.startTime, rrs.startTime );
				if( rrs.priority >= PRIORITY_TEAM_UNHEALTHY && rd.changesBoundaries() )
					rd.priority = std::max( rd.priority, rrs.priority );
			}

			if( rd.keys.contains( rrs.keys ) ) {
				if(foundActiveFetching)
					fetchingSourcesQueue.erase( fetchingSourcesItr );
				else if(foundActiveRelocation) {
					firstQueue->erase( firstRelocationItr );
					for( int i = 1; i < rrs.src.size(); i++ )
						queue[rrs.src[i]].erase( rrs );
				}
			}

			if( foundActiveFetching || foundActiveRelocation ) {
				serversToLaunchFrom.insert( rrs.src.begin(), rrs.src.end() );
				/*TraceEvent(rrs.interval.end(), mi.id()).detail("Result","Cancelled")
					.detail("WasFetching", foundActiveFetching).detail("Contained", rd.keys.contains( rrs.keys ));*/
				queuedRelocations--;
				priority_relocations[ rrs.priority ]--;
			}
		}

		// determine the final state of the relocations map
		auto affectedQueuedItems = queueMap.getAffectedRangesAfterInsertion( rd.keys, rd );

		// put the new request into the global map of requests (modifies the ranges already present)
		queueMap.insert( rd.keys, rd );

		// cancel all the getSourceServers actors that intersect the new range that we will be getting
		getSourceActors.cancel( KeyRangeRef( affectedQueuedItems.front().begin, affectedQueuedItems.back().end ) );

		// update fetchingSourcesQueue and the per-server queue based on truncated ranges after insertion, (re-)launch getSourceServers
		auto queueMapItr = queueMap.rangeContaining(affectedQueuedItems[0].begin);
		for(int r = 0; r < affectedQueuedItems.size(); ++r, ++queueMapItr) {
			//ASSERT(queueMapItr->value() == queueMap.rangeContaining(affectedQueuedItems[r].begin)->value());
			RelocateData& rrs = queueMapItr->value();

			if( rrs.src.size() == 0 && ( rrs.keys == rd.keys || fetchingSourcesQueue.erase(rrs) > 0 ) ) {
				rrs.keys = affectedQueuedItems[r];

				rrs.interval = TraceInterval("QueuedRelocation");
				/*TraceEvent(rrs.interval.begin(), mi.id());
					.detail("KeyBegin", printable(rrs.keys.begin)).detail("KeyEnd", printable(rrs.keys.end))
					.detail("Priority", rrs.priority).detail("WantsNewServers", rrs.wantsNewServers);*/
				queuedRelocations++;
				priority_relocations[rrs.priority]++;

				fetchingSourcesQueue.insert( rrs );
				getSourceActors.insert( rrs.keys, getSourceServersForRange( cx, mi, rrs, fetchSourceServersComplete ) );
			} else {
				RelocateData newData( rrs );
				newData.keys = affectedQueuedItems[r];
				ASSERT( rrs.src.size() || rrs.startTime == -1 );

				bool foundActiveRelocation = false;
				for( int i = 0; i < rrs.src.size(); i++ ) {
					auto& serverQueue = queue[rrs.src[i]];

					if( serverQueue.erase(rrs) > 0 ) {
						if( !foundActiveRelocation ) {
							newData.interval = TraceInterval("QueuedRelocation");
							/*TraceEvent(newData.interval.begin(), mi.id());
								.detail("KeyBegin", printable(newData.keys.begin)).detail("KeyEnd", printable(newData.keys.end))
								.detail("Priority", newData.priority).detail("WantsNewServers", newData.wantsNewServers);*/
							queuedRelocations++;
							priority_relocations[newData.priority]++;
							foundActiveRelocation = true;
						}

						serverQueue.insert( newData );
					}
					else
						break;
				}

				// We update the keys of a relocation even if it is "dead" since it helps validate()
				rrs.keys = affectedQueuedItems[r];
				rrs.interval = newData.interval;
			}
		}

		/*TraceEvent("ReceivedRelocateShard", mi.id())
			.detail("KeyBegin", printable(rd.keys.begin))
			.detail("KeyEnd", printable(rd.keys.end))
			.detail("Priority", rd.priority)
			.detail("AffectedRanges", affectedQueuedItems.size()); */
	}

	void completeSourceFetch( RelocateData results ) {
		ASSERT( fetchingSourcesQueue.count( results ) );

		//logRelocation( results, "GotSourceServers" );

		fetchingSourcesQueue.erase( results );
		queueMap.insert( results.keys, results );
		for( int i = 0; i < results.src.size(); i++ ) {
			queue[results.src[i]].insert( results );
		}
	}

	void logRelocation( RelocateData rd, const char *title ) {
		std::string busyString;
		for(int i = 0; i < rd.src.size() && i < teamSize * 2; i++)
			busyString += describe(rd.src[i]) + " - (" + busymap[ rd.src[i] ].toString() + "); ";

		TraceEvent(title, mi.id())
			.detail("KeyBegin", printable(rd.keys.begin))
			.detail("KeyEnd", printable(rd.keys.end))
			.detail("Priority", rd.priority)
			.detail("WorkFactor", rd.workFactor)
			.detail("SourceServerCount", rd.src.size())
			.detail("SourceServers", describe(rd.src, teamSize * 2))
			.detail("SourceBusyness", busyString);
	}

	void launchQueuedWork( KeyRange keys ) {
		//combine all queued work in the key range and check to see if there is anything to launch
		std::set<RelocateData, std::greater<RelocateData>> combined;
		auto f = queueMap.intersectingRanges( keys );
		for(auto it = f.begin(); it != f.end(); ++it) {
			if( it->value().src.size() && queue[it->value().src[0]].count( it->value() ) )
				combined.insert( it->value() );
		}
		launchQueuedWork( combined );
	}

	void launchQueuedWork( std::set<UID> serversToLaunchFrom ) {
		//combine all work from the source servers to see if there is anything new to launch
		std::set<RelocateData, std::greater<RelocateData>> combined;
		for( auto id : serversToLaunchFrom ) {
			auto& queuedWork = queue[id];
			auto it = queuedWork.begin();
			for( int j = 0; j < teamSize && it != queuedWork.end(); j++) {
				combined.insert( *it );
				++it;
			}
		}
		launchQueuedWork( combined );
	}

	void launchQueuedWork( RelocateData launchData ) {
		//check a single RelocateData to see if it can be launched
		std::set<RelocateData, std::greater<RelocateData>> combined;
		combined.insert( launchData );
		launchQueuedWork( combined );
	}

	void launchQueuedWork( std::set<RelocateData, std::greater<RelocateData>> combined ) {
		int startedHere = 0;
		double startTime = now();
		// kick off relocators from items in the queue as need be
		std::set<RelocateData, std::greater<RelocateData>>::iterator it = combined.begin();
		for(; it != combined.end(); it++ ) {
			RelocateData rd( *it );

			bool overlappingInFlight = false;
			auto intersectingInFlight = inFlight.intersectingRanges( rd.keys );
			for(auto it = intersectingInFlight.begin(); it != intersectingInFlight.end(); ++it) {
				if( fetchKeysComplete.count( it->value() ) &&
					inFlightActors.liveActorAt( it->range().begin ) &&
						!rd.keys.contains( it->range() ) &&
						it->value().priority >= rd.priority &&
						rd.priority < PRIORITY_TEAM_UNHEALTHY ) {
					/*TraceEvent("OverlappingInFlight", mi.id())
						.detail("KeyBegin", printable(it->value().keys.begin))
						.detail("KeyEnd", printable(it->value().keys.end))
						.detail("Priority", it->value().priority); */
					overlappingInFlight = true;
					break;
				}
			}

			if( overlappingInFlight ) {
				//logRelocation( rd, "SkippingOverlappingInFlight" );
				continue;
			}

			// Because the busyness of a server is decreased when a superseding relocation is issued, we
			//  need to consider what the busyness of a server WOULD be if
			auto containedRanges = inFlight.containedRanges( rd.keys );
			std::vector<RelocateData> cancellableRelocations;
			for(auto it = containedRanges.begin(); it != containedRanges.end(); ++it) {
				if( inFlightActors.liveActorAt( it->range().begin ) ) {
					cancellableRelocations.push_back( it->value() );
				}
			}

			// SOMEDAY: the list of source servers may be outdated since they were fetched when the work was put in the queue
			// FIXME: we need spare capacity even when we're just going to be cancelling work via TEAM_HEALTHY
			if( !canLaunch( rd, teamSize, busymap, cancellableRelocations ) ) {
				//logRelocation( rd, "SkippingQueuedRelocation" );
				continue;
			}

			//logRelocation( rd, "LaunchingRelocation" );

			//TraceEvent(rd.interval.end(), mi.id()).detail("Result","Success");
			queuedRelocations--;
			priority_relocations[rd.priority]--;

			// now we are launching: remove this entry from the queue of all the src servers
			for( int i = 0; i < rd.src.size(); i++ ) {
				ASSERT( queue[rd.src[i]].erase(rd) );
			}

			// If there is a job in flight that wants data relocation which we are about to cancel/modify,
			//     make sure that we keep the relocation intent for the job that we launch
			auto f = inFlight.intersectingRanges( rd.keys );
			for(auto it = f.begin(); it != f.end(); ++it) {
				if( inFlightActors.liveActorAt( it->range().begin ) ) {
					rd.wantsNewServers |= it->value().wantsNewServers;
				}
			}
			startedHere++;

			// update both inFlightActors and inFlight key range maps, cancelling deleted RelocateShards
			vector<KeyRange> ranges;
			inFlightActors.getRangesAffectedByInsertion( rd.keys, ranges );
			inFlightActors.cancel( KeyRangeRef( ranges.front().begin, ranges.back().end ) );
			inFlight.insert( rd.keys, rd );
			for(int r=0; r<ranges.size(); r++) {
				RelocateData& rrs = inFlight.rangeContaining(ranges[r].begin)->value();
				rrs.keys = ranges[r];

				launch( rrs, busymap );
				activeRelocations++;
				priority_relocations[ rrs.priority ]++;
				inFlightActors.insert( rrs.keys, dataDistributionRelocator( this, rrs ) );
			}

			//logRelocation( rd, "LaunchedRelocation" );
		}
		if( now() - startTime > .001 && g_random->random01()<0.001 )
			TraceEvent(SevWarnAlways, "LaunchingQueueSlowx1000").detail("elapsed", now() - startTime );

		/*if( startedHere > 0 ) {
			TraceEvent("StartedDDRelocators", mi.id())
				.detail("QueueSize", queuedRelocations)
				.detail("StartedHere", startedHere)
				.detail("ActiveRelocations", activeRelocations);
		} */

		validate();
	}
};

extern bool noUnseed;

// This actor relocates the specified keys to a good place.
// These live in the inFlightActor key range map.
ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd )
{
	state Promise<Void> errorOut( self->error );
	state TraceInterval relocateShardInterval("RelocateShard");
	state PromiseStream<RelocateData> dataTransferComplete( self->dataTransferComplete );
	state PromiseStream<RelocateData> relocationComplete( self->relocationComplete );
	state bool signalledTransferComplete = false;
	state UID masterId = self->mi.id();
	state Reference<IDataDistributionTeam> destination;

	try {
		TraceEvent(relocateShardInterval.begin(), masterId)
			.detail("KeyBegin", printable(rd.keys.begin)).detail("KeyEnd", printable(rd.keys.end))
			.detail("Priority", rd.priority).detail("RelocationID", relocateShardInterval.pairID);

		state StorageMetrics metrics = wait( brokenPromiseToNever( self->getShardMetrics.getReply( GetMetricsRequest( rd.keys ) ) ) );

		ASSERT( rd.src.size() );
		loop {
			state int stuckCount = 0;
			loop {
				double inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_HEALTHY;
				if(rd.priority >= PRIORITY_TEAM_UNHEALTHY) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
				if(rd.priority >= PRIORITY_TEAM_1_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;

				auto req = GetTeamRequest( rd.wantsNewServers, rd.priority == PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, inflightPenalty );
				req.sources = rd.src;
				Optional<Reference<IDataDistributionTeam>> bestTeam = wait( brokenPromiseToNever( self->teamCollection.getTeam.getReply( req ) ) );
				if( bestTeam.present() ) {
					destination = bestTeam.get();
					break;
				}
				TEST(true); //did not find a healthy destination team on the first attempt
				stuckCount++;
				TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", masterId).detail("Count", stuckCount);
				if(stuckCount > 50 && g_network->isSimulated()) { //FIXME: known bug in simulation we are supressing
					int unseed = noUnseed ? 0 : g_random->randomInt(0, 100001);
					TraceEvent("ElapsedTime").detail("SimTime", now()).detail("RealTime", 0)
						.detail("RandomUnseed", unseed);
					flushAndExit(0);
				}
				Void _ = wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskDataDistributionLaunch ) );
			}

			ASSERT(destination->isHealthy());   // team failure tracking is edge triggered, so must never put something on an unhealthy team!
			self->shardsAffectedByTeamFailure->moveShard( rd.keys, destination->getServerIDs() );

			destination->addDataInFlightToTeam( +metrics.bytes );

			TraceEvent("RelocateShardHasDestination", masterId)
				.detail("PairId", relocateShardInterval.pairID)
				.detail("DestinationTeam", destination->getDesc());

			state Error error = success();
			state Promise<Void> dataMovementComplete;
			state Future<Void> doMoveKeys = moveKeys(
				self->cx, rd.keys, destination->getServerIDs(), self->lock,
				self->durableStorageQuorum, dataMovementComplete,
				&self->startMoveKeysParallelismLock,
				&self->finishMoveKeysParallelismLock,
				relocateShardInterval.pairID );
			state Future<Void> pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
			try {
				loop {
					choose {
						when( Void _ = wait( doMoveKeys ) ) {
							self->fetchKeysComplete.insert( rd );
							break;
						}
						when( Void _ = wait( pollHealth ) ) {
							if( !destination->isHealthy() ) {
								if( !signalledTransferComplete ) {
									signalledTransferComplete = true;
									self->dataTransferComplete.send( rd );
								}
							}
							pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch );
						}
						when( Void _ = wait( signalledTransferComplete ? Never() : dataMovementComplete.getFuture() ) ) {
							self->fetchKeysComplete.insert( rd );
							if( !signalledTransferComplete ) {
								signalledTransferComplete = true;
								self->dataTransferComplete.send( rd );
							}
						}
					}
				}
			} catch( Error& e ) {
				error = e;
			}

			//TraceEvent("RelocateShardFinished", masterId).detail("relocateId", relocateShardInterval.pairID);

			if( error.code() != error_code_move_to_removed_server ) {
				if( !error.code() ) {
					try {
						Void _ = wait( destination->updatePhysicalMetrics() ); //prevent a gap between the polling for an increase in physical metrics and decrementing data in flight
					} catch( Error& e ) {
						error = e;
					}
				}

				destination->addDataInFlightToTeam( -metrics.bytes );

				// onFinished.send( rs );
				if( !error.code() ) {
					TraceEvent(relocateShardInterval.end(), masterId).detail("Result","Success");
					if(rd.keys.begin == keyServersPrefix) {
						TraceEvent("MovedKeyServerKeys").detail("dest", destination->getDesc()).trackLatest("MovedKeyServers");
					}

					if( !signalledTransferComplete ) {
						signalledTransferComplete = true;
						dataTransferComplete.send( rd );
					}

					self->bytesWritten += metrics.bytes;
					relocationComplete.send( rd );
					return Void();
				} else {
					throw error;
				}
			} else {
				TEST(true);  // move to removed server
				destination->addDataInFlightToTeam( -metrics.bytes );
				Void _ = wait( delay( SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskDataDistributionLaunch ) );
			}
		}
	} catch (Error& e) {
		TraceEvent(relocateShardInterval.end(), masterId).error(e, true);
		if( !signalledTransferComplete )
			dataTransferComplete.send( rd );

		relocationComplete.send( rd );

		if( e.code() != error_code_actor_cancelled )
			errorOut.sendError(e);
		throw;
	}
}

ACTOR Future<bool> rebalanceTeams( DDQueueData* self, int priority, Reference<IDataDistributionTeam> sourceTeam, Reference<IDataDistributionTeam> destTeam ) {
	if(g_network->isSimulated() && g_simulator.speedUpSimulation) {
		return false;
	}

	std::vector<KeyRange> shards = self->shardsAffectedByTeamFailure->getShardsFor( sourceTeam->getServerIDs() );

	if( !shards.size() )
		return false;

	state KeyRange moveShard = g_random->randomChoice( shards );
	StorageMetrics metrics = wait( brokenPromiseToNever( self->getShardMetrics.getReply(GetMetricsRequest(moveShard)) ) );

	int64_t sourceBytes = sourceTeam->getLoadBytes(false);
	int64_t destBytes = destTeam->getLoadBytes();
	if( sourceBytes - destBytes <= 3 * std::max<int64_t>( SERVER_KNOBS->MIN_SHARD_BYTES, metrics.bytes ) || metrics.bytes == 0 )
		return false;

	//verify the shard is still in sabtf
	std::vector<KeyRange> shards = self->shardsAffectedByTeamFailure->getShardsFor( sourceTeam->getServerIDs() );
	for( int i = 0; i < shards.size(); i++ ) {
		if( moveShard == shards[i] ) {
			TraceEvent(priority == PRIORITY_REBALANCE_OVERUTILIZED_TEAM ? "BgDDMountainChopper" : "BgDDValleyFiller", self->mi.id())
				.detail("sourceBytes", sourceBytes)
				.detail("destBytes", destBytes)
				.detail("shardBytes", metrics.bytes)
				.detail("sourceTeam", sourceTeam->getDesc())
				.detail("destTeam", destTeam->getDesc());

			self->input.send( RelocateShard( moveShard, priority ) );
			return true;
		}
	}

	return false;
}

ACTOR Future<Void> BgDDMountainChopper( DDQueueData* self ) {
	state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL;
	state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
	loop {
		Void _ = wait( delay(checkDelay, TaskDataDistributionLaunch) );
		if (self->priority_relocations[PRIORITY_REBALANCE_OVERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
			state Optional<Reference<IDataDistributionTeam>> randomTeam = wait( brokenPromiseToNever( self->teamCollection.getTeam.getReply( GetTeamRequest( true, false, true ) ) ) );
			if( randomTeam.present() ) {
				if( randomTeam.get()->getMinFreeSpaceRatio() > SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF ) {
					state Optional<Reference<IDataDistributionTeam>> loadedTeam = wait( brokenPromiseToNever( self->teamCollection.getTeam.getReply( GetTeamRequest( true, true, false ) ) ) );
					if( loadedTeam.present() ) {
						bool moved = wait( rebalanceTeams( self, PRIORITY_REBALANCE_OVERUTILIZED_TEAM, loadedTeam.get(), randomTeam.get() ) );
						if(moved) {
							resetCount = 0;
						} else {
							resetCount++;
						}
					}
				}
			}
		}

		if( now() - (*self->lastLimited) < SERVER_KNOBS->BG_DD_SATURATION_DELAY ) {
			checkDelay = std::min(SERVER_KNOBS->BG_DD_MAX_WAIT, checkDelay * SERVER_KNOBS->BG_DD_INCREASE_RATE);
		} else {
			checkDelay = std::max(SERVER_KNOBS->BG_DD_MIN_WAIT, checkDelay / SERVER_KNOBS->BG_DD_DECREASE_RATE);
		}

		if(resetCount >= SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT && checkDelay < SERVER_KNOBS->BG_DD_POLLING_INTERVAL) {
			checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL;
			resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
		}
	}
}

ACTOR Future<Void> BgDDValleyFiller( DDQueueData* self ) {
	state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL;
	state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
	loop {
		Void _ = wait( delay(checkDelay, TaskDataDistributionLaunch) );
		if (self->priority_relocations[PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
			state Optional<Reference<IDataDistributionTeam>> randomTeam = wait( brokenPromiseToNever( self->teamCollection.getTeam.getReply( GetTeamRequest( true, false, false ) ) ) );
			if( randomTeam.present() ) {
				state Optional<Reference<IDataDistributionTeam>> unloadedTeam = wait( brokenPromiseToNever( self->teamCollection.getTeam.getReply( GetTeamRequest( true, true, true ) ) ) );
				if( unloadedTeam.present() ) {
					if( unloadedTeam.get()->getMinFreeSpaceRatio() > SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF ) {
						bool moved = wait( rebalanceTeams( self, PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, randomTeam.get(), unloadedTeam.get() ) );
						if(moved) {
							resetCount = 0;
						} else {
							resetCount++;
						}
					}
				}
			}
		}

		if( now() - (*self->lastLimited) < SERVER_KNOBS->BG_DD_SATURATION_DELAY ) {
			checkDelay = std::min(SERVER_KNOBS->BG_DD_MAX_WAIT, checkDelay * SERVER_KNOBS->BG_DD_INCREASE_RATE);
		} else {
			checkDelay = std::max(SERVER_KNOBS->BG_DD_MIN_WAIT, checkDelay / SERVER_KNOBS->BG_DD_DECREASE_RATE);
		}

		if(resetCount >= SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT && checkDelay < SERVER_KNOBS->BG_DD_POLLING_INTERVAL) {
			checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL;
			resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT;
		}
	}
}

ACTOR Future<Void> dataDistributionQueue(
	Database cx,
	PromiseStream<RelocateShard> input,
	PromiseStream<GetMetricsRequest> getShardMetrics,
	TeamCollectionInterface teamCollection,
	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
	MoveKeysLock lock,
	PromiseStream<Promise<int64_t>> getAverageShardBytes,
	MasterInterface mi,
	int teamSize,
	int durableStorageQuorum,
	double* lastLimited)
{
	state DDQueueData self( mi, lock, cx, teamCollection, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, durableStorageQuorum, input, getShardMetrics, lastLimited );
	state std::set<UID> serversToLaunchFrom;
	state KeyRange keysToLaunchFrom;
	state RelocateData launchData;
	state Future<Void> recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL);
	state Future<Void> bgDDMountainChopper = BgDDMountainChopper( &self );
	state Future<Void> bgDDValleyFiller = BgDDValleyFiller( &self );

	state ActorCollectionNoErrors actors;
	state PromiseStream<KeyRange> rangesComplete;
	state Future<Void> launchQueuedWorkTimeout = Never();

	try {
		loop {
			self.validate();

			// For the given servers that caused us to go around the loop, find the next item(s) that can be launched.
			if( launchData.startTime != -1 ) {
				self.launchQueuedWork( launchData );
				launchData = RelocateData();
			}
			else if( !keysToLaunchFrom.empty() ) {
				self.launchQueuedWork( keysToLaunchFrom );
				keysToLaunchFrom = KeyRangeRef();
			}

			ASSERT( launchData.startTime == -1 && keysToLaunchFrom.empty() );

			choose {
				when ( RelocateShard rs = waitNext( self.input.getFuture() ) ) {
					bool wasEmpty = serversToLaunchFrom.empty();
					self.queueRelocation( rs, serversToLaunchFrom );
					if(wasEmpty && !serversToLaunchFrom.empty())
						launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch);
				}
				when ( Void _ = wait(launchQueuedWorkTimeout) ) {
					self.launchQueuedWork( serversToLaunchFrom );
					serversToLaunchFrom = std::set<UID>();
					launchQueuedWorkTimeout = Never();
				}
				when ( RelocateData results = waitNext( self.fetchSourceServersComplete.getFuture() ) ) {
					self.completeSourceFetch( results );
					launchData = results;
				}
				when ( RelocateData done = waitNext( self.dataTransferComplete.getFuture() ) ) {
					complete( done, self.busymap );
					if(serversToLaunchFrom.empty() && !done.src.empty())
						launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch);
					serversToLaunchFrom.insert(done.src.begin(), done.src.end());
				}
				when ( RelocateData done = waitNext( self.relocationComplete.getFuture() ) ) {
					self.activeRelocations--;
					self.priority_relocations[ done.priority ]--;
					self.fetchKeysComplete.erase( done );
					//self.logRelocation( done, "ShardRelocatorDone" );
					actors.add( tag( delay(0, TaskDataDistributionLaunch), done.keys, rangesComplete ) );
					if( g_network->isSimulated() && debug_isCheckRelocationDuration() && now() - done.startTime > 60 ) {
						TraceEvent(SevWarnAlways, "RelocationDurationTooLong").detail("Duration", now() - done.startTime);
						debug_setCheckRelocationDuration(false);
					}
				}
				when ( KeyRange done = waitNext( rangesComplete.getFuture() ) ) {
					keysToLaunchFrom = done;
				}
				when ( Void _ = wait( recordMetrics ) ) {
					Promise<int64_t> req;
					getAverageShardBytes.send( req );

					recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL);

					int lowPriorityRelocations = 0, highPriorityRelocations = 0, highestPriorityRelocation = 0;
					for( auto it = self.priority_relocations.begin(); it != self.priority_relocations.end(); ++it ) {
						if (it->second)
							highestPriorityRelocation = std::max(highestPriorityRelocation, it->first);
						if( it->first < 200 )
							lowPriorityRelocations += it->second;
						else
							highPriorityRelocations += it->second;
					}

					TraceEvent("MovingData", mi.id())
						.detail( "InFlight", self.activeRelocations )
						.detail( "InQueue", self.queuedRelocations )
						.detail( "AverageShardSize", req.getFuture().isReady() ? req.getFuture().get() : -1 )
						.detail( "LowPriorityRelocations", lowPriorityRelocations )
						.detail( "HighPriorityRelocations", highPriorityRelocations )
						.detail( "HighestPriority", highestPriorityRelocation )
						.detail( "BytesWritten", self.bytesWritten )
						.trackLatest( format("%s/MovingData", printable(cx->dbName).c_str() ).c_str() );
				}
				when ( Void _ = wait( self.error.getFuture() ) ) {}  // Propagate errors from dataDistributionRelocator
				when ( Void _ = wait( bgDDMountainChopper ) ) {}
				when ( Void _ = wait( bgDDValleyFiller ) ) {}
			}
		}
	} catch (Error& e) {
		if (e.code() != error_code_broken_promise && // FIXME: Get rid of these broken_promise errors every time we are killed by the master dying
			e.code() != error_code_movekeys_conflict)
			TraceEvent(SevError, "dataDistributionQueueError", mi.id()).error(e);
		throw e;
	}
}