foundationdb/fdbserver/DDRelocationQueue.actor.cpp

/*
 * DataDistributionQueue.actor.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
 * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <limits>
#include <numeric>
#include <vector>

#include "flow/ActorCollection.h"
#include "flow/FastRef.h"
#include "flow/Trace.h"
#include "flow/Util.h"
#include "fdbrpc/sim_validation.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/DataDistribution.actor.h"
#include "fdbserver/DDSharedContext.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbrpc/simulator.h"
#include "fdbserver/DDTxnProcessor.h"
#include "flow/DebugTrace.h"
#include "flow/actorcompiler.h" // This must be the last #include.

#define WORK_FULL_UTILIZATION 10000 // This is not a knob; it is a fixed point scaling factor!

typedef Reference<IDataDistributionTeam> ITeamRef;
typedef std::pair<ITeamRef, ITeamRef> SrcDestTeamPair;

inline bool isDataMovementForDiskBalancing(DataMovementReason reason) {
	return reason == DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM ||
	       reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM;
}

inline bool isDataMovementForReadBalancing(DataMovementReason reason) {
	return reason == DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM ||
	       reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
}

inline bool isDataMovementForMountainChopper(DataMovementReason reason) {
	return reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM ||
	       reason == DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM;
}

// FIXME: Always use DataMovementReason to invoke these functions.
inline bool isValleyFillerPriority(int priority) {
	return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
	       priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM;
}

inline bool isDataMovementForValleyFiller(DataMovementReason reason) {
	return reason == DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM ||
	       reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
}

typedef std::map<DataMovementReason, int> DmReasonPriorityMapping;
typedef std::map<int, DataMovementReason> PriorityDmReasonMapping;
std::pair<const DmReasonPriorityMapping*, const PriorityDmReasonMapping*> buildPriorityMappings() {
	static DmReasonPriorityMapping reasonPriority{
		{ DataMovementReason::INVALID, -1 },
		{ DataMovementReason::RECOVER_MOVE, SERVER_KNOBS->PRIORITY_RECOVER_MOVE },
		{ DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM },
		{ DataMovementReason::REBALANCE_OVERUTILIZED_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM },
		{ DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM },
		{ DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM },
		{ DataMovementReason::PERPETUAL_STORAGE_WIGGLE, SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE },
		{ DataMovementReason::TEAM_HEALTHY, SERVER_KNOBS->PRIORITY_TEAM_HEALTHY },
		{ DataMovementReason::TEAM_CONTAINS_UNDESIRED_SERVER, SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER },
		{ DataMovementReason::TEAM_REDUNDANT, SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT },
		{ DataMovementReason::MERGE_SHARD, SERVER_KNOBS->PRIORITY_MERGE_SHARD },
		{ DataMovementReason::POPULATE_REGION, SERVER_KNOBS->PRIORITY_POPULATE_REGION },
		{ DataMovementReason::TEAM_UNHEALTHY, SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY },
		{ DataMovementReason::TEAM_2_LEFT, SERVER_KNOBS->PRIORITY_TEAM_2_LEFT },
		{ DataMovementReason::TEAM_1_LEFT, SERVER_KNOBS->PRIORITY_TEAM_1_LEFT },
		{ DataMovementReason::TEAM_FAILED, SERVER_KNOBS->PRIORITY_TEAM_FAILED },
		{ DataMovementReason::TEAM_0_LEFT, SERVER_KNOBS->PRIORITY_TEAM_0_LEFT },
		{ DataMovementReason::SPLIT_SHARD, SERVER_KNOBS->PRIORITY_SPLIT_SHARD },
		{ DataMovementReason::ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD,
		  SERVER_KNOBS->PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD }
	};

	static PriorityDmReasonMapping priorityReason;
	if (priorityReason.empty()) { // only build once
		for (const auto& [r, p] : reasonPriority) {
			priorityReason[p] = r;
		}
		// Don't allow 2 priorities value being the same.
		if (priorityReason.size() != reasonPriority.size()) {
			TraceEvent(SevError, "DuplicateDataMovementPriority").log();
			ASSERT(false);
		}
	}

	return std::make_pair(&reasonPriority, &priorityReason);
}

int dataMovementPriority(DataMovementReason reason) {
	auto [reasonPriority, _] = buildPriorityMappings();
	return reasonPriority->at(reason);
}

DataMovementReason priorityToDataMovementReason(int priority) {
	auto [_, priorityReason] = buildPriorityMappings();
	return priorityReason->at(priority);
}

struct RelocateData {
	KeyRange keys;
	int priority;
	int boundaryPriority;
	int healthPriority;
	RelocateReason reason;

	double startTime;
	UID randomId; // inherit from RelocateShard.traceId
	UID dataMoveId;
	int workFactor;
	std::vector<UID> src;
	std::vector<UID> completeSources;
	std::vector<UID> completeDests;
	bool wantsNewServers;
	bool cancellable;
	TraceInterval interval;
	std::shared_ptr<DataMove> dataMove;

	RelocateData()
	  : priority(-1), boundaryPriority(-1), healthPriority(-1), reason(RelocateReason::OTHER), startTime(-1),
	    dataMoveId(anonymousShardId), workFactor(0), wantsNewServers(false), cancellable(false),
	    interval("QueuedRelocation") {}
	explicit RelocateData(RelocateShard const& rs)
	  : keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1),
	    healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), reason(rs.reason), startTime(now()),
	    randomId(rs.traceId.isValid() ? rs.traceId : deterministicRandom()->randomUniqueID()),
	    dataMoveId(rs.dataMoveId), workFactor(0), wantsNewServers(isDataMovementForMountainChopper(rs.moveReason) ||
	                                                              isDataMovementForValleyFiller(rs.moveReason) ||
	                                                              rs.moveReason == DataMovementReason::SPLIT_SHARD ||
	                                                              rs.moveReason == DataMovementReason::TEAM_REDUNDANT),
	    cancellable(true), interval("QueuedRelocation", randomId), dataMove(rs.dataMove) {
		if (dataMove != nullptr) {
			this->src.insert(this->src.end(), dataMove->meta.src.begin(), dataMove->meta.src.end());
		}
	}

	static bool isHealthPriority(int priority) {
		return priority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
		       priority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || priority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
		       priority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || priority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ||
		       priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT || priority == SERVER_KNOBS->PRIORITY_TEAM_HEALTHY ||
		       priority == SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER ||
		       priority == SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE;
	}

	static bool isBoundaryPriority(int priority) {
		return priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD || priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD;
	}

	bool isRestore() const { return this->dataMove != nullptr; }

	bool operator>(const RelocateData& rhs) const {
		return priority != rhs.priority
		           ? priority > rhs.priority
		           : (startTime != rhs.startTime ? startTime < rhs.startTime : randomId > rhs.randomId);
	}

	bool operator==(const RelocateData& rhs) const {
		return priority == rhs.priority && boundaryPriority == rhs.boundaryPriority &&
		       healthPriority == rhs.healthPriority && reason == rhs.reason && keys == rhs.keys &&
		       startTime == rhs.startTime && workFactor == rhs.workFactor && src == rhs.src &&
		       completeSources == rhs.completeSources && wantsNewServers == rhs.wantsNewServers &&
		       randomId == rhs.randomId;
	}
	bool operator!=(const RelocateData& rhs) const { return !(*this == rhs); }
};

class ParallelTCInfo final : public ReferenceCounted<ParallelTCInfo>, public IDataDistributionTeam {
	std::vector<Reference<IDataDistributionTeam>> teams;
	std::vector<UID> tempServerIDs;

	int64_t sum(std::function<int64_t(IDataDistributionTeam const&)> func) const {
		int64_t result = 0;
		for (const auto& team : teams) {
			result += func(*team);
		}
		return result;
	}

	template <class T>
	std::vector<T> collect(std::function<std::vector<T>(IDataDistributionTeam const&)> func) const {
		std::vector<T> result;

		for (const auto& team : teams) {
			std::vector<T> newItems = func(*team);
			result.insert(result.end(), newItems.begin(), newItems.end());
		}
		return result;
	}

	bool any(std::function<bool(IDataDistributionTeam const&)> func) const {
		for (const auto& team : teams) {
			if (func(*team)) {
				return true;
			}
		}
		return false;
	}

public:
	ParallelTCInfo() = default;
	explicit ParallelTCInfo(ParallelTCInfo const& info) : teams(info.teams), tempServerIDs(info.tempServerIDs){};

	void addTeam(Reference<IDataDistributionTeam> team) { teams.push_back(team); }

	void clear() { teams.clear(); }

	bool all(std::function<bool(IDataDistributionTeam const&)> func) const {
		return !any([func](IDataDistributionTeam const& team) { return !func(team); });
	}

	std::vector<StorageServerInterface> getLastKnownServerInterfaces() const override {
		return collect<StorageServerInterface>(
		    [](IDataDistributionTeam const& team) { return team.getLastKnownServerInterfaces(); });
	}

	int size() const override {
		int totalSize = 0;
		for (auto it = teams.begin(); it != teams.end(); it++) {
			totalSize += (*it)->size();
		}
		return totalSize;
	}

	std::vector<UID> const& getServerIDs() const override {
		static std::vector<UID> tempServerIDs;
		tempServerIDs.clear();
		for (const auto& team : teams) {
			std::vector<UID> const& childIDs = team->getServerIDs();
			tempServerIDs.insert(tempServerIDs.end(), childIDs.begin(), childIDs.end());
		}
		return tempServerIDs;
	}

	void addDataInFlightToTeam(int64_t delta) override {
		for (auto& team : teams) {
			team->addDataInFlightToTeam(delta);
		}
	}

	void addReadInFlightToTeam(int64_t delta) override {
		for (auto& team : teams) {
			team->addReadInFlightToTeam(delta);
		}
	}

	int64_t getDataInFlightToTeam() const override {
		return sum([](IDataDistributionTeam const& team) { return team.getDataInFlightToTeam(); });
	}

	int64_t getLoadBytes(bool includeInFlight = true, double inflightPenalty = 1.0) const override {
		return sum([includeInFlight, inflightPenalty](IDataDistributionTeam const& team) {
			return team.getLoadBytes(includeInFlight, inflightPenalty);
		});
	}

	int64_t getReadInFlightToTeam() const override {
		return sum([](IDataDistributionTeam const& team) { return team.getReadInFlightToTeam(); });
	}

	double getLoadReadBandwidth(bool includeInFlight = true, double inflightPenalty = 1.0) const override {
		return sum([includeInFlight, inflightPenalty](IDataDistributionTeam const& team) {
			return team.getLoadReadBandwidth(includeInFlight, inflightPenalty);
		});
	}

	int64_t getMinAvailableSpace(bool includeInFlight = true) const override {
		int64_t result = std::numeric_limits<int64_t>::max();
		for (const auto& team : teams) {
			result = std::min(result, team->getMinAvailableSpace(includeInFlight));
		}
		return result;
	}

	double getMinAvailableSpaceRatio(bool includeInFlight = true) const override {
		double result = std::numeric_limits<double>::max();
		for (const auto& team : teams) {
			result = std::min(result, team->getMinAvailableSpaceRatio(includeInFlight));
		}
		return result;
	}

	bool hasHealthyAvailableSpace(double minRatio) const override {
		return all([minRatio](IDataDistributionTeam const& team) { return team.hasHealthyAvailableSpace(minRatio); });
	}

	Future<Void> updateStorageMetrics() override {
		std::vector<Future<Void>> futures;

		for (auto& team : teams) {
			futures.push_back(team->updateStorageMetrics());
		}
		return waitForAll(futures);
	}

	bool isOptimal() const override {
		return all([](IDataDistributionTeam const& team) { return team.isOptimal(); });
	}

	bool isWrongConfiguration() const override {
		return any([](IDataDistributionTeam const& team) { return team.isWrongConfiguration(); });
	}
	void setWrongConfiguration(bool wrongConfiguration) override {
		for (auto it = teams.begin(); it != teams.end(); it++) {
			(*it)->setWrongConfiguration(wrongConfiguration);
		}
	}

	bool isHealthy() const override {
		return all([](IDataDistributionTeam const& team) { return team.isHealthy(); });
	}

	void setHealthy(bool h) override {
		for (auto it = teams.begin(); it != teams.end(); it++) {
			(*it)->setHealthy(h);
		}
	}

	int getPriority() const override {
		int priority = 0;
		for (auto it = teams.begin(); it != teams.end(); it++) {
			priority = std::max(priority, (*it)->getPriority());
		}
		return priority;
	}

	void setPriority(int p) override {
		for (auto it = teams.begin(); it != teams.end(); it++) {
			(*it)->setPriority(p);
		}
	}
	void addref() const override { ReferenceCounted<ParallelTCInfo>::addref(); }
	void delref() const override { ReferenceCounted<ParallelTCInfo>::delref(); }

	void addServers(const std::vector<UID>& servers) override {
		ASSERT(!teams.empty());
		teams[0]->addServers(servers);
	}

	std::string getTeamID() const override {
		std::string id;
		for (int i = 0; i < teams.size(); i++) {
			auto const& team = teams[i];
			id += (i == teams.size() - 1) ? team->getTeamID() : format("%s, ", team->getTeamID().c_str());
		}
		return id;
	}
};

struct Busyness {
	std::vector<int> ledger;

	Busyness() : ledger(10, 0) {}

	bool canLaunch(int prio, int work) const {
		ASSERT(prio > 0 && prio < 1000);
		return ledger[prio / 100] <= WORK_FULL_UTILIZATION - work; // allow for rounding errors in double division
	}
	void addWork(int prio, int work) {
		ASSERT(prio > 0 && prio < 1000);
		for (int i = 0; i <= (prio / 100); i++)
			ledger[i] += work;
	}
	void removeWork(int prio, int work) { addWork(prio, -work); }
	std::string toString() {
		std::string result;
		for (int i = 1; i < ledger.size();) {
			int j = i + 1;
			while (j < ledger.size() && ledger[i] == ledger[j])
				j++;
			if (i != 1)
				result += ", ";
			result += i + 1 == j ? format("%03d", i * 100) : format("%03d/%03d", i * 100, (j - 1) * 100);
			result +=
			    format("=%1.02f (%d/%d)", (float)ledger[i] / WORK_FULL_UTILIZATION, ledger[i], WORK_FULL_UTILIZATION);
			i = j;
		}
		return result;
	}
};

// find the "workFactor" for this, were it launched now
int getSrcWorkFactor(RelocateData const& relocation, int singleRegionTeamSize) {
	if (relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
	    relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT)
		return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
	else if (relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT)
		return WORK_FULL_UTILIZATION / 2 / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
	else // for now we assume that any message at a lower priority can best be assumed to have a full team left for work
		return WORK_FULL_UTILIZATION / singleRegionTeamSize / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
}

int getDestWorkFactor() {
	// Work of moving a shard is even across destination servers
	return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_DEST_SERVER;
}

// Data movement's resource control: Do not overload servers used for the RelocateData
// return true if servers are not too busy to launch the relocation
// This ensure source servers will not be overloaded.
bool canLaunchSrc(RelocateData& relocation,
                  int teamSize,
                  int singleRegionTeamSize,
                  std::map<UID, Busyness>& busymap,
                  std::vector<RelocateData> cancellableRelocations) {
	// assert this has not already been launched
	ASSERT(relocation.workFactor == 0);
	ASSERT(relocation.src.size() != 0);
	ASSERT(teamSize >= singleRegionTeamSize);

	// find the "workFactor" for this, were it launched now
	int workFactor = getSrcWorkFactor(relocation, singleRegionTeamSize);
	int neededServers = std::min<int>(relocation.src.size(), teamSize - singleRegionTeamSize + 1);
	if (SERVER_KNOBS->USE_OLD_NEEDED_SERVERS) {
		neededServers = std::max(1, (int)relocation.src.size() - teamSize + 1);
	}
	// see if each of the SS can launch this task
	for (int i = 0; i < relocation.src.size(); i++) {
		// For each source server for this relocation, copy and modify its busyness to reflect work that WOULD be
		// cancelled
		auto busyCopy = busymap[relocation.src[i]];
		for (int j = 0; j < cancellableRelocations.size(); j++) {
			auto& servers = cancellableRelocations[j].src;
			if (std::count(servers.begin(), servers.end(), relocation.src[i]))
				busyCopy.removeWork(cancellableRelocations[j].priority, cancellableRelocations[j].workFactor);
		}
		// Use this modified busyness to check if this relocation could be launched
		if (busyCopy.canLaunch(relocation.priority, workFactor)) {
			--neededServers;
			if (neededServers == 0)
				return true;
		}
	}
	return false;
}

// candidateTeams is a vector containing one team per datacenter, the team(s) DD is planning on moving the shard to.
bool canLaunchDest(const std::vector<std::pair<Reference<IDataDistributionTeam>, bool>>& candidateTeams,
                   int priority,
                   std::map<UID, Busyness>& busymapDest) {
	// fail switch if this is causing issues
	if (SERVER_KNOBS->RELOCATION_PARALLELISM_PER_DEST_SERVER <= 0) {
		return true;
	}
	int workFactor = getDestWorkFactor();
	for (auto& [team, _] : candidateTeams) {
		for (UID id : team->getServerIDs()) {
			if (!busymapDest[id].canLaunch(priority, workFactor)) {
				return false;
			}
		}
	}
	return true;
}

// update busyness for each server
void launch(RelocateData& relocation, std::map<UID, Busyness>& busymap, int singleRegionTeamSize) {
	// if we are here this means that we can launch and should adjust all the work the servers can do
	relocation.workFactor = getSrcWorkFactor(relocation, singleRegionTeamSize);
	for (int i = 0; i < relocation.src.size(); i++)
		busymap[relocation.src[i]].addWork(relocation.priority, relocation.workFactor);
}

void launchDest(RelocateData& relocation,
                const std::vector<std::pair<Reference<IDataDistributionTeam>, bool>>& candidateTeams,
                std::map<UID, Busyness>& destBusymap) {
	ASSERT(relocation.completeDests.empty());
	int destWorkFactor = getDestWorkFactor();
	for (auto& [team, _] : candidateTeams) {
		for (UID id : team->getServerIDs()) {
			relocation.completeDests.push_back(id);
			destBusymap[id].addWork(relocation.priority, destWorkFactor);
		}
	}
}
void completeDest(RelocateData const& relocation, std::map<UID, Busyness>& destBusymap) {
	int destWorkFactor = getDestWorkFactor();
	for (UID id : relocation.completeDests) {
		destBusymap[id].removeWork(relocation.priority, destWorkFactor);
	}
}

void complete(RelocateData const& relocation, std::map<UID, Busyness>& busymap, std::map<UID, Busyness>& destBusymap) {
	ASSERT(relocation.workFactor > 0);
	for (int i = 0; i < relocation.src.size(); i++)
		busymap[relocation.src[i]].removeWork(relocation.priority, relocation.workFactor);

	completeDest(relocation, destBusymap);
}

// Cancells in-flight data moves intersecting with range.
ACTOR Future<Void> cancelDataMove(struct DDQueue* self, KeyRange range, const DDEnabledState* ddEnabledState);

ACTOR Future<Void> dataDistributionRelocator(struct DDQueue* self,
                                             RelocateData rd,
                                             Future<Void> prevCleanup,
                                             const DDEnabledState* ddEnabledState);

struct DDQueue : public IDDRelocationQueue {
	struct DDDataMove {
		DDDataMove() = default;
		explicit DDDataMove(UID id) : id(id) {}

		bool isValid() const { return id.isValid(); }

		UID id;
		Future<Void> cancel;
	};

	struct ServerCounter {
		enum CountType : uint8_t { ProposedSource = 0, QueuedSource, LaunchedSource, LaunchedDest, __COUNT };

	private:
		typedef std::array<int, (int)__COUNT> Item; // one for each CountType
		typedef std::array<Item, RelocateReason::typeCount()> ReasonItem; // one for each RelocateReason

		std::unordered_map<UID, ReasonItem> counter;

		std::string toString(const Item& item) const {
			return format("%d %d %d %d", item[0], item[1], item[2], item[3]);
		}

		void traceReasonItem(TraceEvent* event, const ReasonItem& item) const {
			for (int i = 0; i < item.size(); ++i) {
				if (std::accumulate(item[i].cbegin(), item[i].cend(), 0) > 0) {
					// "PQSD" corresponding to CounterType
					event->detail(RelocateReason(i).toString() + "PQSD", toString(item[i]));
				}
			}
		}

		bool countNonZero(const ReasonItem& item, CountType type) const {
			return std::any_of(item.cbegin(), item.cend(), [type](const Item& item) { return item[(int)type] > 0; });
		}

		void increase(const UID& id, RelocateReason reason, CountType type) {
			int idx = (int)(reason);
			// if (idx < 0 || idx >= RelocateReason::typeCount()) {
			// 	TraceEvent(SevWarnAlways, "ServerCounterDebug").detail("Reason", reason.toString());
			// }
			ASSERT(idx >= 0 && idx < RelocateReason::typeCount());
			counter[id][idx][(int)type] += 1;
		}

		void summarizeLaunchedServers(decltype(counter.cbegin()) begin,
		                              decltype(counter.cend()) end,
		                              TraceEvent* event) const {
			if (begin == end)
				return;

			std::string execSrc, execDest;
			for (; begin != end; ++begin) {
				if (countNonZero(begin->second, LaunchedSource)) {
					execSrc += begin->first.shortString() + ",";
				}
				if (countNonZero(begin->second, LaunchedDest)) {
					execDest += begin->first.shortString() + ",";
				}
			}
			event->detail("RemainedLaunchedSources", execSrc).detail("RemainedLaunchedDestinations", execDest);
		}

	public:
		void clear() { counter.clear(); }

		int get(const UID& id, RelocateReason reason, CountType type) const {
			return counter.at(id)[(int)reason][(int)type];
		}

		void increaseForTeam(const std::vector<UID>& ids, RelocateReason reason, CountType type) {
			for (auto& id : ids) {
				increase(id, reason, type);
			}
		}

		void traceAll(const UID& debugId = UID()) const {
			auto it = counter.cbegin();
			int count = 0;
			for (; count < SERVER_KNOBS->DD_QUEUE_COUNTER_MAX_LOG && it != counter.cend(); ++count, ++it) {
				TraceEvent event("DDQueueServerCounter", debugId);
				event.detail("ServerId", it->first);
				traceReasonItem(&event, it->second);
			}

			if (it != counter.cend()) {
				TraceEvent e(SevWarn, "DDQueueServerCounterTooMany", debugId);
				e.detail("Servers", size());
				if (SERVER_KNOBS->DD_QUEUE_COUNTER_SUMMARIZE) {
					summarizeLaunchedServers(it, counter.cend(), &e);
					return;
				}
			}
		}

		size_t size() const { return counter.size(); }

		// for random test
		static CountType randomCountType() {
			int i = deterministicRandom()->randomInt(0, (int)__COUNT);
			return (CountType)i;
		}
	};

	ActorCollectionNoErrors noErrorActors; // has to be the last one to be destroyed because other Actors may use it.
	UID distributorId;
	MoveKeysLock lock;
	Database cx;
	Reference<IDDTxnProcessor> txnProcessor;

	std::vector<TeamCollectionInterface> teamCollections;
	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
	Reference<PhysicalShardCollection> physicalShardCollection;
	PromiseStream<Promise<int64_t>> getAverageShardBytes;

	FlowLock startMoveKeysParallelismLock;
	FlowLock finishMoveKeysParallelismLock;
	FlowLock cleanUpDataMoveParallelismLock;
	Reference<FlowLock> fetchSourceLock;

	int activeRelocations;
	int queuedRelocations;
	int64_t bytesWritten;
	int teamSize;
	int singleRegionTeamSize;

	std::map<UID, Busyness> busymap; // UID is serverID
	std::map<UID, Busyness> destBusymap; // UID is serverID

	KeyRangeMap<RelocateData> queueMap;
	std::set<RelocateData, std::greater<RelocateData>> fetchingSourcesQueue;
	std::set<RelocateData, std::greater<RelocateData>> fetchKeysComplete;
	KeyRangeActorMap getSourceActors;
	std::map<UID, std::set<RelocateData, std::greater<RelocateData>>>
	    queue; // Key UID is serverID, value is the serverID's set of RelocateData to relocate
	// The last time one server was selected as source team for read rebalance reason. We want to throttle read
	// rebalance on time bases because the read workload sample update has delay after the previous moving
	std::map<UID, double> lastAsSource;
	ServerCounter serverCounter;

	KeyRangeMap<RelocateData> inFlight;
	// Track all actors that relocates specified keys to a good place; Key: keyRange; Value: actor
	KeyRangeActorMap inFlightActors;
	KeyRangeMap<DDDataMove> dataMoves;

	Promise<Void> error;
	PromiseStream<RelocateData> dataTransferComplete;
	PromiseStream<RelocateData> relocationComplete;
	PromiseStream<RelocateData> fetchSourceServersComplete; // find source SSs for a relocate range

	PromiseStream<RelocateShard> output;
	FutureStream<RelocateShard> input;
	PromiseStream<GetMetricsRequest> getShardMetrics;
	PromiseStream<GetTopKMetricsRequest> getTopKMetrics;

	double lastInterval;
	int suppressIntervals;

	Reference<AsyncVar<bool>> rawProcessingUnhealthy; // many operations will remove relocations before adding a new
	                                                  // one, so delay a small time before settling on a new number.
	Reference<AsyncVar<bool>> rawProcessingWiggle;

	std::map<int, int> priority_relocations;
	int unhealthyRelocations;

	Reference<EventCacheHolder> movedKeyServersEventHolder;

	int moveReusePhysicalShard;
	int moveCreateNewPhysicalShard;
	enum RetryFindDstReason {
		None = 0,
		RemoteBestTeamNotReady,
		PrimaryNoHealthyTeam,
		RemoteNoHealthyTeam,
		RemoteTeamIsFull,
		RemoteTeamIsNotHealthy,
		NoAvailablePhysicalShard,
		NumberOfTypes,
	};
	std::vector<int> retryFindDstReasonCount;

	void startRelocation(int priority, int healthPriority) {
		// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
		// we must count it into unhealthyRelocations; because team removers relies on unhealthyRelocations to
		// ensure a team remover will not start before the previous one finishes removing a team and move away data
		// NOTE: split and merge shard have higher priority. If they have to wait for unhealthyRelocations = 0,
		// deadlock may happen: split/merge shard waits for unhealthyRelocations, while blocks team_redundant.
		if (healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
			unhealthyRelocations++;
			rawProcessingUnhealthy->set(true);
		}
		if (healthPriority == SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE) {
			rawProcessingWiggle->set(true);
		}
		priority_relocations[priority]++;
	}
	void finishRelocation(int priority, int healthPriority) {
		if (healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ||
		    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
			unhealthyRelocations--;
			ASSERT(unhealthyRelocations >= 0);
			if (unhealthyRelocations == 0) {
				rawProcessingUnhealthy->set(false);
			}
		}
		priority_relocations[priority]--;
		if (priority_relocations[SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE] == 0) {
			rawProcessingWiggle->set(false);
		}
	}

	DDQueue(UID mid,
	        MoveKeysLock lock,
	        Reference<IDDTxnProcessor> db,
	        std::vector<TeamCollectionInterface> teamCollections,
	        Reference<ShardsAffectedByTeamFailure> sABTF,
	        Reference<PhysicalShardCollection> physicalShardCollection,
	        PromiseStream<Promise<int64_t>> getAverageShardBytes,
	        int teamSize,
	        int singleRegionTeamSize,
	        PromiseStream<RelocateShard> output,
	        FutureStream<RelocateShard> input,
	        PromiseStream<GetMetricsRequest> getShardMetrics,
	        PromiseStream<GetTopKMetricsRequest> getTopKMetrics)
	  : IDDRelocationQueue(), distributorId(mid), lock(lock), cx(db->context()), txnProcessor(db),
	    teamCollections(teamCollections), shardsAffectedByTeamFailure(sABTF),
	    physicalShardCollection(physicalShardCollection), getAverageShardBytes(getAverageShardBytes),
	    startMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
	    finishMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
	    cleanUpDataMoveParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
	    fetchSourceLock(new FlowLock(SERVER_KNOBS->DD_FETCH_SOURCE_PARALLELISM)), activeRelocations(0),
	    queuedRelocations(0), bytesWritten(0), teamSize(teamSize), singleRegionTeamSize(singleRegionTeamSize),
	    output(output), input(input), getShardMetrics(getShardMetrics), getTopKMetrics(getTopKMetrics), lastInterval(0),
	    suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
	    rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
	    movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0),
	    moveCreateNewPhysicalShard(0), retryFindDstReasonCount(static_cast<int>(RetryFindDstReason::NumberOfTypes), 0) {
	}
	DDQueue() = default;

	void validate() {
		if (EXPENSIVE_VALIDATION) {
			for (auto it = fetchingSourcesQueue.begin(); it != fetchingSourcesQueue.end(); ++it) {
				// relocates in the fetching queue do not have src servers yet.
				if (it->src.size())
					TraceEvent(SevError, "DDQueueValidateError1")
					    .detail("Problem", "relocates in the fetching queue do not have src servers yet");

				// relocates in the fetching queue do not have a work factor yet.
				if (it->workFactor != 0.0)
					TraceEvent(SevError, "DDQueueValidateError2")
					    .detail("Problem", "relocates in the fetching queue do not have a work factor yet");

				// relocates in the fetching queue are in the queueMap.
				auto range = queueMap.rangeContaining(it->keys.begin);
				if (range.value() != *it || range.range() != it->keys)
					TraceEvent(SevError, "DDQueueValidateError3")
					    .detail("Problem", "relocates in the fetching queue are in the queueMap");
			}

			/*
			for( auto it = queue.begin(); it != queue.end(); ++it ) {
			    for( auto rdit = it->second.begin(); rdit != it->second.end(); ++rdit ) {
			        // relocates in the queue are in the queueMap exactly.
			        auto range = queueMap.rangeContaining( rdit->keys.begin );
			        if( range.value() != *rdit || range.range() != rdit->keys )
			            TraceEvent(SevError, "DDQueueValidateError4").detail("Problem", "relocates in the queue are in the queueMap exactly")
			            .detail("RangeBegin", range.range().begin)
			            .detail("RangeEnd", range.range().end)
			            .detail("RelocateBegin2", range.value().keys.begin)
			            .detail("RelocateEnd2", range.value().keys.end)
			            .detail("RelocateStart", range.value().startTime)
			            .detail("MapStart", rdit->startTime)
			            .detail("RelocateWork", range.value().workFactor)
			            .detail("MapWork", rdit->workFactor)
			            .detail("RelocateSrc", range.value().src.size())
			            .detail("MapSrc", rdit->src.size())
			            .detail("RelocatePrio", range.value().priority)
			            .detail("MapPrio", rdit->priority);

			        // relocates in the queue have src servers
			        if( !rdit->src.size() )
			            TraceEvent(SevError, "DDQueueValidateError5").detail("Problem", "relocates in the queue have src servers");

			        // relocates in the queue do not have a work factor yet.
			        if( rdit->workFactor != 0.0 )
			            TraceEvent(SevError, "DDQueueValidateError6").detail("Problem", "relocates in the queue do not have a work factor yet");

			        bool contains = false;
			        for( int i = 0; i < rdit->src.size(); i++ ) {
			            if( rdit->src[i] == it->first ) {
			                contains = true;
			                break;
			            }
			        }
			        if( !contains )
			            TraceEvent(SevError, "DDQueueValidateError7").detail("Problem", "queued relocate data does not include ss under which its filed");
			    }
			}*/

			auto inFlightRanges = inFlight.ranges();
			for (auto it = inFlightRanges.begin(); it != inFlightRanges.end(); ++it) {
				for (int i = 0; i < it->value().src.size(); i++) {
					// each server in the inFlight map is in the busymap
					if (!busymap.count(it->value().src[i]))
						TraceEvent(SevError, "DDQueueValidateError8")
						    .detail("Problem", "each server in the inFlight map is in the busymap");

					// relocate data that is inFlight is not also in the queue
					if (queue[it->value().src[i]].count(it->value()))
						TraceEvent(SevError, "DDQueueValidateError9")
						    .detail("Problem", "relocate data that is inFlight is not also in the queue");
				}

				for (int i = 0; i < it->value().completeDests.size(); i++) {
					// each server in the inFlight map is in the dest busymap
					if (!destBusymap.count(it->value().completeDests[i]))
						TraceEvent(SevError, "DDQueueValidateError10")
						    .detail("Problem", "each server in the inFlight map is in the destBusymap");
				}

				// in flight relocates have source servers
				if (it->value().startTime != -1 && !it->value().src.size())
					TraceEvent(SevError, "DDQueueValidateError11")
					    .detail("Problem", "in flight relocates have source servers");

				if (inFlightActors.liveActorAt(it->range().begin)) {
					// the key range in the inFlight map matches the key range in the RelocateData message
					if (it->value().keys != it->range())
						TraceEvent(SevError, "DDQueueValidateError12")
						    .detail(
						        "Problem",
						        "the key range in the inFlight map matches the key range in the RelocateData message");
				} else if (it->value().cancellable) {
					TraceEvent(SevError, "DDQueueValidateError13")
					    .detail("Problem", "key range is cancellable but not in flight!")
					    .detail("Range", it->range());
				}
			}

			for (auto it = busymap.begin(); it != busymap.end(); ++it) {
				for (int i = 0; i < it->second.ledger.size() - 1; i++) {
					if (it->second.ledger[i] < it->second.ledger[i + 1])
						TraceEvent(SevError, "DDQueueValidateError14")
						    .detail("Problem", "ascending ledger problem")
						    .detail("LedgerLevel", i)
						    .detail("LedgerValueA", it->second.ledger[i])
						    .detail("LedgerValueB", it->second.ledger[i + 1]);
					if (it->second.ledger[i] < 0.0)
						TraceEvent(SevError, "DDQueueValidateError15")
						    .detail("Problem", "negative ascending problem")
						    .detail("LedgerLevel", i)
						    .detail("LedgerValue", it->second.ledger[i]);
				}
			}

			for (auto it = destBusymap.begin(); it != destBusymap.end(); ++it) {
				for (int i = 0; i < it->second.ledger.size() - 1; i++) {
					if (it->second.ledger[i] < it->second.ledger[i + 1])
						TraceEvent(SevError, "DDQueueValidateError16")
						    .detail("Problem", "ascending ledger problem")
						    .detail("LedgerLevel", i)
						    .detail("LedgerValueA", it->second.ledger[i])
						    .detail("LedgerValueB", it->second.ledger[i + 1]);
					if (it->second.ledger[i] < 0.0)
						TraceEvent(SevError, "DDQueueValidateError17")
						    .detail("Problem", "negative ascending problem")
						    .detail("LedgerLevel", i)
						    .detail("LedgerValue", it->second.ledger[i]);
				}
			}

			std::set<RelocateData, std::greater<RelocateData>> queuedRelocationsMatch;
			for (auto it = queue.begin(); it != queue.end(); ++it)
				queuedRelocationsMatch.insert(it->second.begin(), it->second.end());
			ASSERT(queuedRelocations == queuedRelocationsMatch.size() + fetchingSourcesQueue.size());

			int testActive = 0;
			for (auto it = priority_relocations.begin(); it != priority_relocations.end(); ++it)
				testActive += it->second;
			ASSERT(activeRelocations + queuedRelocations == testActive);
		}
	}

	ACTOR static Future<Void> getSourceServersForRange(DDQueue* self,
	                                                   RelocateData input,
	                                                   PromiseStream<RelocateData> output,
	                                                   Reference<FlowLock> fetchLock) {

		// FIXME: is the merge case needed
		if (input.priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD) {
			wait(delay(0.5, TaskPriority::DataDistributionVeryLow));
		} else {
			wait(delay(0.0001, TaskPriority::DataDistributionLaunch));
		}

		wait(fetchLock->take(TaskPriority::DataDistributionLaunch));
		state FlowLock::Releaser releaser(*fetchLock);

		IDDTxnProcessor::SourceServers res = wait(self->txnProcessor->getSourceServersForRange(input.keys));
		input.src = std::move(res.srcServers);
		input.completeSources = std::move(res.completeSources);
		output.send(input);
		return Void();
	}

	// This function cannot handle relocation requests which split a shard into three pieces
	void queueRelocation(RelocateShard rs, std::set<UID>& serversToLaunchFrom) {
		//TraceEvent("QueueRelocationBegin").detail("Begin", rd.keys.begin).detail("End", rd.keys.end);

		// remove all items from both queues that are fully contained in the new relocation (i.e. will be overwritten)
		RelocateData rd(rs);
		bool hasHealthPriority = RelocateData::isHealthPriority(rd.priority);
		bool hasBoundaryPriority = RelocateData::isBoundaryPriority(rd.priority);

		auto ranges = queueMap.intersectingRanges(rd.keys);
		for (auto r = ranges.begin(); r != ranges.end(); ++r) {
			RelocateData& rrs = r->value();

			auto fetchingSourcesItr = fetchingSourcesQueue.find(rrs);
			bool foundActiveFetching = fetchingSourcesItr != fetchingSourcesQueue.end();
			std::set<RelocateData, std::greater<RelocateData>>* firstQueue;
			std::set<RelocateData, std::greater<RelocateData>>::iterator firstRelocationItr;
			bool foundActiveRelocation = false;

			if (!foundActiveFetching && rrs.src.size()) {
				firstQueue = &queue[rrs.src[0]];
				firstRelocationItr = firstQueue->find(rrs);
				foundActiveRelocation = firstRelocationItr != firstQueue->end();
			}

			// If there is a queued job that wants data relocation which we are about to cancel/modify,
			//  make sure that we keep the relocation intent for the job that we queue up
			if (foundActiveFetching || foundActiveRelocation) {
				rd.wantsNewServers |= rrs.wantsNewServers;
				rd.startTime = std::min(rd.startTime, rrs.startTime);
				if (!hasHealthPriority) {
					rd.healthPriority = std::max(rd.healthPriority, rrs.healthPriority);
				}
				if (!hasBoundaryPriority) {
					rd.boundaryPriority = std::max(rd.boundaryPriority, rrs.boundaryPriority);
				}
				rd.priority = std::max(rd.priority, std::max(rd.boundaryPriority, rd.healthPriority));
			}

			if (rd.keys.contains(rrs.keys)) {
				if (foundActiveFetching)
					fetchingSourcesQueue.erase(fetchingSourcesItr);
				else if (foundActiveRelocation) {
					firstQueue->erase(firstRelocationItr);
					for (int i = 1; i < rrs.src.size(); i++)
						queue[rrs.src[i]].erase(rrs);
				}
			}

			if (foundActiveFetching || foundActiveRelocation) {
				serversToLaunchFrom.insert(rrs.src.begin(), rrs.src.end());
				/*TraceEvent(rrs.interval.end(), mi.id()).detail("Result","Cancelled")
				    .detail("WasFetching", foundActiveFetching).detail("Contained", rd.keys.contains( rrs.keys ));*/
				queuedRelocations--;
				TraceEvent(SevVerbose, "QueuedRelocationsChanged")
				    .detail("DataMoveID", rrs.dataMoveId)
				    .detail("RandomID", rrs.randomId)
				    .detail("Total", queuedRelocations);
				finishRelocation(rrs.priority, rrs.healthPriority);
			}
		}

		// determine the final state of the relocations map
		auto affectedQueuedItems = queueMap.getAffectedRangesAfterInsertion(rd.keys, rd);

		// put the new request into the global map of requests (modifies the ranges already present)
		queueMap.insert(rd.keys, rd);

		// cancel all the getSourceServers actors that intersect the new range that we will be getting
		getSourceActors.cancel(KeyRangeRef(affectedQueuedItems.front().begin, affectedQueuedItems.back().end));

		// update fetchingSourcesQueue and the per-server queue based on truncated ranges after insertion, (re-)launch
		// getSourceServers
		auto queueMapItr = queueMap.rangeContaining(affectedQueuedItems[0].begin);
		for (int r = 0; r < affectedQueuedItems.size(); ++r, ++queueMapItr) {
			// ASSERT(queueMapItr->value() == queueMap.rangeContaining(affectedQueuedItems[r].begin)->value());
			RelocateData& rrs = queueMapItr->value();

			if (rrs.src.size() == 0 && (rrs.keys == rd.keys || fetchingSourcesQueue.erase(rrs) > 0)) {
				rrs.keys = affectedQueuedItems[r];
				rrs.interval = TraceInterval("QueuedRelocation", rrs.randomId); // inherit the old randomId

				DebugRelocationTraceEvent(rrs.interval.begin(), distributorId)
				    .detail("KeyBegin", rrs.keys.begin)
				    .detail("KeyEnd", rrs.keys.end)
				    .detail("Priority", rrs.priority)
				    .detail("WantsNewServers", rrs.wantsNewServers);

				queuedRelocations++;
				TraceEvent(SevVerbose, "QueuedRelocationsChanged")
				    .detail("DataMoveID", rrs.dataMoveId)
				    .detail("RandomID", rrs.randomId)
				    .detail("Total", queuedRelocations);
				startRelocation(rrs.priority, rrs.healthPriority);

				fetchingSourcesQueue.insert(rrs);
				getSourceActors.insert(
				    rrs.keys, getSourceServersForRange(this, rrs, fetchSourceServersComplete, fetchSourceLock));
			} else {
				RelocateData newData(rrs);
				newData.keys = affectedQueuedItems[r];
				ASSERT(rrs.src.size() || rrs.startTime == -1);

				bool foundActiveRelocation = false;
				for (int i = 0; i < rrs.src.size(); i++) {
					auto& serverQueue = queue[rrs.src[i]];

					if (serverQueue.erase(rrs) > 0) {
						if (!foundActiveRelocation) {
							newData.interval =
							    TraceInterval("QueuedRelocation", rrs.randomId); // inherit the old randomId

							DebugRelocationTraceEvent(newData.interval.begin(), distributorId)
							    .detail("KeyBegin", newData.keys.begin)
							    .detail("KeyEnd", newData.keys.end)
							    .detail("Priority", newData.priority)
							    .detail("WantsNewServers", newData.wantsNewServers);

							queuedRelocations++;
							TraceEvent(SevVerbose, "QueuedRelocationsChanged")
							    .detail("DataMoveID", newData.dataMoveId)
							    .detail("RandomID", newData.randomId)
							    .detail("Total", queuedRelocations);
							startRelocation(newData.priority, newData.healthPriority);
							foundActiveRelocation = true;
						}

						serverQueue.insert(newData);
					} else
						break;
				}

				// We update the keys of a relocation even if it is "dead" since it helps validate()
				rrs.keys = affectedQueuedItems[r];
				rrs.interval = newData.interval;
			}
		}

		DebugRelocationTraceEvent("ReceivedRelocateShard", distributorId)
		    .detail("KeyBegin", rd.keys.begin)
		    .detail("KeyEnd", rd.keys.end)
		    .detail("Priority", rd.priority)
		    .detail("AffectedRanges", affectedQueuedItems.size());
	}

	void completeSourceFetch(const RelocateData& results) {
		ASSERT(fetchingSourcesQueue.count(results));

		// logRelocation( results, "GotSourceServers" );

		fetchingSourcesQueue.erase(results);
		queueMap.insert(results.keys, results);
		for (int i = 0; i < results.src.size(); i++) {
			queue[results.src[i]].insert(results);
		}
		updateLastAsSource(results.src);
		serverCounter.increaseForTeam(results.src, results.reason, ServerCounter::CountType::QueuedSource);
	}

	void logRelocation(const RelocateData& rd, const char* title) {
		std::string busyString;
		for (int i = 0; i < rd.src.size() && i < teamSize * 2; i++)
			busyString += describe(rd.src[i]) + " - (" + busymap[rd.src[i]].toString() + "); ";

		TraceEvent(title, distributorId)
		    .detail("KeyBegin", rd.keys.begin)
		    .detail("KeyEnd", rd.keys.end)
		    .detail("Priority", rd.priority)
		    .detail("WorkFactor", rd.workFactor)
		    .detail("SourceServerCount", rd.src.size())
		    .detail("SourceServers", describe(rd.src, teamSize * 2))
		    .detail("SourceBusyness", busyString);
	}

	void launchQueuedWork(KeyRange keys, const DDEnabledState* ddEnabledState) {
		// combine all queued work in the key range and check to see if there is anything to launch
		std::set<RelocateData, std::greater<RelocateData>> combined;
		auto f = queueMap.intersectingRanges(keys);
		for (auto it = f.begin(); it != f.end(); ++it) {
			if (it->value().src.size() && queue[it->value().src[0]].count(it->value()))
				combined.insert(it->value());
		}
		launchQueuedWork(combined, ddEnabledState);
	}

	void launchQueuedWork(const std::set<UID>& serversToLaunchFrom, const DDEnabledState* ddEnabledState) {
		// combine all work from the source servers to see if there is anything new to launch
		std::set<RelocateData, std::greater<RelocateData>> combined;
		for (auto id : serversToLaunchFrom) {
			auto& queuedWork = queue[id];
			auto it = queuedWork.begin();
			for (int j = 0; j < teamSize && it != queuedWork.end(); j++) {
				combined.insert(*it);
				++it;
			}
		}
		launchQueuedWork(combined, ddEnabledState);
	}

	void launchQueuedWork(RelocateData launchData, const DDEnabledState* ddEnabledState) {
		// check a single RelocateData to see if it can be launched
		std::set<RelocateData, std::greater<RelocateData>> combined;
		combined.insert(launchData);
		launchQueuedWork(combined, ddEnabledState);
	}

	// For each relocateData rd in the queue, check if there exist inflight relocate data whose keyrange is overlapped
	// with rd. If there exist, cancel them by cancelling their actors and reducing the src servers' busyness of those
	// canceled inflight relocateData. Launch the relocation for the rd.
	void launchQueuedWork(std::set<RelocateData, std::greater<RelocateData>> combined,
	                      const DDEnabledState* ddEnabledState) {
		int startedHere = 0;
		double startTime = now();
		// kick off relocators from items in the queue as need be
		std::set<RelocateData, std::greater<RelocateData>>::iterator it = combined.begin();
		for (; it != combined.end(); it++) {
			RelocateData rd(*it);

			// Check if there is an inflight shard that is overlapped with the queued relocateShard (rd)
			bool overlappingInFlight = false;
			auto intersectingInFlight = inFlight.intersectingRanges(rd.keys);
			for (auto it = intersectingInFlight.begin(); it != intersectingInFlight.end(); ++it) {
				if (fetchKeysComplete.count(it->value()) && inFlightActors.liveActorAt(it->range().begin) &&
				    !rd.keys.contains(it->range()) && it->value().priority >= rd.priority &&
				    rd.healthPriority < SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY) {

					DebugRelocationTraceEvent("OverlappingInFlight", distributorId)
					    .detail("KeyBegin", it->value().keys.begin)
					    .detail("KeyEnd", it->value().keys.end)
					    .detail("Priority", it->value().priority);

					overlappingInFlight = true;
					break;
				}
			}

			if (overlappingInFlight) {
				ASSERT(!rd.isRestore());
				// logRelocation( rd, "SkippingOverlappingInFlight" );
				continue;
			}

			// Because the busyness of a server is decreased when a superseding relocation is issued, we
			//  need to consider what the busyness of a server WOULD be if
			auto containedRanges = inFlight.containedRanges(rd.keys);
			std::vector<RelocateData> cancellableRelocations;
			for (auto it = containedRanges.begin(); it != containedRanges.end(); ++it) {
				if (it.value().cancellable) {
					cancellableRelocations.push_back(it->value());
				}
			}

			// Data movement avoids overloading source servers in moving data.
			// SOMEDAY: the list of source servers may be outdated since they were fetched when the work was put in the
			// queue
			// FIXME: we need spare capacity even when we're just going to be cancelling work via TEAM_HEALTHY
			if (!rd.isRestore() && !canLaunchSrc(rd, teamSize, singleRegionTeamSize, busymap, cancellableRelocations)) {
				// logRelocation( rd, "SkippingQueuedRelocation" );
				continue;
			}

			// From now on, the source servers for the RelocateData rd have enough resource to move the data away,
			// because they do not have too much inflight data movement.

			// logRelocation( rd, "LaunchingRelocation" );
			DebugRelocationTraceEvent(rd.interval.end(), distributorId).detail("Result", "Success");

			if (!rd.isRestore()) {
				queuedRelocations--;
				TraceEvent(SevVerbose, "QueuedRelocationsChanged")
				    .detail("DataMoveID", rd.dataMoveId)
				    .detail("RandomID", rd.randomId)
				    .detail("Total", queuedRelocations);
				finishRelocation(rd.priority, rd.healthPriority);

				// now we are launching: remove this entry from the queue of all the src servers
				for (int i = 0; i < rd.src.size(); i++) {
					ASSERT(queue[rd.src[i]].erase(rd));
				}
			}

			Future<Void> fCleanup =
			    SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA ? cancelDataMove(this, rd.keys, ddEnabledState) : Void();

			// If there is a job in flight that wants data relocation which we are about to cancel/modify,
			//     make sure that we keep the relocation intent for the job that we launch
			auto f = inFlight.intersectingRanges(rd.keys);
			for (auto it = f.begin(); it != f.end(); ++it) {
				if (inFlightActors.liveActorAt(it->range().begin)) {
					rd.wantsNewServers |= it->value().wantsNewServers;
				}
			}
			startedHere++;

			// update both inFlightActors and inFlight key range maps, cancelling deleted RelocateShards
			std::vector<KeyRange> ranges;
			inFlightActors.getRangesAffectedByInsertion(rd.keys, ranges);
			inFlightActors.cancel(KeyRangeRef(ranges.front().begin, ranges.back().end));
			inFlight.insert(rd.keys, rd);
			for (int r = 0; r < ranges.size(); r++) {
				RelocateData& rrs = inFlight.rangeContaining(ranges[r].begin)->value();
				rrs.keys = ranges[r];
				if (rd.keys == ranges[r] && rd.isRestore()) {
					ASSERT(rd.dataMove != nullptr);
					ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
					rrs.dataMoveId = rd.dataMove->meta.id;
				} else {
					ASSERT_WE_THINK(!rd.isRestore()); // Restored data move should not overlap.
					// TODO(psm): The shard id is determined by DD.
					rrs.dataMove.reset();
					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
						if (SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
							rrs.dataMoveId = UID();
						} else {
							rrs.dataMoveId = deterministicRandom()->randomUniqueID();
						}
					} else {
						rrs.dataMoveId = anonymousShardId;
					}
				}

				launch(rrs, busymap, singleRegionTeamSize);
				activeRelocations++;
				TraceEvent(SevVerbose, "InFlightRelocationChange")
				    .detail("Launch", rrs.dataMoveId)
				    .detail("Total", activeRelocations);
				startRelocation(rrs.priority, rrs.healthPriority);
				// Start the actor that relocates data in the rrs.keys
				inFlightActors.insert(rrs.keys, dataDistributionRelocator(this, rrs, fCleanup, ddEnabledState));
			}

			// logRelocation( rd, "LaunchedRelocation" );
		}
		if (now() - startTime > .001 && deterministicRandom()->random01() < 0.001)
			TraceEvent(SevWarnAlways, "LaunchingQueueSlowx1000").detail("Elapsed", now() - startTime);

		/*if( startedHere > 0 ) {
		    TraceEvent("StartedDDRelocators", distributorId)
		        .detail("QueueSize", queuedRelocations)
		        .detail("StartedHere", startedHere)
		        .detail("ActiveRelocations", activeRelocations);
		} */

		validate();
	}

	int getHighestPriorityRelocation() const {
		int highestPriority{ 0 };
		for (const auto& [priority, count] : priority_relocations) {
			if (count > 0) {
				highestPriority = std::max(highestPriority, priority);
			}
		}
		return highestPriority;
	}

	// return true if the servers are throttled as source for read rebalance
	bool timeThrottle(const std::vector<UID>& ids) const {
		return std::any_of(ids.begin(), ids.end(), [this](const UID& id) {
			if (this->lastAsSource.count(id)) {
				return (now() - this->lastAsSource.at(id)) * SERVER_KNOBS->READ_REBALANCE_SRC_PARALLELISM <
				       SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL;
			}
			return false;
		});
	}

	void updateLastAsSource(const std::vector<UID>& ids, double t = now()) {
		for (auto& id : ids)
			lastAsSource[id] = t;
	}

	// Schedules cancellation of a data move.
	void enqueueCancelledDataMove(UID dataMoveId, KeyRange range, const DDEnabledState* ddEnabledState) {
		ASSERT(!txnProcessor->isMocked()); // the mock implementation currently doesn't support data move
		std::vector<Future<Void>> cleanup;
		auto f = this->dataMoves.intersectingRanges(range);
		for (auto it = f.begin(); it != f.end(); ++it) {
			if (it->value().isValid()) {
				TraceEvent(SevError, "DDEnqueueCancelledDataMoveConflict", this->distributorId)
				    .detail("DataMoveID", dataMoveId)
				    .detail("CancelledRange", range)
				    .detail("ConflictingDataMoveID", it->value().id)
				    .detail("ConflictingRange", KeyRangeRef(it->range().begin, it->range().end));
				return;
			}
		}

		DDQueue::DDDataMove dataMove(dataMoveId);
		dataMove.cancel = cleanUpDataMove(
		    this->cx, dataMoveId, this->lock, &this->cleanUpDataMoveParallelismLock, range, ddEnabledState);
		this->dataMoves.insert(range, dataMove);
		TraceEvent(SevInfo, "DDEnqueuedCancelledDataMove", this->distributorId)
		    .detail("DataMoveID", dataMoveId)
		    .detail("Range", range);
	}

	Future<Void> periodicalRefreshCounter() {
		auto f = [this]() {
			serverCounter.traceAll(distributorId);
			serverCounter.clear();
		};
		return recurring(f, SERVER_KNOBS->DD_QUEUE_COUNTER_REFRESH_INTERVAL);
	}

	int getUnhealthyRelocationCount() override { return unhealthyRelocations; }

	Future<SrcDestTeamPair> getSrcDestTeams(const int& teamCollectionIndex,
	                                        const GetTeamRequest& srcReq,
	                                        const GetTeamRequest& destReq,
	                                        const int& priority,
	                                        TraceEvent* traceEvent);

	Future<bool> rebalanceReadLoad(DataMovementReason moveReason,
	                               Reference<IDataDistributionTeam> sourceTeam,
	                               Reference<IDataDistributionTeam> destTeam,
	                               bool primary,
	                               TraceEvent* traceEvent);

	Future<bool> rebalanceTeams(DataMovementReason moveReason,
	                            Reference<IDataDistributionTeam const> sourceTeam,
	                            Reference<IDataDistributionTeam const> destTeam,
	                            bool primary,
	                            TraceEvent* traceEvent);
};

ACTOR Future<Void> cancelDataMove(struct DDQueue* self, KeyRange range, const DDEnabledState* ddEnabledState) {
	std::vector<Future<Void>> cleanup;
	auto f = self->dataMoves.intersectingRanges(range);
	for (auto it = f.begin(); it != f.end(); ++it) {
		if (!it->value().isValid()) {
			continue;
		}
		KeyRange keys = KeyRangeRef(it->range().begin, it->range().end);
		TraceEvent(SevInfo, "DDQueueCancelDataMove", self->distributorId)
		    .detail("DataMoveID", it->value().id)
		    .detail("DataMoveRange", keys)
		    .detail("Range", range);
		if (!it->value().cancel.isValid()) {
			it->value().cancel = cleanUpDataMove(
			    self->cx, it->value().id, self->lock, &self->cleanUpDataMoveParallelismLock, keys, ddEnabledState);
		}
		cleanup.push_back(it->value().cancel);
	}
	wait(waitForAll(cleanup));
	auto ranges = self->dataMoves.getAffectedRangesAfterInsertion(range);
	if (!ranges.empty()) {
		self->dataMoves.insert(KeyRangeRef(ranges.front().begin, ranges.back().end), DDQueue::DDDataMove());
	}
	return Void();
}

static std::string destServersString(std::vector<std::pair<Reference<IDataDistributionTeam>, bool>> const& bestTeams) {
	std::stringstream ss;

	for (auto& tc : bestTeams) {
		for (const auto& id : tc.first->getServerIDs()) {
			ss << id.toString() << " ";
		}
	}

	return std::move(ss).str();
}

// This actor relocates the specified keys to a good place.
// The inFlightActor key range map stores the actor for each RelocateData
ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
                                             RelocateData rd,
                                             Future<Void> prevCleanup,
                                             const DDEnabledState* ddEnabledState) {
	state Promise<Void> errorOut(self->error);
	state TraceInterval relocateShardInterval("RelocateShard", rd.randomId);
	state PromiseStream<RelocateData> dataTransferComplete(self->dataTransferComplete);
	state PromiseStream<RelocateData> relocationComplete(self->relocationComplete);
	state bool signalledTransferComplete = false;
	state UID distributorId = self->distributorId;
	state ParallelTCInfo healthyDestinations;

	state bool anyHealthy = false;
	state bool allHealthy = true;
	state bool anyWithSource = false;
	state bool anyDestOverloaded = false;
	state int destOverloadedCount = 0;
	state int stuckCount = 0;
	state std::vector<std::pair<Reference<IDataDistributionTeam>, bool>> bestTeams;
	state double startTime = now();
	state std::vector<UID> destIds;
	state uint64_t debugID = deterministicRandom()->randomUInt64();

	try {
		if (now() - self->lastInterval < 1.0) {
			relocateShardInterval.severity = SevDebug;
			self->suppressIntervals++;
		}

		TraceEvent(relocateShardInterval.begin(), distributorId)
		    .detail("KeyBegin", rd.keys.begin)
		    .detail("KeyEnd", rd.keys.end)
		    .detail("Priority", rd.priority)
		    .detail("SuppressedEventCount", self->suppressIntervals);

		if (relocateShardInterval.severity != SevDebug) {
			self->lastInterval = now();
			self->suppressIntervals = 0;
		}

		if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
			auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
			ASSERT(inFlightRange.range() == rd.keys);
			ASSERT(inFlightRange.value().randomId == rd.randomId);
			ASSERT(inFlightRange.value().dataMoveId == rd.dataMoveId);
			inFlightRange.value().cancellable = false;

			wait(prevCleanup);

			auto f = self->dataMoves.intersectingRanges(rd.keys);
			for (auto it = f.begin(); it != f.end(); ++it) {
				KeyRangeRef kr(it->range().begin, it->range().end);
				const UID mId = it->value().id;
				if (mId.isValid() && mId != rd.dataMoveId) {
					TraceEvent("DDRelocatorConflictingDataMove", distributorId)
					    .detail("CurrentDataMoveID", rd.dataMoveId)
					    .detail("DataMoveID", mId)
					    .detail("Range", kr);
				}
			}
			if (rd.isRestore() || !SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
				if (SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
					ASSERT(rd.dataMoveId.isValid());
				}
				self->dataMoves.insert(rd.keys, DDQueue::DDDataMove(rd.dataMoveId));
			}
		}

		state StorageMetrics metrics =
		    wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(rd.keys))));

		state uint64_t physicalShardIDCandidate = UID().first();
		state bool forceToUseNewPhysicalShard = false;

		ASSERT(rd.src.size());
		loop {
			destOverloadedCount = 0;
			stuckCount = 0;
			state DDQueue::RetryFindDstReason retryFindDstReason = DDQueue::RetryFindDstReason::None;
			// state int bestTeamStuckThreshold = 50;
			loop {
				state int tciIndex = 0;
				state bool foundTeams = true;
				state bool bestTeamReady = false;
				anyHealthy = false;
				allHealthy = true;
				anyWithSource = false;
				anyDestOverloaded = false;
				bestTeams.clear();
				// Get team from teamCollections in different DCs and find the best one
				while (tciIndex < self->teamCollections.size()) {
					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && rd.isRestore()) {
						auto req = GetTeamRequest(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest);
						Future<std::pair<Optional<Reference<IDataDistributionTeam>>, bool>> fbestTeam =
						    brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req));
						bestTeamReady = fbestTeam.isReady();
						std::pair<Optional<Reference<IDataDistributionTeam>>, bool> bestTeam = wait(fbestTeam);
						if (tciIndex > 0 && !bestTeamReady) {
							// self->shardsAffectedByTeamFailure->moveShard must be called without any waits after
							// getting the destination team or we could miss failure notifications for the storage
							// servers in the destination team
							TraceEvent("BestTeamNotReady")
							    .detail("TeamCollectionIndex", tciIndex)
							    .detail("RestoreDataMoveForDest",
							            describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest));
							retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
							foundTeams = false;
							break;
						}
						if (!bestTeam.first.present() || !bestTeam.first.get()->isHealthy()) {
							retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
							                                   : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
							foundTeams = false;
							break;
						}
						anyHealthy = true;
						bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
					} else {
						double inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_HEALTHY;
						if (rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
						    rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT)
							inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
						if (rd.healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
						    rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
						    rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT)
							inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;

						auto req = GetTeamRequest(WantNewServers(rd.wantsNewServers),
						                          WantTrueBest(isValleyFillerPriority(rd.priority)),
						                          PreferLowerDiskUtil::True,
						                          TeamMustHaveShards::False,
						                          ForReadBalance(rd.reason == RelocateReason::REBALANCE_READ),
						                          PreferLowerReadUtil::True,
						                          inflightPenalty);

						req.src = rd.src;
						req.completeSources = rd.completeSources;

						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
						    tciIndex == 1) {
							ASSERT(physicalShardIDCandidate != UID().first() &&
							       physicalShardIDCandidate != anonymousShardId.first());
							Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
							    self->physicalShardCollection->tryGetAvailableRemoteTeamWith(
							        physicalShardIDCandidate, metrics, debugID);
							if (remoteTeamWithPhysicalShard.present()) {
								// Exists a remoteTeam in the mapping that has the physicalShardIDCandidate
								// use the remoteTeam with the physicalShard as the bestTeam
								req = GetTeamRequest(remoteTeamWithPhysicalShard.get().servers);
							}
						}

						// bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any
						// server that hosts the relocateData. This is possible, for example, in a fearless
						// configuration when the remote DC is just brought up.
						Future<std::pair<Optional<Reference<IDataDistributionTeam>>, bool>> fbestTeam =
						    brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req));
						bestTeamReady = fbestTeam.isReady();
						std::pair<Optional<Reference<IDataDistributionTeam>>, bool> bestTeam = wait(fbestTeam);
						if (tciIndex > 0 && !bestTeamReady) {
							// self->shardsAffectedByTeamFailure->moveShard must be called without any waits after
							// getting the destination team or we could miss failure notifications for the storage
							// servers in the destination team
							TraceEvent("BestTeamNotReady");
							retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
							foundTeams = false;
							break;
						}
						// If a DC has no healthy team, we stop checking the other DCs until
						// the unhealthy DC is healthy again or is excluded.
						if (!bestTeam.first.present()) {
							retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
							                                   : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
							foundTeams = false;
							break;
						}
						if (!bestTeam.first.get()->isHealthy()) {
							allHealthy = false;
						} else {
							anyHealthy = true;
						}

						if (bestTeam.second) {
							anyWithSource = true;
						}

						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
							// critical to the correctness of team selection by PhysicalShardCollection
							// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
							// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In this
							// case, we must re-select a remote team We set foundTeams = false to avoid finishing team
							// selection Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select
							// a remote team
							if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
								bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
								if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
									retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
									foundTeams = false;
									break;
								}
							}
						}

						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
							bestTeams.emplace_back(bestTeam.first.get(), true);
							// Always set bestTeams[i].second = true to disable optimization in data move between DCs
							// for the correctness of PhysicalShardCollection
							// Currently, enabling the optimization will break the invariant of PhysicalShardCollection
							// Invariant: once a physical shard is created with a specific set of SSes, this SS set will
							// never get changed.
						} else {
							bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
						}

						// get physicalShardIDCandidate
						if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
						    tciIndex == 0) {
							ASSERT(foundTeams);
							ShardsAffectedByTeamFailure::Team primaryTeam =
							    ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
							physicalShardIDCandidate =
							    self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
							        primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
							ASSERT(physicalShardIDCandidate != UID().first() &&
							       physicalShardIDCandidate != anonymousShardId.first());
						}
					}
					tciIndex++;
				}

				// critical to the correctness of team selection by PhysicalShardCollection
				// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary team
				// Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team
				// In this case, we must re-select a remote team
				// We set foundTeams = false to avoid finishing team selection
				// Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select a remote team
				if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
				    bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
					if (!bestTeams[1].first->isHealthy()) {
						retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
						foundTeams = false;
					}
				}

				// once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves
				// already
				anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap);

				if (foundTeams && anyHealthy && !anyDestOverloaded) {
					ASSERT(rd.completeDests.empty());
					break;
				}

				if (anyDestOverloaded) {
					CODE_PROBE(true, "Destination overloaded throttled move");
					destOverloadedCount++;
					TraceEvent(destOverloadedCount > 50 ? SevInfo : SevDebug, "DestSSBusy", distributorId)
					    .suppressFor(1.0)
					    .detail("StuckCount", stuckCount)
					    .detail("DestOverloadedCount", destOverloadedCount)
					    .detail("TeamCollectionId", tciIndex)
					    .detail("AnyDestOverloaded", anyDestOverloaded)
					    .detail("NumOfTeamCollections", self->teamCollections.size())
					    .detail("Servers", destServersString(bestTeams));
					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
						if (rd.isRestore() && destOverloadedCount > 50) {
							throw data_move_dest_team_not_found();
						}
					}
					wait(delay(SERVER_KNOBS->DEST_OVERLOADED_DELAY, TaskPriority::DataDistributionLaunch));
				} else {
					CODE_PROBE(true, "did not find a healthy destination team on the first attempt");
					stuckCount++;
					TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", distributorId)
					    .suppressFor(1.0)
					    .detail("StuckCount", stuckCount)
					    .detail("DestOverloadedCount", destOverloadedCount)
					    .detail("TeamCollectionId", tciIndex)
					    .detail("AnyDestOverloaded", anyDestOverloaded)
					    .detail("NumOfTeamCollections", self->teamCollections.size());
					if (rd.isRestore() && stuckCount > 50) {
						throw data_move_dest_team_not_found();
					}
					wait(delay(SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch));
				}
				// When forceToUseNewPhysicalShard = false, we get paired primary team and remote team
				// However, this may be failed
				// Any retry triggers to use new physicalShard which enters the normal routine
				if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
					forceToUseNewPhysicalShard = true;
				}

				// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
			}

			if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
				if (!rd.isRestore()) {
					// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
					// thus, update the physicalShardIDCandidate to related data structures
					ASSERT(physicalShardIDCandidate != UID().first());
					if (self->physicalShardCollection->physicalShardExists(physicalShardIDCandidate)) {
						self->moveReusePhysicalShard++;
					} else {
						self->moveCreateNewPhysicalShard++;
						if (retryFindDstReason == DDQueue::RetryFindDstReason::None) {
							// When creating a new physical shard, but the reason is none, this can only happen when
							// determinePhysicalShardIDGivenPrimaryTeam() finds that there is no available physical
							// shard.
							self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++;
						} else {
							self->retryFindDstReasonCount[retryFindDstReason]++;
						}
					}
					rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
					auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
					inFlightRange.value().dataMoveId = rd.dataMoveId;
					auto f = self->dataMoves.intersectingRanges(rd.keys);
					for (auto it = f.begin(); it != f.end(); ++it) {
						KeyRangeRef kr(it->range().begin, it->range().end);
						const UID mId = it->value().id;
						if (mId.isValid() && mId != rd.dataMoveId) {
							TraceEvent("DDRelocatorConflictingDataMoveAfterGetTeam", distributorId)
							    .detail("CurrentDataMoveID", rd.dataMoveId)
							    .detail("DataMoveID", mId)
							    .detail("Range", kr);
						}
					}
					self->dataMoves.insert(rd.keys, DDQueue::DDDataMove(rd.dataMoveId));
				}
				ASSERT(rd.dataMoveId.first() != UID().first());
				auto dataMoveRange = self->dataMoves.rangeContaining(rd.keys.begin);
				ASSERT(dataMoveRange.value().id == rd.dataMoveId);
			}

			// set cancellable to false on inFlight's entry for this key range
			auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
			ASSERT(inFlightRange.range() == rd.keys);
			ASSERT(inFlightRange.value().randomId == rd.randomId);
			inFlightRange.value().cancellable = false;

			destIds.clear();
			state std::vector<UID> healthyIds;
			state std::vector<UID> extraIds;
			state std::vector<ShardsAffectedByTeamFailure::Team> destinationTeams;

			for (int i = 0; i < bestTeams.size(); i++) {
				auto& serverIds = bestTeams[i].first->getServerIDs();
				destinationTeams.push_back(ShardsAffectedByTeamFailure::Team(serverIds, i == 0));

				// TODO(psm): Make DataMoveMetaData aware of the two-step data move optimization.
				if (allHealthy && anyWithSource && !bestTeams[i].second) {
					// When all servers in bestTeams[i] do not hold the shard (!bestTeams[i].second), it indicates
					// the bestTeams[i] is in a new DC where data has not been replicated to.
					// To move data (specified in RelocateShard) to bestTeams[i] in the new DC AND reduce data movement
					// across DC, we randomly choose a server in bestTeams[i] as the shard's destination, and
					// move the shard to the randomly chosen server (in the remote DC), which will later
					// propogate its data to the servers in the same team. This saves data movement bandwidth across DC
					int idx = deterministicRandom()->randomInt(0, serverIds.size());
					destIds.push_back(serverIds[idx]);
					healthyIds.push_back(serverIds[idx]);
					for (int j = 0; j < serverIds.size(); j++) {
						if (j != idx) {
							extraIds.push_back(serverIds[j]);
						}
					}
					healthyDestinations.addTeam(bestTeams[i].first);
				} else {
					destIds.insert(destIds.end(), serverIds.begin(), serverIds.end());
					if (bestTeams[i].first->isHealthy()) {
						healthyIds.insert(healthyIds.end(), serverIds.begin(), serverIds.end());
						healthyDestinations.addTeam(bestTeams[i].first);
					}
				}
			}

			// Sanity check
			state int totalIds = 0;
			for (auto& destTeam : destinationTeams) {
				totalIds += destTeam.servers.size();
			}
			if (totalIds != self->teamSize) {
				TraceEvent(SevWarn, "IncorrectDestTeamSize")
				    .suppressFor(1.0)
				    .detail("ExpectedTeamSize", self->teamSize)
				    .detail("DestTeamSize", totalIds);
			}

			if (!rd.isRestore()) {
				self->shardsAffectedByTeamFailure->moveShard(rd.keys, destinationTeams);
			}

			// FIXME: do not add data in flight to servers that were already in the src.
			healthyDestinations.addDataInFlightToTeam(+metrics.bytes);
			healthyDestinations.addReadInFlightToTeam(+metrics.bytesReadPerKSecond);

			launchDest(rd, bestTeams, self->destBusymap);

			if (SERVER_KNOBS->DD_ENABLE_VERBOSE_TRACING) {
				// StorageMetrics is the rd shard's metrics, e.g., bytes and write bandwidth
				TraceEvent(SevInfo, "RelocateShardDecision", distributorId)
				    .detail("PairId", relocateShardInterval.pairID)
				    .detail("Priority", rd.priority)
				    .detail("KeyBegin", rd.keys.begin)
				    .detail("KeyEnd", rd.keys.end)
				    .detail("StorageMetrics", metrics.toString())
				    .detail("SourceServers", describe(rd.src))
				    .detail("DestinationTeam", describe(destIds))
				    .detail("ExtraIds", describe(extraIds));
			} else {
				TraceEvent(relocateShardInterval.severity, "RelocateShardHasDestination", distributorId)
				    .detail("PairId", relocateShardInterval.pairID)
				    .detail("Priority", rd.priority)
				    .detail("KeyBegin", rd.keys.begin)
				    .detail("KeyEnd", rd.keys.end)
				    .detail("SourceServers", describe(rd.src))
				    .detail("DestinationTeam", describe(destIds))
				    .detail("ExtraIds", describe(extraIds));
			}

			self->serverCounter.increaseForTeam(rd.src, rd.reason, DDQueue::ServerCounter::LaunchedSource);
			self->serverCounter.increaseForTeam(destIds, rd.reason, DDQueue::ServerCounter::LaunchedDest);
			self->serverCounter.increaseForTeam(extraIds, rd.reason, DDQueue::ServerCounter::LaunchedDest);

			state Error error = success();
			state Promise<Void> dataMovementComplete;
			// Move keys from source to destination by changing the serverKeyList and keyServerList system keys
			state Future<Void> doMoveKeys =
			    self->txnProcessor->moveKeys(MoveKeysParams{ rd.dataMoveId,
			                                                 rd.keys,
			                                                 destIds,
			                                                 healthyIds,
			                                                 self->lock,
			                                                 dataMovementComplete,
			                                                 &self->startMoveKeysParallelismLock,
			                                                 &self->finishMoveKeysParallelismLock,
			                                                 self->teamCollections.size() > 1,
			                                                 relocateShardInterval.pairID,
			                                                 ddEnabledState,
			                                                 CancelConflictingDataMoves::False });
			state Future<Void> pollHealth =
			    signalledTransferComplete ? Never()
			                              : delay(SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch);
			try {
				loop {
					choose {
						when(wait(doMoveKeys)) {
							if (extraIds.size()) {
								destIds.insert(destIds.end(), extraIds.begin(), extraIds.end());
								healthyIds.insert(healthyIds.end(), extraIds.begin(), extraIds.end());
								extraIds.clear();
								ASSERT(totalIds == destIds.size()); // Sanity check the destIDs before we move keys
								doMoveKeys =
								    self->txnProcessor->moveKeys(MoveKeysParams{ rd.dataMoveId,
								                                                 rd.keys,
								                                                 destIds,
								                                                 healthyIds,
								                                                 self->lock,
								                                                 Promise<Void>(),
								                                                 &self->startMoveKeysParallelismLock,
								                                                 &self->finishMoveKeysParallelismLock,
								                                                 self->teamCollections.size() > 1,
								                                                 relocateShardInterval.pairID,
								                                                 ddEnabledState,
								                                                 CancelConflictingDataMoves::False });
							} else {
								self->fetchKeysComplete.insert(rd);
								if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
									auto ranges = self->dataMoves.getAffectedRangesAfterInsertion(rd.keys);
									if (ranges.size() == 1 && static_cast<KeyRange>(ranges[0]) == rd.keys &&
									    ranges[0].value.id == rd.dataMoveId && !ranges[0].value.cancel.isValid()) {
										self->dataMoves.insert(rd.keys, DDQueue::DDDataMove());
										TraceEvent(SevVerbose, "DequeueDataMoveOnSuccess", self->distributorId)
										    .detail("DataMoveID", rd.dataMoveId)
										    .detail("DataMoveRange", rd.keys);
									}
								}
								break;
							}
						}
						when(wait(pollHealth)) {
							if (!healthyDestinations.isHealthy()) {
								if (!signalledTransferComplete) {
									signalledTransferComplete = true;
									self->dataTransferComplete.send(rd);
								}
							}
							pollHealth = signalledTransferComplete ? Never()
							                                       : delay(SERVER_KNOBS->HEALTH_POLL_TIME,
							                                               TaskPriority::DataDistributionLaunch);
						}
						when(wait(signalledTransferComplete ? Never() : dataMovementComplete.getFuture())) {
							self->fetchKeysComplete.insert(rd);
							if (!signalledTransferComplete) {
								signalledTransferComplete = true;
								self->dataTransferComplete.send(rd);
							}
						}
					}
				}
			} catch (Error& e) {
				error = e;
			}

			//TraceEvent("RelocateShardFinished", distributorId).detail("RelocateId", relocateShardInterval.pairID);

			if (error.code() != error_code_move_to_removed_server) {
				if (!error.code()) {
					try {
						wait(healthyDestinations
						         .updateStorageMetrics()); // prevent a gap between the polling for an increase in
						                                   // storage metrics and decrementing data in flight
					} catch (Error& e) {
						error = e;
					}
				}

				healthyDestinations.addDataInFlightToTeam(-metrics.bytes);
				auto readLoad = metrics.bytesReadPerKSecond;
				// Note: It’s equal to trigger([healthyDestinations, readLoad], which is a value capture of
				// healthyDestinations. Have to create a reference to healthyDestinations because in ACTOR the state
				// variable is actually a member variable, I can’t write trigger([healthyDestinations, readLoad]
				// directly.
				auto& destinationRef = healthyDestinations;
				self->noErrorActors.add(
				    trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); },
				            delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL)));

				// onFinished.send( rs );
				if (!error.code()) {
					TraceEvent(relocateShardInterval.end(), distributorId)
					    .detail("Duration", now() - startTime)
					    .detail("Result", "Success");
					if (now() - startTime > 600) {
						TraceEvent(SevWarnAlways, "RelocateShardTooLong")
						    .detail("Duration", now() - startTime)
						    .detail("Dest", describe(destIds))
						    .detail("Src", describe(rd.src));
					}
					if (rd.keys.begin == keyServersPrefix) {
						TraceEvent("MovedKeyServerKeys")
						    .detail("Dest", describe(destIds))
						    .trackLatest(self->movedKeyServersEventHolder->trackingKey);
					}

					if (!signalledTransferComplete) {
						signalledTransferComplete = true;
						dataTransferComplete.send(rd);
					}

					self->bytesWritten += metrics.bytes;
					self->shardsAffectedByTeamFailure->finishMove(rd.keys);
					relocationComplete.send(rd);

					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
						// update physical shard collection
						std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams;
						for (int i = 0; i < bestTeams.size(); i++) {
							auto serverIds = bestTeams[i].first->getServerIDs();
							selectedTeams.push_back(ShardsAffectedByTeamFailure::Team(serverIds, i == 0));
						}
						// The update of PhysicalShardToTeams, PhysicalShardInstances, keyRangePhysicalShardIDMap should
						// be atomic
						self->physicalShardCollection->updatePhysicalShardCollection(
						    rd.keys, rd.isRestore(), selectedTeams, rd.dataMoveId.first(), metrics, debugID);
					}

					return Void();
				} else {
					throw error;
				}
			} else {
				CODE_PROBE(true, "move to removed server", probe::decoration::rare);
				healthyDestinations.addDataInFlightToTeam(-metrics.bytes);
				auto readLoad = metrics.bytesReadPerKSecond;
				auto& destinationRef = healthyDestinations;
				self->noErrorActors.add(
				    trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); },
				            delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL)));

				completeDest(rd, self->destBusymap);
				rd.completeDests.clear();

				wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch));
			}
		}
	} catch (Error& e) {
		state Error err = e;
		TraceEvent(relocateShardInterval.end(), distributorId)
		    .errorUnsuppressed(err)
		    .detail("Duration", now() - startTime);
		if (now() - startTime > 600) {
			TraceEvent(SevWarnAlways, "RelocateShardTooLong")
			    .errorUnsuppressed(err)
			    .detail("Duration", now() - startTime)
			    .detail("Dest", describe(destIds))
			    .detail("Src", describe(rd.src));
		}
		if (!signalledTransferComplete)
			dataTransferComplete.send(rd);

		relocationComplete.send(rd);

		if (err.code() == error_code_data_move_dest_team_not_found) {
			wait(cancelDataMove(self, rd.keys, ddEnabledState));
		}

		if (err.code() != error_code_actor_cancelled && err.code() != error_code_data_move_cancelled) {
			if (errorOut.canBeSet()) {
				errorOut.sendError(err);
			}
		}
		throw err;
	}
}

inline double getWorstCpu(const HealthMetrics& metrics, const std::vector<UID>& ids) {
	double cpu = 0;
	for (auto& id : ids) {
		if (metrics.storageStats.count(id)) {
			cpu = std::max(cpu, metrics.storageStats.at(id).cpuUsage);
		} else {
			// assume the server is too busy to report its stats
			cpu = std::max(cpu, 100.0);
			break;
		}
	}
	return cpu;
}

// Move the shard with the top K highest read density of sourceTeam's to destTeam if sourceTeam has much more read load
// than destTeam
ACTOR Future<bool> rebalanceReadLoad(DDQueue* self,
                                     DataMovementReason moveReason,
                                     Reference<IDataDistributionTeam> sourceTeam,
                                     Reference<IDataDistributionTeam> destTeam,
                                     bool primary,
                                     TraceEvent* traceEvent) {
	if (g_network->isSimulated() && g_simulator->speedUpSimulation) {
		traceEvent->detail("CancelingDueToSimulationSpeedup", true);
		return false;
	}

	state std::vector<KeyRange> shards = self->shardsAffectedByTeamFailure->getShardsFor(
	    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
	traceEvent->detail("ShardsInSource", shards.size());
	// For read rebalance if there is just 1 hot shard remained, move this shard to another server won't solve the
	// problem.
	// TODO: This situation should be solved by split and merge
	if (shards.size() <= 1) {
		traceEvent->detail("SkipReason", "NoShardOnSource");
		return false;
	}

	// Check lastAsSource, at most SERVER_KNOBS->READ_REBALANCE_SRC_PARALLELISM shards can be moved within a sample
	// period. It takes time for the sampled metrics being updated after a shard is moved, so we should control the
	// cadence of movement here to avoid moving churn caused by making many decision based on out-of-date sampled
	// metrics.
	if (self->timeThrottle(sourceTeam->getServerIDs())) {
		traceEvent->detail("SkipReason", "SourceTeamThrottle");
		return false;
	}
	// check team difference
	auto srcLoad = sourceTeam->getLoadReadBandwidth(false), destLoad = destTeam->getLoadReadBandwidth();
	traceEvent->detail("SrcReadBandwidth", srcLoad).detail("DestReadBandwidth", destLoad);

	// read bandwidth difference is less than 30% of src load
	if ((1.0 - SERVER_KNOBS->READ_REBALANCE_DIFF_FRAC) * srcLoad <= destLoad) {
		traceEvent->detail("SkipReason", "TeamTooSimilar");
		return false;
	}
	// randomly choose topK shards
	int topK = std::max(1, std::min(int(0.1 * shards.size()), SERVER_KNOBS->READ_REBALANCE_SHARD_TOPK));
	state Future<HealthMetrics> healthMetrics = self->txnProcessor->getHealthMetrics(true);
	state GetTopKMetricsRequest req(
	    shards, topK, (srcLoad - destLoad) * SERVER_KNOBS->READ_REBALANCE_MAX_SHARD_FRAC, srcLoad / shards.size());
	state GetTopKMetricsReply reply = wait(brokenPromiseToNever(self->getTopKMetrics.getReply(req)));
	wait(ready(healthMetrics));
	auto cpu = getWorstCpu(healthMetrics.get(), sourceTeam->getServerIDs());
	if (cpu < SERVER_KNOBS->READ_REBALANCE_CPU_THRESHOLD) { // 15.0 +- (0.3 * 15) < 20.0
		traceEvent->detail("SkipReason", "LowReadLoad").detail("WorstSrcCpu", cpu);
		return false;
	}

	auto& metricsList = reply.shardMetrics;
	// NOTE: randomize is important here since we don't want to always push the same shard into the queue
	deterministicRandom()->randomShuffle(metricsList);
	traceEvent->detail("MinReadLoad", reply.minReadLoad).detail("MaxReadLoad", reply.maxReadLoad);

	if (metricsList.empty()) {
		traceEvent->detail("SkipReason", "NoEligibleShards");
		return false;
	}

	auto& [shard, metrics] = metricsList[0];
	traceEvent->detail("ShardReadBandwidth", metrics.bytesReadPerKSecond);
	//  Verify the shard is still in ShardsAffectedByTeamFailure
	shards = self->shardsAffectedByTeamFailure->getShardsFor(
	    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
	for (int i = 0; i < shards.size(); i++) {
		if (shard == shards[i]) {
			UID traceId = deterministicRandom()->randomUniqueID();
			self->output.send(RelocateShard(shard, moveReason, RelocateReason::REBALANCE_READ, traceId));
			traceEvent->detail("TraceId", traceId);

			auto serverIds = sourceTeam->getServerIDs();
			self->updateLastAsSource(serverIds);

			self->serverCounter.increaseForTeam(
			    serverIds, RelocateReason::REBALANCE_READ, DDQueue::ServerCounter::ProposedSource);
			return true;
		}
	}
	traceEvent->detail("SkipReason", "ShardNotPresent");
	return false;
}

// Move a random shard from sourceTeam if sourceTeam has much more data than provided destTeam
ACTOR static Future<bool> rebalanceTeams(DDQueue* self,
                                         DataMovementReason moveReason,
                                         Reference<IDataDistributionTeam const> sourceTeam,
                                         Reference<IDataDistributionTeam const> destTeam,
                                         bool primary,
                                         TraceEvent* traceEvent) {
	if (g_network->isSimulated() && g_simulator->speedUpSimulation) {
		traceEvent->detail("CancelingDueToSimulationSpeedup", true);
		return false;
	}

	Promise<int64_t> req;
	self->getAverageShardBytes.send(req);

	state int64_t averageShardBytes = wait(req.getFuture());
	state std::vector<KeyRange> shards = self->shardsAffectedByTeamFailure->getShardsFor(
	    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));

	traceEvent->detail("AverageShardBytes", averageShardBytes).detail("ShardsInSource", shards.size());

	if (!shards.size()) {
		traceEvent->detail("SkipReason", "NoShardOnSource");
		return false;
	}

	state KeyRange moveShard;
	state StorageMetrics metrics;
	state int retries = 0;
	while (retries < SERVER_KNOBS->REBALANCE_MAX_RETRIES) {
		state KeyRange testShard = deterministicRandom()->randomChoice(shards);
		StorageMetrics testMetrics =
		    wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(testShard))));
		if (testMetrics.bytes > metrics.bytes) {
			moveShard = testShard;
			metrics = testMetrics;
			if (metrics.bytes > averageShardBytes) {
				break;
			}
		}
		retries++;
	}

	int64_t sourceBytes = sourceTeam->getLoadBytes(false);
	int64_t destBytes = destTeam->getLoadBytes();

	bool sourceAndDestTooSimilar =
	    sourceBytes - destBytes <= 3 * std::max<int64_t>(SERVER_KNOBS->MIN_SHARD_BYTES, metrics.bytes);
	traceEvent->detail("SourceBytes", sourceBytes)
	    .detail("DestBytes", destBytes)
	    .detail("ShardBytes", metrics.bytes)
	    .detail("SourceAndDestTooSimilar", sourceAndDestTooSimilar);

	if (sourceAndDestTooSimilar || metrics.bytes == 0) {
		traceEvent->detail("SkipReason", sourceAndDestTooSimilar ? "TeamTooSimilar" : "ShardZeroSize");
		return false;
	}

	// Verify the shard is still in ShardsAffectedByTeamFailure
	shards = self->shardsAffectedByTeamFailure->getShardsFor(
	    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
	for (int i = 0; i < shards.size(); i++) {
		if (moveShard == shards[i]) {
			UID traceId = deterministicRandom()->randomUniqueID();
			self->output.send(RelocateShard(moveShard, moveReason, RelocateReason::REBALANCE_DISK, traceId));
			traceEvent->detail("TraceId", traceId);

			self->serverCounter.increaseForTeam(
			    sourceTeam->getServerIDs(), RelocateReason::REBALANCE_DISK, DDQueue::ServerCounter::ProposedSource);
			return true;
		}
	}

	traceEvent->detail("SkipReason", "ShardNotPresent");
	return false;
}

ACTOR Future<SrcDestTeamPair> getSrcDestTeams(DDQueue* self,
                                              int teamCollectionIndex,
                                              GetTeamRequest srcReq,
                                              GetTeamRequest destReq,
                                              int priority,
                                              TraceEvent* traceEvent) {

	state std::pair<Optional<ITeamRef>, bool> randomTeam =
	    wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(destReq)));
	traceEvent->detail(
	    "DestTeam", printable(randomTeam.first.map<std::string>([](const ITeamRef& team) { return team->getDesc(); })));

	if (randomTeam.first.present()) {
		state std::pair<Optional<ITeamRef>, bool> loadedTeam =
		    wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(srcReq)));

		traceEvent->detail("SourceTeam", printable(loadedTeam.first.map<std::string>([](const ITeamRef& team) {
			                   return team->getDesc();
		                   })));

		if (loadedTeam.first.present()) {
			return std::make_pair(loadedTeam.first.get(), randomTeam.first.get());
		}
	}
	return {};
}

Future<SrcDestTeamPair> DDQueue::getSrcDestTeams(const int& teamCollectionIndex,
                                                 const GetTeamRequest& srcReq,
                                                 const GetTeamRequest& destReq,
                                                 const int& priority,
                                                 TraceEvent* traceEvent) {
	return ::getSrcDestTeams(this, teamCollectionIndex, srcReq, destReq, priority, traceEvent);
}
Future<bool> DDQueue::rebalanceReadLoad(DataMovementReason moveReason,
                                        Reference<IDataDistributionTeam> sourceTeam,
                                        Reference<IDataDistributionTeam> destTeam,
                                        bool primary,
                                        TraceEvent* traceEvent) {
	return ::rebalanceReadLoad(this, moveReason, sourceTeam, destTeam, primary, traceEvent);
}

Future<bool> DDQueue::rebalanceTeams(DataMovementReason moveReason,
                                     Reference<const IDataDistributionTeam> sourceTeam,
                                     Reference<const IDataDistributionTeam> destTeam,
                                     bool primary,
                                     TraceEvent* traceEvent) {
	return ::rebalanceTeams(this, moveReason, sourceTeam, destTeam, primary, traceEvent);
}

ACTOR Future<bool> getSkipRebalanceValue(Reference<IDDTxnProcessor> txnProcessor, bool readRebalance) {
	Optional<Value> val = wait(txnProcessor->readRebalanceDDIgnoreKey());

	if (!val.present())
		return false;

	bool skipCurrentLoop = false;
	// NOTE: check special value "" and "on" might written in old version < 7.2
	if (val.get().size() > 0 && val.get() != "on"_sr) {
		int ddIgnore = BinaryReader::fromStringRef<uint8_t>(val.get(), Unversioned());
		if (readRebalance) {
			skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_READ) > 0;
		} else {
			skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0;
		}
	} else {
		skipCurrentLoop = true;
	}

	return skipCurrentLoop;
}

ACTOR Future<Void> BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, DataMovementReason reason) {
	state int resetCount = 0;
	state double lastRead = 0;
	state bool skipCurrentLoop = false;
	state const bool readRebalance = isDataMovementForReadBalancing(reason);
	state const char* eventName = isDataMovementForMountainChopper(reason) ? "BgDDMountainChopper" : "BgDDValleyFiller";
	state int ddPriority = dataMovementPriority(reason);
	state double rebalancePollingInterval = 0;

	loop {
		state bool moved = false;
		state Reference<IDataDistributionTeam> sourceTeam;
		state Reference<IDataDistributionTeam> destTeam;
		state GetTeamRequest srcReq;
		state GetTeamRequest destReq;
		state TraceEvent traceEvent(eventName, self->distributorId);
		traceEvent.suppressFor(5.0)
		    .detail("PollingInterval", rebalancePollingInterval)
		    .detail("Rebalance", readRebalance ? "Read" : "Disk");

		// NOTE: the DD throttling relies on DDQueue, so here just trigger the balancer periodically
		wait(delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch));
		try {
			if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) {
				wait(store(skipCurrentLoop, getSkipRebalanceValue(self->txnProcessor, readRebalance)));
				lastRead = now();
			}
			traceEvent.detail("Enabled", !skipCurrentLoop);

			if (skipCurrentLoop) {
				rebalancePollingInterval =
				    std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL);
				continue;
			} else {
				rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL;
			}

			traceEvent.detail("QueuedRelocations", self->priority_relocations[ddPriority]);

			if (self->priority_relocations[ddPriority] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
				bool mcMove = isDataMovementForMountainChopper(reason);
				srcReq = GetTeamRequest(WantNewServers::True,
				                        WantTrueBest(mcMove),
				                        PreferLowerDiskUtil::False,
				                        TeamMustHaveShards::True,
				                        ForReadBalance(readRebalance),
				                        PreferLowerReadUtil::False);
				destReq = GetTeamRequest(WantNewServers::True,
				                         WantTrueBest(!mcMove),
				                         PreferLowerDiskUtil::True,
				                         TeamMustHaveShards::False,
				                         ForReadBalance(readRebalance),
				                         PreferLowerReadUtil::True);
				state Future<SrcDestTeamPair> getTeamFuture =
				    self->getSrcDestTeams(teamCollectionIndex, srcReq, destReq, ddPriority, &traceEvent);
				wait(ready(getTeamFuture));
				sourceTeam = getTeamFuture.get().first;
				destTeam = getTeamFuture.get().second;

				// clang-format off
				if (sourceTeam.isValid() && destTeam.isValid()) {
					if (readRebalance) {
						wait(store(moved,self->rebalanceReadLoad( reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent)));
					} else {
						wait(store(moved,self->rebalanceTeams( reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent)));
					}
				}
				// clang-format on
				moved ? resetCount = 0 : resetCount++;
			}

			traceEvent.detail("ResetCount", resetCount);
		} catch (Error& e) {
			// Log actor_cancelled because it's not legal to suppress an event that's initialized
			traceEvent.errorUnsuppressed(e);
			throw;
		}

		traceEvent.detail("Moved", moved);
		traceEvent.log();
	}
}

ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
                                         PromiseStream<RelocateShard> output,
                                         FutureStream<RelocateShard> input,
                                         PromiseStream<GetMetricsRequest> getShardMetrics,
                                         PromiseStream<GetTopKMetricsRequest> getTopKMetrics,
                                         Reference<AsyncVar<bool>> processingUnhealthy,
                                         Reference<AsyncVar<bool>> processingWiggle,
                                         std::vector<TeamCollectionInterface> teamCollections,
                                         Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
                                         Reference<PhysicalShardCollection> physicalShardCollection,
                                         MoveKeysLock lock,
                                         PromiseStream<Promise<int64_t>> getAverageShardBytes,
                                         FutureStream<Promise<int>> getUnhealthyRelocationCount,
                                         UID distributorId,
                                         int teamSize,
                                         int singleRegionTeamSize,
                                         const DDEnabledState* ddEnabledState) {
	state DDQueue self(distributorId,
	                   lock,
	                   db,
	                   teamCollections,
	                   shardsAffectedByTeamFailure,
	                   physicalShardCollection,
	                   getAverageShardBytes,
	                   teamSize,
	                   singleRegionTeamSize,
	                   output,
	                   input,
	                   getShardMetrics,
	                   getTopKMetrics);
	state std::set<UID> serversToLaunchFrom;
	state KeyRange keysToLaunchFrom;
	state RelocateData launchData;
	state Future<Void> recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL);

	state std::vector<Future<Void>> ddQueueFutures;

	state PromiseStream<KeyRange> rangesComplete;
	state Future<Void> launchQueuedWorkTimeout = Never();

	for (int i = 0; i < teamCollections.size(); i++) {
		ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_OVERUTILIZED_TEAM));
		ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM));
		if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
			ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM));
			ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM));
		}
	}
	ddQueueFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0));
	ddQueueFutures.push_back(delayedAsyncVar(self.rawProcessingWiggle, processingWiggle, 0));
	ddQueueFutures.push_back(self.periodicalRefreshCounter());

	try {
		loop {
			self.validate();

			// For the given servers that caused us to go around the loop, find the next item(s) that can be
			// launched.
			if (launchData.startTime != -1) {
				// Launch dataDistributionRelocator actor to relocate the launchData
				self.launchQueuedWork(launchData, ddEnabledState);
				launchData = RelocateData();
			} else if (!keysToLaunchFrom.empty()) {
				self.launchQueuedWork(keysToLaunchFrom, ddEnabledState);
				keysToLaunchFrom = KeyRangeRef();
			}

			ASSERT(launchData.startTime == -1 && keysToLaunchFrom.empty());

			choose {
				when(RelocateShard rs = waitNext(self.input)) {
					if (rs.isRestore()) {
						ASSERT(rs.dataMove != nullptr);
						ASSERT(rs.dataMoveId.isValid());
						self.launchQueuedWork(RelocateData(rs), ddEnabledState);
					} else if (rs.cancelled) {
						self.enqueueCancelledDataMove(rs.dataMoveId, rs.keys, ddEnabledState);
					} else {
						bool wasEmpty = serversToLaunchFrom.empty();
						self.queueRelocation(rs, serversToLaunchFrom);
						if (wasEmpty && !serversToLaunchFrom.empty())
							launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch);
					}
				}
				when(wait(launchQueuedWorkTimeout)) {
					self.launchQueuedWork(serversToLaunchFrom, ddEnabledState);
					serversToLaunchFrom = std::set<UID>();
					launchQueuedWorkTimeout = Never();
				}
				when(RelocateData results = waitNext(self.fetchSourceServersComplete.getFuture())) {
					// This when is triggered by queueRelocation() which is triggered by sending self.input
					self.completeSourceFetch(results);
					launchData = results;
				}
				when(RelocateData done = waitNext(self.dataTransferComplete.getFuture())) {
					complete(done, self.busymap, self.destBusymap);
					if (serversToLaunchFrom.empty() && !done.src.empty())
						launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch);
					serversToLaunchFrom.insert(done.src.begin(), done.src.end());
				}
				when(RelocateData done = waitNext(self.relocationComplete.getFuture())) {
					self.activeRelocations--;
					TraceEvent(SevVerbose, "InFlightRelocationChange")
					    .detail("Complete", done.dataMoveId)
					    .detail("IsRestore", done.isRestore())
					    .detail("Total", self.activeRelocations);
					self.finishRelocation(done.priority, done.healthPriority);
					self.fetchKeysComplete.erase(done);
					// self.logRelocation( done, "ShardRelocatorDone" );
					self.noErrorActors.add(
					    tag(delay(0, TaskPriority::DataDistributionLaunch), done.keys, rangesComplete));
					if (g_network->isSimulated() && debug_isCheckRelocationDuration() && now() - done.startTime > 60) {
						TraceEvent(SevWarnAlways, "RelocationDurationTooLong")
						    .detail("Duration", now() - done.startTime);
						debug_setCheckRelocationDuration(false);
					}
				}
				when(KeyRange done = waitNext(rangesComplete.getFuture())) { keysToLaunchFrom = done; }
				when(wait(recordMetrics)) {
					Promise<int64_t> req;
					getAverageShardBytes.send(req);

					recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL, TaskPriority::FlushTrace);

					auto const highestPriorityRelocation = self.getHighestPriorityRelocation();

					TraceEvent("MovingData", distributorId)
					    .detail("InFlight", self.activeRelocations)
					    .detail("InQueue", self.queuedRelocations)
					    .detail("AverageShardSize", req.getFuture().isReady() ? req.getFuture().get() : -1)
					    .detail("UnhealthyRelocations", self.unhealthyRelocations)
					    .detail("HighestPriority", highestPriorityRelocation)
					    .detail("BytesWritten", self.bytesWritten)
					    .detail("PriorityRecoverMove", self.priority_relocations[SERVER_KNOBS->PRIORITY_RECOVER_MOVE])
					    .detail("PriorityRebalanceUnderutilizedTeam",
					            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM])
					    .detail("PriorityRebalanceOverutilizedTeam",
					            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM])
					    .detail("PriorityRebalanceReadUnderutilTeam",
					            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM])
					    .detail("PriorityRebalanceReadOverutilTeam",
					            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM])
					    .detail("PriorityStorageWiggle",
					            self.priority_relocations[SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE])
					    .detail("PriorityTeamHealthy", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_HEALTHY])
					    .detail("PriorityTeamContainsUndesiredServer",
					            self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER])
					    .detail("PriorityTeamRedundant",
					            self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT])
					    .detail("PriorityMergeShard", self.priority_relocations[SERVER_KNOBS->PRIORITY_MERGE_SHARD])
					    .detail("PriorityPopulateRegion",
					            self.priority_relocations[SERVER_KNOBS->PRIORITY_POPULATE_REGION])
					    .detail("PriorityTeamUnhealthy",
					            self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY])
					    .detail("PriorityTeam2Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_2_LEFT])
					    .detail("PriorityTeam1Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_1_LEFT])
					    .detail("PriorityTeam0Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_0_LEFT])
					    .detail("PrioritySplitShard", self.priority_relocations[SERVER_KNOBS->PRIORITY_SPLIT_SHARD])
					    .trackLatest("MovingData"); // This trace event's trackLatest lifetime is controlled by
					                                // DataDistributor::movingDataEventHolder. The track latest
					                                // key we use here must match the key used in the holder.

					if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
						TraceEvent("PhysicalShardMoveStats")
						    .detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard)
						    .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard)
						    .detail("RemoteBestTeamNotReady",
						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteBestTeamNotReady])
						    .detail("PrimaryNoHealthyTeam",
						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam])
						    .detail("RemoteNoHealthyTeam",
						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteNoHealthyTeam])
						    .detail("RemoteTeamIsFull",
						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull])
						    .detail("RemoteTeamIsNotHealthy",
						            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy])
						    .detail(
						        "NoAvailablePhysicalShard",
						        self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]);
						self.moveCreateNewPhysicalShard = 0;
						self.moveReusePhysicalShard = 0;
						for (int i = 0; i < self.retryFindDstReasonCount.size(); ++i) {
							self.retryFindDstReasonCount[i] = 0;
						}
					}
				}
				when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
				when(wait(waitForAll(ddQueueFutures))) {}
				when(Promise<int> r = waitNext(getUnhealthyRelocationCount)) {
					r.send(self.getUnhealthyRelocationCount());
				}
			}
		}
	} catch (Error& e) {
		if (e.code() != error_code_broken_promise && // FIXME: Get rid of these broken_promise errors every time we
		                                             // are killed by the master dying
		    e.code() != error_code_movekeys_conflict && e.code() != error_code_data_move_cancelled &&
		    e.code() != error_code_data_move_dest_team_not_found)
			TraceEvent(SevError, "DataDistributionQueueError", distributorId).error(e);
		throw e;
	}
}

ACTOR Future<Void> dataDistributionQueue(Reference<DDSharedContext> context, Database cx);

TEST_CASE("/DataDistribution/DDQueue/ServerCounterTrace") {
	state double duration = 2.5 * SERVER_KNOBS->DD_QUEUE_COUNTER_REFRESH_INTERVAL;
	state DDQueue self;
	state Future<Void> counterFuture = self.periodicalRefreshCounter();
	state Future<Void> finishFuture = delay(duration);
	std::cout << "Start trace counter unit test for " << duration << "s ...\n";
	loop choose {
		when(wait(counterFuture)) {}
		when(wait(finishFuture)) { break; }
		when(wait(delayJittered(2.0))) {
			std::vector<UID> team(3);
			for (int i = 0; i < team.size(); ++i) {
				team[i] = UID(deterministicRandom()->randomInt(1, 400), 0);
			}
			auto reason = RelocateReason(deterministicRandom()->randomInt(0, RelocateReason::typeCount()));
			auto countType = DDQueue::ServerCounter::randomCountType();
			self.serverCounter.increaseForTeam(team, reason, countType);
			ASSERT(self.serverCounter.get(team[0], reason, countType));
		}
	}
	std::cout << "Finished.";
	return Void();
}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								/*
 								 * DataDistributionQueue.actor.cpp
 								 *
 								 * This source file is part of the FoundationDB open source project
 								 *
-												Update copyright header dates

											
										
										
											2022-03-22 04:36:23 +08:00
+								 * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
-												remove trailing whitespace from our copyright headers ; fixed formatting of python setup.py

											
										
										
											2018-02-22 02:25:11 +08:00
+								 *
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								 * Licensed under the Apache License, Version 2.0 (the "License");
 								 * you may not use this file except in compliance with the License.
 								 * You may obtain a copy of the License at
-												remove trailing whitespace from our copyright headers ; fixed formatting of python setup.py

											
										
										
											2018-02-22 02:25:11 +08:00
+								 *
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								 *     http://www.apache.org/licenses/LICENSE-2.0
-												remove trailing whitespace from our copyright headers ; fixed formatting of python setup.py

											
										
										
											2018-02-22 02:25:11 +08:00
+								 *
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								 * Unless required by applicable law or agreed to in writing, software
 								 * distributed under the License is distributed on an "AS IS" BASIS,
 								 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								 * See the License for the specific language governing permissions and
 								 * limitations under the License.
 								 */
-												Rewrite all files to have #include actorcompiler.h as the last include.

											
										
										
											2018-08-11 06:18:24 +08:00
+								#include <limits>
-												Cleanup DataDistributionQueue.actor.cpp and storageserver.actor.cpp

											
										
										
											2020-11-04 12:24:39 +08:00
+								#include <numeric>
 								#include <vector>
-												Rewrite all files to have #include actorcompiler.h as the last include.

											
										
										
											2018-08-11 06:18:24 +08:00
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								#include "flow/ActorCollection.h"
-												fix roll trace event issue for data distribution

Description

Testing

											
										
										
											2021-09-25 01:04:30 +08:00
+								#include "flow/FastRef.h"
 								#include "flow/Trace.h"
-												Stop performing self-moves. (e.g. a = std::move(a))

self-moves are frowned upon in C++, and in our code this generally happens from
calls to swap as part of trying to implement a "unordered erase" function via
swap-to-the-end-and-pop_back.  For convenience, a swapAndPop() function is now
offered that performs this, while disallowing self-moves.

											
										
										
											2018-08-02 09:09:54 +08:00
+								#include "flow/Util.h"
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								#include "fdbrpc/sim_validation.h"
 								#include "fdbclient/SystemData.h"
-												More ide fixes

											
										
										
											2019-03-06 02:29:37 +08:00
+								#include "fdbserver/DataDistribution.actor.h"
-												split DD related headers

											
										
										
											2022-08-17 05:32:55 +08:00
+								#include "fdbserver/DDSharedContext.h"
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								#include "fdbclient/DatabaseContext.h"
-												Use ACTOR forward declarations in MoveKeys

Also MoveKeys.h -> MoveKeys.actor.h

											
										
										
											2019-02-18 10:55:52 +08:00
+								#include "fdbserver/MoveKeys.actor.h"
-												Adjust all includes to be relative to the root.

Remove the use of relative paths.  A header at foo/bar.h could be included by
files under foo/ with "bar.h", but would be included everywhere else as
"foo/bar.h".  Adjust so that every include references such a header with the
latter form.

Signed-off-by: Robert Escriva <rescriva@dropbox.com>

											
										
										
											2018-10-20 01:30:13 +08:00
+								#include "fdbserver/Knobs.h"
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								#include "fdbrpc/simulator.h"
-												add storeTuple and unit test; refactor getSourceServersForRange

											
										
										
											2022-06-10 03:16:12 +08:00
+								#include "fdbserver/DDTxnProcessor.h"
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
+								#include "flow/DebugTrace.h"
-												Rewrite all files to have #include actorcompiler.h as the last include.

											
										
										
											2018-08-11 06:18:24 +08:00
+								#include "flow/actorcompiler.h" // This must be the last #include.
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 								#define WORK_FULL_UTILIZATION 10000 // This is not a knob; it is a fixed point scaling factor!
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+								typedef Reference<IDataDistributionTeam> ITeamRef;
 								typedef std::pair<ITeamRef, ITeamRef> SrcDestTeamPair;
-												Use enum variables to invoke Priority checking (#7514)

* Use enum variables to invoke Priority checking

* add an explicit isDataMovementForReadBalancing function
											
										
										
											2022-07-15 00:06:56 +08:00
+								inline bool isDataMovementForDiskBalancing(DataMovementReason reason) {
 									return reason == DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM ||
 									       reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM;
 								}
 								inline bool isDataMovementForReadBalancing(DataMovementReason reason) {
 									return reason == DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM ||
 									       reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
 								}
 								inline bool isDataMovementForMountainChopper(DataMovementReason reason) {
 									return reason == DataMovementReason::REBALANCE_OVERUTILIZED_TEAM ||
 									       reason == DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM;
 								}
-												Move DD queue code over to using movement-reasons rather than priority (#7614)

* Use enum variables to invoke Priority checking

* add an explicit isDataMovementForReadBalancing function

* Set up RelocateShard in terms of data movement reason instead of priority

* Remove isMountainChopperPriority

* Remove isDiskRebalancePriority

* Fix formatting

* Fix misnamed DataMovementReason::TEAM_HEALTHY

Co-authored-by: Zhongxing Zhang <zhongxing.zhang@snowflake.com>
											
										
										
											2022-07-25 15:50:37 +08:00
+								// FIXME: Always use DataMovementReason to invoke these functions.
-												add new priority in RelocateData

											
										
										
											2022-04-13 07:22:17 +08:00
+								inline bool isValleyFillerPriority(int priority) {
 									return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
 									       priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM;
 								}
-												Use enum variables to invoke Priority checking (#7514)

* Use enum variables to invoke Priority checking

* add an explicit isDataMovementForReadBalancing function
											
										
										
											2022-07-15 00:06:56 +08:00
+								inline bool isDataMovementForValleyFiller(DataMovementReason reason) {
 									return reason == DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM ||
 									       reason == DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM;
 								}
-												remove RelocateShard::INVALID; add priority uniqueness check

											
										
										
											2022-08-09 12:44:45 +08:00
+								typedef std::map<DataMovementReason, int> DmReasonPriorityMapping;
 								typedef std::map<int, DataMovementReason> PriorityDmReasonMapping;
-												fix heap-use-after-free caused by incorrect pair initialization

											
										
										
											2022-08-11 01:47:43 +08:00
+								std::pair<const DmReasonPriorityMapping*, const PriorityDmReasonMapping*> buildPriorityMappings() {
-												remove RelocateShard::INVALID; add priority uniqueness check

											
										
										
											2022-08-09 12:44:45 +08:00
+									static DmReasonPriorityMapping reasonPriority{
 										{ DataMovementReason::INVALID, -1 },
 										{ DataMovementReason::RECOVER_MOVE, SERVER_KNOBS->PRIORITY_RECOVER_MOVE },
 										{ DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM },
 										{ DataMovementReason::REBALANCE_OVERUTILIZED_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM },
 										{ DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM },
 										{ DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM },
 										{ DataMovementReason::PERPETUAL_STORAGE_WIGGLE, SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE },
 										{ DataMovementReason::TEAM_HEALTHY, SERVER_KNOBS->PRIORITY_TEAM_HEALTHY },
 										{ DataMovementReason::TEAM_CONTAINS_UNDESIRED_SERVER, SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER },
 										{ DataMovementReason::TEAM_REDUNDANT, SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT },
 										{ DataMovementReason::MERGE_SHARD, SERVER_KNOBS->PRIORITY_MERGE_SHARD },
 										{ DataMovementReason::POPULATE_REGION, SERVER_KNOBS->PRIORITY_POPULATE_REGION },
 										{ DataMovementReason::TEAM_UNHEALTHY, SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY },
 										{ DataMovementReason::TEAM_2_LEFT, SERVER_KNOBS->PRIORITY_TEAM_2_LEFT },
 										{ DataMovementReason::TEAM_1_LEFT, SERVER_KNOBS->PRIORITY_TEAM_1_LEFT },
 										{ DataMovementReason::TEAM_FAILED, SERVER_KNOBS->PRIORITY_TEAM_FAILED },
 										{ DataMovementReason::TEAM_0_LEFT, SERVER_KNOBS->PRIORITY_TEAM_0_LEFT },
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+										{ DataMovementReason::SPLIT_SHARD, SERVER_KNOBS->PRIORITY_SPLIT_SHARD },
 										{ DataMovementReason::ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD,
 										  SERVER_KNOBS->PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD }
-												remove RelocateShard::INVALID; add priority uniqueness check

											
										
										
											2022-08-09 12:44:45 +08:00
+									};
 									static PriorityDmReasonMapping priorityReason;
 									if (priorityReason.empty()) { // only build once
 										for (const auto& [r, p] : reasonPriority) {
 											priorityReason[p] = r;
 										}
 										// Don't allow 2 priorities value being the same.
 										if (priorityReason.size() != reasonPriority.size()) {
 											TraceEvent(SevError, "DuplicateDataMovementPriority").log();
 											ASSERT(false);
 										}
-												Use enum variables to invoke Priority checking (#7514)

* Use enum variables to invoke Priority checking

* add an explicit isDataMovementForReadBalancing function
											
										
										
											2022-07-15 00:06:56 +08:00
+									}
-												remove RelocateShard::INVALID; add priority uniqueness check

											
										
										
											2022-08-09 12:44:45 +08:00
-												fix heap-use-after-free caused by incorrect pair initialization

											
										
										
											2022-08-11 01:47:43 +08:00
+									return std::make_pair(&reasonPriority, &priorityReason);
-												remove RelocateShard::INVALID; add priority uniqueness check

											
										
										
											2022-08-09 12:44:45 +08:00
+								}
 								int dataMovementPriority(DataMovementReason reason) {
-												fix heap-use-after-free caused by incorrect pair initialization

											
										
										
											2022-08-11 01:47:43 +08:00
+									auto [reasonPriority, _] = buildPriorityMappings();
 									return reasonPriority->at(reason);
-												remove RelocateShard::INVALID; add priority uniqueness check

											
										
										
											2022-08-09 12:44:45 +08:00
+								}
 								DataMovementReason priorityToDataMovementReason(int priority) {
-												fix heap-use-after-free caused by incorrect pair initialization

											
										
										
											2022-08-11 01:47:43 +08:00
+									auto [_, priorityReason] = buildPriorityMappings();
 									return priorityReason->at(priority);
-												Use enum variables to invoke Priority checking (#7514)

* Use enum variables to invoke Priority checking

* add an explicit isDataMovementForReadBalancing function
											
										
										
											2022-07-15 00:06:56 +08:00
+								}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								struct RelocateData {
 									KeyRange keys;
 									int priority;
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+									int boundaryPriority;
 									int healthPriority;
-												add relocate reason and set teamSorter in relocator

											
										
										
											2022-03-19 07:39:31 +08:00
+									RelocateReason reason;
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									double startTime;
-												add TraceId; make the TraceId for MountainChopper, ValleyFiller, RelocateShard, QueuedShard consistent

											
										
										
											2022-08-05 07:57:55 +08:00
+									UID randomId; // inherit from RelocateShard.traceId
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									UID dataMoveId;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									int workFactor;
 									std::vector<UID> src;
-												removed a separately configurable storage team size for the remote data center, because it did not make sense
fix: the master did not monitor for the failure of remote logs
stop merge attempts when a data center is failed
fixed a variety of other problems with data distribution when a data center is failed

											
										
										
											2018-02-03 03:46:04 +08:00
+									std::vector<UID> completeSources;
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+									std::vector<UID> completeDests;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									bool wantsNewServers;
-												Better cancelling logic that reflects whether move has actually started

											
										
										
											2022-02-25 23:33:46 +08:00
+									bool cancellable;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									TraceInterval interval;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									std::shared_ptr<DataMove> dataMove;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+									RelocateData()
-												fix RelocateShard invalid initialized bug (remove RelocationReason::INVALID

											
										
										
											2022-08-09 11:43:35 +08:00
+									  : priority(-1), boundaryPriority(-1), healthPriority(-1), reason(RelocateReason::OTHER), startTime(-1),
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									    dataMoveId(anonymousShardId), workFactor(0), wantsNewServers(false), cancellable(false),
 									    interval("QueuedRelocation") {}
-												added a buggify + minor code cleanup

											
										
										
											2019-10-12 09:31:43 +08:00
+									explicit RelocateData(RelocateShard const& rs)
 									  : keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1),
-												add relocate reason and set teamSorter in relocator

											
										
										
											2022-03-19 07:39:31 +08:00
+									    healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), reason(rs.reason), startTime(now()),
-												add TraceId; make the TraceId for MountainChopper, ValleyFiller, RelocateShard, QueuedShard consistent

											
										
										
											2022-08-05 07:57:55 +08:00
+									    randomId(rs.traceId.isValid() ? rs.traceId : deterministicRandom()->randomUniqueID()),
 									    dataMoveId(rs.dataMoveId), workFactor(0), wantsNewServers(isDataMovementForMountainChopper(rs.moveReason) ||
 									                                                              isDataMovementForValleyFiller(rs.moveReason) ||
 									                                                              rs.moveReason == DataMovementReason::SPLIT_SHARD ||
 									                                                              rs.moveReason == DataMovementReason::TEAM_REDUNDANT),
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
+									    cancellable(true), interval("QueuedRelocation", randomId), dataMove(rs.dataMove) {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										if (dataMove != nullptr) {
 											this->src.insert(this->src.end(), dataMove->meta.src.begin(), dataMove->meta.src.end());
 										}
 									}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+									static bool isHealthPriority(int priority) {
-												Make the DD priority associated with populating a remote region lower than machine failures

											
										
										
											2020-03-05 06:07:32 +08:00
+										return priority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
 										       priority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || priority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+										       priority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || priority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ||
 										       priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT || priority == SERVER_KNOBS->PRIORITY_TEAM_HEALTHY ||
-												consider wiggling when waitUntilHealthy

											
										
										
											2021-10-15 07:22:47 +08:00
+										       priority == SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER ||
 										       priority == SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE;
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+									}
 									static bool isBoundaryPriority(int priority) {
 										return priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD || priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD;
 									}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									bool isRestore() const { return this->dataMove != nullptr; }
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									bool operator>(const RelocateData& rhs) const {
 										return priority != rhs.priority
 										           ? priority > rhs.priority
 										           : (startTime != rhs.startTime ? startTime < rhs.startTime : randomId > rhs.randomId);
 									}
 									bool operator==(const RelocateData& rhs) const {
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+										return priority == rhs.priority && boundaryPriority == rhs.boundaryPriority &&
-												fix teamSorter usage bug

											
										
										
											2022-03-25 04:16:10 +08:00
+										       healthPriority == rhs.healthPriority && reason == rhs.reason && keys == rhs.keys &&
 										       startTime == rhs.startTime && workFactor == rhs.workFactor && src == rhs.src &&
 										       completeSources == rhs.completeSources && wantsNewServers == rhs.wantsNewServers &&
 										       randomId == rhs.randomId;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									}
-												Remove using namespace std::rel_ops

This causes the following to not compile anymore

\#include <utility>
\#include <vector>

using namespace std::rel_ops;

int main() {
    std::vector<int> xs;
    return xs.rbegin() != xs.rend();
}

See https://godbolt.org/z/s1977n

											
										
										
											2020-07-11 05:37:47 +08:00
+									bool operator!=(const RelocateData& rhs) const { return !(*this == rhs); }
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								};
-												Explicitly seal classes that inherit but aren't inherited from

											
										
										
											2020-10-08 12:58:24 +08:00
+								class ParallelTCInfo final : public ReferenceCounted<ParallelTCInfo>, public IDataDistributionTeam {
-												Cleanup DataDistributionQueue.actor.cpp and storageserver.actor.cpp

											
										
										
											2020-11-04 12:24:39 +08:00
+									std::vector<Reference<IDataDistributionTeam>> teams;
 									std::vector<UID> tempServerIDs;
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									int64_t sum(std::function<int64_t(IDataDistributionTeam const&)> func) const {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										int64_t result = 0;
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+										for (const auto& team : teams) {
 											result += func(*team);
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										}
 										return result;
 									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									template <class T>
-												Enforce std:: specifier rather than using namespace

											
										
										
											2021-09-17 08:42:34 +08:00
+									std::vector<T> collect(std::function<std::vector<T>(IDataDistributionTeam const&)> func) const {
-												Cleanup DataDistributionQueue.actor.cpp and storageserver.actor.cpp

											
										
										
											2020-11-04 12:24:39 +08:00
+										std::vector<T> result;
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+										for (const auto& team : teams) {
-												Resolve conflicts

											
										
										
											2020-11-25 01:41:36 +08:00
+											std::vector<T> newItems = func(*team);
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+											result.insert(result.end(), newItems.begin(), newItems.end());
 										}
 										return result;
 									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									bool any(std::function<bool(IDataDistributionTeam const&)> func) const {
 										for (const auto& team : teams) {
 											if (func(*team)) {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+												return true;
 											}
 										}
 										return false;
 									}
-												Add encapsulation to TCTeamInfo and ParallelTCInfo

											
										
										
											2020-07-21 15:18:54 +08:00
+								public:
 									ParallelTCInfo() = default;
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+									explicit ParallelTCInfo(ParallelTCInfo const& info) : teams(info.teams), tempServerIDs(info.tempServerIDs){};
-												Add encapsulation to TCTeamInfo and ParallelTCInfo

											
										
										
											2020-07-21 15:18:54 +08:00
 									void addTeam(Reference<IDataDistributionTeam> team) { teams.push_back(team); }
 									void clear() { teams.clear(); }
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									bool all(std::function<bool(IDataDistributionTeam const&)> func) const {
 										return !any([func](IDataDistributionTeam const& team) { return !func(team); });
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									}
-												Resolve conflicts

											
										
										
											2020-11-25 01:41:36 +08:00
+									std::vector<StorageServerInterface> getLastKnownServerInterfaces() const override {
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+										return collect<StorageServerInterface>(
 										    [](IDataDistributionTeam const& team) { return team.getLastKnownServerInterfaces(); });
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									int size() const override {
-												removed a separately configurable storage team size for the remote data center, because it did not make sense
fix: the master did not monitor for the failure of remote logs
stop merge attempts when a data center is failed
fixed a variety of other problems with data distribution when a data center is failed

											
										
										
											2018-02-03 03:46:04 +08:00
+										int totalSize = 0;
 										for (auto it = teams.begin(); it != teams.end(); it++) {
 											totalSize += (*it)->size();
 										}
 										return totalSize;
 									}
-												Resolve conflicts

											
										
										
											2020-11-25 01:41:36 +08:00
+									std::vector<UID> const& getServerIDs() const override {
-												Enforce std:: specifier rather than using namespace

											
										
										
											2021-09-17 08:42:34 +08:00
+										static std::vector<UID> tempServerIDs;
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										tempServerIDs.clear();
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+										for (const auto& team : teams) {
-												Resolve conflicts

											
										
										
											2020-11-25 01:41:36 +08:00
+											std::vector<UID> const& childIDs = team->getServerIDs();
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+											tempServerIDs.insert(tempServerIDs.end(), childIDs.begin(), childIDs.end());
 										}
 										return tempServerIDs;
 									}
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+									void addDataInFlightToTeam(int64_t delta) override {
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+										for (auto& team : teams) {
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+											team->addDataInFlightToTeam(delta);
 										}
 									}
 									void addReadInFlightToTeam(int64_t delta) override {
 										for (auto& team : teams) {
 											team->addReadInFlightToTeam(delta);
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										}
 									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									int64_t getDataInFlightToTeam() const override {
 										return sum([](IDataDistributionTeam const& team) { return team.getDataInFlightToTeam(); });
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									int64_t getLoadBytes(bool includeInFlight = true, double inflightPenalty = 1.0) const override {
 										return sum([includeInFlight, inflightPenalty](IDataDistributionTeam const& team) {
 											return team.getLoadBytes(includeInFlight, inflightPenalty);
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										});
 									}
-												add ReadInFlight

											
										
										
											2022-03-29 05:20:07 +08:00
+									int64_t getReadInFlightToTeam() const override {
 										return sum([](IDataDistributionTeam const& team) { return team.getReadInFlightToTeam(); });
 									}
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+									double getLoadReadBandwidth(bool includeInFlight = true, double inflightPenalty = 1.0) const override {
 										return sum([includeInFlight, inflightPenalty](IDataDistributionTeam const& team) {
 											return team.getLoadReadBandwidth(includeInFlight, inflightPenalty);
-												add ReadInFlight

											
										
										
											2022-03-29 05:20:07 +08:00
+										});
-												fix uninitialized member

											
										
										
											2022-03-01 02:22:32 +08:00
+									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									int64_t getMinAvailableSpace(bool includeInFlight = true) const override {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										int64_t result = std::numeric_limits<int64_t>::max();
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+										for (const auto& team : teams) {
 											result = std::min(result, team->getMinAvailableSpace(includeInFlight));
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										}
 										return result;
 									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									double getMinAvailableSpaceRatio(bool includeInFlight = true) const override {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										double result = std::numeric_limits<double>::max();
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+										for (const auto& team : teams) {
 											result = std::min(result, team->getMinAvailableSpaceRatio(includeInFlight));
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										}
 										return result;
 									}
-												Use clang-tidy to automatically fix missing overrides

Use `clang-tidy -p . $file -checks='-*,modernize-use-override' -header-filter='.*' -fix`
to fix missing overrides, and then use git clang-format to reformat just
those changes. This went pretty well for most files.

Formatting the following files went off the rails, so I'm going to
follow up with a commit that's just clang-tidy and no clang-format.

- fdbclient/DatabaseBackupAgent.actor.cpp
- fdbclient/FileBackupAgent.actor.cpp
- fdbserver/OldTLogServer_4_6.actor.cpp
- fdbmonitor/SimpleIni.h
- fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp

											
										
										
											2021-01-26 09:55:43 +08:00
+									bool hasHealthyAvailableSpace(double minRatio) const override {
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+										return all([minRatio](IDataDistributionTeam const& team) { return team.hasHealthyAvailableSpace(minRatio); });
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									}
-												Use override where applicable in fdbserver

											
										
										
											2020-10-08 09:41:19 +08:00
+									Future<Void> updateStorageMetrics() override {
-												Cleanup DataDistributionQueue.actor.cpp and storageserver.actor.cpp

											
										
										
											2020-11-04 12:24:39 +08:00
+										std::vector<Future<Void>> futures;
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+										for (auto& team : teams) {
 											futures.push_back(team->updateStorageMetrics());
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										}
 										return waitForAll(futures);
 									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									bool isOptimal() const override {
 										return all([](IDataDistributionTeam const& team) { return team.isOptimal(); });
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									bool isWrongConfiguration() const override {
 										return any([](IDataDistributionTeam const& team) { return team.isWrongConfiguration(); });
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									void setWrongConfiguration(bool wrongConfiguration) override {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										for (auto it = teams.begin(); it != teams.end(); it++) {
 											(*it)->setWrongConfiguration(wrongConfiguration);
 										}
 									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									bool isHealthy() const override {
 										return all([](IDataDistributionTeam const& team) { return team.isHealthy(); });
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									void setHealthy(bool h) override {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										for (auto it = teams.begin(); it != teams.end(); it++) {
 											(*it)->setHealthy(h);
 										}
 									}
-												StorageEngineSwitch:Graceful switch

When fdbcli change storeType for storage engines,
we switch the store type of storage servers one by one gracefully.
This avoids recruiting multiple storage servers on the same process,
which can cause OOM error.

											
										
										
											2019-08-13 01:08:12 +08:00
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									int getPriority() const override {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										int priority = 0;
 										for (auto it = teams.begin(); it != teams.end(); it++) {
 											priority = std::max(priority, (*it)->getPriority());
 										}
 										return priority;
 									}
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									void setPriority(int p) override {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										for (auto it = teams.begin(); it != teams.end(); it++) {
 											(*it)->setPriority(p);
 										}
 									}
-												Merge remote-tracking branch 'origin/main' into change-rebalance-teams-signature

											
										
										
											2022-03-19 01:25:41 +08:00
+									void addref() const override { ReferenceCounted<ParallelTCInfo>::addref(); }
 									void delref() const override { ReferenceCounted<ParallelTCInfo>::delref(); }
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
-												Make IDataDistributionTeam const-correct

											
										
										
											2020-07-21 15:08:01 +08:00
+									void addServers(const std::vector<UID>& servers) override {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+										ASSERT(!teams.empty());
 										teams[0]->addServers(servers);
 									}
-												Piggy back this PR to polish more TraceEvent by:
- Making it clear that it's tracking machine team info or server team info
- Added ID to both machine team and server team for better trackability
- Attach distributor id to some trace events.

											
										
										
											2020-10-22 02:10:14 +08:00
-												Fix several merge issues

											
										
										
											2020-11-17 06:46:36 +08:00
+									std::string getTeamID() const override {
-												Changed getTeamID() to return a string instead of UID as suggested by reviews.

											
										
										
											2020-10-24 01:06:22 +08:00
+										std::string id;
 										for (int i = 0; i < teams.size(); i++) {
 											auto const& team = teams[i];
 											id += (i == teams.size() - 1) ? team->getTeamID() : format("%s, ", team->getTeamID().c_str());
 										}
 										return id;
 									}
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+								};
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								struct Busyness {
-												Cleanup DataDistributionQueue.actor.cpp and storageserver.actor.cpp

											
										
										
											2020-11-04 12:24:39 +08:00
+									std::vector<int> ledger;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									Busyness() : ledger(10, 0) {}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+									bool canLaunch(int prio, int work) const {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										ASSERT(prio > 0 && prio < 1000);
 										return ledger[prio / 100] <= WORK_FULL_UTILIZATION - work; // allow for rounding errors in double division
 									}
 									void addWork(int prio, int work) {
 										ASSERT(prio > 0 && prio < 1000);
 										for (int i = 0; i <= (prio / 100); i++)
 											ledger[i] += work;
 									}
 									void removeWork(int prio, int work) { addWork(prio, -work); }
 									std::string toString() {
 										std::string result;
 										for (int i = 1; i < ledger.size();) {
 											int j = i + 1;
 											while (j < ledger.size() && ledger[i] == ledger[j])
 												j++;
 											if (i != 1)
 												result += ", ";
 											result += i + 1 == j ? format("%03d", i * 100) : format("%03d/%03d", i * 100, (j - 1) * 100);
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+											result +=
 											    format("=%1.02f (%d/%d)", (float)ledger[i] / WORK_FULL_UTILIZATION, ledger[i], WORK_FULL_UTILIZATION);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											i = j;
 										}
 										return result;
 									}
 								};
 								// find the "workFactor" for this, were it launched now
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+								int getSrcWorkFactor(RelocateData const& relocation, int singleRegionTeamSize) {
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+									if (relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
 									    relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT)
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+									else if (relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT)
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										return WORK_FULL_UTILIZATION / 2 / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
 									else // for now we assume that any message at a lower priority can best be assumed to have a full team left for work
-												fix: in multi-region configurations, the data distribution queue could start too much work, expecting that the remote region would contribute to the read workload

											
										
										
											2020-03-05 06:17:17 +08:00
+										return WORK_FULL_UTILIZATION / singleRegionTeamSize / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+								int getDestWorkFactor() {
 									// Work of moving a shard is even across destination servers
 									return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_DEST_SERVER;
 								}
 								// Data movement's resource control: Do not overload servers used for the RelocateData
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								// return true if servers are not too busy to launch the relocation
-												StorageEngineSwitch:Graceful switch

When fdbcli change storeType for storage engines,
we switch the store type of storage servers one by one gracefully.
This avoids recruiting multiple storage servers on the same process,
which can cause OOM error.

											
										
										
											2019-08-13 01:08:12 +08:00
+								// This ensure source servers will not be overloaded.
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+								bool canLaunchSrc(RelocateData& relocation,
 								                  int teamSize,
 								                  int singleRegionTeamSize,
 								                  std::map<UID, Busyness>& busymap,
 								                  std::vector<RelocateData> cancellableRelocations) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									// assert this has not already been launched
 									ASSERT(relocation.workFactor == 0);
 									ASSERT(relocation.src.size() != 0);
-												fix: in multi-region configurations, the data distribution queue could start too much work, expecting that the remote region would contribute to the read workload

											
										
										
											2020-03-05 06:17:17 +08:00
+									ASSERT(teamSize >= singleRegionTeamSize);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									// find the "workFactor" for this, were it launched now
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+									int workFactor = getSrcWorkFactor(relocation, singleRegionTeamSize);
-												fix: in multi-region configurations, the data distribution queue could start too much work, expecting that the remote region would contribute to the read workload

											
										
										
											2020-03-05 06:17:17 +08:00
+									int neededServers = std::min<int>(relocation.src.size(), teamSize - singleRegionTeamSize + 1);
-												added a knob which reverts the new queue behavior

											
										
										
											2020-03-05 08:23:49 +08:00
+									if (SERVER_KNOBS->USE_OLD_NEEDED_SERVERS) {
 										neededServers = std::max(1, (int)relocation.src.size() - teamSize + 1);
 									}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									// see if each of the SS can launch this task
 									for (int i = 0; i < relocation.src.size(); i++) {
 										// For each source server for this relocation, copy and modify its busyness to reflect work that WOULD be
 										// cancelled
 										auto busyCopy = busymap[relocation.src[i]];
 										for (int j = 0; j < cancellableRelocations.size(); j++) {
 											auto& servers = cancellableRelocations[j].src;
 											if (std::count(servers.begin(), servers.end(), relocation.src[i]))
 												busyCopy.removeWork(cancellableRelocations[j].priority, cancellableRelocations[j].workFactor);
 										}
 										// Use this modified busyness to check if this relocation could be launched
 										if (busyCopy.canLaunch(relocation.priority, workFactor)) {
 											--neededServers;
 											if (neededServers == 0)
 												return true;
 										}
 									}
 									return false;
 								}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+								// candidateTeams is a vector containing one team per datacenter, the team(s) DD is planning on moving the shard to.
 								bool canLaunchDest(const std::vector<std::pair<Reference<IDataDistributionTeam>, bool>>& candidateTeams,
 								                   int priority,
 								                   std::map<UID, Busyness>& busymapDest) {
 									// fail switch if this is causing issues
 									if (SERVER_KNOBS->RELOCATION_PARALLELISM_PER_DEST_SERVER <= 0) {
 										return true;
 									}
 									int workFactor = getDestWorkFactor();
-												Minor readability improvement (I believe) to DD code; mostly, replacing .first, .second of pairs with better names through bindings)

											
										
										
											2022-05-14 04:03:00 +08:00
+									for (auto& [team, _] : candidateTeams) {
 										for (UID id : team->getServerIDs()) {
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+											if (!busymapDest[id].canLaunch(priority, workFactor)) {
 												return false;
 											}
 										}
 									}
 									return true;
 								}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								// update busyness for each server
-												fix: in multi-region configurations, the data distribution queue could start too much work, expecting that the remote region would contribute to the read workload

											
										
										
											2020-03-05 06:17:17 +08:00
+								void launch(RelocateData& relocation, std::map<UID, Busyness>& busymap, int singleRegionTeamSize) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									// if we are here this means that we can launch and should adjust all the work the servers can do
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+									relocation.workFactor = getSrcWorkFactor(relocation, singleRegionTeamSize);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									for (int i = 0; i < relocation.src.size(); i++)
 										busymap[relocation.src[i]].addWork(relocation.priority, relocation.workFactor);
 								}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+								void launchDest(RelocateData& relocation,
 								                const std::vector<std::pair<Reference<IDataDistributionTeam>, bool>>& candidateTeams,
 								                std::map<UID, Busyness>& destBusymap) {
 									ASSERT(relocation.completeDests.empty());
 									int destWorkFactor = getDestWorkFactor();
-												Minor readability improvement (I believe) to DD code; mostly, replacing .first, .second of pairs with better names through bindings)

											
										
										
											2022-05-14 04:03:00 +08:00
+									for (auto& [team, _] : candidateTeams) {
 										for (UID id : team->getServerIDs()) {
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+											relocation.completeDests.push_back(id);
 											destBusymap[id].addWork(relocation.priority, destWorkFactor);
 										}
 									}
 								}
-												fix inflight read division; temp destComplete fix; 0.1 constant poll time

											
										
										
											2022-04-21 03:15:40 +08:00
+								void completeDest(RelocateData const& relocation, std::map<UID, Busyness>& destBusymap) {
 									int destWorkFactor = getDestWorkFactor();
 									for (UID id : relocation.completeDests) {
 										destBusymap[id].removeWork(relocation.priority, destWorkFactor);
 									}
 								}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
 								void complete(RelocateData const& relocation, std::map<UID, Busyness>& busymap, std::map<UID, Busyness>& destBusymap) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									ASSERT(relocation.workFactor > 0);
 									for (int i = 0; i < relocation.src.size(); i++)
 										busymap[relocation.src[i]].removeWork(relocation.priority, relocation.workFactor);
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
-												fix inflight read division; temp destComplete fix; 0.1 constant poll time

											
										
										
											2022-04-21 03:15:40 +08:00
+									completeDest(relocation, destBusymap);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+								// Cancells in-flight data moves intersecting with range.
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+								ACTOR Future<Void> cancelDataMove(struct DDQueue* self, KeyRange range, const DDEnabledState* ddEnabledState);
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+								ACTOR Future<Void> dataDistributionRelocator(struct DDQueue* self,
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+								                                             RelocateData rd,
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+								                                             Future<Void> prevCleanup,
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+								                                             const DDEnabledState* ddEnabledState);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												split DD related headers

											
										
										
											2022-08-17 05:32:55 +08:00
+								struct DDQueue : public IDDRelocationQueue {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									struct DDDataMove {
 										DDDataMove() = default;
 										explicit DDDataMove(UID id) : id(id) {}
-												a.fix heap-use-after-free caused by early noErrorsActors destroy

											
										
										
											2022-05-27 06:55:14 +08:00
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										bool isValid() const { return id.isValid(); }
 										UID id;
 										Future<Void> cancel;
 									};
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+									struct ServerCounter {
-												add enum count; filter out zero reason item

											
										
										
											2022-08-12 02:59:46 +08:00
+										enum CountType : uint8_t { ProposedSource = 0, QueuedSource, LaunchedSource, LaunchedDest, __COUNT };
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
 									private:
-												add enum count; filter out zero reason item

											
										
										
											2022-08-12 02:59:46 +08:00
+										typedef std::array<int, (int)__COUNT> Item; // one for each CountType
-												relocation reason for size_split and write_split

											
										
										
											2022-08-10 07:16:14 +08:00
+										typedef std::array<Item, RelocateReason::typeCount()> ReasonItem; // one for each RelocateReason
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+										std::unordered_map<UID, ReasonItem> counter;
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
+										std::string toString(const Item& item) const {
 											return format("%d %d %d %d", item[0], item[1], item[2], item[3]);
 										}
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
+										void traceReasonItem(TraceEvent* event, const ReasonItem& item) const {
 											for (int i = 0; i < item.size(); ++i) {
-												add enum count; filter out zero reason item

											
										
										
											2022-08-12 02:59:46 +08:00
+												if (std::accumulate(item[i].cbegin(), item[i].cend(), 0) > 0) {
 													// "PQSD" corresponding to CounterType
-												solve invalid detail name

											
										
										
											2022-08-12 03:28:18 +08:00
+													event->detail(RelocateReason(i).toString() + "PQSD", toString(item[i]));
-												add enum count; filter out zero reason item

											
										
										
											2022-08-12 02:59:46 +08:00
+												}
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
+											}
 										}
-												add summarize event

											
										
										
											2022-08-10 09:22:48 +08:00
+										bool countNonZero(const ReasonItem& item, CountType type) const {
 											return std::any_of(item.cbegin(), item.cend(), [type](const Item& item) { return item[(int)type] > 0; });
 										}
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+										void increase(const UID& id, RelocateReason reason, CountType type) {
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+											int idx = (int)(reason);
-												compile warning

											
										
										
											2022-08-10 03:32:46 +08:00
+											// if (idx < 0 || idx >= RelocateReason::typeCount()) {
 											// 	TraceEvent(SevWarnAlways, "ServerCounterDebug").detail("Reason", reason.toString());
 											// }
-												fix RelocateShard invalid initialized bug (remove RelocationReason::INVALID

											
										
										
											2022-08-09 11:43:35 +08:00
+											ASSERT(idx >= 0 && idx < RelocateReason::typeCount());
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+											counter[id][idx][(int)type] += 1;
 										}
-												add knob to control summarize

											
										
										
											2022-08-10 14:32:40 +08:00
+										void summarizeLaunchedServers(decltype(counter.cbegin()) begin,
 										                              decltype(counter.cend()) end,
 										                              TraceEvent* event) const {
 											if (begin == end)
 												return;
 											std::string execSrc, execDest;
 											for (; begin != end; ++begin) {
 												if (countNonZero(begin->second, LaunchedSource)) {
 													execSrc += begin->first.shortString() + ",";
 												}
 												if (countNonZero(begin->second, LaunchedDest)) {
 													execDest += begin->first.shortString() + ",";
 												}
 											}
 											event->detail("RemainedLaunchedSources", execSrc).detail("RemainedLaunchedDestinations", execDest);
 										}
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+									public:
 										void clear() { counter.clear(); }
 										int get(const UID& id, RelocateReason reason, CountType type) const {
 											return counter.at(id)[(int)reason][(int)type];
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+										}
 										void increaseForTeam(const std::vector<UID>& ids, RelocateReason reason, CountType type) {
 											for (auto& id : ids) {
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+												increase(id, reason, type);
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+											}
 										}
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
 										void traceAll(const UID& debugId = UID()) const {
-												add summarize event

											
										
										
											2022-08-10 09:22:48 +08:00
+											auto it = counter.cbegin();
-												Update Document; set log limit

											
										
										
											2022-08-09 01:04:48 +08:00
+											int count = 0;
-												add summarize event

											
										
										
											2022-08-10 09:22:48 +08:00
+											for (; count < SERVER_KNOBS->DD_QUEUE_COUNTER_MAX_LOG && it != counter.cend(); ++count, ++it) {
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
+												TraceEvent event("DDQueueServerCounter", debugId);
-												add summarize event

											
										
										
											2022-08-10 09:22:48 +08:00
+												event.detail("ServerId", it->first);
 												traceReasonItem(&event, it->second);
 											}
-												add knob to control summarize

											
										
										
											2022-08-10 14:32:40 +08:00
-												add summarize event

											
										
										
											2022-08-10 09:22:48 +08:00
+											if (it != counter.cend()) {
-												add knob to control summarize

											
										
										
											2022-08-10 14:32:40 +08:00
+												TraceEvent e(SevWarn, "DDQueueServerCounterTooMany", debugId);
 												e.detail("Servers", size());
 												if (SERVER_KNOBS->DD_QUEUE_COUNTER_SUMMARIZE) {
 													summarizeLaunchedServers(it, counter.cend(), &e);
 													return;
-												Update Document; set log limit

											
										
										
											2022-08-09 01:04:48 +08:00
+												}
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
+											}
 										}
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
 										size_t size() const { return counter.size(); }
 										// for random test
 										static CountType randomCountType() {
-												add enum count; filter out zero reason item

											
										
										
											2022-08-12 02:59:46 +08:00
+											int i = deterministicRandom()->randomInt(0, (int)__COUNT);
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+											return (CountType)i;
 										}
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+									};
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									ActorCollectionNoErrors noErrorActors; // has to be the last one to be destroyed because other Actors may use it.
-												Add a new DataDistributor role.

Let cluster controller to start a new data distributor role by sending a
message to a chosen worker.
Change MasterInterface usage in DataDistribution to masterId

Add DataDistributor rejoin handling.

This allows the data distributor to tell the new cluster controller of its
existence so that the controller doesn't spawn a new one. I.e., there should
be only ONE data distributor in the cluster.

If DataDistributor (DD) doesn't join in a while, then ClusterController (CC) tries
to recruit one as DD. CC also monitors DD and restarts one if it failed.

The Proxy is also monitoring the DD. If DD failed, the Proxy will ask CC for
the new DD.

Add GetRecoveryInfo RPC to master server, which is called by data distributor
to obtain the recovery Transaction version from the master server.

											
										
										
											2018-12-14 05:31:37 +08:00
+									UID distributorId;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									MoveKeysLock lock;
 									Database cx;
-												change shared_ptr to Reference

											
										
										
											2022-09-28 02:22:47 +08:00
+									Reference<IDDTxnProcessor> txnProcessor;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									std::vector<TeamCollectionInterface> teamCollections;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+									Reference<PhysicalShardCollection> physicalShardCollection;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									PromiseStream<Promise<int64_t>> getAverageShardBytes;
 									FlowLock startMoveKeysParallelismLock;
 									FlowLock finishMoveKeysParallelismLock;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									FlowLock cleanUpDataMoveParallelismLock;
-												Added a flow lock to prevent too many source server fetches from happening at the same time and running the data distributor out of memory

											
										
										
											2020-07-10 01:38:19 +08:00
+									Reference<FlowLock> fetchSourceLock;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									int activeRelocations;
 									int queuedRelocations;
-												fix: bytesWritten would overflow and go negative

											
										
										
											2018-09-01 03:46:57 +08:00
+									int64_t bytesWritten;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									int teamSize;
-												fix: in multi-region configurations, the data distribution queue could start too much work, expecting that the remote region would contribute to the read workload

											
										
										
											2020-03-05 06:17:17 +08:00
+									int singleRegionTeamSize;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												StorageEngineSwitch:Graceful switch

When fdbcli change storeType for storage engines,
we switch the store type of storage servers one by one gracefully.
This avoids recruiting multiple storage servers on the same process,
which can cause OOM error.

											
										
										
											2019-08-13 01:08:12 +08:00
+									std::map<UID, Busyness> busymap; // UID is serverID
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+									std::map<UID, Busyness> destBusymap; // UID is serverID
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									KeyRangeMap<RelocateData> queueMap;
 									std::set<RelocateData, std::greater<RelocateData>> fetchingSourcesQueue;
 									std::set<RelocateData, std::greater<RelocateData>> fetchKeysComplete;
 									KeyRangeActorMap getSourceActors;
-												DD:Add comments to help understand code

Add comments to explain the functionalities of some code.

											
										
										
											2019-07-20 07:22:15 +08:00
+									std::map<UID, std::set<RelocateData, std::greater<RelocateData>>>
 									    queue; // Key UID is serverID, value is the serverID's set of RelocateData to relocate
-												fix top10 shard index bug; add event detail; fix merge conflict

											
										
										
											2022-04-23 05:14:58 +08:00
+									// The last time one server was selected as source team for read rebalance reason. We want to throttle read
 									// rebalance on time bases because the read workload sample update has delay after the previous moving
 									std::map<UID, double> lastAsSource;
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+									ServerCounter serverCounter;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									KeyRangeMap<RelocateData> inFlight;
-												StorageEngineSwitch:Graceful switch

When fdbcli change storeType for storage engines,
we switch the store type of storage servers one by one gracefully.
This avoids recruiting multiple storage servers on the same process,
which can cause OOM error.

											
										
										
											2019-08-13 01:08:12 +08:00
+									// Track all actors that relocates specified keys to a good place; Key: keyRange; Value: actor
 									KeyRangeActorMap inFlightActors;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									KeyRangeMap<DDDataMove> dataMoves;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									Promise<Void> error;
 									PromiseStream<RelocateData> dataTransferComplete;
 									PromiseStream<RelocateData> relocationComplete;
-												Add a comment to DD

											
										
										
											2020-07-14 01:12:39 +08:00
+									PromiseStream<RelocateData> fetchSourceServersComplete; // find source SSs for a relocate range
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												prevented a slow task when too many shards were sent to the data distribution queue after switching to a fearless deployment

											
										
										
											2018-08-10 03:37:46 +08:00
+									PromiseStream<RelocateShard> output;
 									FutureStream<RelocateShard> input;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									PromiseStream<GetMetricsRequest> getShardMetrics;
-												reset several method use getShardMetrics

											
										
										
											2022-05-04 15:00:03 +08:00
+									PromiseStream<GetTopKMetricsRequest> getTopKMetrics;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												suppressed trace events that are spammy

											
										
										
											2018-02-17 08:01:19 +08:00
+									double lastInterval;
 									int suppressIntervals;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Minor improvement on comments

											
										
										
											2020-07-13 09:30:02 +08:00
+									Reference<AsyncVar<bool>> rawProcessingUnhealthy; // many operations will remove relocations before adding a new
 									                                                  // one, so delay a small time before settling on a new number.
-												consider wiggling when waitUntilHealthy

											
										
										
											2021-10-15 07:22:47 +08:00
+									Reference<AsyncVar<bool>> rawProcessingWiggle;
-												first working version of non-copying recovery working with fearless configurations

											
										
										
											2018-04-09 12:24:05 +08:00
 									std::map<int, int> priority_relocations;
 									int unhealthyRelocations;
-												fix roll trace event issue for data distribution

Description

Testing

											
										
										
											2021-09-25 01:04:30 +08:00
 									Reference<EventCacheHolder> movedKeyServersEventHolder;
-												Add a counter to track physical shard creation throuogh moves

											
										
										
											2022-10-20 13:09:04 +08:00
+									int moveReusePhysicalShard;
 									int moveCreateNewPhysicalShard;
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+									enum RetryFindDstReason {
-												Count the detailed reason for new physical shard creation during data move

											
										
										
											2022-10-23 11:48:58 +08:00
+										None = 0,
 										RemoteBestTeamNotReady,
 										PrimaryNoHealthyTeam,
 										RemoteNoHealthyTeam,
 										RemoteTeamIsFull,
 										RemoteTeamIsNotHealthy,
 										NoAvailablePhysicalShard,
 										NumberOfTypes,
 									};
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+									std::vector<int> retryFindDstReasonCount;
-												Add a counter to track physical shard creation throuogh moves

											
										
										
											2022-10-20 13:09:04 +08:00
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+									void startRelocation(int priority, int healthPriority) {
-												Count PRIORITY_TEAM_REDUNDANT as count PRIORITY_TEAM_UNHEALTHY

											
										
										
											2019-07-20 09:30:01 +08:00
+										// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
 										// we must count it into unhealthyRelocations; because team removers relies on unhealthyRelocations to
 										// ensure a team remover will not start before the previous one finishes removing a team and move away data
 										// NOTE: split and merge shard have higher priority. If they have to wait for unhealthyRelocations = 0,
 										// deadlock may happen: split/merge shard waits for unhealthyRelocations, while blocks team_redundant.
-												Make the DD priority associated with populating a remote region lower than machine failures

											
										
										
											2020-03-05 06:07:32 +08:00
+										if (healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
 										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
 										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
 										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ||
 										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
-												first working version of non-copying recovery working with fearless configurations

											
										
										
											2018-04-09 12:24:05 +08:00
+											unhealthyRelocations++;
 											rawProcessingUnhealthy->set(true);
 										}
-												consider wiggling when waitUntilHealthy

											
										
										
											2021-10-15 07:22:47 +08:00
+										if (healthPriority == SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE) {
 											rawProcessingWiggle->set(true);
 										}
-												first working version of non-copying recovery working with fearless configurations

											
										
										
											2018-04-09 12:24:05 +08:00
+										priority_relocations[priority]++;
 									}
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+									void finishRelocation(int priority, int healthPriority) {
-												Make the DD priority associated with populating a remote region lower than machine failures

											
										
										
											2020-03-05 06:07:32 +08:00
+										if (healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
 										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
 										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
 										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ||
 										    healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
-												first working version of non-copying recovery working with fearless configurations

											
										
										
											2018-04-09 12:24:05 +08:00
+											unhealthyRelocations--;
 											ASSERT(unhealthyRelocations >= 0);
 											if (unhealthyRelocations == 0) {
 												rawProcessingUnhealthy->set(false);
 											}
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+										}
-												first working version of non-copying recovery working with fearless configurations

											
										
										
											2018-04-09 12:24:05 +08:00
+										priority_relocations[priority]--;
-												consider wiggling when waitUntilHealthy

											
										
										
											2021-10-15 07:22:47 +08:00
+										if (priority_relocations[SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE] == 0) {
 											rawProcessingWiggle->set(false);
 										}
-												first working version of non-copying recovery working with fearless configurations

											
										
										
											2018-04-09 12:24:05 +08:00
+									}
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+									DDQueue(UID mid,
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
+									        MoveKeysLock lock,
-												change shared_ptr to Reference

											
										
										
											2022-09-28 02:22:47 +08:00
+									        Reference<IDDTxnProcessor> db,
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
+									        std::vector<TeamCollectionInterface> teamCollections,
 									        Reference<ShardsAffectedByTeamFailure> sABTF,
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+									        Reference<PhysicalShardCollection> physicalShardCollection,
-												ServerCounter.traceAll()

											
										
										
											2022-08-06 06:01:49 +08:00
+									        PromiseStream<Promise<int64_t>> getAverageShardBytes,
 									        int teamSize,
 									        int singleRegionTeamSize,
 									        PromiseStream<RelocateShard> output,
 									        FutureStream<RelocateShard> input,
 									        PromiseStream<GetMetricsRequest> getShardMetrics,
 									        PromiseStream<GetTopKMetricsRequest> getTopKMetrics)
-												rename dbProcessor to db; rename getDb() to context()

											
										
										
											2022-09-24 06:20:35 +08:00
+									  : IDDRelocationQueue(), distributorId(mid), lock(lock), cx(db->context()), txnProcessor(db),
-												format code

											
										
										
											2022-08-17 14:37:55 +08:00
+									    teamCollections(teamCollections), shardsAffectedByTeamFailure(sABTF),
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+									    physicalShardCollection(physicalShardCollection), getAverageShardBytes(getAverageShardBytes),
-												Fix more -Wreorder-ctor warnings across many files

											
										
										
											2021-07-25 02:20:51 +08:00
+									    startMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
-												Added a flow lock to prevent too many source server fetches from happening at the same time and running the data distributor out of memory

											
										
										
											2020-07-10 01:38:19 +08:00
+									    finishMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									    cleanUpDataMoveParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM),
-												Fix more -Wreorder-ctor warnings across many files

											
										
										
											2021-07-25 02:20:51 +08:00
+									    fetchSourceLock(new FlowLock(SERVER_KNOBS->DD_FETCH_SOURCE_PARALLELISM)), activeRelocations(0),
 									    queuedRelocations(0), bytesWritten(0), teamSize(teamSize), singleRegionTeamSize(singleRegionTeamSize),
-												Remove last-limited check from DDMountainChopper and DDValleyFiller

											
										
										
											2022-05-31 12:57:34 +08:00
+									    output(output), input(input), getShardMetrics(getShardMetrics), getTopKMetrics(getTopKMetrics), lastInterval(0),
 									    suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
 									    rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
-												Add a counter to track physical shard creation throuogh moves

											
										
										
											2022-10-20 13:09:04 +08:00
+									    movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0),
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+									    moveCreateNewPhysicalShard(0), retryFindDstReasonCount(static_cast<int>(RetryFindDstReason::NumberOfTypes), 0) {
 									}
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+									DDQueue() = default;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									void validate() {
 										if (EXPENSIVE_VALIDATION) {
 											for (auto it = fetchingSourcesQueue.begin(); it != fetchingSourcesQueue.end(); ++it) {
 												// relocates in the fetching queue do not have src servers yet.
 												if (it->src.size())
 													TraceEvent(SevError, "DDQueueValidateError1")
 													    .detail("Problem", "relocates in the fetching queue do not have src servers yet");
 												// relocates in the fetching queue do not have a work factor yet.
 												if (it->workFactor != 0.0)
 													TraceEvent(SevError, "DDQueueValidateError2")
 													    .detail("Problem", "relocates in the fetching queue do not have a work factor yet");
 												// relocates in the fetching queue are in the queueMap.
 												auto range = queueMap.rangeContaining(it->keys.begin);
 												if (range.value() != *it || range.range() != it->keys)
 													TraceEvent(SevError, "DDQueueValidateError3")
 													    .detail("Problem", "relocates in the fetching queue are in the queueMap");
 											}
 											/*
 											for( auto it = queue.begin(); it != queue.end(); ++it ) {
 											    for( auto rdit = it->second.begin(); rdit != it->second.end(); ++rdit ) {
 											        // relocates in the queue are in the queueMap exactly.
 											        auto range = queueMap.rangeContaining( rdit->keys.begin );
 											        if( range.value() != *rdit || range.range() != rdit->keys )
 											            TraceEvent(SevError, "DDQueueValidateError4").detail("Problem", "relocates in the queue are in the queueMap exactly")
-												Remove trace-calls to printable (in non-workloads)

											
										
										
											2019-03-19 06:03:43 +08:00
+											            .detail("RangeBegin", range.range().begin)
 											            .detail("RangeEnd", range.range().end)
 											            .detail("RelocateBegin2", range.value().keys.begin)
 											            .detail("RelocateEnd2", range.value().keys.end)
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											            .detail("RelocateStart", range.value().startTime)
 											            .detail("MapStart", rdit->startTime)
 											            .detail("RelocateWork", range.value().workFactor)
 											            .detail("MapWork", rdit->workFactor)
 											            .detail("RelocateSrc", range.value().src.size())
 											            .detail("MapSrc", rdit->src.size())
 											            .detail("RelocatePrio", range.value().priority)
 											            .detail("MapPrio", rdit->priority);
-												apply clang-format to *.c, *.cpp, *.h, *.hpp files

											
										
										
											2021-03-11 02:06:03 +08:00
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											        // relocates in the queue have src servers
 											        if( !rdit->src.size() )
 											            TraceEvent(SevError, "DDQueueValidateError5").detail("Problem", "relocates in the queue have src servers");
-												apply clang-format to *.c, *.cpp, *.h, *.hpp files

											
										
										
											2021-03-11 02:06:03 +08:00
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											        // relocates in the queue do not have a work factor yet.
 											        if( rdit->workFactor != 0.0 )
 											            TraceEvent(SevError, "DDQueueValidateError6").detail("Problem", "relocates in the queue do not have a work factor yet");
-												apply clang-format to *.c, *.cpp, *.h, *.hpp files

											
										
										
											2021-03-11 02:06:03 +08:00
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											        bool contains = false;
 											        for( int i = 0; i < rdit->src.size(); i++ ) {
 											            if( rdit->src[i] == it->first ) {
 											                contains = true;
 											                break;
 											            }
 											        }
 											        if( !contains )
 											            TraceEvent(SevError, "DDQueueValidateError7").detail("Problem", "queued relocate data does not include ss under which its filed");
 											    }
 											}*/
 											auto inFlightRanges = inFlight.ranges();
 											for (auto it = inFlightRanges.begin(); it != inFlightRanges.end(); ++it) {
 												for (int i = 0; i < it->value().src.size(); i++) {
 													// each server in the inFlight map is in the busymap
 													if (!busymap.count(it->value().src[i]))
 														TraceEvent(SevError, "DDQueueValidateError8")
 														    .detail("Problem", "each server in the inFlight map is in the busymap");
 													// relocate data that is inFlight is not also in the queue
 													if (queue[it->value().src[i]].count(it->value()))
 														TraceEvent(SevError, "DDQueueValidateError9")
 														    .detail("Problem", "relocate data that is inFlight is not also in the queue");
 												}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+												for (int i = 0; i < it->value().completeDests.size(); i++) {
 													// each server in the inFlight map is in the dest busymap
 													if (!destBusymap.count(it->value().completeDests[i]))
 														TraceEvent(SevError, "DDQueueValidateError10")
 														    .detail("Problem", "each server in the inFlight map is in the destBusymap");
 												}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												// in flight relocates have source servers
 												if (it->value().startTime != -1 && !it->value().src.size())
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+													TraceEvent(SevError, "DDQueueValidateError11")
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													    .detail("Problem", "in flight relocates have source servers");
 												if (inFlightActors.liveActorAt(it->range().begin)) {
 													// the key range in the inFlight map matches the key range in the RelocateData message
 													if (it->value().keys != it->range())
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+														TraceEvent(SevError, "DDQueueValidateError12")
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+														    .detail(
 														        "Problem",
 														        "the key range in the inFlight map matches the key range in the RelocateData message");
-												Better cancelling logic that reflects whether move has actually started

											
										
										
											2022-02-25 23:33:46 +08:00
+												} else if (it->value().cancellable) {
 													TraceEvent(SevError, "DDQueueValidateError13")
 													    .detail("Problem", "key range is cancellable but not in flight!")
 													    .detail("Range", it->range());
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												}
 											}
 											for (auto it = busymap.begin(); it != busymap.end(); ++it) {
 												for (int i = 0; i < it->second.ledger.size() - 1; i++) {
 													if (it->second.ledger[i] < it->second.ledger[i + 1])
-												Better cancelling logic that reflects whether move has actually started

											
										
										
											2022-02-25 23:33:46 +08:00
+														TraceEvent(SevError, "DDQueueValidateError14")
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+														    .detail("Problem", "ascending ledger problem")
-												Attempt to normalize trace events:

* Detail names now all start with an uppercase character and contain no underscores. Ideally these should be head-first camel case, though that was harder to check.
* Type names have the same rules, except they allow one underscore (to support a usage pattern Context_Type). The first character after the underscore is also uppercase.
* Use seconds instead of milliseconds in details.

Added a check when events are logged in simulation that logs a message to stderr if the first two rules above aren't followed.

This probably doesn't address every instance of the above problems, but all of the events I was able to hit in simulation pass the check.

											
										
										
											2018-06-09 02:11:08 +08:00
+														    .detail("LedgerLevel", i)
 														    .detail("LedgerValueA", it->second.ledger[i])
 														    .detail("LedgerValueB", it->second.ledger[i + 1]);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													if (it->second.ledger[i] < 0.0)
-												Better cancelling logic that reflects whether move has actually started

											
										
										
											2022-02-25 23:33:46 +08:00
+														TraceEvent(SevError, "DDQueueValidateError15")
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+														    .detail("Problem", "negative ascending problem")
 														    .detail("LedgerLevel", i)
 														    .detail("LedgerValue", it->second.ledger[i]);
 												}
 											}
 											for (auto it = destBusymap.begin(); it != destBusymap.end(); ++it) {
 												for (int i = 0; i < it->second.ledger.size() - 1; i++) {
 													if (it->second.ledger[i] < it->second.ledger[i + 1])
-												Better cancelling logic that reflects whether move has actually started

											
										
										
											2022-02-25 23:33:46 +08:00
+														TraceEvent(SevError, "DDQueueValidateError16")
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+														    .detail("Problem", "ascending ledger problem")
 														    .detail("LedgerLevel", i)
 														    .detail("LedgerValueA", it->second.ledger[i])
 														    .detail("LedgerValueB", it->second.ledger[i + 1]);
 													if (it->second.ledger[i] < 0.0)
-												Better cancelling logic that reflects whether move has actually started

											
										
										
											2022-02-25 23:33:46 +08:00
+														TraceEvent(SevError, "DDQueueValidateError17")
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+														    .detail("Problem", "negative ascending problem")
-												Attempt to normalize trace events:

* Detail names now all start with an uppercase character and contain no underscores. Ideally these should be head-first camel case, though that was harder to check.
* Type names have the same rules, except they allow one underscore (to support a usage pattern Context_Type). The first character after the underscore is also uppercase.
* Use seconds instead of milliseconds in details.

Added a check when events are logged in simulation that logs a message to stderr if the first two rules above aren't followed.

This probably doesn't address every instance of the above problems, but all of the events I was able to hit in simulation pass the check.

											
										
										
											2018-06-09 02:11:08 +08:00
+														    .detail("LedgerLevel", i)
 														    .detail("LedgerValue", it->second.ledger[i]);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												}
 											}
 											std::set<RelocateData, std::greater<RelocateData>> queuedRelocationsMatch;
 											for (auto it = queue.begin(); it != queue.end(); ++it)
 												queuedRelocationsMatch.insert(it->second.begin(), it->second.end());
 											ASSERT(queuedRelocations == queuedRelocationsMatch.size() + fetchingSourcesQueue.size());
 											int testActive = 0;
 											for (auto it = priority_relocations.begin(); it != priority_relocations.end(); ++it)
 												testActive += it->second;
 											ASSERT(activeRelocations + queuedRelocations == testActive);
 										}
 									}
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+									ACTOR static Future<Void> getSourceServersForRange(DDQueue* self,
-												add storeTuple and unit test; refactor getSourceServersForRange

											
										
										
											2022-06-10 03:16:12 +08:00
+									                                                   RelocateData input,
 									                                                   PromiseStream<RelocateData> output,
 									                                                   Reference<FlowLock> fetchLock) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 										// FIXME: is the merge case needed
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+										if (input.priority == SERVER_KNOBS->PRIORITY_MERGE_SHARD) {
-												Add logging to indicate the time spent at each priority that exceeds some minimum busyness threshold

											
										
										
											2020-02-08 06:34:24 +08:00
+											wait(delay(0.5, TaskPriority::DataDistributionVeryLow));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										} else {
-												A giant translation of TaskFooPriority -> TaskPriority::Foo

This is so that APIs that take priorities don't take ints, which are
common and easy to accidentally pass the wrong thing.

											
										
										
											2019-06-25 17:47:35 +08:00
+											wait(delay(0.0001, TaskPriority::DataDistributionLaunch));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										}
-												Added a flow lock to prevent too many source server fetches from happening at the same time and running the data distributor out of memory

											
										
										
											2020-07-10 01:38:19 +08:00
+										wait(fetchLock->take(TaskPriority::DataDistributionLaunch));
 										state FlowLock::Releaser releaser(*fetchLock);
-												use struct instead of tuple

											
										
										
											2022-06-14 02:27:50 +08:00
+										IDDTxnProcessor::SourceServers res = wait(self->txnProcessor->getSourceServersForRange(input.keys));
 										input.src = std::move(res.srcServers);
 										input.completeSources = std::move(res.completeSources);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										output.send(input);
 										return Void();
 									}
 									// This function cannot handle relocation requests which split a shard into three pieces
-												added a buggify + minor code cleanup

											
										
										
											2019-10-12 09:31:43 +08:00
+									void queueRelocation(RelocateShard rs, std::set<UID>& serversToLaunchFrom) {
-												Remove trace-calls to printable (in non-workloads)

											
										
										
											2019-03-19 06:03:43 +08:00
+										//TraceEvent("QueueRelocationBegin").detail("Begin", rd.keys.begin).detail("End", rd.keys.end);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 										// remove all items from both queues that are fully contained in the new relocation (i.e. will be overwritten)
-												added a buggify + minor code cleanup

											
										
										
											2019-10-12 09:31:43 +08:00
+										RelocateData rd(rs);
 										bool hasHealthPriority = RelocateData::isHealthPriority(rd.priority);
 										bool hasBoundaryPriority = RelocateData::isBoundaryPriority(rd.priority);
-												apply clang-format to *.c, *.cpp, *.h, *.hpp files

											
										
										
											2021-03-11 02:06:03 +08:00
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										auto ranges = queueMap.intersectingRanges(rd.keys);
 										for (auto r = ranges.begin(); r != ranges.end(); ++r) {
 											RelocateData& rrs = r->value();
 											auto fetchingSourcesItr = fetchingSourcesQueue.find(rrs);
 											bool foundActiveFetching = fetchingSourcesItr != fetchingSourcesQueue.end();
 											std::set<RelocateData, std::greater<RelocateData>>* firstQueue;
 											std::set<RelocateData, std::greater<RelocateData>>::iterator firstRelocationItr;
 											bool foundActiveRelocation = false;
 											if (!foundActiveFetching && rrs.src.size()) {
 												firstQueue = &queue[rrs.src[0]];
 												firstRelocationItr = firstQueue->find(rrs);
 												foundActiveRelocation = firstRelocationItr != firstQueue->end();
 											}
 											// If there is a queued job that wants data relocation which we are about to cancel/modify,
 											//  make sure that we keep the relocation intent for the job that we queue up
 											if (foundActiveFetching || foundActiveRelocation) {
 												rd.wantsNewServers |= rrs.wantsNewServers;
 												rd.startTime = std::min(rd.startTime, rrs.startTime);
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+												if (!hasHealthPriority) {
 													rd.healthPriority = std::max(rd.healthPriority, rrs.healthPriority);
 												}
 												if (!hasBoundaryPriority) {
 													rd.boundaryPriority = std::max(rd.boundaryPriority, rrs.boundaryPriority);
 												}
 												rd.priority = std::max(rd.priority, std::max(rd.boundaryPriority, rd.healthPriority));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											}
 											if (rd.keys.contains(rrs.keys)) {
 												if (foundActiveFetching)
 													fetchingSourcesQueue.erase(fetchingSourcesItr);
 												else if (foundActiveRelocation) {
 													firstQueue->erase(firstRelocationItr);
 													for (int i = 1; i < rrs.src.size(); i++)
 														queue[rrs.src[i]].erase(rrs);
 												}
 											}
 											if (foundActiveFetching || foundActiveRelocation) {
 												serversToLaunchFrom.insert(rrs.src.begin(), rrs.src.end());
 												/*TraceEvent(rrs.interval.end(), mi.id()).detail("Result","Cancelled")
 												    .detail("WasFetching", foundActiveFetching).detail("Contained", rd.keys.contains( rrs.keys ));*/
 												queuedRelocations--;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+												TraceEvent(SevVerbose, "QueuedRelocationsChanged")
 												    .detail("DataMoveID", rrs.dataMoveId)
 												    .detail("RandomID", rrs.randomId)
 												    .detail("Total", queuedRelocations);
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+												finishRelocation(rrs.priority, rrs.healthPriority);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											}
 										}
 										// determine the final state of the relocations map
 										auto affectedQueuedItems = queueMap.getAffectedRangesAfterInsertion(rd.keys, rd);
 										// put the new request into the global map of requests (modifies the ranges already present)
 										queueMap.insert(rd.keys, rd);
 										// cancel all the getSourceServers actors that intersect the new range that we will be getting
 										getSourceActors.cancel(KeyRangeRef(affectedQueuedItems.front().begin, affectedQueuedItems.back().end));
 										// update fetchingSourcesQueue and the per-server queue based on truncated ranges after insertion, (re-)launch
 										// getSourceServers
 										auto queueMapItr = queueMap.rangeContaining(affectedQueuedItems[0].begin);
 										for (int r = 0; r < affectedQueuedItems.size(); ++r, ++queueMapItr) {
 											// ASSERT(queueMapItr->value() == queueMap.rangeContaining(affectedQueuedItems[r].begin)->value());
 											RelocateData& rrs = queueMapItr->value();
 											if (rrs.src.size() == 0 && (rrs.keys == rd.keys || fetchingSourcesQueue.erase(rrs) > 0)) {
 												rrs.keys = affectedQueuedItems[r];
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
+												rrs.interval = TraceInterval("QueuedRelocation", rrs.randomId); // inherit the old randomId
 												DebugRelocationTraceEvent(rrs.interval.begin(), distributorId)
 												    .detail("KeyBegin", rrs.keys.begin)
 												    .detail("KeyEnd", rrs.keys.end)
 												    .detail("Priority", rrs.priority)
 												    .detail("WantsNewServers", rrs.wantsNewServers);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 												queuedRelocations++;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+												TraceEvent(SevVerbose, "QueuedRelocationsChanged")
 												    .detail("DataMoveID", rrs.dataMoveId)
 												    .detail("RandomID", rrs.randomId)
 												    .detail("Total", queuedRelocations);
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+												startRelocation(rrs.priority, rrs.healthPriority);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 												fetchingSourcesQueue.insert(rrs);
-												add storeTuple and unit test; refactor getSourceServersForRange

											
										
										
											2022-06-10 03:16:12 +08:00
+												getSourceActors.insert(
-												smaller function

											
										
										
											2022-07-19 05:21:50 +08:00
+												    rrs.keys, getSourceServersForRange(this, rrs, fetchSourceServersComplete, fetchSourceLock));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											} else {
 												RelocateData newData(rrs);
 												newData.keys = affectedQueuedItems[r];
 												ASSERT(rrs.src.size() || rrs.startTime == -1);
 												bool foundActiveRelocation = false;
 												for (int i = 0; i < rrs.src.size(); i++) {
 													auto& serverQueue = queue[rrs.src[i]];
 													if (serverQueue.erase(rrs) > 0) {
 														if (!foundActiveRelocation) {
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
+															newData.interval =
 															    TraceInterval("QueuedRelocation", rrs.randomId); // inherit the old randomId
 															DebugRelocationTraceEvent(newData.interval.begin(), distributorId)
 															    .detail("KeyBegin", newData.keys.begin)
 															    .detail("KeyEnd", newData.keys.end)
 															    .detail("Priority", newData.priority)
 															    .detail("WantsNewServers", newData.wantsNewServers);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+															queuedRelocations++;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+															TraceEvent(SevVerbose, "QueuedRelocationsChanged")
 															    .detail("DataMoveID", newData.dataMoveId)
 															    .detail("RandomID", newData.randomId)
 															    .detail("Total", queuedRelocations);
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+															startRelocation(newData.priority, newData.healthPriority);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+															foundActiveRelocation = true;
 														}
 														serverQueue.insert(newData);
 													} else
 														break;
 												}
 												// We update the keys of a relocation even if it is "dead" since it helps validate()
 												rrs.keys = affectedQueuedItems[r];
 												rrs.interval = newData.interval;
 											}
 										}
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
+										DebugRelocationTraceEvent("ReceivedRelocateShard", distributorId)
 										    .detail("KeyBegin", rd.keys.begin)
 										    .detail("KeyEnd", rd.keys.end)
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										    .detail("Priority", rd.priority)
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
+										    .detail("AffectedRanges", affectedQueuedItems.size());
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									}
-												Fix a data movement stuck bug

When moving keys to a team, if one of the server in the target team died, then
the move can become stuck. This is because the DDTeamCollection waits for all
the data movement of the failed server to be completed. However, in this case,
because the movement has not finished yet, checking the database tells us there
is no key assocated with this server and it is safe to go ahead. In reality,
only the in-memory structure knows there is pending movement, i.e., unfinished
move causes some keys to be attributed to the failed server. Thus, the server
can't be removed yet. Fix by adding a check with in-memory structure in
waitForAllDataRemoved().

Use const& to optimize a few function parameters.

											
										
										
											2019-02-08 07:31:03 +08:00
+									void completeSourceFetch(const RelocateData& results) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										ASSERT(fetchingSourcesQueue.count(results));
 										// logRelocation( results, "GotSourceServers" );
 										fetchingSourcesQueue.erase(results);
 										queueMap.insert(results.keys, results);
 										for (int i = 0; i < results.src.size(); i++) {
 											queue[results.src[i]].insert(results);
 										}
-												change canQueue to timeThrottle()

											
										
										
											2022-04-23 06:26:44 +08:00
+										updateLastAsSource(results.src);
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+										serverCounter.increaseForTeam(results.src, results.reason, ServerCounter::CountType::QueuedSource);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									}
-												Fix a data movement stuck bug

When moving keys to a team, if one of the server in the target team died, then
the move can become stuck. This is because the DDTeamCollection waits for all
the data movement of the failed server to be completed. However, in this case,
because the movement has not finished yet, checking the database tells us there
is no key assocated with this server and it is safe to go ahead. In reality,
only the in-memory structure knows there is pending movement, i.e., unfinished
move causes some keys to be attributed to the failed server. Thus, the server
can't be removed yet. Fix by adding a check with in-memory structure in
waitForAllDataRemoved().

Use const& to optimize a few function parameters.

											
										
										
											2019-02-08 07:31:03 +08:00
+									void logRelocation(const RelocateData& rd, const char* title) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										std::string busyString;
 										for (int i = 0; i < rd.src.size() && i < teamSize * 2; i++)
 											busyString += describe(rd.src[i]) + " - (" + busymap[rd.src[i]].toString() + "); ";
-												Add a new DataDistributor role.

Let cluster controller to start a new data distributor role by sending a
message to a chosen worker.
Change MasterInterface usage in DataDistribution to masterId

Add DataDistributor rejoin handling.

This allows the data distributor to tell the new cluster controller of its
existence so that the controller doesn't spawn a new one. I.e., there should
be only ONE data distributor in the cluster.

If DataDistributor (DD) doesn't join in a while, then ClusterController (CC) tries
to recruit one as DD. CC also monitors DD and restarts one if it failed.

The Proxy is also monitoring the DD. If DD failed, the Proxy will ask CC for
the new DD.

Add GetRecoveryInfo RPC to master server, which is called by data distributor
to obtain the recovery Transaction version from the master server.

											
										
										
											2018-12-14 05:31:37 +08:00
+										TraceEvent(title, distributorId)
-												Addressed code review comments

											
										
										
											2019-04-06 04:11:50 +08:00
+										    .detail("KeyBegin", rd.keys.begin)
 										    .detail("KeyEnd", rd.keys.end)
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										    .detail("Priority", rd.priority)
 										    .detail("WorkFactor", rd.workFactor)
 										    .detail("SourceServerCount", rd.src.size())
 										    .detail("SourceServers", describe(rd.src, teamSize * 2))
 										    .detail("SourceBusyness", busyString);
 									}
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+									void launchQueuedWork(KeyRange keys, const DDEnabledState* ddEnabledState) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										// combine all queued work in the key range and check to see if there is anything to launch
 										std::set<RelocateData, std::greater<RelocateData>> combined;
 										auto f = queueMap.intersectingRanges(keys);
 										for (auto it = f.begin(); it != f.end(); ++it) {
 											if (it->value().src.size() && queue[it->value().src[0]].count(it->value()))
 												combined.insert(it->value());
 										}
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+										launchQueuedWork(combined, ddEnabledState);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									}
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+									void launchQueuedWork(const std::set<UID>& serversToLaunchFrom, const DDEnabledState* ddEnabledState) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										// combine all work from the source servers to see if there is anything new to launch
 										std::set<RelocateData, std::greater<RelocateData>> combined;
 										for (auto id : serversToLaunchFrom) {
 											auto& queuedWork = queue[id];
 											auto it = queuedWork.begin();
 											for (int j = 0; j < teamSize && it != queuedWork.end(); j++) {
 												combined.insert(*it);
 												++it;
 											}
 										}
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+										launchQueuedWork(combined, ddEnabledState);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									}
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+									void launchQueuedWork(RelocateData launchData, const DDEnabledState* ddEnabledState) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										// check a single RelocateData to see if it can be launched
 										std::set<RelocateData, std::greater<RelocateData>> combined;
 										combined.insert(launchData);
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+										launchQueuedWork(combined, ddEnabledState);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									}
-												StorageEngineSwitch:Graceful switch

When fdbcli change storeType for storage engines,
we switch the store type of storage servers one by one gracefully.
This avoids recruiting multiple storage servers on the same process,
which can cause OOM error.

											
										
										
											2019-08-13 01:08:12 +08:00
+									// For each relocateData rd in the queue, check if there exist inflight relocate data whose keyrange is overlapped
-												Minor improvement on comments

											
										
										
											2020-07-13 09:30:02 +08:00
+									// with rd. If there exist, cancel them by cancelling their actors and reducing the src servers' busyness of those
 									// canceled inflight relocateData. Launch the relocation for the rd.
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+									void launchQueuedWork(std::set<RelocateData, std::greater<RelocateData>> combined,
 									                      const DDEnabledState* ddEnabledState) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										int startedHere = 0;
 										double startTime = now();
 										// kick off relocators from items in the queue as need be
 										std::set<RelocateData, std::greater<RelocateData>>::iterator it = combined.begin();
 										for (; it != combined.end(); it++) {
 											RelocateData rd(*it);
-												StorageEngineSwitch:Graceful switch

When fdbcli change storeType for storage engines,
we switch the store type of storage servers one by one gracefully.
This avoids recruiting multiple storage servers on the same process,
which can cause OOM error.

											
										
										
											2019-08-13 01:08:12 +08:00
+											// Check if there is an inflight shard that is overlapped with the queued relocateShard (rd)
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											bool overlappingInFlight = false;
 											auto intersectingInFlight = inFlight.intersectingRanges(rd.keys);
 											for (auto it = intersectingInFlight.begin(); it != intersectingInFlight.end(); ++it) {
-												Apply Clang format to PRIORITY_TEAM_REDUNDANT

											
										
										
											2019-07-20 09:32:05 +08:00
+												if (fetchKeysComplete.count(it->value()) && inFlightActors.liveActorAt(it->range().begin) &&
 												    !rd.keys.contains(it->range()) && it->value().priority >= rd.priority &&
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+												    rd.healthPriority < SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY) {
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
 													DebugRelocationTraceEvent("OverlappingInFlight", distributorId)
-												Addressed code review comments

											
										
										
											2019-04-06 04:11:50 +08:00
+													    .detail("KeyBegin", it->value().keys.begin)
 													    .detail("KeyEnd", it->value().keys.end)
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
+													    .detail("Priority", it->value().priority);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													overlappingInFlight = true;
 													break;
 												}
 											}
 											if (overlappingInFlight) {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+												ASSERT(!rd.isRestore());
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												// logRelocation( rd, "SkippingOverlappingInFlight" );
 												continue;
 											}
 											// Because the busyness of a server is decreased when a superseding relocation is issued, we
 											//  need to consider what the busyness of a server WOULD be if
 											auto containedRanges = inFlight.containedRanges(rd.keys);
 											std::vector<RelocateData> cancellableRelocations;
 											for (auto it = containedRanges.begin(); it != containedRanges.end(); ++it) {
-												Better cancelling logic that reflects whether move has actually started

											
										
										
											2022-02-25 23:33:46 +08:00
+												if (it.value().cancellable) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													cancellableRelocations.push_back(it->value());
 												}
 											}
-												DD:Add comments to help understand code

Add comments to explain the functionalities of some code.

											
										
										
											2019-07-20 07:22:15 +08:00
+											// Data movement avoids overloading source servers in moving data.
-												StorageEngineSwitch:Graceful switch

When fdbcli change storeType for storage engines,
we switch the store type of storage servers one by one gracefully.
This avoids recruiting multiple storage servers on the same process,
which can cause OOM error.

											
										
										
											2019-08-13 01:08:12 +08:00
+											// SOMEDAY: the list of source servers may be outdated since they were fetched when the work was put in the
 											// queue
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											// FIXME: we need spare capacity even when we're just going to be cancelling work via TEAM_HEALTHY
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+											if (!rd.isRestore() && !canLaunchSrc(rd, teamSize, singleRegionTeamSize, busymap, cancellableRelocations)) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												// logRelocation( rd, "SkippingQueuedRelocation" );
 												continue;
 											}
-												DD:Add comments to help understand code

Add comments to explain the functionalities of some code.

											
										
										
											2019-07-20 07:22:15 +08:00
+											// From now on, the source servers for the RelocateData rd have enough resource to move the data away,
 											// because they do not have too much inflight data movement.
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											// logRelocation( rd, "LaunchingRelocation" );
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
+											DebugRelocationTraceEvent(rd.interval.end(), distributorId).detail("Result", "Success");
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+											if (!rd.isRestore()) {
 												queuedRelocations--;
 												TraceEvent(SevVerbose, "QueuedRelocationsChanged")
 												    .detail("DataMoveID", rd.dataMoveId)
 												    .detail("RandomID", rd.randomId)
 												    .detail("Total", queuedRelocations);
 												finishRelocation(rd.priority, rd.healthPriority);
 												// now we are launching: remove this entry from the queue of all the src servers
 												for (int i = 0; i < rd.src.size(); i++) {
 													ASSERT(queue[rd.src[i]].erase(rd));
 												}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+											Future<Void> fCleanup =
-												Change SHARD_ENCODE_LOCATION_METADATA to a server knob. (#7770)

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-08-04 04:51:40 +08:00
+											    SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA ? cancelDataMove(this, rd.keys, ddEnabledState) : Void();
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											// If there is a job in flight that wants data relocation which we are about to cancel/modify,
 											//     make sure that we keep the relocation intent for the job that we launch
 											auto f = inFlight.intersectingRanges(rd.keys);
 											for (auto it = f.begin(); it != f.end(); ++it) {
 												if (inFlightActors.liveActorAt(it->range().begin)) {
 													rd.wantsNewServers |= it->value().wantsNewServers;
 												}
 											}
 											startedHere++;
 											// update both inFlightActors and inFlight key range maps, cancelling deleted RelocateShards
-												Cleanup DataDistributionQueue.actor.cpp and storageserver.actor.cpp

											
										
										
											2020-11-04 12:24:39 +08:00
+											std::vector<KeyRange> ranges;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											inFlightActors.getRangesAffectedByInsertion(rd.keys, ranges);
 											inFlightActors.cancel(KeyRangeRef(ranges.front().begin, ranges.back().end));
 											inFlight.insert(rd.keys, rd);
 											for (int r = 0; r < ranges.size(); r++) {
 												RelocateData& rrs = inFlight.rangeContaining(ranges[r].begin)->value();
 												rrs.keys = ranges[r];
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+												if (rd.keys == ranges[r] && rd.isRestore()) {
 													ASSERT(rd.dataMove != nullptr);
-												Change SHARD_ENCODE_LOCATION_METADATA to a server knob. (#7770)

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-08-04 04:51:40 +08:00
+													ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+													rrs.dataMoveId = rd.dataMove->meta.id;
 												} else {
 													ASSERT_WE_THINK(!rd.isRestore()); // Restored data move should not overlap.
 													// TODO(psm): The shard id is determined by DD.
 													rrs.dataMove.reset();
-												Change SHARD_ENCODE_LOCATION_METADATA to a server knob. (#7770)

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-08-04 04:51:40 +08:00
+													if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+														if (SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 															rrs.dataMoveId = UID();
 														} else {
 															rrs.dataMoveId = deterministicRandom()->randomUniqueID();
 														}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+													} else {
 														rrs.dataMoveId = anonymousShardId;
 													}
 												}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												fix: in multi-region configurations, the data distribution queue could start too much work, expecting that the remote region would contribute to the read workload

											
										
										
											2020-03-05 06:17:17 +08:00
+												launch(rrs, busymap, singleRegionTeamSize);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												activeRelocations++;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+												TraceEvent(SevVerbose, "InFlightRelocationChange")
 												    .detail("Launch", rrs.dataMoveId)
 												    .detail("Total", activeRelocations);
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+												startRelocation(rrs.priority, rrs.healthPriority);
-												StorageEngineSwitch:Graceful switch

When fdbcli change storeType for storage engines,
we switch the store type of storage servers one by one gracefully.
This avoids recruiting multiple storage servers on the same process,
which can cause OOM error.

											
										
										
											2019-08-13 01:08:12 +08:00
+												// Start the actor that relocates data in the rrs.keys
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+												inFlightActors.insert(rrs.keys, dataDistributionRelocator(this, rrs, fCleanup, ddEnabledState));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											}
 											// logRelocation( rd, "LaunchedRelocation" );
 										}
-												Replace g_random and g_nondeterministic_random with functions deterministicRandom() and nondeterministicRandom() that return thread_local random number generators. Delete g_debug_random and trace_random. Allow only deterministicRandom() to be seeded, and require it to be seeded from each thread on which it is used.

											
										
										
											2019-05-11 05:01:52 +08:00
+										if (now() - startTime > .001 && deterministicRandom()->random01() < 0.001)
-												Attempt to normalize trace events:

* Detail names now all start with an uppercase character and contain no underscores. Ideally these should be head-first camel case, though that was harder to check.
* Type names have the same rules, except they allow one underscore (to support a usage pattern Context_Type). The first character after the underscore is also uppercase.
* Use seconds instead of milliseconds in details.

Added a check when events are logged in simulation that logs a message to stderr if the first two rules above aren't followed.

This probably doesn't address every instance of the above problems, but all of the events I was able to hit in simulation pass the check.

											
										
										
											2018-06-09 02:11:08 +08:00
+											TraceEvent(SevWarnAlways, "LaunchingQueueSlowx1000").detail("Elapsed", now() - startTime);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 										/*if( startedHere > 0 ) {
-												Add a new DataDistributor role.

Let cluster controller to start a new data distributor role by sending a
message to a chosen worker.
Change MasterInterface usage in DataDistribution to masterId

Add DataDistributor rejoin handling.

This allows the data distributor to tell the new cluster controller of its
existence so that the controller doesn't spawn a new one. I.e., there should
be only ONE data distributor in the cluster.

If DataDistributor (DD) doesn't join in a while, then ClusterController (CC) tries
to recruit one as DD. CC also monitors DD and restarts one if it failed.

The Proxy is also monitoring the DD. If DD failed, the Proxy will ask CC for
the new DD.

Add GetRecoveryInfo RPC to master server, which is called by data distributor
to obtain the recovery Transaction version from the master server.

											
										
										
											2018-12-14 05:31:37 +08:00
+										    TraceEvent("StartedDDRelocators", distributorId)
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										        .detail("QueueSize", queuedRelocations)
 										        .detail("StartedHere", startedHere)
 										        .detail("ActiveRelocations", activeRelocations);
 										} */
 										validate();
 									}
-												Add MAX_SNAPSHOT_FAULT_TOLERANCE knob

											
										
										
											2022-04-04 13:31:45 +08:00
 									int getHighestPriorityRelocation() const {
 										int highestPriority{ 0 };
 										for (const auto& [priority, count] : priority_relocations) {
 											if (count > 0) {
 												highestPriority = std::max(highestPriority, priority);
 											}
 										}
 										return highestPriority;
 									}
-												add canQueue

											
										
										
											2022-04-21 06:28:03 +08:00
-												change canQueue to timeThrottle()

											
										
										
											2022-04-23 06:26:44 +08:00
+									// return true if the servers are throttled as source for read rebalance
-												add storage metric compare knob; timeThrottle with constant

											
										
										
											2022-04-28 14:37:35 +08:00
+									bool timeThrottle(const std::vector<UID>& ids) const {
 										return std::any_of(ids.begin(), ids.end(), [this](const UID& id) {
-												change canQueue to timeThrottle()

											
										
										
											2022-04-23 06:26:44 +08:00
+											if (this->lastAsSource.count(id)) {
-												change all criteria to knobs

											
										
										
											2022-05-13 07:30:21 +08:00
+												return (now() - this->lastAsSource.at(id)) * SERVER_KNOBS->READ_REBALANCE_SRC_PARALLELISM <
 												       SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL;
-												move canQueue, 60s each source server, random select portion of shards

											
										
										
											2022-04-21 13:19:56 +08:00
+											}
-												change canQueue to timeThrottle()

											
										
										
											2022-04-23 06:26:44 +08:00
+											return false;
-												add canQueue

											
										
										
											2022-04-21 06:28:03 +08:00
+										});
 									}
-												change canQueue to timeThrottle()

											
										
										
											2022-04-23 06:26:44 +08:00
 									void updateLastAsSource(const std::vector<UID>& ids, double t = now()) {
 										for (auto& id : ids)
 											lastAsSource[id] = t;
 									}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
 									// Schedules cancellation of a data move.
 									void enqueueCancelledDataMove(UID dataMoveId, KeyRange range, const DDEnabledState* ddEnabledState) {
-												DDQueue constructor with ITxnProcessor

											
										
										
											2022-09-22 01:56:22 +08:00
+										ASSERT(!txnProcessor->isMocked()); // the mock implementation currently doesn't support data move
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										std::vector<Future<Void>> cleanup;
 										auto f = this->dataMoves.intersectingRanges(range);
 										for (auto it = f.begin(); it != f.end(); ++it) {
 											if (it->value().isValid()) {
 												TraceEvent(SevError, "DDEnqueueCancelledDataMoveConflict", this->distributorId)
 												    .detail("DataMoveID", dataMoveId)
 												    .detail("CancelledRange", range)
 												    .detail("ConflictingDataMoveID", it->value().id)
 												    .detail("ConflictingRange", KeyRangeRef(it->range().begin, it->range().end));
 												return;
 											}
 										}
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+										DDQueue::DDDataMove dataMove(dataMoveId);
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										dataMove.cancel = cleanUpDataMove(
 										    this->cx, dataMoveId, this->lock, &this->cleanUpDataMoveParallelismLock, range, ddEnabledState);
 										this->dataMoves.insert(range, dataMove);
 										TraceEvent(SevInfo, "DDEnqueuedCancelledDataMove", this->distributorId)
 										    .detail("DataMoveID", dataMoveId)
 										    .detail("Range", range);
 									}
-												ddqueue.periodicalRefreshCounter()

											
										
										
											2022-08-06 06:26:34 +08:00
 									Future<Void> periodicalRefreshCounter() {
 										auto f = [this]() {
 											serverCounter.traceAll(distributorId);
 											serverCounter.clear();
 										};
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+										return recurring(f, SERVER_KNOBS->DD_QUEUE_COUNTER_REFRESH_INTERVAL);
-												ddqueue.periodicalRefreshCounter()

											
										
										
											2022-08-06 06:26:34 +08:00
+									}
-												split DD related headers

											
										
										
											2022-08-17 05:32:55 +08:00
 									int getUnhealthyRelocationCount() override { return unhealthyRelocations; }
-												Actor to DDQueue methods

											
										
										
											2022-09-22 08:57:40 +08:00
 									Future<SrcDestTeamPair> getSrcDestTeams(const int& teamCollectionIndex,
 									                                        const GetTeamRequest& srcReq,
 									                                        const GetTeamRequest& destReq,
 									                                        const int& priority,
 									                                        TraceEvent* traceEvent);
 									Future<bool> rebalanceReadLoad(DataMovementReason moveReason,
 									                               Reference<IDataDistributionTeam> sourceTeam,
 									                               Reference<IDataDistributionTeam> destTeam,
 									                               bool primary,
 									                               TraceEvent* traceEvent);
 									Future<bool> rebalanceTeams(DataMovementReason moveReason,
 									                            Reference<IDataDistributionTeam const> sourceTeam,
 									                            Reference<IDataDistributionTeam const> destTeam,
 									                            bool primary,
 									                            TraceEvent* traceEvent);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								};
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+								ACTOR Future<Void> cancelDataMove(struct DDQueue* self, KeyRange range, const DDEnabledState* ddEnabledState) {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									std::vector<Future<Void>> cleanup;
 									auto f = self->dataMoves.intersectingRanges(range);
 									for (auto it = f.begin(); it != f.end(); ++it) {
 										if (!it->value().isValid()) {
 											continue;
 										}
 										KeyRange keys = KeyRangeRef(it->range().begin, it->range().end);
 										TraceEvent(SevInfo, "DDQueueCancelDataMove", self->distributorId)
 										    .detail("DataMoveID", it->value().id)
 										    .detail("DataMoveRange", keys)
 										    .detail("Range", range);
 										if (!it->value().cancel.isValid()) {
 											it->value().cancel = cleanUpDataMove(
 											    self->cx, it->value().id, self->lock, &self->cleanUpDataMoveParallelismLock, keys, ddEnabledState);
 										}
 										cleanup.push_back(it->value().cancel);
 									}
 									wait(waitForAll(cleanup));
 									auto ranges = self->dataMoves.getAffectedRangesAfterInsertion(range);
 									if (!ranges.empty()) {
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+										self->dataMoves.insert(KeyRangeRef(ranges.front().begin, ranges.back().end), DDQueue::DDDataMove());
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+									}
 									return Void();
 								}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+								static std::string destServersString(std::vector<std::pair<Reference<IDataDistributionTeam>, bool>> const& bestTeams) {
 									std::stringstream ss;
 									for (auto& tc : bestTeams) {
 										for (const auto& id : tc.first->getServerIDs()) {
 											ss << id.toString() << " ";
 										}
 									}
 									return std::move(ss).str();
 								}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+								// This actor relocates the specified keys to a good place.
-												DD:Add comments to help understand code

Add comments to explain the functionalities of some code.

											
										
										
											2019-07-20 07:22:15 +08:00
+								// The inFlightActor key range map stores the actor for each RelocateData
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+								ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+								                                             RelocateData rd,
 								                                             Future<Void> prevCleanup,
 								                                             const DDEnabledState* ddEnabledState) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									state Promise<Void> errorOut(self->error);
-												DebugRelocationTraceEvent; TraceInterval randomId:

											
										
										
											2022-08-05 06:28:33 +08:00
+									state TraceInterval relocateShardInterval("RelocateShard", rd.randomId);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									state PromiseStream<RelocateData> dataTransferComplete(self->dataTransferComplete);
 									state PromiseStream<RelocateData> relocationComplete(self->relocationComplete);
 									state bool signalledTransferComplete = false;
-												Add a new DataDistributor role.

Let cluster controller to start a new data distributor role by sending a
message to a chosen worker.
Change MasterInterface usage in DataDistribution to masterId

Add DataDistributor rejoin handling.

This allows the data distributor to tell the new cluster controller of its
existence so that the controller doesn't spawn a new one. I.e., there should
be only ONE data distributor in the cluster.

If DataDistributor (DD) doesn't join in a while, then ClusterController (CC) tries
to recruit one as DD. CC also monitors DD and restarts one if it failed.

The Proxy is also monitoring the DD. If DD failed, the Proxy will ask CC for
the new DD.

Add GetRecoveryInfo RPC to master server, which is called by data distributor
to obtain the recovery Transaction version from the master server.

											
										
										
											2018-12-14 05:31:37 +08:00
+									state UID distributorId = self->distributorId;
-												removed a separately configurable storage team size for the remote data center, because it did not make sense
fix: the master did not monitor for the failure of remote logs
stop merge attempts when a data center is failed
fixed a variety of other problems with data distribution when a data center is failed

											
										
										
											2018-02-03 03:46:04 +08:00
+									state ParallelTCInfo healthyDestinations;
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
-												removed a separately configurable storage team size for the remote data center, because it did not make sense
fix: the master did not monitor for the failure of remote logs
stop merge attempts when a data center is failed
fixed a variety of other problems with data distribution when a data center is failed

											
										
										
											2018-02-03 03:46:04 +08:00
+									state bool anyHealthy = false;
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+									state bool allHealthy = true;
 									state bool anyWithSource = false;
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+									state bool anyDestOverloaded = false;
 									state int destOverloadedCount = 0;
 									state int stuckCount = 0;
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+									state std::vector<std::pair<Reference<IDataDistributionTeam>, bool>> bestTeams;
-												added additional trace events to warn when different parts of shard relocations take more than 10 minutes

											
										
										
											2019-08-17 05:56:58 +08:00
+									state double startTime = now();
 									state std::vector<UID> destIds;
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+									state uint64_t debugID = deterministicRandom()->randomUInt64();
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									try {
-												suppressed trace events that are spammy

											
										
										
											2018-02-17 08:01:19 +08:00
+										if (now() - self->lastInterval < 1.0) {
 											relocateShardInterval.severity = SevDebug;
 											self->suppressIntervals++;
 										}
-												Add a new DataDistributor role.

Let cluster controller to start a new data distributor role by sending a
message to a chosen worker.
Change MasterInterface usage in DataDistribution to masterId

Add DataDistributor rejoin handling.

This allows the data distributor to tell the new cluster controller of its
existence so that the controller doesn't spawn a new one. I.e., there should
be only ONE data distributor in the cluster.

If DataDistributor (DD) doesn't join in a while, then ClusterController (CC) tries
to recruit one as DD. CC also monitors DD and restarts one if it failed.

The Proxy is also monitoring the DD. If DD failed, the Proxy will ask CC for
the new DD.

Add GetRecoveryInfo RPC to master server, which is called by data distributor
to obtain the recovery Transaction version from the master server.

											
										
										
											2018-12-14 05:31:37 +08:00
+										TraceEvent(relocateShardInterval.begin(), distributorId)
-												Remove trace-calls to printable (in non-workloads)

											
										
										
											2019-03-19 06:03:43 +08:00
+										    .detail("KeyBegin", rd.keys.begin)
 										    .detail("KeyEnd", rd.keys.end)
-												suppressed trace events that are spammy

											
										
										
											2018-02-17 08:01:19 +08:00
+										    .detail("Priority", rd.priority)
 										    .detail("SuppressedEventCount", self->suppressIntervals);
 										if (relocateShardInterval.severity != SevDebug) {
 											self->lastInterval = now();
 											self->suppressIntervals = 0;
 										}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Change SHARD_ENCODE_LOCATION_METADATA to a server knob. (#7770)

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-08-04 04:51:40 +08:00
+										if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+											auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
 											ASSERT(inFlightRange.range() == rd.keys);
 											ASSERT(inFlightRange.value().randomId == rd.randomId);
 											ASSERT(inFlightRange.value().dataMoveId == rd.dataMoveId);
 											inFlightRange.value().cancellable = false;
 											wait(prevCleanup);
 											auto f = self->dataMoves.intersectingRanges(rd.keys);
 											for (auto it = f.begin(); it != f.end(); ++it) {
 												KeyRangeRef kr(it->range().begin, it->range().end);
 												const UID mId = it->value().id;
 												if (mId.isValid() && mId != rd.dataMoveId) {
 													TraceEvent("DDRelocatorConflictingDataMove", distributorId)
 													    .detail("CurrentDataMoveID", rd.dataMoveId)
 													    .detail("DataMoveID", mId)
 													    .detail("Range", kr);
 												}
 											}
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+											if (rd.isRestore() || !SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 												if (SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 													ASSERT(rd.dataMoveId.isValid());
 												}
 												self->dataMoves.insert(rd.keys, DDQueue::DDDataMove(rd.dataMoveId));
 											}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										}
-												reset several method use getShardMetrics

											
										
										
											2022-05-04 15:00:03 +08:00
+										state StorageMetrics metrics =
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										    wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(rd.keys))));
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+										state uint64_t physicalShardIDCandidate = UID().first();
 										state bool forceToUseNewPhysicalShard = false;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										ASSERT(rd.src.size());
 										loop {
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+											destOverloadedCount = 0;
 											stuckCount = 0;
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+											state DDQueue::RetryFindDstReason retryFindDstReason = DDQueue::RetryFindDstReason::None;
-												TeamCollection: clang-format

Format the changes with git clang-format.
No functional changes.

Signed-off-by: Meng Xu <meng_xu@apple.com>

											
										
										
											2018-11-22 03:18:26 +08:00
+											// state int bestTeamStuckThreshold = 50;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											loop {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+												state int tciIndex = 0;
 												state bool foundTeams = true;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+												state bool bestTeamReady = false;
-												removed a separately configurable storage team size for the remote data center, because it did not make sense
fix: the master did not monitor for the failure of remote logs
stop merge attempts when a data center is failed
fixed a variety of other problems with data distribution when a data center is failed

											
										
										
											2018-02-03 03:46:04 +08:00
+												anyHealthy = false;
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+												allHealthy = true;
 												anyWithSource = false;
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+												anyDestOverloaded = false;
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+												bestTeams.clear();
-												Revert "Revert "Properly set simulation test for perpetual storage wiggle and bug fixing""

This reverts commit ad576e8c2022b1e9ff92ce3adbf5086e317b9353.

											
										
										
											2021-06-12 06:58:05 +08:00
+												// Get team from teamCollections in different DCs and find the best one
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+												while (tciIndex < self->teamCollections.size()) {
-												Change SHARD_ENCODE_LOCATION_METADATA to a server knob. (#7770)

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-08-04 04:51:40 +08:00
+													if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && rd.isRestore()) {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+														auto req = GetTeamRequest(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest);
 														Future<std::pair<Optional<Reference<IDataDistributionTeam>>, bool>> fbestTeam =
 														    brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req));
 														bestTeamReady = fbestTeam.isReady();
 														std::pair<Optional<Reference<IDataDistributionTeam>>, bool> bestTeam = wait(fbestTeam);
 														if (tciIndex > 0 && !bestTeamReady) {
 															// self->shardsAffectedByTeamFailure->moveShard must be called without any waits after
 															// getting the destination team or we could miss failure notifications for the storage
 															// servers in the destination team
 															TraceEvent("BestTeamNotReady")
 															    .detail("TeamCollectionIndex", tciIndex)
 															    .detail("RestoreDataMoveForDest",
 															            describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest));
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+															retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+															foundTeams = false;
 															break;
 														}
 														if (!bestTeam.first.present() || !bestTeam.first.get()->isHealthy()) {
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+															retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
 															                                   : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+															foundTeams = false;
 															break;
 														}
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+														anyHealthy = true;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+														bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
 													} else {
 														double inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_HEALTHY;
 														if (rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
 														    rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT)
 															inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
 														if (rd.healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
 														    rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
 														    rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT)
 															inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;
 														auto req = GetTeamRequest(WantNewServers(rd.wantsNewServers),
 														                          WantTrueBest(isValleyFillerPriority(rd.priority)),
 														                          PreferLowerDiskUtil::True,
 														                          TeamMustHaveShards::False,
 														                          ForReadBalance(rd.reason == RelocateReason::REBALANCE_READ),
 														                          PreferLowerReadUtil::True,
 														                          inflightPenalty);
 														req.src = rd.src;
 														req.completeSources = rd.completeSources;
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+														if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
 														    tciIndex == 1) {
 															ASSERT(physicalShardIDCandidate != UID().first() &&
 															       physicalShardIDCandidate != anonymousShardId.first());
 															Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
 															    self->physicalShardCollection->tryGetAvailableRemoteTeamWith(
 															        physicalShardIDCandidate, metrics, debugID);
 															if (remoteTeamWithPhysicalShard.present()) {
 																// Exists a remoteTeam in the mapping that has the physicalShardIDCandidate
 																// use the remoteTeam with the physicalShard as the bestTeam
 																req = GetTeamRequest(remoteTeamWithPhysicalShard.get().servers);
 															}
 														}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+														// bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any
 														// server that hosts the relocateData. This is possible, for example, in a fearless
 														// configuration when the remote DC is just brought up.
 														Future<std::pair<Optional<Reference<IDataDistributionTeam>>, bool>> fbestTeam =
 														    brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req));
 														bestTeamReady = fbestTeam.isReady();
 														std::pair<Optional<Reference<IDataDistributionTeam>>, bool> bestTeam = wait(fbestTeam);
 														if (tciIndex > 0 && !bestTeamReady) {
 															// self->shardsAffectedByTeamFailure->moveShard must be called without any waits after
 															// getting the destination team or we could miss failure notifications for the storage
 															// servers in the destination team
 															TraceEvent("BestTeamNotReady");
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+															retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+															foundTeams = false;
 															break;
 														}
 														// If a DC has no healthy team, we stop checking the other DCs until
 														// the unhealthy DC is healthy again or is excluded.
 														if (!bestTeam.first.present()) {
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+															retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
 															                                   : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+															foundTeams = false;
 															break;
 														}
 														if (!bestTeam.first.get()->isHealthy()) {
 															allHealthy = false;
 														} else {
 															anyHealthy = true;
 														}
-												fix: the check for if a teamCollection was tracking a source server was unreliable, leading to scenarios where we would temporarily replicate a shard less than teamSIze

											
										
										
											2020-06-30 01:02:27 +08:00
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+														if (bestTeam.second) {
 															anyWithSource = true;
 														}
-												Add comment to DDQueue GetTeam

Comments to help understand Evan's PR 3487
that fix the problem: replication factor could drop unexpected in fearless config.

											
										
										
											2020-07-14 08:05:12 +08:00
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+														if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 															// critical to the correctness of team selection by PhysicalShardCollection
 															// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
 															// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In this
 															// case, we must re-select a remote team We set foundTeams = false to avoid finishing team
 															// selection Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select
 															// a remote team
 															if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
 																bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
 																if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+																	retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+																	foundTeams = false;
 																	break;
 																}
 															}
 														}
 														if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 															bestTeams.emplace_back(bestTeam.first.get(), true);
 															// Always set bestTeams[i].second = true to disable optimization in data move between DCs
 															// for the correctness of PhysicalShardCollection
 															// Currently, enabling the optimization will break the invariant of PhysicalShardCollection
 															// Invariant: once a physical shard is created with a specific set of SSes, this SS set will
 															// never get changed.
 														} else {
 															bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
 														}
 														// get physicalShardIDCandidate
 														if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
 														    tciIndex == 0) {
 															ASSERT(foundTeams);
 															ShardsAffectedByTeamFailure::Team primaryTeam =
 															    ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
 															physicalShardIDCandidate =
 															    self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
 															        primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
 															ASSERT(physicalShardIDCandidate != UID().first() &&
 															       physicalShardIDCandidate != anonymousShardId.first());
 														}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+													}
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+													tciIndex++;
 												}
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
 												// critical to the correctness of team selection by PhysicalShardCollection
 												// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary team
 												// Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team
 												// In this case, we must re-select a remote team
 												// We set foundTeams = false to avoid finishing team selection
 												// Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select a remote team
 												if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
 												    bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
 													if (!bestTeams[1].first->isHealthy()) {
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+														retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+														foundTeams = false;
 													}
 												}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+												// once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves
 												// already
 												anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap);
 												if (foundTeams && anyHealthy && !anyDestOverloaded) {
 													ASSERT(rd.completeDests.empty());
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													break;
 												}
-												TeamCollection: Use machine team to create server team

Current server team collection logic does not consider
the fact that multipe storage servers can run on the same machine.
When multiple machines fail, all servers on the machines will fail, and
the possibility of having one process team fail and lose data is very high.

To reduce the possibility of losing data when multiple machine fails,
we first create machine teams which span across different fault zones;
we then create server teams based on machine teams by
first picking 1 machine team, and then
picking 1 server from each machine in the machine team.

Signed-off-by: Meng Xu <meng_xu@apple.com>

											
										
										
											2018-08-30 05:40:39 +08:00
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+												if (anyDestOverloaded) {
-												Make TEST macros C++ only (#7558)

* proof of concept

* use code-probe instead of test

* code probe working on gcc

* code probe implemented

* renamed TestProbe to CodeProbe

* fixed refactoring typo

* support filtered output

* print probes at end of simulation

* fix missed probes print

* fix deduplication

* Fix refactoring issues

* revert bad refactor

* make sure file paths are relative

* fix more wrong refactor changes
											
										
										
											2022-07-20 04:15:51 +08:00
+													CODE_PROBE(true, "Destination overloaded throttled move");
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+													destOverloadedCount++;
 													TraceEvent(destOverloadedCount > 50 ? SevInfo : SevDebug, "DestSSBusy", distributorId)
 													    .suppressFor(1.0)
 													    .detail("StuckCount", stuckCount)
 													    .detail("DestOverloadedCount", destOverloadedCount)
 													    .detail("TeamCollectionId", tciIndex)
 													    .detail("AnyDestOverloaded", anyDestOverloaded)
 													    .detail("NumOfTeamCollections", self->teamCollections.size())
 													    .detail("Servers", destServersString(bestTeams));
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+													if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 														if (rd.isRestore() && destOverloadedCount > 50) {
 															throw data_move_dest_team_not_found();
 														}
 													}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+													wait(delay(SERVER_KNOBS->DEST_OVERLOADED_DELAY, TaskPriority::DataDistributionLaunch));
 												} else {
-												Make TEST macros C++ only (#7558)

* proof of concept

* use code-probe instead of test

* code probe working on gcc

* code probe implemented

* renamed TestProbe to CodeProbe

* fixed refactoring typo

* support filtered output

* print probes at end of simulation

* fix missed probes print

* fix deduplication

* Fix refactoring issues

* revert bad refactor

* make sure file paths are relative

* fix more wrong refactor changes
											
										
										
											2022-07-20 04:15:51 +08:00
+													CODE_PROBE(true, "did not find a healthy destination team on the first attempt");
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+													stuckCount++;
 													TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", distributorId)
 													    .suppressFor(1.0)
 													    .detail("StuckCount", stuckCount)
 													    .detail("DestOverloadedCount", destOverloadedCount)
 													    .detail("TeamCollectionId", tciIndex)
 													    .detail("AnyDestOverloaded", anyDestOverloaded)
 													    .detail("NumOfTeamCollections", self->teamCollections.size());
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+													if (rd.isRestore() && stuckCount > 50) {
 														throw data_move_dest_team_not_found();
 													}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+													wait(delay(SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch));
 												}
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+												// When forceToUseNewPhysicalShard = false, we get paired primary team and remote team
 												// However, this may be failed
 												// Any retry triggers to use new physicalShard which enters the normal routine
 												if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 													forceToUseNewPhysicalShard = true;
 												}
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
 												// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											}
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+											if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 												if (!rd.isRestore()) {
 													// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
 													// thus, update the physicalShardIDCandidate to related data structures
 													ASSERT(physicalShardIDCandidate != UID().first());
-												Add a counter to track physical shard creation throuogh moves

											
										
										
											2022-10-20 13:09:04 +08:00
+													if (self->physicalShardCollection->physicalShardExists(physicalShardIDCandidate)) {
 														self->moveReusePhysicalShard++;
 													} else {
 														self->moveCreateNewPhysicalShard++;
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+														if (retryFindDstReason == DDQueue::RetryFindDstReason::None) {
-												Count the detailed reason for new physical shard creation during data move

											
										
										
											2022-10-23 11:48:58 +08:00
+															// When creating a new physical shard, but the reason is none, this can only happen when
 															// determinePhysicalShardIDGivenPrimaryTeam() finds that there is no available physical
 															// shard.
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+															self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++;
-												Count the detailed reason for new physical shard creation during data move

											
										
										
											2022-10-23 11:48:58 +08:00
+														} else {
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+															self->retryFindDstReasonCount[retryFindDstReason]++;
-												Count the detailed reason for new physical shard creation during data move

											
										
										
											2022-10-23 11:48:58 +08:00
+														}
-												Add a counter to track physical shard creation throuogh moves

											
										
										
											2022-10-20 13:09:04 +08:00
+													}
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+													rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
 													auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
 													inFlightRange.value().dataMoveId = rd.dataMoveId;
 													auto f = self->dataMoves.intersectingRanges(rd.keys);
 													for (auto it = f.begin(); it != f.end(); ++it) {
 														KeyRangeRef kr(it->range().begin, it->range().end);
 														const UID mId = it->value().id;
 														if (mId.isValid() && mId != rd.dataMoveId) {
 															TraceEvent("DDRelocatorConflictingDataMoveAfterGetTeam", distributorId)
 															    .detail("CurrentDataMoveID", rd.dataMoveId)
 															    .detail("DataMoveID", mId)
 															    .detail("Range", kr);
 														}
 													}
 													self->dataMoves.insert(rd.keys, DDQueue::DDDataMove(rd.dataMoveId));
 												}
 												ASSERT(rd.dataMoveId.first() != UID().first());
 												auto dataMoveRange = self->dataMoves.rangeContaining(rd.keys.begin);
 												ASSERT(dataMoveRange.value().id == rd.dataMoveId);
 											}
-												Better cancelling logic that reflects whether move has actually started

											
										
										
											2022-02-25 23:33:46 +08:00
+											// set cancellable to false on inFlight's entry for this key range
 											auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
 											ASSERT(inFlightRange.range() == rd.keys);
 											ASSERT(inFlightRange.value().randomId == rd.randomId);
 											inFlightRange.value().cancellable = false;
-												added additional trace events to warn when different parts of shard relocations take more than 10 minutes

											
										
										
											2019-08-17 05:56:58 +08:00
+											destIds.clear();
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+											state std::vector<UID> healthyIds;
 											state std::vector<UID> extraIds;
 											state std::vector<ShardsAffectedByTeamFailure::Team> destinationTeams;
 											for (int i = 0; i < bestTeams.size(); i++) {
 												auto& serverIds = bestTeams[i].first->getServerIDs();
 												destinationTeams.push_back(ShardsAffectedByTeamFailure::Team(serverIds, i == 0));
-												TeamCollection: getTeam may add a new team

getTeam function may add a new team for the GetTeamRequest.
We need to check if the number of teams is larger than the desired team number.

											
										
										
											2019-02-13 06:57:33 +08:00
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+												// TODO(psm): Make DataMoveMetaData aware of the two-step data move optimization.
-												TeamCollection: getTeam may add a new team

getTeam function may add a new team for the GetTeamRequest.
We need to check if the number of teams is larger than the desired team number.

											
										
										
											2019-02-13 06:57:33 +08:00
+												if (allHealthy && anyWithSource && !bestTeams[i].second) {
-												Add comment to DDQueue GetTeam

Comments to help understand Evan's PR 3487
that fix the problem: replication factor could drop unexpected in fearless config.

											
										
										
											2020-07-14 08:05:12 +08:00
+													// When all servers in bestTeams[i] do not hold the shard (!bestTeams[i].second), it indicates
 													// the bestTeams[i] is in a new DC where data has not been replicated to.
 													// To move data (specified in RelocateShard) to bestTeams[i] in the new DC AND reduce data movement
 													// across DC, we randomly choose a server in bestTeams[i] as the shard's destination, and
-												TeamCollection: getTeam may add a new team

getTeam function may add a new team for the GetTeamRequest.
We need to check if the number of teams is larger than the desired team number.

											
										
										
											2019-02-13 06:57:33 +08:00
+													// move the shard to the randomly chosen server (in the remote DC), which will later
 													// propogate its data to the servers in the same team. This saves data movement bandwidth across DC
-												Replace g_random and g_nondeterministic_random with functions deterministicRandom() and nondeterministicRandom() that return thread_local random number generators. Delete g_debug_random and trace_random. Allow only deterministicRandom() to be seeded, and require it to be seeded from each thread on which it is used.

											
										
										
											2019-05-11 05:01:52 +08:00
+													int idx = deterministicRandom()->randomInt(0, serverIds.size());
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+													destIds.push_back(serverIds[idx]);
 													healthyIds.push_back(serverIds[idx]);
 													for (int j = 0; j < serverIds.size(); j++) {
 														if (j != idx) {
 															extraIds.push_back(serverIds[j]);
 														}
 													}
 													healthyDestinations.addTeam(bestTeams[i].first);
 												} else {
 													destIds.insert(destIds.end(), serverIds.begin(), serverIds.end());
 													if (bestTeams[i].first->isHealthy()) {
 														healthyIds.insert(healthyIds.end(), serverIds.begin(), serverIds.end());
 														healthyDestinations.addTeam(bestTeams[i].first);
 													}
 												}
 											}
-												TeamCollection: Use machine team to create server team

Current server team collection logic does not consider
the fact that multipe storage servers can run on the same machine.
When multiple machines fail, all servers on the machines will fail, and
the possibility of having one process team fail and lose data is very high.

To reduce the possibility of losing data when multiple machine fails,
we first create machine teams which span across different fault zones;
we then create server teams based on machine teams by
first picking 1 machine team, and then
picking 1 server from each machine in the machine team.

Signed-off-by: Meng Xu <meng_xu@apple.com>

											
										
										
											2018-08-30 05:40:39 +08:00
+											// Sanity check
 											state int totalIds = 0;
-												TeamCollection: clang-format

Format the changes with git clang-format.
No functional changes.

Signed-off-by: Meng Xu <meng_xu@apple.com>

											
										
										
											2018-11-22 03:18:26 +08:00
+											for (auto& destTeam : destinationTeams) {
-												TeamCollection: Use machine team to create server team

Current server team collection logic does not consider
the fact that multipe storage servers can run on the same machine.
When multiple machines fail, all servers on the machines will fail, and
the possibility of having one process team fail and lose data is very high.

To reduce the possibility of losing data when multiple machine fails,
we first create machine teams which span across different fault zones;
we then create server teams based on machine teams by
first picking 1 machine team, and then
picking 1 server from each machine in the machine team.

Signed-off-by: Meng Xu <meng_xu@apple.com>

											
										
										
											2018-08-30 05:40:39 +08:00
+												totalIds += destTeam.servers.size();
 											}
-												TeamCollection: clang-format

Format the changes with git clang-format.
No functional changes.

Signed-off-by: Meng Xu <meng_xu@apple.com>

											
										
										
											2018-11-22 03:18:26 +08:00
+											if (totalIds != self->teamSize) {
 												TraceEvent(SevWarn, "IncorrectDestTeamSize")
 												    .suppressFor(1.0)
 												    .detail("ExpectedTeamSize", self->teamSize)
 												    .detail("DestTeamSize", totalIds);
-												TeamCollection: Use machine team to create server team

Current server team collection logic does not consider
the fact that multipe storage servers can run on the same machine.
When multiple machines fail, all servers on the machines will fail, and
the possibility of having one process team fail and lose data is very high.

To reduce the possibility of losing data when multiple machine fails,
we first create machine teams which span across different fault zones;
we then create server teams based on machine teams by
first picking 1 machine team, and then
picking 1 server from each machine in the machine team.

Signed-off-by: Meng Xu <meng_xu@apple.com>

											
										
										
											2018-08-30 05:40:39 +08:00
+											}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+											if (!rd.isRestore()) {
 												self->shardsAffectedByTeamFailure->moveShard(rd.keys, destinationTeams);
 											}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+											// FIXME: do not add data in flight to servers that were already in the src.
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+											healthyDestinations.addDataInFlightToTeam(+metrics.bytes);
 											healthyDestinations.addReadInFlightToTeam(+metrics.bytesReadPerKSecond);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+											launchDest(rd, bestTeams, self->destBusymap);
-												DD:Add trace for detailed relocate shard info

											
										
										
											2020-03-01 05:45:00 +08:00
+											if (SERVER_KNOBS->DD_ENABLE_VERBOSE_TRACING) {
 												// StorageMetrics is the rd shard's metrics, e.g., bytes and write bandwidth
 												TraceEvent(SevInfo, "RelocateShardDecision", distributorId)
 												    .detail("PairId", relocateShardInterval.pairID)
 												    .detail("Priority", rd.priority)
 												    .detail("KeyBegin", rd.keys.begin)
 												    .detail("KeyEnd", rd.keys.end)
 												    .detail("StorageMetrics", metrics.toString())
 												    .detail("SourceServers", describe(rd.src))
 												    .detail("DestinationTeam", describe(destIds))
 												    .detail("ExtraIds", describe(extraIds));
 											} else {
 												TraceEvent(relocateShardInterval.severity, "RelocateShardHasDestination", distributorId)
 												    .detail("PairId", relocateShardInterval.pairID)
-												add ReadInFlight

											
										
										
											2022-03-29 05:20:07 +08:00
+												    .detail("Priority", rd.priority)
-												When moving dispaching shard relocations, log the source and destination storage servers.

											
										
										
											2020-11-06 08:13:18 +08:00
+												    .detail("KeyBegin", rd.keys.begin)
 												    .detail("KeyEnd", rd.keys.end)
 												    .detail("SourceServers", describe(rd.src))
-												DD:Add trace for detailed relocate shard info

											
										
										
											2020-03-01 05:45:00 +08:00
+												    .detail("DestinationTeam", describe(destIds))
 												    .detail("ExtraIds", describe(extraIds));
 											}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+											self->serverCounter.increaseForTeam(rd.src, rd.reason, DDQueue::ServerCounter::LaunchedSource);
 											self->serverCounter.increaseForTeam(destIds, rd.reason, DDQueue::ServerCounter::LaunchedDest);
 											self->serverCounter.increaseForTeam(extraIds, rd.reason, DDQueue::ServerCounter::LaunchedDest);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											state Error error = success();
 											state Promise<Void> dataMovementComplete;
-												FastRestore:Add debug msg when memory is over threshold

											
										
										
											2020-02-28 10:32:02 +08:00
+											// Move keys from source to destination by changing the serverKeyList and keyServerList system keys
-												Add MoveKeysParams struct; use txnProcessor->moveKeys()

											
										
										
											2022-09-13 06:40:18 +08:00
+											state Future<Void> doMoveKeys =
 											    self->txnProcessor->moveKeys(MoveKeysParams{ rd.dataMoveId,
 											                                                 rd.keys,
 											                                                 destIds,
 											                                                 healthyIds,
 											                                                 self->lock,
 											                                                 dataMovementComplete,
 											                                                 &self->startMoveKeysParallelismLock,
 											                                                 &self->finishMoveKeysParallelismLock,
 											                                                 self->teamCollections.size() > 1,
 											                                                 relocateShardInterval.pairID,
 											                                                 ddEnabledState,
 											                                                 CancelConflictingDataMoves::False });
-												A giant translation of TaskFooPriority -> TaskPriority::Foo

This is so that APIs that take priorities don't take ints, which are
common and easy to accidentally pass the wrong thing.

											
										
										
											2019-06-25 17:47:35 +08:00
+											state Future<Void> pollHealth =
 											    signalledTransferComplete ? Never()
 											                              : delay(SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											try {
 												loop {
 													choose {
-												Rewrite all `Void _ = wait(...)` -> `wait(...)`.

This takes advantage of the new actorcompiler functionality to avoid
having duplicate definitions of `Void _` when trying to feed the
un-actorompiled source through clang.

											
										
										
											2018-08-11 04:57:10 +08:00
+														when(wait(doMoveKeys)) {
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+															if (extraIds.size()) {
 																destIds.insert(destIds.end(), extraIds.begin(), extraIds.end());
 																healthyIds.insert(healthyIds.end(), extraIds.begin(), extraIds.end());
 																extraIds.clear();
-												TeamCollection: clang-format

Format the changes with git clang-format.
No functional changes.

Signed-off-by: Meng Xu <meng_xu@apple.com>

											
										
										
											2018-11-22 03:18:26 +08:00
+																ASSERT(totalIds == destIds.size()); // Sanity check the destIDs before we move keys
-												Add MoveKeysParams struct; use txnProcessor->moveKeys()

											
										
										
											2022-09-13 06:40:18 +08:00
+																doMoveKeys =
 																    self->txnProcessor->moveKeys(MoveKeysParams{ rd.dataMoveId,
 																                                                 rd.keys,
 																                                                 destIds,
 																                                                 healthyIds,
 																                                                 self->lock,
 																                                                 Promise<Void>(),
 																                                                 &self->startMoveKeysParallelismLock,
 																                                                 &self->finishMoveKeysParallelismLock,
 																                                                 self->teamCollections.size() > 1,
 																                                                 relocateShardInterval.pairID,
 																                                                 ddEnabledState,
 																                                                 CancelConflictingDataMoves::False });
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+															} else {
 																self->fetchKeysComplete.insert(rd);
-												Change SHARD_ENCODE_LOCATION_METADATA to a server knob. (#7770)

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-08-04 04:51:40 +08:00
+																if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+																	auto ranges = self->dataMoves.getAffectedRangesAfterInsertion(rd.keys);
 																	if (ranges.size() == 1 && static_cast<KeyRange>(ranges[0]) == rd.keys &&
 																	    ranges[0].value.id == rd.dataMoveId && !ranges[0].value.cancel.isValid()) {
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+																		self->dataMoves.insert(rd.keys, DDQueue::DDDataMove());
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+																		TraceEvent(SevVerbose, "DequeueDataMoveOnSuccess", self->distributorId)
 																		    .detail("DataMoveID", rd.dataMoveId)
 																		    .detail("DataMoveRange", rd.keys);
 																	}
 																}
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+																break;
 															}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+														}
-												Rewrite all `Void _ = wait(...)` -> `wait(...)`.

This takes advantage of the new actorcompiler functionality to avoid
having duplicate definitions of `Void _` when trying to feed the
un-actorompiled source through clang.

											
										
										
											2018-08-11 04:57:10 +08:00
+														when(wait(pollHealth)) {
-												removed a separately configurable storage team size for the remote data center, because it did not make sense
fix: the master did not monitor for the failure of remote logs
stop merge attempts when a data center is failed
fixed a variety of other problems with data distribution when a data center is failed

											
										
										
											2018-02-03 03:46:04 +08:00
+															if (!healthyDestinations.isHealthy()) {
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+																if (!signalledTransferComplete) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+																	signalledTransferComplete = true;
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+																	self->dataTransferComplete.send(rd);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+																}
 															}
-												A giant translation of TaskFooPriority -> TaskPriority::Foo

This is so that APIs that take priorities don't take ints, which are
common and easy to accidentally pass the wrong thing.

											
										
										
											2019-06-25 17:47:35 +08:00
+															pollHealth = signalledTransferComplete ? Never()
 															                                       : delay(SERVER_KNOBS->HEALTH_POLL_TIME,
 															                                               TaskPriority::DataDistributionLaunch);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+														}
-												Rewrite all `Void _ = wait(...)` -> `wait(...)`.

This takes advantage of the new actorcompiler functionality to avoid
having duplicate definitions of `Void _` when trying to feed the
un-actorompiled source through clang.

											
										
										
											2018-08-11 04:57:10 +08:00
+														when(wait(signalledTransferComplete ? Never() : dataMovementComplete.getFuture())) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+															self->fetchKeysComplete.insert(rd);
 															if (!signalledTransferComplete) {
 																signalledTransferComplete = true;
 																self->dataTransferComplete.send(rd);
 															}
 														}
 													}
 												}
 											} catch (Error& e) {
 												error = e;
 											}
-												Add a new DataDistributor role.

Let cluster controller to start a new data distributor role by sending a
message to a chosen worker.
Change MasterInterface usage in DataDistribution to masterId

Add DataDistributor rejoin handling.

This allows the data distributor to tell the new cluster controller of its
existence so that the controller doesn't spawn a new one. I.e., there should
be only ONE data distributor in the cluster.

If DataDistributor (DD) doesn't join in a while, then ClusterController (CC) tries
to recruit one as DD. CC also monitors DD and restarts one if it failed.

The Proxy is also monitoring the DD. If DD failed, the Proxy will ask CC for
the new DD.

Add GetRecoveryInfo RPC to master server, which is called by data distributor
to obtain the recovery Transaction version from the master server.

											
										
										
											2018-12-14 05:31:37 +08:00
+											//TraceEvent("RelocateShardFinished", distributorId).detail("RelocateId", relocateShardInterval.pairID);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 											if (error.code() != error_code_move_to_removed_server) {
 												if (!error.code()) {
 													try {
-												Send bytes input rate to DD.

											
										
										
											2019-07-26 07:27:32 +08:00
+														wait(healthyDestinations
 														         .updateStorageMetrics()); // prevent a gap between the polling for an increase in
 														                                   // storage metrics and decrementing data in flight
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													} catch (Error& e) {
 														error = e;
 													}
 												}
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+												healthyDestinations.addDataInFlightToTeam(-metrics.bytes);
 												auto readLoad = metrics.bytesReadPerKSecond;
-												add comment

											
										
										
											2022-05-28 03:14:34 +08:00
+												// Note: It’s equal to trigger([healthyDestinations, readLoad], which is a value capture of
 												// healthyDestinations. Have to create a reference to healthyDestinations because in ACTOR the state
 												// variable is actually a member variable, I can’t write trigger([healthyDestinations, readLoad]
 												// directly.
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+												auto& destinationRef = healthyDestinations;
 												self->noErrorActors.add(
-												fix substraction typo

											
										
										
											2022-04-07 14:03:25 +08:00
+												    trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); },
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+												            delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL)));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 												// onFinished.send( rs );
 												if (!error.code()) {
-												Added a duration to regular relocateShard trace events

											
										
										
											2019-08-17 06:15:36 +08:00
+													TraceEvent(relocateShardInterval.end(), distributorId)
 													    .detail("Duration", now() - startTime)
 													    .detail("Result", "Success");
-												added additional trace events to warn when different parts of shard relocations take more than 10 minutes

											
										
										
											2019-08-17 05:56:58 +08:00
+													if (now() - startTime > 600) {
-												added sources servers to the warning message

											
										
										
											2019-08-22 02:48:29 +08:00
+														TraceEvent(SevWarnAlways, "RelocateShardTooLong")
 														    .detail("Duration", now() - startTime)
 														    .detail("Dest", describe(destIds))
 														    .detail("Src", describe(rd.src));
-												added additional trace events to warn when different parts of shard relocations take more than 10 minutes

											
										
										
											2019-08-17 05:56:58 +08:00
+													}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													if (rd.keys.begin == keyServersPrefix) {
-												when doing data movement where one region has the data and the other doesn’t, first move a single replica to the other region to save WAN bandwidth

											
										
										
											2018-06-20 14:15:30 +08:00
+														TraceEvent("MovedKeyServerKeys")
 														    .detail("Dest", describe(destIds))
-												fix roll trace event issue for data distribution

Description

Testing

											
										
										
											2021-09-25 01:04:30 +08:00
+														    .trackLatest(self->movedKeyServersEventHolder->trackingKey);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													}
 													if (!signalledTransferComplete) {
 														signalledTransferComplete = true;
 														dataTransferComplete.send(rd);
 													}
 													self->bytesWritten += metrics.bytes;
-												fix: if a destination team became unhealthy and then healthy again, it would lower the priority of a move even though the source servers we are moving from are still unhealthy
fix: badTeams were not accounted for when checking priorities

											
										
										
											2018-11-12 04:33:31 +08:00
+													self->shardsAffectedByTeamFailure->finishMove(rd.keys);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													relocationComplete.send(rd);
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
 													if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 														// update physical shard collection
 														std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams;
 														for (int i = 0; i < bestTeams.size(); i++) {
 															auto serverIds = bestTeams[i].first->getServerIDs();
 															selectedTeams.push_back(ShardsAffectedByTeamFailure::Team(serverIds, i == 0));
 														}
 														// The update of PhysicalShardToTeams, PhysicalShardInstances, keyRangePhysicalShardIDMap should
 														// be atomic
 														self->physicalShardCollection->updatePhysicalShardCollection(
 														    rd.keys, rd.isRestore(), selectedTeams, rd.dataMoveId.first(), metrics, debugID);
 													}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													return Void();
 												} else {
 													throw error;
 												}
 											} else {
-												Add rare code probe decoration

											
										
										
											2022-09-26 06:28:32 +08:00
+												CODE_PROBE(true, "move to removed server", probe::decoration::rare);
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+												healthyDestinations.addDataInFlightToTeam(-metrics.bytes);
 												auto readLoad = metrics.bytesReadPerKSecond;
 												auto& destinationRef = healthyDestinations;
 												self->noErrorActors.add(
-												fix substraction typo

											
										
										
											2022-04-07 14:03:25 +08:00
+												    trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); },
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+												            delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL)));
-												fix inflight read division; temp destComplete fix; 0.1 constant poll time

											
										
										
											2022-04-21 03:15:40 +08:00
-												enable destComplete

											
										
										
											2022-04-21 04:32:04 +08:00
+												completeDest(rd, self->destBusymap);
 												rd.completeDests.clear();
-												fix inflight read division; temp destComplete fix; 0.1 constant poll time

											
										
										
											2022-04-21 03:15:40 +08:00
-												A giant translation of TaskFooPriority -> TaskPriority::Foo

This is so that APIs that take priorities don't take ints, which are
common and easy to accidentally pass the wrong thing.

											
										
										
											2019-06-25 17:47:35 +08:00
+												wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											}
 										}
 									} catch (Error& e) {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										state Error err = e;
-												Enforce that trace event suppression calls happen first when using trace event call chaining. Fix various instances where we weren't following this requirement.

											
										
										
											2022-02-25 04:25:52 +08:00
+										TraceEvent(relocateShardInterval.end(), distributorId)
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										    .errorUnsuppressed(err)
-												Enforce that trace event suppression calls happen first when using trace event call chaining. Fix various instances where we weren't following this requirement.

											
										
										
											2022-02-25 04:25:52 +08:00
+										    .detail("Duration", now() - startTime);
-												added additional trace events to warn when different parts of shard relocations take more than 10 minutes

											
										
										
											2019-08-17 05:56:58 +08:00
+										if (now() - startTime > 600) {
-												added sources servers to the warning message

											
										
										
											2019-08-22 02:48:29 +08:00
+											TraceEvent(SevWarnAlways, "RelocateShardTooLong")
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+											    .errorUnsuppressed(err)
-												added sources servers to the warning message

											
										
										
											2019-08-22 02:48:29 +08:00
+											    .detail("Duration", now() - startTime)
 											    .detail("Dest", describe(destIds))
 											    .detail("Src", describe(rd.src));
-												added additional trace events to warn when different parts of shard relocations take more than 10 minutes

											
										
										
											2019-08-17 05:56:58 +08:00
+										}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										if (!signalledTransferComplete)
 											dataTransferComplete.send(rd);
 										relocationComplete.send(rd);
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										if (err.code() == error_code_data_move_dest_team_not_found) {
 											wait(cancelDataMove(self, rd.keys, ddEnabledState));
 										}
 										if (err.code() != error_code_actor_cancelled && err.code() != error_code_data_move_cancelled) {
-												disable DD with a in-memory flag and use in snapv2

											
										
										
											2019-07-24 07:16:31 +08:00
+											if (errorOut.canBeSet()) {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+												errorOut.sendError(err);
-												disable DD with a in-memory flag and use in snapv2

											
										
										
											2019-07-24 07:16:31 +08:00
+											}
 										}
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										throw err;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									}
 								}
-												fix getMetrics keys bug

											
										
										
											2022-04-25 08:10:58 +08:00
+								inline double getWorstCpu(const HealthMetrics& metrics, const std::vector<UID>& ids) {
-												CPU reading

											
										
										
											2022-03-24 02:18:58 +08:00
+									double cpu = 0;
-												fix getMetrics keys bug

											
										
										
											2022-04-25 08:10:58 +08:00
+									for (auto& id : ids) {
 										if (metrics.storageStats.count(id)) {
 											cpu = std::max(cpu, metrics.storageStats.at(id).cpuUsage);
 										} else {
 											// assume the server is too busy to report its stats
 											cpu = std::max(cpu, 100.0);
-												solve some comments

											
										
										
											2022-05-04 08:21:08 +08:00
+											break;
-												fix getMetrics keys bug

											
										
										
											2022-04-25 08:10:58 +08:00
+										}
-												CPU reading

											
										
										
											2022-03-24 02:18:58 +08:00
+									}
 									return cpu;
 								}
-												change canQueue to timeThrottle()

											
										
										
											2022-04-23 06:26:44 +08:00
 								// Move the shard with the top K highest read density of sourceTeam's to destTeam if sourceTeam has much more read load
 								// than destTeam
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+								ACTOR Future<bool> rebalanceReadLoad(DDQueue* self,
-												Move DD queue code over to using movement-reasons rather than priority (#7614)

* Use enum variables to invoke Priority checking

* add an explicit isDataMovementForReadBalancing function

* Set up RelocateShard in terms of data movement reason instead of priority

* Remove isMountainChopperPriority

* Remove isDiskRebalancePriority

* Fix formatting

* Fix misnamed DataMovementReason::TEAM_HEALTHY

Co-authored-by: Zhongxing Zhang <zhongxing.zhang@snowflake.com>
											
										
										
											2022-07-25 15:50:37 +08:00
+								                                     DataMovementReason moveReason,
-												metrics comparator; rebalanceReadLoad()

											
										
										
											2022-02-25 08:41:01 +08:00
+								                                     Reference<IDataDistributionTeam> sourceTeam,
 								                                     Reference<IDataDistributionTeam> destTeam,
 								                                     bool primary,
 								                                     TraceEvent* traceEvent) {
-												Make g_simulator a pointer

											
										
										
											2022-09-15 08:10:49 +08:00
+									if (g_network->isSimulated() && g_simulator->speedUpSimulation) {
-												metrics comparator; rebalanceReadLoad()

											
										
										
											2022-02-25 08:41:01 +08:00
+										traceEvent->detail("CancelingDueToSimulationSpeedup", true);
 										return false;
 									}
 									state std::vector<KeyRange> shards = self->shardsAffectedByTeamFailure->getShardsFor(
 									    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
-												fix top10 shard index bug; add event detail; fix merge conflict

											
										
										
											2022-04-23 05:14:58 +08:00
+									traceEvent->detail("ShardsInSource", shards.size());
 									// For read rebalance if there is just 1 hot shard remained, move this shard to another server won't solve the
 									// problem.
 									// TODO: This situation should be solved by split and merge
-												move canQueue, 60s each source server, random select portion of shards

											
										
										
											2022-04-21 13:19:56 +08:00
+									if (shards.size() <= 1) {
-												fix reference assign bug

											
										
										
											2022-03-03 13:56:03 +08:00
+										traceEvent->detail("SkipReason", "NoShardOnSource");
-												metrics comparator; rebalanceReadLoad()

											
										
										
											2022-02-25 08:41:01 +08:00
+										return false;
-												fix reference assign bug

											
										
										
											2022-03-03 13:56:03 +08:00
+									}
-												move canQueue, 60s each source server, random select portion of shards

											
										
										
											2022-04-21 13:19:56 +08:00
-												solve some review comments

											
										
										
											2022-05-24 02:04:37 +08:00
+									// Check lastAsSource, at most SERVER_KNOBS->READ_REBALANCE_SRC_PARALLELISM shards can be moved within a sample
 									// period. It takes time for the sampled metrics being updated after a shard is moved, so we should control the
 									// cadence of movement here to avoid moving churn caused by making many decision based on out-of-date sampled
 									// metrics.
-												add storage metric compare knob; timeThrottle with constant

											
										
										
											2022-04-28 14:37:35 +08:00
+									if (self->timeThrottle(sourceTeam->getServerIDs())) {
-												determine timeThrottle and topK dynamically on the shard number

											
										
										
											2022-04-26 07:59:20 +08:00
+										traceEvent->detail("SkipReason", "SourceTeamThrottle");
 										return false;
 									}
-												set max shard bandwidth

											
										
										
											2022-05-07 07:37:12 +08:00
+									// check team difference
 									auto srcLoad = sourceTeam->getLoadReadBandwidth(false), destLoad = destTeam->getLoadReadBandwidth();
 									traceEvent->detail("SrcReadBandwidth", srcLoad).detail("DestReadBandwidth", destLoad);
-												determine timeThrottle and topK dynamically on the shard number

											
										
										
											2022-04-26 07:59:20 +08:00
-												set max shard bandwidth

											
										
										
											2022-05-07 07:37:12 +08:00
+									// read bandwidth difference is less than 30% of src load
-												add knobs; change knobs

											
										
										
											2022-05-18 05:49:27 +08:00
+									if ((1.0 - SERVER_KNOBS->READ_REBALANCE_DIFF_FRAC) * srcLoad <= destLoad) {
-												set max shard bandwidth

											
										
										
											2022-05-07 07:37:12 +08:00
+										traceEvent->detail("SkipReason", "TeamTooSimilar");
 										return false;
 									}
-												topK shard random selection

											
										
										
											2022-04-22 13:37:16 +08:00
+									// randomly choose topK shards
-												Prevent GetTopKMetricsRequest.topK < 1

											
										
										
											2022-09-03 03:00:43 +08:00
+									int topK = std::max(1, std::min(int(0.1 * shards.size()), SERVER_KNOBS->READ_REBALANCE_SHARD_TOPK));
-												DDQueue constructor with ITxnProcessor

											
										
										
											2022-09-22 01:56:22 +08:00
+									state Future<HealthMetrics> healthMetrics = self->txnProcessor->getHealthMetrics(true);
-												change all criteria to knobs

											
										
										
											2022-05-13 07:30:21 +08:00
+									state GetTopKMetricsRequest req(
-												add knobs; change knobs

											
										
										
											2022-05-18 05:49:27 +08:00
+									    shards, topK, (srcLoad - destLoad) * SERVER_KNOBS->READ_REBALANCE_MAX_SHARD_FRAC, srcLoad / shards.size());
-												add more statistics

											
										
										
											2022-05-18 01:19:09 +08:00
+									state GetTopKMetricsReply reply = wait(brokenPromiseToNever(self->getTopKMetrics.getReply(req)));
-												CPU reading

											
										
										
											2022-03-24 02:18:58 +08:00
+									wait(ready(healthMetrics));
-												add more informative trace info

											
										
										
											2022-05-17 12:25:56 +08:00
+									auto cpu = getWorstCpu(healthMetrics.get(), sourceTeam->getServerIDs());
 									if (cpu < SERVER_KNOBS->READ_REBALANCE_CPU_THRESHOLD) { // 15.0 +- (0.3 * 15) < 20.0
 										traceEvent->detail("SkipReason", "LowReadLoad").detail("WorstSrcCpu", cpu);
-												CPU reading

											
										
										
											2022-03-24 02:18:58 +08:00
+										return false;
 									}
-												fix inflight read division; temp destComplete fix; 0.1 constant poll time

											
										
										
											2022-04-21 03:15:40 +08:00
-												move keys out of StorageMetrics

											
										
										
											2022-05-28 08:10:01 +08:00
+									auto& metricsList = reply.shardMetrics;
-												add more statistics

											
										
										
											2022-05-18 01:19:09 +08:00
+									// NOTE: randomize is important here since we don't want to always push the same shard into the queue
 									deterministicRandom()->randomShuffle(metricsList);
 									traceEvent->detail("MinReadLoad", reply.minReadLoad).detail("MaxReadLoad", reply.maxReadLoad);
-												add more informative trace info

											
										
										
											2022-05-17 12:25:56 +08:00
-												solve review comments

											
										
										
											2022-07-21 07:09:38 +08:00
+									if (metricsList.empty()) {
-												topK shard random selection

											
										
										
											2022-04-22 13:37:16 +08:00
+										traceEvent->detail("SkipReason", "NoEligibleShards");
 										return false;
 									}
-												solve review comments

											
										
										
											2022-07-21 07:09:38 +08:00
+									auto& [shard, metrics] = metricsList[0];
-												set max shard bandwidth

											
										
										
											2022-05-07 07:37:12 +08:00
+									traceEvent->detail("ShardReadBandwidth", metrics.bytesReadPerKSecond);
-												topK shard random selection

											
										
										
											2022-04-22 13:37:16 +08:00
+									//  Verify the shard is still in ShardsAffectedByTeamFailure
 									shards = self->shardsAffectedByTeamFailure->getShardsFor(
 									    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
 									for (int i = 0; i < shards.size(); i++) {
-												move keys out of StorageMetrics

											
										
										
											2022-05-28 08:10:01 +08:00
+										if (shard == shards[i]) {
-												add TraceId; make the TraceId for MountainChopper, ValleyFiller, RelocateShard, QueuedShard consistent

											
										
										
											2022-08-05 07:57:55 +08:00
+											UID traceId = deterministicRandom()->randomUniqueID();
 											self->output.send(RelocateShard(shard, moveReason, RelocateReason::REBALANCE_READ, traceId));
 											traceEvent->detail("TraceId", traceId);
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
 											auto serverIds = sourceTeam->getServerIDs();
 											self->updateLastAsSource(serverIds);
 											self->serverCounter.increaseForTeam(
 											    serverIds, RelocateReason::REBALANCE_READ, DDQueue::ServerCounter::ProposedSource);
-												topK shard random selection

											
										
										
											2022-04-22 13:37:16 +08:00
+											return true;
-												metrics comparator; rebalanceReadLoad()

											
										
										
											2022-02-25 08:41:01 +08:00
+										}
-												topK shard random selection

											
										
										
											2022-04-22 13:37:16 +08:00
+									}
 									traceEvent->detail("SkipReason", "ShardNotPresent");
-												metrics comparator; rebalanceReadLoad()

											
										
										
											2022-02-25 08:41:01 +08:00
+									return false;
 								}
-												Provide destTeam parameter to rebalanceTeams again.

There is a wait before the load bytes of the dest team is used, so the
last commit inadvertently caused a behaviour change. Instead, update the
comment for the function, and pass const IDataDistributionTeam
references.

											
										
										
											2022-03-19 01:20:04 +08:00
+								// Move a random shard from sourceTeam if sourceTeam has much more data than provided destTeam
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+								ACTOR static Future<bool> rebalanceTeams(DDQueue* self,
-												Move DD queue code over to using movement-reasons rather than priority (#7614)

* Use enum variables to invoke Priority checking

* add an explicit isDataMovementForReadBalancing function

* Set up RelocateShard in terms of data movement reason instead of priority

* Remove isMountainChopperPriority

* Remove isDiskRebalancePriority

* Fix formatting

* Fix misnamed DataMovementReason::TEAM_HEALTHY

Co-authored-by: Zhongxing Zhang <zhongxing.zhang@snowflake.com>
											
										
										
											2022-07-25 15:50:37 +08:00
+								                                         DataMovementReason moveReason,
-												Provide destTeam parameter to rebalanceTeams again.

There is a wait before the load bytes of the dest team is used, so the
last commit inadvertently caused a behaviour change. Instead, update the
comment for the function, and pass const IDataDistributionTeam
references.

											
										
										
											2022-03-19 01:20:04 +08:00
+								                                         Reference<IDataDistributionTeam const> sourceTeam,
 								                                         Reference<IDataDistributionTeam const> destTeam,
-												Change signature of rebalanceTeams

											
										
										
											2022-03-18 14:27:33 +08:00
+								                                         bool primary,
 								                                         TraceEvent* traceEvent) {
-												Make g_simulator a pointer

											
										
										
											2022-09-15 08:10:49 +08:00
+									if (g_network->isSimulated() && g_simulator->speedUpSimulation) {
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
+										traceEvent->detail("CancelingDueToSimulationSpeedup", true);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										return false;
 									}
-												The mountainChopper and valleyFiller only move larger than average shards, to avoid moving high bandwidth shards which are generally smaller.

											
										
										
											2019-07-29 14:50:42 +08:00
+									Promise<int64_t> req;
 									self->getAverageShardBytes.send(req);
 									state int64_t averageShardBytes = wait(req.getFuture());
 									state std::vector<KeyRange> shards = self->shardsAffectedByTeamFailure->getShardsFor(
 									    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
+									traceEvent->detail("AverageShardBytes", averageShardBytes).detail("ShardsInSource", shards.size());
-												fix reference assign bug

											
										
										
											2022-03-03 13:56:03 +08:00
+									if (!shards.size()) {
 										traceEvent->detail("SkipReason", "NoShardOnSource");
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										return false;
-												fix reference assign bug

											
										
										
											2022-03-03 13:56:03 +08:00
+									}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												The mountainChopper and valleyFiller only move larger than average shards, to avoid moving high bandwidth shards which are generally smaller.

											
										
										
											2019-07-29 14:50:42 +08:00
+									state KeyRange moveShard;
 									state StorageMetrics metrics;
 									state int retries = 0;
-												addressed review comments

											
										
										
											2019-07-31 08:04:41 +08:00
+									while (retries < SERVER_KNOBS->REBALANCE_MAX_RETRIES) {
-												The mountainChopper and valleyFiller only move larger than average shards, to avoid moving high bandwidth shards which are generally smaller.

											
										
										
											2019-07-29 14:50:42 +08:00
+										state KeyRange testShard = deterministicRandom()->randomChoice(shards);
-												reset several method use getShardMetrics

											
										
										
											2022-05-04 15:00:03 +08:00
+										StorageMetrics testMetrics =
-												The mountainChopper and valleyFiller only move larger than average shards, to avoid moving high bandwidth shards which are generally smaller.

											
										
										
											2019-07-29 14:50:42 +08:00
+										    wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(testShard))));
-												reset several method use getShardMetrics

											
										
										
											2022-05-04 15:00:03 +08:00
+										if (testMetrics.bytes > metrics.bytes) {
-												The mountainChopper and valleyFiller only move larger than average shards, to avoid moving high bandwidth shards which are generally smaller.

											
										
										
											2019-07-29 14:50:42 +08:00
+											moveShard = testShard;
-												reset several method use getShardMetrics

											
										
										
											2022-05-04 15:00:03 +08:00
+											metrics = testMetrics;
-												addressed review comments

											
										
										
											2019-07-31 08:04:41 +08:00
+											if (metrics.bytes > averageShardBytes) {
 												break;
 											}
-												The mountainChopper and valleyFiller only move larger than average shards, to avoid moving high bandwidth shards which are generally smaller.

											
										
										
											2019-07-29 14:50:42 +08:00
+										}
 										retries++;
 									}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									int64_t sourceBytes = sourceTeam->getLoadBytes(false);
 									int64_t destBytes = destTeam->getLoadBytes();
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
 									bool sourceAndDestTooSimilar =
-												fix inflight read division; temp destComplete fix; 0.1 constant poll time

											
										
										
											2022-04-21 03:15:40 +08:00
+									    sourceBytes - destBytes <= 3 * std::max<int64_t>(SERVER_KNOBS->MIN_SHARD_BYTES, metrics.bytes);
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
+									traceEvent->detail("SourceBytes", sourceBytes)
 									    .detail("DestBytes", destBytes)
 									    .detail("ShardBytes", metrics.bytes)
 									    .detail("SourceAndDestTooSimilar", sourceAndDestTooSimilar);
 									if (sourceAndDestTooSimilar || metrics.bytes == 0) {
-												fix reference assign bug

											
										
										
											2022-03-03 13:56:03 +08:00
+										traceEvent->detail("SkipReason", sourceAndDestTooSimilar ? "TeamTooSimilar" : "ShardZeroSize");
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										return false;
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
+									}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Minor improvement on comments

											
										
										
											2020-07-13 09:30:02 +08:00
+									// Verify the shard is still in ShardsAffectedByTeamFailure
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
+									shards = self->shardsAffectedByTeamFailure->getShardsFor(
 									    ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary));
 									for (int i = 0; i < shards.size(); i++) {
 										if (moveShard == shards[i]) {
-												add TraceId; make the TraceId for MountainChopper, ValleyFiller, RelocateShard, QueuedShard consistent

											
										
										
											2022-08-05 07:57:55 +08:00
+											UID traceId = deterministicRandom()->randomUniqueID();
 											self->output.send(RelocateShard(moveShard, moveReason, RelocateReason::REBALANCE_DISK, traceId));
 											traceEvent->detail("TraceId", traceId);
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
 											self->serverCounter.increaseForTeam(
 											    sourceTeam->getServerIDs(), RelocateReason::REBALANCE_DISK, DDQueue::ServerCounter::ProposedSource);
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
+											return true;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										}
 									}
-												fix reference assign bug

											
										
										
											2022-03-03 13:56:03 +08:00
+									traceEvent->detail("SkipReason", "ShardNotPresent");
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									return false;
 								}
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+								ACTOR Future<SrcDestTeamPair> getSrcDestTeams(DDQueue* self,
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+								                                              int teamCollectionIndex,
 								                                              GetTeamRequest srcReq,
 								                                              GetTeamRequest destReq,
 								                                              int priority,
 								                                              TraceEvent* traceEvent) {
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+									state std::pair<Optional<ITeamRef>, bool> randomTeam =
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+									    wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(destReq)));
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+									traceEvent->detail(
 									    "DestTeam", printable(randomTeam.first.map<std::string>([](const ITeamRef& team) { return team->getDesc(); })));
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
 									if (randomTeam.first.present()) {
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+										state std::pair<Optional<ITeamRef>, bool> loadedTeam =
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+										    wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(srcReq)));
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+										traceEvent->detail("SourceTeam", printable(loadedTeam.first.map<std::string>([](const ITeamRef& team) {
 											                   return team->getDesc();
 										                   })));
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+										if (loadedTeam.first.present()) {
 											return std::make_pair(loadedTeam.first.get(), randomTeam.first.get());
 										}
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+									}
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+									return {};
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+								}
-												Actor to DDQueue methods

											
										
										
											2022-09-22 08:57:40 +08:00
+								Future<SrcDestTeamPair> DDQueue::getSrcDestTeams(const int& teamCollectionIndex,
 								                                                 const GetTeamRequest& srcReq,
 								                                                 const GetTeamRequest& destReq,
 								                                                 const int& priority,
 								                                                 TraceEvent* traceEvent) {
 									return ::getSrcDestTeams(this, teamCollectionIndex, srcReq, destReq, priority, traceEvent);
 								}
 								Future<bool> DDQueue::rebalanceReadLoad(DataMovementReason moveReason,
 								                                        Reference<IDataDistributionTeam> sourceTeam,
 								                                        Reference<IDataDistributionTeam> destTeam,
 								                                        bool primary,
 								                                        TraceEvent* traceEvent) {
 									return ::rebalanceReadLoad(this, moveReason, sourceTeam, destTeam, primary, traceEvent);
 								}
 								Future<bool> DDQueue::rebalanceTeams(DataMovementReason moveReason,
 								                                     Reference<const IDataDistributionTeam> sourceTeam,
 								                                     Reference<const IDataDistributionTeam> destTeam,
 								                                     bool primary,
 								                                     TraceEvent* traceEvent) {
 									return ::rebalanceTeams(this, moveReason, sourceTeam, destTeam, primary, traceEvent);
 								}
-												change shared_ptr to Reference

											
										
										
											2022-09-28 02:22:47 +08:00
+								ACTOR Future<bool> getSkipRebalanceValue(Reference<IDDTxnProcessor> txnProcessor, bool readRebalance) {
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+									Optional<Value> val = wait(txnProcessor->readRebalanceDDIgnoreKey());
-												rename dbProcessor to db; readability improvement

											
										
										
											2022-09-23 08:11:07 +08:00
+									if (!val.present())
 										return false;
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+									bool skipCurrentLoop = false;
-												rename dbProcessor to db; readability improvement

											
										
										
											2022-09-23 08:11:07 +08:00
+									// NOTE: check special value "" and "on" might written in old version < 7.2
 									if (val.get().size() > 0 && val.get() != "on"_sr) {
 										int ddIgnore = BinaryReader::fromStringRef<uint8_t>(val.get(), Unversioned());
 										if (readRebalance) {
 											skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_READ) > 0;
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+										} else {
-												rename dbProcessor to db; readability improvement

											
										
										
											2022-09-23 08:11:07 +08:00
+											skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0;
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+										}
-												rename dbProcessor to db; readability improvement

											
										
										
											2022-09-23 08:11:07 +08:00
+									} else {
 										skipCurrentLoop = true;
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+									}
-												rename dbProcessor to db; readability improvement

											
										
										
											2022-09-23 08:11:07 +08:00
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+									return skipCurrentLoop;
 								}
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+								ACTOR Future<Void> BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, DataMovementReason reason) {
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+									state int resetCount = 0;
-												Address review comments. 50K correctness with no failures.

											
										
										
											2019-07-31 11:20:02 +08:00
+									state double lastRead = 0;
-												Address review comments

											
										
										
											2019-07-25 06:32:52 +08:00
+									state bool skipCurrentLoop = false;
-												Use enum variables to invoke Priority checking (#7514)

* Use enum variables to invoke Priority checking

* add an explicit isDataMovementForReadBalancing function
											
										
										
											2022-07-15 00:06:56 +08:00
+									state const bool readRebalance = isDataMovementForReadBalancing(reason);
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+									state const char* eventName = isDataMovementForMountainChopper(reason) ? "BgDDMountainChopper" : "BgDDValleyFiller";
-												Use enum variables to invoke Priority checking (#7514)

* Use enum variables to invoke Priority checking

* add an explicit isDataMovementForReadBalancing function
											
										
										
											2022-07-15 00:06:56 +08:00
+									state int ddPriority = dataMovementPriority(reason);
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+									state double rebalancePollingInterval = 0;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									loop {
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
+										state bool moved = false;
-												fix uninitialized member

											
										
										
											2022-03-01 02:22:32 +08:00
+										state Reference<IDataDistributionTeam> sourceTeam;
 										state Reference<IDataDistributionTeam> destTeam;
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+										state GetTeamRequest srcReq;
 										state GetTeamRequest destReq;
-												add new priority in RelocateData

											
										
										
											2022-04-13 07:22:17 +08:00
+										state TraceEvent traceEvent(eventName, self->distributorId);
-												remove polling interval; uncomment suppressFor

											
										
										
											2022-05-23 14:35:39 +08:00
+										traceEvent.suppressFor(5.0)
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+										    .detail("PollingInterval", rebalancePollingInterval)
-												add new priority in RelocateData

											
										
										
											2022-04-13 07:22:17 +08:00
+										    .detail("Rebalance", readRebalance ? "Read" : "Disk");
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+										// NOTE: the DD throttling relies on DDQueue, so here just trigger the balancer periodically
 										wait(delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch));
-												Actor to DDQueue methods

											
										
										
											2022-09-22 08:57:40 +08:00
+										try {
 											if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) {
 												wait(store(skipCurrentLoop, getSkipRebalanceValue(self->txnProcessor, readRebalance)));
 												lastRead = now();
 											}
 											traceEvent.detail("Enabled", !skipCurrentLoop);
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
-												Actor to DDQueue methods

											
										
										
											2022-09-22 08:57:40 +08:00
+											if (skipCurrentLoop) {
 												rebalancePollingInterval =
 												    std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL);
 												continue;
 											} else {
 												rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL;
 											}
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
-												refactor datadistribution command; try dual-mode code

											
										
										
											2022-04-07 13:10:23 +08:00
+											traceEvent.detail("QueuedRelocations", self->priority_relocations[ddPriority]);
-												add new priority in RelocateData

											
										
										
											2022-04-13 07:22:17 +08:00
 											if (self->priority_relocations[ddPriority] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
-												Update fdbserver/DataDistributionQueue.actor.cpp
											
										
										
											2022-07-30 01:22:21 +08:00
+												bool mcMove = isDataMovementForMountainChopper(reason);
-												remove duplicate code

											
										
										
											2022-07-30 01:04:14 +08:00
+												srcReq = GetTeamRequest(WantNewServers::True,
 												                        WantTrueBest(mcMove),
 												                        PreferLowerDiskUtil::False,
 												                        TeamMustHaveShards::True,
 												                        ForReadBalance(readRebalance),
 												                        PreferLowerReadUtil::False);
 												destReq = GetTeamRequest(WantNewServers::True,
 												                         WantTrueBest(!mcMove),
 												                         PreferLowerDiskUtil::True,
 												                         TeamMustHaveShards::False,
 												                         ForReadBalance(readRebalance),
 												                         PreferLowerReadUtil::True);
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+												state Future<SrcDestTeamPair> getTeamFuture =
-												Actor to DDQueue methods

											
										
										
											2022-09-22 08:57:40 +08:00
+												    self->getSrcDestTeams(teamCollectionIndex, srcReq, destReq, ddPriority, &traceEvent);
-												change reference pointer

											
										
										
											2022-05-23 15:12:48 +08:00
+												wait(ready(getTeamFuture));
 												sourceTeam = getTeamFuture.get().first;
 												destTeam = getTeamFuture.get().second;
-												refactor GetTeamRequest

											
										
										
											2022-05-05 08:42:49 +08:00
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+												// clang-format off
 												if (sourceTeam.isValid() && destTeam.isValid()) {
-												refactor datadistribution command; try dual-mode code

											
										
										
											2022-04-07 13:10:23 +08:00
+													if (readRebalance) {
-												Actor to DDQueue methods

											
										
										
											2022-09-22 08:57:40 +08:00
+														wait(store(moved,self->rebalanceReadLoad( reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent)));
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+													} else {
-												Actor to DDQueue methods

											
										
										
											2022-09-22 08:57:40 +08:00
+														wait(store(moved,self->rebalanceTeams( reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent)));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													}
 												}
-												move canQueue, 60s each source server, random select portion of shards

											
										
										
											2022-04-21 13:19:56 +08:00
+												// clang-format on
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+												moved ? resetCount = 0 : resetCount++;
-												- Addressed review commends
- Added test for the storage server failure disable switch

											
										
										
											2019-07-17 06:12:18 +08:00
+											}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												merge readaware-better

											
										
										
											2022-04-12 08:09:39 +08:00
+											traceEvent.detail("ResetCount", resetCount);
 										} catch (Error& e) {
 											// Log actor_cancelled because it's not legal to suppress an event that's initialized
 											traceEvent.errorUnsuppressed(e);
-												fix busy loop with correct error handling in valley filler

											
										
										
											2022-09-22 05:58:34 +08:00
+											throw;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										}
-												Add more logging to valley filler and mountain chopper

											
										
										
											2020-02-22 02:55:14 +08:00
 										traceEvent.detail("Moved", moved);
-												Remove unused parameter. Don't put check for g_network presence in ASSERT_WE_THINK.

											
										
										
											2020-02-22 08:28:03 +08:00
+										traceEvent.log();
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									}
 								}
-												change shared_ptr to Reference

											
										
										
											2022-09-28 02:22:47 +08:00
+								ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+								                                         PromiseStream<RelocateShard> output,
 								                                         FutureStream<RelocateShard> input,
 								                                         PromiseStream<GetMetricsRequest> getShardMetrics,
-												reset several method use getShardMetrics

											
										
										
											2022-05-04 15:00:03 +08:00
+								                                         PromiseStream<GetTopKMetricsRequest> getTopKMetrics,
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+								                                         Reference<AsyncVar<bool>> processingUnhealthy,
-												consider wiggling when waitUntilHealthy

											
										
										
											2021-10-15 07:22:47 +08:00
+								                                         Reference<AsyncVar<bool>> processingWiggle,
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+								                                         std::vector<TeamCollectionInterface> teamCollections,
 								                                         Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+								                                         Reference<PhysicalShardCollection> physicalShardCollection,
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+								                                         MoveKeysLock lock,
 								                                         PromiseStream<Promise<int64_t>> getAverageShardBytes,
-												Restrict write access to getUnhealthyRelocationCount

											
										
										
											2022-04-04 14:47:54 +08:00
+								                                         FutureStream<Promise<int>> getUnhealthyRelocationCount,
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+								                                         UID distributorId,
 								                                         int teamSize,
 								                                         int singleRegionTeamSize,
 								                                         const DDEnabledState* ddEnabledState) {
-												rename DDQueue; add ServerCounter

											
										
										
											2022-08-06 03:01:11 +08:00
+									state DDQueue self(distributorId,
-												ddqueue.periodicalRefreshCounter()

											
										
										
											2022-08-06 06:26:34 +08:00
+									                   lock,
-												rename dbProcessor to db; readability improvement

											
										
										
											2022-09-23 08:11:07 +08:00
+									                   db,
-												ddqueue.periodicalRefreshCounter()

											
										
										
											2022-08-06 06:26:34 +08:00
+									                   teamCollections,
 									                   shardsAffectedByTeamFailure,
-												dd physical shard core (#7703)


											
										
										
											2022-08-20 02:47:00 +08:00
+									                   physicalShardCollection,
-												ddqueue.periodicalRefreshCounter()

											
										
										
											2022-08-06 06:26:34 +08:00
+									                   getAverageShardBytes,
 									                   teamSize,
 									                   singleRegionTeamSize,
 									                   output,
 									                   input,
 									                   getShardMetrics,
 									                   getTopKMetrics);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									state std::set<UID> serversToLaunchFrom;
 									state KeyRange keysToLaunchFrom;
 									state RelocateData launchData;
 									state Future<Void> recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL);
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
-												ddqueue.periodicalRefreshCounter()

											
										
										
											2022-08-06 06:26:34 +08:00
+									state std::vector<Future<Void>> ddQueueFutures;
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
 									state PromiseStream<KeyRange> rangesComplete;
 									state Future<Void> launchQueuedWorkTimeout = Never();
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									for (int i = 0; i < teamCollections.size(); i++) {
-												replace BgDDMountainChopper and BgDDValleyFiller with BgDDLoadRebalance

											
										
										
											2022-09-22 06:11:04 +08:00
+										ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_OVERUTILIZED_TEAM));
 										ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM));
-												update unittests

											
										
										
											2022-04-17 13:51:55 +08:00
+										if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
-												ddqueue.periodicalRefreshCounter()

											
										
										
											2022-08-06 06:26:34 +08:00
+											ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM));
 											ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM));
-												add new priority in RelocateData

											
										
										
											2022-04-13 07:22:17 +08:00
+										}
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
+									}
-												ddqueue.periodicalRefreshCounter()

											
										
										
											2022-08-06 06:26:34 +08:00
+									ddQueueFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0));
 									ddQueueFutures.push_back(delayedAsyncVar(self.rawProcessingWiggle, processingWiggle, 0));
 									ddQueueFutures.push_back(self.periodicalRefreshCounter());
-												data distribution tracks teams for each data center separately

											
										
										
											2017-10-11 01:36:33 +08:00
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+									try {
 										loop {
 											self.validate();
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+											// For the given servers that caused us to go around the loop, find the next item(s) that can be
 											// launched.
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											if (launchData.startTime != -1) {
-												DD:Add comments to help understand code

Add comments to explain the functionalities of some code.

											
										
										
											2019-07-20 07:22:15 +08:00
+												// Launch dataDistributionRelocator actor to relocate the launchData
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+												self.launchQueuedWork(launchData, ddEnabledState);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												launchData = RelocateData();
 											} else if (!keysToLaunchFrom.empty()) {
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+												self.launchQueuedWork(keysToLaunchFrom, ddEnabledState);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												keysToLaunchFrom = KeyRangeRef();
 											}
 											ASSERT(launchData.startTime == -1 && keysToLaunchFrom.empty());
 											choose {
-												prevented a slow task when too many shards were sent to the data distribution queue after switching to a fearless deployment

											
										
										
											2018-08-10 03:37:46 +08:00
+												when(RelocateShard rs = waitNext(self.input)) {
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+													if (rs.isRestore()) {
 														ASSERT(rs.dataMove != nullptr);
 														ASSERT(rs.dataMoveId.isValid());
 														self.launchQueuedWork(RelocateData(rs), ddEnabledState);
 													} else if (rs.cancelled) {
 														self.enqueueCancelledDataMove(rs.dataMoveId, rs.keys, ddEnabledState);
 													} else {
 														bool wasEmpty = serversToLaunchFrom.empty();
 														self.queueRelocation(rs, serversToLaunchFrom);
 														if (wasEmpty && !serversToLaunchFrom.empty())
 															launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch);
 													}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												}
-												Rewrite all `Void _ = wait(...)` -> `wait(...)`.

This takes advantage of the new actorcompiler functionality to avoid
having duplicate definitions of `Void _` when trying to feed the
un-actorompiled source through clang.

											
										
										
											2018-08-11 04:57:10 +08:00
+												when(wait(launchQueuedWorkTimeout)) {
-												Remove global ddEnabled flag

											
										
										
											2020-09-28 06:26:50 +08:00
+													self.launchQueuedWork(serversToLaunchFrom, ddEnabledState);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													serversToLaunchFrom = std::set<UID>();
 													launchQueuedWorkTimeout = Never();
 												}
 												when(RelocateData results = waitNext(self.fetchSourceServersComplete.getFuture())) {
-												DD:Add comments to help understand code

Add comments to explain the functionalities of some code.

											
										
										
											2019-07-20 07:22:15 +08:00
+													// This when is triggered by queueRelocation() which is triggered by sending self.input
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													self.completeSourceFetch(results);
 													launchData = results;
 												}
 												when(RelocateData done = waitNext(self.dataTransferComplete.getFuture())) {
-												Limiting DD Moves by destination SS.

											
										
										
											2021-12-14 02:13:34 +08:00
+													complete(done, self.busymap, self.destBusymap);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													if (serversToLaunchFrom.empty() && !done.src.empty())
-												A giant translation of TaskFooPriority -> TaskPriority::Foo

This is so that APIs that take priorities don't take ints, which are
common and easy to accidentally pass the wrong thing.

											
										
										
											2019-06-25 17:47:35 +08:00
+														launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													serversToLaunchFrom.insert(done.src.begin(), done.src.end());
 												}
 												when(RelocateData done = waitNext(self.relocationComplete.getFuture())) {
 													self.activeRelocations--;
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+													TraceEvent(SevVerbose, "InFlightRelocationChange")
 													    .detail("Complete", done.dataMoveId)
 													    .detail("IsRestore", done.isRestore())
 													    .detail("Total", self.activeRelocations);
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+													self.finishRelocation(done.priority, done.healthPriority);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													self.fetchKeysComplete.erase(done);
 													// self.logRelocation( done, "ShardRelocatorDone" );
-												enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction:

											
										
										
											2022-04-01 00:57:00 +08:00
+													self.noErrorActors.add(
 													    tag(delay(0, TaskPriority::DataDistributionLaunch), done.keys, rangesComplete));
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													if (g_network->isSimulated() && debug_isCheckRelocationDuration() && now() - done.startTime > 60) {
 														TraceEvent(SevWarnAlways, "RelocationDurationTooLong")
 														    .detail("Duration", now() - done.startTime);
 														debug_setCheckRelocationDuration(false);
 													}
 												}
-												format code

											
										
										
											2022-08-10 05:02:57 +08:00
+												when(KeyRange done = waitNext(rangesComplete.getFuture())) { keysToLaunchFrom = done; }
-												Rewrite all `Void _ = wait(...)` -> `wait(...)`.

This takes advantage of the new actorcompiler functionality to avoid
having duplicate definitions of `Void _` when trying to feed the
un-actorompiled source through clang.

											
										
										
											2018-08-11 04:57:10 +08:00
+												when(wait(recordMetrics)) {
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													Promise<int64_t> req;
 													getAverageShardBytes.send(req);
-												added additional logging in data distribution

											
										
										
											2020-03-14 06:19:33 +08:00
+													recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL, TaskPriority::FlushTrace);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Add MAX_SNAPSHOT_FAULT_TOLERANCE knob

											
										
										
											2022-04-04 13:31:45 +08:00
+													auto const highestPriorityRelocation = self.getHighestPriorityRelocation();
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
-												Add a new DataDistributor role.

Let cluster controller to start a new data distributor role by sending a
message to a chosen worker.
Change MasterInterface usage in DataDistribution to masterId

Add DataDistributor rejoin handling.

This allows the data distributor to tell the new cluster controller of its
existence so that the controller doesn't spawn a new one. I.e., there should
be only ONE data distributor in the cluster.

If DataDistributor (DD) doesn't join in a while, then ClusterController (CC) tries
to recruit one as DD. CC also monitors DD and restarts one if it failed.

The Proxy is also monitoring the DD. If DD failed, the Proxy will ask CC for
the new DD.

Add GetRecoveryInfo RPC to master server, which is called by data distributor
to obtain the recovery Transaction version from the master server.

											
										
										
											2018-12-14 05:31:37 +08:00
+													TraceEvent("MovingData", distributorId)
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													    .detail("InFlight", self.activeRelocations)
 													    .detail("InQueue", self.queuedRelocations)
 													    .detail("AverageShardSize", req.getFuture().isReady() ? req.getFuture().get() : -1)
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+													    .detail("UnhealthyRelocations", self.unhealthyRelocations)
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+													    .detail("HighestPriority", highestPriorityRelocation)
 													    .detail("BytesWritten", self.bytesWritten)
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+													    .detail("PriorityRecoverMove", self.priority_relocations[SERVER_KNOBS->PRIORITY_RECOVER_MOVE])
 													    .detail("PriorityRebalanceUnderutilizedTeam",
 													            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM])
-												fixed typo

											
										
										
											2019-12-19 08:57:39 +08:00
+													    .detail("PriorityRebalanceOverutilizedTeam",
 													            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM])
-												refactor datadistribution command; try dual-mode code

											
										
										
											2022-04-07 13:10:23 +08:00
+													    .detail("PriorityRebalanceReadUnderutilTeam",
 													            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM])
 													    .detail("PriorityRebalanceReadOverutilTeam",
 													            self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM])
-												wait remove

											
										
										
											2021-06-21 13:18:19 +08:00
+													    .detail("PriorityStorageWiggle",
 													            self.priority_relocations[SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE])
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+													    .detail("PriorityTeamHealthy", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_HEALTHY])
 													    .detail("PriorityTeamContainsUndesiredServer",
 													            self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER])
 													    .detail("PriorityTeamRedundant",
 													            self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT])
 													    .detail("PriorityMergeShard", self.priority_relocations[SERVER_KNOBS->PRIORITY_MERGE_SHARD])
-												Make the DD priority associated with populating a remote region lower than machine failures

											
										
										
											2020-03-05 06:07:32 +08:00
+													    .detail("PriorityPopulateRegion",
 													            self.priority_relocations[SERVER_KNOBS->PRIORITY_POPULATE_REGION])
-												Raised the data distribution priority of splitting shards above restoring fault tolerance to avoid hot write shards

											
										
										
											2019-10-12 08:50:43 +08:00
+													    .detail("PriorityTeamUnhealthy",
 													            self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY])
 													    .detail("PriorityTeam2Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_2_LEFT])
 													    .detail("PriorityTeam1Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_1_LEFT])
 													    .detail("PriorityTeam0Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_0_LEFT])
 													    .detail("PrioritySplitShard", self.priority_relocations[SERVER_KNOBS->PRIORITY_SPLIT_SHARD])
-												fix roll trace event issue for data distribution(master)

Description

Testing

											
										
										
											2021-09-25 03:46:51 +08:00
+													    .trackLatest("MovingData"); // This trace event's trackLatest lifetime is controlled by
-												move takeMoveKeysLock to DDTxnProcessor

											
										
										
											2022-07-09 05:11:31 +08:00
+													                                // DataDistributor::movingDataEventHolder. The track latest
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+													                                // key we use here must match the key used in the holder.
-												Add a counter to track physical shard creation throuogh moves

											
										
										
											2022-10-20 13:09:04 +08:00
 													if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
 														TraceEvent("PhysicalShardMoveStats")
 														    .detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard)
-												Count the detailed reason for new physical shard creation during data move

											
										
										
											2022-10-23 11:48:58 +08:00
+														    .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard)
 														    .detail("RemoteBestTeamNotReady",
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+														            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteBestTeamNotReady])
 														    .detail("PrimaryNoHealthyTeam",
 														            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam])
 														    .detail("RemoteNoHealthyTeam",
 														            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteNoHealthyTeam])
-												Count the detailed reason for new physical shard creation during data move

											
										
										
											2022-10-23 11:48:58 +08:00
+														    .detail("RemoteTeamIsFull",
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+														            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull])
-												Count the detailed reason for new physical shard creation during data move

											
										
										
											2022-10-23 11:48:58 +08:00
+														    .detail("RemoteTeamIsNotHealthy",
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+														            self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy])
 														    .detail(
 														        "NoAvailablePhysicalShard",
 														        self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]);
-												Add a counter to track physical shard creation throuogh moves

											
										
										
											2022-10-20 13:09:04 +08:00
+														self.moveCreateNewPhysicalShard = 0;
 														self.moveReusePhysicalShard = 0;
-												Rename NewPhysicalShardReason to RetryFindDstReason

											
										
										
											2022-10-25 01:39:32 +08:00
+														for (int i = 0; i < self.retryFindDstReasonCount.size(); ++i) {
 															self.retryFindDstReasonCount[i] = 0;
-												Count the detailed reason for new physical shard creation during data move

											
										
										
											2022-10-23 11:48:58 +08:00
+														}
-												Add a counter to track physical shard creation throuogh moves

											
										
										
											2022-10-20 13:09:04 +08:00
+													}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+												}
-												Rewrite all `Void _ = wait(...)` -> `wait(...)`.

This takes advantage of the new actorcompiler functionality to avoid
having duplicate definitions of `Void _` when trying to feed the
un-actorompiled source through clang.

											
										
										
											2018-08-11 04:57:10 +08:00
+												when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
-												ddqueue.periodicalRefreshCounter()

											
										
										
											2022-08-06 06:26:34 +08:00
+												when(wait(waitForAll(ddQueueFutures))) {}
-												split DD related headers

											
										
										
											2022-08-17 05:32:55 +08:00
+												when(Promise<int> r = waitNext(getUnhealthyRelocationCount)) {
 													r.send(self.getUnhealthyRelocationCount());
 												}
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+											}
 										}
 									} catch (Error& e) {
-												temporary change special key for data distributor

											
										
										
											2022-02-26 03:01:23 +08:00
+										if (e.code() != error_code_broken_promise && // FIXME: Get rid of these broken_promise errors every time we
 										                                             // are killed by the master dying
-												Shard based move (#6981)

* Shard based move.

* Clean up.

* Clear results on retry in getInitialDataDistribution.

* Remove assertion on SHARD_ENCODE_LOCATION_METADATA for compatibility.

* Resolved comments.

Co-authored-by: He Liu <heliu@apple.com>
											
										
										
											2022-07-08 11:49:16 +08:00
+										    e.code() != error_code_movekeys_conflict && e.code() != error_code_data_move_cancelled &&
 										    e.code() != error_code_data_move_dest_team_not_found)
-												Add a new DataDistributor role.

Let cluster controller to start a new data distributor role by sending a
message to a chosen worker.
Change MasterInterface usage in DataDistribution to masterId

Add DataDistributor rejoin handling.

This allows the data distributor to tell the new cluster controller of its
existence so that the controller doesn't spawn a new one. I.e., there should
be only ONE data distributor in the cluster.

If DataDistributor (DD) doesn't join in a while, then ClusterController (CC) tries
to recruit one as DD. CC also monitors DD and restarts one if it failed.

The Proxy is also monitoring the DD. If DD failed, the Proxy will ask CC for
the new DD.

Add GetRecoveryInfo RPC to master server, which is called by data distributor
to obtain the recovery Transaction version from the master server.

											
										
										
											2018-12-14 05:31:37 +08:00
+											TraceEvent(SevError, "DataDistributionQueueError", distributorId).error(e);
-												Initial repository commit

											
										
										
											2017-05-26 04:48:44 +08:00
+										throw e;
 									}
 								}
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
-												split DD related headers

											
										
										
											2022-08-17 05:32:55 +08:00
+								ACTOR Future<Void> dataDistributionQueue(Reference<DDSharedContext> context, Database cx);
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+								TEST_CASE("/DataDistribution/DDQueue/ServerCounterTrace") {
-												add knob to control summarize

											
										
										
											2022-08-10 14:32:40 +08:00
+									state double duration = 2.5 * SERVER_KNOBS->DD_QUEUE_COUNTER_REFRESH_INTERVAL;
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+									state DDQueue self;
 									state Future<Void> counterFuture = self.periodicalRefreshCounter();
 									state Future<Void> finishFuture = delay(duration);
 									std::cout << "Start trace counter unit test for " << duration << "s ...\n";
 									loop choose {
 										when(wait(counterFuture)) {}
-												format code

											
										
										
											2022-08-10 05:02:57 +08:00
+										when(wait(finishFuture)) { break; }
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+										when(wait(delayJittered(2.0))) {
 											std::vector<UID> team(3);
 											for (int i = 0; i < team.size(); ++i) {
-												add summarize event

											
										
										
											2022-08-10 09:22:48 +08:00
+												team[i] = UID(deterministicRandom()->randomInt(1, 400), 0);
-												add unit test

											
										
										
											2022-08-06 14:57:52 +08:00
+											}
 											auto reason = RelocateReason(deterministicRandom()->randomInt(0, RelocateReason::typeCount()));
 											auto countType = DDQueue::ServerCounter::randomCountType();
 											self.serverCounter.increaseForTeam(team, reason, countType);
 											ASSERT(self.serverCounter.get(team[0], reason, countType));
 										}
 									}
 									std::cout << "Finished.";
 									return Void();
-												Make g_simulator a pointer

											
										
										
											2022-09-15 08:10:49 +08:00
+								}