256 lines
9.2 KiB
C++
256 lines
9.2 KiB
C++
/*
|
|
* DataDistribution.h
|
|
*
|
|
* This source file is part of the FoundationDB open source project
|
|
*
|
|
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "fdbclient/NativeAPI.h"
|
|
#include "ClusterRecruitmentInterface.h"
|
|
#include "MoveKeys.h"
|
|
#include "LogSystem.h"
|
|
|
|
struct RelocateShard {
|
|
KeyRange keys;
|
|
int priority;
|
|
|
|
RelocateShard() : priority(0) {}
|
|
RelocateShard( KeyRange const& keys, int priority ) : keys(keys), priority(priority) {}
|
|
};
|
|
|
|
// Higher priorities are executed first
|
|
// Priority/100 is the "priority group"/"superpriority". Priority inversion
|
|
// is possible within but not between priority groups; fewer priority groups
|
|
// mean better worst case time bounds
|
|
enum {
|
|
PRIORITY_REBALANCE_SHARD = 100,
|
|
PRIORITY_RECOVER_MOVE = 110,
|
|
PRIORITY_REBALANCE_UNDERUTILIZED_TEAM = 120,
|
|
PRIORITY_REBALANCE_OVERUTILIZED_TEAM = 121,
|
|
PRIORITY_TEAM_HEALTHY = 140,
|
|
PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER = 150,
|
|
|
|
PRIORITY_MERGE_SHARD = 240,
|
|
PRIORITY_SPLIT_SHARD = 250,
|
|
|
|
PRIORITY_TEAM_UNHEALTHY = 800,
|
|
PRIORITY_TEAM_2_LEFT = 809,
|
|
|
|
PRIORITY_TEAM_1_LEFT = 900,
|
|
|
|
PRIORITY_TEAM_0_LEFT = 999
|
|
};
|
|
|
|
enum {
|
|
SOME_SHARED = 2,
|
|
NONE_SHARED = 3
|
|
};
|
|
|
|
struct IDataDistributionTeam {
|
|
virtual vector<StorageServerInterface> getLastKnownServerInterfaces() = 0;
|
|
virtual int size() = 0;
|
|
virtual vector<UID> const& getServerIDs() = 0;
|
|
virtual void addDataInFlightToTeam( int64_t delta ) = 0;
|
|
virtual int64_t getDataInFlightToTeam() = 0;
|
|
virtual int64_t getLoadBytes( bool includeInFlight = true, double inflightPenalty = 1.0 ) = 0;
|
|
virtual int64_t getMinFreeSpace( bool includeInFlight = true ) = 0;
|
|
virtual double getMinFreeSpaceRatio( bool includeInFlight = true ) = 0;
|
|
virtual bool hasHealthyFreeSpace() = 0;
|
|
virtual Future<Void> updatePhysicalMetrics() = 0;
|
|
virtual void addref() = 0;
|
|
virtual void delref() = 0;
|
|
virtual bool isHealthy() = 0;
|
|
virtual void setHealthy(bool) = 0;
|
|
virtual int getPriority() = 0;
|
|
virtual void setPriority(int) = 0;
|
|
virtual bool isOptimal() = 0;
|
|
virtual bool isWrongConfiguration() = 0;
|
|
virtual void setWrongConfiguration(bool) = 0;
|
|
virtual void addServers(const vector<UID> &servers) = 0;
|
|
|
|
std::string getDesc() {
|
|
const auto& servers = getLastKnownServerInterfaces();
|
|
std::string s = format("Size %d; ", servers.size());
|
|
for(int i=0; i<servers.size(); i++) {
|
|
if (i) s += ", ";
|
|
s += servers[i].address().toString() + " " + servers[i].id().shortString();
|
|
}
|
|
return s;
|
|
}
|
|
};
|
|
|
|
struct GetTeamRequest {
|
|
bool wantsNewServers;
|
|
bool wantsTrueBest;
|
|
bool preferLowerUtilization;
|
|
double inflightPenalty;
|
|
std::vector<UID> sources;
|
|
std::vector<UID> completeSources;
|
|
Promise< Optional< Reference<IDataDistributionTeam> > > reply;
|
|
|
|
GetTeamRequest() {}
|
|
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, double inflightPenalty = 1.0 ) : wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), inflightPenalty( inflightPenalty ) {}
|
|
};
|
|
|
|
struct GetMetricsRequest {
|
|
KeyRange keys;
|
|
Promise< StorageMetrics > reply;
|
|
|
|
GetMetricsRequest() {}
|
|
GetMetricsRequest( KeyRange const& keys ) : keys(keys) {}
|
|
};
|
|
|
|
struct TeamCollectionInterface {
|
|
PromiseStream< GetTeamRequest > getTeam;
|
|
};
|
|
|
|
class ShardsAffectedByTeamFailure : public ReferenceCounted<ShardsAffectedByTeamFailure> {
|
|
public:
|
|
ShardsAffectedByTeamFailure() {}
|
|
|
|
struct Team {
|
|
vector<UID> servers; // sorted
|
|
bool primary;
|
|
|
|
Team() : primary(true) {}
|
|
Team(vector<UID> const& servers, bool primary) : servers(servers), primary(primary) {}
|
|
|
|
bool operator < ( const Team& r ) const {
|
|
if( servers == r.servers ) return primary < r.primary;
|
|
return servers < r.servers;
|
|
}
|
|
bool operator == ( const Team& r ) const {
|
|
return servers == r.servers && primary == r.primary;
|
|
}
|
|
};
|
|
|
|
// This tracks the data distribution on the data distribution server so that teamTrackers can
|
|
// relocate the right shards when a team is degraded.
|
|
|
|
// The following are important to make sure that failure responses don't revert splits or merges:
|
|
// - The shards boundaries in the two data structures reflect "queued" RelocateShard requests
|
|
// (i.e. reflects the desired set of shards being tracked by dataDistributionTracker,
|
|
// rather than the status quo). These boundaries are modified in defineShard and the content
|
|
// of what servers correspond to each shard is a copy or union of the shards already there
|
|
// - The teams associated with each shard reflect either the sources for non-moving shards
|
|
// or the destination team for in-flight shards (the change is atomic with respect to team selection).
|
|
// moveShard() changes the servers associated with a shard and will never adjust the shard
|
|
// boundaries. If a move is received for a shard that has been redefined (the exact shard is
|
|
// no longer in the map), the servers will be set for all contained shards and added to all
|
|
// intersecting shards.
|
|
|
|
int getNumberOfShards( UID ssID );
|
|
vector<KeyRange> getShardsFor( Team team );
|
|
|
|
//The first element of the pair is either the source for non-moving shards or the destination team for in-flight shards
|
|
//The second element of the pair is all previous sources for in-flight shards
|
|
std::pair<vector<Team>,vector<Team>> getTeamsFor( KeyRangeRef keys );
|
|
|
|
void defineShard( KeyRangeRef keys );
|
|
void moveShard( KeyRangeRef keys, std::vector<Team> destinationTeam );
|
|
void finishMove( KeyRangeRef keys );
|
|
void check();
|
|
private:
|
|
struct OrderByTeamKey {
|
|
bool operator()( const std::pair<Team,KeyRange>& lhs, const std::pair<Team,KeyRange>& rhs ) const {
|
|
if (lhs.first < rhs.first) return true;
|
|
if (lhs.first > rhs.first) return false;
|
|
return lhs.second.begin < rhs.second.begin;
|
|
}
|
|
};
|
|
|
|
KeyRangeMap< std::pair<vector<Team>,vector<Team>> > shard_teams; // A shard can be affected by the failure of multiple teams if it is a queued merge, or when usable_regions > 1
|
|
std::set< std::pair<Team,KeyRange>, OrderByTeamKey > team_shards;
|
|
std::map< UID, int > storageServerShards;
|
|
|
|
void erase(Team team, KeyRange const& range);
|
|
void insert(Team team, KeyRange const& range);
|
|
};
|
|
|
|
// DDShardInfo is so named to avoid link-time name collision with ShardInfo within the StorageServer
|
|
struct DDShardInfo {
|
|
Key key;
|
|
vector<UID> primarySrc;
|
|
vector<UID> remoteSrc;
|
|
vector<UID> primaryDest;
|
|
vector<UID> remoteDest;
|
|
bool hasDest;
|
|
|
|
explicit DDShardInfo(Key key) : key(key), hasDest(false) {}
|
|
};
|
|
|
|
struct InitialDataDistribution : ReferenceCounted<InitialDataDistribution> {
|
|
int mode;
|
|
vector<std::pair<StorageServerInterface, ProcessClass>> allServers;
|
|
std::set<vector<UID>> primaryTeams;
|
|
std::set<vector<UID>> remoteTeams;
|
|
vector<DDShardInfo> shards;
|
|
};
|
|
|
|
Future<Void> dataDistribution(
|
|
Reference<AsyncVar<struct ServerDBInfo>> const& db,
|
|
MasterInterface const& mi, DatabaseConfiguration const& configuration,
|
|
PromiseStream< std::pair<UID, Optional<StorageServerInterface>> > const& serverChanges,
|
|
Reference<ILogSystem> const& logSystem,
|
|
Version const& recoveryCommitVersion,
|
|
std::vector<Optional<Key>> const& primaryDcId,
|
|
std::vector<Optional<Key>> const& remoteDcIds,
|
|
double* const& lastLimited,
|
|
Future<Void> const& remoteRecovered);
|
|
|
|
Future<Void> dataDistributionTracker(
|
|
Reference<InitialDataDistribution> const& initData,
|
|
Database const& cx,
|
|
PromiseStream<RelocateShard> const& output,
|
|
Reference<ShardsAffectedByTeamFailure> const& shardsAffectedByTeamFailure,
|
|
PromiseStream<GetMetricsRequest> const& getShardMetrics,
|
|
FutureStream<Promise<int64_t>> const& getAverageShardBytes,
|
|
Promise<Void> const& readyToStart,
|
|
Reference<AsyncVar<bool>> const& zeroHealthyTeams,
|
|
UID const& masterId);
|
|
|
|
Future<Void> dataDistributionQueue(
|
|
Database const& cx,
|
|
PromiseStream<RelocateShard> const& output,
|
|
FutureStream<RelocateShard> const& input,
|
|
PromiseStream<GetMetricsRequest> const& getShardMetrics,
|
|
Reference<AsyncVar<bool>> const& processingUnhealthy,
|
|
vector<TeamCollectionInterface> const& teamCollection,
|
|
Reference<ShardsAffectedByTeamFailure> const& shardsAffectedByTeamFailure,
|
|
MoveKeysLock const& lock,
|
|
PromiseStream<Promise<int64_t>> const& getAverageShardBytes,
|
|
MasterInterface const& mi,
|
|
int const& teamSize,
|
|
double* const& lastLimited,
|
|
Version const& recoveryVersion);
|
|
|
|
//Holds the permitted size and IO Bounds for a shard
|
|
struct ShardSizeBounds {
|
|
StorageMetrics max;
|
|
StorageMetrics min;
|
|
StorageMetrics permittedError;
|
|
|
|
bool operator == ( ShardSizeBounds const& rhs ) const {
|
|
return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError;
|
|
}
|
|
};
|
|
|
|
//Gets the permitted size and IO bounds for a shard
|
|
ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);
|
|
|
|
//Determines the maximum shard size based on the size of the database
|
|
int64_t getMaxShardSize( double dbSizeEstimate );
|