Merge pull request #2703 from ajbeamon/fix-stuck-dd-rebalancing
Fix issue with rebalancing data movement doing no work
This commit is contained in:
commit
9e84fa965d
|
@ -17,6 +17,8 @@ Fixes
|
|||
* Status could not label more than 5 processes as proxies. `(PR #2653) <https://github.com/apple/foundationdb/pull/2653>`_.
|
||||
* The ``TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER``, ``TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS``, ``TR_FLAG_DISABLE_SERVER_TEAM_REMOVER``, and ``BUGGIFY_ALL_COORDINATION`` knobs could not be set at runtime. `(PR #2661) <https://github.com/apple/foundationdb/pull/2661>`_.
|
||||
* Backup container filename parsing was unnecessarily consulting the local filesystem which will error when permission is denied. `(PR #2693) <https://github.com/apple/foundationdb/pull/2693>`_.
|
||||
* Rebalancing data movement could stop doing work even though the data in the cluster was not well balanced. `(PR #2703) <https://github.com/apple/foundationdb/pull/2703>`_.
|
||||
|
||||
|
||||
6.2.15
|
||||
======
|
||||
|
|
|
@ -293,8 +293,8 @@ public:
|
|||
return minRatio;
|
||||
}
|
||||
|
||||
virtual bool hasHealthyFreeSpace() {
|
||||
return getMinFreeSpaceRatio() > SERVER_KNOBS->MIN_FREE_SPACE_RATIO && getMinFreeSpace() > SERVER_KNOBS->MIN_FREE_SPACE;
|
||||
virtual bool hasHealthyFreeSpace(double minRatio, int64_t minFreeSpace) {
|
||||
return (minRatio == 0 || getMinFreeSpaceRatio() > minRatio) && (minFreeSpace == std::numeric_limits<int64_t>::min() || getMinFreeSpace() > minFreeSpace);
|
||||
}
|
||||
|
||||
virtual Future<Void> updateStorageMetrics() {
|
||||
|
@ -758,6 +758,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
std::vector<Reference<IDataDistributionTeam>> randomTeams;
|
||||
const std::set<UID> completeSources(req.completeSources.begin(), req.completeSources.end());
|
||||
|
||||
// Note: this block does not apply any filters from the request
|
||||
if( !req.wantsNewServers ) {
|
||||
for( int i = 0; i < req.completeSources.size(); i++ ) {
|
||||
if( !self->server_info.count( req.completeSources[i] ) ) {
|
||||
|
@ -784,7 +785,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
if( req.wantsTrueBest ) {
|
||||
ASSERT( !bestOption.present() );
|
||||
for( int i = 0; i < self->teams.size(); i++ ) {
|
||||
if( self->teams[i]->isHealthy() && (!req.preferLowerUtilization || self->teams[i]->hasHealthyFreeSpace()) ) {
|
||||
if (self->teams[i]->isHealthy() &&
|
||||
self->teams[i]->hasHealthyFreeSpace(req.minFreeSpaceRatio, req.preferLowerUtilization ? SERVER_KNOBS->MIN_FREE_SPACE : std::numeric_limits<int64_t>::min()) &&
|
||||
(!req.teamMustHaveShards || self->shardsAffectedByTeamFailure->getShardsFor(ShardsAffectedByTeamFailure::Team(self->teams[i]->getServerIDs(), self->primary)).size() > 0))
|
||||
{
|
||||
int64_t loadBytes = self->teams[i]->getLoadBytes(true, req.inflightPenalty);
|
||||
if( !bestOption.present() || ( req.preferLowerUtilization && loadBytes < bestLoadBytes ) || ( !req.preferLowerUtilization && loadBytes > bestLoadBytes ) ) {
|
||||
bestLoadBytes = loadBytes;
|
||||
|
@ -798,7 +802,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
while( randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT && nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES ) {
|
||||
Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams);
|
||||
|
||||
bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyFreeSpace());
|
||||
bool ok = dest->isHealthy() &&
|
||||
dest->hasHealthyFreeSpace(req.minFreeSpaceRatio, req.preferLowerUtilization ? SERVER_KNOBS->MIN_FREE_SPACE : std::numeric_limits<int64_t>::min()) &&
|
||||
(!req.teamMustHaveShards || self->shardsAffectedByTeamFailure->getShardsFor(ShardsAffectedByTeamFailure::Team(dest->getServerIDs(), self->primary)).size() > 0);
|
||||
|
||||
for(int i=0; ok && i<randomTeams.size(); i++) {
|
||||
if (randomTeams[i]->getServerIDs() == dest->getServerIDs()) {
|
||||
ok = false;
|
||||
|
@ -823,6 +830,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
|
||||
// Note: req.completeSources can be empty and all servers (and server teams) can be unhealthy.
|
||||
// We will get stuck at this! This only happens when a DC fails. No need to consider it right now.
|
||||
// Note: this block does not apply any filters from the request
|
||||
if(!bestOption.present() && self->zeroHealthyTeams->get()) {
|
||||
//Attempt to find the unhealthy source server team and return it
|
||||
for( int i = 0; i < req.completeSources.size(); i++ ) {
|
||||
|
|
|
@ -47,7 +47,7 @@ struct IDataDistributionTeam {
|
|||
virtual int64_t getLoadBytes( bool includeInFlight = true, double inflightPenalty = 1.0 ) = 0;
|
||||
virtual int64_t getMinFreeSpace( bool includeInFlight = true ) = 0;
|
||||
virtual double getMinFreeSpaceRatio( bool includeInFlight = true ) = 0;
|
||||
virtual bool hasHealthyFreeSpace() = 0;
|
||||
virtual bool hasHealthyFreeSpace( double minRatio, int64_t minFreeSpace ) = 0;
|
||||
virtual Future<Void> updateStorageMetrics() = 0;
|
||||
virtual void addref() = 0;
|
||||
virtual void delref() = 0;
|
||||
|
@ -75,12 +75,16 @@ struct GetTeamRequest {
|
|||
bool wantsNewServers;
|
||||
bool wantsTrueBest;
|
||||
bool preferLowerUtilization;
|
||||
bool teamMustHaveShards;
|
||||
double minFreeSpaceRatio;
|
||||
double inflightPenalty;
|
||||
std::vector<UID> completeSources;
|
||||
Promise< Optional< Reference<IDataDistributionTeam> > > reply;
|
||||
|
||||
GetTeamRequest() {}
|
||||
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, double inflightPenalty = 1.0 ) : wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), inflightPenalty( inflightPenalty ) {}
|
||||
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, bool teamMustHaveShards, double minFreeSpaceRatio = 0.0, double inflightPenalty = 1.0 )
|
||||
: wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), teamMustHaveShards( teamMustHaveShards ),
|
||||
minFreeSpaceRatio( minFreeSpaceRatio ), inflightPenalty( inflightPenalty ) {}
|
||||
};
|
||||
|
||||
struct GetMetricsRequest {
|
||||
|
|
|
@ -186,9 +186,9 @@ public:
|
|||
return result;
|
||||
}
|
||||
|
||||
virtual bool hasHealthyFreeSpace() {
|
||||
return all([](Reference<IDataDistributionTeam> team) {
|
||||
return team->hasHealthyFreeSpace();
|
||||
virtual bool hasHealthyFreeSpace(double minRatio, int64_t minFreeSpace) {
|
||||
return all([minRatio, minFreeSpace](Reference<IDataDistributionTeam> team) {
|
||||
return team->hasHealthyFreeSpace(minRatio, minFreeSpace);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -929,7 +929,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
|
|||
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
|
||||
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;
|
||||
|
||||
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, inflightPenalty);
|
||||
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, false, SERVER_KNOBS->MIN_FREE_SPACE_RATIO, inflightPenalty);
|
||||
req.completeSources = rd.completeSources;
|
||||
Optional<Reference<IDataDistributionTeam>> bestTeam = wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)));
|
||||
// If a DC has no healthy team, we stop checking the other DCs until
|
||||
|
@ -1213,21 +1213,19 @@ ACTOR Future<Void> BgDDMountainChopper( DDQueueData* self, int teamCollectionInd
|
|||
if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] <
|
||||
SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
|
||||
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait(brokenPromiseToNever(
|
||||
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, true))));
|
||||
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, true, false, SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF))));
|
||||
if (randomTeam.present()) {
|
||||
if (randomTeam.get()->getMinFreeSpaceRatio() > SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF) {
|
||||
state Optional<Reference<IDataDistributionTeam>> loadedTeam =
|
||||
wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(
|
||||
GetTeamRequest(true, true, false))));
|
||||
if (loadedTeam.present()) {
|
||||
bool moved =
|
||||
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, loadedTeam.get(),
|
||||
randomTeam.get(), teamCollectionIndex == 0));
|
||||
if (moved) {
|
||||
resetCount = 0;
|
||||
} else {
|
||||
resetCount++;
|
||||
}
|
||||
state Optional<Reference<IDataDistributionTeam>> loadedTeam =
|
||||
wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(
|
||||
GetTeamRequest(true, true, false, true))));
|
||||
if (loadedTeam.present()) {
|
||||
bool moved =
|
||||
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, loadedTeam.get(),
|
||||
randomTeam.get(), teamCollectionIndex == 0));
|
||||
if (moved) {
|
||||
resetCount = 0;
|
||||
} else {
|
||||
resetCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1282,20 +1280,18 @@ ACTOR Future<Void> BgDDValleyFiller( DDQueueData* self, int teamCollectionIndex)
|
|||
if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] <
|
||||
SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
|
||||
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait(brokenPromiseToNever(
|
||||
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, false))));
|
||||
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, false, true))));
|
||||
if (randomTeam.present()) {
|
||||
state Optional<Reference<IDataDistributionTeam>> unloadedTeam = wait(brokenPromiseToNever(
|
||||
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, true, true))));
|
||||
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, true, true, false, SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF))));
|
||||
if (unloadedTeam.present()) {
|
||||
if (unloadedTeam.get()->getMinFreeSpaceRatio() > SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF) {
|
||||
bool moved =
|
||||
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, randomTeam.get(),
|
||||
unloadedTeam.get(), teamCollectionIndex == 0));
|
||||
if (moved) {
|
||||
resetCount = 0;
|
||||
} else {
|
||||
resetCount++;
|
||||
}
|
||||
bool moved =
|
||||
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, randomTeam.get(),
|
||||
unloadedTeam.get(), teamCollectionIndex == 0));
|
||||
if (moved) {
|
||||
resetCount = 0;
|
||||
} else {
|
||||
resetCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue