Merge pull request #2703 from ajbeamon/fix-stuck-dd-rebalancing

Fix issue with rebalancing data movement doing no work
This commit is contained in:
A.J. Beamon 2020-02-20 15:56:04 -08:00 committed by GitHub
commit 9e84fa965d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 45 additions and 35 deletions

View File

@ -17,6 +17,8 @@ Fixes
* Status could not label more than 5 processes as proxies. `(PR #2653) <https://github.com/apple/foundationdb/pull/2653>`_.
* The ``TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER``, ``TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS``, ``TR_FLAG_DISABLE_SERVER_TEAM_REMOVER``, and ``BUGGIFY_ALL_COORDINATION`` knobs could not be set at runtime. `(PR #2661) <https://github.com/apple/foundationdb/pull/2661>`_.
* Backup container filename parsing was unnecessarily consulting the local filesystem which will error when permission is denied. `(PR #2693) <https://github.com/apple/foundationdb/pull/2693>`_.
* Rebalancing data movement could stop doing work even though the data in the cluster was not well balanced. `(PR #2703) <https://github.com/apple/foundationdb/pull/2703>`_.
6.2.15
======

View File

@ -293,8 +293,8 @@ public:
return minRatio;
}
virtual bool hasHealthyFreeSpace() {
return getMinFreeSpaceRatio() > SERVER_KNOBS->MIN_FREE_SPACE_RATIO && getMinFreeSpace() > SERVER_KNOBS->MIN_FREE_SPACE;
virtual bool hasHealthyFreeSpace(double minRatio, int64_t minFreeSpace) {
return (minRatio == 0 || getMinFreeSpaceRatio() > minRatio) && (minFreeSpace == std::numeric_limits<int64_t>::min() || getMinFreeSpace() > minFreeSpace);
}
virtual Future<Void> updateStorageMetrics() {
@ -758,6 +758,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
std::vector<Reference<IDataDistributionTeam>> randomTeams;
const std::set<UID> completeSources(req.completeSources.begin(), req.completeSources.end());
// Note: this block does not apply any filters from the request
if( !req.wantsNewServers ) {
for( int i = 0; i < req.completeSources.size(); i++ ) {
if( !self->server_info.count( req.completeSources[i] ) ) {
@ -784,7 +785,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
if( req.wantsTrueBest ) {
ASSERT( !bestOption.present() );
for( int i = 0; i < self->teams.size(); i++ ) {
if( self->teams[i]->isHealthy() && (!req.preferLowerUtilization || self->teams[i]->hasHealthyFreeSpace()) ) {
if (self->teams[i]->isHealthy() &&
self->teams[i]->hasHealthyFreeSpace(req.minFreeSpaceRatio, req.preferLowerUtilization ? SERVER_KNOBS->MIN_FREE_SPACE : std::numeric_limits<int64_t>::min()) &&
(!req.teamMustHaveShards || self->shardsAffectedByTeamFailure->getShardsFor(ShardsAffectedByTeamFailure::Team(self->teams[i]->getServerIDs(), self->primary)).size() > 0))
{
int64_t loadBytes = self->teams[i]->getLoadBytes(true, req.inflightPenalty);
if( !bestOption.present() || ( req.preferLowerUtilization && loadBytes < bestLoadBytes ) || ( !req.preferLowerUtilization && loadBytes > bestLoadBytes ) ) {
bestLoadBytes = loadBytes;
@ -798,7 +802,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
while( randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT && nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES ) {
Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams);
bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyFreeSpace());
bool ok = dest->isHealthy() &&
dest->hasHealthyFreeSpace(req.minFreeSpaceRatio, req.preferLowerUtilization ? SERVER_KNOBS->MIN_FREE_SPACE : std::numeric_limits<int64_t>::min()) &&
(!req.teamMustHaveShards || self->shardsAffectedByTeamFailure->getShardsFor(ShardsAffectedByTeamFailure::Team(dest->getServerIDs(), self->primary)).size() > 0);
for(int i=0; ok && i<randomTeams.size(); i++) {
if (randomTeams[i]->getServerIDs() == dest->getServerIDs()) {
ok = false;
@ -823,6 +830,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// Note: req.completeSources can be empty and all servers (and server teams) can be unhealthy.
// We will get stuck at this! This only happens when a DC fails. No need to consider it right now.
// Note: this block does not apply any filters from the request
if(!bestOption.present() && self->zeroHealthyTeams->get()) {
//Attempt to find the unhealthy source server team and return it
for( int i = 0; i < req.completeSources.size(); i++ ) {

View File

@ -47,7 +47,7 @@ struct IDataDistributionTeam {
virtual int64_t getLoadBytes( bool includeInFlight = true, double inflightPenalty = 1.0 ) = 0;
virtual int64_t getMinFreeSpace( bool includeInFlight = true ) = 0;
virtual double getMinFreeSpaceRatio( bool includeInFlight = true ) = 0;
virtual bool hasHealthyFreeSpace() = 0;
virtual bool hasHealthyFreeSpace( double minRatio, int64_t minFreeSpace ) = 0;
virtual Future<Void> updateStorageMetrics() = 0;
virtual void addref() = 0;
virtual void delref() = 0;
@ -75,12 +75,16 @@ struct GetTeamRequest {
bool wantsNewServers;
bool wantsTrueBest;
bool preferLowerUtilization;
bool teamMustHaveShards;
double minFreeSpaceRatio;
double inflightPenalty;
std::vector<UID> completeSources;
Promise< Optional< Reference<IDataDistributionTeam> > > reply;
GetTeamRequest() {}
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, double inflightPenalty = 1.0 ) : wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), inflightPenalty( inflightPenalty ) {}
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, bool teamMustHaveShards, double minFreeSpaceRatio = 0.0, double inflightPenalty = 1.0 )
: wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), teamMustHaveShards( teamMustHaveShards ),
minFreeSpaceRatio( minFreeSpaceRatio ), inflightPenalty( inflightPenalty ) {}
};
struct GetMetricsRequest {

View File

@ -186,9 +186,9 @@ public:
return result;
}
virtual bool hasHealthyFreeSpace() {
return all([](Reference<IDataDistributionTeam> team) {
return team->hasHealthyFreeSpace();
virtual bool hasHealthyFreeSpace(double minRatio, int64_t minFreeSpace) {
return all([minRatio, minFreeSpace](Reference<IDataDistributionTeam> team) {
return team->hasHealthyFreeSpace(minRatio, minFreeSpace);
});
}
@ -929,7 +929,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, inflightPenalty);
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, false, SERVER_KNOBS->MIN_FREE_SPACE_RATIO, inflightPenalty);
req.completeSources = rd.completeSources;
Optional<Reference<IDataDistributionTeam>> bestTeam = wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)));
// If a DC has no healthy team, we stop checking the other DCs until
@ -1213,21 +1213,19 @@ ACTOR Future<Void> BgDDMountainChopper( DDQueueData* self, int teamCollectionInd
if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] <
SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait(brokenPromiseToNever(
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, true))));
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, true, false, SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF))));
if (randomTeam.present()) {
if (randomTeam.get()->getMinFreeSpaceRatio() > SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF) {
state Optional<Reference<IDataDistributionTeam>> loadedTeam =
wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(
GetTeamRequest(true, true, false))));
if (loadedTeam.present()) {
bool moved =
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, loadedTeam.get(),
randomTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
} else {
resetCount++;
}
state Optional<Reference<IDataDistributionTeam>> loadedTeam =
wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(
GetTeamRequest(true, true, false, true))));
if (loadedTeam.present()) {
bool moved =
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, loadedTeam.get(),
randomTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
} else {
resetCount++;
}
}
}
@ -1282,20 +1280,18 @@ ACTOR Future<Void> BgDDValleyFiller( DDQueueData* self, int teamCollectionIndex)
if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] <
SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait(brokenPromiseToNever(
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, false))));
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, false, true))));
if (randomTeam.present()) {
state Optional<Reference<IDataDistributionTeam>> unloadedTeam = wait(brokenPromiseToNever(
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, true, true))));
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, true, true, false, SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF))));
if (unloadedTeam.present()) {
if (unloadedTeam.get()->getMinFreeSpaceRatio() > SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF) {
bool moved =
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, randomTeam.get(),
unloadedTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
} else {
resetCount++;
}
bool moved =
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, randomTeam.get(),
unloadedTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
} else {
resetCount++;
}
}
}