Add new criteria to DD's GetTeamRequest that allow you to require shards be present on the team and that the team have a minimum free ratio. This avoids scenarios where the team chosen when processing the request is later rejected by the requestor, causing rebalancing movements to get stuck.

This commit is contained in:
A.J. Beamon 2020-02-20 09:32:00 -08:00
parent 8cfc032929
commit c164acb88d
3 changed files with 46 additions and 30 deletions

View File

@ -773,7 +773,10 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
break;
}
}
if(found && teamList[j]->isHealthy()) {
if(found && teamList[j]->isHealthy() &&
(!req.requiresAssignedData || self->shardsAffectedByTeamFailure->getShardsFor(ShardsAffectedByTeamFailure::Team(teamList[j]->getServerIDs(), self->primary)).size() > 0) &&
teamList[j]->getMinFreeSpaceRatio() >= req.minFreeSpaceRatio)
{
req.reply.send( teamList[j] );
return Void();
}
@ -784,7 +787,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
if( req.wantsTrueBest ) {
ASSERT( !bestOption.present() );
for( int i = 0; i < self->teams.size(); i++ ) {
if( self->teams[i]->isHealthy() && (!req.preferLowerUtilization || self->teams[i]->hasHealthyFreeSpace()) ) {
if (self->teams[i]->isHealthy() &&
(!req.preferLowerUtilization || self->teams[i]->hasHealthyFreeSpace()) &&
(!req.requiresAssignedData || self->shardsAffectedByTeamFailure->getShardsFor(ShardsAffectedByTeamFailure::Team(self->teams[i]->getServerIDs(), self->primary)) .size() > 0) &&
self->teams[i]->getMinFreeSpaceRatio() >= req.minFreeSpaceRatio)
{
int64_t loadBytes = self->teams[i]->getLoadBytes(true, req.inflightPenalty);
if( !bestOption.present() || ( req.preferLowerUtilization && loadBytes < bestLoadBytes ) || ( !req.preferLowerUtilization && loadBytes > bestLoadBytes ) ) {
bestLoadBytes = loadBytes;
@ -798,7 +805,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
while( randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT && nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES ) {
Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams);
bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyFreeSpace());
bool ok = dest->isHealthy() &&
(!req.preferLowerUtilization || dest->hasHealthyFreeSpace()) &&
(!req.requiresAssignedData || self->shardsAffectedByTeamFailure->getShardsFor(ShardsAffectedByTeamFailure::Team(dest->getServerIDs(), self->primary)).size() > 0) &&
dest->getMinFreeSpaceRatio() >= req.minFreeSpaceRatio;
for(int i=0; ok && i<randomTeams.size(); i++) {
if (randomTeams[i]->getServerIDs() == dest->getServerIDs()) {
ok = false;
@ -833,6 +844,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
for( int j = 0; j < teamList.size(); j++ ) {
bool found = true;
auto serverIDs = teamList[j]->getServerIDs();
if((req.requiresAssignedData && self->shardsAffectedByTeamFailure->getShardsFor(ShardsAffectedByTeamFailure::Team(serverIDs, self->primary)).size() == 0) ||
teamList[j]->getMinFreeSpaceRatio() < req.minFreeSpaceRatio)
{
continue;
}
for( int k = 0; k < teamList[j]->size(); k++ ) {
if( !completeSources.count( serverIDs[k] ) ) {
found = false;

View File

@ -75,12 +75,16 @@ struct GetTeamRequest {
bool wantsNewServers;
bool wantsTrueBest;
bool preferLowerUtilization;
bool requiresAssignedData;
double minFreeSpaceRatio;
double inflightPenalty;
std::vector<UID> completeSources;
Promise< Optional< Reference<IDataDistributionTeam> > > reply;
GetTeamRequest() {}
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, double inflightPenalty = 1.0 ) : wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), inflightPenalty( inflightPenalty ) {}
GetTeamRequest( bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, bool requiresAssignedData, double minFreeSpaceRatio = 0.0, double inflightPenalty = 1.0 )
: wantsNewServers( wantsNewServers ), wantsTrueBest( wantsTrueBest ), preferLowerUtilization( preferLowerUtilization ), requiresAssignedData(requiresAssignedData),
minFreeSpaceRatio( minFreeSpaceRatio ), inflightPenalty( inflightPenalty ) {}
};
struct GetMetricsRequest {

View File

@ -929,7 +929,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, inflightPenalty);
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, false, 0.0, inflightPenalty);
req.completeSources = rd.completeSources;
Optional<Reference<IDataDistributionTeam>> bestTeam = wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)));
// If a DC has no healthy team, we stop checking the other DCs until
@ -1213,21 +1213,19 @@ ACTOR Future<Void> BgDDMountainChopper( DDQueueData* self, int teamCollectionInd
if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] <
SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait(brokenPromiseToNever(
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, true))));
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, true, false, SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF))));
if (randomTeam.present()) {
if (randomTeam.get()->getMinFreeSpaceRatio() > SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF) {
state Optional<Reference<IDataDistributionTeam>> loadedTeam =
wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(
GetTeamRequest(true, true, false))));
if (loadedTeam.present()) {
bool moved =
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, loadedTeam.get(),
randomTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
} else {
resetCount++;
}
state Optional<Reference<IDataDistributionTeam>> loadedTeam =
wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(
GetTeamRequest(true, true, false, true))));
if (loadedTeam.present()) {
bool moved =
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, loadedTeam.get(),
randomTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
} else {
resetCount++;
}
}
}
@ -1282,20 +1280,18 @@ ACTOR Future<Void> BgDDValleyFiller( DDQueueData* self, int teamCollectionIndex)
if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] <
SERVER_KNOBS->DD_REBALANCE_PARALLELISM) {
state Optional<Reference<IDataDistributionTeam>> randomTeam = wait(brokenPromiseToNever(
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, false))));
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, false, false, true))));
if (randomTeam.present()) {
state Optional<Reference<IDataDistributionTeam>> unloadedTeam = wait(brokenPromiseToNever(
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, true, true))));
self->teamCollections[teamCollectionIndex].getTeam.getReply(GetTeamRequest(true, true, true, false, SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF))));
if (unloadedTeam.present()) {
if (unloadedTeam.get()->getMinFreeSpaceRatio() > SERVER_KNOBS->FREE_SPACE_RATIO_DD_CUTOFF) {
bool moved =
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, randomTeam.get(),
unloadedTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
} else {
resetCount++;
}
bool moved =
wait(rebalanceTeams(self, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, randomTeam.get(),
unloadedTeam.get(), teamCollectionIndex == 0));
if (moved) {
resetCount = 0;
} else {
resetCount++;
}
}
}