Make the DD priority associated with populating a remote region lower than machine failures

This commit is contained in:
Evan Tschannen 2020-03-04 14:07:32 -08:00
parent b0062f58d3
commit 6296465e07
5 changed files with 22 additions and 14 deletions

View File

@ -2870,7 +2870,9 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
lastWrongConfiguration = anyWrongConfiguration;
state int lastPriority = team->getPriority();
if( serversLeft < self->configuration.storageTeamSize ) {
if(team->size() == 0) {
team->setPriority( SERVER_KNOBS->PRIORITY_POPULATE_REGION );
} else if( serversLeft < self->configuration.storageTeamSize ) {
if( serversLeft == 0 )
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_0_LEFT );
else if( serversLeft == 1 )
@ -2887,10 +2889,11 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY );
}
}
else if( anyUndesired )
else if( anyUndesired ) {
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER );
else
} else {
team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_HEALTHY );
}
if(lastPriority != team->getPriority()) {
self->priority_teams[lastPriority]--;

View File

@ -57,7 +57,8 @@ struct RelocateData {
rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT), interval("QueuedRelocation") {}
static bool isHealthPriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
return priority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
priority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ||
priority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
priority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
priority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ||
@ -394,7 +395,7 @@ struct DDQueueData {
// ensure a team remover will not start before the previous one finishes removing a team and move away data
// NOTE: split and merge shard have higher priority. If they have to wait for unhealthyRelocations = 0,
// deadlock may happen: split/merge shard waits for unhealthyRelocations, while blocks team_redundant.
if (healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
if (healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
unhealthyRelocations++;
rawProcessingUnhealthy->set(true);
@ -402,7 +403,7 @@ struct DDQueueData {
priority_relocations[priority]++;
}
void finishRelocation(int priority, int healthPriority) {
if (healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
if (healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ||
healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
unhealthyRelocations--;
ASSERT(unhealthyRelocations >= 0);
@ -927,7 +928,7 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
while( tciIndex < self->teamCollections.size() ) {
double inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_HEALTHY;
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, false, inflightPenalty);
req.completeSources = rd.completeSources;
@ -1497,6 +1498,7 @@ ACTOR Future<Void> dataDistributionQueue(
.detail( "PriorityTeamContainsUndesiredServer", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER] )
.detail( "PriorityTeamRedundant", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT] )
.detail( "PriorityMergeShard", self.priority_relocations[SERVER_KNOBS->PRIORITY_MERGE_SHARD] )
.detail( "PriorityPopulateRegion", self.priority_relocations[SERVER_KNOBS->PRIORITY_POPULATE_REGION] )
.detail( "PriorityTeamUnhealthy", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY] )
.detail( "PriorityTeam2Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_2_LEFT] )
.detail( "PriorityTeam1Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_1_LEFT] )

View File

@ -112,6 +112,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
init( PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER, 150 );
init( PRIORITY_TEAM_REDUNDANT, 200 );
init( PRIORITY_MERGE_SHARD, 340 );
init( PRIORITY_POPULATE_REGION, 600 );
init( PRIORITY_TEAM_UNHEALTHY, 700 );
init( PRIORITY_TEAM_2_LEFT, 709 );
init( PRIORITY_TEAM_1_LEFT, 800 );

View File

@ -117,6 +117,7 @@ public:
int PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER;
int PRIORITY_TEAM_REDUNDANT;
int PRIORITY_MERGE_SHARD;
int PRIORITY_POPULATE_REGION;
int PRIORITY_TEAM_UNHEALTHY;
int PRIORITY_TEAM_2_LEFT;
int PRIORITY_TEAM_1_LEFT;

View File

@ -1430,29 +1430,30 @@ ACTOR static Future<JsonBuilderObject> dataStatusFetcher(WorkerDetails ddWorker,
stateSectionObj["description"] = "No replicas remain of some data";
stateSectionObj["min_replicas_remaining"] = 0;
replicas = 0;
}
else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_1_LEFT) {
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_1_LEFT) {
stateSectionObj["healthy"] = false;
stateSectionObj["name"] = "healing";
stateSectionObj["description"] = "Only one replica remains of some data";
stateSectionObj["min_replicas_remaining"] = 1;
replicas = 1;
}
else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) {
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) {
stateSectionObj["healthy"] = false;
stateSectionObj["name"] = "healing";
stateSectionObj["description"] = "Only two replicas remain of some data";
stateSectionObj["min_replicas_remaining"] = 2;
replicas = 2;
}
else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY) {
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY) {
stateSectionObj["healthy"] = false;
stateSectionObj["name"] = "healing";
stateSectionObj["description"] = "Restoring replication factor";
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_POPULATE_REGION) {
stateSectionObj["healthy"] = true;
stateSectionObj["name"] = "healthy_populating_region";
stateSectionObj["description"] = "Populating remote region";
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_MERGE_SHARD) {
stateSectionObj["healthy"] = true;
stateSectionObj["name"] = "healthy_repartitioning";
stateSectionObj["description"] = "Repartitioning.";
stateSectionObj["description"] = "Repartitioning";
} else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) {
stateSectionObj["healthy"] = true;
stateSectionObj["name"] = "optimizing_team_collections";