More aggressively attempt to find teams that do not have low disk space

This commit is contained in:
Evan Tschannen 2020-02-20 16:47:50 -08:00
parent 44f21ca332
commit 819c55556c
5 changed files with 20 additions and 9 deletions

View File

@ -241,9 +241,14 @@ public:
double minAvailableSpaceRatio = getMinAvailableSpaceRatio(includeInFlight);
int64_t inFlightBytes = includeInFlight ? getDataInFlightToTeam() / servers.size() : 0;
double availableSpaceMultiplier = SERVER_KNOBS->FREE_SPACE_RATIO_CUTOFF / ( std::max( std::min( SERVER_KNOBS->FREE_SPACE_RATIO_CUTOFF, minAvailableSpaceRatio ), 0.000001 ) );
if(servers.size()>2) {
//make sure in triple replication the penalty is high enough that you will always avoid a team with a member at 20% free space
availableSpaceMultiplier = availableSpaceMultiplier * availableSpaceMultiplier;
}
if(availableSpaceMultiplier > 1 && deterministicRandom()->random01() < 0.001)
TraceEvent(SevWarn, "DiskNearCapacity").detail("AvailableSpaceRatio", minAvailableSpaceRatio);
if(minAvailableSpaceRatio < SERVER_KNOBS->START_MIN_FREE_SPACE_RATIO) {
TraceEvent(SevWarn, "DiskNearCapacity").suppressFor(1.0).detail("AvailableSpaceRatio", minAvailableSpaceRatio);
}
return (physicalBytes + (inflightPenalty*inFlightBytes)) * availableSpaceMultiplier;
}

View File

@ -929,7 +929,8 @@ ACTOR Future<Void> dataDistributionRelocator( DDQueueData *self, RelocateData rd
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY;
if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT;
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, false, SERVER_KNOBS->MIN_FREE_SPACE_RATIO, inflightPenalty);
double targetFreeSpaceRatio = std::max(SERVER_KNOBS->START_MIN_FREE_SPACE_RATIO - stuckCount*SERVER_KNOBS->MIN_FREE_SPACE_RATIO_INCREMENT, SERVER_KNOBS->END_MIN_FREE_SPACE_RATIO);
auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, false, targetFreeSpaceRatio, inflightPenalty);
req.completeSources = rd.completeSources;
Optional<Reference<IDataDistributionTeam>> bestTeam = wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)));
// If a DC has no healthy team, we stop checking the other DCs until

View File

@ -102,7 +102,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
init( BG_DD_DECREASE_RATE, 1.02 );
init( BG_DD_SATURATION_DELAY, 1.0 );
init( INFLIGHT_PENALTY_HEALTHY, 1.0 );
init( INFLIGHT_PENALTY_UNHEALTHY, 10.0 );
init( INFLIGHT_PENALTY_UNHEALTHY, 500.0 );
init( INFLIGHT_PENALTY_ONE_LEFT, 1000.0 );
init( PRIORITY_RECOVER_MOVE, 110 );
@ -184,7 +184,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
init( DD_MERGE_COALESCE_DELAY, isSimulated ? 30.0 : 300.0 ); if( randomize && BUGGIFY ) DD_MERGE_COALESCE_DELAY = 0.001;
init( STORAGE_METRICS_POLLING_DELAY, 2.0 ); if( randomize && BUGGIFY ) STORAGE_METRICS_POLLING_DELAY = 15.0;
init( STORAGE_METRICS_RANDOM_DELAY, 0.2 );
init( FREE_SPACE_RATIO_CUTOFF, 0.1 );
init( FREE_SPACE_RATIO_CUTOFF, 0.35 );
init( FREE_SPACE_RATIO_DD_CUTOFF, 0.2 );
init( DESIRED_TEAMS_PER_SERVER, 5 ); if( randomize && BUGGIFY ) DESIRED_TEAMS_PER_SERVER = 1;
init( MAX_TEAMS_PER_SERVER, 5*DESIRED_TEAMS_PER_SERVER );
@ -428,7 +428,9 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
init( MAX_TRANSACTIONS_PER_BYTE, 1000 );
init( MIN_FREE_SPACE, 1e8 );
init( MIN_FREE_SPACE_RATIO, 0.05 );
init( START_MIN_FREE_SPACE_RATIO, 0.26 );
init( END_MIN_FREE_SPACE_RATIO, 0.05 );
init( MIN_FREE_SPACE_RATIO_INCREMENT, 0.03 );
init( MAX_TL_SS_VERSION_DIFFERENCE, 1e99 ); // if( randomize && BUGGIFY ) MAX_TL_SS_VERSION_DIFFERENCE = std::max(1.0, 0.25 * VERSIONS_PER_SECOND); // spring starts at half this value //FIXME: this knob causes ratekeeper to clamp on idle cluster in simulation that have a large number of logs
init( MAX_TL_SS_VERSION_DIFFERENCE_BATCH, 1e99 );

View File

@ -151,6 +151,7 @@ public:
double STORAGE_METRICS_RANDOM_DELAY;
double FREE_SPACE_RATIO_CUTOFF;
double FREE_SPACE_RATIO_DD_CUTOFF;
double FREE_SPACE_CUTOFF_PENALTY;
int DESIRED_TEAMS_PER_SERVER;
int MAX_TEAMS_PER_SERVER;
int64_t DD_SHARD_SIZE_GRANULARITY;
@ -367,7 +368,9 @@ public:
double MAX_TRANSACTIONS_PER_BYTE;
int64_t MIN_FREE_SPACE;
double MIN_FREE_SPACE_RATIO;
double START_MIN_FREE_SPACE_RATIO;
double END_MIN_FREE_SPACE_RATIO;
double MIN_FREE_SPACE_RATIO_INCREMENT;
double MAX_TL_SS_VERSION_DIFFERENCE; // spring starts at half this value
double MAX_TL_SS_VERSION_DIFFERENCE_BATCH;

View File

@ -390,7 +390,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
limitReason_t ssLimitReason = limitReason_t::unlimited;
int64_t minFreeSpace = std::max(SERVER_KNOBS->MIN_FREE_SPACE, (int64_t)(SERVER_KNOBS->MIN_FREE_SPACE_RATIO * ss.smoothTotalSpace.smoothTotal()));
int64_t minFreeSpace = std::max(SERVER_KNOBS->MIN_FREE_SPACE, (int64_t)(SERVER_KNOBS->END_MIN_FREE_SPACE_RATIO * ss.smoothTotalSpace.smoothTotal()));
worstFreeSpaceStorageServer = std::min(worstFreeSpaceStorageServer, (int64_t)ss.smoothFreeSpace.smoothTotal() - minFreeSpace);
@ -574,7 +574,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
limitReason_t tlogLimitReason = limitReason_t::log_server_write_queue;
int64_t minFreeSpace = std::max( SERVER_KNOBS->MIN_FREE_SPACE, (int64_t)(SERVER_KNOBS->MIN_FREE_SPACE_RATIO * tl.smoothTotalSpace.smoothTotal()));
int64_t minFreeSpace = std::max( SERVER_KNOBS->MIN_FREE_SPACE, (int64_t)(SERVER_KNOBS->END_MIN_FREE_SPACE_RATIO * tl.smoothTotalSpace.smoothTotal()));
worstFreeSpaceTLog = std::min(worstFreeSpaceTLog, (int64_t)tl.smoothFreeSpace.smoothTotal() - minFreeSpace);