adjust CPU pivot knobs to hack simulation test
This commit is contained in:
parent
990ad26d8b
commit
5648f827a0
|
@ -169,12 +169,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
|
|||
init( PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD, 960 ); if( randomize && BUGGIFY ) PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD = 360; // Set as the lowest priority
|
||||
|
||||
// Data distribution
|
||||
init( AVAILABLE_SPACE_PIVOT_PERCENT, 0.6);
|
||||
init( CPU_PIVOT_PERCENT, 0.8);
|
||||
init( AVAILABLE_SPACE_PIVOT_RATIO, 0.6 );
|
||||
init( CPU_PIVOT_RATIO, 0.9 );
|
||||
// In order to make sure GetTeam has enough eligible destination team:
|
||||
ASSERT_GT(AVAILABLE_SPACE_PIVOT_PERCENT + CPU_PIVOT_PERCENT, 1.0 );
|
||||
init( MAX_DEST_CPU_PERCENT, 98.0 );
|
||||
init( CPU_STABLE_INTERVAL, 300.0 );
|
||||
ASSERT_GT(AVAILABLE_SPACE_PIVOT_RATIO + CPU_PIVOT_RATIO, 1.0 );
|
||||
// In simulation, the CPU percent of every storage server is hard-coded as 100.0%. It is difficult to test pivot CPU in normal simulation. TODO: add mock DD Test case for it.
|
||||
init( MAX_DEST_CPU_PERCENT, isSimulated ? 100.0: 98.0 );
|
||||
init( CPU_STABLE_INTERVAL, isSimulated ? 0.0 : 300.0 );
|
||||
init( DD_TEAM_PIVOT_UPDATE_DELAY, 5.0 );
|
||||
|
||||
init( SHARD_ENCODE_LOCATION_METADATA, false ); if( randomize && BUGGIFY ) SHARD_ENCODE_LOCATION_METADATA = true;
|
||||
init( ENABLE_DD_PHYSICAL_SHARD, false ); // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true; When true, optimization of data move between DCs is disabled
|
||||
|
@ -755,7 +757,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
|
|||
init( MIN_AVAILABLE_SPACE_RATIO, 0.05 );
|
||||
init( MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER, 0.01 );
|
||||
init( TARGET_AVAILABLE_SPACE_RATIO, 0.30 );
|
||||
init( AVAILABLE_SPACE_UPDATE_DELAY, 5.0 );
|
||||
|
||||
init( MAX_TL_SS_VERSION_DIFFERENCE, 1e99 ); // if( randomize && BUGGIFY ) MAX_TL_SS_VERSION_DIFFERENCE = std::max(1.0, 0.25 * VERSIONS_PER_SECOND); // spring starts at half this value //FIXME: this knob causes ratekeeper to clamp on idle cluster in simulation that have a large number of logs
|
||||
init( MAX_TL_SS_VERSION_DIFFERENCE_BATCH, 1e99 );
|
||||
|
|
|
@ -190,14 +190,17 @@ public:
|
|||
// Data distribution
|
||||
// DD won't move shard to teams that has availableSpaceRatio < max(0.05, AllTeamAvailSpaceRatio[pivot]), where
|
||||
// pivot = pivot percent * team count.
|
||||
double AVAILABLE_SPACE_PIVOT_PERCENT;
|
||||
// DD won't move shard to teams that has CPU >= AllTeamCPU[pivot], where pivot = pivot percent *
|
||||
double AVAILABLE_SPACE_PIVOT_RATIO;
|
||||
// DD won't move shard to teams that has CPU > AllTeamCPU[pivot], where pivot = pivot percent *
|
||||
// team count.
|
||||
double CPU_PIVOT_PERCENT;
|
||||
// DD won't move shard to teams that has CPU >= MAX_DEST_CPU_PERCENT
|
||||
double CPU_PIVOT_RATIO;
|
||||
// DD won't move shard to teams that has CPU > MAX_DEST_CPU_PERCENT
|
||||
double MAX_DEST_CPU_PERCENT;
|
||||
// DD only move shard to teams that has CPU < pivot CPU for enough time
|
||||
// DD only move shard to teams that has CPU <= pivot CPU for enough time
|
||||
double CPU_STABLE_INTERVAL;
|
||||
// The constant interval DD update pivot values for team selection. It should be >=
|
||||
// min(STORAGE_METRICS_POLLING_DELAY,DETAILED_METRIC_UPDATE_RATE) otherwise the pivot won't change;
|
||||
double DD_TEAM_PIVOT_UPDATE_DELAY;
|
||||
|
||||
bool SHARD_ENCODE_LOCATION_METADATA; // If true, location metadata will contain shard ID.
|
||||
bool ENABLE_DD_PHYSICAL_SHARD; // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true.
|
||||
|
@ -664,6 +667,7 @@ public:
|
|||
double SMOOTHING_AMOUNT;
|
||||
double SLOW_SMOOTHING_AMOUNT;
|
||||
double METRIC_UPDATE_RATE;
|
||||
// The interval of detailed HealthMetric is pushed to GRV proxies
|
||||
double DETAILED_METRIC_UPDATE_RATE;
|
||||
double LAST_LIMITED_RATIO;
|
||||
double RATEKEEPER_DEFAULT_LIMIT;
|
||||
|
@ -745,7 +749,6 @@ public:
|
|||
double MIN_AVAILABLE_SPACE_RATIO;
|
||||
double MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER;
|
||||
double TARGET_AVAILABLE_SPACE_RATIO;
|
||||
double AVAILABLE_SPACE_UPDATE_DELAY;
|
||||
|
||||
double MAX_TL_SS_VERSION_DIFFERENCE; // spring starts at half this value
|
||||
double MAX_TL_SS_VERSION_DIFFERENCE_BATCH;
|
||||
|
|
|
@ -339,7 +339,10 @@ public:
|
|||
if (randomTeams.empty() && !self->zeroHealthyTeams->get()) {
|
||||
self->bestTeamKeepStuckCount++;
|
||||
if (g_network->isSimulated()) {
|
||||
TraceEvent(SevWarn, "GetTeamReturnEmpty").detail("HealthyTeams", self->healthyTeamCount);
|
||||
TraceEvent(SevWarn, "GetTeamReturnEmpty")
|
||||
.detail("HealthyTeams", self->healthyTeamCount)
|
||||
.detail("PivotCPU", self->pivotCPU)
|
||||
.detail("PivotDiskSpace", self->pivotAvailableSpaceRatio);
|
||||
}
|
||||
} else {
|
||||
self->bestTeamKeepStuckCount = 0;
|
||||
|
@ -374,10 +377,14 @@ public:
|
|||
return Void();
|
||||
}
|
||||
}
|
||||
// if (!bestOption.present()) {
|
||||
// TraceEvent("GetTeamRequest").detail("Request", req.getDesc());
|
||||
// self->traceAllInfo(true);
|
||||
// }
|
||||
if (!bestOption.present()) {
|
||||
TraceEvent("GetTeamRequestDebug")
|
||||
.detail("Request", req.getDesc())
|
||||
.detail("HealthyTeams", self->healthyTeamCount)
|
||||
.detail("PivotCPU", self->pivotCPU)
|
||||
.detail("PivotDiskSpace", self->pivotAvailableSpaceRatio);
|
||||
self->traceAllInfo(true);
|
||||
}
|
||||
|
||||
req.reply.send(std::make_pair(bestOption, foundSrc));
|
||||
return Void();
|
||||
|
@ -3246,7 +3253,7 @@ public:
|
|||
}; // class DDTeamCollectionImpl
|
||||
|
||||
void DDTeamCollection::updateTeamPivotValues() {
|
||||
if (now() - lastPivotValuesUpdate > SERVER_KNOBS->AVAILABLE_SPACE_UPDATE_DELAY) {
|
||||
if (now() - lastPivotValuesUpdate > SERVER_KNOBS->DD_TEAM_PIVOT_UPDATE_DELAY) {
|
||||
lastPivotValuesUpdate = now();
|
||||
std::vector<double> teamAvailableSpace;
|
||||
std::vector<std::pair<double, int>> teamAverageCPU_index;
|
||||
|
@ -3256,11 +3263,12 @@ void DDTeamCollection::updateTeamPivotValues() {
|
|||
if (teams[i]->isHealthy()) {
|
||||
teamAvailableSpace.push_back(teams[i]->getMinAvailableSpaceRatio());
|
||||
teamAverageCPU_index.emplace_back(teams[i]->getAverageCPU(), i);
|
||||
minTeamAvgCPU = std::min(minTeamAvgCPU, teamAverageCPU_index.back().first);
|
||||
}
|
||||
}
|
||||
|
||||
size_t pivot = teamAvailableSpace.size() * std::min(1.0, SERVER_KNOBS->AVAILABLE_SPACE_PIVOT_PERCENT);
|
||||
size_t cpuPivotIndex = teamAverageCPU_index.size() * std::min(1.0, SERVER_KNOBS->CPU_PIVOT_PERCENT);
|
||||
size_t pivot = teamAvailableSpace.size() * std::min(1.0, SERVER_KNOBS->AVAILABLE_SPACE_PIVOT_RATIO);
|
||||
size_t cpuPivotIndex = teamAverageCPU_index.size() * std::min(1.0, SERVER_KNOBS->CPU_PIVOT_RATIO);
|
||||
if (teamAvailableSpace.size() > 1) {
|
||||
std::nth_element(teamAvailableSpace.begin(), teamAvailableSpace.begin() + pivot, teamAvailableSpace.end());
|
||||
pivotAvailableSpaceRatio =
|
||||
|
@ -3269,15 +3277,11 @@ void DDTeamCollection::updateTeamPivotValues() {
|
|||
|
||||
std::nth_element(
|
||||
teamAverageCPU_index.begin(), teamAverageCPU_index.begin() + cpuPivotIndex, teamAverageCPU_index.end());
|
||||
pivotCPU = std::min(SERVER_KNOBS->MAX_DEST_CPU_PERCENT, teamAverageCPU_index[cpuPivotIndex].first);
|
||||
// set high CPU for teams >= pivot CPU
|
||||
for (int i = cpuPivotIndex; i < teamAverageCPU_index.size(); ++i) {
|
||||
pivotCPU = teamAverageCPU_index[cpuPivotIndex].first;
|
||||
// set high CPU for teams > pivot CPU
|
||||
for (int i = cpuPivotIndex + 1; i < teamAverageCPU_index.size(); ++i) {
|
||||
teams[teamAverageCPU_index[i].second]->setLastHighCPUTime(lastPivotValuesUpdate);
|
||||
}
|
||||
for (int i = cpuPivotIndex - 1; i >= 0 && teamAverageCPU_index[i].first >= pivotCPU; --i) {
|
||||
teams[teamAverageCPU_index[i].second]->setLastHighCPUTime(lastPivotValuesUpdate);
|
||||
}
|
||||
|
||||
} else {
|
||||
pivotAvailableSpaceRatio = SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO;
|
||||
pivotCPU = SERVER_KNOBS->MAX_DEST_CPU_PERCENT;
|
||||
|
@ -3290,6 +3294,13 @@ void DDTeamCollection::updateTeamPivotValues() {
|
|||
.detail("Primary", primary);
|
||||
printDetailedTeamsInfo.trigger();
|
||||
}
|
||||
|
||||
if (pivotCPU > SERVER_KNOBS->MAX_DEST_CPU_PERCENT) {
|
||||
TraceEvent(SevWarnAlways, "DDTeamPivotCPUTooHigh", distributorId)
|
||||
.detail("PivotCPU", pivotCPU)
|
||||
.detail("MinTeamAvgCPU", minTeamAvgCPU)
|
||||
.detail("Primary", primary);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5768,7 +5779,7 @@ public:
|
|||
state GetTeamRequest req(TeamSelect::WANT_COMPLETE_SRCS,
|
||||
PreferLowerDiskUtil::True,
|
||||
TeamMustHaveShards::False,
|
||||
PreferLowerReadUtil::True);
|
||||
PreferLowerReadUtil::False);
|
||||
req.completeSources = completeSources;
|
||||
|
||||
wait(collection->getTeam(req));
|
||||
|
@ -5822,7 +5833,7 @@ public:
|
|||
state GetTeamRequest req(TeamSelect::WANT_COMPLETE_SRCS,
|
||||
PreferLowerDiskUtil::True,
|
||||
TeamMustHaveShards::False,
|
||||
PreferLowerReadUtil::True);
|
||||
PreferLowerReadUtil::False);
|
||||
req.completeSources = completeSources;
|
||||
|
||||
wait(collection->getTeam(req));
|
||||
|
@ -5874,7 +5885,7 @@ public:
|
|||
state GetTeamRequest req(TeamSelect::WANT_TRUE_BEST,
|
||||
PreferLowerDiskUtil::True,
|
||||
TeamMustHaveShards::False,
|
||||
PreferLowerReadUtil::True);
|
||||
PreferLowerReadUtil::False);
|
||||
req.completeSources = completeSources;
|
||||
|
||||
wait(collection->getTeam(req));
|
||||
|
@ -5978,7 +5989,7 @@ public:
|
|||
state GetTeamRequest req(TeamSelect::WANT_TRUE_BEST,
|
||||
PreferLowerDiskUtil::True,
|
||||
TeamMustHaveShards::False,
|
||||
PreferLowerReadUtil::True);
|
||||
PreferLowerReadUtil::False);
|
||||
req.completeSources = completeSources;
|
||||
|
||||
wait(collection->getTeam(req));
|
||||
|
@ -6036,7 +6047,7 @@ public:
|
|||
state GetTeamRequest req(TeamSelect::WANT_TRUE_BEST,
|
||||
PreferLowerDiskUtil::True,
|
||||
TeamMustHaveShards::False,
|
||||
PreferLowerReadUtil::True);
|
||||
PreferLowerReadUtil::False);
|
||||
req.completeSources = completeSources;
|
||||
|
||||
wait(collection->getTeam(req));
|
||||
|
@ -6099,13 +6110,15 @@ public:
|
|||
std::set<UID> expectedServers{ UID(4, 0) };
|
||||
std::set<UID> expectedServersHigh{ UID(5, 0) };
|
||||
|
||||
ASSERT(resTeam.present() && resTeamHigh.present());
|
||||
ASSERT(resTeam.present());
|
||||
ASSERT(resTeamHigh.present());
|
||||
auto servers = resTeam.get()->getServerIDs(), serversHigh = resTeamHigh.get()->getServerIDs();
|
||||
const std::set<UID> selectedServers(servers.begin(), servers.end()),
|
||||
selectedServersHigh(serversHigh.begin(), serversHigh.end());
|
||||
// for (auto id : selectedServers)
|
||||
// std::cout << id.toString() << std::endl;
|
||||
ASSERT(expectedServers == selectedServers && expectedServersHigh == selectedServersHigh);
|
||||
ASSERT(expectedServers == selectedServers);
|
||||
ASSERT(expectedServersHigh == selectedServersHigh);
|
||||
|
||||
resTeam.get()->addReadInFlightToTeam(50);
|
||||
req.reply.reset();
|
||||
|
@ -6152,7 +6165,7 @@ public:
|
|||
state GetTeamRequest req(TeamSelect::WANT_TRUE_BEST,
|
||||
PreferLowerDiskUtil::True,
|
||||
TeamMustHaveShards::False,
|
||||
PreferLowerReadUtil::True);
|
||||
PreferLowerReadUtil::False);
|
||||
req.completeSources = completeSources;
|
||||
|
||||
wait(collection->getTeam(req));
|
||||
|
@ -6167,6 +6180,8 @@ public:
|
|||
|
||||
return Void();
|
||||
}
|
||||
|
||||
ACTOR static Future<Void> GetTeam_CpuUtilSelection() { return Void(); }
|
||||
};
|
||||
|
||||
TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") {
|
||||
|
@ -6228,6 +6243,7 @@ TEST_CASE("/DataDistribution/GetTeam/ServerUtilizationNearCutoff") {
|
|||
wait(DDTeamCollectionUnitTest::GetTeam_ServerUtilizationNearCutoff());
|
||||
return Void();
|
||||
}
|
||||
|
||||
TEST_CASE("/DataDistribution/GetTeam/TrueBestLeastReadBandwidth") {
|
||||
wait(DDTeamCollectionUnitTest::GetTeam_TrueBestLeastReadBandwidth());
|
||||
return Void();
|
||||
|
|
|
@ -423,8 +423,8 @@ double TCTeamInfo::getAverageCPU() const {
|
|||
size++;
|
||||
}
|
||||
}
|
||||
// If every storage server hasn't gotten their CPU updated, we assume they are too busy to respond so return 101;
|
||||
return size == 0 ? 101.0 : sum / size;
|
||||
// If every storage server hasn't gotten their CPU updated, we assume they are too busy to respond so return 100.0;
|
||||
return size == 0 ? 100.0 : sum / size;
|
||||
}
|
||||
|
||||
int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const {
|
||||
|
|
|
@ -281,7 +281,8 @@ protected:
|
|||
Future<bool> clearHealthyZoneFuture;
|
||||
double pivotAvailableSpaceRatio;
|
||||
double lastPivotValuesUpdate;
|
||||
double pivotCPU;
|
||||
double pivotCPU = 0.0;
|
||||
double minTeamAvgCPU = 101.0;
|
||||
|
||||
int lowestUtilizationTeam;
|
||||
int highestUtilizationTeam;
|
||||
|
|
|
@ -225,7 +225,8 @@ public:
|
|||
void setLastHighCPUTime(double time) override { lastHighCPUTime = time; }
|
||||
|
||||
bool hasLowCpuFor(double cpuThreshold, double duration) const override {
|
||||
return getAverageCPU() < cpuThreshold && now() - lastHighCPUTime >= duration;
|
||||
return getAverageCPU() <= std::min(cpuThreshold, SERVER_KNOBS->MAX_DEST_CPU_PERCENT) &&
|
||||
now() - lastHighCPUTime >= duration;
|
||||
}
|
||||
|
||||
int64_t getReadInFlightToTeam() const override;
|
||||
|
|
Loading…
Reference in New Issue