knobified the durability lag rate controls

This commit is contained in:
Evan Tschannen 2019-07-11 18:34:19 -07:00 committed by Alex Miller
parent c5fb5494f5
commit 1a18c859c7
7 changed files with 33 additions and 18 deletions

View File

@ -230,7 +230,8 @@
"storage_server_min_free_space",
"storage_server_min_free_space_ratio",
"log_server_min_free_space",
"log_server_min_free_space_ratio"
"log_server_min_free_space_ratio",
"storage_server_durability_lag"
]
},
"description":"The database is not being saturated by the workload."
@ -249,7 +250,8 @@
"storage_server_min_free_space",
"storage_server_min_free_space_ratio",
"log_server_min_free_space",
"log_server_min_free_space_ratio"
"log_server_min_free_space_ratio",
"storage_server_durability_lag"
]
},
"description":"The database is not being saturated by the workload."

View File

@ -128,4 +128,5 @@ min_free_space Running out of space (approaching 100MB limi
min_free_space_ratio Running out of space (approaching 5% limit).
log_server_min_free_space Log server running out of space (approaching 100MB limit).
log_server_min_free_space_ratio Log server running out of space (approaching 5% limit).
storage_server_durability_lag Storage server durable version falling behind.
=================================== ====================================================

View File

@ -250,7 +250,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"storage_server_min_free_space",
"storage_server_min_free_space_ratio",
"log_server_min_free_space",
"log_server_min_free_space_ratio"
"log_server_min_free_space_ratio",
"storage_server_durability_lag"
]
},
"description":"The database is not being saturated by the workload."
@ -269,7 +270,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"storage_server_min_free_space",
"storage_server_min_free_space_ratio",
"log_server_min_free_space",
"log_server_min_free_space_ratio"
"log_server_min_free_space_ratio",
"storage_server_durability_lag"
]
},
"description":"The database is not being saturated by the workload."

View File

@ -396,6 +396,13 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( MAX_TL_SS_VERSION_DIFFERENCE_BATCH, 1e99 );
init( MAX_MACHINES_FALLING_BEHIND, 1 );
init( MAX_TPS_HISTORY_SAMPLES, 600 );
init( NEEDED_TPS_HISTORY_SAMPLES, 200 );
init( TARGET_DURABILITY_LAG_VERSIONS, 200e6 );
init( TARGET_DURABILITY_LAG_VERSIONS_BATCH, 100e6 );
init( INITIAL_DURABILITY_LAG_MULTIPLIER, 1.02 );
init( DURABILITY_LAG_REDUCTION_RATE, 0.9999 );
//Storage Metrics
init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 );
init( STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS, 1000.0 / STORAGE_METRICS_AVERAGE_INTERVAL ); // milliHz!

View File

@ -332,6 +332,13 @@ public:
double MAX_TL_SS_VERSION_DIFFERENCE_BATCH;
int MAX_MACHINES_FALLING_BEHIND;
int MAX_TPS_HISTORY_SAMPLES;
int NEEDED_TPS_HISTORY_SAMPLES;
int64_t TARGET_DURABILITY_LAG_VERSIONS;
int64_t TARGET_DURABILITY_LAG_VERSIONS_BATCH;
double INITIAL_DURABILITY_LAG_MULTIPLIER;
double DURABILITY_LAG_REDUCTION_RATE;
//Storage Metrics
double STORAGE_METRICS_AVERAGE_INTERVAL;
double STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;

View File

@ -75,7 +75,7 @@ const char* limitReasonDesc[] = {
"Storage server running out of space (approaching 5% limit).",
"Log server running out of space (approaching 100MB limit).",
"Log server running out of space (approaching 5% limit).",
"Storage server is overwhelmed by read workload"
"Storage server durable version falling behind."
};
static_assert(sizeof(limitReasonDesc) / sizeof(limitReasonDesc[0]) == limitReason_t_end, "limitReasonDesc table size");
@ -128,10 +128,11 @@ struct RatekeeperLimits {
int64_t logTargetBytes;
int64_t logSpringBytes;
double maxVersionDifference;
int64_t durabilityLagTargetVersions;
std::string context;
RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference) :
RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference, int64_t durabilityLagTargetVersions) :
tpsLimit(std::numeric_limits<double>::infinity()),
tpsLimitMetric(StringRef("Ratekeeper.TPSLimit" + context)),
reasonMetric(StringRef("Ratekeeper.Reason" + context)),
@ -140,6 +141,7 @@ struct RatekeeperLimits {
logTargetBytes(logTargetBytes),
logSpringBytes(logSpringBytes),
maxVersionDifference(maxVersionDifference),
durabilityLagTargetVersions(durabilityLagTargetVersions),
context(context)
{}
};
@ -177,8 +179,8 @@ struct RatekeeperData {
RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")),
lastWarning(0),
normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE),
batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH),
normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS),
batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH),
durabilityLagLimit(std::numeric_limits<double>::infinity()), lastDurabilityLag(0)
{}
};
@ -350,7 +352,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
// SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this value
actualTps = std::max( std::max( 1.0, actualTps ), self->smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT );
if(self->actualTpsHistory.size() > 600) {
if(self->actualTpsHistory.size() > SERVER_KNOBS->MAX_TPS_HISTORY_SAMPLES) {
self->actualTpsHistory.pop_front();
}
self->actualTpsHistory.push_back(actualTps);
@ -495,16 +497,16 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
}
limitingStorageDurabilityLagStorageServer = -1*ss->first;
if(limitingStorageDurabilityLagStorageServer > 200e6) {
if(limitingStorageDurabilityLagStorageServer > limits->durabilityLagTargetVersions && self->actualTpsHistory.size() > SERVER_KNOBS->NEEDED_TPS_HISTORY_SAMPLES) {
if(self->durabilityLagLimit == std::numeric_limits<double>::infinity()) {
double maxTps = 0;
for(int i = 0; i < self->actualTpsHistory.size(); i++) {
maxTps = std::max(maxTps, self->actualTpsHistory[i]);
}
self->durabilityLagLimit = 1.02*maxTps;
self->durabilityLagLimit = SERVER_KNOBS->INITIAL_DURABILITY_LAG_MULTIPLIER*maxTps;
}
if( limitingStorageDurabilityLagStorageServer > self->lastDurabilityLag ) {
self->durabilityLagLimit = 0.9999*self->durabilityLagLimit;
self->durabilityLagLimit = SERVER_KNOBS->DURABILITY_LAG_REDUCTION_RATE*self->durabilityLagLimit;
}
if(self->durabilityLagLimit < limits->tpsLimit) {
limits->tpsLimit = self->durabilityLagLimit;

View File

@ -1,6 +0,0 @@
testTitle=UnitTests
testName=UnitTests
startDelay=0
useDB=false
maxTestCases=0
testsMatching=!/redwood/correctness/btree