knobified the durability lag rate controls
This commit is contained in:
parent
c5fb5494f5
commit
1a18c859c7
|
@ -230,7 +230,8 @@
|
|||
"storage_server_min_free_space",
|
||||
"storage_server_min_free_space_ratio",
|
||||
"log_server_min_free_space",
|
||||
"log_server_min_free_space_ratio"
|
||||
"log_server_min_free_space_ratio",
|
||||
"storage_server_durability_lag"
|
||||
]
|
||||
},
|
||||
"description":"The database is not being saturated by the workload."
|
||||
|
@ -249,7 +250,8 @@
|
|||
"storage_server_min_free_space",
|
||||
"storage_server_min_free_space_ratio",
|
||||
"log_server_min_free_space",
|
||||
"log_server_min_free_space_ratio"
|
||||
"log_server_min_free_space_ratio",
|
||||
"storage_server_durability_lag"
|
||||
]
|
||||
},
|
||||
"description":"The database is not being saturated by the workload."
|
||||
|
|
|
@ -128,4 +128,5 @@ min_free_space Running out of space (approaching 100MB limi
|
|||
min_free_space_ratio Running out of space (approaching 5% limit).
|
||||
log_server_min_free_space Log server running out of space (approaching 100MB limit).
|
||||
log_server_min_free_space_ratio Log server running out of space (approaching 5% limit).
|
||||
storage_server_durability_lag Storage server durable version falling behind.
|
||||
=================================== ====================================================
|
||||
|
|
|
@ -250,7 +250,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
|
|||
"storage_server_min_free_space",
|
||||
"storage_server_min_free_space_ratio",
|
||||
"log_server_min_free_space",
|
||||
"log_server_min_free_space_ratio"
|
||||
"log_server_min_free_space_ratio",
|
||||
"storage_server_durability_lag"
|
||||
]
|
||||
},
|
||||
"description":"The database is not being saturated by the workload."
|
||||
|
@ -269,7 +270,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
|
|||
"storage_server_min_free_space",
|
||||
"storage_server_min_free_space_ratio",
|
||||
"log_server_min_free_space",
|
||||
"log_server_min_free_space_ratio"
|
||||
"log_server_min_free_space_ratio",
|
||||
"storage_server_durability_lag"
|
||||
]
|
||||
},
|
||||
"description":"The database is not being saturated by the workload."
|
||||
|
|
|
@ -396,6 +396,13 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( MAX_TL_SS_VERSION_DIFFERENCE_BATCH, 1e99 );
|
||||
init( MAX_MACHINES_FALLING_BEHIND, 1 );
|
||||
|
||||
init( MAX_TPS_HISTORY_SAMPLES, 600 );
|
||||
init( NEEDED_TPS_HISTORY_SAMPLES, 200 );
|
||||
init( TARGET_DURABILITY_LAG_VERSIONS, 200e6 );
|
||||
init( TARGET_DURABILITY_LAG_VERSIONS_BATCH, 100e6 );
|
||||
init( INITIAL_DURABILITY_LAG_MULTIPLIER, 1.02 );
|
||||
init( DURABILITY_LAG_REDUCTION_RATE, 0.9999 );
|
||||
|
||||
//Storage Metrics
|
||||
init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 );
|
||||
init( STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS, 1000.0 / STORAGE_METRICS_AVERAGE_INTERVAL ); // milliHz!
|
||||
|
|
|
@ -332,6 +332,13 @@ public:
|
|||
double MAX_TL_SS_VERSION_DIFFERENCE_BATCH;
|
||||
int MAX_MACHINES_FALLING_BEHIND;
|
||||
|
||||
int MAX_TPS_HISTORY_SAMPLES;
|
||||
int NEEDED_TPS_HISTORY_SAMPLES;
|
||||
int64_t TARGET_DURABILITY_LAG_VERSIONS;
|
||||
int64_t TARGET_DURABILITY_LAG_VERSIONS_BATCH;
|
||||
double INITIAL_DURABILITY_LAG_MULTIPLIER;
|
||||
double DURABILITY_LAG_REDUCTION_RATE;
|
||||
|
||||
//Storage Metrics
|
||||
double STORAGE_METRICS_AVERAGE_INTERVAL;
|
||||
double STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS;
|
||||
|
|
|
@ -75,7 +75,7 @@ const char* limitReasonDesc[] = {
|
|||
"Storage server running out of space (approaching 5% limit).",
|
||||
"Log server running out of space (approaching 100MB limit).",
|
||||
"Log server running out of space (approaching 5% limit).",
|
||||
"Storage server is overwhelmed by read workload"
|
||||
"Storage server durable version falling behind."
|
||||
};
|
||||
|
||||
static_assert(sizeof(limitReasonDesc) / sizeof(limitReasonDesc[0]) == limitReason_t_end, "limitReasonDesc table size");
|
||||
|
@ -128,10 +128,11 @@ struct RatekeeperLimits {
|
|||
int64_t logTargetBytes;
|
||||
int64_t logSpringBytes;
|
||||
double maxVersionDifference;
|
||||
int64_t durabilityLagTargetVersions;
|
||||
|
||||
std::string context;
|
||||
|
||||
RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference) :
|
||||
RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference, int64_t durabilityLagTargetVersions) :
|
||||
tpsLimit(std::numeric_limits<double>::infinity()),
|
||||
tpsLimitMetric(StringRef("Ratekeeper.TPSLimit" + context)),
|
||||
reasonMetric(StringRef("Ratekeeper.Reason" + context)),
|
||||
|
@ -140,6 +141,7 @@ struct RatekeeperLimits {
|
|||
logTargetBytes(logTargetBytes),
|
||||
logSpringBytes(logSpringBytes),
|
||||
maxVersionDifference(maxVersionDifference),
|
||||
durabilityLagTargetVersions(durabilityLagTargetVersions),
|
||||
context(context)
|
||||
{}
|
||||
};
|
||||
|
@ -177,8 +179,8 @@ struct RatekeeperData {
|
|||
RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
|
||||
actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")),
|
||||
lastWarning(0),
|
||||
normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE),
|
||||
batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH),
|
||||
normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS),
|
||||
batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH),
|
||||
durabilityLagLimit(std::numeric_limits<double>::infinity()), lastDurabilityLag(0)
|
||||
{}
|
||||
};
|
||||
|
@ -350,7 +352,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
|
|||
// SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this value
|
||||
actualTps = std::max( std::max( 1.0, actualTps ), self->smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT );
|
||||
|
||||
if(self->actualTpsHistory.size() > 600) {
|
||||
if(self->actualTpsHistory.size() > SERVER_KNOBS->MAX_TPS_HISTORY_SAMPLES) {
|
||||
self->actualTpsHistory.pop_front();
|
||||
}
|
||||
self->actualTpsHistory.push_back(actualTps);
|
||||
|
@ -495,16 +497,16 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) {
|
|||
}
|
||||
|
||||
limitingStorageDurabilityLagStorageServer = -1*ss->first;
|
||||
if(limitingStorageDurabilityLagStorageServer > 200e6) {
|
||||
if(limitingStorageDurabilityLagStorageServer > limits->durabilityLagTargetVersions && self->actualTpsHistory.size() > SERVER_KNOBS->NEEDED_TPS_HISTORY_SAMPLES) {
|
||||
if(self->durabilityLagLimit == std::numeric_limits<double>::infinity()) {
|
||||
double maxTps = 0;
|
||||
for(int i = 0; i < self->actualTpsHistory.size(); i++) {
|
||||
maxTps = std::max(maxTps, self->actualTpsHistory[i]);
|
||||
}
|
||||
self->durabilityLagLimit = 1.02*maxTps;
|
||||
self->durabilityLagLimit = SERVER_KNOBS->INITIAL_DURABILITY_LAG_MULTIPLIER*maxTps;
|
||||
}
|
||||
if( limitingStorageDurabilityLagStorageServer > self->lastDurabilityLag ) {
|
||||
self->durabilityLagLimit = 0.9999*self->durabilityLagLimit;
|
||||
self->durabilityLagLimit = SERVER_KNOBS->DURABILITY_LAG_REDUCTION_RATE*self->durabilityLagLimit;
|
||||
}
|
||||
if(self->durabilityLagLimit < limits->tpsLimit) {
|
||||
limits->tpsLimit = self->durabilityLagLimit;
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
testTitle=UnitTests
|
||||
testName=UnitTests
|
||||
startDelay=0
|
||||
useDB=false
|
||||
maxTestCases=0
|
||||
testsMatching=!/redwood/correctness/btree
|
Loading…
Reference in New Issue