From 1a18c859c768de90e05673e49a2987e50981da18 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 11 Jul 2019 18:34:19 -0700 Subject: [PATCH] knobified the durability lag rate controls --- .../source/mr-status-json-schemas.rst.inc | 6 ++++-- documentation/sphinx/source/mr-status.rst | 1 + fdbclient/Schemas.cpp | 6 ++++-- fdbserver/Knobs.cpp | 7 +++++++ fdbserver/Knobs.h | 7 +++++++ fdbserver/Ratekeeper.actor.cpp | 18 ++++++++++-------- tests/fast/RedwoodCorrectnessBTree.txt | 6 ------ 7 files changed, 33 insertions(+), 18 deletions(-) delete mode 100644 tests/fast/RedwoodCorrectnessBTree.txt diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index bc6aac6411..f4fc5ded96 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -230,7 +230,8 @@ "storage_server_min_free_space", "storage_server_min_free_space_ratio", "log_server_min_free_space", - "log_server_min_free_space_ratio" + "log_server_min_free_space_ratio", + "storage_server_durability_lag" ] }, "description":"The database is not being saturated by the workload." @@ -249,7 +250,8 @@ "storage_server_min_free_space", "storage_server_min_free_space_ratio", "log_server_min_free_space", - "log_server_min_free_space_ratio" + "log_server_min_free_space_ratio", + "storage_server_durability_lag" ] }, "description":"The database is not being saturated by the workload." diff --git a/documentation/sphinx/source/mr-status.rst b/documentation/sphinx/source/mr-status.rst index dfc063e911..9e11906e71 100644 --- a/documentation/sphinx/source/mr-status.rst +++ b/documentation/sphinx/source/mr-status.rst @@ -128,4 +128,5 @@ min_free_space Running out of space (approaching 100MB limi min_free_space_ratio Running out of space (approaching 5% limit). log_server_min_free_space Log server running out of space (approaching 100MB limit). log_server_min_free_space_ratio Log server running out of space (approaching 5% limit). +storage_server_durability_lag Storage server durable version falling behind. =================================== ==================================================== diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 658bf86527..8080eb2d2c 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -250,7 +250,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "storage_server_min_free_space", "storage_server_min_free_space_ratio", "log_server_min_free_space", - "log_server_min_free_space_ratio" + "log_server_min_free_space_ratio", + "storage_server_durability_lag" ] }, "description":"The database is not being saturated by the workload." @@ -269,7 +270,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "storage_server_min_free_space", "storage_server_min_free_space_ratio", "log_server_min_free_space", - "log_server_min_free_space_ratio" + "log_server_min_free_space_ratio", + "storage_server_durability_lag" ] }, "description":"The database is not being saturated by the workload." diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 7a570281ca..ee3fb18823 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -396,6 +396,13 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( MAX_TL_SS_VERSION_DIFFERENCE_BATCH, 1e99 ); init( MAX_MACHINES_FALLING_BEHIND, 1 ); + init( MAX_TPS_HISTORY_SAMPLES, 600 ); + init( NEEDED_TPS_HISTORY_SAMPLES, 200 ); + init( TARGET_DURABILITY_LAG_VERSIONS, 200e6 ); + init( TARGET_DURABILITY_LAG_VERSIONS_BATCH, 100e6 ); + init( INITIAL_DURABILITY_LAG_MULTIPLIER, 1.02 ); + init( DURABILITY_LAG_REDUCTION_RATE, 0.9999 ); + //Storage Metrics init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 ); init( STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS, 1000.0 / STORAGE_METRICS_AVERAGE_INTERVAL ); // milliHz! diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 37c3916ae5..065c5ab4e5 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -332,6 +332,13 @@ public: double MAX_TL_SS_VERSION_DIFFERENCE_BATCH; int MAX_MACHINES_FALLING_BEHIND; + int MAX_TPS_HISTORY_SAMPLES; + int NEEDED_TPS_HISTORY_SAMPLES; + int64_t TARGET_DURABILITY_LAG_VERSIONS; + int64_t TARGET_DURABILITY_LAG_VERSIONS_BATCH; + double INITIAL_DURABILITY_LAG_MULTIPLIER; + double DURABILITY_LAG_REDUCTION_RATE; + //Storage Metrics double STORAGE_METRICS_AVERAGE_INTERVAL; double STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 0f665aff7f..8ea2463651 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -75,7 +75,7 @@ const char* limitReasonDesc[] = { "Storage server running out of space (approaching 5% limit).", "Log server running out of space (approaching 100MB limit).", "Log server running out of space (approaching 5% limit).", - "Storage server is overwhelmed by read workload" + "Storage server durable version falling behind." }; static_assert(sizeof(limitReasonDesc) / sizeof(limitReasonDesc[0]) == limitReason_t_end, "limitReasonDesc table size"); @@ -128,10 +128,11 @@ struct RatekeeperLimits { int64_t logTargetBytes; int64_t logSpringBytes; double maxVersionDifference; + int64_t durabilityLagTargetVersions; std::string context; - RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference) : + RatekeeperLimits(std::string context, int64_t storageTargetBytes, int64_t storageSpringBytes, int64_t logTargetBytes, int64_t logSpringBytes, double maxVersionDifference, int64_t durabilityLagTargetVersions) : tpsLimit(std::numeric_limits::infinity()), tpsLimitMetric(StringRef("Ratekeeper.TPSLimit" + context)), reasonMetric(StringRef("Ratekeeper.Reason" + context)), @@ -140,6 +141,7 @@ struct RatekeeperLimits { logTargetBytes(logTargetBytes), logSpringBytes(logSpringBytes), maxVersionDifference(maxVersionDifference), + durabilityLagTargetVersions(durabilityLagTargetVersions), context(context) {} }; @@ -177,8 +179,8 @@ struct RatekeeperData { RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")), lastWarning(0), - normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE), - batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH), + normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS), + batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH), durabilityLagLimit(std::numeric_limits::infinity()), lastDurabilityLag(0) {} }; @@ -350,7 +352,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) { // SOMEDAY: Remove the max( 1.0, ... ) since the below calculations _should_ be able to recover back up from this value actualTps = std::max( std::max( 1.0, actualTps ), self->smoothTotalDurableBytes.smoothRate() / CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT ); - if(self->actualTpsHistory.size() > 600) { + if(self->actualTpsHistory.size() > SERVER_KNOBS->MAX_TPS_HISTORY_SAMPLES) { self->actualTpsHistory.pop_front(); } self->actualTpsHistory.push_back(actualTps); @@ -495,16 +497,16 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) { } limitingStorageDurabilityLagStorageServer = -1*ss->first; - if(limitingStorageDurabilityLagStorageServer > 200e6) { + if(limitingStorageDurabilityLagStorageServer > limits->durabilityLagTargetVersions && self->actualTpsHistory.size() > SERVER_KNOBS->NEEDED_TPS_HISTORY_SAMPLES) { if(self->durabilityLagLimit == std::numeric_limits::infinity()) { double maxTps = 0; for(int i = 0; i < self->actualTpsHistory.size(); i++) { maxTps = std::max(maxTps, self->actualTpsHistory[i]); } - self->durabilityLagLimit = 1.02*maxTps; + self->durabilityLagLimit = SERVER_KNOBS->INITIAL_DURABILITY_LAG_MULTIPLIER*maxTps; } if( limitingStorageDurabilityLagStorageServer > self->lastDurabilityLag ) { - self->durabilityLagLimit = 0.9999*self->durabilityLagLimit; + self->durabilityLagLimit = SERVER_KNOBS->DURABILITY_LAG_REDUCTION_RATE*self->durabilityLagLimit; } if(self->durabilityLagLimit < limits->tpsLimit) { limits->tpsLimit = self->durabilityLagLimit; diff --git a/tests/fast/RedwoodCorrectnessBTree.txt b/tests/fast/RedwoodCorrectnessBTree.txt deleted file mode 100644 index 3bde204032..0000000000 --- a/tests/fast/RedwoodCorrectnessBTree.txt +++ /dev/null @@ -1,6 +0,0 @@ -testTitle=UnitTests -testName=UnitTests -startDelay=0 -useDB=false -maxTestCases=0 -testsMatching=!/redwood/correctness/btree