From 406bcebdc49a468ef735976a48e0be76a9ffa4ce Mon Sep 17 00:00:00 2001 From: Balachandar Namasivayam Date: Wed, 17 Jul 2019 14:47:08 -0700 Subject: [PATCH] Ratekeeper to throttle tpsLimit to 1 if it is not able to fetch storage server list for some configurable amount of time. --- fdbcli/fdbcli.actor.cpp | 3 +++ fdbclient/Schemas.cpp | 6 ++++-- fdbserver/Knobs.cpp | 3 ++- fdbserver/Knobs.h | 4 +++- fdbserver/Ratekeeper.actor.cpp | 24 +++++++++++++++++++----- 5 files changed, 31 insertions(+), 9 deletions(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 9bbd0977a4..5855f61060 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -626,6 +626,9 @@ std::string getDateInfoString(StatusObjectReader statusObj, std::string key) { } std::string getProcessAddressByServerID(StatusObjectReader processesMap, std::string serverID) { + if(serverID == "") + return "unknown"; + for (auto proc : processesMap.obj()){ try { StatusArray rolesArray = proc.second.get_obj()["roles"].get_array(); diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 93441be518..35b2e5b72b 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -252,7 +252,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "storage_server_min_free_space_ratio", "log_server_min_free_space", "log_server_min_free_space_ratio", - "storage_server_durability_lag" + "storage_server_durability_lag", + "storage_server_list_fetch_failed" ] }, "description":"The database is not being saturated by the workload." @@ -272,7 +273,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "storage_server_min_free_space_ratio", "log_server_min_free_space", "log_server_min_free_space_ratio", - "storage_server_durability_lag" + "storage_server_durability_lag", + "storage_server_list_fetch_failed" ] }, "description":"The database is not being saturated by the workload." diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 8b1e4e47e9..e2563ef833 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -404,7 +404,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( INITIAL_DURABILITY_LAG_MULTIPLIER, 1.02 ); init( DURABILITY_LAG_REDUCTION_RATE, 0.9999 ); init( DURABILITY_LAG_INCREASE_RATE, 1.001 ); - + init( STORAGE_SERVER_LIST_FETCH_TIMEOUT, 20.0 ); + //Storage Metrics init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 ); init( STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS, 1000.0 / STORAGE_METRICS_AVERAGE_INTERVAL ); // milliHz! diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 2810360341..ac8d7b0265 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -340,7 +340,9 @@ public: double INITIAL_DURABILITY_LAG_MULTIPLIER; double DURABILITY_LAG_REDUCTION_RATE; double DURABILITY_LAG_INCREASE_RATE; - + + double STORAGE_SERVER_LIST_FETCH_TIMEOUT; + //Storage Metrics double STORAGE_METRICS_AVERAGE_INTERVAL; double STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 22752d7c08..45c496ad6e 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -42,6 +42,7 @@ enum limitReason_t { log_server_min_free_space, log_server_min_free_space_ratio, storage_server_durability_lag, + storage_server_list_fetch_failed, limitReason_t_end }; @@ -58,7 +59,8 @@ const char* limitReasonName[] = { "storage_server_min_free_space_ratio", "log_server_min_free_space", "log_server_min_free_space_ratio", - "storage_server_durability_lag" + "storage_server_durability_lag", + "storage_server_list_fetch_failed" }; static_assert(sizeof(limitReasonName) / sizeof(limitReasonName[0]) == limitReason_t_end, "limitReasonDesc table size"); @@ -75,7 +77,8 @@ const char* limitReasonDesc[] = { "Storage server running out of space (approaching 5% limit).", "Log server running out of space (approaching 100MB limit).", "Log server running out of space (approaching 5% limit).", - "Storage server durable version falling behind." + "Storage server durable version falling behind.", + "Unable to fetch storage server list." }; static_assert(sizeof(limitReasonDesc) / sizeof(limitReasonDesc[0]) == limitReason_t_end, "limitReasonDesc table size"); @@ -173,6 +176,7 @@ struct RatekeeperData { Int64MetricHandle actualTpsMetric; double lastWarning; + double lastSSListFetchedTimestamp; RatekeeperLimits normalLimits; RatekeeperLimits batchLimits; @@ -181,7 +185,7 @@ struct RatekeeperData { RatekeeperData() : smoothReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothBatchReleasedTransactions(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT), actualTpsMetric(LiteralStringRef("Ratekeeper.ActualTPS")), - lastWarning(0), + lastWarning(0), lastSSListFetchedTimestamp(now()), normalLimits("", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER, SERVER_KNOBS->TARGET_BYTES_PER_TLOG, SERVER_KNOBS->SPRING_BYTES_TLOG, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS), batchLimits("Batch", SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER_BATCH, SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER_BATCH, SERVER_KNOBS->TARGET_BYTES_PER_TLOG_BATCH, SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH) {} @@ -307,6 +311,7 @@ ACTOR Future trackEachStorageServer( } ACTOR Future monitorServerListChange( + RatekeeperData* self, Reference> dbInfo, PromiseStream< std::pair> > serverChanges) { state Database db = openDBOnServer(dbInfo, TaskPriority::Ratekeeper, true, true); @@ -315,7 +320,9 @@ ACTOR Future monitorServerListChange( loop { try { + tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE ); vector> results = wait(getServerListAndProcessClasses(&tr)); + self->lastSSListFetchedTimestamp = now(); std::map newServers; for (int i = 0; i < results.size(); i++) { @@ -646,6 +653,13 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) { if (s.value.valid) totalDiskUsageBytes += s.value.lastReply.storageBytes.used; + if (now() - self->lastSSListFetchedTimestamp > SERVER_KNOBS->STORAGE_SERVER_LIST_FETCH_TIMEOUT) { + limits->tpsLimit = 1; + limitReason = limitReason_t::storage_server_list_fetch_failed; + reasonID = UID(); + TraceEvent(SevWarnAlways, "RkSSListFetchTimeout").suppressFor(1.0); + } + limits->tpsLimitMetric = std::min(limits->tpsLimit, 1e6); limits->reasonMetric = limitReason; @@ -654,7 +668,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) { TraceEvent(name.c_str()) .detail("TPSLimit", limits->tpsLimit) .detail("Reason", limitReason) - .detail("ReasonServerID", reasonID) + .detail("ReasonServerID", reasonID==UID() ? std::string() : Traceable::toString(reasonID)) .detail("ReleasedTPS", self->smoothReleasedTransactions.smoothRate()) .detail("ReleasedBatchTPS", self->smoothBatchReleasedTransactions.smoothRate()) .detail("TPSBasis", actualTps) @@ -715,7 +729,7 @@ ACTOR Future ratekeeper(RatekeeperInterface rkInterf, Reference> > serverChanges; - self.addActor.send( monitorServerListChange(dbInfo, serverChanges) ); + self.addActor.send( monitorServerListChange(&self, dbInfo, serverChanges) ); self.addActor.send( trackEachStorageServer(&self, serverChanges.getFuture()) ); TraceEvent("RkTLogQueueSizeParameters").detail("Target", SERVER_KNOBS->TARGET_BYTES_PER_TLOG).detail("Spring", SERVER_KNOBS->SPRING_BYTES_TLOG)