From 56ae46f89e86fa720998cf5dabb980126628e761 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 4 Mar 2019 14:16:39 -0800 Subject: [PATCH] Client lazily fetches health metrics from proxies --- fdbclient/DatabaseContext.h | 4 +- fdbclient/Knobs.cpp | 4 +- fdbclient/Knobs.h | 4 +- fdbclient/NativeAPI.actor.cpp | 69 ++++++++++++------------ fdbclient/NativeAPI.actor.h | 3 +- fdbclient/vexillographer/fdb.options | 3 -- fdbserver/workloads/Throttling.actor.cpp | 9 ++-- 7 files changed, 48 insertions(+), 48 deletions(-) diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index fda49310b1..151c45e8ff 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -65,6 +65,7 @@ public: Reference getMasterProxies(); Future> getMasterProxiesFuture(); Future onMasterProxiesChanged(); + Future getHealthMetrics(bool detailed); // Update the watch counter for the database void addWatch(); @@ -161,7 +162,8 @@ public: int apiVersion; HealthMetrics healthMetrics; - Future updateHealthMetrics; + double healthMetricsLastUpdated; + double detailedHealthMetricsLastUpdated; }; #endif diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index e73cfc7058..ed70994153 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -70,8 +70,8 @@ ClientKnobs::ClientKnobs(bool randomize) { init( STORAGE_METRICS_SHARD_LIMIT, 100 ); if( randomize && BUGGIFY ) STORAGE_METRICS_SHARD_LIMIT = 3; init( STORAGE_METRICS_UNFAIR_SPLIT_LIMIT, 2.0/3.0 ); init( STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, 15.0 ); - init( UPDATE_HEALTH_METRICS_INTERVAL, 0.5 ); - init( UPDATE_DETAILED_HEALTH_METRICS_INTERVAL, 5.0 ); + init( AGGREGATE_HEALTH_METRICS_MAX_STALENESS, 0.5 ); + init( DETAILED_HEALTH_METRICS_MAX_STALENESS, 5.0 ); //KeyRangeMap init( KRM_GET_RANGE_LIMIT, 1e5 ); if( randomize && BUGGIFY ) KRM_GET_RANGE_LIMIT = 10; diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h index 29c8778a78..29136be794 100644 --- a/fdbclient/Knobs.h +++ b/fdbclient/Knobs.h @@ -69,8 +69,8 @@ public: int STORAGE_METRICS_SHARD_LIMIT; double STORAGE_METRICS_UNFAIR_SPLIT_LIMIT; double STORAGE_METRICS_TOO_MANY_SHARDS_DELAY; - double UPDATE_HEALTH_METRICS_INTERVAL; - double UPDATE_DETAILED_HEALTH_METRICS_INTERVAL; + double AGGREGATE_HEALTH_METRICS_MAX_STALENESS; + double DETAILED_HEALTH_METRICS_MAX_STALENESS; //KeyRangeMap int KRM_GET_RANGE_LIMIT; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 61e44640c5..80b1ed6bca 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -466,32 +466,47 @@ ACTOR static Future monitorMasterProxiesChange(Reference updateHealthMetricsActor(DatabaseContext *cx) { - state bool sendDetailedHealthMetrics = networkOptions.sendDetailedHealthMetrics; - state double lastDetailed = 0; +ACTOR static Future getHealthMetricsActor(DatabaseContext *cx, bool detailed) { + if (now() - cx->healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) { + if (detailed) { + TraceEvent("SENDING_CACHED_DETAILED_METRICS"); + return cx->healthMetrics; + } + else { + HealthMetrics result; + result.update(cx->healthMetrics, false, false); + return result; + } + } + state bool sendDetailedRequest = detailed && now() - cx->detailedHealthMetricsLastUpdated > + CLIENT_KNOBS->DETAILED_HEALTH_METRICS_MAX_STALENESS; loop { - wait( delay(CLIENT_KNOBS->UPDATE_HEALTH_METRICS_INTERVAL) ); - state bool sendDetailed = networkOptions.sendDetailedHealthMetrics && now() - lastDetailed > CLIENT_KNOBS->UPDATE_DETAILED_HEALTH_METRICS_INTERVAL; - loop { - choose { - when(wait(cx->onMasterProxiesChanged())) {} - when(state GetHealthMetricsReply rep = - wait(cx->getMasterProxies().isValid() && cx->getMasterProxies()->size() ? - loadBalance(cx->getMasterProxies(), - &MasterProxyInterface::getHealthMetrics, - GetHealthMetricsRequest(sendDetailed)) : - Never())) { - cx->healthMetrics.update(rep.healthMetrics, sendDetailed, true); - break; + choose { + when(wait(cx->onMasterProxiesChanged())) {} + when(GetHealthMetricsReply rep = + wait(loadBalance(cx->getMasterProxies(), &MasterProxyInterface::getHealthMetrics, + GetHealthMetricsRequest(sendDetailedRequest)))) { + cx->healthMetrics.update(rep.healthMetrics, detailed, true); + if (detailed) { + cx->healthMetricsLastUpdated = now(); + cx->detailedHealthMetricsLastUpdated = now(); + return cx->healthMetrics; + } + else { + cx->healthMetricsLastUpdated = now(); + HealthMetrics result; + result.update(cx->healthMetrics, false, false); + return result; } } } - if(sendDetailed) { - lastDetailed = now(); - } } } +Future DatabaseContext::getHealthMetrics(bool detailed = false) { + return getHealthMetricsActor(this, detailed); +} + DatabaseContext::DatabaseContext( Reference cluster, Reference> clientInfo, Future clientInfoMonitor, Standalone dbId, int taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion ) @@ -500,7 +515,8 @@ DatabaseContext::DatabaseContext( transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0), transactionsMaybeCommitted(0), transactionsResourceConstrained(0), outstandingWatches(0), - latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000) + latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), + healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0) { maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; @@ -514,8 +530,6 @@ DatabaseContext::DatabaseContext( monitorMasterProxiesInfoChange = monitorMasterProxiesChange(clientInfo, &masterProxiesChangeTrigger); clientStatusUpdater.actor = clientStatusUpdateActor(this); - - updateHealthMetrics = updateHealthMetricsActor(this); } DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000) {} @@ -938,17 +952,6 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional valu validateOptionValue(value, false); networkOptions.slowTaskProfilingEnabled = true; break; - case FDBNetworkOptions::SEND_DETAILED_HEALTH_METRICS: - validateOptionValue(value, true); - int sendDetailedHealthMetrics; - try { - sendDetailedHealthMetrics = std::stoi(value.get().toString()); - } catch (...) { - TraceEvent(SevWarnAlways, "InvalidDetailedMetricsOptionValue").detail("Value", value.get().toString()); - throw invalid_option_value(); - } - networkOptions.sendDetailedHealthMetrics = (sendDetailedHealthMetrics > 0); - break; default: break; } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 7ccaf703c1..f5ba76e385 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -56,13 +56,12 @@ struct NetworkOptions { Optional logClientInfo; Standalone> supportedVersions; bool slowTaskProfilingEnabled; - bool sendDetailedHealthMetrics; // The default values, TRACE_DEFAULT_ROLL_SIZE and TRACE_DEFAULT_MAX_LOGS_SIZE are located in Trace.h. NetworkOptions() : localAddress(""), clusterFile(""), traceDirectory(Optional()), traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"), - traceFormat("xml"), slowTaskProfilingEnabled(false), sendDetailedHealthMetrics(false) {} + traceFormat("xml"), slowTaskProfilingEnabled(false) {} }; class Database { diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index 0c829131cf..bf5045229a 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -107,9 +107,6 @@ description is not currently required but encouraged. description="Disables logging of client statistics, such as sampled transaction activity." />