Add metrics to understand tail commit latency (#9435)

* Add server-side latency metrics for Resolver requests.

* Add separate resolver latency metrics for queue wait and compute time.

* Add histogram for queue depth observed on resolver (during metrics interval).

* Fix tlog latency measurement to use timer() instead of now().
This commit is contained in:
Dan Adkins 2023-02-24 14:13:12 -05:00 committed by GitHub
parent 03fbc59bb1
commit e3a61b9b22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 58 additions and 6 deletions

View File

@ -18,12 +18,14 @@
* limitations under the License.
*/
#include <algorithm>
#include <memory>
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/Notified.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbrpc/Stats.h"
#include "fdbserver/ApplyMetadataMutation.h"
#include "fdbserver/ConflictSet.h"
#include "fdbserver/IKeyValueStore.h"
@ -39,6 +41,7 @@
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h"
#include "flow/Error.h"
#include "flow/Histogram.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -168,6 +171,19 @@ struct Resolver : ReferenceCounted<Resolver> {
Counter splitRequests;
int numLogs;
// End-to-end server latency of resolver requests.
Reference<Histogram> resolverLatencyDist;
// Queue wait times, per request.
Reference<Histogram> queueWaitLatencyDist;
// Actual work, per req request.
Reference<Histogram> computeTimeDist;
// Distribution of waiters in queue.
// 0 or 1 will be most common, but higher values are interesting.
Reference<Histogram> queueDepthDist;
Future<Void> logger;
EncryptionAtRestMode encryptMode;
@ -184,7 +200,12 @@ struct Resolver : ReferenceCounted<Resolver> {
resolvedStateTransactions("ResolvedStateTransactions", cc),
resolvedStateMutations("ResolvedStateMutations", cc), resolvedStateBytes("ResolvedStateBytes", cc),
resolveBatchOut("ResolveBatchOut", cc), metricsRequests("MetricsRequests", cc),
splitRequests("SplitRequests", cc) {
splitRequests("SplitRequests", cc),
resolverLatencyDist(Histogram::getHistogram("Resolver"_sr, "Latency"_sr, Histogram::Unit::milliseconds)),
queueWaitLatencyDist(Histogram::getHistogram("Resolver"_sr, "QueueWait"_sr, Histogram::Unit::milliseconds)),
computeTimeDist(Histogram::getHistogram("Resolver"_sr, "ComputeTime"_sr, Histogram::Unit::milliseconds)),
// Distribution of queue depths, with knowledge that Histogram has 32 buckets, and each bucket will have size 1.
queueDepthDist(Histogram::getHistogram("Resolver"_sr, "QueueDepth"_sr, Histogram::Unit::countLinear, 0, 31)) {
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "NeededVersion", [this]() { return this->neededVersion.get(); });
specialCounter(cc, "TotalStateBytes", [this]() { return this->totalStateBytes.get(); });
@ -251,8 +272,17 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
self->neededVersion.set(std::max(self->neededVersion.get(), req.prevVersion));
}
// Update queue depth metric before waiting. Check if we're going to be one of the waiters or not.
int waiters = self->version.numWaiting();
if (self->version.get() < req.prevVersion) {
waiters++;
}
self->queueDepthDist->sampleRecordCounter(waiters);
choose {
when(wait(self->version.whenAtLeast(req.prevVersion))) {
// Update queue depth metric after waiting.
self->queueDepthDist->sampleRecordCounter(self->version.numWaiting());
break;
}
when(wait(self->checkNeededVersion.onTrigger())) {}
@ -264,8 +294,16 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
g_network->setCurrentTask(TaskPriority::DefaultEndpoint);
}
// Time until now has been spent waiting in the queue to do actual work.
double queueWaitEndTime = g_network->timer();
self->queueWaitLatencyDist->sampleSeconds(queueWaitEndTime - req.requestTime());
if (self->version.get() ==
req.prevVersion) { // Not a duplicate (check relies on no waiting between here and self->version.set() below!)
// This is the beginning of the compute phase of the
// resolver. There's no wait before it's done.
const double beginComputeTime = g_network->timer();
++self->resolveBatchStart;
self->resolvedTransactions += req.transactions.size();
self->resolvedBytes += req.transactions.expectedSize();
@ -463,6 +501,10 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
self->checkNeededVersion.trigger();
}
// Measure the time spent doing actual work in the resolver.
const double endComputeTime = g_network->timer();
self->computeTimeDist->sampleSeconds(endComputeTime - beginComputeTime);
if (req.debugID.present())
g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.After");
} else {
@ -487,6 +529,11 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
req.reply.send(Never());
}
// Measure server-side RPC latency from the time a request was
// received to time the response was sent.
const double endTime = g_network->timer();
self->resolverLatencyDist->sampleSeconds(endTime - req.requestTime());
++self->resolveBatchOut;
return Void();

View File

@ -358,6 +358,7 @@ struct TLogData : NonCopyable {
Reference<AsyncVar<bool>> degraded;
std::vector<TagsAndMessage> tempTagMessages;
// Distribution of end-to-end server latency of tlog commit requests.
Reference<Histogram> commitLatencyDist;
TLogData(UID dbgid,
@ -2329,8 +2330,6 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
return Void();
}
state double beforeCommitT = now();
// Not a duplicate (check relies on critical section between here self->version.set() below!)
state bool isNotDuplicate = (logData->version.get() == req.prevVersion);
if (isNotDuplicate) {
@ -2378,8 +2377,12 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
return Void();
}
// Measure server-side RPC latency from the time a request was
// received to time the response was sent.
const double endTime = g_network->timer();
if (isNotDuplicate) {
self->commitLatencyDist->sampleSeconds(now() - beforeCommitT);
self->commitLatencyDist->sampleSeconds(endTime - req.requestTime());
}
if (req.debugID.present())

View File

@ -29,6 +29,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/Locality.h"
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/TimedRequest.h"
struct ResolverInterface {
constexpr static FileIdentifier file_identifier = 1755944;
@ -121,7 +122,7 @@ struct ResolveTransactionBatchReply {
}
};
struct ResolveTransactionBatchRequest {
struct ResolveTransactionBatchRequest : TimedRequest {
constexpr static FileIdentifier file_identifier = 16462858;
Arena arena;

View File

@ -26,6 +26,7 @@
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/MutationList.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbrpc/TimedRequest.h"
#include <iterator>
struct TLogInterface {
@ -294,7 +295,7 @@ struct TLogCommitReply {
}
};
struct TLogCommitRequest {
struct TLogCommitRequest : TimedRequest {
constexpr static FileIdentifier file_identifier = 4022206;
SpanContext spanContext;
Arena arena;