Add metrics to understand tail commit latency (#9435)
* Add server-side latency metrics for Resolver requests. * Add separate resolver latency metrics for queue wait and compute time. * Add histogram for queue depth observed on resolver (during metrics interval). * Fix tlog latency measurement to use timer() instead of now().
This commit is contained in:
parent
03fbc59bb1
commit
e3a61b9b22
|
@ -18,12 +18,14 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
||||
#include "fdbclient/NativeAPI.actor.h"
|
||||
#include "fdbclient/Notified.h"
|
||||
#include "fdbclient/StorageServerInterface.h"
|
||||
#include "fdbclient/SystemData.h"
|
||||
#include "fdbrpc/Stats.h"
|
||||
#include "fdbserver/ApplyMetadataMutation.h"
|
||||
#include "fdbserver/ConflictSet.h"
|
||||
#include "fdbserver/IKeyValueStore.h"
|
||||
|
@ -39,6 +41,7 @@
|
|||
#include "fdbserver/WorkerInterface.actor.h"
|
||||
#include "flow/ActorCollection.h"
|
||||
#include "flow/Error.h"
|
||||
#include "flow/Histogram.h"
|
||||
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
||||
|
@ -168,6 +171,19 @@ struct Resolver : ReferenceCounted<Resolver> {
|
|||
Counter splitRequests;
|
||||
int numLogs;
|
||||
|
||||
// End-to-end server latency of resolver requests.
|
||||
Reference<Histogram> resolverLatencyDist;
|
||||
|
||||
// Queue wait times, per request.
|
||||
Reference<Histogram> queueWaitLatencyDist;
|
||||
|
||||
// Actual work, per req request.
|
||||
Reference<Histogram> computeTimeDist;
|
||||
|
||||
// Distribution of waiters in queue.
|
||||
// 0 or 1 will be most common, but higher values are interesting.
|
||||
Reference<Histogram> queueDepthDist;
|
||||
|
||||
Future<Void> logger;
|
||||
|
||||
EncryptionAtRestMode encryptMode;
|
||||
|
@ -184,7 +200,12 @@ struct Resolver : ReferenceCounted<Resolver> {
|
|||
resolvedStateTransactions("ResolvedStateTransactions", cc),
|
||||
resolvedStateMutations("ResolvedStateMutations", cc), resolvedStateBytes("ResolvedStateBytes", cc),
|
||||
resolveBatchOut("ResolveBatchOut", cc), metricsRequests("MetricsRequests", cc),
|
||||
splitRequests("SplitRequests", cc) {
|
||||
splitRequests("SplitRequests", cc),
|
||||
resolverLatencyDist(Histogram::getHistogram("Resolver"_sr, "Latency"_sr, Histogram::Unit::milliseconds)),
|
||||
queueWaitLatencyDist(Histogram::getHistogram("Resolver"_sr, "QueueWait"_sr, Histogram::Unit::milliseconds)),
|
||||
computeTimeDist(Histogram::getHistogram("Resolver"_sr, "ComputeTime"_sr, Histogram::Unit::milliseconds)),
|
||||
// Distribution of queue depths, with knowledge that Histogram has 32 buckets, and each bucket will have size 1.
|
||||
queueDepthDist(Histogram::getHistogram("Resolver"_sr, "QueueDepth"_sr, Histogram::Unit::countLinear, 0, 31)) {
|
||||
specialCounter(cc, "Version", [this]() { return this->version.get(); });
|
||||
specialCounter(cc, "NeededVersion", [this]() { return this->neededVersion.get(); });
|
||||
specialCounter(cc, "TotalStateBytes", [this]() { return this->totalStateBytes.get(); });
|
||||
|
@ -251,8 +272,17 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
|
|||
self->neededVersion.set(std::max(self->neededVersion.get(), req.prevVersion));
|
||||
}
|
||||
|
||||
// Update queue depth metric before waiting. Check if we're going to be one of the waiters or not.
|
||||
int waiters = self->version.numWaiting();
|
||||
if (self->version.get() < req.prevVersion) {
|
||||
waiters++;
|
||||
}
|
||||
self->queueDepthDist->sampleRecordCounter(waiters);
|
||||
|
||||
choose {
|
||||
when(wait(self->version.whenAtLeast(req.prevVersion))) {
|
||||
// Update queue depth metric after waiting.
|
||||
self->queueDepthDist->sampleRecordCounter(self->version.numWaiting());
|
||||
break;
|
||||
}
|
||||
when(wait(self->checkNeededVersion.onTrigger())) {}
|
||||
|
@ -264,8 +294,16 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
|
|||
g_network->setCurrentTask(TaskPriority::DefaultEndpoint);
|
||||
}
|
||||
|
||||
// Time until now has been spent waiting in the queue to do actual work.
|
||||
double queueWaitEndTime = g_network->timer();
|
||||
self->queueWaitLatencyDist->sampleSeconds(queueWaitEndTime - req.requestTime());
|
||||
|
||||
if (self->version.get() ==
|
||||
req.prevVersion) { // Not a duplicate (check relies on no waiting between here and self->version.set() below!)
|
||||
// This is the beginning of the compute phase of the
|
||||
// resolver. There's no wait before it's done.
|
||||
const double beginComputeTime = g_network->timer();
|
||||
|
||||
++self->resolveBatchStart;
|
||||
self->resolvedTransactions += req.transactions.size();
|
||||
self->resolvedBytes += req.transactions.expectedSize();
|
||||
|
@ -463,6 +501,10 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
|
|||
self->checkNeededVersion.trigger();
|
||||
}
|
||||
|
||||
// Measure the time spent doing actual work in the resolver.
|
||||
const double endComputeTime = g_network->timer();
|
||||
self->computeTimeDist->sampleSeconds(endComputeTime - beginComputeTime);
|
||||
|
||||
if (req.debugID.present())
|
||||
g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.After");
|
||||
} else {
|
||||
|
@ -487,6 +529,11 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
|
|||
req.reply.send(Never());
|
||||
}
|
||||
|
||||
// Measure server-side RPC latency from the time a request was
|
||||
// received to time the response was sent.
|
||||
const double endTime = g_network->timer();
|
||||
self->resolverLatencyDist->sampleSeconds(endTime - req.requestTime());
|
||||
|
||||
++self->resolveBatchOut;
|
||||
|
||||
return Void();
|
||||
|
|
|
@ -358,6 +358,7 @@ struct TLogData : NonCopyable {
|
|||
Reference<AsyncVar<bool>> degraded;
|
||||
std::vector<TagsAndMessage> tempTagMessages;
|
||||
|
||||
// Distribution of end-to-end server latency of tlog commit requests.
|
||||
Reference<Histogram> commitLatencyDist;
|
||||
|
||||
TLogData(UID dbgid,
|
||||
|
@ -2329,8 +2330,6 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
|
|||
return Void();
|
||||
}
|
||||
|
||||
state double beforeCommitT = now();
|
||||
|
||||
// Not a duplicate (check relies on critical section between here self->version.set() below!)
|
||||
state bool isNotDuplicate = (logData->version.get() == req.prevVersion);
|
||||
if (isNotDuplicate) {
|
||||
|
@ -2378,8 +2377,12 @@ ACTOR Future<Void> tLogCommit(TLogData* self,
|
|||
return Void();
|
||||
}
|
||||
|
||||
// Measure server-side RPC latency from the time a request was
|
||||
// received to time the response was sent.
|
||||
const double endTime = g_network->timer();
|
||||
|
||||
if (isNotDuplicate) {
|
||||
self->commitLatencyDist->sampleSeconds(now() - beforeCommitT);
|
||||
self->commitLatencyDist->sampleSeconds(endTime - req.requestTime());
|
||||
}
|
||||
|
||||
if (req.debugID.present())
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "fdbclient/FDBTypes.h"
|
||||
#include "fdbrpc/Locality.h"
|
||||
#include "fdbrpc/fdbrpc.h"
|
||||
#include "fdbrpc/TimedRequest.h"
|
||||
|
||||
struct ResolverInterface {
|
||||
constexpr static FileIdentifier file_identifier = 1755944;
|
||||
|
@ -121,7 +122,7 @@ struct ResolveTransactionBatchReply {
|
|||
}
|
||||
};
|
||||
|
||||
struct ResolveTransactionBatchRequest {
|
||||
struct ResolveTransactionBatchRequest : TimedRequest {
|
||||
constexpr static FileIdentifier file_identifier = 16462858;
|
||||
Arena arena;
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "fdbclient/CommitTransaction.h"
|
||||
#include "fdbclient/MutationList.h"
|
||||
#include "fdbclient/StorageServerInterface.h"
|
||||
#include "fdbrpc/TimedRequest.h"
|
||||
#include <iterator>
|
||||
|
||||
struct TLogInterface {
|
||||
|
@ -294,7 +295,7 @@ struct TLogCommitReply {
|
|||
}
|
||||
};
|
||||
|
||||
struct TLogCommitRequest {
|
||||
struct TLogCommitRequest : TimedRequest {
|
||||
constexpr static FileIdentifier file_identifier = 4022206;
|
||||
SpanContext spanContext;
|
||||
Arena arena;
|
||||
|
|
Loading…
Reference in New Issue