diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index 0ff41b12d9..8c43f39024 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -18,12 +18,14 @@ * limitations under the License. */ +#include #include #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/Notified.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/SystemData.h" +#include "fdbrpc/Stats.h" #include "fdbserver/ApplyMetadataMutation.h" #include "fdbserver/ConflictSet.h" #include "fdbserver/IKeyValueStore.h" @@ -39,6 +41,7 @@ #include "fdbserver/WorkerInterface.actor.h" #include "flow/ActorCollection.h" #include "flow/Error.h" +#include "flow/Histogram.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -168,6 +171,19 @@ struct Resolver : ReferenceCounted { Counter splitRequests; int numLogs; + // End-to-end server latency of resolver requests. + Reference resolverLatencyDist; + + // Queue wait times, per request. + Reference queueWaitLatencyDist; + + // Actual work, per req request. + Reference computeTimeDist; + + // Distribution of waiters in queue. + // 0 or 1 will be most common, but higher values are interesting. + Reference queueDepthDist; + Future logger; EncryptionAtRestMode encryptMode; @@ -184,7 +200,12 @@ struct Resolver : ReferenceCounted { resolvedStateTransactions("ResolvedStateTransactions", cc), resolvedStateMutations("ResolvedStateMutations", cc), resolvedStateBytes("ResolvedStateBytes", cc), resolveBatchOut("ResolveBatchOut", cc), metricsRequests("MetricsRequests", cc), - splitRequests("SplitRequests", cc) { + splitRequests("SplitRequests", cc), + resolverLatencyDist(Histogram::getHistogram("Resolver"_sr, "Latency"_sr, Histogram::Unit::milliseconds)), + queueWaitLatencyDist(Histogram::getHistogram("Resolver"_sr, "QueueWait"_sr, Histogram::Unit::milliseconds)), + computeTimeDist(Histogram::getHistogram("Resolver"_sr, "ComputeTime"_sr, Histogram::Unit::milliseconds)), + // Distribution of queue depths, with knowledge that Histogram has 32 buckets, and each bucket will have size 1. + queueDepthDist(Histogram::getHistogram("Resolver"_sr, "QueueDepth"_sr, Histogram::Unit::countLinear, 0, 31)) { specialCounter(cc, "Version", [this]() { return this->version.get(); }); specialCounter(cc, "NeededVersion", [this]() { return this->neededVersion.get(); }); specialCounter(cc, "TotalStateBytes", [this]() { return this->totalStateBytes.get(); }); @@ -251,8 +272,17 @@ ACTOR Future resolveBatch(Reference self, self->neededVersion.set(std::max(self->neededVersion.get(), req.prevVersion)); } + // Update queue depth metric before waiting. Check if we're going to be one of the waiters or not. + int waiters = self->version.numWaiting(); + if (self->version.get() < req.prevVersion) { + waiters++; + } + self->queueDepthDist->sampleRecordCounter(waiters); + choose { when(wait(self->version.whenAtLeast(req.prevVersion))) { + // Update queue depth metric after waiting. + self->queueDepthDist->sampleRecordCounter(self->version.numWaiting()); break; } when(wait(self->checkNeededVersion.onTrigger())) {} @@ -264,8 +294,16 @@ ACTOR Future resolveBatch(Reference self, g_network->setCurrentTask(TaskPriority::DefaultEndpoint); } + // Time until now has been spent waiting in the queue to do actual work. + double queueWaitEndTime = g_network->timer(); + self->queueWaitLatencyDist->sampleSeconds(queueWaitEndTime - req.requestTime()); + if (self->version.get() == req.prevVersion) { // Not a duplicate (check relies on no waiting between here and self->version.set() below!) + // This is the beginning of the compute phase of the + // resolver. There's no wait before it's done. + const double beginComputeTime = g_network->timer(); + ++self->resolveBatchStart; self->resolvedTransactions += req.transactions.size(); self->resolvedBytes += req.transactions.expectedSize(); @@ -463,6 +501,10 @@ ACTOR Future resolveBatch(Reference self, self->checkNeededVersion.trigger(); } + // Measure the time spent doing actual work in the resolver. + const double endComputeTime = g_network->timer(); + self->computeTimeDist->sampleSeconds(endComputeTime - beginComputeTime); + if (req.debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.After"); } else { @@ -487,6 +529,11 @@ ACTOR Future resolveBatch(Reference self, req.reply.send(Never()); } + // Measure server-side RPC latency from the time a request was + // received to time the response was sent. + const double endTime = g_network->timer(); + self->resolverLatencyDist->sampleSeconds(endTime - req.requestTime()); + ++self->resolveBatchOut; return Void(); diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 2fa585d0e2..d04d1b8f2a 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -358,6 +358,7 @@ struct TLogData : NonCopyable { Reference> degraded; std::vector tempTagMessages; + // Distribution of end-to-end server latency of tlog commit requests. Reference commitLatencyDist; TLogData(UID dbgid, @@ -2329,8 +2330,6 @@ ACTOR Future tLogCommit(TLogData* self, return Void(); } - state double beforeCommitT = now(); - // Not a duplicate (check relies on critical section between here self->version.set() below!) state bool isNotDuplicate = (logData->version.get() == req.prevVersion); if (isNotDuplicate) { @@ -2378,8 +2377,12 @@ ACTOR Future tLogCommit(TLogData* self, return Void(); } + // Measure server-side RPC latency from the time a request was + // received to time the response was sent. + const double endTime = g_network->timer(); + if (isNotDuplicate) { - self->commitLatencyDist->sampleSeconds(now() - beforeCommitT); + self->commitLatencyDist->sampleSeconds(endTime - req.requestTime()); } if (req.debugID.present()) diff --git a/fdbserver/include/fdbserver/ResolverInterface.h b/fdbserver/include/fdbserver/ResolverInterface.h index 04410a0ef3..2865b9e341 100644 --- a/fdbserver/include/fdbserver/ResolverInterface.h +++ b/fdbserver/include/fdbserver/ResolverInterface.h @@ -29,6 +29,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbrpc/Locality.h" #include "fdbrpc/fdbrpc.h" +#include "fdbrpc/TimedRequest.h" struct ResolverInterface { constexpr static FileIdentifier file_identifier = 1755944; @@ -121,7 +122,7 @@ struct ResolveTransactionBatchReply { } }; -struct ResolveTransactionBatchRequest { +struct ResolveTransactionBatchRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 16462858; Arena arena; diff --git a/fdbserver/include/fdbserver/TLogInterface.h b/fdbserver/include/fdbserver/TLogInterface.h index cc376fc595..c73dca0bc5 100644 --- a/fdbserver/include/fdbserver/TLogInterface.h +++ b/fdbserver/include/fdbserver/TLogInterface.h @@ -26,6 +26,7 @@ #include "fdbclient/CommitTransaction.h" #include "fdbclient/MutationList.h" #include "fdbclient/StorageServerInterface.h" +#include "fdbrpc/TimedRequest.h" #include struct TLogInterface { @@ -294,7 +295,7 @@ struct TLogCommitReply { } }; -struct TLogCommitRequest { +struct TLogCommitRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 4022206; SpanContext spanContext; Arena arena;