From 6afc36cc3300f4af13a11d23463be071a5891ba3 Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Fri, 19 Aug 2022 20:05:37 -0400 Subject: [PATCH] add splunk dashboards (#7942) --- .../details.xml | 431 ++++++++ .../performance_overview.xml | 323 ++++++ .../ratekeeper.xml | 928 ++++++++++++++++++ .../recovery.xml | 873 ++++++++++++++++ .../transaction_latency.xml | 247 +++++ 5 files changed, 2802 insertions(+) create mode 100644 contrib/observability_splunk_dashboard/details.xml create mode 100644 contrib/observability_splunk_dashboard/performance_overview.xml create mode 100644 contrib/observability_splunk_dashboard/ratekeeper.xml create mode 100644 contrib/observability_splunk_dashboard/recovery.xml create mode 100644 contrib/observability_splunk_dashboard/transaction_latency.xml diff --git a/contrib/observability_splunk_dashboard/details.xml b/contrib/observability_splunk_dashboard/details.xml new file mode 100644 index 0000000000..70ff15883b --- /dev/null +++ b/contrib/observability_splunk_dashboard/details.xml @@ -0,0 +1,431 @@ +
+ + Details for FoundationDB Cluster +
+ + + * + + + + * + + + + + -60m@m + now + + + + + Default + 5 seconds + 1 minute + 10 minutes + 1 hour + 1 day + bins=100 + bins=100 + + + + All + Storage Server + Transaction Log + Proxy + Resolver + Master + Cluster Controller + Log Router + Data Distributor + Ratekeeper + Tester + + + + + * + + + + * + +
+ + + + Storage Queue Size + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?<InputRate>.*) (?<InputRoughness>.*) (?<InputCounter>.*)" | rex field=BytesDurable "(?<DurableRate>.*) (?<DurableRoughness>.*) (?<DurableCounter>.*)" | eval QueueSize=InputCounter-DurableCounter | timechart $Span$ avg(QueueSize) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Storage Input Rate + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?<InputRate>.*) (?<InputRoughness>.*) (?<InputCounter>.*)" | timechart $Span$ avg(InputRate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Storage Bytes Queried + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesQueried "(?<Rate>.*) (?<Roughness>.*) (?<Counter>.*)" | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Average Process CPU by Role (capped at 2; beware kernel bug) + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ avg(Cpu) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Max Process CPU by Role (capped at 2; beware kernel bug) + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ max(Cpu) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Disk Busyness + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=ProcessMetrics TrackLatestType=Original | eval DiskBusyPercentage=(Elapsed-DiskIdleSeconds)/Elapsed | timechart $Span$ avg(DiskBusyPercentage) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Max Run Loop Busyness by Role (for <=6.1, S2Pri1) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics NOT TrackLatestType=Rolled | eval Busyness=if(isnull(PriorityStarvedBelow1), if(isnull(PriorityBusy1), S2Pri1, PriorityBusy1/Elapsed), PriorityStarvedBelow1/Elapsed) | timechart $Span$ max(Busyness) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Max Run Loop Busyness by Priority (6.2+ only) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics TrackLatestType=Original | foreach PriorityBusy* [eval Busyness<<MATCHSTR>>=PriorityBusy<<MATCHSTR>>/Elapsed] | timechart $Span$ max(Busyness*) + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + TLog Queue Size + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval QueueSize=SharedBytesInput-SharedBytesDurable | timechart $Span$ avg(QueueSize) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Connection Timeouts (counted on both sides of connection) + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) $Roles$ host=$Host$ | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) | rex field=WithAddr "(?<OtherAddr>[^:]*:[^:]*).*" | eval Machine=Machine+","+OtherAddr | makemv delim="," Machine | search Machine=$Machine$ | eval Count=1+SuppressedEventCount | timechart sum(Count) by Machine useother=f + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Pairwise Connection Timeouts Between Datacenters + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) host=* Machine=* NOT TrackLatestType=Rolled +| eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) +| rex field=host "(?<Datacenter>..).*" +| eval Datacenter=if(isnotnull(pie_work_unit), pie_work_unit, Datacenter) +| rex field=WithAddr "(?<OtherIP>[^:]*):.*" +| join OtherIP + [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled + | rex field=Machine "(?<OtherIP>[^:]*):.*" + | rex field=host "(?<OtherDatacenter>..).*" + | eval OtherDatacenter=if(isnotnull(pie_work_unit), pie_work_unit, OtherDatacenter)] +| eval DC1=if(Datacenter>OtherDatacenter, Datacenter, OtherDatacenter), DC2=if(Datacenter>OtherDatacenter, OtherDatacenter, Datacenter) +| eval Connection=DC1+" <-> " + DC2 +| eval Count=1+SuppressedEventCount +| timechart count by Connection + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Pairwise Connection Timeouts Between Known Server Processes (Sorted by Count, descending) + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut OR Type=ProcessMetrics) $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr), Reason=if(Type=="ConnectionTimedOut", "Timed out trying to connect", "Established connection timed out") | rex field=Machine "(?<IP>[^:]*):.*" | rex field=host "(?<Datacenter>..).*" | rex field=WithAddr "(?<OtherIP>[^:]*):.*" | eventstats values(Roles) as Roles by IP | join OtherIP [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled | rex field=Machine "(?<OtherIP>[^:]*):.*" | rex field=host "(?<OtherDatacenter>..).*" | stats values(Roles) as OtherRoles by OtherIP, OtherDatacenter | eval OtherRoles="("+mvjoin(OtherRoles,",")+")"] | eval Roles="("+mvjoin(Roles,",")+")" | eval IP=Datacenter+": "+IP+" "+Roles, OtherIP=OtherDatacenter+": "+OtherIP+" "+OtherRoles | eval Addr1=if(IP>OtherIP, IP, OtherIP), Addr2=if(IP>OtherIP, OtherIP, IP) | eval Connection=Addr1+" <-> " + Addr2 | eval Count=1+SuppressedEventCount | stats sum(Count) as Count, values(Reason) as Reasons by Connection | sort -Count + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + Lazy Deletion Rate (making space available for reuse) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=LazyDeletePages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Vacuuming Rate (shrinking file) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=VacuumedPages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Roles + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | makemv delim="," Roles | mvexpand Roles | timechart $Span$ distinct_count(Machine) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + + Slow Tasks (Sorted by Duration, Descending) + + index=$Index$ LogGroup=$LogGroup$ Type=SlowTask $Roles$ host=$Host$ Machine=$Machine$ | sort -Duration | table _time, Duration, Machine, TaskID, Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + +
+
+ + + Event Counts (Sorted by Severity and Count, Descending) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | stats count as Count by Type, Severity | sort -Severity, -Count + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Errors + + index=$Index$ LogGroup=$LogGroup$ Severity=40 $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | table _time, Type, Machine, Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + +
+
+
+ + + + Recoveries (Ignores Filters) + + index=$Index$ LogGroup=$LogGroup$ Type=MasterRecoveryState TrackLatestType=Original (StatusCode=0 OR StatusCode=11) | eval RecoveryResetInterval=10 | sort _time | streamstats earliest(_time) as RecoveryStart, count as EventCount reset_after="(StatusCode=11)" | where StatusCode=11 | eval EventCount=if(EventCount==1, 2, EventCount), RecoveryStart=if(RecoveryStart==_time, _time-RecoveryDuration, RecoveryStart) | sort -_time | streamstats current=f global=f window=1 first(RecoveryStart) as NextRecoveryStart | eval RecoverySpan=NextRecoveryStart-_time, FailedRecoveries=EventCount-2, SuccessfulRecoveries=1 | eval AvailableSeconds=if(RecoverySpan<RecoveryResetInterval, RecoverySpan, 0) | sort _time | streamstats earliest(RecoveryStart) as RecoveryStart, sum(FailedRecoveries) as FailedRecoveryCount, sum(SuccessfulRecoveries) as SuccessfulRecoveryCount, sum(AvailableSeconds) as AvailableSeconds reset_after="(NOT RecoverySpan < RecoveryResetInterval)" | where NOT RecoverySpan < RecoveryResetInterval | eval Duration=_time-RecoveryStart, StartTime=strftime(RecoveryStart, "%F %X.%Q"), ShortLivedRecoveryCount=SuccessfulRecoveryCount-1 | table StartTime, Duration, FailedRecoveryCount, ShortLivedRecoveryCount, AvailableSeconds | sort -StartTime + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Process (Re)starts + + index=$Index$ LogGroup=$LogGroup$ Type=ProgramStart TrackLatestType=Original $Roles$ host=$Host$ Machine=$Machine$ | table _time, Machine | sort -_time + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Failure Detection (Machine Filter Only) + + index=$Index$ LogGroup=$LogGroup$ Type=FailureDetectionStatus System=$Machine$ | sort _time | eval Failed=if(Status=="Failed", 1, 0) | streamstats current=t global=f window=2 first(Failed) as PrevFailed by System | where PrevFailed=1 OR Failed=1 | eval Failed=PrevFailed + "," + Failed | makemv delim="," Failed | mvexpand Failed | timechart $Span$ max(Failed) by System + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + +
+ + + + Storage Server Space Usage (Sorted by Available Space Percentage, Ascending) + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBStored=BytesStored/1e9, Overhead=KvstoreBytesUsed/BytesStored, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeSpacePercent) as FreeSpacePercent, latest(GBStored) as GBStored, latest(GBUsed) as GBUsed, latest(Overhead) as OverheadFactor, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + TLog Server Space Usage (Sorted by Available Space Percentage, Ascending) + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics host=* Machine=* TrackLatestType=Original Roles=TL | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeDiskSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeDiskSpacePercent) as FreeDiskSpacePercent, latest(GBUsed) as GBUsed, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + Data Movement by Type (Log Scale, Ignores Filters) + + index=$Index$ LogGroup=$LogGroup$ Type=MovingData TrackLatestType=Original | timechart avg(Priority*) as * + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Storage Server Max Bytes Stored by Host + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval GBStored=BytesStored/1e9 | timechart max(GBStored) by host limit=100 + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Master Failed Clients + + index=$Index$ LogGroup=$LogGroup$ Type=WaitFailureClient +| stats count by FailedEndpoint + $TimeRange.earliest$ + $TimeRange.latest$ + + +
+
+
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/performance_overview.xml b/contrib/observability_splunk_dashboard/performance_overview.xml new file mode 100644 index 0000000000..0719e2bbab --- /dev/null +++ b/contrib/observability_splunk_dashboard/performance_overview.xml @@ -0,0 +1,323 @@ +
+ +
+ + + * + + + + + + + + + -60m@m + now + + + + + Normal + Batch + + + + + 60s + +
+ + + Transaction Rate measured on Proxies + + Sum in $ChartBinSizeToken$ seconds + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " TxnThrottled +| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), TxnThrottledRate=mvindex(TxnThrottled, 0) +| timechart span=$ChartBinSizeToken$ sum(TxnRequestInRate) as StartedTxnBatchRate, sum(TxnRequestOutRate) as FinishedTxnBatchRate, sum(TxnStartInRate) as StartedTxnRate, sum(TxnStartOutRate) as FinishedTxnRate, sum(TxnThrottledRate) as ThrottledTxnRate + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Read Rate measured on Storage Servers + + Average in $ChartBinSizeToken$ seconds + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" +| rex field=BytesQueried "(?<RRate>.*) (?<RRoughness>.*) (?<RCounter>.*)" +| rex field=RowsQueried "(?<KRate>.*) (?<KRoughness>.*) (?<KCounter>.*)" +| rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" +| rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" +| timechart span=$ChartBinSizeToken$ avg(RRate) as BytesReadPerSecond, avg(KRate) as RowsReadPerSecond, avg(FRate) as DDReadPerSecond + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Write Rate measured on Proxies + + 1min Average + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " MutationBytes +| makemv delim=" " Mutations +| eval MutationBytesRate=mvindex(MutationBytes, 0), MutationsRate=mvindex(Mutations,0) +| bucket span=5s _time +| stats sum(MutationBytesRate) as MutationBytes, sum(MutationsRate) as Mutations by _time +|eval MutationMB=MutationBytes/1024/1024, MutationsK=Mutations/1000 +| timechart span=$ChartBinSizeToken$ avg(MutationMB) as MutationMB, avg(MutationsK) as MutationsK + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Write Rate measured on Storage Servers + + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" +| rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" +| rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" +| timechart span=$ChartBinSizeToken$ avg(WRate) as BytesPerSecond, avg(FRate) as DDBytesWrittenPerSecond + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + GRV Latency measured on all Proxies + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=GRVLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Commit Latency measured on all Proxies + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=CommitLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Read Latency measured on all Storage Servers + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=ReadLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + RateKeeper: ReleasedTPS vs LimitTPS + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time ReleasedTPS TPSLimit +| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + RateKeeper: Throttling Reason + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time Reason + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + + RateKeeper: Throttling Server + + Ratekeeper: Limit Reason: ReasonServerID (Most recent 10 records) + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate AND TrackLatestType="Original" +| streamstats count as numOfEvents +| where numOfEvents < 10 +| eval DateTime=strftime(Time, "%Y-%m-%dT%H:%M:%S") +| table DateTime, ReasonServerID + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + +
+
+
+ + + Disk Overhead = Disk Usage / Logical KV Size + + Y-axis is capped at 10 + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type=StorageMetrics OR Type=DDTrackerStats) TrackLatestType=Original +| bucket _time span=5s +| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes, avg(TotalSizeBytes) as LogicalKVBytes by _time +| eval overhead=StorageDiskUsedBytes/LogicalKVBytes +| timechart avg(overhead) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + KV Data Size + + + index=$Index$ LogGroup=$LogGroup$ +Roles=*DD* host=* Machine=* Type=DDTrackerStats TrackLatestType=Original +| eval TotalKVGB=TotalSizeBytes/1024/1024/1024, SystemKVGB=SystemSizeBytes/1024/1024/1024 +|timechart avg(TotalKVGB), avg(SystemKVGB), avg(Shards) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Disk Usage + + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* Type=StorageMetrics TrackLatestType=Original +| bucket _time span=5s +| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes by _time +|eval StorageDiskTotalMB = StorageDiskTotalBytes/1024/1024, StorageDiskUsedMB=StorageDiskUsedBytes/1024/1024 +| timechart avg(StorageDiskTotalMB) as StorageDiskTotalMB, avg(StorageDiskUsedMB) as StorageDiskUsedMB + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Cluster Roles + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics TrackLatestType="Original" +| rex field=host "(?<HostDC>..).*-..(?<HostConfig>..).*" +| eval HostDC=if(isnotnull(pie_work_unit), pie_work_unit, HostDC) +| makemv delim="," Roles +| stats dc(Machine) as MachineCount by Roles, HostDC +| stats list(HostDC), list(MachineCount) by Roles +| sort Roles + $TimeSpan.earliest$ + $TimeSpan.latest$ + + +
+
+
+ + + Storage Engine + + + index=$Index$ LogGroup=$LogGroup$ Type=Role Origination=Recruited As=StorageServer | table StorageEngine, OriginalDateTime, DateTime |head 2 + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + +
+
+ + Cluster Generations + + Indicate FDB recoveries + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics |timechart max(Generation) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + +
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/ratekeeper.xml b/contrib/observability_splunk_dashboard/ratekeeper.xml new file mode 100644 index 0000000000..c4a31a8fbc --- /dev/null +++ b/contrib/observability_splunk_dashboard/ratekeeper.xml @@ -0,0 +1,928 @@ +
+ +
+ + + * + + + + + + + + + -60m@m + now + + + + + Normal + Batch + + + + + 30s + + + + Yes + No + + + + + MasterServer + MasterProxyServer + StorageServer + TLog + Resolver + GrvProxyServer + CommitProxyServer + + + + MasterServer + MasterProxyServer + Resolver + TLog + StorageServer + GrvProxyServer + CommitProxyServer + + + + MasterServer + MasterProxyServer + Resolver + TLog + StorageServer + GrvProxyServer + CommitProxyServer + +
+ + + Aggregated Storage Server Bandwidth + + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" + | rex field=BytesQueried "(?<RRate>.*) (?<RRoughness>.*) (?<RCounter>.*)" + | rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" + | rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" + | bin span=5s _time + | stats sum(RRate) as ReadSum, sum(WRate) as WriteSum, sum(FRate) as FetchedKeyRate by _time + | eval ReadSpeedMB=ReadSum/1024/1024, WriteSpeedMB=WriteSum/1024/1024, FetchedKeyRateMB=FetchedKeyRate/1024/1024 + |timechart avg(ReadSpeedMB), avg(WriteSpeedMB), avg(FetchedKeyRateMB) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Aggregated Proxy Bandwidth + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " MutationBytes +| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), MutationBytesRate=mvindex(MutationBytes, 0) +| bin span=60s _time +| stats avg(TxnRequestInRate) as TxnRequestInRatePerHost, avg(TxnRequestOutRate) as TxnRequestOutRatePerHost, avg(TxnStartInRate) as TxnStartInRatePerHost, avg(TxnStartOutRate) as TxnStartOutRatePerHost, avg(MutationBytesRate) as MutationBytesRatePerHost by Machine,_time +| eval WriteThroughputKB=sum(MutationBytesRatePerHost)/1000 +| timechart span=1m sum(TxnRequestInRatePerHost), sum(TxnRequestOutRatePerHost), sum(TxnStartInRatePerHost), sum(TxnStartOutRatePerHost), sum(WriteThroughputKB) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 1: Overview - GRV Arrivals and Leaves per Second Seen by Proxies + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| eval TxnRequestIn=mvindex(TxnRequestIn, 0), TxnRequestOut=mvindex(TxnRequestOut, 0), TxnStartIn=mvindex(TxnStartIn, 0), TxnStartOut=mvindex(TxnStartOut, 0) +| timechart span=30s avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) by Machine + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + Chart 2: RKOverview - Input ReleasedTPS and Output TPSLimit + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time ReleasedTPS TPSLimit +| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 3: RKOverview - RKLimitReason + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time Reason + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + + + Chart 4: Don't Process Transactions - RkSSListFetchTimeout (TpsLimit = 0) + + + index=$Index$ LogGroup=$LogGroup$ +Type="RkSSListFetchTimeout" +| timechart span=1s count + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 5: Don't Process Transactions - RkTlogMinFreeSpaceZero (TpsLimit = 0) + + + index=$Index$ LogGroup=$LogGroup$ +Type="RkTlogMinFreeSpaceZero" +| timechart span=1s count + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 6: Don't Process Transactions - ProxyGRVThresholdExceeded + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyGRVThresholdExceeded*") AND TrackLatestType="Original" +| timechart span=1s count by Type + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 7: RKLimitReasonCandidate - LimitingStorageServerDurabilityLag (MVCCVersionInMemory) + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerDurabilityLag) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 8: RKLimitReasonCandidate - LimitingStorageServerVersionLag (TLogVer-SSVer) + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerVersionLag) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 9: RKLimitReasonCandidate - LimitingStorageServerQueue + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerQueue) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 10: Runtime Monitoring - StorageServer MVCCVersionInMemory (storage_server_durability_lag) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval NonDurableVersions=Version-DurableVersion +| timechart span=$ChartBinSizeToken$ limit=0 avg(NonDurableVersions) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 11: Runtime Monitoring - StorageServer LocalRate (higher MVCCVersionInMemory -> lower LocalRate) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" +| timechart limit=0 avg(LocalRate) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 12: Runtime Monitoring - StorageServer ReadsRejected (lower LocalRate -> higher probability of rejecting read)) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" +| timechart limit=0 avg(ReadsRejected) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 13: Runtime Monitoring - Version Lag between StorageServer and Tlog (storage_server_readable_behind) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval SSFallBehindVersions=VersionLag +| timechart span=$ChartBinSizeToken$ limit=0 avg(SSFallBehindVersions) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 14: Runtime Monitoring - StorageServerBytes (storage_server_write_queue_size) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| makemv delim=" " BytesInput | makemv delim=" " BytesDurable | makemv delim=" " BytesFetched | makemv delim=" " MutationBytes +| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesFetched=mvindex(BytesFetched, 2), MutationBytes=mvindex(MutationBytes, 2), BytesInMemoryQueue=BytesInput-BytesDurable +| timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 15: Runtime Monitoring - StorageServer KVStore Free Space Ratio (storage_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 16: Runtime Monitoring - TLog Queue Free Space Ratio (log_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval QueueBytesFreeRatio=QueueDiskBytesFree/QueueDiskBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(QueueBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 17: Runtime Monitoring - TLog KVStore Free Space Ratio (log_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 18: Runtime Monitoring - TLogBytes (log_server_write_queue) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| makemv delim=" " BytesInput +| makemv delim=" " BytesDurable +| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesInMemoryQueue=BytesInput-BytesDurable | timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 19: Runtime Monitoring - Proxy Throughput + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ limit=0 avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) avg(TxnStartBatch) avg(TxnStartErrors) avg(TxnCommitIn) avg(TxnCommitVersionAssigned) avg(TxnCommitResolving) avg(TxnCommitResolved) avg(TxnCommitOut) avg(TxnCommitOutSuccess) avg(TxnCommitErrors) avg(TxnThrottled) avg(TxnConflicts) avg(CommitBatchIn) avg(CommitBatchOut) avg(TxnRejectedForQueuedTooLong) avg(Mutations) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 20: Runtime Monitoring - Proxy Queue Length + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" | timechart span=$ChartBinSizeToken$ limit=0 avg(*QueueSize*) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 21: Runtime Monitoring - TLog UnpoppedVersion + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval UnpoppedVersion=PersistentDataDurableVersion-QueuePoppedVersion +| timechart span=$ChartBinSizeToken$ limit=0 avg(UnpoppedVersion) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 22: Runtime Monitoring - Storage Server Disk (AIODiskStall) + + + index=$Index$ LogGroup=$LogGroup$ Type="ProcessMetrics" +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As="StorageServer" + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ limit=0 avg(AIODiskStall) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 23: Runtime Monitoring - StorageServer Query Queue Length + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| makemv QueryQueue | eval QueryQueue=mvindex(QueryQueue, 1) | table _time QueryQueue Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(QueryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 24: Transaction Trace Stats - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (*ProxyServer.masterProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) +| table Time Type ID Location Machine Roles +| append + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) + | rename ID as ParentID + | table Time Type ParentID Location Machine Roles + | join ParentID + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" + | rename ID as ParentID + | rename To as ID + | table ParentID ID] + | table Time Type ID Location Machine Roles] +| table Time Type ID Location Machine Roles +| sort 0 Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=TBegin +| bin bins=20 span=$StatsGRVSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $GRVByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 25: Transaction Trace Stats - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) +| table Machine Location Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Time Order +| stats list(*) by ID +| rename list(*) as * +| table Machine Location Time Roles ID Type +| eval count = mvcount(Location) +| search count>2 +| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin +| table _time ID TimeSpan Machine Location Time +| bin bins=20 span=$StatsReadSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $GetValueByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 26: Transaction Trace Stats - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + Machine + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) +| table Time Type ID Location Machine Roles +| sort 0 Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location) +| search Count>=2 +| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=T1 +| table _time TimeSpan Machine +| bin bins=20 span=$StatsCommitSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $CommitByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 27: Transaction Tracing - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (*ProxyServer.*ProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) +| table Time Type ID Location Machine Roles +| append + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) + | rename ID as ParentID + | table Time Type ParentID Location Machine Roles + | join ParentID + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" + | rename ID as ParentID + | rename To as ID + | table ParentID ID] + | table Time Type ID Location Machine Roles] +| table Time Type ID Location Machine Roles +| eval Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) +| table Time Order Type ID Location Machine Roles +| sort 0 Order Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), TimeInQueue = T2-T1, TimeGetVersionFromProxies = if(mvcount==4, T3-T2, -0.0000001), TimeConfirmLivenessFromTLogs = if(mvcount==4, T4-T3, T3-T2), TimeSpan=if(mvcount==4,T4-T1,T3-T1), _time=T1 +| table _time TimeSpan TimeInQueue TimeGetVersionFromProxies TimeConfirmLivenessFromTLogs Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeInQueue), avg(TimeGetVersionFromProxies), avg(TimeConfirmLivenessFromTLogs) $GRVLatencyByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 28: Transaction Tracing - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) +| table Machine Location Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Time Order +| stats list(*) by ID +| rename list(*) as * +| table Machine Location Time Roles ID Type +| eval count = mvcount(Location) +| search count>2 +| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin +| table _time TimeSpan +| timechart span=30s limit=0 avg(TimeSpan) $GetValueLatencyByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 29: Transaction Tracing - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) +| table Time Type ID Location Machine Roles +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| table Time Order Type ID Location Machine Roles +| sort 0 Time Order +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location) +| search Count=7 +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), T5=mvindex(Time, 4), T6=mvindex(Time, 5), T7=mvindex(Time, 6), TimeSpan=T7-T1, TimeResolution=T4-T3, TimePostResolution=T5-T4, TimeProcessingMutation=T6-T5, TimeTLogPush=T7-T6, _time=T1 +| table _time TimeSpan TimeResolution TimePostResolution TimeProcessingMutation TimeTLogPush Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeResolution), avg(TimePostResolution), avg(TimeProcessingMutation), avg(TimeTLogPush) $CommitByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 30: Transaction Tracing - Commit - TLogPush and Resolver Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + Step + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (Resolver.resolveBatch.Before OR Resolver.resolveBatch.AfterQueueSizeCheck OR Resolver.resolveBatch.AfterOrderer OR Resolver.resolveBatch.After OR TLog.tLogCommit.BeforeWaitForVersion OR TLog.tLogCommit.Before OR TLog.tLogCommit.AfterTLogCommit OR TLog.tLogCommit.After) +| table Time Type ID Location Machine Roles +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location=="MasterProxyServer.batcher", 1, Location=="MasterProxyServer.commitBatch.Before", 2, Location=="MasterProxyServer.commitBatch.GettingCommitVersion", 3, Location=="MasterProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location=="MasterProxyServer.commitBatch.AfterResolution", 8.5, Location=="MasterProxyServer.commitBatch.ProcessingMutations", 9, Location=="MasterProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location=="MasterProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| table Time Order Type ID Location Machine Roles +| sort 0 Time Order +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location), Step=case(Count=4 and (mvindex(Location, 0) like "TLog%"), "TimeTLogCommit", Count=4 and (mvindex(Location, 0) like "Resolver%"), "TimeResolver", Count=10, "TimeSpan"), BeginTime=mvindex(Time, 0), EndTime=mvindex(Time, -1), Duration=EndTime-BeginTime, _time=BeginTime +| search Count=4 +| eval Machinei=mvindex(Machine, 0), MachineStep = Step."-".Machinei +| table _time Step Duration Machinei Location Machine MachineStep +| timechart span=$ChartBinSizeToken$ limit=0 avg(Duration) by $TLogResolverByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 31: Machine Performance - CPU Utilization (CPU Time divided by Elapsed) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory Elapsed +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization=CPUSeconds/Elapsed +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 32: Machine Performance - Memory Utilization (ResidentMemory divided by Memory) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization = ResidentMemory/Memory +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 33: Machine Performance - Disk Utilization ((DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization = (DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 34: Machine Performance - Network (Mbps Received and Mbps Sent) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ avg(MbpsReceived) avg(MbpsSent) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 35: Machine Performance - Disk (Reads Count and Writes Count) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ avg(DiskReadsCount) avg(DiskWritesCount) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 36: Network Performance - Timeout + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($SourcePerfConnectionToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($DestinationPerfConnectionToken$)) + | dedup ID + | rename Machine as PeerAddr] +| eval Connection=Machine."-".PeerAddr +| timechart useother=0 span=$ChartBinSizeToken$ count $TimeoutByConnectionToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 37: Network Performance - PingLatency + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=PingLatency) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($SourcePerfConnectionToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($DestinationPerfConnectionToken$)) + | dedup ID + | rename Machine as PeerAddr] +| eval Connection=Machine."-".PeerAddr +| timechart useother=0 span=$ChartBinSizeToken$ avg(MeanLatency) avg(MaxLatency) $PingLatencyByConnectionToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + +
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/recovery.xml b/contrib/observability_splunk_dashboard/recovery.xml new file mode 100644 index 0000000000..6ba6b9a63b --- /dev/null +++ b/contrib/observability_splunk_dashboard/recovery.xml @@ -0,0 +1,873 @@ +
+ +
+ + + Table 1: Find long recovery (Input Index and LogGroup and Select a time span). + + + * + + + + + + + + + -0s + now + + + + + index=$IndexForOverview$ LogGroup=$LogGroupForOverview$ + ((Type="MasterRecoveryState" AND (Status="reading_coordinated_state" OR Status="fully_recovered" OR Status="accepting_commits")) OR (Type="Role" AND As="MasterServer" AND ("Transition"="Begin" OR "Transition"="End")) OR Type="MasterTerminated") AND (NOT TrackLatestType="Rolled") | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID Machine Type Transition As Status DateTime Time ErrorDescription LogGroup +| search NOT ErrorDescription="Success" +| eval EventType=case(Transition="Begin" AND As="MasterServer" AND Type="Role", "MasterStart", Type="MasterRecoveryState" AND Status="fully_recovered", "FullRecovery", Type="MasterRecoveryState" AND Status="reading_coordinated_state", "StartRecoveryAttempt", Transition="End" AND As="MasterServer" AND Type="Role", "MasterTerminated", Type="MasterTerminated", "MasterTerminated", Type="MasterRecoveryState" AND Status="accepting_commits", "AcceptingCommits") +| table ID Machine EventType DateTime Time ErrorDescription LogGroup +| fillnull value="-" +| sort -Time +| eval ifMasterTerminatedEvent=if(EventType="MasterTerminated", 1, 0) +| stats list(*) by ID Machine ifMasterTerminatedEvent +| rename list(*) as * +| table ID Machine EventType DateTime Time ErrorDescription LogGroup +| sort -Time +| eval LastTime=mvindex(Time, 0), FirstTime=mvindex(Time, -1), Duration=LastTime-FirstTime +| table ID Machine Duration EventType DateTime Time ErrorDescription LogGroup + $time_token_for_recoveryhistorytable.earliest$ + $time_token_for_recoveryhistorytable.latest$ + + + + +
+
+
+ + + Table 2: Select timespan containing the long recovery and see all recovery attempts in the time span (The input Index and LogGroup and Timespan are for all following tables and charts) + + + * + + + + + + + + -0s@s + now + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="MasterRecoveryState" OR (Type="MasterTerminated") OR (Type="Role" AND As="MasterServer" AND "Transition"="End") OR Type="RecoveryInternal" OR Type="ProxyReplies" OR Type="CommitProxyReplies" OR Type="ResolverReplies" OR Type="MasterRecruitedInitialStorageServers") AND (NOT TrackLatestType="Rolled") +| rename ID as MasterID +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table MasterID Machine Status Step Type DateTime Time StatusCode MyRecoveryCount ErrorDescription Reason ErrorCode +| fillnull value="-" ErrorDescription Reason ErrorCode +| eval Status=case(Type=="MasterRecoveryState", Status, Type=="Role", "RoleEnd", Type=="MasterTerminated", "MasterTerminated", Type=="RecoveryInternal", Status."/".Step, Type=="ProxyReplies" OR Type=="CommitProxyReplies", "initializing_transaction_servers/ProxyReplies", Type="ResolverReplies", "initializing_transaction_servers/ResolverReplies", Type=="MasterRecruitedInitialStorageServers", "initializing_transaction_servers/MasterRecruitedInitialStorageServers"), StatusCode=case(Type=="ProxyReplies" OR Type=="CommitProxyReplies" OR Type=="ResolverReplies" OR Type=="MasterRecruitedInitialStorageServers", "8", Type!="ProxyReplies" AND Type!="CommitProxyReplies" AND Type!="ResolverReplies" AND Type!="MasterRecruitedInitialStorageServers", StatusCode) +| fillnull value="-" StatusCode +| sort 0 -Time -StatusCode +| stats list(*) by MasterID Machine +| rename list(*) as * +| eval FirstTime=mvindex(Time, -1), LastTime=mvindex(Time, 0), Duration=LastTime-FirstTime +| table MasterID Machine MyRecoveryCount Duration ErrorDescription Reason ErrorCode StatusCode Status DateTime Time +| sort -MyRecoveryCount +| fillnull value="-" MyRecoveryCount + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+
+ + + Table 3: Why recovery is triggered? Using WaitFailureClient event. Machine A detects Machine B's failure. First column is the time when WaitFailureClient happens. Columns of 2,3,4,5 are for A. Columns of 6,7 are for B. + + + index=$Index$ LogGroup=$LogGroup$ + Type="WaitFailureClient" +| table Type Time Machine FailedEndpoint +| replace *:tls with * in FailedEndpoint +| join Machine type=left + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND Transition="End" + | eval EndTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | rename As as Role + | table ID EndTime Machine Role] +| join FailedEndpoint type=left + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" + | stats latest(*) by ID | rename latest(*) as * + | rename Machine as FailedEndpoint + | eval FailedEndpointLatestRoleEventInfo=As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(*) by FailedEndpoint + | rename list(*) as * + | table FailedEndpoint FailedEndpointLatestRoleEventInfo] +| eval FailureDetectedTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| makemv delim=" " FailedEndpointLatestRoleEventInfo +| table FailureDetectedTime Machine ID Role EndTime FailedEndpoint FailedEndpointLatestRoleEventInfo + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 4: New Recruitment Configuration (using MasterRecoveredConfig event) + + + index=$Index$ LogGroup=$LogGroup$ + Type="MasterRecoveredConfig" AND TrackLatestType="Original" +| eval Configuration=replace(Conf, "&quot;", "\"") +| rename Configuration as _raw + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + Table 5: Data Centers (using ProcessMetrics event) + + + index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics +| dedup DCID +| rename DCID as DataCenterID +| table DataCenterID pie_work_unit +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+ + Table 6: New Role (using Role event joined by ProcessMetrics event) + + + index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ((As="ClusterController") OR (As="MasterServer") OR (As="TLog") OR (As="Resolver") OR (As="MasterProxyServer") OR (As="CommitProxyServer") OR (As="GrvProxyServer") OR (As="LogRouter")) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) +| eventstats count by ID +| rename As as Role +| search count=1 AND Transition="Begin" +| table ID Role Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Role Machine DataCenter +| fillnull value="null" DataCenter +| stats count by Role DataCenter + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 7: Role Details + + + MasterServer + TLog + Resolver + MasterProxyServer (for <7.0) + LogRouter + CommitProxyServer (for 7.0+) + GrvProxyServer (for 7.0+) + As=" + " + OR + + + + Begin + End + Begin->End + count=1 AND Transition="Begin" + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($RolesToken$) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) +| eventstats count by ID +| rename As as Role +| search $RoleDetailTableWhichRoleToken$ +| table ID Role Machine Time +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Role Machine DataCenter Time +| fillnull value="null" DataCenter +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID Role Machine DataCenter DateTime +| sort 0 -DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 8: CC Recruitment SevWarn OR SevError (use events in clusterRecruitFromConfiguration and clusterRecruitRemoteFromConfiguration) + + + index=$Index$ LogGroup=$LogGroup$ + Type="RecruitFromConfigurationNotAvailable" OR Type="RecruitFromConfigurationRetry" OR Type="RecruitFromConfigurationError" OR Type="RecruitRemoteFromConfigurationNotAvailable" OR Type="RecruitRemoteFromConfigurationRetry" OR Type="RecruitRemoteFromConfigurationError" + | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)"), GoodRecruitmentTimeReady=case(Type=="RecruitFromConfigurationNotAvailable" OR Type=="RecruitRemoteFromConfigurationNotAvailable", "True", Type=="RecruitFromConfigurationRetry" OR Type=="RecruitRemoteFromConfigurationRetry", GoodRecruitmentTimeReady, Type=="RecruitFromConfigurationError" OR Type=="RecruitRemoteFromConfigurationError", "-") + | table Type GoodRecruitmentTimeReady Time DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + +
+
+
+ + + Table 9: RecoveryCount of the selected TLog (in Table 11) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogStart") OR (LogId=$row.TLogID$ AND Type="TLogPersistentStateRestore") +| eval ID=if(Type="TLogStart", ID, LogId), DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID RecoveryCount Type DateTime | fillnull value="Not found. The fdb version is somewhat old." + -7d@h + now + + + +
+
+ + Table 10: Which roles the selected TLog (in Table 11) talks to + + + index=$Index$ LogGroup=$LogGroup$ + ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") +| sort -Time +| eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") +| stats list(*) by TLogID +| rename list(*) As * +| table TLogID TLogEvents +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| table TLogID TLogEvents +| mvexpand TLogEvents +| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), MasterID=mvindex(temp,2) +| fields - temp - TLogEvents +| sort 0 -Time +| search NOT MasterID="NULL" +| dedup MasterID +| rename MasterID as ID +| join type=left ID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role") + | sort 0 -Time + | dedup ID + | table ID Machine As] +| table ID Machine As | fillnull value="null" Machine As + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 11: TLog Events (Collecting all TLogs that produce interesting events during the time span) + + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR + ((Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2")) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled") AND $SeeLogEventDetailTableToken$ +| sort -Time +| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."null", (Type="TLogReady"), Time." ".Type." "."null", (Type="TLogStart"), Time." ".Type." "."null", (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."null") +| stats list(TLogEvents) by TLogID +| rename list(TLogEvents) As TLogEvents +| eval EarliestEvent=mvindex(TLogEvents, -1) , LatestEvent=mvindex(TLogEvents, 0) +| table TLogID TLogEvents EarliestEvent LatestEvent +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND As="TLog") + | sort 0 -Time + | dedup ID + | rename ID as TLogID + | table TLogID host LogGroup Machine] +| table TLogID Machine LogGroup host EarliestEvent LatestEvent +| fillnull value="null" Machine host LogGroup +| eval temp=split(LatestEvent," "), LatestTime=mvindex(temp,0), LatestEvent=mvindex(temp,1), temp2=split(EarliestEvent," "), EarliestTime=mvindex(temp2,0), EarliestEvent=mvindex(temp2,1), Duration=LatestTime-EarliestTime +| table TLogID Machine EarliestTime Duration LogGroup host +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| fillnull value="null" DataCenter +| table TLogID Machine DataCenter EarliestTime Duration host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + ((Type="TLogRejoining") OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow")) OR ((Type="TLogLockStarted" OR Type="TLogLocked")) OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh")) AND (NOT TrackLatestType="Rolled") + | sort -Time + | eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") + | stats list(*) by TLogID + | rename list(*) As * + | table TLogID TLogEvents + | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) + | search ignore=0 + | sort TLogID + | table TLogID TLogEvents + | mvexpand TLogEvents + | eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), RoleID=mvindex(temp,2) + | fields - temp - TLogEvents + | sort 0 -Time + | search NOT RoleID="NULL" + | table TLogID RoleID MasterMachine + | stats list(*) by TLogID + | rename list(*) as * + | streamstats count + | mvexpand RoleID + | dedup count RoleID + | fields - count + | stats count by TLogID + | rename count as Roles + | table TLogID Roles] +| table TLogID Machine DataCenter Roles EarliestTime Duration host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR + ((Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled")) + | sort -Time + | eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=if(Type="Role", Type.Transition, Type) + | sort 0 TLogEvents + | stats list(TLogEvents) by TLogID + | rename list(TLogEvents) As TLogEvents + | table TLogID TLogEvents + | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) + | search ignore=0 + | mvcombine delim=" " TLogEvents + | table TLogID TLogEvents] +| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestTime host LogGroup +| eval EarliestDateTime=strftime(EarliestTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TLogStart") OR (Type="TLogPersistentStateRestore") + | eval TLogID=if(Type="TLogStart", ID, LogId) + | table TLogID RecoveryCount] +| table TLogID RecoveryCount Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup +| fillnull value="TLog too old, click and see details" RecoveryCount + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + $click.value$ + +
+
+ + Table 12: Event Details (Including rejoining events) of the selected TLog (in Table 11) + + + index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover" AND LogId=$row.TLogID$) OR (Type="TLogReady" AND ID=$row.TLogID$) OR (Type="TLogStart" AND ID=$row.TLogID$) OR + ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") +| sort -Time +| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."-"." "."-", (Type="TLogReady"), Time." ".Type." "."-"." "."-", (Type="TLogStart"), Time." ".Type." "."-"." "."-", (Type="TLogRejoining"), Time." ".Type." ".Master." "."-", (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."-", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."-"." "."-", (Type="Role" AND As="TLog" AND Transition="Begin" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." ".Origination, (Type="Role" AND As="TLog" AND Transition="End" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." "."-") +| stats list(*) by TLogID +| rename list(*) As * +| table TLogID TLogEvents +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role" AND As="TLog" AND ID=$row.TLogID$) + | dedup ID + | rename ID as TLogID + | table TLogID Machine] +| table TLogID Machine TLogEvents +| fillnull value="-" Machine +| mvexpand TLogEvents +| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), ToID=mvindex(temp,2), Origination= mvindex(temp,3) +| fields - temp - TLogEvents +| join type=left + [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role") + | dedup ID + | rename ID as ToID + | rename As as ToRole + | rename Machine as ToMachine + | table ToID ToRole ToMachine] +| sort 0 -Time +| fillnull value="-" ToRole ToMachine +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TLogID Machine Event DateTime ToID ToRole ToMachine Time DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+
+ + + Table 13: All Tags of the selected TLog (in Table 11) that have been popped by SSes (using TLogPoppedTag event) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogPoppedTag") +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| rename ID as TLogID +| rename Tags as UnpoppedRecoveredTagCount +| rename Tag as TagPopped +| rename DurableKCVer as DurableKnownCommittedVersion +| search TagPopped!="-1:2" +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt +| sort 0 -UnpoppedRecoveredTagCount +| join TagPopped type=left + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") + | stats latest(*) by Machine + | rename latest(*) as * + | rename Tag as TagPopped + | table TagPopped ID Machine] +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| rename ID as SSID +| rename Machine as SSMachine +| rename DataCenter as SSDataCenter +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped SSID SSMachine SSDataCenter DurableKnownCommittedVersion RecoveredAt +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+ + Table 14: All Tags of the selected TLog (in Table 11) to be popped by SSes (using TLogReady event) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogReady") +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| rename ID as TLogID +| table TLogID Type AllTags Locality +| makemv delim="," AllTags +| mvexpand AllTags +| rename AllTags as Tag | sort 0 Tag +| join Tag type=left + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") + | stats latest(*) by Machine + | rename latest(*) as * + | table Tag ID Machine] +| table TLogID Tag ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| fillnull value="-" +| table TLogID Tag ID Machine DataCenter +| rename ID as SSID | rename Machine as SSMachine | rename DataCenter as SSDataCenter +| search Tag!="-1:2" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 15: The Tags of the selected TLog (in Table 11) that are not popped by SSes (using set diff tags in Table 13 and Table 14) (if result contains "...", the result of Table 15 is wrong) + + + | set diff + [ search index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogReady") + | table AllTags + | makemv delim="," AllTags + | mvexpand AllTags + | rename AllTags as Tag + | table Tag] + [ search index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogPoppedTag") + | table Tag] + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+ + Table 16: All Current Storage Servers (assume each machine has at most one SS) + + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") AND $TriggerSSTableToken$ +| stats latest(*) by Machine +| rename latest(*) as * +| table Tag ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Machine DataCenter Tag +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ((As="StorageServer")) AND (NOT TrackLatestType="Rolled")) + | stats latest(*) by Machine + | rename latest(*) as * + | rename As as Role + | table ID Role Machine + | join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] + | table ID Role Machine DataCenter + | fillnull value="null" DataCenter] +| sort 0 DataCenter +| table Tag ID Machine DataCenter | sort 0 Tag + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Chart 1: Timeout/TimedOut event distribution grouped by source (Machine) + + + 5s + + + + TLog + MasterServer + MasterProxyServer (for version < 7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for ver 7+) + As=" + " + OR + + + + TLog + MasterServer + MasterProxyServer (for version <7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for version 7+) + As=" + " + OR + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | dedup ID + | rename Machine as PeerAddr] +| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by Machine + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Chart 2: Timeout/TimedOut event distribution grouped by destination (PeerAddr) + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | dedup ID + | rename Machine as PeerAddr] +| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by PeerAddr + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Table 17: Check Type=ConnectionTimedOut OR Type=ConnectionTimeout events between transaction roles in the recovery (including the role that refresh/begin/end in the timespan) + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| stats count as TotalTimeouts by Machine PeerAddr +| table Machine PeerAddr TotalTimeouts +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | stats latest(*) by ID + | rename latest(*) as * + | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(Role) AS MachineRoleLatestEvent BY Machine + ] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | stats latest(*) by ID + | rename latest(*) as * + | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(Role) AS PeerRoleLatestEvent BY Machine + | rename Machine AS PeerAddr + ] +| table Machine PeerAddr TotalTimeouts MachineRoleLatestEvent PeerRoleLatestEvent + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 18: Proxy 0 + + + index=$Index$ LogGroup=$LogGroup$ + (Type="ProxyReplies" OR Type="CommitProxyReplies") AND FirstProxy="True" +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table WorkerID LogGroup FirstProxy Time DateTime +| sort 0 -Time +| join type=left WorkerID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND As="Worker" AND Transition="Refresh" + | dedup ID + | rename ID as WorkerID + | stats list(*) by WorkerID + | rename list(*) as * + | table WorkerID Machine Roles] +| table WorkerID Machine Roles LogGroup FirstProxy Time DateTime +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND (As="MasterProxyServer" OR As="CommitProxyServer") AND Transition="Refresh" + | dedup ID + | rename ID as ProxyID + | table Machine ProxyID] +| table ProxyID Machine LogGroup FirstProxy + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 19: Latest Role Events on the input Machine (Input Machine, like 172.27.113.121:4500) + + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND Machine=$SearchMachineToken$ +| stats latest(*) by ID Transition +| rename latest(*) as * +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table DateTime Machine ID Transition As Roles LogGroup Error ErrorDescription Reason +| sort 0 -DateTime +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Chart 3: severity>=20 event distribution (including roles that refresh/begin/end in the timespan) + + + * + + + + TLog + MasterServer + MasterProxyServer (for version <7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for version 7+) + As=" + " + OR + + + + EventType + Machine + Severity + Type + + + + 5s + + + + index=$Index$ LogGroup=$LogGroup$ + Severity>10 AND $BadEvents$ +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND ($BadEventRoleToken$) + | dedup ID | table Machine] +| table Machine Type Severity _time +| timechart useother=0 span=$BadEventChartTimeSpanToken$ count by $BadEventChartBy$ + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Table 20: Check severity>20 events of roles in the recovery (including the role that refresh/begin/end in the timespan) + + + index=$Index$ LogGroup=$LogGroup$ + Severity>10 +| stats count by Machine Type +| rename count as Count +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND ($BadEventRoleToken$) + | dedup ID + | eval Role=As."-".ID + | stats list(Role) by Machine + | rename list(Role) as Roles + | table Machine Roles] +| table Type Count Roles Machine +| sort -Count + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/transaction_latency.xml b/contrib/observability_splunk_dashboard/transaction_latency.xml new file mode 100644 index 0000000000..99b551f2c9 --- /dev/null +++ b/contrib/observability_splunk_dashboard/transaction_latency.xml @@ -0,0 +1,247 @@ +
+ + Design for ClusterController issued transactions. +
+ + + + + + + * + + + + * + + + + + @d + now + + +
+ + + All Transactions (Currently, this table also does not cover getrange operation and the operation which not do commit). + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ ID=$transactionID$ + (Type="TransactionAttachID" OR Type="GetValueAttachID" OR Type="CommitAttachID") +| eval To=case(Type=="TransactionAttachID", "0"."-".To, Type="GetValueAttachID", "1"."-".To, Type=="CommitAttachID", "2"."-".To) +| stats list(To) by ID +| rename list(To) as ToList +| table ID ToList +| eval Count = mvcount(ToList) +| search Count=3 +| eval To0=mvindex(ToList,0), To1=mvindex(ToList,1), To2=mvindex(ToList,2), To0=split(To0,"-"), To1=split(To1,"-"), To2=split(To2,"-"), GrvID=case(mvindex(To0, 0)=="0", mvindex(To0, 1), mvindex(To1, 0)=="0", mvindex(To1, 1), mvindex(To2, 0)=="0", mvindex(To2, 1)), ReadID=case(mvindex(To0, 0)=="1", mvindex(To0, 1), mvindex(To1, 0)=="1", mvindex(To1, 1), mvindex(To2, 0)=="1", mvindex(To2, 1)), CommitID=case(mvindex(To0, 0)=="2", mvindex(To0, 1), mvindex(To1, 0)=="2", mvindex(To1, 1), mvindex(To2, 0)=="2", mvindex(To2, 1)) +| table ID GrvID ReadID CommitID +| join GrvID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.Before") + | rename ID as GrvID + | rename Time as BeginTime + | table GrvID BeginTime + ] +| join GrvID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.After") + | rename ID as GrvID + | rename Time as GRVDoneTime + | table GrvID GRVDoneTime + ] +| join ReadID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="GetValueDebug" AND Location="NativeAPI.getValue.After") + | rename ID as ReadID + | rename Time as ReadDoneTime + | table ReadID ReadDoneTime + ] +| join CommitID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="CommitDebug" AND Location="NativeAPI.commit.After") + | rename ID as CommitID + | rename Time as CommitDoneTime + | table CommitID CommitDoneTime + ] +| rename ID as TransactionID +| eval BeginToGRVDone = GRVDoneTime-BeginTime, GRVDoneToReadDone = ReadDoneTime-GRVDoneTime, ReadDoneToCommitDone = CommitDoneTime-ReadDoneTime, Duration=CommitDoneTime-BeginTime, BeginTimeScope=BeginTime-1, EndTimeScope=CommitDoneTime+1, BeginDateTime=strftime(BeginTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TransactionID Duration BeginDateTime BeginToGRVDone GRVDoneToReadDone ReadDoneToCommitDone Duration GrvID ReadID CommitID BeginTimeScope EndTimeScope | sort -Duration + $time_token.earliest$ + $time_token.latest$ + + + + $row.BeginTimeScope$ + $row.EndTimeScope$ + $row.ReadID$ + $row.GrvID$ + $row.CommitID$ + +
+
+
+ + + Step1: GRV + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (NOT MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion) +AND (ID=$GrvID$ OR ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="TransactionAttachID" AND ID=$GrvID$ + | return $To]) +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time - MinTime, Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location=="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location=="GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster", 2.1, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location=="MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion", 4, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) +| table Time Delta Order Type ID Location Machine Roles +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+ + Step1: (Only for FDB v6.3): GRV --- Get Committed Version (MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion Events) + + only for FDB 6.3 + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND Location="MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion" + AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="TransactionAttachID" AND ID=$GrvID$ + | return $To] +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time - MinTime +| sort 0 -Time +| table Machine Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+
+ + + Step2: GetValue + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ Type="GetValueDebug" AND ID=$ReadID$ +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| table Machine Location Delta Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $time_token.earliest$ + $time_token.latest$ + + +
+
+
+ + + Step3: Commit + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (ID=$CommitID$ OR ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To]) + +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| table Machine Location Delta Time Roles ID Type +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLogServer.tLogCommit.BeforeWaitForVersion", 11, Location=="TLogServer.tLogCommit.Before", 12, Location=="TLogServer.tLogCommit.AfterTLogCommit", 13, Location=="TLogServer.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+
+ + + Step3: Commit --- Resolver + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + (Location="Resolver*") +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To] + | rename To as ID + | table ID] +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| eval Order=case(Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8) +| sort 0 Time Order +| stats list(*) by Type ID Machine Roles +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration +| table Machine Roles Duration Location Delta Time +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table Machine DataCenter Roles Duration Location Delta Time + $time_token.earliest$ + $time_token.latest$ + + +
+
+
+ + + Step3: Commit --- Commit to TLogs (CommitDebug Events), grouped by Machine and sorted by Duration + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + (Location="TLog*") +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To] + | rename To as ID + | table ID] +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| sort 0 Time +| stats list(*) by Type ID Machine Roles +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration +| table Machine Roles Duration Location Delta Time + $BeginTime$ + $EndTime$ + + + +
+
+
+
\ No newline at end of file