add splunk dashboards (#7942)

This commit is contained in:
Zhe Wang 2022-08-19 20:05:37 -04:00 committed by GitHub
parent f0bade0ba4
commit 6afc36cc33
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 2802 additions and 0 deletions

View File

@ -0,0 +1,431 @@
<form theme="light">
<label>FoundationDB - Details</label>
<description>Details for FoundationDB Cluster</description>
<fieldset submitButton="false">
<input type="text" token="Index" searchWhenChanged="true">
<label>Index</label>
<default>*</default>
</input>
<input type="text" token="LogGroup" searchWhenChanged="true">
<label>LogGroup</label>
<default>*</default>
</input>
<input type="time" token="TimeRange" searchWhenChanged="true">
<label>Time Range</label>
<default>
<earliest>-60m@m</earliest>
<latest>now</latest>
</default>
</input>
<input type="dropdown" token="Span" searchWhenChanged="true">
<label>Timechart Resolution</label>
<choice value="bins=100">Default</choice>
<choice value="span=5s">5 seconds</choice>
<choice value="span=1m">1 minute</choice>
<choice value="span=10m">10 minutes</choice>
<choice value="span=1h">1 hour</choice>
<choice value="span=1d">1 day</choice>
<default>bins=100</default>
<initialValue>bins=100</initialValue>
</input>
<input type="dropdown" token="Roles" searchWhenChanged="true">
<label>Roles</label>
<choice value="">All</choice>
<choice value="Roles=*SS*">Storage Server</choice>
<choice value="Roles=*TL*">Transaction Log</choice>
<choice value="Roles=*MP*">Proxy</choice>
<choice value="Roles=*RV*">Resolver</choice>
<choice value="Roles=*MS*">Master</choice>
<choice value="Roles=*CC*">Cluster Controller</choice>
<choice value="Roles=*LR*">Log Router</choice>
<choice value="Roles=*DD*">Data Distributor</choice>
<choice value="Roles=*RK*">Ratekeeper</choice>
<choice value="Roles=*TS*">Tester</choice>
<default></default>
</input>
<input type="text" token="Host" searchWhenChanged="true">
<label>Host</label>
<default>*</default>
</input>
<input type="text" token="Machine" searchWhenChanged="true">
<label>Machine</label>
<default>*</default>
</input>
</fieldset>
<row>
<panel>
<chart>
<title>Storage Queue Size</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?&lt;InputRate&gt;.*) (?&lt;InputRoughness&gt;.*) (?&lt;InputCounter&gt;.*)" | rex field=BytesDurable "(?&lt;DurableRate&gt;.*) (?&lt;DurableRoughness&gt;.*) (?&lt;DurableCounter&gt;.*)" | eval QueueSize=InputCounter-DurableCounter | timechart $Span$ avg(QueueSize) by Machine</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>Storage Input Rate</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?&lt;InputRate&gt;.*) (?&lt;InputRoughness&gt;.*) (?&lt;InputCounter&gt;.*)" | timechart $Span$ avg(InputRate) by Machine</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>Storage Bytes Queried</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesQueried "(?&lt;Rate&gt;.*) (?&lt;Roughness&gt;.*) (?&lt;Counter&gt;.*)" | timechart $Span$ avg(Rate) by Machine</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<chart>
<title>Average Process CPU by Role (capped at 2; beware kernel bug)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ avg(Cpu) by Roles</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.axisY.maximumNumber">2</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>Max Process CPU by Role (capped at 2; beware kernel bug)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ max(Cpu) by Roles</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.axisY.maximumNumber">2</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>Disk Busyness</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=ProcessMetrics TrackLatestType=Original | eval DiskBusyPercentage=(Elapsed-DiskIdleSeconds)/Elapsed | timechart $Span$ avg(DiskBusyPercentage) by Machine</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<chart>
<title>Max Run Loop Busyness by Role (for &lt;=6.1, S2Pri1)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics NOT TrackLatestType=Rolled | eval Busyness=if(isnull(PriorityStarvedBelow1), if(isnull(PriorityBusy1), S2Pri1, PriorityBusy1/Elapsed), PriorityStarvedBelow1/Elapsed) | timechart $Span$ max(Busyness) by Roles</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>Max Run Loop Busyness by Priority (6.2+ only)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics TrackLatestType=Original | foreach PriorityBusy* [eval Busyness&lt;&lt;MATCHSTR&gt;&gt;=PriorityBusy&lt;&lt;MATCHSTR&gt;&gt;/Elapsed] | timechart $Span$ max(Busyness*)</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>TLog Queue Size</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval QueueSize=SharedBytesInput-SharedBytesDurable | timechart $Span$ avg(QueueSize) by Machine</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<chart>
<title>Connection Timeouts (counted on both sides of connection)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) $Roles$ host=$Host$ | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) | rex field=WithAddr "(?&lt;OtherAddr&gt;[^:]*:[^:]*).*" | eval Machine=Machine+","+OtherAddr | makemv delim="," Machine | search Machine=$Machine$ | eval Count=1+SuppressedEventCount | timechart sum(Count) by Machine useother=f</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.chart.nullValueMode">zero</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>Pairwise Connection Timeouts Between Datacenters</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) host=* Machine=* NOT TrackLatestType=Rolled
| eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr)
| rex field=host "(?&lt;Datacenter&gt;..).*"
| eval Datacenter=if(isnotnull(pie_work_unit), pie_work_unit, Datacenter)
| rex field=WithAddr "(?&lt;OtherIP&gt;[^:]*):.*"
| join OtherIP
[search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled
| rex field=Machine "(?&lt;OtherIP&gt;[^:]*):.*"
| rex field=host "(?&lt;OtherDatacenter&gt;..).*"
| eval OtherDatacenter=if(isnotnull(pie_work_unit), pie_work_unit, OtherDatacenter)]
| eval DC1=if(Datacenter&gt;OtherDatacenter, Datacenter, OtherDatacenter), DC2=if(Datacenter&gt;OtherDatacenter, OtherDatacenter, Datacenter)
| eval Connection=DC1+" &lt;-&gt; " + DC2
| eval Count=1+SuppressedEventCount
| timechart count by Connection</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<table>
<title>Pairwise Connection Timeouts Between Known Server Processes (Sorted by Count, descending)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut OR Type=ProcessMetrics) $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr), Reason=if(Type=="ConnectionTimedOut", "Timed out trying to connect", "Established connection timed out") | rex field=Machine "(?&lt;IP&gt;[^:]*):.*" | rex field=host "(?&lt;Datacenter&gt;..).*" | rex field=WithAddr "(?&lt;OtherIP&gt;[^:]*):.*" | eventstats values(Roles) as Roles by IP | join OtherIP [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled | rex field=Machine "(?&lt;OtherIP&gt;[^:]*):.*" | rex field=host "(?&lt;OtherDatacenter&gt;..).*" | stats values(Roles) as OtherRoles by OtherIP, OtherDatacenter | eval OtherRoles="("+mvjoin(OtherRoles,",")+")"] | eval Roles="("+mvjoin(Roles,",")+")" | eval IP=Datacenter+": "+IP+" "+Roles, OtherIP=OtherDatacenter+": "+OtherIP+" "+OtherRoles | eval Addr1=if(IP&gt;OtherIP, IP, OtherIP), Addr2=if(IP&gt;OtherIP, OtherIP, IP) | eval Connection=Addr1+" &lt;-&gt; " + Addr2 | eval Count=1+SuppressedEventCount | stats sum(Count) as Count, values(Reason) as Reasons by Connection | sort -Count</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<chart>
<title>Lazy Deletion Rate (making space available for reuse)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=LazyDeletePages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>Vacuuming Rate (shrinking file)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=VacuumedPages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>Roles</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | makemv delim="," Roles | mvexpand Roles | timechart $Span$ distinct_count(Machine) by Roles</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<table>
<title>Slow Tasks (Sorted by Duration, Descending)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=SlowTask $Roles$ host=$Host$ Machine=$Machine$ | sort -Duration | table _time, Duration, Machine, TaskID, Roles</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
<panel>
<table>
<title>Event Counts (Sorted by Severity and Count, Descending)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | stats count as Count by Type, Severity | sort -Severity, -Count</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
<panel>
<table>
<title>Errors</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Severity=40 $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | table _time, Type, Machine, Roles</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<table>
<title>Recoveries (Ignores Filters)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=MasterRecoveryState TrackLatestType=Original (StatusCode=0 OR StatusCode=11) | eval RecoveryResetInterval=10 | sort _time | streamstats earliest(_time) as RecoveryStart, count as EventCount reset_after="(StatusCode=11)" | where StatusCode=11 | eval EventCount=if(EventCount==1, 2, EventCount), RecoveryStart=if(RecoveryStart==_time, _time-RecoveryDuration, RecoveryStart) | sort -_time | streamstats current=f global=f window=1 first(RecoveryStart) as NextRecoveryStart | eval RecoverySpan=NextRecoveryStart-_time, FailedRecoveries=EventCount-2, SuccessfulRecoveries=1 | eval AvailableSeconds=if(RecoverySpan&lt;RecoveryResetInterval, RecoverySpan, 0) | sort _time | streamstats earliest(RecoveryStart) as RecoveryStart, sum(FailedRecoveries) as FailedRecoveryCount, sum(SuccessfulRecoveries) as SuccessfulRecoveryCount, sum(AvailableSeconds) as AvailableSeconds reset_after="(NOT RecoverySpan &lt; RecoveryResetInterval)" | where NOT RecoverySpan &lt; RecoveryResetInterval | eval Duration=_time-RecoveryStart, StartTime=strftime(RecoveryStart, "%F %X.%Q"), ShortLivedRecoveryCount=SuccessfulRecoveryCount-1 | table StartTime, Duration, FailedRecoveryCount, ShortLivedRecoveryCount, AvailableSeconds | sort -StartTime</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
<panel>
<table>
<title>Process (Re)starts</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ProgramStart TrackLatestType=Original $Roles$ host=$Host$ Machine=$Machine$ | table _time, Machine | sort -_time</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
<panel>
<chart>
<title>Failure Detection (Machine Filter Only)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=FailureDetectionStatus System=$Machine$ | sort _time | eval Failed=if(Status=="Failed", 1, 0) | streamstats current=t global=f window=2 first(Failed) as PrevFailed by System | where PrevFailed=1 OR Failed=1 | eval Failed=PrevFailed + "," + Failed | makemv delim="," Failed | mvexpand Failed | timechart $Span$ max(Failed) by System</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.axisY.maximumNumber">1</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<table>
<title>Storage Server Space Usage (Sorted by Available Space Percentage, Ascending)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBStored=BytesStored/1e9, Overhead=KvstoreBytesUsed/BytesStored, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeSpacePercent) as FreeSpacePercent, latest(GBStored) as GBStored, latest(GBUsed) as GBUsed, latest(Overhead) as OverheadFactor, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<table>
<title>TLog Server Space Usage (Sorted by Available Space Percentage, Ascending)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics host=* Machine=* TrackLatestType=Original Roles=TL | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeDiskSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeDiskSpacePercent) as FreeDiskSpacePercent, latest(GBUsed) as GBUsed, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<chart>
<title>Data Movement by Type (Log Scale, Ignores Filters)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=MovingData TrackLatestType=Original | timechart avg(Priority*) as *</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<chart>
<title>Storage Server Max Bytes Stored by Host</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval GBStored=BytesStored/1e9 | timechart max(GBStored) by host limit=100</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<table>
<title>Master Failed Clients</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=WaitFailureClient
| stats count by FailedEndpoint</query>
<earliest>$TimeRange.earliest$</earliest>
<latest>$TimeRange.latest$</latest>
</search>
<option name="drilldown">none</option>
</table>
</panel>
</row>
</form>

View File

@ -0,0 +1,323 @@
<form theme="dark">
<label>FoundationDB - Performance Overview (Dev WiP)</label>
<fieldset submitButton="false" autoRun="true">
<input type="text" token="Index" searchWhenChanged="true">
<label>Index</label>
<default>*</default>
</input>
<input type="text" token="LogGroup" searchWhenChanged="true">
<label>LogGroup</label>
<default></default>
</input>
<input type="time" token="TimeSpan" searchWhenChanged="true">
<label>TimeSpan</label>
<default>
<earliest>-60m@m</earliest>
<latest>now</latest>
</default>
</input>
<input type="dropdown" token="UpdateRateTypeToken" searchWhenChanged="true">
<label>RK: Normal or Batch Txn</label>
<choice value="">Normal</choice>
<choice value="Batch">Batch</choice>
<default></default>
</input>
<input type="text" token="ChartBinSizeToken" searchWhenChanged="true">
<label>Chart Bin Size</label>
<default>60s</default>
</input>
</fieldset>
<row>
<panel>
<title>Transaction Rate measured on Proxies</title>
<chart>
<title>Sum in $ChartBinSizeToken$ seconds</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original"
| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " TxnThrottled
| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), TxnThrottledRate=mvindex(TxnThrottled, 0)
| timechart span=$ChartBinSizeToken$ sum(TxnRequestInRate) as StartedTxnBatchRate, sum(TxnRequestOutRate) as FinishedTxnBatchRate, sum(TxnStartInRate) as StartedTxnRate, sum(TxnStartOutRate) as FinishedTxnRate, sum(TxnThrottledRate) as ThrottledTxnRate</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Read Rate measured on Storage Servers</title>
<chart>
<title>Average in $ChartBinSizeToken$ seconds</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original"
| rex field=BytesQueried "(?&lt;RRate&gt;.*) (?&lt;RRoughness&gt;.*) (?&lt;RCounter&gt;.*)"
| rex field=RowsQueried "(?&lt;KRate&gt;.*) (?&lt;KRoughness&gt;.*) (?&lt;KCounter&gt;.*)"
| rex field=BytesInput "(?&lt;WRate&gt;.*) (?&lt;WRoughness&gt;.*) (?&lt;WCounter&gt;.*)"
| rex field=BytesFetched "(?&lt;FRate&gt;.*) (?&lt;FRoughness&gt;.*) (?&lt;FCounter&gt;.*)"
| timechart span=$ChartBinSizeToken$ avg(RRate) as BytesReadPerSecond, avg(KRate) as RowsReadPerSecond, avg(FRate) as DDReadPerSecond</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Write Rate measured on Proxies</title>
<chart>
<title>1min Average</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original"
| makemv delim=" " MutationBytes
| makemv delim=" " Mutations
| eval MutationBytesRate=mvindex(MutationBytes, 0), MutationsRate=mvindex(Mutations,0)
| bucket span=5s _time
| stats sum(MutationBytesRate) as MutationBytes, sum(MutationsRate) as Mutations by _time
|eval MutationMB=MutationBytes/1024/1024, MutationsK=Mutations/1000
| timechart span=$ChartBinSizeToken$ avg(MutationMB) as MutationMB, avg(MutationsK) as MutationsK</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.abbreviation">none</option>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="charting.layout.splitSeries">0</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Write Rate measured on Storage Servers</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original"
| rex field=BytesInput "(?&lt;WRate&gt;.*) (?&lt;WRoughness&gt;.*) (?&lt;WCounter&gt;.*)"
| rex field=BytesFetched "(?&lt;FRate&gt;.*) (?&lt;FRoughness&gt;.*) (?&lt;FCounter&gt;.*)"
| timechart span=$ChartBinSizeToken$ avg(WRate) as BytesPerSecond, avg(FRate) as DDBytesWrittenPerSecond</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>GRV Latency measured on all Proxies</title>
<chart>
<title>Seconds</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=GRVLatencyMetrics AND TrackLatestType="Original"
| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="charting.legend.placement">bottom</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Commit Latency measured on all Proxies</title>
<chart>
<title>Seconds</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=CommitLatencyMetrics AND TrackLatestType="Original"
| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="charting.legend.placement">bottom</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Read Latency measured on all Storage Servers</title>
<chart>
<title>Seconds</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ReadLatencyMetrics AND TrackLatestType="Original"
| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="charting.legend.placement">bottom</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>RateKeeper: ReleasedTPS vs LimitTPS</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original"
| replace inf with 100000000000
| eval _time=Time
| table _time ReleasedTPS TPSLimit
| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="height">251</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>RateKeeper: Throttling Reason</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original"
| replace inf with 100000000000
| eval _time=Time
| table _time Reason</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisLabelsY.majorUnit">1</option>
<option name="charting.axisY.abbreviation">none</option>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">area</option>
<option name="charting.drilldown">none</option>
<option name="charting.legend.mode">standard</option>
<option name="height">249</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>RateKeeper: Throttling Server</title>
<table>
<title>Ratekeeper: Limit Reason: ReasonServerID (Most recent 10 records)</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate AND TrackLatestType="Original"
| streamstats count as numOfEvents
| where numOfEvents &lt; 10
| eval DateTime=strftime(Time, "%Y-%m-%dT%H:%M:%S")
| table DateTime, ReasonServerID</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Disk Overhead = Disk Usage / Logical KV Size</title>
<chart>
<title>Y-axis is capped at 10</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type=StorageMetrics OR Type=DDTrackerStats) TrackLatestType=Original
| bucket _time span=5s
| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes, avg(TotalSizeBytes) as LogicalKVBytes by _time
| eval overhead=StorageDiskUsedBytes/LogicalKVBytes
| timechart avg(overhead)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.maximumNumber">10</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="charting.legend.placement">bottom</option>
</chart>
</panel>
<panel>
<title>KV Data Size</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Roles=*DD* host=* Machine=* Type=DDTrackerStats TrackLatestType=Original
| eval TotalKVGB=TotalSizeBytes/1024/1024/1024, SystemKVGB=SystemSizeBytes/1024/1024/1024
|timechart avg(TotalKVGB), avg(SystemKVGB), avg(Shards)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="charting.legend.placement">bottom</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Disk Usage</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ host=* Machine=* Type=StorageMetrics TrackLatestType=Original
| bucket _time span=5s
| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes by _time
|eval StorageDiskTotalMB = StorageDiskTotalBytes/1024/1024, StorageDiskUsedMB=StorageDiskUsedBytes/1024/1024
| timechart avg(StorageDiskTotalMB) as StorageDiskTotalMB, avg(StorageDiskUsedMB) as StorageDiskUsedMB</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="charting.legend.placement">bottom</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Cluster Roles</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics TrackLatestType="Original"
| rex field=host "(?&lt;HostDC&gt;..).*-..(?&lt;HostConfig&gt;..).*"
| eval HostDC=if(isnotnull(pie_work_unit), pie_work_unit, HostDC)
| makemv delim="," Roles
| stats dc(Machine) as MachineCount by Roles, HostDC
| stats list(HostDC), list(MachineCount) by Roles
| sort Roles</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="drilldown">none</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Storage Engine</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=Role Origination=Recruited As=StorageServer | table StorageEngine, OriginalDateTime, DateTime |head 2</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
<panel>
<title>Cluster Generations</title>
<chart>
<title>Indicate FDB recoveries</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics |timechart max(Generation)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
</form>

View File

@ -0,0 +1,928 @@
<form theme="dark">
<label>FoundationDB - RateKeeper (Dev)</label>
<fieldset submitButton="false">
<input type="text" token="Index" searchWhenChanged="true">
<label>Index</label>
<default>*</default>
</input>
<input type="text" token="LogGroup" searchWhenChanged="true">
<label>LogGroup</label>
<default></default>
</input>
<input type="time" token="TimeSpan" searchWhenChanged="true">
<label>TimeSpan</label>
<default>
<earliest>-60m@m</earliest>
<latest>now</latest>
</default>
</input>
<input type="dropdown" token="UpdateRateTypeToken" searchWhenChanged="true">
<label>RKChart: Normal or Batch</label>
<choice value="">Normal</choice>
<choice value="Batch">Batch</choice>
<default></default>
</input>
<input type="text" token="ChartBinSizeToken" searchWhenChanged="true">
<label>Chart Bin Size</label>
<default>30s</default>
</input>
<input type="dropdown" token="ChartByMachineToken" searchWhenChanged="true">
<label>ClusterStateMetric byMachine</label>
<choice value="by Machine">Yes</choice>
<choice value="">No</choice>
<default></default>
</input>
<input type="dropdown" token="RolePerformanceChartToken" searchWhenChanged="true">
<label>Role for Proc Perf Charts</label>
<choice value="MasterServer">MasterServer</choice>
<choice value="MasterProxyServer">MasterProxyServer</choice>
<choice value="StorageServer">StorageServer</choice>
<choice value="TLog">TLog</choice>
<choice value="Resolver">Resolver</choice>
<choice value="GrvProxyServer">GrvProxyServer</choice>
<choice value="CommitProxyServer">CommitProxyServer</choice>
</input>
<input type="dropdown" token="SourcePerfConnectionToken" searchWhenChanged="true">
<label>Source for Perf Connection</label>
<choice value="MasterServer">MasterServer</choice>
<choice value="MasterProxyServer">MasterProxyServer</choice>
<choice value="Resolver">Resolver</choice>
<choice value="TLog">TLog</choice>
<choice value="StorageServer">StorageServer</choice>
<choice value="GrvProxyServer">GrvProxyServer</choice>
<choice value="CommitProxyServer">CommitProxyServer</choice>
</input>
<input type="dropdown" token="DestinationPerfConnectionToken" searchWhenChanged="true">
<label>Dest for Perf Connection</label>
<choice value="MasterServer">MasterServer</choice>
<choice value="MasterProxyServer">MasterProxyServer</choice>
<choice value="Resolver">Resolver</choice>
<choice value="TLog">TLog</choice>
<choice value="StorageServer">StorageServer</choice>
<choice value="GrvProxyServer">GrvProxyServer</choice>
<choice value="CommitProxyServer">CommitProxyServer</choice>
</input>
</fieldset>
<row>
<panel>
<title>Aggregated Storage Server Bandwidth</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original"
| rex field=BytesQueried "(?&lt;RRate&gt;.*) (?&lt;RRoughness&gt;.*) (?&lt;RCounter&gt;.*)"
| rex field=BytesInput "(?&lt;WRate&gt;.*) (?&lt;WRoughness&gt;.*) (?&lt;WCounter&gt;.*)"
| rex field=BytesFetched "(?&lt;FRate&gt;.*) (?&lt;FRoughness&gt;.*) (?&lt;FCounter&gt;.*)"
| bin span=5s _time
| stats sum(RRate) as ReadSum, sum(WRate) as WriteSum, sum(FRate) as FetchedKeyRate by _time
| eval ReadSpeedMB=ReadSum/1024/1024, WriteSpeedMB=WriteSum/1024/1024, FetchedKeyRateMB=FetchedKeyRate/1024/1024
|timechart avg(ReadSpeedMB), avg(WriteSpeedMB), avg(FetchedKeyRateMB)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Aggregated Proxy Bandwidth</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original"
| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " MutationBytes
| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), MutationBytesRate=mvindex(MutationBytes, 0)
| bin span=60s _time
| stats avg(TxnRequestInRate) as TxnRequestInRatePerHost, avg(TxnRequestOutRate) as TxnRequestOutRatePerHost, avg(TxnStartInRate) as TxnStartInRatePerHost, avg(TxnStartOutRate) as TxnStartOutRatePerHost, avg(MutationBytesRate) as MutationBytesRatePerHost by Machine,_time
| eval WriteThroughputKB=sum(MutationBytesRatePerHost)/1000
| timechart span=1m sum(TxnRequestInRatePerHost), sum(TxnRequestOutRatePerHost), sum(TxnStartInRatePerHost), sum(TxnStartOutRatePerHost), sum(WriteThroughputKB)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 1: Overview - GRV Arrivals and Leaves per Second Seen by Proxies</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original"
| eval TxnRequestIn=mvindex(TxnRequestIn, 0), TxnRequestOut=mvindex(TxnRequestOut, 0), TxnStartIn=mvindex(TxnStartIn, 0), TxnStartOut=mvindex(TxnStartOut, 0)
| timechart span=30s avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) by Machine</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="height">249</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 2: RKOverview - Input ReleasedTPS and Output TPSLimit</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original"
| replace inf with 100000000000
| eval _time=Time
| table _time ReleasedTPS TPSLimit
| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="height">251</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 3: RKOverview - RKLimitReason</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original"
| replace inf with 100000000000
| eval _time=Time
| table _time Reason</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisLabelsY.majorUnit">1</option>
<option name="charting.axisY.abbreviation">none</option>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">area</option>
<option name="charting.drilldown">none</option>
<option name="height">249</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 4: Don't Process Transactions - RkSSListFetchTimeout (TpsLimit = 0)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="RkSSListFetchTimeout"
| timechart span=1s count</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 5: Don't Process Transactions - RkTlogMinFreeSpaceZero (TpsLimit = 0)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="RkTlogMinFreeSpaceZero"
| timechart span=1s count</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 6: Don't Process Transactions - ProxyGRVThresholdExceeded</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyGRVThresholdExceeded*") AND TrackLatestType="Original"
| timechart span=1s count by Type</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 7: RKLimitReasonCandidate - LimitingStorageServerDurabilityLag (MVCCVersionInMemory)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original"
| replace inf with 100000000000
| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerDurabilityLag)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 8: RKLimitReasonCandidate - LimitingStorageServerVersionLag (TLogVer-SSVer)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original"
| replace inf with 100000000000
| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerVersionLag)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 9: RKLimitReasonCandidate - LimitingStorageServerQueue</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original"
| replace inf with 100000000000
| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerQueue)</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 10: Runtime Monitoring - StorageServer MVCCVersionInMemory (storage_server_durability_lag)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original"
| eval NonDurableVersions=Version-DurableVersion
| timechart span=$ChartBinSizeToken$ limit=0 avg(NonDurableVersions) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="height">251</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 11: Runtime Monitoring - StorageServer LocalRate (higher MVCCVersionInMemory -&gt; lower LocalRate)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics"
| timechart limit=0 avg(LocalRate) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 12: Runtime Monitoring - StorageServer ReadsRejected (lower LocalRate -&gt; higher probability of rejecting read))</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics"
| timechart limit=0 avg(ReadsRejected) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 13: Runtime Monitoring - Version Lag between StorageServer and Tlog (storage_server_readable_behind)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original"
| eval SSFallBehindVersions=VersionLag
| timechart span=$ChartBinSizeToken$ limit=0 avg(SSFallBehindVersions) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 14: Runtime Monitoring - StorageServerBytes (storage_server_write_queue_size)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original"
| makemv delim=" " BytesInput | makemv delim=" " BytesDurable | makemv delim=" " BytesFetched | makemv delim=" " MutationBytes
| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesFetched=mvindex(BytesFetched, 2), MutationBytes=mvindex(MutationBytes, 2), BytesInMemoryQueue=BytesInput-BytesDurable
| timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 15: Runtime Monitoring - StorageServer KVStore Free Space Ratio (storage_server_min_free_space)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original"
| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal
| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 16: Runtime Monitoring - TLog Queue Free Space Ratio (log_server_min_free_space)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original"
| eval QueueBytesFreeRatio=QueueDiskBytesFree/QueueDiskBytesTotal
| timechart span=$ChartBinSizeToken$ limit=0 avg(QueueBytesFreeRatio) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 17: Runtime Monitoring - TLog KVStore Free Space Ratio (log_server_min_free_space)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original"
| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal
| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 18: Runtime Monitoring - TLogBytes (log_server_write_queue)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original"
| makemv delim=" " BytesInput
| makemv delim=" " BytesDurable
| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesInMemoryQueue=BytesInput-BytesDurable | timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 19: Runtime Monitoring - Proxy Throughput</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original"
| timechart span=$ChartBinSizeToken$ limit=0 avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) avg(TxnStartBatch) avg(TxnStartErrors) avg(TxnCommitIn) avg(TxnCommitVersionAssigned) avg(TxnCommitResolving) avg(TxnCommitResolved) avg(TxnCommitOut) avg(TxnCommitOutSuccess) avg(TxnCommitErrors) avg(TxnThrottled) avg(TxnConflicts) avg(CommitBatchIn) avg(CommitBatchOut) avg(TxnRejectedForQueuedTooLong) avg(Mutations) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 20: Runtime Monitoring - Proxy Queue Length</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" | timechart span=$ChartBinSizeToken$ limit=0 avg(*QueueSize*) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 21: Runtime Monitoring - TLog UnpoppedVersion</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original"
| eval UnpoppedVersion=PersistentDataDurableVersion-QueuePoppedVersion
| timechart span=$ChartBinSizeToken$ limit=0 avg(UnpoppedVersion) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 22: Runtime Monitoring - Storage Server Disk (AIODiskStall)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="ProcessMetrics"
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As="StorageServer"
| stats first(Machine) by Machine
| rename first(Machine) as Machine
| table Machine]
| timechart span=$ChartBinSizeToken$ limit=0 avg(AIODiskStall) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 23: Runtime Monitoring - StorageServer Query Queue Length</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original"
| makemv QueryQueue | eval QueryQueue=mvindex(QueryQueue, 1) | table _time QueryQueue Machine
| timechart span=$ChartBinSizeToken$ limit=0 avg(QueryQueue) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 24: Transaction Trace Stats - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
<input type="dropdown" token="GRVByMachineStatsToken" searchWhenChanged="true">
<label>By Machine</label>
<choice value="Machine">Yes</choice>
<choice value="">No</choice>
<default></default>
</input>
<input type="text" token="StatsGRVSpanToken" searchWhenChanged="true">
<label>Span</label>
<default>500ms</default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="TransactionDebug" AND (*ProxyServer.masterProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After)
| table Time Type ID Location Machine Roles
| append
[ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before)
| rename ID as ParentID
| table Time Type ParentID Location Machine Roles
| join ParentID
[ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID"
| rename ID as ParentID
| rename To as ID
| table ParentID ID]
| table Time Type ID Location Machine Roles]
| table Time Type ID Location Machine Roles
| sort 0 Time
| table Machine Location Time Roles Type ID
| stats list(*) by ID
| rename list(*) as *
| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=TBegin
| bin bins=20 span=$StatsGRVSpanToken$ TimeSpan
| chart limit=0 count by TimeSpan $GRVByMachineStatsToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">column</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 25: Transaction Trace Stats - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
<input type="dropdown" token="GetValueByMachineStatsToken" searchWhenChanged="true">
<label>By Machine</label>
<choice value="Machine">Yes</choice>
<choice value="">No</choice>
<default></default>
</input>
<input type="text" token="StatsReadSpanToken" searchWhenChanged="true">
<label>Span</label>
<default>500ms</default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After)
| table Machine Location Time Roles ID Type
| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10)
| sort 0 Time Order
| stats list(*) by ID
| rename list(*) as *
| table Machine Location Time Roles ID Type
| eval count = mvcount(Location)
| search count&gt;2
| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin
| table _time ID TimeSpan Machine Location Time
| bin bins=20 span=$StatsReadSpanToken$ TimeSpan
| chart limit=0 count by TimeSpan $GetValueByMachineStatsToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">column</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 26: Transaction Trace Stats - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
<input type="dropdown" token="CommitByMachineStatsToken">
<label>By Machine</label>
<choice value="Machine">Yes</choice>
<choice value="">No</choice>
<default>Machine</default>
</input>
<input type="text" token="StatsCommitSpanToken" searchWhenChanged="true">
<label>Span</label>
<default>500ms</default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution)
| table Time Type ID Location Machine Roles
| sort 0 Time
| table Machine Location Time Roles Type ID
| stats list(*) by ID
| rename list(*) as *
| eval Count=mvcount(Location)
| search Count&gt;=2
| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=T1
| table _time TimeSpan Machine
| bin bins=20 span=$StatsCommitSpanToken$ TimeSpan
| chart limit=0 count by TimeSpan $CommitByMachineStatsToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">column</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 27: Transaction Tracing - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
<input type="dropdown" token="GRVLatencyByMachineToken" searchWhenChanged="true">
<label>By Machine</label>
<choice value="by Machine">Yes</choice>
<choice value="">No</choice>
<default></default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="TransactionDebug" AND (*ProxyServer.*ProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After)
| table Time Type ID Location Machine Roles
| append
[ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before)
| rename ID as ParentID
| table Time Type ParentID Location Machine Roles
| join ParentID
[ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID"
| rename ID as ParentID
| rename To as ID
| table ParentID ID]
| table Time Type ID Location Machine Roles]
| table Time Type ID Location Machine Roles
| eval Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6)
| table Time Order Type ID Location Machine Roles
| sort 0 Order Time
| table Machine Location Time Roles Type ID
| stats list(*) by ID
| rename list(*) as *
| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), TimeInQueue = T2-T1, TimeGetVersionFromProxies = if(mvcount==4, T3-T2, -0.0000001), TimeConfirmLivenessFromTLogs = if(mvcount==4, T4-T3, T3-T2), TimeSpan=if(mvcount==4,T4-T1,T3-T1), _time=T1
| table _time TimeSpan TimeInQueue TimeGetVersionFromProxies TimeConfirmLivenessFromTLogs Machine
| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeInQueue), avg(TimeGetVersionFromProxies), avg(TimeConfirmLivenessFromTLogs) $GRVLatencyByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 28: Transaction Tracing - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
<input type="dropdown" token="GetValueLatencyByMachineToken" searchWhenChanged="true">
<label>By Machine</label>
<choice value="by Machine">Yes</choice>
<choice value="">No</choice>
<default></default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After)
| table Machine Location Time Roles ID Type
| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10)
| sort 0 Time Order
| stats list(*) by ID
| rename list(*) as *
| table Machine Location Time Roles ID Type
| eval count = mvcount(Location)
| search count&gt;2
| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin
| table _time TimeSpan
| timechart span=30s limit=0 avg(TimeSpan) $GetValueLatencyByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 29: Transaction Tracing - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
<input type="dropdown" token="CommitByMachineToken" searchWhenChanged="true">
<label>By Machine</label>
<choice value="By Machine">Yes</choice>
<choice value="">No</choice>
<default></default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution)
| table Time Type ID Location Machine Roles
| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16)
| table Time Order Type ID Location Machine Roles
| sort 0 Time Order
| table Machine Location Time Roles Type ID
| stats list(*) by ID
| rename list(*) as *
| eval Count=mvcount(Location)
| search Count=7
| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), T5=mvindex(Time, 4), T6=mvindex(Time, 5), T7=mvindex(Time, 6), TimeSpan=T7-T1, TimeResolution=T4-T3, TimePostResolution=T5-T4, TimeProcessingMutation=T6-T5, TimeTLogPush=T7-T6, _time=T1
| table _time TimeSpan TimeResolution TimePostResolution TimeProcessingMutation TimeTLogPush Machine
| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeResolution), avg(TimePostResolution), avg(TimeProcessingMutation), avg(TimeTLogPush) $CommitByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 30: Transaction Tracing - Commit - TLogPush and Resolver Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace)</title>
<input type="dropdown" token="TLogResolverByMachineToken" searchWhenChanged="true">
<label>By Machine</label>
<choice value="MachineStep">Yes</choice>
<choice value="Step">No</choice>
<default>Step</default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="CommitDebug" AND (Resolver.resolveBatch.Before OR Resolver.resolveBatch.AfterQueueSizeCheck OR Resolver.resolveBatch.AfterOrderer OR Resolver.resolveBatch.After OR TLog.tLogCommit.BeforeWaitForVersion OR TLog.tLogCommit.Before OR TLog.tLogCommit.AfterTLogCommit OR TLog.tLogCommit.After)
| table Time Type ID Location Machine Roles
| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location=="MasterProxyServer.batcher", 1, Location=="MasterProxyServer.commitBatch.Before", 2, Location=="MasterProxyServer.commitBatch.GettingCommitVersion", 3, Location=="MasterProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location=="MasterProxyServer.commitBatch.AfterResolution", 8.5, Location=="MasterProxyServer.commitBatch.ProcessingMutations", 9, Location=="MasterProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location=="MasterProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16)
| table Time Order Type ID Location Machine Roles
| sort 0 Time Order
| table Machine Location Time Roles Type ID
| stats list(*) by ID
| rename list(*) as *
| eval Count=mvcount(Location), Step=case(Count=4 and (mvindex(Location, 0) like "TLog%"), "TimeTLogCommit", Count=4 and (mvindex(Location, 0) like "Resolver%"), "TimeResolver", Count=10, "TimeSpan"), BeginTime=mvindex(Time, 0), EndTime=mvindex(Time, -1), Duration=EndTime-BeginTime, _time=BeginTime
| search Count=4
| eval Machinei=mvindex(Machine, 0), MachineStep = Step."-".Machinei
| table _time Step Duration Machinei Location Machine MachineStep
| timechart span=$ChartBinSizeToken$ limit=0 avg(Duration) by $TLogResolverByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 31: Machine Performance - CPU Utilization (CPU Time divided by Elapsed)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original"
| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory Elapsed
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$
| stats first(Machine) by Machine
| rename first(Machine) as Machine
| table Machine]
| eval Utilization=CPUSeconds/Elapsed
| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 32: Machine Performance - Memory Utilization (ResidentMemory divided by Memory)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original"
| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$
| stats first(Machine) by Machine
| rename first(Machine) as Machine
| table Machine]
| eval Utilization = ResidentMemory/Memory
| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">linear</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 33: Machine Performance - Disk Utilization ((DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original"
| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$
| stats first(Machine) by Machine
| rename first(Machine) as Machine
| table Machine]
| eval Utilization = (DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes
| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 34: Machine Performance - Network (Mbps Received and Mbps Sent)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original"
| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$
| stats first(Machine) by Machine
| rename first(Machine) as Machine
| table Machine]
| timechart span=$ChartBinSizeToken$ avg(MbpsReceived) avg(MbpsSent) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.axisY.scale">log</option>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 35: Machine Performance - Disk (Reads Count and Writes Count)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original"
| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$
| stats first(Machine) by Machine
| rename first(Machine) as Machine
| table Machine]
| timechart span=$ChartBinSizeToken$ avg(DiskReadsCount) avg(DiskWritesCount) $ChartByMachineToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 36: Network Performance - Timeout</title>
<input type="dropdown" token="TimeoutByConnectionToken" searchWhenChanged="true">
<label>By Connection</label>
<choice value="By Connection">Yes</choice>
<choice value="">No</choice>
<default></default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type=ConnectionTimedOut OR Type=ConnectionTimeout)
| replace *:tls with * in PeerAddr
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($SourcePerfConnectionToken$))
| dedup ID]
| join PeerAddr
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($DestinationPerfConnectionToken$))
| dedup ID
| rename Machine as PeerAddr]
| eval Connection=Machine."-".PeerAddr
| timechart useother=0 span=$ChartBinSizeToken$ count $TimeoutByConnectionToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
<panel>
<title>Chart 37: Network Performance - PingLatency</title>
<input type="dropdown" token="PingLatencyByConnectionToken" searchWhenChanged="true">
<label>By Connection</label>
<choice value="By Connection">Yes</choice>
<choice value="">No</choice>
<default></default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type=PingLatency)
| replace *:tls with * in PeerAddr
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($SourcePerfConnectionToken$))
| dedup ID]
| join PeerAddr
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($DestinationPerfConnectionToken$))
| dedup ID
| rename Machine as PeerAddr]
| eval Connection=Machine."-".PeerAddr
| timechart useother=0 span=$ChartBinSizeToken$ avg(MeanLatency) avg(MaxLatency) $PingLatencyByConnectionToken$</query>
<earliest>$TimeSpan.earliest$</earliest>
<latest>$TimeSpan.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
</form>

View File

@ -0,0 +1,873 @@
<form theme="dark">
<label>FoundationDB - Long Recovery (Dev)</label>
<fieldset submitButton="false" autoRun="false"></fieldset>
<row>
<panel>
<title>Table 1: Find long recovery (Input Index and LogGroup and Select a time span).</title>
<input type="text" token="IndexForOverview" searchWhenChanged="true">
<label>Index</label>
<default>*</default>
</input>
<input type="text" token="LogGroupForOverview" searchWhenChanged="true">
<label>LogGroup</label>
<default></default>
</input>
<input type="time" token="time_token_for_recoveryhistorytable" searchWhenChanged="true">
<label>Select a time span</label>
<default>
<earliest>-0s</earliest>
<latest>now</latest>
</default>
</input>
<table>
<search>
<query>index=$IndexForOverview$ LogGroup=$LogGroupForOverview$
((Type="MasterRecoveryState" AND (Status="reading_coordinated_state" OR Status="fully_recovered" OR Status="accepting_commits")) OR (Type="Role" AND As="MasterServer" AND ("Transition"="Begin" OR "Transition"="End")) OR Type="MasterTerminated") AND (NOT TrackLatestType="Rolled") | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| table ID Machine Type Transition As Status DateTime Time ErrorDescription LogGroup
| search NOT ErrorDescription="Success"
| eval EventType=case(Transition="Begin" AND As="MasterServer" AND Type="Role", "MasterStart", Type="MasterRecoveryState" AND Status="fully_recovered", "FullRecovery", Type="MasterRecoveryState" AND Status="reading_coordinated_state", "StartRecoveryAttempt", Transition="End" AND As="MasterServer" AND Type="Role", "MasterTerminated", Type="MasterTerminated", "MasterTerminated", Type="MasterRecoveryState" AND Status="accepting_commits", "AcceptingCommits")
| table ID Machine EventType DateTime Time ErrorDescription LogGroup
| fillnull value="-"
| sort -Time
| eval ifMasterTerminatedEvent=if(EventType="MasterTerminated", 1, 0)
| stats list(*) by ID Machine ifMasterTerminatedEvent
| rename list(*) as *
| table ID Machine EventType DateTime Time ErrorDescription LogGroup
| sort -Time
| eval LastTime=mvindex(Time, 0), FirstTime=mvindex(Time, -1), Duration=LastTime-FirstTime
| table ID Machine Duration EventType DateTime Time ErrorDescription LogGroup</query>
<earliest>$time_token_for_recoveryhistorytable.earliest$</earliest>
<latest>$time_token_for_recoveryhistorytable.latest$</latest>
</search>
<option name="count">15</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 2: Select timespan containing the long recovery and see all recovery attempts in the time span (The input Index and LogGroup and Timespan are for all following tables and charts)</title>
<input type="text" token="Index" searchWhenChanged="true">
<label>Index</label>
<default>*</default>
</input>
<input type="text" searchWhenChanged="true" token="LogGroup">
<label>LogGroup</label>
</input>
<input type="time" token="ReoveryTime" searchWhenChanged="true">
<label>ReoveryTimeSpan</label>
<default>
<earliest>-0s@s</earliest>
<latest>now</latest>
</default>
</input>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type="MasterRecoveryState" OR (Type="MasterTerminated") OR (Type="Role" AND As="MasterServer" AND "Transition"="End") OR Type="RecoveryInternal" OR Type="ProxyReplies" OR Type="CommitProxyReplies" OR Type="ResolverReplies" OR Type="MasterRecruitedInitialStorageServers") AND (NOT TrackLatestType="Rolled")
| rename ID as MasterID
| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| table MasterID Machine Status Step Type DateTime Time StatusCode MyRecoveryCount ErrorDescription Reason ErrorCode
| fillnull value="-" ErrorDescription Reason ErrorCode
| eval Status=case(Type=="MasterRecoveryState", Status, Type=="Role", "RoleEnd", Type=="MasterTerminated", "MasterTerminated", Type=="RecoveryInternal", Status."/".Step, Type=="ProxyReplies" OR Type=="CommitProxyReplies", "initializing_transaction_servers/ProxyReplies", Type="ResolverReplies", "initializing_transaction_servers/ResolverReplies", Type=="MasterRecruitedInitialStorageServers", "initializing_transaction_servers/MasterRecruitedInitialStorageServers"), StatusCode=case(Type=="ProxyReplies" OR Type=="CommitProxyReplies" OR Type=="ResolverReplies" OR Type=="MasterRecruitedInitialStorageServers", "8", Type!="ProxyReplies" AND Type!="CommitProxyReplies" AND Type!="ResolverReplies" AND Type!="MasterRecruitedInitialStorageServers", StatusCode)
| fillnull value="-" StatusCode
| sort 0 -Time -StatusCode
| stats list(*) by MasterID Machine
| rename list(*) as *
| eval FirstTime=mvindex(Time, -1), LastTime=mvindex(Time, 0), Duration=LastTime-FirstTime
| table MasterID Machine MyRecoveryCount Duration ErrorDescription Reason ErrorCode StatusCode Status DateTime Time
| sort -MyRecoveryCount
| fillnull value="-" MyRecoveryCount</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="count">3</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
<option name="wrap">false</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 3: Why recovery is triggered? Using WaitFailureClient event. Machine A detects Machine B's failure. First column is the time when WaitFailureClient happens. Columns of 2,3,4,5 are for A. Columns of 6,7 are for B.</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="WaitFailureClient"
| table Type Time Machine FailedEndpoint
| replace *:tls with * in FailedEndpoint
| join Machine type=left
[ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND Transition="End"
| eval EndTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| rename As as Role
| table ID EndTime Machine Role]
| join FailedEndpoint type=left
[ search index=$Index$ LogGroup=$LogGroup$ Type="Role"
| stats latest(*) by ID | rename latest(*) as *
| rename Machine as FailedEndpoint
| eval FailedEndpointLatestRoleEventInfo=As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| stats list(*) by FailedEndpoint
| rename list(*) as *
| table FailedEndpoint FailedEndpointLatestRoleEventInfo]
| eval FailureDetectedTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| makemv delim=" " FailedEndpointLatestRoleEventInfo
| table FailureDetectedTime Machine ID Role EndTime FailedEndpoint FailedEndpointLatestRoleEventInfo</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="wrap">false</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 4: New Recruitment Configuration (using MasterRecoveredConfig event)</title>
<event>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="MasterRecoveredConfig" AND TrackLatestType="Original"
| eval Configuration=replace(Conf, "&amp;quot;", "\"")
| rename Configuration as _raw</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="list.drilldown">none</option>
<option name="refresh.display">progressbar</option>
</event>
</panel>
</row>
<row>
<panel>
<title>Table 5: Data Centers (using ProcessMetrics event)</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type=ProcessMetrics
| dedup DCID
| rename DCID as DataCenterID
| table DataCenterID pie_work_unit
| fillnull value="-"</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
<panel>
<title>Table 6: New Role (using Role event joined by ProcessMetrics event)</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ((As="ClusterController") OR (As="MasterServer") OR (As="TLog") OR (As="Resolver") OR (As="MasterProxyServer") OR (As="CommitProxyServer") OR (As="GrvProxyServer") OR (As="LogRouter")) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh"))
| eventstats count by ID
| rename As as Role
| search count=1 AND Transition="Begin"
| table ID Role Machine
| join type=left Machine
[ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics
| dedup Machine, DCID
| rename DCID as DataCenter
| table Machine DataCenter]
| table ID Role Machine DataCenter
| fillnull value="null" DataCenter
| stats count by Role DataCenter</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 7: Role Details</title>
<input type="multiselect" token="RolesToken" searchWhenChanged="true">
<label>Roles</label>
<choice value="MasterServer">MasterServer</choice>
<choice value="TLog">TLog</choice>
<choice value="Resolver">Resolver</choice>
<choice value="MasterProxyServer">MasterProxyServer (for &lt;7.0)</choice>
<choice value="LogRouter">LogRouter</choice>
<choice value="CommitProxyServer">CommitProxyServer (for 7.0+)</choice>
<choice value="GrvProxyServer">GrvProxyServer (for 7.0+)</choice>
<valuePrefix>As="</valuePrefix>
<valueSuffix>"</valueSuffix>
<delimiter> OR </delimiter>
</input>
<input type="dropdown" token="RoleDetailTableWhichRoleToken" searchWhenChanged="true">
<label>Begin/End</label>
<choice value="count=1 AND Transition=&quot;Begin&quot;">Begin</choice>
<choice value="count=1 AND Transition=&quot;End&quot;">End</choice>
<choice value="count=2">Begin-&gt;End</choice>
<default>count=1 AND Transition="Begin"</default>
</input>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($RolesToken$) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh"))
| eventstats count by ID
| rename As as Role
| search $RoleDetailTableWhichRoleToken$
| table ID Role Machine Time
| join type=left Machine
[ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics
| dedup Machine, DCID
| rename DCID as DataCenter
| table Machine DataCenter]
| table ID Role Machine DataCenter Time
| fillnull value="null" DataCenter
| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| table ID Role Machine DataCenter DateTime
| sort 0 -DateTime</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 8: CC Recruitment SevWarn OR SevError (use events in clusterRecruitFromConfiguration and clusterRecruitRemoteFromConfiguration)</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="RecruitFromConfigurationNotAvailable" OR Type="RecruitFromConfigurationRetry" OR Type="RecruitFromConfigurationError" OR Type="RecruitRemoteFromConfigurationNotAvailable" OR Type="RecruitRemoteFromConfigurationRetry" OR Type="RecruitRemoteFromConfigurationError"
| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)"), GoodRecruitmentTimeReady=case(Type=="RecruitFromConfigurationNotAvailable" OR Type=="RecruitRemoteFromConfigurationNotAvailable", "True", Type=="RecruitFromConfigurationRetry" OR Type=="RecruitRemoteFromConfigurationRetry", GoodRecruitmentTimeReady, Type=="RecruitFromConfigurationError" OR Type=="RecruitRemoteFromConfigurationError", "-")
| table Type GoodRecruitmentTimeReady Time DateTime</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 9: RecoveryCount of the selected TLog (in Table 11)</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(ID=$row.TLogID$ AND Type="TLogStart") OR (LogId=$row.TLogID$ AND Type="TLogPersistentStateRestore")
| eval ID=if(Type="TLogStart", ID, LogId), DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| table ID RecoveryCount Type DateTime | fillnull value="Not found. The fdb version is somewhat old."</query>
<earliest>-7d@h</earliest>
<latest>now</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
<panel>
<title>Table 10: Which roles the selected TLog (in Table 11) talks to</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled")
| sort -Time
| eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null")
| stats list(*) by TLogID
| rename list(*) As *
| table TLogID TLogEvents
| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0)
| search ignore=0
| sort TLogID
| table TLogID TLogEvents
| mvexpand TLogEvents
| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), MasterID=mvindex(temp,2)
| fields - temp - TLogEvents
| sort 0 -Time
| search NOT MasterID="NULL"
| dedup MasterID
| rename MasterID as ID
| join type=left ID
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role")
| sort 0 -Time
| dedup ID
| table ID Machine As]
| table ID Machine As | fillnull value="null" Machine As</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 11: TLog Events (Collecting all TLogs that produce interesting events during the time span)</title>
<input type="text" token="SeeLogEventDetailTableToken" searchWhenChanged="true">
<label>Input * to do search</label>
</input>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR
((Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2")) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled") AND $SeeLogEventDetailTableToken$
| sort -Time
| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."null", (Type="TLogReady"), Time." ".Type." "."null", (Type="TLogStart"), Time." ".Type." "."null", (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."null")
| stats list(TLogEvents) by TLogID
| rename list(TLogEvents) As TLogEvents
| eval EarliestEvent=mvindex(TLogEvents, -1) , LatestEvent=mvindex(TLogEvents, 0)
| table TLogID TLogEvents EarliestEvent LatestEvent
| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0)
| search ignore=0
| sort TLogID
| join type=left TLogID
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND As="TLog")
| sort 0 -Time
| dedup ID
| rename ID as TLogID
| table TLogID host LogGroup Machine]
| table TLogID Machine LogGroup host EarliestEvent LatestEvent
| fillnull value="null" Machine host LogGroup
| eval temp=split(LatestEvent," "), LatestTime=mvindex(temp,0), LatestEvent=mvindex(temp,1), temp2=split(EarliestEvent," "), EarliestTime=mvindex(temp2,0), EarliestEvent=mvindex(temp2,1), Duration=LatestTime-EarliestTime
| table TLogID Machine EarliestTime Duration LogGroup host
| join type=left Machine
[ search index=$Index$ LogGroup=$LogGroup$
Type=ProcessMetrics
| dedup Machine, DCID
| rename DCID as DataCenter
| table Machine DataCenter]
| fillnull value="null" DataCenter
| table TLogID Machine DataCenter EarliestTime Duration host LogGroup
| join type=left TLogID
[ search index=$Index$ LogGroup=$LogGroup$
((Type="TLogRejoining") OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow")) OR ((Type="TLogLockStarted" OR Type="TLogLocked")) OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh")) AND (NOT TrackLatestType="Rolled")
| sort -Time
| eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null")
| stats list(*) by TLogID
| rename list(*) As *
| table TLogID TLogEvents
| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0)
| search ignore=0
| sort TLogID
| table TLogID TLogEvents
| mvexpand TLogEvents
| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), RoleID=mvindex(temp,2)
| fields - temp - TLogEvents
| sort 0 -Time
| search NOT RoleID="NULL"
| table TLogID RoleID MasterMachine
| stats list(*) by TLogID
| rename list(*) as *
| streamstats count
| mvexpand RoleID
| dedup count RoleID
| fields - count
| stats count by TLogID
| rename count as Roles
| table TLogID Roles]
| table TLogID Machine DataCenter Roles EarliestTime Duration host LogGroup
| join type=left TLogID
[ search index=$Index$ LogGroup=$LogGroup$
(Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR
((Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled"))
| sort -Time
| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=if(Type="Role", Type.Transition, Type)
| sort 0 TLogEvents
| stats list(TLogEvents) by TLogID
| rename list(TLogEvents) As TLogEvents
| table TLogID TLogEvents
| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0)
| search ignore=0
| mvcombine delim=" " TLogEvents
| table TLogID TLogEvents]
| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestTime host LogGroup
| eval EarliestDateTime=strftime(EarliestTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup
| join type=left TLogID
[ search index=$Index$ LogGroup=$LogGroup$
(Type="TLogStart") OR (Type="TLogPersistentStateRestore")
| eval TLogID=if(Type="TLogStart", ID, LogId)
| table TLogID RecoveryCount]
| table TLogID RecoveryCount Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup
| fillnull value="TLog too old, click and see details" RecoveryCount</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">cell</option>
<option name="wrap">false</option>
<drilldown>
<set token="row.TLogID">$click.value$</set>
</drilldown>
</table>
</panel>
<panel>
<title>Table 12: Event Details (Including rejoining events) of the selected TLog (in Table 11)</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type="TLogRecover" AND LogId=$row.TLogID$) OR (Type="TLogReady" AND ID=$row.TLogID$) OR (Type="TLogStart" AND ID=$row.TLogID$) OR
((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled")
| sort -Time
| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."-"." "."-", (Type="TLogReady"), Time." ".Type." "."-"." "."-", (Type="TLogStart"), Time." ".Type." "."-"." "."-", (Type="TLogRejoining"), Time." ".Type." ".Master." "."-", (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."-", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."-"." "."-", (Type="Role" AND As="TLog" AND Transition="Begin" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." ".Origination, (Type="Role" AND As="TLog" AND Transition="End" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." "."-")
| stats list(*) by TLogID
| rename list(*) As *
| table TLogID TLogEvents
| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0)
| search ignore=0
| sort TLogID
| join type=left TLogID
[ search index=$Index$ LogGroup=$LogGroup$ (Type="Role" AND As="TLog" AND ID=$row.TLogID$)
| dedup ID
| rename ID as TLogID
| table TLogID Machine]
| table TLogID Machine TLogEvents
| fillnull value="-" Machine
| mvexpand TLogEvents
| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), ToID=mvindex(temp,2), Origination= mvindex(temp,3)
| fields - temp - TLogEvents
| join type=left
[ search index=$Index$ LogGroup=$LogGroup$ (Type="Role")
| dedup ID
| rename ID as ToID
| rename As as ToRole
| rename Machine as ToMachine
| table ToID ToRole ToMachine]
| sort 0 -Time
| fillnull value="-" ToRole ToMachine
| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| table TLogID Machine Event DateTime ToID ToRole ToMachine Time DateTime</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="count">14</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
<option name="wrap">false</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 13: All Tags of the selected TLog (in Table 11) that have been popped by SSes (using TLogPoppedTag event)</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(ID=$row.TLogID$ AND Type="TLogPoppedTag")
| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| rename ID as TLogID
| rename Tags as UnpoppedRecoveredTagCount
| rename Tag as TagPopped
| rename DurableKCVer as DurableKnownCommittedVersion
| search TagPopped!="-1:2"
| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt
| sort 0 -UnpoppedRecoveredTagCount
| join TagPopped type=left
[ search index=$Index$ LogGroup=$LogGroup$
(Type="StorageMetrics")
| stats latest(*) by Machine
| rename latest(*) as *
| rename Tag as TagPopped
| table TagPopped ID Machine]
| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt ID Machine
| join type=left Machine
[ search index=$Index$ LogGroup=$LogGroup$
Type=ProcessMetrics
| dedup Machine, DCID
| rename DCID as DataCenter
| table Machine DataCenter]
| rename ID as SSID
| rename Machine as SSMachine
| rename DataCenter as SSDataCenter
| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped SSID SSMachine SSDataCenter DurableKnownCommittedVersion RecoveredAt
| fillnull value="-"</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
<option name="wrap">false</option>
</table>
</panel>
<panel>
<title>Table 14: All Tags of the selected TLog (in Table 11) to be popped by SSes (using TLogReady event)</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(ID=$row.TLogID$ AND Type="TLogReady")
| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| rename ID as TLogID
| table TLogID Type AllTags Locality
| makemv delim="," AllTags
| mvexpand AllTags
| rename AllTags as Tag | sort 0 Tag
| join Tag type=left
[ search index=$Index$ LogGroup=$LogGroup$
(Type="StorageMetrics")
| stats latest(*) by Machine
| rename latest(*) as *
| table Tag ID Machine]
| table TLogID Tag ID Machine
| join type=left Machine
[ search index=$Index$ LogGroup=$LogGroup$
Type=ProcessMetrics
| dedup Machine, DCID
| rename DCID as DataCenter
| table Machine DataCenter]
| fillnull value="-"
| table TLogID Tag ID Machine DataCenter
| rename ID as SSID | rename Machine as SSMachine | rename DataCenter as SSDataCenter
| search Tag!="-1:2"</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 15: The Tags of the selected TLog (in Table 11) that are not popped by SSes (using set diff tags in Table 13 and Table 14) (if result contains "...", the result of Table 15 is wrong)</title>
<table>
<search>
<query>| set diff
[ search index=$Index$ LogGroup=$LogGroup$
(ID=$row.TLogID$ AND Type="TLogReady")
| table AllTags
| makemv delim="," AllTags
| mvexpand AllTags
| rename AllTags as Tag
| table Tag]
[ search index=$Index$ LogGroup=$LogGroup$
(ID=$row.TLogID$ AND Type="TLogPoppedTag")
| table Tag]</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
<panel>
<title>Table 16: All Current Storage Servers (assume each machine has at most one SS)</title>
<input type="text" token="TriggerSSTableToken" searchWhenChanged="true">
<label>Input * to search</label>
</input>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type="StorageMetrics") AND $TriggerSSTableToken$
| stats latest(*) by Machine
| rename latest(*) as *
| table Tag ID Machine
| join type=left Machine
[ search index=$Index$ LogGroup=$LogGroup$
Type=ProcessMetrics
| dedup Machine, DCID
| rename DCID as DataCenter
| table Machine DataCenter]
| table ID Machine DataCenter Tag
| join ID
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ((As="StorageServer")) AND (NOT TrackLatestType="Rolled"))
| stats latest(*) by Machine
| rename latest(*) as *
| rename As as Role
| table ID Role Machine
| join type=left Machine
[ search index=$Index$ LogGroup=$LogGroup$
Type=ProcessMetrics
| dedup Machine, DCID
| rename DCID as DataCenter
| table Machine DataCenter]
| table ID Role Machine DataCenter
| fillnull value="null" DataCenter]
| sort 0 DataCenter
| table Tag ID Machine DataCenter | sort 0 Tag</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Chart 1: Timeout/TimedOut event distribution grouped by source (Machine)</title>
<input type="text" token="TimeoutEventByMachineTableTimeSpanToken" searchWhenChanged="true">
<label>TimeSpan</label>
<default>5s</default>
</input>
<input type="multiselect" token="TimeoutbyMachineTableSourceRoleToken" searchWhenChanged="true">
<label>Select Source Roles</label>
<choice value="TLog">TLog</choice>
<choice value="MasterServer">MasterServer</choice>
<choice value="MasterProxyServer">MasterProxyServer (for version &lt; 7)</choice>
<choice value="Resolver">Resolver</choice>
<choice value="ClusterController">ClusterController</choice>
<choice value="SharedTLog">SharedTLog</choice>
<choice value="LogRouter">LogRouter</choice>
<choice value="Coordinator">Coordinator</choice>
<choice value="StorageServer">StorageServer</choice>
<choice value="CommitProxyServer">CommitProxyServer (for version 7+)</choice>
<choice value="GrvProxyServer">GrvProxyServer (for ver 7+)</choice>
<valuePrefix>As="</valuePrefix>
<valueSuffix>"</valueSuffix>
<delimiter> OR </delimiter>
</input>
<input type="multiselect" token="TimeoutbyMachineTableDestinationRoleToken" searchWhenChanged="true">
<label>Select Destination Roles</label>
<choice value="TLog">TLog</choice>
<choice value="MasterServer">MasterServer</choice>
<choice value="MasterProxyServer">MasterProxyServer (for version &lt;7)</choice>
<choice value="Resolver">Resolver</choice>
<choice value="ClusterController">ClusterController</choice>
<choice value="SharedTLog">SharedTLog</choice>
<choice value="LogRouter">LogRouter</choice>
<choice value="Coordinator">Coordinator</choice>
<choice value="StorageServer">StorageServer</choice>
<choice value="CommitProxyServer">CommitProxyServer (for version 7+)</choice>
<choice value="GrvProxyServer">GrvProxyServer (for version 7+)</choice>
<valuePrefix>As="</valuePrefix>
<valueSuffix>"</valueSuffix>
<delimiter> OR </delimiter>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type=ConnectionTimedOut OR Type=ConnectionTimeout)
| replace *:tls with * in PeerAddr
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$))
| dedup ID]
| join PeerAddr
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$))
| dedup ID
| rename Machine as PeerAddr]
| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by Machine</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="height">233</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Chart 2: Timeout/TimedOut event distribution grouped by destination (PeerAddr)</title>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type=ConnectionTimedOut OR Type=ConnectionTimeout)
| replace *:tls with * in PeerAddr
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$))
| dedup ID]
| join PeerAddr
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$))
| dedup ID
| rename Machine as PeerAddr]
| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by PeerAddr</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="height">219</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Table 17: Check Type=ConnectionTimedOut OR Type=ConnectionTimeout events between transaction roles in the recovery (including the role that refresh/begin/end in the timespan)</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type=ConnectionTimedOut OR Type=ConnectionTimeout)
| replace *:tls with * in PeerAddr
| stats count as TotalTimeouts by Machine PeerAddr
| table Machine PeerAddr TotalTimeouts
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$))
| stats latest(*) by ID
| rename latest(*) as *
| eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| stats list(Role) AS MachineRoleLatestEvent BY Machine
]
| join PeerAddr
[ search index=$Index$ LogGroup=$LogGroup$
(Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$))
| stats latest(*) by ID
| rename latest(*) as *
| eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| stats list(Role) AS PeerRoleLatestEvent BY Machine
| rename Machine AS PeerAddr
]
| table Machine PeerAddr TotalTimeouts MachineRoleLatestEvent PeerRoleLatestEvent</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 18: Proxy 0</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Type="ProxyReplies" OR Type="CommitProxyReplies") AND FirstProxy="True"
| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| table WorkerID LogGroup FirstProxy Time DateTime
| sort 0 -Time
| join type=left WorkerID
[ search index=$Index$ LogGroup=$LogGroup$
Type="Role" AND As="Worker" AND Transition="Refresh"
| dedup ID
| rename ID as WorkerID
| stats list(*) by WorkerID
| rename list(*) as *
| table WorkerID Machine Roles]
| table WorkerID Machine Roles LogGroup FirstProxy Time DateTime
| join type=left Machine
[ search index=$Index$ LogGroup=$LogGroup$
Type="Role" AND (As="MasterProxyServer" OR As="CommitProxyServer") AND Transition="Refresh"
| dedup ID
| rename ID as ProxyID
| table Machine ProxyID]
| table ProxyID Machine LogGroup FirstProxy</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Table 19: Latest Role Events on the input Machine (Input Machine, like 172.27.113.121:4500)</title>
<input type="text" token="SearchMachineToken" searchWhenChanged="true">
<label>Machine (IP:PORT)</label>
</input>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="Role" AND Machine=$SearchMachineToken$
| stats latest(*) by ID Transition
| rename latest(*) as *
| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| table DateTime Machine ID Transition As Roles LogGroup Error ErrorDescription Reason
| sort 0 -DateTime
| fillnull value="-"</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Chart 3: severity&gt;=20 event distribution (including roles that refresh/begin/end in the timespan)</title>
<input type="text" token="BadEvents" searchWhenChanged="true">
<label>Events</label>
<default>*</default>
</input>
<input type="multiselect" token="BadEventRoleToken" searchWhenChanged="true">
<label>Roles</label>
<choice value="TLog">TLog</choice>
<choice value="MasterServer">MasterServer</choice>
<choice value="MasterProxyServer">MasterProxyServer (for version &lt;7)</choice>
<choice value="Resolver">Resolver</choice>
<choice value="ClusterController">ClusterController</choice>
<choice value="SharedTLog">SharedTLog</choice>
<choice value="LogRouter">LogRouter</choice>
<choice value="Coordinator">Coordinator</choice>
<choice value="StorageServer">StorageServer</choice>
<choice value="CommitProxyServer">CommitProxyServer (for version 7+)</choice>
<choice value="GrvProxyServer">GrvProxyServer (for version 7+)</choice>
<valuePrefix>As="</valuePrefix>
<valueSuffix>"</valueSuffix>
<delimiter> OR </delimiter>
</input>
<input type="dropdown" token="BadEventChartBy" searchWhenChanged="true">
<label>By</label>
<choice value="Type">EventType</choice>
<choice value="Machine">Machine</choice>
<choice value="Severity">Severity</choice>
<default>Type</default>
</input>
<input type="text" token="BadEventChartTimeSpanToken" searchWhenChanged="true">
<label>TimeSpan</label>
<default>5s</default>
</input>
<chart>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Severity&gt;10 AND $BadEvents$
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$
Type="Role" AND ($BadEventRoleToken$)
| dedup ID | table Machine]
| table Machine Type Severity _time
| timechart useother=0 span=$BadEventChartTimeSpanToken$ count by $BadEventChartBy$</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="charting.chart">line</option>
<option name="charting.drilldown">none</option>
<option name="height">305</option>
<option name="refresh.display">progressbar</option>
</chart>
</panel>
</row>
<row>
<panel>
<title>Table 20: Check severity&gt;20 events of roles in the recovery (including the role that refresh/begin/end in the timespan)</title>
<table>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Severity&gt;10
| stats count by Machine Type
| rename count as Count
| join Machine
[ search index=$Index$ LogGroup=$LogGroup$
Type="Role" AND ($BadEventRoleToken$)
| dedup ID
| eval Role=As."-".ID
| stats list(Role) by Machine
| rename list(Role) as Roles
| table Machine Roles]
| table Type Count Roles Machine
| sort -Count</query>
<earliest>$ReoveryTime.earliest$</earliest>
<latest>$ReoveryTime.latest$</latest>
</search>
<option name="drilldown">none</option>
<option name="refresh.display">progressbar</option>
<option name="wrap">false</option>
</table>
</panel>
</row>
</form>

View File

@ -0,0 +1,247 @@
<form theme="dark">
<label>FoundationDB - Tracing GRV and Commit Long Latency of CC Transactions (6.3 and 7.0+) (DEV)</label>
<description>Design for ClusterController issued transactions.</description>
<fieldset submitButton="false" autoRun="true">
<input type="text" token="Index" searchWhenChanged="true">
<label>Index</label>
<default></default>
</input>
<input type="text" token="LogGroup" searchWhenChanged="true">
<label>LogGroup</label>
<default>*</default>
</input>
<input type="text" token="transactionID">
<label>Hex Transaction ID (optional)</label>
<default>*</default>
</input>
<input type="time" token="time_token" searchWhenChanged="true">
<label>Time span</label>
<default>
<earliest>@d</earliest>
<latest>now</latest>
</default>
</input>
</fieldset>
<row>
<panel>
<title>All Transactions (Currently, this table also does not cover getrange operation and the operation which not do commit).</title>
<table>
<title>for FDB 6.3 and 7.0+</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ ID=$transactionID$
(Type="TransactionAttachID" OR Type="GetValueAttachID" OR Type="CommitAttachID")
| eval To=case(Type=="TransactionAttachID", "0"."-".To, Type="GetValueAttachID", "1"."-".To, Type=="CommitAttachID", "2"."-".To)
| stats list(To) by ID
| rename list(To) as ToList
| table ID ToList
| eval Count = mvcount(ToList)
| search Count=3
| eval To0=mvindex(ToList,0), To1=mvindex(ToList,1), To2=mvindex(ToList,2), To0=split(To0,"-"), To1=split(To1,"-"), To2=split(To2,"-"), GrvID=case(mvindex(To0, 0)=="0", mvindex(To0, 1), mvindex(To1, 0)=="0", mvindex(To1, 1), mvindex(To2, 0)=="0", mvindex(To2, 1)), ReadID=case(mvindex(To0, 0)=="1", mvindex(To0, 1), mvindex(To1, 0)=="1", mvindex(To1, 1), mvindex(To2, 0)=="1", mvindex(To2, 1)), CommitID=case(mvindex(To0, 0)=="2", mvindex(To0, 1), mvindex(To1, 0)=="2", mvindex(To1, 1), mvindex(To2, 0)=="2", mvindex(To2, 1))
| table ID GrvID ReadID CommitID
| join GrvID
[ search index=$Index$ LogGroup=$LogGroup$
(Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.Before")
| rename ID as GrvID
| rename Time as BeginTime
| table GrvID BeginTime
]
| join GrvID
[ search index=$Index$ LogGroup=$LogGroup$
(Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.After")
| rename ID as GrvID
| rename Time as GRVDoneTime
| table GrvID GRVDoneTime
]
| join ReadID
[ search index=$Index$ LogGroup=$LogGroup$
(Type="GetValueDebug" AND Location="NativeAPI.getValue.After")
| rename ID as ReadID
| rename Time as ReadDoneTime
| table ReadID ReadDoneTime
]
| join CommitID
[ search index=$Index$ LogGroup=$LogGroup$
(Type="CommitDebug" AND Location="NativeAPI.commit.After")
| rename ID as CommitID
| rename Time as CommitDoneTime
| table CommitID CommitDoneTime
]
| rename ID as TransactionID
| eval BeginToGRVDone = GRVDoneTime-BeginTime, GRVDoneToReadDone = ReadDoneTime-GRVDoneTime, ReadDoneToCommitDone = CommitDoneTime-ReadDoneTime, Duration=CommitDoneTime-BeginTime, BeginTimeScope=BeginTime-1, EndTimeScope=CommitDoneTime+1, BeginDateTime=strftime(BeginTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)")
| table TransactionID Duration BeginDateTime BeginToGRVDone GRVDoneToReadDone ReadDoneToCommitDone Duration GrvID ReadID CommitID BeginTimeScope EndTimeScope | sort -Duration</query>
<earliest>$time_token.earliest$</earliest>
<latest>$time_token.latest$</latest>
</search>
<option name="drilldown">cell</option>
<drilldown>
<set token="BeginTime">$row.BeginTimeScope$</set>
<set token="EndTime">$row.EndTimeScope$</set>
<set token="ReadID">$row.ReadID$</set>
<set token="GrvID">$row.GrvID$</set>
<set token="CommitID">$row.CommitID$</set>
</drilldown>
</table>
</panel>
</row>
<row>
<panel>
<title>Step1: GRV</title>
<table>
<title>for FDB 6.3 and 7.0+</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="TransactionDebug" AND (NOT MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion)
AND (ID=$GrvID$ OR ID=
[ search index=$Index$ LogGroup=$LogGroup$
Type="TransactionAttachID" AND ID=$GrvID$
| return $To])
| table Time Type ID Location Machine Roles
| eventstats min(Time) as MinTime
| eval Delta = Time - MinTime, Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location=="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location=="GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster", 2.1, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location=="MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion", 4, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6)
| table Time Delta Order Type ID Location Machine Roles
| sort 0 Order
| table Machine Location Delta Time Roles ID Type</query>
<earliest>$BeginTime$</earliest>
<latest>$EndTime$</latest>
</search>
<option name="drilldown">none</option>
</table>
</panel>
<panel>
<title>Step1: (Only for FDB v6.3): GRV --- Get Committed Version (MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion Events)</title>
<table>
<title>only for FDB 6.3</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="TransactionDebug" AND Location="MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion"
AND ID=
[ search index=$Index$ LogGroup=$LogGroup$
Type="TransactionAttachID" AND ID=$GrvID$
| return $To]
| table Time Type ID Location Machine Roles
| eventstats min(Time) as MinTime
| eval Delta = Time - MinTime
| sort 0 -Time
| table Machine Delta Time Roles ID Type</query>
<earliest>$BeginTime$</earliest>
<latest>$EndTime$</latest>
</search>
<option name="drilldown">none</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Step2: GetValue</title>
<table>
<title>for FDB 6.3 and 7.0+</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$ Type="GetValueDebug" AND ID=$ReadID$
| eventstats min(Time) as MinTime
| eval Delta = Time-MinTime
| table Machine Location Delta Time Roles ID Type
| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10)
| sort 0 Order
| table Machine Location Delta Time Roles ID Type</query>
<earliest>$time_token.earliest$</earliest>
<latest>$time_token.latest$</latest>
</search>
<option name="drilldown">none</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Step3: Commit</title>
<table>
<title>for FDB 6.3 and 7.0+</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
Type="CommitDebug" AND (ID=$CommitID$ OR ID=
[ search index=$Index$ LogGroup=$LogGroup$
Type="CommitAttachID" AND ID=$CommitID$
| return $To])
| table Time Type ID Location Machine Roles
| eventstats min(Time) as MinTime
| eval Delta = Time-MinTime
| table Machine Location Delta Time Roles ID Type
| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLogServer.tLogCommit.BeforeWaitForVersion", 11, Location=="TLogServer.tLogCommit.Before", 12, Location=="TLogServer.tLogCommit.AfterTLogCommit", 13, Location=="TLogServer.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16)
| sort 0 Order
| table Machine Location Delta Time Roles ID Type</query>
<earliest>$BeginTime$</earliest>
<latest>$EndTime$</latest>
</search>
<option name="drilldown">none</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Step3: Commit --- Resolver</title>
<table>
<title>for FDB 6.3 and 7.0+</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Location="Resolver*")
| join ID
[ search index=$Index$ LogGroup=$LogGroup$
Type="CommitAttachID" AND ID=
[ search index=$Index$ LogGroup=$LogGroup$
Type="CommitAttachID" AND ID=$CommitID$
| return $To]
| rename To as ID
| table ID]
| eventstats min(Time) as MinTime
| eval Delta = Time-MinTime
| eval Order=case(Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8)
| sort 0 Time Order
| stats list(*) by Type ID Machine Roles
| rename list(*) as *
| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration
| table Machine Roles Duration Location Delta Time
| join type=left Machine
[ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics
| dedup Machine, DCID
| rename DCID as DataCenter
| table Machine DataCenter]
| table Machine DataCenter Roles Duration Location Delta Time</query>
<earliest>$time_token.earliest$</earliest>
<latest>$time_token.latest$</latest>
</search>
<option name="drilldown">none</option>
</table>
</panel>
</row>
<row>
<panel>
<title>Step3: Commit --- Commit to TLogs (CommitDebug Events), grouped by Machine and sorted by Duration</title>
<table>
<title>for FDB 6.3 and 7.0+</title>
<search>
<query>index=$Index$ LogGroup=$LogGroup$
(Location="TLog*")
| join ID
[ search index=$Index$ LogGroup=$LogGroup$
Type="CommitAttachID" AND ID=
[ search index=$Index$ LogGroup=$LogGroup$
Type="CommitAttachID" AND ID=$CommitID$
| return $To]
| rename To as ID
| table ID]
| eventstats min(Time) as MinTime
| eval Delta = Time-MinTime
| sort 0 Time
| stats list(*) by Type ID Machine Roles
| rename list(*) as *
| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration
| table Machine Roles Duration Location Delta Time</query>
<earliest>$BeginTime$</earliest>
<latest>$EndTime$</latest>
</search>
<option name="count">10</option>
<option name="drilldown">none</option>
</table>
</panel>
</row>
</form>