perf vendor events intel: Refresh alderlake metrics
Update the alderlake metrics using the new tooling from: https://github.com/intel/perfmon The metrics are unchanged but the formulas differ due to parentheses, use of exponents and removal of redundant operations like "* 1". Signed-off-by: Ian Rogers <irogers@google.com> Acked-by: Kan Liang <kan.liang@linux.intel.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.g.garry@oracle.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: https://lore.kernel.org/r/20221215064755.1620246-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
parent
ed4c1778cc
commit
266b2ca727
|
@ -10,7 +10,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
|
||||
"MetricExpr": "(topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS)",
|
||||
"MetricExpr": "topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS",
|
||||
"MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
|
||||
"MetricName": "tma_fetch_latency",
|
||||
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
|
||||
|
@ -46,7 +46,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
|
||||
"MetricExpr": "(tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
|
||||
"MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
|
||||
"MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group",
|
||||
"MetricName": "tma_mispredicts_resteers",
|
||||
"PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
|
||||
|
@ -55,7 +55,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears",
|
||||
"MetricExpr": "(1 - (tma_branch_mispredicts / tma_bad_speculation)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
|
||||
"MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
|
||||
"MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group",
|
||||
"MetricName": "tma_clears_resteers",
|
||||
"PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
|
||||
|
@ -153,7 +153,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
|
||||
"MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
|
||||
"MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * SLOTS",
|
||||
"MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group",
|
||||
"MetricName": "tma_branch_mispredicts",
|
||||
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS",
|
||||
|
@ -171,7 +171,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
|
||||
"MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
|
||||
"MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * SLOTS",
|
||||
"MetricGroup": "TopdownL1;tma_L1_group",
|
||||
"MetricName": "tma_backend_bound",
|
||||
"PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
|
||||
|
@ -180,7 +180,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
|
||||
"MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
|
||||
"MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * SLOTS",
|
||||
"MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group",
|
||||
"MetricName": "tma_memory_bound",
|
||||
"PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
|
||||
|
@ -232,7 +232,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
|
||||
"MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS",
|
||||
"MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS",
|
||||
"MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group",
|
||||
"MetricName": "tma_lock_latency",
|
||||
"PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS",
|
||||
|
@ -277,7 +277,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
|
||||
"MetricExpr": "((25 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + (24 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
|
||||
"MetricExpr": "(25 * Average_Frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 24 * Average_Frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / CLKS",
|
||||
"MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group",
|
||||
"MetricName": "tma_contested_accesses",
|
||||
"PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS",
|
||||
|
@ -286,7 +286,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
|
||||
"MetricExpr": "(24 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD)))) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
|
||||
"MetricExpr": "24 * Average_Frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / CLKS",
|
||||
"MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group",
|
||||
"MetricName": "tma_data_sharing",
|
||||
"PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD",
|
||||
|
@ -295,7 +295,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
|
||||
"MetricExpr": "(9 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
|
||||
"MetricExpr": "9 * Average_Frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / CLKS",
|
||||
"MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group",
|
||||
"MetricName": "tma_l3_hit_latency",
|
||||
"PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
|
||||
|
@ -313,7 +313,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
|
||||
"MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / CLKS)",
|
||||
"MetricExpr": "MEMORY_ACTIVITY.STALLS_L3_MISS / CLKS",
|
||||
"MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
|
||||
"MetricName": "tma_dram_bound",
|
||||
"PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS",
|
||||
|
@ -340,7 +340,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
|
||||
"MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS",
|
||||
"MetricExpr": "tma_st_buffer",
|
||||
"MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
|
||||
"MetricName": "tma_store_bound",
|
||||
"PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS",
|
||||
|
@ -349,7 +349,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
|
||||
"MetricExpr": "((MEM_STORE_RETIRED.L2_HIT * 10 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS",
|
||||
"MetricExpr": "(MEM_STORE_RETIRED.L2_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS",
|
||||
"MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group",
|
||||
"MetricName": "tma_store_latency",
|
||||
"PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)",
|
||||
|
@ -358,7 +358,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
|
||||
"MetricExpr": "(28 * Average_Frequency) * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS",
|
||||
"MetricExpr": "28 * Average_Frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS",
|
||||
"MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group",
|
||||
"MetricName": "tma_false_sharing",
|
||||
"PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM",
|
||||
|
@ -428,7 +428,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
|
||||
"MetricExpr": "(cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CLKS",
|
||||
"MetricExpr": "((cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CLKS if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CLKS)",
|
||||
"MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group",
|
||||
"MetricName": "tma_ports_utilization",
|
||||
"PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
|
||||
|
@ -556,7 +556,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
|
||||
"MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
|
||||
"MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * SLOTS",
|
||||
"MetricGroup": "TopdownL1;tma_L1_group",
|
||||
"MetricName": "tma_retiring",
|
||||
"PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
|
||||
|
@ -704,7 +704,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences",
|
||||
"MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
|
||||
"MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * SLOTS",
|
||||
"MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
|
||||
"MetricName": "tma_heavy_operations",
|
||||
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY",
|
||||
|
@ -722,7 +722,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
|
||||
"MetricExpr": "UOPS_RETIRED.MS / SLOTS",
|
||||
"MetricExpr": "tma_ms_uops",
|
||||
"MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group",
|
||||
"MetricName": "tma_microcode_sequencer",
|
||||
"PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS",
|
||||
|
@ -782,21 +782,21 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
|
||||
"MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ",
|
||||
"MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
|
||||
"MetricGroup": "Mem;MemoryBW;Offcore",
|
||||
"MetricName": "Memory_Bandwidth",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
|
||||
"MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)))",
|
||||
"MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))",
|
||||
"MetricGroup": "Mem;MemoryLat;Offcore",
|
||||
"MetricName": "Memory_Latency",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
|
||||
"MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores))) ",
|
||||
"MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
|
||||
"MetricGroup": "Mem;MemoryTLB;Offcore",
|
||||
"MetricName": "Memory_Data_TLBs",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -831,14 +831,14 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Uops Per Instruction",
|
||||
"MetricExpr": "(tma_retiring * SLOTS) / INST_RETIRED.ANY",
|
||||
"MetricExpr": "tma_retiring * SLOTS / INST_RETIRED.ANY",
|
||||
"MetricGroup": "Pipeline;Ret;Retire",
|
||||
"MetricName": "UPI",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Instruction per taken branch",
|
||||
"MetricExpr": "(tma_retiring * SLOTS) / BR_INST_RETIRED.NEAR_TAKEN",
|
||||
"MetricExpr": "tma_retiring * SLOTS / BR_INST_RETIRED.NEAR_TAKEN",
|
||||
"MetricGroup": "Branches;Fed;FetchBW",
|
||||
"MetricName": "UpTB",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -866,7 +866,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor",
|
||||
"MetricExpr": "SLOTS / (TOPDOWN.SLOTS / 2) if #SMT_on else 1",
|
||||
"MetricExpr": "(SLOTS / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)",
|
||||
"MetricGroup": "SMT;tma_L1_group",
|
||||
"MetricName": "Slots_Utilization",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -888,7 +888,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Floating Point Operations Per Cycle",
|
||||
"MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS",
|
||||
"MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS",
|
||||
"MetricGroup": "Flops;Ret",
|
||||
"MetricName": "FLOPc",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -903,14 +903,14 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
|
||||
"MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
|
||||
"MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
|
||||
"MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
|
||||
"MetricName": "ILP",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
|
||||
"MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0",
|
||||
"MetricExpr": "((1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0)",
|
||||
"MetricGroup": "Cor;SMT",
|
||||
"MetricName": "Core_Bound_Likely",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -966,14 +966,14 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
|
||||
"MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
|
||||
"MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
|
||||
"MetricGroup": "Flops;InsType",
|
||||
"MetricName": "IpFLOP",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
|
||||
"MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))",
|
||||
"MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))",
|
||||
"MetricGroup": "Flops;InsType",
|
||||
"MetricName": "IpArith",
|
||||
"PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.",
|
||||
|
@ -1027,7 +1027,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
|
||||
"MetricExpr": "(tma_retiring * SLOTS) / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
|
||||
"MetricExpr": "tma_retiring * SLOTS / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
|
||||
"MetricGroup": "Pipeline;Ret",
|
||||
"MetricName": "Retire",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -1104,7 +1104,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
|
||||
"MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES",
|
||||
"MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES",
|
||||
"MetricGroup": "Bad;BrMispredicts",
|
||||
"MetricName": "Branch_Misprediction_Cost",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -1160,63 +1160,63 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
|
||||
"MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
|
||||
"MetricGroup": "CacheMisses;Mem",
|
||||
"MetricName": "L1MPKI",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
|
||||
"MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
|
||||
"MetricGroup": "CacheMisses;Mem",
|
||||
"MetricName": "L1MPKI_Load",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
|
||||
"MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
|
||||
"MetricGroup": "Backend;CacheMisses;Mem",
|
||||
"MetricName": "L2MPKI",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
|
||||
"MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
|
||||
"MetricGroup": "CacheMisses;Mem;Offcore",
|
||||
"MetricName": "L2MPKI_All",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)",
|
||||
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
|
||||
"MetricGroup": "CacheMisses;Mem",
|
||||
"MetricName": "L2MPKI_Load",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
|
||||
"MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
|
||||
"MetricGroup": "CacheMisses;Mem",
|
||||
"MetricName": "L2HPKI_All",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)",
|
||||
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
|
||||
"MetricGroup": "CacheMisses;Mem",
|
||||
"MetricName": "L2HPKI_Load",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
|
||||
"MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
|
||||
"MetricGroup": "CacheMisses;Mem",
|
||||
"MetricName": "L3MPKI",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
|
||||
"MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
|
||||
"MetricGroup": "CacheMisses;Mem",
|
||||
"MetricName": "FB_HPKI",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -1231,28 +1231,28 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
|
||||
"MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
|
||||
"MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
|
||||
"MetricGroup": "Mem;MemoryBW",
|
||||
"MetricName": "L1D_Cache_Fill_BW",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
|
||||
"MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
|
||||
"MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
|
||||
"MetricGroup": "Mem;MemoryBW",
|
||||
"MetricName": "L2_Cache_Fill_BW",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
|
||||
"MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
|
||||
"MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
|
||||
"MetricGroup": "Mem;MemoryBW",
|
||||
"MetricName": "L3_Cache_Fill_BW",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
|
||||
"MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time",
|
||||
"MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
|
||||
"MetricGroup": "Mem;MemoryBW;Offcore",
|
||||
"MetricName": "L3_Cache_Access_BW",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -1294,14 +1294,14 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
|
||||
"MetricExpr": "Turbo_Utilization * TSC / 1000000000 / duration_time",
|
||||
"MetricExpr": "Turbo_Utilization * TSC / 1e9 / duration_time",
|
||||
"MetricGroup": "Power;Summary",
|
||||
"MetricName": "Average_Frequency",
|
||||
"Unit": "cpu_core"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Giga Floating Point Operations Per Second",
|
||||
"MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time",
|
||||
"MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
|
||||
"MetricGroup": "Cor;Flops;HPC",
|
||||
"MetricName": "GFLOPs",
|
||||
"PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine.",
|
||||
|
@ -1316,7 +1316,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
|
||||
"MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0",
|
||||
"MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)",
|
||||
"MetricGroup": "SMT",
|
||||
"MetricName": "SMT_2T_Utilization",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -1337,7 +1337,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
|
||||
"MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1000000 / duration_time / 1000",
|
||||
"MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
|
||||
"MetricGroup": "HPC;Mem;MemoryBW;SoC",
|
||||
"MetricName": "DRAM_BW_Use",
|
||||
"Unit": "cpu_core"
|
||||
|
@ -1365,7 +1365,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Uncore frequency per die [GHZ]",
|
||||
"MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000",
|
||||
"MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1e9",
|
||||
"MetricGroup": "SoC",
|
||||
"MetricName": "UNCORE_FREQ"
|
||||
},
|
||||
|
@ -1558,7 +1558,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Counts the number of cycles the core is stalled due to stores or loads. ",
|
||||
"MetricExpr": "min((TOPDOWN_BE_BOUND.ALL / SLOTS), (LD_HEAD.ANY_AT_RET / CLKS) + tma_store_bound)",
|
||||
"MetricExpr": "min(tma_backend_bound, LD_HEAD.ANY_AT_RET / CLKS + tma_store_bound)",
|
||||
"MetricGroup": "TopdownL2;tma_backend_bound_group",
|
||||
"MetricName": "tma_load_store_bound",
|
||||
"ScaleUnit": "100%",
|
||||
|
@ -1566,7 +1566,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Counts the number of cycles the core is stalled due to store buffer full.",
|
||||
"MetricExpr": "tma_mem_scheduler * (MEM_SCHEDULER_BLOCK.ST_BUF / MEM_SCHEDULER_BLOCK.ALL)",
|
||||
"MetricExpr": "tma_st_buffer",
|
||||
"MetricGroup": "TopdownL3;tma_load_store_bound_group",
|
||||
"MetricName": "tma_store_bound",
|
||||
"ScaleUnit": "100%",
|
||||
|
@ -1614,7 +1614,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 Cache.",
|
||||
"MetricExpr": "(MEM_BOUND_STALLS.LOAD_L2_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD)",
|
||||
"MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / CLKS - MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD",
|
||||
"MetricGroup": "TopdownL3;tma_load_store_bound_group",
|
||||
"MetricName": "tma_l2_bound",
|
||||
"ScaleUnit": "100%",
|
||||
|
@ -1622,7 +1622,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
|
||||
"MetricExpr": "(MEM_BOUND_STALLS.LOAD_LLC_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD)",
|
||||
"MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / CLKS - MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD",
|
||||
"MetricGroup": "TopdownL3;tma_load_store_bound_group",
|
||||
"MetricName": "tma_l3_bound",
|
||||
"ScaleUnit": "100%",
|
||||
|
@ -1630,7 +1630,7 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).",
|
||||
"MetricExpr": "(MEM_BOUND_STALLS.LOAD_DRAM_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD)",
|
||||
"MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / CLKS - MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD",
|
||||
"MetricGroup": "TopdownL3;tma_load_store_bound_group",
|
||||
"MetricName": "tma_dram_bound",
|
||||
"ScaleUnit": "100%",
|
||||
|
@ -1939,25 +1939,25 @@
|
|||
},
|
||||
{
|
||||
"BriefDescription": "Percent of instruction miss cost that hit in the L2",
|
||||
"MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_L2_HIT / (MEM_BOUND_STALLS.IFETCH)",
|
||||
"MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_L2_HIT / MEM_BOUND_STALLS.IFETCH",
|
||||
"MetricName": "Inst_Miss_Cost_L2Hit_Percent",
|
||||
"Unit": "cpu_atom"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Percent of instruction miss cost that hit in the L3",
|
||||
"MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_LLC_HIT / (MEM_BOUND_STALLS.IFETCH)",
|
||||
"MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_LLC_HIT / MEM_BOUND_STALLS.IFETCH",
|
||||
"MetricName": "Inst_Miss_Cost_L3Hit_Percent",
|
||||
"Unit": "cpu_atom"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "Percent of instruction miss cost that hit in DRAM",
|
||||
"MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_DRAM_HIT / (MEM_BOUND_STALLS.IFETCH)",
|
||||
"MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_DRAM_HIT / MEM_BOUND_STALLS.IFETCH",
|
||||
"MetricName": "Inst_Miss_Cost_DRAMHit_Percent",
|
||||
"Unit": "cpu_atom"
|
||||
},
|
||||
{
|
||||
"BriefDescription": "load ops retired per 1000 instruction",
|
||||
"MetricExpr": "1000 * MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY",
|
||||
"MetricExpr": "1e3 * MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY",
|
||||
"MetricName": "MemLoadPKI",
|
||||
"Unit": "cpu_atom"
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue