From 9a5511eadea316f184940f92dfc5da1eecb0bfd9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 May 2023 10:37:59 -0700 Subject: [PATCH] perf vendor events intel: Update sapphirerapids events/metrics Update sapphirerapids events to v1.13 improving event descriptions. Metrics are updated to make TMA info metric names synchronized. Events and metrics were generated by: https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230517173805.602113-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- .../arch/x86/sapphirerapids/memory.json | 6 +- .../arch/x86/sapphirerapids/spr-metrics.json | 1749 ++++++++++------- .../sapphirerapids/uncore-interconnect.json | 2 +- .../x86/sapphirerapids/uncore-memory.json | 8 +- 5 files changed, 1019 insertions(+), 748 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 1d2e63575da7..59afd27feb1d 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -23,7 +23,7 @@ GenuineIntel-6-A[AC],v1.01,meteorlake,core GenuineIntel-6-1[AEF],v3,nehalemep,core GenuineIntel-6-2E,v3,nehalemex,core GenuineIntel-6-2A,v19,sandybridge,core -GenuineIntel-6-(8F|CF),v1.12,sapphirerapids,core +GenuineIntel-6-(8F|CF),v1.13,sapphirerapids,core GenuineIntel-6-AF,v1.00,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v55,skylake,core diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json index b72a36999930..e8bf7c9c44e1 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json @@ -32,18 +32,20 @@ "UMask": "0x3" }, { - "BriefDescription": "MEMORY_ACTIVITY.STALLS_L2_MISS", + "BriefDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding.", "CounterMask": "5", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS", + "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x5" }, { - "BriefDescription": "MEMORY_ACTIVITY.STALLS_L3_MISS", + "BriefDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding.", "CounterMask": "9", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS", + "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x9" }, diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index 4308e2483112..4f3dd85540b6 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -29,10 +29,261 @@ }, { "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", "MetricGroup": "SoC", "MetricName": "UNCORE_FREQ" }, + { + "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", + "MetricName": "cpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "CPU operating frequency (in GHz)", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", + "MetricName": "cpu_operating_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Percentage of time spent in the active CPU power state C0", + "MetricExpr": "tma_info_system_cpu_utilization", + "MetricName": "cpu_utilization", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "dtlb_2nd_level_2mb_large_page_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_2nd_level_load_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions", + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "dtlb_2nd_level_store_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_ITOM + UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR) * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", + "MetricName": "itlb_2nd_level_large_page_mpi", + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions", + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricName": "itlb_2nd_level_mpi", + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", + "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L1_HIT / INST_RETIRED.ANY", + "MetricName": "l1d_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", + "MetricName": "l1d_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_code_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_hits_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", + "MetricExpr": "MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricName": "l2_demand_data_read_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", + "MetricName": "l2_mpi", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD / INST_RETIRED.ANY", + "MetricName": "llc_code_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA + UNC_CHA_TOR_INSERTS.IA_MISS_DRD + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF) / INST_RETIRED.ANY", + "MetricName": "llc_data_read_mpi_demand_plus_prefetch", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to local memory in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_latency_for_local_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to remote memory in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_latency_for_remote_requests", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to DRAM in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_to_dram_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to Intel(R) Optane(TM) Persistent Memory(PMEM) in nano seconds", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM) * #num_packages)) * duration_time", + "MetricName": "llc_demand_data_read_miss_to_pmem_latency", + "ScaleUnit": "1ns" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to local memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_LOCAL * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_local_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.READS_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to remote memory.", + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_REMOTE * 64 / 1e6 / duration_time", + "MetricName": "llc_miss_remote_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", + "MetricName": "loads_per_instr", + "ScaleUnit": "1per_instr" + }, + { + "BriefDescription": "DDR memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "DDR memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time", + "MetricName": "memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory write bandwidth (MB/sec) caused by directory updates; includes DDR and Intel(R) Optane(TM) Persistent Memory(PMEM).", + "MetricExpr": "(UNC_CHA_DIR_UPDATE.HA + UNC_CHA_DIR_UPDATE.TOR + UNC_M2M_DIRECTORY_UPDATE.ANY) * 64 / 1e6 / duration_time", + "MetricName": "memory_extra_write_bw_due_to_directory_updates", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL) / (UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE)", + "MetricName": "numa_reads_addressed_to_local_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE) / (UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE)", + "MetricName": "numa_reads_addressed_to_remote_dram", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_decoded_icache", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MITE_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", + "MetricExpr": "IDQ.MS_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", + "MetricName": "percent_uops_delivered_from_microcode_sequencer", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory read bandwidth (MB/sec)", + "MetricExpr": "UNC_M_PMM_RPQ_INSERTS * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_read", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory bandwidth (MB/sec)", + "MetricExpr": "(UNC_M_PMM_RPQ_INSERTS + UNC_M_PMM_WPQ_INSERTS) * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_total", + "ScaleUnit": "1MB/s" + }, + { + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory write bandwidth (MB/sec)", + "MetricExpr": "UNC_M_PMM_WPQ_INSERTS * 64 / 1e6 / duration_time", + "MetricName": "pmem_memory_bandwidth_write", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", @@ -48,9 +299,15 @@ "MetricName": "smi_num", "ScaleUnit": "1SMI#" }, + { + "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", + "MetricExpr": "MEM_INST_RETIRED.ALL_STORES / INST_RETIRED.ANY", + "MetricName": "stores_per_instr", + "ScaleUnit": "1per_instr" + }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", "MetricThreshold": "tma_alu_op_utilization > 0.6", @@ -58,7 +315,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the Advanced Matrix Extensions (AMX) execution engine was busy with tile (arithmetic) operations", - "MetricExpr": "EXE.AMX_BUSY / tma_info_core_clks", + "MetricExpr": "EXE.AMX_BUSY / tma_info_core_core_clks", "MetricGroup": "Compute;HPC;Server;TopdownL5;tma_L5_group;tma_ports_utilized_0_group", "MetricName": "tma_amx_busy", "MetricThreshold": "tma_amx_busy > 0.5 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -66,7 +323,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", - "MetricExpr": "100 * cpu@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_slots", + "MetricExpr": "100 * cpu@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_thread_slots", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_assists", "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", @@ -75,7 +332,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists.", - "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / tma_info_slots", + "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / tma_info_thread_slots", "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_avx_assists", "MetricThreshold": "tma_avx_assists > 0.1", @@ -83,7 +340,7 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -103,17 +360,17 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", - "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_branch_resteers", "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -131,7 +388,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", - "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", "MetricName": "tma_clears_resteers", "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -141,7 +398,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(76 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(76 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -161,7 +418,7 @@ { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "75.5 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "75.5 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_data_sharing", "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -170,16 +427,16 @@ }, { "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", "MetricName": "tma_decoder0_alone", - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35))", + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35))", "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", - "MetricExpr": "ARITH.DIV_ACTIVE / tma_info_clks", + "MetricExpr": "ARITH.DIV_ACTIVE / tma_info_thread_clks", "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_divider", "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -189,7 +446,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks - tma_pmm_bound if #has_pmem > 0 else MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks)", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks - tma_pmm_bound if #has_pmem > 0 else MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks)", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -198,43 +455,43 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_dsb_switches", "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", - "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", "MetricName": "tma_dtlb_load", "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", - "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", "MetricName": "tma_dtlb_store", "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", "ScaleUnit": "100%" }, { "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", - "MetricExpr": "80 * tma_info_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", + "MetricExpr": "80 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", "MetricName": "tma_false_sharing", "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -243,11 +500,11 @@ }, { "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", - "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_clks", + "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_thread_clks", "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -255,14 +512,14 @@ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricName": "tma_fetch_bandwidth", - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35", + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35", "MetricgroupNoGroup": "TopdownL2", - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", + "MetricExpr": "topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -281,7 +538,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) matrix uops fraction the CPU has retired (aggregated across all supported FP datatypes in AMX engine)", - "MetricExpr": "cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=1@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;HPC;Pipeline;Server;TopdownL4;tma_L4_group;tma_fp_arith_group", "MetricName": "tma_fp_amx", "MetricThreshold": "tma_fp_amx > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -300,7 +557,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", - "MetricExpr": "30 * ASSISTS.FP / tma_info_slots", + "MetricExpr": "30 * ASSISTS.FP / tma_info_thread_slots", "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_fp_assists", "MetricThreshold": "tma_fp_assists > 0.1", @@ -309,7 +566,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -318,7 +575,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -327,7 +584,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_128b", "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -336,7 +593,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_256b", "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -345,7 +602,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", - "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_512b", "MetricThreshold": "tma_fp_vector_512b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -354,7 +611,7 @@ }, { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", - "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -364,7 +621,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", - "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_fused_instructions", "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", @@ -373,7 +630,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -383,631 +640,121 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", - "MetricExpr": "ICACHE_DATA.STALLS / tma_info_clks", + "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_icache_misses", "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS", "ScaleUnit": "100%" }, - { - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", - "MetricGroup": "Power;Summary", - "MetricName": "tma_info_average_frequency" - }, - { - "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", - "MetricName": "tma_info_big_code", - "MetricThreshold": "tma_info_big_code > 20", - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" - }, - { - "BriefDescription": "Branch instructions per taken branch.", - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_bptkbranch" - }, { "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_branch_misprediction_cost", - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" }, { - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", - "MetricGroup": "Ret;tma_issueBC", - "MetricName": "tma_info_branching_overhead", - "MetricThreshold": "tma_info_branching_overhead > 10", - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" + "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200" }, { - "BriefDescription": "Fraction of branches that are CALL or RET", - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_callret" + "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_cond_taken", + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200" }, { - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "Pipeline", - "MetricName": "tma_info_clks" + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_indirect", + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" }, { - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Fed;MemoryTLB", - "MetricName": "tma_info_code_stlb_mpki" + "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", + "MetricGroup": "Bad;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmisp_ret", + "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500" }, { - "BriefDescription": "Fraction of branches that are non-taken conditionals", - "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_nt" - }, - { - "BriefDescription": "Fraction of branches that are taken conditionals", - "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches;CodeGen;PGO", - "MetricName": "tma_info_cond_tk" + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;BadSpec;BrMispredicts", + "MetricName": "tma_info_bad_spec_ipmispredict", + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" }, { "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", "MetricGroup": "Cor;SMT", - "MetricName": "tma_info_core_bound_likely", - "MetricThreshold": "tma_info_core_bound_likely > 0.5" - }, - { - "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", - "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", - "MetricGroup": "SMT", - "MetricName": "tma_info_core_clks" - }, - { - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", - "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_coreipc" - }, - { - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", - "MetricExpr": "1 / tma_info_ipc", - "MetricGroup": "Mem;Pipeline", - "MetricName": "tma_info_cpi" - }, - { - "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", - "MetricGroup": "HPC;Summary", - "MetricName": "tma_info_cpu_utilization" - }, - { - "BriefDescription": "Average Parallel L2 cache miss data reads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_data_l2_mlp" - }, - { - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", - "MetricName": "tma_info_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" - }, - { - "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", - "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", - "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", - "MetricName": "tma_info_dsb_coverage", - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 6 > 0.35", - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" + "MetricName": "tma_info_botlnk_l0_core_bound_likely", + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", "MetricGroup": "DSBmiss;Fed;tma_issueFB", - "MetricName": "tma_info_dsb_misses", - "MetricThreshold": "tma_info_dsb_misses > 10", - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" - }, - { - "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", - "MetricGroup": "DSBmiss", - "MetricName": "tma_info_dsb_switch_cost" - }, - { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", - "MetricName": "tma_info_execute" - }, - { - "BriefDescription": "The ratio of Executed- by Issued-Uops", - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", - "MetricGroup": "Cor;Pipeline", - "MetricName": "tma_info_execute_per_issue", - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." - }, - { - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_fb_hpki" - }, - { - "BriefDescription": "Average number of Uops issued by front-end when it issued something", - "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", - "MetricGroup": "Fed;FetchBW", - "MetricName": "tma_info_fetch_upc" - }, - { - "BriefDescription": "Floating Point Operations Per Cycle", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR_HALF + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16", - "MetricGroup": "Flops;Ret", - "MetricName": "tma_info_flopc" - }, - { - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * tma_info_core_clks)", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_fp_arith_utilization", - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." - }, - { - "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "tma_info_flopc / duration_time", - "MetricGroup": "Cor;Flops;HPC", - "MetricName": "tma_info_gflops", - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + "MetricName": "tma_info_botlnk_l2_dsb_misses", + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", - "MetricName": "tma_info_ic_misses", - "MetricThreshold": "tma_info_ic_misses > 5", + "MetricName": "tma_info_botlnk_l2_ic_misses", + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " }, { - "BriefDescription": "Average Latency for L1 instruction cache misses", - "MetricExpr": "ICACHE_DATA.STALLS / cpu@ICACHE_DATA.STALLS\\,cmask\\=1\\,edge@", - "MetricGroup": "Fed;FetchLat;IcMiss", - "MetricName": "tma_info_icache_miss_latency" + "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", + "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", + "MetricName": "tma_info_bottleneck_big_code", + "MetricThreshold": "tma_info_bottleneck_big_code > 20", + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" }, { - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", - "MetricName": "tma_info_ilp" + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", + "MetricGroup": "Ret;tma_issueBC", + "MetricName": "tma_info_bottleneck_branching_overhead", + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" }, { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", "MetricGroup": "Fed;FetchBW;Frontend", - "MetricName": "tma_info_instruction_fetch_bw", - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" - }, - { - "BriefDescription": "Total number of retired Instructions", - "MetricExpr": "INST_RETIRED.ANY", - "MetricGroup": "Summary;TmaL1;tma_L1_group", - "MetricName": "tma_info_instructions", - "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" - }, - { - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", - "MetricGroup": "IoBW;Mem;Server;SoC", - "MetricName": "tma_info_io_write_bw" - }, - { - "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR + (cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR))", - "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_iparith", - "MetricThreshold": "tma_info_iparith < 10", - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." - }, - { - "BriefDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.BF16", - "MetricGroup": "Flops;FpVector;InsType;Server", - "MetricName": "tma_info_iparith_amx_f16", - "MetricThreshold": "tma_info_iparith_amx_f16 < 10", - "PublicDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." - }, - { - "BriefDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.INT8", - "MetricGroup": "InsType;IntVector;Server", - "MetricName": "tma_info_iparith_amx_int8", - "MetricThreshold": "tma_info_iparith_amx_int8 < 10", - "PublicDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." - }, - { - "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF)", - "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx128", - "MetricThreshold": "tma_info_iparith_avx128 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." - }, - { - "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF)", - "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx256", - "MetricThreshold": "tma_info_iparith_avx256 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." - }, - { - "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF)", - "MetricGroup": "Flops;FpVector;InsType", - "MetricName": "tma_info_iparith_avx512", - "MetricThreshold": "tma_info_iparith_avx512 < 10", - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." - }, - { - "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", - "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_dp", - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." - }, - { - "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", - "MetricGroup": "Flops;FpScalar;InsType", - "MetricName": "tma_info_iparith_scalar_sp", - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." - }, - { - "BriefDescription": "Instructions per a microcode Assist invocation", - "MetricExpr": "INST_RETIRED.ANY / cpu@ASSISTS.ANY\\,umask\\=0x1B@", - "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_ipassist", - "MetricThreshold": "tma_info_ipassist < 100e3", - "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" - }, - { - "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Branches;Fed;InsType", - "MetricName": "tma_info_ipbranch", - "MetricThreshold": "tma_info_ipbranch < 8" - }, - { - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", - "MetricGroup": "Ret;Summary", - "MetricName": "tma_info_ipc" - }, - { - "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", - "MetricGroup": "Branches;Fed;PGO", - "MetricName": "tma_info_ipcall", - "MetricThreshold": "tma_info_ipcall < 200" - }, - { - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", - "MetricGroup": "DSBmiss;Fed", - "MetricName": "tma_info_ipdsb_miss_ret", - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" - }, - { - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", - "MetricGroup": "Branches;OS", - "MetricName": "tma_info_ipfarbranch", - "MetricThreshold": "tma_info_ipfarbranch < 1e6" - }, - { - "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / tma_info_flopc", - "MetricGroup": "Flops;InsType", - "MetricName": "tma_info_ipflop", - "MetricThreshold": "tma_info_ipflop < 10" - }, - { - "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", - "MetricGroup": "InsType", - "MetricName": "tma_info_ipload", - "MetricThreshold": "tma_info_ipload < 3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_ntaken", - "MetricThreshold": "tma_info_ipmisp_cond_ntaken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_cond_taken", - "MetricThreshold": "tma_info_ipmisp_cond_taken < 200" - }, - { - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_indirect", - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" - }, - { - "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", - "MetricGroup": "Bad;BrMispredicts", - "MetricName": "tma_info_ipmisp_ret", - "MetricThreshold": "tma_info_ipmisp_ret < 500" - }, - { - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;BadSpec;BrMispredicts", - "MetricName": "tma_info_ipmispredict", - "MetricThreshold": "tma_info_ipmispredict < 200" - }, - { - "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", - "MetricGroup": "InsType", - "MetricName": "tma_info_ipstore", - "MetricThreshold": "tma_info_ipstore < 8" - }, - { - "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", - "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", - "MetricGroup": "Prefetches", - "MetricName": "tma_info_ipswpf", - "MetricThreshold": "tma_info_ipswpf < 100" - }, - { - "BriefDescription": "Instruction per taken branch", - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", - "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", - "MetricName": "tma_info_iptb", - "MetricThreshold": "tma_info_iptb < 13", - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" - }, - { - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", - "MetricGroup": "Fed", - "MetricName": "tma_info_ipunknown_branch" - }, - { - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_jump" - }, - { - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_cpi" - }, - { - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", - "MetricGroup": "OS", - "MetricName": "tma_info_kernel_utilization", - "MetricThreshold": "tma_info_kernel_utilization > 0.05" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "tma_info_l1d_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l1d_cache_fill_bw_1t" - }, - { - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki" - }, - { - "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l1mpki_load" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", - "MetricExpr": "tma_info_l2_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l2_cache_fill_bw_1t" - }, - { - "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", - "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_nonsilent_pki" - }, - { - "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", - "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_instructions", - "MetricGroup": "L2Evicts;Mem;Server", - "MetricName": "tma_info_l2_evictions_silent_pki" - }, - { - "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", - "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_all" - }, - { - "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2hpki_load" - }, - { - "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "Backend;CacheMisses;Mem", - "MetricName": "tma_info_l2mpki" - }, - { - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem;Offcore", - "MetricName": "tma_info_l2mpki_all" - }, - { - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code" - }, - { - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "IcMiss", - "MetricName": "tma_info_l2mpki_code_all" - }, - { - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l2mpki_load" - }, - { - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw" - }, - { - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_access_bw", - "MetricGroup": "Mem;MemoryBW;Offcore", - "MetricName": "tma_info_l3_cache_access_bw_1t" - }, - { - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw" - }, - { - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", - "MetricExpr": "tma_info_l3_cache_fill_bw", - "MetricGroup": "Mem;MemoryBW", - "MetricName": "tma_info_l3_cache_fill_bw_1t" - }, - { - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", - "MetricGroup": "CacheMisses;Mem", - "MetricName": "tma_info_l3mpki" - }, - { - "BriefDescription": "Average Latency for L2 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l2_miss_latency" - }, - { - "BriefDescription": "Average Parallel L2 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", - "MetricGroup": "Memory_BW;Offcore", - "MetricName": "tma_info_load_l2_mlp" - }, - { - "BriefDescription": "Average Latency for L3 cache miss demand Loads", - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "MetricGroup": "Memory_Lat;Offcore", - "MetricName": "tma_info_load_l3_miss_latency" - }, - { - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", - "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY", - "MetricGroup": "Mem;MemoryBound;MemoryLat", - "MetricName": "tma_info_load_miss_real_latency" - }, - { - "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", - "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", - "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_load_stlb_mpki" - }, - { - "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", - "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / uncore_cha_0@event\\=0x1@", - "MetricGroup": "Mem;MemoryLat;Server;SoC", - "MetricName": "tma_info_mem_dram_read_latency", - "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" - }, - { - "BriefDescription": "Average number of parallel data read requests to external memory", - "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", - "MetricGroup": "Mem;MemoryBW;SoC", - "MetricName": "tma_info_mem_parallel_reads", - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" - }, - { - "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", - "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / uncore_cha_0@event\\=0x1@ if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryLat;Server;SoC", - "MetricName": "tma_info_mem_pmm_read_latency", - "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" - }, - { - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_socket_clks / duration_time)", - "MetricGroup": "Mem;MemoryLat;SoC", - "MetricName": "tma_info_mem_read_latency", - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" }, { "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_info_memory_bandwidth", - "MetricThreshold": "tma_info_memory_bandwidth > 20", - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" + "MetricName": "tma_info_bottleneck_memory_bandwidth", + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", - "MetricName": "tma_info_memory_data_tlbs", - "MetricThreshold": "tma_info_memory_data_tlbs > 20", + "MetricName": "tma_info_bottleneck_memory_data_tlbs", + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" }, { @@ -1015,8 +762,8 @@ "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_info_memory_latency", - "MetricThreshold": "tma_info_memory_latency > 20", + "MetricName": "tma_info_bottleneck_memory_latency", + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { @@ -1024,121 +771,631 @@ "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", - "MetricName": "tma_info_mispredictions", - "MetricThreshold": "tma_info_mispredictions > 20", - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" + "MetricName": "tma_info_bottleneck_mispredictions", + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" + }, + { + "BriefDescription": "Fraction of branches that are CALL or RET", + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_callret" + }, + { + "BriefDescription": "Fraction of branches that are non-taken conditionals", + "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches;CodeGen;PGO", + "MetricName": "tma_info_branches_cond_nt" + }, + { + "BriefDescription": "Fraction of branches that are taken conditionals", + "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches;CodeGen;PGO", + "MetricName": "tma_info_branches_cond_tk" + }, + { + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_jump" + }, + { + "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", + "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)", + "MetricGroup": "Bad;Branches", + "MetricName": "tma_info_branches_other_branches" + }, + { + "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", + "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", + "MetricGroup": "SMT", + "MetricName": "tma_info_core_core_clks" + }, + { + "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", + "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_core_coreipc" + }, + { + "BriefDescription": "Floating Point Operations Per Cycle", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR_HALF + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16", + "MetricGroup": "Flops;Ret", + "MetricName": "tma_info_core_flopc" + }, + { + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * tma_info_core_core_clks)", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_core_fp_arith_utilization", + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", + "MetricName": "tma_info_core_ilp" + }, + { + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", + "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", + "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", + "MetricName": "tma_info_frontend_dsb_coverage", + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 6 > 0.35", + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" + }, + { + "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", + "MetricGroup": "DSBmiss", + "MetricName": "tma_info_frontend_dsb_switch_cost" + }, + { + "BriefDescription": "Average number of Uops issued by front-end when it issued something", + "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", + "MetricGroup": "Fed;FetchBW", + "MetricName": "tma_info_frontend_fetch_upc" + }, + { + "BriefDescription": "Average Latency for L1 instruction cache misses", + "MetricExpr": "ICACHE_DATA.STALLS / cpu@ICACHE_DATA.STALLS\\,cmask\\=1\\,edge@", + "MetricGroup": "Fed;FetchLat;IcMiss", + "MetricName": "tma_info_frontend_icache_miss_latency" + }, + { + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", + "MetricGroup": "DSBmiss;Fed", + "MetricName": "tma_info_frontend_ipdsb_miss_ret", + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" + }, + { + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", + "MetricGroup": "Fed", + "MetricName": "tma_info_frontend_ipunknown_branch" + }, + { + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code" + }, + { + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", + "MetricGroup": "IcMiss", + "MetricName": "tma_info_frontend_l2mpki_code_all" + }, + { + "BriefDescription": "Branch instructions per taken branch.", + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_bptkbranch" + }, + { + "BriefDescription": "Total number of retired Instructions", + "MetricExpr": "INST_RETIRED.ANY", + "MetricGroup": "Summary;TmaL1;tma_L1_group", + "MetricName": "tma_info_inst_mix_instructions", + "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" + }, + { + "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR + (cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR))", + "MetricGroup": "Flops;InsType", + "MetricName": "tma_info_inst_mix_iparith", + "MetricThreshold": "tma_info_inst_mix_iparith < 10", + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." + }, + { + "BriefDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.BF16", + "MetricGroup": "Flops;FpVector;InsType;Server", + "MetricName": "tma_info_inst_mix_iparith_amx_f16", + "MetricThreshold": "tma_info_inst_mix_iparith_amx_f16 < 10", + "PublicDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." + }, + { + "BriefDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.INT8", + "MetricGroup": "InsType;IntVector;Server", + "MetricName": "tma_info_inst_mix_iparith_amx_int8", + "MetricThreshold": "tma_info_inst_mix_iparith_amx_int8 < 10", + "PublicDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." + }, + { + "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF)", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_iparith_avx128", + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + }, + { + "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF)", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_iparith_avx256", + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + }, + { + "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF)", + "MetricGroup": "Flops;FpVector;InsType", + "MetricName": "tma_info_inst_mix_iparith_avx512", + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + }, + { + "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", + "MetricGroup": "Flops;FpScalar;InsType", + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + }, + { + "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", + "MetricGroup": "Flops;FpScalar;InsType", + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." + }, + { + "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", + "MetricGroup": "Branches;Fed;InsType", + "MetricName": "tma_info_inst_mix_ipbranch", + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" + }, + { + "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", + "MetricGroup": "Branches;Fed;PGO", + "MetricName": "tma_info_inst_mix_ipcall", + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" + }, + { + "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_flopc", + "MetricGroup": "Flops;InsType", + "MetricName": "tma_info_inst_mix_ipflop", + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" + }, + { + "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", + "MetricGroup": "InsType", + "MetricName": "tma_info_inst_mix_ipload", + "MetricThreshold": "tma_info_inst_mix_ipload < 3" + }, + { + "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", + "MetricGroup": "InsType", + "MetricName": "tma_info_inst_mix_ipstore", + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" + }, + { + "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", + "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", + "MetricGroup": "Prefetches", + "MetricName": "tma_info_inst_mix_ipswpf", + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" + }, + { + "BriefDescription": "Instruction per taken branch", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", + "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", + "MetricName": "tma_info_inst_mix_iptb", + "MetricThreshold": "tma_info_inst_mix_iptb < 13", + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" + }, + { + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_nonsilent_pki" + }, + { + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_inst_mix_instructions", + "MetricGroup": "L2Evicts;Mem;Server", + "MetricName": "tma_info_memory_core_l2_evictions_silent_pki" + }, + { + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_core_l3_cache_access_bw" + }, + { + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" + }, + { + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_fb_hpki" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki" + }, + { + "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", + "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l1mpki_load" + }, + { + "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", + "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l2hpki_all" + }, + { + "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l2hpki_load" + }, + { + "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", + "MetricGroup": "Backend;CacheMisses;Mem", + "MetricName": "tma_info_memory_l2mpki" + }, + { + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", + "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem;Offcore", + "MetricName": "tma_info_memory_l2mpki_all" + }, + { + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l2mpki_load" + }, + { + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", + "MetricGroup": "CacheMisses;Mem", + "MetricName": "tma_info_memory_l3mpki" + }, + { + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", + "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY", + "MetricGroup": "Mem;MemoryBound;MemoryLat", + "MetricName": "tma_info_memory_load_miss_real_latency" }, { "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", "MetricGroup": "Mem;MemoryBW;MemoryBound", - "MetricName": "tma_info_mlp", + "MetricName": "tma_info_memory_mlp", "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" }, { - "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", - "MetricExpr": "1 - (tma_info_cond_nt + tma_info_cond_tk + tma_info_callret + tma_info_jump)", - "MetricGroup": "Bad;Branches", - "MetricName": "tma_info_other_branches" + "BriefDescription": "Average Parallel L2 cache miss data reads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_data_l2_mlp" + }, + { + "BriefDescription": "Average Latency for L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" + }, + { + "BriefDescription": "Average Parallel L2 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", + "MetricGroup": "Memory_BW;Offcore", + "MetricName": "tma_info_memory_oro_load_l2_mlp" + }, + { + "BriefDescription": "Average Latency for L3 cache miss demand Loads", + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", + "MetricGroup": "Memory_Lat;Offcore", + "MetricName": "tma_info_memory_oro_load_l3_miss_latency" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" + }, + { + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", + "MetricGroup": "Mem;MemoryBW;Offcore", + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" + }, + { + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", + "MetricGroup": "Mem;MemoryBW", + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" + }, + { + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Fed;MemoryTLB", + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" + }, + { + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", + "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", + "MetricGroup": "Mem;MemoryTLB", + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * tma_info_core_clks)", + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * tma_info_core_core_clks)", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_page_walks_utilization", - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" - }, - { - "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", - "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryBW;Server;SoC", - "MetricName": "tma_info_pmm_read_bw" - }, - { - "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", - "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", - "MetricGroup": "Mem;MemoryBW;Server;SoC", - "MetricName": "tma_info_pmm_write_bw" - }, - { - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", - "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", - "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_retire" - }, - { - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", - "MetricExpr": "TOPDOWN.SLOTS", - "MetricGroup": "TmaL1;tma_L1_group", - "MetricName": "tma_info_slots" - }, - { - "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", - "MetricExpr": "(tma_info_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", - "MetricGroup": "SMT;TmaL1;tma_L1_group", - "MetricName": "tma_info_slots_utilization" - }, - { - "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", - "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", - "MetricGroup": "SMT", - "MetricName": "tma_info_smt_2t_utilization" - }, - { - "BriefDescription": "Socket actual clocks when any core is active on that socket", - "MetricExpr": "uncore_cha_0@event\\=0x1@", - "MetricGroup": "SoC", - "MetricName": "tma_info_socket_clks" + "MetricName": "tma_info_memory_tlb_page_walks_utilization", + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", "MetricGroup": "Mem;MemoryTLB", - "MetricName": "tma_info_store_stlb_mpki" + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" + }, + { + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", + "MetricName": "tma_info_pipeline_execute" + }, + { + "BriefDescription": "Instructions per a microcode Assist invocation", + "MetricExpr": "INST_RETIRED.ANY / cpu@ASSISTS.ANY\\,umask\\=0x1B@", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "tma_info_pipeline_ipassist", + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" + }, + { + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", + "MetricGroup": "Pipeline;Ret", + "MetricName": "tma_info_pipeline_retire" }, { "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", "MetricGroup": "Pipeline;Ret", - "MetricName": "tma_info_strings_cycles", - "MetricThreshold": "tma_info_strings_cycles > 0.1" + "MetricName": "tma_info_pipeline_strings_cycles", + "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1" + }, + { + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", + "MetricGroup": "Power;Summary", + "MetricName": "tma_info_system_average_frequency" + }, + { + "BriefDescription": "Average CPU Utilization", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricGroup": "HPC;Summary", + "MetricName": "tma_info_system_cpu_utilization" + }, + { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" + }, + { + "BriefDescription": "Giga Floating Point Operations Per Second", + "MetricExpr": "tma_info_core_flopc / duration_time", + "MetricGroup": "Cor;Flops;HPC", + "MetricName": "tma_info_system_gflops", + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." + }, + { + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", + "MetricGroup": "IoBW;Mem;Server;SoC", + "MetricName": "tma_info_system_io_write_bw" + }, + { + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", + "MetricGroup": "Branches;OS", + "MetricName": "tma_info_system_ipfarbranch", + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" + }, + { + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_cpi" + }, + { + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "OS", + "MetricName": "tma_info_system_kernel_utilization", + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" + }, + { + "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / uncore_cha_0@event\\=0x1@", + "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricName": "tma_info_system_mem_dram_read_latency", + "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" + }, + { + "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", + "MetricGroup": "Mem;MemoryBW;SoC", + "MetricName": "tma_info_system_mem_parallel_reads", + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" + }, + { + "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", + "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / uncore_cha_0@event\\=0x1@ if #has_pmem > 0 else 0)", + "MetricGroup": "Mem;MemoryLat;Server;SoC", + "MetricName": "tma_info_system_mem_pmm_read_latency", + "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" + }, + { + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / duration_time)", + "MetricGroup": "Mem;MemoryLat;SoC", + "MetricName": "tma_info_system_mem_read_latency", + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" + }, + { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", + "MetricGroup": "Mem;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_pmm_read_bw" + }, + { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", + "MetricGroup": "Mem;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_pmm_write_bw" + }, + { + "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", + "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", + "MetricGroup": "SMT", + "MetricName": "tma_info_system_smt_2t_utilization" + }, + { + "BriefDescription": "Socket actual clocks when any core is active on that socket", + "MetricExpr": "uncore_cha_0@event\\=0x1@", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_socket_clks" }, { "BriefDescription": "Tera Integer (matrix) Operations Per Second", "MetricExpr": "8 * AMX_OPS_RETIRED.INT8 / 1e12 / duration_time", "MetricGroup": "Cor;HPC;IntVector;Server", - "MetricName": "tma_info_tiops" + "MetricName": "tma_info_system_tiops" }, { "BriefDescription": "Average Frequency Utilization relative nominal frequency", - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", "MetricGroup": "Power", - "MetricName": "tma_info_turbo_utilization" - }, - { - "BriefDescription": "Uops Per Instruction", - "MetricExpr": "tma_retiring * tma_info_slots / INST_RETIRED.ANY", - "MetricGroup": "Pipeline;Ret;Retire", - "MetricName": "tma_info_uoppi", - "MetricThreshold": "tma_info_uoppi > 1.05" + "MetricName": "tma_info_system_turbo_utilization" }, { "BriefDescription": "Cross-socket Ultra Path Interconnect (UPI) data transmit bandwidth for data only [MB / sec]", "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 64 / 9 / 1e6", "MetricGroup": "Server;SoC", - "MetricName": "tma_info_upi_data_transmit_bw" + "MetricName": "tma_info_system_upi_data_transmit_bw" + }, + { + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", + "MetricGroup": "Pipeline", + "MetricName": "tma_info_thread_clks" + }, + { + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", + "MetricExpr": "1 / tma_info_thread_ipc", + "MetricGroup": "Mem;Pipeline", + "MetricName": "tma_info_thread_cpi" + }, + { + "BriefDescription": "The ratio of Executed- by Issued-Uops", + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", + "MetricGroup": "Cor;Pipeline", + "MetricName": "tma_info_thread_execute_per_issue", + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." + }, + { + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", + "MetricGroup": "Ret;Summary", + "MetricName": "tma_info_thread_ipc" + }, + { + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", + "MetricExpr": "TOPDOWN.SLOTS", + "MetricGroup": "TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots" + }, + { + "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", + "MetricExpr": "(tma_info_thread_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", + "MetricGroup": "SMT;TmaL1;tma_L1_group", + "MetricName": "tma_info_thread_slots_utilization" + }, + { + "BriefDescription": "Uops Per Instruction", + "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY", + "MetricGroup": "Pipeline;Ret;Retire", + "MetricName": "tma_info_thread_uoppi", + "MetricThreshold": "tma_info_thread_uoppi > 1.05" }, { "BriefDescription": "Instruction per taken branch", - "MetricExpr": "tma_retiring * tma_info_slots / BR_INST_RETIRED.NEAR_TAKEN", + "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN", "MetricGroup": "Branches;Fed;FetchBW", - "MetricName": "tma_info_uptb", - "MetricThreshold": "tma_info_uptb < 9" + "MetricName": "tma_info_thread_uptb", + "MetricThreshold": "tma_info_thread_uptb < 9" }, { "BriefDescription": "This metric approximates arithmetic Integer (Int) matrix uops fraction the CPU has retired (aggregated across all supported Int datatypes in AMX engine)", - "MetricExpr": "cpu@AMX_OPS_RETIRED.INT8\\,cmask\\=1@ / (tma_retiring * tma_info_slots)", + "MetricExpr": "cpu@AMX_OPS_RETIRED.INT8\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;HPC;IntVector;Pipeline;Server;TopdownL4;tma_L4_group;tma_int_operations_group", "MetricName": "tma_int_amx", "MetricThreshold": "tma_int_amx > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1156,7 +1413,7 @@ }, { "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", - "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_128b", "MetricThreshold": "tma_int_vector_128b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1165,7 +1422,7 @@ }, { "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", - "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * tma_info_slots)", + "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", "MetricName": "tma_int_vector_256b", "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1174,7 +1431,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", - "MetricExpr": "ICACHE_TAG.STALLS / tma_info_clks", + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", "MetricName": "tma_itlb_misses", "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1183,7 +1440,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", - "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", + "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", "MetricName": "tma_l1_bound", "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1193,7 +1450,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / tma_info_clks", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l2_bound", "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1202,7 +1459,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1211,20 +1468,20 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricExpr": "33 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "33 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", - "MetricExpr": "DECODE.LCP / tma_info_clks", + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", "MetricName": "tma_lcp", "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", "ScaleUnit": "100%" }, { @@ -1239,7 +1496,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * tma_info_core_clks)", + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", "MetricThreshold": "tma_load_op_utilization > 0.6", @@ -1256,7 +1513,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", "MetricName": "tma_load_stlb_miss", "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1264,7 +1521,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", - "MetricExpr": "71 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "71 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_dram", "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1274,7 +1531,7 @@ { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1293,7 +1550,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to memory bandwidth Allocation feature (RDT's memory bandwidth throttling).", - "MetricExpr": "INT_MISC.MBA_STALLS / tma_info_clks", + "MetricExpr": "INT_MISC.MBA_STALLS / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;Server;TopdownL5;tma_L5_group;tma_mem_bandwidth_group", "MetricName": "tma_mba_stalls", "MetricThreshold": "tma_mba_stalls > 0.1 & (tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1301,25 +1558,25 @@ }, { "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", - "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -1329,7 +1586,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_clks", + "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -1338,7 +1595,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_memory_operations", "MetricThreshold": "tma_memory_operations > 0.1 & tma_light_operations > 0.6", @@ -1346,7 +1603,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricExpr": "UOPS_RETIRED.MS / tma_info_slots", + "MetricExpr": "UOPS_RETIRED.MS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", @@ -1355,25 +1612,25 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", - "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", + "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_clks / 2", + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", "ScaleUnit": "100%" }, { "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", - "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / tma_info_clks", + "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / tma_info_thread_clks", "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", "MetricName": "tma_mixing_vectors", "MetricThreshold": "tma_mixing_vectors > 0.05", @@ -1382,7 +1639,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", - "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_slots / UOPS_ISSUED.ANY) / tma_info_clks", + "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY) / tma_info_thread_clks", "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", "MetricName": "tma_ms_switches", "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", @@ -1391,7 +1648,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", - "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_non_fused_branches", "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", @@ -1400,7 +1657,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", - "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_slots)", + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_nop_instructions", "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", @@ -1419,7 +1676,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", - "MetricExpr": "99 * ASSISTS.PAGE_FAULT / tma_info_slots", + "MetricExpr": "99 * ASSISTS.PAGE_FAULT / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group", "MetricName": "tma_page_faults", "MetricThreshold": "tma_page_faults > 0.05", @@ -1428,7 +1685,7 @@ }, { "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", - "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_pmm_bound", "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1437,7 +1694,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", - "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_0", "MetricThreshold": "tma_port_0 > 0.6", @@ -1446,7 +1703,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_1", "MetricThreshold": "tma_port_1 > 0.6", @@ -1455,7 +1712,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", - "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_clks", + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", "MetricName": "tma_port_6", "MetricThreshold": "tma_port_6 > 0.6", @@ -1464,7 +1721,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_clks)", + "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1473,7 +1730,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_clks", + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1482,7 +1739,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1491,7 +1748,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1500,7 +1757,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_clks", + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1509,7 +1766,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", - "MetricExpr": "(135.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "(135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1518,7 +1775,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "149 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", + "MetricExpr": "149 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_dram", "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1527,7 +1784,7 @@ }, { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1537,7 +1794,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", - "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_clks", + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", @@ -1546,7 +1803,7 @@ }, { "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.", - "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_slots)", + "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group", "MetricName": "tma_shuffles", "MetricThreshold": "tma_shuffles > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", @@ -1554,7 +1811,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_clks", + "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_thread_clks", "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", @@ -1563,7 +1820,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_split_loads", "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1572,7 +1829,7 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1581,16 +1838,16 @@ }, { "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", - "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / tma_info_clks", + "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_store_bound", "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -1599,7 +1856,7 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1608,7 +1865,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", - "MetricExpr": "(MEM_STORE_RETIRED.L2_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", + "MetricExpr": "(MEM_STORE_RETIRED.L2_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", "MetricName": "tma_store_latency", "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1617,7 +1874,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", - "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_clks)", + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_store_op_utilization", "MetricThreshold": "tma_store_op_utilization > 0.6", @@ -1634,7 +1891,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", "MetricName": "tma_store_stlb_miss", "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -1642,7 +1899,7 @@ }, { "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", - "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_clks", + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_thread_clks", "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", "MetricName": "tma_streaming_stores", "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", @@ -1651,7 +1908,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", - "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_clks", + "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_thread_clks", "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", "MetricName": "tma_unknown_branches", "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", @@ -1694,5 +1951,17 @@ "MetricGroup": "transaction", "MetricName": "tsx_transactional_cycles", "ScaleUnit": "100%" + }, + { + "BriefDescription": "Uncore operating frequency in GHz", + "MetricExpr": "UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_CLOCKTICKS) * #num_packages) / 1e9 / duration_time", + "MetricName": "uncore_frequency", + "ScaleUnit": "1GHz" + }, + { + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", + "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", + "MetricName": "upi_data_transmit_bw", + "ScaleUnit": "1MB/s" } ] diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json index 08faf38115d9..6800de05c836 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json @@ -464,7 +464,7 @@ "Unit": "M2M" }, { - "BriefDescription": "Counts the time when FM didn? do d2c for fill reads (cross tile case)", + "BriefDescription": "Counts the time when FM didn't do d2c for fill reads (cross tile case)", "EventCode": "0x4a", "EventName": "UNC_M2M_DIRECT2CORE_NOT_TAKEN_NOTFORKED", "PerPkg": "1", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json index 225333561295..3ff9e9b722c8 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json @@ -2480,11 +2480,11 @@ "Unit": "iMC" }, { - "BriefDescription": "DRAM Precharge commands. : Precharge due to (?)", + "BriefDescription": "DRAM Precharge commands", "EventCode": "0x03", "EventName": "UNC_M_PRE_COUNT.PGT", "PerPkg": "1", - "PublicDescription": "DRAM Precharge commands. : Precharge due to (?) : Counts the number of DRAM Precharge commands sent on this channel.", + "PublicDescription": "DRAM Precharge commands. Counts the number of DRAM Precharge commands sent on this channel.", "UMask": "0x88", "Unit": "iMC" }, @@ -3236,7 +3236,7 @@ "Unit": "iMC" }, { - "BriefDescription": "2LM Tag check hit due to memory read (bug?)", + "BriefDescription": "2LM Tag check hit due to memory read", "EventCode": "0xd3", "EventName": "UNC_M_TAGCHK.NM_RD_HIT", "PerPkg": "1", @@ -3244,7 +3244,7 @@ "Unit": "iMC" }, { - "BriefDescription": "2LM Tag check hit due to memory write (bug?)", + "BriefDescription": "2LM Tag check hit due to memory write", "EventCode": "0xd3", "EventName": "UNC_M_TAGCHK.NM_WR_HIT", "PerPkg": "1",