forked from OSchip/llvm-project
[MCA] Add tests for IPC on Cortex-A55
The tests compare IPC statistics that MCA provides with IPC values measured on Cortex-A55 hardware. For hardware tests, each snippet is run in a loop unrolled by 1000, and IPC is measured by linux-perf. Several tests do not match the hardware: the skewed ALU is not supported, LDR seem to be missing a forwarding path. Differential Revision: https://reviews.llvm.org/D98174
This commit is contained in:
parent
3f6753efe1
commit
f08a2fc09e
|
@ -0,0 +1,14 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
add w8, w8, #1
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 1000
|
||||
# CHECK-NEXT: Total Cycles: 1003
|
||||
# CHECK-NEXT: Total uOps: 1000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 1.00
|
||||
# CHECK-NEXT: IPC: 1.00
|
||||
# CHECK-NEXT: Block RThroughput: 0.5
|
|
@ -0,0 +1,15 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
add w8, w8, #1
|
||||
add w9, w9, #1
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 2000
|
||||
# CHECK-NEXT: Total Cycles: 1003
|
||||
# CHECK-NEXT: Total uOps: 2000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 1.99
|
||||
# CHECK-NEXT: IPC: 1.99
|
||||
# CHECK-NEXT: Block RThroughput: 1.0
|
|
@ -0,0 +1,15 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
fmadd s3, s5, s6, s7
|
||||
fmadd s8, s9, s10, s11
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 2000
|
||||
# CHECK-NEXT: Total Cycles: 1004
|
||||
# CHECK-NEXT: Total uOps: 2000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 1.99
|
||||
# CHECK-NEXT: IPC: 1.99
|
||||
# CHECK-NEXT: Block RThroughput: 1.0
|
|
@ -0,0 +1,19 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
# FMADD writes and retires out-of-order
|
||||
fmadd s3, s5, s6, s7
|
||||
# ADD instructions are issued and retire in-order
|
||||
add w8, w8, #1
|
||||
add w9, w9, #1
|
||||
add w10, w10, #1
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 4000
|
||||
# CHECK-NEXT: Total Cycles: 2003
|
||||
# CHECK-NEXT: Total uOps: 4000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 2.00
|
||||
# CHECK-NEXT: IPC: 2.00
|
||||
# CHECK-NEXT: Block RThroughput: 2.0
|
|
@ -0,0 +1,18 @@
|
|||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
# CHECK: IPC:
|
||||
# CHECK-SAME: 2.00
|
||||
#
|
||||
# XFAIL: *
|
||||
#
|
||||
# Cortex-A55 has a secondary skewed ALU in the Ex1 stage for simple
|
||||
# ALU instructions that do not require shifting or saturation
|
||||
# resources. Results from the skewed ALU are available 1 cycle earlier.
|
||||
#
|
||||
# This features allows the first and the second instruction to be
|
||||
# dual-issued despite a register dependency (w8).
|
||||
#
|
||||
# MCA and LLVM scheduling model do not support this yet.
|
||||
|
||||
add w8, w8, #1
|
||||
add w10, w8, #1
|
||||
add w12, w8, #1
|
|
@ -0,0 +1,16 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
add w8, w8, #1
|
||||
add w12, w8, #1
|
||||
mul w10, w10, w10
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 3000
|
||||
# CHECK-NEXT: Total Cycles: 3003
|
||||
# CHECK-NEXT: Total uOps: 3000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 1.00
|
||||
# CHECK-NEXT: IPC: 1.00
|
||||
# CHECK-NEXT: Block RThroughput: 1.5
|
|
@ -0,0 +1,21 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
# DIV is not modeled precisely: on hardware it takes variable
|
||||
# number of cycles depending on its operands, but LLVM scheduling
|
||||
# model only provides an average latency.
|
||||
|
||||
add w8, w8, #1
|
||||
movz w10, #1, lsl #16
|
||||
movz w12, #32768, lsl #16
|
||||
sdiv w10, w12, w10
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 4000
|
||||
# CHECK-NEXT: Total Cycles: 8004
|
||||
# CHECK-NEXT: Total uOps: 4000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.50
|
||||
# CHECK-NEXT: IPC: 0.50
|
||||
# CHECK-NEXT: Block RThroughput: 8.0
|
|
@ -0,0 +1,22 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
# DIV is not modeled precisely: on hardware it takes variable
|
||||
# number of cycles depending on its operands. LLVM scheduling model
|
||||
# only provides an average latency.
|
||||
|
||||
add w8, w8, #1
|
||||
movz w10, #1, lsl #16
|
||||
movz w12, #32768, lsl #16
|
||||
mul w11, w8, w8
|
||||
sdiv w10, w12, w10
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 5000
|
||||
# CHECK-NEXT: Total Cycles: 8004
|
||||
# CHECK-NEXT: Total uOps: 5000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.62
|
||||
# CHECK-NEXT: IPC: 0.62
|
||||
# CHECK-NEXT: Block RThroughput: 8.0
|
|
@ -0,0 +1,25 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
# It appears that ADD and MUL fuse together, if both can be issued in
|
||||
# one cycle:
|
||||
#
|
||||
# add w12, w8, #1
|
||||
# mul w10, w12, w10
|
||||
#
|
||||
# FIXME: MCA (and LLVM scheduling model) do not support this. The test
|
||||
# case uses different registers to break the pattern.
|
||||
|
||||
add w8, w8, #1
|
||||
add w13, w8, #1
|
||||
mul w10, w12, w10
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 3000
|
||||
# CHECK-NEXT: Total Cycles: 3003
|
||||
# CHECK-NEXT: Total uOps: 3000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 1.00
|
||||
# CHECK-NEXT: IPC: 1.00
|
||||
# CHECK-NEXT: Block RThroughput: 1.5
|
|
@ -0,0 +1,17 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
add w8, w8, #1
|
||||
add w12, w9, #1
|
||||
cmp w9, #42
|
||||
mul w10, w12, w10
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 4000
|
||||
# CHECK-NEXT: Total Cycles: 3004
|
||||
# CHECK-NEXT: Total uOps: 4000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 1.33
|
||||
# CHECK-NEXT: IPC: 1.33
|
||||
# CHECK-NEXT: Block RThroughput: 2.0
|
|
@ -0,0 +1,19 @@
|
|||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
# CHECK: IPC:
|
||||
# CHECK-SAME: 1.50
|
||||
#
|
||||
# XFAIL: *
|
||||
#
|
||||
# MCA reports IPC = 0.60, while hardware shows IPC = 1.50.
|
||||
#
|
||||
# 1) The skewed ALU on Cortex-A55 is not modeled: ADD and AND
|
||||
# instructions should be issued in the same cycle.
|
||||
# See A55-2.s test for more details.
|
||||
#
|
||||
# 2) Cortex-A55 manual mentions that there is a forwarding path from
|
||||
# the ALU pipeline to the LD/ST pipeline. This is not implemented in
|
||||
# the LLVM scheduling model.
|
||||
|
||||
add w8, w8, #1
|
||||
and w12, w8, #0x3f
|
||||
ldr w14, [x10, w12, uxtw #2]
|
|
@ -0,0 +1,15 @@
|
|||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
|
||||
|
||||
fabs s0, s1
|
||||
fabs s2, s3
|
||||
|
||||
# CHECK: Iterations: 1000
|
||||
# CHECK-NEXT: Instructions: 2000
|
||||
# CHECK-NEXT: Total Cycles: 1004
|
||||
# CHECK-NEXT: Total uOps: 2000
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 1.99
|
||||
# CHECK-NEXT: IPC: 1.99
|
||||
# CHECK-NEXT: Block RThroughput: 1.0
|
Loading…
Reference in New Issue