[X86] AMD Zen 3: MULX w/ mem operand has the same throughput as with reg op

Exegesis is faulty and sometimes when measuring throughput^-1
produces snippets that have loop-carried dependencies,
which must be what caused me to incorrectly measure it originally.

After looking much more carefully, the inverse throughput should match
that of the MULX w/ reg op.

As per llvm-exegesis measurements.
This commit is contained in:
Roman Lebedev 2021-08-27 13:23:27 +03:00
parent 0f04936a2d
commit d4d459e747
No known key found for this signature in database
GPG Key ID: 083C3EBB4A1689E0
4 changed files with 51 additions and 71 deletions

View File

@ -618,30 +618,10 @@ defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Intege
defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication.
defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
let Latency = !add(Znver3Model.LoadLatency, 3);
let ResourceCycles = [1, 1, 2];
let NumMicroOps = 2;
}
def : InstRW<[Zn3MULX32rm, WriteIMulHLd,
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
ReadAfterLd], (instrs MULX32rm)>;
defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication.
defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
let Latency = !add(Znver3Model.LoadLatency, 3);
let ResourceCycles = [1, 1, 2];
let NumMicroOps = 2;
}
def : InstRW<[Zn3MULX64rm, WriteIMulHLd,
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
ReadAfterLd], (instrs MULX64rm)>;
defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
defm : Zn3WriteResInt<WriteIMulHLd, [], !add(4, Znver3Model.LoadLatency), [], 0>; // Integer multiplication, high part.

View File

@ -17,13 +17,13 @@ add %rax, %rax
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 12
# CHECK-NEXT: Total Cycles: 11
# CHECK-NEXT: Total uOps: 3
# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.25
# CHECK-NEXT: IPC: 0.17
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK-NEXT: uOps Per Cycle: 0.27
# CHECK-NEXT: IPC: 0.18
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
@ -34,7 +34,7 @@ add %rax, %rax
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 8 2.00 * mulxl (%rdi), %eax, %ecx
# CHECK-NEXT: 2 8 1.00 * mulxl (%rdi), %eax, %ecx
# CHECK-NEXT: 1 1 0.25 addl %eax, %eax
# CHECK: Resources:
@ -64,19 +64,19 @@ add %rax, %rax
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
# CHECK-NEXT: - - 1.00 - 2.00 - 1.00 - - - - - - - - - - 1.00 - - 1.00 - -
# CHECK-NEXT: - - 1.00 - 1.00 - 1.00 - - - - - - - - - - 1.00 - - 1.00 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
# CHECK-NEXT: - - 1.00 - 2.00 - - - - - - - - - - - - 1.00 - - 1.00 - - mulxl (%rdi), %eax, %ecx
# CHECK-NEXT: - - 1.00 - 1.00 - - - - - - - - - - - - 1.00 - - 1.00 - - mulxl (%rdi), %eax, %ecx
# CHECK-NEXT: - - - - - - 1.00 - - - - - - - - - - - - - - - - addl %eax, %eax
# CHECK: Timeline view:
# CHECK-NEXT: 01
# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER. mulxl (%rdi), %eax, %ecx
# CHECK-NEXT: [0,1] D========eER addl %eax, %eax
# CHECK: [0,0] DeeeeeeeeER mulxl (%rdi), %eax, %ecx
# CHECK-NEXT: [0,1] D=======eER addl %eax, %eax
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -86,20 +86,20 @@ add %rax, %rax
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxl (%rdi), %eax, %ecx
# CHECK-NEXT: 1. 1 9.0 0.0 0.0 addl %eax, %eax
# CHECK-NEXT: 1 5.0 0.5 0.0 <total>
# CHECK-NEXT: 1. 1 8.0 0.0 0.0 addl %eax, %eax
# CHECK-NEXT: 1 4.5 0.5 0.0 <total>
# CHECK: [1] Code Region
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 12
# CHECK-NEXT: Total Cycles: 11
# CHECK-NEXT: Total uOps: 3
# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.25
# CHECK-NEXT: IPC: 0.17
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK-NEXT: uOps Per Cycle: 0.27
# CHECK-NEXT: IPC: 0.18
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
@ -110,7 +110,7 @@ add %rax, %rax
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 8 2.00 * mulxq (%rdi), %rax, %rcx
# CHECK-NEXT: 2 8 1.00 * mulxq (%rdi), %rax, %rcx
# CHECK-NEXT: 1 1 0.25 addq %rax, %rax
# CHECK: Resources:
@ -140,19 +140,19 @@ add %rax, %rax
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
# CHECK-NEXT: - - 1.00 - 2.00 - 1.00 - - - - - - - - - - 1.00 - - 1.00 - -
# CHECK-NEXT: - - 1.00 - 1.00 - 1.00 - - - - - - - - - - 1.00 - - 1.00 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
# CHECK-NEXT: - - 1.00 - 2.00 - - - - - - - - - - - - 1.00 - - 1.00 - - mulxq (%rdi), %rax, %rcx
# CHECK-NEXT: - - 1.00 - 1.00 - - - - - - - - - - - - 1.00 - - 1.00 - - mulxq (%rdi), %rax, %rcx
# CHECK-NEXT: - - - - - - 1.00 - - - - - - - - - - - - - - - - addq %rax, %rax
# CHECK: Timeline view:
# CHECK-NEXT: 01
# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER. mulxq (%rdi), %rax, %rcx
# CHECK-NEXT: [0,1] D========eER addq %rax, %rax
# CHECK: [0,0] DeeeeeeeeER mulxq (%rdi), %rax, %rcx
# CHECK-NEXT: [0,1] D=======eER addq %rax, %rax
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -162,5 +162,5 @@ add %rax, %rax
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxq (%rdi), %rax, %rcx
# CHECK-NEXT: 1. 1 9.0 0.0 0.0 addq %rax, %rax
# CHECK-NEXT: 1 5.0 0.5 0.0 <total>
# CHECK-NEXT: 1. 1 8.0 0.0 0.0 addq %rax, %rax
# CHECK-NEXT: 1 4.5 0.5 0.0 <total>

View File

@ -15,13 +15,13 @@ mulxq (%rdi), %rax, %rdx
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 14
# CHECK-NEXT: Total Cycles: 15
# CHECK-NEXT: Total uOps: 4
# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.29
# CHECK-NEXT: IPC: 0.14
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK-NEXT: uOps Per Cycle: 0.27
# CHECK-NEXT: IPC: 0.13
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
@ -32,7 +32,7 @@ mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 8 2.00 * mulxl (%rdi), %eax, %edx
# CHECK-NEXT: 2 8 1.00 * mulxl (%rdi), %eax, %edx
# CHECK: Resources:
# CHECK-NEXT: [0] - Zn3AGU0
@ -61,18 +61,18 @@ mulxq (%rdi), %rax, %rdx
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
# CHECK-NEXT: - 0.50 0.50 - 2.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - -
# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
# CHECK-NEXT: - 0.50 0.50 - 2.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - - mulxl (%rdi), %eax, %edx
# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - - mulxl (%rdi), %eax, %edx
# CHECK: Timeline view:
# CHECK-NEXT: 0123
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . mulxl (%rdi), %eax, %edx
# CHECK-NEXT: [1,0] D===eeeeeeeeER mulxl (%rdi), %eax, %edx
# CHECK-NEXT: [1,0] D====eeeeeeeeER mulxl (%rdi), %eax, %edx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -81,19 +81,19 @@ mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl (%rdi), %eax, %edx
# CHECK-NEXT: 0. 2 3.0 0.5 0.0 mulxl (%rdi), %eax, %edx
# CHECK: [1] Code Region
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 14
# CHECK-NEXT: Total Cycles: 15
# CHECK-NEXT: Total uOps: 4
# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.29
# CHECK-NEXT: IPC: 0.14
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK-NEXT: uOps Per Cycle: 0.27
# CHECK-NEXT: IPC: 0.13
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
@ -104,7 +104,7 @@ mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 8 2.00 * mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: 2 8 1.00 * mulxq (%rdi), %rax, %rdx
# CHECK: Resources:
# CHECK-NEXT: [0] - Zn3AGU0
@ -133,18 +133,18 @@ mulxq (%rdi), %rax, %rdx
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
# CHECK-NEXT: - 0.50 0.50 - 2.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - -
# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
# CHECK-NEXT: - 0.50 0.50 - 2.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - - mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - - mulxq (%rdi), %rax, %rdx
# CHECK: Timeline view:
# CHECK-NEXT: 0123
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: [1,0] D===eeeeeeeeER mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: [1,0] D====eeeeeeeeER mulxq (%rdi), %rax, %rdx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -153,4 +153,4 @@ mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: 0. 2 3.0 0.5 0.0 mulxq (%rdi), %rax, %rdx

View File

@ -63,9 +63,9 @@ shrx %rax, (%rbx), %rcx
# CHECK-NEXT: 1 1 0.50 bzhiq %rax, %rbx, %rcx
# CHECK-NEXT: 2 5 0.50 * bzhiq %rax, (%rbx), %rcx
# CHECK-NEXT: 2 4 1.00 mulxl %eax, %ebx, %ecx
# CHECK-NEXT: 2 8 2.00 * mulxl (%rax), %ebx, %ecx
# CHECK-NEXT: 2 8 1.00 * mulxl (%rax), %ebx, %ecx
# CHECK-NEXT: 2 4 1.00 mulxq %rax, %rbx, %rcx
# CHECK-NEXT: 2 8 2.00 * mulxq (%rax), %rbx, %rcx
# CHECK-NEXT: 2 8 1.00 * mulxq (%rax), %rbx, %rcx
# CHECK-NEXT: 1 3 1.00 pdepl %eax, %ebx, %ecx
# CHECK-NEXT: 1 5 0.33 * pdepl (%rax), %ebx, %ecx
# CHECK-NEXT: 1 3 1.00 pdepq %rax, %rbx, %rcx
@ -118,7 +118,7 @@ shrx %rax, (%rbx), %rcx
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
# CHECK-NEXT: 5.33 5.33 5.33 1.00 21.00 11.00 1.00 - - - - - - - - 5.33 5.33 5.33 5.33 5.33 5.33 - -
# CHECK-NEXT: 5.33 5.33 5.33 1.00 19.00 11.00 1.00 - - - - - - - - 5.33 5.33 5.33 5.33 5.33 5.33 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@ -127,9 +127,9 @@ shrx %rax, (%rbx), %rcx
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - - - - - - - - - - - bzhiq %rax, %rbx, %rcx
# CHECK-NEXT: 0.33 0.33 0.33 - 0.50 0.50 - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - bzhiq %rax, (%rbx), %rcx
# CHECK-NEXT: - - - - 1.00 - - - - - - - - - - - - - - - - - - mulxl %eax, %ebx, %ecx
# CHECK-NEXT: 0.33 0.33 0.33 - 2.00 - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - mulxl (%rax), %ebx, %ecx
# CHECK-NEXT: 0.33 0.33 0.33 - 1.00 - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - mulxl (%rax), %ebx, %ecx
# CHECK-NEXT: - - - - 1.00 - - - - - - - - - - - - - - - - - - mulxq %rax, %rbx, %rcx
# CHECK-NEXT: 0.33 0.33 0.33 - 2.00 - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - mulxq (%rax), %rbx, %rcx
# CHECK-NEXT: 0.33 0.33 0.33 - 1.00 - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - mulxq (%rax), %rbx, %rcx
# CHECK-NEXT: - - - - 1.00 - - - - - - - - - - - - - - - - - - pdepl %eax, %ebx, %ecx
# CHECK-NEXT: 0.33 0.33 0.33 0.25 0.25 0.25 0.25 - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - pdepl (%rax), %ebx, %ecx
# CHECK-NEXT: - - - - 1.00 - - - - - - - - - - - - - - - - - - pdepq %rax, %rbx, %rcx