[X86][SLM] Fix PBLENDVB uops and throughput

SLM PBLENDVB is just as bad as BLENDVPD/PS - so model it as such, fixing the rr vs rm uops diff as well. The Intel AoM appears to have a copy+paste typo with PBLENDW, it doesn't match Agner or InstLatX64.

Noticed while investigating some of the weird discrepancies reported by the D103695 helper script (SLM had much better vector shift throughputs than it should).
This commit is contained in:
Simon Pilgrim 2021-09-02 18:07:40 +01:00
parent e28cd75a50
commit 6ba0b9f68a
2 changed files with 11 additions and 11 deletions
llvm
lib/Target/X86
test/tools/llvm-mca/X86/SLM

View File

@ -62,7 +62,7 @@ def : ReadAdvance<ReadInt2Fpu, 0>;
multiclass SLMWriteResPair<X86FoldableSchedWrite SchedRW, multiclass SLMWriteResPair<X86FoldableSchedWrite SchedRW,
list<ProcResourceKind> ExePorts, list<ProcResourceKind> ExePorts,
int Lat, list<int> Res = [1], int UOps = 1, int Lat, list<int> Res = [1], int UOps = 1,
int LoadLat = 3> { int LoadLat = 3, int LoadUOps = 0> {
// Register variant is using a single cycle on ExePort. // Register variant is using a single cycle on ExePort.
def : WriteRes<SchedRW, ExePorts> { def : WriteRes<SchedRW, ExePorts> {
let Latency = Lat; let Latency = Lat;
@ -75,7 +75,7 @@ multiclass SLMWriteResPair<X86FoldableSchedWrite SchedRW,
def : WriteRes<SchedRW.Folded, !listconcat([SLM_MEC_RSV], ExePorts)> { def : WriteRes<SchedRW.Folded, !listconcat([SLM_MEC_RSV], ExePorts)> {
let Latency = !add(Lat, LoadLat); let Latency = !add(Lat, LoadLat);
let ResourceCycles = !listconcat([1], Res); let ResourceCycles = !listconcat([1], Res);
let NumMicroOps = UOps; let NumMicroOps = !add(UOps, LoadUOps);
} }
} }
@ -280,7 +280,7 @@ defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
defm : SLMWriteResPair<WriteFBlend, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteFBlend, [SLM_FPC_RSV0], 1>;
defm : X86WriteResPairUnsupported<WriteFBlendY>; defm : X86WriteResPairUnsupported<WriteFBlendY>;
defm : X86WriteResPairUnsupported<WriteFBlendZ>; defm : X86WriteResPairUnsupported<WriteFBlendZ>;
defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 4, [4], 3>; defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 4, [4], 2, 3, 1>;
defm : X86WriteResPairUnsupported<WriteFVarBlendY>; defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
defm : X86WriteResPairUnsupported<WriteFShuffle256>; defm : X86WriteResPairUnsupported<WriteFShuffle256>;
@ -391,7 +391,7 @@ defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
defm : SLMWriteResPair<WriteBlend, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteBlend, [SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0], 1>;
defm : X86WriteResPairUnsupported<WriteBlendZ>; defm : X86WriteResPairUnsupported<WriteBlendZ>;
defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 4, [4], 2, 3, 1>;
defm : X86WriteResPairUnsupported<WriteVarBlendY>; defm : X86WriteResPairUnsupported<WriteVarBlendY>;
defm : X86WriteResPairUnsupported<WriteVarBlendZ>; defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
defm : SLMWriteResPair<WriteMPSAD, [SLM_FPC_RSV0], 7>; defm : SLMWriteResPair<WriteMPSAD, [SLM_FPC_RSV0], 7>;

View File

@ -159,9 +159,9 @@ roundss $1, (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 * blendpd $11, (%rax), %xmm2 # CHECK-NEXT: 1 4 1.00 * blendpd $11, (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 blendps $11, %xmm0, %xmm2 # CHECK-NEXT: 1 1 1.00 blendps $11, %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * blendps $11, (%rax), %xmm2 # CHECK-NEXT: 1 4 1.00 * blendps $11, (%rax), %xmm2
# CHECK-NEXT: 3 4 4.00 blendvpd %xmm0, %xmm0, %xmm2 # CHECK-NEXT: 2 4 4.00 blendvpd %xmm0, %xmm0, %xmm2
# CHECK-NEXT: 3 7 4.00 * blendvpd %xmm0, (%rax), %xmm2 # CHECK-NEXT: 3 7 4.00 * blendvpd %xmm0, (%rax), %xmm2
# CHECK-NEXT: 3 4 4.00 blendvps %xmm0, %xmm0, %xmm2 # CHECK-NEXT: 2 4 4.00 blendvps %xmm0, %xmm0, %xmm2
# CHECK-NEXT: 3 7 4.00 * blendvps %xmm0, (%rax), %xmm2 # CHECK-NEXT: 3 7 4.00 * blendvps %xmm0, (%rax), %xmm2
# CHECK-NEXT: 1 3 1.00 dppd $22, %xmm0, %xmm2 # CHECK-NEXT: 1 3 1.00 dppd $22, %xmm0, %xmm2
# CHECK-NEXT: 1 6 1.00 * dppd $22, (%rax), %xmm2 # CHECK-NEXT: 1 6 1.00 * dppd $22, (%rax), %xmm2
@ -176,8 +176,8 @@ roundss $1, (%rax), %xmm2
# CHECK-NEXT: 1 10 1.00 * mpsadbw $1, (%rax), %xmm2 # CHECK-NEXT: 1 10 1.00 * mpsadbw $1, (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 packusdw %xmm0, %xmm2 # CHECK-NEXT: 1 1 1.00 packusdw %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * packusdw (%rax), %xmm2 # CHECK-NEXT: 1 4 1.00 * packusdw (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 pblendvb %xmm0, %xmm0, %xmm2 # CHECK-NEXT: 2 4 4.00 pblendvb %xmm0, %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * pblendvb %xmm0, (%rax), %xmm2 # CHECK-NEXT: 3 7 4.00 * pblendvb %xmm0, (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 pblendw $11, %xmm0, %xmm2 # CHECK-NEXT: 1 1 1.00 pblendw $11, %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * pblendw $11, (%rax), %xmm2 # CHECK-NEXT: 1 4 1.00 * pblendw $11, (%rax), %xmm2
# CHECK-NEXT: 2 4 2.00 pcmpeqq %xmm0, %xmm2 # CHECK-NEXT: 2 4 2.00 pcmpeqq %xmm0, %xmm2
@ -264,7 +264,7 @@ roundss $1, (%rax), %xmm2
# CHECK: Resource pressure per iteration: # CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7]
# CHECK-NEXT: - - - 90.00 25.00 - - 54.00 # CHECK-NEXT: - - - 96.00 25.00 - - 54.00
# CHECK: Resource pressure by instruction: # CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] Instructions: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] Instructions:
@ -289,8 +289,8 @@ roundss $1, (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - 1.00 mpsadbw $1, (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 mpsadbw $1, (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - - packusdw %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 - - - - packusdw %xmm0, %xmm2
# CHECK-NEXT: - - - 1.00 - - - 1.00 packusdw (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 packusdw (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - - pblendvb %xmm0, %xmm0, %xmm2 # CHECK-NEXT: - - - 4.00 - - - - pblendvb %xmm0, %xmm0, %xmm2
# CHECK-NEXT: - - - 1.00 - - - 1.00 pblendvb %xmm0, (%rax), %xmm2 # CHECK-NEXT: - - - 4.00 - - - 1.00 pblendvb %xmm0, (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - - - - pblendw $11, %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 - - - - pblendw $11, %xmm0, %xmm2
# CHECK-NEXT: - - - 1.00 - - - 1.00 pblendw $11, (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 pblendw $11, (%rax), %xmm2
# CHECK-NEXT: - - - 2.00 2.00 - - - pcmpeqq %xmm0, %xmm2 # CHECK-NEXT: - - - 2.00 2.00 - - - pcmpeqq %xmm0, %xmm2