forked from OSchip/llvm-project
[X86][XOP] Tidyup VPHADD/VPHSUB unary horizontal ops default schedule class
Based off Agner and AMD SoG tables, the XOP VPHADD/VPHSUB unary horizontal ops are as fast as basic arithmetic ops, not the slower SSSE3 binary horizontal add/sub ops. This also matches what the bdver2 model already lists. Noticed while investigating reduction add optimizations.
This commit is contained in:
parent
a8b4f5bbab
commit
0c9c92ffc0
|
@ -13,11 +13,11 @@
|
|||
multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
|
||||
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
|
||||
[(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWriteVecALU.XMM]>;
|
||||
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
|
||||
Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>;
|
||||
Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
|
||||
}
|
||||
|
||||
let ExeDomain = SSEPackedInt in {
|
||||
|
|
|
@ -267,36 +267,36 @@ vpshlw %xmm0, (%rax), %xmm3
|
|||
# CHECK-NEXT: 1 1 1.00 vpermil2ps $0, %ymm0, %ymm1, %ymm2, %ymm3
|
||||
# CHECK-NEXT: 2 8 1.00 * vpermil2ps $0, (%rax), %ymm0, %ymm1, %ymm3
|
||||
# CHECK-NEXT: 2 8 1.00 * vpermil2ps $0, %ymm0, (%rax), %ymm1, %ymm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphaddbd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphaddbd (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphaddbq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphaddbq (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphaddbw %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphaddbw (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphadddq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphadddq (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphaddubd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphaddubd (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphaddubq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphaddubq (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphaddubw %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphaddubw (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphaddudq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphaddudq (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphadduwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphadduwd (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphadduwq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphadduwq (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphaddwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphaddwd (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphaddwq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphaddwq (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphsubbw %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphsubbw (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphsubdq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphsubdq (%rax), %xmm3
|
||||
# CHECK-NEXT: 3 3 1.50 vphsubwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 4 9 1.50 * vphsubwd (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphaddbd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphaddbd (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphaddbq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphaddbq (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphaddbw %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphaddbw (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphadddq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphadddq (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphaddubd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphaddubd (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphaddubq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphaddubq (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphaddubw %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphaddubw (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphaddudq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphaddudq (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphadduwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphadduwd (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphadduwq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphadduwq (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphaddwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphaddwd (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphaddwq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphaddwq (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphsubbw %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphsubbw (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphsubdq %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphsubdq (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 1 0.50 vphsubwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: 2 7 0.50 * vphsubwd (%rax), %xmm3
|
||||
# CHECK-NEXT: 1 5 1.00 vpmacsdd %xmm0, %xmm1, %xmm2, %xmm3
|
||||
# CHECK-NEXT: 2 11 1.00 * vpmacsdd %xmm0, (%rax), %xmm1, %xmm3
|
||||
# CHECK-NEXT: 1 5 1.00 vpmacsdqh %xmm0, %xmm1, %xmm2, %xmm3
|
||||
|
@ -381,7 +381,7 @@ vpshlw %xmm0, (%rax), %xmm3
|
|||
|
||||
# CHECK: Resource pressure per iteration:
|
||||
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
|
||||
# CHECK-NEXT: - - 68.00 68.00 - 71.00 41.50 41.50
|
||||
# CHECK-NEXT: - - 68.00 38.00 - 41.00 41.50 41.50
|
||||
|
||||
# CHECK: Resource pressure by instruction:
|
||||
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
|
||||
|
@ -431,36 +431,36 @@ vpshlw %xmm0, (%rax), %xmm3
|
|||
# CHECK-NEXT: - - - - - 1.00 - - vpermil2ps $0, %ymm0, %ymm1, %ymm2, %ymm3
|
||||
# CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpermil2ps $0, (%rax), %ymm0, %ymm1, %ymm3
|
||||
# CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpermil2ps $0, %ymm0, (%rax), %ymm1, %ymm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddbd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddbd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddbq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddbq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddbw %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddbw (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphadddq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphadddq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddubd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddubd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddubq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddubq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddubw %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddubw (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddudq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddudq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphadduwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphadduwd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphadduwq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphadduwq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddwd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddwq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddwq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphsubbw %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphsubbw (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphsubdq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphsubdq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 - - vphsubwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphsubwd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddbd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddbd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddbq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddbq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddbw %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddbw (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphadddq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphadddq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddubd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddubd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddubq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddubq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddubw %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddubw (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddudq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddudq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphadduwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphadduwd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphadduwq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphadduwq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddwd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddwq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddwq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphsubbw %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphsubbw (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphsubdq %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphsubdq (%rax), %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 - - vphsubwd %xmm0, %xmm3
|
||||
# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphsubwd (%rax), %xmm3
|
||||
# CHECK-NEXT: - - 1.00 - - - - - vpmacsdd %xmm0, %xmm1, %xmm2, %xmm3
|
||||
# CHECK-NEXT: - - 1.00 - - - 0.50 0.50 vpmacsdd %xmm0, (%rax), %xmm1, %xmm3
|
||||
# CHECK-NEXT: - - 1.00 - - - - - vpmacsdqh %xmm0, %xmm1, %xmm2, %xmm3
|
||||
|
|
Loading…
Reference in New Issue