forked from OSchip/llvm-project
Add scheduler classes to integer/float horizontal operations.
This patch will close PR32801. Differential Revision: https://reviews.llvm.org/D33203 llvm-svn: 304986
This commit is contained in:
parent
c41b67cc52
commit
8cb1d0931f
|
@ -5183,14 +5183,14 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
|
|||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
|
||||
[(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
|
||||
Sched<[WriteFAdd]>;
|
||||
Sched<[WriteFHAdd]>;
|
||||
|
||||
def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
|
||||
!if(Is2Addr,
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
|
||||
[(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
|
||||
IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
|
||||
IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
|
||||
}
|
||||
multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
|
||||
X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
|
||||
|
@ -5200,14 +5200,14 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
|
|||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
|
||||
[(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
|
||||
Sched<[WriteFAdd]>;
|
||||
Sched<[WriteFHAdd]>;
|
||||
|
||||
def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
|
||||
!if(Is2Addr,
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
|
||||
[(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
|
||||
IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
|
||||
IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX] in {
|
||||
|
@ -5310,7 +5310,7 @@ defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, memopv2i64>;
|
|||
// SSSE3 - Packed Binary Operator Instructions
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
let Sched = WriteVecALU in {
|
||||
let Sched = WritePHAdd in {
|
||||
def SSE_PHADDSUBD : OpndItins<
|
||||
IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
|
||||
>;
|
||||
|
|
|
@ -1488,6 +1488,39 @@ def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>;
|
|||
|
||||
//-- Arithmetic instructions --//
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Horizontal add/sub instructions.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// HADD, HSUB PS/PD
|
||||
// x,x / v,v,v.
|
||||
def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
|
||||
let Latency = 5;
|
||||
let NumMicroOps = 3;
|
||||
let ResourceCycles = [1, 2];
|
||||
}
|
||||
|
||||
// x,m / v,v,m.
|
||||
def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
|
||||
let Latency = 9;
|
||||
let NumMicroOps = 4;
|
||||
let ResourceCycles = [1, 2, 1];
|
||||
}
|
||||
|
||||
// PHADD|PHSUB (S) W/D.
|
||||
// v <- v,v.
|
||||
def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
|
||||
let Latency = 3;
|
||||
let NumMicroOps = 3;
|
||||
let ResourceCycles = [1, 2];
|
||||
}
|
||||
// v <- v,m.
|
||||
def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
|
||||
let Latency = 6;
|
||||
let NumMicroOps = 3;
|
||||
let ResourceCycles = [1, 2, 1];
|
||||
}
|
||||
|
||||
// PHADD|PHSUB (S) W/D.
|
||||
// v <- v,v.
|
||||
def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
|
||||
|
|
|
@ -157,6 +157,31 @@ def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> {
|
|||
let ResourceCycles = [1, 1, 1, 1];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Horizontal add/sub instructions.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// HADD, HSUB PS/PD
|
||||
// x,x / v,v,v.
|
||||
def : WriteRes<WriteFHAdd, [SBPort1]> {
|
||||
let Latency = 3;
|
||||
}
|
||||
|
||||
// x,m / v,v,m.
|
||||
def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> {
|
||||
let Latency = 7;
|
||||
let ResourceCycles = [1, 1];
|
||||
}
|
||||
|
||||
// PHADD|PHSUB (S) W/D.
|
||||
// v <- v,v.
|
||||
def : WriteRes<WritePHAdd, [SBPort15]>;
|
||||
|
||||
// v <- v,m.
|
||||
def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> {
|
||||
let Latency = 5;
|
||||
let ResourceCycles = [1, 1];
|
||||
}
|
||||
|
||||
// String instructions.
|
||||
// Packed Compare Implicit Length Strings, Return Mask
|
||||
def : WriteRes<WritePCmpIStrM, [SBPort015]> {
|
||||
|
|
|
@ -77,6 +77,10 @@ defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
|
|||
// FMA Scheduling helper class.
|
||||
class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
|
||||
|
||||
// Horizontal Add/Sub (float and integer)
|
||||
defm WriteFHAdd : X86SchedWritePair;
|
||||
defm WritePHAdd : X86SchedWritePair;
|
||||
|
||||
// Vector integer operations.
|
||||
defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
|
||||
defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
|
||||
|
|
|
@ -319,6 +319,38 @@ def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> {
|
|||
let ResourceCycles = [1, 1];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Horizontal add/sub instructions.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
def : WriteRes<WriteFHAdd, [JFPU0]> {
|
||||
let Latency = 3;
|
||||
}
|
||||
|
||||
def : WriteRes<WriteFHAddLd, [JLAGU, JFPU0]> {
|
||||
let Latency = 8;
|
||||
}
|
||||
|
||||
def : WriteRes<WritePHAdd, [JFPU01]> {
|
||||
let ResourceCycles = [1];
|
||||
}
|
||||
def : WriteRes<WritePHAddLd, [JLAGU, JFPU01 ]> {
|
||||
let Latency = 6;
|
||||
let ResourceCycles = [1, 1];
|
||||
}
|
||||
|
||||
def WriteFHAddY: SchedWriteRes<[JFPU0]> {
|
||||
let Latency = 3;
|
||||
let ResourceCycles = [2];
|
||||
}
|
||||
def : InstRW<[WriteFHAddY], (instregex "VH(ADD|SUB)P(S|D)Yrr")>;
|
||||
|
||||
def WriteFHAddYLd: SchedWriteRes<[JLAGU, JFPU0]> {
|
||||
let Latency = 8;
|
||||
let ResourceCycles = [1, 2];
|
||||
}
|
||||
def : InstRW<[WriteFHAddYLd], (instregex "VH(ADD|SUB)P(S|D)Yrm")>;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Carry-less multiplication instructions.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -137,6 +137,33 @@ defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>;
|
|||
defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>;
|
||||
defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Horizontal add/sub instructions.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// HADD, HSUB PS/PD
|
||||
|
||||
def : WriteRes<WriteFHAdd, [FPC_RSV01]> {
|
||||
let Latency = 3;
|
||||
let ResourceCycles = [2];
|
||||
}
|
||||
|
||||
def : WriteRes<WriteFHAddLd, [FPC_RSV01, MEC_RSV]> {
|
||||
let Latency = 6;
|
||||
let ResourceCycles = [2, 1];
|
||||
}
|
||||
|
||||
// PHADD|PHSUB (S) W/D.
|
||||
def : WriteRes<WritePHAdd, [FPC_RSV01]> {
|
||||
let Latency = 1;
|
||||
let ResourceCycles = [1];
|
||||
}
|
||||
|
||||
def : WriteRes<WritePHAddLd, [FPC_RSV01, MEC_RSV]> {
|
||||
let Latency = 4;
|
||||
let ResourceCycles = [1, 1];
|
||||
}
|
||||
|
||||
// String instructions.
|
||||
// Packed Compare Implicit Length Strings, Return Mask
|
||||
def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
|
||||
|
|
|
@ -910,14 +910,14 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
|
|||
;
|
||||
; BTVER2-LABEL: test_haddpd:
|
||||
; BTVER2: # BB#0:
|
||||
; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
|
||||
; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
|
||||
; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
|
||||
; BTVER2-NEXT: retq # sched: [4:1.00]
|
||||
;
|
||||
; ZNVER1-LABEL: test_haddpd:
|
||||
; ZNVER1: # BB#0:
|
||||
; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
|
||||
; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
|
||||
; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
|
||||
; ZNVER1-NEXT: retq # sched: [4:1.00]
|
||||
%1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
|
||||
%2 = load <4 x double>, <4 x double> *%a2, align 32
|
||||
|
@ -941,14 +941,14 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
|
|||
;
|
||||
; BTVER2-LABEL: test_haddps:
|
||||
; BTVER2: # BB#0:
|
||||
; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
|
||||
; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
|
||||
; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
|
||||
; BTVER2-NEXT: retq # sched: [4:1.00]
|
||||
;
|
||||
; ZNVER1-LABEL: test_haddps:
|
||||
; ZNVER1: # BB#0:
|
||||
; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
|
||||
; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
|
||||
; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
|
||||
; ZNVER1-NEXT: retq # sched: [4:1.00]
|
||||
%1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
|
||||
%2 = load <8 x float>, <8 x float> *%a2, align 32
|
||||
|
@ -972,14 +972,14 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double
|
|||
;
|
||||
; BTVER2-LABEL: test_hsubpd:
|
||||
; BTVER2: # BB#0:
|
||||
; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
|
||||
; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
|
||||
; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
|
||||
; BTVER2-NEXT: retq # sched: [4:1.00]
|
||||
;
|
||||
; ZNVER1-LABEL: test_hsubpd:
|
||||
; ZNVER1: # BB#0:
|
||||
; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
|
||||
; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
|
||||
; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
|
||||
; ZNVER1-NEXT: retq # sched: [4:1.00]
|
||||
%1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
|
||||
%2 = load <4 x double>, <4 x double> *%a2, align 32
|
||||
|
@ -1003,14 +1003,14 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
|
|||
;
|
||||
; BTVER2-LABEL: test_hsubps:
|
||||
; BTVER2: # BB#0:
|
||||
; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
|
||||
; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
|
||||
; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
|
||||
; BTVER2-NEXT: retq # sched: [4:1.00]
|
||||
;
|
||||
; ZNVER1-LABEL: test_hsubps:
|
||||
; ZNVER1: # BB#0:
|
||||
; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
|
||||
; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
|
||||
; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
|
||||
; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
|
||||
; ZNVER1-NEXT: retq # sched: [4:1.00]
|
||||
%1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
|
||||
%2 = load <8 x float>, <8 x float> *%a2, align 32
|
||||
|
|
Loading…
Reference in New Issue