forked from OSchip/llvm-project
[X86] Add load folding isel patterns to scalar_math_patterns and AVX512_scalar_math_fp_patterns.
Also add a FIXME for the peephole pass not being able to handle this. llvm-svn: 363032
This commit is contained in:
parent
2efd2957ef
commit
627d8168e7
|
@ -11871,6 +11871,12 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
|
|||
_.FRC:$src)))),
|
||||
(!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
|
||||
(_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
|
||||
def : Pat<(MoveNode
|
||||
(_.VT VR128X:$dst),
|
||||
(_.VT (scalar_to_vector
|
||||
(Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
|
||||
(_.ScalarLdFrag addr:$src))))),
|
||||
(!cast<Instruction>("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>;
|
||||
|
||||
// extracted masked scalar math op with insert via movss
|
||||
def : Pat<(MoveNode (_.VT VR128X:$src1),
|
||||
|
@ -11884,6 +11890,16 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
|
|||
(_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
|
||||
VK1WM:$mask, _.VT:$src1,
|
||||
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
|
||||
def : Pat<(MoveNode (_.VT VR128X:$src1),
|
||||
(scalar_to_vector
|
||||
(X86selects VK1WM:$mask,
|
||||
(Op (_.EltVT
|
||||
(extractelt (_.VT VR128X:$src1), (iPTR 0))),
|
||||
(_.ScalarLdFrag addr:$src2)),
|
||||
_.FRC:$src0))),
|
||||
(!cast<Instruction>("V"#OpcPrefix#Zrm_Intk)
|
||||
(_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
|
||||
VK1WM:$mask, _.VT:$src1, addr:$src2)>;
|
||||
|
||||
// extracted masked scalar math op with insert via movss
|
||||
def : Pat<(MoveNode (_.VT VR128X:$src1),
|
||||
|
@ -11895,6 +11911,13 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
|
|||
(!cast<I>("V"#OpcPrefix#Zrr_Intkz)
|
||||
VK1WM:$mask, _.VT:$src1,
|
||||
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
|
||||
def : Pat<(MoveNode (_.VT VR128X:$src1),
|
||||
(scalar_to_vector
|
||||
(X86selects VK1WM:$mask,
|
||||
(Op (_.EltVT
|
||||
(extractelt (_.VT VR128X:$src1), (iPTR 0))),
|
||||
(_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
|
||||
(!cast<I>("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -4685,6 +4685,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
|
|||
&RI, MF);
|
||||
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
|
||||
if (Size < RCSize) {
|
||||
// FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
|
||||
// Check if it's safe to fold the load. If the size of the object is
|
||||
// narrower than the load width, then it's not.
|
||||
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
|
||||
|
|
|
@ -2692,7 +2692,8 @@ let isCodeGenOnly = 1 in {
|
|||
// patterns we have to try to match.
|
||||
multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
|
||||
ValueType VT, ValueType EltTy,
|
||||
RegisterClass RC, Predicate BasePredicate> {
|
||||
RegisterClass RC, PatFrag ld_frag,
|
||||
Predicate BasePredicate> {
|
||||
let Predicates = [BasePredicate] in {
|
||||
// extracted scalar math op with insert via movss/movsd
|
||||
def : Pat<(VT (Move (VT VR128:$dst),
|
||||
|
@ -2701,6 +2702,11 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
|
|||
RC:$src))))),
|
||||
(!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
|
||||
(VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
|
||||
def : Pat<(VT (Move (VT VR128:$dst),
|
||||
(VT (scalar_to_vector
|
||||
(Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
|
||||
(ld_frag addr:$src)))))),
|
||||
(!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
|
||||
}
|
||||
|
||||
// Repeat for AVX versions of the instructions.
|
||||
|
@ -2712,18 +2718,23 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
|
|||
RC:$src))))),
|
||||
(!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
|
||||
(VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
|
||||
def : Pat<(VT (Move (VT VR128:$dst),
|
||||
(VT (scalar_to_vector
|
||||
(Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
|
||||
(ld_frag addr:$src)))))),
|
||||
(!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
|
||||
}
|
||||
}
|
||||
|
||||
defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
|
||||
defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
|
||||
defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
|
||||
defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
|
||||
defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
|
||||
defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
|
||||
defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
|
||||
defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
|
||||
|
||||
defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
|
||||
defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
|
||||
defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
|
||||
defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
|
||||
defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
|
||||
defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
|
||||
defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
|
||||
defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
|
||||
|
||||
/// Unop Arithmetic
|
||||
/// In addition, we also have a special variant of the scalar form here to
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
|
||||
; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
|
||||
; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
|
||||
; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
|
||||
|
||||
; Verify that we're folding the load into the math instruction.
|
||||
; This pattern is generated out of the simplest intrinsics usage:
|
||||
|
|
|
@ -414,14 +414,12 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
|
|||
define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
|
||||
; X86-SSE-LABEL: blend_add_ss:
|
||||
; X86-SSE: # %bb.0:
|
||||
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X86-SSE-NEXT: addss %xmm1, %xmm0
|
||||
; X86-SSE-NEXT: addss {{[0-9]+}}(%esp), %xmm0
|
||||
; X86-SSE-NEXT: retl
|
||||
;
|
||||
; X86-AVX-LABEL: blend_add_ss:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X86-AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: blend_add_ss:
|
||||
|
@ -444,14 +442,12 @@ define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
|
|||
define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
|
||||
; X86-SSE-LABEL: blend_sub_ss:
|
||||
; X86-SSE: # %bb.0:
|
||||
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X86-SSE-NEXT: subss %xmm1, %xmm0
|
||||
; X86-SSE-NEXT: subss {{[0-9]+}}(%esp), %xmm0
|
||||
; X86-SSE-NEXT: retl
|
||||
;
|
||||
; X86-AVX-LABEL: blend_sub_ss:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X86-AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: blend_sub_ss:
|
||||
|
@ -474,14 +470,12 @@ define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
|
|||
define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
|
||||
; X86-SSE-LABEL: blend_mul_ss:
|
||||
; X86-SSE: # %bb.0:
|
||||
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X86-SSE-NEXT: mulss %xmm1, %xmm0
|
||||
; X86-SSE-NEXT: mulss {{[0-9]+}}(%esp), %xmm0
|
||||
; X86-SSE-NEXT: retl
|
||||
;
|
||||
; X86-AVX-LABEL: blend_mul_ss:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X86-AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: blend_mul_ss:
|
||||
|
@ -504,14 +498,12 @@ define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
|
|||
define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
|
||||
; X86-SSE-LABEL: blend_div_ss:
|
||||
; X86-SSE: # %bb.0:
|
||||
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X86-SSE-NEXT: divss %xmm1, %xmm0
|
||||
; X86-SSE-NEXT: divss {{[0-9]+}}(%esp), %xmm0
|
||||
; X86-SSE-NEXT: retl
|
||||
;
|
||||
; X86-AVX-LABEL: blend_div_ss:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X86-AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: vdivss {{[0-9]+}}(%esp), %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: blend_div_ss:
|
||||
|
@ -534,14 +526,12 @@ define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
|
|||
define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
|
||||
; X86-SSE-LABEL: blend_add_sd:
|
||||
; X86-SSE: # %bb.0:
|
||||
; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; X86-SSE-NEXT: addsd %xmm1, %xmm0
|
||||
; X86-SSE-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
|
||||
; X86-SSE-NEXT: retl
|
||||
;
|
||||
; X86-AVX-LABEL: blend_add_sd:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; X86-AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: vaddsd {{[0-9]+}}(%esp), %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: blend_add_sd:
|
||||
|
@ -564,14 +554,12 @@ define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
|
|||
define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
|
||||
; X86-SSE-LABEL: blend_sub_sd:
|
||||
; X86-SSE: # %bb.0:
|
||||
; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; X86-SSE-NEXT: subsd %xmm1, %xmm0
|
||||
; X86-SSE-NEXT: subsd {{[0-9]+}}(%esp), %xmm0
|
||||
; X86-SSE-NEXT: retl
|
||||
;
|
||||
; X86-AVX-LABEL: blend_sub_sd:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; X86-AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: vsubsd {{[0-9]+}}(%esp), %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: blend_sub_sd:
|
||||
|
@ -594,14 +582,12 @@ define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
|
|||
define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
|
||||
; X86-SSE-LABEL: blend_mul_sd:
|
||||
; X86-SSE: # %bb.0:
|
||||
; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; X86-SSE-NEXT: mulsd %xmm1, %xmm0
|
||||
; X86-SSE-NEXT: mulsd {{[0-9]+}}(%esp), %xmm0
|
||||
; X86-SSE-NEXT: retl
|
||||
;
|
||||
; X86-AVX-LABEL: blend_mul_sd:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; X86-AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: vmulsd {{[0-9]+}}(%esp), %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: blend_mul_sd:
|
||||
|
@ -624,14 +610,12 @@ define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
|
|||
define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
|
||||
; X86-SSE-LABEL: blend_div_sd:
|
||||
; X86-SSE: # %bb.0:
|
||||
; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; X86-SSE-NEXT: divsd %xmm1, %xmm0
|
||||
; X86-SSE-NEXT: divsd {{[0-9]+}}(%esp), %xmm0
|
||||
; X86-SSE-NEXT: retl
|
||||
;
|
||||
; X86-AVX-LABEL: blend_div_sd:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; X86-AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: vdivsd {{[0-9]+}}(%esp), %xmm0, %xmm0
|
||||
; X86-AVX-NEXT: retl
|
||||
;
|
||||
; X64-SSE-LABEL: blend_div_sd:
|
||||
|
|
Loading…
Reference in New Issue