forked from OSchip/llvm-project
[X86] Add patterns for folding full vector load into MOVHPS and MOVLPS with SSE1 only.
llvm-svn: 337320
This commit is contained in:
parent
c0f2e306f2
commit
9ef92865ec
|
@ -6452,7 +6452,9 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr,
|
||||||
Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>, EVEX_4V;
|
Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>, EVEX_4V;
|
||||||
}
|
}
|
||||||
|
|
||||||
defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
|
// No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in
|
||||||
|
// SSE1. And MOVLPS pattern is even more complex.
|
||||||
|
defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", null_frag,
|
||||||
v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
|
v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
|
||||||
defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl,
|
defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl,
|
||||||
v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
|
v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
|
||||||
|
|
|
@ -661,19 +661,16 @@ let Predicates = [UseSSE1] in {
|
||||||
// SSE 1 & 2 - Move Low packed FP Instructions
|
// SSE 1 & 2 - Move Low packed FP Instructions
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
|
multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode,
|
||||||
string base_opc, string asm_opr> {
|
string base_opc, string asm_opr> {
|
||||||
|
// No pattern as they need be special cased between high and low.
|
||||||
let hasSideEffects = 0, mayLoad = 1 in
|
let hasSideEffects = 0, mayLoad = 1 in
|
||||||
def PSrm : PI<opc, MRMSrcMem,
|
def PSrm : PI<opc, MRMSrcMem,
|
||||||
(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
|
(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
|
||||||
!strconcat(base_opc, "s", asm_opr),
|
!strconcat(base_opc, "s", asm_opr),
|
||||||
[(set VR128:$dst,
|
[], SSEPackedSingle>, PS,
|
||||||
(psnode VR128:$src1,
|
Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
|
||||||
(bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
|
|
||||||
SSEPackedSingle>, PS,
|
|
||||||
Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
|
|
||||||
|
|
||||||
let hasSideEffects = 0, mayLoad = 1 in
|
|
||||||
def PDrm : PI<opc, MRMSrcMem,
|
def PDrm : PI<opc, MRMSrcMem,
|
||||||
(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
|
(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
|
||||||
!strconcat(base_opc, "d", asm_opr),
|
!strconcat(base_opc, "d", asm_opr),
|
||||||
|
@ -683,19 +680,19 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
|
||||||
Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
|
Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator psnode,
|
multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
|
||||||
SDPatternOperator pdnode, string base_opc> {
|
string base_opc> {
|
||||||
let Predicates = [UseAVX] in
|
let Predicates = [UseAVX] in
|
||||||
defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
|
defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
|
||||||
"\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
|
"\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
|
||||||
VEX_4V, VEX_WIG;
|
VEX_4V, VEX_WIG;
|
||||||
|
|
||||||
let Constraints = "$src1 = $dst" in
|
let Constraints = "$src1 = $dst" in
|
||||||
defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
|
defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
|
||||||
"\t{$src2, $dst|$dst, $src2}">;
|
"\t{$src2, $dst|$dst, $src2}">;
|
||||||
}
|
}
|
||||||
|
|
||||||
defm MOVL : sse12_mov_hilo_packed<0x12, null_frag, X86Movsd, "movlp">;
|
defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
|
||||||
|
|
||||||
let SchedRW = [WriteFStore] in {
|
let SchedRW = [WriteFStore] in {
|
||||||
let Predicates = [UseAVX] in {
|
let Predicates = [UseAVX] in {
|
||||||
|
@ -725,13 +722,18 @@ let Predicates = [UseSSE1] in {
|
||||||
def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
|
def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
|
||||||
(iPTR 0))), addr:$src1),
|
(iPTR 0))), addr:$src1),
|
||||||
(MOVLPSmr addr:$src1, VR128:$src2)>;
|
(MOVLPSmr addr:$src1, VR128:$src2)>;
|
||||||
|
|
||||||
|
// This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
|
||||||
|
// end up with a movsd or bleand instead of shufp.
|
||||||
|
def : Pat<(X86Shufp (memopv4f32 addr:$src2), VR128:$src1, (i8 -28)),
|
||||||
|
(MOVLPSrm VR128:$src1, addr:$src2)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// SSE 1 & 2 - Move Hi packed FP Instructions
|
// SSE 1 & 2 - Move Hi packed FP Instructions
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Unpckl, "movhp">;
|
defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
|
||||||
|
|
||||||
let SchedRW = [WriteFStore] in {
|
let SchedRW = [WriteFStore] in {
|
||||||
// v2f64 extract element 1 is always custom lowered to unpack high to low
|
// v2f64 extract element 1 is always custom lowered to unpack high to low
|
||||||
|
@ -796,6 +798,11 @@ let Predicates = [UseSSE1] in {
|
||||||
def : Pat<(X86Movlhps VR128:$src1,
|
def : Pat<(X86Movlhps VR128:$src1,
|
||||||
(bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
|
(bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
|
||||||
(MOVHPSrm VR128:$src1, addr:$src2)>;
|
(MOVHPSrm VR128:$src1, addr:$src2)>;
|
||||||
|
|
||||||
|
// This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
|
||||||
|
// end up with a movsd or bleand instead of shufp.
|
||||||
|
def : Pat<(X86Movlhps VR128:$src1, (memopv4f32 addr:$src2)),
|
||||||
|
(MOVHPSrm VR128:$src1, addr:$src2)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
let Predicates = [UseSSE2] in {
|
let Predicates = [UseSSE2] in {
|
||||||
|
|
|
@ -298,9 +298,7 @@ define <4 x float> @shuffle_mem_v4f32_6723(<4 x float> %a, <4 x float>* %pb) {
|
||||||
define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) {
|
define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) {
|
||||||
; SSE1-LABEL: shuffle_mem_v4f32_4523:
|
; SSE1-LABEL: shuffle_mem_v4f32_4523:
|
||||||
; SSE1: # %bb.0:
|
; SSE1: # %bb.0:
|
||||||
; SSE1-NEXT: movaps (%rdi), %xmm1
|
; SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
|
||||||
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
|
|
||||||
; SSE1-NEXT: movaps %xmm1, %xmm0
|
|
||||||
; SSE1-NEXT: retq
|
; SSE1-NEXT: retq
|
||||||
%b = load <4 x float>, <4 x float>* %pb, align 16
|
%b = load <4 x float>, <4 x float>* %pb, align 16
|
||||||
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
|
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
|
||||||
|
|
Loading…
Reference in New Issue