forked from OSchip/llvm-project
[X86] Add a DAG combine to turn vzmovl+load into vzload if the load isn't volatile. Remove isel patterns for vzmovl+load
We currently have some isel patterns for treating vzmovl+load the same as vzload, but that shrinks the load which we shouldn't do if the load is volatile. Rather than adding isel checks for volatile. This patch removes the patterns and teachs DAG combine to merge them into vzload when its legal to do so. Differential Revision: https://reviews.llvm.org/D63665 llvm-svn: 364333
This commit is contained in:
parent
37340e3cd6
commit
14ea14ae85
|
@ -33675,6 +33675,26 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
|
|||
Movl, N->getOperand(0).getOperand(2));
|
||||
}
|
||||
|
||||
// If this a vzmovl of a full vector load, replace it with a vzload, unless
|
||||
// the load is volatile.
|
||||
if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
|
||||
ISD::isNormalLoad(N->getOperand(0).getNode())) {
|
||||
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
|
||||
if (!LN->isVolatile()) {
|
||||
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
|
||||
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
|
||||
SDValue VZLoad =
|
||||
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
|
||||
VT.getVectorElementType(),
|
||||
LN->getPointerInfo(),
|
||||
LN->getAlignment(),
|
||||
MachineMemOperand::MOLoad);
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 0), VZLoad.getValue(1));
|
||||
return VZLoad;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
|
||||
// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
|
||||
// FIXME: This can probably go away once we default to widening legalization.
|
||||
|
|
|
@ -4317,15 +4317,11 @@ let Predicates = [HasAVX512] in {
|
|||
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
|
||||
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
|
||||
(VMOVSSZrm addr:$src)>;
|
||||
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
|
||||
(VMOVSSZrm addr:$src)>;
|
||||
|
||||
// MOVSDrm zeros the high parts of the register; represent this
|
||||
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
|
||||
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
|
||||
(VMOVSDZrm addr:$src)>;
|
||||
def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
|
||||
(VMOVSDZrm addr:$src)>;
|
||||
|
||||
// Represent the same patterns above but in the form they appear for
|
||||
// 256-bit types
|
||||
|
@ -4363,14 +4359,10 @@ let Predicates = [HasAVX512] in {
|
|||
(VMOVDI2PDIZrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
|
||||
(VMOVDI2PDIZrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
|
||||
(VMOVDI2PDIZrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vzload addr:$src)),
|
||||
(VMOVDI2PDIZrm addr:$src)>;
|
||||
def : Pat<(v8i32 (X86vzload addr:$src)),
|
||||
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
|
||||
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
|
||||
(VMOVQI2PQIZrm addr:$src)>;
|
||||
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
|
||||
(VMOVZPQILo2PQIZrr VR128X:$src)>;
|
||||
def : Pat<(v2i64 (X86vzload addr:$src)),
|
||||
|
|
|
@ -267,8 +267,6 @@ let Predicates = [UseAVX] in {
|
|||
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
|
||||
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
|
||||
(VMOVSSrm addr:$src)>;
|
||||
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
|
||||
(VMOVSSrm addr:$src)>;
|
||||
def : Pat<(v4f32 (X86vzload addr:$src)),
|
||||
(VMOVSSrm addr:$src)>;
|
||||
|
||||
|
@ -276,8 +274,6 @@ let Predicates = [UseAVX] in {
|
|||
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
|
||||
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
|
||||
(VMOVSDrm addr:$src)>;
|
||||
def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
|
||||
(VMOVSDrm addr:$src)>;
|
||||
def : Pat<(v2f64 (X86vzload addr:$src)),
|
||||
(VMOVSDrm addr:$src)>;
|
||||
|
||||
|
@ -321,16 +317,12 @@ let Predicates = [UseSSE1] in {
|
|||
// MOVSSrm already zeros the high parts of the register.
|
||||
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
|
||||
(MOVSSrm addr:$src)>;
|
||||
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
|
||||
(MOVSSrm addr:$src)>;
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE2] in {
|
||||
// MOVSDrm already zeros the high parts of the register.
|
||||
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
|
||||
(MOVSDrm addr:$src)>;
|
||||
def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
|
||||
(MOVSDrm addr:$src)>;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -4145,8 +4137,6 @@ let Predicates = [UseAVX] in {
|
|||
(VMOVDI2PDIrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
|
||||
(VMOVDI2PDIrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
|
||||
(VMOVDI2PDIrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vzload addr:$src)),
|
||||
(VMOVDI2PDIrm addr:$src)>;
|
||||
def : Pat<(v8i32 (X86vzload addr:$src)),
|
||||
|
@ -4163,8 +4153,6 @@ let Predicates = [UseSSE2] in {
|
|||
(MOVDI2PDIrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
|
||||
(MOVDI2PDIrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
|
||||
(MOVDI2PDIrm addr:$src)>;
|
||||
def : Pat<(v4i32 (X86vzload addr:$src)),
|
||||
(MOVDI2PDIrm addr:$src)>;
|
||||
}
|
||||
|
@ -4233,8 +4221,6 @@ def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
|
|||
(MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
|
||||
|
||||
let Predicates = [UseAVX] in {
|
||||
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
|
||||
(VMOVQI2PQIrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vzload addr:$src)),
|
||||
(VMOVQI2PQIrm addr:$src)>;
|
||||
def : Pat<(v4i64 (X86vzload addr:$src)),
|
||||
|
@ -4245,8 +4231,6 @@ let Predicates = [UseAVX] in {
|
|||
}
|
||||
|
||||
let Predicates = [UseSSE2] in {
|
||||
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
|
||||
(MOVQI2PQIrm addr:$src)>;
|
||||
def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
|
||||
|
||||
def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst),
|
||||
|
|
|
@ -37,16 +37,33 @@ entry:
|
|||
ret <2 x i64>%Y
|
||||
}
|
||||
|
||||
; FIXME: We shouldn't shrink the load to movss here since it is volatile.
|
||||
define <4 x i32> @load_zmov_4i32_to_0zzz_volatile(<4 x i32> *%ptr) {
|
||||
; SSE-LABEL: load_zmov_4i32_to_0zzz_volatile:
|
||||
; SSE: # %bb.0: # %entry
|
||||
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: load_zmov_4i32_to_0zzz_volatile:
|
||||
; SSE2: # %bb.0: # %entry
|
||||
; SSE2-NEXT: movaps (%rdi), %xmm1
|
||||
; SSE2-NEXT: xorps %xmm0, %xmm0
|
||||
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: load_zmov_4i32_to_0zzz_volatile:
|
||||
; SSSE3: # %bb.0: # %entry
|
||||
; SSSE3-NEXT: movaps (%rdi), %xmm1
|
||||
; SSSE3-NEXT: xorps %xmm0, %xmm0
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: load_zmov_4i32_to_0zzz_volatile:
|
||||
; SSE41: # %bb.0: # %entry
|
||||
; SSE41-NEXT: movaps (%rdi), %xmm1
|
||||
; SSE41-NEXT: xorps %xmm0, %xmm0
|
||||
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: load_zmov_4i32_to_0zzz_volatile:
|
||||
; AVX: # %bb.0: # %entry
|
||||
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
|
||||
; AVX-NEXT: retq
|
||||
entry:
|
||||
%X = load volatile <4 x i32>, <4 x i32>* %ptr
|
||||
|
@ -54,16 +71,17 @@ entry:
|
|||
ret <4 x i32>%Y
|
||||
}
|
||||
|
||||
; FIXME: We shouldn't shrink the load to movsd here since it is volatile.
|
||||
define <2 x i64> @load_zmov_2i64_to_0z_volatile(<2 x i64> *%ptr) {
|
||||
; SSE-LABEL: load_zmov_2i64_to_0z_volatile:
|
||||
; SSE: # %bb.0: # %entry
|
||||
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE-NEXT: movdqa (%rdi), %xmm0
|
||||
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: load_zmov_2i64_to_0z_volatile:
|
||||
; AVX: # %bb.0: # %entry
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
|
||||
; AVX-NEXT: retq
|
||||
entry:
|
||||
%X = load volatile <2 x i64>, <2 x i64>* %ptr
|
||||
|
|
Loading…
Reference in New Issue