forked from OSchip/llvm-project
Canonicalize X86ISD::MOVDDUP nodes to v2f64 to make sure all cases match. Also eliminate unneeded isel patterns. rdar://8520311
llvm-svn: 115977
This commit is contained in:
parent
a6769bb993
commit
5c31bf0619
|
@ -5083,6 +5083,7 @@ static bool MayFoldVectorLoad(SDValue V) {
|
||||||
// uses while it only has one, use this version, and let isel match
|
// uses while it only has one, use this version, and let isel match
|
||||||
// another instruction if the load really happens to have more than
|
// another instruction if the load really happens to have more than
|
||||||
// one use. Remove this version after this bug get fixed.
|
// one use. Remove this version after this bug get fixed.
|
||||||
|
// rdar://8434668, PR8156
|
||||||
static bool RelaxedMayFoldVectorLoad(SDValue V) {
|
static bool RelaxedMayFoldVectorLoad(SDValue V) {
|
||||||
if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT)
|
if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT)
|
||||||
V = V.getOperand(0);
|
V = V.getOperand(0);
|
||||||
|
@ -5169,6 +5170,17 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
|
||||||
|
EVT VT = Op.getValueType();
|
||||||
|
|
||||||
|
// Canonizalize to v2f64.
|
||||||
|
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, V1);
|
||||||
|
return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
|
||||||
|
getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
|
||||||
|
V1, DAG));
|
||||||
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
|
SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
|
||||||
bool HasSSE2) {
|
bool HasSSE2) {
|
||||||
|
@ -5309,7 +5321,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
|
||||||
if (VT.getVectorNumElements() <= 4)
|
if (VT.getVectorNumElements() <= 4)
|
||||||
return SDValue();
|
return SDValue();
|
||||||
|
|
||||||
// Canonize all of the remaining to v4f32.
|
// Canonicalize all of the remaining to v4f32.
|
||||||
return PromoteSplat(SVOp, DAG);
|
return PromoteSplat(SVOp, DAG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5394,7 +5406,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
|
||||||
|
|
||||||
if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef &&
|
if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef &&
|
||||||
RelaxedMayFoldVectorLoad(V1))
|
RelaxedMayFoldVectorLoad(V1))
|
||||||
return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
|
return getMOVDDup(Op, dl, V1, DAG);
|
||||||
|
|
||||||
if (X86::isMOVHLPS_v_undef_Mask(SVOp))
|
if (X86::isMOVHLPS_v_undef_Mask(SVOp))
|
||||||
return getMOVHighToLow(Op, dl, DAG);
|
return getMOVHighToLow(Op, dl, DAG);
|
||||||
|
|
|
@ -5537,19 +5537,14 @@ def : Pat<(X86Movddup (memopv2f64 addr:$src)),
|
||||||
def : Pat<(X86Movddup (memopv2f64 addr:$src)),
|
def : Pat<(X86Movddup (memopv2f64 addr:$src)),
|
||||||
(MOVDDUPrm addr:$src)>;
|
(MOVDDUPrm addr:$src)>;
|
||||||
|
|
||||||
def : Pat<(X86Movddup (bc_v4f32 (memopv2f64 addr:$src))),
|
def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
|
||||||
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
|
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
|
||||||
def : Pat<(X86Movddup (bc_v4f32 (memopv2f64 addr:$src))),
|
def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
|
||||||
(MOVDDUPrm addr:$src)>;
|
(MOVDDUPrm addr:$src)>;
|
||||||
|
|
||||||
def : Pat<(X86Movddup (memopv2i64 addr:$src)),
|
def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
|
||||||
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
|
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
|
||||||
def : Pat<(X86Movddup (memopv2i64 addr:$src)),
|
def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
|
||||||
(MOVDDUPrm addr:$src)>;
|
|
||||||
|
|
||||||
def : Pat<(X86Movddup (bc_v4i32 (memopv2i64 addr:$src))),
|
|
||||||
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
|
|
||||||
def : Pat<(X86Movddup (bc_v4i32 (memopv2i64 addr:$src))),
|
|
||||||
(MOVDDUPrm addr:$src)>;
|
(MOVDDUPrm addr:$src)>;
|
||||||
|
|
||||||
def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
|
def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
|
||||||
|
@ -5564,6 +5559,7 @@ def : Pat<(X86Movddup (bc_v2f64
|
||||||
(v2i64 (scalar_to_vector (loadi64 addr:$src))))),
|
(v2i64 (scalar_to_vector (loadi64 addr:$src))))),
|
||||||
(MOVDDUPrm addr:$src)>;
|
(MOVDDUPrm addr:$src)>;
|
||||||
|
|
||||||
|
|
||||||
// Shuffle with UNPCKLPS
|
// Shuffle with UNPCKLPS
|
||||||
def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
|
def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
|
||||||
(VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
|
(VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
|
||||||
|
@ -5675,14 +5671,11 @@ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
|
||||||
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
|
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
|
||||||
def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
|
def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
|
||||||
(MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
|
(MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
|
||||||
// FIXME: Instead of X86Movddup, there should be a X86Movlhps here, the problem
|
|
||||||
|
// FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the problem
|
||||||
// is during lowering, where it's not possible to recognize the load fold cause
|
// is during lowering, where it's not possible to recognize the load fold cause
|
||||||
// it has two uses through a bitcast. One use disappears at isel time and the
|
// it has two uses through a bitcast. One use disappears at isel time and the
|
||||||
// fold opportunity reappears.
|
// fold opportunity reappears.
|
||||||
def : Pat<(v2i64 (X86Movddup VR128:$src)),
|
|
||||||
(MOVLHPSrr VR128:$src, VR128:$src)>;
|
|
||||||
def : Pat<(v4f32 (X86Movddup VR128:$src)),
|
|
||||||
(MOVLHPSrr VR128:$src, VR128:$src)>;
|
|
||||||
def : Pat<(v2f64 (X86Movddup VR128:$src)),
|
def : Pat<(v2f64 (X86Movddup VR128:$src)),
|
||||||
(UNPCKLPDrr VR128:$src, VR128:$src)>;
|
(UNPCKLPDrr VR128:$src, VR128:$src)>;
|
||||||
|
|
||||||
|
@ -5690,6 +5683,7 @@ def : Pat<(v2f64 (X86Movddup VR128:$src)),
|
||||||
def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
|
def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
|
||||||
(scalar_to_vector (loadf64 addr:$src2)))),
|
(scalar_to_vector (loadf64 addr:$src2)))),
|
||||||
(MOVHPDrm VR128:$src1, addr:$src2)>;
|
(MOVHPDrm VR128:$src1, addr:$src2)>;
|
||||||
|
|
||||||
// FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
|
// FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
|
||||||
// is during lowering, where it's not possible to recognize the load fold cause
|
// is during lowering, where it's not possible to recognize the load fold cause
|
||||||
// it has two uses through a bitcast. One use disappears at isel time and the
|
// it has two uses through a bitcast. One use disappears at isel time and the
|
||||||
|
|
|
@ -169,7 +169,7 @@ define internal void @t10() nounwind {
|
||||||
ret void
|
ret void
|
||||||
; X64: t10:
|
; X64: t10:
|
||||||
; X64: pextrw $4, %xmm0, %eax
|
; X64: pextrw $4, %xmm0, %eax
|
||||||
; X64: movlhps %xmm1, %xmm1
|
; X64: unpcklpd %xmm1, %xmm1
|
||||||
; X64: pshuflw $8, %xmm1, %xmm1
|
; X64: pshuflw $8, %xmm1, %xmm1
|
||||||
; X64: pinsrw $2, %eax, %xmm1
|
; X64: pinsrw $2, %eax, %xmm1
|
||||||
; X64: pextrw $6, %xmm0, %eax
|
; X64: pextrw $6, %xmm0, %eax
|
||||||
|
@ -260,3 +260,18 @@ entry:
|
||||||
; X64: pinsrw $1, %eax, %xmm0
|
; X64: pinsrw $1, %eax, %xmm0
|
||||||
; X64: ret
|
; X64: ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; rdar://8520311
|
||||||
|
define <4 x i32> @t17() nounwind {
|
||||||
|
entry:
|
||||||
|
; X64: t17:
|
||||||
|
; X64: movddup (%rax), %xmm0
|
||||||
|
%tmp1 = load <4 x float>* undef, align 16
|
||||||
|
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
||||||
|
%tmp3 = load <4 x float>* undef, align 16
|
||||||
|
%tmp4 = shufflevector <4 x float> %tmp2, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
|
||||||
|
%tmp5 = bitcast <4 x float> %tmp3 to <4 x i32>
|
||||||
|
%tmp6 = shufflevector <4 x i32> %tmp5, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
|
||||||
|
%tmp7 = and <4 x i32> %tmp6, <i32 undef, i32 undef, i32 -1, i32 0>
|
||||||
|
ret <4 x i32> %tmp7
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue