diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 90ef6df0d012..66f9612e0275 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5083,6 +5083,7 @@ static bool MayFoldVectorLoad(SDValue V) { // uses while it only has one, use this version, and let isel match // another instruction if the load really happens to have more than // one use. Remove this version after this bug get fixed. +// rdar://8434668, PR8156 static bool RelaxedMayFoldVectorLoad(SDValue V) { if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT) V = V.getOperand(0); @@ -5169,6 +5170,17 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, return true; } +static +SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // Canonizalize to v2f64. + V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, V1); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, + V1, DAG)); +} + static SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { @@ -5309,7 +5321,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, if (VT.getVectorNumElements() <= 4) return SDValue(); - // Canonize all of the remaining to v4f32. + // Canonicalize all of the remaining to v4f32. return PromoteSplat(SVOp, DAG); } @@ -5394,7 +5406,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && RelaxedMayFoldVectorLoad(V1)) - return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); + return getMOVDDup(Op, dl, V1, DAG); if (X86::isMOVHLPS_v_undef_Mask(SVOp)) return getMOVHighToLow(Op, dl, DAG); diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index ced3bb94d123..c37def16bf16 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5537,19 +5537,14 @@ def : Pat<(X86Movddup (memopv2f64 addr:$src)), def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; -def : Pat<(X86Movddup (bc_v4f32 (memopv2f64 addr:$src))), +def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; -def : Pat<(X86Movddup (bc_v4f32 (memopv2f64 addr:$src))), +def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), (MOVDDUPrm addr:$src)>; -def : Pat<(X86Movddup (memopv2i64 addr:$src)), +def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; -def : Pat<(X86Movddup (memopv2i64 addr:$src)), - (MOVDDUPrm addr:$src)>; - -def : Pat<(X86Movddup (bc_v4i32 (memopv2i64 addr:$src))), - (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; -def : Pat<(X86Movddup (bc_v4i32 (memopv2i64 addr:$src))), +def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), (MOVDDUPrm addr:$src)>; def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))), @@ -5564,6 +5559,7 @@ def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (MOVDDUPrm addr:$src)>; + // Shuffle with UNPCKLPS def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; @@ -5675,14 +5671,11 @@ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), (MOVLHPSrr VR128:$src1, VR128:$src2)>; def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; -// FIXME: Instead of X86Movddup, there should be a X86Movlhps here, the problem + +// FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the problem // is during lowering, where it's not possible to recognize the load fold cause // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. -def : Pat<(v2i64 (X86Movddup VR128:$src)), - (MOVLHPSrr VR128:$src, VR128:$src)>; -def : Pat<(v4f32 (X86Movddup VR128:$src)), - (MOVLHPSrr VR128:$src, VR128:$src)>; def : Pat<(v2f64 (X86Movddup VR128:$src)), (UNPCKLPDrr VR128:$src, VR128:$src)>; @@ -5690,6 +5683,7 @@ def : Pat<(v2f64 (X86Movddup VR128:$src)), def : Pat<(v2f64 (X86Movlhpd VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; + // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold cause // it has two uses through a bitcast. One use disappears at isel time and the diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll index 206cdff1ba7d..9a60091a0cf0 100644 --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -169,7 +169,7 @@ define internal void @t10() nounwind { ret void ; X64: t10: ; X64: pextrw $4, %xmm0, %eax -; X64: movlhps %xmm1, %xmm1 +; X64: unpcklpd %xmm1, %xmm1 ; X64: pshuflw $8, %xmm1, %xmm1 ; X64: pinsrw $2, %eax, %xmm1 ; X64: pextrw $6, %xmm0, %eax @@ -260,3 +260,18 @@ entry: ; X64: pinsrw $1, %eax, %xmm0 ; X64: ret } + +; rdar://8520311 +define <4 x i32> @t17() nounwind { +entry: +; X64: t17: +; X64: movddup (%rax), %xmm0 + %tmp1 = load <4 x float>* undef, align 16 + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> + %tmp3 = load <4 x float>* undef, align 16 + %tmp4 = shufflevector <4 x float> %tmp2, <4 x float> undef, <4 x i32> + %tmp5 = bitcast <4 x float> %tmp3 to <4 x i32> + %tmp6 = shufflevector <4 x i32> %tmp5, <4 x i32> undef, <4 x i32> + %tmp7 = and <4 x i32> %tmp6, + ret <4 x i32> %tmp7 +}