diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 44c72b6bec4c..24954d75d2e5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1853,10 +1853,17 @@ SDOperand SelectionDAG::getShuffleScalarElt(const SDNode *N, unsigned Idx) { unsigned NumElems = PermMask.getNumOperands(); SDOperand V = (Idx < NumElems) ? N->getOperand(0) : N->getOperand(1); Idx %= NumElems; - if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) { - return (Idx == 0) - ? V.getOperand(0) : getNode(ISD::UNDEF, MVT::getVectorElementType(VT)); + + if (V.getOpcode() == ISD::BIT_CONVERT) { + V = V.getOperand(0); + if (MVT::getVectorNumElements(V.getValueType()) != NumElems) + return SDOperand(); } + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) + return (Idx == 0) ? V.getOperand(0) + : getNode(ISD::UNDEF, MVT::getVectorElementType(VT)); + if (V.getOpcode() == ISD::BUILD_VECTOR) + return V.getOperand(Idx); if (V.getOpcode() == ISD::VECTOR_SHUFFLE) { SDOperand Elt = PermMask.getOperand(Idx); if (Elt.getOpcode() == ISD::UNDEF) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5343971d34b3..d194d38e1ce6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2923,6 +2923,70 @@ static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, unsigned Idx, return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask); } +/// getNumOfConsecutiveZeros - Return the number of elements in a result of +/// a shuffle that is zero. +static +unsigned getNumOfConsecutiveZeros(SDOperand Op, SDOperand Mask, + unsigned NumElems, bool Low, + SelectionDAG &DAG) { + unsigned NumZeros = 0; + for (unsigned i = 0; i < NumElems; ++i) { + SDOperand Idx = Mask.getOperand(Low ? i : NumElems-i-1); + if (Idx.getOpcode() == ISD::UNDEF) { + ++NumZeros; + continue; + } + unsigned Index = cast(Idx)->getValue(); + SDOperand Elt = DAG.getShuffleScalarElt(Op.Val, Index); + if (Elt.Val && isZeroNode(Elt)) + ++NumZeros; + else + break; + } + return NumZeros; +} + +/// isVectorShift - Returns true if the shuffle can be implemented as a +/// logical left or right shift of a vector. +static bool isVectorShift(SDOperand Op, SDOperand Mask, SelectionDAG &DAG, + bool &isLeft, SDOperand &ShVal, unsigned &ShAmt) { + unsigned NumElems = Mask.getNumOperands(); + + isLeft = true; + unsigned NumZeros= getNumOfConsecutiveZeros(Op, Mask, NumElems, true, DAG); + if (!NumZeros) { + isLeft = false; + NumZeros = getNumOfConsecutiveZeros(Op, Mask, NumElems, false, DAG); + if (!NumZeros) + return false; + } + + bool SeenV1 = false; + bool SeenV2 = false; + for (unsigned i = NumZeros; i < NumElems; ++i) { + unsigned Val = isLeft ? (i - NumZeros) : i; + SDOperand Idx = Mask.getOperand(isLeft ? i : (i - NumZeros)); + if (Idx.getOpcode() == ISD::UNDEF) + continue; + unsigned Index = cast(Idx)->getValue(); + if (Index < NumElems) + SeenV1 = true; + else { + Index -= NumElems; + SeenV2 = true; + } + if (Index != Val) + return false; + } + if (SeenV1 && SeenV2) + return false; + + ShVal = SeenV1 ? Op.getOperand(0) : Op.getOperand(1); + ShAmt = NumZeros; + return true; +} + + /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. /// static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros, @@ -2995,6 +3059,20 @@ static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros, return V; } +/// getVShift - Return a vector logical shift node. +/// +static SDOperand getVShift(bool isLeft, MVT::ValueType VT, SDOperand SrcOp, + unsigned NumBits, SelectionDAG &DAG, + const TargetLowering &TLI) { + bool isMMX = MVT::getSizeInBits(VT) == 64; + MVT::ValueType ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; + unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; + SrcOp = DAG.getNode(ISD::BIT_CONVERT, ShVT, SrcOp); + return DAG.getNode(ISD::BIT_CONVERT, VT, + DAG.getNode(Opc, ShVT, SrcOp, + DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); +} + SDOperand X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) { // All zero's are handled with pxor, all one's are handled with pcmpeqd. @@ -3091,6 +3169,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) { return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget->hasSSE2(), DAG); } + + // Is it a vector logical left shift? + if (NumElems == 2 && Idx == 1 && + isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) { + unsigned NumBits = MVT::getSizeInBits(VT); + return getVShift(true, VT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(1)), + NumBits/2, DAG, *this); + } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDOperand(); @@ -3615,6 +3702,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { } } + // Check if this can be converted into a logical shift. + bool isLeft = false; + unsigned ShAmt = 0; + SDOperand ShVal; + bool isShift = isVectorShift(Op, PermMask, DAG, isLeft, ShVal, ShAmt); + if (isShift && ShVal.hasOneUse()) { + // If the shifted value has multiple uses, it may be cheaper to use + // v_set0 + movlhps or movhlps, etc. + MVT::ValueType EVT = MVT::getVectorElementType(VT); + ShAmt *= MVT::getSizeInBits(EVT); + return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this); + } + if (X86::isMOVLMask(PermMask.Val)) { if (V1IsUndef) return V2; @@ -3634,6 +3734,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val)) return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); + if (isShift) { + // No better options. Use a vshl / vsrl. + MVT::ValueType EVT = MVT::getVectorElementType(VT); + ShAmt *= MVT::getSizeInBits(EVT); + return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this); + } + bool Commuted = false; // FIXME: This should also accept a bitcast of a splat? Be careful, not // 1,1,1,1 -> v8i16 though. @@ -5729,6 +5836,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VSHL: return "X86ISD::VSHL"; + case X86ISD::VSRL: return "X86ISD::VSRL"; } } @@ -6296,8 +6405,10 @@ static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, static SDOperand PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { + unsigned NumOps = N->getNumOperands(); + // Ignore single operand BUILD_VECTOR. - if (N->getNumOperands() == 1) + if (NumOps == 1) return SDOperand(); MVT::ValueType VT = N->getValueType(0); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index b99a09be7ced..0c67794c9329 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -205,7 +205,10 @@ namespace llvm { VZEXT_MOVL, // VZEXT_LOAD - Load, scalar_to_vector, and zero extend. - VZEXT_LOAD + VZEXT_LOAD, + + // VSHL, VSRL - Vector logical left / right shift. + VSHL, VSRL }; } diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td index 42f19af1f8b6..b167a7ac88d8 100644 --- a/llvm/lib/Target/X86/X86InstrMMX.td +++ b/llvm/lib/Target/X86/X86InstrMMX.td @@ -294,6 +294,12 @@ defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_mmx_psra_d, int_x86_mmx_psrai_d>; +// Shift up / down and insert zero's. +def : Pat<(v1i64 (X86vshl VR64:$src, (i8 imm:$amt))), + (v1i64 (MMX_PSLLQri VR64:$src, imm:$amt))>; +def : Pat<(v1i64 (X86vshr VR64:$src, (i8 imm:$amt))), + (v1i64 (MMX_PSRLQri VR64:$src, imm:$amt))>; + // Comparison Instructions defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>; defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 1ea4bfd35e03..3d5959aa2f68 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -51,6 +51,8 @@ def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad]>; +def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>; +def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>; //===----------------------------------------------------------------------===// // SSE Complex Patterns @@ -1957,6 +1959,12 @@ let Predicates = [HasSSE2] in { (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>; def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>; + + // Shift up / down and insert zero's. + def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))), + (v2i64 (PSLLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>; + def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))), + (v2i64 (PSRLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>; } // Logical diff --git a/llvm/test/CodeGen/X86/mmx-insert-element.ll b/llvm/test/CodeGen/X86/mmx-insert-element.ll index dc488363e7f5..0aa476dba80e 100644 --- a/llvm/test/CodeGen/X86/mmx-insert-element.ll +++ b/llvm/test/CodeGen/X86/mmx-insert-element.ll @@ -1,23 +1,7 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | grep movq | count 3 - -; FIXME: This code outputs: -; -; subl $28, %esp -; movl 32(%esp), %eax -; movd %eax, %mm0 -; movq %mm0, (%esp) -; movl (%esp), %eax -; movl %eax, 20(%esp) -; movq %mm0, 8(%esp) -; movl 12(%esp), %eax -; movl %eax, 16(%esp) -; movq 16(%esp), %mm0 -; addl $28, %esp -; -; Which is ugly. We need to fix this. +; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | not grep movq +; RUN: llvm-as < %s | llc -march=x86 -mattr=+mmx | grep psllq define <2 x i32> @qux(i32 %A) nounwind { -entry: %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 ; <<2 x i32>> [#uses=1] ret <2 x i32> %tmp3 } diff --git a/llvm/test/CodeGen/X86/vec_clear.ll b/llvm/test/CodeGen/X86/vec_clear.ll index d4641294b456..c119a94f74f6 100644 --- a/llvm/test/CodeGen/X86/vec_clear.ll +++ b/llvm/test/CodeGen/X86/vec_clear.ll @@ -1,6 +1,7 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep and +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | grep psrldq -define <4 x float> @test(<4 x float>* %v1) { +define <4 x float> @test(<4 x float>* %v1) nounwind { %tmp = load <4 x float>* %v1 ; <<4 x float>> [#uses=1] %tmp15 = bitcast <4 x float> %tmp to <2 x i64> ; <<2 x i64>> [#uses=1] %tmp24 = and <2 x i64> %tmp15, bitcast (<4 x i32> < i32 0, i32 0, i32 -1, i32 -1 > to <2 x i64>) ; <<2 x i64>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/vec_insert-3.ll b/llvm/test/CodeGen/X86/vec_insert-3.ll index 1d374b4b9c62..e42a3684899a 100644 --- a/llvm/test/CodeGen/X86/vec_insert-3.ll +++ b/llvm/test/CodeGen/X86/vec_insert-3.ll @@ -1,6 +1,6 @@ ; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep punpcklqdq | count 1 -define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) { +define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind { %tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1 ret <2 x i64> %tmp1 } diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll new file mode 100644 index 000000000000..eaa523e6e573 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_insert-5.ll @@ -0,0 +1,31 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psllq | grep 32 +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pslldq | grep 12 +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psrldq | grep 8 +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep psrldq | grep 12 + +define void @t1(i32 %a, <1 x i64>* %P) nounwind { + %tmp12 = shl i32 %a, 12 + %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 + %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 + %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64> + store <1 x i64> %tmp23, <1 x i64>* %P + ret void +} + +define <4 x float> @t2(<4 x float>* %P) nounwind { + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > + ret <4 x float> %tmp2 +} + +define <4 x float> @t3(<4 x float>* %P) nounwind { + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 > + ret <4 x float> %tmp2 +} + +define <4 x float> @t4(<4 x float>* %P) nounwind { + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > + ret <4 x float> %tmp2 +} diff --git a/llvm/test/CodeGen/X86/vec_insert-6.ll b/llvm/test/CodeGen/X86/vec_insert-6.ll new file mode 100644 index 000000000000..405152e2dc8f --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_insert-6.ll @@ -0,0 +1,7 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep pslldq + +define <4 x float> @t3(<4 x float>* %P) nounwind { + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > + ret <4 x float> %tmp2 +}