diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 035248c08fa2..eedb1ff71418 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3178,6 +3178,152 @@ static bool isPALIGNRMask(const SmallVectorImpl &Mask, EVT VT, return true; } +/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 256-bit +/// VSHUFPSY. +static bool isVSHUFPSYMask(const SmallVectorImpl &Mask, EVT VT, + const X86Subtarget *Subtarget) { + int NumElems = VT.getVectorNumElements(); + + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) + return false; + + if (NumElems != 8) + return false; + + // VSHUFPSY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // + // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 + // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 + // + // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, + // Y3..Y0, Y3..Y0, X3..X0, X3..X0 + // + int QuarterSize = NumElems/4; + int HalfSize = QuarterSize*2; + for (int i = 0; i < QuarterSize; ++i) + if (!isUndefOrInRange(Mask[i], 0, HalfSize)) + return false; + for (int i = QuarterSize; i < QuarterSize*2; ++i) + if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) + return false; + + // The mask of the second half must be the same as the first but with + // the appropriate offsets. This works in the same way as VPERMILPS + // works with masks. + for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { + if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) + return false; + int FstHalfIdx = i-HalfSize; + if (Mask[FstHalfIdx] < 0) + continue; + if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) + return false; + } + for (int i = QuarterSize*3; i < NumElems; ++i) { + if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) + return false; + int FstHalfIdx = i-HalfSize; + if (Mask[FstHalfIdx] < 0) + continue; + if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) + return false; + + } + + return true; +} + +/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VSHUFPSY instruction. +static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast(N); + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + + assert(NumElems == 8 && VT.getSizeInBits() == 256 && + "Only supports v8i32 and v8f32 types"); + + int HalfSize = NumElems/2; + unsigned Mask = 0; + for (int i = 0; i != NumElems ; ++i) { + if (SVOp->getMaskElt(i) < 0) + continue; + // The mask of the first half must be equal to the second one. + unsigned Shamt = (i%HalfSize)*2; + unsigned Elt = SVOp->getMaskElt(i) % HalfSize; + Mask |= Elt << Shamt; + } + + return Mask; +} + +/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 256-bit +/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS +/// version and the mask of the second half isn't binded with the first +/// one. +static bool isVSHUFPDYMask(const SmallVectorImpl &Mask, EVT VT, + const X86Subtarget *Subtarget) { + int NumElems = VT.getVectorNumElements(); + + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) + return false; + + if (NumElems != 4) + return false; + + // VSHUFPSY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // + // SRC1 => X3 X2 X1 X0 + // SRC2 => Y3 Y2 Y1 Y0 + // + // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0 + // + int QuarterSize = NumElems/4; + int HalfSize = QuarterSize*2; + for (int i = 0; i < QuarterSize; ++i) + if (!isUndefOrInRange(Mask[i], 0, HalfSize)) + return false; + for (int i = QuarterSize; i < QuarterSize*2; ++i) + if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) + return false; + for (int i = QuarterSize*2; i < QuarterSize*3; ++i) + if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) + return false; + for (int i = QuarterSize*3; i < NumElems; ++i) + if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) + return false; + + return true; +} + +/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VSHUFPDY instruction. +static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast(N); + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + + assert(NumElems == 4 && VT.getSizeInBits() == 256 && + "Only supports v4i64 and v4f64 types"); + + int HalfSize = NumElems/2; + unsigned Mask = 0; + for (int i = 0; i != NumElems ; ++i) { + if (SVOp->getMaskElt(i) < 0) + continue; + int Elt = SVOp->getMaskElt(i) % HalfSize; + Mask |= Elt << i; + } + + return Mask; +} + /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit /// SHUFPS and SHUFPD. @@ -6068,6 +6214,22 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); } +static inline unsigned getSHUFPOpcode(EVT VT) { + switch(VT.getSimpleVT().SimpleTy) { + case MVT::v8i32: // Use fp unit for int unpack. + case MVT::v8f32: + case MVT::v4i32: // Use fp unit for int unpack. + case MVT::v4f32: return X86ISD::SHUFPS; + case MVT::v4i64: // Use fp unit for int unpack. + case MVT::v4f64: + case MVT::v2i64: // Use fp unit for int unpack. + case MVT::v2f64: return X86ISD::SHUFPD; + default: + llvm_unreachable("Unknown type for shufp*"); + } + return 0; +} + static SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); @@ -6121,7 +6283,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { assert(VT != MVT::v4i32 && "unsupported shuffle type"); // Invert the operand order and use SHUFPS to match it. - return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1, X86::getShuffleSHUFImmediate(SVOp), DAG); } @@ -6357,13 +6519,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); - if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, - TargetMask, DAG); - - if (VT == MVT::v4f32) - return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, - TargetMask, DAG); + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1, + TargetMask, DAG); } // Check if this can be converted into a logical shift. @@ -6515,15 +6672,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { X86::getShufflePSHUFLWImmediate(SVOp), DAG); - if (isSHUFPMask(M, VT)) { - unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); - if (VT == MVT::v4f32 || VT == MVT::v4i32) - return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, - TargetMask, DAG); - if (VT == MVT::v2f64 || VT == MVT::v2i64) - return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, - TargetMask, DAG); - } + if (isSHUFPMask(M, VT)) + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, + X86::getShuffleSHUFImmediate(SVOp), DAG); if (X86::isUNPCKL_v_undef_Mask(SVOp)) return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); @@ -6550,6 +6701,16 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, getShuffleVPERM2F128Immediate(SVOp), DAG); + // Handle VSHUFPSY permutations + if (isVSHUFPSYMask(M, VT, Subtarget)) + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, + getShuffleVSHUFPSYImmediate(SVOp), DAG); + + // Handle VSHUFPDY permutations + if (isVSHUFPDYMask(M, VT, Subtarget)) + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, + getShuffleVSHUFPDYImmediate(SVOp), DAG); + //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, // lower it into other known shuffles. FIXME: this isn't true yet, but diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 8ce9dc3dc8fe..7a6a6250e7f9 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -1709,7 +1709,7 @@ let Predicates = [HasAVX] in { def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)), (VSHUFPDrri VR128:$src1, VR128:$src2, (SHUFFLE_get_shuf_imm VR128:$src3))>; - // Generic VSHUFPD patterns + def : Pat<(v2f64 (X86Shufps VR128:$src1, (memopv2f64 addr:$src2), (i8 imm:$imm))), (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; @@ -1717,6 +1717,31 @@ let Predicates = [HasAVX] in { (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))), (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; + + // 256-bit patterns + def : Pat<(v8i32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v8i32 (X86Shufps VR256:$src1, + (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; + + def : Pat<(v8f32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v8f32 (X86Shufps VR256:$src1, + (memopv8f32 addr:$src2), (i8 imm:$imm))), + (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; + + def : Pat<(v4i64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v4i64 (X86Shufpd VR256:$src1, + (memopv4i64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; + + def : Pat<(v4f64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v4f64 (X86Shufpd VR256:$src1, + (memopv4f64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll index 1c10814002af..d085f5d6cdb5 100644 --- a/llvm/test/CodeGen/X86/avx-basic.ll +++ b/llvm/test/CodeGen/X86/avx-basic.ll @@ -64,10 +64,8 @@ entry: ret <4 x i64> %shuffle } -; CHECK: vpunpckhqdq -; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: movlhps -; CHECK-NEXT: vinsertf128 $1 +; CHECK: _B +; CHECK: vshufpd $1, %ymm define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/avx-vshufp.ll b/llvm/test/CodeGen/X86/avx-vshufp.ll new file mode 100644 index 000000000000..f06548dc3d6d --- /dev/null +++ b/llvm/test/CodeGen/X86/avx-vshufp.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +; CHECK: vshufps $-53, %ymm +define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +; CHECK: vshufpd $10, %ymm +define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} + +; CHECK: vshufps $-53, %ymm +define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +; CHECK: vshufpd $2, %ymm +define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +}