diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 83ee12b450ca..863da74efbc5 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -208,7 +208,6 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, } } -/// \brief Decode PSHUFB masks stored in an LLVM Constant. void DecodePSHUFBMask(const ConstantDataSequential *C, SmallVectorImpl &ShuffleMask) { Type *MaskTy = C->getType(); @@ -240,6 +239,25 @@ void DecodePSHUFBMask(const ConstantDataSequential *C, } } +void DecodePSHUFBMask(ArrayRef RawMask, + SmallVectorImpl &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + // For AVX vectors with 32 bytes the base of the shuffle is the half of + // the vector we're inside. + int Base = i < 16 ? 0 : 16; + // If the high bit (7) of the byte is set, the element is zeroed. + if (M & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + int Index = Base + M; + assert((Index >= 0 && (unsigned)Index < RawMask.size()) && + "Out of bounds shuffle index for pshub instruction!"); + ShuffleMask.push_back(Index); + } + } +} + /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl &ShuffleMask) { diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 3ac0afe1e349..dd93c096b348 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -16,6 +16,7 @@ #define X86_SHUFFLE_DECODE_H #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/ArrayRef.h" //===----------------------------------------------------------------------===// // Vector Mask Decoding @@ -60,9 +61,15 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl &ShuffleMask); /// different datatypes and vector widths. void DecodeUNPCKLMask(MVT VT, SmallVectorImpl &ShuffleMask); +/// \brief Decode a PSHUFB mask from an IR-level vector constant. void DecodePSHUFBMask(const ConstantDataSequential *C, SmallVectorImpl &ShuffleMask); +/// \brief Decode a PSHUFB mask from a raw array of constants such as from +/// BUILD_VECTOR. +void DecodePSHUFBMask(ArrayRef RawMask, + SmallVectorImpl &ShuffleMask); + void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1b65e741533d..eb722daef7fe 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3408,6 +3408,7 @@ static bool MayFoldIntoStore(SDValue Op) { static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: @@ -5186,6 +5187,67 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodePSHUFLWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; + case X86ISD::PSHUFB: { + IsUnary = true; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + EVT VT = MaskNode.getValueType(); + assert(VT.isVector() && + "Can't produce a non-vector with a build_vector!"); + if (!VT.isInteger()) + return false; + + int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; + + SmallVector RawMask; + for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { + auto *CN = dyn_cast(MaskNode->getOperand(i)); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + + // We now have to decode the element which could be any integer size and + // extract each byte of it. + for (int j = 0; j < NumBytesPerElement; ++j) { + // Note that this is x86 and so always little endian: the low byte is + // the first byte of the mask. + RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); + MaskElement = MaskElement.lshr(8); + } + } + DecodePSHUFBMask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast(MaskCP->getConstVal())) { + // FIXME: Support AVX-512 here. + if (!C->getType()->isVectorTy() || + (C->getNumElements() != 16 && C->getNumElements() != 32)) + return false; + + assert(C->getType()->isVectorTy() && "Expected a vector constant."); + DecodePSHUFBMask(C, Mask); + break; + } + + return false; + } case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast(ImmN)->getZExtValue(), Mask); @@ -7843,6 +7905,37 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); } + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly + // with PSHUFB. It is important to do this before we attempt to generate any + // blends but after all of the single-input lowerings. If the single input + // lowerings can find an instruction sequence that is faster than a PSHUFB, we + // want to preserve that and we can DAG combine any longer sequences into + // a PSHUFB in the end. But once we start blending from multiple inputs, + // the complexity of DAG combining bad patterns back into PSHUFB is too high, + // and there are *very* few patterns that would actually be faster than the + // PSHUFB approach because of its ability to zero lanes. + // + // FIXME: The only exceptions to the above are blends which are exact + // interleavings with direct instructions supporting them. We currently don't + // handle those well here. + if (Subtarget->hasSSSE3()) { + SDValue V1Mask[16]; + SDValue V2Mask[16]; + for (int i = 0; i < 16; ++i) + if (Mask[i] == -1) { + V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8); + } else { + V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8); + V2Mask[i] = + DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8); + } + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + } + int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; @@ -18712,7 +18805,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, - int Depth, SelectionDAG &DAG, + int Depth, bool HasPSHUFB, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); @@ -18794,15 +18887,16 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, } } - // Bail if we have fewer than 3 shuffle instructions in the chain. - if (Depth < 3) + // Don't try to re-form single instruction chains under any circumstances now + // that we've done encoding canonicalization for them. + if (Depth < 2) return false; - // If we have 3 or more shuffle instructions, we can replace them with - // a single PSHUFB instruction profitably. Intel's manuals suggest only using - // PSHUFB if doing so replacing 5 instructions, but in practice PSHUFB tends - // to be *very* fast so we're more aggressive. - if (Subtarget->hasSSSE3()) { + // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we + // can replace them with a single PSHUFB instruction profitably. Intel's + // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but + // in practice PSHUFB tends to be *very* fast so we're more aggressive. + if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { SmallVector PSHUFBMask; assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); int Ratio = 16 / Mask.size(); @@ -18861,7 +18955,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, /// combining in this recursive walk. static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, ArrayRef IncomingMask, int Depth, - SelectionDAG &DAG, + bool HasPSHUFB, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { // Bound the depth of our recursive combine because this is ultimately @@ -18923,12 +19017,14 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // See if we can recurse into the operand to combine more things. switch (Op.getOpcode()) { + case X86ISD::PSHUFB: + HasPSHUFB = true; case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: if (Op.getOperand(0).hasOneUse() && combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - DAG, DCI, Subtarget)) + HasPSHUFB, DAG, DCI, Subtarget)) return true; break; @@ -18938,7 +19034,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // We can't check for single use, we have to check that this shuffle is the only user. if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - DAG, DCI, Subtarget)) + HasPSHUFB, DAG, DCI, Subtarget)) return true; break; } @@ -18962,7 +19058,8 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, Mask.swap(NewMask); } - return combineX86ShuffleChain(Op, Root, Mask, Depth, DAG, DCI, Subtarget); + return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, + Subtarget); } /// \brief Get the PSHUF-style mask from PSHUF node. @@ -19391,7 +19488,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, - /*Depth*/ 1, DAG, DCI, Subtarget)) + /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, + DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. } @@ -22431,6 +22529,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 4da7e42caabf..49620e93d6fe 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1,196 +1,284 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSSE3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; FIXME: # BB#0: +; FIXME-NEXT: punpcklbw %xmm0, %xmm0 +; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] +; FIXME-NEXT: retq +; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; +; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01 +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7] +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: punpcklwd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03 +; ALL: # BB#0: +; ALL-NEXT: punpcklbw %xmm0, %xmm0 +; ALL-NEXT: punpcklwd %xmm0, %xmm0 +; ALL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07 +; ALL: # BB#0: +; ALL-NEXT: punpcklbw %xmm0, %xmm0 +; ALL-NEXT: punpckhwd %xmm0, %xmm0 +; ALL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07 +; ALL: # BB#0: +; ALL-NEXT: punpcklbw %xmm0, %xmm0 +; ALL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; FIXME-LABEL: @shuffle_v16i8_0101010101010101 +; FIXME: # BB#0: +; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] +; FIXME-NEXT: retq +; +; SSE2-LABEL: @shuffle_v16i8_0101010101010101 +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_0101010101010101 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23 -; CHECK-SSE2: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23 +; ALL: punpcklbw %xmm1, %xmm0 +; ALL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07 +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw %xmm1, %xmm1 +; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: punpcklbw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07 +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklbw %xmm1, %xmm1 +; SSSE3-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSSE3-NEXT: punpcklbw %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12 -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: punpckhbw %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: packuswb %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12 +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw %xmm1, %xmm2 +; SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20 -; CHECK-SSE2: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20 +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw %xmm2, %xmm1 +; SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20 -; CHECK-SSE2: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1] -; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20 +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklbw %xmm2, %xmm3 +; SSE2-NEXT: pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhbw %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7] +; SSE2-NEXT: shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1] +; SSE2-NEXT: punpckhbw %xmm2, %xmm1 +; SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @zext_to_v8i16_shuffle -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 +; ALL-LABEL: @zext_to_v8i16_shuffle +; ALL: pxor %xmm1, %xmm1 +; ALL-NEXT: punpcklbw %xmm1, %xmm0 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @zext_to_v4i32_shuffle -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 +; ALL-LABEL: @zext_to_v4i32_shuffle +; ALL: pxor %xmm1, %xmm1 +; ALL-NEXT: punpcklbw %xmm1, %xmm0 +; ALL-NEXT: punpcklbw %xmm1, %xmm0 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pand -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; FIXME-LABEL: @trunc_v4i32_shuffle +; FIXME: # BB#0: +; FIXME-NEXT: pand +; FIXME-NEXT: packuswb %xmm0, %xmm0 +; FIXME-NEXT: packuswb %xmm0, %xmm0 +; FIXME-NEXT: retq +; +; SSE2-LABEL: @trunc_v4i32_shuffle +; SSE2: # BB#0: +; SSE2-NEXT: pand +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @trunc_v4i32_shuffle +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*}} # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> ret <16 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 575dc25f2ec6..9a719c8c7ee8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -214,8 +214,7 @@ define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) { ; ; SSSE3-LABEL: @shuffle_v8i16_26401375 ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,1,4,5,8,9,12,13,2,3,6,7,14,15,10,11] -; SSSE3-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] ; SSSE3-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle