[x86] Fully generalize the zext lowering in the new vector shuffle

lowering to support both anyext and zext and to custom lower for many
different microarchitectures.

Using this allows us to get *exactly* the right code for zext and anyext
shuffles in all the vector sizes. For v16i8, the improvement is *huge*.
The new SSE2 test case added I refused to add before this because it was
sooooo muny instructions.

llvm-svn: 218143
This commit is contained in:
Chandler Carruth 2014-09-19 20:00:32 +00:00
parent 3f9b021c00
commit 0fc0c22fa9
5 changed files with 116 additions and 67 deletions

View File

@ -7393,16 +7393,21 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
return Zeroable; return Zeroable;
} }
/// \brief Try to lower a vector shuffle as a zero extension. /// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
/// ///
/// This tries to use the SSE4.1 PMOVZX instruction family to lower a vector /// This routine will try to do everything in its power to cleverly lower
/// shuffle throuh a zero extension. It doesn't check for the availability or /// a shuffle which happens to match the pattern of a zero extend. It doesn't
/// profitability of this lowering though, it tries to aggressively match this /// check for the profitability of this lowering, it tries to aggressively
/// pattern. It handles both blends with all-zero inputs to explicitly /// match this pattern. It will use all of the micro-architectural details it
/// zero-extend and undef-lanes (sometimes undef due to masking out later). /// can to emit an efficient lowering. It handles both blends with all-zero
static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1, /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
SDValue V2, ArrayRef<int> Mask, /// masking out later).
SelectionDAG &DAG) { ///
/// The reason we have dedicated lowering for zext-style shuffles is that they
/// are both incredibly common and often quite performance sensitive.
static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Bits = VT.getSizeInBits(); int Bits = VT.getSizeInBits();
@ -7413,6 +7418,7 @@ static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1,
// if valid. // if valid.
auto LowerWithStride = [&](int Stride) -> SDValue { auto LowerWithStride = [&](int Stride) -> SDValue {
SDValue InputV; SDValue InputV;
bool AnyExt = true;
for (int i = 0; i < NumElements; ++i) { for (int i = 0; i < NumElements; ++i) {
if (Mask[i] == -1) if (Mask[i] == -1)
continue; // Valid anywhere but doesn't tell us anything. continue; // Valid anywhere but doesn't tell us anything.
@ -7420,8 +7426,10 @@ static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1,
// Each of the extend elements needs to be zeroable. // Each of the extend elements needs to be zeroable.
if (!Zeroable[i]) if (!Zeroable[i])
return SDValue(); return SDValue();
else
continue; // We no lorger are in the anyext case.
AnyExt = false;
continue;
} }
// Each of the base elements needs to be consecutive indices into the // Each of the base elements needs to be consecutive indices into the
@ -7442,15 +7450,68 @@ static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1,
if (!InputV) if (!InputV)
return SDValue(); return SDValue();
// Found a valid lowering! Compute all the types and the operation. We force // Found a valid zext mask! Try various lowering strategies based on the
// everything to integer types here as that's the only way zext makes sense. // input type and available ISA extensions.
MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); if (Subtarget->hasSSE41()) {
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride), MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
NumElements / Stride); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride),
NumElements / Stride);
InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
}
InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); // For any extends we can cheat for larger element sizes and use shuffle
return DAG.getNode(ISD::BITCAST, DL, VT, // instructions that can fold with a load and/or copy.
DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); if (AnyExt && EltBits == 32) {
int PSHUFDMask[4] = {0, -1, 1, -1};
return DAG.getNode(
ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
}
if (AnyExt && EltBits == 16 && Stride > 2) {
int PSHUFDMask[4] = {0, -1, 0, -1};
InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
int PSHUFHWMask[4] = {1, -1, -1, -1};
return DAG.getNode(
ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
}
// If this would require more than 2 unpack instructions to expand, use
// pshufb when available. We can only use more than 2 unpack instructions
// when zero extending i8 elements which also makes it easier to use pshufb.
if (Stride > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
assert(NumElements == 16 && "Unexpected byte vector width!");
SDValue PSHUFBMask[16];
for (int i = 0; i < 16; ++i)
PSHUFBMask[i] =
DAG.getConstant((i % Stride == 0) ? i / Stride : 0x80, MVT::i8);
InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
DAG.getNode(ISD::BUILD_VECTOR, DL,
MVT::v16i8, PSHUFBMask)));
}
// Otherwise emit a sequence of unpacks.
do {
MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
: getZeroVector(InputVT, Subtarget, DAG, DL);
InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
Stride /= 2;
EltBits *= 2;
NumElements /= 2;
} while (Stride > 1);
return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
}; };
// The widest stride possible for zero extending is to a 64-bit integer. // The widest stride possible for zero extending is to a 64-bit integer.
@ -7469,7 +7530,7 @@ static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1,
return V; return V;
} }
// No viable zext lowering found. // No viable ext lowering found.
return SDValue(); return SDValue();
} }
@ -7843,10 +7904,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// Whenever we can lower this as a zext, that instruction is strictly faster // Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. // than any alternative.
if (Subtarget->hasSSE41()) if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
if (SDValue ZExt = Mask, Subtarget, DAG))
lowerVectorShuffleAsZeroExtend(DL, MVT::v4i32, V1, V2, Mask, DAG)) return ZExt;
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern. // Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
@ -8519,10 +8579,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// Whenever we can lower this as a zext, that instruction is strictly faster // Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. // than any alternative.
if (Subtarget->hasSSE41()) if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v8i16, V1, V2, DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
OrigMask, DAG)) return ZExt;
return ZExt;
auto isV1 = [](int M) { return M >= 0 && M < 8; }; auto isV1 = [](int M) { return M >= 0 && M < 8; };
auto isV2 = [](int M) { return M >= 8; }; auto isV2 = [](int M) { return M >= 8; };
@ -8688,10 +8747,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Rotate; return Rotate;
// Try to use a zext lowering. // Try to use a zext lowering.
if (Subtarget->hasSSE41()) if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v16i8, V1, V2, DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
OrigMask, DAG)) return ZExt;
return ZExt;
int MaskStorage[16] = { int MaskStorage[16] = {
OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],

View File

@ -515,13 +515,13 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(
; SSE2-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu ; SSE2-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
; SSE2: # BB#0: ; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7] ; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq ; SSE2-NEXT: retq
; ;
; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu ; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
; SSSE3: # BB#0: ; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},1,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}] ; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq ; SSSE3-NEXT: retq
; ;
; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu ; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
@ -533,15 +533,17 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(
} }
define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
; SSE2-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
; SSE2: # BB#0:
; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3],xmm0[4],[[X]][4],xmm0[5],[[X]][5],xmm0[6],[[X]][6],xmm0[7],[[X]][7]
; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz ; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
; SSSE3: # BB#0: ; SSSE3: # BB#0:
; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]]
; SSSE3-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]]
; SSSE3-NEXT: pshufb {{.*}} # [[X2]] = zero,[[X2]][2,4,6],zero,[[X2]][10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: por %xmm0, %[[X2]]
; SSSE3-NEXT: punpcklbw {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3],[[X2]][4],[[X1]][4],[[X2]][5],[[X1]][5],[[X2]][6],[[X1]][6],[[X2]][7],[[X1]][7]
; SSSE3-NEXT: movdqa %[[X2]], %xmm0
; SSSE3-NEXT: retq ; SSSE3-NEXT: retq
; ;
; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz ; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
@ -576,16 +578,16 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(
define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) { define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
; SSE2-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz ; SSE2-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz
; SSE2: # BB#0: ; SSE2: # BB#0:
; SSE2-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] ; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] ; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3],xmm0[4],[[X]][4],xmm0[5],[[X]][5],xmm0[6],[[X]][6],xmm0[7],[[X]][7]
; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] ; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
; SSE2-NEXT: retq ; SSE2-NEXT: retq
; ;
; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz ; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz
; SSSE3: # BB#0: ; SSSE3: # BB#0:
; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] ; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]]
; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] ; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] ; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
; SSSE3-NEXT: retq ; SSSE3-NEXT: retq
; ;
; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz ; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz

View File

@ -778,23 +778,20 @@ define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) { define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
; SSE2-LABEL: @shuffle_v4i32_0z1z ; SSE2-LABEL: @shuffle_v4i32_0z1z
; SSE2: # BB#0: ; SSE2: # BB#0:
; SSE2-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] ; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3] ; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: retq ; SSE2-NEXT: retq
; ;
; SSE3-LABEL: @shuffle_v4i32_0z1z ; SSE3-LABEL: @shuffle_v4i32_0z1z
; SSE3: # BB#0: ; SSE3: # BB#0:
; SSE3-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] ; SSE3-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3] ; SSE3-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
; SSE3-NEXT: retq ; SSE3-NEXT: retq
; ;
; SSSE3-LABEL: @shuffle_v4i32_0z1z ; SSSE3-LABEL: @shuffle_v4i32_0z1z
; SSSE3: # BB#0: ; SSSE3: # BB#0:
; SSSE3-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] ; SSSE3-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]]
; SSSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3] ; SSSE3-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
; SSSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: retq ; SSSE3-NEXT: retq
; ;
; SSE41-LABEL: @shuffle_v4i32_0z1z ; SSE41-LABEL: @shuffle_v4i32_0z1z

View File

@ -1007,23 +1007,15 @@ define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) {
; SSE2-LABEL: @shuffle_v8i16_0zzz1zzz ; SSE2-LABEL: @shuffle_v8i16_0zzz1zzz
; SSE2: # BB#0: ; SSE2: # BB#0:
; SSE2-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] ; SSE2-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]]
; SSE2-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]]
; SSE2-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = [[X2]][0,3,2,1]
; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3] ; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1]
; SSE2-NEXT: retq ; SSE2-NEXT: retq
; ;
; SSSE3-LABEL: @shuffle_v8i16_0zzz1zzz ; SSSE3-LABEL: @shuffle_v8i16_0zzz1zzz
; SSSE3: # BB#0: ; SSSE3: # BB#0:
; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] ; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]]
; SSSE3-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]] ; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
; SSSE3-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3] ; SSSE3-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1]
; SSSE3-NEXT: pshufb {{.*}} # [[X2]] = [[X2]][2,3,8,9,6,7,0,1,8,9,6,7,4,5,6,7]
; SSSE3-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3]
; SSSE3-NEXT: movdqa %[[X2]], %xmm0
; SSSE3-NEXT: retq ; SSSE3-NEXT: retq
; ;
; SSE41-LABEL: @shuffle_v8i16_0zzz1zzz ; SSE41-LABEL: @shuffle_v8i16_0zzz1zzz

View File

@ -9,7 +9,7 @@ define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) {
; CHECK: movd (%{{.*}}), %[[X:xmm[0-9]+]] ; CHECK: movd (%{{.*}}), %[[X:xmm[0-9]+]]
; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]] ; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]]
; CHECK-NEXT: punpcklbw %[[Z]], %[[X]] ; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
; CHECK-NEXT: punpcklbw %[[Z]], %[[X]] ; CHECK-NEXT: punpcklwd %[[Z]], %[[X]]
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%val = load <4 x i8>* %ptr %val = load <4 x i8>* %ptr