[X86] Improve AVX1 shuffle lowering for v8f32 shuffles where the low half comes from V1 and the high half comes from V2 and the halves do the same operation

To lower this we now create a new V1 containing the low half of both sources and a new V2 containing the upper half of both sources. Then we created a repeated lane shuffle of those new sources to create the final result.

This fixes PR35833

Differential Revison: https://reviews.llvm.org/D41794

llvm-svn: 339818
This commit is contained in:
Craig Topper 2018-08-15 21:21:52 +00:00
parent e15e32f800
commit 08e082619a
9 changed files with 460 additions and 534 deletions

View File

@ -13334,79 +13334,162 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// Lower a vector shuffle by first fixing the 128-bit lanes and then
/// shuffling each lane.
///
/// This will only succeed when the result of fixing the 128-bit lanes results
/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
/// each 128-bit lanes. This handles many cases where we can quickly blend away
/// the lane crosses early and then use simpler shuffles within each lane.
/// This attempts to create a repeated lane shuffle where each lane uses one
/// or two of the lanes of the inputs. The lanes of the input vectors are
/// shuffled in one or two independent shuffles to get the lanes into the
/// position needed by the final shuffle.
///
/// FIXME: It might be worthwhile at some point to support this without
/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
/// in x86 only floating point has interesting non-repeating shuffles, and even
/// those are still *marginally* more expensive.
/// FIXME: This should be generalized to 512-bit shuffles.
static SDValue lowerVectorShuffleByMerging128BitLanes(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!V2.isUndef() && "This is only useful with multiple inputs.");
if (is128BitLaneRepeatedShuffleMask(VT, Mask))
return SDValue();
int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();
int NumLanes = Size / LaneSize;
assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
assert(NumLanes == 2 && "Only handles 256-bit shuffles.");
// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
// check whether the in-128-bit lane shuffles share a repeating pattern.
SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
SmallVector<int, 16> RepeatMask(LaneSize, -1);
int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } };
// First pass will try to fill in the RepeatMask from lanes that need two
// sources.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Srcs[2] = { -1, -1 };
SmallVector<int, 16> InLaneMask(LaneSize, -1);
for (int i = 0; i != LaneSize; ++i) {
int M = Mask[(Lane * LaneSize) + i];
if (M < 0)
continue;
// Determine which of the 4 possible input lanes (2 from each source)
// this element comes from. Assign that as one of the sources for this
// lane. We can assign up to 2 sources for this lane. If we run out
// sources we can't do anything.
int LaneSrc = M / LaneSize;
int Src;
if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
Src = 0;
else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
Src = 1;
else
return SDValue();
Srcs[Src] = LaneSrc;
InLaneMask[i] = (M % LaneSize) + Src * Size;
}
// If this lane has two sources, see if it fits with the repeat mask so far.
if (Srcs[1] < 0)
continue;
int j = i / LaneSize;
LaneSrcs[Lane][0] = Srcs[0];
LaneSrcs[Lane][1] = Srcs[1];
if (Lanes[j] < 0) {
// First entry we've seen for this lane.
Lanes[j] = Mask[i] / LaneSize;
} else if (Lanes[j] != Mask[i] / LaneSize) {
// This doesn't match the lane selected previously!
return SDValue();
auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
assert(M1.size() == M2.size() && "Unexpected mask size");
for (int i = 0, e = M1.size(); i != e; ++i)
if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
return false;
return true;
};
auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
for (int i = 0, e = MergedMask.size(); i != e; ++i) {
int M = Mask[i];
if (M < 0)
continue;
assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
"Unexpected mask element");
MergedMask[i] = M;
}
};
if (MatchMasks(InLaneMask, RepeatMask)) {
// Merge this lane mask into the final repeat mask.
MergeMasks(InLaneMask, RepeatMask);
continue;
}
// Check that within each lane we have a consistent shuffle mask.
int k = i % LaneSize;
if (InLaneMask[k] < 0) {
InLaneMask[k] = Mask[i] % LaneSize;
} else if (InLaneMask[k] != Mask[i] % LaneSize) {
// This doesn't fit a repeating in-lane mask.
return SDValue();
// Didn't find a match. Swap the operands and try again.
std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
ShuffleVectorSDNode::commuteMask(InLaneMask);
if (MatchMasks(InLaneMask, RepeatMask)) {
// Merge this lane mask into the final repeat mask.
MergeMasks(InLaneMask, RepeatMask);
continue;
}
// Couldn't find a match with the operands in either order.
return SDValue();
}
// First shuffle the lanes into place.
MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
VT.getSizeInBits() / 64);
SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
for (int i = 0; i < NumLanes; ++i)
if (Lanes[i] >= 0) {
LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
// Now handle any lanes with only one source.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
// If this lane has already been processed, skip it.
if (LaneSrcs[Lane][0] >= 0)
continue;
for (int i = 0; i != LaneSize; ++i) {
int M = Mask[(Lane * LaneSize) + i];
if (M < 0)
continue;
// If RepeatMask isn't defined yet we can define it ourself.
if (RepeatMask[i] < 0)
RepeatMask[i] = M % LaneSize;
if (RepeatMask[i] < Size) {
if (RepeatMask[i] != M % LaneSize)
return SDValue();
LaneSrcs[Lane][0] = M / LaneSize;
} else {
if (RepeatMask[i] != ((M % LaneSize) + Size))
return SDValue();
LaneSrcs[Lane][1] = M / LaneSize;
}
}
V1 = DAG.getBitcast(LaneVT, V1);
V2 = DAG.getBitcast(LaneVT, V2);
SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
return SDValue();
}
// Cast it back to the type we actually want.
LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
SmallVector<int, 16> NewMask(Size, -1);
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][0];
for (int i = 0; i != LaneSize; ++i) {
int M = -1;
if (Src >= 0)
M = Src * LaneSize + i;
NewMask[Lane * LaneSize + i] = M;
}
}
SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
// Now do a simple shuffle that isn't lane crossing.
SmallVector<int, 8> NewMask((unsigned)Size, -1);
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
"Must not introduce lane crosses at this point!");
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][1];
for (int i = 0; i != LaneSize; ++i) {
int M = -1;
if (Src >= 0)
M = Src * LaneSize + i;
NewMask[Lane * LaneSize + i] = M;
}
}
SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
for (int i = 0; i != Size; ++i) {
NewMask[i] = RepeatMask[i % LaneSize];
if (NewMask[i] < 0)
continue;
NewMask[i] += (i / LaneSize) * LaneSize;
}
return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
}
/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

View File

@ -201,10 +201,13 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX256VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k0
; AVX256VLBW-NEXT: vpmovm2b %k0, %ymm0
; AVX256VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX256VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
; AVX256VLBW-NEXT: movl $-537190396, %eax # imm = 0xDFFB2004
; AVX256VLBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX256VLBW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,2,1,5,5,6,5]
; AVX256VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX256VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX256VLBW-NEXT: movl $537141252, %eax # imm = 0x20042004
; AVX256VLBW-NEXT: kmovd %eax, %k1
; AVX256VLBW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX256VLBW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1}
; AVX256VLBW-NEXT: vpmovb2m %ymm0, %k0
; AVX256VLBW-NEXT: vpmovm2b %k0, %ymm0
; AVX256VLBW-NEXT: retq

View File

@ -224,21 +224,21 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
@ -2131,11 +2131,9 @@ define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_1
;
; AVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,2]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
@ -2159,9 +2157,8 @@ define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_0
;
; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-NEXT: retq
;
@ -2189,9 +2186,8 @@ define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_2
; AVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
@ -2218,21 +2214,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-FAST-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
@ -2259,21 +2253,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpsllq $48, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,7,8,9,10,11,12,12,12,15]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FAST-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,14,15,16,17,16,17,16,17,16,17,24,25,24,25,24,25,30,31]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-FAST-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
@ -2298,9 +2290,8 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1
;
; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
; AVX2-NEXT: retq
;
@ -2327,8 +2318,8 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1
; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
@ -2354,18 +2345,16 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1
;
; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11,22,23,18,19,20,21,16,17,28,29,30,31,24,25,26,27]
; AVX2-FAST-NEXT: retq
;
@ -2391,11 +2380,10 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0
;
; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,24,25,24,25,24,25,24,25,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15,24,25,24,25,24,25,24,25,16,17,16,17,16,17,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
@ -2419,9 +2407,8 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_1
;
; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-NEXT: retq
;
@ -2448,18 +2435,16 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1
;
; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11,20,21,22,23,16,17,20,21,28,29,30,31,24,25,26,27]
; AVX2-FAST-NEXT: retq
;
@ -2483,22 +2468,12 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15,20,21,22,23,16,17,18,19,28,29,30,31,24,25,30,31]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
; AVX512VL: # %bb.0:
@ -2522,9 +2497,8 @@ define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_0
;
; AVX2-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1,30,31,26,27,28,29,24,25,22,23,18,19,20,21,16,17]
; AVX2-NEXT: retq
;
@ -2550,11 +2524,10 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0
;
; AVX2-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1,18,19,16,17,26,27,24,25,26,27,24,25,18,19,16,17]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3,18,19,16,17,26,27,24,25,26,27,24,25,18,19,18,19]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
@ -2579,11 +2552,10 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0
;
; AVX2-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1,26,27,24,25,18,19,16,17,26,27,24,25,18,19,16,17]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3,26,27,24,25,18,19,16,17,26,27,24,25,18,19,18,19]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
@ -2608,11 +2580,10 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1
;
; AVX2-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9,26,27,24,25,18,19,16,17,18,19,16,17,26,27,24,25]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3,26,27,24,25,18,19,16,17,18,19,16,17,26,27,18,19]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
@ -2637,11 +2608,10 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0
;
; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1,16,17,24,25,24,25,16,17,16,17,24,25,24,25,16,17]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3,16,17,24,25,24,25,16,17,16,17,24,25,24,25,18,19]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
@ -2666,11 +2636,10 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1
;
; AVX2-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9,24,25,16,17,16,17,24,25,24,25,16,17,16,17,24,25]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3,24,25,16,17,16,17,24,25,24,25,16,17,16,17,18,19]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
@ -2695,9 +2664,8 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_1
;
; AVX2-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7,20,21,28,29,24,25,16,17,26,27,18,19,30,31,22,23]
; AVX2-NEXT: retq
;
@ -2723,9 +2691,8 @@ define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_1
;
; AVX2-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7,20,21,16,17,28,29,24,25,26,27,18,19,30,31,22,23]
; AVX2-NEXT: retq
;
@ -2751,9 +2718,8 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_1
;
; AVX2-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11,20,21,28,29,24,25,16,17,18,19,22,23,30,31,26,27]
; AVX2-NEXT: retq
;
@ -2777,19 +2743,12 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
; AVX2-FAST-NEXT: retq
; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; AVX512VL: # %bb.0:
@ -2813,11 +2772,10 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1
;
; AVX2-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9,16,17,16,17,24,25,24,25,24,25,24,25,24,25,24,25]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15,16,17,16,17,24,25,24,25,24,25,24,25,24,25,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
@ -2842,11 +2800,10 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1
;
; AVX2-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9,24,25,24,25,16,17,16,17,24,25,24,25,24,25,24,25]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15,24,25,24,25,16,17,16,17,24,25,24,25,24,25,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
@ -2871,11 +2828,10 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1
;
; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15,16,17,24,25,24,25,16,17,24,25,24,25,24,25,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
@ -2900,11 +2856,10 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0
;
; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1,16,17,24,25,24,25,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15,16,17,24,25,24,25,16,17,16,17,16,17,16,17,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
@ -2930,19 +2885,17 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15,16,17,24,25,24,25,16,17,24,25,26,27,28,29,30,31]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15,16,17,24,25,24,25,16,17,24,25,26,27,28,29,30,31]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
@ -2967,11 +2920,10 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1
;
; AVX2-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9,16,17,18,19,24,25,24,25,24,25,24,25,24,25,24,25]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,255,255,255,255,0,0,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15,16,17,18,19,24,25,24,25,24,25,24,25,24,25,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
@ -2996,11 +2948,10 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1
;
; AVX2-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9,24,25,24,25,24,25,16,17,24,25,24,25,24,25,24,25]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,255,255,255,255,255,255,255,255,0,0,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255>
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15,24,25,24,25,24,25,16,17,24,25,24,25,24,25,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
@ -3025,11 +2976,10 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1
;
; AVX2-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,255,255,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255>
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15,16,17,24,25,24,25,16,17,24,25,24,25,24,25,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
@ -3071,8 +3021,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_1
;
; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7,28,29,22,23,20,21,22,23,24,25,26,27,28,29,22,23]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,2]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
@ -3132,9 +3082,8 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1
;
; AVX2-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7,16,17,18,19,20,21,30,31,24,25,26,27,28,29,22,23]
; AVX2-NEXT: retq
;
@ -3159,10 +3108,9 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_1
;
; AVX2-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15,24,25,26,27,28,29,22,23,16,17,18,19,20,21,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3,24,25,26,27,28,29,22,23,16,17,18,19,20,21,18,19]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
@ -3185,19 +3133,12 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
; AVX2-FAST-NEXT: retq
; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; AVX512VL: # %bb.0:
@ -3222,9 +3163,8 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_2
;
; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15]
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; AVX2-NEXT: retq
;
@ -3253,10 +3193,9 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3
; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
@ -3284,8 +3223,8 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_3
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
;
@ -3314,9 +3253,8 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2
; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23]
; AVX2-NEXT: retq
;
@ -3348,10 +3286,10 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
@ -3359,9 +3297,9 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,2,3,8,9,12,13,12,13,14,15,16,17,16,17,20,21,18,19,24,25,28,29,28,29,30,31]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-FAST-NEXT: retq
@ -3392,9 +3330,8 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
@ -3403,9 +3340,8 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
@ -3437,9 +3373,8 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2
;
; AVX2-SLOW-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@ -3447,9 +3382,8 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2
;
; AVX2-FAST-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,2,3,0,1,8,9,10,11,6,7,4,5,18,19,16,17,18,19,16,17,24,25,26,27,22,23,20,21]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,12,13,14,15,18,19,16,17,22,23,20,21,22,23,20,21,28,29,30,31]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
@ -3478,9 +3412,8 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1
;
; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
; AVX2-NEXT: retq
;
@ -3510,8 +3443,8 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
; AVX2-NEXT: retq
;
@ -3543,20 +3476,18 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3
; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
@ -3681,7 +3612,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_1
;
; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,2]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
@ -3739,9 +3670,8 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_1
; AVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,u,u>
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23]
; AVX2-NEXT: retq
;
@ -3797,8 +3727,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
;
; AVX2-SLOW-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
@ -3806,9 +3736,9 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
;
; AVX2-FAST-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,4,4,5,6,4]
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15,16,17,18,19,20,21,18,19,24,25,26,27,30,31,30,31]
; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
; AVX2-FAST-NEXT: retq
;
@ -3867,9 +3797,8 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1
; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2-NEXT: retq
;
@ -3912,9 +3841,8 @@ define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_1
;
; AVX2-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2-NEXT: retq
;
@ -3977,9 +3905,8 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1
; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; AVX2-NEXT: retq
;
@ -4023,9 +3950,8 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_1
;
; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; AVX2-NEXT: retq
;
@ -4088,9 +4014,8 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2
; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; AVX2-NEXT: retq
;
@ -4135,9 +4060,8 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2-NEXT: retq
;

View File

@ -309,23 +309,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX512VLBW-NEXT: movl $32767, %eax # imm = 0x7FFF
; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
; AVX512VLBW-NEXT: movl $-2147450880, %eax # imm = 0x80008000
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@ -2659,18 +2660,13 @@ define <32 x i8> @shuffle_v32i8_31_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_
;
; AVX2-LABEL: shuffle_v32i8_31_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm0[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_31_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: movl $-2147483648, %eax # imm = 0x80000000
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
; AVX512VLBW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX512VLBW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm0[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512VLBW-NEXT: retq
;

View File

@ -311,9 +311,9 @@ define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0462:
; ALL: # %bb.0:
; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
ret <4 x double> %shuffle
@ -1318,12 +1318,8 @@ define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1z3z(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1z3z:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1z3z:
@ -1567,15 +1563,14 @@ define <4 x i64> @insert_dup_mem_v4i64(i64* %ptr) {
define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1234:
; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1234:
; AVX2: # %bb.0:
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1234:
@ -1682,17 +1677,11 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: add_v4f64_0246_1357:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],xmm4[0]
; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: vaddpd %ymm0, %ymm3, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v4f64_0246_1357:
@ -1731,17 +1720,11 @@ entry:
define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: add_v4f64_4602_5713:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm1[0],xmm4[0]
; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX1-NEXT: vaddpd %ymm0, %ymm3, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v4f64_4602_5713:
@ -1780,21 +1763,15 @@ entry:
define <4 x i64> @add_v4i64_0246_1357(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: add_v4i64_0246_1357:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm4[0]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v4i64_0246_1357:
@ -1833,21 +1810,15 @@ entry:
define <4 x i64> @add_v4i64_4602_5713(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: add_v4i64_4602_5713:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm4[0]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v4i64_4602_5713:

View File

@ -74,10 +74,10 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00040000:
; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00040000:
@ -659,11 +659,11 @@ define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_c348cda0:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[0,1,0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
; AVX1-NEXT: retq
;
@ -885,12 +885,9 @@ define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
; AVX1-LABEL: PR21138:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR21138:
@ -1146,10 +1143,10 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00040000:
; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00040000:
@ -2443,17 +2440,15 @@ define <8 x i32> @insert_dup_mem_v8i32(i32* %ptr) {
define <8 x i32> @shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_12345678:
; AVX1: # %bb.0:
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_12345678:
; AVX2: # %bb.0:
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3],ymm0[20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_12345678:
@ -2489,17 +2484,11 @@ define <8 x i32> @shuffle_v8i32_12345670(<8 x i32> %a) {
define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: add_v8f32_02468ACE_13579BDF:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm0[0,2],xmm4[0,2]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: vaddps %ymm0, %ymm3, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v8f32_02468ACE_13579BDF:
@ -2538,17 +2527,11 @@ entry:
define <8 x float> @add_v8f32_8ACE0246_9BDF1357(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: add_v8f32_8ACE0246_9BDF1357:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,2],xmm4[0,2]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vaddps %ymm0, %ymm3, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357:
@ -2587,21 +2570,15 @@ entry:
define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: add_v8i32_02468ACE_13579BDF:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm0[0,2],xmm4[0,2]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v8i32_02468ACE_13579BDF:
@ -2640,21 +2617,15 @@ entry:
define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: add_v8i32_8ACE0246_9BDF1357:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,2],xmm4[0,2]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v8i32_8ACE0246_9BDF1357:

View File

@ -434,41 +434,32 @@ define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_
define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm2
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm1
; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm1
; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm2
; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm1
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
; AVX512DQ-NEXT: retq
;
@ -485,15 +476,13 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512F-NEXT: vpshufb %ymm5, %ymm1, %ymm2
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpshufb %ymm5, %ymm0, %ymm1
; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
; AVX512F-NEXT: retq
@ -501,31 +490,27 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
; AVX512DQ-NEXT: retq

View File

@ -82,33 +82,33 @@ define <4 x double> @expand2(<2 x double> %a) {
; SKX64-LABEL: expand2:
; SKX64: # %bb.0:
; SKX64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; SKX64-NEXT: movb $9, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z}
; SKX64-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
; SKX64-NEXT: vmovaps %xmm0, %xmm0
; SKX64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand2:
; KNL64: # %bb.0:
; KNL64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; KNL64-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
; KNL64-NEXT: vmovaps %xmm0, %xmm0
; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand2:
; SKX32: # %bb.0:
; SKX32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; SKX32-NEXT: movb $9, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z}
; SKX32-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
; SKX32-NEXT: vmovaps %xmm0, %xmm0
; SKX32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand2:
; KNL32: # %bb.0:
; KNL32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; KNL32-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
; KNL32-NEXT: vmovaps %xmm0, %xmm0
; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; KNL32-NEXT: retl
%res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1>
ret <4 x double> %res
@ -162,9 +162,9 @@ define <4 x i64> @expand4(<2 x i64> %a ) {
; KNL64-LABEL: expand4:
; KNL64: # %bb.0:
; KNL64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; KNL64-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
; KNL64-NEXT: vmovaps %xmm0, %xmm0
; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand4:
@ -178,9 +178,9 @@ define <4 x i64> @expand4(<2 x i64> %a ) {
; KNL32-LABEL: expand4:
; KNL32: # %bb.0:
; KNL32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; KNL32-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
; KNL32-NEXT: vmovaps %xmm0, %xmm0
; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; KNL32-NEXT: retl
%res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3>
ret <4 x i64> %res

View File

@ -2773,24 +2773,17 @@ define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
;
; AVX1-LABEL: PR22412:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: PR22412:
; AVX2-SLOW: # %bb.0: # %entry
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: PR22412:
; AVX2-FAST: # %bb.0: # %entry
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
; AVX2-LABEL: PR22412:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
; AVX2-NEXT: retq
entry:
%s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>