forked from OSchip/llvm-project
[x86] Dramatically improve v8i16 shuffle lowering by not using its
terribly complex partial blend logic. This code path was one of the more complex and bug prone when it first went in and it hasn't faired much better. Ultimately, with the simpler basis for unpack lowering and support bit-math blending, this is completely obsolete. In the worst case without this we generate different but equivalent instructions. However, in many cases we generate much better code. This is especially true when blends or pshufb is available. This does expose one (minor) weakness of the unpack lowering that I'll try to address. In case you were wondering, this is actually a big part of what I've been trying to pull off in the recent string of commits. llvm-svn: 229853
This commit is contained in:
parent
5bb231c279
commit
352eba1c29
|
@ -9521,122 +9521,6 @@ static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
|
|||
return InterleavedCrosses < SplitCrosses;
|
||||
}
|
||||
|
||||
/// \brief Blend two v8i16 vectors using a naive unpack strategy.
|
||||
///
|
||||
/// This strategy only works when the inputs from each vector fit into a single
|
||||
/// half of that vector, and generally there are not so many inputs as to leave
|
||||
/// the in-place shuffles required highly constrained (and thus expensive). It
|
||||
/// shifts all the inputs into a single side of both input vectors and then
|
||||
/// uses an unpack to interleave these inputs in a single vector. At that
|
||||
/// point, we will fall back on the generic single input shuffle lowering.
|
||||
static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
|
||||
SDValue V2,
|
||||
MutableArrayRef<int> Mask,
|
||||
const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
|
||||
assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
|
||||
SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
|
||||
for (int i = 0; i < 8; ++i)
|
||||
if (Mask[i] >= 0 && Mask[i] < 4)
|
||||
LoV1Inputs.push_back(i);
|
||||
else if (Mask[i] >= 4 && Mask[i] < 8)
|
||||
HiV1Inputs.push_back(i);
|
||||
else if (Mask[i] >= 8 && Mask[i] < 12)
|
||||
LoV2Inputs.push_back(i);
|
||||
else if (Mask[i] >= 12)
|
||||
HiV2Inputs.push_back(i);
|
||||
|
||||
int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
|
||||
int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
|
||||
(void)NumV1Inputs;
|
||||
(void)NumV2Inputs;
|
||||
assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
|
||||
assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
|
||||
assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
|
||||
|
||||
bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
|
||||
HiV1Inputs.size() + HiV2Inputs.size();
|
||||
|
||||
auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
|
||||
ArrayRef<int> HiInputs, bool MoveToLo,
|
||||
int MaskOffset) {
|
||||
ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
|
||||
ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
|
||||
if (BadInputs.empty())
|
||||
return V;
|
||||
|
||||
int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
|
||||
int MoveOffset = MoveToLo ? 0 : 4;
|
||||
|
||||
if (GoodInputs.empty()) {
|
||||
for (int BadInput : BadInputs) {
|
||||
MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
|
||||
Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
|
||||
}
|
||||
} else {
|
||||
if (GoodInputs.size() == 2) {
|
||||
// If the low inputs are spread across two dwords, pack them into
|
||||
// a single dword.
|
||||
MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
|
||||
MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
|
||||
Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
|
||||
Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
|
||||
} else {
|
||||
// Otherwise pin the good inputs.
|
||||
for (int GoodInput : GoodInputs)
|
||||
MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
|
||||
}
|
||||
|
||||
if (BadInputs.size() == 2) {
|
||||
// If we have two bad inputs then there may be either one or two good
|
||||
// inputs fixed in place. Find a fixed input, and then find the *other*
|
||||
// two adjacent indices by using modular arithmetic.
|
||||
int GoodMaskIdx =
|
||||
std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
|
||||
[](int M) { return M >= 0; }) -
|
||||
std::begin(MoveMask);
|
||||
int MoveMaskIdx =
|
||||
((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
|
||||
assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
|
||||
assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
|
||||
MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
|
||||
MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
|
||||
Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
|
||||
Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
|
||||
} else {
|
||||
assert(BadInputs.size() == 1 && "All sizes handled");
|
||||
int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
|
||||
std::end(MoveMask), -1) -
|
||||
std::begin(MoveMask);
|
||||
MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
|
||||
Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
|
||||
}
|
||||
}
|
||||
|
||||
return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
|
||||
MoveMask);
|
||||
};
|
||||
V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
|
||||
/*MaskOffset*/ 0);
|
||||
V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
|
||||
/*MaskOffset*/ 8);
|
||||
|
||||
// FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
|
||||
// cross-half traffic in the final shuffle.
|
||||
|
||||
// Munge the mask to be a single-input mask after the unpack merges the
|
||||
// results.
|
||||
for (int &M : Mask)
|
||||
if (M != -1)
|
||||
M = 2 * (M % 4) + (M / 8);
|
||||
|
||||
return DAG.getVectorShuffle(
|
||||
MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
|
||||
DL, MVT::v8i16, V1, V2),
|
||||
DAG.getUNDEF(MVT::v8i16), Mask);
|
||||
}
|
||||
|
||||
/// \brief Helper to form a PSHUFB-based shuffle+blend.
|
||||
static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
|
||||
SDValue V2, ArrayRef<int> Mask,
|
||||
|
@ -9772,9 +9656,6 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
|||
lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
|
||||
return BitBlend;
|
||||
|
||||
if (NumV1Inputs + NumV2Inputs <= 4)
|
||||
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
|
||||
|
||||
// Check whether an interleaving lowering is likely to be more efficient.
|
||||
// This isn't perfect but it is a strong heuristic that tends to work well on
|
||||
// the kinds of shuffles that show up in practice.
|
||||
|
|
|
@ -192,8 +192,8 @@ define void @t10() nounwind {
|
|||
define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
|
||||
; X64-LABEL: t11:
|
||||
; X64: ## BB#0: ## %entry
|
||||
; X64-NEXT: psrld $16, %xmm0
|
||||
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
|
||||
|
@ -204,9 +204,14 @@ entry:
|
|||
define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
|
||||
; X64-LABEL: t12:
|
||||
; X64: ## BB#0: ## %entry
|
||||
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
||||
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
|
||||
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
|
||||
; X64-NEXT: pand %xmm1, %xmm0
|
||||
; X64-NEXT: pandn %xmm2, %xmm1
|
||||
; X64-NEXT: por %xmm0, %xmm1
|
||||
; X64-NEXT: movdqa %xmm1, %xmm0
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
|
||||
|
@ -217,9 +222,13 @@ entry:
|
|||
define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
|
||||
; X64-LABEL: t13:
|
||||
; X64: ## BB#0: ## %entry
|
||||
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,65535,65535]
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
|
||||
; X64-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
|
||||
; X64-NEXT: pand %xmm0, %xmm1
|
||||
; X64-NEXT: pandn %xmm2, %xmm0
|
||||
; X64-NEXT: por %xmm1, %xmm0
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
|
||||
|
@ -229,8 +238,9 @@ entry:
|
|||
define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
|
||||
; X64-LABEL: t14:
|
||||
; X64: ## BB#0: ## %entry
|
||||
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; X64-NEXT: psrlq $16, %xmm0
|
||||
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
||||
; X64-NEXT: movdqa %xmm1, %xmm0
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >
|
||||
|
@ -242,11 +252,8 @@ define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
|
|||
; X64-LABEL: t15:
|
||||
; X64: ## BB#0: ## %entry
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
|
||||
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
|
||||
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
|
||||
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
|
||||
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
|
||||
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
|
||||
|
|
|
@ -1100,34 +1100,33 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
|
|||
; SSE2: # BB#0: # %entry
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
|
||||
; SSE2-NEXT: pand %xmm5, %xmm2
|
||||
; SSE2-NEXT: pandn %xmm4, %xmm5
|
||||
; SSE2-NEXT: por %xmm2, %xmm5
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,1,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,1,4,5,6,7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
|
||||
; SSE2-NEXT: packuswb %xmm5, %xmm3
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[1,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,1]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm3
|
||||
; SSE2-NEXT: pandn %xmm0, %xmm2
|
||||
; SSE2-NEXT: por %xmm3, %xmm2
|
||||
; SSE2-NEXT: packuswb %xmm2, %xmm4
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
|
||||
; SSE2-NEXT: pand %xmm0, %xmm3
|
||||
; SSE2-NEXT: pand %xmm0, %xmm4
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,7]
|
||||
; SSE2-NEXT: pandn %xmm1, %xmm0
|
||||
; SSE2-NEXT: por %xmm3, %xmm0
|
||||
; SSE2-NEXT: por %xmm4, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
|
||||
|
|
|
@ -1060,36 +1060,33 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
|
|||
define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
|
||||
; SSE2-LABEL: shuffle_v8i16_443aXXXX:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm0
|
||||
; SSE2-NEXT: pandn %xmm1, %xmm2
|
||||
; SSE2-NEXT: por %xmm0, %xmm2
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v8i16_443aXXXX:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
|
||||
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[4,5,u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7],zero,zero,xmm0[u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: por %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v8i16_443aXXXX:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
|
||||
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
|
||||
; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
|
||||
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v8i16_443aXXXX:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
|
||||
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
|
||||
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
|
||||
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
ret <8 x i16> %shuffle
|
||||
|
@ -1098,34 +1095,37 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
|
|||
define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
|
||||
; SSE2-LABEL: shuffle_v8i16_032dXXXX:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v8i16_032dXXXX:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: por %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v8i16_032dXXXX:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v8i16_032dXXXX:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
|
||||
; AVX-NEXT: retq
|
||||
; AVX1-LABEL: shuffle_v8i16_032dXXXX:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i16_032dXXXX:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
ret <8 x i16> %shuffle
|
||||
}
|
||||
|
@ -1146,33 +1146,30 @@ define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
|
|||
define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
|
||||
; SSE2-LABEL: shuffle_v8i16_012dXXXX:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
|
||||
; SSE2-NEXT: pandn %xmm1, %xmm2
|
||||
; SSE2-NEXT: por %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v8i16_012dXXXX:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: por %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v8i16_012dXXXX:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v8i16_012dXXXX:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
|
||||
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
ret <8 x i16> %shuffle
|
||||
|
@ -1181,41 +1178,37 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
|
|||
define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
|
||||
; SSE2-LABEL: shuffle_v8i16_XXXXcde3:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
|
||||
; SSE2-NEXT: pandn %xmm0, %xmm2
|
||||
; SSE2-NEXT: por %xmm1, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v8i16_XXXXcde3:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm0[6,7]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13],zero,zero
|
||||
; SSSE3-NEXT: por %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v8i16_XXXXcde3:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: shuffle_v8i16_XXXXcde3:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
|
||||
ret <8 x i16> %shuffle
|
||||
|
@ -1224,42 +1217,32 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
|
|||
define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
|
||||
; SSE2-LABEL: shuffle_v8i16_cde3XXXX:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm1
|
||||
; SSE2-NEXT: pandn %xmm0, %xmm2
|
||||
; SSE2-NEXT: por %xmm1, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v8i16_cde3XXXX:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[6,7,u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: por %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v8i16_cde3XXXX:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: shuffle_v8i16_cde3XXXX:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i16_cde3XXXX:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
|
||||
; AVX2-NEXT: retq
|
||||
; AVX-LABEL: shuffle_v8i16_cde3XXXX:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
ret <8 x i16> %shuffle
|
||||
}
|
||||
|
@ -1338,36 +1321,46 @@ define <8 x i16> @shuffle_v8i16_0923cde7(<8 x i16> %a, <8 x i16> %b) {
|
|||
define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
|
||||
; SSE2-LABEL: shuffle_v8i16_XXX1X579:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,2,0]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm1
|
||||
; SSE2-NEXT: por %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v8i16_XXX1X579:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,xmm1[u,u],zero,zero,zero,zero,xmm1[2,3]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,u,u,10,11,14,15],zero,zero
|
||||
; SSSE3-NEXT: por %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v8i16_XXX1X579:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
|
||||
; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
|
||||
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
|
||||
; SSE41-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v8i16_XXX1X579:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
|
||||
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
|
||||
; AVX-NEXT: retq
|
||||
; AVX1-LABEL: shuffle_v8i16_XXX1X579:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
|
||||
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
|
||||
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i16_XXX1X579:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
|
||||
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
|
||||
ret <8 x i16> %shuffle
|
||||
}
|
||||
|
@ -1375,42 +1368,40 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
|
|||
define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
|
||||
; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
|
||||
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
||||
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u]
|
||||
; SSSE3-NEXT: por %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
||||
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
|
||||
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v8i16_XX4X8acX:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
||||
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
||||
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
|
||||
; AVX-NEXT: retq
|
||||
; AVX1-LABEL: shuffle_v8i16_XX4X8acX:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i16_XX4X8acX:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
|
||||
ret <8 x i16> %shuffle
|
||||
}
|
||||
|
|
|
@ -1435,9 +1435,9 @@ define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_z
|
|||
; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,14,15,0,1,2,3,12,13,14,15]
|
||||
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
|
|
Loading…
Reference in New Issue