forked from OSchip/llvm-project
[X86] Try to zero elts when lowering 256-bit shuffle with PSHUFB.
Otherwise we fallback to a blend of PSHUFBs later on. Differential Revision: http://reviews.llvm.org/D19661 llvm-svn: 271113
This commit is contained in:
parent
2d39bb3c6a
commit
a3dc1ba142
|
@ -7170,6 +7170,59 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
|
||||||
return Zeroable;
|
return Zeroable;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Mutate a shuffle mask, replacing zeroable elements with SM_SentinelZero.
|
||||||
|
static void computeZeroableShuffleMask(MutableArrayRef<int> Mask,
|
||||||
|
SDValue V1, SDValue V2) {
|
||||||
|
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
|
||||||
|
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
|
||||||
|
if (Mask[i] != SM_SentinelUndef && Zeroable[i])
|
||||||
|
Mask[i] = SM_SentinelZero;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Try to lower a shuffle with a single PSHUFB of V1.
|
||||||
|
/// This is only possible if V2 is unused (at all, or only for zero elements).
|
||||||
|
static SDValue lowerVectorShuffleWithPSHUFB(SDLoc DL, MVT VT,
|
||||||
|
ArrayRef<int> Mask, SDValue V1,
|
||||||
|
SDValue V2,
|
||||||
|
const X86Subtarget &Subtarget,
|
||||||
|
SelectionDAG &DAG) {
|
||||||
|
const int NumBytes = VT.is128BitVector() ? 16 : 32;
|
||||||
|
const int NumEltBytes = VT.getScalarSizeInBits() / 8;
|
||||||
|
|
||||||
|
assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
|
||||||
|
(Subtarget.hasAVX2() && VT.is256BitVector()));
|
||||||
|
|
||||||
|
SmallVector<int, 32> ZeroableMask(Mask.begin(), Mask.end());
|
||||||
|
computeZeroableShuffleMask(ZeroableMask, V1, V2);
|
||||||
|
|
||||||
|
if (!isSingleInputShuffleMask(ZeroableMask) ||
|
||||||
|
is128BitLaneCrossingShuffleMask(VT, Mask))
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
SmallVector<SDValue, 32> PSHUFBMask(NumBytes);
|
||||||
|
// Sign bit set in i8 mask means zero element.
|
||||||
|
SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
|
||||||
|
|
||||||
|
for (int i = 0; i < NumBytes; ++i) {
|
||||||
|
int M = ZeroableMask[i / NumEltBytes];
|
||||||
|
if (M == SM_SentinelUndef) {
|
||||||
|
PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
|
||||||
|
} else if (M == SM_SentinelZero) {
|
||||||
|
PSHUFBMask[i] = ZeroMask;
|
||||||
|
} else {
|
||||||
|
M = M * NumEltBytes + (i % NumEltBytes);
|
||||||
|
M = i < 16 ? M : M - 16;
|
||||||
|
PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
|
||||||
|
return DAG.getBitcast(
|
||||||
|
VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
|
||||||
|
DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
|
||||||
|
}
|
||||||
|
|
||||||
// X86 has dedicated unpack instructions that can handle specific blend
|
// X86 has dedicated unpack instructions that can handle specific blend
|
||||||
// operations: UNPCKH and UNPCKL.
|
// operations: UNPCKH and UNPCKL.
|
||||||
static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
|
static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
|
||||||
|
@ -11389,26 +11442,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||||
return lowerV8I16GeneralSingleInputVectorShuffle(
|
return lowerV8I16GeneralSingleInputVectorShuffle(
|
||||||
DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
|
DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
|
||||||
}
|
}
|
||||||
|
|
||||||
SDValue PSHUFBMask[32];
|
|
||||||
for (int i = 0; i < 16; ++i) {
|
|
||||||
if (Mask[i] == -1) {
|
|
||||||
PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
int M = i < 8 ? Mask[i] : Mask[i] - 8;
|
|
||||||
assert(M >= 0 && M < 8 && "Invalid single-input mask!");
|
|
||||||
PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
|
|
||||||
PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
|
|
||||||
}
|
|
||||||
return DAG.getBitcast(
|
|
||||||
MVT::v16i16,
|
|
||||||
DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
|
|
||||||
DAG.getBitcast(MVT::v32i8, V1),
|
|
||||||
DAG.getBuildVector(MVT::v32i8, DL, PSHUFBMask)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
|
||||||
|
V2, Subtarget, DAG))
|
||||||
|
return PSHUFB;
|
||||||
|
|
||||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||||
// shuffle.
|
// shuffle.
|
||||||
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
|
||||||
|
@ -11471,24 +11510,16 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||||
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
|
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
|
||||||
return V;
|
return V;
|
||||||
|
|
||||||
if (isSingleInputShuffleMask(Mask)) {
|
// There are no generalized cross-lane shuffle operations available on i8
|
||||||
// There are no generalized cross-lane shuffle operations available on i8
|
// element types.
|
||||||
// element types.
|
if (isSingleInputShuffleMask(Mask) &&
|
||||||
if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
|
is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
|
||||||
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
|
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
|
||||||
Mask, DAG);
|
DAG);
|
||||||
|
|
||||||
SDValue PSHUFBMask[32];
|
if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
|
||||||
for (int i = 0; i < 32; ++i)
|
V2, Subtarget, DAG))
|
||||||
PSHUFBMask[i] =
|
return PSHUFB;
|
||||||
Mask[i] < 0
|
|
||||||
? DAG.getUNDEF(MVT::i8)
|
|
||||||
: DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL,
|
|
||||||
MVT::i8);
|
|
||||||
|
|
||||||
return DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, V1,
|
|
||||||
DAG.getBuildVector(MVT::v32i8, DL, PSHUFBMask));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||||
// shuffle.
|
// shuffle.
|
||||||
|
|
|
@ -2378,6 +2378,24 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
|
||||||
ret <16 x i16> %shuffle
|
ret <16 x i16> %shuffle
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a) {
|
||||||
|
; AVX1-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
|
||||||
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||||
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
|
||||||
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||||
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
|
||||||
|
; AVX2-NEXT: retq
|
||||||
|
%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 2, i32 16, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||||
|
ret <16 x i16> %shuffle
|
||||||
|
}
|
||||||
|
|
||||||
define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
|
define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
|
||||||
; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
|
; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
|
||||||
; AVX1: # BB#0:
|
; AVX1: # BB#0:
|
||||||
|
|
|
@ -953,6 +953,22 @@ define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_
|
||||||
ret <32 x i8> %shuffle
|
ret <32 x i8> %shuffle
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31(<32 x i8> %a) {
|
||||||
|
; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15]
|
||||||
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||||
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[2],zero,ymm0[4,u,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
|
||||||
|
; AVX2-NEXT: retq
|
||||||
|
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 2, i32 32, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
|
||||||
|
ret <32 x i8> %shuffle
|
||||||
|
}
|
||||||
|
|
||||||
define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
|
define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
|
||||||
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
|
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
|
||||||
; AVX1: # BB#0:
|
; AVX1: # BB#0:
|
||||||
|
|
Loading…
Reference in New Issue