[X86][AVX2] Add support for combining target shuffles to VPMOVZX

Initial 256-bit vector support - 512-bit support requires extra checks for AVX512BW support (PMOVZXBW) that will be handled in a future patch.

llvm-svn: 294896
This commit is contained in:
Simon Pilgrim 2017-02-12 14:31:23 +00:00
parent 022c6e4f33
commit 4cd841757a
2 changed files with 13 additions and 10 deletions

View File

@ -26264,7 +26264,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
// instructions. // instructions.
// TODO: Investigate sharing more of this with shuffle lowering. // TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool FloatDomain, bool FloatDomain, SDValue &V1, SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size(); unsigned NumMaskElts = Mask.size();
@ -26280,8 +26281,9 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
} }
// Match against a VZEXT instruction. // Match against a VZEXT instruction.
// TODO: Add 256/512-bit vector support. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) { if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize; unsigned MaxScale = 64 / MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
bool Match = true; bool Match = true;
@ -26291,7 +26293,10 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
} }
if (Match) { if (Match) {
SrcVT = MaskVT; unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
if (SrcVT != MaskVT)
V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts); DstVT = MVT::getVectorVT(DstVT, NumDstElts);
Shuffle = X86ISD::VZEXT; Shuffle = X86ISD::VZEXT;
@ -26908,8 +26913,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
} }
} }
if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle, if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, DL, DAG,
ShuffleSrcVT, ShuffleVT)) { Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT)) {
if (Depth == 1 && Root.getOpcode() == Shuffle) if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do! return false; // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))

View File

@ -480,14 +480,12 @@ define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) { define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_zext: ; X32-LABEL: combine_pshufb_as_zext:
; X32: # BB#0: ; X32: # BB#0:
; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,1] ; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[10,11],zero,zero,zero,zero,zero,zero,ymm0[20,21],zero,zero,zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero
; X32-NEXT: retl ; X32-NEXT: retl
; ;
; X64-LABEL: combine_pshufb_as_zext: ; X64-LABEL: combine_pshufb_as_zext:
; X64: # BB#0: ; X64: # BB#0:
; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,1] ; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[10,11],zero,zero,zero,zero,zero,zero,ymm0[20,21],zero,zero,zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero
; X64-NEXT: retq ; X64-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)