[X86][SSE] Resolve target shuffle inputs to sentinels to permit more combines

The combineX86ShufflesRecursively only supports unary shuffles, but was missing the opportunity to combine binary shuffles with a zero / undef second input.

This patch resolves target shuffle inputs, converting the shuffle mask elements to SM_SentinelUndef/SM_SentinelZero where possible. It then resolves the updated mask to check if we have created a faux unary shuffle.

Additionally, we now attempt to recursively call combineX86ShufflesRecursively for all input operands (we used to just recurse for unary integer shuffles and unary unpacks) - it safely returns early if its not a target shuffle.

Differential Revision: http://reviews.llvm.org/D16683

llvm-svn: 260063
This commit is contained in:
Simon Pilgrim 2016-02-07 22:51:06 +00:00
parent bd173badb4
commit f116e4acc7
3 changed files with 113 additions and 48 deletions

View File

@ -5094,7 +5094,9 @@ static bool setTargetShuffleZeroElements(SDValue N,
if (M < 0)
continue;
// Determine shuffle input and normalize the mask.
SDValue V = M < Size ? V1 : V2;
M %= Size;
// We are referencing an UNDEF input.
if (V.isUndef()) {
@ -5102,12 +5104,77 @@ static bool setTargetShuffleZeroElements(SDValue N,
continue;
}
// TODO - handle the Size != (int)V.getNumOperands() cases in future.
if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
if (V.getOpcode() != ISD::BUILD_VECTOR)
continue;
if (!X86::isZeroNode(V.getOperand(M % Size)))
// If the BUILD_VECTOR has fewer elements then the (larger) source
// element must be UNDEF/ZERO.
// TODO: Is it worth testing the individual bits of a constant?
if ((Size % V.getNumOperands()) == 0) {
unsigned Scale = Size / V->getNumOperands();
SDValue Op = V.getOperand(M / Scale);
if (Op.isUndef())
Mask[i] = SM_SentinelUndef;
else if (X86::isZeroNode(Op))
Mask[i] = SM_SentinelZero;
continue;
Mask[i] = SM_SentinelZero;
}
// If the BUILD_VECTOR has more elements then all the (smaller) source
// elements must be all UNDEF or all ZERO.
if ((V.getNumOperands() % Size) == 0) {
unsigned Scale = V->getNumOperands() / Size;
bool AllUndef = true;
bool AllZero = true;
for (unsigned j = 0; j != Scale; ++j) {
SDValue Op = V.getOperand((M * Scale) + j);
AllUndef &= Op.isUndef();
AllZero &= X86::isZeroNode(Op);
}
if (AllUndef)
Mask[i] = SM_SentinelUndef;
else if (AllZero)
Mask[i] = SM_SentinelZero;
continue;
}
}
return true;
}
/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
/// remaining input indices in case we now have a unary shuffle and adjust the
/// Op0/Op1 inputs accordingly.
/// Returns true if the target shuffle mask was decoded.
static bool resolveTargetShuffleInputs(SDValue Op, bool &IsUnary, SDValue &Op0,
SDValue &Op1,
SmallVectorImpl<int> &Mask) {
if (!setTargetShuffleZeroElements(Op, Mask))
return false;
int NumElts = Mask.size();
bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
return 0 <= Idx && Idx < NumElts;
});
bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
[NumElts](int Idx) { return NumElts <= Idx; });
Op0 = Op0InUse ? Op.getOperand(0) : SDValue();
Op1 = Op1InUse ? Op.getOperand(1) : SDValue();
IsUnary = !(Op0InUse && Op1InUse);
if (!IsUnary)
return true;
// We're only using Op1 - commute the mask and inputs.
if (!Op0InUse && Op1InUse) {
for (int &M : Mask)
if (NumElts <= M)
M -= NumElts;
Op0 = Op1;
Op1 = SDValue();
}
return true;
@ -23278,7 +23345,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
/// \brief Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
///
/// This is the leaf of the recursive combinine below. When we have found some
/// This is the leaf of the recursive combine below. When we have found some
/// chain of single-use x86 shuffle instructions and accumulated the combined
/// shuffle mask represented by them, this will try to pattern match that mask
/// into either a single instruction if there is a special purpose instruction
@ -23439,13 +23506,19 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
int NumBytes = VT.getSizeInBits() / 8;
int Ratio = NumBytes / Mask.size();
for (int i = 0; i < NumBytes; ++i) {
if (Mask[i / Ratio] == SM_SentinelUndef) {
int M = Mask[i / Ratio];
if (M == SM_SentinelUndef) {
PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
continue;
}
int M = Mask[i / Ratio] != SM_SentinelZero
? Ratio * Mask[i / Ratio] + i % Ratio
: 255;
if (M == SM_SentinelZero) {
PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
continue;
}
M = Ratio * M + i % Ratio;
// Check that we are not crossing lanes.
if ((M / 16) != (i / 16))
return false;
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
@ -23518,13 +23591,15 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
"Can only combine shuffles of the same vector register size.");
if (!isTargetShuffle(Op.getOpcode()))
return false;
SmallVector<int, 16> OpMask;
// Extract target shuffle mask and resolve sentinels and inputs.
bool IsUnary;
bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary);
// We only can combine unary shuffles which we can decode the mask for.
if (!HaveMask || !IsUnary)
SDValue Input0, Input1;
SmallVector<int, 16> OpMask;
if (!resolveTargetShuffleInputs(Op, IsUnary, Input0, Input1, OpMask))
return false;
// At the moment we can only combine target shuffle unary cases.
if (!IsUnary)
return false;
assert(VT.getVectorNumElements() == OpMask.size() &&
@ -23570,32 +23645,25 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
RootMaskedIdx % OpRatio);
}
// See if we can recurse into the operand to combine more things.
switch (Op.getOpcode()) {
case X86ISD::PSHUFB:
HasPSHUFB = true;
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
if (Op.getOperand(0).hasOneUse() &&
combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
HasPSHUFB, DAG, DCI, Subtarget))
return true;
break;
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
assert(Op.getOperand(0) == Op.getOperand(1) &&
"We only combine unary shuffles!");
// We can't check for single use, we have to check that this shuffle is the
// only user.
if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
HasPSHUFB, DAG, DCI, Subtarget))
return true;
break;
// Handle the all undef case early.
// TODO - should we handle zero/undef case as well? Widening the mask
// will lose information on undef elements possibly reducing future
// combine possibilities.
if (std::all_of(Mask.begin(), Mask.end(),
[](int Idx) { return Idx == SM_SentinelUndef; })) {
DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
return true;
}
HasPSHUFB |= (Op.getOpcode() == X86ISD::PSHUFB);
// See if we can recurse into Input0 (if it's a target shuffle).
if (Input0 && Op->isOnlyUserOf(Input0.getNode()) &&
combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1, HasPSHUFB,
DAG, DCI, Subtarget))
return true;
// Minor canonicalization of the accumulated shuffle mask to make it easier
// to match below. All this does is detect masks with sequential pairs of
// elements, and shrink them to the half-width mask. It does this in a loop

View File

@ -143,14 +143,12 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
;
; SSSE3-LABEL: sext_16i8_to_8i32:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSSE3-NEXT: psrad $24, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $24, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
; SSSE3-NEXT: psrad $24, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_8i32:

View File

@ -1205,9 +1205,8 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[8],zero,xmm1[10],zero,xmm1[12],zero,xmm1[14],zero
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]