forked from OSchip/llvm-project
[X86][SSE] Resolve target shuffle inputs to sentinels to permit more combines
The combineX86ShufflesRecursively only supports unary shuffles, but was missing the opportunity to combine binary shuffles with a zero / undef second input. This patch resolves target shuffle inputs, converting the shuffle mask elements to SM_SentinelUndef/SM_SentinelZero where possible. It then resolves the updated mask to check if we have created a faux unary shuffle. Additionally, we now attempt to recursively call combineX86ShufflesRecursively for all input operands (we used to just recurse for unary integer shuffles and unary unpacks) - it safely returns early if its not a target shuffle. Differential Revision: http://reviews.llvm.org/D16683 llvm-svn: 260063
This commit is contained in:
parent
bd173badb4
commit
f116e4acc7
|
@ -5094,7 +5094,9 @@ static bool setTargetShuffleZeroElements(SDValue N,
|
|||
if (M < 0)
|
||||
continue;
|
||||
|
||||
// Determine shuffle input and normalize the mask.
|
||||
SDValue V = M < Size ? V1 : V2;
|
||||
M %= Size;
|
||||
|
||||
// We are referencing an UNDEF input.
|
||||
if (V.isUndef()) {
|
||||
|
@ -5102,12 +5104,77 @@ static bool setTargetShuffleZeroElements(SDValue N,
|
|||
continue;
|
||||
}
|
||||
|
||||
// TODO - handle the Size != (int)V.getNumOperands() cases in future.
|
||||
if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
|
||||
// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
|
||||
if (V.getOpcode() != ISD::BUILD_VECTOR)
|
||||
continue;
|
||||
if (!X86::isZeroNode(V.getOperand(M % Size)))
|
||||
|
||||
// If the BUILD_VECTOR has fewer elements then the (larger) source
|
||||
// element must be UNDEF/ZERO.
|
||||
// TODO: Is it worth testing the individual bits of a constant?
|
||||
if ((Size % V.getNumOperands()) == 0) {
|
||||
unsigned Scale = Size / V->getNumOperands();
|
||||
SDValue Op = V.getOperand(M / Scale);
|
||||
if (Op.isUndef())
|
||||
Mask[i] = SM_SentinelUndef;
|
||||
else if (X86::isZeroNode(Op))
|
||||
Mask[i] = SM_SentinelZero;
|
||||
continue;
|
||||
Mask[i] = SM_SentinelZero;
|
||||
}
|
||||
|
||||
// If the BUILD_VECTOR has more elements then all the (smaller) source
|
||||
// elements must be all UNDEF or all ZERO.
|
||||
if ((V.getNumOperands() % Size) == 0) {
|
||||
unsigned Scale = V->getNumOperands() / Size;
|
||||
bool AllUndef = true;
|
||||
bool AllZero = true;
|
||||
for (unsigned j = 0; j != Scale; ++j) {
|
||||
SDValue Op = V.getOperand((M * Scale) + j);
|
||||
AllUndef &= Op.isUndef();
|
||||
AllZero &= X86::isZeroNode(Op);
|
||||
}
|
||||
if (AllUndef)
|
||||
Mask[i] = SM_SentinelUndef;
|
||||
else if (AllZero)
|
||||
Mask[i] = SM_SentinelZero;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
|
||||
/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
|
||||
/// remaining input indices in case we now have a unary shuffle and adjust the
|
||||
/// Op0/Op1 inputs accordingly.
|
||||
/// Returns true if the target shuffle mask was decoded.
|
||||
static bool resolveTargetShuffleInputs(SDValue Op, bool &IsUnary, SDValue &Op0,
|
||||
SDValue &Op1,
|
||||
SmallVectorImpl<int> &Mask) {
|
||||
if (!setTargetShuffleZeroElements(Op, Mask))
|
||||
return false;
|
||||
|
||||
int NumElts = Mask.size();
|
||||
bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
|
||||
return 0 <= Idx && Idx < NumElts;
|
||||
});
|
||||
bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
|
||||
[NumElts](int Idx) { return NumElts <= Idx; });
|
||||
|
||||
Op0 = Op0InUse ? Op.getOperand(0) : SDValue();
|
||||
Op1 = Op1InUse ? Op.getOperand(1) : SDValue();
|
||||
IsUnary = !(Op0InUse && Op1InUse);
|
||||
|
||||
if (!IsUnary)
|
||||
return true;
|
||||
|
||||
// We're only using Op1 - commute the mask and inputs.
|
||||
if (!Op0InUse && Op1InUse) {
|
||||
for (int &M : Mask)
|
||||
if (NumElts <= M)
|
||||
M -= NumElts;
|
||||
Op0 = Op1;
|
||||
Op1 = SDValue();
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -23278,7 +23345,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
|
|||
/// \brief Combine an arbitrary chain of shuffles into a single instruction if
|
||||
/// possible.
|
||||
///
|
||||
/// This is the leaf of the recursive combinine below. When we have found some
|
||||
/// This is the leaf of the recursive combine below. When we have found some
|
||||
/// chain of single-use x86 shuffle instructions and accumulated the combined
|
||||
/// shuffle mask represented by them, this will try to pattern match that mask
|
||||
/// into either a single instruction if there is a special purpose instruction
|
||||
|
@ -23439,13 +23506,19 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
|
|||
int NumBytes = VT.getSizeInBits() / 8;
|
||||
int Ratio = NumBytes / Mask.size();
|
||||
for (int i = 0; i < NumBytes; ++i) {
|
||||
if (Mask[i / Ratio] == SM_SentinelUndef) {
|
||||
int M = Mask[i / Ratio];
|
||||
if (M == SM_SentinelUndef) {
|
||||
PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
|
||||
continue;
|
||||
}
|
||||
int M = Mask[i / Ratio] != SM_SentinelZero
|
||||
? Ratio * Mask[i / Ratio] + i % Ratio
|
||||
: 255;
|
||||
if (M == SM_SentinelZero) {
|
||||
PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
|
||||
continue;
|
||||
}
|
||||
M = Ratio * M + i % Ratio;
|
||||
// Check that we are not crossing lanes.
|
||||
if ((M / 16) != (i / 16))
|
||||
return false;
|
||||
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
|
||||
}
|
||||
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
|
||||
|
@ -23518,13 +23591,15 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
|
|||
assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
|
||||
"Can only combine shuffles of the same vector register size.");
|
||||
|
||||
if (!isTargetShuffle(Op.getOpcode()))
|
||||
return false;
|
||||
SmallVector<int, 16> OpMask;
|
||||
// Extract target shuffle mask and resolve sentinels and inputs.
|
||||
bool IsUnary;
|
||||
bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary);
|
||||
// We only can combine unary shuffles which we can decode the mask for.
|
||||
if (!HaveMask || !IsUnary)
|
||||
SDValue Input0, Input1;
|
||||
SmallVector<int, 16> OpMask;
|
||||
if (!resolveTargetShuffleInputs(Op, IsUnary, Input0, Input1, OpMask))
|
||||
return false;
|
||||
|
||||
// At the moment we can only combine target shuffle unary cases.
|
||||
if (!IsUnary)
|
||||
return false;
|
||||
|
||||
assert(VT.getVectorNumElements() == OpMask.size() &&
|
||||
|
@ -23570,32 +23645,25 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
|
|||
RootMaskedIdx % OpRatio);
|
||||
}
|
||||
|
||||
// See if we can recurse into the operand to combine more things.
|
||||
switch (Op.getOpcode()) {
|
||||
case X86ISD::PSHUFB:
|
||||
HasPSHUFB = true;
|
||||
case X86ISD::PSHUFD:
|
||||
case X86ISD::PSHUFHW:
|
||||
case X86ISD::PSHUFLW:
|
||||
if (Op.getOperand(0).hasOneUse() &&
|
||||
combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
|
||||
HasPSHUFB, DAG, DCI, Subtarget))
|
||||
return true;
|
||||
break;
|
||||
|
||||
case X86ISD::UNPCKL:
|
||||
case X86ISD::UNPCKH:
|
||||
assert(Op.getOperand(0) == Op.getOperand(1) &&
|
||||
"We only combine unary shuffles!");
|
||||
// We can't check for single use, we have to check that this shuffle is the
|
||||
// only user.
|
||||
if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
|
||||
combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
|
||||
HasPSHUFB, DAG, DCI, Subtarget))
|
||||
return true;
|
||||
break;
|
||||
// Handle the all undef case early.
|
||||
// TODO - should we handle zero/undef case as well? Widening the mask
|
||||
// will lose information on undef elements possibly reducing future
|
||||
// combine possibilities.
|
||||
if (std::all_of(Mask.begin(), Mask.end(),
|
||||
[](int Idx) { return Idx == SM_SentinelUndef; })) {
|
||||
DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
|
||||
return true;
|
||||
}
|
||||
|
||||
HasPSHUFB |= (Op.getOpcode() == X86ISD::PSHUFB);
|
||||
|
||||
// See if we can recurse into Input0 (if it's a target shuffle).
|
||||
if (Input0 && Op->isOnlyUserOf(Input0.getNode()) &&
|
||||
combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1, HasPSHUFB,
|
||||
DAG, DCI, Subtarget))
|
||||
return true;
|
||||
|
||||
|
||||
// Minor canonicalization of the accumulated shuffle mask to make it easier
|
||||
// to match below. All this does is detect masks with sequential pairs of
|
||||
// elements, and shrink them to the half-width mask. It does this in a loop
|
||||
|
|
|
@ -143,14 +143,12 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
|
|||
;
|
||||
; SSSE3-LABEL: sext_16i8_to_8i32:
|
||||
; SSSE3: # BB#0: # %entry
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; SSSE3-NEXT: psrad $24, %xmm2
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; SSSE3-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
|
||||
; SSSE3-NEXT: psrad $24, %xmm0
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
|
||||
; SSSE3-NEXT: psrad $24, %xmm1
|
||||
; SSSE3-NEXT: movdqa %xmm2, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: sext_16i8_to_8i32:
|
||||
|
|
|
@ -1205,9 +1205,8 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
|
|||
; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
|
||||
; SSSE3: # BB#0: # %entry
|
||||
; SSSE3-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[8],zero,xmm1[10],zero,xmm1[12],zero,xmm1[14],zero
|
||||
; SSSE3-NEXT: pxor %xmm2, %xmm2
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
|
||||
; SSSE3-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
||||
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
|
||||
|
|
Loading…
Reference in New Issue