forked from OSchip/llvm-project
[X86][SSE] Remove XFormVExtractWithShuffleIntoLoad to prevent legalization infinite loops (PR43971)
As detailed in PR43971/D70267, the use of XFormVExtractWithShuffleIntoLoad causes issues where we end up in infinite loops of extract(targetshuffle(vecload)) -> extract(shuffle(vecload)) -> extract(vecload) -> extract(targetshuffle(vecload)), there are just too many legalization checks at every stage that we can't guarantee that extract(shuffle(vecload)) -> scalarload can occur. At the moment we see a number of minor regressions as we don't fold extract(shuffle(vecload)) -> scalarload before legal ops, these can be addressed in future patches and extension of X86ISelLowering's combineExtractWithShuffle.
This commit is contained in:
parent
96d814a5fe
commit
bbf4af3109
|
@ -35126,123 +35126,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
|
|||
Op, DemandedBits, DemandedElts, DAG, Depth);
|
||||
}
|
||||
|
||||
/// Check if a vector extract from a target-specific shuffle of a load can be
|
||||
/// folded into a single element load.
|
||||
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
|
||||
/// shuffles have been custom lowered so we need to handle those here.
|
||||
static SDValue
|
||||
XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
if (DCI.isBeforeLegalizeOps())
|
||||
return SDValue();
|
||||
|
||||
SDValue InVec = N->getOperand(0);
|
||||
SDValue EltNo = N->getOperand(1);
|
||||
EVT EltVT = N->getValueType(0);
|
||||
|
||||
if (!isa<ConstantSDNode>(EltNo))
|
||||
return SDValue();
|
||||
|
||||
EVT OriginalVT = InVec.getValueType();
|
||||
unsigned NumOriginalElts = OriginalVT.getVectorNumElements();
|
||||
|
||||
// Peek through bitcasts, don't duplicate a load with other uses.
|
||||
InVec = peekThroughOneUseBitcasts(InVec);
|
||||
|
||||
EVT CurrentVT = InVec.getValueType();
|
||||
if (!CurrentVT.isVector())
|
||||
return SDValue();
|
||||
|
||||
unsigned NumCurrentElts = CurrentVT.getVectorNumElements();
|
||||
if ((NumOriginalElts % NumCurrentElts) != 0)
|
||||
return SDValue();
|
||||
|
||||
if (!isTargetShuffle(InVec.getOpcode()))
|
||||
return SDValue();
|
||||
|
||||
// Don't duplicate a load with other uses.
|
||||
if (!InVec.hasOneUse())
|
||||
return SDValue();
|
||||
|
||||
SmallVector<int, 16> ShuffleMask;
|
||||
SmallVector<SDValue, 2> ShuffleOps;
|
||||
bool UnaryShuffle;
|
||||
if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
|
||||
ShuffleOps, ShuffleMask, UnaryShuffle))
|
||||
return SDValue();
|
||||
|
||||
unsigned Scale = NumOriginalElts / NumCurrentElts;
|
||||
if (Scale > 1) {
|
||||
SmallVector<int, 16> ScaledMask;
|
||||
scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);
|
||||
ShuffleMask = std::move(ScaledMask);
|
||||
}
|
||||
assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch");
|
||||
|
||||
// Select the input vector, guarding against out of range extract vector.
|
||||
int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
|
||||
int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];
|
||||
|
||||
if (Idx == SM_SentinelZero)
|
||||
return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
|
||||
: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
|
||||
if (Idx == SM_SentinelUndef)
|
||||
return DAG.getUNDEF(EltVT);
|
||||
|
||||
// Bail if any mask element is SM_SentinelZero - getVectorShuffle below
|
||||
// won't handle it.
|
||||
if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
|
||||
return SDValue();
|
||||
|
||||
assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
|
||||
"Shuffle index out of range");
|
||||
SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];
|
||||
|
||||
// If inputs to shuffle are the same for both ops, then allow 2 uses
|
||||
unsigned AllowedUses =
|
||||
(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
|
||||
|
||||
if (LdNode.getOpcode() == ISD::BITCAST) {
|
||||
// Don't duplicate a load with other uses.
|
||||
if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
|
||||
return SDValue();
|
||||
|
||||
AllowedUses = 1; // only allow 1 load use if we have a bitcast
|
||||
LdNode = LdNode.getOperand(0);
|
||||
}
|
||||
|
||||
if (!ISD::isNormalLoad(LdNode.getNode()))
|
||||
return SDValue();
|
||||
|
||||
LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
|
||||
|
||||
if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple())
|
||||
return SDValue();
|
||||
|
||||
// If there's a bitcast before the shuffle, check if the load type and
|
||||
// alignment is valid.
|
||||
unsigned Align = LN0->getAlignment();
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
|
||||
EltVT.getTypeForEVT(*DAG.getContext()));
|
||||
|
||||
if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
|
||||
return SDValue();
|
||||
|
||||
// All checks match so transform back to vector_shuffle so that DAG combiner
|
||||
// can finish the job
|
||||
SDLoc dl(N);
|
||||
|
||||
// Create shuffle node taking into account the case that its a unary shuffle
|
||||
SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)
|
||||
: DAG.getBitcast(OriginalVT, ShuffleOps[1]);
|
||||
Shuffle = DAG.getVectorShuffle(OriginalVT, dl,
|
||||
DAG.getBitcast(OriginalVT, ShuffleOps[0]),
|
||||
Shuffle, ShuffleMask);
|
||||
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
|
||||
EltNo);
|
||||
}
|
||||
|
||||
// Helper to peek through bitops/setcc to determine size of source vector.
|
||||
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
|
||||
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
|
||||
|
@ -36496,14 +36379,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
|
|||
}
|
||||
|
||||
// TODO - Remove this once we can handle the implicit zero-extension of
|
||||
// X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,
|
||||
// combineHorizontalPredicateResult and combineBasicSADPattern.
|
||||
// X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
|
||||
// combineBasicSADPattern.
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
|
||||
return NewOp;
|
||||
|
||||
// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
|
||||
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
|
||||
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
|
||||
|
|
|
@ -5,15 +5,21 @@
|
|||
define float @test(<4 x float>* %A) nounwind {
|
||||
; X86-LABEL: test:
|
||||
; X86: # %bb.0: # %entry
|
||||
; X86-NEXT: pushl %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: xorps %xmm0, %xmm0
|
||||
; X86-NEXT: flds 12(%eax)
|
||||
; X86-NEXT: movaps %xmm0, (%eax)
|
||||
; X86-NEXT: movaps (%eax), %xmm0
|
||||
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; X86-NEXT: xorps %xmm1, %xmm1
|
||||
; X86-NEXT: movaps %xmm1, (%eax)
|
||||
; X86-NEXT: movss %xmm0, (%esp)
|
||||
; X86-NEXT: flds (%esp)
|
||||
; X86-NEXT: popl %eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: test:
|
||||
; X64: # %bb.0: # %entry
|
||||
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; X64-NEXT: movaps (%rdi), %xmm0
|
||||
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; X64-NEXT: xorps %xmm1, %xmm1
|
||||
; X64-NEXT: movaps %xmm1, (%rdi)
|
||||
; X64-NEXT: retq
|
||||
|
|
|
@ -9,13 +9,20 @@ define i32 @t(<2 x i64>* %val) nounwind {
|
|||
; X32-SSE2-LABEL: t:
|
||||
; X32-SSE2: # %bb.0:
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE2-NEXT: movl 8(%eax), %eax
|
||||
; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
|
||||
; X32-SSE2-NEXT: movd %xmm0, %eax
|
||||
; X32-SSE2-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: t:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movl 8(%rdi), %eax
|
||||
; X64-NEXT: retq
|
||||
; X64-SSSE3-LABEL: t:
|
||||
; X64-SSSE3: # %bb.0:
|
||||
; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
|
||||
; X64-SSSE3-NEXT: movd %xmm0, %eax
|
||||
; X64-SSSE3-NEXT: retq
|
||||
;
|
||||
; X64-AVX-LABEL: t:
|
||||
; X64-AVX: # %bb.0:
|
||||
; X64-AVX-NEXT: movl 8(%rdi), %eax
|
||||
; X64-AVX-NEXT: retq
|
||||
%tmp2 = load <2 x i64>, <2 x i64>* %val, align 16 ; <<2 x i64>> [#uses=1]
|
||||
%tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1]
|
||||
%tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; <i32> [#uses=1]
|
||||
|
@ -127,7 +134,8 @@ define float @t6(<8 x float> *%a0) {
|
|||
; X32-SSE2-NEXT: pushl %eax
|
||||
; X32-SSE2-NEXT: .cfi_def_cfa_offset 8
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; X32-SSE2-NEXT: movaps (%eax), %xmm0
|
||||
; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
||||
; X32-SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1
|
||||
; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
|
@ -142,7 +150,7 @@ define float @t6(<8 x float> *%a0) {
|
|||
;
|
||||
; X64-SSSE3-LABEL: t6:
|
||||
; X64-SSSE3: # %bb.0:
|
||||
; X64-SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
|
||||
; X64-SSSE3-NEXT: xorps %xmm0, %xmm0
|
||||
; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0
|
||||
; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
|
@ -165,3 +173,50 @@ define float @t6(<8 x float> *%a0) {
|
|||
%cond = select i1 %cmp, float 1.000000e+00, float %vecext
|
||||
ret float %cond
|
||||
}
|
||||
|
||||
define void @PR43971(<8 x float> *%a0, float *%a1) {
|
||||
; X32-SSE2-LABEL: PR43971:
|
||||
; X32-SSE2: # %bb.0: # %entry
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-SSE2-NEXT: movaps 16(%ecx), %xmm0
|
||||
; X32-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; X32-SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; X32-SSE2-NEXT: cmpltss %xmm0, %xmm1
|
||||
; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; X32-SSE2-NEXT: andps %xmm1, %xmm2
|
||||
; X32-SSE2-NEXT: andnps %xmm0, %xmm1
|
||||
; X32-SSE2-NEXT: orps %xmm2, %xmm1
|
||||
; X32-SSE2-NEXT: movss %xmm1, (%eax)
|
||||
; X32-SSE2-NEXT: retl
|
||||
;
|
||||
; X64-SSSE3-LABEL: PR43971:
|
||||
; X64-SSSE3: # %bb.0: # %entry
|
||||
; X64-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; X64-SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; X64-SSSE3-NEXT: cmpltss %xmm0, %xmm1
|
||||
; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; X64-SSSE3-NEXT: andps %xmm1, %xmm2
|
||||
; X64-SSSE3-NEXT: andnps %xmm0, %xmm1
|
||||
; X64-SSSE3-NEXT: orps %xmm2, %xmm1
|
||||
; X64-SSSE3-NEXT: movss %xmm1, (%rsi)
|
||||
; X64-SSSE3-NEXT: retq
|
||||
;
|
||||
; X64-AVX-LABEL: PR43971:
|
||||
; X64-AVX: # %bb.0: # %entry
|
||||
; X64-AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
|
||||
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1
|
||||
; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
|
||||
; X64-AVX-NEXT: vmovss %xmm0, (%rsi)
|
||||
; X64-AVX-NEXT: retq
|
||||
entry:
|
||||
%0 = load <8 x float>, <8 x float>* %a0, align 32
|
||||
%vecext = extractelement <8 x float> %0, i32 6
|
||||
%cmp = fcmp ogt float %vecext, 0.000000e+00
|
||||
%1 = load float, float* %a1, align 4
|
||||
%cond = select i1 %cmp, float %1, float %vecext
|
||||
store float %cond, float* %a1, align 4
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -269,12 +269,12 @@ define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
|
|||
define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) {
|
||||
; SSE-LABEL: extract_lane_insertps_5123:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: extract_lane_insertps_5123:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = mem[1,1,3,3]
|
||||
; AVX-NEXT: retq
|
||||
%a1 = load <4 x float>, <4 x float> *%p1
|
||||
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 64)
|
||||
|
@ -285,12 +285,13 @@ define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) {
|
|||
define float @extract_lane_insertps_6123(<4 x float> %a0, <4 x float> *%p1) {
|
||||
; SSE-LABEL: extract_lane_insertps_6123:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
||||
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: extract_lane_insertps_6123:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
|
||||
; AVX-NEXT: retq
|
||||
%a1 = load <4 x float>, <4 x float> *%p1
|
||||
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)
|
||||
|
|
|
@ -57,13 +57,15 @@ define void @test3(float* %R, <4 x float>* %P1) nounwind {
|
|||
; X32: # %bb.0: # %entry
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; X32-NEXT: movaps (%ecx), %xmm0
|
||||
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; X32-NEXT: movss %xmm0, (%eax)
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: test3:
|
||||
; X64: # %bb.0: # %entry
|
||||
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; X64-NEXT: movaps (%rsi), %xmm0
|
||||
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; X64-NEXT: movss %xmm0, (%rdi)
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
|
|
Loading…
Reference in New Issue