[DAGCombine] Skip PostInc combine with later users

When decided whether to generate a post-inc load/store, look at the
other memory nodes that use the same base address and, if any proceed
the current node, then don't do the combine.
The change only seems to be affecting the Arm backend, which I was
surprised at, but it appears to fix a lot of our issues around MVE
masked load/stores having to store a temporary address after an early
post-increment on a shared base address.

Differential Revision: https://reviews.llvm.org/D75847
This commit is contained in:
Sam Parker 2020-03-23 08:39:53 +00:00
parent 8e45eaf1da
commit 62fdb1f534
5 changed files with 48 additions and 43 deletions

View File

@ -14248,10 +14248,25 @@ static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
return false;
SmallPtrSet<const SDNode *, 32> Visited;
for (SDNode *Use : BasePtr.getNode()->uses()) {
if (Use == Ptr.getNode())
continue;
// No if there's a later user which could perform the index instead.
if (isa<MemSDNode>(Use)) {
bool IsLoad = true;
bool IsMasked = false;
SDValue OtherPtr;
if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
IsMasked, OtherPtr, TLI)) {
SmallVector<const SDNode *, 2> Worklist;
Worklist.push_back(Use);
if (SDNode::hasPredecessorHelper(N, Visited, Worklist))
return false;
}
}
// If all the uses are load / store addresses, then don't do the
// transformation.
if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {

View File

@ -404,8 +404,8 @@ entry:
; CHECK-NEXT: sub sp, #508
; CHECK-NEXT: sub sp, #8
; Argument addresses computed relative to BP
; CHECK: adds r0, r6, #7
; CHECK-NEXT: adds r0, #13
; CHECK: adds r4, r6, #7
; CHECK-NEXT: adds r4, #13
; CHECK: adds r1, r6, #7
; CHECK-NEXT: adds r1, #9
; CHECK: adds r5, r6, #7

View File

@ -400,18 +400,16 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB4_1: @ %bb9
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
; CHECK-NEXT: vldrwt.u32 q1, [r0]
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r0]
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: vstrwt.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %bb27
; CHECK-NEXT: pop {r7, pc}
@ -464,13 +462,12 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
; CHECK-NEXT: bic r12, r12, #3
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
; CHECK-NEXT: mov r12, r0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB5_1: @ %bb12
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q0, [r12], #16
; CHECK-NEXT: vldrwt.u32 q0, [r0]
; CHECK-NEXT: vpttt.i32 ne, q0, zr
; CHECK-NEXT: vcmpt.s32 le, q0, r2
; CHECK-NEXT: vctpt.32 r3
@ -478,8 +475,7 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r0]
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: vstrwt.32 q0, [r0], #16
; CHECK-NEXT: le lr, .LBB5_1
; CHECK-NEXT: @ %bb.2: @ %bb32
; CHECK-NEXT: pop {r7, pc}

View File

@ -8,14 +8,13 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: dlstp.16 lr, r2
; CHECK: .LBB0_1: @ %vector.body
; CHECK: vldrb.s16 q0, [r1], #8
; CHECK-NEXT: vldrh.u16 q1, [r3], #16
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.s16 q0, [r1], #8
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vadd.i16 q0, q1, q0
; CHECK-NEXT: vstrh.16 q0, [r0]
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: vstrh.16 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@ -63,14 +62,13 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: dlstp.16 lr, r2
; CHECK: .LBB1_1: @ %vector.body
; CHECK: vldrb.u16 q0, [r1], #8
; CHECK-NEXT: vldrh.u16 q1, [r3], #16
; CHECK-NEXT: .LBB1_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrb.u16 q0, [r1], #8
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vadd.i16 q0, q1, q0
; CHECK-NEXT: vstrh.16 q0, [r0]
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: vstrh.16 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB1_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@ -118,14 +116,13 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16*
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK: .LBB2_1: @ %vector.body
; CHECK: vldrh.s32 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.s32 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vadd.i32 q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@ -173,14 +170,13 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16*
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK: .LBB3_1: @ %vector.body
; CHECK: vldrh.u32 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q1, [r3], #16
; CHECK-NEXT: .LBB3_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u32 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vadd.i32 q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}

View File

@ -133,26 +133,24 @@ define void @fma_tailpred(float* noalias nocapture readonly %A, float* noalias n
; CHECK-NEXT: bic r12, r12, #3
; CHECK-NEXT: mov.w lr, #1
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: subs r3, #1
; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: vdup.32 q1, r3
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: sub.w r12, r3, #1
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vdup.32 q1, r12
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vdup.32 q2, r12
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vdup.32 q2, r3
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: vorr q2, q2, q0
; CHECK-NEXT: vpttt.u32 cs, q1, q2
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vldrwt.u32 q3, [r1], #16
; CHECK-NEXT: vldrwt.u32 q4, [r3], #16
; CHECK-NEXT: vldrwt.u32 q4, [r2]
; CHECK-NEXT: vfma.f32 q4, q3, q2
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q4, [r2]
; CHECK-NEXT: mov r2, r3
; CHECK-NEXT: vstrwt.32 q4, [r2], #16
; CHECK-NEXT: le lr, .LBB1_2
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup
; CHECK-NEXT: vpop {d8, d9}