From 28166816b05aebb3154e5f8a28b3ef447cce8471 Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Tue, 26 Nov 2019 10:25:04 +0000 Subject: [PATCH] [ARM][ReachingDefs] Remove dead code in loloops. Add some more helper functions to ReachingDefs to query the uses of a given MachineInstr and also to query whether two MachineInstrs use the same def of a register. For Arm, while tail-predicating, these helpers are used in the low-overhead loops to remove the dead code that calculates the number of loop iterations. Differential Revision: https://reviews.llvm.org/D70240 --- .../llvm/CodeGen/ReachingDefAnalysis.h | 9 + llvm/lib/CodeGen/ReachingDefAnalysis.cpp | 43 +++- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 189 +++++++++++++----- .../Thumb2/LowOverheadLoops/fast-fp-loops.ll | 6 +- .../LowOverheadLoops/mve-tail-data-types.ll | 78 ++------ .../LowOverheadLoops/vector-arith-codegen.ll | 49 +---- .../CodeGen/Thumb2/LowOverheadLoops/wlstp.mir | 21 +- 7 files changed, 214 insertions(+), 181 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h index dda82b7717e7..ac001e326c57 100644 --- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -114,6 +114,15 @@ public: /// reaching def instuction of PhysReg that reaches MI. int getClearance(MachineInstr *MI, MCPhysReg PhysReg); + /// Provides the uses, in the same block as MI, of register that MI defines. + /// This does not consider live-outs. + void getReachingLocalUses(MachineInstr *MI, int PhysReg, + SmallVectorImpl &Uses); + + /// Provide the number of uses, in the same block as MI, of the register that + /// MI defines. + unsigned getNumUses(MachineInstr *MI, int PhysReg); + private: /// Set up LiveRegs by merging predecessor live-out values. void enterBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB); diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index 55d9cb65999c..ad7f910be4c5 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -194,9 +194,19 @@ MachineInstr* ReachingDefAnalysis::getReachingMIDef(MachineInstr *MI, int PhysRe return getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg)); } +bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B, + int PhysReg) { + MachineBasicBlock *ParentA = A->getParent(); + MachineBasicBlock *ParentB = B->getParent(); + if (ParentA != ParentB) + return false; + + return getReachingDef(A, PhysReg) == getReachingDef(B, PhysReg); +} + MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB, int InstId) { - assert(MBB->getNumber() < MBBReachingDefs.size() && + assert(static_cast(MBB->getNumber()) < MBBReachingDefs.size() && "Unexpected basic block number."); assert(InstId < static_cast(MBB->size()) && "Unexpected instruction id."); @@ -216,14 +226,31 @@ int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) { return InstIds[MI] - getReachingDef(MI, PhysReg); } -bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B, - int PhysReg) { - MachineBasicBlock *ParentA = A->getParent(); - MachineBasicBlock *ParentB = B->getParent(); - if (ParentA != ParentB) - return false; +void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg, + SmallVectorImpl &Uses) { + MachineBasicBlock *MBB = Def->getParent(); + MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def); + while (++MI != MBB->end()) { + for (auto &MO : MI->operands()) { + if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg) + continue; - return getReachingDef(A, PhysReg) == getReachingDef(B, PhysReg); + // If/when we find a new reaching def, we know that there's no more uses + // of 'Def'. + if (getReachingMIDef(&*MI, PhysReg) != Def) + return; + + Uses.push_back(&*MI); + if (MO.isKill()) + return; + } + } +} + +unsigned ReachingDefAnalysis::getNumUses(MachineInstr *Def, int PhysReg) { + SmallVector Uses; + getReachingLocalUses(Def, PhysReg, Uses); + return Uses.size(); } bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) { diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 7487a43b7aa3..756d0fdb5570 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -110,12 +110,41 @@ namespace { // Check the branch targets are within range and we satisfy our // restrictions. - void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA); + void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA, + MachineLoopInfo *MLI); bool FoundAllComponents() const { return Start && Dec && End; } + // Return the loop iteration count, or the number of elements if we're tail + // predicating. + MachineOperand &getCount() { + return IsTailPredicationLegal() ? + VCTP->getOperand(1) : Start->getOperand(0); + } + + unsigned getStartOpcode() const { + bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; + if (!IsTailPredicationLegal()) + return IsDo ? ARM::t2DLS : ARM::t2WLS; + + switch (VCTP->getOpcode()) { + default: + llvm_unreachable("unhandled vctp opcode"); + break; + case ARM::MVE_VCTP8: + return IsDo ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8; + case ARM::MVE_VCTP16: + return IsDo ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16; + case ARM::MVE_VCTP32: + return IsDo ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32; + case ARM::MVE_VCTP64: + return IsDo ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64; + } + return 0; + } + void dump() const { if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; @@ -130,6 +159,7 @@ namespace { class ARMLowOverheadLoops : public MachineFunctionPass { MachineFunction *MF = nullptr; + MachineLoopInfo *MLI = nullptr; ReachingDefAnalysis *RDA = nullptr; const ARMBaseInstrInfo *TII = nullptr; MachineRegisterInfo *MRI = nullptr; @@ -236,7 +266,8 @@ MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) { } void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, - ReachingDefAnalysis *RDA) { + ReachingDefAnalysis *RDA, + MachineLoopInfo *MLI) { if (Revert) return; @@ -273,14 +304,70 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, if (!InsertPt) { LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); Revert = true; + return; } else LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); - LLVM_DEBUG(if (IsTailPredicationLegal()) { - dbgs() << "ARM Loops: Will use tail predication to convert:\n"; + // For tail predication, we need to provide the number of elements, instead + // of the iteration count, to the loop start instruction. The number of + // elements is provided to the vctp instruction, so we need to check that + // we can use this register at InsertPt. + if (!IsTailPredicationLegal()) + return; + + Register NumElements = VCTP->getOperand(1).getReg(); + + // If the register is defined within loop, then we can't perform TP. + // TODO: Check whether this is just a mov of a register that would be + // available. + if (RDA->getReachingDef(VCTP, NumElements) >= 0) { + CannotTailPredicate = true; + return; + } + + // We can't perform TP if the register does not hold the same value at + // InsertPt as the liveout value. + MachineBasicBlock *InsertBB = InsertPt->getParent(); + if (!RDA->hasSameReachingDef(InsertPt, &InsertBB->back(), + NumElements)) { + CannotTailPredicate = true; + return; + } + + // Especially in the case of while loops, InsertBB may not be the + // preheader, so we need to check that the register isn't redefined + // before entering the loop. + auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB, + Register NumElements) { + // NumElements is redefined in this block. + if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0) + return true; + + // Don't continue searching up through multiple predecessors. + if (MBB->pred_size() > 1) + return true; + + return false; + }; + + // First, find the block that looks like the preheader. + MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true); + if (!MBB) { + CannotTailPredicate = true; + return; + } + + // Then search backwards for a def, until we get to InsertBB. + while (MBB != InsertBB) { + CannotTailPredicate = CannotProvideElements(MBB, NumElements); + if (CannotTailPredicate) + return; + MBB = *MBB->pred_begin(); + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication to convert:\n"; for (auto *MI : VPTUsers) - dbgs() << " - " << *MI; - }); + dbgs() << " - " << *MI;); } bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { @@ -291,7 +378,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { MF = &mf; LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); - auto &MLI = getAnalysis(); + MLI = &getAnalysis(); RDA = &getAnalysis(); MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); MRI = &MF->getRegInfo(); @@ -301,7 +388,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { BBUtils->adjustBBOffsetsAfter(&MF->front()); bool Changed = false; - for (auto ML : MLI) { + for (auto ML : *MLI) { if (!ML->getParentLoop()) Changed |= ProcessLoop(ML); } @@ -317,7 +404,14 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { for (auto I = ML->begin(), E = ML->end(); I != E; ++I) Changed |= ProcessLoop(*I); - LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); + LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n"; + if (auto *Preheader = ML->getLoopPreheader()) + dbgs() << " - " << Preheader->getName() << "\n"; + else if (auto *Preheader = MLI->findLoopPreheader(ML)) + dbgs() << " - " << Preheader->getName() << "\n"; + for (auto *MBB : ML->getBlocks()) + dbgs() << " - " << MBB->getName() << "\n"; + ); // Search the given block for a loop start instruction. If one isn't found, // and there's only one predecessor block, search that one too. @@ -333,28 +427,15 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { }; LowOverheadLoop LoLoop(ML); - // Search the preheader for the start intrinsic, or look through the - // predecessors of the header to find exactly one set.iterations intrinsic. + // Search the preheader for the start intrinsic. // FIXME: I don't see why we shouldn't be supporting multiple predecessors // with potentially multiple set.loop.iterations, so we need to enable this. if (auto *Preheader = ML->getLoopPreheader()) LoLoop.Start = SearchForStart(Preheader); - else { - LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n" - << " - Performing manual predecessor search.\n"); - MachineBasicBlock *Pred = nullptr; - for (auto *MBB : ML->getHeader()->predecessors()) { - if (!ML->contains(MBB)) { - if (Pred) { - LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n"); - LoLoop.Start = nullptr; - break; - } - Pred = MBB; - LoLoop.Start = SearchForStart(MBB); - } - } - } + else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) + LoLoop.Start = SearchForStart(Preheader); + else + return false; // Find the low-overhead loop components and decide whether or not to fall // back to a normal loop. Also look for a vctp instructions and decide @@ -412,7 +493,7 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { if (!LoLoop.FoundAllComponents()) return false; - LoLoop.CheckLegality(BBUtils.get(), RDA); + LoLoop.CheckLegality(BBUtils.get(), RDA, MLI); Expand(LoLoop); return true; } @@ -504,35 +585,45 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { MachineInstr *Start = LoLoop.Start; MachineBasicBlock *MBB = InsertPt->getParent(); bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; - unsigned Opc = 0; - - if (!LoLoop.IsTailPredicationLegal()) - Opc = IsDo ? ARM::t2DLS : ARM::t2WLS; - else { - switch (LoLoop.VCTP->getOpcode()) { - case ARM::MVE_VCTP8: - Opc = IsDo ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8; - break; - case ARM::MVE_VCTP16: - Opc = IsDo ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16; - break; - case ARM::MVE_VCTP32: - Opc = IsDo ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32; - break; - case ARM::MVE_VCTP64: - Opc = IsDo ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64; - break; - } - } + unsigned Opc = LoLoop.getStartOpcode(); + MachineOperand &Count = LoLoop.getCount(); MachineInstrBuilder MIB = BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); MIB.addDef(ARM::LR); - MIB.add(Start->getOperand(0)); + MIB.add(Count); if (!IsDo) MIB.add(Start->getOperand(1)); + // When using tail-predication, try to delete the dead code that was used to + // calculate the number of loop iterations. + if (LoLoop.IsTailPredicationLegal()) { + SmallVector Killed; + SmallVector Dead; + if (auto *Def = RDA->getReachingMIDef(Start, + Start->getOperand(0).getReg())) { + Killed.push_back(Def); + + while (!Killed.empty()) { + MachineInstr *Def = Killed.back(); + Killed.pop_back(); + Dead.push_back(Def); + for (auto &MO : Def->operands()) { + if (!MO.isReg() || !MO.isKill()) + continue; + + MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg()); + if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1) + Killed.push_back(Kill); + } + } + for (auto *MI : Dead) + MI->eraseFromParent(); + } + } + + // If we're inserting at a mov lr, then remove it as it's redundant. if (InsertPt != Start) InsertPt->eraseFromParent(); Start->eraseFromParent(); diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index 02d05ef9c0f6..f285b445cf3c 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -36,11 +36,7 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .LBB0_4: @ %vector.ph -; CHECK-NEXT: adds r6, r3, #3 -; CHECK-NEXT: bic r6, r6, #3 -; CHECK-NEXT: subs r6, #4 -; CHECK-NEXT: add.w lr, r12, r6, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1] diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index 38e688bbf628..21be95e1fcc8 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -16,17 +16,19 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r12, r2 ; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: vldrb.u32 q2, [r2] +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q2, [r2] ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: sub.w r2, r12, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsel q0, q0, q1 @@ -82,13 +84,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.s32 q2, [r1] @@ -160,17 +157,19 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r12, r2 ; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: vldrb.u32 q2, [r2] +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q2, [r2] ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: sub.w r2, r12, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: letp lr, .LBB2_1 +; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsel q0, q0, q1 @@ -226,13 +225,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q2, [r1] @@ -297,13 +291,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [r1] @@ -392,13 +381,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB5_9 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: add.w r7, r12, #3 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: bic r7, r7, #3 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r5, r0, r4 @@ -607,12 +591,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} -; CHECK-NEXT: add.w lr, r12, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: bic lr, lr, #3 -; CHECK-NEXT: sub.w lr, lr, #4 -; CHECK-NEXT: add.w lr, r4, lr, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.s32 q0, [r0] @@ -703,13 +682,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB7_9 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: add.w r7, r12, #3 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: bic r7, r7, #3 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r5, r0, r4 @@ -918,12 +892,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} -; CHECK-NEXT: add.w lr, r12, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: bic lr, lr, #3 -; CHECK-NEXT: sub.w lr, lr, #4 -; CHECK-NEXT: add.w lr, r4, lr, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q0, [r0] @@ -1016,11 +985,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB9_8 ; CHECK-NEXT: .LBB9_4: @ %vector.ph -; CHECK-NEXT: add.w r4, r12, #3 -; CHECK-NEXT: bic r4, r4, #3 -; CHECK-NEXT: subs r4, #4 -; CHECK-NEXT: add.w lr, lr, r4, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0] @@ -1217,13 +1182,8 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} -; CHECK-NEXT: add.w r12, r3, #7 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #7 -; CHECK-NEXT: sub.w r12, r12, #8 -; CHECK-NEXT: add.w lr, lr, r12, lsr #3 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.16 lr, lr +; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r4, r1, r12 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll index 02bf12ce6200..fdf04db82207 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -9,13 +9,8 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmov q1, q0 @@ -82,13 +77,8 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adds r1, r2, #3 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r1, r2 @@ -148,13 +138,8 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adds r1, r2, #3 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r1, r2 @@ -213,12 +198,7 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #3 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1] @@ -272,12 +252,7 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #3 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 -; CHECK-NEXT: dlstp.32 lr, lr +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1] @@ -331,13 +306,8 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, pc} -; CHECK-NEXT: add.w r12, r3, #15 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #15 -; CHECK-NEXT: sub.w r12, r12, #16 -; CHECK-NEXT: add.w lr, lr, r12, lsr #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.8 lr, lr +; CHECK-NEXT: dlstp.8 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r4, r1, r12 @@ -396,12 +366,7 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: add.w r12, r3, #7 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #7 -; CHECK-NEXT: sub.w r12, r12, #8 -; CHECK-NEXT: add.w lr, lr, r12, lsr #3 -; CHECK-NEXT: dlstp.16 lr, lr +; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r1] diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir index 69f23f605013..99f6e39d3712 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir @@ -195,12 +195,7 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 - ; CHECK: renamable $r12 = t2ADDri renamable $r3, 15, 14, $noreg, $noreg - ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg - ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 16, 14, $noreg, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg - ; CHECK: $lr = MVE_WLSTP_8 renamable $lr, %bb.1 + ; CHECK: $lr = MVE_WLSTP_8 renamable $r3, %bb.1 ; CHECK: tB %bb.3, 14, $noreg ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) @@ -323,12 +318,7 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r12 = t2ADDri renamable $r3, 7, 14, $noreg, $noreg - ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg - ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 7, 14, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 8, 14, $noreg, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 27, 14, $noreg, $noreg - ; CHECK: $lr = MVE_WLSTP_16 renamable $lr, %bb.1 + ; CHECK: $lr = MVE_WLSTP_16 renamable $r3, %bb.1 ; CHECK: tB %bb.2, 14, $noreg ; CHECK: bb.1.vector.body: ; CHECK: successors: %bb.2(0x04000000), %bb.1(0x7c000000) @@ -437,13 +427,8 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg ; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg - ; CHECK: $lr = MVE_WLSTP_32 renamable $lr, %bb.1 + ; CHECK: $lr = MVE_WLSTP_32 $r2, %bb.1 ; CHECK: tB %bb.4, 14, $noreg ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000)