From a6fd919cb3f5e72fb07b961a567c658192782e83 Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Tue, 25 Jun 2019 10:45:51 +0000 Subject: [PATCH] [ARM] DLS/LE low-overhead loop code generation Introduce three pseudo instructions to be used during DAG ISel to represent v8.1-m low-overhead loops. One maps to set_loop_iterations while loop_decrement_reg is lowered to two, so that we can separate the decrement and branching operations. The pseudo instructions are expanded pre-emission, where we can still decide whether we actually want to generate a low-overhead loop, in a new pass: ARMLowOverheadLoops. The pass currently bails, reverting to an sub, icmp and br, in the cases where a call or stack spill/restore happens between the decrement and branching instructions, or if the loop is too large. Differential Revision: https://reviews.llvm.org/D63476 llvm-svn: 364288 --- llvm/lib/Target/ARM/ARM.h | 3 +- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 30 ++ llvm/lib/Target/ARM/ARMInstrThumb2.td | 16 + llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 295 ++++++++++++++++++ llvm/lib/Target/ARM/ARMTargetMachine.cpp | 5 + llvm/lib/Target/ARM/CMakeLists.txt | 1 + llvm/test/CodeGen/ARM/O3-pipeline.ll | 7 + .../Transforms/HardwareLoops/ARM/calls.ll | 22 +- .../Transforms/HardwareLoops/ARM/cond-mov.mir | 115 +++++++ .../Transforms/HardwareLoops/ARM/massive.mir | 145 +++++++++ .../HardwareLoops/ARM/multiblock-massive.mir | 160 ++++++++++ .../HardwareLoops/ARM/revert-after-call.mir | 141 +++++++++ .../HardwareLoops/ARM/revert-after-spill.mir | 139 +++++++++ .../Transforms/HardwareLoops/ARM/simple-do.ll | 37 +++ .../HardwareLoops/ARM/size-limit.mir | 155 +++++++++ .../Transforms/HardwareLoops/ARM/structure.ll | 177 +++++++++++ .../Transforms/HardwareLoops/ARM/switch.mir | 198 ++++++++++++ 17 files changed, 1644 insertions(+), 2 deletions(-) create mode 100644 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp create mode 100644 llvm/test/Transforms/HardwareLoops/ARM/cond-mov.mir create mode 100644 llvm/test/Transforms/HardwareLoops/ARM/massive.mir create mode 100644 llvm/test/Transforms/HardwareLoops/ARM/multiblock-massive.mir create mode 100644 llvm/test/Transforms/HardwareLoops/ARM/revert-after-call.mir create mode 100644 llvm/test/Transforms/HardwareLoops/ARM/revert-after-spill.mir create mode 100644 llvm/test/Transforms/HardwareLoops/ARM/size-limit.mir create mode 100644 llvm/test/Transforms/HardwareLoops/ARM/switch.mir diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h index f8dd2984f057..bf8ed6562fe7 100644 --- a/llvm/lib/Target/ARM/ARM.h +++ b/llvm/lib/Target/ARM/ARM.h @@ -35,7 +35,7 @@ class MachineInstr; class MCInst; class PassRegistry; - +FunctionPass *createARMLowOverheadLoopsPass(); Pass *createARMParallelDSPPass(); FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, CodeGenOpt::Level OptLevel); @@ -66,6 +66,7 @@ void initializeARMExpandPseudoPass(PassRegistry &); void initializeThumb2SizeReducePass(PassRegistry &); void initializeThumb2ITBlockPass(PassRegistry &); void initializeMVEVPTBlockPass(PassRegistry &); +void initializeARMLowOverheadLoopsPass(PassRegistry &); } // end namespace llvm diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 492c83c2bf7a..c74459a15425 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2986,6 +2986,36 @@ void ARMDAGToDAGISel::Select(SDNode *N) { unsigned CC = (unsigned) cast(N2)->getZExtValue(); if (InFlag.getOpcode() == ARMISD::CMPZ) { + if (InFlag.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) { + SDValue Int = InFlag.getOperand(0); + uint64_t ID = cast(Int->getOperand(1))->getZExtValue(); + + // Handle low-overhead loops. + if (ID == Intrinsic::loop_decrement_reg) { + SDValue Elements = Int.getOperand(2); + SDValue Size = CurDAG->getTargetConstant( + cast(Int.getOperand(3))->getZExtValue(), dl, + MVT::i32); + + SDValue Args[] = { Elements, Size, Int.getOperand(0) }; + SDNode *LoopDec = + CurDAG->getMachineNode(ARM::t2LoopDec, dl, + CurDAG->getVTList(MVT::i32, MVT::Other), + Args); + ReplaceUses(Int.getNode(), LoopDec); + + SDValue EndArgs[] = { SDValue(LoopDec, 0), N1, Chain }; + SDNode *LoopEnd = + CurDAG->getMachineNode(ARM::t2LoopEnd, dl, MVT::Other, EndArgs); + + ReplaceUses(N, LoopEnd); + CurDAG->RemoveDeadNode(N); + CurDAG->RemoveDeadNode(InFlag.getNode()); + CurDAG->RemoveDeadNode(Int.getNode()); + return; + } + } + bool SwitchEQNEToPLMI; SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI); InFlag = N->getOperand(4); diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index 8ca710cb4688..b236a79ec62e 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5135,6 +5135,7 @@ class t2LOL let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB]; } +let isNotDuplicable = 1 in { def t2WLS : t2LOL<(outs GPRlr:$LR), (ins rGPR:$Rn, wlslabel_u11:$label), "wls", "$LR, $Rn, $label"> { @@ -5178,6 +5179,21 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> { let Inst{10-1} = label{10-1}; } +def t2DoLoopStart : + t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br, + [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>; + +def t2LoopDec : + t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), + 4, IIC_Br, []>, Sched<[WriteBr]>; + +let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in +def t2LoopEnd : + t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target), + 8, IIC_Br, []>, Sched<[WriteBr]>; + +} // end isNotDuplicable + class CS opcode, list pattern=[]> : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZR:$Rn, GPRwithZR:$Rm, pred_noal:$fcond), AddrModeNone, NoItinerary, iname, "$Rd, $Rn, $Rm, $fcond", "", pattern> { diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp new file mode 100644 index 000000000000..b7f3e5bd3502 --- /dev/null +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -0,0 +1,295 @@ +//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// Finalize v8.1-m low-overhead loops by converting the associated pseudo +/// instructions into machine operations. +/// The expectation is that the loop contains three pseudo instructions: +/// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop +/// form should be in the preheader, whereas the while form should be in the +/// preheaders only predecessor. TODO: Could DoLoopStart get moved into the +/// pre-preheader? +/// - t2LoopDec - placed within in the loop body. +/// - t2LoopEnd - the loop latch terminator. +/// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMBasicBlockInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-low-overhead-loops" +#define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" + +namespace { + + class ARMLowOverheadLoops : public MachineFunctionPass { + const ARMBaseInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + std::unique_ptr BBUtils = nullptr; + + public: + static char ID; + + ARMLowOverheadLoops() : MachineFunctionPass(ID) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool ProcessLoop(MachineLoop *ML); + + void Expand(MachineLoop *ML, MachineInstr *Start, + MachineInstr *Dec, MachineInstr *End, bool Revert); + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { + return ARM_LOW_OVERHEAD_LOOPS_NAME; + } + }; +} + +char ARMLowOverheadLoops::ID = 0; + +INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, + false, false) + +bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) { + //if (!static_cast(MF.getSubtarget()).hasLOB()) + //return false; + + LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n"); + + auto &MLI = getAnalysis(); + MRI = &MF.getRegInfo(); + TII = static_cast( + MF.getSubtarget().getInstrInfo()); + BBUtils = std::unique_ptr(new ARMBasicBlockUtils(MF)); + BBUtils->computeAllBlockSizes(); + + bool Changed = false; + for (auto ML : MLI) { + if (!ML->getParentLoop()) + Changed |= ProcessLoop(ML); + } + return Changed; +} + +bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { + + bool Changed = false; + + // Process inner loops first. + for (auto I = ML->begin(), E = ML->end(); I != E; ++I) + Changed |= ProcessLoop(*I); + + LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); + + auto IsLoopStart = [](MachineInstr &MI) { + return MI.getOpcode() == ARM::t2DoLoopStart; + }; + + auto SearchForStart = + [&IsLoopStart](MachineBasicBlock *MBB) -> MachineInstr* { + for (auto &MI : *MBB) { + if (IsLoopStart(MI)) + return &MI; + } + return nullptr; + }; + + MachineInstr *Start = nullptr; + MachineInstr *Dec = nullptr; + MachineInstr *End = nullptr; + bool Revert = false; + + if (auto *Preheader = ML->getLoopPreheader()) + Start = SearchForStart(Preheader); + + // Find the low-overhead loop components and decide whether or not to fall + // back to a normal loop. + for (auto *MBB : reverse(ML->getBlocks())) { + for (auto &MI : *MBB) { + if (MI.getOpcode() == ARM::t2LoopDec) + Dec = &MI; + else if (MI.getOpcode() == ARM::t2LoopEnd) + End = &MI; + + if (!Dec) + continue; + + // TODO: Though the call will require LE to execute again, does this + // mean we should revert? Always executing LE hopefully should be faster + // than performing a sub,cmp,br or even subs,br. + if (MI.getDesc().isCall()) + Revert = true; + + // If we find that we load/store LR between LoopDec and LoopEnd, expect + // that the decremented value has been spilled to the stack. Because + // this value isn't actually going to be produced until the latch, by LE, + // we would need to generate a real sub. The value is also likely to be + // reloaded for use of LoopEnd - in which in case we'd need to perform + // an add because it gets negated again by LE! The other option is to + // then generate the other form of LE which doesn't perform the sub. + if (MI.mayLoad() || MI.mayStore()) + Revert = + MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR; + } + + if (Dec && End && Revert) + break; + } + + if (Start || Dec || End) { + if (!Start || !Dec || !End) + report_fatal_error("Failed to find all loop components"); + } else { + LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); + return Changed; + } + + if (!End->getOperand(1).isMBB() || + End->getOperand(1).getMBB() != ML->getHeader()) + report_fatal_error("Expected LoopEnd to target Loop Header"); + + // The LE instructions has 12-bits for the label offset. + if (!BBUtils->isBBInRange(End, ML->getHeader(), 4096)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Too large for a low-overhead loop!\n"); + Revert = true; + } + + LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start + << " - Found Loop Dec: " << *Dec + << " - Found Loop End: " << *End); + + Expand(ML, Start, Dec, End, Revert); + return true; +} + +void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, + MachineInstr *Dec, MachineInstr *End, + bool Revert) { + + auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) { + // The trip count should already been held in LR since the instructions + // within the loop can only read and write to LR. So, there should be a + // mov to setup the count. WLS/DLS perform this move, so find the original + // and delete it - inserting WLS/DLS in its place. + MachineBasicBlock *MBB = Start->getParent(); + MachineInstr *InsertPt = Start; + for (auto &I : MRI->def_instructions(ARM::LR)) { + if (I.getParent() != MBB) + continue; + + // Always execute. + if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL) + continue; + + // Only handle move reg, if the trip count it will need moving into a reg + // before the setup instruction anyway. + if (!I.getDesc().isMoveReg() || + !I.getOperand(1).isIdenticalTo(Start->getOperand(0))) + continue; + InsertPt = &I; + break; + } + + MachineInstrBuilder MIB = + BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(ARM::t2DLS)); + if (InsertPt != Start) + InsertPt->eraseFromParent(); + + MIB.addDef(ARM::LR); + MIB.add(Start->getOperand(0)); + LLVM_DEBUG(dbgs() << "ARM Loops: Inserted DLS: " << *MIB); + Start->eraseFromParent(); + }; + + // Combine the LoopDec and LoopEnd instructions into LE(TP). + auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec, + MachineInstr *End) { + MachineBasicBlock *MBB = End->getParent(); + MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), + TII->get(ARM::t2LEUpdate)); + MIB.addDef(ARM::LR); + MIB.add(End->getOperand(0)); + MIB.add(End->getOperand(1)); + LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); + + // If there is a branch after loop end, which branches to the fallthrough + // block, remove the branch. + MachineBasicBlock *Latch = End->getParent(); + MachineInstr *Terminator = &Latch->instr_back(); + if (End != Terminator) { + MachineBasicBlock *Exit = ML->getExitBlock(); + if (Latch->isLayoutSuccessor(Exit)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop exit branch: " + << *Terminator); + Terminator->eraseFromParent(); + } + } + End->eraseFromParent(); + Dec->eraseFromParent(); + }; + + // Generate a subs, or sub and cmp, and a branch instead of an LE. + // TODO: Check flags so that we can possibly generate a subs. + auto ExpandBranch = [this](MachineInstr *Dec, MachineInstr *End) { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub, cmp, br.\n"); + // Create sub + MachineBasicBlock *MBB = Dec->getParent(); + MachineInstrBuilder MIB = BuildMI(*MBB, Dec, Dec->getDebugLoc(), + TII->get(ARM::t2SUBri)); + MIB.addDef(ARM::LR); + MIB.add(Dec->getOperand(1)); + MIB.add(Dec->getOperand(2)); + MIB.addImm(ARMCC::AL); + MIB.addReg(0); + MIB.addReg(0); + + // Create cmp + MBB = End->getParent(); + MIB = BuildMI(*MBB, End, End->getDebugLoc(), TII->get(ARM::t2CMPri)); + MIB.addReg(ARM::LR); + MIB.addImm(0); + MIB.addImm(ARMCC::AL); + + // Create bne + MIB = BuildMI(*MBB, End, End->getDebugLoc(), TII->get(ARM::t2Bcc)); + MIB.add(End->getOperand(1)); // branch target + MIB.addImm(ARMCC::NE); // condition code + End->eraseFromParent(); + Dec->eraseFromParent(); + }; + + if (Revert) { + Start->eraseFromParent(); + ExpandBranch(Dec, End); + } else { + ExpandLoopStart(ML, Start); + ExpandLoopEnd(ML, Dec, End); + } +} + +FunctionPass *llvm::createARMLowOverheadLoopsPass() { + return new ARMLowOverheadLoops(); +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index bd8c5d0b1b66..7f0aae1739b3 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -96,6 +96,7 @@ extern "C" void LLVMInitializeARMTarget() { initializeARMExpandPseudoPass(Registry); initializeThumb2SizeReducePass(Registry); initializeMVEVPTBlockPass(Registry); + initializeARMLowOverheadLoopsPass(Registry); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -446,6 +447,9 @@ bool ARMPassConfig::addPreISel() { MergeExternalByDefault)); } + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createHardwareLoopsPass()); + return false; } @@ -526,4 +530,5 @@ void ARMPassConfig::addPreEmitPass() { addPass(createARMOptimizeBarriersPass()); addPass(createARMConstantIslandPass()); + addPass(createARMLowOverheadLoopsPass()); } diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt index d12728b8b120..586b6ea45358 100644 --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -39,6 +39,7 @@ add_llvm_target(ARMCodeGen ARMLegalizerInfo.cpp ARMParallelDSP.cpp ARMLoadStoreOptimizer.cpp + ARMLowOverheadLoops.cpp ARMMCInstLower.cpp ARMMachineFunctionInfo.cpp ARMMacroFusion.cpp diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index e3eab216bd5a..f81f8ce51d18 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -49,6 +49,10 @@ ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation ; CHECK-NEXT: Merge internal globals +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Hardware Loop Insertion ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors ; CHECK-NEXT: Module Verifier @@ -138,6 +142,9 @@ ; CHECK-NEXT: Unpack machine instruction bundles ; CHECK-NEXT: optimise barriers pass ; CHECK-NEXT: ARM constant island placement and branch shortening pass +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: ARM Low Overhead Loops pass ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis diff --git a/llvm/test/Transforms/HardwareLoops/ARM/calls.ll b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll index f2fb8f4eda76..0e1d859d88dc 100644 --- a/llvm/test/Transforms/HardwareLoops/ARM/calls.ll +++ b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll @@ -3,7 +3,7 @@ ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP - +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+lob,+mve.fp -disable-arm-loloops=false %s -o - | FileCheck %s --check-prefix=CHECK-LLC ; CHECK-LABEL: skip_call ; CHECK-NOT: call void @llvm.set.loop.iterations @@ -41,6 +41,15 @@ while.end: ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %loop, label %exit +; CHECK-LLC-LABEL: test_target_specific: +; CHECK-LLC: mov.w lr, #50 +; CHECK-LLC: dls lr, lr +; CHECK-LLC-NOT: mov lr, +; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]: +; CHECK-LLC: le lr, [[LOOP_HEADER]] +; CHECK-LLC-NOT: b . +; CHECK-LLC: @ %exit + define i32 @test_target_specific(i32* %a, i32* %b) { entry: br label %loop @@ -86,6 +95,17 @@ exit: ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100) ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100) + +; CHECK-LLC-LABEL: test_fabs: +; CHECK-LLC: mov.w lr, #100 +; CHECK-LLC: dls lr, lr +; CHECK-LLC-NOT: mov lr, +; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]: +; CHECK-LLC-NOT: bl +; CHECK-LLC: le lr, [[LOOP_HEADER]] +; CHECK-LLC-NOT: b . +; CHECK-LLC: @ %exit + define float @test_fabs(float* %a) { entry: br label %loop diff --git a/llvm/test/Transforms/HardwareLoops/ARM/cond-mov.mir b/llvm/test/Transforms/HardwareLoops/ARM/cond-mov.mir new file mode 100644 index 000000000000..e77c19f938c7 --- /dev/null +++ b/llvm/test/Transforms/HardwareLoops/ARM/cond-mov.mir @@ -0,0 +1,115 @@ +# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +# CHECK: $lr = tMOVr $r0, 13, $noreg +# CHECK: $lr = t2DLS killed $r0 +# CHECK: $lr = t2LEUpdate renamable $lr, %bb.1 + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main" + + define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { + entry: + %scevgep = getelementptr i32, i32* %q, i32 -1 + %scevgep3 = getelementptr i32, i32* %p, i32 -1 + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %while.body + + while.body: + %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ] + %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ] + %0 = phi i32 [ %n, %entry ], [ %2, %while.body ] + %scevgep2 = getelementptr i32, i32* %lsr.iv, i32 1 + %scevgep6 = getelementptr i32, i32* %lsr.iv4, i32 1 + %1 = load i32, i32* %scevgep2, align 4 + store i32 %1, i32* %scevgep6, align 4 + %scevgep1 = getelementptr i32, i32* %lsr.iv, i32 1 + %scevgep5 = getelementptr i32, i32* %lsr.iv4, i32 1 + %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %3 = icmp ne i32 %2, 0 + br i1 %3, label %while.body, label %while.end + + while.end: + ret i32 0 + } + + declare void @llvm.set.loop.iterations.i32(i32) #0 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 + declare void @llvm.stackprotector(i8*, i8**) #1 + + attributes #0 = { noduplicate nounwind } + attributes #1 = { nounwind } + +... +--- +name: do_copy +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + + $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $lr = tMOVr $r0, 13, $noreg + t2DoLoopStart killed $r0 + renamable $r0 = t2SUBri killed renamable $r1, 4, 14, $noreg, $noreg + renamable $r1 = t2SUBri killed renamable $r2, 4, 14, $noreg, $noreg + + bb.1.while.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1 + + renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep2) + early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep6) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1 + t2B %bb.2, 14, $noreg + + bb.2.while.end: + $r0 = t2MOVi 0, 14, $noreg, $noreg + $sp = t2LDMIA_RET $sp, 14, $noreg, def $r7, def $pc, implicit killed $r0 + +... diff --git a/llvm/test/Transforms/HardwareLoops/ARM/massive.mir b/llvm/test/Transforms/HardwareLoops/ARM/massive.mir new file mode 100644 index 000000000000..aa9cbd33ea3a --- /dev/null +++ b/llvm/test/Transforms/HardwareLoops/ARM/massive.mir @@ -0,0 +1,145 @@ +# RUN: llc -mtriple=armv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +# CHECK: for.body: +# CHECK-NOT: t2DLS +# CHECK-NOT: t2LEUpdate + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-unknown-unknown" + + ; Function Attrs: norecurse nounwind + define dso_local arm_aapcscc void @massive(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) local_unnamed_addr { + entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader + + for.body.preheader: ; preds = %entry + %scevgep = getelementptr i32, i32* %a, i32 -1 + %scevgep4 = getelementptr i32, i32* %c, i32 -1 + %scevgep8 = getelementptr i32, i32* %b, i32 -1 + call void @llvm.set.loop.iterations.i32(i32 %N) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body, %for.body.preheader + %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] + %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] + %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] + %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.body ] + %size = call i32 @llvm.arm.space(i32 4096, i32 undef) + %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 + %1 = load i32, i32* %scevgep11, align 4, !tbaa !3 + %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1 + %2 = load i32, i32* %scevgep7, align 4, !tbaa !3 + %mul = mul nsw i32 %2, %1 + %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1 + store i32 %mul, i32* %scevgep3, align 4, !tbaa !3 + %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1 + %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1 + %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1 + %3 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %for.body, label %for.cond.cleanup + } + + declare i32 @llvm.arm.space(i32, i32) #1 + declare void @llvm.set.loop.iterations.i32(i32) #2 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #2 + + attributes #1 = { nounwind } + attributes #2 = { noduplicate nounwind } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 1, !"min_enum_size", i32 4} + !2 = !{!"clang version 9.0.0 (http://llvm.org/git/clang.git a9c7c0fc5d468f3d18a5c6beb697ab0d5be2ff4c) (http://llvm.org/git/llvm.git f34bff0c141a04a5182d57e2cfb1e4bc582c81b0)"} + !3 = !{!4, !4, i64 0} + !4 = !{!"int", !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C/C++ TBAA"} + +... +--- +name: massive +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: false +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + + frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + tCMPi8 $r3, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 8, implicit-def $itstate + tPOP_RET 0, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg + $lr = tMOVr $r3, 14, $noreg + t2DoLoopStart killed $r3 + + bb.1.for.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + dead renamable $r3 = SPACE 4096, undef renamable $r0 + renamable $r12, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep11, !tbaa !3) + renamable $r3, renamable $r2 = t2LDR_PRE killed renamable $r2, 4, 14, $noreg :: (load 4 from %ir.scevgep7, !tbaa !3) + renamable $r3 = nsw t2MUL killed renamable $r3, killed renamable $r12, 14, $noreg + early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep3, !tbaa !3) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1 + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... diff --git a/llvm/test/Transforms/HardwareLoops/ARM/multiblock-massive.mir b/llvm/test/Transforms/HardwareLoops/ARM/multiblock-massive.mir new file mode 100644 index 000000000000..f40321b43b50 --- /dev/null +++ b/llvm/test/Transforms/HardwareLoops/ARM/multiblock-massive.mir @@ -0,0 +1,160 @@ +# RUN: llc -mtriple=armv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +# CHECK: for.body: +# CHECK-NOT: t2DLS +# CHECK-NOT: t2LEUpdate + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-unknown-unknown" + + define dso_local arm_aapcscc void @size_limit(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) local_unnamed_addr { + entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader + + for.body.preheader: ; preds = %entry + br label %for.body + + for.cond.cleanup: ; preds = %for.end, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.end + %lsr.iv4 = phi i32* [ %b, %for.body.preheader ], [ %scevgep5, %for.end ] + %lsr.iv2 = phi i32* [ %c, %for.body.preheader ], [ %scevgep3, %for.end ] + %lsr.iv1 = phi i32* [ %a, %for.body.preheader ], [ %scevgep, %for.end ] + %lsr.iv = phi i32 [ %N, %for.body.preheader ], [ %lsr.iv.next, %for.end ] + %size = call i32 @llvm.arm.space(i32 3072, i32 undef) + %0 = load i32, i32* %lsr.iv4, align 4, !tbaa !3 + %1 = load i32, i32* %lsr.iv2, align 4, !tbaa !3 + %mul = mul nsw i32 %1, %0 + store i32 %mul, i32* %lsr.iv1, align 4, !tbaa !3 + %cmp = icmp ne i32 %0, 0 + br i1 %cmp, label %middle.block, label %for.end + + middle.block: ; preds = %for.body + %div = udiv i32 %1, %0 + store i32 %div, i32* %lsr.iv1, align 4, !tbaa !3 + %size.1 = call i32 @llvm.arm.space(i32 1024, i32 undef) + br label %for.end + + for.end: ; preds = %middle.block, %for.body + %lsr.iv.next = add i32 %lsr.iv, -1 + %scevgep = getelementptr i32, i32* %lsr.iv1, i32 1 + %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 1 + %scevgep5 = getelementptr i32, i32* %lsr.iv4, i32 1 + %exitcond = icmp eq i32 %lsr.iv.next, 0 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + } + + declare i32 @llvm.arm.space(i32, i32) #1 + attributes #1 = { nounwind } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 1, !"min_enum_size", i32 4} + !2 = !{!"clang version 9.0.0 (http://llvm.org/git/clang.git a9c7c0fc5d468f3d18a5c6beb697ab0d5be2ff4c) (http://llvm.org/git/llvm.git f34bff0c141a04a5182d57e2cfb1e4bc582c81b0)"} + !3 = !{!4, !4, i64 0} + !4 = !{!"int", !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C/C++ TBAA"} + +... +--- +name: size_limit +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: false +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: -8 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x30000000), %bb.3(0x50000000) + + frame-setup tPUSH 14, $noreg, killed $r4, killed $r6, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + frame-setup CFI_INSTRUCTION offset $r6, -12 + frame-setup CFI_INSTRUCTION offset $r4, -16 + $r7 = frame-setup tADDrSPi $sp, 2, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa $r7, 8 + tCBNZ $r3, %bb.3 + + bb.1.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r4, def $r6, def $r7, def $pc + + bb.2.for.end: + successors: %bb.1(0x04000000), %bb.3(0x7c000000) + + renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 4, 14, $noreg + renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 4, 14, $noreg + renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 4, 14, $noreg + renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 1, 14, $noreg + tBcc %bb.1, 0, killed $cpsr + + bb.3.for.body: + successors: %bb.4(0x50000000), %bb.2(0x30000000) + + dead renamable $r12 = SPACE 3072, undef renamable $r0 + renamable $r12 = t2LDRi12 renamable $r1, 0, 14, $noreg :: (load 4 from %ir.lsr.iv4, !tbaa !3) + renamable $lr = t2LDRi12 renamable $r2, 0, 14, $noreg :: (load 4 from %ir.lsr.iv2, !tbaa !3) + t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr + renamable $r4 = nsw t2MUL renamable $lr, renamable $r12, 14, $noreg + tSTRi killed renamable $r4, renamable $r0, 0, 14, $noreg :: (store 4 into %ir.lsr.iv1, !tbaa !3) + t2Bcc %bb.2, 0, killed $cpsr + + bb.4.middle.block: + successors: %bb.2(0x80000000) + + renamable $r4 = t2UDIV killed renamable $lr, killed renamable $r12, 14, $noreg + tSTRi killed renamable $r4, renamable $r0, 0, 14, $noreg :: (store 4 into %ir.lsr.iv1, !tbaa !3) + dead renamable $r4 = SPACE 1024, undef renamable $r0 + t2B %bb.2, 14, $noreg + +... diff --git a/llvm/test/Transforms/HardwareLoops/ARM/revert-after-call.mir b/llvm/test/Transforms/HardwareLoops/ARM/revert-after-call.mir new file mode 100644 index 000000000000..f334a2d3c5b1 --- /dev/null +++ b/llvm/test/Transforms/HardwareLoops/ARM/revert-after-call.mir @@ -0,0 +1,141 @@ +# RUN: llc -mtriple=thumbv8.1m.main %s -o - | FileCheck %s + +# CHECK: .LBB0_2: +# CHECK: sub.w lr, lr, #1 +# CHECK: mov [[TMP:r[0-9]+]], lr +# CHECK: bl bar +# CHECK: mov lr, [[TMP]] +# CHECK: cmp.w lr, #0 +# CHECK: bne{{.*}} .LBB0_2 + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-arm-none-eabi" + + define i32 @skip_call(i32 %n) #0 { + entry: + %cmp6 = icmp eq i32 %n, 0 + br i1 %cmp6, label %while.end, label %while.body.preheader + + while.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %while.body + + while.body: ; preds = %while.body, %while.body.preheader + %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ] + %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() + %add = add nsw i32 %call, %res.07 + %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %2 = icmp ne i32 %1, 0 + br i1 %2, label %while.body, label %while.end + + while.end: ; preds = %while.body, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.body ] + ret i32 %res.0.lcssa + } + + declare i32 @bar(...) local_unnamed_addr #0 + declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 + declare void @llvm.stackprotector(i8*, i8**) #2 + + attributes #0 = { "target-features"="+mve.fp" } + attributes #1 = { noduplicate nounwind } + attributes #2 = { nounwind } + +... +--- +name: skip_call +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: true + hasCalls: true + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x30000000), %bb.3(0x50000000) + liveins: $r0, $r4, $r5, $r7, $lr + + $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + frame-setup CFI_INSTRUCTION offset $r5, -12 + frame-setup CFI_INSTRUCTION offset $r4, -16 + t2CMPri $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 0, killed $cpsr + + bb.3.while.body.preheader: + successors: %bb.4(0x80000000) + liveins: $r0 + + $lr = tMOVr $r0, 14, $noreg + renamable $r4 = t2MOVi 0, 14, $noreg, $noreg + t2DoLoopStart killed $r0 + + bb.4.while.body: + successors: %bb.4(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r4 + + renamable $lr = t2LoopDec killed renamable $lr, 1 + $r5 = tMOVr killed $lr, 14, $noreg + tBL 14, $noreg, @bar, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $r0 + $lr = tMOVr killed $r5, 14, $noreg + renamable $r4 = nsw t2ADDrr killed renamable $r0, killed renamable $r4, 14, $noreg, $noreg + t2LoopEnd renamable $lr, %bb.4 + t2B %bb.2, 14, $noreg + + bb.2.while.end: + liveins: $r4 + + $r0 = tMOVr killed $r4, 14, $noreg + $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0 + + bb.1: + renamable $r4 = t2MOVi 0, 14, $noreg, $noreg + $r0 = tMOVr killed $r4, 14, $noreg + $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0 + +... diff --git a/llvm/test/Transforms/HardwareLoops/ARM/revert-after-spill.mir b/llvm/test/Transforms/HardwareLoops/ARM/revert-after-spill.mir new file mode 100644 index 000000000000..63310f2b4c56 --- /dev/null +++ b/llvm/test/Transforms/HardwareLoops/ARM/revert-after-spill.mir @@ -0,0 +1,139 @@ +# RUN: llc -mtriple=thumbv8.1m.main %s -o - | FileCheck %s + +# CHECK: .LBB0_2: +# CHECK: sub.w lr, lr, #1 +# CHECK: str.w lr, [sp, #12] +# CHECK: ldr.w lr, [sp, #12] +# CHECK: cmp.w lr, #0 +# CHECK: bne{{.*}} .LBB0_2 + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-arm-none-eabi" + + define i32 @skip_spill(i32 %n) #0 { + entry: + %cmp6 = icmp eq i32 %n, 0 + br i1 %cmp6, label %while.end, label %while.body.preheader + + while.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %while.body + + while.body: ; preds = %while.body, %while.body.preheader + %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %0 = phi i32 [ %n, %while.body.preheader ], [ %1, %while.body ] + %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() + %add = add nsw i32 %call, %res.07 + %1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %2 = icmp ne i32 %1, 0 + br i1 %2, label %while.body, label %while.end + + while.end: ; preds = %while.body, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.body ] + ret i32 %res.0.lcssa + } + + declare i32 @bar(...) local_unnamed_addr #0 + declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 + declare void @llvm.stackprotector(i8*, i8**) #2 + + attributes #0 = { "target-features"="+mve.fp" } + attributes #1 = { noduplicate nounwind } + attributes #2 = { nounwind } + +... +--- +name: skip_spill +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: true + hasCalls: true + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x30000000), %bb.3(0x50000000) + liveins: $r0, $r4, $r5, $r7, $lr + + $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + frame-setup CFI_INSTRUCTION offset $r5, -12 + frame-setup CFI_INSTRUCTION offset $r4, -16 + t2CMPri $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 0, killed $cpsr + + bb.3.while.body.preheader: + successors: %bb.4(0x80000000) + liveins: $r0 + + $lr = tMOVr $r0, 14, $noreg + renamable $r4 = t2MOVi 0, 14, $noreg, $noreg + t2DoLoopStart killed $r0 + + bb.4.while.body: + successors: %bb.4(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r4 + + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2STRi12 $lr, %stack.0, 0, 14, $noreg :: (store 4) + $lr = t2LDRi12 %stack.0, 0, 14, $noreg :: (load 4) + renamable $r4 = nsw t2ADDrr renamable $lr, killed renamable $r4, 14, $noreg, $noreg + t2LoopEnd renamable $lr, %bb.4 + t2B %bb.2, 14, $noreg + + bb.2.while.end: + liveins: $r4 + + $r0 = tMOVr killed $r4, 14, $noreg + $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0 + + bb.1: + renamable $r4 = t2MOVi 0, 14, $noreg, $noreg + $r0 = tMOVr killed $r4, 14, $noreg + $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $r5, def $r7, def $pc, implicit killed $r0 + +... diff --git a/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll b/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll index 88d650b24540..ca18d8921979 100644 --- a/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll +++ b/llvm/test/Transforms/HardwareLoops/ARM/simple-do.ll @@ -1,6 +1,7 @@ ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=true %s -S -o - | FileCheck %s --check-prefix=DISABLED ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-lob -hardware-loops %s -S -o - | FileCheck %s --check-prefix=DISABLED +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -disable-arm-loloops=false %s -o - | FileCheck %s --check-prefix=CHECK-LLC ; DISABLED-NOT: llvm.set.loop.iterations ; DISABLED-NOT: llvm.loop.decrement @@ -15,6 +16,15 @@ ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end + +; CHECK-LLC-LABEL:do_copy: +; CHECK-LLC-NOT: mov lr, r0 +; CHECK-LLC: dls lr, r0 +; CHECK-LLC-NOT: mov lr, r0 +; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]: +; CHECK-LLC: le lr, [[LOOP_HEADER]] +; CHECK-LLC-NOT: b [[LOOP_EXIT:\.LBB[0-9._]+]] +; CHECK-LLC: @ %while.end define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { entry: br label %while.body @@ -45,6 +55,14 @@ while.end: ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit +; CHECK-LLC-LABEL:do_inc1: +; CHECK-LLC: dls lr, +; CHECK-LLC-NOT: mov lr, +; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]: +; CHECK-LLC: le lr, [[LOOP_HEADER]] +; CHECK-LLC-NOT: b [[LOOP_EXIT:\.LBB[0-9_]+]] +; CHECK-LLC: [[LOOP_EXIT:\.LBB[0-9_]+]]: + define i32 @do_inc1(i32 %n) { entry: %cmp7 = icmp eq i32 %n, 0 @@ -84,6 +102,16 @@ while.end: ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit + +; CHECK-LLC: do_inc2: +; CHECK-LLC-NOT: mov lr, +; CHECK-LLC: dls lr, +; CHECK-LLC-NOT: mov lr, +; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9._]+]]: +; CHECK-LLC: le lr, [[LOOP_HEADER]] +; CHECK-LLC-NOT: b [[LOOP_EXIT:\.LBB[0-9._]+]] +; CHECK-LLC: [[LOOP_EXIT:\.LBB[0-9_]+]]: + define i32 @do_inc2(i32 %n) { entry: %cmp7 = icmp sgt i32 %n, 0 @@ -127,6 +155,15 @@ while.end: ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit + +; CHECK-LLC: do_dec2 +; CHECK-LLC-NOT: mov lr, +; CHECK-LLC: dls lr, +; CHECK-LLC-NOT: mov lr, +; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]: +; CHECK-LLC: le lr, [[LOOP_HEADER]] +; CHECK-LLC-NOT: b . +; CHECK-LLC: @ %while.end define i32 @do_dec2(i32 %n) { entry: %cmp6 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/HardwareLoops/ARM/size-limit.mir b/llvm/test/Transforms/HardwareLoops/ARM/size-limit.mir new file mode 100644 index 000000000000..1739bda2403e --- /dev/null +++ b/llvm/test/Transforms/HardwareLoops/ARM/size-limit.mir @@ -0,0 +1,155 @@ +# RUN: llc -mtriple=armv8.1m.main -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +# CHECK: entry: +# CHECK: $lr = t2DLS +# CHECK: for.body: +# CHECK: $lr = t2LEUpdate renamable $lr + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-unknown-unknown" + + ; Function Attrs: norecurse nounwind + define dso_local arm_aapcscc void @size_limit(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader + + for.body.preheader: ; preds = %entry + %scevgep = getelementptr i32, i32* %a, i32 -1 + %scevgep4 = getelementptr i32, i32* %c, i32 -1 + %scevgep8 = getelementptr i32, i32* %b, i32 -1 + call void @llvm.set.loop.iterations.i32(i32 %N) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body, %for.body.preheader + %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ] + %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] + %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] + %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.body ] + %size = call i32 @llvm.arm.space(i32 4072, i32 undef) + %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1 + %1 = load i32, i32* %scevgep11, align 4, !tbaa !3 + %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1 + %2 = load i32, i32* %scevgep7, align 4, !tbaa !3 + %mul = mul nsw i32 %2, %1 + %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1 + store i32 %mul, i32* %scevgep3, align 4, !tbaa !3 + %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1 + %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1 + %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1 + %3 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %for.body, label %for.cond.cleanup + } + + ; Function Attrs: nounwind + declare i32 @llvm.arm.space(i32, i32) #1 + + ; Function Attrs: noduplicate nounwind + declare void @llvm.set.loop.iterations.i32(i32) #2 + + ; Function Attrs: noduplicate nounwind + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #2 + + ; Function Attrs: nounwind + declare void @llvm.stackprotector(i8*, i8**) #1 + + attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+ras,+soft-float,+strict-align,+thumb-mode,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8d16,-fp-armv8d16sp,-fp-armv8sp,-fp16,-fp16fml,-fp64,-fpregs,-fullfp16,-neon,-vfp2,-vfp2d16,-vfp2d16sp,-vfp2sp,-vfp3,-vfp3d16,-vfp3d16sp,-vfp3sp,-vfp4,-vfp4d16,-vfp4d16sp,-vfp4sp" "unsafe-fp-math"="false" "use-soft-float"="true" } + attributes #1 = { nounwind } + attributes #2 = { noduplicate nounwind } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 1, !"min_enum_size", i32 4} + !2 = !{!"clang version 9.0.0 (http://llvm.org/git/clang.git a9c7c0fc5d468f3d18a5c6beb697ab0d5be2ff4c) (http://llvm.org/git/llvm.git f34bff0c141a04a5182d57e2cfb1e4bc582c81b0)"} + !3 = !{!4, !4, i64 0} + !4 = !{!"int", !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C/C++ TBAA"} + +... +--- +name: size_limit +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: false +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + + frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + tCMPi8 $r3, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 8, implicit-def $itstate + tPOP_RET 0, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14, $noreg + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg + renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14, $noreg + $lr = tMOVr $r3, 14, $noreg + t2DoLoopStart killed $r3 + + bb.1.for.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + dead renamable $r3 = SPACE 4072, undef renamable $r0 + renamable $r12, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep11, !tbaa !3) + renamable $r3, renamable $r2 = t2LDR_PRE killed renamable $r2, 4, 14, $noreg :: (load 4 from %ir.scevgep7, !tbaa !3) + renamable $r3 = nsw t2MUL killed renamable $r3, killed renamable $r12, 14, $noreg + early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep3, !tbaa !3) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1 + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... diff --git a/llvm/test/Transforms/HardwareLoops/ARM/structure.ll b/llvm/test/Transforms/HardwareLoops/ARM/structure.ll index fa3dbc0d929c..03c9e4071cf8 100644 --- a/llvm/test/Transforms/HardwareLoops/ARM/structure.ll +++ b/llvm/test/Transforms/HardwareLoops/ARM/structure.ll @@ -1,4 +1,6 @@ ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -disable-arm-loloops=false %s -o - | FileCheck %s --check-prefix=CHECK-LLC +; RUN: opt -mtriple=thumbv8.1m.main -loop-unroll -unroll-remainder=false -S < %s | llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false | FileCheck %s --check-prefix=CHECK-UNROLL ; CHECK-LABEL: early_exit ; CHECK-NOT: llvm.set.loop.iterations @@ -43,6 +45,16 @@ do.end: ; CHECK-NOT: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1) ; CHECK-NOT: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7 + +; CHECK-LLC: nested: +; CHECK-LLC-NOT: mov lr, r1 +; CHECK-LLC: dls lr, r1 +; CHECK-LLC-NOT: mov lr, r1 +; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9._]+]]: +; CHECK-LLC: le lr, [[LOOP_HEADER]] +; CHECK-LLC-NOT: b [[LOOP_EXIT:\.LBB[0-9._]+]] +; CHECK-LLC: [[LOOP_EXIT:\.LBB[0-9._]+]]: + define void @nested(i32* nocapture %A, i32 %N) { entry: %cmp20 = icmp eq i32 %N, 0 @@ -210,6 +222,171 @@ exit: ret void } +; CHECK-LABEL: search +; CHECK: for.body.preheader: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: br label %for.body +; CHECK: for.body: +; CHECK: for.inc: +; CHECK: [[LOOP_DEC:%[^ ]+]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32 +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %for.body, label %for.cond.cleanup +define i32 @search(i8* nocapture readonly %c, i32 %N) { +entry: + %cmp11 = icmp eq i32 %N, 0 + br i1 %cmp11, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + %found.0.lcssa = phi i32 [ 0, %entry ], [ %found.1, %for.inc ] + %spaces.0.lcssa = phi i32 [ 0, %entry ], [ %spaces.1, %for.inc ] + %sub = sub nsw i32 %found.0.lcssa, %spaces.0.lcssa + ret i32 %sub + +for.body: + %i.014 = phi i32 [ %inc3, %for.inc ], [ 0, %entry ] + %spaces.013 = phi i32 [ %spaces.1, %for.inc ], [ 0, %entry ] + %found.012 = phi i32 [ %found.1, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, i8* %c, i32 %i.014 + %0 = load i8, i8* %arrayidx, align 1 + switch i8 %0, label %for.inc [ + i8 108, label %sw.bb + i8 111, label %sw.bb + i8 112, label %sw.bb + i8 32, label %sw.bb1 + ] + +sw.bb: ; preds = %for.body, %for.body, %for.body + %inc = add nsw i32 %found.012, 1 + br label %for.inc + +sw.bb1: ; preds = %for.body + %inc2 = add nsw i32 %spaces.013, 1 + br label %for.inc + +for.inc: ; preds = %sw.bb, %sw.bb1, %for.body + %found.1 = phi i32 [ %found.012, %for.body ], [ %found.012, %sw.bb1 ], [ %inc, %sw.bb ] + %spaces.1 = phi i32 [ %spaces.013, %for.body ], [ %inc2, %sw.bb1 ], [ %spaces.013, %sw.bb ] + %inc3 = add nuw i32 %i.014, 1 + %exitcond = icmp eq i32 %inc3, %N + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: unroll_inc_int +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32( + +; TODO: We should be able to support the unrolled loop body. +; CHECK-UNROLL-LABEL: unroll_inc_int: +; CHECK-UNROLL: [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader +; CHECK-UNROLL-NOT: dls +; CHECK-UNROLL: [[LOOP:.LBB[0-9_]+]]: @ %for.body +; CHECK-UNROLL-NOT: le lr, [[LOOP]] +; CHECK-UNROLL: bne [[LOOP]] +; CHECK-UNROLL: %for.body.epil.preheader +; CHECK-UNROLL: dls +; CHECK-UNROLL: %for.body.epil +; CHECK-UNROLL: le + +define void @unroll_inc_int(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { +entry: + %cmp8 = icmp sgt i32 %N, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: unroll_inc_unsigned +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32( + +; CHECK-LLC-LABEL: unroll_inc_unsigned: +; CHECK-LLC: dls lr, [[COUNT:r[0-9]+]] +; CHECK-LLC: le lr + +; TODO: We should be able to support the unrolled loop body. +; CHECK-UNROLL-LABEL: unroll_inc_unsigned: +; CHECK-UNROLL: [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader +; CHECK-UNROLL-NOT: dls +; CHECK-UNROLL: [[LOOP:.LBB[0-9_]+]]: @ %for.body +; CHECK-UNROLL-NOT: le lr, [[LOOP]] +; CHECK-UNROLL: bne [[LOOP]] +; CHECK-UNROLL: %for.body.epil.preheader +; CHECK-UNROLL: dls +; CHECK-UNROLL: %for.body.epil +; CHECK-UNROLL: le +define void @unroll_inc_unsigned(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: unroll_dec_int +; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32( + +; TODO: An unnecessary register is being held to hold COUNT, lr should just +; be used instead. +; CHECK-LLC-LABEL: unroll_dec_int: +; CHECK-LLC: dls lr, [[COUNT:r[0-9]+]] +; CHECK-LLC: subs [[COUNT]], #1 +; CHECK-LLC: le lr + +; CHECK-UNROLL-LABEL: unroll_dec_int +; CHECK-UNROLL: dls lr +; CHECK-UNROLL: le lr +; CHECK-UNROLL: dls lr +; CHECK-UNROLL: le lr +define void @unroll_dec_int(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { +entry: + %cmp8 = icmp sgt i32 %N, 0 + br i1 %cmp8, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ %dec, %for.body ], [ %N, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09 + store i32 %mul, i32* %arrayidx2, align 4 + %dec = add nsw i32 %i.09, -1 + %cmp = icmp sgt i32 %dec, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} declare void @llvm.set.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 diff --git a/llvm/test/Transforms/HardwareLoops/ARM/switch.mir b/llvm/test/Transforms/HardwareLoops/ARM/switch.mir new file mode 100644 index 000000000000..052c79c0f487 --- /dev/null +++ b/llvm/test/Transforms/HardwareLoops/ARM/switch.mir @@ -0,0 +1,198 @@ +# RUN: llc -mtriple=thumbv8.1m.main %s -run-pass=arm-low-overhead-loops -o - +# CHECK: bb.1.for.body.preheader: +# CHECK: $lr = t2DLS +# CHECK-NOT: t2LoopDec +# CHECK: bb.6.for.inc: +# CHECK: $lr = t2LEUpdate renamable $lr, %bb.2 + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-unknown-unknown" + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcscc i32 @search(i8* nocapture readonly %c, i32 %N) local_unnamed_addr #0 { + entry: + %cmp11 = icmp eq i32 %N, 0 + br i1 %cmp11, label %for.cond.cleanup, label %for.body.preheader + + for.body.preheader: + call void @llvm.set.loop.iterations.i32(i32 %N) + br label %for.body + + for.cond.cleanup: + %found.0.lcssa = phi i32 [ 0, %entry ], [ %found.1, %for.inc ] + %spaces.0.lcssa = phi i32 [ 0, %entry ], [ %spaces.1, %for.inc ] + %sub = sub nsw i32 %found.0.lcssa, %spaces.0.lcssa + ret i32 %sub + + for.body: + %lsr.iv1 = phi i8* [ %c, %for.body.preheader ], [ %scevgep, %for.inc ] + %spaces.013 = phi i32 [ %spaces.1, %for.inc ], [ 0, %for.body.preheader ] + %found.012 = phi i32 [ %found.1, %for.inc ], [ 0, %for.body.preheader ] + %0 = phi i32 [ %N, %for.body.preheader ], [ %3, %for.inc ] + %1 = load i8, i8* %lsr.iv1, align 1 + %2 = zext i8 %1 to i32 + switch i32 %2, label %for.inc [ + i32 108, label %sw.bb + i32 111, label %sw.bb + i32 112, label %sw.bb + i32 32, label %sw.bb1 + ] + + sw.bb: + %inc = add nsw i32 %found.012, 1 + br label %for.inc + + sw.bb1: + %inc2 = add nsw i32 %spaces.013, 1 + br label %for.inc + + for.inc: + %found.1 = phi i32 [ %found.012, %for.body ], [ %found.012, %sw.bb1 ], [ %inc, %sw.bb ] + %spaces.1 = phi i32 [ %spaces.013, %for.body ], [ %inc2, %sw.bb1 ], [ %spaces.013, %sw.bb ] + %scevgep = getelementptr i8, i8* %lsr.iv1, i32 1 + %3 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %for.body, label %for.cond.cleanup + } + + declare void @llvm.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 + declare void @llvm.stackprotector(i8*, i8**) #2 + + attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+ras,+soft-float,+strict-align,+thumb-mode,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8d16,-fp-armv8d16sp,-fp-armv8sp,-fp16,-fp16fml,-fp64,-fpregs,-fullfp16,-neon,-vfp2,-vfp2d16,-vfp2d16sp,-vfp2sp,-vfp3,-vfp3d16,-vfp3d16sp,-vfp3sp,-vfp4,-vfp4d16,-vfp4d16sp,-vfp4sp" "unsafe-fp-math"="false" "use-soft-float"="true" } + attributes #1 = { noduplicate nounwind } + attributes #2 = { nounwind } + +... +--- +name: search +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: -8 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x30000000), %bb.3(0x50000000) + liveins: $r0, $r1, $r4, $r6, $lr + + $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r4, killed $r6, $r7, killed $lr + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + frame-setup CFI_INSTRUCTION offset $r6, -12 + frame-setup CFI_INSTRUCTION offset $r4, -16 + $r7 = frame-setup t2ADDri $sp, 8, 14, $noreg, $noreg + frame-setup CFI_INSTRUCTION def_cfa $r7, 8 + t2CMPri $r1, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 0, killed $cpsr + + bb.3.for.body.preheader: + successors: %bb.4(0x80000000) + liveins: $r0, $r1 + + $lr = tMOVr $r1, 14, $noreg + t2DoLoopStart killed $r1 + renamable $r1 = t2MOVi 0, 14, $noreg, $noreg + renamable $r12 = t2MOVi 1, 14, $noreg, $noreg + renamable $r2 = t2MOVi 0, 14, $noreg, $noreg + + bb.4.for.body: + successors: %bb.5(0x26666665), %bb.6(0x5999999b) + liveins: $lr, $r0, $r1, $r2, $r12 + + renamable $r3 = t2LDRBi12 renamable $r0, 0, 14, $noreg :: (load 1 from %ir.lsr.iv1) + renamable $r4 = t2SUBri renamable $r3, 108, 14, $noreg, $noreg + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2CMPri renamable $r4, 4, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.5, 8, killed $cpsr + + bb.6.for.body: + successors: %bb.7(0x6db6db6e), %bb.5(0x12492492) + liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r12 + + renamable $r4 = t2LSLrr renamable $r12, killed renamable $r4, 14, $noreg, $noreg + t2TSTri killed renamable $r4, 25, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.5, 0, killed $cpsr + + bb.7.sw.bb: + successors: %bb.8(0x80000000) + liveins: $lr, $r0, $r1, $r2, $r12 + + renamable $r2 = nsw t2ADDri killed renamable $r2, 1, 14, $noreg, $noreg + t2B %bb.8, 14, $noreg + + bb.5.for.body: + successors: %bb.8(0x80000000) + liveins: $lr, $r0, $r1, $r2, $r3, $r12 + + t2CMPri killed renamable $r3, 32, 14, $noreg, implicit-def $cpsr + BUNDLE implicit-def dead $itstate, implicit-def $r1, implicit killed $r1, implicit killed $cpsr { + t2IT 0, 8, implicit-def $itstate + renamable $r1 = nsw t2ADDri killed renamable $r1, 1, 0, killed $cpsr, $noreg, implicit $r1, implicit internal killed $itstate + } + + bb.8.for.inc: + successors: %bb.4(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r12 + + renamable $r0 = t2ADDri killed renamable $r0, 1, 14, $noreg, $noreg + t2LoopEnd renamable $lr, %bb.4 + t2B %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + liveins: $r1, $r2 + + renamable $r0 = nsw t2SUBrr killed renamable $r2, killed renamable $r1, 14, $noreg, $noreg + $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $r6, def $r7, def $pc, implicit killed $r0 + + bb.1: + renamable $r2 = t2MOVi 0, 14, $noreg, $noreg + renamable $r1 = t2MOVi 0, 14, $noreg, $noreg + renamable $r0 = nsw t2SUBrr killed renamable $r2, killed renamable $r1, 14, $noreg, $noreg + $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $r6, def $r7, def $pc, implicit killed $r0 + +...