[ARM] Updates to arm-block-placement pass

The patch makes two updates to the arm-block-placement pass:
- Handle arbitrarily nested loops
- Extends the search (for t2WhileLoopStartLR) to the predecessor of the
  preHeader.

Differential Revision: https://reviews.llvm.org/D99649
This commit is contained in:
Malhar Jajoo 2021-04-12 14:46:23 +01:00 committed by David Green
parent 489cdedd11
commit 58f3201a20
4 changed files with 392 additions and 166 deletions

View File

@ -38,6 +38,8 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *After);
bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other);
bool fixBackwardsWLS(MachineLoop *ML);
bool processPostOrderLoops(MachineLoop *ML);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@ -57,9 +59,135 @@ char ARMBlockPlacement::ID = 0;
INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false,
false)
static MachineInstr *findWLSInBlock(MachineBasicBlock *MBB) {
for (auto &Terminator : MBB->terminators()) {
if (Terminator.getOpcode() == ARM::t2WhileLoopStartLR)
return &Terminator;
}
return nullptr;
}
/// Find t2WhileLoopStartLR in the loop predecessor BB or otherwise in its only
/// predecessor. If found, returns (BB, WLS Instr) pair, otherwise a null pair.
static MachineInstr *findWLS(MachineLoop *ML) {
MachineBasicBlock *Predecessor = ML->getLoopPredecessor();
if (!Predecessor)
return nullptr;
MachineInstr *WlsInstr = findWLSInBlock(Predecessor);
if (WlsInstr)
return WlsInstr;
if (Predecessor->pred_size() == 1)
return findWLSInBlock(*Predecessor->pred_begin());
return nullptr;
}
/// Checks if loop has a backwards branching WLS, and if possible, fixes it.
/// This requires checking the preheader (or it's predecessor) for a WLS and if
/// its target is before it.
/// If moving the target block wouldn't produce another backwards WLS or a new
/// forwards LE branch, then move the target block after the preheader (or it's
/// predecessor).
bool ARMBlockPlacement::fixBackwardsWLS(MachineLoop *ML) {
MachineInstr *WlsInstr = findWLS(ML);
if (!WlsInstr)
return false;
MachineBasicBlock *Predecessor = WlsInstr->getParent();
MachineBasicBlock *LoopExit = WlsInstr->getOperand(2).getMBB();
// We don't want to move the function's entry block.
if (!LoopExit->getPrevNode())
return false;
if (blockIsBefore(Predecessor, LoopExit))
return false;
LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from "
<< Predecessor->getFullName() << " to "
<< LoopExit->getFullName() << "\n");
// Make sure that moving the target block doesn't cause any of its WLSs
// that were previously not backwards to become backwards
bool CanMove = true;
MachineInstr *WlsInLoopExit = findWLSInBlock(LoopExit);
if (WlsInLoopExit) {
// An example loop structure where the LoopExit can't be moved, since
// bb1's WLS will become backwards once it's moved after bb3
// bb1: - LoopExit
// WLS bb2
// bb2: - LoopExit2
// ...
// bb3: - Predecessor
// WLS bb1
// bb4: - Header
MachineBasicBlock *LoopExit2 = WlsInLoopExit->getOperand(2).getMBB();
// If the WLS from LoopExit to LoopExit2 is already backwards then
// moving LoopExit won't affect it, so it can be moved. If LoopExit2 is
// after the Predecessor then moving will keep it as a forward branch, so it
// can be moved. If LoopExit2 is between the Predecessor and LoopExit then
// moving LoopExit will make it a backwards branch, so it can't be moved
// since we'd fix one and introduce one backwards branch.
// TODO: Analyse the blocks to make a decision if it would be worth
// moving LoopExit even if LoopExit2 is between the Predecessor and
// LoopExit.
if (!blockIsBefore(LoopExit2, LoopExit) &&
(LoopExit2 == Predecessor || blockIsBefore(LoopExit2, Predecessor))) {
LLVM_DEBUG(dbgs() << DEBUG_PREFIX
<< "Can't move the target block as it would "
"introduce a new backwards WLS branch\n");
CanMove = false;
}
}
if (CanMove) {
// Make sure no LEs become forwards.
// An example loop structure where the LoopExit can't be moved, since
// bb2's LE will become forwards once bb1 is moved after bb3.
// bb1: - LoopExit
// bb2:
// LE bb1 - Terminator
// bb3: - Predecessor
// WLS bb1
// bb4: - Header
for (auto It = LoopExit->getIterator(); It != Predecessor->getIterator();
It++) {
MachineBasicBlock *MBB = &*It;
for (auto &Terminator : MBB->terminators()) {
if (Terminator.getOpcode() != ARM::t2LoopEnd &&
Terminator.getOpcode() != ARM::t2LoopEndDec)
continue;
MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB();
// The LE will become forwards branching if it branches to LoopExit
// which isn't allowed by the architecture, so we should avoid
// introducing these.
// TODO: Analyse the blocks to make a decision if it would be worth
// moving LoopExit even if we'd introduce a forwards LE
if (LETarget == LoopExit) {
LLVM_DEBUG(dbgs() << DEBUG_PREFIX
<< "Can't move the target block as it would "
"introduce a new forwards LE branch\n");
CanMove = false;
break;
}
}
}
}
if (CanMove)
moveBasicBlock(LoopExit, Predecessor);
return CanMove;
}
/// Updates ordering (of WLS BB and their loopExits) in inner loops first
/// Returns true if any change was made in any of the loops
bool ARMBlockPlacement::processPostOrderLoops(MachineLoop *ML) {
bool Changed = false;
for (auto *InnerML : *ML)
Changed |= processPostOrderLoops(InnerML);
return Changed | fixBackwardsWLS(ML);
}
bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
return false;
const ARMSubtarget &ST = static_cast<const ARMSubtarget &>(MF.getSubtarget());
if (!ST.hasLOB())
return false;
@ -72,109 +200,9 @@ bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
BBUtils->adjustBBOffsetsAfter(&MF.front());
bool Changed = false;
// Find loops with a backwards branching WLS.
// This requires looping over the loops in the function, checking each
// preheader for a WLS and if its target is before the preheader. If moving
// the target block wouldn't produce another backwards WLS or a new forwards
// LE branch then move the target block after the preheader.
for (auto *ML : *MLI) {
MachineBasicBlock *Preheader = ML->getLoopPredecessor();
if (!Preheader)
continue;
for (auto &Terminator : Preheader->terminators()) {
if (Terminator.getOpcode() != ARM::t2WhileLoopStartLR)
continue;
MachineBasicBlock *LoopExit = Terminator.getOperand(2).getMBB();
// We don't want to move the function's entry block.
if (!LoopExit->getPrevNode())
continue;
if (blockIsBefore(Preheader, LoopExit))
continue;
LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from "
<< Preheader->getFullName() << " to "
<< LoopExit->getFullName() << "\n");
// Make sure that moving the target block doesn't cause any of its WLSs
// that were previously not backwards to become backwards
bool CanMove = true;
for (auto &LoopExitTerminator : LoopExit->terminators()) {
if (LoopExitTerminator.getOpcode() != ARM::t2WhileLoopStartLR)
continue;
// An example loop structure where the LoopExit can't be moved, since
// bb1's WLS will become backwards once it's moved after bb3 bb1: -
// LoopExit
// WLS bb2 - LoopExit2
// bb2:
// ...
// bb3: - Preheader
// WLS bb1
// bb4: - Header
MachineBasicBlock *LoopExit2 =
LoopExitTerminator.getOperand(2).getMBB();
// If the WLS from LoopExit to LoopExit2 is already backwards then
// moving LoopExit won't affect it, so it can be moved. If LoopExit2 is
// after the Preheader then moving will keep it as a forward branch, so
// it can be moved. If LoopExit2 is between the Preheader and LoopExit
// then moving LoopExit will make it a backwards branch, so it can't be
// moved since we'd fix one and introduce one backwards branch.
// TODO: Analyse the blocks to make a decision if it would be worth
// moving LoopExit even if LoopExit2 is between the Preheader and
// LoopExit.
if (!blockIsBefore(LoopExit2, LoopExit) &&
(LoopExit2 == Preheader || blockIsBefore(LoopExit2, Preheader))) {
LLVM_DEBUG(dbgs() << DEBUG_PREFIX
<< "Can't move the target block as it would "
"introduce a new backwards WLS branch\n");
CanMove = false;
break;
}
}
if (CanMove) {
// Make sure no LEs become forwards.
// An example loop structure where the LoopExit can't be moved, since
// bb2's LE will become forwards once bb1 is moved after bb3.
// bb1: - LoopExit
// bb2:
// LE bb1 - Terminator
// bb3: - Preheader
// WLS bb1
// bb4: - Header
for (auto It = LoopExit->getIterator(); It != Preheader->getIterator();
It++) {
MachineBasicBlock *MBB = &*It;
for (auto &Terminator : MBB->terminators()) {
if (Terminator.getOpcode() != ARM::t2LoopEnd &&
Terminator.getOpcode() != ARM::t2LoopEndDec)
continue;
MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB();
// The LE will become forwards branching if it branches to LoopExit
// which isn't allowed by the architecture, so we should avoid
// introducing these.
// TODO: Analyse the blocks to make a decision if it would be worth
// moving LoopExit even if we'd introduce a forwards LE
if (LETarget == LoopExit) {
LLVM_DEBUG(dbgs() << DEBUG_PREFIX
<< "Can't move the target block as it would "
"introduce a new forwards LE branch\n");
CanMove = false;
break;
}
}
}
if (!CanMove)
break;
}
if (CanMove) {
moveBasicBlock(LoopExit, Preheader);
Changed = true;
break;
}
}
}
// Find loops with a backwards branching WLS and fix if possible.
for (auto *ML : *MLI)
Changed |= processPostOrderLoops(ML);
return Changed;
}
@ -184,6 +212,8 @@ bool ARMBlockPlacement::blockIsBefore(MachineBasicBlock *BB,
return BBUtils->getOffsetOf(Other) > BBUtils->getOffsetOf(BB);
}
/// Moves a given MBB to be positioned after another MBB while maintaining
/// existing control flow
void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
MachineBasicBlock *After) {
LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Moving " << BB->getName() << " after "
@ -195,6 +225,9 @@ void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
BB->moveAfter(After);
// Since only the blocks are to be moved around (but the control flow must
// not change), if there were any fall-throughs (to/from adjacent blocks),
// replace with unconditional branch to the fall through block.
auto FixFallthrough = [&](MachineBasicBlock *From, MachineBasicBlock *To) {
LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Checking for fallthrough from "
<< From->getName() << " to " << To->getName() << "\n");

View File

@ -1,16 +1,19 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -run-pass=arm-block-placement %s -o - | FileCheck %s
--- |
; Checks that loopExitBlock gets moved (in forward direction) if there is a backwards WLS to it.
define void @backwards_branch(i32 %N, i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
entry:
unreachable
}
; Checks that loopExitBlock does not get reordered (since it is entry block) even if there is a backwards WLS to it.
define void @backwards_branch_entry_block(i32 %N, i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
entry:
unreachable
}
; Checks that loopExitBlock (containing a backwards WLS) is moved (in forward direction) if there is a backwards WLS to it.
define void @backwards_branch_target_already_backwards(i32 %N, i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
entry:
unreachable
@ -21,16 +24,25 @@
unreachable
}
; Checks that loopExitBlock (to which a backwards LE exists) is not moved if moving it would cause the LE to become forwards branching.
define void @backwards_branch_forwards_le(i32 %N, i32 %M, i32* nocapture %a, i32* nocapture %b, i32* nocapture %c) local_unnamed_addr #0 {
entry:
unreachable
}
; Checks that a MachineFunction is unaffected if it doesn't contain any WLS (pseudo) instruction.
define void @no_preheader(i32 %N, i32 %M, i32* nocapture %a, i32* nocapture %b, i32* nocapture %c) local_unnamed_addr #0 {
entry:
unreachable
}
; Within a nested loop, checks that loopExit gets moved (in forward direction) if there exists a backwards WLS to it.
; Both the WLS and loopExit are at depth=3.
define void @nested_loops(i32 %n, i32 %m, i32 %l, i8* noalias %X, i8* noalias %Y) local_unnamed_addr #0 {
entry:
unreachable
}
declare dso_local i32 @g(...) local_unnamed_addr #1
declare dso_local i32 @h(...) local_unnamed_addr #1
@ -441,3 +453,188 @@ body: |
bb.5:
frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc
...
---
name: nested_loops
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$r0' }
- { reg: '$r1' }
- { reg: '$r2' }
- { reg: '$r3' }
frameInfo:
stackSize: 32
maxAlignment: 4
maxCallFrameSize: 0
fixedStack:
- { id: 0, size: 4, alignment: 8, isImmutable: true }
stack:
- { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '$lr',
callee-saved-restored: false }
- { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '$r10' }
- { id: 2, type: spill-slot, offset: -12, size: 4, alignment: 4, callee-saved-register: '$r9' }
- { id: 3, type: spill-slot, offset: -16, size: 4, alignment: 4, callee-saved-register: '$r8' }
- { id: 4, type: spill-slot, offset: -20, size: 4, alignment: 4, callee-saved-register: '$r7' }
- { id: 5, type: spill-slot, offset: -24, size: 4, alignment: 4, callee-saved-register: '$r6' }
- { id: 6, type: spill-slot, offset: -28, size: 4, alignment: 4, callee-saved-register: '$r5' }
- { id: 7, type: spill-slot, offset: -32, size: 4, alignment: 4, callee-saved-register: '$r4' }
machineFunctionInfo: {}
body: |
; CHECK-LABEL: name: nested_loops
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $lr
; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $lr
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 32
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r10, -8
; CHECK: frame-setup CFI_INSTRUCTION offset $r9, -12
; CHECK: frame-setup CFI_INSTRUCTION offset $r8, -16
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -20
; CHECK: frame-setup CFI_INSTRUCTION offset $r6, -24
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -28
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -32
; CHECK: tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: t2IT 11, 8, implicit-def $itstate
; CHECK: $sp = frame-destroy t2LDMIA_RET $sp, 11 /* CC::lt */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc, implicit killed $itstate
; CHECK: bb.1:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: liveins: $r0, $r1, $r2, $r3
; CHECK: renamable $r12 = t2LDRi12 $sp, 32, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
; CHECK: $r9 = tMOVr killed $r2, 14 /* CC::al */, $noreg
; CHECK: renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
; CHECK: t2B %bb.3, 14 /* CC::al */, $noreg
; CHECK: bb.2:
; CHECK: successors: %bb.9(0x04000000), %bb.3(0x7c000000)
; CHECK: liveins: $r0, $r1, $r3, $r8, $r9, $r12
; CHECK: renamable $r8 = nuw nsw t2ADDri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg
; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 1, 14 /* CC::al */, $noreg
; CHECK: tCMPhir renamable $r8, renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: renamable $r12 = t2ADDri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg
; CHECK: t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr
; CHECK: bb.3:
; CHECK: successors: %bb.4(0x50000000), %bb.2(0x30000000)
; CHECK: liveins: $r0, $r1, $r3, $r8, $r9, $r12
; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, killed $cpsr
; CHECK: bb.4:
; CHECK: successors: %bb.6(0x80000000)
; CHECK: liveins: $r0, $r1, $r3, $r8, $r9, $r12
; CHECK: renamable $r4, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
; CHECK: $r10 = tMOVr $r12, 14 /* CC::al */, $noreg
; CHECK: $r2 = tMOVr $r3, 14 /* CC::al */, $noreg
; CHECK: t2B %bb.6, 14 /* CC::al */, $noreg
; CHECK: bb.6:
; CHECK: successors: %bb.7(0x50000000), %bb.5(0x30000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
; CHECK: renamable $lr = t2WhileLoopStartLR killed renamable $r9, %bb.5, implicit-def dead $cpsr
; CHECK: tB %bb.7, 14 /* CC::al */, $noreg
; CHECK: bb.5:
; CHECK: successors: %bb.2(0x04000000), %bb.6(0x7c000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
; CHECK: renamable $r4, dead $cpsr = nuw nsw tADDi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg
; CHECK: renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 1, 14 /* CC::al */, $noreg
; CHECK: tCMPr renamable $r4, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: renamable $r10 = t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg
; CHECK: t2Bcc %bb.2, 0 /* CC::eq */, killed $cpsr
; CHECK: tB %bb.6, 14 /* CC::al */, $noreg
; CHECK: bb.7:
; CHECK: successors: %bb.8(0x80000000)
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
; CHECK: $r5 = tMOVr $r10, 14 /* CC::al */, $noreg
; CHECK: $r6 = tMOVr $r2, 14 /* CC::al */, $noreg
; CHECK: t2B %bb.8, 14 /* CC::al */, $noreg
; CHECK: bb.8:
; CHECK: successors: %bb.8(0x7c000000), %bb.5(0x04000000)
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r8, $r9, $r10, $r12
; CHECK: tSTRi killed $r0, $r1, 0, 14 /* CC::al */, $noreg
; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.8, implicit-def dead $cpsr
; CHECK: t2B %bb.5, 14 /* CC::al */, $noreg
; CHECK: bb.9:
; CHECK: $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc
bb.0:
successors: %bb.1
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $lr
$sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $lr
frame-setup CFI_INSTRUCTION def_cfa_offset 32
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r10, -8
frame-setup CFI_INSTRUCTION offset $r9, -12
frame-setup CFI_INSTRUCTION offset $r8, -16
frame-setup CFI_INSTRUCTION offset $r7, -20
frame-setup CFI_INSTRUCTION offset $r6, -24
frame-setup CFI_INSTRUCTION offset $r5, -28
frame-setup CFI_INSTRUCTION offset $r4, -32
tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
t2IT 11, 8, implicit-def $itstate
$sp = frame-destroy t2LDMIA_RET $sp, 11 /* CC::lt */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc, implicit killed $itstate
bb.1:
liveins: $r0, $r1, $r2, $r3
renamable $r12 = t2LDRi12 $sp, 32, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
$r9 = tMOVr killed $r2, 14 /* CC::al */, $noreg
renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
t2B %bb.2, 14 /* CC::al */, $noreg
bb.8:
successors: %bb.9(0x04000000), %bb.2(0x7c000000)
liveins: $r0, $r1, $r3, $r8, $r9, $r12
renamable $r8 = nuw nsw t2ADDri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg
renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 1, 14 /* CC::al */, $noreg
tCMPhir renamable $r8, renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
renamable $r12 = t2ADDri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg
t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr
bb.2:
successors: %bb.3(0x50000000), %bb.8(0x30000000)
liveins: $r0, $r1, $r3, $r8, $r9, $r12
tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
t2Bcc %bb.8, 11 /* CC::lt */, killed $cpsr
bb.3:
liveins: $r0, $r1, $r3, $r8, $r9, $r12
renamable $r4, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
$r10 = tMOVr $r12, 14 /* CC::al */, $noreg
$r2 = tMOVr $r3, 14 /* CC::al */, $noreg
t2B %bb.4, 14 /* CC::al */, $noreg
bb.7:
successors: %bb.8(0x04000000), %bb.4(0x7c000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
renamable $r4, dead $cpsr = nuw nsw tADDi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg
renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 1, 14 /* CC::al */, $noreg
tCMPr renamable $r4, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr
renamable $r10 = t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg
t2Bcc %bb.8, 0 /* CC::eq */, killed $cpsr
bb.4:
successors: %bb.5(0x50000000), %bb.7(0x30000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
renamable $lr = t2WhileLoopStartLR killed renamable $r9, %bb.7, implicit-def dead $cpsr
bb.5:
liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12
$r5 = tMOVr $r10, 14 /* CC::al */, $noreg
$r6 = tMOVr $r2, 14 /* CC::al */, $noreg
t2B %bb.6, 14 /* CC::al */, $noreg
bb.6:
successors: %bb.6(0x7c000000), %bb.7(0x04000000)
liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r8, $r9, $r10, $r12
tSTRi killed $r0, $r1, 0, 14 /* CC::al */, $noreg
renamable $lr = t2LoopEndDec killed renamable $lr, %bb.6, implicit-def dead $cpsr
t2B %bb.7, 14 /* CC::al */, $noreg
bb.9:
$sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc
...

View File

@ -1077,18 +1077,10 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_3: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #8
; CHECK-NEXT: add.w r0, r5, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: .LBB16_4: @ %while.body
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_3: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
; CHECK-NEXT: @ Child Loop BB16_5 Depth 2
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: ldrh.w lr, [r3, #14]
@ -1125,14 +1117,14 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vfma.f16 q0, q1, lr
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: blo .LBB16_7
; CHECK-NEXT: @ %bb.5: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: blo .LBB16_6
; CHECK-NEXT: @ %bb.4: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB16_6: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: .LBB16_5: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r0, [r6], #16
; CHECK-NEXT: vldrw.u32 q1, [r5]
@ -1163,33 +1155,39 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: le lr, .LBB16_6
; CHECK-NEXT: b .LBB16_8
; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: le lr, .LBB16_5
; CHECK-NEXT: b .LBB16_7
; CHECK-NEXT: .LBB16_6: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB16_8: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: .LBB16_7: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: subs.w lr, r0, #0
; CHECK-NEXT: beq.w .LBB16_3
; CHECK-NEXT: wls lr, r0, .LBB16_8
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_8: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #8
; CHECK-NEXT: add.w r0, r5, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
; CHECK-NEXT: beq .LBB16_12
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: .LBB16_10: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r4, [r6], #2
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: bne .LBB16_10
; CHECK-NEXT: b .LBB16_11
; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: le lr, .LBB16_10
; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: add.w r5, r5, r0, lsl #1
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: b .LBB16_8
; CHECK-NEXT: .LBB16_12: @ %if.end
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}

View File

@ -1071,18 +1071,10 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_3: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: add.w r0, r4, r0, lsl #2
; CHECK-NEXT: add.w r4, r0, #16
; CHECK-NEXT: beq .LBB16_12
; CHECK-NEXT: .LBB16_4: @ %while.body
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_3: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
; CHECK-NEXT: @ Child Loop BB16_5 Depth 2
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
; CHECK-NEXT: add.w lr, r10, #8
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
@ -1109,14 +1101,14 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
; CHECK-NEXT: vfma.f32 q0, q3, r11
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: vfma.f32 q0, q1, r8
; CHECK-NEXT: blo .LBB16_7
; CHECK-NEXT: @ %bb.5: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: blo .LBB16_6
; CHECK-NEXT: @ %bb.4: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: .LBB16_6: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: .LBB16_5: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11}
; CHECK-NEXT: vldrw.u32 q1, [r4], #32
@ -1137,34 +1129,40 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
; CHECK-NEXT: vfma.f32 q0, q2, r11
; CHECK-NEXT: vfma.f32 q0, q3, r9
; CHECK-NEXT: vfma.f32 q0, q1, r1
; CHECK-NEXT: le lr, .LBB16_6
; CHECK-NEXT: b .LBB16_8
; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: le lr, .LBB16_5
; CHECK-NEXT: b .LBB16_7
; CHECK-NEXT: .LBB16_6: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: .LBB16_8: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: .LBB16_7: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
; CHECK-NEXT: subs.w lr, r0, #0
; CHECK-NEXT: beq .LBB16_3
; CHECK-NEXT: wls lr, r0, .LBB16_8
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_8: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: add.w r0, r4, r0, lsl #2
; CHECK-NEXT: add.w r4, r0, #16
; CHECK-NEXT: beq .LBB16_12
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: mov r3, r4
; CHECK-NEXT: .LBB16_10: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldr r0, [r7], #4
; CHECK-NEXT: vldrw.u32 q1, [r3], #4
; CHECK-NEXT: vfma.f32 q0, q1, r0
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: bne .LBB16_10
; CHECK-NEXT: b .LBB16_11
; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: le lr, .LBB16_10
; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: add.w r4, r4, r0, lsl #2
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: b .LBB16_8
; CHECK-NEXT: .LBB16_12: @ %if.end
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}