forked from OSchip/llvm-project
[ARM][ReachingDefs] Remove dead code in loloops.
Add some more helper functions to ReachingDefs to query the uses of a given MachineInstr and also to query whether two MachineInstrs use the same def of a register. For Arm, while tail-predicating, these helpers are used in the low-overhead loops to remove the dead code that calculates the number of loop iterations. Differential Revision: https://reviews.llvm.org/D70240
This commit is contained in:
parent
3ec193fb52
commit
28166816b0
|
@ -114,6 +114,15 @@ public:
|
|||
/// reaching def instuction of PhysReg that reaches MI.
|
||||
int getClearance(MachineInstr *MI, MCPhysReg PhysReg);
|
||||
|
||||
/// Provides the uses, in the same block as MI, of register that MI defines.
|
||||
/// This does not consider live-outs.
|
||||
void getReachingLocalUses(MachineInstr *MI, int PhysReg,
|
||||
SmallVectorImpl<MachineInstr*> &Uses);
|
||||
|
||||
/// Provide the number of uses, in the same block as MI, of the register that
|
||||
/// MI defines.
|
||||
unsigned getNumUses(MachineInstr *MI, int PhysReg);
|
||||
|
||||
private:
|
||||
/// Set up LiveRegs by merging predecessor live-out values.
|
||||
void enterBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB);
|
||||
|
|
|
@ -194,9 +194,19 @@ MachineInstr* ReachingDefAnalysis::getReachingMIDef(MachineInstr *MI, int PhysRe
|
|||
return getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg));
|
||||
}
|
||||
|
||||
bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
|
||||
int PhysReg) {
|
||||
MachineBasicBlock *ParentA = A->getParent();
|
||||
MachineBasicBlock *ParentB = B->getParent();
|
||||
if (ParentA != ParentB)
|
||||
return false;
|
||||
|
||||
return getReachingDef(A, PhysReg) == getReachingDef(B, PhysReg);
|
||||
}
|
||||
|
||||
MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
|
||||
int InstId) {
|
||||
assert(MBB->getNumber() < MBBReachingDefs.size() &&
|
||||
assert(static_cast<size_t>(MBB->getNumber()) < MBBReachingDefs.size() &&
|
||||
"Unexpected basic block number.");
|
||||
assert(InstId < static_cast<int>(MBB->size()) &&
|
||||
"Unexpected instruction id.");
|
||||
|
@ -216,14 +226,31 @@ int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) {
|
|||
return InstIds[MI] - getReachingDef(MI, PhysReg);
|
||||
}
|
||||
|
||||
bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
|
||||
int PhysReg) {
|
||||
MachineBasicBlock *ParentA = A->getParent();
|
||||
MachineBasicBlock *ParentB = B->getParent();
|
||||
if (ParentA != ParentB)
|
||||
return false;
|
||||
void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg,
|
||||
SmallVectorImpl<MachineInstr*> &Uses) {
|
||||
MachineBasicBlock *MBB = Def->getParent();
|
||||
MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
|
||||
while (++MI != MBB->end()) {
|
||||
for (auto &MO : MI->operands()) {
|
||||
if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg)
|
||||
continue;
|
||||
|
||||
return getReachingDef(A, PhysReg) == getReachingDef(B, PhysReg);
|
||||
// If/when we find a new reaching def, we know that there's no more uses
|
||||
// of 'Def'.
|
||||
if (getReachingMIDef(&*MI, PhysReg) != Def)
|
||||
return;
|
||||
|
||||
Uses.push_back(&*MI);
|
||||
if (MO.isKill())
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned ReachingDefAnalysis::getNumUses(MachineInstr *Def, int PhysReg) {
|
||||
SmallVector<MachineInstr*, 4> Uses;
|
||||
getReachingLocalUses(Def, PhysReg, Uses);
|
||||
return Uses.size();
|
||||
}
|
||||
|
||||
bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) {
|
||||
|
|
|
@ -110,12 +110,41 @@ namespace {
|
|||
|
||||
// Check the branch targets are within range and we satisfy our
|
||||
// restrictions.
|
||||
void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA);
|
||||
void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA,
|
||||
MachineLoopInfo *MLI);
|
||||
|
||||
bool FoundAllComponents() const {
|
||||
return Start && Dec && End;
|
||||
}
|
||||
|
||||
// Return the loop iteration count, or the number of elements if we're tail
|
||||
// predicating.
|
||||
MachineOperand &getCount() {
|
||||
return IsTailPredicationLegal() ?
|
||||
VCTP->getOperand(1) : Start->getOperand(0);
|
||||
}
|
||||
|
||||
unsigned getStartOpcode() const {
|
||||
bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
|
||||
if (!IsTailPredicationLegal())
|
||||
return IsDo ? ARM::t2DLS : ARM::t2WLS;
|
||||
|
||||
switch (VCTP->getOpcode()) {
|
||||
default:
|
||||
llvm_unreachable("unhandled vctp opcode");
|
||||
break;
|
||||
case ARM::MVE_VCTP8:
|
||||
return IsDo ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8;
|
||||
case ARM::MVE_VCTP16:
|
||||
return IsDo ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16;
|
||||
case ARM::MVE_VCTP32:
|
||||
return IsDo ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32;
|
||||
case ARM::MVE_VCTP64:
|
||||
return IsDo ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void dump() const {
|
||||
if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
|
||||
if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
|
||||
|
@ -130,6 +159,7 @@ namespace {
|
|||
|
||||
class ARMLowOverheadLoops : public MachineFunctionPass {
|
||||
MachineFunction *MF = nullptr;
|
||||
MachineLoopInfo *MLI = nullptr;
|
||||
ReachingDefAnalysis *RDA = nullptr;
|
||||
const ARMBaseInstrInfo *TII = nullptr;
|
||||
MachineRegisterInfo *MRI = nullptr;
|
||||
|
@ -236,7 +266,8 @@ MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
|
|||
}
|
||||
|
||||
void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
|
||||
ReachingDefAnalysis *RDA) {
|
||||
ReachingDefAnalysis *RDA,
|
||||
MachineLoopInfo *MLI) {
|
||||
if (Revert)
|
||||
return;
|
||||
|
||||
|
@ -273,14 +304,70 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
|
|||
if (!InsertPt) {
|
||||
LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
|
||||
Revert = true;
|
||||
return;
|
||||
} else
|
||||
LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
|
||||
|
||||
LLVM_DEBUG(if (IsTailPredicationLegal()) {
|
||||
dbgs() << "ARM Loops: Will use tail predication to convert:\n";
|
||||
// For tail predication, we need to provide the number of elements, instead
|
||||
// of the iteration count, to the loop start instruction. The number of
|
||||
// elements is provided to the vctp instruction, so we need to check that
|
||||
// we can use this register at InsertPt.
|
||||
if (!IsTailPredicationLegal())
|
||||
return;
|
||||
|
||||
Register NumElements = VCTP->getOperand(1).getReg();
|
||||
|
||||
// If the register is defined within loop, then we can't perform TP.
|
||||
// TODO: Check whether this is just a mov of a register that would be
|
||||
// available.
|
||||
if (RDA->getReachingDef(VCTP, NumElements) >= 0) {
|
||||
CannotTailPredicate = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// We can't perform TP if the register does not hold the same value at
|
||||
// InsertPt as the liveout value.
|
||||
MachineBasicBlock *InsertBB = InsertPt->getParent();
|
||||
if (!RDA->hasSameReachingDef(InsertPt, &InsertBB->back(),
|
||||
NumElements)) {
|
||||
CannotTailPredicate = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Especially in the case of while loops, InsertBB may not be the
|
||||
// preheader, so we need to check that the register isn't redefined
|
||||
// before entering the loop.
|
||||
auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB,
|
||||
Register NumElements) {
|
||||
// NumElements is redefined in this block.
|
||||
if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0)
|
||||
return true;
|
||||
|
||||
// Don't continue searching up through multiple predecessors.
|
||||
if (MBB->pred_size() > 1)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
// First, find the block that looks like the preheader.
|
||||
MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true);
|
||||
if (!MBB) {
|
||||
CannotTailPredicate = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Then search backwards for a def, until we get to InsertBB.
|
||||
while (MBB != InsertBB) {
|
||||
CannotTailPredicate = CannotProvideElements(MBB, NumElements);
|
||||
if (CannotTailPredicate)
|
||||
return;
|
||||
MBB = *MBB->pred_begin();
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication to convert:\n";
|
||||
for (auto *MI : VPTUsers)
|
||||
dbgs() << " - " << *MI;
|
||||
});
|
||||
dbgs() << " - " << *MI;);
|
||||
}
|
||||
|
||||
bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
|
||||
|
@ -291,7 +378,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
|
|||
MF = &mf;
|
||||
LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
|
||||
|
||||
auto &MLI = getAnalysis<MachineLoopInfo>();
|
||||
MLI = &getAnalysis<MachineLoopInfo>();
|
||||
RDA = &getAnalysis<ReachingDefAnalysis>();
|
||||
MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
|
||||
MRI = &MF->getRegInfo();
|
||||
|
@ -301,7 +388,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
|
|||
BBUtils->adjustBBOffsetsAfter(&MF->front());
|
||||
|
||||
bool Changed = false;
|
||||
for (auto ML : MLI) {
|
||||
for (auto ML : *MLI) {
|
||||
if (!ML->getParentLoop())
|
||||
Changed |= ProcessLoop(ML);
|
||||
}
|
||||
|
@ -317,7 +404,14 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
|
|||
for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
|
||||
Changed |= ProcessLoop(*I);
|
||||
|
||||
LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
|
||||
LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";
|
||||
if (auto *Preheader = ML->getLoopPreheader())
|
||||
dbgs() << " - " << Preheader->getName() << "\n";
|
||||
else if (auto *Preheader = MLI->findLoopPreheader(ML))
|
||||
dbgs() << " - " << Preheader->getName() << "\n";
|
||||
for (auto *MBB : ML->getBlocks())
|
||||
dbgs() << " - " << MBB->getName() << "\n";
|
||||
);
|
||||
|
||||
// Search the given block for a loop start instruction. If one isn't found,
|
||||
// and there's only one predecessor block, search that one too.
|
||||
|
@ -333,28 +427,15 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
|
|||
};
|
||||
|
||||
LowOverheadLoop LoLoop(ML);
|
||||
// Search the preheader for the start intrinsic, or look through the
|
||||
// predecessors of the header to find exactly one set.iterations intrinsic.
|
||||
// Search the preheader for the start intrinsic.
|
||||
// FIXME: I don't see why we shouldn't be supporting multiple predecessors
|
||||
// with potentially multiple set.loop.iterations, so we need to enable this.
|
||||
if (auto *Preheader = ML->getLoopPreheader())
|
||||
LoLoop.Start = SearchForStart(Preheader);
|
||||
else {
|
||||
LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
|
||||
<< " - Performing manual predecessor search.\n");
|
||||
MachineBasicBlock *Pred = nullptr;
|
||||
for (auto *MBB : ML->getHeader()->predecessors()) {
|
||||
if (!ML->contains(MBB)) {
|
||||
if (Pred) {
|
||||
LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
|
||||
LoLoop.Start = nullptr;
|
||||
break;
|
||||
}
|
||||
Pred = MBB;
|
||||
LoLoop.Start = SearchForStart(MBB);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
|
||||
LoLoop.Start = SearchForStart(Preheader);
|
||||
else
|
||||
return false;
|
||||
|
||||
// Find the low-overhead loop components and decide whether or not to fall
|
||||
// back to a normal loop. Also look for a vctp instructions and decide
|
||||
|
@ -412,7 +493,7 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
|
|||
if (!LoLoop.FoundAllComponents())
|
||||
return false;
|
||||
|
||||
LoLoop.CheckLegality(BBUtils.get(), RDA);
|
||||
LoLoop.CheckLegality(BBUtils.get(), RDA, MLI);
|
||||
Expand(LoLoop);
|
||||
return true;
|
||||
}
|
||||
|
@ -504,35 +585,45 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
|
|||
MachineInstr *Start = LoLoop.Start;
|
||||
MachineBasicBlock *MBB = InsertPt->getParent();
|
||||
bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
|
||||
unsigned Opc = 0;
|
||||
|
||||
if (!LoLoop.IsTailPredicationLegal())
|
||||
Opc = IsDo ? ARM::t2DLS : ARM::t2WLS;
|
||||
else {
|
||||
switch (LoLoop.VCTP->getOpcode()) {
|
||||
case ARM::MVE_VCTP8:
|
||||
Opc = IsDo ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8;
|
||||
break;
|
||||
case ARM::MVE_VCTP16:
|
||||
Opc = IsDo ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16;
|
||||
break;
|
||||
case ARM::MVE_VCTP32:
|
||||
Opc = IsDo ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32;
|
||||
break;
|
||||
case ARM::MVE_VCTP64:
|
||||
Opc = IsDo ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64;
|
||||
break;
|
||||
}
|
||||
}
|
||||
unsigned Opc = LoLoop.getStartOpcode();
|
||||
MachineOperand &Count = LoLoop.getCount();
|
||||
|
||||
MachineInstrBuilder MIB =
|
||||
BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
|
||||
|
||||
MIB.addDef(ARM::LR);
|
||||
MIB.add(Start->getOperand(0));
|
||||
MIB.add(Count);
|
||||
if (!IsDo)
|
||||
MIB.add(Start->getOperand(1));
|
||||
|
||||
// When using tail-predication, try to delete the dead code that was used to
|
||||
// calculate the number of loop iterations.
|
||||
if (LoLoop.IsTailPredicationLegal()) {
|
||||
SmallVector<MachineInstr*, 4> Killed;
|
||||
SmallVector<MachineInstr*, 4> Dead;
|
||||
if (auto *Def = RDA->getReachingMIDef(Start,
|
||||
Start->getOperand(0).getReg())) {
|
||||
Killed.push_back(Def);
|
||||
|
||||
while (!Killed.empty()) {
|
||||
MachineInstr *Def = Killed.back();
|
||||
Killed.pop_back();
|
||||
Dead.push_back(Def);
|
||||
for (auto &MO : Def->operands()) {
|
||||
if (!MO.isReg() || !MO.isKill())
|
||||
continue;
|
||||
|
||||
MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg());
|
||||
if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1)
|
||||
Killed.push_back(Kill);
|
||||
}
|
||||
}
|
||||
for (auto *MI : Dead)
|
||||
MI->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
// If we're inserting at a mov lr, then remove it as it's redundant.
|
||||
if (InsertPt != Start)
|
||||
InsertPt->eraseFromParent();
|
||||
Start->eraseFromParent();
|
||||
|
|
|
@ -36,11 +36,7 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
|
|||
; CHECK-NEXT: mov.w r12, #0
|
||||
; CHECK-NEXT: b .LBB0_8
|
||||
; CHECK-NEXT: .LBB0_4: @ %vector.ph
|
||||
; CHECK-NEXT: adds r6, r3, #3
|
||||
; CHECK-NEXT: bic r6, r6, #3
|
||||
; CHECK-NEXT: subs r6, #4
|
||||
; CHECK-NEXT: add.w lr, r12, r6, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r3
|
||||
; CHECK-NEXT: .LBB0_5: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||||
|
|
|
@ -16,17 +16,19 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
|
|||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
||||
; CHECK-NEXT: movs r3, #0
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB0_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: mov r12, r2
|
||||
; CHECK-NEXT: adds r2, r1, r3
|
||||
; CHECK-NEXT: vldrb.u32 q2, [r2]
|
||||
; CHECK-NEXT: vctp.32 r12
|
||||
; CHECK-NEXT: vpst
|
||||
; CHECK-NEXT: vldrbt.u32 q2, [r2]
|
||||
; CHECK-NEXT: adds r3, #4
|
||||
; CHECK-NEXT: sub.w r2, r12, #4
|
||||
; CHECK-NEXT: vmov q1, q0
|
||||
; CHECK-NEXT: vmla.u32 q0, q2, r0
|
||||
; CHECK-NEXT: letp lr, .LBB0_1
|
||||
; CHECK-NEXT: le lr, .LBB0_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %middle.block
|
||||
; CHECK-NEXT: vctp.32 r12
|
||||
; CHECK-NEXT: vpsel q0, q0, q1
|
||||
|
@ -82,13 +84,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
|
|||
; CHECK-NEXT: moveq r0, #0
|
||||
; CHECK-NEXT: bxeq lr
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: adds r3, r2, #3
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: bic r3, r3, #3
|
||||
; CHECK-NEXT: sub.w r12, r3, #4
|
||||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r2
|
||||
; CHECK-NEXT: .LBB1_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.s32 q2, [r1]
|
||||
|
@ -160,17 +157,19 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
|
|||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
||||
; CHECK-NEXT: movs r3, #0
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB2_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: mov r12, r2
|
||||
; CHECK-NEXT: adds r2, r1, r3
|
||||
; CHECK-NEXT: vldrb.u32 q2, [r2]
|
||||
; CHECK-NEXT: vctp.32 r12
|
||||
; CHECK-NEXT: vpst
|
||||
; CHECK-NEXT: vldrbt.u32 q2, [r2]
|
||||
; CHECK-NEXT: adds r3, #4
|
||||
; CHECK-NEXT: sub.w r2, r12, #4
|
||||
; CHECK-NEXT: vmov q1, q0
|
||||
; CHECK-NEXT: vmla.u32 q0, q2, r0
|
||||
; CHECK-NEXT: letp lr, .LBB2_1
|
||||
; CHECK-NEXT: le lr, .LBB2_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %middle.block
|
||||
; CHECK-NEXT: vctp.32 r12
|
||||
; CHECK-NEXT: vpsel q0, q0, q1
|
||||
|
@ -226,13 +225,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
|
|||
; CHECK-NEXT: moveq r0, #0
|
||||
; CHECK-NEXT: bxeq lr
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: adds r3, r2, #3
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: bic r3, r3, #3
|
||||
; CHECK-NEXT: sub.w r12, r3, #4
|
||||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r2
|
||||
; CHECK-NEXT: .LBB3_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u32 q2, [r1]
|
||||
|
@ -297,13 +291,8 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
|
|||
; CHECK-NEXT: moveq r0, #0
|
||||
; CHECK-NEXT: bxeq lr
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: adds r3, r2, #3
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: bic r3, r3, #3
|
||||
; CHECK-NEXT: sub.w r12, r3, #4
|
||||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r2
|
||||
; CHECK-NEXT: .LBB4_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r1]
|
||||
|
@ -392,13 +381,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
|
|||
; CHECK-NEXT: movs r7, #0
|
||||
; CHECK-NEXT: b .LBB5_9
|
||||
; CHECK-NEXT: .LBB5_4: @ %vector.ph
|
||||
; CHECK-NEXT: add.w r7, r12, #3
|
||||
; CHECK-NEXT: movs r6, #1
|
||||
; CHECK-NEXT: bic r7, r7, #3
|
||||
; CHECK-NEXT: movs r4, #0
|
||||
; CHECK-NEXT: subs r7, #4
|
||||
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r12
|
||||
; CHECK-NEXT: .LBB5_5: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: adds r5, r0, r4
|
||||
|
@ -607,12 +591,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
|
|||
; CHECK-NEXT: cmp.w r12, #0
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r4, pc}
|
||||
; CHECK-NEXT: add.w lr, r12, #3
|
||||
; CHECK-NEXT: movs r4, #1
|
||||
; CHECK-NEXT: bic lr, lr, #3
|
||||
; CHECK-NEXT: sub.w lr, lr, #4
|
||||
; CHECK-NEXT: add.w lr, r4, lr, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r12
|
||||
; CHECK-NEXT: .LBB6_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r0]
|
||||
|
@ -703,13 +682,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
|
|||
; CHECK-NEXT: movs r7, #0
|
||||
; CHECK-NEXT: b .LBB7_9
|
||||
; CHECK-NEXT: .LBB7_4: @ %vector.ph
|
||||
; CHECK-NEXT: add.w r7, r12, #3
|
||||
; CHECK-NEXT: movs r6, #1
|
||||
; CHECK-NEXT: bic r7, r7, #3
|
||||
; CHECK-NEXT: movs r4, #0
|
||||
; CHECK-NEXT: subs r7, #4
|
||||
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r12
|
||||
; CHECK-NEXT: .LBB7_5: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: adds r5, r0, r4
|
||||
|
@ -918,12 +892,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
|
|||
; CHECK-NEXT: cmp.w r12, #0
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r4, pc}
|
||||
; CHECK-NEXT: add.w lr, r12, #3
|
||||
; CHECK-NEXT: movs r4, #1
|
||||
; CHECK-NEXT: bic lr, lr, #3
|
||||
; CHECK-NEXT: sub.w lr, lr, #4
|
||||
; CHECK-NEXT: add.w lr, r4, lr, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r12
|
||||
; CHECK-NEXT: .LBB8_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r0]
|
||||
|
@ -1016,11 +985,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
|
|||
; CHECK-NEXT: mov.w r12, #0
|
||||
; CHECK-NEXT: b .LBB9_8
|
||||
; CHECK-NEXT: .LBB9_4: @ %vector.ph
|
||||
; CHECK-NEXT: add.w r4, r12, #3
|
||||
; CHECK-NEXT: bic r4, r4, #3
|
||||
; CHECK-NEXT: subs r4, #4
|
||||
; CHECK-NEXT: add.w lr, lr, r4, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r12
|
||||
; CHECK-NEXT: .LBB9_5: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
||||
|
@ -1217,13 +1182,8 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture
|
|||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r4, pc}
|
||||
; CHECK-NEXT: add.w r12, r3, #7
|
||||
; CHECK-NEXT: mov.w lr, #1
|
||||
; CHECK-NEXT: bic r12, r12, #7
|
||||
; CHECK-NEXT: sub.w r12, r12, #8
|
||||
; CHECK-NEXT: add.w lr, lr, r12, lsr #3
|
||||
; CHECK-NEXT: mov.w r12, #0
|
||||
; CHECK-NEXT: dlstp.16 lr, lr
|
||||
; CHECK-NEXT: dlstp.16 lr, r3
|
||||
; CHECK-NEXT: .LBB10_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: add.w r4, r1, r12
|
||||
|
|
|
@ -9,13 +9,8 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
|
|||
; CHECK-NEXT: moveq r0, #0
|
||||
; CHECK-NEXT: bxeq lr
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: adds r3, r2, #3
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: bic r3, r3, #3
|
||||
; CHECK-NEXT: sub.w r12, r3, #4
|
||||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r2
|
||||
; CHECK-NEXT: .LBB0_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vmov q1, q0
|
||||
|
@ -82,13 +77,8 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
|
|||
; CHECK-NEXT: moveq r0, #0
|
||||
; CHECK-NEXT: bxeq lr
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: adds r1, r2, #3
|
||||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: bic r1, r1, #3
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: subs r1, #4
|
||||
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r2
|
||||
; CHECK-NEXT: .LBB1_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: mov r1, r2
|
||||
|
@ -148,13 +138,8 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
|
|||
; CHECK-NEXT: moveq r0, #0
|
||||
; CHECK-NEXT: bxeq lr
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: adds r1, r2, #3
|
||||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: bic r1, r1, #3
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x0
|
||||
; CHECK-NEXT: subs r1, #4
|
||||
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r2
|
||||
; CHECK-NEXT: .LBB2_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: mov r1, r2
|
||||
|
@ -213,12 +198,7 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
|
|||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r7, pc}
|
||||
; CHECK-NEXT: add.w r12, r3, #3
|
||||
; CHECK-NEXT: mov.w lr, #1
|
||||
; CHECK-NEXT: bic r12, r12, #3
|
||||
; CHECK-NEXT: sub.w r12, r12, #4
|
||||
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r3
|
||||
; CHECK-NEXT: .LBB3_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||||
|
@ -272,12 +252,7 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
|
|||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r7, pc}
|
||||
; CHECK-NEXT: add.w r12, r3, #3
|
||||
; CHECK-NEXT: mov.w lr, #1
|
||||
; CHECK-NEXT: bic r12, r12, #3
|
||||
; CHECK-NEXT: sub.w r12, r12, #4
|
||||
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
|
||||
; CHECK-NEXT: dlstp.32 lr, lr
|
||||
; CHECK-NEXT: dlstp.32 lr, r3
|
||||
; CHECK-NEXT: .LBB4_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||||
|
@ -331,13 +306,8 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur
|
|||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r4, pc}
|
||||
; CHECK-NEXT: add.w r12, r3, #15
|
||||
; CHECK-NEXT: mov.w lr, #1
|
||||
; CHECK-NEXT: bic r12, r12, #15
|
||||
; CHECK-NEXT: sub.w r12, r12, #16
|
||||
; CHECK-NEXT: add.w lr, lr, r12, lsr #4
|
||||
; CHECK-NEXT: mov.w r12, #0
|
||||
; CHECK-NEXT: dlstp.8 lr, lr
|
||||
; CHECK-NEXT: dlstp.8 lr, r3
|
||||
; CHECK-NEXT: .LBB5_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: add.w r4, r1, r12
|
||||
|
@ -396,12 +366,7 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt
|
|||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r7, pc}
|
||||
; CHECK-NEXT: add.w r12, r3, #7
|
||||
; CHECK-NEXT: mov.w lr, #1
|
||||
; CHECK-NEXT: bic r12, r12, #7
|
||||
; CHECK-NEXT: sub.w r12, r12, #8
|
||||
; CHECK-NEXT: add.w lr, lr, r12, lsr #3
|
||||
; CHECK-NEXT: dlstp.16 lr, lr
|
||||
; CHECK-NEXT: dlstp.16 lr, r3
|
||||
; CHECK-NEXT: .LBB6_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||||
|
|
|
@ -195,12 +195,7 @@ body: |
|
|||
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
||||
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
|
||||
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8
|
||||
; CHECK: renamable $r12 = t2ADDri renamable $r3, 15, 14, $noreg, $noreg
|
||||
; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg
|
||||
; CHECK: renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg
|
||||
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 16, 14, $noreg, $noreg
|
||||
; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg
|
||||
; CHECK: $lr = MVE_WLSTP_8 renamable $lr, %bb.1
|
||||
; CHECK: $lr = MVE_WLSTP_8 renamable $r3, %bb.1
|
||||
; CHECK: tB %bb.3, 14, $noreg
|
||||
; CHECK: bb.1.vector.ph:
|
||||
; CHECK: successors: %bb.2(0x80000000)
|
||||
|
@ -323,12 +318,7 @@ body: |
|
|||
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
||||
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
|
||||
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
|
||||
; CHECK: renamable $r12 = t2ADDri renamable $r3, 7, 14, $noreg, $noreg
|
||||
; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg
|
||||
; CHECK: renamable $r12 = t2BICri killed renamable $r12, 7, 14, $noreg, $noreg
|
||||
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 8, 14, $noreg, $noreg
|
||||
; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 27, 14, $noreg, $noreg
|
||||
; CHECK: $lr = MVE_WLSTP_16 renamable $lr, %bb.1
|
||||
; CHECK: $lr = MVE_WLSTP_16 renamable $r3, %bb.1
|
||||
; CHECK: tB %bb.2, 14, $noreg
|
||||
; CHECK: bb.1.vector.body:
|
||||
; CHECK: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
|
||||
|
@ -437,13 +427,8 @@ body: |
|
|||
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
||||
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
|
||||
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
|
||||
; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg
|
||||
; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg
|
||||
; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
|
||||
; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
|
||||
; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
|
||||
; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
|
||||
; CHECK: $lr = MVE_WLSTP_32 renamable $lr, %bb.1
|
||||
; CHECK: $lr = MVE_WLSTP_32 $r2, %bb.1
|
||||
; CHECK: tB %bb.4, 14, $noreg
|
||||
; CHECK: bb.1.vector.ph:
|
||||
; CHECK: successors: %bb.2(0x80000000)
|
||||
|
|
Loading…
Reference in New Issue