[AArch64] Homogeneous Prolog and Epilog Size Optimization

Prologs and epilogs handle callee-save registers and tend to be irregular with
different immediate offsets that are not often handled by the MachineOutliner.
Commit D18619/a5335647d5e8 (combining stack operations) stretched irregularity
further.

This patch tries to emit homogeneous stores and loads with the same offset for
prologs and epilogs respectively. We have observed that this canonicalizes
(homogenizes) prologs and epilogs significantly and results in a greatly
increased chance of outlining, resulting in a code size reduction.

Despite the above results, there are still size wins to be had that the
MachineOutliner does not provide due to the special handling X30/LR. To handle
the LR case, his patch custom-outlines prologs and epilogs in place. It does
this by doing the following:

  * Injects HOM_Prolog and HOM_Epilog pseudo instructions during a Prolog and
    Epilog Injection Pass.
  * Lowers and optimizes said pseudos in a AArchLowerHomogneousPrologEpilog Pass.
  * Outlined helpers are created on demand. Identical helpers are merged by the linker.
  * An opt-in flag is introduced to enable this feature. Another threshold flag
    is also introduced to control the aggressiveness of outlining for application's need.

This reduced an average of 4% of code size on LLVM-TestSuite/CTMark targeting arm64/-Oz.

Differential Revision: https://reviews.llvm.org/D76570
This commit is contained in:
Kyungwoo Lee 2021-02-01 22:32:32 -05:00 committed by Puyan Lotfi
parent 327196d688
commit 0426be3df6
11 changed files with 1001 additions and 9 deletions

View File

@ -42,6 +42,7 @@ FunctionPass *createAArch64SLSHardeningPass();
FunctionPass *createAArch64IndirectThunks();
FunctionPass *createAArch64SpeculationHardeningPass();
FunctionPass *createAArch64LoadStoreOptimizationPass();
ModulePass *createAArch64LowerHomogeneousPrologEpilogPass();
FunctionPass *createAArch64SIMDInstrOptPass();
ModulePass *createAArch64PromoteConstantPass();
FunctionPass *createAArch64ConditionOptimizerPass();
@ -79,6 +80,7 @@ void initializeAArch64ExpandPseudoPass(PassRegistry&);
void initializeAArch64SLSHardeningPass(PassRegistry&);
void initializeAArch64SpeculationHardeningPass(PassRegistry&);
void initializeAArch64LoadStoreOptPass(PassRegistry&);
void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
void initializeAArch64SIMDInstrOptPass(PassRegistry&);
void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
void initializeAArch64PostLegalizerCombinerPass(PassRegistry &);

View File

@ -179,6 +179,11 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
cl::desc("sort stack allocations"),
cl::init(true), cl::Hidden);
cl::opt<bool> EnableHomogeneousPrologEpilog(
"homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
cl::desc("Emit homogeneous prologue and epilogue for the size "
"optimization (default = off)"));
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
/// Returns the argument pop size.
@ -213,6 +218,47 @@ static uint64_t getArgumentPopSize(MachineFunction &MF,
return ArgumentPopSize;
}
static bool produceCompactUnwindFrame(MachineFunction &MF);
static bool needsWinCFI(const MachineFunction &MF);
static StackOffset getSVEStackSize(const MachineFunction &MF);
/// Returns true if a homogeneous prolog or epilog code can be emitted
/// for the size optimization. If possible, a frame helper call is injected.
/// When Exit block is given, this check is for epilog.
bool AArch64FrameLowering::homogeneousPrologEpilog(
MachineFunction &MF, MachineBasicBlock *Exit) const {
if (!MF.getFunction().hasMinSize())
return false;
if (!EnableHomogeneousPrologEpilog)
return false;
if (ReverseCSRRestoreSeq)
return false;
if (EnableRedZone)
return false;
// TODO: Window is supported yet.
if (needsWinCFI(MF))
return false;
// TODO: SVE is not supported yet.
if (getSVEStackSize(MF))
return false;
// Bail on stack adjustment needed on return for simplicity.
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
if (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF))
return false;
if (Exit && getArgumentPopSize(MF, *Exit))
return false;
return true;
}
/// Returns true if CSRs should be paired.
bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
}
/// This is the biggest offset to the stack pointer we can encode in aarch64
/// instructions (without using a separate calculation and a temp register).
/// Note that the exception here are vector stores/loads which cannot encode any
@ -605,6 +651,8 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
if (homogeneousPrologEpilog(MF))
return false;
if (AFI->getLocalStackSize() == 0)
return false;
@ -1148,12 +1196,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// All of the remaining stack allocations are for locals.
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
bool HomPrologEpilog = homogeneousPrologEpilog(MF);
if (CombineSPBump) {
assert(!SVEStackSize && "Cannot combine SP bump with SVE");
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(-NumBytes), TII,
MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
NumBytes = 0;
} else if (HomPrologEpilog) {
// Stack has been already adjusted.
NumBytes -= PrologueSaveSize;
} else if (PrologueSaveSize != 0) {
MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
@ -1181,6 +1233,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (CombineSPBump)
FPOffset += AFI->getLocalStackSize();
if (HomPrologEpilog) {
auto Prolog = MBBI;
--Prolog;
assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
} else {
// Issue sub fp, sp, FPOffset or
// mov fp,sp when FPOffset is zero.
// Note: All stores of callee-saved registers are marked as "FrameSetup".
@ -1189,6 +1247,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
StackOffset::getFixed(FPOffset), TII,
MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
}
}
if (windowsRequiresStackProbe(MF, NumBytes)) {
uint64_t NumWords = NumBytes >> 4;
@ -1615,6 +1674,25 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// function.
if (MF.hasEHFunclets())
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
if (homogeneousPrologEpilog(MF, &MBB)) {
assert(!NeedsWinCFI);
auto LastPopI = MBB.getFirstTerminator();
if (LastPopI != MBB.begin()) {
auto HomogeneousEpilog = std::prev(LastPopI);
if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
LastPopI = HomogeneousEpilog;
}
// Adjust local stack
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(-AFI->getLocalStackSize()), TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI);
// SP has been already adjusted while restoring callee save regs.
// We've bailed-out the case with adjusting SP for arguments.
assert(AfterCSRPopSize == 0);
return;
}
bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
// Assume we can't combine the last pop with the sp restore.
@ -2333,6 +2411,22 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MBB.addLiveIn(AArch64::X18);
}
if (homogeneousPrologEpilog(MF)) {
auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
.setMIFlag(MachineInstr::FrameSetup);
for (auto &RPI : RegPairs) {
MIB.addReg(RPI.Reg1);
MIB.addReg(RPI.Reg2);
// Update register live in.
if (!MRI.isReserved(RPI.Reg1))
MBB.addLiveIn(RPI.Reg1);
if (!MRI.isReserved(RPI.Reg2))
MBB.addLiveIn(RPI.Reg2);
}
return true;
}
for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
++RPII) {
RegPairInfo RPI = *RPII;
@ -2528,6 +2622,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
for (const RegPairInfo &RPI : reverse(RegPairs))
if (!RPI.isScalable())
EmitMI(RPI);
} else if (homogeneousPrologEpilog(MF, &MBB)) {
auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
.setMIFlag(MachineInstr::FrameDestroy);
for (auto &RPI : RegPairs) {
MIB.addReg(RPI.Reg1, RegState::Define);
MIB.addReg(RPI.Reg2, RegState::Define);
}
return true;
} else
for (const RegPairInfo &RPI : RegPairs)
if (!RPI.isScalable())
@ -2597,7 +2699,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// MachO's compact unwind format relies on all registers being stored in
// pairs.
// FIXME: the usual format is actually better if unwinding isn't needed.
if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
!SavedRegs.test(PairedReg)) {
SavedRegs.set(PairedReg);
if (AArch64::GPR64RegClass.contains(PairedReg) &&
@ -2676,7 +2778,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// MachO's compact unwind format relies on all registers being stored in
// pairs, so if we need to spill one extra for BigStack, then we need to
// store the pair.
if (produceCompactUnwindFrame(MF))
if (producePairRegisters(MF))
SavedRegs.set(UnspilledCSGPRPaired);
ExtraCSSpill = UnspilledCSGPR;
}

View File

@ -124,6 +124,16 @@ public:
SmallVectorImpl<int> &ObjectsToAllocate) const override;
private:
/// Returns true if a homogeneous prolog or epilog code can be emitted
/// for the size optimization. If so, HOM_Prolog/HOM_Epilog pseudo
/// instructions are emitted in place. When Exit block is given, this check is
/// for epilog.
bool homogeneousPrologEpilog(MachineFunction &MF,
MachineBasicBlock *Exit = nullptr) const;
/// Returns true if CSRs should be paired.
bool producePairRegisters(MachineFunction &MF) const;
bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
uint64_t StackBumpBytes) const;

View File

@ -3896,6 +3896,14 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
Sched<[]>;
}
// Pseudo instructions for homogeneous prolog/epilog
let isPseudo = 1 in {
// Save CSRs in order, {FPOffset}
def HOM_Prolog : Pseudo<(outs), (ins variable_ops), []>, Sched<[]>;
// Restore CSRs in order
def HOM_Epilog : Pseudo<(outs), (ins variable_ops), []>, Sched<[]>;
}
//===----------------------------------------------------------------------===//
// Floating point immediate move.
//===----------------------------------------------------------------------===//

View File

@ -0,0 +1,613 @@
//===- AArch64LowerHomogeneousPrologEpilog.cpp ----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains a pass that lowers homogeneous prolog/epilog instructions.
//
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64InstPrinter.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/raw_ostream.h"
#include <sstream>
using namespace llvm;
#define AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME \
"AArch64 homogeneous prolog/epilog lowering pass"
cl::opt<int> FrameHelperSizeThreshold(
"frame-helper-size-threshold", cl::init(2), cl::Hidden,
cl::desc("The minimum number of instructions that are outlined in a frame "
"helper (default = 2)"));
namespace {
class AArch64LowerHomogeneousPE {
public:
const AArch64InstrInfo *TII;
AArch64LowerHomogeneousPE(Module *M, MachineModuleInfo *MMI)
: M(M), MMI(MMI) {}
bool run();
bool runOnMachineFunction(MachineFunction &Fn);
private:
Module *M;
MachineModuleInfo *MMI;
bool runOnMBB(MachineBasicBlock &MBB);
bool runOnMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
/// Lower a HOM_Prolog pseudo instruction into a helper call
/// or a sequence of homogeneous stores.
/// When a a fp setup follows, it can be optimized.
bool lowerProlog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
/// Lower a HOM_Epilog pseudo instruction into a helper call
/// or a sequence of homogeneous loads.
/// When a return follow, it can be optimized.
bool lowerEpilog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
};
class AArch64LowerHomogeneousPrologEpilog : public ModulePass {
public:
static char ID;
AArch64LowerHomogeneousPrologEpilog() : ModulePass(ID) {
initializeAArch64LowerHomogeneousPrologEpilogPass(
*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineModuleInfoWrapperPass>();
AU.addPreserved<MachineModuleInfoWrapperPass>();
AU.setPreservesAll();
ModulePass::getAnalysisUsage(AU);
}
bool runOnModule(Module &M) override;
StringRef getPassName() const override {
return AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME;
}
};
} // end anonymous namespace
char AArch64LowerHomogeneousPrologEpilog::ID = 0;
INITIALIZE_PASS(AArch64LowerHomogeneousPrologEpilog,
"aarch64-lower-homogeneous-prolog-epilog",
AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME, false, false)
bool AArch64LowerHomogeneousPrologEpilog::runOnModule(Module &M) {
if (skipModule(M))
return false;
MachineModuleInfo *MMI =
&getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
return AArch64LowerHomogeneousPE(&M, MMI).run();
}
bool AArch64LowerHomogeneousPE::run() {
bool Changed = false;
for (auto &F : *M) {
if (F.empty())
continue;
MachineFunction *MF = MMI->getMachineFunction(F);
if (!MF)
continue;
Changed |= runOnMachineFunction(*MF);
}
return Changed;
}
enum FrameHelperType { Prolog, PrologFrame, Epilog, EpilogTail };
/// Return a frame helper name with the given CSRs and the helper type.
/// For instance, a prolog helper that saves x19 and x20 is named as
/// OUTLINED_FUNCTION_PROLOG_x19x20.
static std::string getFrameHelperName(SmallVectorImpl<unsigned> &Regs,
FrameHelperType Type, unsigned FpOffset) {
std::ostringstream RegStream;
switch (Type) {
case FrameHelperType::Prolog:
RegStream << "OUTLINED_FUNCTION_PROLOG_";
break;
case FrameHelperType::PrologFrame:
RegStream << "OUTLINED_FUNCTION_PROLOG_FRAME" << FpOffset << "_";
break;
case FrameHelperType::Epilog:
RegStream << "OUTLINED_FUNCTION_EPILOG_";
break;
case FrameHelperType::EpilogTail:
RegStream << "OUTLINED_FUNCTION_EPILOG_TAIL_";
break;
}
for (auto Reg : Regs)
RegStream << AArch64InstPrinter::getRegisterName(Reg);
return RegStream.str();
}
/// Create a Function for the unique frame helper with the given name.
/// Return a newly created MachineFunction with an empty MachineBasicBlock.
static MachineFunction &createFrameHelperMachineFunction(Module *M,
MachineModuleInfo *MMI,
StringRef Name) {
LLVMContext &C = M->getContext();
Function *F = M->getFunction(Name);
assert(F == nullptr && "Function has been created before");
F = Function::Create(FunctionType::get(Type::getVoidTy(C), false),
Function::ExternalLinkage, Name, M);
assert(F && "Function was null!");
// Use ODR linkage to avoid duplication.
F->setLinkage(GlobalValue::LinkOnceODRLinkage);
F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
// Set no-opt/minsize, so we don't insert padding between outlined
// functions.
F->addFnAttr(Attribute::OptimizeNone);
F->addFnAttr(Attribute::NoInline);
F->addFnAttr(Attribute::MinSize);
F->addFnAttr(Attribute::Naked);
MachineFunction &MF = MMI->getOrCreateMachineFunction(*F);
// Remove unnecessary register liveness and set NoVRegs.
MF.getProperties().reset(MachineFunctionProperties::Property::TracksLiveness);
MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA);
MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
MF.getRegInfo().freezeReservedRegs(MF);
// Create entry block.
BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
IRBuilder<> Builder(EntryBB);
Builder.CreateRetVoid();
// Insert the new block into the function.
MachineBasicBlock *MBB = MF.CreateMachineBasicBlock();
MF.insert(MF.begin(), MBB);
return MF;
}
/// Emit a store-pair instruction for frame-setup.
static void emitStore(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator Pos,
const TargetInstrInfo &TII, unsigned Reg1, unsigned Reg2,
int Offset, bool IsPreDec) {
bool IsFloat = AArch64::FPR64RegClass.contains(Reg1);
assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2)));
unsigned Opc;
if (IsPreDec)
Opc = IsFloat ? AArch64::STPDpre : AArch64::STPXpre;
else
Opc = IsFloat ? AArch64::STPDi : AArch64::STPXi;
MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc));
if (IsPreDec)
MIB.addDef(AArch64::SP);
MIB.addReg(Reg2)
.addReg(Reg1)
.addReg(AArch64::SP)
.addImm(Offset)
.setMIFlag(MachineInstr::FrameSetup);
}
/// Emit a load-pair instruction for frame-destroy.
static void emitLoad(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator Pos,
const TargetInstrInfo &TII, unsigned Reg1, unsigned Reg2,
int Offset, bool IsPostDec) {
bool IsFloat = AArch64::FPR64RegClass.contains(Reg1);
assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2)));
unsigned Opc;
if (IsPostDec)
Opc = IsFloat ? AArch64::LDPDpost : AArch64::LDPXpost;
else
Opc = IsFloat ? AArch64::LDPDi : AArch64::LDPXi;
MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc));
if (IsPostDec)
MIB.addDef(AArch64::SP);
MIB.addReg(Reg2)
.addReg(Reg1)
.addReg(AArch64::SP)
.addImm(Offset)
.setMIFlag(MachineInstr::FrameDestroy);
}
/// Return a unique function if a helper can be formed with the given Regs
/// and frame type.
/// 1) _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22:
/// stp x22, x21, [sp, #-32]! ; x29/x30 has been stored at the caller
/// stp x20, x19, [sp, #16]
/// ret
///
/// 2) _OUTLINED_FUNCTION_PROLOG_FRAME32_x30x29x19x20x21x22:
/// stp x22, x21, [sp, #-32]! ; x29/x30 has been stored at the caller
/// stp x20, x19, [sp, #16]
/// add fp, sp, #32
/// ret
///
/// 3) _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22:
/// mov x16, x30
/// ldp x29, x30, [sp, #32]
/// ldp x20, x19, [sp, #16]
/// ldp x22, x21, [sp], #48
/// ret x16
///
/// 4) _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22:
/// ldp x29, x30, [sp, #32]
/// ldp x20, x19, [sp, #16]
/// ldp x22, x21, [sp], #48
/// ret
/// @param M module
/// @param MMI machine module info
/// @param Regs callee save regs that the helper will handle
/// @param Type frame helper type
/// @return a helper function
static Function *getOrCreateFrameHelper(Module *M, MachineModuleInfo *MMI,
SmallVectorImpl<unsigned> &Regs,
FrameHelperType Type,
unsigned FpOffset = 0) {
assert(Regs.size() >= 2);
auto Name = getFrameHelperName(Regs, Type, FpOffset);
auto *F = M->getFunction(Name);
if (F)
return F;
auto &MF = createFrameHelperMachineFunction(M, MMI, Name);
MachineBasicBlock &MBB = *MF.begin();
const TargetSubtargetInfo &STI = MF.getSubtarget();
const TargetInstrInfo &TII = *STI.getInstrInfo();
int Size = (int)Regs.size();
switch (Type) {
case FrameHelperType::Prolog:
case FrameHelperType::PrologFrame: {
// Compute the remaining SP adjust beyond FP/LR.
auto LRIdx = std::distance(
Regs.begin(), std::find(Regs.begin(), Regs.end(), AArch64::LR));
// If the register stored to the lowest address is not LR, we must subtract
// more from SP here.
if (LRIdx != Size - 2) {
assert(Regs[Size - 2] != AArch64::LR);
emitStore(MF, MBB, MBB.end(), TII, Regs[Size - 2], Regs[Size - 1],
LRIdx - Size + 2, true);
}
// Store CSRs in the reverse order.
for (int I = Size - 3; I >= 0; I -= 2) {
// FP/LR has been stored at call-site.
if (Regs[I - 1] == AArch64::LR)
continue;
emitStore(MF, MBB, MBB.end(), TII, Regs[I - 1], Regs[I], Size - I - 1,
false);
}
if (Type == FrameHelperType::PrologFrame)
BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ADDXri))
.addDef(AArch64::FP)
.addUse(AArch64::SP)
.addImm(FpOffset)
.addImm(0)
.setMIFlag(MachineInstr::FrameSetup);
BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET))
.addReg(AArch64::LR);
break;
}
case FrameHelperType::Epilog:
case FrameHelperType::EpilogTail:
if (Type == FrameHelperType::Epilog)
// Stash LR to X16
BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ORRXrs))
.addDef(AArch64::X16)
.addReg(AArch64::XZR)
.addUse(AArch64::LR)
.addImm(0);
for (int I = 0; I < Size - 2; I += 2)
emitLoad(MF, MBB, MBB.end(), TII, Regs[I], Regs[I + 1], Size - I - 2,
false);
// Restore the last CSR with post-increment of SP.
emitLoad(MF, MBB, MBB.end(), TII, Regs[Size - 2], Regs[Size - 1], Size,
true);
BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET))
.addReg(Type == FrameHelperType::Epilog ? AArch64::X16 : AArch64::LR);
break;
}
return M->getFunction(Name);
}
/// This function checks if a frame helper should be used for
/// HOM_Prolog/HOM_Epilog pseudo instruction expansion.
/// @param MBB machine basic block
/// @param NextMBBI next instruction following HOM_Prolog/HOM_Epilog
/// @param Regs callee save registers that are saved or restored.
/// @param Type frame helper type
/// @return True if a use of helper is qualified.
static bool shouldUseFrameHelper(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &NextMBBI,
SmallVectorImpl<unsigned> &Regs,
FrameHelperType Type) {
const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
auto RegCount = Regs.size();
assert(RegCount > 0 && (RegCount % 2 == 0));
// # of instructions that will be outlined.
int InstCount = RegCount / 2;
// Do not use a helper call when not saving LR.
if (std::find(Regs.begin(), Regs.end(), AArch64::LR) == Regs.end())
return false;
switch (Type) {
case FrameHelperType::Prolog:
// Prolog helper cannot save FP/LR.
InstCount--;
break;
case FrameHelperType::PrologFrame: {
// Effecitvely no change in InstCount since FpAdjusment is included.
break;
}
case FrameHelperType::Epilog:
// Bail-out if X16 is live across the epilog helper because it is used in
// the helper to handle X30.
for (auto NextMI = NextMBBI; NextMI != MBB.end(); NextMI++) {
if (NextMI->readsRegister(AArch64::W16, TRI))
return false;
}
// Epilog may not be in the last block. Check the liveness in successors.
for (const MachineBasicBlock *SuccMBB : MBB.successors()) {
if (SuccMBB->isLiveIn(AArch64::W16) || SuccMBB->isLiveIn(AArch64::X16))
return false;
}
// No change in InstCount for the regular epilog case.
break;
case FrameHelperType::EpilogTail: {
// EpilogTail helper includes the caller's return.
if (NextMBBI == MBB.end())
return false;
if (NextMBBI->getOpcode() != AArch64::RET_ReallyLR)
return false;
InstCount++;
break;
}
}
return InstCount >= FrameHelperSizeThreshold;
}
/// Lower a HOM_Epilog pseudo instruction into a helper call while
/// creating the helper on demand. Or emit a sequence of loads in place when not
/// using a helper call.
///
/// 1. With a helper including ret
/// HOM_Epilog x30, x29, x19, x20, x21, x22 ; MBBI
/// ret ; NextMBBI
/// =>
/// b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22
/// ... ; NextMBBI
///
/// 2. With a helper
/// HOM_Epilog x30, x29, x19, x20, x21, x22
/// =>
/// bl _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22
///
/// 3. Without a helper
/// HOM_Epilog x30, x29, x19, x20, x21, x22
/// =>
/// ldp x29, x30, [sp, #32]
/// ldp x20, x19, [sp, #16]
/// ldp x22, x21, [sp], #48
bool AArch64LowerHomogeneousPE::lowerEpilog(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
auto &MF = *MBB.getParent();
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
SmallVector<unsigned, 8> Regs;
for (auto &MO : MI.operands())
if (MO.isReg())
Regs.push_back(MO.getReg());
int Size = (int)Regs.size();
if (Size == 0)
return false;
// Registers are in pair.
assert(Size % 2 == 0);
assert(MI.getOpcode() == AArch64::HOM_Epilog);
auto Return = NextMBBI;
if (shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::EpilogTail)) {
// When MBB ends with a return, emit a tail-call to the epilog helper
auto *EpilogTailHelper =
getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::EpilogTail);
BuildMI(MBB, MBBI, DL, TII->get(AArch64::TCRETURNdi))
.addGlobalAddress(EpilogTailHelper)
.addImm(0)
.setMIFlag(MachineInstr::FrameDestroy)
.copyImplicitOps(MI)
.copyImplicitOps(*Return);
NextMBBI = std::next(Return);
Return->removeFromParent();
} else if (shouldUseFrameHelper(MBB, NextMBBI, Regs,
FrameHelperType::Epilog)) {
// The default epilog helper case.
auto *EpilogHelper =
getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Epilog);
BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
.addGlobalAddress(EpilogHelper)
.setMIFlag(MachineInstr::FrameDestroy)
.copyImplicitOps(MI);
} else {
// Fall back to no-helper.
for (int I = 0; I < Size - 2; I += 2)
emitLoad(MF, MBB, MBBI, *TII, Regs[I], Regs[I + 1], Size - I - 2, false);
// Restore the last CSR with post-increment of SP.
emitLoad(MF, MBB, MBBI, *TII, Regs[Size - 2], Regs[Size - 1], Size, true);
}
MBBI->removeFromParent();
return true;
}
/// Lower a HOM_Prolog pseudo instruction into a helper call while
/// creating the helper on demand. Or emit a sequence of stores in place when
/// not using a helper call.
///
/// 1. With a helper including frame-setup
/// HOM_Prolog x30, x29, x19, x20, x21, x22, 32
/// =>
/// stp x29, x30, [sp, #-16]!
/// bl _OUTLINED_FUNCTION_PROLOG_FRAME32_x30x29x19x20x21x22
///
/// 2. With a helper
/// HOM_Prolog x30, x29, x19, x20, x21, x22
/// =>
/// stp x29, x30, [sp, #-16]!
/// bl _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22
///
/// 3. Without a helper
/// HOM_Prolog x30, x29, x19, x20, x21, x22
/// =>
/// stp x22, x21, [sp, #-48]!
/// stp x20, x19, [sp, #16]
/// stp x29, x30, [sp, #32]
bool AArch64LowerHomogeneousPE::lowerProlog(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
auto &MF = *MBB.getParent();
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
SmallVector<unsigned, 8> Regs;
int LRIdx = 0;
Optional<int> FpOffset;
for (auto &MO : MI.operands()) {
if (MO.isReg()) {
if (MO.getReg() == AArch64::LR)
LRIdx = Regs.size();
Regs.push_back(MO.getReg());
} else if (MO.isImm()) {
FpOffset = MO.getImm();
}
}
int Size = (int)Regs.size();
if (Size == 0)
return false;
// Allow compact unwind case only for oww.
assert(Size % 2 == 0);
assert(MI.getOpcode() == AArch64::HOM_Prolog);
if (FpOffset &&
shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::PrologFrame)) {
// FP/LR is stored at the top of stack before the prolog helper call.
emitStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP, -LRIdx - 2, true);
auto *PrologFrameHelper = getOrCreateFrameHelper(
M, MMI, Regs, FrameHelperType::PrologFrame, *FpOffset);
BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
.addGlobalAddress(PrologFrameHelper)
.setMIFlag(MachineInstr::FrameSetup)
.copyImplicitOps(MI)
.addReg(AArch64::FP, RegState::Implicit | RegState::Define)
.addReg(AArch64::SP, RegState::Implicit);
} else if (!FpOffset && shouldUseFrameHelper(MBB, NextMBBI, Regs,
FrameHelperType::Prolog)) {
// FP/LR is stored at the top of stack before the prolog helper call.
emitStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP, -LRIdx - 2, true);
auto *PrologHelper =
getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Prolog);
BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
.addGlobalAddress(PrologHelper)
.setMIFlag(MachineInstr::FrameSetup)
.copyImplicitOps(MI);
} else {
// Fall back to no-helper.
emitStore(MF, MBB, MBBI, *TII, Regs[Size - 2], Regs[Size - 1], -Size, true);
for (int I = Size - 3; I >= 0; I -= 2)
emitStore(MF, MBB, MBBI, *TII, Regs[I - 1], Regs[I], Size - I - 1, false);
if (FpOffset) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri))
.addDef(AArch64::FP)
.addUse(AArch64::SP)
.addImm(*FpOffset)
.addImm(0)
.setMIFlag(MachineInstr::FrameSetup);
}
}
MBBI->removeFromParent();
return true;
}
/// Process each machine instruction
/// @param MBB machine basic block
/// @param MBBI current instruction iterator
/// @param NextMBBIT next instruction iterator which can be updated
/// @return True when IR is changed.
bool AArch64LowerHomogeneousPE::runOnMI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI) {
MachineInstr &MI = *MBBI;
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
default:
break;
case AArch64::HOM_Prolog:
return lowerProlog(MBB, MBBI, NextMBBI);
case AArch64::HOM_Epilog:
return lowerEpilog(MBB, MBBI, NextMBBI);
}
return false;
}
bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) {
bool Modified = false;
MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
while (MBBI != E) {
MachineBasicBlock::iterator NMBBI = std::next(MBBI);
Modified |= runOnMI(MBB, MBBI, NMBBI);
MBBI = NMBBI;
}
return Modified;
}
bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) {
TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
bool Modified = false;
for (auto &MBB : MF)
Modified |= runOnMBB(MBB);
return Modified;
}
ModulePass *llvm::createAArch64LowerHomogeneousPrologEpilogPass() {
return new AArch64LowerHomogeneousPrologEpilog();
}

View File

@ -161,6 +161,8 @@ static cl::opt<bool>
cl::desc("Enable the AAcrh64 branch target pass"),
cl::init(true));
extern cl::opt<bool> EnableHomogeneousPrologEpilog;
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
// Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@ -197,6 +199,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
initializeAArch64SLSHardeningPass(*PR);
initializeAArch64StackTaggingPass(*PR);
initializeAArch64StackTaggingPreRAPass(*PR);
initializeAArch64LowerHomogeneousPrologEpilogPass(*PR);
}
//===----------------------------------------------------------------------===//
@ -634,6 +637,9 @@ void AArch64PassConfig::addPostRegAlloc() {
}
void AArch64PassConfig::addPreSched2() {
// Lower homogeneous frame instructions
if (EnableHomogeneousPrologEpilog)
addPass(createAArch64LowerHomogeneousPrologEpilogPass());
// Expand some pseudo instructions to allow proper scheduling.
addPass(createAArch64ExpandPseudoPass());
// Use load/store pair instructions when possible.

View File

@ -59,6 +59,7 @@ add_llvm_target(AArch64CodeGen
AArch64ISelLowering.cpp
AArch64InstrInfo.cpp
AArch64LoadStoreOptimizer.cpp
AArch64LowerHomogeneousPrologEpilog.cpp
AArch64MachineFunctionInfo.cpp
AArch64MacroFusion.cpp
AArch64MCInstLower.cpp

View File

@ -0,0 +1,40 @@
# RUN: llc -mtriple=arm64-applie-ios7.0 -start-before=aarch64-lower-homogeneous-prolog-epilog -homogeneous-prolog-epilog %s -o - | FileCheck %s
#
# This test ensure no outlined epilog is formed when X16 is live across the helper.
--- |
@FuncPtr = local_unnamed_addr global i32 (i32)* null, align 8
define i32 @_Z3fooi(i32) minsize "frame-pointer"="all" {
ret i32 0
}
declare i32 @_Z3gooii(i32, i32)
...
---
name: _Z3fooi
tracksRegLiveness: true
body: |
bb.0:
liveins: $w0, $lr, $x19, $x20
successors: %bb.1
frame-setup HOM_Prolog $lr, $fp, $x19, $x20, 16
frame-setup CFI_INSTRUCTION def_cfa $w29, 16
frame-setup CFI_INSTRUCTION offset $w30, -8
frame-setup CFI_INSTRUCTION offset $w29, -16
frame-setup CFI_INSTRUCTION offset $w19, -24
frame-setup CFI_INSTRUCTION offset $w20, -32
$w19 = nsw ADDWri $w0, 1, 0
$w1 = ORRWrr $wzr, $w0
$w0 = ORRWrr $wzr, $w19
BL @_Z3gooii, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def $w0
$x8 = ADRP target-flags(aarch64-page) @FuncPtr
$x16 = LDRXui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @FuncPtr
$w0 = nsw ADDWrr renamable $w0, killed renamable $w19
$lr, $fp, $x19, $x20 = frame-destroy HOM_Epilog
B %bb.1
bb.1:
liveins: $w0, $x16
TCRETURNri killed renamable $x16, 0, csr_aarch64_aapcs, implicit $sp, implicit $w0
# CHECK: _OUTLINED_FUNCTION_PROLOG_FRAME16_x30x29x19x20:
# CHECK-NOT: _OUTLINED_FUNCTION_EPILOG_x30x29x19x20:

View File

@ -0,0 +1,85 @@
; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog | FileCheck %s
; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -homogeneous-prolog-epilog | FileCheck %s --check-prefixes=CHECK-LINUX
; CHECK-LABEL: __Z3foofffi:
; CHECK: stp x29, x30, [sp, #-16]!
; CHECK-NEXT: bl _OUTLINED_FUNCTION_PROLOG_FRAME48_x30x29x19x20d8d9d10d11
; CHECK: bl __Z3goof
; CHECK: bl __Z3goof
; CHECK: b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20d8d9d10d11
; CHECK-LINUX-LABEL: _Z3foofffi:
; CHECK-LINUX: stp x29, x30, [sp, #-32]!
; CHECK-LINUX-NEXT: bl OUTLINED_FUNCTION_PROLOG_FRAME32_x19x20x30x29d8d9d10d11
; CHECK-LINUX: bl _Z3goof
; CHECK-LINUX: bl _Z3goof
; CHECK-LINUX: b OUTLINED_FUNCTION_EPILOG_TAIL_x19x20x30x29d8d9d10d11
define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) ssp minsize "frame-pointer"="non-leaf" {
entry:
%inc = fadd float %b, 1.000000e+00
%add = fadd float %inc, %x
%add1 = fadd float %add, %y
%conv = sitofp i32 %z to float
%sub = fsub float %add1, %conv
%dec = add nsw i32 %z, -1
%call = tail call float @_Z3goof(float %inc) #2
%call2 = tail call float @_Z3goof(float %sub) #2
%add3 = fadd float %call, %call2
%mul = fmul float %inc, %add3
%add4 = fadd float %sub, %mul
%conv5 = sitofp i32 %dec to float
%sub6 = fsub float %add4, %conv5
ret float %sub6
}
; CHECK-LABEL: _Z3zoov:
; CHECK: stp x29, x30, [sp, #-16]!
; CHECK: bl __Z3hoo
; CHECK: b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29
define i32 @_Z3zoov() nounwind ssp minsize {
%1 = tail call i32 @_Z3hoov() #2
%2 = add nsw i32 %1, 1
ret i32 %2
}
declare float @_Z3goof(float) nounwind ssp minsize
declare i32 @_Z3hoov() nounwind ssp optsize
; CHECK-LABEL: _OUTLINED_FUNCTION_PROLOG_FRAME48_x30x29x19x20d8d9d10d11:
; CHECK: stp d11, d10, [sp, #-48]!
; CHECK-NEXT: stp d9, d8, [sp, #16]
; CHECK-NEXT: stp x20, x19, [sp, #32]
; CHECK-NEXT: add x29, sp, #48
; CHECK-NEXT: ret
; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20d8d9d10d11:
; CHECK: ldp x29, x30, [sp, #48]
; CHECK-NEXT: ldp x20, x19, [sp, #32]
; CHECK-NEXT: ldp d9, d8, [sp, #16]
; CHECK-NEXT: ldp d11, d10, [sp], #64
; CHECK-NEXT: ret
; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29:
; CHECK: ldp x29, x30, [sp], #16
; CHECK-NEXT: ret
; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_PROLOG_FRAME32_x19x20x30x29d8d9d10d11:
; CHECK-LINUX: stp d11, d10, [sp, #-32]!
; CHECK-LINUX-NEXT: stp d9, d8, [sp, #16]
; CHECK-LINUX-NEXT: stp x20, x19, [sp, #48]
; CHECK-LINUX-NEXT: add x29, sp, #32
; CHECK-LINUX-NEXT: ret
; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_EPILOG_TAIL_x19x20x30x29d8d9d10d11:
; CHECK-LINUX: ldp x20, x19, [sp, #48]
; CHECK-LINUX-NEXT: ldp x29, x30, [sp, #32]
; CHECK-LINUX-NEXT: ldp d9, d8, [sp, #16]
; CHECK-LINUX-NEXT: ldp d11, d10, [sp], #64
; CHECK-LINUX-NEXT: ret
; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_EPILOG_TAIL_x30x29:
; CHECK-LINUX: ldp x29, x30, [sp], #16
; CHECK-LINUX-NEXT: ret

View File

@ -0,0 +1,70 @@
; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog -frame-helper-size-threshold=6 | FileCheck %s
; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -homogeneous-prolog-epilog -frame-helper-size-threshold=6 | FileCheck %s --check-prefixes=CHECK-LINUX
; CHECK-LABEL: __Z3foofffi:
; CHECK: stp d11, d10, [sp, #-64]!
; CHECK-NEXT: stp d9, d8, [sp, #16]
; CHECK-NEXT: stp x20, x19, [sp, #32]
; CHECK-NEXT: stp x29, x30, [sp, #48]
; CHECK-NEXT: add x29, sp, #48
; CHECK: bl __Z3goof
; CHECK: bl __Z3goof
; CHECK: ldp x29, x30, [sp, #48]
; CHECK: ldp x20, x19, [sp, #32]
; CHECK: ldp d9, d8, [sp, #16]
; CHECK: ldp d11, d10, [sp], #64
; CHECK: ret
; CHECK-LINUX-LABEL: _Z3foofffi:
; CHECK-LINUX: stp d11, d10, [sp, #-64]!
; CHECK-LINUX-NEXT: stp d9, d8, [sp, #16]
; CHECK-LINUX-NEXT: stp x29, x30, [sp, #32]
; CHECK-LINUX-NEXT: stp x20, x19, [sp, #48]
; CHECK-LINUX-NEXT: add x29, sp, #32
; CHECK-LINUX: bl _Z3goof
; CHECK-LINUX: bl _Z3goof
; CHECK-LINUX: ldp x20, x19, [sp, #48]
; CHECK-LINUX: ldp x29, x30, [sp, #32]
; CHECK-LINUX: ldp d9, d8, [sp, #16]
; CHECK-LINUX: ldp d11, d10, [sp], #64
; CHECK-LINUX: ret
define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) uwtable ssp minsize "frame-pointer"="non-leaf" {
entry:
%inc = fadd float %b, 1.000000e+00
%add = fadd float %inc, %x
%add1 = fadd float %add, %y
%conv = sitofp i32 %z to float
%sub = fsub float %add1, %conv
%dec = add nsw i32 %z, -1
%call = tail call float @_Z3goof(float %inc) #2
%call2 = tail call float @_Z3goof(float %sub) #2
%add3 = fadd float %call, %call2
%mul = fmul float %inc, %add3
%add4 = fadd float %sub, %mul
%conv5 = sitofp i32 %dec to float
%sub6 = fsub float %add4, %conv5
ret float %sub6
}
; CHECK-LABEL: __Z3zoov:
; CHECK: stp x29, x30, [sp, #-16]!
; CHECK: bl __Z3hoo
; CHECK: ldp x29, x30, [sp], #16
; CHECK-NEXT: ret
; CHECK-LINUX-LABEL: _Z3zoov:
; CHECK-LINUX: stp x29, x30, [sp, #-16]!
; CHECK-LINUX: bl _Z3hoo
; CHECK-LINUX: ldp x29, x30, [sp], #16
; CHECK-LINUX-NEXT: ret
define i32 @_Z3zoov() nounwind ssp minsize {
%1 = tail call i32 @_Z3hoov() #2
%2 = add nsw i32 %1, 1
ret i32 %2
}
declare float @_Z3goof(float) nounwind ssp minsize
declare i32 @_Z3hoov() nounwind ssp minsize

View File

@ -0,0 +1,55 @@
; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog| FileCheck %s
; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -homogeneous-prolog-epilog | FileCheck %s --check-prefixes=CHECK-LINUX
; CHECK-LABEL: __Z3hooii:
; CHECK: stp x29, x30, [sp, #-16]!
; CHECK-NEXT: bl _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22
; CHECK: bl __Z3gooi
; CHECK: bl __Z3gooi
; CHECK: bl _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22
; CHECK-NEXT: b __Z3gooi
; CHECK-LINUX-LABEL: _Z3hooii:
; CHECK-LINUX: stp x29, x30, [sp, #-48]!
; CHECK-LINUX-NEXT: bl OUTLINED_FUNCTION_PROLOG_x19x20x21x22x30x29
; CHECK-LINUX: bl _Z3gooi
; CHECK-LINUX: bl _Z3gooi
; CHECK-LINUX: bl OUTLINED_FUNCTION_EPILOG_x19x20x21x22x30x29
; CHECK-LINUX-NEXT: b _Z3gooi
define i32 @_Z3hooii(i32 %b, i32 %a) nounwind ssp minsize {
%1 = tail call i32 @_Z3gooi(i32 %b)
%2 = tail call i32 @_Z3gooi(i32 %a)
%3 = add i32 %a, %b
%4 = add i32 %3, %1
%5 = add i32 %4, %2
%6 = tail call i32 @_Z3gooi(i32 %5)
ret i32 %6
}
declare i32 @_Z3gooi(i32);
; CHECK-LABEL: _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22:
; CHECK: stp x22, x21, [sp, #-32]!
; CHECK-NEXT: stp x20, x19, [sp, #16]
; CHECK-NEXT: ret
; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22:
; CHECK: mov x16, x30
; CHECK-NEXT: ldp x29, x30, [sp, #32]
; CHECK-NEXT: ldp x20, x19, [sp, #16]
; CHECK-NEXT: ldp x22, x21, [sp], #48
; CHECK-NEXT: ret x16
; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_PROLOG_x19x20x21x22x30x29:
; CHECK-LINUX: stp x22, x21, [sp, #16]
; CHECK-LINUX-NEXT: stp x20, x19, [sp, #32]
; CHECK-LINUX-NEXT: ret
; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_EPILOG_x19x20x21x22x30x29:
; CHECK-LINUX: mov x16, x30
; CHECK-LINUX-NEXT: ldp x20, x19, [sp, #32]
; CHECK-LINUX-NEXT: ldp x22, x21, [sp, #16]
; CHECK-LINUX-NEXT: ldp x29, x30, [sp], #48
; CHECK-LINUX-NEXT: ret x16