forked from OSchip/llvm-project
CXX_FAST_TLS calling convention: performance improvement for AArch64.
The access function has a short entry and a short exit, the initialization block is only run the first time. To improve the performance, we want to have a short frame at the entry and exit. We explicitly handle most of the CSRs via copies. Only the CSRs that are not handled via copies will be in CSR_SaveList. Frame lowering and prologue/epilogue insertion will generate a short frame in the entry and exit according to CSR_SaveList. The majority of the CSRs will be handled by register allcoator. Register allocator will try to spill and reload them in the initialization block. We add CSRsViaCopy, it will be explicitly handled during lowering. 1> we first set FunctionLoweringInfo->SplitCSR if conditions are met (the target supports it for the given machine function and the function has only return exits). We also call TLI->initializeSplitCSR to perform initialization. 2> we call TLI->insertCopiesSplitCSR to insert copies from CSRsViaCopy to virtual registers at beginning of the entry block and copies from virtual registers to CSRsViaCopy at beginning of the exit blocks. 3> we also need to make sure the explicit copies will not be eliminated. The target independent portion was committed as r255353. rdar://problem/23557469 Differential Revision: http://reviews.llvm.org/D15341 llvm-svn: 255821
This commit is contained in:
parent
9a5b052f7c
commit
cbe4f9417d
|
@ -288,6 +288,14 @@ def CSR_AArch64_CXX_TLS_Darwin
|
|||
(sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
|
||||
(sequence "D%u", 0, 31))>;
|
||||
|
||||
// CSRs that are handled by prologue, epilogue.
|
||||
def CSR_AArch64_CXX_TLS_Darwin_PE
|
||||
: CalleeSavedRegs<(add LR, FP)>;
|
||||
|
||||
// CSRs that are handled explicitly via copies.
|
||||
def CSR_AArch64_CXX_TLS_Darwin_ViaCopy
|
||||
: CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>;
|
||||
|
||||
// The ELF stub used for TLS-descriptor access saves every feasible
|
||||
// register. Only X0 and LR are clobbered.
|
||||
def CSR_AArch64_TLS_ELF
|
||||
|
|
|
@ -3646,6 +3646,9 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
|
|||
if (F.isVarArg())
|
||||
return false;
|
||||
|
||||
if (TLI.supportSplitCSR(FuncInfo.MF))
|
||||
return false;
|
||||
|
||||
// Build a list of return value registers.
|
||||
SmallVector<unsigned, 4> RetRegs;
|
||||
|
||||
|
|
|
@ -3271,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|||
Flag = Chain.getValue(1);
|
||||
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
|
||||
}
|
||||
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
|
||||
const MCPhysReg *I =
|
||||
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
|
||||
if (I) {
|
||||
for (; *I; ++I) {
|
||||
if (AArch64::GPR64RegClass.contains(*I))
|
||||
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
|
||||
else if (AArch64::FPR64RegClass.contains(*I))
|
||||
RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
|
||||
else
|
||||
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
|
||||
}
|
||||
}
|
||||
|
||||
RetOps[0] = Chain; // Update chain.
|
||||
|
||||
|
@ -10003,3 +10016,49 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons
|
|||
IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
|
||||
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
|
||||
}
|
||||
|
||||
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
|
||||
// Update IsSplitCSR in AArch64unctionInfo.
|
||||
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
|
||||
AFI->setIsSplitCSR(true);
|
||||
}
|
||||
|
||||
void AArch64TargetLowering::insertCopiesSplitCSR(
|
||||
MachineBasicBlock *Entry,
|
||||
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
|
||||
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
|
||||
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
|
||||
if (!IStart)
|
||||
return;
|
||||
|
||||
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
|
||||
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
|
||||
for (const MCPhysReg *I = IStart; *I; ++I) {
|
||||
const TargetRegisterClass *RC = nullptr;
|
||||
if (AArch64::GPR64RegClass.contains(*I))
|
||||
RC = &AArch64::GPR64RegClass;
|
||||
else if (AArch64::FPR64RegClass.contains(*I))
|
||||
RC = &AArch64::FPR64RegClass;
|
||||
else
|
||||
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
|
||||
|
||||
unsigned NewVR = MRI->createVirtualRegister(RC);
|
||||
// Create copy from CSR to a virtual register.
|
||||
// FIXME: this currently does not emit CFI pseudo-instructions, it works
|
||||
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
|
||||
// nounwind. If we want to generalize this later, we may need to emit
|
||||
// CFI pseudo-instructions.
|
||||
assert(Entry->getParent()->getFunction()->hasFnAttribute(
|
||||
Attribute::NoUnwind) &&
|
||||
"Function should be nounwind in insertCopiesSplitCSR!");
|
||||
Entry->addLiveIn(*I);
|
||||
BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
|
||||
NewVR)
|
||||
.addReg(*I);
|
||||
|
||||
for (auto *Exit : Exits)
|
||||
BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
|
||||
*I)
|
||||
.addReg(NewVR);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -385,6 +385,14 @@ public:
|
|||
bool isCheapToSpeculateCtlz() const override {
|
||||
return true;
|
||||
}
|
||||
bool supportSplitCSR(MachineFunction *MF) const override {
|
||||
return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
|
||||
MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
|
||||
}
|
||||
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
|
||||
void insertCopiesSplitCSR(
|
||||
MachineBasicBlock *Entry,
|
||||
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
|
||||
|
||||
private:
|
||||
bool isExtFreeImpl(const Instruction *Ext) const override;
|
||||
|
|
|
@ -72,16 +72,22 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
|
|||
/// registers.
|
||||
unsigned VarArgsFPRSize;
|
||||
|
||||
/// True if this function has a subset of CSRs that is handled explicitly via
|
||||
/// copies.
|
||||
bool IsSplitCSR;
|
||||
|
||||
public:
|
||||
AArch64FunctionInfo()
|
||||
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
|
||||
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
|
||||
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
|
||||
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
|
||||
IsSplitCSR(false) {}
|
||||
|
||||
explicit AArch64FunctionInfo(MachineFunction &MF)
|
||||
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
|
||||
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
|
||||
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
|
||||
VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
|
||||
IsSplitCSR(false) {
|
||||
(void)MF;
|
||||
}
|
||||
|
||||
|
@ -96,6 +102,9 @@ public:
|
|||
bool hasStackFrame() const { return HasStackFrame; }
|
||||
void setHasStackFrame(bool s) { HasStackFrame = s; }
|
||||
|
||||
bool isSplitCSR() const { return IsSplitCSR; }
|
||||
void setIsSplitCSR(bool s) { IsSplitCSR = s; }
|
||||
|
||||
void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
|
||||
unsigned getLocalStackSize() const { return LocalStackSize; }
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "AArch64RegisterInfo.h"
|
||||
#include "AArch64FrameLowering.h"
|
||||
#include "AArch64InstrInfo.h"
|
||||
#include "AArch64MachineFunctionInfo.h"
|
||||
#include "AArch64Subtarget.h"
|
||||
#include "MCTargetDesc/AArch64AddressingModes.h"
|
||||
#include "llvm/ADT/BitVector.h"
|
||||
|
@ -47,11 +48,22 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
|
|||
if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
|
||||
return CSR_AArch64_AllRegs_SaveList;
|
||||
if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS)
|
||||
return CSR_AArch64_CXX_TLS_Darwin_SaveList;
|
||||
return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
|
||||
CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
|
||||
CSR_AArch64_CXX_TLS_Darwin_SaveList;
|
||||
else
|
||||
return CSR_AArch64_AAPCS_SaveList;
|
||||
}
|
||||
|
||||
const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
|
||||
const MachineFunction *MF) const {
|
||||
assert(MF && "Invalid MachineFunction pointer.");
|
||||
if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
|
||||
MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
|
||||
return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const uint32_t *
|
||||
AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
|
||||
CallingConv::ID CC) const {
|
||||
|
|
|
@ -35,6 +35,8 @@ public:
|
|||
|
||||
/// Code Generation virtual methods...
|
||||
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
|
||||
const MCPhysReg *
|
||||
getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
|
||||
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
|
||||
CallingConv::ID) const override;
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
|
|||
declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
|
||||
declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
|
||||
|
||||
define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
|
||||
define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
|
||||
%.b.i = load i1, i1* @__tls_guard, align 1
|
||||
br i1 %.b.i, label %__tls_init.exit, label %init.i
|
||||
|
||||
|
@ -28,50 +28,49 @@ __tls_init.exit:
|
|||
}
|
||||
|
||||
; CHECK-LABEL: _ZTW2sg
|
||||
; CHECK-DAG: stp d31, d30
|
||||
; CHECK-DAG: stp d29, d28
|
||||
; CHECK-DAG: stp d27, d26
|
||||
; CHECK-DAG: stp d25, d24
|
||||
; CHECK-DAG: stp d23, d22
|
||||
; CHECK-DAG: stp d21, d20
|
||||
; CHECK-DAG: stp d19, d18
|
||||
; CHECK-DAG: stp d17, d16
|
||||
; CHECK-DAG: stp d7, d6
|
||||
; CHECK-DAG: stp d5, d4
|
||||
; CHECK-DAG: stp d3, d2
|
||||
; CHECK-DAG: stp d1, d0
|
||||
; CHECK-DAG: stp x20, x19
|
||||
; CHECK-DAG: stp x14, x13
|
||||
; CHECK-DAG: stp x12, x11
|
||||
; CHECK-DAG: stp x10, x9
|
||||
; CHECK-DAG: stp x8, x7
|
||||
; CHECK-DAG: stp x6, x5
|
||||
; CHECK-DAG: stp x4, x3
|
||||
; CHECK-DAG: stp x2, x1
|
||||
; CHECK-DAG: stp x29, x30
|
||||
; CHECK-NOT: stp d31, d30
|
||||
; CHECK-NOT: stp d29, d28
|
||||
; CHECK-NOT: stp d27, d26
|
||||
; CHECK-NOT: stp d25, d24
|
||||
; CHECK-NOT: stp d23, d22
|
||||
; CHECK-NOT: stp d21, d20
|
||||
; CHECK-NOT: stp d19, d18
|
||||
; CHECK-NOT: stp d17, d16
|
||||
; CHECK-NOT: stp d7, d6
|
||||
; CHECK-NOT: stp d5, d4
|
||||
; CHECK-NOT: stp d3, d2
|
||||
; CHECK-NOT: stp d1, d0
|
||||
; CHECK-NOT: stp x20, x19
|
||||
; CHECK-NOT: stp x14, x13
|
||||
; CHECK-NOT: stp x12, x11
|
||||
; CHECK-NOT: stp x10, x9
|
||||
; CHECK-NOT: stp x8, x7
|
||||
; CHECK-NOT: stp x6, x5
|
||||
; CHECK-NOT: stp x4, x3
|
||||
; CHECK-NOT: stp x2, x1
|
||||
; CHECK: blr
|
||||
; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]]
|
||||
; CHECK: blr
|
||||
; CHECK: tlv_atexit
|
||||
; CHECK: [[BB_end]]:
|
||||
; CHECK: blr
|
||||
; CHECK-DAG: ldp x2, x1
|
||||
; CHECK-DAG: ldp x4, x3
|
||||
; CHECK-DAG: ldp x6, x5
|
||||
; CHECK-DAG: ldp x8, x7
|
||||
; CHECK-DAG: ldp x10, x9
|
||||
; CHECK-DAG: ldp x12, x11
|
||||
; CHECK-DAG: ldp x14, x13
|
||||
; CHECK-DAG: ldp x20, x19
|
||||
; CHECK-DAG: ldp d1, d0
|
||||
; CHECK-DAG: ldp d3, d2
|
||||
; CHECK-DAG: ldp d5, d4
|
||||
; CHECK-DAG: ldp d7, d6
|
||||
; CHECK-DAG: ldp d17, d16
|
||||
; CHECK-DAG: ldp d19, d18
|
||||
; CHECK-DAG: ldp d21, d20
|
||||
; CHECK-DAG: ldp d23, d22
|
||||
; CHECK-DAG: ldp d25, d24
|
||||
; CHECK-DAG: ldp d27, d26
|
||||
; CHECK-DAG: ldp d29, d28
|
||||
; CHECK-DAG: ldp d31, d30
|
||||
; CHECK-NOT: ldp x2, x1
|
||||
; CHECK-NOT: ldp x4, x3
|
||||
; CHECK-NOT: ldp x6, x5
|
||||
; CHECK-NOT: ldp x8, x7
|
||||
; CHECK-NOT: ldp x10, x9
|
||||
; CHECK-NOT: ldp x12, x11
|
||||
; CHECK-NOT: ldp x14, x13
|
||||
; CHECK-NOT: ldp x20, x19
|
||||
; CHECK-NOT: ldp d1, d0
|
||||
; CHECK-NOT: ldp d3, d2
|
||||
; CHECK-NOT: ldp d5, d4
|
||||
; CHECK-NOT: ldp d7, d6
|
||||
; CHECK-NOT: ldp d17, d16
|
||||
; CHECK-NOT: ldp d19, d18
|
||||
; CHECK-NOT: ldp d21, d20
|
||||
; CHECK-NOT: ldp d23, d22
|
||||
; CHECK-NOT: ldp d25, d24
|
||||
; CHECK-NOT: ldp d27, d26
|
||||
; CHECK-NOT: ldp d29, d28
|
||||
; CHECK-NOT: ldp d31, d30
|
||||
|
|
Loading…
Reference in New Issue