[ppc64] Enable sibling call optimization on ppc64 ELFv1/ELFv2 abi

This patch enable sibling call optimization on ppc64 ELFv1/ELFv2 abi, and
add a couple of test cases. This patch also passed llvm/clang bootstrap
test, and spec2006 build/run/result validation.

Original issue: https://llvm.org/bugs/show_bug.cgi?id=25617

Great thanks to Tom's (tjablin) help, he contributed a lot to this patch.
Thanks Hal and Kit's invaluable opinions!

Reviewers: hfinkel kbarton

http://reviews.llvm.org/D16315

llvm-svn: 265506
This commit is contained in:
Chuang-Yu Cheng 2016-04-06 02:04:38 +00:00
parent 024a623c55
commit 2e5973ef74
5 changed files with 467 additions and 8 deletions

View File

@ -19,6 +19,7 @@
#include "PPCTargetMachine.h"
#include "PPCTargetObjectFile.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/CallingConvLower.h"
@ -36,12 +37,15 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
#define DEBUG_TYPE "ppc-lowering"
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
@ -51,6 +55,12 @@ cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hi
static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
static cl::opt<bool> DisableSCO("disable-ppc-sco",
cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
// FIXME: Remove this once the bug has been fixed!
extern cl::opt<bool> ANDIGlueBug;
@ -3842,6 +3852,176 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
return SPDiff;
}
static bool isFunctionGlobalAddress(SDValue Callee);
static bool
resideInSameModule(SDValue Callee, Reloc::Model RelMod) {
// If !G, Callee can be an external symbol.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (!G) return false;
const GlobalValue *GV = G->getGlobal();
if (GV->isDeclaration()) return false;
switch(GV->getLinkage()) {
default: llvm_unreachable("unknow linkage type");
case GlobalValue::AvailableExternallyLinkage:
case GlobalValue::ExternalWeakLinkage:
return false;
// Callee with weak linkage is allowed if it has hidden or protected
// visibility
case GlobalValue::LinkOnceAnyLinkage:
case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions
case GlobalValue::WeakAnyLinkage:
case GlobalValue::WeakODRLinkage: // e.g. c++ template instantiation
if (GV->hasDefaultVisibility())
return false;
case GlobalValue::ExternalLinkage:
case GlobalValue::InternalLinkage:
case GlobalValue::PrivateLinkage:
break;
}
// With '-fPIC', calling default visiblity function need insert 'nop' after
// function call, no matter that function resides in same module or not, so
// we treat it as in different module.
if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility())
return false;
return true;
}
static bool
needStackSlotPassParameters(const PPCSubtarget &Subtarget,
const SmallVectorImpl<ISD::OutputArg> &Outs) {
assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64());
const unsigned PtrByteSize = 8;
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
static const MCPhysReg GPR[] = {
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
};
static const MCPhysReg VR[] = {
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
};
const unsigned NumGPRs = array_lengthof(GPR);
const unsigned NumFPRs = 13;
const unsigned NumVRs = array_lengthof(VR);
const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
unsigned NumBytes = LinkageSize;
unsigned AvailableFPRs = NumFPRs;
unsigned AvailableVRs = NumVRs;
for (const ISD::OutputArg& Param : Outs) {
if (Param.Flags.isNest()) continue;
if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
PtrByteSize, LinkageSize, ParamAreaSize,
NumBytes, AvailableFPRs, AvailableVRs,
Subtarget.hasQPX()))
return true;
}
return false;
}
static bool
hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
if (CS->arg_size() != CallerFn->getArgumentList().size())
return false;
ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end();
Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
const Value* CalleeArg = *CalleeArgIter;
const Value* CallerArg = &(*CallerArgIter);
if (CalleeArg == CallerArg)
continue;
// e.g. @caller([4 x i64] %a, [4 x i64] %b) {
// tail call @callee([4 x i64] undef, [4 x i64] %b)
// }
// 1st argument of callee is undef and has the same type as caller.
if (CalleeArg->getType() == CallerArg->getType() &&
isa<UndefValue>(CalleeArg))
continue;
return false;
}
return true;
}
bool
PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
SDValue Callee,
CallingConv::ID CalleeCC,
ImmutableCallSite *CS,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<ISD::InputArg> &Ins,
SelectionDAG& DAG) const {
bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
if (DisableSCO && !TailCallOpt) return false;
// Variadic argument functions are not supported.
if (isVarArg) return false;
MachineFunction &MF = DAG.getMachineFunction();
CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
// Tail or Sibling call optimization (TCO/SCO) needs callee and caller has
// the same calling convention
if (CallerCC != CalleeCC) return false;
// SCO support C calling convention
if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C)
return false;
// Functions containing by val parameters are not supported.
if (std::any_of(Ins.begin(), Ins.end(),
[](const ISD::InputArg& IA) { return IA.Flags.isByVal(); }))
return false;
// No TCO/SCO on indirect call because Caller have to restore its TOC
if (!isFunctionGlobalAddress(Callee) &&
!isa<ExternalSymbolSDNode>(Callee))
return false;
// Check if Callee resides in the same module, because for now, PPC64 SVR4 ABI
// (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
// module.
// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
if (!resideInSameModule(Callee, getTargetMachine().getRelocationModel()))
return false;
// TCO allows altering callee ABI, so we don't have to check further.
if (CalleeCC == CallingConv::Fast && TailCallOpt)
return true;
if (DisableSCO) return false;
// If callee use the same argument list that caller is using, then we can
// apply SCO on this case. If it is not, then we need to check if callee needs
// stack for passing arguments.
if (!hasSameArgumentList(MF.getFunction(), CS) &&
needStackSlotPassParameters(Subtarget, Outs)) {
return false;
}
return true;
}
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function.
@ -4479,9 +4659,32 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool IsPatchPoint = CLI.IsPatchPoint;
ImmutableCallSite *CS = CLI.CS;
if (isTailCall)
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
Ins, DAG);
if (isTailCall) {
if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
isTailCall =
IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
isVarArg, Outs, Ins, DAG);
else
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
Ins, DAG);
if (isTailCall) {
++NumTailCalls;
if (!getTargetMachine().Options.GuaranteedTailCallOpt)
++NumSiblingCalls;
assert(isa<GlobalAddressSDNode>(Callee) &&
"Callee should be an llvm::Function object.");
DEBUG(
const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
const unsigned Width = 80 - strlen("TCO caller: ")
- strlen(", callee linkage: 0, 0");
dbgs() << "TCO caller: "
<< left_justify(DAG.getMachineFunction().getName(), Width)
<< ", callee linkage: "
<< GV->getVisibility() << ", " << GV->getLinkage() << "\n"
);
}
}
if (!isTailCall && CS && CS->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
@ -4760,12 +4963,16 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned NumOps = Outs.size();
bool hasNest = false;
bool IsSibCall = false;
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
unsigned PtrByteSize = 8;
MachineFunction &MF = DAG.getMachineFunction();
if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
IsSibCall = true;
// Mark this function as potentially containing a function that contains a
// tail call. As a consequence the frame pointer will be used for dynamicalloc
// and restoring the callers stack pointer in this functions epilog. This is
@ -4885,9 +5092,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
CallConv == CallingConv::Fast)
NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
int SPDiff = 0;
// Calculate by how many bytes the stack has to be adjusted in case of tail
// call optimization.
int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
if (!IsSibCall)
SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
// To protect arguments on the stack from being clobbered in a tail call,
// force all the loads to happen before doing any other lowering.
@ -4896,8 +5106,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
dl);
if (!IsSibCall)
Chain = DAG.getCALLSEQ_START(Chain,
DAG.getIntPtrConstant(NumBytes, dl, true), dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be move somewhere else
@ -5366,7 +5577,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
InFlag = Chain.getValue(1);
}
if (isTailCall)
if (isTailCall && !IsSibCall)
PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
FPOp, true, TailCallArguments);

View File

@ -713,6 +713,16 @@ namespace llvm {
const SmallVectorImpl<ISD::InputArg> &Ins,
SelectionDAG& DAG) const;
bool
IsEligibleForTailCallOptimization_64SVR4(
SDValue Callee,
CallingConv::ID CalleeCC,
ImmutableCallSite *CS,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<ISD::InputArg> &Ins,
SelectionDAG& DAG) const;
SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
int SPDiff,
SDValue Chain,

View File

@ -14,7 +14,8 @@ define weak void @foo_weak() nounwind {
define void @test_direct() nounwind readnone {
; CHECK-LABEL: test_direct:
tail call void @foo() nounwind
; CHECK: bl foo
; Because of tail call optimization, it can be 'b' instruction.
; CHECK: [[BR:b[l]?]] foo
; CHECK-NOT: nop
ret void
}

View File

@ -0,0 +1,46 @@
; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu --enable-shrink-wrap=false | FileCheck %s -check-prefix=CHECK-SCO-ONLY
; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu --enable-shrink-wrap=true | FileCheck %s -check-prefix=CHECK-SCO-SHRK
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu --enable-shrink-wrap=false | FileCheck %s -check-prefix=CHECK-SCO-ONLY
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu --enable-shrink-wrap=true | FileCheck %s -check-prefix=CHECK-SCO-SHRK
%"class.clang::NamedDecl" = type { i32 }
declare void @__assert_fail();
define i8 @_ZNK5clang9NamedDecl23getLinkageAndVisibilityEv(
%"class.clang::NamedDecl"* %this) {
entry:
%tobool = icmp eq %"class.clang::NamedDecl"* %this, null
br i1 %tobool, label %cond.false, label %exit
cond.false:
tail call void @__assert_fail()
unreachable
exit:
%DeclKind = getelementptr inbounds
%"class.clang::NamedDecl",
%"class.clang::NamedDecl"* %this, i64 0, i32 0
%bf.load = load i32, i32* %DeclKind, align 4
%call.i = tail call i8 @LVComputationKind(
%"class.clang::NamedDecl"* %this,
i32 %bf.load)
ret i8 %call.i
; CHECK-SCO-SHRK-LABEL: _ZNK5clang9NamedDecl23getLinkageAndVisibilityEv:
; CHECK-SCO-SHRK: b LVComputationKind
; CHECK-SCO-SHRK: #TC_RETURNd8
; CHECK-SCO-SHRK: stdu 1, -{{[0-9]+}}(1)
; CHECK-SCO-SHRK: bl __assert_fail
;
; CHECK-SCO-ONLY-LABEL: _ZNK5clang9NamedDecl23getLinkageAndVisibilityEv:
; CHECK-SCO-ONLY: stdu 1, -{{[0-9]+}}(1)
; CHECK-SCO-ONLY: b LVComputationKind
; CHECK-SCO-ONLY: #TC_RETURNd8
; CHECK-SCO-ONLY: bl __assert_fail
}
define fastcc i8 @LVComputationKind(
%"class.clang::NamedDecl"* %D,
i32 %computation) {
ret i8 0
}

View File

@ -0,0 +1,191 @@
; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-SCO
; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX
; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX
; No combination of "powerpc64le-unknown-linux-gnu" + "CHECK-SCO", because
; only Power8 (and later) fully support LE.
%S_56 = type { [13 x i32], i32 }
%S_64 = type { [15 x i32], i32 }
%S_32 = type { [7 x i32], i32 }
; Function Attrs: noinline nounwind
define void @callee_56_copy([7 x i64] %a, %S_56* %b) #0 { ret void }
define void @callee_64_copy([8 x i64] %a, %S_64* %b) #0 { ret void }
; Function Attrs: nounwind
define void @caller_56_reorder_copy(%S_56* %b, [7 x i64] %a) #1 {
tail call void @callee_56_copy([7 x i64] %a, %S_56* %b)
ret void
; CHECK-SCO-LABEL: caller_56_reorder_copy:
; CHECK-SCO-NOT: stdu 1
; CHECK-SCO: TC_RETURNd8 callee_56_copy
}
define void @caller_64_reorder_copy(%S_64* %b, [8 x i64] %a) #1 {
tail call void @callee_64_copy([8 x i64] %a, %S_64* %b)
ret void
; CHECK-SCO-LABEL: caller_64_reorder_copy:
; CHECK-SCO: bl callee_64_copy
}
define void @callee_64_64_copy([8 x i64] %a, [8 x i64] %b) #0 { ret void }
define void @caller_64_64_copy([8 x i64] %a, [8 x i64] %b) #1 {
tail call void @callee_64_64_copy([8 x i64] %a, [8 x i64] %b)
ret void
; CHECK-SCO-LABEL: caller_64_64_copy:
; CHECK-SCO: b callee_64_64_copy
}
define void @caller_64_64_reorder_copy([8 x i64] %a, [8 x i64] %b) #1 {
tail call void @callee_64_64_copy([8 x i64] %b, [8 x i64] %a)
ret void
; CHECK-SCO-LABEL: caller_64_64_reorder_copy:
; CHECK-SCO: bl callee_64_64_copy
}
define void @caller_64_64_undef_copy([8 x i64] %a, [8 x i64] %b) #1 {
tail call void @callee_64_64_copy([8 x i64] %a, [8 x i64] undef)
ret void
; CHECK-SCO-LABEL: caller_64_64_undef_copy:
; CHECK-SCO: b callee_64_64_copy
}
define void @arg8_callee(
float %a, i32 signext %b, float %c, i32* %d,
i8 zeroext %e, float %f, i32* %g, i32 signext %h)
{
ret void
}
define void @arg8_caller(float %a, i32 signext %b, i8 zeroext %c, i32* %d) {
entry:
tail call void @arg8_callee(float undef, i32 signext undef, float undef,
i32* %d, i8 zeroext undef, float undef,
i32* undef, i32 signext undef)
ret void
; CHECK-SCO-LABEL: arg8_caller:
; CHECK-SCO: b arg8_callee
}
; Struct return test
; Function Attrs: noinline nounwind
define void @callee_sret_56(%S_56* noalias sret %agg.result) #0 { ret void }
define void @callee_sret_32(%S_32* noalias sret %agg.result) #0 { ret void }
; Function Attrs: nounwind
define void @caller_do_something_sret_32(%S_32* noalias sret %agg.result) #1 {
%1 = alloca %S_56, align 4
%2 = bitcast %S_56* %1 to i8*
call void @callee_sret_56(%S_56* nonnull sret %1)
tail call void @callee_sret_32(%S_32* sret %agg.result)
ret void
; CHECK-SCO-LABEL: caller_do_something_sret_32:
; CHECK-SCO: stdu 1
; CHECK-SCO: bl callee_sret_56
; CHECK-SCO: addi 1
; CHECK-SCO: TC_RETURNd8 callee_sret_32
}
define void @caller_local_sret_32(%S_32* %a) #1 {
%tmp = alloca %S_32, align 4
tail call void @callee_sret_32(%S_32* nonnull sret %tmp)
ret void
; CHECK-SCO-LABEL: caller_local_sret_32:
; CHECK-SCO: bl callee_sret_32
}
attributes #0 = { noinline nounwind }
attributes #1 = { nounwind }
; vector <4 x i1> test
define void @callee_v4i1(i8 %a, <4 x i1> %b, <4 x i1> %c) { ret void }
define void @caller_v4i1_reorder(i8 %a, <4 x i1> %b, <4 x i1> %c) {
tail call void @callee_v4i1(i8 %a, <4 x i1> %c, <4 x i1> %b)
ret void
; <4 x i1> is 32 bytes aligned, if subtarget doesn't support qpx, then we can't
; place b, c to qpx register, so we can't do sco on caller_v4i1_reorder
; CHECK-SCO-LABEL: caller_v4i1_reorder:
; CHECK-SCO: bl callee_v4i1
; CHECK-SCO-HASQPX-LABEL: caller_v4i1_reorder:
; CHECK-SCO-HASQPX: b callee_v4i1
}
define void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { ret void }
define void @f128_caller(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) {
tail call void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b)
ret void
; CHECK-SCO-LABEL: f128_caller:
; CHECK-SCO: b f128_callee
}
; weak linkage test
%class.T = type { [2 x i8] }
define weak_odr hidden void @wo_hcallee(%class.T* %this, i8* %c) { ret void }
define void @wo_hcaller(%class.T* %this, i8* %c) {
tail call void @wo_hcallee(%class.T* %this, i8* %c)
ret void
; CHECK-SCO-LABEL: wo_hcaller:
; CHECK-SCO: b wo_hcallee
}
define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void }
define void @wo_pcaller(%class.T* %this, i8* %c) {
tail call void @wo_pcallee(%class.T* %this, i8* %c)
ret void
; CHECK-SCO-LABEL: wo_pcaller:
; CHECK-SCO: b wo_pcallee
}
define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void }
define void @wo_caller(%class.T* %this, i8* %c) {
tail call void @wo_callee(%class.T* %this, i8* %c)
ret void
; CHECK-SCO-LABEL: wo_caller:
; CHECK-SCO: bl wo_callee
}
define weak protected void @w_pcallee(i8* %ptr) { ret void }
define void @w_pcaller(i8* %ptr) {
tail call void @w_pcallee(i8* %ptr)
ret void
; CHECK-SCO-LABEL: w_pcaller:
; CHECK-SCO: b w_pcallee
}
define weak hidden void @w_hcallee(i8* %ptr) { ret void }
define void @w_hcaller(i8* %ptr) {
tail call void @w_hcallee(i8* %ptr)
ret void
; CHECK-SCO-LABEL: w_hcaller:
; CHECK-SCO: b w_hcallee
}
define weak void @w_callee(i8* %ptr) { ret void }
define void @w_caller(i8* %ptr) {
tail call void @w_callee(i8* %ptr)
ret void
; CHECK-SCO-LABEL: w_caller:
; CHECK-SCO: bl w_callee
}