remove inalloca parameters in globalopt and simplify argpromotion

Summary:
Inalloca parameters require special handling in some optimizations.
This change causes globalopt to strip the inalloca attribute from
function parameters when it is safe to do so, removes the special
handling for inallocas from argpromotion, and replaces it with a
simple check that causes argpromotion to skip functions that receive
inallocas (for when the pass is invoked on code that didn't run
through globalopt first). This also avoids a case where argpromotion
would incorrectly try to pass an inalloca in a register.

Fixes PR41658.

Reviewers: rnk, efriedma

Reviewed By: rnk

Subscribers: llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D61286

llvm-svn: 359743
This commit is contained in:
Bob Haarman 2019-05-02 00:37:36 +00:00
parent 1feaee52ff
commit a78ab77b6b
5 changed files with 76 additions and 34 deletions

View File

@ -566,8 +566,8 @@ static void markIndicesSafe(const IndicesVector &ToMark,
/// This method limits promotion of aggregates to only promote up to three
/// elements of the aggregate in order to avoid exploding the number of
/// arguments passed in.
static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
AAResults &AAR, unsigned MaxElements) {
static bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, AAResults &AAR,
unsigned MaxElements) {
using GEPIndicesSet = std::set<IndicesVector>;
// Quick exit for unused arguments
@ -589,9 +589,6 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
//
// This set will contain all sets of indices that are loaded in the entry
// block, and thus are safe to unconditionally load in the caller.
//
// This optimization is also safe for InAlloca parameters, because it verifies
// that the address isn't captured.
GEPIndicesSet SafeToUnconditionallyLoad;
// This set contains all the sets of indices that we are planning to promote.
@ -599,7 +596,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
GEPIndicesSet ToPromote;
// If the pointer is always valid, any load with first index 0 is valid.
if (isByValOrInAlloca || allCallersPassInValidPointerForArgument(Arg))
if (isByVal || allCallersPassInValidPointerForArgument(Arg))
SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
// First, iterate the entry block and mark loads of (geps of) arguments as
@ -656,8 +653,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
// TODO: This runs the above loop over and over again for dead GEPs
// Couldn't we just do increment the UI iterator earlier and erase the
// use?
return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR,
MaxElements);
return isSafeToPromoteArgument(Arg, isByVal, AAR, MaxElements);
}
// Ensure that all of the indices are constants.
@ -856,6 +852,11 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
if (F->isVarArg())
return nullptr;
// Don't transform functions that receive inallocas, as the transformation may
// not be safe depending on calling convention.
if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca))
return nullptr;
// First check: see if there are any pointer arguments! If not, quick exit.
SmallVector<Argument *, 16> PointerArgs;
for (Argument &I : F->args())
@ -914,8 +915,7 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
// If this is a byval argument, and if the aggregate type is small, just
// pass the elements, which is always safe, if the passed value is densely
// packed or if we can prove the padding bytes are never accessed. This does
// not apply to inalloca.
// packed or if we can prove the padding bytes are never accessed.
bool isSafeToPromote =
PtrArg->hasByValAttr() &&
(isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg));
@ -966,7 +966,7 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
}
// Otherwise, see if we can promote the pointer to its value.
if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR,
if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValAttr(), AAR,
MaxElements))
ArgsToPromote.insert(PtrArg);
}

View File

@ -2090,21 +2090,21 @@ static void ChangeCalleesToFastCall(Function *F) {
}
}
static AttributeList StripNest(LLVMContext &C, AttributeList Attrs) {
// There can be at most one attribute set with a nest attribute.
unsigned NestIndex;
if (Attrs.hasAttrSomewhere(Attribute::Nest, &NestIndex))
return Attrs.removeAttribute(C, NestIndex, Attribute::Nest);
static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
Attribute::AttrKind A) {
unsigned AttrIndex;
if (Attrs.hasAttrSomewhere(A, &AttrIndex))
return Attrs.removeAttribute(C, AttrIndex, A);
return Attrs;
}
static void RemoveNestAttribute(Function *F) {
F->setAttributes(StripNest(F->getContext(), F->getAttributes()));
static void RemoveAttribute(Function *F, Attribute::AttrKind A) {
F->setAttributes(StripAttr(F->getContext(), F->getAttributes(), A));
for (User *U : F->users()) {
if (isa<BlockAddress>(U))
continue;
CallSite CS(cast<Instruction>(U));
CS.setAttributes(StripNest(F->getContext(), CS.getAttributes()));
CS.setAttributes(StripAttr(F->getContext(), CS.getAttributes(), A));
}
}
@ -2119,13 +2119,6 @@ static bool hasChangeableCC(Function *F) {
if (CC != CallingConv::C && CC != CallingConv::X86_ThisCall)
return false;
// Don't break the invariant that the inalloca parameter is the only parameter
// passed in memory.
// FIXME: GlobalOpt should remove inalloca when possible and hoist the dynamic
// alloca it uses to the entry block if possible.
if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca))
return false;
// FIXME: Change CC for the whole chain of musttail calls when possible.
//
// Can't change CC of the function that either has musttail calls, or is a
@ -2287,6 +2280,17 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
if (!F->hasLocalLinkage())
continue;
// If we have an inalloca parameter that we can safely remove the
// inalloca attribute from, do so. This unlocks optimizations that
// wouldn't be safe in the presence of inalloca.
// FIXME: We should also hoist alloca affected by this to the entry
// block if possible.
if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
!F->hasAddressTaken()) {
RemoveAttribute(F, Attribute::InAlloca);
Changed = true;
}
if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
NumInternalFunc++;
TargetTransformInfo &TTI = GetTTI(*F);
@ -2319,7 +2323,7 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
!F->hasAddressTaken()) {
// The function is not used by a trampoline intrinsic, so it is safe
// to remove the 'nest' attribute.
RemoveNestAttribute(F);
RemoveAttribute(F, Attribute::Nest);
++NumNestRemoved;
Changed = true;
}

View File

@ -0,0 +1,38 @@
; In PR41658, argpromotion put an inalloca in a position that per the
; calling convention is passed in a register. This test verifies that
; we don't do that anymore. It also verifies that the combination of
; globalopt and argpromotion is able to optimize the call safely.
;
; RUN: opt -S -argpromotion %s | FileCheck --check-prefix=THIS %s
; RUN: opt -S -globalopt -argpromotion %s | FileCheck --check-prefix=OPT %s
; THIS: define internal x86_thiscallcc void @internalfun(%struct.a* %this, <{ %struct.a
; OPT: define internal fastcc void @internalfun(<{ %struct.a }>*)
target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
target triple = "i386-pc-windows-msvc19.11.0"
%struct.a = type { i8 }
define internal x86_thiscallcc void @internalfun(%struct.a* %this, <{ %struct.a }>* inalloca) {
entry:
%a = getelementptr inbounds <{ %struct.a }>, <{ %struct.a }>* %0, i32 0, i32 0
%argmem = alloca inalloca <{ %struct.a }>, align 4
%1 = getelementptr inbounds <{ %struct.a }>, <{ %struct.a }>* %argmem, i32 0, i32 0
%call = call x86_thiscallcc %struct.a* @copy_ctor(%struct.a* %1, %struct.a* dereferenceable(1) %a)
call void @ext(<{ %struct.a }>* inalloca %argmem)
ret void
}
; This is here to ensure @internalfun is live.
define void @exportedfun(%struct.a* %a) {
%inalloca.save = tail call i8* @llvm.stacksave()
%argmem = alloca inalloca <{ %struct.a }>, align 4
call x86_thiscallcc void @internalfun(%struct.a* %a, <{ %struct.a }>* inalloca %argmem)
call void @llvm.stackrestore(i8* %inalloca.save)
ret void
}
declare x86_thiscallcc %struct.a* @copy_ctor(%struct.a* returned, %struct.a* dereferenceable(1))
declare void @ext(<{ %struct.a }>* inalloca)
declare i8* @llvm.stacksave()
declare void @llvm.stackrestore(i8*)

View File

@ -1,5 +1,5 @@
; RUN: opt %s -argpromotion -sroa -S | FileCheck %s
; RUN: opt %s -passes='argpromotion,function(sroa)' -S | FileCheck %s
; RUN: opt %s -globalopt -argpromotion -sroa -S | FileCheck %s
; RUN: opt %s -passes='module(globalopt),cgscc(argpromotion),function(sroa)' -S | FileCheck %s
target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
@ -15,7 +15,7 @@ entry:
%r = add i32 %a, %b
ret i32 %r
}
; CHECK-LABEL: define internal i32 @f
; CHECK-LABEL: define internal fastcc i32 @f
; CHECK-NOT: load
; CHECK: ret
@ -35,7 +35,7 @@ entry:
; Argpromote can't promote %a because of the icmp use.
define internal i1 @g(%struct.ss* %a, %struct.ss* inalloca %b) nounwind {
; CHECK: define internal i1 @g(%struct.ss* %a, %struct.ss* inalloca %b)
; CHECK: define internal fastcc i1 @g(%struct.ss* %a, %struct.ss* %b)
entry:
%c = icmp eq %struct.ss* %a, %b
ret i1 %c
@ -45,6 +45,6 @@ define i32 @test() {
entry:
%S = alloca inalloca %struct.ss
%c = call i1 @g(%struct.ss* %S, %struct.ss* inalloca %S)
; CHECK: call i1 @g(%struct.ss* %S, %struct.ss* inalloca %S)
; CHECK: call fastcc i1 @g(%struct.ss* %S, %struct.ss* %S)
ret i32 0
}

View File

@ -27,7 +27,7 @@ define internal i32 @j(i32* %m) {
}
define internal i32 @inalloca(i32* inalloca %p) {
; CHECK-LABEL: define internal i32 @inalloca(i32* inalloca %p)
; CHECK-LABEL: define internal fastcc i32 @inalloca(i32* %p)
%rv = load i32, i32* %p
ret i32 %rv
}
@ -52,4 +52,4 @@ define void @call_things() {
; CHECK: call fastcc i32 @g
; CHECK: call coldcc i32 @h
; CHECK: call i32 @j
; CHECK: call i32 @inalloca(i32* inalloca %args)
; CHECK: call fastcc i32 @inalloca(i32* %args)