forked from OSchip/llvm-project
Don't inline dynamic allocas that simplify to huge static allocas.
Some sequences of optimizations can generate call sites which may never be executed during runtime, and through constant propagation result in dynamic allocas being converted to static allocas with very large allocation amounts. The inliner tries to move these to the caller's entry block, resulting in the stack limits being reached/bypassed. Avoid inlining functions if this would result. The threshold of 64k currently doesn't get triggered on the test suite with an -Os LTO build on arm64, care should be taken in changing this in future to avoid needlessly pessimising inlining behaviour. Differential Revision: https://reviews.llvm.org/D81765
This commit is contained in:
parent
7d1452d837
commit
090c108d04
|
@ -49,6 +49,9 @@ const int ColdccPenalty = 2000;
|
||||||
/// Do not inline functions which allocate this many bytes on the stack
|
/// Do not inline functions which allocate this many bytes on the stack
|
||||||
/// when the caller is recursive.
|
/// when the caller is recursive.
|
||||||
const unsigned TotalAllocaSizeRecursiveCaller = 1024;
|
const unsigned TotalAllocaSizeRecursiveCaller = 1024;
|
||||||
|
/// Do not inline dynamic allocas that have been constant propagated to be
|
||||||
|
/// static allocas above this amount in bytes.
|
||||||
|
const uint64_t MaxSimplifiedDynamicAllocaToInline = 65536;
|
||||||
} // namespace InlineConstants
|
} // namespace InlineConstants
|
||||||
|
|
||||||
/// Represents the cost of inlining a function.
|
/// Represents the cost of inlining a function.
|
||||||
|
|
|
@ -853,10 +853,22 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
|
||||||
if (I.isArrayAllocation()) {
|
if (I.isArrayAllocation()) {
|
||||||
Constant *Size = SimplifiedValues.lookup(I.getArraySize());
|
Constant *Size = SimplifiedValues.lookup(I.getArraySize());
|
||||||
if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
|
if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
|
||||||
|
// Sometimes a dynamic alloca could be converted into a static alloca
|
||||||
|
// after this constant prop, and become a huge static alloca on an
|
||||||
|
// unconditional CFG path. Avoid inlining if this is going to happen above
|
||||||
|
// a threshold.
|
||||||
|
// FIXME: If the threshold is removed or lowered too much, we could end up
|
||||||
|
// being too pessimistic and prevent inlining non-problematic code. This
|
||||||
|
// could result in unintended perf regressions. A better overall strategy
|
||||||
|
// is needed to track stack usage during inlining.
|
||||||
Type *Ty = I.getAllocatedType();
|
Type *Ty = I.getAllocatedType();
|
||||||
AllocatedSize = SaturatingMultiplyAdd(
|
AllocatedSize = SaturatingMultiplyAdd(
|
||||||
AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
|
AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
|
||||||
AllocatedSize);
|
AllocatedSize);
|
||||||
|
if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) {
|
||||||
|
HasDynamicAlloca = true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return Base::visitAlloca(I);
|
return Base::visitAlloca(I);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,175 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||||
|
; RUN: opt -inline < %s -S -o - | FileCheck %s
|
||||||
|
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||||
|
target triple = "x86_64-apple-macosx10.15.0"
|
||||||
|
|
||||||
|
define void @caller1(i8 *%p1, i1 %b) {
|
||||||
|
; CHECK-LABEL: @caller1(
|
||||||
|
; CHECK-NEXT: entry:
|
||||||
|
; CHECK-NEXT: [[COND:%.*]] = icmp eq i1 [[B:%.*]], true
|
||||||
|
; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[SPLIT:%.*]]
|
||||||
|
; CHECK: split:
|
||||||
|
; CHECK-NEXT: call void @callee(i8* [[P1:%.*]], i32 0, i32 -1)
|
||||||
|
; CHECK-NEXT: br label [[EXIT]]
|
||||||
|
; CHECK: exit:
|
||||||
|
; CHECK-NEXT: ret void
|
||||||
|
;
|
||||||
|
entry:
|
||||||
|
%cond = icmp eq i1 %b, true
|
||||||
|
br i1 %cond, label %exit, label %split
|
||||||
|
|
||||||
|
split:
|
||||||
|
; This path may be generated from CS splitting and never taken at runtime.
|
||||||
|
call void @callee(i8* %p1, i32 0, i32 -1)
|
||||||
|
br label %exit
|
||||||
|
|
||||||
|
exit:
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @callee(i8* %p1, i32 %l1, i32 %l2) {
|
||||||
|
entry:
|
||||||
|
%ext = zext i32 %l2 to i64
|
||||||
|
%vla = alloca float, i64 %ext, align 16
|
||||||
|
call void @extern_call(float* nonnull %vla) #3
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
define void @caller2_below_threshold(i8 *%p1, i1 %b) {
|
||||||
|
; CHECK-LABEL: @caller2_below_threshold(
|
||||||
|
; CHECK-NEXT: entry:
|
||||||
|
; CHECK-NEXT: [[VLA_I:%.*]] = alloca float, i64 15000, align 16
|
||||||
|
; CHECK-NEXT: [[COND:%.*]] = icmp eq i1 [[B:%.*]], true
|
||||||
|
; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[SPLIT:%.*]]
|
||||||
|
; CHECK: split:
|
||||||
|
; CHECK-NEXT: [[SAVEDSTACK:%.*]] = call i8* @llvm.stacksave()
|
||||||
|
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[VLA_I]] to i8*
|
||||||
|
; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 60000, i8* [[TMP0]])
|
||||||
|
; CHECK-NEXT: call void @extern_call(float* nonnull [[VLA_I]]) #2
|
||||||
|
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[VLA_I]] to i8*
|
||||||
|
; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 60000, i8* [[TMP1]])
|
||||||
|
; CHECK-NEXT: call void @llvm.stackrestore(i8* [[SAVEDSTACK]])
|
||||||
|
; CHECK-NEXT: br label [[EXIT]]
|
||||||
|
; CHECK: exit:
|
||||||
|
; CHECK-NEXT: ret void
|
||||||
|
;
|
||||||
|
entry:
|
||||||
|
%cond = icmp eq i1 %b, true
|
||||||
|
br i1 %cond, label %exit, label %split
|
||||||
|
|
||||||
|
split:
|
||||||
|
call void @callee(i8* %p1, i32 0, i32 15000)
|
||||||
|
br label %exit
|
||||||
|
|
||||||
|
exit:
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @callee2_not_in_entry(i8* %p1, i32 %l1, i32 %l2) {
|
||||||
|
entry:
|
||||||
|
%ext = zext i32 %l2 to i64
|
||||||
|
%c = icmp eq i32 %l1, 42
|
||||||
|
br i1 %c, label %bb2, label %bb3
|
||||||
|
bb2:
|
||||||
|
%vla = alloca float, i64 %ext, align 16
|
||||||
|
call void @extern_call(float* nonnull %vla) #3
|
||||||
|
ret void
|
||||||
|
bb3:
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @caller3_alloca_not_in_entry(i8 *%p1, i1 %b) {
|
||||||
|
; CHECK-LABEL: @caller3_alloca_not_in_entry(
|
||||||
|
; CHECK-NEXT: entry:
|
||||||
|
; CHECK-NEXT: [[COND:%.*]] = icmp eq i1 [[B:%.*]], true
|
||||||
|
; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[SPLIT:%.*]]
|
||||||
|
; CHECK: split:
|
||||||
|
; CHECK-NEXT: br label [[EXIT]]
|
||||||
|
; CHECK: exit:
|
||||||
|
; CHECK-NEXT: ret void
|
||||||
|
;
|
||||||
|
entry:
|
||||||
|
%cond = icmp eq i1 %b, true
|
||||||
|
br i1 %cond, label %exit, label %split
|
||||||
|
|
||||||
|
split:
|
||||||
|
call void @callee2_not_in_entry(i8* %p1, i32 0, i32 -1)
|
||||||
|
br label %exit
|
||||||
|
|
||||||
|
exit:
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @caller4_over_threshold(i8 *%p1, i1 %b, i32 %len) {
|
||||||
|
; CHECK-LABEL: @caller4_over_threshold(
|
||||||
|
; CHECK-NEXT: entry:
|
||||||
|
; CHECK-NEXT: [[COND:%.*]] = icmp eq i1 [[B:%.*]], true
|
||||||
|
; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[SPLIT:%.*]]
|
||||||
|
; CHECK: split:
|
||||||
|
; CHECK-NEXT: call void @callee(i8* [[P1:%.*]], i32 0, i32 16500)
|
||||||
|
; CHECK-NEXT: br label [[EXIT]]
|
||||||
|
; CHECK: exit:
|
||||||
|
; CHECK-NEXT: ret void
|
||||||
|
;
|
||||||
|
entry:
|
||||||
|
%cond = icmp eq i1 %b, true
|
||||||
|
br i1 %cond, label %exit, label %split
|
||||||
|
|
||||||
|
split:
|
||||||
|
call void @callee(i8* %p1, i32 0, i32 16500)
|
||||||
|
br label %exit
|
||||||
|
|
||||||
|
exit:
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare noalias i8* @malloc(i64)
|
||||||
|
define i8* @stack_allocate(i32 %size) #2 {
|
||||||
|
entry:
|
||||||
|
%cmp = icmp ult i32 %size, 100
|
||||||
|
%conv = zext i32 %size to i64
|
||||||
|
br i1 %cmp, label %if.then, label %if.end
|
||||||
|
|
||||||
|
if.then: ; preds = %entry
|
||||||
|
%0 = alloca i8, i64 %conv, align 8
|
||||||
|
br label %return
|
||||||
|
|
||||||
|
if.end: ; preds = %entry
|
||||||
|
%call = tail call i8* @malloc(i64 %conv) #3
|
||||||
|
br label %return
|
||||||
|
|
||||||
|
return: ; preds = %if.end, %if.then
|
||||||
|
%retval.0 = phi i8* [ %0, %if.then ], [ %call, %if.end ]
|
||||||
|
ret i8* %retval.0
|
||||||
|
}
|
||||||
|
|
||||||
|
define i8* @test_stack_allocate_always(i32 %size) {
|
||||||
|
; CHECK-LABEL: @test_stack_allocate_always(
|
||||||
|
; CHECK-NEXT: entry:
|
||||||
|
; CHECK-NEXT: [[SAVEDSTACK:%.*]] = call i8* @llvm.stacksave()
|
||||||
|
; CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[SIZE:%.*]], 100
|
||||||
|
; CHECK-NEXT: [[CONV_I:%.*]] = zext i32 [[SIZE]] to i64
|
||||||
|
; CHECK-NEXT: br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]]
|
||||||
|
; CHECK: if.then.i:
|
||||||
|
; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 [[CONV_I]], align 8
|
||||||
|
; CHECK-NEXT: br label [[STACK_ALLOCATE_EXIT:%.*]]
|
||||||
|
; CHECK: if.end.i:
|
||||||
|
; CHECK-NEXT: [[CALL_I:%.*]] = tail call i8* @malloc(i64 [[CONV_I]]) #2
|
||||||
|
; CHECK-NEXT: br label [[STACK_ALLOCATE_EXIT]]
|
||||||
|
; CHECK: stack_allocate.exit:
|
||||||
|
; CHECK-NEXT: [[RETVAL_0_I:%.*]] = phi i8* [ [[TMP0]], [[IF_THEN_I]] ], [ [[CALL_I]], [[IF_END_I]] ]
|
||||||
|
; CHECK-NEXT: call void @llvm.stackrestore(i8* [[SAVEDSTACK]])
|
||||||
|
; CHECK-NEXT: ret i8* [[RETVAL_0_I]]
|
||||||
|
;
|
||||||
|
entry:
|
||||||
|
%call = tail call i8* @stack_allocate(i32 %size)
|
||||||
|
ret i8* %call
|
||||||
|
}
|
||||||
|
|
||||||
|
declare void @extern_call(float*)
|
||||||
|
|
||||||
|
attributes #1 = { argmemonly nounwind willreturn writeonly }
|
||||||
|
attributes #2 = { alwaysinline }
|
||||||
|
attributes #3 = { nounwind }
|
||||||
|
|
Loading…
Reference in New Issue