forked from OSchip/llvm-project
[DSE] Re-enable calloc transformation with extra care (PR25892)
Transformation from malloc+memset to calloc is always correct and in many situations it brings significant observable benefits in terms of execution speed and memory consumption [1][2]. Unfortunately there are cases when producing calloc cause performance drops [3]. As discussed here: https://reviews.llvm.org/D103009 it's possible to differentiate between those 2 scenarios. If optimizer is able to prove that after malloc call it's _very_ likely to reach memset branch then after calloc emission we shouldn't observe any performance hits. Therefore finding "null pointer check" pattern before memset basic block sounds like good justification for performing transformation. Also that method was already suggested by GCC folks [4]. Main reason for change is that for now to be safe we check for post dominance relation which is way too conservative approach making transformation "almost" disabled in practice. This patch tends to enable transformation again but with extra care. [1] https://stackoverflow.com/questions/2688466/why-mallocmemset-is-slower-than-calloc [2] https://vorpus.org/blog/why-does-calloc-exist/ [3] http://smalldatum.blogspot.com/2017/11/a-new-optimization-in-gcc-5x-and-mysql.html [4] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83022 Differential Revision: https://reviews.llvm.org/D110021
This commit is contained in:
parent
9c8f950a04
commit
9e65929a8e
|
@ -1844,8 +1844,29 @@ struct DSEState {
|
|||
if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
|
||||
Func != LibFunc_malloc)
|
||||
return false;
|
||||
|
||||
auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) {
|
||||
// Check for br(icmp ptr, null), truebb, falsebb) pattern at the end
|
||||
// of malloc block
|
||||
auto *MallocBB = Malloc->getParent(),
|
||||
*MemsetBB = Memset->getParent();
|
||||
if (MallocBB == MemsetBB)
|
||||
return true;
|
||||
auto *Ptr = Memset->getArgOperand(0);
|
||||
auto *TI = MallocBB->getTerminator();
|
||||
ICmpInst::Predicate Pred;
|
||||
BasicBlock *TrueBB, *FalseBB;
|
||||
if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
|
||||
FalseBB)))
|
||||
return false;
|
||||
if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
|
||||
return false;
|
||||
return true;
|
||||
};
|
||||
|
||||
if (Malloc->getOperand(0) == MemSet->getLength()) {
|
||||
if (DT.dominates(Malloc, MemSet) && PDT.dominates(MemSet, Malloc) &&
|
||||
if (shouldCreateCalloc(Malloc, MemSet) &&
|
||||
DT.dominates(Malloc, MemSet) &&
|
||||
memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) {
|
||||
IRBuilder<> IRB(Malloc);
|
||||
const auto &DL = Malloc->getModule()->getDataLayout();
|
||||
|
|
|
@ -405,17 +405,14 @@ if.end: ; preds = %if.then, %entry
|
|||
ret i8* %call
|
||||
}
|
||||
|
||||
; FIXME: malloc+memset are not currently transformed into calloc unless the
|
||||
; memset post-dominates the malloc.
|
||||
define float* @pr25892(i64 %size) {
|
||||
; CHECK-LABEL: @pr25892(
|
||||
; CHECK: entry:
|
||||
; CHECK-NEXT: [[CALL:%.*]] = call i8* @malloc(i64 [[SIZE:%.*]])
|
||||
; CHECK-NEXT: [[CALL:%.*]] = call i8* @calloc(i64 1, i64 [[SIZE:%.*]])
|
||||
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8* [[CALL]], null
|
||||
; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
|
||||
; CHECK: if.end:
|
||||
; CHECK-NEXT: [[BC:%.*]] = bitcast i8* [[CALL]] to float*
|
||||
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %call, i8 0, i64 %size, i1 false)
|
||||
; CHECK-NEXT: br label [[CLEANUP]]
|
||||
; CHECK: cleanup:
|
||||
; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float* [ [[BC]], [[IF_END]] ], [ null, [[ENTRY:%.*]] ]
|
||||
|
@ -437,13 +434,11 @@ cleanup:
|
|||
define float* @pr25892_with_extra_store(i64 %size) {
|
||||
; CHECK-LABEL: @pr25892_with_extra_store(
|
||||
; CHECK: entry:
|
||||
; CHECK-NEXT: [[CALL:%.*]] = call i8* @malloc(i64 [[SIZE:%.*]])
|
||||
; CHECK-NEXT: [[CALL:%.*]] = call i8* @calloc(i64 1, i64 [[SIZE:%.*]])
|
||||
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8* [[CALL]], null
|
||||
; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
|
||||
; CHECK: if.end:
|
||||
; CHECK-NEXT: [[BC:%.*]] = bitcast i8* [[CALL]] to float*
|
||||
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %call, i8 0, i64 %size, i1 false)
|
||||
; CHECK-NEXT: store i8 0, i8* %call, align 1
|
||||
; CHECK-NEXT: br label [[CLEANUP]]
|
||||
; CHECK: cleanup:
|
||||
; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float* [ [[BC]], [[IF_END]] ], [ null, [[ENTRY:%.*]] ]
|
||||
|
@ -463,6 +458,32 @@ cleanup:
|
|||
ret float* %retval.0
|
||||
}
|
||||
|
||||
; This should not create a calloc
|
||||
define i8* @malloc_with_no_nointer_null_check(i64 %0, i32 %1) {
|
||||
; CHECK-LABEL: @malloc_with_no_nointer_null_check
|
||||
; CHECK: entry:
|
||||
; CHECK-NEXT: [[CALL:%.*]] = call i8* @malloc(i64 [[TMP0:%.*]])
|
||||
; CHECK-NEXT: [[A:%.*]] = and i32 [[TMP1:%.*]], 32
|
||||
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0
|
||||
; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
|
||||
; CHECK: if.end:
|
||||
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[CALL]], i8 0, i64 [[TMP0]], i1 false)
|
||||
; CHECK-NEXT: br label [[CLEANUP]]
|
||||
; CHECK: cleanup:
|
||||
; CHECK-NEXT: ret i8* [[CALL]]
|
||||
;
|
||||
entry:
|
||||
%call = call i8* @malloc(i64 %0) inaccessiblememonly
|
||||
%a = and i32 %1, 32
|
||||
%cmp = icmp eq i32 %a, 0
|
||||
br i1 %cmp, label %cleanup, label %if.end
|
||||
if.end:
|
||||
call void @llvm.memset.p0i8.i64(i8* %call, i8 0, i64 %0, i1 false)
|
||||
br label %cleanup
|
||||
cleanup:
|
||||
ret i8* %call
|
||||
}
|
||||
|
||||
; PR50143
|
||||
define i8* @store_zero_after_calloc_inaccessiblememonly() {
|
||||
; CHECK-LABEL: @store_zero_after_calloc_inaccessiblememonly(
|
||||
|
@ -605,3 +626,52 @@ define i8* @memset_pattern16_after_calloc(i8* %pat) {
|
|||
call void @memset_pattern16(i8* %call, i8* %pat, i64 40000) #1
|
||||
ret i8* %call
|
||||
}
|
||||
|
||||
@n = global i32 0, align 4
|
||||
@a = external global i32, align 4
|
||||
@b = external global i32*, align 8
|
||||
|
||||
; GCC calloc-1.c test case should create calloc
|
||||
define i8* @test_malloc_memset_to_calloc(i64* %0) {
|
||||
; CHECK-LABEL: @test_malloc_memset_to_calloc(
|
||||
; CHECK: entry:
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @n, align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
|
||||
; CHECK-NEXT: [[CALLOC:%.*]] = call i8* @calloc(i64 1, i64 [[TMP2]])
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP0:%.*]], align 8
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], 1
|
||||
; CHECK-NEXT: store i64 [[TMP4]], i64* [[TMP0]], align 8
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8* [[CALLOC]], null
|
||||
; CHECK-NEXT: br i1 [[TMP5]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
|
||||
; CHECK: if.then: ; preds = %entry
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP3]], 2
|
||||
; CHECK-NEXT: store i64 [[TMP6]], i64* [[TMP0]], align 8
|
||||
; CHECK-NEXT: store i32 2, i32* @a, align 4
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** @b, align 8
|
||||
; CHECK-NEXT: store i32 3, i32* [[TMP7]], align 4
|
||||
; CHECK-NEXT: br label [[IF_END]]
|
||||
; CHECK: if.end: ; preds = %if.then, %entry
|
||||
; CHECK-NEXT: ret i8* [[CALLOC]]
|
||||
;
|
||||
entry:
|
||||
%1 = load i32, i32* @n, align 4
|
||||
%2 = sext i32 %1 to i64
|
||||
%3 = tail call i8* @malloc(i64 %2) inaccessiblememonly
|
||||
%4 = load i64, i64* %0, align 8
|
||||
%5 = add nsw i64 %4, 1
|
||||
store i64 %5, i64* %0, align 8
|
||||
%6 = icmp eq i8* %3, null
|
||||
br i1 %6, label %if.end, label %if.then
|
||||
|
||||
if.then:
|
||||
%7 = add nsw i64 %4, 2
|
||||
store i64 %7, i64* %0, align 8
|
||||
store i32 2, i32* @a, align 4
|
||||
tail call void @llvm.memset.p0i8.i64(i8* align 4 %3, i8 0, i64 %2, i1 false)
|
||||
%8 = load i32*, i32** @b, align 8
|
||||
store i32 3, i32* %8, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end:
|
||||
ret i8* %3
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue