[DSE] Re-enable calloc transformation with extra care (PR25892)

Transformation from malloc+memset to calloc is always correct and in many situations
it brings significant observable benefits in terms of execution speed and memory consumption [1][2].
Unfortunately there are cases when producing calloc cause performance drops [3].
As discussed here: https://reviews.llvm.org/D103009 it's possible to differentiate between those 2 scenarios.
If optimizer is able to prove that after malloc call it's _very_ likely to reach memset branch then after
calloc emission we shouldn't observe any performance hits. Therefore finding "null pointer check" pattern
before memset basic block sounds like good justification for performing transformation.
Also that method was already suggested by GCC folks [4]. Main reason for change is that for now
to be safe we check for post dominance relation which is way too conservative approach making transformation
"almost" disabled in practice. This patch tends to enable transformation again but with extra care.

[1] https://stackoverflow.com/questions/2688466/why-mallocmemset-is-slower-than-calloc
[2] https://vorpus.org/blog/why-does-calloc-exist/
[3] http://smalldatum.blogspot.com/2017/11/a-new-optimization-in-gcc-5x-and-mysql.html
[4] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83022

Differential Revision: https://reviews.llvm.org/D110021
This commit is contained in:
Dawid Jurczak 2021-10-10 19:52:33 +02:00
parent 9c8f950a04
commit 9e65929a8e
2 changed files with 99 additions and 8 deletions

View File

@ -1844,8 +1844,29 @@ struct DSEState {
if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
Func != LibFunc_malloc)
return false;
auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) {
// Check for br(icmp ptr, null), truebb, falsebb) pattern at the end
// of malloc block
auto *MallocBB = Malloc->getParent(),
*MemsetBB = Memset->getParent();
if (MallocBB == MemsetBB)
return true;
auto *Ptr = Memset->getArgOperand(0);
auto *TI = MallocBB->getTerminator();
ICmpInst::Predicate Pred;
BasicBlock *TrueBB, *FalseBB;
if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
FalseBB)))
return false;
if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
return false;
return true;
};
if (Malloc->getOperand(0) == MemSet->getLength()) {
if (DT.dominates(Malloc, MemSet) && PDT.dominates(MemSet, Malloc) &&
if (shouldCreateCalloc(Malloc, MemSet) &&
DT.dominates(Malloc, MemSet) &&
memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) {
IRBuilder<> IRB(Malloc);
const auto &DL = Malloc->getModule()->getDataLayout();

View File

@ -405,17 +405,14 @@ if.end: ; preds = %if.then, %entry
ret i8* %call
}
; FIXME: malloc+memset are not currently transformed into calloc unless the
; memset post-dominates the malloc.
define float* @pr25892(i64 %size) {
; CHECK-LABEL: @pr25892(
; CHECK: entry:
; CHECK-NEXT: [[CALL:%.*]] = call i8* @malloc(i64 [[SIZE:%.*]])
; CHECK-NEXT: [[CALL:%.*]] = call i8* @calloc(i64 1, i64 [[SIZE:%.*]])
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8* [[CALL]], null
; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
; CHECK: if.end:
; CHECK-NEXT: [[BC:%.*]] = bitcast i8* [[CALL]] to float*
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %call, i8 0, i64 %size, i1 false)
; CHECK-NEXT: br label [[CLEANUP]]
; CHECK: cleanup:
; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float* [ [[BC]], [[IF_END]] ], [ null, [[ENTRY:%.*]] ]
@ -437,13 +434,11 @@ cleanup:
define float* @pr25892_with_extra_store(i64 %size) {
; CHECK-LABEL: @pr25892_with_extra_store(
; CHECK: entry:
; CHECK-NEXT: [[CALL:%.*]] = call i8* @malloc(i64 [[SIZE:%.*]])
; CHECK-NEXT: [[CALL:%.*]] = call i8* @calloc(i64 1, i64 [[SIZE:%.*]])
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8* [[CALL]], null
; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
; CHECK: if.end:
; CHECK-NEXT: [[BC:%.*]] = bitcast i8* [[CALL]] to float*
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %call, i8 0, i64 %size, i1 false)
; CHECK-NEXT: store i8 0, i8* %call, align 1
; CHECK-NEXT: br label [[CLEANUP]]
; CHECK: cleanup:
; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float* [ [[BC]], [[IF_END]] ], [ null, [[ENTRY:%.*]] ]
@ -463,6 +458,32 @@ cleanup:
ret float* %retval.0
}
; This should not create a calloc
define i8* @malloc_with_no_nointer_null_check(i64 %0, i32 %1) {
; CHECK-LABEL: @malloc_with_no_nointer_null_check
; CHECK: entry:
; CHECK-NEXT: [[CALL:%.*]] = call i8* @malloc(i64 [[TMP0:%.*]])
; CHECK-NEXT: [[A:%.*]] = and i32 [[TMP1:%.*]], 32
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], 0
; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
; CHECK: if.end:
; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[CALL]], i8 0, i64 [[TMP0]], i1 false)
; CHECK-NEXT: br label [[CLEANUP]]
; CHECK: cleanup:
; CHECK-NEXT: ret i8* [[CALL]]
;
entry:
%call = call i8* @malloc(i64 %0) inaccessiblememonly
%a = and i32 %1, 32
%cmp = icmp eq i32 %a, 0
br i1 %cmp, label %cleanup, label %if.end
if.end:
call void @llvm.memset.p0i8.i64(i8* %call, i8 0, i64 %0, i1 false)
br label %cleanup
cleanup:
ret i8* %call
}
; PR50143
define i8* @store_zero_after_calloc_inaccessiblememonly() {
; CHECK-LABEL: @store_zero_after_calloc_inaccessiblememonly(
@ -605,3 +626,52 @@ define i8* @memset_pattern16_after_calloc(i8* %pat) {
call void @memset_pattern16(i8* %call, i8* %pat, i64 40000) #1
ret i8* %call
}
@n = global i32 0, align 4
@a = external global i32, align 4
@b = external global i32*, align 8
; GCC calloc-1.c test case should create calloc
define i8* @test_malloc_memset_to_calloc(i64* %0) {
; CHECK-LABEL: @test_malloc_memset_to_calloc(
; CHECK: entry:
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
; CHECK-NEXT: [[CALLOC:%.*]] = call i8* @calloc(i64 1, i64 [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP0:%.*]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], 1
; CHECK-NEXT: store i64 [[TMP4]], i64* [[TMP0]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8* [[CALLOC]], null
; CHECK-NEXT: br i1 [[TMP5]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
; CHECK: if.then: ; preds = %entry
; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP3]], 2
; CHECK-NEXT: store i64 [[TMP6]], i64* [[TMP0]], align 8
; CHECK-NEXT: store i32 2, i32* @a, align 4
; CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** @b, align 8
; CHECK-NEXT: store i32 3, i32* [[TMP7]], align 4
; CHECK-NEXT: br label [[IF_END]]
; CHECK: if.end: ; preds = %if.then, %entry
; CHECK-NEXT: ret i8* [[CALLOC]]
;
entry:
%1 = load i32, i32* @n, align 4
%2 = sext i32 %1 to i64
%3 = tail call i8* @malloc(i64 %2) inaccessiblememonly
%4 = load i64, i64* %0, align 8
%5 = add nsw i64 %4, 1
store i64 %5, i64* %0, align 8
%6 = icmp eq i8* %3, null
br i1 %6, label %if.end, label %if.then
if.then:
%7 = add nsw i64 %4, 2
store i64 %7, i64* %0, align 8
store i32 2, i32* @a, align 4
tail call void @llvm.memset.p0i8.i64(i8* align 4 %3, i8 0, i64 %2, i1 false)
%8 = load i32*, i32** @b, align 8
store i32 3, i32* %8, align 4
br label %if.end
if.end:
ret i8* %3
}