forked from OSchip/llvm-project
[OldPM] Pass manager: run SROA after (simple) loop unrolling
I have stumbled into this pretty accidentally, when rewriting
some spaghetti-like code into something more structured,
which involved using some `std::array<>`s. And to my surprise,
the `alloca`s remained, causing about `+160%` perf regression.
https://llvm-compile-time-tracker.com/compare.php?from=bb6f4d32aac3eecb51909f4facc625219307ee68&to=d563e66f40f9d4d145cb2050e41cb961e2b37785&stat=instructions
suggests that this has geomean compile-time cost of `+0.08%`.
Note that D68593 / cecc0d27ad
already did this chage for NewPM, but left OldPM in a pessimized state.
This fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40011 | PR40011 ]], [[ https://bugs.llvm.org/show_bug.cgi?id=42794 | PR42794 ]] and probably some other reports.
Reviewed By: nikic, xbolva00
Differential Revision: https://reviews.llvm.org/D87972
This commit is contained in:
parent
51beb0c80d
commit
03bd5198b6
|
@ -1,4 +1,4 @@
|
|||
// RUN: %clang_cc1 %s -O2 -fno-experimental-new-pass-manager -std=c++11 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -target-feature +sse4.2 -target-feature +avx -emit-llvm -o - | FileCheck %s
|
||||
// RUN: %clang_cc1 %s -O1 -std=c++11 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -target-feature +sse4.2 -target-feature +avx -emit-llvm -o - | FileCheck %s
|
||||
|
||||
// Testcase from llvm.org/PR32056
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// RUN: %clang -O1 -fexperimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s -check-prefix=CHECK-NEWPM
|
||||
// RUN: %clang -O1 -fno-experimental-new-pass-manager -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s -check-prefix=CHECK-OLDPM
|
||||
// RUN: %clang -O1 -fno-unroll-loops -S -o - %s -emit-llvm | FileCheck %s
|
||||
|
||||
extern int a[16];
|
||||
int b = 0;
|
||||
int foo(void) {
|
||||
|
@ -9,10 +9,8 @@ int foo(void) {
|
|||
return b;
|
||||
}
|
||||
// Check br i1 to make sure that the loop is fully unrolled
|
||||
// CHECK-LABEL-NEWPM: foo
|
||||
// CHECK-NOT-NEWPM: br i1
|
||||
// CHECK-LABEL-OLDPM: foo
|
||||
// CHECK-NOT-OLDPM: br i1
|
||||
// CHECK-LABEL: foo
|
||||
// CHECK-NOT: br i1
|
||||
|
||||
void Helper() {
|
||||
const int *nodes[5];
|
||||
|
@ -26,17 +24,7 @@ void Helper() {
|
|||
}
|
||||
|
||||
// Check br i1 to make sure the loop is gone, there will still be a label branch for the infinite loop.
|
||||
// CHECK-LABEL-NEWPM: Helper
|
||||
// CHECK-NEWPM: br label
|
||||
// CHECK-NEWPM-NOT: br i1
|
||||
// CHECK-NEWPM: br label
|
||||
|
||||
// The old pass manager doesn't remove the while loop so check for 5 load i32*.
|
||||
// CHECK-LABEL-OLDPM: Helper
|
||||
// CHECK-OLDPM: br label
|
||||
// CHECK-OLDPM: load i32*
|
||||
// CHECK-OLDPM: load i32*
|
||||
// CHECK-OLDPM: load i32*
|
||||
// CHECK-OLDPM: load i32*
|
||||
// CHECK-OLDPM: load i32*
|
||||
// CHECK-OLDPM: ret
|
||||
// CHECK-LABEL: Helper
|
||||
// CHECK: br label
|
||||
// CHECK-NOT: br i1
|
||||
// CHECK: br label
|
||||
|
|
|
@ -479,14 +479,6 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
|
|||
if (EnableOpt)
|
||||
PM.add(createAMDGPUPromoteAllocaToVector());
|
||||
});
|
||||
|
||||
Builder.addExtension(
|
||||
PassManagerBuilder::EP_LoopOptimizerEnd,
|
||||
[](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
|
||||
// Add SROA after loop unrolling as more promotable patterns are
|
||||
// exposed after small loops are fully unrolled.
|
||||
PM.add(createSROAPass());
|
||||
});
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -459,6 +459,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
|
|||
addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
|
||||
// This ends the loop pass pipelines.
|
||||
|
||||
// Break up allocas that may now be splittable after loop unrolling.
|
||||
MPM.add(createSROAPass());
|
||||
|
||||
if (OptLevel > 1) {
|
||||
MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
|
||||
MPM.add(NewGVN ? createNewGVNPass()
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -O2 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s
|
||||
; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -O2 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s
|
||||
|
||||
; REQUIRES: asserts
|
||||
|
||||
|
@ -22,7 +22,7 @@
|
|||
; CHECK-NEXT: Target Library Information
|
||||
; CHECK-NEXT: Target Transform Information
|
||||
; Target Pass Configuration
|
||||
; CHECK: Type-Based Alias Analysis
|
||||
; CHECK: Type-Based Alias Analysis
|
||||
; CHECK-NEXT: Scoped NoAlias Alias Analysis
|
||||
; CHECK-NEXT: Assumption Cache Tracker
|
||||
; CHECK-NEXT: Profile summary info
|
||||
|
@ -134,6 +134,8 @@
|
|||
; CHECK-NEXT: Recognize loop idioms
|
||||
; CHECK-NEXT: Delete dead loops
|
||||
; CHECK-NEXT: Unroll loops
|
||||
; CHECK-NEXT: SROA
|
||||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
; CHECK-NEXT: MergedLoadStoreMotion
|
||||
; CHECK-NEXT: Phi Values Analysis
|
||||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
|
|
|
@ -139,6 +139,8 @@
|
|||
; CHECK-NEXT: Recognize loop idioms
|
||||
; CHECK-NEXT: Delete dead loops
|
||||
; CHECK-NEXT: Unroll loops
|
||||
; CHECK-NEXT: SROA
|
||||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
; CHECK-NEXT: MergedLoadStoreMotion
|
||||
; CHECK-NEXT: Phi Values Analysis
|
||||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
|
|
|
@ -139,6 +139,8 @@
|
|||
; CHECK-NEXT: Recognize loop idioms
|
||||
; CHECK-NEXT: Delete dead loops
|
||||
; CHECK-NEXT: Unroll loops
|
||||
; CHECK-NEXT: SROA
|
||||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
; CHECK-NEXT: MergedLoadStoreMotion
|
||||
; CHECK-NEXT: Phi Values Analysis
|
||||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
|
|
|
@ -120,6 +120,8 @@
|
|||
; CHECK-NEXT: Recognize loop idioms
|
||||
; CHECK-NEXT: Delete dead loops
|
||||
; CHECK-NEXT: Unroll loops
|
||||
; CHECK-NEXT: SROA
|
||||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
; CHECK-NEXT: MergedLoadStoreMotion
|
||||
; CHECK-NEXT: Phi Values Analysis
|
||||
; CHECK-NEXT: Function Alias Analysis Results
|
||||
|
|
|
@ -22,55 +22,21 @@ target triple = "x86_64-unknown-linux-gnu"
|
|||
%"struct.std::array" = type { [6 x i32] }
|
||||
|
||||
define dso_local void @_Z3fooi(i32 %cnt) {
|
||||
; OLDPM-LABEL: @_Z3fooi(
|
||||
; OLDPM-NEXT: entry:
|
||||
; OLDPM-NEXT: [[ARR:%.*]] = alloca %"struct.std::array", align 16
|
||||
; OLDPM-NEXT: [[TMP0:%.*]] = bitcast %"struct.std::array"* [[ARR]] to i8*
|
||||
; OLDPM-NEXT: call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull [[TMP0]])
|
||||
; OLDPM-NEXT: [[ARRAYDECAY_I_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 0
|
||||
; OLDPM-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 1
|
||||
; OLDPM-NEXT: [[INCDEC_PTR_1:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 2
|
||||
; OLDPM-NEXT: [[INCDEC_PTR_2:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 3
|
||||
; OLDPM-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[CNT:%.*]], i32 0
|
||||
; OLDPM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
; OLDPM-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 2, i32 3, i32 4>
|
||||
; OLDPM-NEXT: [[TMP4:%.*]] = bitcast %"struct.std::array"* [[ARR]] to <4 x i32>*
|
||||
; OLDPM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16
|
||||
; OLDPM-NEXT: [[INCDEC_PTR_3:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 4
|
||||
; OLDPM-NEXT: [[INC_4:%.*]] = add nsw i32 [[CNT]], 5
|
||||
; OLDPM-NEXT: store i32 [[INC_4]], i32* [[INCDEC_PTR_3]], align 16
|
||||
; OLDPM-NEXT: [[INCDEC_PTR_4:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* [[ARR]], i64 0, i32 0, i64 5
|
||||
; OLDPM-NEXT: [[INC_5:%.*]] = add nsw i32 [[CNT]], 6
|
||||
; OLDPM-NEXT: store i32 [[INC_5]], i32* [[INCDEC_PTR_4]], align 4
|
||||
; OLDPM-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYDECAY_I_I_I]], align 16
|
||||
; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP5]])
|
||||
; OLDPM-NEXT: [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
|
||||
; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP6]])
|
||||
; OLDPM-NEXT: [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR_1]], align 8
|
||||
; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP7]])
|
||||
; OLDPM-NEXT: [[TMP8:%.*]] = load i32, i32* [[INCDEC_PTR_2]], align 4
|
||||
; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP8]])
|
||||
; OLDPM-NEXT: [[TMP9:%.*]] = load i32, i32* [[INCDEC_PTR_3]], align 16
|
||||
; OLDPM-NEXT: call void @_Z3usei(i32 [[TMP9]])
|
||||
; OLDPM-NEXT: call void @_Z3usei(i32 [[INC_5]])
|
||||
; OLDPM-NEXT: call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull [[TMP0]])
|
||||
; OLDPM-NEXT: ret void
|
||||
;
|
||||
; NEWPM-LABEL: @_Z3fooi(
|
||||
; NEWPM-NEXT: entry:
|
||||
; NEWPM-NEXT: [[INC:%.*]] = add nsw i32 [[CNT:%.*]], 1
|
||||
; NEWPM-NEXT: [[INC_1:%.*]] = add nsw i32 [[CNT]], 2
|
||||
; NEWPM-NEXT: [[INC_2:%.*]] = add nsw i32 [[CNT]], 3
|
||||
; NEWPM-NEXT: [[INC_3:%.*]] = add nsw i32 [[CNT]], 4
|
||||
; NEWPM-NEXT: [[INC_4:%.*]] = add nsw i32 [[CNT]], 5
|
||||
; NEWPM-NEXT: [[INC_5:%.*]] = add nsw i32 [[CNT]], 6
|
||||
; NEWPM-NEXT: call void @_Z3usei(i32 [[INC]])
|
||||
; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_1]])
|
||||
; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_2]])
|
||||
; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_3]])
|
||||
; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_4]])
|
||||
; NEWPM-NEXT: call void @_Z3usei(i32 [[INC_5]])
|
||||
; NEWPM-NEXT: ret void
|
||||
; CHECK-LABEL: @_Z3fooi(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[CNT:%.*]], 1
|
||||
; CHECK-NEXT: [[INC_1:%.*]] = add nsw i32 [[CNT]], 2
|
||||
; CHECK-NEXT: [[INC_2:%.*]] = add nsw i32 [[CNT]], 3
|
||||
; CHECK-NEXT: [[INC_3:%.*]] = add nsw i32 [[CNT]], 4
|
||||
; CHECK-NEXT: [[INC_4:%.*]] = add nsw i32 [[CNT]], 5
|
||||
; CHECK-NEXT: [[INC_5:%.*]] = add nsw i32 [[CNT]], 6
|
||||
; CHECK-NEXT: call void @_Z3usei(i32 [[INC]])
|
||||
; CHECK-NEXT: call void @_Z3usei(i32 [[INC_1]])
|
||||
; CHECK-NEXT: call void @_Z3usei(i32 [[INC_2]])
|
||||
; CHECK-NEXT: call void @_Z3usei(i32 [[INC_3]])
|
||||
; CHECK-NEXT: call void @_Z3usei(i32 [[INC_4]])
|
||||
; CHECK-NEXT: call void @_Z3usei(i32 [[INC_5]])
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%cnt.addr = alloca i32
|
||||
|
|
Loading…
Reference in New Issue