forked from OSchip/llvm-project
[AArch64] Move SeparateConstOffsetFromGEPPass before LSR and enable EnableGEPOpt by default.
GEP's across basic blocks were not getting splitted due to EnableGEPOpt which was turned off by default. Hence, EarlyCSE missed the opportunity to eliminate common part of GEP's. This can be achieved by simply turning GEP pass on. - This patch moves SeparateConstOffsetFromGEPPass() just before LSR. - It enables EnableGEPOpt by default. Resolves - https://github.com/llvm/llvm-project/issues/50528 Added an unit test. Differential Revision: https://reviews.llvm.org/D128582
This commit is contained in:
parent
1b7feac2a6
commit
f55dbfbd9d
|
@ -128,7 +128,7 @@ static cl::opt<bool>
|
||||||
static cl::opt<bool>
|
static cl::opt<bool>
|
||||||
EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
|
EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
|
||||||
cl::desc("Enable optimizations on complex GEPs"),
|
cl::desc("Enable optimizations on complex GEPs"),
|
||||||
cl::init(false));
|
cl::init(true));
|
||||||
|
|
||||||
static cl::opt<bool>
|
static cl::opt<bool>
|
||||||
BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
|
BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
|
||||||
|
@ -563,17 +563,6 @@ void AArch64PassConfig::addIRPasses() {
|
||||||
addPass(createFalkorMarkStridedAccessesPass());
|
addPass(createFalkorMarkStridedAccessesPass());
|
||||||
}
|
}
|
||||||
|
|
||||||
TargetPassConfig::addIRPasses();
|
|
||||||
|
|
||||||
addPass(createAArch64StackTaggingPass(
|
|
||||||
/*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
|
|
||||||
|
|
||||||
// Match interleaved memory accesses to ldN/stN intrinsics.
|
|
||||||
if (TM->getOptLevel() != CodeGenOpt::None) {
|
|
||||||
addPass(createInterleavedLoadCombinePass());
|
|
||||||
addPass(createInterleavedAccessPass());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
|
if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
|
||||||
// Call SeparateConstOffsetFromGEP pass to extract constants within indices
|
// Call SeparateConstOffsetFromGEP pass to extract constants within indices
|
||||||
// and lower a GEP with multiple indices to either arithmetic operations or
|
// and lower a GEP with multiple indices to either arithmetic operations or
|
||||||
|
@ -587,6 +576,17 @@ void AArch64PassConfig::addIRPasses() {
|
||||||
addPass(createLICMPass());
|
addPass(createLICMPass());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TargetPassConfig::addIRPasses();
|
||||||
|
|
||||||
|
addPass(createAArch64StackTaggingPass(
|
||||||
|
/*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
|
||||||
|
|
||||||
|
// Match interleaved memory accesses to ldN/stN intrinsics.
|
||||||
|
if (TM->getOptLevel() != CodeGenOpt::None) {
|
||||||
|
addPass(createInterleavedLoadCombinePass());
|
||||||
|
addPass(createInterleavedAccessPass());
|
||||||
|
}
|
||||||
|
|
||||||
// Add Control Flow Guard checks.
|
// Add Control Flow Guard checks.
|
||||||
if (TM->getTargetTriple().isOSWindows())
|
if (TM->getTargetTriple().isOSWindows())
|
||||||
addPass(createCFGuardCheckPass());
|
addPass(createCFGuardCheckPass());
|
||||||
|
|
|
@ -7,45 +7,51 @@ target triple = "aarch64--"
|
||||||
define i32 @cse_gep([4 x i32]* %ptr, i32 %idx) {
|
define i32 @cse_gep([4 x i32]* %ptr, i32 %idx) {
|
||||||
; O0-LABEL: name: cse_gep
|
; O0-LABEL: name: cse_gep
|
||||||
; O0: bb.1 (%ir-block.0):
|
; O0: bb.1 (%ir-block.0):
|
||||||
; O0: liveins: $w1, $x0
|
; O0-NEXT: liveins: $w1, $x0
|
||||||
; O0: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
|
; O0-NEXT: {{ $}}
|
||||||
; O0: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
|
; O0-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
|
||||||
; O0: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
|
; O0-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
|
||||||
; O0: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
|
; O0-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
|
||||||
; O0: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]
|
; O0-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
|
||||||
; O0: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL]](s64)
|
; O0-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]
|
||||||
; O0: [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
|
; O0-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL]](s64)
|
||||||
; O0: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY2]](p0) :: (load (s32) from %ir.gep1)
|
; O0-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
|
||||||
; O0: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]
|
; O0-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY2]](p0) :: (load (s32) from %ir.gep1)
|
||||||
; O0: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL1]](s64)
|
; O0-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]
|
||||||
; O0: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
|
; O0-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL1]](s64)
|
||||||
; O0: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
|
; O0-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
|
||||||
; O0: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %ir.gep2)
|
; O0-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
|
||||||
; O0: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[LOAD1]]
|
; O0-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %ir.gep2)
|
||||||
; O0: $w0 = COPY [[ADD]](s32)
|
; O0-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[LOAD1]]
|
||||||
; O0: RET_ReallyLR implicit $w0
|
; O0-NEXT: $w0 = COPY [[ADD]](s32)
|
||||||
|
; O0-NEXT: RET_ReallyLR implicit $w0
|
||||||
; O3-LABEL: name: cse_gep
|
; O3-LABEL: name: cse_gep
|
||||||
; O3: bb.1 (%ir-block.0):
|
; O3: bb.1 (%ir-block.0):
|
||||||
; O3: liveins: $w1, $x0
|
; O3-NEXT: liveins: $w1, $x0
|
||||||
; O3: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
|
; O3-NEXT: {{ $}}
|
||||||
; O3: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
|
; O3-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
|
||||||
; O3: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
|
; O3-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
|
||||||
; O3: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
|
; O3-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
|
||||||
; O3: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]
|
; O3-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
|
||||||
; O3: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL]](s64)
|
; O3-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
|
||||||
; O3: [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
|
; O3-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C1]]
|
||||||
; O3: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY2]](p0) :: (load (s32) from %ir.gep1)
|
; O3-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[MUL]](s64)
|
||||||
; O3: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
|
; O3-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
|
||||||
; O3: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C1]](s64)
|
; O3-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY2]](p0) :: (load (s32) from %ir.gep1)
|
||||||
; O3: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32) from %ir.gep2)
|
; O3-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C]](s64)
|
||||||
; O3: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[LOAD1]]
|
; O3-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[SHL]](s64)
|
||||||
; O3: $w0 = COPY [[ADD]](s32)
|
; O3-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY [[PTR_ADD1]](p0)
|
||||||
; O3: RET_ReallyLR implicit $w0
|
; O3-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
|
||||||
|
; O3-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY3]], [[C2]](s64)
|
||||||
|
; O3-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %ir.3)
|
||||||
|
; O3-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[LOAD1]]
|
||||||
|
; O3-NEXT: $w0 = COPY [[ADD]](s32)
|
||||||
|
; O3-NEXT: RET_ReallyLR implicit $w0
|
||||||
%sidx = sext i32 %idx to i64
|
%sidx = sext i32 %idx to i64
|
||||||
%gep1 = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i64 %sidx, i64 0
|
%gep1 = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i64 %sidx, i64 0
|
||||||
%v1 = load i32, i32* %gep1
|
%v1 = load i32, i32* %gep1
|
||||||
%gep2 = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i64 %sidx, i64 1
|
%gep2 = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i64 %sidx, i64 1
|
||||||
%v2 = load i32, i32* %gep2
|
%v2 = load i32, i32* %gep2
|
||||||
%res = add i32 %v2, %v2
|
%res = add i32 %v1, %v2
|
||||||
ret i32 %res
|
ret i32 %res
|
||||||
}
|
}
|
||||||
|
|
|
@ -1458,10 +1458,12 @@ define void @test_lifetime_intrin() {
|
||||||
; O3-LABEL: name: test_lifetime_intrin
|
; O3-LABEL: name: test_lifetime_intrin
|
||||||
; O3: {{%[0-9]+}}:_(p0) = G_FRAME_INDEX %stack.0.slot
|
; O3: {{%[0-9]+}}:_(p0) = G_FRAME_INDEX %stack.0.slot
|
||||||
; O3-NEXT: LIFETIME_START %stack.0.slot
|
; O3-NEXT: LIFETIME_START %stack.0.slot
|
||||||
|
; O3-NEXT: G_STORE
|
||||||
; O3-NEXT: LIFETIME_END %stack.0.slot
|
; O3-NEXT: LIFETIME_END %stack.0.slot
|
||||||
; O3-NEXT: RET_ReallyLR
|
; O3-NEXT: RET_ReallyLR
|
||||||
%slot = alloca i8, i32 4
|
%slot = alloca i8, i32 4
|
||||||
call void @llvm.lifetime.start.p0i8(i64 0, i8* %slot)
|
call void @llvm.lifetime.start.p0i8(i64 0, i8* %slot)
|
||||||
|
store volatile i8 10, i8* %slot
|
||||||
call void @llvm.lifetime.end.p0i8(i64 0, i8* %slot)
|
call void @llvm.lifetime.end.p0i8(i64 0, i8* %slot)
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,9 +33,20 @@
|
||||||
; CHECK-NEXT: Scalar Evolution Analysis
|
; CHECK-NEXT: Scalar Evolution Analysis
|
||||||
; CHECK-NEXT: Loop Data Prefetch
|
; CHECK-NEXT: Loop Data Prefetch
|
||||||
; CHECK-NEXT: Falkor HW Prefetch Fix
|
; CHECK-NEXT: Falkor HW Prefetch Fix
|
||||||
; CHECK-NEXT: Module Verifier
|
; CHECK-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
|
||||||
|
; CHECK-NEXT: Early CSE
|
||||||
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
|
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||||
|
; CHECK-NEXT: Function Alias Analysis Results
|
||||||
|
; CHECK-NEXT: Memory SSA
|
||||||
; CHECK-NEXT: Canonicalize natural loops
|
; CHECK-NEXT: Canonicalize natural loops
|
||||||
|
; CHECK-NEXT: LCSSA Verifier
|
||||||
|
; CHECK-NEXT: Loop-Closed SSA Form Pass
|
||||||
|
; CHECK-NEXT: Scalar Evolution Analysis
|
||||||
|
; CHECK-NEXT: Lazy Branch Probability Analysis
|
||||||
|
; CHECK-NEXT: Lazy Block Frequency Analysis
|
||||||
|
; CHECK-NEXT: Loop Pass Manager
|
||||||
|
; CHECK-NEXT: Loop Invariant Code Motion
|
||||||
|
; CHECK-NEXT: Module Verifier
|
||||||
; CHECK-NEXT: Loop Pass Manager
|
; CHECK-NEXT: Loop Pass Manager
|
||||||
; CHECK-NEXT: Canonicalize Freeze Instructions in Loops
|
; CHECK-NEXT: Canonicalize Freeze Instructions in Loops
|
||||||
; CHECK-NEXT: Induction Variable Users
|
; CHECK-NEXT: Induction Variable Users
|
||||||
|
|
|
@ -27,15 +27,16 @@ L2:
|
||||||
define void @test_add_cbz_multiple_use(i32 %a, i32 %b, i32* %ptr) {
|
define void @test_add_cbz_multiple_use(i32 %a, i32 %b, i32* %ptr) {
|
||||||
; CHECK-LABEL: test_add_cbz_multiple_use:
|
; CHECK-LABEL: test_add_cbz_multiple_use:
|
||||||
; CHECK: // %bb.0: // %common.ret
|
; CHECK: // %bb.0: // %common.ret
|
||||||
; CHECK-NEXT: adds w8, w0, w1
|
; CHECK-NEXT: mov w8, #10
|
||||||
; CHECK-NEXT: csel w8, wzr, w8, ne
|
; CHECK-NEXT: adds w9, w0, w1
|
||||||
|
; CHECK-NEXT: csel w8, w8, w9, ne
|
||||||
; CHECK-NEXT: str w8, [x2]
|
; CHECK-NEXT: str w8, [x2]
|
||||||
; CHECK-NEXT: ret
|
; CHECK-NEXT: ret
|
||||||
%c = add nsw i32 %a, %b
|
%c = add nsw i32 %a, %b
|
||||||
%d = icmp ne i32 %c, 0
|
%d = icmp ne i32 %c, 0
|
||||||
br i1 %d, label %L1, label %L2
|
br i1 %d, label %L1, label %L2
|
||||||
L1:
|
L1:
|
||||||
store i32 0, i32* %ptr, align 4
|
store i32 10, i32* %ptr, align 4
|
||||||
ret void
|
ret void
|
||||||
L2:
|
L2:
|
||||||
store i32 %c, i32* %ptr, align 4
|
store i32 %c, i32* %ptr, align 4
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
if not 'AArch64' in config.root.targets:
|
||||||
|
config.unsupported = True
|
|
@ -0,0 +1,32 @@
|
||||||
|
; RUN: llc < %s -O3 -mtriple=aarch64-linux-gnu | FileCheck %s
|
||||||
|
|
||||||
|
%struct = type { i32, i32, i32 }
|
||||||
|
|
||||||
|
define i32 @test1(%struct* %ptr, i64 %idx) {
|
||||||
|
; CHECK-LABEL: test1:
|
||||||
|
; CHECK: // %bb.0:
|
||||||
|
; CHECK-NEXT: mov w8, #12
|
||||||
|
; CHECK-NEXT: madd x8, x1, x8, x0
|
||||||
|
; CHECK-NEXT: ldr w9, [x8, #4]
|
||||||
|
; CHECK-NEXT: tbnz w9, #31, .LBB0_2
|
||||||
|
; CHECK-NEXT: // %bb.1:
|
||||||
|
; CHECK-NEXT: mov w0, wzr
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
; CHECK-NEXT: .LBB0_2: // %then
|
||||||
|
; CHECK-NEXT: ldr w8, [x8, #8]
|
||||||
|
; CHECK-NEXT: add w0, w9, w8
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%gep.1 = getelementptr %struct, %struct* %ptr, i64 %idx, i32 1
|
||||||
|
%lv.1 = load i32, i32* %gep.1
|
||||||
|
%c = icmp slt i32 %lv.1, 0
|
||||||
|
br i1 %c, label %then, label %else
|
||||||
|
|
||||||
|
then:
|
||||||
|
%gep.2 = getelementptr %struct, %struct* %ptr, i64 %idx, i32 2
|
||||||
|
%lv.2 = load i32, i32* %gep.2
|
||||||
|
%res = add i32 %lv.1, %lv.2
|
||||||
|
ret i32 %res
|
||||||
|
|
||||||
|
else:
|
||||||
|
ret i32 0
|
||||||
|
}
|
Loading…
Reference in New Issue