llvm-project/llvm/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll

; RUN: llc -O3 -aarch64-enable-gep-opt=true  -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck <%t %s
; REQUIRES: asserts
target triple = "aarch64--linux-android"

%typeD = type { i32, i32, [256 x i32], [257 x i32] }

; Function Attrs: noreturn nounwind uwtable
define i32 @test1(%typeD* nocapture %s) {
entry:
; CHECK-LABEL: entry:
; CHECK:    %uglygep = getelementptr i8, i8* %0, i64 1032
; CHECK:    br label %do.body.i


  %tPos = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 0
  %k0 = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 1
  %.pre = load i32, i32* %tPos, align 4
  br label %do.body.i

do.body.i:
; CHECK-LABEL: do.body.i:
; CHECK:          %uglygep2 = getelementptr i8, i8* %uglygep, i64 %3
; CHECK-NEXT:     %4 = bitcast i8* %uglygep2 to i32*
; CHECK-NOT:      %uglygep2 = getelementptr i8, i8* %uglygep, i64 1032


  %0 = phi i32 [ 256, %entry ], [ %.be, %do.body.i.backedge ]
  %1 = phi i32 [ 0, %entry ], [ %.be6, %do.body.i.backedge ]
  %add.i = add nsw i32 %1, %0
  %shr.i = ashr i32 %add.i, 1
  %idxprom.i = sext i32 %shr.i to i64
  %arrayidx.i = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 3, i64 %idxprom.i
  %2 = load i32, i32* %arrayidx.i, align 4
  %cmp.i = icmp sle i32 %2, %.pre
  %na.1.i = select i1 %cmp.i, i32 %0, i32 %shr.i
  %nb.1.i = select i1 %cmp.i, i32 %shr.i, i32 %1
  %sub.i = sub nsw i32 %na.1.i, %nb.1.i
  %cmp1.i = icmp eq i32 %sub.i, 1
  br i1 %cmp1.i, label %fooo.exit, label %do.body.i.backedge

do.body.i.backedge:
  %.be = phi i32 [ %na.1.i, %do.body.i ], [ 256, %fooo.exit ]
  %.be6 = phi i32 [ %nb.1.i, %do.body.i ], [ 0, %fooo.exit ]
  br label %do.body.i

fooo.exit:                              ; preds = %do.body.i
  store i32 %nb.1.i, i32* %k0, align 4
  br label %do.body.i.backedge
}
[AArch64] Register passes so they can be run by llc Initialize all AArch64-specific passes in the TargetMachine so they can be run by llc. This can lead to conflicts in opt with some command line options that share the same name as the pass, so I took this opportunity to do some cleanups: * rename all relevant command line options from "aarch64-blah" to "aarch64-enable-blah" and update the tests accordingly * run clang-format on their declarations * move all these declarations to a common place (the TargetMachine) as opposed to having them scattered around (AArch64BranchRelaxation and AArch64AddressTypePromotion were the only offenders) llvm-svn: 277322 2016-08-01 13:56:57 +08:00			`; RUN: llc -O3 -aarch64-enable-gep-opt=true -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck <%t %s`
Swap loop invariant GEP with loop variant GEP to allow more LICM. This patch changes the order of GEPs generated by Splitting GEPs pass, specially when one of the GEPs has constant and the base is loop invariant, then we will generate the GEP with constant first when beneficial, to expose more cases for LICM. If originally Splitting GEP generate the following: do.body.i: %idxprom.i = sext i32 %shr.i to i64 %2 = bitcast %typeD* %s to i8* %3 = shl i64 %idxprom.i, 2 %uglygep = getelementptr i8, i8* %2, i64 %3 %uglygep7 = getelementptr i8, i8* %uglygep, i64 1032 ... Now it genereates: do.body.i: %idxprom.i = sext i32 %shr.i to i64 %2 = bitcast %typeD* %s to i8* %3 = shl i64 %idxprom.i, 2 %uglygep = getelementptr i8, i8* %2, i64 1032 %uglygep7 = getelementptr i8, i8* %uglygep, i64 %3 ... For no-loop cases, the original way of generating GEPs seems to expose more CSE cases, so we don't change the logic for no-loop cases, and only limit our change to the specific case we are interested in. llvm-svn: 248420 2015-09-24 03:25:30 +08:00			`; REQUIRES: asserts`
			`target triple = "aarch64--linux-android"`

			`%typeD = type { i32, i32, [256 x i32], [257 x i32] }`

			`; Function Attrs: noreturn nounwind uwtable`
			`define i32 @test1(%typeD* nocapture %s) {`
			`entry:`
			`; CHECK-LABEL: entry:`
			`; CHECK: %uglygep = getelementptr i8, i8* %0, i64 1032`
			`; CHECK: br label %do.body.i`


			`%tPos = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 0`
			`%k0 = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 1`
			`%.pre = load i32, i32* %tPos, align 4`
			`br label %do.body.i`

			`do.body.i:`
			`; CHECK-LABEL: do.body.i:`
[AArch64] Use LateSimplifyCFG after expanding atomic operations. Summary: After r308422 we defer optimizations that can destroy loop canonical forms to LateSimplifyCFG. Running LateSimplifyCFG after expanding atomic operations can exploit more control-flow opportunities. Reviewers: mcrosier, t.p.northover, efriedma Reviewed By: efriedma Subscribers: aemerson, rengolin, javed.absar, llvm-commits, kristof.beyls Differential Revision: https://reviews.llvm.org/D38262 llvm-svn: 314857 2017-10-04 06:39:24 +08:00			`; CHECK: %uglygep2 = getelementptr i8, i8* %uglygep, i64 %3`
			`; CHECK-NEXT: %4 = bitcast i8* %uglygep2 to i32*`
			`; CHECK-NOT: %uglygep2 = getelementptr i8, i8* %uglygep, i64 1032`
Swap loop invariant GEP with loop variant GEP to allow more LICM. This patch changes the order of GEPs generated by Splitting GEPs pass, specially when one of the GEPs has constant and the base is loop invariant, then we will generate the GEP with constant first when beneficial, to expose more cases for LICM. If originally Splitting GEP generate the following: do.body.i: %idxprom.i = sext i32 %shr.i to i64 %2 = bitcast %typeD* %s to i8* %3 = shl i64 %idxprom.i, 2 %uglygep = getelementptr i8, i8* %2, i64 %3 %uglygep7 = getelementptr i8, i8* %uglygep, i64 1032 ... Now it genereates: do.body.i: %idxprom.i = sext i32 %shr.i to i64 %2 = bitcast %typeD* %s to i8* %3 = shl i64 %idxprom.i, 2 %uglygep = getelementptr i8, i8* %2, i64 1032 %uglygep7 = getelementptr i8, i8* %uglygep, i64 %3 ... For no-loop cases, the original way of generating GEPs seems to expose more CSE cases, so we don't change the logic for no-loop cases, and only limit our change to the specific case we are interested in. llvm-svn: 248420 2015-09-24 03:25:30 +08:00

			`%0 = phi i32 [ 256, %entry ], [ %.be, %do.body.i.backedge ]`
			`%1 = phi i32 [ 0, %entry ], [ %.be6, %do.body.i.backedge ]`
			`%add.i = add nsw i32 %1, %0`
			`%shr.i = ashr i32 %add.i, 1`
			`%idxprom.i = sext i32 %shr.i to i64`
			`%arrayidx.i = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 3, i64 %idxprom.i`
			`%2 = load i32, i32* %arrayidx.i, align 4`
			`%cmp.i = icmp sle i32 %2, %.pre`
			`%na.1.i = select i1 %cmp.i, i32 %0, i32 %shr.i`
			`%nb.1.i = select i1 %cmp.i, i32 %shr.i, i32 %1`
			`%sub.i = sub nsw i32 %na.1.i, %nb.1.i`
			`%cmp1.i = icmp eq i32 %sub.i, 1`
			`br i1 %cmp1.i, label %fooo.exit, label %do.body.i.backedge`

			`do.body.i.backedge:`
			`%.be = phi i32 [ %na.1.i, %do.body.i ], [ 256, %fooo.exit ]`
			`%.be6 = phi i32 [ %nb.1.i, %do.body.i ], [ 0, %fooo.exit ]`
			`br label %do.body.i`

			`fooo.exit: ; preds = %do.body.i`
			`store i32 %nb.1.i, i32* %k0, align 4`
			`br label %do.body.i.backedge`
			`}`