llvm-project/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1171 lines
42 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -debugify-and-strip-all-safe %s -o - -mtriple=arm64-apple-ios -enable-shrink-wrap=true -disable-post-ra -frame-pointer=non-leaf | FileCheck %s --check-prefix=ENABLE
; RUN: llc -debugify-and-strip-all-safe %s -o - -enable-shrink-wrap=false -disable-post-ra -frame-pointer=non-leaf | FileCheck %s --check-prefix=DISABLE
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-ios"
; Initial motivating example: Simple diamond with a call just on one side.
define i32 @foo(i32 %a, i32 %b) {
; ENABLE-LABEL: foo:
; ENABLE: ; %bb.0:
; ENABLE-NEXT: cmp w0, w1
; ENABLE-NEXT: b.ge LBB0_2
; ENABLE-NEXT: ; %bb.1: ; %true
; ENABLE-NEXT: sub sp, sp, #32 ; =32
; ENABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; ENABLE-NEXT: add x29, sp, #16 ; =16
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: stur w0, [x29, #-4]
; ENABLE-NEXT: sub x1, x29, #4 ; =4
; ENABLE-NEXT: mov w0, wzr
; ENABLE-NEXT: bl _doSomething
; ENABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; ENABLE-NEXT: add sp, sp, #32 ; =32
; ENABLE-NEXT: LBB0_2: ; %false
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: foo:
; DISABLE: ; %bb.0:
; DISABLE-NEXT: sub sp, sp, #32 ; =32
; DISABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; DISABLE-NEXT: add x29, sp, #16 ; =16
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: cmp w0, w1
; DISABLE-NEXT: b.ge LBB0_2
; DISABLE-NEXT: ; %bb.1: ; %true
; DISABLE-NEXT: stur w0, [x29, #-4]
; DISABLE-NEXT: sub x1, x29, #4 ; =4
; DISABLE-NEXT: mov w0, wzr
; DISABLE-NEXT: bl _doSomething
; DISABLE-NEXT: LBB0_2: ; %false
; DISABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; DISABLE-NEXT: add sp, sp, #32 ; =32
; DISABLE-NEXT: ret
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
%tmp = alloca i32, align 4
%tmp2 = icmp slt i32 %a, %b
br i1 %tmp2, label %true, label %false
true:
store i32 %a, i32* %tmp, align 4
%tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
br label %false
false:
%tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
ret i32 %tmp.0
}
; Function Attrs: optsize
declare i32 @doSomething(i32, i32*)
; Check that we do not perform the restore inside the loop whereas the save
; is outside.
define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
; ENABLE-LABEL: freqSaveAndRestoreOutsideLoop:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: cbz w0, LBB1_4
; ENABLE-NEXT: ; %bb.1: ; %for.body.preheader
; ENABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; ENABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; ENABLE-NEXT: add x29, sp, #16 ; =16
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: .cfi_offset w19, -24
; ENABLE-NEXT: .cfi_offset w20, -32
; ENABLE-NEXT: mov w19, wzr
; ENABLE-NEXT: mov w20, #10
; ENABLE-NEXT: LBB1_2: ; %for.body
; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLE-NEXT: bl _something
; ENABLE-NEXT: subs w20, w20, #1 ; =1
; ENABLE-NEXT: add w19, w0, w19
; ENABLE-NEXT: b.ne LBB1_2
; ENABLE-NEXT: ; %bb.3: ; %for.end
; ENABLE-NEXT: lsl w0, w19, #3
; ENABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; ENABLE-NEXT: ret
; ENABLE-NEXT: LBB1_4: ; %if.else
; ENABLE-NEXT: lsl w0, w1, #1
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: freqSaveAndRestoreOutsideLoop:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; DISABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; DISABLE-NEXT: add x29, sp, #16 ; =16
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: .cfi_offset w19, -24
; DISABLE-NEXT: .cfi_offset w20, -32
; DISABLE-NEXT: cbz w0, LBB1_4
; DISABLE-NEXT: ; %bb.1: ; %for.body.preheader
; DISABLE-NEXT: mov w19, wzr
; DISABLE-NEXT: mov w20, #10
; DISABLE-NEXT: LBB1_2: ; %for.body
; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLE-NEXT: bl _something
; DISABLE-NEXT: subs w20, w20, #1 ; =1
; DISABLE-NEXT: add w19, w0, w19
; DISABLE-NEXT: b.ne LBB1_2
; DISABLE-NEXT: ; %bb.3: ; %for.end
; DISABLE-NEXT: lsl w0, w19, #3
; DISABLE-NEXT: b LBB1_5
; DISABLE-NEXT: LBB1_4: ; %if.else
; DISABLE-NEXT: lsl w0, w1, #1
; DISABLE-NEXT: LBB1_5: ; %if.end
; DISABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
entry:
%tobool = icmp eq i32 %cond, 0
br i1 %tobool, label %if.else, label %for.body
for.body: ; preds = %entry, %for.body
%i.05 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%sum.04 = phi i32 [ %add, %for.body ], [ 0, %entry ]
%call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
%add = add nsw i32 %call, %sum.04
%inc = add nuw nsw i32 %i.05, 1
%exitcond = icmp eq i32 %inc, 10
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
%shl = shl i32 %add, 3
br label %if.end
if.else: ; preds = %entry
%mul = shl nsw i32 %N, 1
br label %if.end
if.end: ; preds = %if.else, %for.end
%sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
ret i32 %sum.1
}
declare i32 @something(...)
; Check that we do not perform the shrink-wrapping inside the loop even
; though that would be legal. The cost model must prevent that.
define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) {
; ENABLE-LABEL: freqSaveAndRestoreOutsideLoop2:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; ENABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; ENABLE-NEXT: add x29, sp, #16 ; =16
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: .cfi_offset w19, -24
; ENABLE-NEXT: .cfi_offset w20, -32
; ENABLE-NEXT: mov w19, wzr
; ENABLE-NEXT: mov w20, #10
; ENABLE-NEXT: LBB2_1: ; %for.body
; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLE-NEXT: bl _something
; ENABLE-NEXT: subs w20, w20, #1 ; =1
; ENABLE-NEXT: add w19, w0, w19
; ENABLE-NEXT: b.ne LBB2_1
; ENABLE-NEXT: ; %bb.2: ; %for.end
; ENABLE-NEXT: mov w0, w19
; ENABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: freqSaveAndRestoreOutsideLoop2:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; DISABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; DISABLE-NEXT: add x29, sp, #16 ; =16
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: .cfi_offset w19, -24
; DISABLE-NEXT: .cfi_offset w20, -32
; DISABLE-NEXT: mov w19, wzr
; DISABLE-NEXT: mov w20, #10
; DISABLE-NEXT: LBB2_1: ; %for.body
; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLE-NEXT: bl _something
; DISABLE-NEXT: subs w20, w20, #1 ; =1
; DISABLE-NEXT: add w19, w0, w19
; DISABLE-NEXT: b.ne LBB2_1
; DISABLE-NEXT: ; %bb.2: ; %for.end
; DISABLE-NEXT: mov w0, w19
; DISABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%i.04 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%sum.03 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
%add = add nsw i32 %call, %sum.03
%inc = add nuw nsw i32 %i.04, 1
%exitcond = icmp eq i32 %inc, 10
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret i32 %add
}
; Check with a more complex case that we do not have save within the loop and
; restore outside.
define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
; ENABLE-LABEL: loopInfoSaveOutsideLoop:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: cbz w0, LBB3_4
; ENABLE-NEXT: ; %bb.1: ; %for.body.preheader
; ENABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; ENABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; ENABLE-NEXT: add x29, sp, #16 ; =16
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: .cfi_offset w19, -24
; ENABLE-NEXT: .cfi_offset w20, -32
; ENABLE-NEXT: mov w19, wzr
; ENABLE-NEXT: mov w20, #10
; ENABLE-NEXT: LBB3_2: ; %for.body
; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLE-NEXT: bl _something
; ENABLE-NEXT: subs w20, w20, #1 ; =1
; ENABLE-NEXT: add w19, w0, w19
; ENABLE-NEXT: b.ne LBB3_2
; ENABLE-NEXT: ; %bb.3: ; %for.end
; ENABLE-NEXT: bl _somethingElse
; ENABLE-NEXT: lsl w0, w19, #3
; ENABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; ENABLE-NEXT: ret
; ENABLE-NEXT: LBB3_4: ; %if.else
; ENABLE-NEXT: lsl w0, w1, #1
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: loopInfoSaveOutsideLoop:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; DISABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; DISABLE-NEXT: add x29, sp, #16 ; =16
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: .cfi_offset w19, -24
; DISABLE-NEXT: .cfi_offset w20, -32
; DISABLE-NEXT: cbz w0, LBB3_4
; DISABLE-NEXT: ; %bb.1: ; %for.body.preheader
; DISABLE-NEXT: mov w19, wzr
; DISABLE-NEXT: mov w20, #10
; DISABLE-NEXT: LBB3_2: ; %for.body
; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLE-NEXT: bl _something
; DISABLE-NEXT: subs w20, w20, #1 ; =1
; DISABLE-NEXT: add w19, w0, w19
; DISABLE-NEXT: b.ne LBB3_2
; DISABLE-NEXT: ; %bb.3: ; %for.end
; DISABLE-NEXT: bl _somethingElse
; DISABLE-NEXT: lsl w0, w19, #3
; DISABLE-NEXT: b LBB3_5
; DISABLE-NEXT: LBB3_4: ; %if.else
; DISABLE-NEXT: lsl w0, w1, #1
; DISABLE-NEXT: LBB3_5: ; %if.end
; DISABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
entry:
%tobool = icmp eq i32 %cond, 0
br i1 %tobool, label %if.else, label %for.body
for.body: ; preds = %entry, %for.body
%i.05 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%sum.04 = phi i32 [ %add, %for.body ], [ 0, %entry ]
%call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
%add = add nsw i32 %call, %sum.04
%inc = add nuw nsw i32 %i.05, 1
%exitcond = icmp eq i32 %inc, 10
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
tail call void bitcast (void (...)* @somethingElse to void ()*)()
%shl = shl i32 %add, 3
br label %if.end
if.else: ; preds = %entry
%mul = shl nsw i32 %N, 1
br label %if.end
if.end: ; preds = %if.else, %for.end
%sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
ret i32 %sum.1
}
declare void @somethingElse(...)
; Check with a more complex case that we do not have restore within the loop and
; save outside.
define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) nounwind {
; ENABLE-LABEL: loopInfoRestoreOutsideLoop:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: cbz w0, LBB4_4
; ENABLE-NEXT: ; %bb.1: ; %if.then
; ENABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; ENABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; ENABLE-NEXT: add x29, sp, #16 ; =16
; ENABLE-NEXT: bl _somethingElse
; ENABLE-NEXT: mov w19, wzr
; ENABLE-NEXT: mov w20, #10
; ENABLE-NEXT: LBB4_2: ; %for.body
; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLE-NEXT: bl _something
; ENABLE-NEXT: subs w20, w20, #1 ; =1
; ENABLE-NEXT: add w19, w0, w19
; ENABLE-NEXT: b.ne LBB4_2
; ENABLE-NEXT: ; %bb.3: ; %for.end
; ENABLE-NEXT: lsl w0, w19, #3
; ENABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; ENABLE-NEXT: ret
; ENABLE-NEXT: LBB4_4: ; %if.else
; ENABLE-NEXT: lsl w0, w1, #1
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: loopInfoRestoreOutsideLoop:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; DISABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; DISABLE-NEXT: add x29, sp, #16 ; =16
; DISABLE-NEXT: cbz w0, LBB4_4
; DISABLE-NEXT: ; %bb.1: ; %if.then
; DISABLE-NEXT: bl _somethingElse
; DISABLE-NEXT: mov w19, wzr
; DISABLE-NEXT: mov w20, #10
; DISABLE-NEXT: LBB4_2: ; %for.body
; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLE-NEXT: bl _something
; DISABLE-NEXT: subs w20, w20, #1 ; =1
; DISABLE-NEXT: add w19, w0, w19
; DISABLE-NEXT: b.ne LBB4_2
; DISABLE-NEXT: ; %bb.3: ; %for.end
; DISABLE-NEXT: lsl w0, w19, #3
; DISABLE-NEXT: b LBB4_5
; DISABLE-NEXT: LBB4_4: ; %if.else
; DISABLE-NEXT: lsl w0, w1, #1
; DISABLE-NEXT: LBB4_5: ; %if.end
; DISABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
entry:
%tobool = icmp eq i32 %cond, 0
br i1 %tobool, label %if.else, label %if.then
if.then: ; preds = %entry
tail call void bitcast (void (...)* @somethingElse to void ()*)()
br label %for.body
for.body: ; preds = %for.body, %if.then
%i.05 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
%sum.04 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
%call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
%add = add nsw i32 %call, %sum.04
%inc = add nuw nsw i32 %i.05, 1
%exitcond = icmp eq i32 %inc, 10
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
%shl = shl i32 %add, 3
br label %if.end
if.else: ; preds = %entry
%mul = shl nsw i32 %N, 1
br label %if.end
if.end: ; preds = %if.else, %for.end
%sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
ret i32 %sum.1
}
; Check that we handle function with no frame information correctly.
define i32 @emptyFrame() {
; ENABLE-LABEL: emptyFrame:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: mov w0, wzr
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: emptyFrame:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: mov w0, wzr
; DISABLE-NEXT: ret
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
entry:
ret i32 0
}
; Check that we handle variadic function correctly.
define i32 @variadicFunc(i32 %cond, i32 %count, ...) nounwind {
; ENABLE-LABEL: variadicFunc:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: cbz w0, LBB6_4
; ENABLE-NEXT: ; %bb.1: ; %if.then
; ENABLE-NEXT: sub sp, sp, #16 ; =16
; ENABLE-NEXT: add x8, sp, #16 ; =16
; ENABLE-NEXT: cmp w1, #1 ; =1
; ENABLE-NEXT: str x8, [sp, #8]
; ENABLE-NEXT: mov w0, wzr
; ENABLE-NEXT: b.lt LBB6_3
; ENABLE-NEXT: LBB6_2: ; %for.body
; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLE-NEXT: ldr x8, [sp, #8]
; ENABLE-NEXT: add x9, x8, #8 ; =8
; ENABLE-NEXT: str x9, [sp, #8]
; ENABLE-NEXT: ldr w8, [x8]
; ENABLE-NEXT: subs w1, w1, #1 ; =1
; ENABLE-NEXT: add w0, w0, w8
; ENABLE-NEXT: b.ne LBB6_2
; ENABLE-NEXT: LBB6_3: ; %for.end
; ENABLE-NEXT: add sp, sp, #16 ; =16
; ENABLE-NEXT: ret
; ENABLE-NEXT: LBB6_4: ; %if.else
; ENABLE-NEXT: lsl w0, w1, #1
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: variadicFunc:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: sub sp, sp, #16 ; =16
; DISABLE-NEXT: cbz w0, LBB6_4
; DISABLE-NEXT: ; %bb.1: ; %if.then
; DISABLE-NEXT: add x8, sp, #16 ; =16
; DISABLE-NEXT: cmp w1, #1 ; =1
; DISABLE-NEXT: str x8, [sp, #8]
; DISABLE-NEXT: mov w0, wzr
; DISABLE-NEXT: b.lt LBB6_3
; DISABLE-NEXT: LBB6_2: ; %for.body
; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLE-NEXT: ldr x8, [sp, #8]
; DISABLE-NEXT: add x9, x8, #8 ; =8
; DISABLE-NEXT: str x9, [sp, #8]
; DISABLE-NEXT: ldr w8, [x8]
; DISABLE-NEXT: subs w1, w1, #1 ; =1
; DISABLE-NEXT: add w0, w0, w8
; DISABLE-NEXT: b.ne LBB6_2
; DISABLE-NEXT: LBB6_3: ; %if.end
; DISABLE-NEXT: add sp, sp, #16 ; =16
; DISABLE-NEXT: ret
; DISABLE-NEXT: LBB6_4: ; %if.else
; DISABLE-NEXT: lsl w0, w1, #1
; DISABLE-NEXT: add sp, sp, #16 ; =16
; DISABLE-NEXT: ret
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
entry:
%ap = alloca i8*, align 8
%tobool = icmp eq i32 %cond, 0
br i1 %tobool, label %if.else, label %if.then
if.then: ; preds = %entry
%ap1 = bitcast i8** %ap to i8*
call void @llvm.va_start(i8* %ap1)
%cmp6 = icmp sgt i32 %count, 0
br i1 %cmp6, label %for.body, label %for.end
for.body: ; preds = %if.then, %for.body
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %if.then ]
%sum.07 = phi i32 [ %add, %for.body ], [ 0, %if.then ]
%0 = va_arg i8** %ap, i32
%add = add nsw i32 %sum.07, %0
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, %count
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %if.then
%sum.0.lcssa = phi i32 [ 0, %if.then ], [ %add, %for.body ]
call void @llvm.va_end(i8* %ap1)
br label %if.end
if.else: ; preds = %entry
%mul = shl nsw i32 %count, 1
br label %if.end
if.end: ; preds = %if.else, %for.end
%sum.1 = phi i32 [ %sum.0.lcssa, %for.end ], [ %mul, %if.else ]
ret i32 %sum.1
}
declare void @llvm.va_start(i8*)
declare void @llvm.va_end(i8*)
; Check that we handle inline asm correctly.
define i32 @inlineAsm(i32 %cond, i32 %N) {
; ENABLE-LABEL: inlineAsm:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: cbz w0, LBB7_4
; ENABLE-NEXT: ; %bb.1: ; %for.body.preheader
; ENABLE-NEXT: stp x20, x19, [sp, #-16]! ; 16-byte Folded Spill
; ENABLE-NEXT: .cfi_def_cfa_offset 16
; ENABLE-NEXT: .cfi_offset w19, -8
; ENABLE-NEXT: .cfi_offset w20, -16
; ENABLE-NEXT: mov w8, #10
; ENABLE-NEXT: LBB7_2: ; %for.body
; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLE-NEXT: subs w8, w8, #1 ; =1
; ENABLE-NEXT: ; InlineAsm Start
; ENABLE-NEXT: add x19, x19, #1 ; =1
; ENABLE-NEXT: ; InlineAsm End
; ENABLE-NEXT: b.ne LBB7_2
; ENABLE-NEXT: ; %bb.3:
; ENABLE-NEXT: mov w0, wzr
; ENABLE-NEXT: ldp x20, x19, [sp], #16 ; 16-byte Folded Reload
; ENABLE-NEXT: ret
; ENABLE-NEXT: LBB7_4: ; %if.else
; ENABLE-NEXT: lsl w0, w1, #1
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: inlineAsm:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: stp x20, x19, [sp, #-16]! ; 16-byte Folded Spill
; DISABLE-NEXT: .cfi_def_cfa_offset 16
; DISABLE-NEXT: .cfi_offset w19, -8
; DISABLE-NEXT: .cfi_offset w20, -16
; DISABLE-NEXT: cbz w0, LBB7_4
; DISABLE-NEXT: ; %bb.1: ; %for.body.preheader
; DISABLE-NEXT: mov w8, #10
; DISABLE-NEXT: LBB7_2: ; %for.body
; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLE-NEXT: subs w8, w8, #1 ; =1
; DISABLE-NEXT: ; InlineAsm Start
; DISABLE-NEXT: add x19, x19, #1 ; =1
; DISABLE-NEXT: ; InlineAsm End
; DISABLE-NEXT: b.ne LBB7_2
; DISABLE-NEXT: ; %bb.3:
; DISABLE-NEXT: mov w0, wzr
; DISABLE-NEXT: ldp x20, x19, [sp], #16 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
; DISABLE-NEXT: LBB7_4: ; %if.else
; DISABLE-NEXT: lsl w0, w1, #1
; DISABLE-NEXT: ldp x20, x19, [sp], #16 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
entry:
%tobool = icmp eq i32 %cond, 0
br i1 %tobool, label %if.else, label %for.body
for.body: ; preds = %entry, %for.body
%i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
tail call void asm sideeffect "add x19, x19, #1", "~{x19}"()
%inc = add nuw nsw i32 %i.03, 1
%exitcond = icmp eq i32 %inc, 10
br i1 %exitcond, label %if.end, label %for.body
if.else: ; preds = %entry
%mul = shl nsw i32 %N, 1
br label %if.end
if.end: ; preds = %for.body, %if.else
%sum.0 = phi i32 [ %mul, %if.else ], [ 0, %for.body ]
ret i32 %sum.0
}
; Check that we handle calls to variadic functions correctly.
define i32 @callVariadicFunc(i32 %cond, i32 %N) {
; ENABLE-LABEL: callVariadicFunc:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: ; kill: def $w1 killed $w1 def $x1
; ENABLE-NEXT: cbz w0, LBB8_2
; ENABLE-NEXT: ; %bb.1: ; %if.then
; ENABLE-NEXT: sub sp, sp, #64 ; =64
; ENABLE-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill
; ENABLE-NEXT: add x29, sp, #48 ; =48
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: stp x1, x1, [sp, #32]
; ENABLE-NEXT: stp x1, x1, [sp, #16]
; ENABLE-NEXT: stp x1, x1, [sp]
; ENABLE-NEXT: mov w0, w1
; ENABLE-NEXT: bl _someVariadicFunc
; ENABLE-NEXT: lsl w0, w0, #3
; ENABLE-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
; ENABLE-NEXT: add sp, sp, #64 ; =64
; ENABLE-NEXT: ret
; ENABLE-NEXT: LBB8_2: ; %if.else
; ENABLE-NEXT: lsl w0, w1, #1
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: callVariadicFunc:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: sub sp, sp, #64 ; =64
; DISABLE-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill
; DISABLE-NEXT: add x29, sp, #48 ; =48
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: ; kill: def $w1 killed $w1 def $x1
; DISABLE-NEXT: cbz w0, LBB8_2
; DISABLE-NEXT: ; %bb.1: ; %if.then
; DISABLE-NEXT: stp x1, x1, [sp, #32]
; DISABLE-NEXT: stp x1, x1, [sp, #16]
; DISABLE-NEXT: stp x1, x1, [sp]
; DISABLE-NEXT: mov w0, w1
; DISABLE-NEXT: bl _someVariadicFunc
; DISABLE-NEXT: lsl w0, w0, #3
; DISABLE-NEXT: b LBB8_3
; DISABLE-NEXT: LBB8_2: ; %if.else
; DISABLE-NEXT: lsl w0, w1, #1
; DISABLE-NEXT: LBB8_3: ; %if.end
; DISABLE-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
; DISABLE-NEXT: add sp, sp, #64 ; =64
; DISABLE-NEXT: ret
[ShrinkWrap] Add (a simplified version) of shrink-wrapping. This patch introduces a new pass that computes the safe point to insert the prologue and epilogue of the function. The interest is to find safe points that are cheaper than the entry and exits blocks. As an example and to avoid regressions to be introduce, this patch also implements the required bits to enable the shrink-wrapping pass for AArch64. ** Context ** Currently we insert the prologue and epilogue of the method/function in the entry and exits blocks. Although this is correct, we can do a better job when those are not immediately required and insert them at less frequently executed places. The job of the shrink-wrapping pass is to identify such places. ** Motivating example ** Let us consider the following function that perform a call only in one branch of a if: define i32 @f(i32 %a, i32 %b) { %tmp = alloca i32, align 4 %tmp2 = icmp slt i32 %a, %b br i1 %tmp2, label %true, label %false true: store i32 %a, i32* %tmp, align 4 %tmp4 = call i32 @doSomething(i32 0, i32* %tmp) br label %false false: %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ] ret i32 %tmp.0 } On AArch64 this code generates (removing the cfi directives to ease readabilities): _f: ; @f ; BB#0: stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething LBB0_2: ; %false mov sp, x29 ldp x29, x30, [sp], #16 ret With shrink-wrapping we could generate: _f: ; @f ; BB#0: cmp w0, w1 b.ge LBB0_2 ; BB#1: ; %true stp x29, x30, [sp, #-16]! mov x29, sp sub sp, sp, #16 ; =16 stur w0, [x29, #-4] sub x1, x29, #4 ; =4 mov w0, wzr bl _doSomething add sp, x29, #16 ; =16 ldp x29, x30, [sp], #16 LBB0_2: ; %false ret Therefore, we would pay the overhead of setting up/destroying the frame only if we actually do the call. ** Proposed Solution ** This patch introduces a new machine pass that perform the shrink-wrapping analysis (See the comments at the beginning of ShrinkWrap.cpp for more details). It then stores the safe save and restore point into the MachineFrameInfo attached to the MachineFunction. This information is then used by the PrologEpilogInserter (PEI) to place the related code at the right place. This pass runs right before the PEI. Unlike the original paper of Chow from PLDI’88, this implementation of shrink-wrapping does not use expensive data-flow analysis and does not need hack to properly avoid frequently executed point. Instead, it relies on dominance and loop properties. The pass is off by default and each target can opt-in by setting the EnableShrinkWrap boolean to true in their derived class of TargetPassConfig. This setting can also be overwritten on the command line by using -enable-shrink-wrap. Before you try out the pass for your target, make sure you properly fix your emitProlog/emitEpilog/adjustForXXX method to cope with basic blocks that are not necessarily the entry block. ** Design Decisions ** 1. ShrinkWrap is its own pass right now. It could frankly be merged into PEI but for debugging and clarity I thought it was best to have its own file. 2. Right now, we only support one save point and one restore point. At some point we can expand this to several save point and restore point, the impacted component would then be: - The pass itself: New algorithm needed. - MachineFrameInfo: Hold a list or set of Save/Restore point instead of one pointer. - PEI: Should loop over the save point and restore point. Anyhow, at least for this first iteration, I do not believe this is interesting to support the complex cases. We should revisit that when we motivating examples. Differential Revision: http://reviews.llvm.org/D9210 <rdar://problem/3201744> llvm-svn: 236507
2015-05-06 01:38:16 +08:00
entry:
%tobool = icmp eq i32 %cond, 0
br i1 %tobool, label %if.else, label %if.then
if.then: ; preds = %entry
%call = tail call i32 (i32, ...) @someVariadicFunc(i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N)
%shl = shl i32 %call, 3
br label %if.end
if.else: ; preds = %entry
%mul = shl nsw i32 %N, 1
br label %if.end
if.end: ; preds = %if.else, %if.then
%sum.0 = phi i32 [ %shl, %if.then ], [ %mul, %if.else ]
ret i32 %sum.0
}
declare i32 @someVariadicFunc(i32, ...)
; Make sure we do not insert unreachable code after noreturn function.
; Although this is not incorrect to insert such code, it is useless
; and it hurts the binary size.
;
define i32 @noreturn(i8 signext %bad_thing) {
; ENABLE-LABEL: noreturn:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: cbnz w0, LBB9_2
; ENABLE-NEXT: ; %bb.1: ; %if.end
; ENABLE-NEXT: mov w0, #42
; ENABLE-NEXT: ret
; ENABLE-NEXT: LBB9_2: ; %if.abort
; ENABLE-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
; ENABLE-NEXT: mov x29, sp
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: bl _abort
;
; DISABLE-LABEL: noreturn:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
; DISABLE-NEXT: mov x29, sp
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: cbnz w0, LBB9_2
; DISABLE-NEXT: ; %bb.1: ; %if.end
; DISABLE-NEXT: mov w0, #42
; DISABLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
; DISABLE-NEXT: LBB9_2: ; %if.abort
; DISABLE-NEXT: bl _abort
entry:
%tobool = icmp eq i8 %bad_thing, 0
br i1 %tobool, label %if.end, label %if.abort
if.abort:
tail call void @abort() #0
unreachable
if.end:
ret i32 42
}
declare void @abort() #0
attributes #0 = { noreturn nounwind }
; Make sure that we handle infinite loops properly When checking that the Save
; and Restore blocks are control flow equivalent, the loop searches for the
; immediate (post) dominator for the (restore) save blocks. When either the Save
; or Restore block is located in an infinite loop the only immediate (post)
; dominator is itself. In this case, we cannot perform shrink wrapping, but we
; should return gracefully and continue compilation.
; The only condition for this test is the compilation finishes correctly.
;
define void @infiniteloop() {
; ENABLE-LABEL: infiniteloop:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; ENABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; ENABLE-NEXT: add x29, sp, #16 ; =16
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: .cfi_offset w19, -24
; ENABLE-NEXT: .cfi_offset w20, -32
; ENABLE-NEXT: cbnz wzr, LBB10_3
; ENABLE-NEXT: ; %bb.1: ; %if.then
; ENABLE-NEXT: sub x19, sp, #16 ; =16
; ENABLE-NEXT: mov sp, x19
; ENABLE-NEXT: mov w20, wzr
; ENABLE-NEXT: LBB10_2: ; %for.body
; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLE-NEXT: bl _something
; ENABLE-NEXT: add w20, w0, w20
; ENABLE-NEXT: str w20, [x19]
; ENABLE-NEXT: b LBB10_2
; ENABLE-NEXT: LBB10_3: ; %if.end
; ENABLE-NEXT: sub sp, x29, #16 ; =16
; ENABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: infiniteloop:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; DISABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; DISABLE-NEXT: add x29, sp, #16 ; =16
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: .cfi_offset w19, -24
; DISABLE-NEXT: .cfi_offset w20, -32
; DISABLE-NEXT: cbnz wzr, LBB10_3
; DISABLE-NEXT: ; %bb.1: ; %if.then
; DISABLE-NEXT: sub x19, sp, #16 ; =16
; DISABLE-NEXT: mov sp, x19
; DISABLE-NEXT: mov w20, wzr
; DISABLE-NEXT: LBB10_2: ; %for.body
; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLE-NEXT: bl _something
; DISABLE-NEXT: add w20, w0, w20
; DISABLE-NEXT: str w20, [x19]
; DISABLE-NEXT: b LBB10_2
; DISABLE-NEXT: LBB10_3: ; %if.end
; DISABLE-NEXT: sub sp, x29, #16 ; =16
; DISABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
entry:
br i1 undef, label %if.then, label %if.end
if.then:
%ptr = alloca i32, i32 4
br label %for.body
for.body: ; preds = %for.body, %entry
%sum.03 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
%call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
%add = add nsw i32 %call, %sum.03
store i32 %add, i32* %ptr
br label %for.body
if.end:
ret void
}
; Another infinite loop test this time with a body bigger than just one block.
define void @infiniteloop2() {
; ENABLE-LABEL: infiniteloop2:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; ENABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; ENABLE-NEXT: add x29, sp, #16 ; =16
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: .cfi_offset w19, -24
; ENABLE-NEXT: .cfi_offset w20, -32
; ENABLE-NEXT: cbnz wzr, LBB11_3
; ENABLE-NEXT: ; %bb.1: ; %if.then
; ENABLE-NEXT: sub x8, sp, #16 ; =16
; ENABLE-NEXT: mov sp, x8
; ENABLE-NEXT: mov w9, wzr
; ENABLE-NEXT: ; InlineAsm Start
; ENABLE-NEXT: mov x10, #0
; ENABLE-NEXT: ; InlineAsm End
; ENABLE-NEXT: LBB11_2: ; %for.body
; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLE-NEXT: add w9, w10, w9
; ENABLE-NEXT: str w9, [x8]
; ENABLE-NEXT: ; InlineAsm Start
; ENABLE-NEXT: nop
; ENABLE-NEXT: ; InlineAsm End
; ENABLE-NEXT: mov w9, #1
; ENABLE-NEXT: b LBB11_2
; ENABLE-NEXT: LBB11_3: ; %if.end
; ENABLE-NEXT: sub sp, x29, #16 ; =16
; ENABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: infiniteloop2:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
; DISABLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
; DISABLE-NEXT: add x29, sp, #16 ; =16
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: .cfi_offset w19, -24
; DISABLE-NEXT: .cfi_offset w20, -32
; DISABLE-NEXT: cbnz wzr, LBB11_3
; DISABLE-NEXT: ; %bb.1: ; %if.then
; DISABLE-NEXT: sub x8, sp, #16 ; =16
; DISABLE-NEXT: mov sp, x8
; DISABLE-NEXT: mov w9, wzr
; DISABLE-NEXT: ; InlineAsm Start
; DISABLE-NEXT: mov x10, #0
; DISABLE-NEXT: ; InlineAsm End
; DISABLE-NEXT: LBB11_2: ; %for.body
; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLE-NEXT: add w9, w10, w9
; DISABLE-NEXT: str w9, [x8]
; DISABLE-NEXT: ; InlineAsm Start
; DISABLE-NEXT: nop
; DISABLE-NEXT: ; InlineAsm End
; DISABLE-NEXT: mov w9, #1
; DISABLE-NEXT: b LBB11_2
; DISABLE-NEXT: LBB11_3: ; %if.end
; DISABLE-NEXT: sub sp, x29, #16 ; =16
; DISABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
entry:
br i1 undef, label %if.then, label %if.end
if.then:
%ptr = alloca i32, i32 4
br label %for.body
for.body: ; preds = %for.body, %entry
%sum.03 = phi i32 [ 0, %if.then ], [ %add, %body1 ], [ 1, %body2]
%call = tail call i32 asm "mov $0, #0", "=r,~{x19}"()
%add = add nsw i32 %call, %sum.03
store i32 %add, i32* %ptr
br i1 undef, label %body1, label %body2
body1:
tail call void asm sideeffect "nop", "~{x19}"()
br label %for.body
body2:
tail call void asm sideeffect "nop", "~{x19}"()
br label %for.body
if.end:
ret void
}
; Another infinite loop test this time with two nested infinite loop.
define void @infiniteloop3() {
; ENABLE-LABEL: infiniteloop3:
; ENABLE: ; %bb.0: ; %entry
; ENABLE-NEXT: cbnz wzr, LBB12_5
; ENABLE-NEXT: ; %bb.1: ; %loop2a.preheader
; ENABLE-NEXT: mov x8, xzr
; ENABLE-NEXT: mov x9, xzr
; ENABLE-NEXT: mov x11, xzr
; ENABLE-NEXT: b LBB12_3
; ENABLE-NEXT: LBB12_2: ; %loop2b
; ENABLE-NEXT: ; in Loop: Header=BB12_3 Depth=1
; ENABLE-NEXT: str x10, [x11]
; ENABLE-NEXT: mov x11, x10
; ENABLE-NEXT: LBB12_3: ; %loop1
; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLE-NEXT: mov x10, x9
; ENABLE-NEXT: ldr x9, [x8]
; ENABLE-NEXT: cbnz x8, LBB12_2
; ENABLE-NEXT: ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
; ENABLE-NEXT: mov x8, x10
; ENABLE-NEXT: mov x11, x10
; ENABLE-NEXT: b LBB12_3
; ENABLE-NEXT: LBB12_5: ; %end
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: infiniteloop3:
; DISABLE: ; %bb.0: ; %entry
; DISABLE-NEXT: cbnz wzr, LBB12_5
; DISABLE-NEXT: ; %bb.1: ; %loop2a.preheader
; DISABLE-NEXT: mov x8, xzr
; DISABLE-NEXT: mov x9, xzr
; DISABLE-NEXT: mov x11, xzr
; DISABLE-NEXT: b LBB12_3
; DISABLE-NEXT: LBB12_2: ; %loop2b
; DISABLE-NEXT: ; in Loop: Header=BB12_3 Depth=1
; DISABLE-NEXT: str x10, [x11]
; DISABLE-NEXT: mov x11, x10
; DISABLE-NEXT: LBB12_3: ; %loop1
; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLE-NEXT: mov x10, x9
; DISABLE-NEXT: ldr x9, [x8]
; DISABLE-NEXT: cbnz x8, LBB12_2
; DISABLE-NEXT: ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
; DISABLE-NEXT: mov x8, x10
; DISABLE-NEXT: mov x11, x10
; DISABLE-NEXT: b LBB12_3
; DISABLE-NEXT: LBB12_5: ; %end
; DISABLE-NEXT: ret
entry:
br i1 undef, label %loop2a, label %body
body: ; preds = %entry
br i1 undef, label %loop2a, label %end
loop1: ; preds = %loop2a, %loop2b
%var.phi = phi i32* [ %next.phi, %loop2b ], [ %var, %loop2a ]
%next.phi = phi i32* [ %next.load, %loop2b ], [ %next.var, %loop2a ]
%0 = icmp eq i32* %var, null
%next.load = load i32*, i32** undef
br i1 %0, label %loop2a, label %loop2b
loop2a: ; preds = %loop1, %body, %entry
%var = phi i32* [ null, %body ], [ null, %entry ], [ %next.phi, %loop1 ]
%next.var = phi i32* [ undef, %body ], [ null, %entry ], [ %next.load, %loop1 ]
br label %loop1
loop2b: ; preds = %loop1
%gep1 = bitcast i32* %var.phi to i32*
%next.ptr = bitcast i32* %gep1 to i32**
store i32* %next.phi, i32** %next.ptr
br label %loop1
end:
ret void
}
; Re-aligned stack pointer. See bug 26642. Avoid clobbering live
; values in the prologue when re-aligning the stack pointer.
define i32 @stack_realign(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2) {
; ENABLE-LABEL: stack_realign:
; ENABLE: ; %bb.0:
; ENABLE-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
; ENABLE-NEXT: mov x29, sp
; ENABLE-NEXT: sub x9, sp, #16 ; =16
; ENABLE-NEXT: and sp, x9, #0xffffffffffffffe0
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: lsl w8, w0, w1
; ENABLE-NEXT: cmp w0, w1
; ENABLE-NEXT: lsl w9, w1, w0
; ENABLE-NEXT: b.ge LBB13_2
; ENABLE-NEXT: ; %bb.1: ; %true
; ENABLE-NEXT: str w0, [sp]
; ENABLE-NEXT: LBB13_2: ; %false
; ENABLE-NEXT: str w8, [x2]
; ENABLE-NEXT: str w9, [x3]
; ENABLE-NEXT: mov sp, x29
; ENABLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: stack_realign:
; DISABLE: ; %bb.0:
; DISABLE-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
; DISABLE-NEXT: mov x29, sp
; DISABLE-NEXT: sub x9, sp, #16 ; =16
; DISABLE-NEXT: and sp, x9, #0xffffffffffffffe0
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: lsl w8, w0, w1
; DISABLE-NEXT: cmp w0, w1
; DISABLE-NEXT: lsl w9, w1, w0
; DISABLE-NEXT: b.ge LBB13_2
; DISABLE-NEXT: ; %bb.1: ; %true
; DISABLE-NEXT: str w0, [sp]
; DISABLE-NEXT: LBB13_2: ; %false
; DISABLE-NEXT: str w8, [x2]
; DISABLE-NEXT: str w9, [x3]
; DISABLE-NEXT: mov sp, x29
; DISABLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
%tmp = alloca i32, align 32
%shl1 = shl i32 %a, %b
%shl2 = shl i32 %b, %a
%tmp2 = icmp slt i32 %a, %b
br i1 %tmp2, label %true, label %false
true:
store i32 %a, i32* %tmp, align 4
%tmp4 = load i32, i32* %tmp
br label %false
false:
%tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
store i32 %shl1, i32* %ptr1
store i32 %shl2, i32* %ptr2
ret i32 %tmp.0
}
; Re-aligned stack pointer with all caller-save regs live. See bug
; 26642. In this case we currently avoid shrink wrapping because
; ensuring we have a scratch register to re-align the stack pointer is
; too complicated. Output should be the same for both enabled and
; disabled shrink wrapping.
define void @stack_realign2(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2, i32* %ptr3, i32* %ptr4, i32* %ptr5, i32* %ptr6) {
; ENABLE-LABEL: stack_realign2:
; ENABLE: ; %bb.0:
; ENABLE-NEXT: stp x28, x27, [sp, #-96]! ; 16-byte Folded Spill
; ENABLE-NEXT: stp x26, x25, [sp, #16] ; 16-byte Folded Spill
; ENABLE-NEXT: stp x24, x23, [sp, #32] ; 16-byte Folded Spill
; ENABLE-NEXT: stp x22, x21, [sp, #48] ; 16-byte Folded Spill
; ENABLE-NEXT: stp x20, x19, [sp, #64] ; 16-byte Folded Spill
; ENABLE-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill
; ENABLE-NEXT: add x29, sp, #80 ; =80
; ENABLE-NEXT: sub x9, sp, #32 ; =32
; ENABLE-NEXT: and sp, x9, #0xffffffffffffffe0
; ENABLE-NEXT: .cfi_def_cfa w29, 16
; ENABLE-NEXT: .cfi_offset w30, -8
; ENABLE-NEXT: .cfi_offset w29, -16
; ENABLE-NEXT: .cfi_offset w19, -24
; ENABLE-NEXT: .cfi_offset w20, -32
; ENABLE-NEXT: .cfi_offset w21, -40
; ENABLE-NEXT: .cfi_offset w22, -48
; ENABLE-NEXT: .cfi_offset w23, -56
; ENABLE-NEXT: .cfi_offset w24, -64
; ENABLE-NEXT: .cfi_offset w25, -72
; ENABLE-NEXT: .cfi_offset w26, -80
; ENABLE-NEXT: .cfi_offset w27, -88
; ENABLE-NEXT: .cfi_offset w28, -96
; ENABLE-NEXT: lsl w8, w0, w1
; ENABLE-NEXT: lsl w9, w1, w0
; ENABLE-NEXT: lsr w10, w0, w1
; ENABLE-NEXT: lsr w12, w1, w0
; ENABLE-NEXT: add w15, w1, w0
; ENABLE-NEXT: subs w17, w1, w0
; ENABLE-NEXT: sub w11, w9, w10
; ENABLE-NEXT: add w16, w8, w9
; ENABLE-NEXT: add w13, w10, w12
; ENABLE-NEXT: add w14, w12, w15
; ENABLE-NEXT: b.le LBB14_2
; ENABLE-NEXT: ; %bb.1: ; %true
; ENABLE-NEXT: str w0, [sp]
; ENABLE-NEXT: ; InlineAsm Start
; ENABLE-NEXT: nop
; ENABLE-NEXT: ; InlineAsm End
; ENABLE-NEXT: LBB14_2: ; %false
; ENABLE-NEXT: str w8, [x2]
; ENABLE-NEXT: str w9, [x3]
; ENABLE-NEXT: str w10, [x4]
; ENABLE-NEXT: str w12, [x5]
; ENABLE-NEXT: str w15, [x6]
; ENABLE-NEXT: str w17, [x7]
; ENABLE-NEXT: stp w0, w1, [x2, #4]
; ENABLE-NEXT: stp w16, w11, [x2, #12]
; ENABLE-NEXT: stp w13, w14, [x2, #20]
; ENABLE-NEXT: sub sp, x29, #80 ; =80
; ENABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x24, x23, [sp, #32] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x26, x25, [sp, #16] ; 16-byte Folded Reload
; ENABLE-NEXT: ldp x28, x27, [sp], #96 ; 16-byte Folded Reload
; ENABLE-NEXT: ret
;
; DISABLE-LABEL: stack_realign2:
; DISABLE: ; %bb.0:
; DISABLE-NEXT: stp x28, x27, [sp, #-96]! ; 16-byte Folded Spill
; DISABLE-NEXT: stp x26, x25, [sp, #16] ; 16-byte Folded Spill
; DISABLE-NEXT: stp x24, x23, [sp, #32] ; 16-byte Folded Spill
; DISABLE-NEXT: stp x22, x21, [sp, #48] ; 16-byte Folded Spill
; DISABLE-NEXT: stp x20, x19, [sp, #64] ; 16-byte Folded Spill
; DISABLE-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill
; DISABLE-NEXT: add x29, sp, #80 ; =80
; DISABLE-NEXT: sub x9, sp, #32 ; =32
; DISABLE-NEXT: and sp, x9, #0xffffffffffffffe0
; DISABLE-NEXT: .cfi_def_cfa w29, 16
; DISABLE-NEXT: .cfi_offset w30, -8
; DISABLE-NEXT: .cfi_offset w29, -16
; DISABLE-NEXT: .cfi_offset w19, -24
; DISABLE-NEXT: .cfi_offset w20, -32
; DISABLE-NEXT: .cfi_offset w21, -40
; DISABLE-NEXT: .cfi_offset w22, -48
; DISABLE-NEXT: .cfi_offset w23, -56
; DISABLE-NEXT: .cfi_offset w24, -64
; DISABLE-NEXT: .cfi_offset w25, -72
; DISABLE-NEXT: .cfi_offset w26, -80
; DISABLE-NEXT: .cfi_offset w27, -88
; DISABLE-NEXT: .cfi_offset w28, -96
; DISABLE-NEXT: lsl w8, w0, w1
; DISABLE-NEXT: lsl w9, w1, w0
; DISABLE-NEXT: lsr w10, w0, w1
; DISABLE-NEXT: lsr w12, w1, w0
; DISABLE-NEXT: add w15, w1, w0
; DISABLE-NEXT: subs w17, w1, w0
; DISABLE-NEXT: sub w11, w9, w10
; DISABLE-NEXT: add w16, w8, w9
; DISABLE-NEXT: add w13, w10, w12
; DISABLE-NEXT: add w14, w12, w15
; DISABLE-NEXT: b.le LBB14_2
; DISABLE-NEXT: ; %bb.1: ; %true
; DISABLE-NEXT: str w0, [sp]
; DISABLE-NEXT: ; InlineAsm Start
; DISABLE-NEXT: nop
; DISABLE-NEXT: ; InlineAsm End
; DISABLE-NEXT: LBB14_2: ; %false
; DISABLE-NEXT: str w8, [x2]
; DISABLE-NEXT: str w9, [x3]
; DISABLE-NEXT: str w10, [x4]
; DISABLE-NEXT: str w12, [x5]
; DISABLE-NEXT: str w15, [x6]
; DISABLE-NEXT: str w17, [x7]
; DISABLE-NEXT: stp w0, w1, [x2, #4]
; DISABLE-NEXT: stp w16, w11, [x2, #12]
; DISABLE-NEXT: stp w13, w14, [x2, #20]
; DISABLE-NEXT: sub sp, x29, #80 ; =80
; DISABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x24, x23, [sp, #32] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x26, x25, [sp, #16] ; 16-byte Folded Reload
; DISABLE-NEXT: ldp x28, x27, [sp], #96 ; 16-byte Folded Reload
; DISABLE-NEXT: ret
%tmp = alloca i32, align 32
%tmp1 = shl i32 %a, %b
%tmp2 = shl i32 %b, %a
%tmp3 = lshr i32 %a, %b
%tmp4 = lshr i32 %b, %a
%tmp5 = add i32 %b, %a
%tmp6 = sub i32 %b, %a
%tmp7 = add i32 %tmp1, %tmp2
%tmp8 = sub i32 %tmp2, %tmp3
%tmp9 = add i32 %tmp3, %tmp4
%tmp10 = add i32 %tmp4, %tmp5
%cmp = icmp slt i32 %a, %b
br i1 %cmp, label %true, label %false
true:
store i32 %a, i32* %tmp, align 4
call void asm sideeffect "nop", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28}"() nounwind
br label %false
false:
store i32 %tmp1, i32* %ptr1, align 4
store i32 %tmp2, i32* %ptr2, align 4
store i32 %tmp3, i32* %ptr3, align 4
store i32 %tmp4, i32* %ptr4, align 4
store i32 %tmp5, i32* %ptr5, align 4
store i32 %tmp6, i32* %ptr6, align 4
%idx1 = getelementptr inbounds i32, i32* %ptr1, i64 1
store i32 %a, i32* %idx1, align 4
%idx2 = getelementptr inbounds i32, i32* %ptr1, i64 2
store i32 %b, i32* %idx2, align 4
%idx3 = getelementptr inbounds i32, i32* %ptr1, i64 3
store i32 %tmp7, i32* %idx3, align 4
%idx4 = getelementptr inbounds i32, i32* %ptr1, i64 4
store i32 %tmp8, i32* %idx4, align 4
%idx5 = getelementptr inbounds i32, i32* %ptr1, i64 5
store i32 %tmp9, i32* %idx5, align 4
%idx6 = getelementptr inbounds i32, i32* %ptr1, i64 6
store i32 %tmp10, i32* %idx6, align 4
ret void
}