llvm-project/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll

; RUN: opt -S -hotcoldsplit < %s | FileCheck %s

; Source:
;
; extern void sideeffect(int);
; extern void __attribute__((cold)) sink();
; void foo(int cond) {
;   if (cond) { //< Start outlining here.
;     while (cond > 10) {
;       --cond;
;       sideeffect(0);
;     }
;     sink();
;   }
;   sideeffect(1);
; }

target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"

; CHECK-LABEL: define {{.*}}@foo(
; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl
; CHECK-LABEL: codeRepl:
; CHECK-NEXT: call void @foo.cold.1
; CHECK-LABEL: if.end:
; CHECK: call void @sideeffect(i32 1)
define void @foo(i32 %cond) {
entry:
  %tobool = icmp eq i32 %cond, 0
  br i1 %tobool, label %if.end, label %while.cond.preheader

while.cond.preheader:                             ; preds = %entry
  %cmp3 = icmp sgt i32 %cond, 10
  br i1 %cmp3, label %while.body.preheader, label %while.end

while.body.preheader:                             ; preds = %while.cond.preheader
  br label %while.body

while.body:                                       ; preds = %while.body.preheader, %while.body
  %cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]
  %dec = add nsw i32 %cond.addr.04, -1
  tail call void @sideeffect(i32 0) #3
  %cmp = icmp sgt i32 %dec, 10
  br i1 %cmp, label %while.body, label %while.end.loopexit

while.end.loopexit:                               ; preds = %while.body
  br label %while.end

while.end:                                        ; preds = %while.end.loopexit, %while.cond.preheader
  tail call void (...) @sink()
  ret void

if.end:                                           ; preds = %entry
  tail call void @sideeffect(i32 1)
  ret void
}

; This is the same as @foo, but the while loop comes after the sink block.
; CHECK-LABEL: define {{.*}}@while_loop_after_sink(
; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl
; CHECK-LABEL: codeRepl:
; CHECK-NEXT: call void @while_loop_after_sink.cold.1
; CHECK-LABEL: if.end:
; CHECK: call void @sideeffect(i32 1)
define void @while_loop_after_sink(i32 %cond) {
entry:
  %tobool = icmp eq i32 %cond, 0
  br i1 %tobool, label %if.end, label %sink

sink:
  tail call void (...) @sink()
  br label %while.cond.preheader

while.cond.preheader:
  %cmp3 = icmp sgt i32 %cond, 10
  br i1 %cmp3, label %while.body.preheader, label %while.end

while.body.preheader:                             ; preds = %while.cond.preheader
  br label %while.body

while.body:                                       ; preds = %while.body.preheader, %while.body
  %cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]
  %dec = add nsw i32 %cond.addr.04, -1
  tail call void @sideeffect(i32 0) #3
  %cmp = icmp sgt i32 %dec, 10
  br i1 %cmp, label %while.body, label %while.end.loopexit

while.end.loopexit:                               ; preds = %while.body
  br label %while.end

while.end:                                        ; preds = %while.end.loopexit, %while.cond.preheader
  ret void

if.end:                                           ; preds = %entry
  tail call void @sideeffect(i32 1)
  ret void
}

; CHECK-LABEL: define {{.*}}@foo.cold.1
; CHECK: phi i32
; CHECK-NEXT: add nsw i32
; CHECK-NEXT: call {{.*}}@sideeffect
; CHECK-NEXT: icmp
; CHECK-NEXT: br

; CHECK-LABEL: define {{.*}}@while_loop_after_sink.cold.1
; CHECK: call {{.*}}@sink
; CHECK: phi i32
; CHECK-NEXT: add nsw i32
; CHECK-NEXT: call {{.*}}@sideeffect
; CHECK-NEXT: icmp
; CHECK-NEXT: br

declare void @sideeffect(i32)

declare void @sink(...) cold
[HotColdSplitting] Identify larger cold regions using domtree queries The current splitting algorithm works in three stages: 1) Identify cold blocks, then 2) Use forward/backward propagation to mark hot blocks, then 3) Grow a SESE region of blocks outside of the set of hot blocks and start outlining. While testing this pass on Apple internal frameworks I noticed that some kinds of control flow (e.g. loops) are never outlined, even though they unconditionally lead to / follow cold blocks. I noticed two other issues related to how cold regions are identified: - An inconsistency can arise in the internal state of the hotness propagation stage, as a block may end up in both the ColdBlocks set and the HotBlocks set. Further inconsistencies can arise as these sets do not match what's in ProfileSummaryInfo. - It isn't necessary to limit outlining to single-exit regions. This patch teaches the splitting algorithm to identify maximal cold regions and outline them. A maximal cold region is defined as the set of blocks post-dominated by a cold sink block, or dominated by that sink block. This approach can successfully outline loops in the cold path. As a side benefit, it maintains less internal state than the current approach. Due to a limitation in CodeExtractor, blocks within the maximal cold region which aren't dominated by a single entry point (a so-called "max ancestor") are filtered out. Results: - X86 (LNT + -Os + externals): 134KB of TEXT were outlined compared to 47KB pre-patch, or a ~3x improvement. Did not see a performance impact across two runs. - AArch64 (LNT + -Os + externals + Apple-internal benchmarks): 149KB of TEXT were outlined. Ditto re: performance impact. - Outlining results improve marginally in the internal frameworks I tested. Follow-ups: - Outline more than once per function, outline large single basic blocks, & try to remove unconditional branches in outlined functions. Differential Revision: https://reviews.llvm.org/D53627 llvm-svn: 345209 2018-10-25 06:15:41 +08:00			`; RUN: opt -S -hotcoldsplit < %s \| FileCheck %s`

			`; Source:`
			`;`
			`; extern void sideeffect(int);`
			`; extern void __attribute__((cold)) sink();`
			`; void foo(int cond) {`
			`; if (cond) { //< Start outlining here.`
			`; while (cond > 10) {`
			`; --cond;`
			`; sideeffect(0);`
			`; }`
			`; sink();`
			`; }`
			`; sideeffect(1);`
			`; }`

			`target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"`
			`target triple = "x86_64-apple-macosx10.14.0"`

			`; CHECK-LABEL: define {{.*}}@foo(`
			`; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl`
			`; CHECK-LABEL: codeRepl:`
			`; CHECK-NEXT: call void @foo.cold.1`
			`; CHECK-LABEL: if.end:`
			`; CHECK: call void @sideeffect(i32 1)`
			`define void @foo(i32 %cond) {`
			`entry:`
			`%tobool = icmp eq i32 %cond, 0`
			`br i1 %tobool, label %if.end, label %while.cond.preheader`

			`while.cond.preheader: ; preds = %entry`
			`%cmp3 = icmp sgt i32 %cond, 10`
			`br i1 %cmp3, label %while.body.preheader, label %while.end`

			`while.body.preheader: ; preds = %while.cond.preheader`
			`br label %while.body`

			`while.body: ; preds = %while.body.preheader, %while.body`
			`%cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]`
			`%dec = add nsw i32 %cond.addr.04, -1`
			`tail call void @sideeffect(i32 0) #3`
			`%cmp = icmp sgt i32 %dec, 10`
			`br i1 %cmp, label %while.body, label %while.end.loopexit`

			`while.end.loopexit: ; preds = %while.body`
			`br label %while.end`

			`while.end: ; preds = %while.end.loopexit, %while.cond.preheader`
			`tail call void (...) @sink()`
			`ret void`

			`if.end: ; preds = %entry`
			`tail call void @sideeffect(i32 1)`
			`ret void`
			`}`

[HotColdSplitting] Outline more than once per function Algorithm: Identify maximal cold regions and put them in a worklist. If a candidate region overlaps with another, discard it. While the worklist is full, remove a single-entry sub-region from the worklist and attempt to outline it. By the non-overlap property, this should not invalidate parts of the domtree pertaining to other outlining regions. Testing: LNT results on X86 are clean. With test-suite + externals, llvm outlines 134KB pre-patch, and 352KB post-patch (+ ~2.6x). The file 483.xalancbmk/src/Constants.cpp stands out as an extreme case where llvm outlines over 100 times in some functions (mostly EH paths). There was not a significant performance impact pre vs. post-patch. Differential Revision: https://reviews.llvm.org/D53887 llvm-svn: 348639 2018-12-08 04:23:52 +08:00			`; This is the same as @foo, but the while loop comes after the sink block.`
			`; CHECK-LABEL: define {{.*}}@while_loop_after_sink(`
			`; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl`
			`; CHECK-LABEL: codeRepl:`
			`; CHECK-NEXT: call void @while_loop_after_sink.cold.1`
			`; CHECK-LABEL: if.end:`
			`; CHECK: call void @sideeffect(i32 1)`
			`define void @while_loop_after_sink(i32 %cond) {`
			`entry:`
			`%tobool = icmp eq i32 %cond, 0`
			`br i1 %tobool, label %if.end, label %sink`

			`sink:`
			`tail call void (...) @sink()`
			`br label %while.cond.preheader`

			`while.cond.preheader:`
			`%cmp3 = icmp sgt i32 %cond, 10`
			`br i1 %cmp3, label %while.body.preheader, label %while.end`

			`while.body.preheader: ; preds = %while.cond.preheader`
			`br label %while.body`

			`while.body: ; preds = %while.body.preheader, %while.body`
			`%cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]`
			`%dec = add nsw i32 %cond.addr.04, -1`
			`tail call void @sideeffect(i32 0) #3`
			`%cmp = icmp sgt i32 %dec, 10`
			`br i1 %cmp, label %while.body, label %while.end.loopexit`

			`while.end.loopexit: ; preds = %while.body`
			`br label %while.end`

			`while.end: ; preds = %while.end.loopexit, %while.cond.preheader`
			`ret void`

			`if.end: ; preds = %entry`
			`tail call void @sideeffect(i32 1)`
			`ret void`
			`}`

[HotColdSplitting] Identify larger cold regions using domtree queries The current splitting algorithm works in three stages: 1) Identify cold blocks, then 2) Use forward/backward propagation to mark hot blocks, then 3) Grow a SESE region of blocks outside of the set of hot blocks and start outlining. While testing this pass on Apple internal frameworks I noticed that some kinds of control flow (e.g. loops) are never outlined, even though they unconditionally lead to / follow cold blocks. I noticed two other issues related to how cold regions are identified: - An inconsistency can arise in the internal state of the hotness propagation stage, as a block may end up in both the ColdBlocks set and the HotBlocks set. Further inconsistencies can arise as these sets do not match what's in ProfileSummaryInfo. - It isn't necessary to limit outlining to single-exit regions. This patch teaches the splitting algorithm to identify maximal cold regions and outline them. A maximal cold region is defined as the set of blocks post-dominated by a cold sink block, or dominated by that sink block. This approach can successfully outline loops in the cold path. As a side benefit, it maintains less internal state than the current approach. Due to a limitation in CodeExtractor, blocks within the maximal cold region which aren't dominated by a single entry point (a so-called "max ancestor") are filtered out. Results: - X86 (LNT + -Os + externals): 134KB of TEXT were outlined compared to 47KB pre-patch, or a ~3x improvement. Did not see a performance impact across two runs. - AArch64 (LNT + -Os + externals + Apple-internal benchmarks): 149KB of TEXT were outlined. Ditto re: performance impact. - Outlining results improve marginally in the internal frameworks I tested. Follow-ups: - Outline more than once per function, outline large single basic blocks, & try to remove unconditional branches in outlined functions. Differential Revision: https://reviews.llvm.org/D53627 llvm-svn: 345209 2018-10-25 06:15:41 +08:00			`; CHECK-LABEL: define {{.*}}@foo.cold.1`
			`; CHECK: phi i32`
			`; CHECK-NEXT: add nsw i32`
			`; CHECK-NEXT: call {{.*}}@sideeffect`
			`; CHECK-NEXT: icmp`
			`; CHECK-NEXT: br`

[HotColdSplitting] Outline more than once per function Algorithm: Identify maximal cold regions and put them in a worklist. If a candidate region overlaps with another, discard it. While the worklist is full, remove a single-entry sub-region from the worklist and attempt to outline it. By the non-overlap property, this should not invalidate parts of the domtree pertaining to other outlining regions. Testing: LNT results on X86 are clean. With test-suite + externals, llvm outlines 134KB pre-patch, and 352KB post-patch (+ ~2.6x). The file 483.xalancbmk/src/Constants.cpp stands out as an extreme case where llvm outlines over 100 times in some functions (mostly EH paths). There was not a significant performance impact pre vs. post-patch. Differential Revision: https://reviews.llvm.org/D53887 llvm-svn: 348639 2018-12-08 04:23:52 +08:00			`; CHECK-LABEL: define {{.*}}@while_loop_after_sink.cold.1`
			`; CHECK: call {{.*}}@sink`
			`; CHECK: phi i32`
			`; CHECK-NEXT: add nsw i32`
			`; CHECK-NEXT: call {{.*}}@sideeffect`
			`; CHECK-NEXT: icmp`
			`; CHECK-NEXT: br`

[HotColdSplitting] Identify larger cold regions using domtree queries The current splitting algorithm works in three stages: 1) Identify cold blocks, then 2) Use forward/backward propagation to mark hot blocks, then 3) Grow a SESE region of blocks outside of the set of hot blocks and start outlining. While testing this pass on Apple internal frameworks I noticed that some kinds of control flow (e.g. loops) are never outlined, even though they unconditionally lead to / follow cold blocks. I noticed two other issues related to how cold regions are identified: - An inconsistency can arise in the internal state of the hotness propagation stage, as a block may end up in both the ColdBlocks set and the HotBlocks set. Further inconsistencies can arise as these sets do not match what's in ProfileSummaryInfo. - It isn't necessary to limit outlining to single-exit regions. This patch teaches the splitting algorithm to identify maximal cold regions and outline them. A maximal cold region is defined as the set of blocks post-dominated by a cold sink block, or dominated by that sink block. This approach can successfully outline loops in the cold path. As a side benefit, it maintains less internal state than the current approach. Due to a limitation in CodeExtractor, blocks within the maximal cold region which aren't dominated by a single entry point (a so-called "max ancestor") are filtered out. Results: - X86 (LNT + -Os + externals): 134KB of TEXT were outlined compared to 47KB pre-patch, or a ~3x improvement. Did not see a performance impact across two runs. - AArch64 (LNT + -Os + externals + Apple-internal benchmarks): 149KB of TEXT were outlined. Ditto re: performance impact. - Outlining results improve marginally in the internal frameworks I tested. Follow-ups: - Outline more than once per function, outline large single basic blocks, & try to remove unconditional branches in outlined functions. Differential Revision: https://reviews.llvm.org/D53627 llvm-svn: 345209 2018-10-25 06:15:41 +08:00			`declare void @sideeffect(i32)`

			`declare void @sink(...) cold`