llvm-project/llvm/test/Transforms/PGOProfile/chr.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2549 lines
79 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -chr -instcombine -simplifycfg -S | FileCheck %s
; RUN: opt < %s -passes='require<profile-summary>,function(chr,instcombine,simplify-cfg)' -S | FileCheck %s
declare void @foo()
declare void @bar()
; Simple case.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; if ((t0 & 2) != 0) // Likely true
; foo()
; ->
; t0 = *i
; if ((t0 & 3) != 0) { // Likely true
; foo()
; foo()
; } else {
; if ((t0 & 1) != 0)
; foo()
; if ((t0 & 2) != 0)
; foo()
; }
define void @test_chr_1(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
ret void
}
; Simple case with a cold block.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; if ((t0 & 2) == 0) // Likely false
; bar()
; if ((t0 & 4) != 0) // Likely true
; foo()
; ->
; t0 = *i
; if ((t0 & 7) == 7) { // Likely true
; foo()
; foo()
; } else {
; if ((t0 & 1) != 0)
; foo()
; if ((t0 & 2) == 0)
; bar()
; if ((t0 & 4) != 0)
; foo()
; }
define void @test_chr_1_1(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_1_1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 7
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 7
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB5:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[BB2_NONCHR:%.*]], label [[BB3_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @bar()
; CHECK-NEXT: br label [[BB3_NONCHR]]
; CHECK: bb3.nonchr:
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 4
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
; CHECK-NEXT: br i1 [[TMP8]], label [[BB5]], label [[BB4_NONCHR:%.*]], !prof !16
; CHECK: bb4.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB5]]
; CHECK: bb5:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb2, label %bb3, !prof !15
bb2:
call void @bar()
br label %bb3
bb3:
%5 = and i32 %0, 4
%6 = icmp eq i32 %5, 0
br i1 %6, label %bb5, label %bb4, !prof !15
bb4:
call void @foo()
br label %bb5
bb5:
ret void
}
; With an aggregate bit check.
; Roughly,
; t0 = *i
; if ((t0 & 255) != 0) // Likely true
; if ((t0 & 1) != 0) // Likely true
; foo()
; if ((t0 & 2) != 0) // Likely true
; foo()
; ->
; t0 = *i
; if ((t0 & 3) != 0) { // Likely true
; foo()
; foo()
; } else if ((t0 & 255) != 0)
; if ((t0 & 1) != 0)
; foo()
; if ((t0 & 2) != 0)
; foo()
; }
define void @test_chr_2(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb1:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB4:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[BB4]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[BB2_NONCHR:%.*]], label [[BB1_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
; CHECK-NEXT: br i1 [[TMP8]], label [[BB4]], label [[BB3_NONCHR:%.*]], !prof !16
; CHECK: bb3.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB4]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB2_NONCHR]]
; CHECK: bb4:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb4, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb2, label %bb1, !prof !15
bb1:
call void @foo()
br label %bb2
bb2:
%5 = and i32 %0, 2
%6 = icmp eq i32 %5, 0
br i1 %6, label %bb4, label %bb3, !prof !15
bb3:
call void @foo()
br label %bb4
bb4:
ret void
}
; Split case.
; Roughly,
; t1 = *i
; if ((t1 & 1) != 0) // Likely true
; foo()
; if ((t1 & 2) != 0) // Likely true
; foo()
; t2 = *i
; if ((t2 & 4) != 0) // Likely true
; foo()
; if ((t2 & 8) != 0) // Likely true
; foo()
; ->
; t1 = *i
; if ((t1 & 3) != 0) { // Likely true
; foo()
; foo()
; } else {
; if ((t1 & 1) != 0)
; foo()
; if ((t1 & 2) != 0)
; foo()
; }
; t2 = *i
; if ((t2 & 12) != 0) { // Likely true
; foo()
; foo()
; } else {
; if ((t2 & 4) != 0)
; foo()
; if ((t2 & 8) != 0)
; foo()
; }
define void @test_chr_3(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_3(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[I]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 12
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 12
; CHECK-NEXT: br i1 [[TMP9]], label [[BB4:%.*]], label [[BB3_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb4:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB7:%.*]]
; CHECK: bb3.split.nonchr:
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP7]], 4
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
; CHECK-NEXT: br i1 [[TMP11]], label [[BB5_NONCHR:%.*]], label [[BB4_NONCHR:%.*]], !prof !16
; CHECK: bb4.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB5_NONCHR]]
; CHECK: bb5.nonchr:
; CHECK-NEXT: [[TMP12:%.*]] = and i32 [[TMP7]], 8
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 0
; CHECK-NEXT: br i1 [[TMP13]], label [[BB7]], label [[BB6_NONCHR:%.*]], !prof !16
; CHECK: bb6.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB7]]
; CHECK: bb7:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
%5 = load i32, i32* %i
%6 = and i32 %5, 4
%7 = icmp eq i32 %6, 0
br i1 %7, label %bb5, label %bb4, !prof !15
bb4:
call void @foo()
br label %bb5
bb5:
%8 = and i32 %5, 8
%9 = icmp eq i32 %8, 0
br i1 %9, label %bb7, label %bb6, !prof !15
bb6:
call void @foo()
br label %bb7
bb7:
ret void
}
; Selects.
; Roughly,
; t0 = *i
; sum1 = (t0 & 1) ? sum0 : (sum0 + 42) // Likely false
; sum2 = (t0 & 2) ? sum1 : (sum1 + 43) // Likely false
; return sum2
; ->
; t0 = *i
; if ((t0 & 3) == 3)
; return sum0 + 85
; else {
; sum1 = (t0 & 1) ? sum0 : (sum0 + 42)
; sum2 = (t0 & 2) ? sum1 : (sum1 + 43)
; return sum2
; }
define i32 @test_chr_4(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: entry.split:
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SUM0:%.*]], 85
; CHECK-NEXT: ret i32 [[TMP3]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SUM0]], 42
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: [[SUM1_NONCHR:%.*]] = select i1 [[TMP6]], i32 [[SUM0]], i32 [[TMP4]], !prof !16
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[SUM1_NONCHR]], 43
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM1_NONCHR]], i32 [[TMP9]], !prof !16
; CHECK-NEXT: ret i32 [[SUM2_NONCHR]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
%3 = add i32 %sum0, 42
%sum1 = select i1 %2, i32 %sum0, i32 %3, !prof !15
%4 = and i32 %0, 2
%5 = icmp eq i32 %4, 0
%6 = add i32 %sum1, 43
%sum2 = select i1 %5, i32 %sum1, i32 %6, !prof !15
ret i32 %sum2
}
; Selects + Brs
; Roughly,
; t0 = *i
; if ((t0 & 255) != 0) { // Likely true
; sum = (t0 & 1) ? sum0 : (sum0 + 42) // Likely false
; sum = (t0 & 2) ? sum : (sum + 43) // Likely false
; if ((t0 & 4) != 0) { // Likely true
; sum3 = sum + 44
; sum = (t0 & 8) ? sum3 : (sum3 + 44) // Likely false
; }
; }
; return sum
; ->
; t0 = *i
; if ((t0 & 15) != 15) { // Likely true
; sum = sum0 + 173
; } else if ((t0 & 255) != 0) {
; sum = (t0 & 1) ? sum0 : (sum0 + 42)
; sum = (t0 & 2) ? sum : (sum + 43)
; if ((t0 & 4) != 0) {
; sum3 = sum + 44
; sum = (t0 & 8) ? sum3 : (sum3 + 44)
; }
; }
; return sum
define i32 @test_chr_5(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_5(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 15
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 15
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SUM0:%.*]], 85
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SUM0]], 173
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[SUM0]], 42
; CHECK-NEXT: [[SUM1_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM0]], i32 [[TMP9]], !prof !16
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[SUM1_NONCHR]], 43
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP11]], i32 [[SUM1_NONCHR]], i32 [[TMP12]], !prof !16
; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP0]], 4
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0
; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP16]], i32 44, i32 88
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP14]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb3, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
%5 = add i32 %sum0, 42
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
%6 = and i32 %0, 2
%7 = icmp eq i32 %6, 0
%8 = add i32 %sum1, 43
%sum2 = select i1 %7, i32 %sum1, i32 %8, !prof !15
%9 = and i32 %0, 4
%10 = icmp eq i32 %9, 0
br i1 %10, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%11 = and i32 %0, 8
%12 = icmp eq i32 %11, 0
%13 = add i32 %sum3, 44
%sum4 = select i1 %12, i32 %sum3, i32 %13, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; Selects + Brs with a scope split in the middle
; Roughly,
; t0 = *i
; if ((t0 & 255) != 0) { // Likely true
; sum = (t0 & 1) ? sum0 : (sum0 + 42) // Likely false
; sum = (t0 & 2) ? sum : (sum + 43) // Likely false
; if ((sum0 & 4) != 0) { // Likely true. The condition doesn't use v.
; sum3 = sum + 44
; sum = (t0 & 8) ? sum3 : (sum3 + 44) // Likely false
; }
; }
; return sum
; ->
; t0 = *i
; if ((sum0 & 4) != 0 & (t0 & 11) != 11) { // Likely true
; sum = sum0 + 173
; } else if ((t0 & 255) != 0) {
; sum = (t0 & 1) ? sum0 : (sum0 + 42)
; sum = (t0 & 2) ? sum : (sum + 43)
; if ((sum0 & 4) != 0) {
; sum3 = sum + 44
; sum = (t0 & 8) ? sum3 : (sum3 + 44)
; }
; }
; return sum
define i32 @test_chr_5_1(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_5_1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[SUM0:%.*]], 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 11
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 11
; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], [[TMP2]]
; CHECK-NEXT: br i1 [[TMP5]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[SUM0]], 85
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[SUM0]], 173
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
; CHECK-NEXT: br i1 [[TMP9]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[SUM0]], 42
; CHECK-NEXT: [[SUM1_NONCHR:%.*]] = select i1 [[TMP11]], i32 [[SUM0]], i32 [[TMP12]], !prof !16
; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0
; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SUM1_NONCHR]], 43
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP14]], i32 [[SUM1_NONCHR]], i32 [[TMP15]], !prof !16
; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[SUM0]], 4
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0
; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP19]], i32 44, i32 88
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP17]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP7]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb3, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
%5 = add i32 %sum0, 42
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
%6 = and i32 %0, 2
%7 = icmp eq i32 %6, 0
%8 = add i32 %sum1, 43
%sum2 = select i1 %7, i32 %sum1, i32 %8, !prof !15
%9 = and i32 %sum0, 4 ; Split
%10 = icmp eq i32 %9, 0
br i1 %10, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%11 = and i32 %0, 8
%12 = icmp eq i32 %11, 0
%13 = add i32 %sum3, 44
%sum4 = select i1 %12, i32 %sum3, i32 %13, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; Selects + Brs, non-matching bases
; Roughly,
; i0 = *i
; j0 = *j
; if ((i0 & 255) != 0) { // Likely true
; sum = (i0 & 2) ? sum0 : (sum0 + 43) // Likely false
; if ((j0 & 4) != 0) { // Likely true. The condition uses j0, not i0.
; sum3 = sum + 44
; sum = (i0 & 8) ? sum3 : (sum3 + 44) // Likely false
; }
; }
; return sum
; ->
; i0 = *i
; j0 = *j
; if ((j0 & 4) != 0 & (i0 & 10) != 10) { // Likely true
; sum = sum0 + 131
; } else if ((i0 & 255) != 0) {
; sum = (i0 & 2) ? sum0 : (sum0 + 43)
; if ((j0 & 4) != 0) {
; sum3 = sum + 44
; sum = (i0 & 8) ? sum3 : (sum3 + 44)
; }
; }
; return sum
define i32 @test_chr_6(i32* %i, i32* %j, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_6(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[V9:%.*]] = and i32 [[J0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp ne i32 [[V9]], 0
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[I0]], 10
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 10
; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[V10]]
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[V13:%.*]] = add i32 [[SUM0]], 131
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[V1:%.*]] = and i32 [[I0]], 255
; CHECK-NEXT: [[V2:%.*]] = icmp eq i32 [[V1]], 0
; CHECK-NEXT: br i1 [[V2]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[V3_NONCHR:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4_NONCHR:%.*]] = icmp eq i32 [[V3_NONCHR]], 0
; CHECK-NEXT: [[V8_NONCHR:%.*]] = add i32 [[SUM0]], 43
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[V4_NONCHR]], i32 [[SUM0]], i32 [[V8_NONCHR]], !prof !16
; CHECK-NEXT: [[V9_NONCHR:%.*]] = and i32 [[J0]], 4
; CHECK-NEXT: [[V10_NONCHR:%.*]] = icmp eq i32 [[V9_NONCHR]], 0
; CHECK-NEXT: [[V11_NONCHR:%.*]] = and i32 [[I0]], 8
; CHECK-NEXT: [[V12_NONCHR:%.*]] = icmp eq i32 [[V11_NONCHR]], 0
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[V12_NONCHR]], i32 44, i32 88
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[V10_NONCHR]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[V13]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%i0 = load i32, i32* %i
%j0 = load i32, i32* %j
%v1 = and i32 %i0, 255
%v2 = icmp eq i32 %v1, 0
br i1 %v2, label %bb3, label %bb0, !prof !15
bb0:
%v3 = and i32 %i0, 2
%v4 = icmp eq i32 %v3, 0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
%v9 = and i32 %j0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%v11 = and i32 %i0, 8
%v12 = icmp eq i32 %v11, 0
%v13 = add i32 %sum3, 44
%sum4 = select i1 %v12, i32 %sum3, i32 %v13, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; Selects + Brs, the branch condition can't be hoisted to be merged with a
; select. No CHR happens.
; Roughly,
; i0 = *i
; sum = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
; foo();
; j0 = *j
; if ((j0 & 4) != 0) { // Likely true
; foo();
; sum = sum + 44
; }
; return sum
; ->
; (no change)
define i32 @test_chr_7(i32* %i, i32* %j, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_7(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[V3:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V3]], 0
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[V9:%.*]] = and i32 [[J0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
; CHECK-NEXT: br i1 [[V10]], label [[BB2:%.*]], label [[BB1:%.*]], !prof !16
; CHECK: bb1:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[SUM4:%.*]] = add i32 [[SUM2]], 44
; CHECK-NEXT: br label [[BB2]]
; CHECK: bb2:
; CHECK-NEXT: [[SUM5:%.*]] = phi i32 [ [[SUM2]], [[ENTRY:%.*]] ], [ [[SUM4]], [[BB1]] ]
; CHECK-NEXT: ret i32 [[SUM5]]
;
entry:
%i0 = load i32, i32* %i
%v3 = and i32 %i0, 2
%v4 = icmp eq i32 %v3, 0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
call void @foo()
%j0 = load i32, i32* %j
%v9 = and i32 %j0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb2, label %bb1, !prof !15 ; %v10 can't be hoisted above the above select
bb1:
call void @foo()
%sum4 = add i32 %sum2, 44
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %entry ], [ %sum4, %bb1 ]
ret i32 %sum5
}
; Selects + Brs, the branch condition can't be hoisted to be merged with the
; selects. Dropping the select.
; Roughly,
; i0 = *i
; sum = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
; foo();
; j0 = *j
; if ((j0 & 4) != 0) // Likely true
; foo()
; if ((j0 & 8) != 0) // Likely true
; foo()
; return sum
; ->
; i0 = *i
; sum = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
; foo();
; j0 = *j
; if ((j0 & 12) != 12) { // Likely true
; foo()
; foo()
; } else {
; if ((j0 & 4) != 0)
; foo()
; if ((j0 & 8) != 0)
; foo()
; }
; return sum
define i32 @test_chr_7_1(i32* %i, i32* %j, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_7_1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[J0]], 12
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 12
; CHECK-NEXT: br i1 [[TMP1]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[V9:%.*]] = and i32 [[J0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
; CHECK-NEXT: br i1 [[V10]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[V11_NONCHR:%.*]] = and i32 [[J0]], 8
; CHECK-NEXT: [[V12_NONCHR:%.*]] = icmp eq i32 [[V11_NONCHR]], 0
; CHECK-NEXT: br i1 [[V12_NONCHR]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[V3:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V3]], 0
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: ret i32 [[SUM2]]
;
entry:
%i0 = load i32, i32* %i
%v3 = and i32 %i0, 2
%v4 = icmp eq i32 %v3, 0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
call void @foo()
%j0 = load i32, i32* %j
%v9 = and i32 %j0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb1, label %bb0, !prof !15 ; %v10 can't be hoisted above the above select
bb0:
call void @foo()
br label %bb1
bb1:
%v11 = and i32 %j0, 8
%v12 = icmp eq i32 %v11, 0
br i1 %v12, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
ret i32 %sum2
}
; Branches aren't biased enough. No CHR happens.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Not biased
; foo()
; if ((t0 & 2) != 0) // Not biased
; foo()
; ->
; (no change)
define void @test_chr_8(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_8(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[BB0:%.*]], !prof !17
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[BB3:%.*]], label [[BB2:%.*]], !prof !17
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !16
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !16
bb2:
call void @foo()
br label %bb3
bb3:
ret void
}
; With an existing phi at the exit.
; Roughly,
; t = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; if ((t0 & 2) != 0) { // Likely true
; t = *j
; foo()
; }
; // There's a phi for t here.
; return t
; ->
; t = *i
; if ((t & 3) == 3) { // Likely true
; foo()
; t = *j
; foo()
; } else {
; if ((t & 1) != 0)
; foo()
; if ((t & 2) != 0) {
; t = *j
; foo()
; }
; }
; // There's a phi for t here.
; return t
define i32 @test_chr_9(i32* %i, i32* %j) !prof !14 {
; CHECK-LABEL: @test_chr_9(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: br i1 [[TMP7]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[J]], align 4
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP3]], [[BB0]] ], [ [[TMP0]], [[BB1_NONCHR]] ], [ [[TMP8]], [[BB2_NONCHR]] ]
; CHECK-NEXT: ret i32 [[TMP9]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !15
bb2:
%5 = load i32, i32* %j
call void @foo()
br label %bb3
bb3:
%6 = phi i32 [ %0, %bb1 ], [ %5, %bb2 ]
ret i32 %6
}
; With no phi at the exit, but the exit needs a phi inserted after CHR.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; t1 = *j
; if ((t1 & 2) != 0) // Likely true
; foo()
; return (t1 * 42) - (t1 - 99)
; ->
; t0 = *i
; if ((t0 & 3) == 3) { // Likely true
; foo()
; t1 = *j
; foo()
; } else {
; if ((t0 & 1) != 0)
; foo()
; if ((t0 & 2) != 0) {
; t1 = *j
; foo()
; }
; }
; // A new phi for t1 is inserted here.
; return (t1 * 42) - (t1 - 99)
define i32 @test_chr_10(i32* %i, i32* %j) !prof !14 {
; CHECK-LABEL: @test_chr_10(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[J]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
; CHECK-NEXT: br i1 [[TMP8]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP3]], [[BB0]] ], [ [[TMP6]], [[BB2_NONCHR]] ], [ [[TMP6]], [[BB1_NONCHR]] ]
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 42
; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP9]], -99
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
; CHECK-NEXT: ret i32 [[TMP12]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = load i32, i32* %j
%4 = and i32 %0, 2
%5 = icmp eq i32 %4, 0
br i1 %5, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
%6 = mul i32 %3, 42
%7 = sub i32 %3, 99
%8 = add i32 %6, %7
ret i32 %8
}
; Test a case where there are two use-def chain paths to the same value (t0)
; from the branch condition. This is a regression test for an old bug that
; caused a bad hoisting that moves (hoists) a value (%conv) twice to the end of
; the %entry block (once for %div and once for %mul16) and put a use ahead of
; its definition like:
; %entry:
; ...
; %div = fdiv double 1.000000e+00, %conv
; %conv = sitofp i32 %0 to double
; %mul16 = fmul double %div, %conv
;
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; // there are two use-def paths from the branch condition to t0.
; if ((1.0 / t0) * t0 < 1) // Likely true
; foo()
; ->
; t0 = *i
; if ((t0 & 1) != 0 & (1.0 / t0) * t0 > 0) { // Likely true
; foo()
; foo()
; } else {
; if ((t0 & 1) != 0)
; foo()
; if ((1.0 / t0) * t0 < 1) // Likely true
; foo()
; }
define void @test_chr_11(i32* %i, i32 %x) !prof !14 {
; CHECK-LABEL: @test_chr_11(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
; CHECK-NEXT: [[DIV:%.*]] = fdiv double 1.000000e+00, [[CONV]]
; CHECK-NEXT: [[MUL16:%.*]] = fmul double [[DIV]], [[CONV]]
; CHECK-NEXT: [[CONV717:%.*]] = fptosi double [[MUL16]] to i32
; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i32 [[CONV717]], 0
; CHECK-NEXT: [[TMP3:%.*]] = and i1 [[TMP2]], [[CMP18]]
; CHECK-NEXT: br i1 [[TMP3]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0_NONCHR:%.*]], label [[BB1_NONCHR:%.*]], !prof !18
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[CONV_NONCHR:%.*]] = sitofp i32 [[TMP0]] to double
; CHECK-NEXT: [[DIV_NONCHR:%.*]] = fdiv double 1.000000e+00, [[CONV_NONCHR]]
; CHECK-NEXT: [[MUL16_NONCHR:%.*]] = fmul double [[DIV_NONCHR]], [[CONV_NONCHR]]
; CHECK-NEXT: [[CONV717_NONCHR:%.*]] = fptosi double [[MUL16_NONCHR]] to i32
; CHECK-NEXT: [[CMP18_NONCHR:%.*]] = icmp slt i32 [[CONV717_NONCHR]], 1
; CHECK-NEXT: br i1 [[CMP18_NONCHR]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%conv = sitofp i32 %0 to double
%div = fdiv double 1.000000e+00, %conv
%mul16 = fmul double %div, %conv
%conv717 = fptosi double %mul16 to i32
%cmp18 = icmp slt i32 %conv717, 1
br i1 %cmp18, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
ret void
}
; Selects + unrelated br only
define i32 @test_chr_12(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_12(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP2]], label [[BB3:%.*]], label [[BB0:%.*]], !prof !16
; CHECK: bb0:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SUM0:%.*]], 42
; CHECK-NEXT: [[SUM1:%.*]] = select i1 [[TMP4]], i32 [[SUM0]], i32 [[TMP5]], !prof !16
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SUM1]], 43
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[TMP7]], i32 [[SUM1]], i32 [[TMP8]], !prof !16
; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
; CHECK-NEXT: [[TMP13:%.*]] = and i1 [[TMP10]], [[TMP12]]
; CHECK-NEXT: br i1 [[TMP13]], label [[BB1:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb1:
; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[SUM2]], 88
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb0.split.nonchr:
; CHECK-NEXT: br i1 [[TMP10]], label [[BB1_NONCHR:%.*]], label [[BB3]], !prof !18
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP16]], i32 44, i32 88, !prof !16
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2]], [[SUM4_NONCHR_V]]
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[SUM0]], [[ENTRY:%.*]] ], [ [[TMP14]], [[BB1]] ], [ [[SUM2]], [[BB0_SPLIT_NONCHR]] ], [ [[SUM4_NONCHR]], [[BB1_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb3, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
%5 = add i32 %sum0, 42
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
%6 = and i32 %0, 2
%7 = icmp eq i32 %6, 0
%8 = add i32 %sum1, 43
%sum2 = select i1 %7, i32 %sum1, i32 %8, !prof !15
%9 = load i32, i32* %i
%10 = icmp eq i32 %9, 0
br i1 %10, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%11 = and i32 %0, 8
%12 = icmp eq i32 %11, 0
%13 = add i32 %sum3, 44
%sum4 = select i1 %12, i32 %sum3, i32 %13, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; In the second CHR, a condition value depends on a trivial phi that's inserted
; by the first CHR.
; Roughly,
; i0 = *i
; v2 = (z != 1) ? pred : true // Likely false
; if (z == 0 & pred) // Likely false
; foo()
; j0 = *j
; sum2 = ((i0 & 2) == j0) ? sum0 : (sum0 + 43) // Likely false
; sum3 = ((i0 == j0) ? sum0 : (sum0 + 43) // Likely false
; foo()
; if ((i0 & 4) == 0) // Unbiased
; foo()
; return i0 + sum3
; ->
; i0 = *i
; if (z != 1 & (z == 0 & pred)) // First CHR
; foo()
; // A trivial phi for i0 is inserted here by the first CHR (which gets removed
; // later) and the subsequent branch condition (for the second CHR) uses it.
; j0 = *j
; if ((i0 & 2) != j0 & i0 != j0) { // Second CHR
; sum3 = sum0 + 43
; foo()
; if (i0 & 4) == 0)
; foo()
; } else {
; sum3 = (i0 == j0) ? sum0 : (sum0 + 43)
; foo()
; if (i0 & 4) == 0)
; foo()
; }
; return i0 + sum3
define i32 @test_chr_14(i32* %i, i32* %j, i32 %sum0, i1 %pred, i32 %z) !prof !14 {
; CHECK-LABEL: @test_chr_14(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[V1:%.*]] = icmp ne i32 [[Z:%.*]], 1
; CHECK-NEXT: [[V0:%.*]] = icmp eq i32 [[Z]], 0
; CHECK-NEXT: [[V3_NONCHR:%.*]] = and i1 [[V0]], [[PRED:%.*]]
; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[V1]], [[V3_NONCHR]]
; CHECK-NEXT: br i1 [[OR_COND]], label [[BB0_NONCHR:%.*]], label [[BB1:%.*]], !prof !19
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4:%.*]] = icmp ne i32 [[V6]], [[J0]]
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[V5:%.*]] = icmp ne i32 [[I0]], [[J0]]
; CHECK-NEXT: [[TMP0:%.*]] = and i1 [[V4]], [[V5]]
; CHECK-NEXT: br i1 [[TMP0]], label [[BB1_SPLIT:%.*]], label [[BB1_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb1.split:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[V9:%.*]] = and i32 [[I0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
; CHECK-NEXT: br i1 [[V10]], label [[BB3:%.*]], label [[BB2:%.*]]
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb1.split.nonchr:
; CHECK-NEXT: [[V5_NONCHR:%.*]] = icmp eq i32 [[I0]], [[J0]]
; CHECK-NEXT: [[SUM3_NONCHR:%.*]] = select i1 [[V5_NONCHR]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[V9_NONCHR:%.*]] = and i32 [[I0]], 4
; CHECK-NEXT: [[V10_NONCHR:%.*]] = icmp eq i32 [[V9_NONCHR]], 0
; CHECK-NEXT: br i1 [[V10_NONCHR]], label [[BB3]], label [[BB2_NONCHR:%.*]]
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[V8]], [[BB2]] ], [ [[V8]], [[BB1_SPLIT]] ], [ [[SUM3_NONCHR]], [[BB2_NONCHR]] ], [ [[SUM3_NONCHR]], [[BB1_SPLIT_NONCHR]] ]
; CHECK-NEXT: [[V11:%.*]] = add i32 [[I0]], [[TMP1]]
; CHECK-NEXT: ret i32 [[V11]]
;
entry:
%i0 = load i32, i32* %i
%v0 = icmp eq i32 %z, 0
%v1 = icmp ne i32 %z, 1
%v2 = select i1 %v1, i1 %pred, i1 true, !prof !15
%v3 = and i1 %v0, %pred
br i1 %v3, label %bb0, label %bb1, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%j0 = load i32, i32* %j
%v6 = and i32 %i0, 2
%v4 = icmp eq i32 %v6, %j0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
%v5 = icmp eq i32 %i0, %j0
%sum3 = select i1 %v5, i32 %sum0, i32 %v8, !prof !15
call void @foo()
%v9 = and i32 %i0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb3, label %bb2
bb2:
call void @foo()
br label %bb3
bb3:
%v11 = add i32 %i0, %sum3
ret i32 %v11
}
; Branch or selects depends on another select. No CHR happens.
; Roughly,
; i0 = *i
; if (z == 0 & ((z != 1) ? pred : true)) { // Likely false
; foo()
; j0 = *j
; sum2 = ((i0 & 2) == j0) ? sum0 : (sum0 + 43) // Likely false
; sum3 = (i0 == sum2) ? sum2 : (sum0 + 43) // Likely false. This depends on the
; // previous select.
; foo()
; if ((i0 & 4) == 0) // Unbiased
; foo()
; return i0 + sum3
; ->
; (no change)
define i32 @test_chr_15(i32* %i, i32* %j, i32 %sum0, i1 %pred, i32 %z) !prof !14 {
; CHECK-LABEL: @test_chr_15(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[V0:%.*]] = icmp eq i32 [[Z:%.*]], 0
; CHECK-NEXT: [[V3:%.*]] = and i1 [[V0]], [[PRED:%.*]]
; CHECK-NEXT: br i1 [[V3]], label [[BB0:%.*]], label [[BB1:%.*]], !prof !16
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V6]], [[J0]]
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[V9:%.*]] = and i32 [[I0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
; CHECK-NEXT: br i1 [[V10]], label [[BB3:%.*]], label [[BB2:%.*]]
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[V5:%.*]] = icmp eq i32 [[I0]], [[SUM2]]
; CHECK-NEXT: [[SUM3:%.*]] = select i1 [[V5]], i32 [[SUM2]], i32 [[V8]], !prof !16
; CHECK-NEXT: [[V11:%.*]] = add i32 [[I0]], [[SUM3]]
; CHECK-NEXT: ret i32 [[V11]]
;
entry:
%i0 = load i32, i32* %i
%v0 = icmp eq i32 %z, 0
%v1 = icmp ne i32 %z, 1
%v2 = select i1 %v1, i1 %pred, i1 true, !prof !15
%v3 = and i1 %v0, %v2
br i1 %v3, label %bb0, label %bb1, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%j0 = load i32, i32* %j
%v6 = and i32 %i0, 2
%v4 = icmp eq i32 %v6, %j0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
%v5 = icmp eq i32 %i0, %sum2
%sum3 = select i1 %v5, i32 %sum2, i32 %v8, !prof !15
call void @foo()
%v9 = and i32 %i0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb3, label %bb2
bb2:
call void @foo()
br label %bb3
bb3:
%v11 = add i32 %i0, %sum3
ret i32 %v11
}
; With an existing phi at the exit but a value (%v40) is both alive and is an
; operand to a phi at the exit block.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; v40 = t0 + 44
; if ((t0 & 2) != 0) // Likely true
; v41 = t0 + 99
; foo()
; }
; v42 = phi v40, v41
; return v42 + v40
; ->
; t0 = *i
; if ((t0 & 3) == 3) // Likely true
; foo()
; v40 = t0 + 44
; v41 = t0 + 99
; foo()
; } else {
; if ((t0 & 1) != 0) // Likely true
; foo()
; v40_nc = t0 + 44
; if ((t0 & 2) != 0) // Likely true
; v41_nc = t0 + 99
; foo()
; }
; }
; t7 = phi v40, v40_nc
; v42 = phi v41, v41_nc
; v43 = v42 + t7
; return v43
define i32 @test_chr_16(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_16(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[V40:%.*]] = add i32 [[TMP0]], 44
; CHECK-NEXT: [[V41:%.*]] = add i32 [[TMP0]], 99
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[V40_NONCHR:%.*]] = add i32 [[TMP0]], 44
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: [[V41_NONCHR:%.*]] = add i32 [[TMP0]], 99
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ [[V40]], [[BB0]] ], [ [[V40_NONCHR]], [[BB2_NONCHR]] ], [ [[V40_NONCHR]], [[BB1_NONCHR]] ]
; CHECK-NEXT: [[V42:%.*]] = phi i32 [ [[V41]], [[BB0]] ], [ [[V41_NONCHR]], [[BB2_NONCHR]] ], [ [[V40_NONCHR]], [[BB1_NONCHR]] ]
; CHECK-NEXT: [[V43:%.*]] = add i32 [[V42]], [[TMP7]]
; CHECK-NEXT: ret i32 [[V43]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%v40 = add i32 %0, 44
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !15
bb2:
%v41 = add i32 %0, 99
call void @foo()
br label %bb3
bb3:
%v42 = phi i32 [ %v41, %bb2 ], [ %v40, %bb1 ]
%v43 = add i32 %v42, %v40
ret i32 %v43
}
; Two consecutive regions have an entry in the middle of them. No CHR happens.
; Roughly,
; if ((i & 4) == 0) {
; if (!j)
; goto bb1
; } else {
; t0 = (i & 1)
; if (t0 != 0) // Likely true
; foo()
; s = (i & 1) + i
; }
; bb1:
; p = phi i, t0, s
; if ((i & 2) != 0) // Likely true
; foo()
; q = p + 2
; }
; r = phi p, q, i
; return r
; ->
; (no change)
define i32 @test_chr_17(i32 %i, i1 %j) !prof !14 {
; CHECK-LABEL: @test_chr_17(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[V0:%.*]] = and i32 [[I:%.*]], 4
; CHECK-NEXT: [[V1:%.*]] = icmp eq i32 [[V0]], 0
; CHECK-NEXT: br i1 [[V1]], label [[BBE:%.*]], label [[BBQ:%.*]]
; CHECK: bbq:
; CHECK-NEXT: br i1 [[J:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bbe:
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[I]], 1
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
; CHECK-NEXT: br i1 [[TMP1]], label [[BB1]], label [[BB0:%.*]], !prof !16
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[S:%.*]] = add i32 [[TMP0]], [[I]]
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[P:%.*]] = phi i32 [ [[I]], [[BBQ]] ], [ [[TMP0]], [[BBE]] ], [ [[S]], [[BB0]] ]
; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[I]], 2
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
; CHECK-NEXT: br i1 [[TMP3]], label [[BB3]], label [[BB2:%.*]], !prof !16
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[Q:%.*]] = add i32 [[P]], [[TMP2]]
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[P]], [[BB1]] ], [ [[Q]], [[BB2]] ], [ [[I]], [[BBQ]] ]
; CHECK-NEXT: ret i32 [[R]]
;
entry:
%v0 = and i32 %i, 4
%v1 = icmp eq i32 %v0, 0
br i1 %v1, label %bbe, label %bbq
bbq:
br i1 %j, label %bb3, label %bb1
bbe:
%0 = and i32 %i, 1
%1 = icmp eq i32 %0, 0
br i1 %1, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
%s = add i32 %0, %i
br label %bb1
bb1:
%p = phi i32 [ %i, %bbq ], [ %0, %bbe ], [ %s, %bb0 ]
%2 = and i32 %i, 2
%3 = icmp eq i32 %2, 0
br i1 %3, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
%q = add i32 %p, %2
br label %bb3
bb3:
%r = phi i32 [ %p, %bb1 ], [ %q, %bb2 ], [ %i, %bbq ]
ret i32 %r
}
; Select + br, there's a loop and we need to update the user of an inserted phi
; at the entry block. This is a regression test for a bug that's fixed.
; Roughly,
; do {
; inc1 = phi inc2, 0
; li = *i
; sum1 = sum0 + 42
; sum2 = ((li & 1) == 0) ? sum0 : sum1 // Likely false
; inc2 = inc1 + 1
; if ((li & 4) != 0) // Likely true
; sum3 = sum2 + 44
; sum4 = phi sum1, sum3
; } while (inc2 != 100) // Likely true (loop back)
; return sum4
; ->
; do {
; inc1 = phi tmp2, 0 // The first operand needed to be updated
; li = *i
; sum1 = sum0 + 42
; if ((li & 5) == 5) { // Likely true
; inc2 = inc1 + 1
; sum3 = sum0 + 86
; } else {
; inc2_nc = inc1 + 1
; if ((li & 4) == 0)
; sum2_nc = ((li & 1) == 0) ? sum0 : sum1
; sum3_nc = sum2_nc + 44
; }
; tmp2 = phi inc2, in2c_nc
; sum4 = phi sum3, sum3_nc, sum1
; } while (tmp2 != 100)
; return sum4
define i32 @test_chr_18(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_18(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[BB0:%.*]]
; CHECK: bb0:
; CHECK-NEXT: [[INC1:%.*]] = phi i32 [ [[TMP2:%.*]], [[BB2:%.*]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[LI:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[SUM1:%.*]] = add i32 [[SUM0:%.*]], 42
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[LI]], 5
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 5
; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_SPLIT:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0.split:
; CHECK-NEXT: [[INC2:%.*]] = add i32 [[INC1]], 1
; CHECK-NEXT: [[SUM3:%.*]] = add i32 [[SUM0]], 86
; CHECK-NEXT: br label [[BB2]]
; CHECK: bb0.split.nonchr:
; CHECK-NEXT: [[A4_NONCHR:%.*]] = and i32 [[LI]], 4
; CHECK-NEXT: [[CMP4_NONCHR:%.*]] = icmp eq i32 [[A4_NONCHR]], 0
; CHECK-NEXT: [[INC2_NONCHR:%.*]] = add i32 [[INC1]], 1
; CHECK-NEXT: br i1 [[CMP4_NONCHR]], label [[BB2]], label [[BB1_NONCHR:%.*]], !prof !16
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[A1:%.*]] = and i32 [[LI]], 1
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A1]], 0
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[CMP1]], i32 [[SUM0]], i32 [[SUM1]], !prof !16
; CHECK-NEXT: [[SUM3_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], 44
; CHECK-NEXT: br label [[BB2]]
; CHECK: bb2:
; CHECK-NEXT: [[TMP2]] = phi i32 [ [[INC2]], [[BB0_SPLIT]] ], [ [[INC2_NONCHR]], [[BB1_NONCHR]] ], [ [[INC2_NONCHR]], [[BB0_SPLIT_NONCHR]] ]
; CHECK-NEXT: [[SUM4:%.*]] = phi i32 [ [[SUM3]], [[BB0_SPLIT]] ], [ [[SUM3_NONCHR]], [[BB1_NONCHR]] ], [ [[SUM1]], [[BB0_SPLIT_NONCHR]] ]
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 100
; CHECK-NEXT: br i1 [[CMP]], label [[BB3:%.*]], label [[BB0]], !prof !16
; CHECK: bb3:
; CHECK-NEXT: ret i32 [[SUM4]]
;
entry:
br label %bb0
bb0:
%inc1 = phi i32 [ %inc2, %bb2 ], [ 0, %entry ]
%li = load i32, i32* %i
%a1 = and i32 %li, 1
%cmp1 = icmp eq i32 %a1, 0
%sum1 = add i32 %sum0, 42
%sum2 = select i1 %cmp1, i32 %sum0, i32 %sum1, !prof !15
%a4 = and i32 %li, 4
%cmp4 = icmp eq i32 %a4, 0
%inc2 = add i32 %inc1, 1
br i1 %cmp4, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
br label %bb2
bb2:
%sum4 = phi i32 [ %sum1, %bb0 ], [ %sum3, %bb1 ]
%cmp = icmp eq i32 %inc2, 100
br i1 %cmp, label %bb3, label %bb0, !prof !15
bb3:
ret i32 %sum4
}
; Selects + Brs. Those share the condition value, which causes the
; targets/operands of the branch/select to be flipped.
; Roughly,
; t0 = *i
; if ((t0 & 255) != 0) { // Likely true
; sum1 = ((t0 & 1) == 0) ? sum0 : (sum0 + 42) // Likely false
; sum2 = ((t0 & 1) == 0) ? sum1 : (sum1 + 42) // Likely false
; if ((t0 & 1) != 0) { // Likely true
; sum3 = sum2 + 44
; sum4 = ((t0 & 8) == 0) ? sum3 : (sum3 + 44) // Likely false
; }
; sum5 = phi sum2, sum4
; }
; sum6 = phi sum0, sum5
; return sum6
; ->
; t0 = *i
; if ((t0 & 9) == 9) { // Likely true
; tmp3 = sum0 + 85 // Dead
; tmp4 = sum0 + 173
; } else {
; if ((t0 & 255) != 0) {
; sum2_nc = ((t0 & 1) == 0) ? sum0 : (sum0 + 85)
; sum4_nc_v = ((t0 & 8) == 0) ? 44 : 88
; sum4_nc = add sum2_nc + sum4_nc_v
; }
; }
; sum6 = phi tmp4, sum0, sum2_nc, sum4_nc
; return sum6
define i32 @test_chr_19(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_19(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 9
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 9
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SUM0:%.*]], 85
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SUM0]], 173
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[SUM0]], 85
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM0]], i32 [[TMP9]], !prof !16
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP11]], i32 44, i32 88
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb3, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
%5 = add i32 %sum0, 42
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
%6 = add i32 %sum1, 43
%sum2 = select i1 %4, i32 %sum1, i32 %6, !prof !15
br i1 %4, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%7 = and i32 %0, 8
%8 = icmp eq i32 %7, 0
%9 = add i32 %sum3, 44
%sum4 = select i1 %8, i32 %sum3, i32 %9, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; Selects. The exit block, which belongs to the top-level region, has a select
; and causes the top-level region to be the outermost CHR scope with the
; subscope that includes the entry block with two selects. The outermost CHR
; scope doesn't see the selects in the entry block as the entry block is in the
; subscope and incorrectly sets the CHR hoist point to the branch rather than
; the first select in the entry block and causes the CHR'ed selects ("select i1
; false...") to incorrectly position above the CHR branch. This is testing
; against a quirk of how the region analysis handles the entry block.
; Roughly,
; i0 = *i
; sum2 = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
; sum3 = ((i0 & 4) == 0) ? sum2 : (sum2 + 44) // Likely false
; if (j)
; foo()
; i5 = *i
; v13 = (i5 == 44) ? i5 : sum3
; return v13
; ->
; i0 = *i
; if ((i0 & 6) != 6) { // Likely true
; v9 = sum0 + 87
; if (j)
; foo()
; } else {
; sum2.nc = ((i0 & 2) == 0) ? sum0 : (sum0 + 43)
; sum3.nc = ((i0 & 4) == 0) ? sum2.nc : (sum2.nc + 44)
; if (j)
; foo()
; }
; t2 = phi v9, sum3.nc
; i5 = *i
; v13 = (i5 == 44) ? 44 : t2
; return v13
define i32 @test_chr_20(i32* %i, i32 %sum0, i1 %j) !prof !14 {
; CHECK-LABEL: @test_chr_20(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[I0]], 6
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 6
; CHECK-NEXT: br i1 [[TMP1]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: entry.split:
; CHECK-NEXT: [[V9:%.*]] = add i32 [[SUM0:%.*]], 87
; CHECK-NEXT: br i1 [[J:%.*]], label [[BB1:%.*]], label [[BB4:%.*]]
; CHECK: bb1:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB4]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0]], 43
; CHECK-NEXT: [[V3:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V3]], 0
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: [[V6_NONCHR:%.*]] = and i32 [[I0]], 4
; CHECK-NEXT: [[V5_NONCHR:%.*]] = icmp eq i32 [[V6_NONCHR]], 0
; CHECK-NEXT: [[V9_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], 44
; CHECK-NEXT: [[SUM3_NONCHR:%.*]] = select i1 [[V5_NONCHR]], i32 [[SUM2_NONCHR]], i32 [[V9_NONCHR]], !prof !16
; CHECK-NEXT: br i1 [[J]], label [[BB1_NONCHR:%.*]], label [[BB4]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB4]]
; CHECK: bb4:
; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[V9]], [[BB1]] ], [ [[V9]], [[ENTRY_SPLIT]] ], [ [[SUM3_NONCHR]], [[BB1_NONCHR]] ], [ [[SUM3_NONCHR]], [[ENTRY_SPLIT_NONCHR]] ]
; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[I]], align 4
; CHECK-NEXT: [[V12:%.*]] = icmp eq i32 [[I5]], 44
; CHECK-NEXT: [[V13:%.*]] = select i1 [[V12]], i32 44, i32 [[TMP2]], !prof !16
; CHECK-NEXT: ret i32 [[V13]]
;
entry:
%i0 = load i32, i32* %i
%v3 = and i32 %i0, 2
%v4 = icmp eq i32 %v3, 0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
%v6 = and i32 %i0, 4
%v5 = icmp eq i32 %v6, 0
%v9 = add i32 %sum2, 44
%sum3 = select i1 %v5, i32 %sum2, i32 %v9, !prof !15
br i1 %j, label %bb1, label %bb4
bb1:
call void @foo()
br label %bb4
bb4:
%i5 = load i32, i32* %i
%v12 = icmp eq i32 %i5, 44
%v13 = select i1 %v12, i32 %i5, i32 %sum3, !prof !15
ret i32 %v13
}
; Test the case where two scopes share a common instruction to hoist (%cmp.i).
; Two scopes would hoist it to their hoist points, but since the outer scope
; hoists (entry/bb6-9) it first to its hoist point, it'd be wrong (causing bad
; IR) for the inner scope (bb1-4) to hoist the same instruction to its hoist
; point.
; Roughly,
; if (j != k) {
; if (i != 2)
; foo();
; cmp.i = i == 86
; if (!cmp.i)
; foo();
; if (j != i)
; foo();
; if (!cmp.i)
; foo();
; }
; return 45;
define i32 @test_chr_21(i64 %i, i64 %k, i64 %j) !prof !14 {
; CHECK-LABEL: @test_chr_21(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i64 [[J:%.*]], [[K:%.*]]
; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[J]], [[I:%.*]]
; CHECK-NEXT: [[CMP_I:%.*]] = icmp ne i64 [[I]], 86
; CHECK-NEXT: [[TMP0:%.*]] = and i1 [[CMP0]], [[CMP3]]
; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[TMP0]], [[CMP_I]]
; CHECK-NEXT: br i1 [[TMP1]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb1:
; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[I]], 2
; CHECK-NEXT: switch i64 [[I]], label [[BB2:%.*]] [
; CHECK-NEXT: i64 2, label [[BB3_NONCHR2:%.*]]
; CHECK-NEXT: i64 86, label [[BB2_NONCHR1:%.*]]
; CHECK-NEXT: ], !prof !20
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB7:%.*]]
; CHECK: bb2.nonchr1:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3_NONCHR2]]
; CHECK: bb3.nonchr2:
; CHECK-NEXT: br i1 [[CMP_I]], label [[BB4_NONCHR3:%.*]], label [[BB7]], !prof !18
; CHECK: bb4.nonchr3:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB7]]
; CHECK: bb7:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB10:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: br i1 [[CMP0]], label [[BB1_NONCHR:%.*]], label [[BB10]], !prof !18
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[CMP2_NONCHR:%.*]] = icmp eq i64 [[I]], 2
; CHECK-NEXT: br i1 [[CMP2_NONCHR]], label [[BB3_NONCHR:%.*]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb3.nonchr:
; CHECK-NEXT: [[CMP_I_NONCHR:%.*]] = icmp eq i64 [[I]], 86
; CHECK-NEXT: br i1 [[CMP_I_NONCHR]], label [[BB6_NONCHR:%.*]], label [[BB4_NONCHR:%.*]], !prof !16
; CHECK: bb6.nonchr:
; CHECK-NEXT: [[CMP3_NONCHR:%.*]] = icmp eq i64 [[J]], [[I]]
; CHECK-NEXT: br i1 [[CMP3_NONCHR]], label [[BB8_NONCHR:%.*]], label [[BB7_NONCHR:%.*]], !prof !16
; CHECK: bb8.nonchr:
; CHECK-NEXT: br i1 [[CMP_I_NONCHR]], label [[BB10]], label [[BB9_NONCHR:%.*]], !prof !16
; CHECK: bb9.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB10]]
; CHECK: bb7.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB8_NONCHR]]
; CHECK: bb4.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB6_NONCHR]]
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3_NONCHR]]
; CHECK: bb10:
; CHECK-NEXT: ret i32 45
;
entry:
%cmp0 = icmp eq i64 %j, %k
br i1 %cmp0, label %bb10, label %bb1, !prof !15
bb1:
%cmp2 = icmp eq i64 %i, 2
br i1 %cmp2, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
%cmp.i = icmp eq i64 %i, 86
br i1 %cmp.i, label %bb5, label %bb4, !prof !15
bb4:
call void @foo()
br label %bb5
bb5:
br label %bb6
bb6:
%cmp3 = icmp eq i64 %j, %i
br i1 %cmp3, label %bb8, label %bb7, !prof !15
bb7:
call void @foo()
br label %bb8
bb8:
br i1 %cmp.i, label %bb10, label %bb9, !prof !15
bb9:
call void @foo()
br label %bb10
bb10:
ret i32 45
}
; Test a case with a really long use-def chains. This test checks that it's not
; really slow and doesn't appear to be hanging.
define i64 @test_chr_22(i1 %i, i64* %j, i64 %v0) !prof !14 {
; CHECK-LABEL: @test_chr_22(
; CHECK-NEXT: bb0:
; CHECK-NEXT: [[REASS_ADD:%.*]] = shl i64 [[V0:%.*]], 1
; CHECK-NEXT: [[V2:%.*]] = add i64 [[REASS_ADD]], 3
; CHECK-NEXT: [[V299:%.*]] = mul i64 [[V2]], 7860086430977039991
; CHECK-NEXT: store i64 [[V299]], i64* [[J:%.*]], align 4
; CHECK-NEXT: ret i64 99
;
bb0:
%v1 = add i64 %v0, 3
%v2 = add i64 %v1, %v0
%c1 = icmp sgt i64 %v2, 99
%v3 = select i1 %c1, i64 %v1, i64 %v2, !prof !15
%v4 = add i64 %v2, %v2
%v5 = add i64 %v4, %v2
%v6 = add i64 %v5, %v4
%v7 = add i64 %v6, %v5
%v8 = add i64 %v7, %v6
%v9 = add i64 %v8, %v7
%v10 = add i64 %v9, %v8
%v11 = add i64 %v10, %v9
%v12 = add i64 %v11, %v10
%v13 = add i64 %v12, %v11
%v14 = add i64 %v13, %v12
%v15 = add i64 %v14, %v13
%v16 = add i64 %v15, %v14
%v17 = add i64 %v16, %v15
%v18 = add i64 %v17, %v16
%v19 = add i64 %v18, %v17
%v20 = add i64 %v19, %v18
%v21 = add i64 %v20, %v19
%v22 = add i64 %v21, %v20
%v23 = add i64 %v22, %v21
%v24 = add i64 %v23, %v22
%v25 = add i64 %v24, %v23
%v26 = add i64 %v25, %v24
%v27 = add i64 %v26, %v25
%v28 = add i64 %v27, %v26
%v29 = add i64 %v28, %v27
%v30 = add i64 %v29, %v28
%v31 = add i64 %v30, %v29
%v32 = add i64 %v31, %v30
%v33 = add i64 %v32, %v31
%v34 = add i64 %v33, %v32
%v35 = add i64 %v34, %v33
%v36 = add i64 %v35, %v34
%v37 = add i64 %v36, %v35
%v38 = add i64 %v37, %v36
%v39 = add i64 %v38, %v37
%v40 = add i64 %v39, %v38
%v41 = add i64 %v40, %v39
%v42 = add i64 %v41, %v40
%v43 = add i64 %v42, %v41
%v44 = add i64 %v43, %v42
%v45 = add i64 %v44, %v43
%v46 = add i64 %v45, %v44
%v47 = add i64 %v46, %v45
%v48 = add i64 %v47, %v46
%v49 = add i64 %v48, %v47
%v50 = add i64 %v49, %v48
%v51 = add i64 %v50, %v49
%v52 = add i64 %v51, %v50
%v53 = add i64 %v52, %v51
%v54 = add i64 %v53, %v52
%v55 = add i64 %v54, %v53
%v56 = add i64 %v55, %v54
%v57 = add i64 %v56, %v55
%v58 = add i64 %v57, %v56
%v59 = add i64 %v58, %v57
%v60 = add i64 %v59, %v58
%v61 = add i64 %v60, %v59
%v62 = add i64 %v61, %v60
%v63 = add i64 %v62, %v61
%v64 = add i64 %v63, %v62
%v65 = add i64 %v64, %v63
%v66 = add i64 %v65, %v64
%v67 = add i64 %v66, %v65
%v68 = add i64 %v67, %v66
%v69 = add i64 %v68, %v67
%v70 = add i64 %v69, %v68
%v71 = add i64 %v70, %v69
%v72 = add i64 %v71, %v70
%v73 = add i64 %v72, %v71
%v74 = add i64 %v73, %v72
%v75 = add i64 %v74, %v73
%v76 = add i64 %v75, %v74
%v77 = add i64 %v76, %v75
%v78 = add i64 %v77, %v76
%v79 = add i64 %v78, %v77
%v80 = add i64 %v79, %v78
%v81 = add i64 %v80, %v79
%v82 = add i64 %v81, %v80
%v83 = add i64 %v82, %v81
%v84 = add i64 %v83, %v82
%v85 = add i64 %v84, %v83
%v86 = add i64 %v85, %v84
%v87 = add i64 %v86, %v85
%v88 = add i64 %v87, %v86
%v89 = add i64 %v88, %v87
%v90 = add i64 %v89, %v88
%v91 = add i64 %v90, %v89
%v92 = add i64 %v91, %v90
%v93 = add i64 %v92, %v91
%v94 = add i64 %v93, %v92
%v95 = add i64 %v94, %v93
%v96 = add i64 %v95, %v94
%v97 = add i64 %v96, %v95
%v98 = add i64 %v97, %v96
%v99 = add i64 %v98, %v97
%v100 = add i64 %v99, %v98
%v101 = add i64 %v100, %v99
%v102 = add i64 %v101, %v100
%v103 = add i64 %v102, %v101
%v104 = add i64 %v103, %v102
%v105 = add i64 %v104, %v103
%v106 = add i64 %v105, %v104
%v107 = add i64 %v106, %v105
%v108 = add i64 %v107, %v106
%v109 = add i64 %v108, %v107
%v110 = add i64 %v109, %v108
%v111 = add i64 %v110, %v109
%v112 = add i64 %v111, %v110
%v113 = add i64 %v112, %v111
%v114 = add i64 %v113, %v112
%v115 = add i64 %v114, %v113
%v116 = add i64 %v115, %v114
%v117 = add i64 %v116, %v115
%v118 = add i64 %v117, %v116
%v119 = add i64 %v118, %v117
%v120 = add i64 %v119, %v118
%v121 = add i64 %v120, %v119
%v122 = add i64 %v121, %v120
%v123 = add i64 %v122, %v121
%v124 = add i64 %v123, %v122
%v125 = add i64 %v124, %v123
%v126 = add i64 %v125, %v124
%v127 = add i64 %v126, %v125
%v128 = add i64 %v127, %v126
%v129 = add i64 %v128, %v127
%v130 = add i64 %v129, %v128
%v131 = add i64 %v130, %v129
%v132 = add i64 %v131, %v130
%v133 = add i64 %v132, %v131
%v134 = add i64 %v133, %v132
%v135 = add i64 %v134, %v133
%v136 = add i64 %v135, %v134
%v137 = add i64 %v136, %v135
%v138 = add i64 %v137, %v136
%v139 = add i64 %v138, %v137
%v140 = add i64 %v139, %v138
%v141 = add i64 %v140, %v139
%v142 = add i64 %v141, %v140
%v143 = add i64 %v142, %v141
%v144 = add i64 %v143, %v142
%v145 = add i64 %v144, %v143
%v146 = add i64 %v145, %v144
%v147 = add i64 %v146, %v145
%v148 = add i64 %v147, %v146
%v149 = add i64 %v148, %v147
%v150 = add i64 %v149, %v148
%v151 = add i64 %v150, %v149
%v152 = add i64 %v151, %v150
%v153 = add i64 %v152, %v151
%v154 = add i64 %v153, %v152
%v155 = add i64 %v154, %v153
%v156 = add i64 %v155, %v154
%v157 = add i64 %v156, %v155
%v158 = add i64 %v157, %v156
%v159 = add i64 %v158, %v157
%v160 = add i64 %v159, %v158
%v161 = add i64 %v160, %v159
%v162 = add i64 %v161, %v160
%v163 = add i64 %v162, %v161
%v164 = add i64 %v163, %v162
%v165 = add i64 %v164, %v163
%v166 = add i64 %v165, %v164
%v167 = add i64 %v166, %v165
%v168 = add i64 %v167, %v166
%v169 = add i64 %v168, %v167
%v170 = add i64 %v169, %v168
%v171 = add i64 %v170, %v169
%v172 = add i64 %v171, %v170
%v173 = add i64 %v172, %v171
%v174 = add i64 %v173, %v172
%v175 = add i64 %v174, %v173
%v176 = add i64 %v175, %v174
%v177 = add i64 %v176, %v175
%v178 = add i64 %v177, %v176
%v179 = add i64 %v178, %v177
%v180 = add i64 %v179, %v178
%v181 = add i64 %v180, %v179
%v182 = add i64 %v181, %v180
%v183 = add i64 %v182, %v181
%v184 = add i64 %v183, %v182
%v185 = add i64 %v184, %v183
%v186 = add i64 %v185, %v184
%v187 = add i64 %v186, %v185
%v188 = add i64 %v187, %v186
%v189 = add i64 %v188, %v187
%v190 = add i64 %v189, %v188
%v191 = add i64 %v190, %v189
%v192 = add i64 %v191, %v190
%v193 = add i64 %v192, %v191
%v194 = add i64 %v193, %v192
%v195 = add i64 %v194, %v193
%v196 = add i64 %v195, %v194
%v197 = add i64 %v196, %v195
%v198 = add i64 %v197, %v196
%v199 = add i64 %v198, %v197
%v200 = add i64 %v199, %v198
%v201 = add i64 %v200, %v199
%v202 = add i64 %v201, %v200
%v203 = add i64 %v202, %v201
%v204 = add i64 %v203, %v202
%v205 = add i64 %v204, %v203
%v206 = add i64 %v205, %v204
%v207 = add i64 %v206, %v205
%v208 = add i64 %v207, %v206
%v209 = add i64 %v208, %v207
%v210 = add i64 %v209, %v208
%v211 = add i64 %v210, %v209
%v212 = add i64 %v211, %v210
%v213 = add i64 %v212, %v211
%v214 = add i64 %v213, %v212
%v215 = add i64 %v214, %v213
%v216 = add i64 %v215, %v214
%v217 = add i64 %v216, %v215
%v218 = add i64 %v217, %v216
%v219 = add i64 %v218, %v217
%v220 = add i64 %v219, %v218
%v221 = add i64 %v220, %v219
%v222 = add i64 %v221, %v220
%v223 = add i64 %v222, %v221
%v224 = add i64 %v223, %v222
%v225 = add i64 %v224, %v223
%v226 = add i64 %v225, %v224
%v227 = add i64 %v226, %v225
%v228 = add i64 %v227, %v226
%v229 = add i64 %v228, %v227
%v230 = add i64 %v229, %v228
%v231 = add i64 %v230, %v229
%v232 = add i64 %v231, %v230
%v233 = add i64 %v232, %v231
%v234 = add i64 %v233, %v232
%v235 = add i64 %v234, %v233
%v236 = add i64 %v235, %v234
%v237 = add i64 %v236, %v235
%v238 = add i64 %v237, %v236
%v239 = add i64 %v238, %v237
%v240 = add i64 %v239, %v238
%v241 = add i64 %v240, %v239
%v242 = add i64 %v241, %v240
%v243 = add i64 %v242, %v241
%v244 = add i64 %v243, %v242
%v245 = add i64 %v244, %v243
%v246 = add i64 %v245, %v244
%v247 = add i64 %v246, %v245
%v248 = add i64 %v247, %v246
%v249 = add i64 %v248, %v247
%v250 = add i64 %v249, %v248
%v251 = add i64 %v250, %v249
%v252 = add i64 %v251, %v250
%v253 = add i64 %v252, %v251
%v254 = add i64 %v253, %v252
%v255 = add i64 %v254, %v253
%v256 = add i64 %v255, %v254
%v257 = add i64 %v256, %v255
%v258 = add i64 %v257, %v256
%v259 = add i64 %v258, %v257
%v260 = add i64 %v259, %v258
%v261 = add i64 %v260, %v259
%v262 = add i64 %v261, %v260
%v263 = add i64 %v262, %v261
%v264 = add i64 %v263, %v262
%v265 = add i64 %v264, %v263
%v266 = add i64 %v265, %v264
%v267 = add i64 %v266, %v265
%v268 = add i64 %v267, %v266
%v269 = add i64 %v268, %v267
%v270 = add i64 %v269, %v268
%v271 = add i64 %v270, %v269
%v272 = add i64 %v271, %v270
%v273 = add i64 %v272, %v271
%v274 = add i64 %v273, %v272
%v275 = add i64 %v274, %v273
%v276 = add i64 %v275, %v274
%v277 = add i64 %v276, %v275
%v278 = add i64 %v277, %v276
%v279 = add i64 %v278, %v277
%v280 = add i64 %v279, %v278
%v281 = add i64 %v280, %v279
%v282 = add i64 %v281, %v280
%v283 = add i64 %v282, %v281
%v284 = add i64 %v283, %v282
%v285 = add i64 %v284, %v283
%v286 = add i64 %v285, %v284
%v287 = add i64 %v286, %v285
%v288 = add i64 %v287, %v286
%v289 = add i64 %v288, %v287
%v290 = add i64 %v289, %v288
%v291 = add i64 %v290, %v289
%v292 = add i64 %v291, %v290
%v293 = add i64 %v292, %v291
%v294 = add i64 %v293, %v292
%v295 = add i64 %v294, %v293
%v296 = add i64 %v295, %v294
%v297 = add i64 %v296, %v295
%v298 = add i64 %v297, %v296
%v299 = add i64 %v298, %v297
%v300 = add i64 %v299, %v298
%v301 = icmp eq i64 %v300, 100
%v302 = select i1 %v301, i64 %v298, i64 %v299, !prof !15
store i64 %v302, i64* %j
ret i64 99
}
; Test a case with a really long use-def chains. This test checks that it's not
; really slow and doesn't appear to be hanging. This is different from
; test_chr_22 in that it has nested control structures (multiple scopes) and
; covers additional code.
define i64 @test_chr_23(i64 %v0) !prof !14 {
; CHECK-LABEL: @test_chr_23(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[V0:%.*]], 50
; CHECK-NEXT: [[V10:%.*]] = icmp ne i64 [[TMP0]], -50
; CHECK-NEXT: ret i64 99
;
entry:
%v1 = add i64 %v0, 3
%v2 = add i64 %v1, %v1
%v3 = add i64 %v2, %v1
%v4 = add i64 %v2, %v3
%v5 = add i64 %v4, %v2
%v6 = add i64 %v5, %v4
%v7 = add i64 %v6, %v5
%v8 = add i64 %v7, %v6
%v9 = add i64 %v8, %v7
%v10 = icmp eq i64 %v9, 100
br i1 %v10, label %body, label %end, !prof !15
body:
%v1_0 = add i64 %v9, 3
%v2_0 = add i64 %v1_0, %v1_0
%v3_0 = add i64 %v2_0, %v1_0
%v4_0 = add i64 %v2_0, %v3_0
%v5_0 = add i64 %v4_0, %v2_0
%v6_0 = add i64 %v5_0, %v4_0
%v7_0 = add i64 %v6_0, %v5_0
%v8_0 = add i64 %v7_0, %v6_0
%v9_0 = add i64 %v8_0, %v7_0
%v10_0 = icmp eq i64 %v9_0, 100
br i1 %v10_0, label %body.1, label %end, !prof !15
body.1:
%v1_1 = add i64 %v9_0, 3
%v2_1 = add i64 %v1_1, %v1_1
%v3_1 = add i64 %v2_1, %v1_1
%v4_1 = add i64 %v2_1, %v3_1
%v5_1 = add i64 %v4_1, %v2_1
%v6_1 = add i64 %v5_1, %v4_1
%v7_1 = add i64 %v6_1, %v5_1
%v8_1 = add i64 %v7_1, %v6_1
%v9_1 = add i64 %v8_1, %v7_1
%v10_1 = icmp eq i64 %v9_1, 100
br i1 %v10_1, label %body.2, label %end, !prof !15
body.2:
%v1_2 = add i64 %v9_1, 3
%v2_2 = add i64 %v1_2, %v1_2
%v3_2 = add i64 %v2_2, %v1_2
%v4_2 = add i64 %v2_2, %v3_2
%v5_2 = add i64 %v4_2, %v2_2
%v6_2 = add i64 %v5_2, %v4_2
%v7_2 = add i64 %v6_2, %v5_2
%v8_2 = add i64 %v7_2, %v6_2
%v9_2 = add i64 %v8_2, %v7_2
%v10_2 = icmp eq i64 %v9_2, 100
br i1 %v10_2, label %body.3, label %end, !prof !15
body.3:
%v1_3 = add i64 %v9_2, 3
%v2_3 = add i64 %v1_3, %v1_3
%v3_3 = add i64 %v2_3, %v1_3
%v4_3 = add i64 %v2_3, %v3_3
%v5_3 = add i64 %v4_3, %v2_3
%v6_3 = add i64 %v5_3, %v4_3
%v7_3 = add i64 %v6_3, %v5_3
%v8_3 = add i64 %v7_3, %v6_3
%v9_3 = add i64 %v8_3, %v7_3
%v10_3 = icmp eq i64 %v9_3, 100
br i1 %v10_3, label %body.4, label %end, !prof !15
body.4:
%v1_4 = add i64 %v9_3, 3
%v2_4 = add i64 %v1_4, %v1_4
%v3_4 = add i64 %v2_4, %v1_4
%v4_4 = add i64 %v2_4, %v3_4
%v5_4 = add i64 %v4_4, %v2_4
%v6_4 = add i64 %v5_4, %v4_4
%v7_4 = add i64 %v6_4, %v5_4
%v8_4 = add i64 %v7_4, %v6_4
%v9_4 = add i64 %v8_4, %v7_4
%v10_4 = icmp eq i64 %v9_4, 100
br i1 %v10_4, label %body.5, label %end, !prof !15
body.5:
%v1_5 = add i64 %v9_4, 3
%v2_5 = add i64 %v1_5, %v1_5
%v3_5 = add i64 %v2_5, %v1_5
%v4_5 = add i64 %v2_5, %v3_5
%v5_5 = add i64 %v4_5, %v2_5
%v6_5 = add i64 %v5_5, %v4_5
%v7_5 = add i64 %v6_5, %v5_5
%v8_5 = add i64 %v7_5, %v6_5
%v9_5 = add i64 %v8_5, %v7_5
%v10_5 = icmp eq i64 %v9_5, 100
br i1 %v10_5, label %body.6, label %end, !prof !15
body.6:
%v1_6 = add i64 %v9_5, 3
%v2_6 = add i64 %v1_6, %v1_6
%v3_6 = add i64 %v2_6, %v1_6
%v4_6 = add i64 %v2_6, %v3_6
%v5_6 = add i64 %v4_6, %v2_6
%v6_6 = add i64 %v5_6, %v4_6
%v7_6 = add i64 %v6_6, %v5_6
%v8_6 = add i64 %v7_6, %v6_6
%v9_6 = add i64 %v8_6, %v7_6
%v10_6 = icmp eq i64 %v9_6, 100
br i1 %v10_6, label %body.7, label %end, !prof !15
body.7:
%v1_7 = add i64 %v9_6, 3
%v2_7 = add i64 %v1_7, %v1_7
%v3_7 = add i64 %v2_7, %v1_7
%v4_7 = add i64 %v2_7, %v3_7
%v5_7 = add i64 %v4_7, %v2_7
%v6_7 = add i64 %v5_7, %v4_7
%v7_7 = add i64 %v6_7, %v5_7
%v8_7 = add i64 %v7_7, %v6_7
%v9_7 = add i64 %v8_7, %v7_7
%v10_7 = icmp eq i64 %v9_7, 100
br i1 %v10_7, label %body.8, label %end, !prof !15
body.8:
%v1_8 = add i64 %v9_7, 3
%v2_8 = add i64 %v1_8, %v1_8
%v3_8 = add i64 %v2_8, %v1_8
%v4_8 = add i64 %v2_8, %v3_8
%v5_8 = add i64 %v4_8, %v2_8
%v6_8 = add i64 %v5_8, %v4_8
%v7_8 = add i64 %v6_8, %v5_8
%v8_8 = add i64 %v7_8, %v6_8
%v9_8 = add i64 %v8_8, %v7_8
%v10_8 = icmp eq i64 %v9_8, 100
br i1 %v10_8, label %body.9, label %end, !prof !15
body.9:
%v1_9 = add i64 %v9_8, 3
%v2_9 = add i64 %v1_9, %v1_9
%v3_9 = add i64 %v2_9, %v1_9
%v4_9 = add i64 %v2_9, %v3_9
%v5_9 = add i64 %v4_9, %v2_9
%v6_9 = add i64 %v5_9, %v4_9
%v7_9 = add i64 %v6_9, %v5_9
%v8_9 = add i64 %v7_9, %v6_9
%v9_9 = add i64 %v8_9, %v7_9
br label %end
end:
ret i64 99
}
; Test to not crash upon a 0:0 branch_weight metadata.
define void @test_chr_24(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_24(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[BB0:%.*]], !prof !21
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[BB3:%.*]], label [[BB2:%.*]], !prof !21
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !17
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !17
bb2:
call void @foo()
br label %bb3
bb3:
ret void
}
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 100}
!15 = !{!"branch_weights", i32 0, i32 1}
!16 = !{!"branch_weights", i32 1, i32 1}
!17 = !{!"branch_weights", i32 0, i32 0}
; CHECK: !15 = !{!"branch_weights", i32 1000, i32 0}
; CHECK: !16 = !{!"branch_weights", i32 0, i32 1}
; CHECK: !17 = !{!"branch_weights", i32 1, i32 1}
; CHECK: !18 = !{!"branch_weights", i32 1, i32 0}
; CHECK: !19 = !{!"branch_weights", i32 0, i32 1000}