llvm-project/llvm/test/Transforms/PGOProfile/chr.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2556 lines
79 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -chr -instcombine -simplifycfg -S | FileCheck %s
; RUN: opt < %s -passes='require<profile-summary>,function(chr,instcombine,simplify-cfg)' -S | FileCheck %s
declare void @foo()
declare void @bar()
; Simple case.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; if ((t0 & 2) != 0) // Likely true
; foo()
; ->
; t0 = *i
; if ((t0 & 3) != 0) { // Likely true
; foo()
; foo()
; } else {
; if ((t0 & 1) != 0)
; foo()
; if ((t0 & 2) != 0)
; foo()
; }
define void @test_chr_1(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
ret void
}
; Simple case with a cold block.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; if ((t0 & 2) == 0) // Likely false
; bar()
; if ((t0 & 4) != 0) // Likely true
; foo()
; ->
; t0 = *i
; if ((t0 & 7) == 7) { // Likely true
; foo()
; foo()
; } else {
; if ((t0 & 1) != 0)
; foo()
; if ((t0 & 2) == 0)
; bar()
; if ((t0 & 4) != 0)
; foo()
; }
define void @test_chr_1_1(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_1_1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 7
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 7
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB5:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[BB2_NONCHR:%.*]], label [[BB3_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @bar()
; CHECK-NEXT: br label [[BB3_NONCHR]]
; CHECK: bb3.nonchr:
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 4
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: br i1 [[TMP7]], label [[BB5]], label [[BB4_NONCHR:%.*]], !prof !16
; CHECK: bb4.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB5]]
; CHECK: bb5:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb2, label %bb3, !prof !15
bb2:
call void @bar()
br label %bb3
bb3:
%5 = and i32 %0, 4
%6 = icmp eq i32 %5, 0
br i1 %6, label %bb5, label %bb4, !prof !15
bb4:
call void @foo()
br label %bb5
bb5:
ret void
}
; With an aggregate bit check.
; Roughly,
; t0 = *i
; if ((t0 & 255) != 0) // Likely true
; if ((t0 & 1) != 0) // Likely true
; foo()
; if ((t0 & 2) != 0) // Likely true
; foo()
; ->
; t0 = *i
; if ((t0 & 3) != 0) { // Likely true
; foo()
; foo()
; } else if ((t0 & 255) != 0)
; if ((t0 & 1) != 0)
; foo()
; if ((t0 & 2) != 0)
; foo()
; }
define void @test_chr_2(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb1:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB4:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB4]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[BB2_NONCHR:%.*]], label [[BB1_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: br i1 [[TMP7]], label [[BB4]], label [[BB3_NONCHR:%.*]], !prof !16
; CHECK: bb3.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB4]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB2_NONCHR]]
; CHECK: bb4:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb4, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb2, label %bb1, !prof !15
bb1:
call void @foo()
br label %bb2
bb2:
%5 = and i32 %0, 2
%6 = icmp eq i32 %5, 0
br i1 %6, label %bb4, label %bb3, !prof !15
bb3:
call void @foo()
br label %bb4
bb4:
ret void
}
; Split case.
; Roughly,
; t1 = *i
; if ((t1 & 1) != 0) // Likely true
; foo()
; if ((t1 & 2) != 0) // Likely true
; foo()
; t2 = *i
; if ((t2 & 4) != 0) // Likely true
; foo()
; if ((t2 & 8) != 0) // Likely true
; foo()
; ->
; t1 = *i
; if ((t1 & 3) != 0) { // Likely true
; foo()
; foo()
; } else {
; if ((t1 & 1) != 0)
; foo()
; if ((t1 & 2) != 0)
; foo()
; }
; t2 = *i
; if ((t2 & 12) != 0) { // Likely true
; foo()
; foo()
; } else {
; if ((t2 & 4) != 0)
; foo()
; if ((t2 & 8) != 0)
; foo()
; }
define void @test_chr_3(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_3(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[I]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP6]], 12
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 12
; CHECK-NEXT: br i1 [[TMP8]], label [[BB4:%.*]], label [[BB3_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb4:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB7:%.*]]
; CHECK: bb3.split.nonchr:
; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP6]], 4
; CHECK-NEXT: [[DOTNOT1:%.*]] = icmp eq i32 [[TMP9]], 0
; CHECK-NEXT: br i1 [[DOTNOT1]], label [[BB5_NONCHR:%.*]], label [[BB4_NONCHR:%.*]], !prof !16
; CHECK: bb4.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB5_NONCHR]]
; CHECK: bb5.nonchr:
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP6]], 8
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
; CHECK-NEXT: br i1 [[TMP11]], label [[BB7]], label [[BB6_NONCHR:%.*]], !prof !16
; CHECK: bb6.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB7]]
; CHECK: bb7:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
%5 = load i32, i32* %i
%6 = and i32 %5, 4
%7 = icmp eq i32 %6, 0
br i1 %7, label %bb5, label %bb4, !prof !15
bb4:
call void @foo()
br label %bb5
bb5:
%8 = and i32 %5, 8
%9 = icmp eq i32 %8, 0
br i1 %9, label %bb7, label %bb6, !prof !15
bb6:
call void @foo()
br label %bb7
bb7:
ret void
}
; Selects.
; Roughly,
; t0 = *i
; sum1 = (t0 & 1) ? sum0 : (sum0 + 42) // Likely false
; sum2 = (t0 & 2) ? sum1 : (sum1 + 43) // Likely false
; return sum2
; ->
; t0 = *i
; if ((t0 & 3) == 3)
; return sum0 + 85
; else {
; sum1 = (t0 & 1) ? sum0 : (sum0 + 42)
; sum2 = (t0 & 2) ? sum1 : (sum1 + 43)
; return sum2
; }
define i32 @test_chr_4(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: entry.split:
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SUM0:%.*]], 85
; CHECK-NEXT: ret i32 [[TMP3]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SUM0]], 42
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: [[SUM1_NONCHR:%.*]] = select i1 [[DOTNOT]], i32 [[SUM0]], i32 [[TMP4]], !prof !16
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SUM1_NONCHR]], 43
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP7]], i32 [[SUM1_NONCHR]], i32 [[TMP8]], !prof !16
; CHECK-NEXT: ret i32 [[SUM2_NONCHR]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
%3 = add i32 %sum0, 42
%sum1 = select i1 %2, i32 %sum0, i32 %3, !prof !15
%4 = and i32 %0, 2
%5 = icmp eq i32 %4, 0
%6 = add i32 %sum1, 43
%sum2 = select i1 %5, i32 %sum1, i32 %6, !prof !15
ret i32 %sum2
}
; Selects + Brs
; Roughly,
; t0 = *i
; if ((t0 & 255) != 0) { // Likely true
; sum = (t0 & 1) ? sum0 : (sum0 + 42) // Likely false
; sum = (t0 & 2) ? sum : (sum + 43) // Likely false
; if ((t0 & 4) != 0) { // Likely true
; sum3 = sum + 44
; sum = (t0 & 8) ? sum3 : (sum3 + 44) // Likely false
; }
; }
; return sum
; ->
; t0 = *i
; if ((t0 & 15) != 15) { // Likely true
; sum = sum0 + 173
; } else if ((t0 & 255) != 0) {
; sum = (t0 & 1) ? sum0 : (sum0 + 42)
; sum = (t0 & 2) ? sum : (sum + 43)
; if ((t0 & 4) != 0) {
; sum3 = sum + 44
; sum = (t0 & 8) ? sum3 : (sum3 + 44)
; }
; }
; return sum
define i32 @test_chr_5(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_5(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 15
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 15
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SUM0:%.*]], 85
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SUM0]], 173
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SUM0]], 42
; CHECK-NEXT: [[SUM1_NONCHR:%.*]] = select i1 [[TMP7]], i32 [[SUM0]], i32 [[TMP8]], !prof !16
; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[SUM1_NONCHR]], 43
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP10]], i32 [[SUM1_NONCHR]], i32 [[TMP11]], !prof !16
; CHECK-NEXT: [[TMP12:%.*]] = and i32 [[TMP0]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 0
; CHECK-NEXT: [[TMP14:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 0
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP15]], i32 44, i32 88
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP13]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb3, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
%5 = add i32 %sum0, 42
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
%6 = and i32 %0, 2
%7 = icmp eq i32 %6, 0
%8 = add i32 %sum1, 43
%sum2 = select i1 %7, i32 %sum1, i32 %8, !prof !15
%9 = and i32 %0, 4
%10 = icmp eq i32 %9, 0
br i1 %10, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%11 = and i32 %0, 8
%12 = icmp eq i32 %11, 0
%13 = add i32 %sum3, 44
%sum4 = select i1 %12, i32 %sum3, i32 %13, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; Selects + Brs with a scope split in the middle
; Roughly,
; t0 = *i
; if ((t0 & 255) != 0) { // Likely true
; sum = (t0 & 1) ? sum0 : (sum0 + 42) // Likely false
; sum = (t0 & 2) ? sum : (sum + 43) // Likely false
; if ((sum0 & 4) != 0) { // Likely true. The condition doesn't use v.
; sum3 = sum + 44
; sum = (t0 & 8) ? sum3 : (sum3 + 44) // Likely false
; }
; }
; return sum
; ->
; t0 = *i
; if ((sum0 & 4) != 0 & (t0 & 11) != 11) { // Likely true
; sum = sum0 + 173
; } else if ((t0 & 255) != 0) {
; sum = (t0 & 1) ? sum0 : (sum0 + 42)
; sum = (t0 & 2) ? sum : (sum + 43)
; if ((sum0 & 4) != 0) {
; sum3 = sum + 44
; sum = (t0 & 8) ? sum3 : (sum3 + 44)
; }
; }
; return sum
define i32 @test_chr_5_1(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_5_1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[SUM0:%.*]], 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 11
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 11
; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], [[TMP2]]
; CHECK-NEXT: br i1 [[TMP5]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[SUM0]], 85
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[SUM0]], 173
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP8]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[SUM0]], 42
; CHECK-NEXT: [[SUM1_NONCHR:%.*]] = select i1 [[TMP10]], i32 [[SUM0]], i32 [[TMP11]], !prof !16
; CHECK-NEXT: [[TMP12:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 0
; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[SUM1_NONCHR]], 43
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP13]], i32 [[SUM1_NONCHR]], i32 [[TMP14]], !prof !16
; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[SUM0]], 4
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
; CHECK-NEXT: [[TMP17:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP17]], 0
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP18]], i32 44, i32 88
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP16]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP7]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb3, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
%5 = add i32 %sum0, 42
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
%6 = and i32 %0, 2
%7 = icmp eq i32 %6, 0
%8 = add i32 %sum1, 43
%sum2 = select i1 %7, i32 %sum1, i32 %8, !prof !15
%9 = and i32 %sum0, 4 ; Split
%10 = icmp eq i32 %9, 0
br i1 %10, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%11 = and i32 %0, 8
%12 = icmp eq i32 %11, 0
%13 = add i32 %sum3, 44
%sum4 = select i1 %12, i32 %sum3, i32 %13, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; Selects + Brs, non-matching bases
; Roughly,
; i0 = *i
; j0 = *j
; if ((i0 & 255) != 0) { // Likely true
; sum = (i0 & 2) ? sum0 : (sum0 + 43) // Likely false
; if ((j0 & 4) != 0) { // Likely true. The condition uses j0, not i0.
; sum3 = sum + 44
; sum = (i0 & 8) ? sum3 : (sum3 + 44) // Likely false
; }
; }
; return sum
; ->
; i0 = *i
; j0 = *j
; if ((j0 & 4) != 0 & (i0 & 10) != 10) { // Likely true
; sum = sum0 + 131
; } else if ((i0 & 255) != 0) {
; sum = (i0 & 2) ? sum0 : (sum0 + 43)
; if ((j0 & 4) != 0) {
; sum3 = sum + 44
; sum = (i0 & 8) ? sum3 : (sum3 + 44)
; }
; }
; return sum
define i32 @test_chr_6(i32* %i, i32* %j, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_6(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[V9:%.*]] = and i32 [[J0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp ne i32 [[V9]], 0
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[I0]], 10
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 10
; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[V10]]
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[V13:%.*]] = add i32 [[SUM0]], 131
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[V1:%.*]] = and i32 [[I0]], 255
; CHECK-NEXT: [[V2_NOT:%.*]] = icmp eq i32 [[V1]], 0
; CHECK-NEXT: br i1 [[V2_NOT]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[V3_NONCHR:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4_NONCHR:%.*]] = icmp eq i32 [[V3_NONCHR]], 0
; CHECK-NEXT: [[V8_NONCHR:%.*]] = add i32 [[SUM0]], 43
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[V4_NONCHR]], i32 [[SUM0]], i32 [[V8_NONCHR]], !prof !16
; CHECK-NEXT: [[V9_NONCHR:%.*]] = and i32 [[J0]], 4
; CHECK-NEXT: [[V10_NONCHR:%.*]] = icmp eq i32 [[V9_NONCHR]], 0
; CHECK-NEXT: [[V11_NONCHR:%.*]] = and i32 [[I0]], 8
; CHECK-NEXT: [[V12_NONCHR:%.*]] = icmp eq i32 [[V11_NONCHR]], 0
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[V12_NONCHR]], i32 44, i32 88
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost Summary: Previously, if the threshold was 2, we were willing to speculatively execute 2 cheap instructions in both basic blocks (thus we were willing to speculatively execute cost = 4), but weren't willing to speculate when one BB had 3 instructions and other one had no instructions, even thought that would have total cost of 3. This looks inconsistent to me. I don't think `cmov`-like instructions will start executing until both of it's inputs are available: https://godbolt.org/z/zgHePf So i don't see why the existing behavior is the correct one. Also, let's add it's own `cl::opt` for this threshold, with default=4, so it is not stricter than the previous threshold: will allow to fold when there are 2 BB's each with cost=2. And since the logic has changed, it will also allow to fold when one BB has cost=3 and other cost=1, or there is only one BB with cost=4. This is an alternative solution to D65148: This fix is mainly motivated by `signbit-like-value-extension.ll` test. That pattern comes up in JPEG decoding, see e.g. `Figure F.12 – Extending the sign bit of a decoded value in V` of `ITU T.81` (JPEG specification). That branch is not predictable, and it is within the innermost loop, so the fact that that pattern ends up being stuck with a branch instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial. This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240) | metric | old | new | delta | % | | x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% | | x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% | | x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% | | x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% | | x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% | | x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% | | x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% | We significantly decrease basic block count, notably decrease instruction count, significantly decrease branch count and very significantly increase `cmov` count. Performance-wise, unsurprisingly, this has great effect on target RawSpeed benchmark. I'm seeing 5 **major** improvements: ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828 Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436 Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766 Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297 Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324 Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681 ``` That being said, it's always best-effort, so there will likely be cases where this worsens things. Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc Reviewed By: jmolloy Subscribers: xbolva00, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67318 llvm-svn: 372009
2019-09-17 00:18:24 +08:00
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[V10_NONCHR]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[V13]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%i0 = load i32, i32* %i
%j0 = load i32, i32* %j
%v1 = and i32 %i0, 255
%v2 = icmp eq i32 %v1, 0
br i1 %v2, label %bb3, label %bb0, !prof !15
bb0:
%v3 = and i32 %i0, 2
%v4 = icmp eq i32 %v3, 0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
%v9 = and i32 %j0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%v11 = and i32 %i0, 8
%v12 = icmp eq i32 %v11, 0
%v13 = add i32 %sum3, 44
%sum4 = select i1 %v12, i32 %sum3, i32 %v13, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; Selects + Brs, the branch condition can't be hoisted to be merged with a
; select. No CHR happens.
; Roughly,
; i0 = *i
; sum = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
; foo();
; j0 = *j
; if ((j0 & 4) != 0) { // Likely true
; foo();
; sum = sum + 44
; }
; return sum
; ->
; (no change)
define i32 @test_chr_7(i32* %i, i32* %j, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_7(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[V3:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V3]], 0
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[V9:%.*]] = and i32 [[J0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
; CHECK-NEXT: br i1 [[V10]], label [[BB2:%.*]], label [[BB1:%.*]], !prof !16
; CHECK: bb1:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[SUM4:%.*]] = add i32 [[SUM2]], 44
; CHECK-NEXT: br label [[BB2]]
; CHECK: bb2:
; CHECK-NEXT: [[SUM5:%.*]] = phi i32 [ [[SUM2]], [[ENTRY:%.*]] ], [ [[SUM4]], [[BB1]] ]
; CHECK-NEXT: ret i32 [[SUM5]]
;
entry:
%i0 = load i32, i32* %i
%v3 = and i32 %i0, 2
%v4 = icmp eq i32 %v3, 0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
call void @foo()
%j0 = load i32, i32* %j
%v9 = and i32 %j0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb2, label %bb1, !prof !15 ; %v10 can't be hoisted above the above select
bb1:
call void @foo()
%sum4 = add i32 %sum2, 44
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %entry ], [ %sum4, %bb1 ]
ret i32 %sum5
}
; Selects + Brs, the branch condition can't be hoisted to be merged with the
; selects. Dropping the select.
; Roughly,
; i0 = *i
; sum = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
; foo();
; j0 = *j
; if ((j0 & 4) != 0) // Likely true
; foo()
; if ((j0 & 8) != 0) // Likely true
; foo()
; return sum
; ->
; i0 = *i
; sum = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
; foo();
; j0 = *j
; if ((j0 & 12) != 12) { // Likely true
; foo()
; foo()
; } else {
; if ((j0 & 4) != 0)
; foo()
; if ((j0 & 8) != 0)
; foo()
; }
; return sum
define i32 @test_chr_7_1(i32* %i, i32* %j, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_7_1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[J0]], 12
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 12
; CHECK-NEXT: br i1 [[TMP1]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[V9:%.*]] = and i32 [[J0]], 4
; CHECK-NEXT: [[V10_NOT:%.*]] = icmp eq i32 [[V9]], 0
; CHECK-NEXT: br i1 [[V10_NOT]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[V11_NONCHR:%.*]] = and i32 [[J0]], 8
; CHECK-NEXT: [[V12_NONCHR:%.*]] = icmp eq i32 [[V11_NONCHR]], 0
; CHECK-NEXT: br i1 [[V12_NONCHR]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[V3:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V3]], 0
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: ret i32 [[SUM2]]
;
entry:
%i0 = load i32, i32* %i
%v3 = and i32 %i0, 2
%v4 = icmp eq i32 %v3, 0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
call void @foo()
%j0 = load i32, i32* %j
%v9 = and i32 %j0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb1, label %bb0, !prof !15 ; %v10 can't be hoisted above the above select
bb0:
call void @foo()
br label %bb1
bb1:
%v11 = and i32 %j0, 8
%v12 = icmp eq i32 %v11, 0
br i1 %v12, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
ret i32 %sum2
}
; Branches aren't biased enough. No CHR happens.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Not biased
; foo()
; if ((t0 & 2) != 0) // Not biased
; foo()
; ->
; (no change)
define void @test_chr_8(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_8(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[BB0:%.*]], !prof !17
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[BB3:%.*]], label [[BB2:%.*]], !prof !17
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !16
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !16
bb2:
call void @foo()
br label %bb3
bb3:
ret void
}
; With an existing phi at the exit.
; Roughly,
; t = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; if ((t0 & 2) != 0) { // Likely true
; t = *j
; foo()
; }
; // There's a phi for t here.
; return t
; ->
; t = *i
; if ((t & 3) == 3) { // Likely true
; foo()
; t = *j
; foo()
; } else {
; if ((t & 1) != 0)
; foo()
; if ((t & 2) != 0) {
; t = *j
; foo()
; }
; }
; // There's a phi for t here.
; return t
define i32 @test_chr_9(i32* %i, i32* %j) !prof !14 {
; CHECK-LABEL: @test_chr_9(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[J]], align 4
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[TMP3]], [[BB0]] ], [ [[TMP0]], [[BB1_NONCHR]] ], [ [[TMP7]], [[BB2_NONCHR]] ]
; CHECK-NEXT: ret i32 [[TMP8]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !15
bb2:
%5 = load i32, i32* %j
call void @foo()
br label %bb3
bb3:
%6 = phi i32 [ %0, %bb1 ], [ %5, %bb2 ]
ret i32 %6
}
; With no phi at the exit, but the exit needs a phi inserted after CHR.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; t1 = *j
; if ((t1 & 2) != 0) // Likely true
; foo()
; return (t1 * 42) - (t1 - 99)
; ->
; t0 = *i
; if ((t0 & 3) == 3) { // Likely true
; foo()
; t1 = *j
; foo()
; } else {
; if ((t0 & 1) != 0)
; foo()
; if ((t0 & 2) != 0) {
; t1 = *j
; foo()
; }
; }
; // A new phi for t1 is inserted here.
; return (t1 * 42) - (t1 - 99)
define i32 @test_chr_10(i32* %i, i32* %j) !prof !14 {
; CHECK-LABEL: @test_chr_10(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[J]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: br i1 [[TMP7]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[TMP3]], [[BB0]] ], [ [[TMP5]], [[BB2_NONCHR]] ], [ [[TMP5]], [[BB1_NONCHR]] ]
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], 42
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], -99
; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP9]], [[TMP10]]
; CHECK-NEXT: ret i32 [[TMP11]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%3 = load i32, i32* %j
%4 = and i32 %0, 2
%5 = icmp eq i32 %4, 0
br i1 %5, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
%6 = mul i32 %3, 42
%7 = sub i32 %3, 99
%8 = add i32 %6, %7
ret i32 %8
}
; Test a case where there are two use-def chain paths to the same value (t0)
; from the branch condition. This is a regression test for an old bug that
; caused a bad hoisting that moves (hoists) a value (%conv) twice to the end of
; the %entry block (once for %div and once for %mul16) and put a use ahead of
; its definition like:
; %entry:
; ...
; %div = fdiv double 1.000000e+00, %conv
; %conv = sitofp i32 %0 to double
; %mul16 = fmul double %div, %conv
;
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; // there are two use-def paths from the branch condition to t0.
; if ((1.0 / t0) * t0 < 1) // Likely true
; foo()
; ->
; t0 = *i
; if ((t0 & 1) != 0 & (1.0 / t0) * t0 > 0) { // Likely true
; foo()
; foo()
; } else {
; if ((t0 & 1) != 0)
; foo()
; if ((1.0 / t0) * t0 < 1) // Likely true
; foo()
; }
define void @test_chr_11(i32* %i, i32 %x) !prof !14 {
; CHECK-LABEL: @test_chr_11(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
; CHECK-NEXT: [[DIV:%.*]] = fdiv double 1.000000e+00, [[CONV]]
; CHECK-NEXT: [[MUL16:%.*]] = fmul double [[DIV]], [[CONV]]
; CHECK-NEXT: [[CONV717:%.*]] = fptosi double [[MUL16]] to i32
; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i32 [[CONV717]], 0
; CHECK-NEXT: [[TMP3:%.*]] = and i1 [[TMP2]], [[CMP18]]
; CHECK-NEXT: br i1 [[TMP3]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0_NONCHR:%.*]], label [[BB1_NONCHR:%.*]], !prof !18
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[CONV_NONCHR:%.*]] = sitofp i32 [[TMP0]] to double
; CHECK-NEXT: [[DIV_NONCHR:%.*]] = fdiv double 1.000000e+00, [[CONV_NONCHR]]
; CHECK-NEXT: [[MUL16_NONCHR:%.*]] = fmul double [[DIV_NONCHR]], [[CONV_NONCHR]]
; CHECK-NEXT: [[CONV717_NONCHR:%.*]] = fptosi double [[MUL16_NONCHR]] to i32
; CHECK-NEXT: [[CMP18_NONCHR:%.*]] = icmp slt i32 [[CONV717_NONCHR]], 1
; CHECK-NEXT: br i1 [[CMP18_NONCHR]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%conv = sitofp i32 %0 to double
%div = fdiv double 1.000000e+00, %conv
%mul16 = fmul double %div, %conv
%conv717 = fptosi double %mul16 to i32
%cmp18 = icmp slt i32 %conv717, 1
br i1 %cmp18, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
ret void
}
; Selects + unrelated br only
define i32 @test_chr_12(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_12(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP2]], label [[BB3:%.*]], label [[BB0:%.*]], !prof !16
; CHECK: bb0:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SUM0:%.*]], 42
; CHECK-NEXT: [[SUM1:%.*]] = select i1 [[TMP4]], i32 [[SUM0]], i32 [[TMP5]], !prof !16
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SUM1]], 43
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[TMP7]], i32 [[SUM1]], i32 [[TMP8]], !prof !16
; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
; CHECK-NEXT: [[TMP13:%.*]] = and i1 [[TMP10]], [[TMP12]]
; CHECK-NEXT: br i1 [[TMP13]], label [[BB1:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb1:
; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[SUM2]], 88
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb0.split.nonchr:
; CHECK-NEXT: br i1 [[TMP10]], label [[BB1_NONCHR:%.*]], label [[BB3]], !prof !18
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP16]], i32 44, i32 88, !prof !16
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2]], [[SUM4_NONCHR_V]]
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[SUM0]], [[ENTRY:%.*]] ], [ [[TMP14]], [[BB1]] ], [ [[SUM2]], [[BB0_SPLIT_NONCHR]] ], [ [[SUM4_NONCHR]], [[BB1_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb3, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
%5 = add i32 %sum0, 42
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
%6 = and i32 %0, 2
%7 = icmp eq i32 %6, 0
%8 = add i32 %sum1, 43
%sum2 = select i1 %7, i32 %sum1, i32 %8, !prof !15
%9 = load i32, i32* %i
%10 = icmp eq i32 %9, 0
br i1 %10, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%11 = and i32 %0, 8
%12 = icmp eq i32 %11, 0
%13 = add i32 %sum3, 44
%sum4 = select i1 %12, i32 %sum3, i32 %13, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; In the second CHR, a condition value depends on a trivial phi that's inserted
; by the first CHR.
; Roughly,
; i0 = *i
; v2 = (z != 1) ? pred : true // Likely false
; if (z == 0 & pred) // Likely false
; foo()
; j0 = *j
; sum2 = ((i0 & 2) == j0) ? sum0 : (sum0 + 43) // Likely false
; sum3 = ((i0 == j0) ? sum0 : (sum0 + 43) // Likely false
; foo()
; if ((i0 & 4) == 0) // Unbiased
; foo()
; return i0 + sum3
; ->
; i0 = *i
; if (z != 1 & (z == 0 & pred)) // First CHR
; foo()
; // A trivial phi for i0 is inserted here by the first CHR (which gets removed
; // later) and the subsequent branch condition (for the second CHR) uses it.
; j0 = *j
; if ((i0 & 2) != j0 & i0 != j0) { // Second CHR
; sum3 = sum0 + 43
; foo()
; if (i0 & 4) == 0)
; foo()
; } else {
; sum3 = (i0 == j0) ? sum0 : (sum0 + 43)
; foo()
; if (i0 & 4) == 0)
; foo()
; }
; return i0 + sum3
define i32 @test_chr_14(i32* %i, i32* %j, i32 %sum0, i1 %pred, i32 %z) !prof !14 {
; CHECK-LABEL: @test_chr_14(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[V1:%.*]] = icmp ne i32 [[Z:%.*]], 1
; CHECK-NEXT: [[V0:%.*]] = icmp eq i32 [[Z]], 0
; CHECK-NEXT: [[V3_NONCHR:%.*]] = and i1 [[V0]], [[PRED:%.*]]
; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[V1]], [[V3_NONCHR]]
; CHECK-NEXT: br i1 [[OR_COND]], label [[BB0_NONCHR:%.*]], label [[BB1:%.*]], !prof !19
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4:%.*]] = icmp ne i32 [[V6]], [[J0]]
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[V5:%.*]] = icmp ne i32 [[I0]], [[J0]]
; CHECK-NEXT: [[TMP0:%.*]] = and i1 [[V4]], [[V5]]
; CHECK-NEXT: br i1 [[TMP0]], label [[BB1_SPLIT:%.*]], label [[BB1_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb1.split:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[V9:%.*]] = and i32 [[I0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
; CHECK-NEXT: br i1 [[V10]], label [[BB3:%.*]], label [[BB2:%.*]]
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb1.split.nonchr:
; CHECK-NEXT: [[V5_NONCHR:%.*]] = icmp eq i32 [[I0]], [[J0]]
; CHECK-NEXT: [[SUM3_NONCHR:%.*]] = select i1 [[V5_NONCHR]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[V9_NONCHR:%.*]] = and i32 [[I0]], 4
; CHECK-NEXT: [[V10_NONCHR:%.*]] = icmp eq i32 [[V9_NONCHR]], 0
; CHECK-NEXT: br i1 [[V10_NONCHR]], label [[BB3]], label [[BB2_NONCHR:%.*]]
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[V8]], [[BB2]] ], [ [[V8]], [[BB1_SPLIT]] ], [ [[SUM3_NONCHR]], [[BB2_NONCHR]] ], [ [[SUM3_NONCHR]], [[BB1_SPLIT_NONCHR]] ]
; CHECK-NEXT: [[V11:%.*]] = add i32 [[I0]], [[TMP1]]
; CHECK-NEXT: ret i32 [[V11]]
;
entry:
%i0 = load i32, i32* %i
%v0 = icmp eq i32 %z, 0
%v1 = icmp ne i32 %z, 1
%v2 = select i1 %v1, i1 %pred, i1 true, !prof !15
%v3 = and i1 %v0, %pred
br i1 %v3, label %bb0, label %bb1, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%j0 = load i32, i32* %j
%v6 = and i32 %i0, 2
%v4 = icmp eq i32 %v6, %j0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
%v5 = icmp eq i32 %i0, %j0
%sum3 = select i1 %v5, i32 %sum0, i32 %v8, !prof !15
call void @foo()
%v9 = and i32 %i0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb3, label %bb2
bb2:
call void @foo()
br label %bb3
bb3:
%v11 = add i32 %i0, %sum3
ret i32 %v11
}
; Branch or selects depends on another select. No CHR happens.
; Roughly,
; i0 = *i
; if (z == 0 & ((z != 1) ? pred : true)) { // Likely false
; foo()
; j0 = *j
; sum2 = ((i0 & 2) == j0) ? sum0 : (sum0 + 43) // Likely false
; sum3 = (i0 == sum2) ? sum2 : (sum0 + 43) // Likely false. This depends on the
; // previous select.
; foo()
; if ((i0 & 4) == 0) // Unbiased
; foo()
; return i0 + sum3
; ->
; (no change)
define i32 @test_chr_15(i32* %i, i32* %j, i32 %sum0, i1 %pred, i32 %z) !prof !14 {
; CHECK-LABEL: @test_chr_15(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[V0:%.*]] = icmp eq i32 [[Z:%.*]], 0
; CHECK-NEXT: [[V3:%.*]] = and i1 [[V0]], [[PRED:%.*]]
; CHECK-NEXT: br i1 [[V3]], label [[BB0:%.*]], label [[BB1:%.*]], !prof !16
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V6]], [[J0]]
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[V9:%.*]] = and i32 [[I0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
; CHECK-NEXT: br i1 [[V10]], label [[BB3:%.*]], label [[BB2:%.*]]
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[V5:%.*]] = icmp eq i32 [[I0]], [[SUM2]]
; CHECK-NEXT: [[SUM3:%.*]] = select i1 [[V5]], i32 [[SUM2]], i32 [[V8]], !prof !16
; CHECK-NEXT: [[V11:%.*]] = add i32 [[I0]], [[SUM3]]
; CHECK-NEXT: ret i32 [[V11]]
;
entry:
%i0 = load i32, i32* %i
%v0 = icmp eq i32 %z, 0
%v1 = icmp ne i32 %z, 1
%v2 = select i1 %v1, i1 %pred, i1 true, !prof !15
%v3 = and i1 %v0, %v2
br i1 %v3, label %bb0, label %bb1, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%j0 = load i32, i32* %j
%v6 = and i32 %i0, 2
%v4 = icmp eq i32 %v6, %j0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
%v5 = icmp eq i32 %i0, %sum2
%sum3 = select i1 %v5, i32 %sum2, i32 %v8, !prof !15
call void @foo()
%v9 = and i32 %i0, 4
%v10 = icmp eq i32 %v9, 0
br i1 %v10, label %bb3, label %bb2
bb2:
call void @foo()
br label %bb3
bb3:
%v11 = add i32 %i0, %sum3
ret i32 %v11
}
; With an existing phi at the exit but a value (%v40) is both alive and is an
; operand to a phi at the exit block.
; Roughly,
; t0 = *i
; if ((t0 & 1) != 0) // Likely true
; foo()
; v40 = t0 + 44
; if ((t0 & 2) != 0) // Likely true
; v41 = t0 + 99
; foo()
; }
; v42 = phi v40, v41
; return v42 + v40
; ->
; t0 = *i
; if ((t0 & 3) == 3) // Likely true
; foo()
; v40 = t0 + 44
; v41 = t0 + 99
; foo()
; } else {
; if ((t0 & 1) != 0) // Likely true
; foo()
; v40_nc = t0 + 44
; if ((t0 & 2) != 0) // Likely true
; v41_nc = t0 + 99
; foo()
; }
; }
; t7 = phi v40, v40_nc
; v42 = phi v41, v41_nc
; v43 = v42 + t7
; return v43
define i32 @test_chr_16(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_16(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[V40:%.*]] = add i32 [[TMP0]], 44
; CHECK-NEXT: [[V41:%.*]] = add i32 [[TMP0]], 99
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1_NONCHR]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[V40_NONCHR:%.*]] = add i32 [[TMP0]], 44
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb2.nonchr:
; CHECK-NEXT: [[V41_NONCHR:%.*]] = add i32 [[TMP0]], 99
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[V40]], [[BB0]] ], [ [[V40_NONCHR]], [[BB2_NONCHR]] ], [ [[V40_NONCHR]], [[BB1_NONCHR]] ]
; CHECK-NEXT: [[V42:%.*]] = phi i32 [ [[V41]], [[BB0]] ], [ [[V41_NONCHR]], [[BB2_NONCHR]] ], [ [[V40_NONCHR]], [[BB1_NONCHR]] ]
; CHECK-NEXT: [[V43:%.*]] = add i32 [[V42]], [[TMP6]]
; CHECK-NEXT: ret i32 [[V43]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
br label %bb1
bb1:
%v40 = add i32 %0, 44
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !15
bb2:
%v41 = add i32 %0, 99
call void @foo()
br label %bb3
bb3:
%v42 = phi i32 [ %v41, %bb2 ], [ %v40, %bb1 ]
%v43 = add i32 %v42, %v40
ret i32 %v43
}
; Two consecutive regions have an entry in the middle of them. No CHR happens.
; Roughly,
; if ((i & 4) == 0) {
; if (!j)
; goto bb1
; } else {
; t0 = (i & 1)
; if (t0 != 0) // Likely true
; foo()
; s = (i & 1) + i
; }
; bb1:
; p = phi i, t0, s
; if ((i & 2) != 0) // Likely true
; foo()
; q = p + 2
; }
; r = phi p, q, i
; return r
; ->
; (no change)
define i32 @test_chr_17(i32 %i, i1 %j) !prof !14 {
; CHECK-LABEL: @test_chr_17(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[V0:%.*]] = and i32 [[I:%.*]], 4
; CHECK-NEXT: [[V1:%.*]] = icmp eq i32 [[V0]], 0
; CHECK-NEXT: br i1 [[V1]], label [[BBE:%.*]], label [[BBQ:%.*]]
; CHECK: bbq:
; CHECK-NEXT: br i1 [[J:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bbe:
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[I]], 1
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
; CHECK-NEXT: br i1 [[TMP1]], label [[BB1]], label [[BB0:%.*]], !prof !16
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[S:%.*]] = add i32 [[TMP0]], [[I]]
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[P:%.*]] = phi i32 [ [[I]], [[BBQ]] ], [ [[TMP0]], [[BBE]] ], [ [[S]], [[BB0]] ]
; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[I]], 2
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
; CHECK-NEXT: br i1 [[TMP3]], label [[BB3]], label [[BB2:%.*]], !prof !16
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[Q:%.*]] = add i32 [[P]], [[TMP2]]
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[P]], [[BB1]] ], [ [[Q]], [[BB2]] ], [ [[I]], [[BBQ]] ]
; CHECK-NEXT: ret i32 [[R]]
;
entry:
%v0 = and i32 %i, 4
%v1 = icmp eq i32 %v0, 0
br i1 %v1, label %bbe, label %bbq
bbq:
br i1 %j, label %bb3, label %bb1
bbe:
%0 = and i32 %i, 1
%1 = icmp eq i32 %0, 0
br i1 %1, label %bb1, label %bb0, !prof !15
bb0:
call void @foo()
%s = add i32 %0, %i
br label %bb1
bb1:
%p = phi i32 [ %i, %bbq ], [ %0, %bbe ], [ %s, %bb0 ]
%2 = and i32 %i, 2
%3 = icmp eq i32 %2, 0
br i1 %3, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
%q = add i32 %p, %2
br label %bb3
bb3:
%r = phi i32 [ %p, %bb1 ], [ %q, %bb2 ], [ %i, %bbq ]
ret i32 %r
}
; Select + br, there's a loop and we need to update the user of an inserted phi
; at the entry block. This is a regression test for a bug that's fixed.
; Roughly,
; do {
; inc1 = phi inc2, 0
; li = *i
; sum1 = sum0 + 42
; sum2 = ((li & 1) == 0) ? sum0 : sum1 // Likely false
; inc2 = inc1 + 1
; if ((li & 4) != 0) // Likely true
; sum3 = sum2 + 44
; sum4 = phi sum1, sum3
; } while (inc2 != 100) // Likely true (loop back)
; return sum4
; ->
; do {
; inc1 = phi tmp2, 0 // The first operand needed to be updated
; li = *i
; sum1 = sum0 + 42
; if ((li & 5) == 5) { // Likely true
; inc2 = inc1 + 1
; sum3 = sum0 + 86
; } else {
; inc2_nc = inc1 + 1
; if ((li & 4) == 0)
; sum2_nc = ((li & 1) == 0) ? sum0 : sum1
; sum3_nc = sum2_nc + 44
; }
; tmp2 = phi inc2, in2c_nc
; sum4 = phi sum3, sum3_nc, sum1
; } while (tmp2 != 100)
; return sum4
define i32 @test_chr_18(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_18(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[BB0:%.*]]
; CHECK: bb0:
; CHECK-NEXT: [[INC1:%.*]] = phi i32 [ [[TMP2:%.*]], [[BB2:%.*]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[LI:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[SUM1:%.*]] = add i32 [[SUM0:%.*]], 42
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[LI]], 5
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 5
; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_SPLIT:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0.split:
; CHECK-NEXT: [[INC2:%.*]] = add i32 [[INC1]], 1
; CHECK-NEXT: [[SUM3:%.*]] = add i32 [[SUM0]], 86
; CHECK-NEXT: br label [[BB2]]
; CHECK: bb0.split.nonchr:
; CHECK-NEXT: [[A4_NONCHR:%.*]] = and i32 [[LI]], 4
; CHECK-NEXT: [[CMP4_NONCHR:%.*]] = icmp eq i32 [[A4_NONCHR]], 0
; CHECK-NEXT: [[INC2_NONCHR:%.*]] = add i32 [[INC1]], 1
; CHECK-NEXT: br i1 [[CMP4_NONCHR]], label [[BB2]], label [[BB1_NONCHR:%.*]], !prof !16
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[A1:%.*]] = and i32 [[LI]], 1
; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[A1]], 0
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[CMP1_NOT]], i32 [[SUM0]], i32 [[SUM1]], !prof !16
; CHECK-NEXT: [[SUM3_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], 44
; CHECK-NEXT: br label [[BB2]]
; CHECK: bb2:
; CHECK-NEXT: [[TMP2]] = phi i32 [ [[INC2]], [[BB0_SPLIT]] ], [ [[INC2_NONCHR]], [[BB1_NONCHR]] ], [ [[INC2_NONCHR]], [[BB0_SPLIT_NONCHR]] ]
; CHECK-NEXT: [[SUM4:%.*]] = phi i32 [ [[SUM3]], [[BB0_SPLIT]] ], [ [[SUM3_NONCHR]], [[BB1_NONCHR]] ], [ [[SUM1]], [[BB0_SPLIT_NONCHR]] ]
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 100
; CHECK-NEXT: br i1 [[CMP]], label [[BB3:%.*]], label [[BB0]], !prof !16
; CHECK: bb3:
; CHECK-NEXT: ret i32 [[SUM4]]
;
entry:
br label %bb0
bb0:
%inc1 = phi i32 [ %inc2, %bb2 ], [ 0, %entry ]
%li = load i32, i32* %i
%a1 = and i32 %li, 1
%cmp1 = icmp eq i32 %a1, 0
%sum1 = add i32 %sum0, 42
%sum2 = select i1 %cmp1, i32 %sum0, i32 %sum1, !prof !15
%a4 = and i32 %li, 4
%cmp4 = icmp eq i32 %a4, 0
%inc2 = add i32 %inc1, 1
br i1 %cmp4, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
br label %bb2
bb2:
%sum4 = phi i32 [ %sum1, %bb0 ], [ %sum3, %bb1 ]
%cmp = icmp eq i32 %inc2, 100
br i1 %cmp, label %bb3, label %bb0, !prof !15
bb3:
ret i32 %sum4
}
; Selects + Brs. Those share the condition value, which causes the
; targets/operands of the branch/select to be flipped.
; Roughly,
; t0 = *i
; if ((t0 & 255) != 0) { // Likely true
; sum1 = ((t0 & 1) == 0) ? sum0 : (sum0 + 42) // Likely false
; sum2 = ((t0 & 1) == 0) ? sum1 : (sum1 + 42) // Likely false
; if ((t0 & 1) != 0) { // Likely true
; sum3 = sum2 + 44
; sum4 = ((t0 & 8) == 0) ? sum3 : (sum3 + 44) // Likely false
; }
; sum5 = phi sum2, sum4
; }
; sum6 = phi sum0, sum5
; return sum6
; ->
; t0 = *i
; if ((t0 & 9) == 9) { // Likely true
; tmp3 = sum0 + 85 // Dead
; tmp4 = sum0 + 173
; } else {
; if ((t0 & 255) != 0) {
; sum2_nc = ((t0 & 1) == 0) ? sum0 : (sum0 + 85)
; sum4_nc_v = ((t0 & 8) == 0) ? 44 : 88
; sum4_nc = add sum2_nc + sum4_nc_v
; }
; }
; sum6 = phi tmp4, sum0, sum2_nc, sum4_nc
; return sum6
define i32 @test_chr_19(i32* %i, i32 %sum0) !prof !14 {
; CHECK-LABEL: @test_chr_19(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 9
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 9
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0:
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SUM0:%.*]], 85
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SUM0]], 173
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 255
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
; CHECK: bb0.nonchr:
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SUM0]], 85
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP7]], i32 [[SUM0]], i32 [[TMP8]], !prof !16
; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP10]], i32 44, i32 88
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP7]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
; CHECK-NEXT: ret i32 [[SUM6]]
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 255
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb3, label %bb0, !prof !15
bb0:
%3 = and i32 %0, 1
%4 = icmp eq i32 %3, 0
%5 = add i32 %sum0, 42
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
%6 = add i32 %sum1, 43
%sum2 = select i1 %4, i32 %sum1, i32 %6, !prof !15
br i1 %4, label %bb2, label %bb1, !prof !15
bb1:
%sum3 = add i32 %sum2, 44
%7 = and i32 %0, 8
%8 = icmp eq i32 %7, 0
%9 = add i32 %sum3, 44
%sum4 = select i1 %8, i32 %sum3, i32 %9, !prof !15
br label %bb2
bb2:
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
br label %bb3
bb3:
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
ret i32 %sum6
}
; Selects. The exit block, which belongs to the top-level region, has a select
; and causes the top-level region to be the outermost CHR scope with the
; subscope that includes the entry block with two selects. The outermost CHR
; scope doesn't see the selects in the entry block as the entry block is in the
; subscope and incorrectly sets the CHR hoist point to the branch rather than
; the first select in the entry block and causes the CHR'ed selects ("select i1
; false...") to incorrectly position above the CHR branch. This is testing
; against a quirk of how the region analysis handles the entry block.
; Roughly,
; i0 = *i
; sum2 = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
; sum3 = ((i0 & 4) == 0) ? sum2 : (sum2 + 44) // Likely false
; if (j)
; foo()
; i5 = *i
; v13 = (i5 == 44) ? i5 : sum3
; return v13
; ->
; i0 = *i
; if ((i0 & 6) != 6) { // Likely true
; v9 = sum0 + 87
; if (j)
; foo()
; } else {
; sum2.nc = ((i0 & 2) == 0) ? sum0 : (sum0 + 43)
; sum3.nc = ((i0 & 4) == 0) ? sum2.nc : (sum2.nc + 44)
; if (j)
; foo()
; }
; t2 = phi v9, sum3.nc
; i5 = *i
; v13 = (i5 == 44) ? 44 : t2
; return v13
define i32 @test_chr_20(i32* %i, i32 %sum0, i1 %j) !prof !14 {
; CHECK-LABEL: @test_chr_20(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[I0]], 6
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 6
; CHECK-NEXT: br i1 [[TMP1]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: entry.split:
; CHECK-NEXT: [[V9:%.*]] = add i32 [[SUM0:%.*]], 87
; CHECK-NEXT: br i1 [[J:%.*]], label [[BB1:%.*]], label [[BB4:%.*]]
; CHECK: bb1:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB4]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0]], 43
; CHECK-NEXT: [[V3:%.*]] = and i32 [[I0]], 2
; CHECK-NEXT: [[V4_NOT:%.*]] = icmp eq i32 [[V3]], 0
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[V4_NOT]], i32 [[SUM0]], i32 [[V8]], !prof !16
; CHECK-NEXT: [[V6_NONCHR:%.*]] = and i32 [[I0]], 4
; CHECK-NEXT: [[V5_NONCHR:%.*]] = icmp eq i32 [[V6_NONCHR]], 0
; CHECK-NEXT: [[V9_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], 44
; CHECK-NEXT: [[SUM3_NONCHR:%.*]] = select i1 [[V5_NONCHR]], i32 [[SUM2_NONCHR]], i32 [[V9_NONCHR]], !prof !16
; CHECK-NEXT: br i1 [[J]], label [[BB1_NONCHR:%.*]], label [[BB4]]
; CHECK: bb1.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB4]]
; CHECK: bb4:
; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[V9]], [[BB1]] ], [ [[V9]], [[ENTRY_SPLIT]] ], [ [[SUM3_NONCHR]], [[BB1_NONCHR]] ], [ [[SUM3_NONCHR]], [[ENTRY_SPLIT_NONCHR]] ]
; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[I]], align 4
; CHECK-NEXT: [[V12:%.*]] = icmp eq i32 [[I5]], 44
; CHECK-NEXT: [[V13:%.*]] = select i1 [[V12]], i32 44, i32 [[TMP2]], !prof !16
; CHECK-NEXT: ret i32 [[V13]]
;
entry:
%i0 = load i32, i32* %i
%v3 = and i32 %i0, 2
%v4 = icmp eq i32 %v3, 0
%v8 = add i32 %sum0, 43
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
%v6 = and i32 %i0, 4
%v5 = icmp eq i32 %v6, 0
%v9 = add i32 %sum2, 44
%sum3 = select i1 %v5, i32 %sum2, i32 %v9, !prof !15
br i1 %j, label %bb1, label %bb4
bb1:
call void @foo()
br label %bb4
bb4:
%i5 = load i32, i32* %i
%v12 = icmp eq i32 %i5, 44
%v13 = select i1 %v12, i32 %i5, i32 %sum3, !prof !15
ret i32 %v13
}
; Test the case where two scopes share a common instruction to hoist (%cmp.i).
; Two scopes would hoist it to their hoist points, but since the outer scope
; hoists (entry/bb6-9) it first to its hoist point, it'd be wrong (causing bad
; IR) for the inner scope (bb1-4) to hoist the same instruction to its hoist
; point.
; Roughly,
; if (j != k) {
; if (i != 2)
; foo();
; cmp.i = i == 86
; if (!cmp.i)
; foo();
; if (j != i)
; foo();
; if (!cmp.i)
; foo();
; }
; return 45;
define i32 @test_chr_21(i64 %i, i64 %k, i64 %j) !prof !14 {
; CHECK-LABEL: @test_chr_21(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i64 [[J:%.*]], [[K:%.*]]
; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[J]], [[I:%.*]]
; CHECK-NEXT: [[CMP_I:%.*]] = icmp ne i64 [[I]], 86
; CHECK-NEXT: [[TMP0:%.*]] = and i1 [[CMP0]], [[CMP3]]
; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[TMP0]], [[CMP_I]]
; CHECK-NEXT: br i1 [[TMP1]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb1:
; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[I]], 2
; CHECK-NEXT: switch i64 [[I]], label [[BB2:%.*]] [
; CHECK-NEXT: i64 2, label [[BB3_NONCHR2:%.*]]
; CHECK-NEXT: i64 86, label [[BB2_NONCHR1:%.*]]
; CHECK-NEXT: ], !prof !20
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB7:%.*]]
; CHECK: bb2.nonchr1:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3_NONCHR2]]
; CHECK: bb3.nonchr2:
; CHECK-NEXT: br i1 [[CMP_I]], label [[BB4_NONCHR3:%.*]], label [[BB7]], !prof !18
; CHECK: bb4.nonchr3:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB7]]
; CHECK: bb7:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB10:%.*]]
; CHECK: entry.split.nonchr:
; CHECK-NEXT: br i1 [[CMP0]], label [[BB1_NONCHR:%.*]], label [[BB10]], !prof !18
; CHECK: bb1.nonchr:
; CHECK-NEXT: [[CMP2_NONCHR:%.*]] = icmp eq i64 [[I]], 2
; CHECK-NEXT: br i1 [[CMP2_NONCHR]], label [[BB3_NONCHR:%.*]], label [[BB2_NONCHR:%.*]], !prof !16
; CHECK: bb3.nonchr:
; CHECK-NEXT: [[CMP_I_NONCHR:%.*]] = icmp eq i64 [[I]], 86
; CHECK-NEXT: br i1 [[CMP_I_NONCHR]], label [[BB6_NONCHR:%.*]], label [[BB4_NONCHR:%.*]], !prof !16
; CHECK: bb6.nonchr:
; CHECK-NEXT: [[CMP3_NONCHR:%.*]] = icmp eq i64 [[J]], [[I]]
; CHECK-NEXT: br i1 [[CMP3_NONCHR]], label [[BB8_NONCHR:%.*]], label [[BB7_NONCHR:%.*]], !prof !16
; CHECK: bb8.nonchr:
; CHECK-NEXT: br i1 [[CMP_I_NONCHR]], label [[BB10]], label [[BB9_NONCHR:%.*]], !prof !16
; CHECK: bb9.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB10]]
; CHECK: bb7.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB8_NONCHR]]
; CHECK: bb4.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB6_NONCHR]]
; CHECK: bb2.nonchr:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3_NONCHR]]
; CHECK: bb10:
; CHECK-NEXT: ret i32 45
;
entry:
%cmp0 = icmp eq i64 %j, %k
br i1 %cmp0, label %bb10, label %bb1, !prof !15
bb1:
%cmp2 = icmp eq i64 %i, 2
br i1 %cmp2, label %bb3, label %bb2, !prof !15
bb2:
call void @foo()
br label %bb3
bb3:
%cmp.i = icmp eq i64 %i, 86
br i1 %cmp.i, label %bb5, label %bb4, !prof !15
bb4:
call void @foo()
br label %bb5
bb5:
br label %bb6
bb6:
%cmp3 = icmp eq i64 %j, %i
br i1 %cmp3, label %bb8, label %bb7, !prof !15
bb7:
call void @foo()
br label %bb8
bb8:
br i1 %cmp.i, label %bb10, label %bb9, !prof !15
bb9:
call void @foo()
br label %bb10
bb10:
ret i32 45
}
; Test a case with a really long use-def chains. This test checks that it's not
; really slow and doesn't appear to be hanging.
define i64 @test_chr_22(i1 %i, i64* %j, i64 %v0) !prof !14 {
; CHECK-LABEL: @test_chr_22(
; CHECK-NEXT: bb0:
; CHECK-NEXT: [[REASS_ADD:%.*]] = shl i64 [[V0:%.*]], 1
; CHECK-NEXT: [[V2:%.*]] = add i64 [[REASS_ADD]], 3
[SimplifyCFG][LoopRotate] SimplifyCFG: disable common instruction hoisting by default, enable late in pipeline I've been looking at missed vectorizations in one codebase. One particular thing that stands out is that some of the loops reach vectorizer in a rather mangled form, with weird PHI's, and some of the loops aren't even in a rotated form. After taking a more detailed look, that happened because the loop's headers were too big by then. It is evident that SimplifyCFG's common code hoisting transform is at fault there, because the pattern it handles is precisely the unrotated loop basic block structure. Surprizingly, `SimplifyCFGOpt::HoistThenElseCodeToIf()` is enabled by default, and is always run, unlike it's friend, common code sinking transform, `SinkCommonCodeFromPredecessors()`, which is not enabled by default and is only run once very late in the pipeline. I'm proposing to harmonize this, and disable common code hoisting until //late// in pipeline. Definition of //late// may vary, here currently i've picked the same one as for code sinking, but i suppose we could enable it as soon as right after loop rotation happens. Experimentation shows that this does indeed unsurprizingly help, more loops got rotated, although other issues remain elsewhere. Now, this undoubtedly seriously shakes phase ordering. This will undoubtedly be a mixed bag in terms of both compile- and run- time performance, codesize. Since we no longer aggressively hoist+deduplicate common code, we don't pay the price of said hoisting (which wasn't big). That may allow more loops to be rotated, so we pay that price. That, in turn, that may enable all the transforms that require canonical (rotated) loop form, including but not limited to vectorization, so we pay that too. And in general, no deduplication means more [duplicate] instructions going through the optimizations. But there's still late hoisting, some of them will be caught late. As per benchmarks i've run {F12360204}, this is mostly within the noise, there are some small improvements, some small regressions. One big regression i saw i fixed in rG8d487668d09fb0e4e54f36207f07c1480ffabbfd, but i'm sure this will expose many more pre-existing missed optimizations, as usual :S llvm-compile-time-tracker.com thoughts on this: http://llvm-compile-time-tracker.com/compare.php?from=e40315d2b4ed1e38962a8f33ff151693ed4ada63&to=c8289c0ecbf235da9fb0e3bc052e3c0d6bff5cf9&stat=instructions * this does regress compile-time by +0.5% geomean (unsurprizingly) * size impact varies; for ThinLTO it's actually an improvement The largest fallout appears to be in GVN's load partial redundancy elimination, it spends *much* more time in `MemoryDependenceResults::getNonLocalPointerDependency()`. Non-local `MemoryDependenceResults` is widely-known to be, uh, costly. There does not appear to be a proper solution to this issue, other than silencing the compile-time performance regression by tuning cut-off thresholds in `MemoryDependenceResults`, at the cost of potentially regressing run-time performance. D84609 attempts to move in that direction, but the path is unclear and is going to take some time. If we look at stats before/after diffs, some excerpts: * RawSpeed (the target) {F12360200} * -14 (-73.68%) loops not rotated due to the header size (yay) * -272 (-0.67%) `"Number of live out of a loop variables"` - good for vectorizer * -3937 (-64.19%) common instructions hoisted * +561 (+0.06%) x86 asm instructions * -2 basic blocks * +2418 (+0.11%) IR instructions * vanilla test-suite + RawSpeed + darktable {F12360201} * -36396 (-65.29%) common instructions hoisted * +1676 (+0.02%) x86 asm instructions * +662 (+0.06%) basic blocks * +4395 (+0.04%) IR instructions It is likely to be sub-optimal for when optimizing for code size, so one might want to change tune pipeline by enabling sinking/hoisting when optimizing for size. Reviewed By: mkazantsev Differential Revision: https://reviews.llvm.org/D84108
2020-07-30 00:54:33 +08:00
; CHECK-NEXT: [[C1:%.*]] = icmp slt i64 [[V2]], 100
; CHECK-NEXT: br i1 [[C1]], label [[BB0_SPLIT:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
; CHECK: bb0.split:
; CHECK-NEXT: [[V299:%.*]] = mul i64 [[V2]], 7860086430977039991
; CHECK-NEXT: store i64 [[V299]], i64* [[J:%.*]], align 4
; CHECK-NEXT: ret i64 99
[SimplifyCFG][LoopRotate] SimplifyCFG: disable common instruction hoisting by default, enable late in pipeline I've been looking at missed vectorizations in one codebase. One particular thing that stands out is that some of the loops reach vectorizer in a rather mangled form, with weird PHI's, and some of the loops aren't even in a rotated form. After taking a more detailed look, that happened because the loop's headers were too big by then. It is evident that SimplifyCFG's common code hoisting transform is at fault there, because the pattern it handles is precisely the unrotated loop basic block structure. Surprizingly, `SimplifyCFGOpt::HoistThenElseCodeToIf()` is enabled by default, and is always run, unlike it's friend, common code sinking transform, `SinkCommonCodeFromPredecessors()`, which is not enabled by default and is only run once very late in the pipeline. I'm proposing to harmonize this, and disable common code hoisting until //late// in pipeline. Definition of //late// may vary, here currently i've picked the same one as for code sinking, but i suppose we could enable it as soon as right after loop rotation happens. Experimentation shows that this does indeed unsurprizingly help, more loops got rotated, although other issues remain elsewhere. Now, this undoubtedly seriously shakes phase ordering. This will undoubtedly be a mixed bag in terms of both compile- and run- time performance, codesize. Since we no longer aggressively hoist+deduplicate common code, we don't pay the price of said hoisting (which wasn't big). That may allow more loops to be rotated, so we pay that price. That, in turn, that may enable all the transforms that require canonical (rotated) loop form, including but not limited to vectorization, so we pay that too. And in general, no deduplication means more [duplicate] instructions going through the optimizations. But there's still late hoisting, some of them will be caught late. As per benchmarks i've run {F12360204}, this is mostly within the noise, there are some small improvements, some small regressions. One big regression i saw i fixed in rG8d487668d09fb0e4e54f36207f07c1480ffabbfd, but i'm sure this will expose many more pre-existing missed optimizations, as usual :S llvm-compile-time-tracker.com thoughts on this: http://llvm-compile-time-tracker.com/compare.php?from=e40315d2b4ed1e38962a8f33ff151693ed4ada63&to=c8289c0ecbf235da9fb0e3bc052e3c0d6bff5cf9&stat=instructions * this does regress compile-time by +0.5% geomean (unsurprizingly) * size impact varies; for ThinLTO it's actually an improvement The largest fallout appears to be in GVN's load partial redundancy elimination, it spends *much* more time in `MemoryDependenceResults::getNonLocalPointerDependency()`. Non-local `MemoryDependenceResults` is widely-known to be, uh, costly. There does not appear to be a proper solution to this issue, other than silencing the compile-time performance regression by tuning cut-off thresholds in `MemoryDependenceResults`, at the cost of potentially regressing run-time performance. D84609 attempts to move in that direction, but the path is unclear and is going to take some time. If we look at stats before/after diffs, some excerpts: * RawSpeed (the target) {F12360200} * -14 (-73.68%) loops not rotated due to the header size (yay) * -272 (-0.67%) `"Number of live out of a loop variables"` - good for vectorizer * -3937 (-64.19%) common instructions hoisted * +561 (+0.06%) x86 asm instructions * -2 basic blocks * +2418 (+0.11%) IR instructions * vanilla test-suite + RawSpeed + darktable {F12360201} * -36396 (-65.29%) common instructions hoisted * +1676 (+0.02%) x86 asm instructions * +662 (+0.06%) basic blocks * +4395 (+0.04%) IR instructions It is likely to be sub-optimal for when optimizing for code size, so one might want to change tune pipeline by enabling sinking/hoisting when optimizing for size. Reviewed By: mkazantsev Differential Revision: https://reviews.llvm.org/D84108
2020-07-30 00:54:33 +08:00
; CHECK: bb0.split.nonchr:
; CHECK-NEXT: [[V299_NONCHR:%.*]] = mul i64 [[V2]], 7860086430977039991
; CHECK-NEXT: store i64 [[V299_NONCHR]], i64* [[J]], align 4
; CHECK-NEXT: ret i64 99
;
bb0:
%v1 = add i64 %v0, 3
%v2 = add i64 %v1, %v0
%c1 = icmp sgt i64 %v2, 99
%v3 = select i1 %c1, i64 %v1, i64 %v2, !prof !15
%v4 = add i64 %v2, %v2
%v5 = add i64 %v4, %v2
%v6 = add i64 %v5, %v4
%v7 = add i64 %v6, %v5
%v8 = add i64 %v7, %v6
%v9 = add i64 %v8, %v7
%v10 = add i64 %v9, %v8
%v11 = add i64 %v10, %v9
%v12 = add i64 %v11, %v10
%v13 = add i64 %v12, %v11
%v14 = add i64 %v13, %v12
%v15 = add i64 %v14, %v13
%v16 = add i64 %v15, %v14
%v17 = add i64 %v16, %v15
%v18 = add i64 %v17, %v16
%v19 = add i64 %v18, %v17
%v20 = add i64 %v19, %v18
%v21 = add i64 %v20, %v19
%v22 = add i64 %v21, %v20
%v23 = add i64 %v22, %v21
%v24 = add i64 %v23, %v22
%v25 = add i64 %v24, %v23
%v26 = add i64 %v25, %v24
%v27 = add i64 %v26, %v25
%v28 = add i64 %v27, %v26
%v29 = add i64 %v28, %v27
%v30 = add i64 %v29, %v28
%v31 = add i64 %v30, %v29
%v32 = add i64 %v31, %v30
%v33 = add i64 %v32, %v31
%v34 = add i64 %v33, %v32
%v35 = add i64 %v34, %v33
%v36 = add i64 %v35, %v34
%v37 = add i64 %v36, %v35
%v38 = add i64 %v37, %v36
%v39 = add i64 %v38, %v37
%v40 = add i64 %v39, %v38
%v41 = add i64 %v40, %v39
%v42 = add i64 %v41, %v40
%v43 = add i64 %v42, %v41
%v44 = add i64 %v43, %v42
%v45 = add i64 %v44, %v43
%v46 = add i64 %v45, %v44
%v47 = add i64 %v46, %v45
%v48 = add i64 %v47, %v46
%v49 = add i64 %v48, %v47
%v50 = add i64 %v49, %v48
%v51 = add i64 %v50, %v49
%v52 = add i64 %v51, %v50
%v53 = add i64 %v52, %v51
%v54 = add i64 %v53, %v52
%v55 = add i64 %v54, %v53
%v56 = add i64 %v55, %v54
%v57 = add i64 %v56, %v55
%v58 = add i64 %v57, %v56
%v59 = add i64 %v58, %v57
%v60 = add i64 %v59, %v58
%v61 = add i64 %v60, %v59
%v62 = add i64 %v61, %v60
%v63 = add i64 %v62, %v61
%v64 = add i64 %v63, %v62
%v65 = add i64 %v64, %v63
%v66 = add i64 %v65, %v64
%v67 = add i64 %v66, %v65
%v68 = add i64 %v67, %v66
%v69 = add i64 %v68, %v67
%v70 = add i64 %v69, %v68
%v71 = add i64 %v70, %v69
%v72 = add i64 %v71, %v70
%v73 = add i64 %v72, %v71
%v74 = add i64 %v73, %v72
%v75 = add i64 %v74, %v73
%v76 = add i64 %v75, %v74
%v77 = add i64 %v76, %v75
%v78 = add i64 %v77, %v76
%v79 = add i64 %v78, %v77
%v80 = add i64 %v79, %v78
%v81 = add i64 %v80, %v79
%v82 = add i64 %v81, %v80
%v83 = add i64 %v82, %v81
%v84 = add i64 %v83, %v82
%v85 = add i64 %v84, %v83
%v86 = add i64 %v85, %v84
%v87 = add i64 %v86, %v85
%v88 = add i64 %v87, %v86
%v89 = add i64 %v88, %v87
%v90 = add i64 %v89, %v88
%v91 = add i64 %v90, %v89
%v92 = add i64 %v91, %v90
%v93 = add i64 %v92, %v91
%v94 = add i64 %v93, %v92
%v95 = add i64 %v94, %v93
%v96 = add i64 %v95, %v94
%v97 = add i64 %v96, %v95
%v98 = add i64 %v97, %v96
%v99 = add i64 %v98, %v97
%v100 = add i64 %v99, %v98
%v101 = add i64 %v100, %v99
%v102 = add i64 %v101, %v100
%v103 = add i64 %v102, %v101
%v104 = add i64 %v103, %v102
%v105 = add i64 %v104, %v103
%v106 = add i64 %v105, %v104
%v107 = add i64 %v106, %v105
%v108 = add i64 %v107, %v106
%v109 = add i64 %v108, %v107
%v110 = add i64 %v109, %v108
%v111 = add i64 %v110, %v109
%v112 = add i64 %v111, %v110
%v113 = add i64 %v112, %v111
%v114 = add i64 %v113, %v112
%v115 = add i64 %v114, %v113
%v116 = add i64 %v115, %v114
%v117 = add i64 %v116, %v115
%v118 = add i64 %v117, %v116
%v119 = add i64 %v118, %v117
%v120 = add i64 %v119, %v118
%v121 = add i64 %v120, %v119
%v122 = add i64 %v121, %v120
%v123 = add i64 %v122, %v121
%v124 = add i64 %v123, %v122
%v125 = add i64 %v124, %v123
%v126 = add i64 %v125, %v124
%v127 = add i64 %v126, %v125
%v128 = add i64 %v127, %v126
%v129 = add i64 %v128, %v127
%v130 = add i64 %v129, %v128
%v131 = add i64 %v130, %v129
%v132 = add i64 %v131, %v130
%v133 = add i64 %v132, %v131
%v134 = add i64 %v133, %v132
%v135 = add i64 %v134, %v133
%v136 = add i64 %v135, %v134
%v137 = add i64 %v136, %v135
%v138 = add i64 %v137, %v136
%v139 = add i64 %v138, %v137
%v140 = add i64 %v139, %v138
%v141 = add i64 %v140, %v139
%v142 = add i64 %v141, %v140
%v143 = add i64 %v142, %v141
%v144 = add i64 %v143, %v142
%v145 = add i64 %v144, %v143
%v146 = add i64 %v145, %v144
%v147 = add i64 %v146, %v145
%v148 = add i64 %v147, %v146
%v149 = add i64 %v148, %v147
%v150 = add i64 %v149, %v148
%v151 = add i64 %v150, %v149
%v152 = add i64 %v151, %v150
%v153 = add i64 %v152, %v151
%v154 = add i64 %v153, %v152
%v155 = add i64 %v154, %v153
%v156 = add i64 %v155, %v154
%v157 = add i64 %v156, %v155
%v158 = add i64 %v157, %v156
%v159 = add i64 %v158, %v157
%v160 = add i64 %v159, %v158
%v161 = add i64 %v160, %v159
%v162 = add i64 %v161, %v160
%v163 = add i64 %v162, %v161
%v164 = add i64 %v163, %v162
%v165 = add i64 %v164, %v163
%v166 = add i64 %v165, %v164
%v167 = add i64 %v166, %v165
%v168 = add i64 %v167, %v166
%v169 = add i64 %v168, %v167
%v170 = add i64 %v169, %v168
%v171 = add i64 %v170, %v169
%v172 = add i64 %v171, %v170
%v173 = add i64 %v172, %v171
%v174 = add i64 %v173, %v172
%v175 = add i64 %v174, %v173
%v176 = add i64 %v175, %v174
%v177 = add i64 %v176, %v175
%v178 = add i64 %v177, %v176
%v179 = add i64 %v178, %v177
%v180 = add i64 %v179, %v178
%v181 = add i64 %v180, %v179
%v182 = add i64 %v181, %v180
%v183 = add i64 %v182, %v181
%v184 = add i64 %v183, %v182
%v185 = add i64 %v184, %v183
%v186 = add i64 %v185, %v184
%v187 = add i64 %v186, %v185
%v188 = add i64 %v187, %v186
%v189 = add i64 %v188, %v187
%v190 = add i64 %v189, %v188
%v191 = add i64 %v190, %v189
%v192 = add i64 %v191, %v190
%v193 = add i64 %v192, %v191
%v194 = add i64 %v193, %v192
%v195 = add i64 %v194, %v193
%v196 = add i64 %v195, %v194
%v197 = add i64 %v196, %v195
%v198 = add i64 %v197, %v196
%v199 = add i64 %v198, %v197
%v200 = add i64 %v199, %v198
%v201 = add i64 %v200, %v199
%v202 = add i64 %v201, %v200
%v203 = add i64 %v202, %v201
%v204 = add i64 %v203, %v202
%v205 = add i64 %v204, %v203
%v206 = add i64 %v205, %v204
%v207 = add i64 %v206, %v205
%v208 = add i64 %v207, %v206
%v209 = add i64 %v208, %v207
%v210 = add i64 %v209, %v208
%v211 = add i64 %v210, %v209
%v212 = add i64 %v211, %v210
%v213 = add i64 %v212, %v211
%v214 = add i64 %v213, %v212
%v215 = add i64 %v214, %v213
%v216 = add i64 %v215, %v214
%v217 = add i64 %v216, %v215
%v218 = add i64 %v217, %v216
%v219 = add i64 %v218, %v217
%v220 = add i64 %v219, %v218
%v221 = add i64 %v220, %v219
%v222 = add i64 %v221, %v220
%v223 = add i64 %v222, %v221
%v224 = add i64 %v223, %v222
%v225 = add i64 %v224, %v223
%v226 = add i64 %v225, %v224
%v227 = add i64 %v226, %v225
%v228 = add i64 %v227, %v226
%v229 = add i64 %v228, %v227
%v230 = add i64 %v229, %v228
%v231 = add i64 %v230, %v229
%v232 = add i64 %v231, %v230
%v233 = add i64 %v232, %v231
%v234 = add i64 %v233, %v232
%v235 = add i64 %v234, %v233
%v236 = add i64 %v235, %v234
%v237 = add i64 %v236, %v235
%v238 = add i64 %v237, %v236
%v239 = add i64 %v238, %v237
%v240 = add i64 %v239, %v238
%v241 = add i64 %v240, %v239
%v242 = add i64 %v241, %v240
%v243 = add i64 %v242, %v241
%v244 = add i64 %v243, %v242
%v245 = add i64 %v244, %v243
%v246 = add i64 %v245, %v244
%v247 = add i64 %v246, %v245
%v248 = add i64 %v247, %v246
%v249 = add i64 %v248, %v247
%v250 = add i64 %v249, %v248
%v251 = add i64 %v250, %v249
%v252 = add i64 %v251, %v250
%v253 = add i64 %v252, %v251
%v254 = add i64 %v253, %v252
%v255 = add i64 %v254, %v253
%v256 = add i64 %v255, %v254
%v257 = add i64 %v256, %v255
%v258 = add i64 %v257, %v256
%v259 = add i64 %v258, %v257
%v260 = add i64 %v259, %v258
%v261 = add i64 %v260, %v259
%v262 = add i64 %v261, %v260
%v263 = add i64 %v262, %v261
%v264 = add i64 %v263, %v262
%v265 = add i64 %v264, %v263
%v266 = add i64 %v265, %v264
%v267 = add i64 %v266, %v265
%v268 = add i64 %v267, %v266
%v269 = add i64 %v268, %v267
%v270 = add i64 %v269, %v268
%v271 = add i64 %v270, %v269
%v272 = add i64 %v271, %v270
%v273 = add i64 %v272, %v271
%v274 = add i64 %v273, %v272
%v275 = add i64 %v274, %v273
%v276 = add i64 %v275, %v274
%v277 = add i64 %v276, %v275
%v278 = add i64 %v277, %v276
%v279 = add i64 %v278, %v277
%v280 = add i64 %v279, %v278
%v281 = add i64 %v280, %v279
%v282 = add i64 %v281, %v280
%v283 = add i64 %v282, %v281
%v284 = add i64 %v283, %v282
%v285 = add i64 %v284, %v283
%v286 = add i64 %v285, %v284
%v287 = add i64 %v286, %v285
%v288 = add i64 %v287, %v286
%v289 = add i64 %v288, %v287
%v290 = add i64 %v289, %v288
%v291 = add i64 %v290, %v289
%v292 = add i64 %v291, %v290
%v293 = add i64 %v292, %v291
%v294 = add i64 %v293, %v292
%v295 = add i64 %v294, %v293
%v296 = add i64 %v295, %v294
%v297 = add i64 %v296, %v295
%v298 = add i64 %v297, %v296
%v299 = add i64 %v298, %v297
%v300 = add i64 %v299, %v298
%v301 = icmp eq i64 %v300, 100
%v302 = select i1 %v301, i64 %v298, i64 %v299, !prof !15
store i64 %v302, i64* %j
ret i64 99
}
; Test a case with a really long use-def chains. This test checks that it's not
; really slow and doesn't appear to be hanging. This is different from
; test_chr_22 in that it has nested control structures (multiple scopes) and
; covers additional code.
define i64 @test_chr_23(i64 %v0) !prof !14 {
; CHECK-LABEL: @test_chr_23(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[V0:%.*]], 50
; CHECK-NEXT: [[V10_NOT:%.*]] = icmp eq i64 [[TMP0]], -50
; CHECK-NEXT: ret i64 99
;
entry:
%v1 = add i64 %v0, 3
%v2 = add i64 %v1, %v1
%v3 = add i64 %v2, %v1
%v4 = add i64 %v2, %v3
%v5 = add i64 %v4, %v2
%v6 = add i64 %v5, %v4
%v7 = add i64 %v6, %v5
%v8 = add i64 %v7, %v6
%v9 = add i64 %v8, %v7
%v10 = icmp eq i64 %v9, 100
br i1 %v10, label %body, label %end, !prof !15
body:
%v1_0 = add i64 %v9, 3
%v2_0 = add i64 %v1_0, %v1_0
%v3_0 = add i64 %v2_0, %v1_0
%v4_0 = add i64 %v2_0, %v3_0
%v5_0 = add i64 %v4_0, %v2_0
%v6_0 = add i64 %v5_0, %v4_0
%v7_0 = add i64 %v6_0, %v5_0
%v8_0 = add i64 %v7_0, %v6_0
%v9_0 = add i64 %v8_0, %v7_0
%v10_0 = icmp eq i64 %v9_0, 100
br i1 %v10_0, label %body.1, label %end, !prof !15
body.1:
%v1_1 = add i64 %v9_0, 3
%v2_1 = add i64 %v1_1, %v1_1
%v3_1 = add i64 %v2_1, %v1_1
%v4_1 = add i64 %v2_1, %v3_1
%v5_1 = add i64 %v4_1, %v2_1
%v6_1 = add i64 %v5_1, %v4_1
%v7_1 = add i64 %v6_1, %v5_1
%v8_1 = add i64 %v7_1, %v6_1
%v9_1 = add i64 %v8_1, %v7_1
%v10_1 = icmp eq i64 %v9_1, 100
br i1 %v10_1, label %body.2, label %end, !prof !15
body.2:
%v1_2 = add i64 %v9_1, 3
%v2_2 = add i64 %v1_2, %v1_2
%v3_2 = add i64 %v2_2, %v1_2
%v4_2 = add i64 %v2_2, %v3_2
%v5_2 = add i64 %v4_2, %v2_2
%v6_2 = add i64 %v5_2, %v4_2
%v7_2 = add i64 %v6_2, %v5_2
%v8_2 = add i64 %v7_2, %v6_2
%v9_2 = add i64 %v8_2, %v7_2
%v10_2 = icmp eq i64 %v9_2, 100
br i1 %v10_2, label %body.3, label %end, !prof !15
body.3:
%v1_3 = add i64 %v9_2, 3
%v2_3 = add i64 %v1_3, %v1_3
%v3_3 = add i64 %v2_3, %v1_3
%v4_3 = add i64 %v2_3, %v3_3
%v5_3 = add i64 %v4_3, %v2_3
%v6_3 = add i64 %v5_3, %v4_3
%v7_3 = add i64 %v6_3, %v5_3
%v8_3 = add i64 %v7_3, %v6_3
%v9_3 = add i64 %v8_3, %v7_3
%v10_3 = icmp eq i64 %v9_3, 100
br i1 %v10_3, label %body.4, label %end, !prof !15
body.4:
%v1_4 = add i64 %v9_3, 3
%v2_4 = add i64 %v1_4, %v1_4
%v3_4 = add i64 %v2_4, %v1_4
%v4_4 = add i64 %v2_4, %v3_4
%v5_4 = add i64 %v4_4, %v2_4
%v6_4 = add i64 %v5_4, %v4_4
%v7_4 = add i64 %v6_4, %v5_4
%v8_4 = add i64 %v7_4, %v6_4
%v9_4 = add i64 %v8_4, %v7_4
%v10_4 = icmp eq i64 %v9_4, 100
br i1 %v10_4, label %body.5, label %end, !prof !15
body.5:
%v1_5 = add i64 %v9_4, 3
%v2_5 = add i64 %v1_5, %v1_5
%v3_5 = add i64 %v2_5, %v1_5
%v4_5 = add i64 %v2_5, %v3_5
%v5_5 = add i64 %v4_5, %v2_5
%v6_5 = add i64 %v5_5, %v4_5
%v7_5 = add i64 %v6_5, %v5_5
%v8_5 = add i64 %v7_5, %v6_5
%v9_5 = add i64 %v8_5, %v7_5
%v10_5 = icmp eq i64 %v9_5, 100
br i1 %v10_5, label %body.6, label %end, !prof !15
body.6:
%v1_6 = add i64 %v9_5, 3
%v2_6 = add i64 %v1_6, %v1_6
%v3_6 = add i64 %v2_6, %v1_6
%v4_6 = add i64 %v2_6, %v3_6
%v5_6 = add i64 %v4_6, %v2_6
%v6_6 = add i64 %v5_6, %v4_6
%v7_6 = add i64 %v6_6, %v5_6
%v8_6 = add i64 %v7_6, %v6_6
%v9_6 = add i64 %v8_6, %v7_6
%v10_6 = icmp eq i64 %v9_6, 100
br i1 %v10_6, label %body.7, label %end, !prof !15
body.7:
%v1_7 = add i64 %v9_6, 3
%v2_7 = add i64 %v1_7, %v1_7
%v3_7 = add i64 %v2_7, %v1_7
%v4_7 = add i64 %v2_7, %v3_7
%v5_7 = add i64 %v4_7, %v2_7
%v6_7 = add i64 %v5_7, %v4_7
%v7_7 = add i64 %v6_7, %v5_7
%v8_7 = add i64 %v7_7, %v6_7
%v9_7 = add i64 %v8_7, %v7_7
%v10_7 = icmp eq i64 %v9_7, 100
br i1 %v10_7, label %body.8, label %end, !prof !15
body.8:
%v1_8 = add i64 %v9_7, 3
%v2_8 = add i64 %v1_8, %v1_8
%v3_8 = add i64 %v2_8, %v1_8
%v4_8 = add i64 %v2_8, %v3_8
%v5_8 = add i64 %v4_8, %v2_8
%v6_8 = add i64 %v5_8, %v4_8
%v7_8 = add i64 %v6_8, %v5_8
%v8_8 = add i64 %v7_8, %v6_8
%v9_8 = add i64 %v8_8, %v7_8
%v10_8 = icmp eq i64 %v9_8, 100
br i1 %v10_8, label %body.9, label %end, !prof !15
body.9:
%v1_9 = add i64 %v9_8, 3
%v2_9 = add i64 %v1_9, %v1_9
%v3_9 = add i64 %v2_9, %v1_9
%v4_9 = add i64 %v2_9, %v3_9
%v5_9 = add i64 %v4_9, %v2_9
%v6_9 = add i64 %v5_9, %v4_9
%v7_9 = add i64 %v6_9, %v5_9
%v8_9 = add i64 %v7_9, %v6_9
%v9_9 = add i64 %v8_9, %v7_9
br label %end
end:
ret i64 99
}
; Test to not crash upon a 0:0 branch_weight metadata.
define void @test_chr_24(i32* %i) !prof !14 {
; CHECK-LABEL: @test_chr_24(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[BB0:%.*]], !prof !21
; CHECK: bb0:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
; CHECK-NEXT: br i1 [[TMP4]], label [[BB3:%.*]], label [[BB2:%.*]], !prof !21
; CHECK: bb2:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %i
%1 = and i32 %0, 1
%2 = icmp eq i32 %1, 0
br i1 %2, label %bb1, label %bb0, !prof !17
bb0:
call void @foo()
br label %bb1
bb1:
%3 = and i32 %0, 2
%4 = icmp eq i32 %3, 0
br i1 %4, label %bb3, label %bb2, !prof !17
bb2:
call void @foo()
br label %bb3
bb3:
ret void
}
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 100}
!15 = !{!"branch_weights", i32 0, i32 1}
!16 = !{!"branch_weights", i32 1, i32 1}
!17 = !{!"branch_weights", i32 0, i32 0}
; CHECK: !15 = !{!"branch_weights", i32 1000, i32 0}
; CHECK: !16 = !{!"branch_weights", i32 0, i32 1}
; CHECK: !17 = !{!"branch_weights", i32 1, i32 1}
; CHECK: !18 = !{!"branch_weights", i32 1, i32 0}
; CHECK: !19 = !{!"branch_weights", i32 0, i32 1000}