2019-04-17 12:52:47 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
|
|
; RUN: opt < %s -chr -instcombine -simplifycfg -S | FileCheck %s
|
|
|
|
; RUN: opt < %s -passes='require<profile-summary>,function(chr,instcombine,simplify-cfg)' -S | FileCheck %s
|
|
|
|
|
|
|
|
declare void @foo()
|
|
|
|
declare void @bar()
|
|
|
|
|
|
|
|
; Simple case.
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 1) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; if ((t0 & 2) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 3) != 0) { // Likely true
|
|
|
|
; foo()
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; if ((t0 & 1) != 0)
|
|
|
|
; foo()
|
|
|
|
; if ((t0 & 2) != 0)
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
define void @test_chr_1(i32* %i) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_1(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1_NONCHR]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: ret void
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb1, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%3 = and i32 %0, 2
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
br i1 %4, label %bb3, label %bb2, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; Simple case with a cold block.
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 1) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; if ((t0 & 2) == 0) // Likely false
|
|
|
|
; bar()
|
|
|
|
; if ((t0 & 4) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 7) == 7) { // Likely true
|
|
|
|
; foo()
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; if ((t0 & 1) != 0)
|
|
|
|
; foo()
|
|
|
|
; if ((t0 & 2) == 0)
|
|
|
|
; bar()
|
|
|
|
; if ((t0 & 4) != 0)
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
define void @test_chr_1_1(i32* %i) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_1_1(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 7
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 7
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB5:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1_NONCHR]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[BB2_NONCHR:%.*]], label [[BB3_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: call void @bar()
|
|
|
|
; CHECK-NEXT: br label [[BB3_NONCHR]]
|
|
|
|
; CHECK: bb3.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 4
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[BB5]], label [[BB4_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb4.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB5]]
|
|
|
|
; CHECK: bb5:
|
|
|
|
; CHECK-NEXT: ret void
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb1, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%3 = and i32 %0, 2
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
br i1 %4, label %bb2, label %bb3, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @bar()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%5 = and i32 %0, 4
|
|
|
|
%6 = icmp eq i32 %5, 0
|
|
|
|
br i1 %6, label %bb5, label %bb4, !prof !15
|
|
|
|
|
|
|
|
bb4:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb5
|
|
|
|
|
|
|
|
bb5:
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; With an aggregate bit check.
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 255) != 0) // Likely true
|
|
|
|
; if ((t0 & 1) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; if ((t0 & 2) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 3) != 0) { // Likely true
|
|
|
|
; foo()
|
|
|
|
; foo()
|
|
|
|
; } else if ((t0 & 255) != 0)
|
|
|
|
; if ((t0 & 1) != 0)
|
|
|
|
; foo()
|
|
|
|
; if ((t0 & 2) != 0)
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
define void @test_chr_2(i32* %i) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_2(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB4:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 255
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[BB4]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[BB2_NONCHR:%.*]], label [[BB1_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[BB4]], label [[BB3_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb3.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB4]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB2_NONCHR]]
|
|
|
|
; CHECK: bb4:
|
|
|
|
; CHECK-NEXT: ret void
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 255
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb4, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
%3 = and i32 %0, 1
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
br i1 %4, label %bb2, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%5 = and i32 %0, 2
|
|
|
|
%6 = icmp eq i32 %5, 0
|
|
|
|
br i1 %6, label %bb4, label %bb3, !prof !15
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb4
|
|
|
|
|
|
|
|
bb4:
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; Split case.
|
|
|
|
; Roughly,
|
|
|
|
; t1 = *i
|
|
|
|
; if ((t1 & 1) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; if ((t1 & 2) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; t2 = *i
|
|
|
|
; if ((t2 & 4) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; if ((t2 & 8) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; ->
|
|
|
|
; t1 = *i
|
|
|
|
; if ((t1 & 3) != 0) { // Likely true
|
|
|
|
; foo()
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; if ((t1 & 1) != 0)
|
|
|
|
; foo()
|
|
|
|
; if ((t1 & 2) != 0)
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
; t2 = *i
|
|
|
|
; if ((t2 & 12) != 0) { // Likely true
|
|
|
|
; foo()
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; if ((t2 & 4) != 0)
|
|
|
|
; foo()
|
|
|
|
; if ((t2 & 8) != 0)
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
define void @test_chr_3(i32* %i) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_3(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1_NONCHR]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[I]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 12
|
|
|
|
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 12
|
|
|
|
; CHECK-NEXT: br i1 [[TMP9]], label [[BB4:%.*]], label [[BB3_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb4:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB7:%.*]]
|
|
|
|
; CHECK: bb3.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP7]], 4
|
|
|
|
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP11]], label [[BB5_NONCHR:%.*]], label [[BB4_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb4.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB5_NONCHR]]
|
|
|
|
; CHECK: bb5.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP12:%.*]] = and i32 [[TMP7]], 8
|
|
|
|
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP13]], label [[BB7]], label [[BB6_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb6.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB7]]
|
|
|
|
; CHECK: bb7:
|
|
|
|
; CHECK-NEXT: ret void
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb1, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%3 = and i32 %0, 2
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
br i1 %4, label %bb3, label %bb2, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%5 = load i32, i32* %i
|
|
|
|
%6 = and i32 %5, 4
|
|
|
|
%7 = icmp eq i32 %6, 0
|
|
|
|
br i1 %7, label %bb5, label %bb4, !prof !15
|
|
|
|
|
|
|
|
bb4:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb5
|
|
|
|
|
|
|
|
bb5:
|
|
|
|
%8 = and i32 %5, 8
|
|
|
|
%9 = icmp eq i32 %8, 0
|
|
|
|
br i1 %9, label %bb7, label %bb6, !prof !15
|
|
|
|
|
|
|
|
bb6:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb7
|
|
|
|
|
|
|
|
bb7:
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; Selects.
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; sum1 = (t0 & 1) ? sum0 : (sum0 + 42) // Likely false
|
|
|
|
; sum2 = (t0 & 2) ? sum1 : (sum1 + 43) // Likely false
|
|
|
|
; return sum2
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 3) == 3)
|
|
|
|
; return sum0 + 85
|
|
|
|
; else {
|
|
|
|
; sum1 = (t0 & 1) ? sum0 : (sum0 + 42)
|
|
|
|
; sum2 = (t0 & 2) ? sum1 : (sum1 + 43)
|
|
|
|
; return sum2
|
|
|
|
; }
|
|
|
|
define i32 @test_chr_4(i32* %i, i32 %sum0) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_4(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: entry.split:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SUM0:%.*]], 85
|
|
|
|
; CHECK-NEXT: ret i32 [[TMP3]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SUM0]], 42
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
|
|
; CHECK-NEXT: [[SUM1_NONCHR:%.*]] = select i1 [[TMP6]], i32 [[SUM0]], i32 [[TMP4]], !prof !16
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
|
|
|
|
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[SUM1_NONCHR]], 43
|
|
|
|
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM1_NONCHR]], i32 [[TMP9]], !prof !16
|
|
|
|
; CHECK-NEXT: ret i32 [[SUM2_NONCHR]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
%3 = add i32 %sum0, 42
|
|
|
|
%sum1 = select i1 %2, i32 %sum0, i32 %3, !prof !15
|
|
|
|
%4 = and i32 %0, 2
|
|
|
|
%5 = icmp eq i32 %4, 0
|
|
|
|
%6 = add i32 %sum1, 43
|
|
|
|
%sum2 = select i1 %5, i32 %sum1, i32 %6, !prof !15
|
|
|
|
ret i32 %sum2
|
|
|
|
}
|
|
|
|
|
|
|
|
; Selects + Brs
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 255) != 0) { // Likely true
|
|
|
|
; sum = (t0 & 1) ? sum0 : (sum0 + 42) // Likely false
|
|
|
|
; sum = (t0 & 2) ? sum : (sum + 43) // Likely false
|
|
|
|
; if ((t0 & 4) != 0) { // Likely true
|
|
|
|
; sum3 = sum + 44
|
|
|
|
; sum = (t0 & 8) ? sum3 : (sum3 + 44) // Likely false
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; return sum
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 15) != 15) { // Likely true
|
|
|
|
; sum = sum0 + 173
|
|
|
|
; } else if ((t0 & 255) != 0) {
|
|
|
|
; sum = (t0 & 1) ? sum0 : (sum0 + 42)
|
|
|
|
; sum = (t0 & 2) ? sum : (sum + 43)
|
|
|
|
; if ((t0 & 4) != 0) {
|
|
|
|
; sum3 = sum + 44
|
|
|
|
; sum = (t0 & 8) ? sum3 : (sum3 + 44)
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; return sum
|
|
|
|
define i32 @test_chr_5(i32* %i, i32 %sum0) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_5(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 15
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 15
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SUM0:%.*]], 85
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SUM0]], 173
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 255
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
|
|
|
|
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[SUM0]], 42
|
|
|
|
; CHECK-NEXT: [[SUM1_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM0]], i32 [[TMP9]], !prof !16
|
|
|
|
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
|
|
|
|
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[SUM1_NONCHR]], 43
|
|
|
|
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP11]], i32 [[SUM1_NONCHR]], i32 [[TMP12]], !prof !16
|
|
|
|
; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP0]], 4
|
|
|
|
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0
|
|
|
|
; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP0]], 8
|
|
|
|
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP16]], i32 44, i32 88
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP14]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: ret i32 [[SUM6]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 255
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb3, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
%3 = and i32 %0, 1
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
%5 = add i32 %sum0, 42
|
|
|
|
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
|
|
|
|
%6 = and i32 %0, 2
|
|
|
|
%7 = icmp eq i32 %6, 0
|
|
|
|
%8 = add i32 %sum1, 43
|
|
|
|
%sum2 = select i1 %7, i32 %sum1, i32 %8, !prof !15
|
|
|
|
%9 = and i32 %0, 4
|
|
|
|
%10 = icmp eq i32 %9, 0
|
|
|
|
br i1 %10, label %bb2, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%sum3 = add i32 %sum2, 44
|
|
|
|
%11 = and i32 %0, 8
|
|
|
|
%12 = icmp eq i32 %11, 0
|
|
|
|
%13 = add i32 %sum3, 44
|
|
|
|
%sum4 = select i1 %12, i32 %sum3, i32 %13, !prof !15
|
|
|
|
br label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
|
|
|
|
ret i32 %sum6
|
|
|
|
}
|
|
|
|
|
|
|
|
; Selects + Brs with a scope split in the middle
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 255) != 0) { // Likely true
|
|
|
|
; sum = (t0 & 1) ? sum0 : (sum0 + 42) // Likely false
|
|
|
|
; sum = (t0 & 2) ? sum : (sum + 43) // Likely false
|
|
|
|
; if ((sum0 & 4) != 0) { // Likely true. The condition doesn't use v.
|
|
|
|
; sum3 = sum + 44
|
|
|
|
; sum = (t0 & 8) ? sum3 : (sum3 + 44) // Likely false
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; return sum
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((sum0 & 4) != 0 & (t0 & 11) != 11) { // Likely true
|
|
|
|
; sum = sum0 + 173
|
|
|
|
; } else if ((t0 & 255) != 0) {
|
|
|
|
; sum = (t0 & 1) ? sum0 : (sum0 + 42)
|
|
|
|
; sum = (t0 & 2) ? sum : (sum + 43)
|
|
|
|
; if ((sum0 & 4) != 0) {
|
|
|
|
; sum3 = sum + 44
|
|
|
|
; sum = (t0 & 8) ? sum3 : (sum3 + 44)
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; return sum
|
|
|
|
define i32 @test_chr_5_1(i32* %i, i32 %sum0) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_5_1(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[SUM0:%.*]], 4
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 11
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 11
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP4]], [[TMP2]]
|
|
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[SUM0]], 85
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[SUM0]], 173
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP0]], 255
|
|
|
|
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP9]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
|
|
|
|
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[SUM0]], 42
|
|
|
|
; CHECK-NEXT: [[SUM1_NONCHR:%.*]] = select i1 [[TMP11]], i32 [[SUM0]], i32 [[TMP12]], !prof !16
|
|
|
|
; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0
|
|
|
|
; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SUM1_NONCHR]], 43
|
|
|
|
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP14]], i32 [[SUM1_NONCHR]], i32 [[TMP15]], !prof !16
|
|
|
|
; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[SUM0]], 4
|
|
|
|
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0
|
|
|
|
; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP0]], 8
|
|
|
|
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP19]], i32 44, i32 88
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP17]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP7]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: ret i32 [[SUM6]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 255
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb3, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
%3 = and i32 %0, 1
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
%5 = add i32 %sum0, 42
|
|
|
|
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
|
|
|
|
%6 = and i32 %0, 2
|
|
|
|
%7 = icmp eq i32 %6, 0
|
|
|
|
%8 = add i32 %sum1, 43
|
|
|
|
%sum2 = select i1 %7, i32 %sum1, i32 %8, !prof !15
|
|
|
|
%9 = and i32 %sum0, 4 ; Split
|
|
|
|
%10 = icmp eq i32 %9, 0
|
|
|
|
br i1 %10, label %bb2, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%sum3 = add i32 %sum2, 44
|
|
|
|
%11 = and i32 %0, 8
|
|
|
|
%12 = icmp eq i32 %11, 0
|
|
|
|
%13 = add i32 %sum3, 44
|
|
|
|
%sum4 = select i1 %12, i32 %sum3, i32 %13, !prof !15
|
|
|
|
br label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
|
|
|
|
ret i32 %sum6
|
|
|
|
}
|
|
|
|
|
|
|
|
; Selects + Brs, non-matching bases
|
|
|
|
; Roughly,
|
|
|
|
; i0 = *i
|
|
|
|
; j0 = *j
|
|
|
|
; if ((i0 & 255) != 0) { // Likely true
|
|
|
|
; sum = (i0 & 2) ? sum0 : (sum0 + 43) // Likely false
|
|
|
|
; if ((j0 & 4) != 0) { // Likely true. The condition uses j0, not i0.
|
|
|
|
; sum3 = sum + 44
|
|
|
|
; sum = (i0 & 8) ? sum3 : (sum3 + 44) // Likely false
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; return sum
|
|
|
|
; ->
|
|
|
|
; i0 = *i
|
|
|
|
; j0 = *j
|
|
|
|
; if ((j0 & 4) != 0 & (i0 & 10) != 10) { // Likely true
|
|
|
|
; sum = sum0 + 131
|
|
|
|
; } else if ((i0 & 255) != 0) {
|
|
|
|
; sum = (i0 & 2) ? sum0 : (sum0 + 43)
|
|
|
|
; if ((j0 & 4) != 0) {
|
|
|
|
; sum3 = sum + 44
|
|
|
|
; sum = (i0 & 8) ? sum3 : (sum3 + 44)
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; return sum
|
|
|
|
define i32 @test_chr_6(i32* %i, i32* %j, i32 %sum0) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_6(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[V9:%.*]] = and i32 [[J0]], 4
|
|
|
|
; CHECK-NEXT: [[V10:%.*]] = icmp ne i32 [[V9]], 0
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[I0]], 10
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 10
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[V10]]
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
|
|
|
|
; CHECK-NEXT: [[V13:%.*]] = add i32 [[SUM0]], 131
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[V1:%.*]] = and i32 [[I0]], 255
|
|
|
|
; CHECK-NEXT: [[V2:%.*]] = icmp eq i32 [[V1]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[V2]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: [[V3_NONCHR:%.*]] = and i32 [[I0]], 2
|
|
|
|
; CHECK-NEXT: [[V4_NONCHR:%.*]] = icmp eq i32 [[V3_NONCHR]], 0
|
|
|
|
; CHECK-NEXT: [[V8_NONCHR:%.*]] = add i32 [[SUM0]], 43
|
|
|
|
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[V4_NONCHR]], i32 [[SUM0]], i32 [[V8_NONCHR]], !prof !16
|
|
|
|
; CHECK-NEXT: [[V9_NONCHR:%.*]] = and i32 [[J0]], 4
|
|
|
|
; CHECK-NEXT: [[V10_NONCHR:%.*]] = icmp eq i32 [[V9_NONCHR]], 0
|
|
|
|
; CHECK-NEXT: [[V11_NONCHR:%.*]] = and i32 [[I0]], 8
|
|
|
|
; CHECK-NEXT: [[V12_NONCHR:%.*]] = icmp eq i32 [[V11_NONCHR]], 0
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[V12_NONCHR]], i32 44, i32 88
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[V10_NONCHR]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[V13]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: ret i32 [[SUM6]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%i0 = load i32, i32* %i
|
|
|
|
%j0 = load i32, i32* %j
|
|
|
|
%v1 = and i32 %i0, 255
|
|
|
|
%v2 = icmp eq i32 %v1, 0
|
|
|
|
br i1 %v2, label %bb3, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
%v3 = and i32 %i0, 2
|
|
|
|
%v4 = icmp eq i32 %v3, 0
|
|
|
|
%v8 = add i32 %sum0, 43
|
|
|
|
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
|
|
|
|
%v9 = and i32 %j0, 4
|
|
|
|
%v10 = icmp eq i32 %v9, 0
|
|
|
|
br i1 %v10, label %bb2, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%sum3 = add i32 %sum2, 44
|
|
|
|
%v11 = and i32 %i0, 8
|
|
|
|
%v12 = icmp eq i32 %v11, 0
|
|
|
|
%v13 = add i32 %sum3, 44
|
|
|
|
%sum4 = select i1 %v12, i32 %sum3, i32 %v13, !prof !15
|
|
|
|
br label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
|
|
|
|
ret i32 %sum6
|
|
|
|
}
|
|
|
|
|
|
|
|
; Selects + Brs, the branch condition can't be hoisted to be merged with a
|
|
|
|
; select. No CHR happens.
|
|
|
|
; Roughly,
|
|
|
|
; i0 = *i
|
|
|
|
; sum = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
|
|
|
|
; foo();
|
|
|
|
; j0 = *j
|
|
|
|
; if ((j0 & 4) != 0) { // Likely true
|
|
|
|
; foo();
|
|
|
|
; sum = sum + 44
|
|
|
|
; }
|
|
|
|
; return sum
|
|
|
|
; ->
|
|
|
|
; (no change)
|
|
|
|
define i32 @test_chr_7(i32* %i, i32* %j, i32 %sum0) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_7(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[V3:%.*]] = and i32 [[I0]], 2
|
|
|
|
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V3]], 0
|
|
|
|
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
|
|
|
|
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[V9:%.*]] = and i32 [[J0]], 4
|
|
|
|
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[V10]], label [[BB2:%.*]], label [[BB1:%.*]], !prof !16
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[SUM4:%.*]] = add i32 [[SUM2]], 44
|
|
|
|
; CHECK-NEXT: br label [[BB2]]
|
|
|
|
; CHECK: bb2:
|
|
|
|
; CHECK-NEXT: [[SUM5:%.*]] = phi i32 [ [[SUM2]], [[ENTRY:%.*]] ], [ [[SUM4]], [[BB1]] ]
|
|
|
|
; CHECK-NEXT: ret i32 [[SUM5]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%i0 = load i32, i32* %i
|
|
|
|
%v3 = and i32 %i0, 2
|
|
|
|
%v4 = icmp eq i32 %v3, 0
|
|
|
|
%v8 = add i32 %sum0, 43
|
|
|
|
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
|
|
|
|
call void @foo()
|
|
|
|
%j0 = load i32, i32* %j
|
|
|
|
%v9 = and i32 %j0, 4
|
|
|
|
%v10 = icmp eq i32 %v9, 0
|
|
|
|
br i1 %v10, label %bb2, label %bb1, !prof !15 ; %v10 can't be hoisted above the above select
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
call void @foo()
|
|
|
|
%sum4 = add i32 %sum2, 44
|
|
|
|
br label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%sum5 = phi i32 [ %sum2, %entry ], [ %sum4, %bb1 ]
|
|
|
|
ret i32 %sum5
|
|
|
|
}
|
|
|
|
|
|
|
|
; Selects + Brs, the branch condition can't be hoisted to be merged with the
|
|
|
|
; selects. Dropping the select.
|
|
|
|
; Roughly,
|
|
|
|
; i0 = *i
|
|
|
|
; sum = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
|
|
|
|
; foo();
|
|
|
|
; j0 = *j
|
|
|
|
; if ((j0 & 4) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; if ((j0 & 8) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; return sum
|
|
|
|
; ->
|
|
|
|
; i0 = *i
|
|
|
|
; sum = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
|
|
|
|
; foo();
|
|
|
|
; j0 = *j
|
|
|
|
; if ((j0 & 12) != 12) { // Likely true
|
|
|
|
; foo()
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; if ((j0 & 4) != 0)
|
|
|
|
; foo()
|
|
|
|
; if ((j0 & 8) != 0)
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
; return sum
|
|
|
|
define i32 @test_chr_7_1(i32* %i, i32* %j, i32 %sum0) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_7_1(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[J0]], 12
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 12
|
|
|
|
; CHECK-NEXT: br i1 [[TMP1]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[V9:%.*]] = and i32 [[J0]], 4
|
|
|
|
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[V10]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1_NONCHR]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[V11_NONCHR:%.*]] = and i32 [[J0]], 8
|
|
|
|
; CHECK-NEXT: [[V12_NONCHR:%.*]] = icmp eq i32 [[V11_NONCHR]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[V12_NONCHR]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
2020-05-22 15:32:21 +08:00
|
|
|
; CHECK-NEXT: [[V3:%.*]] = and i32 [[I0]], 2
|
|
|
|
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V3]], 0
|
|
|
|
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
|
|
|
|
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: ret i32 [[SUM2]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%i0 = load i32, i32* %i
|
|
|
|
%v3 = and i32 %i0, 2
|
|
|
|
%v4 = icmp eq i32 %v3, 0
|
|
|
|
%v8 = add i32 %sum0, 43
|
|
|
|
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
|
|
|
|
call void @foo()
|
|
|
|
%j0 = load i32, i32* %j
|
|
|
|
%v9 = and i32 %j0, 4
|
|
|
|
%v10 = icmp eq i32 %v9, 0
|
|
|
|
br i1 %v10, label %bb1, label %bb0, !prof !15 ; %v10 can't be hoisted above the above select
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%v11 = and i32 %j0, 8
|
|
|
|
%v12 = icmp eq i32 %v11, 0
|
|
|
|
br i1 %v12, label %bb3, label %bb2, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
ret i32 %sum2
|
|
|
|
}
|
|
|
|
|
|
|
|
; Branches aren't biased enough. No CHR happens.
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 1) != 0) // Not biased
|
|
|
|
; foo()
|
|
|
|
; if ((t0 & 2) != 0) // Not biased
|
|
|
|
; foo()
|
|
|
|
; ->
|
|
|
|
; (no change)
|
|
|
|
define void @test_chr_8(i32* %i) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_8(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[BB0:%.*]], !prof !17
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1]]
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[BB3:%.*]], label [[BB2:%.*]], !prof !17
|
|
|
|
; CHECK: bb2:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: ret void
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb1, label %bb0, !prof !16
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%3 = and i32 %0, 2
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
br i1 %4, label %bb3, label %bb2, !prof !16
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; With an existing phi at the exit.
|
|
|
|
; Roughly,
|
|
|
|
; t = *i
|
|
|
|
; if ((t0 & 1) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; if ((t0 & 2) != 0) { // Likely true
|
|
|
|
; t = *j
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
; // There's a phi for t here.
|
|
|
|
; return t
|
|
|
|
; ->
|
|
|
|
; t = *i
|
|
|
|
; if ((t & 3) == 3) { // Likely true
|
|
|
|
; foo()
|
|
|
|
; t = *j
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; if ((t & 1) != 0)
|
|
|
|
; foo()
|
|
|
|
; if ((t & 2) != 0) {
|
|
|
|
; t = *j
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; // There's a phi for t here.
|
|
|
|
; return t
|
|
|
|
define i32 @test_chr_9(i32* %i, i32* %j) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_9(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[J:%.*]], align 4
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1_NONCHR]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP7]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[J]], align 4
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP3]], [[BB0]] ], [ [[TMP0]], [[BB1_NONCHR]] ], [ [[TMP8]], [[BB2_NONCHR]] ]
|
|
|
|
; CHECK-NEXT: ret i32 [[TMP9]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb1, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%3 = and i32 %0, 2
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
br i1 %4, label %bb3, label %bb2, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%5 = load i32, i32* %j
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%6 = phi i32 [ %0, %bb1 ], [ %5, %bb2 ]
|
|
|
|
ret i32 %6
|
|
|
|
}
|
|
|
|
|
|
|
|
; With no phi at the exit, but the exit needs a phi inserted after CHR.
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 1) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; t1 = *j
|
|
|
|
; if ((t1 & 2) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; return (t1 * 42) - (t1 - 99)
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 3) == 3) { // Likely true
|
|
|
|
; foo()
|
|
|
|
; t1 = *j
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; if ((t0 & 1) != 0)
|
|
|
|
; foo()
|
|
|
|
; if ((t0 & 2) != 0) {
|
|
|
|
; t1 = *j
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; // A new phi for t1 is inserted here.
|
|
|
|
; return (t1 * 42) - (t1 - 99)
|
|
|
|
define i32 @test_chr_10(i32* %i, i32* %j) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_10(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[J:%.*]], align 4
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP5]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1_NONCHR]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[J]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP3]], [[BB0]] ], [ [[TMP6]], [[BB2_NONCHR]] ], [ [[TMP6]], [[BB1_NONCHR]] ]
|
|
|
|
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 42
|
|
|
|
; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP9]], -99
|
|
|
|
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
|
|
|
|
; CHECK-NEXT: ret i32 [[TMP12]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb1, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%3 = load i32, i32* %j
|
|
|
|
%4 = and i32 %0, 2
|
|
|
|
%5 = icmp eq i32 %4, 0
|
|
|
|
br i1 %5, label %bb3, label %bb2, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%6 = mul i32 %3, 42
|
|
|
|
%7 = sub i32 %3, 99
|
|
|
|
%8 = add i32 %6, %7
|
|
|
|
ret i32 %8
|
|
|
|
}
|
|
|
|
|
|
|
|
; Test a case where there are two use-def chain paths to the same value (t0)
|
|
|
|
; from the branch condition. This is a regression test for an old bug that
|
|
|
|
; caused a bad hoisting that moves (hoists) a value (%conv) twice to the end of
|
|
|
|
; the %entry block (once for %div and once for %mul16) and put a use ahead of
|
|
|
|
; its definition like:
|
|
|
|
; %entry:
|
|
|
|
; ...
|
|
|
|
; %div = fdiv double 1.000000e+00, %conv
|
|
|
|
; %conv = sitofp i32 %0 to double
|
|
|
|
; %mul16 = fmul double %div, %conv
|
|
|
|
;
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 1) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; // there are two use-def paths from the branch condition to t0.
|
|
|
|
; if ((1.0 / t0) * t0 < 1) // Likely true
|
|
|
|
; foo()
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 1) != 0 & (1.0 / t0) * t0 > 0) { // Likely true
|
|
|
|
; foo()
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; if ((t0 & 1) != 0)
|
|
|
|
; foo()
|
|
|
|
; if ((1.0 / t0) * t0 < 1) // Likely true
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
define void @test_chr_11(i32* %i, i32 %x) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_11(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
|
|
|
|
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
|
|
|
|
; CHECK-NEXT: [[DIV:%.*]] = fdiv double 1.000000e+00, [[CONV]]
|
|
|
|
; CHECK-NEXT: [[MUL16:%.*]] = fmul double [[DIV]], [[CONV]]
|
|
|
|
; CHECK-NEXT: [[CONV717:%.*]] = fptosi double [[MUL16]] to i32
|
|
|
|
; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i32 [[CONV717]], 0
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i1 [[TMP2]], [[CMP18]]
|
|
|
|
; CHECK-NEXT: br i1 [[TMP3]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0_NONCHR:%.*]], label [[BB1_NONCHR:%.*]], !prof !18
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1_NONCHR]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[CONV_NONCHR:%.*]] = sitofp i32 [[TMP0]] to double
|
|
|
|
; CHECK-NEXT: [[DIV_NONCHR:%.*]] = fdiv double 1.000000e+00, [[CONV_NONCHR]]
|
|
|
|
; CHECK-NEXT: [[MUL16_NONCHR:%.*]] = fmul double [[DIV_NONCHR]], [[CONV_NONCHR]]
|
|
|
|
; CHECK-NEXT: [[CONV717_NONCHR:%.*]] = fptosi double [[MUL16_NONCHR]] to i32
|
|
|
|
; CHECK-NEXT: [[CMP18_NONCHR:%.*]] = icmp slt i32 [[CONV717_NONCHR]], 1
|
|
|
|
; CHECK-NEXT: br i1 [[CMP18_NONCHR]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: ret void
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb1, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%conv = sitofp i32 %0 to double
|
|
|
|
%div = fdiv double 1.000000e+00, %conv
|
|
|
|
%mul16 = fmul double %div, %conv
|
|
|
|
%conv717 = fptosi double %mul16 to i32
|
|
|
|
%cmp18 = icmp slt i32 %conv717, 1
|
|
|
|
br i1 %cmp18, label %bb3, label %bb2, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; Selects + unrelated br only
|
|
|
|
define i32 @test_chr_12(i32* %i, i32 %sum0) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_12(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 255
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB3:%.*]], label [[BB0:%.*]], !prof !16
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SUM0:%.*]], 42
|
|
|
|
; CHECK-NEXT: [[SUM1:%.*]] = select i1 [[TMP4]], i32 [[SUM0]], i32 [[TMP5]], !prof !16
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SUM1]], 43
|
|
|
|
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[TMP7]], i32 [[SUM1]], i32 [[TMP8]], !prof !16
|
|
|
|
; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
|
|
|
|
; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP0]], 8
|
|
|
|
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
|
|
|
|
; CHECK-NEXT: [[TMP13:%.*]] = and i1 [[TMP10]], [[TMP12]]
|
|
|
|
; CHECK-NEXT: br i1 [[TMP13]], label [[BB1:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[SUM2]], 88
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb0.split.nonchr:
|
|
|
|
; CHECK-NEXT: br i1 [[TMP10]], label [[BB1_NONCHR:%.*]], label [[BB3]], !prof !18
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP0]], 8
|
|
|
|
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
|
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP16]], i32 44, i32 88, !prof !16
|
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2]], [[SUM4_NONCHR_V]]
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[SUM0]], [[ENTRY:%.*]] ], [ [[TMP14]], [[BB1]] ], [ [[SUM2]], [[BB0_SPLIT_NONCHR]] ], [ [[SUM4_NONCHR]], [[BB1_NONCHR]] ]
|
|
|
|
; CHECK-NEXT: ret i32 [[SUM6]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 255
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb3, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
%3 = and i32 %0, 1
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
%5 = add i32 %sum0, 42
|
|
|
|
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
|
|
|
|
%6 = and i32 %0, 2
|
|
|
|
%7 = icmp eq i32 %6, 0
|
|
|
|
%8 = add i32 %sum1, 43
|
|
|
|
%sum2 = select i1 %7, i32 %sum1, i32 %8, !prof !15
|
|
|
|
%9 = load i32, i32* %i
|
|
|
|
%10 = icmp eq i32 %9, 0
|
|
|
|
br i1 %10, label %bb2, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%sum3 = add i32 %sum2, 44
|
|
|
|
%11 = and i32 %0, 8
|
|
|
|
%12 = icmp eq i32 %11, 0
|
|
|
|
%13 = add i32 %sum3, 44
|
|
|
|
%sum4 = select i1 %12, i32 %sum3, i32 %13, !prof !15
|
|
|
|
br label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
|
|
|
|
ret i32 %sum6
|
|
|
|
}
|
|
|
|
|
|
|
|
; In the second CHR, a condition value depends on a trivial phi that's inserted
|
|
|
|
; by the first CHR.
|
|
|
|
; Roughly,
|
|
|
|
; i0 = *i
|
|
|
|
; v2 = (z != 1) ? pred : true // Likely false
|
|
|
|
; if (z == 0 & pred) // Likely false
|
|
|
|
; foo()
|
|
|
|
; j0 = *j
|
|
|
|
; sum2 = ((i0 & 2) == j0) ? sum0 : (sum0 + 43) // Likely false
|
|
|
|
; sum3 = ((i0 == j0) ? sum0 : (sum0 + 43) // Likely false
|
|
|
|
; foo()
|
|
|
|
; if ((i0 & 4) == 0) // Unbiased
|
|
|
|
; foo()
|
|
|
|
; return i0 + sum3
|
|
|
|
; ->
|
|
|
|
; i0 = *i
|
|
|
|
; if (z != 1 & (z == 0 & pred)) // First CHR
|
|
|
|
; foo()
|
|
|
|
; // A trivial phi for i0 is inserted here by the first CHR (which gets removed
|
|
|
|
; // later) and the subsequent branch condition (for the second CHR) uses it.
|
|
|
|
; j0 = *j
|
|
|
|
; if ((i0 & 2) != j0 & i0 != j0) { // Second CHR
|
|
|
|
; sum3 = sum0 + 43
|
|
|
|
; foo()
|
|
|
|
; if (i0 & 4) == 0)
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; sum3 = (i0 == j0) ? sum0 : (sum0 + 43)
|
|
|
|
; foo()
|
|
|
|
; if (i0 & 4) == 0)
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
; return i0 + sum3
|
|
|
|
define i32 @test_chr_14(i32* %i, i32* %j, i32 %sum0, i1 %pred, i32 %z) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_14(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[V1:%.*]] = icmp ne i32 [[Z:%.*]], 1
|
|
|
|
; CHECK-NEXT: [[V0:%.*]] = icmp eq i32 [[Z]], 0
|
|
|
|
; CHECK-NEXT: [[V3_NONCHR:%.*]] = and i1 [[V0]], [[PRED:%.*]]
|
|
|
|
; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[V1]], [[V3_NONCHR]]
|
|
|
|
; CHECK-NEXT: br i1 [[OR_COND]], label [[BB0_NONCHR:%.*]], label [[BB1:%.*]], !prof !19
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1]]
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2
|
|
|
|
; CHECK-NEXT: [[V4:%.*]] = icmp ne i32 [[V6]], [[J0]]
|
|
|
|
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
|
|
|
|
; CHECK-NEXT: [[V5:%.*]] = icmp ne i32 [[I0]], [[J0]]
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = and i1 [[V4]], [[V5]]
|
|
|
|
; CHECK-NEXT: br i1 [[TMP0]], label [[BB1_SPLIT:%.*]], label [[BB1_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb1.split:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[V9:%.*]] = and i32 [[I0]], 4
|
|
|
|
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[V10]], label [[BB3:%.*]], label [[BB2:%.*]]
|
|
|
|
; CHECK: bb2:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb1.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[V5_NONCHR:%.*]] = icmp eq i32 [[I0]], [[J0]]
|
|
|
|
; CHECK-NEXT: [[SUM3_NONCHR:%.*]] = select i1 [[V5_NONCHR]], i32 [[SUM0]], i32 [[V8]], !prof !16
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[V9_NONCHR:%.*]] = and i32 [[I0]], 4
|
|
|
|
; CHECK-NEXT: [[V10_NONCHR:%.*]] = icmp eq i32 [[V9_NONCHR]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[V10_NONCHR]], label [[BB3]], label [[BB2_NONCHR:%.*]]
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[V8]], [[BB2]] ], [ [[V8]], [[BB1_SPLIT]] ], [ [[SUM3_NONCHR]], [[BB2_NONCHR]] ], [ [[SUM3_NONCHR]], [[BB1_SPLIT_NONCHR]] ]
|
|
|
|
; CHECK-NEXT: [[V11:%.*]] = add i32 [[I0]], [[TMP1]]
|
|
|
|
; CHECK-NEXT: ret i32 [[V11]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%i0 = load i32, i32* %i
|
|
|
|
%v0 = icmp eq i32 %z, 0
|
|
|
|
%v1 = icmp ne i32 %z, 1
|
|
|
|
%v2 = select i1 %v1, i1 %pred, i1 true, !prof !15
|
|
|
|
%v3 = and i1 %v0, %pred
|
|
|
|
br i1 %v3, label %bb0, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%j0 = load i32, i32* %j
|
|
|
|
%v6 = and i32 %i0, 2
|
|
|
|
%v4 = icmp eq i32 %v6, %j0
|
|
|
|
%v8 = add i32 %sum0, 43
|
|
|
|
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
|
|
|
|
%v5 = icmp eq i32 %i0, %j0
|
|
|
|
%sum3 = select i1 %v5, i32 %sum0, i32 %v8, !prof !15
|
|
|
|
call void @foo()
|
|
|
|
%v9 = and i32 %i0, 4
|
|
|
|
%v10 = icmp eq i32 %v9, 0
|
|
|
|
br i1 %v10, label %bb3, label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%v11 = add i32 %i0, %sum3
|
|
|
|
ret i32 %v11
|
|
|
|
}
|
|
|
|
|
|
|
|
; Branch or selects depends on another select. No CHR happens.
|
|
|
|
; Roughly,
|
|
|
|
; i0 = *i
|
|
|
|
; if (z == 0 & ((z != 1) ? pred : true)) { // Likely false
|
|
|
|
; foo()
|
|
|
|
; j0 = *j
|
|
|
|
; sum2 = ((i0 & 2) == j0) ? sum0 : (sum0 + 43) // Likely false
|
|
|
|
; sum3 = (i0 == sum2) ? sum2 : (sum0 + 43) // Likely false. This depends on the
|
|
|
|
; // previous select.
|
|
|
|
; foo()
|
|
|
|
; if ((i0 & 4) == 0) // Unbiased
|
|
|
|
; foo()
|
|
|
|
; return i0 + sum3
|
|
|
|
; ->
|
|
|
|
; (no change)
|
|
|
|
define i32 @test_chr_15(i32* %i, i32* %j, i32 %sum0, i1 %pred, i32 %z) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_15(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[V0:%.*]] = icmp eq i32 [[Z:%.*]], 0
|
|
|
|
; CHECK-NEXT: [[V3:%.*]] = and i1 [[V0]], [[PRED:%.*]]
|
|
|
|
; CHECK-NEXT: br i1 [[V3]], label [[BB0:%.*]], label [[BB1:%.*]], !prof !16
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1]]
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2
|
|
|
|
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V6]], [[J0]]
|
|
|
|
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
|
|
|
|
; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[V9:%.*]] = and i32 [[I0]], 4
|
|
|
|
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[V10]], label [[BB3:%.*]], label [[BB2:%.*]]
|
|
|
|
; CHECK: bb2:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
2020-05-22 15:32:21 +08:00
|
|
|
; CHECK-NEXT: [[V5:%.*]] = icmp eq i32 [[I0]], [[SUM2]]
|
|
|
|
; CHECK-NEXT: [[SUM3:%.*]] = select i1 [[V5]], i32 [[SUM2]], i32 [[V8]], !prof !16
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: [[V11:%.*]] = add i32 [[I0]], [[SUM3]]
|
|
|
|
; CHECK-NEXT: ret i32 [[V11]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%i0 = load i32, i32* %i
|
|
|
|
%v0 = icmp eq i32 %z, 0
|
|
|
|
%v1 = icmp ne i32 %z, 1
|
|
|
|
%v2 = select i1 %v1, i1 %pred, i1 true, !prof !15
|
|
|
|
%v3 = and i1 %v0, %v2
|
|
|
|
br i1 %v3, label %bb0, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%j0 = load i32, i32* %j
|
|
|
|
%v6 = and i32 %i0, 2
|
|
|
|
%v4 = icmp eq i32 %v6, %j0
|
|
|
|
%v8 = add i32 %sum0, 43
|
|
|
|
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
|
|
|
|
%v5 = icmp eq i32 %i0, %sum2
|
|
|
|
%sum3 = select i1 %v5, i32 %sum2, i32 %v8, !prof !15
|
|
|
|
call void @foo()
|
|
|
|
%v9 = and i32 %i0, 4
|
|
|
|
%v10 = icmp eq i32 %v9, 0
|
|
|
|
br i1 %v10, label %bb3, label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%v11 = add i32 %i0, %sum3
|
|
|
|
ret i32 %v11
|
|
|
|
}
|
|
|
|
|
|
|
|
; With an existing phi at the exit but a value (%v40) is both alive and is an
|
|
|
|
; operand to a phi at the exit block.
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 1) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; v40 = t0 + 44
|
|
|
|
; if ((t0 & 2) != 0) // Likely true
|
|
|
|
; v41 = t0 + 99
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
; v42 = phi v40, v41
|
|
|
|
; return v42 + v40
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 3) == 3) // Likely true
|
|
|
|
; foo()
|
|
|
|
; v40 = t0 + 44
|
|
|
|
; v41 = t0 + 99
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; if ((t0 & 1) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; v40_nc = t0 + 44
|
|
|
|
; if ((t0 & 2) != 0) // Likely true
|
|
|
|
; v41_nc = t0 + 99
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; t7 = phi v40, v40_nc
|
|
|
|
; v42 = phi v41, v41_nc
|
|
|
|
; v43 = v42 + t7
|
|
|
|
; return v43
|
|
|
|
define i32 @test_chr_16(i32* %i) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_16(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 3
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 3
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[V40:%.*]] = add i32 [[TMP0]], 44
|
|
|
|
; CHECK-NEXT: [[V41:%.*]] = add i32 [[TMP0]], 99
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[BB1_NONCHR:%.*]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1_NONCHR]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[V40_NONCHR:%.*]] = add i32 [[TMP0]], 44
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB2_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: [[V41_NONCHR:%.*]] = add i32 [[TMP0]], 99
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ [[V40]], [[BB0]] ], [ [[V40_NONCHR]], [[BB2_NONCHR]] ], [ [[V40_NONCHR]], [[BB1_NONCHR]] ]
|
|
|
|
; CHECK-NEXT: [[V42:%.*]] = phi i32 [ [[V41]], [[BB0]] ], [ [[V41_NONCHR]], [[BB2_NONCHR]] ], [ [[V40_NONCHR]], [[BB1_NONCHR]] ]
|
|
|
|
; CHECK-NEXT: [[V43:%.*]] = add i32 [[V42]], [[TMP7]]
|
|
|
|
; CHECK-NEXT: ret i32 [[V43]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb1, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%v40 = add i32 %0, 44
|
|
|
|
%3 = and i32 %0, 2
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
br i1 %4, label %bb3, label %bb2, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%v41 = add i32 %0, 99
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%v42 = phi i32 [ %v41, %bb2 ], [ %v40, %bb1 ]
|
|
|
|
%v43 = add i32 %v42, %v40
|
|
|
|
ret i32 %v43
|
|
|
|
}
|
|
|
|
|
|
|
|
; Two consecutive regions have an entry in the middle of them. No CHR happens.
|
|
|
|
; Roughly,
|
|
|
|
; if ((i & 4) == 0) {
|
|
|
|
; if (!j)
|
|
|
|
; goto bb1
|
|
|
|
; } else {
|
|
|
|
; t0 = (i & 1)
|
|
|
|
; if (t0 != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; s = (i & 1) + i
|
|
|
|
; }
|
|
|
|
; bb1:
|
|
|
|
; p = phi i, t0, s
|
|
|
|
; if ((i & 2) != 0) // Likely true
|
|
|
|
; foo()
|
|
|
|
; q = p + 2
|
|
|
|
; }
|
|
|
|
; r = phi p, q, i
|
|
|
|
; return r
|
|
|
|
; ->
|
|
|
|
; (no change)
|
|
|
|
define i32 @test_chr_17(i32 %i, i1 %j) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_17(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[V0:%.*]] = and i32 [[I:%.*]], 4
|
|
|
|
; CHECK-NEXT: [[V1:%.*]] = icmp eq i32 [[V0]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[V1]], label [[BBE:%.*]], label [[BBQ:%.*]]
|
|
|
|
; CHECK: bbq:
|
|
|
|
; CHECK-NEXT: br i1 [[J:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
|
|
|
|
; CHECK: bbe:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[I]], 1
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP1]], label [[BB1]], label [[BB0:%.*]], !prof !16
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[S:%.*]] = add i32 [[TMP0]], [[I]]
|
|
|
|
; CHECK-NEXT: br label [[BB1]]
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: [[P:%.*]] = phi i32 [ [[I]], [[BBQ]] ], [ [[TMP0]], [[BBE]] ], [ [[S]], [[BB0]] ]
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[I]], 2
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP3]], label [[BB3]], label [[BB2:%.*]], !prof !16
|
|
|
|
; CHECK: bb2:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: [[Q:%.*]] = add i32 [[P]], [[TMP2]]
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[P]], [[BB1]] ], [ [[Q]], [[BB2]] ], [ [[I]], [[BBQ]] ]
|
|
|
|
; CHECK-NEXT: ret i32 [[R]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%v0 = and i32 %i, 4
|
|
|
|
%v1 = icmp eq i32 %v0, 0
|
|
|
|
br i1 %v1, label %bbe, label %bbq
|
|
|
|
|
|
|
|
bbq:
|
|
|
|
br i1 %j, label %bb3, label %bb1
|
|
|
|
|
|
|
|
bbe:
|
|
|
|
%0 = and i32 %i, 1
|
|
|
|
%1 = icmp eq i32 %0, 0
|
|
|
|
br i1 %1, label %bb1, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
%s = add i32 %0, %i
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%p = phi i32 [ %i, %bbq ], [ %0, %bbe ], [ %s, %bb0 ]
|
|
|
|
%2 = and i32 %i, 2
|
|
|
|
%3 = icmp eq i32 %2, 0
|
|
|
|
br i1 %3, label %bb3, label %bb2, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
%q = add i32 %p, %2
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%r = phi i32 [ %p, %bb1 ], [ %q, %bb2 ], [ %i, %bbq ]
|
|
|
|
ret i32 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
; Select + br, there's a loop and we need to update the user of an inserted phi
|
|
|
|
; at the entry block. This is a regression test for a bug that's fixed.
|
|
|
|
; Roughly,
|
|
|
|
; do {
|
|
|
|
; inc1 = phi inc2, 0
|
|
|
|
; li = *i
|
|
|
|
; sum1 = sum0 + 42
|
|
|
|
; sum2 = ((li & 1) == 0) ? sum0 : sum1 // Likely false
|
|
|
|
; inc2 = inc1 + 1
|
|
|
|
; if ((li & 4) != 0) // Likely true
|
|
|
|
; sum3 = sum2 + 44
|
|
|
|
; sum4 = phi sum1, sum3
|
|
|
|
; } while (inc2 != 100) // Likely true (loop back)
|
|
|
|
; return sum4
|
|
|
|
; ->
|
|
|
|
; do {
|
|
|
|
; inc1 = phi tmp2, 0 // The first operand needed to be updated
|
|
|
|
; li = *i
|
|
|
|
; sum1 = sum0 + 42
|
|
|
|
; if ((li & 5) == 5) { // Likely true
|
|
|
|
; inc2 = inc1 + 1
|
|
|
|
; sum3 = sum0 + 86
|
|
|
|
; } else {
|
|
|
|
; inc2_nc = inc1 + 1
|
|
|
|
; if ((li & 4) == 0)
|
|
|
|
; sum2_nc = ((li & 1) == 0) ? sum0 : sum1
|
|
|
|
; sum3_nc = sum2_nc + 44
|
|
|
|
; }
|
|
|
|
; tmp2 = phi inc2, in2c_nc
|
|
|
|
; sum4 = phi sum3, sum3_nc, sum1
|
|
|
|
; } while (tmp2 != 100)
|
|
|
|
; return sum4
|
|
|
|
define i32 @test_chr_18(i32* %i, i32 %sum0) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_18(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: br label [[BB0:%.*]]
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: [[INC1:%.*]] = phi i32 [ [[TMP2:%.*]], [[BB2:%.*]] ], [ 0, [[ENTRY:%.*]] ]
|
|
|
|
; CHECK-NEXT: [[LI:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[SUM1:%.*]] = add i32 [[SUM0:%.*]], 42
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[LI]], 5
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 5
|
|
|
|
; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_SPLIT:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0.split:
|
|
|
|
; CHECK-NEXT: [[INC2:%.*]] = add i32 [[INC1]], 1
|
|
|
|
; CHECK-NEXT: [[SUM3:%.*]] = add i32 [[SUM0]], 86
|
|
|
|
; CHECK-NEXT: br label [[BB2]]
|
|
|
|
; CHECK: bb0.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[A4_NONCHR:%.*]] = and i32 [[LI]], 4
|
|
|
|
; CHECK-NEXT: [[CMP4_NONCHR:%.*]] = icmp eq i32 [[A4_NONCHR]], 0
|
|
|
|
; CHECK-NEXT: [[INC2_NONCHR:%.*]] = add i32 [[INC1]], 1
|
|
|
|
; CHECK-NEXT: br i1 [[CMP4_NONCHR]], label [[BB2]], label [[BB1_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[A1:%.*]] = and i32 [[LI]], 1
|
|
|
|
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A1]], 0
|
|
|
|
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[CMP1]], i32 [[SUM0]], i32 [[SUM1]], !prof !16
|
|
|
|
; CHECK-NEXT: [[SUM3_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], 44
|
|
|
|
; CHECK-NEXT: br label [[BB2]]
|
|
|
|
; CHECK: bb2:
|
|
|
|
; CHECK-NEXT: [[TMP2]] = phi i32 [ [[INC2]], [[BB0_SPLIT]] ], [ [[INC2_NONCHR]], [[BB1_NONCHR]] ], [ [[INC2_NONCHR]], [[BB0_SPLIT_NONCHR]] ]
|
|
|
|
; CHECK-NEXT: [[SUM4:%.*]] = phi i32 [ [[SUM3]], [[BB0_SPLIT]] ], [ [[SUM3_NONCHR]], [[BB1_NONCHR]] ], [ [[SUM1]], [[BB0_SPLIT_NONCHR]] ]
|
|
|
|
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 100
|
|
|
|
; CHECK-NEXT: br i1 [[CMP]], label [[BB3:%.*]], label [[BB0]], !prof !16
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: ret i32 [[SUM4]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
br label %bb0
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
%inc1 = phi i32 [ %inc2, %bb2 ], [ 0, %entry ]
|
|
|
|
%li = load i32, i32* %i
|
|
|
|
%a1 = and i32 %li, 1
|
|
|
|
%cmp1 = icmp eq i32 %a1, 0
|
|
|
|
%sum1 = add i32 %sum0, 42
|
|
|
|
%sum2 = select i1 %cmp1, i32 %sum0, i32 %sum1, !prof !15
|
|
|
|
%a4 = and i32 %li, 4
|
|
|
|
%cmp4 = icmp eq i32 %a4, 0
|
|
|
|
%inc2 = add i32 %inc1, 1
|
|
|
|
br i1 %cmp4, label %bb2, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%sum3 = add i32 %sum2, 44
|
|
|
|
br label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%sum4 = phi i32 [ %sum1, %bb0 ], [ %sum3, %bb1 ]
|
|
|
|
%cmp = icmp eq i32 %inc2, 100
|
|
|
|
br i1 %cmp, label %bb3, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
ret i32 %sum4
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
; Selects + Brs. Those share the condition value, which causes the
|
|
|
|
; targets/operands of the branch/select to be flipped.
|
|
|
|
; Roughly,
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 255) != 0) { // Likely true
|
|
|
|
; sum1 = ((t0 & 1) == 0) ? sum0 : (sum0 + 42) // Likely false
|
|
|
|
; sum2 = ((t0 & 1) == 0) ? sum1 : (sum1 + 42) // Likely false
|
|
|
|
; if ((t0 & 1) != 0) { // Likely true
|
|
|
|
; sum3 = sum2 + 44
|
|
|
|
; sum4 = ((t0 & 8) == 0) ? sum3 : (sum3 + 44) // Likely false
|
|
|
|
; }
|
|
|
|
; sum5 = phi sum2, sum4
|
|
|
|
; }
|
|
|
|
; sum6 = phi sum0, sum5
|
|
|
|
; return sum6
|
|
|
|
; ->
|
|
|
|
; t0 = *i
|
|
|
|
; if ((t0 & 9) == 9) { // Likely true
|
|
|
|
; tmp3 = sum0 + 85 // Dead
|
|
|
|
; tmp4 = sum0 + 173
|
|
|
|
; } else {
|
|
|
|
; if ((t0 & 255) != 0) {
|
|
|
|
; sum2_nc = ((t0 & 1) == 0) ? sum0 : (sum0 + 85)
|
|
|
|
; sum4_nc_v = ((t0 & 8) == 0) ? 44 : 88
|
|
|
|
; sum4_nc = add sum2_nc + sum4_nc_v
|
|
|
|
; }
|
|
|
|
; }
|
|
|
|
; sum6 = phi tmp4, sum0, sum2_nc, sum4_nc
|
|
|
|
; return sum6
|
|
|
|
define i32 @test_chr_19(i32* %i, i32 %sum0) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_19(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 9
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 9
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB0:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SUM0:%.*]], 85
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SUM0]], 173
|
|
|
|
; CHECK-NEXT: br label [[BB3:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP0]], 255
|
|
|
|
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP6]], label [[BB3]], label [[BB0_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb0.nonchr:
|
|
|
|
; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0
|
|
|
|
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[SUM0]], 85
|
|
|
|
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM0]], i32 [[TMP9]], !prof !16
|
|
|
|
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP0]], 8
|
|
|
|
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP11]], i32 44, i32 88
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]]
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB cost
Summary:
Previously, if the threshold was 2, we were willing to speculatively
execute 2 cheap instructions in both basic blocks (thus we were willing
to speculatively execute cost = 4), but weren't willing to speculate
when one BB had 3 instructions and other one had no instructions,
even thought that would have total cost of 3.
This looks inconsistent to me.
I don't think `cmov`-like instructions will start executing
until both of it's inputs are available: https://godbolt.org/z/zgHePf
So i don't see why the existing behavior is the correct one.
Also, let's add it's own `cl::opt` for this threshold,
with default=4, so it is not stricter than the previous threshold:
will allow to fold when there are 2 BB's each with cost=2.
And since the logic has changed, it will also allow to fold when
one BB has cost=3 and other cost=1, or there is only one BB with cost=4.
This is an alternative solution to D65148:
This fix is mainly motivated by `signbit-like-value-extension.ll` test.
That pattern comes up in JPEG decoding, see e.g.
`Figure F.12 – Extending the sign bit of a decoded value in V`
of `ITU T.81` (JPEG specification).
That branch is not predictable, and it is within the innermost loop,
so the fact that that pattern ends up being stuck with a branch
instead of `select` (i.e. `CMOV` for x86) is unlikely to be beneficial.
This has great results on the final assembly (vanilla test-suite + RawSpeed): (metric pass - D67240)
| metric | old | new | delta | % |
| x86-mi-counting.NumMachineFunctions | 37720 | 37721 | 1 | 0.00% |
| x86-mi-counting.NumMachineBasicBlocks | 773545 | 771181 | -2364 | -0.31% |
| x86-mi-counting.NumMachineInstructions | 7488843 | 7486442 | -2401 | -0.03% |
| x86-mi-counting.NumUncondBR | 135770 | 135543 | -227 | -0.17% |
| x86-mi-counting.NumCondBR | 423753 | 422187 | -1566 | -0.37% |
| x86-mi-counting.NumCMOV | 24815 | 25731 | 916 | 3.69% |
| x86-mi-counting.NumVecBlend | 17 | 17 | 0 | 0.00% |
We significantly decrease basic block count, notably decrease instruction count,
significantly decrease branch count and very significantly increase `cmov` count.
Performance-wise, unsurprisingly, this has great effect on
target RawSpeed benchmark. I'm seeing 5 **major** improvements:
```
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_mean -0.3064 -0.3064 226.9913 157.4452 226.9800 157.4384
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_median -0.3057 -0.3057 226.8407 157.4926 226.8282 157.4828
Samsung/NX3000/_3184416.SRW/threads:8/process_time/real_time_stddev -0.4985 -0.4954 0.3051 0.1530 0.3040 0.1534
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_mean -0.1747 -0.1747 80.4787 66.4227 80.4771 66.4146
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_median -0.1742 -0.1743 80.4686 66.4542 80.4690 66.4436
Kodak/DCS760C/86L57188.DCR/threads:8/process_time/real_time_stddev +0.6089 +0.5797 0.0670 0.1078 0.0673 0.1062
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_mean -0.1598 -0.1598 171.6996 144.2575 171.6915 144.2538
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_median -0.1598 -0.1597 171.7109 144.2755 171.7018 144.2766
Sony/DSLR-A230/DSC08026.ARW/threads:8/process_time/real_time_stddev +0.4024 +0.3850 0.0847 0.1187 0.0848 0.1175
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_mean -0.0550 -0.0551 280.3046 264.8800 280.3017 264.8559
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_median -0.0554 -0.0554 280.2628 264.7360 280.2574 264.7297
Canon/EOS 77D/IMG_4049.CR2/threads:8/process_time/real_time_stddev +0.7005 +0.7041 0.2779 0.4725 0.2775 0.4729
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_pvalue 0.0000 0.0000 U Test, Repetitions: 49 vs 49
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_mean -0.0354 -0.0355 316.7396 305.5208 316.7342 305.4890
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_median -0.0354 -0.0356 316.6969 305.4798 316.6917 305.4324
Canon/EOS 5DS/2K4A9929.CR2/threads:8/process_time/real_time_stddev +0.0493 +0.0330 0.3562 0.3737 0.3563 0.3681
```
That being said, it's always best-effort, so there will likely
be cases where this worsens things.
Reviewers: efriedma, craig.topper, dmgreen, jmolloy, fhahn, Carrot, hfinkel, chandlerc
Reviewed By: jmolloy
Subscribers: xbolva00, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D67318
llvm-svn: 372009
2019-09-17 00:18:24 +08:00
|
|
|
; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ]
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK-NEXT: ret i32 [[SUM6]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 255
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb3, label %bb0, !prof !15
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
%3 = and i32 %0, 1
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
%5 = add i32 %sum0, 42
|
|
|
|
%sum1 = select i1 %4, i32 %sum0, i32 %5, !prof !15
|
|
|
|
%6 = add i32 %sum1, 43
|
|
|
|
%sum2 = select i1 %4, i32 %sum1, i32 %6, !prof !15
|
|
|
|
br i1 %4, label %bb2, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%sum3 = add i32 %sum2, 44
|
|
|
|
%7 = and i32 %0, 8
|
|
|
|
%8 = icmp eq i32 %7, 0
|
|
|
|
%9 = add i32 %sum3, 44
|
|
|
|
%sum4 = select i1 %8, i32 %sum3, i32 %9, !prof !15
|
|
|
|
br label %bb2
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
%sum5 = phi i32 [ %sum2, %bb0 ], [ %sum4, %bb1 ]
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%sum6 = phi i32 [ %sum0, %entry ], [ %sum5, %bb2 ]
|
|
|
|
ret i32 %sum6
|
|
|
|
}
|
|
|
|
|
|
|
|
; Selects. The exit block, which belongs to the top-level region, has a select
|
|
|
|
; and causes the top-level region to be the outermost CHR scope with the
|
|
|
|
; subscope that includes the entry block with two selects. The outermost CHR
|
|
|
|
; scope doesn't see the selects in the entry block as the entry block is in the
|
|
|
|
; subscope and incorrectly sets the CHR hoist point to the branch rather than
|
|
|
|
; the first select in the entry block and causes the CHR'ed selects ("select i1
|
|
|
|
; false...") to incorrectly position above the CHR branch. This is testing
|
|
|
|
; against a quirk of how the region analysis handles the entry block.
|
|
|
|
; Roughly,
|
|
|
|
; i0 = *i
|
|
|
|
; sum2 = ((i0 & 2) == 0) ? sum0 : (sum0 + 43) // Likely false
|
|
|
|
; sum3 = ((i0 & 4) == 0) ? sum2 : (sum2 + 44) // Likely false
|
|
|
|
; if (j)
|
|
|
|
; foo()
|
|
|
|
; i5 = *i
|
|
|
|
; v13 = (i5 == 44) ? i5 : sum3
|
|
|
|
; return v13
|
|
|
|
; ->
|
|
|
|
; i0 = *i
|
|
|
|
; if ((i0 & 6) != 6) { // Likely true
|
|
|
|
; v9 = sum0 + 87
|
|
|
|
; if (j)
|
|
|
|
; foo()
|
|
|
|
; } else {
|
|
|
|
; sum2.nc = ((i0 & 2) == 0) ? sum0 : (sum0 + 43)
|
|
|
|
; sum3.nc = ((i0 & 4) == 0) ? sum2.nc : (sum2.nc + 44)
|
|
|
|
; if (j)
|
|
|
|
; foo()
|
|
|
|
; }
|
|
|
|
; t2 = phi v9, sum3.nc
|
|
|
|
; i5 = *i
|
|
|
|
; v13 = (i5 == 44) ? 44 : t2
|
|
|
|
; return v13
|
|
|
|
define i32 @test_chr_20(i32* %i, i32 %sum0, i1 %j) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_20(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[I0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[I0]], 6
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 6
|
|
|
|
; CHECK-NEXT: br i1 [[TMP1]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: entry.split:
|
|
|
|
; CHECK-NEXT: [[V9:%.*]] = add i32 [[SUM0:%.*]], 87
|
|
|
|
; CHECK-NEXT: br i1 [[J:%.*]], label [[BB1:%.*]], label [[BB4:%.*]]
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB4]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0]], 43
|
|
|
|
; CHECK-NEXT: [[V3:%.*]] = and i32 [[I0]], 2
|
|
|
|
; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V3]], 0
|
|
|
|
; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof !16
|
|
|
|
; CHECK-NEXT: [[V6_NONCHR:%.*]] = and i32 [[I0]], 4
|
|
|
|
; CHECK-NEXT: [[V5_NONCHR:%.*]] = icmp eq i32 [[V6_NONCHR]], 0
|
|
|
|
; CHECK-NEXT: [[V9_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], 44
|
|
|
|
; CHECK-NEXT: [[SUM3_NONCHR:%.*]] = select i1 [[V5_NONCHR]], i32 [[SUM2_NONCHR]], i32 [[V9_NONCHR]], !prof !16
|
|
|
|
; CHECK-NEXT: br i1 [[J]], label [[BB1_NONCHR:%.*]], label [[BB4]]
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB4]]
|
|
|
|
; CHECK: bb4:
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[V9]], [[BB1]] ], [ [[V9]], [[ENTRY_SPLIT]] ], [ [[SUM3_NONCHR]], [[BB1_NONCHR]] ], [ [[SUM3_NONCHR]], [[ENTRY_SPLIT_NONCHR]] ]
|
|
|
|
; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[I]], align 4
|
|
|
|
; CHECK-NEXT: [[V12:%.*]] = icmp eq i32 [[I5]], 44
|
|
|
|
; CHECK-NEXT: [[V13:%.*]] = select i1 [[V12]], i32 44, i32 [[TMP2]], !prof !16
|
|
|
|
; CHECK-NEXT: ret i32 [[V13]]
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%i0 = load i32, i32* %i
|
|
|
|
%v3 = and i32 %i0, 2
|
|
|
|
%v4 = icmp eq i32 %v3, 0
|
|
|
|
%v8 = add i32 %sum0, 43
|
|
|
|
%sum2 = select i1 %v4, i32 %sum0, i32 %v8, !prof !15
|
|
|
|
%v6 = and i32 %i0, 4
|
|
|
|
%v5 = icmp eq i32 %v6, 0
|
|
|
|
%v9 = add i32 %sum2, 44
|
|
|
|
%sum3 = select i1 %v5, i32 %sum2, i32 %v9, !prof !15
|
|
|
|
br i1 %j, label %bb1, label %bb4
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb4
|
|
|
|
|
|
|
|
bb4:
|
|
|
|
%i5 = load i32, i32* %i
|
|
|
|
%v12 = icmp eq i32 %i5, 44
|
|
|
|
%v13 = select i1 %v12, i32 %i5, i32 %sum3, !prof !15
|
|
|
|
ret i32 %v13
|
|
|
|
}
|
|
|
|
|
2019-05-02 06:49:52 +08:00
|
|
|
; Test the case where two scopes share a common instruction to hoist (%cmp.i).
|
|
|
|
; Two scopes would hoist it to their hoist points, but since the outer scope
|
|
|
|
; hoists (entry/bb6-9) it first to its hoist point, it'd be wrong (causing bad
|
|
|
|
; IR) for the inner scope (bb1-4) to hoist the same instruction to its hoist
|
|
|
|
; point.
|
|
|
|
; Roughly,
|
|
|
|
; if (j != k) {
|
|
|
|
; if (i != 2)
|
|
|
|
; foo();
|
|
|
|
; cmp.i = i == 86
|
|
|
|
; if (!cmp.i)
|
|
|
|
; foo();
|
|
|
|
; if (j != i)
|
|
|
|
; foo();
|
|
|
|
; if (!cmp.i)
|
|
|
|
; foo();
|
|
|
|
; }
|
|
|
|
; return 45;
|
|
|
|
define i32 @test_chr_21(i64 %i, i64 %k, i64 %j) !prof !14 {
|
|
|
|
; CHECK-LABEL: @test_chr_21(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i64 [[J:%.*]], [[K:%.*]]
|
|
|
|
; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[J]], [[I:%.*]]
|
|
|
|
; CHECK-NEXT: [[CMP_I:%.*]] = icmp ne i64 [[I]], 86
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = and i1 [[CMP0]], [[CMP3]]
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[TMP0]], [[CMP_I]]
|
|
|
|
; CHECK-NEXT: br i1 [[TMP1]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof !15
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[I]], 2
|
|
|
|
; CHECK-NEXT: switch i64 [[I]], label [[BB2:%.*]] [
|
|
|
|
; CHECK-NEXT: i64 2, label [[BB3_NONCHR2:%.*]]
|
|
|
|
; CHECK-NEXT: i64 86, label [[BB2_NONCHR1:%.*]]
|
|
|
|
; CHECK-NEXT: ], !prof !20
|
|
|
|
; CHECK: bb2:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB7:%.*]]
|
|
|
|
; CHECK: bb2.nonchr1:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3_NONCHR2]]
|
|
|
|
; CHECK: bb3.nonchr2:
|
|
|
|
; CHECK-NEXT: br i1 [[CMP_I]], label [[BB4_NONCHR3:%.*]], label [[BB7]], !prof !18
|
|
|
|
; CHECK: bb4.nonchr3:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB7]]
|
|
|
|
; CHECK: bb7:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB10:%.*]]
|
|
|
|
; CHECK: entry.split.nonchr:
|
|
|
|
; CHECK-NEXT: br i1 [[CMP0]], label [[BB1_NONCHR:%.*]], label [[BB10]], !prof !18
|
|
|
|
; CHECK: bb1.nonchr:
|
|
|
|
; CHECK-NEXT: [[CMP2_NONCHR:%.*]] = icmp eq i64 [[I]], 2
|
|
|
|
; CHECK-NEXT: br i1 [[CMP2_NONCHR]], label [[BB3_NONCHR:%.*]], label [[BB2_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb3.nonchr:
|
|
|
|
; CHECK-NEXT: [[CMP_I_NONCHR:%.*]] = icmp eq i64 [[I]], 86
|
|
|
|
; CHECK-NEXT: br i1 [[CMP_I_NONCHR]], label [[BB6_NONCHR:%.*]], label [[BB4_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb6.nonchr:
|
|
|
|
; CHECK-NEXT: [[CMP3_NONCHR:%.*]] = icmp eq i64 [[J]], [[I]]
|
|
|
|
; CHECK-NEXT: br i1 [[CMP3_NONCHR]], label [[BB8_NONCHR:%.*]], label [[BB7_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb8.nonchr:
|
|
|
|
; CHECK-NEXT: br i1 [[CMP_I_NONCHR]], label [[BB10]], label [[BB9_NONCHR:%.*]], !prof !16
|
|
|
|
; CHECK: bb9.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB10]]
|
|
|
|
; CHECK: bb7.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB8_NONCHR]]
|
|
|
|
; CHECK: bb4.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB6_NONCHR]]
|
|
|
|
; CHECK: bb2.nonchr:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3_NONCHR]]
|
|
|
|
; CHECK: bb10:
|
|
|
|
; CHECK-NEXT: ret i32 45
|
|
|
|
;
|
|
|
|
entry:
|
|
|
|
%cmp0 = icmp eq i64 %j, %k
|
|
|
|
br i1 %cmp0, label %bb10, label %bb1, !prof !15
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%cmp2 = icmp eq i64 %i, 2
|
|
|
|
br i1 %cmp2, label %bb3, label %bb2, !prof !15
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
%cmp.i = icmp eq i64 %i, 86
|
|
|
|
br i1 %cmp.i, label %bb5, label %bb4, !prof !15
|
|
|
|
|
|
|
|
bb4:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb5
|
|
|
|
|
|
|
|
bb5:
|
|
|
|
br label %bb6
|
|
|
|
|
|
|
|
bb6:
|
|
|
|
%cmp3 = icmp eq i64 %j, %i
|
|
|
|
br i1 %cmp3, label %bb8, label %bb7, !prof !15
|
|
|
|
|
|
|
|
bb7:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb8
|
|
|
|
|
|
|
|
bb8:
|
|
|
|
br i1 %cmp.i, label %bb10, label %bb9, !prof !15
|
|
|
|
|
|
|
|
bb9:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb10
|
|
|
|
|
|
|
|
bb10:
|
|
|
|
ret i32 45
|
|
|
|
}
|
|
|
|
|
2019-05-23 02:37:34 +08:00
|
|
|
; Test a case with a really long use-def chains. This test checks that it's not
|
|
|
|
; really slow and doesn't appear to be hanging.
|
|
|
|
define i64 @test_chr_22(i1 %i, i64* %j, i64 %v0) !prof !14 {
|
2020-05-20 13:02:55 +08:00
|
|
|
; CHECK-LABEL: @test_chr_22(
|
|
|
|
; CHECK-NEXT: bb0:
|
2020-05-22 23:37:58 +08:00
|
|
|
; CHECK-NEXT: [[REASS_ADD:%.*]] = shl i64 [[V0:%.*]], 1
|
|
|
|
; CHECK-NEXT: [[V2:%.*]] = add i64 [[REASS_ADD]], 3
|
2020-05-20 13:02:55 +08:00
|
|
|
; CHECK-NEXT: [[V299:%.*]] = mul i64 [[V2]], 7860086430977039991
|
|
|
|
; CHECK-NEXT: store i64 [[V299]], i64* [[J:%.*]], align 4
|
|
|
|
; CHECK-NEXT: ret i64 99
|
|
|
|
;
|
2019-05-23 02:37:34 +08:00
|
|
|
bb0:
|
|
|
|
%v1 = add i64 %v0, 3
|
|
|
|
%v2 = add i64 %v1, %v0
|
|
|
|
%c1 = icmp sgt i64 %v2, 99
|
|
|
|
%v3 = select i1 %c1, i64 %v1, i64 %v2, !prof !15
|
|
|
|
%v4 = add i64 %v2, %v2
|
|
|
|
%v5 = add i64 %v4, %v2
|
|
|
|
%v6 = add i64 %v5, %v4
|
|
|
|
%v7 = add i64 %v6, %v5
|
|
|
|
%v8 = add i64 %v7, %v6
|
|
|
|
%v9 = add i64 %v8, %v7
|
|
|
|
%v10 = add i64 %v9, %v8
|
|
|
|
%v11 = add i64 %v10, %v9
|
|
|
|
%v12 = add i64 %v11, %v10
|
|
|
|
%v13 = add i64 %v12, %v11
|
|
|
|
%v14 = add i64 %v13, %v12
|
|
|
|
%v15 = add i64 %v14, %v13
|
|
|
|
%v16 = add i64 %v15, %v14
|
|
|
|
%v17 = add i64 %v16, %v15
|
|
|
|
%v18 = add i64 %v17, %v16
|
|
|
|
%v19 = add i64 %v18, %v17
|
|
|
|
%v20 = add i64 %v19, %v18
|
|
|
|
%v21 = add i64 %v20, %v19
|
|
|
|
%v22 = add i64 %v21, %v20
|
|
|
|
%v23 = add i64 %v22, %v21
|
|
|
|
%v24 = add i64 %v23, %v22
|
|
|
|
%v25 = add i64 %v24, %v23
|
|
|
|
%v26 = add i64 %v25, %v24
|
|
|
|
%v27 = add i64 %v26, %v25
|
|
|
|
%v28 = add i64 %v27, %v26
|
|
|
|
%v29 = add i64 %v28, %v27
|
|
|
|
%v30 = add i64 %v29, %v28
|
|
|
|
%v31 = add i64 %v30, %v29
|
|
|
|
%v32 = add i64 %v31, %v30
|
|
|
|
%v33 = add i64 %v32, %v31
|
|
|
|
%v34 = add i64 %v33, %v32
|
|
|
|
%v35 = add i64 %v34, %v33
|
|
|
|
%v36 = add i64 %v35, %v34
|
|
|
|
%v37 = add i64 %v36, %v35
|
|
|
|
%v38 = add i64 %v37, %v36
|
|
|
|
%v39 = add i64 %v38, %v37
|
|
|
|
%v40 = add i64 %v39, %v38
|
|
|
|
%v41 = add i64 %v40, %v39
|
|
|
|
%v42 = add i64 %v41, %v40
|
|
|
|
%v43 = add i64 %v42, %v41
|
|
|
|
%v44 = add i64 %v43, %v42
|
|
|
|
%v45 = add i64 %v44, %v43
|
|
|
|
%v46 = add i64 %v45, %v44
|
|
|
|
%v47 = add i64 %v46, %v45
|
|
|
|
%v48 = add i64 %v47, %v46
|
|
|
|
%v49 = add i64 %v48, %v47
|
|
|
|
%v50 = add i64 %v49, %v48
|
|
|
|
%v51 = add i64 %v50, %v49
|
|
|
|
%v52 = add i64 %v51, %v50
|
|
|
|
%v53 = add i64 %v52, %v51
|
|
|
|
%v54 = add i64 %v53, %v52
|
|
|
|
%v55 = add i64 %v54, %v53
|
|
|
|
%v56 = add i64 %v55, %v54
|
|
|
|
%v57 = add i64 %v56, %v55
|
|
|
|
%v58 = add i64 %v57, %v56
|
|
|
|
%v59 = add i64 %v58, %v57
|
|
|
|
%v60 = add i64 %v59, %v58
|
|
|
|
%v61 = add i64 %v60, %v59
|
|
|
|
%v62 = add i64 %v61, %v60
|
|
|
|
%v63 = add i64 %v62, %v61
|
|
|
|
%v64 = add i64 %v63, %v62
|
|
|
|
%v65 = add i64 %v64, %v63
|
|
|
|
%v66 = add i64 %v65, %v64
|
|
|
|
%v67 = add i64 %v66, %v65
|
|
|
|
%v68 = add i64 %v67, %v66
|
|
|
|
%v69 = add i64 %v68, %v67
|
|
|
|
%v70 = add i64 %v69, %v68
|
|
|
|
%v71 = add i64 %v70, %v69
|
|
|
|
%v72 = add i64 %v71, %v70
|
|
|
|
%v73 = add i64 %v72, %v71
|
|
|
|
%v74 = add i64 %v73, %v72
|
|
|
|
%v75 = add i64 %v74, %v73
|
|
|
|
%v76 = add i64 %v75, %v74
|
|
|
|
%v77 = add i64 %v76, %v75
|
|
|
|
%v78 = add i64 %v77, %v76
|
|
|
|
%v79 = add i64 %v78, %v77
|
|
|
|
%v80 = add i64 %v79, %v78
|
|
|
|
%v81 = add i64 %v80, %v79
|
|
|
|
%v82 = add i64 %v81, %v80
|
|
|
|
%v83 = add i64 %v82, %v81
|
|
|
|
%v84 = add i64 %v83, %v82
|
|
|
|
%v85 = add i64 %v84, %v83
|
|
|
|
%v86 = add i64 %v85, %v84
|
|
|
|
%v87 = add i64 %v86, %v85
|
|
|
|
%v88 = add i64 %v87, %v86
|
|
|
|
%v89 = add i64 %v88, %v87
|
|
|
|
%v90 = add i64 %v89, %v88
|
|
|
|
%v91 = add i64 %v90, %v89
|
|
|
|
%v92 = add i64 %v91, %v90
|
|
|
|
%v93 = add i64 %v92, %v91
|
|
|
|
%v94 = add i64 %v93, %v92
|
|
|
|
%v95 = add i64 %v94, %v93
|
|
|
|
%v96 = add i64 %v95, %v94
|
|
|
|
%v97 = add i64 %v96, %v95
|
|
|
|
%v98 = add i64 %v97, %v96
|
|
|
|
%v99 = add i64 %v98, %v97
|
|
|
|
%v100 = add i64 %v99, %v98
|
|
|
|
%v101 = add i64 %v100, %v99
|
|
|
|
%v102 = add i64 %v101, %v100
|
|
|
|
%v103 = add i64 %v102, %v101
|
|
|
|
%v104 = add i64 %v103, %v102
|
|
|
|
%v105 = add i64 %v104, %v103
|
|
|
|
%v106 = add i64 %v105, %v104
|
|
|
|
%v107 = add i64 %v106, %v105
|
|
|
|
%v108 = add i64 %v107, %v106
|
|
|
|
%v109 = add i64 %v108, %v107
|
|
|
|
%v110 = add i64 %v109, %v108
|
|
|
|
%v111 = add i64 %v110, %v109
|
|
|
|
%v112 = add i64 %v111, %v110
|
|
|
|
%v113 = add i64 %v112, %v111
|
|
|
|
%v114 = add i64 %v113, %v112
|
|
|
|
%v115 = add i64 %v114, %v113
|
|
|
|
%v116 = add i64 %v115, %v114
|
|
|
|
%v117 = add i64 %v116, %v115
|
|
|
|
%v118 = add i64 %v117, %v116
|
|
|
|
%v119 = add i64 %v118, %v117
|
|
|
|
%v120 = add i64 %v119, %v118
|
|
|
|
%v121 = add i64 %v120, %v119
|
|
|
|
%v122 = add i64 %v121, %v120
|
|
|
|
%v123 = add i64 %v122, %v121
|
|
|
|
%v124 = add i64 %v123, %v122
|
|
|
|
%v125 = add i64 %v124, %v123
|
|
|
|
%v126 = add i64 %v125, %v124
|
|
|
|
%v127 = add i64 %v126, %v125
|
|
|
|
%v128 = add i64 %v127, %v126
|
|
|
|
%v129 = add i64 %v128, %v127
|
|
|
|
%v130 = add i64 %v129, %v128
|
|
|
|
%v131 = add i64 %v130, %v129
|
|
|
|
%v132 = add i64 %v131, %v130
|
|
|
|
%v133 = add i64 %v132, %v131
|
|
|
|
%v134 = add i64 %v133, %v132
|
|
|
|
%v135 = add i64 %v134, %v133
|
|
|
|
%v136 = add i64 %v135, %v134
|
|
|
|
%v137 = add i64 %v136, %v135
|
|
|
|
%v138 = add i64 %v137, %v136
|
|
|
|
%v139 = add i64 %v138, %v137
|
|
|
|
%v140 = add i64 %v139, %v138
|
|
|
|
%v141 = add i64 %v140, %v139
|
|
|
|
%v142 = add i64 %v141, %v140
|
|
|
|
%v143 = add i64 %v142, %v141
|
|
|
|
%v144 = add i64 %v143, %v142
|
|
|
|
%v145 = add i64 %v144, %v143
|
|
|
|
%v146 = add i64 %v145, %v144
|
|
|
|
%v147 = add i64 %v146, %v145
|
|
|
|
%v148 = add i64 %v147, %v146
|
|
|
|
%v149 = add i64 %v148, %v147
|
|
|
|
%v150 = add i64 %v149, %v148
|
|
|
|
%v151 = add i64 %v150, %v149
|
|
|
|
%v152 = add i64 %v151, %v150
|
|
|
|
%v153 = add i64 %v152, %v151
|
|
|
|
%v154 = add i64 %v153, %v152
|
|
|
|
%v155 = add i64 %v154, %v153
|
|
|
|
%v156 = add i64 %v155, %v154
|
|
|
|
%v157 = add i64 %v156, %v155
|
|
|
|
%v158 = add i64 %v157, %v156
|
|
|
|
%v159 = add i64 %v158, %v157
|
|
|
|
%v160 = add i64 %v159, %v158
|
|
|
|
%v161 = add i64 %v160, %v159
|
|
|
|
%v162 = add i64 %v161, %v160
|
|
|
|
%v163 = add i64 %v162, %v161
|
|
|
|
%v164 = add i64 %v163, %v162
|
|
|
|
%v165 = add i64 %v164, %v163
|
|
|
|
%v166 = add i64 %v165, %v164
|
|
|
|
%v167 = add i64 %v166, %v165
|
|
|
|
%v168 = add i64 %v167, %v166
|
|
|
|
%v169 = add i64 %v168, %v167
|
|
|
|
%v170 = add i64 %v169, %v168
|
|
|
|
%v171 = add i64 %v170, %v169
|
|
|
|
%v172 = add i64 %v171, %v170
|
|
|
|
%v173 = add i64 %v172, %v171
|
|
|
|
%v174 = add i64 %v173, %v172
|
|
|
|
%v175 = add i64 %v174, %v173
|
|
|
|
%v176 = add i64 %v175, %v174
|
|
|
|
%v177 = add i64 %v176, %v175
|
|
|
|
%v178 = add i64 %v177, %v176
|
|
|
|
%v179 = add i64 %v178, %v177
|
|
|
|
%v180 = add i64 %v179, %v178
|
|
|
|
%v181 = add i64 %v180, %v179
|
|
|
|
%v182 = add i64 %v181, %v180
|
|
|
|
%v183 = add i64 %v182, %v181
|
|
|
|
%v184 = add i64 %v183, %v182
|
|
|
|
%v185 = add i64 %v184, %v183
|
|
|
|
%v186 = add i64 %v185, %v184
|
|
|
|
%v187 = add i64 %v186, %v185
|
|
|
|
%v188 = add i64 %v187, %v186
|
|
|
|
%v189 = add i64 %v188, %v187
|
|
|
|
%v190 = add i64 %v189, %v188
|
|
|
|
%v191 = add i64 %v190, %v189
|
|
|
|
%v192 = add i64 %v191, %v190
|
|
|
|
%v193 = add i64 %v192, %v191
|
|
|
|
%v194 = add i64 %v193, %v192
|
|
|
|
%v195 = add i64 %v194, %v193
|
|
|
|
%v196 = add i64 %v195, %v194
|
|
|
|
%v197 = add i64 %v196, %v195
|
|
|
|
%v198 = add i64 %v197, %v196
|
|
|
|
%v199 = add i64 %v198, %v197
|
|
|
|
%v200 = add i64 %v199, %v198
|
|
|
|
%v201 = add i64 %v200, %v199
|
|
|
|
%v202 = add i64 %v201, %v200
|
|
|
|
%v203 = add i64 %v202, %v201
|
|
|
|
%v204 = add i64 %v203, %v202
|
|
|
|
%v205 = add i64 %v204, %v203
|
|
|
|
%v206 = add i64 %v205, %v204
|
|
|
|
%v207 = add i64 %v206, %v205
|
|
|
|
%v208 = add i64 %v207, %v206
|
|
|
|
%v209 = add i64 %v208, %v207
|
|
|
|
%v210 = add i64 %v209, %v208
|
|
|
|
%v211 = add i64 %v210, %v209
|
|
|
|
%v212 = add i64 %v211, %v210
|
|
|
|
%v213 = add i64 %v212, %v211
|
|
|
|
%v214 = add i64 %v213, %v212
|
|
|
|
%v215 = add i64 %v214, %v213
|
|
|
|
%v216 = add i64 %v215, %v214
|
|
|
|
%v217 = add i64 %v216, %v215
|
|
|
|
%v218 = add i64 %v217, %v216
|
|
|
|
%v219 = add i64 %v218, %v217
|
|
|
|
%v220 = add i64 %v219, %v218
|
|
|
|
%v221 = add i64 %v220, %v219
|
|
|
|
%v222 = add i64 %v221, %v220
|
|
|
|
%v223 = add i64 %v222, %v221
|
|
|
|
%v224 = add i64 %v223, %v222
|
|
|
|
%v225 = add i64 %v224, %v223
|
|
|
|
%v226 = add i64 %v225, %v224
|
|
|
|
%v227 = add i64 %v226, %v225
|
|
|
|
%v228 = add i64 %v227, %v226
|
|
|
|
%v229 = add i64 %v228, %v227
|
|
|
|
%v230 = add i64 %v229, %v228
|
|
|
|
%v231 = add i64 %v230, %v229
|
|
|
|
%v232 = add i64 %v231, %v230
|
|
|
|
%v233 = add i64 %v232, %v231
|
|
|
|
%v234 = add i64 %v233, %v232
|
|
|
|
%v235 = add i64 %v234, %v233
|
|
|
|
%v236 = add i64 %v235, %v234
|
|
|
|
%v237 = add i64 %v236, %v235
|
|
|
|
%v238 = add i64 %v237, %v236
|
|
|
|
%v239 = add i64 %v238, %v237
|
|
|
|
%v240 = add i64 %v239, %v238
|
|
|
|
%v241 = add i64 %v240, %v239
|
|
|
|
%v242 = add i64 %v241, %v240
|
|
|
|
%v243 = add i64 %v242, %v241
|
|
|
|
%v244 = add i64 %v243, %v242
|
|
|
|
%v245 = add i64 %v244, %v243
|
|
|
|
%v246 = add i64 %v245, %v244
|
|
|
|
%v247 = add i64 %v246, %v245
|
|
|
|
%v248 = add i64 %v247, %v246
|
|
|
|
%v249 = add i64 %v248, %v247
|
|
|
|
%v250 = add i64 %v249, %v248
|
|
|
|
%v251 = add i64 %v250, %v249
|
|
|
|
%v252 = add i64 %v251, %v250
|
|
|
|
%v253 = add i64 %v252, %v251
|
|
|
|
%v254 = add i64 %v253, %v252
|
|
|
|
%v255 = add i64 %v254, %v253
|
|
|
|
%v256 = add i64 %v255, %v254
|
|
|
|
%v257 = add i64 %v256, %v255
|
|
|
|
%v258 = add i64 %v257, %v256
|
|
|
|
%v259 = add i64 %v258, %v257
|
|
|
|
%v260 = add i64 %v259, %v258
|
|
|
|
%v261 = add i64 %v260, %v259
|
|
|
|
%v262 = add i64 %v261, %v260
|
|
|
|
%v263 = add i64 %v262, %v261
|
|
|
|
%v264 = add i64 %v263, %v262
|
|
|
|
%v265 = add i64 %v264, %v263
|
|
|
|
%v266 = add i64 %v265, %v264
|
|
|
|
%v267 = add i64 %v266, %v265
|
|
|
|
%v268 = add i64 %v267, %v266
|
|
|
|
%v269 = add i64 %v268, %v267
|
|
|
|
%v270 = add i64 %v269, %v268
|
|
|
|
%v271 = add i64 %v270, %v269
|
|
|
|
%v272 = add i64 %v271, %v270
|
|
|
|
%v273 = add i64 %v272, %v271
|
|
|
|
%v274 = add i64 %v273, %v272
|
|
|
|
%v275 = add i64 %v274, %v273
|
|
|
|
%v276 = add i64 %v275, %v274
|
|
|
|
%v277 = add i64 %v276, %v275
|
|
|
|
%v278 = add i64 %v277, %v276
|
|
|
|
%v279 = add i64 %v278, %v277
|
|
|
|
%v280 = add i64 %v279, %v278
|
|
|
|
%v281 = add i64 %v280, %v279
|
|
|
|
%v282 = add i64 %v281, %v280
|
|
|
|
%v283 = add i64 %v282, %v281
|
|
|
|
%v284 = add i64 %v283, %v282
|
|
|
|
%v285 = add i64 %v284, %v283
|
|
|
|
%v286 = add i64 %v285, %v284
|
|
|
|
%v287 = add i64 %v286, %v285
|
|
|
|
%v288 = add i64 %v287, %v286
|
|
|
|
%v289 = add i64 %v288, %v287
|
|
|
|
%v290 = add i64 %v289, %v288
|
|
|
|
%v291 = add i64 %v290, %v289
|
|
|
|
%v292 = add i64 %v291, %v290
|
|
|
|
%v293 = add i64 %v292, %v291
|
|
|
|
%v294 = add i64 %v293, %v292
|
|
|
|
%v295 = add i64 %v294, %v293
|
|
|
|
%v296 = add i64 %v295, %v294
|
|
|
|
%v297 = add i64 %v296, %v295
|
|
|
|
%v298 = add i64 %v297, %v296
|
|
|
|
%v299 = add i64 %v298, %v297
|
|
|
|
%v300 = add i64 %v299, %v298
|
|
|
|
%v301 = icmp eq i64 %v300, 100
|
|
|
|
%v302 = select i1 %v301, i64 %v298, i64 %v299, !prof !15
|
|
|
|
store i64 %v302, i64* %j
|
|
|
|
ret i64 99
|
|
|
|
}
|
|
|
|
|
2019-09-06 00:56:55 +08:00
|
|
|
; Test a case with a really long use-def chains. This test checks that it's not
|
|
|
|
; really slow and doesn't appear to be hanging. This is different from
|
|
|
|
; test_chr_22 in that it has nested control structures (multiple scopes) and
|
|
|
|
; covers additional code.
|
|
|
|
define i64 @test_chr_23(i64 %v0) !prof !14 {
|
2020-05-20 13:02:55 +08:00
|
|
|
; CHECK-LABEL: @test_chr_23(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[V0:%.*]], 50
|
|
|
|
; CHECK-NEXT: [[V10:%.*]] = icmp ne i64 [[TMP0]], -50
|
|
|
|
; CHECK-NEXT: ret i64 99
|
|
|
|
;
|
2019-09-06 00:56:55 +08:00
|
|
|
entry:
|
|
|
|
%v1 = add i64 %v0, 3
|
|
|
|
%v2 = add i64 %v1, %v1
|
|
|
|
%v3 = add i64 %v2, %v1
|
|
|
|
%v4 = add i64 %v2, %v3
|
|
|
|
%v5 = add i64 %v4, %v2
|
|
|
|
%v6 = add i64 %v5, %v4
|
|
|
|
%v7 = add i64 %v6, %v5
|
|
|
|
%v8 = add i64 %v7, %v6
|
|
|
|
%v9 = add i64 %v8, %v7
|
|
|
|
%v10 = icmp eq i64 %v9, 100
|
|
|
|
br i1 %v10, label %body, label %end, !prof !15
|
|
|
|
|
|
|
|
body:
|
|
|
|
%v1_0 = add i64 %v9, 3
|
|
|
|
%v2_0 = add i64 %v1_0, %v1_0
|
|
|
|
%v3_0 = add i64 %v2_0, %v1_0
|
|
|
|
%v4_0 = add i64 %v2_0, %v3_0
|
|
|
|
%v5_0 = add i64 %v4_0, %v2_0
|
|
|
|
%v6_0 = add i64 %v5_0, %v4_0
|
|
|
|
%v7_0 = add i64 %v6_0, %v5_0
|
|
|
|
%v8_0 = add i64 %v7_0, %v6_0
|
|
|
|
%v9_0 = add i64 %v8_0, %v7_0
|
|
|
|
%v10_0 = icmp eq i64 %v9_0, 100
|
|
|
|
br i1 %v10_0, label %body.1, label %end, !prof !15
|
|
|
|
|
|
|
|
body.1:
|
|
|
|
%v1_1 = add i64 %v9_0, 3
|
|
|
|
%v2_1 = add i64 %v1_1, %v1_1
|
|
|
|
%v3_1 = add i64 %v2_1, %v1_1
|
|
|
|
%v4_1 = add i64 %v2_1, %v3_1
|
|
|
|
%v5_1 = add i64 %v4_1, %v2_1
|
|
|
|
%v6_1 = add i64 %v5_1, %v4_1
|
|
|
|
%v7_1 = add i64 %v6_1, %v5_1
|
|
|
|
%v8_1 = add i64 %v7_1, %v6_1
|
|
|
|
%v9_1 = add i64 %v8_1, %v7_1
|
|
|
|
%v10_1 = icmp eq i64 %v9_1, 100
|
|
|
|
br i1 %v10_1, label %body.2, label %end, !prof !15
|
|
|
|
|
|
|
|
body.2:
|
|
|
|
%v1_2 = add i64 %v9_1, 3
|
|
|
|
%v2_2 = add i64 %v1_2, %v1_2
|
|
|
|
%v3_2 = add i64 %v2_2, %v1_2
|
|
|
|
%v4_2 = add i64 %v2_2, %v3_2
|
|
|
|
%v5_2 = add i64 %v4_2, %v2_2
|
|
|
|
%v6_2 = add i64 %v5_2, %v4_2
|
|
|
|
%v7_2 = add i64 %v6_2, %v5_2
|
|
|
|
%v8_2 = add i64 %v7_2, %v6_2
|
|
|
|
%v9_2 = add i64 %v8_2, %v7_2
|
|
|
|
%v10_2 = icmp eq i64 %v9_2, 100
|
|
|
|
br i1 %v10_2, label %body.3, label %end, !prof !15
|
|
|
|
|
|
|
|
body.3:
|
|
|
|
%v1_3 = add i64 %v9_2, 3
|
|
|
|
%v2_3 = add i64 %v1_3, %v1_3
|
|
|
|
%v3_3 = add i64 %v2_3, %v1_3
|
|
|
|
%v4_3 = add i64 %v2_3, %v3_3
|
|
|
|
%v5_3 = add i64 %v4_3, %v2_3
|
|
|
|
%v6_3 = add i64 %v5_3, %v4_3
|
|
|
|
%v7_3 = add i64 %v6_3, %v5_3
|
|
|
|
%v8_3 = add i64 %v7_3, %v6_3
|
|
|
|
%v9_3 = add i64 %v8_3, %v7_3
|
|
|
|
%v10_3 = icmp eq i64 %v9_3, 100
|
|
|
|
br i1 %v10_3, label %body.4, label %end, !prof !15
|
|
|
|
|
|
|
|
body.4:
|
|
|
|
%v1_4 = add i64 %v9_3, 3
|
|
|
|
%v2_4 = add i64 %v1_4, %v1_4
|
|
|
|
%v3_4 = add i64 %v2_4, %v1_4
|
|
|
|
%v4_4 = add i64 %v2_4, %v3_4
|
|
|
|
%v5_4 = add i64 %v4_4, %v2_4
|
|
|
|
%v6_4 = add i64 %v5_4, %v4_4
|
|
|
|
%v7_4 = add i64 %v6_4, %v5_4
|
|
|
|
%v8_4 = add i64 %v7_4, %v6_4
|
|
|
|
%v9_4 = add i64 %v8_4, %v7_4
|
|
|
|
%v10_4 = icmp eq i64 %v9_4, 100
|
|
|
|
br i1 %v10_4, label %body.5, label %end, !prof !15
|
|
|
|
|
|
|
|
body.5:
|
|
|
|
%v1_5 = add i64 %v9_4, 3
|
|
|
|
%v2_5 = add i64 %v1_5, %v1_5
|
|
|
|
%v3_5 = add i64 %v2_5, %v1_5
|
|
|
|
%v4_5 = add i64 %v2_5, %v3_5
|
|
|
|
%v5_5 = add i64 %v4_5, %v2_5
|
|
|
|
%v6_5 = add i64 %v5_5, %v4_5
|
|
|
|
%v7_5 = add i64 %v6_5, %v5_5
|
|
|
|
%v8_5 = add i64 %v7_5, %v6_5
|
|
|
|
%v9_5 = add i64 %v8_5, %v7_5
|
|
|
|
%v10_5 = icmp eq i64 %v9_5, 100
|
|
|
|
br i1 %v10_5, label %body.6, label %end, !prof !15
|
|
|
|
|
|
|
|
body.6:
|
|
|
|
%v1_6 = add i64 %v9_5, 3
|
|
|
|
%v2_6 = add i64 %v1_6, %v1_6
|
|
|
|
%v3_6 = add i64 %v2_6, %v1_6
|
|
|
|
%v4_6 = add i64 %v2_6, %v3_6
|
|
|
|
%v5_6 = add i64 %v4_6, %v2_6
|
|
|
|
%v6_6 = add i64 %v5_6, %v4_6
|
|
|
|
%v7_6 = add i64 %v6_6, %v5_6
|
|
|
|
%v8_6 = add i64 %v7_6, %v6_6
|
|
|
|
%v9_6 = add i64 %v8_6, %v7_6
|
|
|
|
%v10_6 = icmp eq i64 %v9_6, 100
|
|
|
|
br i1 %v10_6, label %body.7, label %end, !prof !15
|
|
|
|
|
|
|
|
body.7:
|
|
|
|
%v1_7 = add i64 %v9_6, 3
|
|
|
|
%v2_7 = add i64 %v1_7, %v1_7
|
|
|
|
%v3_7 = add i64 %v2_7, %v1_7
|
|
|
|
%v4_7 = add i64 %v2_7, %v3_7
|
|
|
|
%v5_7 = add i64 %v4_7, %v2_7
|
|
|
|
%v6_7 = add i64 %v5_7, %v4_7
|
|
|
|
%v7_7 = add i64 %v6_7, %v5_7
|
|
|
|
%v8_7 = add i64 %v7_7, %v6_7
|
|
|
|
%v9_7 = add i64 %v8_7, %v7_7
|
|
|
|
%v10_7 = icmp eq i64 %v9_7, 100
|
|
|
|
br i1 %v10_7, label %body.8, label %end, !prof !15
|
|
|
|
|
|
|
|
body.8:
|
|
|
|
%v1_8 = add i64 %v9_7, 3
|
|
|
|
%v2_8 = add i64 %v1_8, %v1_8
|
|
|
|
%v3_8 = add i64 %v2_8, %v1_8
|
|
|
|
%v4_8 = add i64 %v2_8, %v3_8
|
|
|
|
%v5_8 = add i64 %v4_8, %v2_8
|
|
|
|
%v6_8 = add i64 %v5_8, %v4_8
|
|
|
|
%v7_8 = add i64 %v6_8, %v5_8
|
|
|
|
%v8_8 = add i64 %v7_8, %v6_8
|
|
|
|
%v9_8 = add i64 %v8_8, %v7_8
|
|
|
|
%v10_8 = icmp eq i64 %v9_8, 100
|
|
|
|
br i1 %v10_8, label %body.9, label %end, !prof !15
|
|
|
|
|
|
|
|
body.9:
|
|
|
|
%v1_9 = add i64 %v9_8, 3
|
|
|
|
%v2_9 = add i64 %v1_9, %v1_9
|
|
|
|
%v3_9 = add i64 %v2_9, %v1_9
|
|
|
|
%v4_9 = add i64 %v2_9, %v3_9
|
|
|
|
%v5_9 = add i64 %v4_9, %v2_9
|
|
|
|
%v6_9 = add i64 %v5_9, %v4_9
|
|
|
|
%v7_9 = add i64 %v6_9, %v5_9
|
|
|
|
%v8_9 = add i64 %v7_9, %v6_9
|
|
|
|
%v9_9 = add i64 %v8_9, %v7_9
|
|
|
|
br label %end
|
|
|
|
|
|
|
|
end:
|
|
|
|
ret i64 99
|
|
|
|
}
|
|
|
|
|
2020-01-14 06:19:45 +08:00
|
|
|
; Test to not crash upon a 0:0 branch_weight metadata.
|
|
|
|
define void @test_chr_24(i32* %i) !prof !14 {
|
2020-05-20 13:02:55 +08:00
|
|
|
; CHECK-LABEL: @test_chr_24(
|
|
|
|
; CHECK-NEXT: entry:
|
|
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4
|
|
|
|
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1
|
|
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[BB0:%.*]], !prof !21
|
|
|
|
; CHECK: bb0:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB1]]
|
|
|
|
; CHECK: bb1:
|
|
|
|
; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 2
|
|
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
|
|
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[BB3:%.*]], label [[BB2:%.*]], !prof !21
|
|
|
|
; CHECK: bb2:
|
|
|
|
; CHECK-NEXT: call void @foo()
|
|
|
|
; CHECK-NEXT: br label [[BB3]]
|
|
|
|
; CHECK: bb3:
|
|
|
|
; CHECK-NEXT: ret void
|
|
|
|
;
|
2020-01-14 06:19:45 +08:00
|
|
|
entry:
|
|
|
|
%0 = load i32, i32* %i
|
|
|
|
%1 = and i32 %0, 1
|
|
|
|
%2 = icmp eq i32 %1, 0
|
|
|
|
br i1 %2, label %bb1, label %bb0, !prof !17
|
|
|
|
|
|
|
|
bb0:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb1
|
|
|
|
|
|
|
|
bb1:
|
|
|
|
%3 = and i32 %0, 2
|
|
|
|
%4 = icmp eq i32 %3, 0
|
|
|
|
br i1 %4, label %bb3, label %bb2, !prof !17
|
|
|
|
|
|
|
|
bb2:
|
|
|
|
call void @foo()
|
|
|
|
br label %bb3
|
|
|
|
|
|
|
|
bb3:
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-04-17 12:52:47 +08:00
|
|
|
!llvm.module.flags = !{!0}
|
|
|
|
!0 = !{i32 1, !"ProfileSummary", !1}
|
|
|
|
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
|
|
|
|
!2 = !{!"ProfileFormat", !"InstrProf"}
|
|
|
|
!3 = !{!"TotalCount", i64 10000}
|
|
|
|
!4 = !{!"MaxCount", i64 10}
|
|
|
|
!5 = !{!"MaxInternalCount", i64 1}
|
|
|
|
!6 = !{!"MaxFunctionCount", i64 1000}
|
|
|
|
!7 = !{!"NumCounts", i64 3}
|
|
|
|
!8 = !{!"NumFunctions", i64 3}
|
|
|
|
!9 = !{!"DetailedSummary", !10}
|
|
|
|
!10 = !{!11, !12, !13}
|
|
|
|
!11 = !{i32 10000, i64 100, i32 1}
|
|
|
|
!12 = !{i32 999000, i64 100, i32 1}
|
|
|
|
!13 = !{i32 999999, i64 1, i32 2}
|
|
|
|
|
|
|
|
!14 = !{!"function_entry_count", i64 100}
|
|
|
|
!15 = !{!"branch_weights", i32 0, i32 1}
|
|
|
|
!16 = !{!"branch_weights", i32 1, i32 1}
|
2020-01-14 06:19:45 +08:00
|
|
|
!17 = !{!"branch_weights", i32 0, i32 0}
|
2019-04-17 12:52:47 +08:00
|
|
|
; CHECK: !15 = !{!"branch_weights", i32 1000, i32 0}
|
|
|
|
; CHECK: !16 = !{!"branch_weights", i32 0, i32 1}
|
|
|
|
; CHECK: !17 = !{!"branch_weights", i32 1, i32 1}
|
|
|
|
; CHECK: !18 = !{!"branch_weights", i32 1, i32 0}
|
|
|
|
; CHECK: !19 = !{!"branch_weights", i32 0, i32 1000}
|