2016-04-20 07:51:52 +08:00
|
|
|
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
|
In MachineBlockPlacement, filter cold blocks off the loop chain when profile data is available.
In the current BB placement algorithm, a loop chain always contains all loop blocks. This has a drawback that cold blocks in the loop may be inserted on a hot function path, hence increasing branch cost and also reducing icache locality.
Consider a simple example shown below:
A
|
B⇆C
|
D
When B->C is quite cold, the best BB-layout should be A,B,D,C. But the current implementation produces A,C,B,D.
This patch filters those cold blocks off from the loop chain by comparing the ratio:
LoopBBFreq / LoopFreq
to 20%: if it is less than 20%, we don't include this BB to the loop chain. Here LoopFreq is the frequency of the loop when we reduce the loop into a single node. In general we have more cold blocks when the loop has few iterations. And vice versa.
Differential revision: http://reviews.llvm.org/D11662
llvm-svn: 251833
2015-11-03 05:24:00 +08:00
|
|
|
|
|
|
|
define void @foo() !prof !1 {
|
|
|
|
; Test if a cold block in a loop will be placed at the end of the function
|
|
|
|
; chain.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: foo:
|
|
|
|
; CHECK: callq b
|
|
|
|
; CHECK: callq c
|
|
|
|
; CHECK: callq e
|
|
|
|
; CHECK: callq f
|
|
|
|
; CHECK: callq d
|
|
|
|
|
|
|
|
entry:
|
|
|
|
br label %header
|
|
|
|
|
|
|
|
header:
|
|
|
|
call void @b()
|
|
|
|
%call = call zeroext i1 @a()
|
|
|
|
br i1 %call, label %if.then, label %if.else, !prof !4
|
|
|
|
|
|
|
|
if.then:
|
|
|
|
call void @c()
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.else:
|
|
|
|
call void @d()
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.end:
|
|
|
|
call void @e()
|
|
|
|
%call2 = call zeroext i1 @a()
|
|
|
|
br i1 %call2, label %header, label %end, !prof !5
|
|
|
|
|
|
|
|
end:
|
|
|
|
call void @f()
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
Revert Revert [MBP] do not rotate loop if it creates extra branch
This is a second attempt to land this patch.
The first one resulted in a crash of clang sanitizer buildbot.
The fix is here and regression test is added.
This is a last fix for the corner case of PR32214. Actually this is not really corner case in general.
We should not do a loop rotation if we create an additional branch due to it.
Consider the case where we have a loop chain H, M, B, C , where
H is header with viable fallthrough from pre-header and exit from the loop
M - some middle block
B - backedge to Header but with exit from the loop also.
C - some cold block of the loop.
Let's H is determined as a best exit. If we do a loop rotation M, B, C, H we can introduce the extra branch.
Let's compute the change in number of branches:
+1 branch from pre-header to header
-1 branch from header to exit
+1 branch from header to middle block if there is such
-1 branch from cold bock to header if there is one
So if C is not a predecessor of H then we introduce extra branch.
This change actually prohibits rotation of the loop if both true
Best Exit has next element in chain as successor.
Last element in chain is not a predecessor of first element of chain.
Reviewers: iteratee, xur, sammccall, chandlerc
Reviewed By: iteratee
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D34745
llvm-svn: 307631
2017-07-11 16:34:58 +08:00
|
|
|
define void @nested_loop_0(i1 %flag) !prof !1 {
|
In MachineBlockPlacement, filter cold blocks off the loop chain when profile data is available.
In the current BB placement algorithm, a loop chain always contains all loop blocks. This has a drawback that cold blocks in the loop may be inserted on a hot function path, hence increasing branch cost and also reducing icache locality.
Consider a simple example shown below:
A
|
B⇆C
|
D
When B->C is quite cold, the best BB-layout should be A,B,D,C. But the current implementation produces A,C,B,D.
This patch filters those cold blocks off from the loop chain by comparing the ratio:
LoopBBFreq / LoopFreq
to 20%: if it is less than 20%, we don't include this BB to the loop chain. Here LoopFreq is the frequency of the loop when we reduce the loop into a single node. In general we have more cold blocks when the loop has few iterations. And vice versa.
Differential revision: http://reviews.llvm.org/D11662
llvm-svn: 251833
2015-11-03 05:24:00 +08:00
|
|
|
; Test if a block that is cold in the inner loop but not cold in the outer loop
|
|
|
|
; will merged to the outer loop chain.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: nested_loop_0:
|
|
|
|
; CHECK: callq c
|
|
|
|
; CHECK: callq d
|
|
|
|
; CHECK: callq e
|
|
|
|
; CHECK: callq b
|
|
|
|
; CHECK: callq f
|
|
|
|
|
|
|
|
entry:
|
|
|
|
br label %header
|
|
|
|
|
|
|
|
header:
|
|
|
|
call void @b()
|
|
|
|
%call4 = call zeroext i1 @a()
|
|
|
|
br i1 %call4, label %header2, label %end
|
|
|
|
|
|
|
|
header2:
|
|
|
|
call void @c()
|
|
|
|
%call = call zeroext i1 @a()
|
|
|
|
br i1 %call, label %if.then, label %if.else, !prof !2
|
|
|
|
|
|
|
|
if.then:
|
|
|
|
call void @d()
|
|
|
|
%call3 = call zeroext i1 @a()
|
|
|
|
br i1 %call3, label %header2, label %header, !prof !3
|
|
|
|
|
|
|
|
if.else:
|
|
|
|
call void @e()
|
Revert Revert [MBP] do not rotate loop if it creates extra branch
This is a second attempt to land this patch.
The first one resulted in a crash of clang sanitizer buildbot.
The fix is here and regression test is added.
This is a last fix for the corner case of PR32214. Actually this is not really corner case in general.
We should not do a loop rotation if we create an additional branch due to it.
Consider the case where we have a loop chain H, M, B, C , where
H is header with viable fallthrough from pre-header and exit from the loop
M - some middle block
B - backedge to Header but with exit from the loop also.
C - some cold block of the loop.
Let's H is determined as a best exit. If we do a loop rotation M, B, C, H we can introduce the extra branch.
Let's compute the change in number of branches:
+1 branch from pre-header to header
-1 branch from header to exit
+1 branch from header to middle block if there is such
-1 branch from cold bock to header if there is one
So if C is not a predecessor of H then we introduce extra branch.
This change actually prohibits rotation of the loop if both true
Best Exit has next element in chain as successor.
Last element in chain is not a predecessor of first element of chain.
Reviewers: iteratee, xur, sammccall, chandlerc
Reviewed By: iteratee
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D34745
llvm-svn: 307631
2017-07-11 16:34:58 +08:00
|
|
|
br i1 %flag, label %header2, label %header, !prof !3
|
In MachineBlockPlacement, filter cold blocks off the loop chain when profile data is available.
In the current BB placement algorithm, a loop chain always contains all loop blocks. This has a drawback that cold blocks in the loop may be inserted on a hot function path, hence increasing branch cost and also reducing icache locality.
Consider a simple example shown below:
A
|
B⇆C
|
D
When B->C is quite cold, the best BB-layout should be A,B,D,C. But the current implementation produces A,C,B,D.
This patch filters those cold blocks off from the loop chain by comparing the ratio:
LoopBBFreq / LoopFreq
to 20%: if it is less than 20%, we don't include this BB to the loop chain. Here LoopFreq is the frequency of the loop when we reduce the loop into a single node. In general we have more cold blocks when the loop has few iterations. And vice versa.
Differential revision: http://reviews.llvm.org/D11662
llvm-svn: 251833
2015-11-03 05:24:00 +08:00
|
|
|
|
|
|
|
end:
|
|
|
|
call void @f()
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @nested_loop_1() !prof !1 {
|
|
|
|
; Test if a cold block in an inner loop will be placed at the end of the
|
|
|
|
; function chain.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: nested_loop_1:
|
|
|
|
; CHECK: callq b
|
|
|
|
; CHECK: callq c
|
|
|
|
; CHECK: callq e
|
|
|
|
; CHECK: callq d
|
|
|
|
|
|
|
|
entry:
|
|
|
|
br label %header
|
|
|
|
|
|
|
|
header:
|
|
|
|
call void @b()
|
|
|
|
br label %header2
|
|
|
|
|
|
|
|
header2:
|
|
|
|
call void @c()
|
|
|
|
%call = call zeroext i1 @a()
|
|
|
|
br i1 %call, label %end, label %if.else, !prof !4
|
|
|
|
|
|
|
|
if.else:
|
|
|
|
call void @d()
|
|
|
|
%call2 = call zeroext i1 @a()
|
|
|
|
br i1 %call2, label %header2, label %header, !prof !5
|
|
|
|
|
|
|
|
end:
|
|
|
|
call void @e()
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
declare zeroext i1 @a()
|
|
|
|
declare void @b()
|
|
|
|
declare void @c()
|
|
|
|
declare void @d()
|
|
|
|
declare void @e()
|
|
|
|
declare void @f()
|
|
|
|
|
|
|
|
!1 = !{!"function_entry_count", i64 1}
|
|
|
|
!2 = !{!"branch_weights", i32 100, i32 1}
|
|
|
|
!3 = !{!"branch_weights", i32 1, i32 10}
|
|
|
|
!4 = !{!"branch_weights", i32 1000, i32 1}
|
|
|
|
!5 = !{!"branch_weights", i32 100, i32 1}
|