[X86] Improved X86::CMOV to Branch heuristic.

Resolved PR33954.
This patch contains two more constraints that aim to reduce the noise cases where we convert CMOV into branch for small gain, and end up spending more cycles due to overhead.

Differential Revision: https://reviews.llvm.org/D36081

llvm-svn: 310352
This commit is contained in:
Amjad Aboud 2017-08-08 12:17:56 +00:00
parent 70a3511bd5
commit 6fa6813aec
3 changed files with 161 additions and 10 deletions

View File

@ -67,6 +67,11 @@ static cl::opt<bool>
cl::desc("Enable the X86 cmov-to-branch optimization."), cl::desc("Enable the X86 cmov-to-branch optimization."),
cl::init(true), cl::Hidden); cl::init(true), cl::Hidden);
static cl::opt<unsigned>
GainCycleThreshold("x86-cmov-converter-threshold",
cl::desc("Minimum gain per loop (in cycles) threshold."),
cl::init(4), cl::Hidden);
/// Converts X86 cmov instructions into branches when profitable. /// Converts X86 cmov instructions into branches when profitable.
class X86CmovConverterPass : public MachineFunctionPass { class X86CmovConverterPass : public MachineFunctionPass {
public: public:
@ -389,19 +394,28 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
// Critical-path is iteration dependent - there is dependency of // Critical-path is iteration dependent - there is dependency of
// critical-path instructions on critical-path instructions of // critical-path instructions on critical-path instructions of
// previous iteration. // previous iteration.
// Thus, it is required to check the gradient of the gain - the // Thus, check the gain percent of the 2nd iteration (similar to the
// change in Depth-Diff compared to the change in Loop-Depth between // previous case), but it is also required to check the gradient of
// 1st and 2nd iterations. // the gain - the change in Depth-Diff compared to the change in
// Loop-Depth between 1st and 2nd iterations.
// To be conservative, the gradient need to be at least 50%. // To be conservative, the gradient need to be at least 50%.
// //
// In addition, In order not to optimize loops with very small gain, the
// gain (in cycles) after 2nd iteration should not be less than a given
// threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
//
// If loop is not worth optimizing, remove all CMOV-group-candidates. // If loop is not worth optimizing, remove all CMOV-group-candidates.
//===--------------------------------------------------------------------===// //===--------------------------------------------------------------------===//
if (Diff[1] < GainCycleThreshold)
return false;
bool WorthOptLoop = false; bool WorthOptLoop = false;
if (Diff[1] == Diff[0]) if (Diff[1] == Diff[0])
WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth; WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
else if (Diff[1] > Diff[0]) else if (Diff[1] > Diff[0])
WorthOptLoop = WorthOptLoop =
(Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth); (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
(Diff[1] * 8 >= LoopDepth[1].Depth);
if (!WorthOptLoop) if (!WorthOptLoop)
return false; return false;

View File

@ -0,0 +1,91 @@
; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This test checks that x86-cmov-converter optimization does not transform CMOV
;; instruction when the gain (in cycles) of converting to branch is less than
;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
;;
;; Test was created using the following command line:
;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
;; Where foo.c is:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;int bar(int *a, int *b, int n) {
;; int sum = 0;
;; for (int i = 0; i < n; ++i) {
;; int x = a[i] * a[i+1] * a[i+2];
;; int y = b[i] * b[i+1];
;; sum += y > x ? x : 0;
;; }
;; return sum;
;;}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Adding a test to the above function shows code with CMOV is 25% faster than
;; the code with branch.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;#define N 10000
;;int A[N];
;;int B[N];
;;
;;
;;
;;int main () {
;; for (int i=0; i< N; ++i) {
;; A[i] = i%4;
;; B[i] = i%5;
;; }
;; int sum = 0;
;; for (int i=0; i< N*10; ++i)
;; sum += bar(A, B, N);
;; return sum;
;;}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; CHECK-NOT: jg
; CHECK: cmovle
define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
entry:
%cmp30 = icmp sgt i32 %n, 0
br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%.pre = load i32, i32* %a, align 4
%arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
%.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
%.pre35 = load i32, i32* %b, align 4
%wide.trip.count = zext i32 %n to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
ret i32 %sum.0.lcssa
for.body: ; preds = %for.body, %for.body.preheader
%0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
%1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
%2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%mul = mul nsw i32 %1, %2
%3 = add nuw nsw i64 %indvars.iv, 2
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
%4 = load i32, i32* %arrayidx5, align 4
%mul6 = mul nsw i32 %mul, %4
%arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
%5 = load i32, i32* %arrayidx11, align 4
%mul12 = mul nsw i32 %5, %0
%cmp13 = icmp sgt i32 %mul12, %mul6
%cond = select i1 %cmp13, i32 %mul6, i32 0
%add14 = add nsw i32 %cond, %sum.032
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
attributes #0 = {"target-cpu"="skylake"}
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 5.0.0 (trunk)"}

View File

@ -3,13 +3,13 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This test checks that x86-cmov-converter optimization transform CMOV ;; This test checks that x86-cmov-converter optimization transform CMOV
;; instruction into branches when it is profitable. ;; instruction into branches when it is profitable.
;; There are 5 cases below: ;; There are 6 cases below:
;; 1. CmovInCriticalPath: ;; 1. CmovInCriticalPath:
;; CMOV depends on the condition and it is in the hot path. ;; CMOV depends on the condition and it is in the hot path.
;; Thus, it worths transforming. ;; Thus, it worths transforming.
;; ;;
;; 2. CmovNotInCriticalPath: ;; 2. CmovNotInCriticalPath:
;; similar test like in (1), just that CMOV is not in the hot path. ;; Similar test like in (1), just that CMOV is not in the hot path.
;; Thus, it does not worth transforming. ;; Thus, it does not worth transforming.
;; ;;
;; 3. MaxIndex: ;; 3. MaxIndex:
@ -26,16 +26,21 @@
;; Usually, binary search CMOV is not predicted. ;; Usually, binary search CMOV is not predicted.
;; Thus, it does not worth transforming. ;; Thus, it does not worth transforming.
;; ;;
;; 6. SmallGainPerLoop:
;; The gain percentage from converting CMOV into branch is acceptable,
;; however, the absolute gain is smaller than a threshold.
;; Thus, it does not worth transforming.
;;
;; Test was created using the following command line: ;; Test was created using the following command line:
;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o - ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
;; Where foo.c is: ;; Where foo.c is:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;void CmovInHotPath(int n, int a, int b, int *c, int *d) { ;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
;; for (int i = 0; i < n; i++) { ;; for (int i = 0; i < n; i++) {
;; int t = c[i]; ;; int t = c[i] + 1;
;; if (c[i] * a > b) ;; if (c[i] * a > b)
;; t = 10; ;; t = 10;
;; c[i] = t; ;; c[i] = (c[i] + 1) * t;
;; } ;; }
;;} ;;}
;; ;;
@ -87,6 +92,16 @@
;; } ;; }
;; return Curr->Val; ;; return Curr->Val;
;;} ;;}
;;
;;
;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) {
;; for (int i = 0; i < n; i++) {
;; int t = c[i];
;; if (c[i] * a > b)
;; t = 10;
;; c[i] = t;
;; }
;;}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%struct.Node = type { i32, %struct.Node*, %struct.Node* } %struct.Node = type { i32, %struct.Node*, %struct.Node* }
@ -111,10 +126,12 @@ for.body: ; preds = %for.body.preheader,
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4 %0 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, 1
%mul = mul nsw i32 %0, %a %mul = mul nsw i32 %0, %a
%cmp3 = icmp sgt i32 %mul, %b %cmp3 = icmp sgt i32 %mul, %b
%. = select i1 %cmp3, i32 10, i32 %0 %. = select i1 %cmp3, i32 10, i32 %add
store i32 %., i32* %arrayidx, align 4 %mul7 = mul nsw i32 %., %add
store i32 %mul7, i32* %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.cond.cleanup, label %for.body br i1 %exitcond, label %for.cond.cleanup, label %for.body
@ -253,6 +270,35 @@ while.end: ; preds = %while.body, %entry
ret i32 %.lcssa ret i32 %.lcssa
} }
; CHECK-LABEL: SmallGainPerLoop
; CHECK-NOT: jg
; CHECK: cmovg
define void @SmallGainPerLoop(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
entry:
%cmp14 = icmp sgt i32 %n, 0
br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %n to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%mul = mul nsw i32 %0, %a
%cmp3 = icmp sgt i32 %mul, %b
%. = select i1 %cmp3, i32 10, i32 %0
store i32 %., i32* %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; The following test checks that x86-cmov-converter optimization transforms ;; The following test checks that x86-cmov-converter optimization transforms
;; CMOV instructions into branch correctly. ;; CMOV instructions into branch correctly.