[X86] Improved X86::CMOV to Branch heuristic.

Resolved PR33954. This patch contains two more constraints that aim to reduce the noise cases where we convert CMOV into branch for small gain, and end up spending more cycles due to overhead. Differential Revision: https://reviews.llvm.org/D36081 llvm-svn: 310352
2017-08-08 12:17:56 +00:00 · 2017-08-08 12:17:56 +00:00 · 6fa6813aec
parent 70a3511bd5
commit 6fa6813aec
3 changed files with 161 additions and 10 deletions
--- a/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/llvm/lib/Target/X86/X86CmovConversion.cpp
@ -67,6 +67,11 @@ static cl::opt<bool>
                        cl::desc("Enable the X86 cmov-to-branch optimization."),
                        cl::init(true), cl::Hidden);
 static cl::opt<unsigned>
    GainCycleThreshold("x86-cmov-converter-threshold",
                       cl::desc("Minimum gain per loop (in cycles) threshold."),
                       cl::init(4), cl::Hidden);
 /// Converts X86 cmov instructions into branches when profitable.
 class X86CmovConverterPass : public MachineFunctionPass {
 public:
@ -389,19 +394,28 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
  //           Critical-path is iteration dependent - there is dependency of
  //           critical-path instructions on critical-path instructions of
  //           previous iteration.
-  //           Thus, it is required to check the gradient of the gain - the
+  //           Thus, check the gain percent of the 2nd iteration (similar to the
-  //           change in Depth-Diff compared to the change in Loop-Depth between
+  //           previous case), but it is also required to check the gradient of
-  //           1st and 2nd iterations.
+  //           the gain - the change in Depth-Diff compared to the change in
  //           Loop-Depth between 1st and 2nd iterations.
  //           To be conservative, the gradient need to be at least 50%.
  //
  //   In addition, In order not to optimize loops with very small gain, the
  //   gain (in cycles) after 2nd iteration should not be less than a given
  //   threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
  //
  // If loop is not worth optimizing, remove all CMOV-group-candidates.
  //===--------------------------------------------------------------------===//
  if (Diff[1] < GainCycleThreshold)
    return false;
  bool WorthOptLoop = false;
  if (Diff[1] == Diff[0])
    WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
  else if (Diff[1] > Diff[0])
    WorthOptLoop =
-        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
+        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
        (Diff[1] * 8 >= LoopDepth[1].Depth);
  if (!WorthOptLoop)
    return false;
--- a/llvm/test/CodeGen/X86/pr33954.ll
+++ b/llvm/test/CodeGen/X86/pr33954.ll
@ -0,0 +1,91 @@
 ; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This test checks that x86-cmov-converter optimization does not transform CMOV
 ;; instruction when the gain (in cycles) of converting to branch is less than
 ;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
 ;;
 ;; Test was created using the following command line:
 ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
 ;; Where foo.c is:
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;int bar(int *a, int *b, int n) {
 ;;  int sum = 0;
 ;;  for (int i = 0; i < n; ++i) {
 ;;    int x = a[i] * a[i+1] * a[i+2];
 ;;    int y = b[i] * b[i+1];
 ;;    sum += y > x ? x : 0;
 ;;  }
 ;;  return sum;
 ;;}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Adding a test to the above function shows code with CMOV is 25% faster than
 ;; the code with branch.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;#define N 10000
 ;;int A[N];
 ;;int B[N];
 ;;
 ;;
 ;;
 ;;int main () {
 ;;  for (int i=0; i< N; ++i) {
 ;;    A[i] = i%4;
 ;;    B[i] = i%5;
 ;;  }
 ;;  int sum = 0;
 ;;  for (int i=0; i< N*10; ++i)
 ;;    sum += bar(A, B, N);
 ;;  return sum;
 ;;}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; CHECK-NOT: jg
 ; CHECK: cmovle
 define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
 entry:
  %cmp30 = icmp sgt i32 %n, 0
  br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup
 for.body.preheader:                               ; preds = %entry
  %.pre = load i32, i32* %a, align 4
  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
  %.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
  %.pre35 = load i32, i32* %b, align 4
  %wide.trip.count = zext i32 %n to i64
  br label %for.body
 for.cond.cleanup:                                 ; preds = %for.body, %entry
  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
  ret i32 %sum.0.lcssa
 for.body:                                         ; preds = %for.body, %for.body.preheader
  %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
  %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
  %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %mul = mul nsw i32 %1, %2
  %3 = add nuw nsw i64 %indvars.iv, 2
  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
  %4 = load i32, i32* %arrayidx5, align 4
  %mul6 = mul nsw i32 %mul, %4
  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
  %5 = load i32, i32* %arrayidx11, align 4
  %mul12 = mul nsw i32 %5, %0
  %cmp13 = icmp sgt i32 %mul12, %mul6
  %cond = select i1 %cmp13, i32 %mul6, i32 0
  %add14 = add nsw i32 %cond, %sum.032
  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 attributes #0 = {"target-cpu"="skylake"}
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 !0 = !{i32 1, !"wchar_size", i32 2}
 !1 = !{i32 7, !"PIC Level", i32 2}
 !2 = !{!"clang version 5.0.0 (trunk)"}
--- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
@ -3,13 +3,13 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This test checks that x86-cmov-converter optimization transform CMOV
 ;; instruction into branches when it is profitable.
-;; There are 5 cases below:
+;; There are 6 cases below:
 ;;   1. CmovInCriticalPath:
 ;;        CMOV depends on the condition and it is in the hot path.
 ;;        Thus, it worths transforming.
 ;;
 ;;   2. CmovNotInCriticalPath:
-;;        similar test like in (1), just that CMOV is not in the hot path.
+;;        Similar test like in (1), just that CMOV is not in the hot path.
 ;;        Thus, it does not worth transforming.
 ;;
 ;;   3. MaxIndex:
@ -26,16 +26,21 @@
 ;;        Usually, binary search CMOV is not predicted.
 ;;        Thus, it does not worth transforming.
 ;;
 ;;   6. SmallGainPerLoop:
 ;;        The gain percentage from converting CMOV into branch is acceptable,
 ;;        however, the absolute gain is smaller than a threshold.
 ;;        Thus, it does not worth transforming.
 ;;
 ;; Test was created using the following command line:
 ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
 ;; Where foo.c is:
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
 ;;  for (int i = 0; i < n; i++) {
-;;    int t = c[i];
+;;    int t = c[i] + 1;
 ;;    if (c[i] * a > b)
 ;;      t = 10;
-;;    c[i] = t;
+;;    c[i] = (c[i] + 1) * t;
 ;;  }
 ;;}
 ;;
@ -87,6 +92,16 @@
 ;;  }
 ;;  return Curr->Val;
 ;;}
 ;;
 ;;
 ;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) {
 ;;  for (int i = 0; i < n; i++) {
 ;;    int t = c[i];
 ;;    if (c[i] * a > b)
 ;;      t = 10;
 ;;    c[i] = t;
 ;;  }
 ;;}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 %struct.Node = type { i32, %struct.Node*, %struct.Node* }
@ -111,10 +126,12 @@ for.body:                                         ; preds = %for.body.preheader,
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
  %0 = load i32, i32* %arrayidx, align 4
  %add = add nsw i32 %0, 1
  %mul = mul nsw i32 %0, %a
  %cmp3 = icmp sgt i32 %mul, %b
-  %. = select i1 %cmp3, i32 10, i32 %0
+  %. = select i1 %cmp3, i32 10, i32 %add
-  store i32 %., i32* %arrayidx, align 4
+  %mul7 = mul nsw i32 %., %add
  store i32 %mul7, i32* %arrayidx, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond, label %for.cond.cleanup, label %for.body
@ -253,6 +270,35 @@ while.end:                                        ; preds = %while.body, %entry
  ret i32 %.lcssa
 }
 ; CHECK-LABEL: SmallGainPerLoop
 ; CHECK-NOT: jg
 ; CHECK: cmovg
 define void @SmallGainPerLoop(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
 entry:
  %cmp14 = icmp sgt i32 %n, 0
  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
 for.body.preheader:                               ; preds = %entry
  %wide.trip.count = zext i32 %n to i64
  br label %for.body
 for.cond.cleanup:                                 ; preds = %for.body, %entry
  ret void
 for.body:                                         ; preds = %for.body.preheader, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
  %0 = load i32, i32* %arrayidx, align 4
  %mul = mul nsw i32 %0, %a
  %cmp3 = icmp sgt i32 %mul, %b
  %. = select i1 %cmp3, i32 10, i32 %0
  store i32 %., i32* %arrayidx, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; The following test checks that x86-cmov-converter optimization transforms
 ;; CMOV instructions into branch correctly.