From 6fa6813aeccadb8f0780aa0ee4468ccc443122d8 Mon Sep 17 00:00:00 2001
From: Amjad Aboud <amjad.aboud@intel.com>
Date: Tue, 8 Aug 2017 12:17:56 +0000
Subject: [PATCH] [X86] Improved X86::CMOV to Branch heuristic. Resolved
 PR33954. This patch contains two more constraints that aim to reduce the
 noise cases where we convert CMOV into branch for small gain, and end up
 spending more cycles due to overhead.

Differential Revision: https://reviews.llvm.org/D36081

llvm-svn: 310352
---
 llvm/lib/Target/X86/X86CmovConversion.cpp   | 22 ++++-
 llvm/test/CodeGen/X86/pr33954.ll            | 91 +++++++++++++++++++++
 llvm/test/CodeGen/X86/x86-cmov-converter.ll | 58 +++++++++++--
 3 files changed, 161 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr33954.ll
diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp
index bfc834435de5..b9b91d629db9 100644
--- a/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -67,6 +67,11 @@ static cl::opt<bool>
                         cl::desc("Enable the X86 cmov-to-branch optimization."),
                         cl::init(true), cl::Hidden);
 
+static cl::opt<unsigned>
+    GainCycleThreshold("x86-cmov-converter-threshold",
+                       cl::desc("Minimum gain per loop (in cycles) threshold."),
+                       cl::init(4), cl::Hidden);
+
 /// Converts X86 cmov instructions into branches when profitable.
 class X86CmovConverterPass : public MachineFunctionPass {
 public:
@@ -389,19 +394,28 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
   //           Critical-path is iteration dependent - there is dependency of
   //           critical-path instructions on critical-path instructions of
   //           previous iteration.
-  //           Thus, it is required to check the gradient of the gain - the
-  //           change in Depth-Diff compared to the change in Loop-Depth between
-  //           1st and 2nd iterations.
+  //           Thus, check the gain percent of the 2nd iteration (similar to the
+  //           previous case), but it is also required to check the gradient of
+  //           the gain - the change in Depth-Diff compared to the change in
+  //           Loop-Depth between 1st and 2nd iterations.
   //           To be conservative, the gradient need to be at least 50%.
   //
+  //   In addition, In order not to optimize loops with very small gain, the
+  //   gain (in cycles) after 2nd iteration should not be less than a given
+  //   threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
+  //
   // If loop is not worth optimizing, remove all CMOV-group-candidates.
   //===--------------------------------------------------------------------===//
+  if (Diff[1] < GainCycleThreshold)
+    return false;
+
   bool WorthOptLoop = false;
   if (Diff[1] == Diff[0])
     WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
   else if (Diff[1] > Diff[0])
     WorthOptLoop =
-        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
+        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
+        (Diff[1] * 8 >= LoopDepth[1].Depth);
 
   if (!WorthOptLoop)
     return false;
diff --git a/llvm/test/CodeGen/X86/pr33954.ll b/llvm/test/CodeGen/X86/pr33954.ll
new file mode 100644
index 000000000000..084873a7dc55
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr33954.ll
@@ -0,0 +1,91 @@
+; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; This test checks that x86-cmov-converter optimization does not transform CMOV
+;; instruction when the gain (in cycles) of converting to branch is less than
+;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
+;;
+;; Test was created using the following command line:
+;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
+;; Where foo.c is:
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;int bar(int *a, int *b, int n) {
+;;  int sum = 0;
+;;  for (int i = 0; i < n; ++i) {
+;;    int x = a[i] * a[i+1] * a[i+2];
+;;    int y = b[i] * b[i+1];
+;;    sum += y > x ? x : 0;
+;;  }
+;;  return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Adding a test to the above function shows code with CMOV is 25% faster than
+;; the code with branch.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;#define N 10000
+;;int A[N];
+;;int B[N];
+;;
+;;
+;;
+;;int main () {
+;;  for (int i=0; i< N; ++i) {
+;;    A[i] = i%4;
+;;    B[i] = i%5;
+;;  }
+;;  int sum = 0;
+;;  for (int i=0; i< N*10; ++i)
+;;    sum += bar(A, B, N);
+;;  return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-NOT: jg
+; CHECK: cmovle
+define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp30 = icmp sgt i32 %n, 0
+  br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %.pre = load i32, i32* %a, align 4
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
+  %.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  %.pre35 = load i32, i32* %b, align 4
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
+  ret i32 %sum.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
+  %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
+  %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %mul = mul nsw i32 %1, %2
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
+  %4 = load i32, i32* %arrayidx5, align 4
+  %mul6 = mul nsw i32 %mul, %4
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %5 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %5, %0
+  %cmp13 = icmp sgt i32 %mul12, %mul6
+  %cond = select i1 %cmp13, i32 %mul6, i32 0
+  %add14 = add nsw i32 %cond, %sum.032
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = {"target-cpu"="skylake"}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 5.0.0 (trunk)"}
diff --git a/llvm/test/CodeGen/X86/x86-cmov-converter.ll b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
index 39877c14429f..1837a8d49851 100644
--- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
@@ -3,13 +3,13 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This test checks that x86-cmov-converter optimization transform CMOV
 ;; instruction into branches when it is profitable.
-;; There are 5 cases below:
+;; There are 6 cases below:
 ;;   1. CmovInCriticalPath:
 ;;        CMOV depends on the condition and it is in the hot path.
 ;;        Thus, it worths transforming.
 ;;
 ;;   2. CmovNotInCriticalPath:
-;;        similar test like in (1), just that CMOV is not in the hot path.
+;;        Similar test like in (1), just that CMOV is not in the hot path.
 ;;        Thus, it does not worth transforming.
 ;;
 ;;   3. MaxIndex:
@@ -26,16 +26,21 @@
 ;;        Usually, binary search CMOV is not predicted.
 ;;        Thus, it does not worth transforming.
 ;;
+;;   6. SmallGainPerLoop:
+;;        The gain percentage from converting CMOV into branch is acceptable,
+;;        however, the absolute gain is smaller than a threshold.
+;;        Thus, it does not worth transforming.
+;;
 ;; Test was created using the following command line:
 ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
 ;; Where foo.c is:
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
 ;;  for (int i = 0; i < n; i++) {
-;;    int t = c[i];
+;;    int t = c[i] + 1;
 ;;    if (c[i] * a > b)
 ;;      t = 10;
-;;    c[i] = t;
+;;    c[i] = (c[i] + 1) * t;
 ;;  }
 ;;}
 ;;
@@ -87,6 +92,16 @@
 ;;  }
 ;;  return Curr->Val;
 ;;}
+;;
+;;
+;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) {
+;;  for (int i = 0; i < n; i++) {
+;;    int t = c[i];
+;;    if (c[i] * a > b)
+;;      t = 10;
+;;    c[i] = t;
+;;  }
+;;}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 %struct.Node = type { i32, %struct.Node*, %struct.Node* }
@@ -111,10 +126,12 @@ for.body:                                         ; preds = %for.body.preheader,
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
   %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
   %mul = mul nsw i32 %0, %a
   %cmp3 = icmp sgt i32 %mul, %b
-  %. = select i1 %cmp3, i32 10, i32 %0
-  store i32 %., i32* %arrayidx, align 4
+  %. = select i1 %cmp3, i32 10, i32 %add
+  %mul7 = mul nsw i32 %., %add
+  store i32 %mul7, i32* %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
@@ -253,6 +270,35 @@ while.end:                                        ; preds = %while.body, %entry
   ret i32 %.lcssa
 }
 
+; CHECK-LABEL: SmallGainPerLoop
+; CHECK-NOT: jg
+; CHECK: cmovg
+
+define void @SmallGainPerLoop(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
+entry:
+  %cmp14 = icmp sgt i32 %n, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %a
+  %cmp3 = icmp sgt i32 %mul, %b
+  %. = select i1 %cmp3, i32 10, i32 %0
+  store i32 %., i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; The following test checks that x86-cmov-converter optimization transforms
 ;; CMOV instructions into branch correctly.