llvm-project/llvm/test/CodeGen/X86/pr33954.ll

; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This test checks that x86-cmov-converter optimization does not transform CMOV
;; instruction when the gain (in cycles) of converting to branch is less than
;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
;;
;; Test was created using the following command line:
;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
;; Where foo.c is:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;int bar(int *a, int *b, int n) {
;;  int sum = 0;
;;  for (int i = 0; i < n; ++i) {
;;    int x = a[i] * a[i+1] * a[i+2];
;;    int y = b[i] * b[i+1];
;;    sum += y > x ? x : 0;
;;  }
;;  return sum;
;;}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Adding a test to the above function shows code with CMOV is 25% faster than
;; the code with branch.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;#define N 10000
;;int A[N];
;;int B[N];
;;
;;
;;
;;int main () {
;;  for (int i=0; i< N; ++i) {
;;    A[i] = i%4;
;;    B[i] = i%5;
;;  }
;;  int sum = 0;
;;  for (int i=0; i< N*10; ++i)
;;    sum += bar(A, B, N);
;;  return sum;
;;}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; CHECK-NOT: jg
; CHECK: cmovle
define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
entry:
  %cmp30 = icmp sgt i32 %n, 0
  br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry
  %.pre = load i32, i32* %a, align 4
  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
  %.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
  %.pre35 = load i32, i32* %b, align 4
  %wide.trip.count = zext i32 %n to i64
  br label %for.body

for.cond.cleanup:                                 ; preds = %for.body, %entry
  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
  ret i32 %sum.0.lcssa

for.body:                                         ; preds = %for.body, %for.body.preheader
  %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
  %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
  %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %mul = mul nsw i32 %1, %2
  %3 = add nuw nsw i64 %indvars.iv, 2
  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
  %4 = load i32, i32* %arrayidx5, align 4
  %mul6 = mul nsw i32 %mul, %4
  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
  %5 = load i32, i32* %arrayidx11, align 4
  %mul12 = mul nsw i32 %5, %0
  %cmp13 = icmp sgt i32 %mul12, %mul6
  %cond = select i1 %cmp13, i32 %mul6, i32 0
  %add14 = add nsw i32 %cond, %sum.032
  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

attributes #0 = {"target-cpu"="skylake"}

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 5.0.0 (trunk)"}
[X86] Improved X86::CMOV to Branch heuristic. Resolved PR33954. This patch contains two more constraints that aim to reduce the noise cases where we convert CMOV into branch for small gain, and end up spending more cycles due to overhead. Differential Revision: https://reviews.llvm.org/D36081 llvm-svn: 310352 2017-08-08 20:17:56 +08:00			`; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s \| FileCheck %s`

			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
			`;; This test checks that x86-cmov-converter optimization does not transform CMOV`
			`;; instruction when the gain (in cycles) of converting to branch is less than`
			`;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").`
			`;;`
			`;; Test was created using the following command line:`
			`;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -`
			`;; Where foo.c is:`
			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
			`;;int bar(int a, int b, int n) {`
			`;; int sum = 0;`
			`;; for (int i = 0; i < n; ++i) {`
			`;; int x = a[i] * a[i+1] * a[i+2];`
			`;; int y = b[i] * b[i+1];`
			`;; sum += y > x ? x : 0;`
			`;; }`
			`;; return sum;`
			`;;}`
			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
			`;; Adding a test to the above function shows code with CMOV is 25% faster than`
			`;; the code with branch.`
			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
			`;;#define N 10000`
			`;;int A[N];`
			`;;int B[N];`
			`;;`
			`;;`
			`;;`
			`;;int main () {`
			`;; for (int i=0; i< N; ++i) {`
			`;; A[i] = i%4;`
			`;; B[i] = i%5;`
			`;; }`
			`;; int sum = 0;`
			`;; for (int i=0; i< N*10; ++i)`
			`;; sum += bar(A, B, N);`
			`;; return sum;`
			`;;}`
			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`

			`; CHECK-NOT: jg`
			`; CHECK: cmovle`
			`define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {`
			`entry:`
			`%cmp30 = icmp sgt i32 %n, 0`
			`br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup`

			`for.body.preheader: ; preds = %entry`
			`%.pre = load i32, i32* %a, align 4`
			`%arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1`
			`%.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4`
			`%.pre35 = load i32, i32* %b, align 4`
			`%wide.trip.count = zext i32 %n to i64`
			`br label %for.body`

			`for.cond.cleanup: ; preds = %for.body, %entry`
			`%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]`
			`ret i32 %sum.0.lcssa`

			`for.body: ; preds = %for.body, %for.body.preheader`
			`%0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]`
			`%1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]`
			`%2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]`
			`%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]`
			`%sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]`
			`%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1`
			`%mul = mul nsw i32 %1, %2`
			`%3 = add nuw nsw i64 %indvars.iv, 2`
			`%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3`
			`%4 = load i32, i32* %arrayidx5, align 4`
			`%mul6 = mul nsw i32 %mul, %4`
			`%arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next`
			`%5 = load i32, i32* %arrayidx11, align 4`
			`%mul12 = mul nsw i32 %5, %0`
			`%cmp13 = icmp sgt i32 %mul12, %mul6`
			`%cond = select i1 %cmp13, i32 %mul6, i32 0`
			`%add14 = add nsw i32 %cond, %sum.032`
			`%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count`
			`br i1 %exitcond, label %for.cond.cleanup, label %for.body`
			`}`

			`attributes #0 = {"target-cpu"="skylake"}`

			`!llvm.module.flags = !{!0, !1}`
			`!llvm.ident = !{!2}`

			`!0 = !{i32 1, !"wchar_size", i32 2}`
			`!1 = !{i32 7, !"PIC Level", i32 2}`
			`!2 = !{!"clang version 5.0.0 (trunk)"}`