llvm-project/llvm/test/CodeGen/X86/cmov.ll

222 lines
6.4 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
[x86] Teach the cmov converter to aggressively convert cmovs with memory operands into control flow. We have seen periodically performance problems with cmov where one operand comes from memory. On modern x86 processors with strong branch predictors and speculative execution, this tends to be much better done with a branch than cmov. We routinely see cmov stalling while the load is completed rather than continuing, and if there are subsequent branches, they cannot be speculated in turn. Also, in many (even simple) cases, macro fusion causes the control flow version to be fewer uops. Consider the IACA output for the initial sequence of code in a very hot function in one of our internal benchmarks that motivates this, and notice the micro-op reduction provided. Before, SNB: ``` Throughput Analysis Report -------------------------- Block Throughput: 2.20 Cycles Throughput Bottleneck: Port1 | Num Of | Ports pressure in cycles | | | Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | | --------------------------------------------------------------------- | 1 | | 1.0 | | | | | CP | mov rcx, rdi | 0* | | | | | | | | xor edi, edi | 2^ | 0.1 | 0.6 | 0.5 0.5 | 0.5 0.5 | | 0.4 | CP | cmp byte ptr [rsi+0xf], 0xf | 1 | | | 0.5 0.5 | 0.5 0.5 | | | | mov rax, qword ptr [rsi] | 3 | 1.8 | 0.6 | | | | 0.6 | CP | cmovbe rax, rdi | 2^ | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | cmp byte ptr [rcx+0xf], 0x10 | 0F | | | | | | | | jb 0xf Total Num Of Uops: 9 ``` After, SNB: ``` Throughput Analysis Report -------------------------- Block Throughput: 2.00 Cycles Throughput Bottleneck: Port5 | Num Of | Ports pressure in cycles | | | Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | | --------------------------------------------------------------------- | 1 | 0.5 | 0.5 | | | | | | mov rax, rdi | 0* | | | | | | | | xor edi, edi | 2^ | 0.5 | 0.5 | 1.0 1.0 | | | | | cmp byte ptr [rsi+0xf], 0xf | 1 | 0.5 | 0.5 | | | | | | mov ecx, 0x0 | 1 | | | | | | 1.0 | CP | jnbe 0x39 | 2^ | | | | 1.0 1.0 | | 1.0 | CP | cmp byte ptr [rax+0xf], 0x10 | 0F | | | | | | | | jnb 0x3c Total Num Of Uops: 7 ``` The difference even manifests in a throughput cycle rate difference on Haswell. Before, HSW: ``` Throughput Analysis Report -------------------------- Block Throughput: 2.00 Cycles Throughput Bottleneck: FrontEnd | Num Of | Ports pressure in cycles | | | Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | | --------------------------------------------------------------------------------- | 0* | | | | | | | | | | mov rcx, rdi | 0* | | | | | | | | | | xor edi, edi | 2^ | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | | cmp byte ptr [rsi+0xf], 0xf | 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | | mov rax, qword ptr [rsi] | 3 | 1.0 | 1.0 | | | | | 1.0 | | | cmovbe rax, rdi | 2^ | 0.5 | | 0.5 0.5 | 0.5 0.5 | | | 0.5 | | | cmp byte ptr [rcx+0xf], 0x10 | 0F | | | | | | | | | | jb 0xf Total Num Of Uops: 8 ``` After, HSW: ``` Throughput Analysis Report -------------------------- Block Throughput: 1.50 Cycles Throughput Bottleneck: FrontEnd | Num Of | Ports pressure in cycles | | | Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | | --------------------------------------------------------------------------------- | 0* | | | | | | | | | | mov rax, rdi | 0* | | | | | | | | | | xor edi, edi | 2^ | | | 1.0 1.0 | | | 1.0 | | | | cmp byte ptr [rsi+0xf], 0xf | 1 | | 1.0 | | | | | | | | mov ecx, 0x0 | 1 | | | | | | | 1.0 | | | jnbe 0x39 | 2^ | 1.0 | | | 1.0 1.0 | | | | | | cmp byte ptr [rax+0xf], 0x10 | 0F | | | | | | | | | | jnb 0x3c Total Num Of Uops: 6 ``` Note that this cannot be usefully restricted to inner loops. Much of the hot code we see hitting this is not in an inner loop or not in a loop at all. The optimization still remains effective and indeed critical for some of our code. I have run a suite of internal benchmarks with this change. I saw a few very significant improvements and a very few minor regressions, but overall this change rarely has a significant effect. However, the improvements were very significant, and in quite important routines responsible for a great deal of our C++ CPU cycles. The gains pretty clealy outweigh the regressions for us. I also ran the test-suite and SPEC2006. Only 11 binaries changed at all and none of them showed any regressions. Amjad Aboud at Intel also ran this over their benchmarks and saw no regressions. Differential Revision: https://reviews.llvm.org/D36858 llvm-svn: 311226
2017-08-19 13:01:19 +08:00
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
; CHECK-LABEL: test1:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: btl %esi, %edi
; CHECK-NEXT: movl $12, %eax
; CHECK-NEXT: cmovael (%rcx), %eax
; CHECK-NEXT: retq
entry:
%0 = lshr i32 %x, %n
%1 = and i32 %0, 1
%toBool = icmp eq i32 %1, 0
%v = load i32, i32* %vp
%.0 = select i1 %toBool, i32 %v, i32 12
ret i32 %.0
}
define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
; CHECK-LABEL: test2:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: btl %esi, %edi
; CHECK-NEXT: movl $12, %eax
; CHECK-NEXT: cmovbl (%rcx), %eax
; CHECK-NEXT: retq
entry:
%0 = lshr i32 %x, %n
%1 = and i32 %0, 1
%toBool = icmp eq i32 %1, 0
%v = load i32, i32* %vp
%.0 = select i1 %toBool, i32 12, i32 %v
ret i32 %.0
}
; x86's 32-bit cmov zeroes the high 32 bits of the destination. Make
; sure CodeGen takes advantage of that to avoid an unnecessary
; zero-extend (movl) after the cmov.
declare void @bar(i64) nounwind
define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
; CHECK-LABEL: test3:
; CHECK: # BB#0:
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: testb $1, %dl
; CHECK-NEXT: cmovel %esi, %edi
; CHECK-NEXT: callq bar
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
%c = trunc i64 %a to i32
%d = trunc i64 %b to i32
%e = select i1 %p, i32 %c, i32 %d
%f = zext i32 %e to i64
call void @bar(i64 %f)
ret void
}
; CodeGen shouldn't try to do a setne after an expanded 8-bit conditional
; move without recomputing EFLAGS, because the expansion of the conditional
; move with control flow may clobber EFLAGS (e.g., with xor, to set the
; register to zero).
; The test is a little awkward; the important part is that there's a test before the
; setne.
; PR4814
@g_3 = external global i8
@g_96 = external global i8
@g_100 = external global i8
@_2E_str = external constant [15 x i8], align 1
define i1 @test4() nounwind {
; CHECK-LABEL: test4:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: movsbl {{.*}}(%rip), %edx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shrb $7, %al
; CHECK-NEXT: movzbl %al, %ecx
; CHECK-NEXT: xorl $1, %ecx
; CHECK-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
; CHECK-NEXT: sarl %cl, %edx
; CHECK-NEXT: movb {{.*}}(%rip), %al
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je .LBB3_2
; CHECK-NEXT: # BB#1: # %bb.i.i.i
; CHECK-NEXT: movb {{.*}}(%rip), %cl
; CHECK-NEXT: .LBB3_2: # %func_4.exit.i
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: testb %dl, %dl
; CHECK-NEXT: setne %bl
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: je .LBB3_4
; CHECK-NEXT: # BB#3: # %func_4.exit.i
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: .LBB3_4: # %func_4.exit.i
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je .LBB3_7
; CHECK-NEXT: # BB#5: # %func_4.exit.i
; CHECK-NEXT: testb %bl, %bl
; CHECK-NEXT: jne .LBB3_7
; CHECK-NEXT: # BB#6: # %bb.i.i
; CHECK-NEXT: movb {{.*}}(%rip), %cl
; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: .LBB3_7: # %func_1.exit
; CHECK-NEXT: movb %cl, {{.*}}(%rip)
; CHECK-NEXT: movzbl %cl, %esi
; CHECK-NEXT: movl $_2E_str, %edi
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: callq printf
; CHECK-NEXT: movl %ebx, %eax
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: retq
entry:
%0 = load i8, i8* @g_3, align 1
%1 = sext i8 %0 to i32
%.lobit.i = lshr i8 %0, 7
%tmp.i = zext i8 %.lobit.i to i32
%tmp.not.i = xor i32 %tmp.i, 1
%iftmp.17.0.i.i = ashr i32 %1, %tmp.not.i
%retval56.i.i = trunc i32 %iftmp.17.0.i.i to i8
%2 = icmp eq i8 %retval56.i.i, 0
%g_96.promoted.i = load i8, i8* @g_96
%3 = icmp eq i8 %g_96.promoted.i, 0
br i1 %3, label %func_4.exit.i, label %bb.i.i.i
bb.i.i.i:
%4 = load volatile i8, i8* @g_100, align 1
br label %func_4.exit.i
func_4.exit.i:
%.not.i = xor i1 %2, true
%brmerge.i = or i1 %3, %.not.i
%.mux.i = select i1 %2, i8 %g_96.promoted.i, i8 0
br i1 %brmerge.i, label %func_1.exit, label %bb.i.i
bb.i.i:
%5 = load volatile i8, i8* @g_100, align 1
br label %func_1.exit
func_1.exit:
%g_96.tmp.0.i = phi i8 [ %g_96.promoted.i, %bb.i.i ], [ %.mux.i, %func_4.exit.i ]
%ret = phi i1 [ 0, %bb.i.i ], [ %.not.i, %func_4.exit.i ]
store i8 %g_96.tmp.0.i, i8* @g_96
%6 = zext i8 %g_96.tmp.0.i to i32
%7 = tail call i32 (i8*, ...) @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind
ret i1 %ret
}
declare i32 @printf(i8* nocapture, ...) nounwind
; Should compile to setcc | -2.
; rdar://6668608
define i32 @test5(i32* nocapture %P) nounwind readonly {
; CHECK-LABEL: test5:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl $41, (%rdi)
; CHECK-NEXT: setg %al
; CHECK-NEXT: orl $-2, %eax
; CHECK-NEXT: retq
entry:
%0 = load i32, i32* %P, align 4
%1 = icmp sgt i32 %0, 41
%iftmp.0.0 = select i1 %1, i32 -1, i32 -2
ret i32 %iftmp.0.0
}
define i32 @test6(i32* nocapture %P) nounwind readonly {
; CHECK-LABEL: test6:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl $42, (%rdi)
; CHECK-NEXT: setl %al
; CHECK-NEXT: leal 4(%rax,%rax,8), %eax
; CHECK-NEXT: retq
entry:
%0 = load i32, i32* %P, align 4
%1 = icmp sgt i32 %0, 41
%iftmp.0.0 = select i1 %1, i32 4, i32 13
ret i32 %iftmp.0.0
}
2009-09-15 10:27:23 +08:00
; Don't try to use a 16-bit conditional move to do an 8-bit select,
; because it isn't worth it. Just use a branch instead.
define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
; CHECK-LABEL: test7:
; CHECK: # BB#0:
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: jne .LBB6_2
; CHECK-NEXT: # BB#1:
; CHECK-NEXT: movl %edx, %esi
; CHECK-NEXT: .LBB6_2:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: retq
2009-09-15 10:27:23 +08:00
%d = select i1 %c, i8 %a, i8 %b
ret i8 %d
}
define i32 @smin(i32 %x) {
; CHECK-LABEL: smin:
; CHECK: # BB#0:
; CHECK-NEXT: notl %edi
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: cmovsl %edi, %eax
; CHECK-NEXT: retq
%not_x = xor i32 %x, -1
%1 = icmp slt i32 %not_x, -1
%sel = select i1 %1, i32 %not_x, i32 -1
ret i32 %sel
}