llvm-project/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll

486 lines
17 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s --check-prefixes=32-ALL,32-GOOD-RA
; RUN: llc -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=32-ALL,32-FAST-RA
; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA
; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=64-ALL,64-FAST-RA
; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA-SAHF
; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=64-ALL,64-FAST-RA-SAHF
; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s --check-prefixes=64-ALL,64-GOOD-RA-SAHF
; TODO: Reenable verify-machineinstr once the if (!AXDead) // FIXME
; in X86InstrInfo::copyPhysReg() is resolved.
x86: Emit LAHF/SAHF instead of PUSHF/POPF NaCl's sandbox doesn't allow PUSHF/POPF out of security concerns (priviledged emulators have forgotten to mask system bits in the past, and EFLAGS's DF bit is a constant source of hilarity). Commit r220529 fixed PR20376 by saving cmpxchg's flags result using EFLAGS, this commit now generated LAHF/SAHF instead, for all of x86 (not just NaCl) because it leads to an overall performance gain over PUSHF/POPF. As with the previous patch this code generation is pretty bad because it occurs very later, after register allocation, and in many cases it rematerializes flags which were already available (e.g. already in a register through SETE). Fortunately it's somewhat rare that this code needs to fire. I did [[ https://github.com/jfbastien/benchmark-x86-flags | a bit of benchmarking ]], the results on an Intel Haswell E5-2690 CPU at 2.9GHz are: | Time per call (ms) | Runtime (ms) | Benchmark | | 0.000012514 | 6257 | sete.i386 | | 0.000012810 | 6405 | sete.i386-fast | | 0.000010456 | 5228 | sete.x86-64 | | 0.000010496 | 5248 | sete.x86-64-fast | | 0.000012906 | 6453 | lahf-sahf.i386 | | 0.000013236 | 6618 | lahf-sahf.i386-fast | | 0.000010580 | 5290 | lahf-sahf.x86-64 | | 0.000010304 | 5152 | lahf-sahf.x86-64-fast | | 0.000028056 | 14028 | pushf-popf.i386 | | 0.000027160 | 13580 | pushf-popf.i386-fast | | 0.000023810 | 11905 | pushf-popf.x86-64 | | 0.000026468 | 13234 | pushf-popf.x86-64-fast | Clearly `PUSHF`/`POPF` are suboptimal. It doesn't really seems to be worth teaching LLVM about individual flags, at least not for this purpose. Reviewers: rnk, jvoung, t.p.northover Subscribers: llvm-commits Differential revision: http://reviews.llvm.org/D6629 llvm-svn: 244503
2015-08-11 04:59:36 +08:00
declare i32 @foo()
declare i32 @bar(i64)
; In the following case when using fast scheduling we get a long chain of
; EFLAGS save/restore due to a sequence of:
; cmpxchg8b (implicit-def eflags)
; eax = copy eflags
; adjcallstackdown32
; ...
; use of eax
; During PEI the adjcallstackdown32 is replaced with the subl which
; clobbers eflags, effectively interfering in the liveness interval.
; Is this a case we care about? Maybe no, considering this issue
; happens with the fast pre-regalloc scheduler enforced. A more
; performant scheduler would move the adjcallstackdown32 out of the
; eflags liveness interval.
define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) nounwind {
; 32-GOOD-RA-LABEL: test_intervening_call:
; 32-GOOD-RA: # %bb.0: # %entry
; 32-GOOD-RA-NEXT: pushl %ebp
; 32-GOOD-RA-NEXT: movl %esp, %ebp
; 32-GOOD-RA-NEXT: pushl %ebx
; 32-GOOD-RA-NEXT: pushl %esi
; 32-GOOD-RA-NEXT: movl 12(%ebp), %eax
; 32-GOOD-RA-NEXT: movl 16(%ebp), %edx
; 32-GOOD-RA-NEXT: movl 20(%ebp), %ebx
; 32-GOOD-RA-NEXT: movl 24(%ebp), %ecx
; 32-GOOD-RA-NEXT: movl 8(%ebp), %esi
; 32-GOOD-RA-NEXT: lock cmpxchg8b (%esi)
; 32-GOOD-RA-NEXT: pushl %eax
; 32-GOOD-RA-NEXT: seto %al
; 32-GOOD-RA-NEXT: lahf
; 32-GOOD-RA-NEXT: movl %eax, %esi
; 32-GOOD-RA-NEXT: popl %eax
; 32-GOOD-RA-NEXT: subl $8, %esp
; 32-GOOD-RA-NEXT: pushl %edx
; 32-GOOD-RA-NEXT: pushl %eax
; 32-GOOD-RA-NEXT: calll bar
; 32-GOOD-RA-NEXT: addl $16, %esp
; 32-GOOD-RA-NEXT: movl %esi, %eax
; 32-GOOD-RA-NEXT: addb $127, %al
; 32-GOOD-RA-NEXT: sahf
; 32-GOOD-RA-NEXT: jne .LBB0_3
; 32-GOOD-RA-NEXT: # %bb.1: # %t
; 32-GOOD-RA-NEXT: movl $42, %eax
; 32-GOOD-RA-NEXT: jmp .LBB0_2
; 32-GOOD-RA-NEXT: .LBB0_3: # %f
; 32-GOOD-RA-NEXT: xorl %eax, %eax
; 32-GOOD-RA-NEXT: .LBB0_2: # %t
; 32-GOOD-RA-NEXT: xorl %edx, %edx
; 32-GOOD-RA-NEXT: popl %esi
; 32-GOOD-RA-NEXT: popl %ebx
; 32-GOOD-RA-NEXT: popl %ebp
; 32-GOOD-RA-NEXT: retl
;
; 32-FAST-RA-LABEL: test_intervening_call:
; 32-FAST-RA: # %bb.0: # %entry
; 32-FAST-RA-NEXT: pushl %ebp
; 32-FAST-RA-NEXT: movl %esp, %ebp
; 32-FAST-RA-NEXT: pushl %ebx
; 32-FAST-RA-NEXT: pushl %esi
; 32-FAST-RA-NEXT: movl 8(%ebp), %esi
; 32-FAST-RA-NEXT: movl 20(%ebp), %ebx
; 32-FAST-RA-NEXT: movl 24(%ebp), %ecx
; 32-FAST-RA-NEXT: movl 12(%ebp), %eax
; 32-FAST-RA-NEXT: movl 16(%ebp), %edx
; 32-FAST-RA-NEXT: lock cmpxchg8b (%esi)
; 32-FAST-RA-NEXT: pushl %eax
; 32-FAST-RA-NEXT: seto %al
; 32-FAST-RA-NEXT: lahf
; 32-FAST-RA-NEXT: movl %eax, %ecx
; 32-FAST-RA-NEXT: popl %eax
; 32-FAST-RA-NEXT: subl $8, %esp
; 32-FAST-RA-NEXT: pushl %eax
; 32-FAST-RA-NEXT: movl %ecx, %eax
; 32-FAST-RA-NEXT: addb $127, %al
; 32-FAST-RA-NEXT: sahf
; 32-FAST-RA-NEXT: popl %eax
; 32-FAST-RA-NEXT: pushl %eax
; 32-FAST-RA-NEXT: seto %al
; 32-FAST-RA-NEXT: lahf
; 32-FAST-RA-NEXT: movl %eax, %esi
; 32-FAST-RA-NEXT: popl %eax
; 32-FAST-RA-NEXT: pushl %edx
; 32-FAST-RA-NEXT: pushl %eax
; 32-FAST-RA-NEXT: calll bar
; 32-FAST-RA-NEXT: addl $16, %esp
; 32-FAST-RA-NEXT: movl %esi, %eax
; 32-FAST-RA-NEXT: addb $127, %al
; 32-FAST-RA-NEXT: sahf
; 32-FAST-RA-NEXT: jne .LBB0_3
; 32-FAST-RA-NEXT: # %bb.1: # %t
; 32-FAST-RA-NEXT: movl $42, %eax
; 32-FAST-RA-NEXT: jmp .LBB0_2
; 32-FAST-RA-NEXT: .LBB0_3: # %f
; 32-FAST-RA-NEXT: xorl %eax, %eax
; 32-FAST-RA-NEXT: .LBB0_2: # %t
; 32-FAST-RA-NEXT: xorl %edx, %edx
; 32-FAST-RA-NEXT: popl %esi
; 32-FAST-RA-NEXT: popl %ebx
; 32-FAST-RA-NEXT: popl %ebp
; 32-FAST-RA-NEXT: retl
;
; 64-GOOD-RA-LABEL: test_intervening_call:
; 64-GOOD-RA: # %bb.0: # %entry
; 64-GOOD-RA-NEXT: pushq %rbp
; 64-GOOD-RA-NEXT: movq %rsp, %rbp
; 64-GOOD-RA-NEXT: pushq %rbx
; 64-GOOD-RA-NEXT: pushq %rax
; 64-GOOD-RA-NEXT: movq %rsi, %rax
; 64-GOOD-RA-NEXT: lock cmpxchgq %rdx, (%rdi)
; 64-GOOD-RA-NEXT: pushfq
; 64-GOOD-RA-NEXT: popq %rbx
; 64-GOOD-RA-NEXT: movq %rax, %rdi
; 64-GOOD-RA-NEXT: callq bar
; 64-GOOD-RA-NEXT: pushq %rbx
; 64-GOOD-RA-NEXT: popfq
; 64-GOOD-RA-NEXT: jne .LBB0_3
; 64-GOOD-RA-NEXT: # %bb.1: # %t
; 64-GOOD-RA-NEXT: movl $42, %eax
; 64-GOOD-RA-NEXT: jmp .LBB0_2
; 64-GOOD-RA-NEXT: .LBB0_3: # %f
; 64-GOOD-RA-NEXT: xorl %eax, %eax
; 64-GOOD-RA-NEXT: .LBB0_2: # %t
; 64-GOOD-RA-NEXT: addq $8, %rsp
; 64-GOOD-RA-NEXT: popq %rbx
; 64-GOOD-RA-NEXT: popq %rbp
; 64-GOOD-RA-NEXT: retq
;
; 64-FAST-RA-LABEL: test_intervening_call:
; 64-FAST-RA: # %bb.0: # %entry
; 64-FAST-RA-NEXT: pushq %rbp
; 64-FAST-RA-NEXT: movq %rsp, %rbp
; 64-FAST-RA-NEXT: pushq %rbx
; 64-FAST-RA-NEXT: pushq %rax
; 64-FAST-RA-NEXT: movq %rsi, %rax
; 64-FAST-RA-NEXT: lock cmpxchgq %rdx, (%rdi)
; 64-FAST-RA-NEXT: pushfq
; 64-FAST-RA-NEXT: popq %rbx
; 64-FAST-RA-NEXT: movq %rax, %rdi
; 64-FAST-RA-NEXT: callq bar
; 64-FAST-RA-NEXT: pushq %rbx
; 64-FAST-RA-NEXT: popfq
; 64-FAST-RA-NEXT: jne .LBB0_3
; 64-FAST-RA-NEXT: # %bb.1: # %t
; 64-FAST-RA-NEXT: movl $42, %eax
; 64-FAST-RA-NEXT: jmp .LBB0_2
; 64-FAST-RA-NEXT: .LBB0_3: # %f
; 64-FAST-RA-NEXT: xorl %eax, %eax
; 64-FAST-RA-NEXT: .LBB0_2: # %t
; 64-FAST-RA-NEXT: addq $8, %rsp
; 64-FAST-RA-NEXT: popq %rbx
; 64-FAST-RA-NEXT: popq %rbp
; 64-FAST-RA-NEXT: retq
;
; 64-GOOD-RA-SAHF-LABEL: test_intervening_call:
; 64-GOOD-RA-SAHF: # %bb.0: # %entry
; 64-GOOD-RA-SAHF-NEXT: pushq %rbp
; 64-GOOD-RA-SAHF-NEXT: movq %rsp, %rbp
; 64-GOOD-RA-SAHF-NEXT: pushq %rbx
; 64-GOOD-RA-SAHF-NEXT: pushq %rax
; 64-GOOD-RA-SAHF-NEXT: movq %rsi, %rax
; 64-GOOD-RA-SAHF-NEXT: lock cmpxchgq %rdx, (%rdi)
; 64-GOOD-RA-SAHF-NEXT: pushq %rax
; 64-GOOD-RA-SAHF-NEXT: seto %al
; 64-GOOD-RA-SAHF-NEXT: lahf
; 64-GOOD-RA-SAHF-NEXT: movq %rax, %rbx
; 64-GOOD-RA-SAHF-NEXT: popq %rax
; 64-GOOD-RA-SAHF-NEXT: movq %rax, %rdi
; 64-GOOD-RA-SAHF-NEXT: callq bar
; 64-GOOD-RA-SAHF-NEXT: movq %rbx, %rax
; 64-GOOD-RA-SAHF-NEXT: addb $127, %al
; 64-GOOD-RA-SAHF-NEXT: sahf
; 64-GOOD-RA-SAHF-NEXT: jne .LBB0_3
; 64-GOOD-RA-SAHF-NEXT: # %bb.1: # %t
; 64-GOOD-RA-SAHF-NEXT: movl $42, %eax
; 64-GOOD-RA-SAHF-NEXT: jmp .LBB0_2
; 64-GOOD-RA-SAHF-NEXT: .LBB0_3: # %f
; 64-GOOD-RA-SAHF-NEXT: xorl %eax, %eax
; 64-GOOD-RA-SAHF-NEXT: .LBB0_2: # %t
; 64-GOOD-RA-SAHF-NEXT: addq $8, %rsp
; 64-GOOD-RA-SAHF-NEXT: popq %rbx
; 64-GOOD-RA-SAHF-NEXT: popq %rbp
; 64-GOOD-RA-SAHF-NEXT: retq
;
; 64-FAST-RA-SAHF-LABEL: test_intervening_call:
; 64-FAST-RA-SAHF: # %bb.0: # %entry
; 64-FAST-RA-SAHF-NEXT: pushq %rbp
; 64-FAST-RA-SAHF-NEXT: movq %rsp, %rbp
; 64-FAST-RA-SAHF-NEXT: pushq %rbx
; 64-FAST-RA-SAHF-NEXT: pushq %rax
; 64-FAST-RA-SAHF-NEXT: movq %rsi, %rax
; 64-FAST-RA-SAHF-NEXT: lock cmpxchgq %rdx, (%rdi)
; 64-FAST-RA-SAHF-NEXT: pushq %rax
; 64-FAST-RA-SAHF-NEXT: seto %al
; 64-FAST-RA-SAHF-NEXT: lahf
; 64-FAST-RA-SAHF-NEXT: movq %rax, %rbx
; 64-FAST-RA-SAHF-NEXT: popq %rax
; 64-FAST-RA-SAHF-NEXT: movq %rax, %rdi
; 64-FAST-RA-SAHF-NEXT: callq bar
; 64-FAST-RA-SAHF-NEXT: movq %rbx, %rax
; 64-FAST-RA-SAHF-NEXT: addb $127, %al
; 64-FAST-RA-SAHF-NEXT: sahf
; 64-FAST-RA-SAHF-NEXT: jne .LBB0_3
; 64-FAST-RA-SAHF-NEXT: # %bb.1: # %t
; 64-FAST-RA-SAHF-NEXT: movl $42, %eax
; 64-FAST-RA-SAHF-NEXT: jmp .LBB0_2
; 64-FAST-RA-SAHF-NEXT: .LBB0_3: # %f
; 64-FAST-RA-SAHF-NEXT: xorl %eax, %eax
; 64-FAST-RA-SAHF-NEXT: .LBB0_2: # %t
; 64-FAST-RA-SAHF-NEXT: addq $8, %rsp
; 64-FAST-RA-SAHF-NEXT: popq %rbx
; 64-FAST-RA-SAHF-NEXT: popq %rbp
; 64-FAST-RA-SAHF-NEXT: retq
entry:
%cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
x86: Emit LAHF/SAHF instead of PUSHF/POPF NaCl's sandbox doesn't allow PUSHF/POPF out of security concerns (priviledged emulators have forgotten to mask system bits in the past, and EFLAGS's DF bit is a constant source of hilarity). Commit r220529 fixed PR20376 by saving cmpxchg's flags result using EFLAGS, this commit now generated LAHF/SAHF instead, for all of x86 (not just NaCl) because it leads to an overall performance gain over PUSHF/POPF. As with the previous patch this code generation is pretty bad because it occurs very later, after register allocation, and in many cases it rematerializes flags which were already available (e.g. already in a register through SETE). Fortunately it's somewhat rare that this code needs to fire. I did [[ https://github.com/jfbastien/benchmark-x86-flags | a bit of benchmarking ]], the results on an Intel Haswell E5-2690 CPU at 2.9GHz are: | Time per call (ms) | Runtime (ms) | Benchmark | | 0.000012514 | 6257 | sete.i386 | | 0.000012810 | 6405 | sete.i386-fast | | 0.000010456 | 5228 | sete.x86-64 | | 0.000010496 | 5248 | sete.x86-64-fast | | 0.000012906 | 6453 | lahf-sahf.i386 | | 0.000013236 | 6618 | lahf-sahf.i386-fast | | 0.000010580 | 5290 | lahf-sahf.x86-64 | | 0.000010304 | 5152 | lahf-sahf.x86-64-fast | | 0.000028056 | 14028 | pushf-popf.i386 | | 0.000027160 | 13580 | pushf-popf.i386-fast | | 0.000023810 | 11905 | pushf-popf.x86-64 | | 0.000026468 | 13234 | pushf-popf.x86-64-fast | Clearly `PUSHF`/`POPF` are suboptimal. It doesn't really seems to be worth teaching LLVM about individual flags, at least not for this purpose. Reviewers: rnk, jvoung, t.p.northover Subscribers: llvm-commits Differential revision: http://reviews.llvm.org/D6629 llvm-svn: 244503
2015-08-11 04:59:36 +08:00
%v = extractvalue { i64, i1 } %cx, 0
%p = extractvalue { i64, i1 } %cx, 1
x86: Emit LAHF/SAHF instead of PUSHF/POPF NaCl's sandbox doesn't allow PUSHF/POPF out of security concerns (priviledged emulators have forgotten to mask system bits in the past, and EFLAGS's DF bit is a constant source of hilarity). Commit r220529 fixed PR20376 by saving cmpxchg's flags result using EFLAGS, this commit now generated LAHF/SAHF instead, for all of x86 (not just NaCl) because it leads to an overall performance gain over PUSHF/POPF. As with the previous patch this code generation is pretty bad because it occurs very later, after register allocation, and in many cases it rematerializes flags which were already available (e.g. already in a register through SETE). Fortunately it's somewhat rare that this code needs to fire. I did [[ https://github.com/jfbastien/benchmark-x86-flags | a bit of benchmarking ]], the results on an Intel Haswell E5-2690 CPU at 2.9GHz are: | Time per call (ms) | Runtime (ms) | Benchmark | | 0.000012514 | 6257 | sete.i386 | | 0.000012810 | 6405 | sete.i386-fast | | 0.000010456 | 5228 | sete.x86-64 | | 0.000010496 | 5248 | sete.x86-64-fast | | 0.000012906 | 6453 | lahf-sahf.i386 | | 0.000013236 | 6618 | lahf-sahf.i386-fast | | 0.000010580 | 5290 | lahf-sahf.x86-64 | | 0.000010304 | 5152 | lahf-sahf.x86-64-fast | | 0.000028056 | 14028 | pushf-popf.i386 | | 0.000027160 | 13580 | pushf-popf.i386-fast | | 0.000023810 | 11905 | pushf-popf.x86-64 | | 0.000026468 | 13234 | pushf-popf.x86-64-fast | Clearly `PUSHF`/`POPF` are suboptimal. It doesn't really seems to be worth teaching LLVM about individual flags, at least not for this purpose. Reviewers: rnk, jvoung, t.p.northover Subscribers: llvm-commits Differential revision: http://reviews.llvm.org/D6629 llvm-svn: 244503
2015-08-11 04:59:36 +08:00
call i32 @bar(i64 %v)
br i1 %p, label %t, label %f
t:
ret i64 42
f:
ret i64 0
}
; Interesting in producing a clobber without any function calls.
define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) nounwind {
; 32-ALL-LABEL: test_control_flow:
; 32-ALL: # %bb.0: # %entry
; 32-ALL-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-ALL-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; 32-ALL-NEXT: jle .LBB1_6
; 32-ALL-NEXT: # %bb.1: # %loop_start
; 32-ALL-NEXT: movl {{[0-9]+}}(%esp), %ecx
; 32-ALL-NEXT: .p2align 4, 0x90
; 32-ALL-NEXT: .LBB1_2: # %while.condthread-pre-split.i
; 32-ALL-NEXT: # =>This Loop Header: Depth=1
; 32-ALL-NEXT: # Child Loop BB1_3 Depth 2
; 32-ALL-NEXT: movl (%ecx), %edx
; 32-ALL-NEXT: .p2align 4, 0x90
; 32-ALL-NEXT: .LBB1_3: # %while.cond.i
; 32-ALL-NEXT: # Parent Loop BB1_2 Depth=1
; 32-ALL-NEXT: # => This Inner Loop Header: Depth=2
; 32-ALL-NEXT: movl %edx, %eax
; 32-ALL-NEXT: xorl %edx, %edx
; 32-ALL-NEXT: testl %eax, %eax
; 32-ALL-NEXT: je .LBB1_3
; 32-ALL-NEXT: # %bb.4: # %while.body.i
; 32-ALL-NEXT: # in Loop: Header=BB1_2 Depth=1
; 32-ALL-NEXT: lock cmpxchgl %eax, (%ecx)
; 32-ALL-NEXT: jne .LBB1_2
; 32-ALL-NEXT: # %bb.5:
; 32-ALL-NEXT: xorl %eax, %eax
; 32-ALL-NEXT: .LBB1_6: # %cond.end
; 32-ALL-NEXT: retl
;
; 64-ALL-LABEL: test_control_flow:
; 64-ALL: # %bb.0: # %entry
; 64-ALL-NEXT: cmpl %edx, %esi
; 64-ALL-NEXT: jle .LBB1_5
; 64-ALL-NEXT: .p2align 4, 0x90
; 64-ALL-NEXT: .LBB1_1: # %while.condthread-pre-split.i
; 64-ALL-NEXT: # =>This Loop Header: Depth=1
; 64-ALL-NEXT: # Child Loop BB1_2 Depth 2
; 64-ALL-NEXT: movl (%rdi), %ecx
; 64-ALL-NEXT: .p2align 4, 0x90
; 64-ALL-NEXT: .LBB1_2: # %while.cond.i
; 64-ALL-NEXT: # Parent Loop BB1_1 Depth=1
; 64-ALL-NEXT: # => This Inner Loop Header: Depth=2
; 64-ALL-NEXT: movl %ecx, %eax
; 64-ALL-NEXT: xorl %ecx, %ecx
; 64-ALL-NEXT: testl %eax, %eax
; 64-ALL-NEXT: je .LBB1_2
; 64-ALL-NEXT: # %bb.3: # %while.body.i
; 64-ALL-NEXT: # in Loop: Header=BB1_1 Depth=1
; 64-ALL-NEXT: lock cmpxchgl %eax, (%rdi)
; 64-ALL-NEXT: jne .LBB1_1
; 64-ALL-NEXT: # %bb.4:
; 64-ALL-NEXT: xorl %esi, %esi
; 64-ALL-NEXT: .LBB1_5: # %cond.end
; 64-ALL-NEXT: movl %esi, %eax
; 64-ALL-NEXT: retq
entry:
%cmp = icmp sgt i32 %i, %j
br i1 %cmp, label %loop_start, label %cond.end
loop_start:
br label %while.condthread-pre-split.i
while.condthread-pre-split.i:
%.pr.i = load i32, i32* %p, align 4
br label %while.cond.i
while.cond.i:
%0 = phi i32 [ %.pr.i, %while.condthread-pre-split.i ], [ 0, %while.cond.i ]
%tobool.i = icmp eq i32 %0, 0
br i1 %tobool.i, label %while.cond.i, label %while.body.i
while.body.i:
%.lcssa = phi i32 [ %0, %while.cond.i ]
%1 = cmpxchg i32* %p, i32 %.lcssa, i32 %.lcssa seq_cst seq_cst
%2 = extractvalue { i32, i1 } %1, 1
br i1 %2, label %cond.end.loopexit, label %while.condthread-pre-split.i
cond.end.loopexit:
br label %cond.end
cond.end:
%cond = phi i32 [ %i, %entry ], [ 0, %cond.end.loopexit ]
ret i32 %cond
}
; This one is an interesting case because CMOV doesn't have a chain
; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here.
define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) nounwind {
; 32-GOOD-RA-LABEL: test_feed_cmov:
; 32-GOOD-RA: # %bb.0: # %entry
; 32-GOOD-RA-NEXT: pushl %ebp
; 32-GOOD-RA-NEXT: movl %esp, %ebp
; 32-GOOD-RA-NEXT: pushl %edi
; 32-GOOD-RA-NEXT: pushl %esi
; 32-GOOD-RA-NEXT: movl 12(%ebp), %eax
; 32-GOOD-RA-NEXT: movl 16(%ebp), %esi
; 32-GOOD-RA-NEXT: movl 8(%ebp), %ecx
; 32-GOOD-RA-NEXT: lock cmpxchgl %esi, (%ecx)
; 32-GOOD-RA-NEXT: seto %al
; 32-GOOD-RA-NEXT: lahf
; 32-GOOD-RA-NEXT: movl %eax, %edi
; 32-GOOD-RA-NEXT: calll foo
; 32-GOOD-RA-NEXT: pushl %eax
; 32-GOOD-RA-NEXT: movl %edi, %eax
; 32-GOOD-RA-NEXT: addb $127, %al
; 32-GOOD-RA-NEXT: sahf
; 32-GOOD-RA-NEXT: popl %eax
; 32-GOOD-RA-NEXT: je .LBB2_2
; 32-GOOD-RA-NEXT: # %bb.1: # %entry
; 32-GOOD-RA-NEXT: movl %eax, %esi
; 32-GOOD-RA-NEXT: .LBB2_2: # %entry
; 32-GOOD-RA-NEXT: movl %esi, %eax
; 32-GOOD-RA-NEXT: popl %esi
; 32-GOOD-RA-NEXT: popl %edi
; 32-GOOD-RA-NEXT: popl %ebp
; 32-GOOD-RA-NEXT: retl
;
; 32-FAST-RA-LABEL: test_feed_cmov:
; 32-FAST-RA: # %bb.0: # %entry
; 32-FAST-RA-NEXT: pushl %ebp
; 32-FAST-RA-NEXT: movl %esp, %ebp
; 32-FAST-RA-NEXT: pushl %edi
; 32-FAST-RA-NEXT: pushl %esi
; 32-FAST-RA-NEXT: movl 8(%ebp), %ecx
; 32-FAST-RA-NEXT: movl 16(%ebp), %esi
; 32-FAST-RA-NEXT: movl 12(%ebp), %eax
; 32-FAST-RA-NEXT: lock cmpxchgl %esi, (%ecx)
; 32-FAST-RA-NEXT: seto %al
; 32-FAST-RA-NEXT: lahf
; 32-FAST-RA-NEXT: movl %eax, %edi
; 32-FAST-RA-NEXT: calll foo
; 32-FAST-RA-NEXT: pushl %eax
; 32-FAST-RA-NEXT: movl %edi, %eax
; 32-FAST-RA-NEXT: addb $127, %al
; 32-FAST-RA-NEXT: sahf
; 32-FAST-RA-NEXT: popl %eax
; 32-FAST-RA-NEXT: je .LBB2_2
; 32-FAST-RA-NEXT: # %bb.1: # %entry
; 32-FAST-RA-NEXT: movl %eax, %esi
; 32-FAST-RA-NEXT: .LBB2_2: # %entry
; 32-FAST-RA-NEXT: movl %esi, %eax
; 32-FAST-RA-NEXT: popl %esi
; 32-FAST-RA-NEXT: popl %edi
; 32-FAST-RA-NEXT: popl %ebp
; 32-FAST-RA-NEXT: retl
;
; 64-GOOD-RA-LABEL: test_feed_cmov:
; 64-GOOD-RA: # %bb.0: # %entry
; 64-GOOD-RA-NEXT: pushq %rbp
; 64-GOOD-RA-NEXT: movq %rsp, %rbp
; 64-GOOD-RA-NEXT: pushq %r14
; 64-GOOD-RA-NEXT: pushq %rbx
; 64-GOOD-RA-NEXT: movl %edx, %ebx
; 64-GOOD-RA-NEXT: movl %esi, %eax
; 64-GOOD-RA-NEXT: lock cmpxchgl %edx, (%rdi)
; 64-GOOD-RA-NEXT: pushfq
; 64-GOOD-RA-NEXT: popq %r14
; 64-GOOD-RA-NEXT: callq foo
; 64-GOOD-RA-NEXT: pushq %r14
; 64-GOOD-RA-NEXT: popfq
; 64-GOOD-RA-NEXT: cmovel %ebx, %eax
; 64-GOOD-RA-NEXT: popq %rbx
; 64-GOOD-RA-NEXT: popq %r14
; 64-GOOD-RA-NEXT: popq %rbp
; 64-GOOD-RA-NEXT: retq
;
; 64-FAST-RA-LABEL: test_feed_cmov:
; 64-FAST-RA: # %bb.0: # %entry
; 64-FAST-RA-NEXT: pushq %rbp
; 64-FAST-RA-NEXT: movq %rsp, %rbp
; 64-FAST-RA-NEXT: pushq %r14
; 64-FAST-RA-NEXT: pushq %rbx
; 64-FAST-RA-NEXT: movl %edx, %ebx
; 64-FAST-RA-NEXT: movl %esi, %eax
; 64-FAST-RA-NEXT: lock cmpxchgl %edx, (%rdi)
; 64-FAST-RA-NEXT: pushfq
; 64-FAST-RA-NEXT: popq %r14
; 64-FAST-RA-NEXT: callq foo
; 64-FAST-RA-NEXT: pushq %r14
; 64-FAST-RA-NEXT: popfq
; 64-FAST-RA-NEXT: cmovel %ebx, %eax
; 64-FAST-RA-NEXT: popq %rbx
; 64-FAST-RA-NEXT: popq %r14
; 64-FAST-RA-NEXT: popq %rbp
; 64-FAST-RA-NEXT: retq
;
; 64-GOOD-RA-SAHF-LABEL: test_feed_cmov:
; 64-GOOD-RA-SAHF: # %bb.0: # %entry
; 64-GOOD-RA-SAHF-NEXT: pushq %rbp
; 64-GOOD-RA-SAHF-NEXT: movq %rsp, %rbp
; 64-GOOD-RA-SAHF-NEXT: pushq %r14
; 64-GOOD-RA-SAHF-NEXT: pushq %rbx
; 64-GOOD-RA-SAHF-NEXT: movl %edx, %ebx
; 64-GOOD-RA-SAHF-NEXT: movl %esi, %eax
; 64-GOOD-RA-SAHF-NEXT: lock cmpxchgl %edx, (%rdi)
; 64-GOOD-RA-SAHF-NEXT: seto %al
; 64-GOOD-RA-SAHF-NEXT: lahf
; 64-GOOD-RA-SAHF-NEXT: movq %rax, %r14
; 64-GOOD-RA-SAHF-NEXT: callq foo
; 64-GOOD-RA-SAHF-NEXT: pushq %rax
; 64-GOOD-RA-SAHF-NEXT: movq %r14, %rax
; 64-GOOD-RA-SAHF-NEXT: addb $127, %al
; 64-GOOD-RA-SAHF-NEXT: sahf
; 64-GOOD-RA-SAHF-NEXT: popq %rax
; 64-GOOD-RA-SAHF-NEXT: cmovel %ebx, %eax
; 64-GOOD-RA-SAHF-NEXT: popq %rbx
; 64-GOOD-RA-SAHF-NEXT: popq %r14
; 64-GOOD-RA-SAHF-NEXT: popq %rbp
; 64-GOOD-RA-SAHF-NEXT: retq
;
; 64-FAST-RA-SAHF-LABEL: test_feed_cmov:
; 64-FAST-RA-SAHF: # %bb.0: # %entry
; 64-FAST-RA-SAHF-NEXT: pushq %rbp
; 64-FAST-RA-SAHF-NEXT: movq %rsp, %rbp
; 64-FAST-RA-SAHF-NEXT: pushq %r14
; 64-FAST-RA-SAHF-NEXT: pushq %rbx
; 64-FAST-RA-SAHF-NEXT: movl %edx, %ebx
; 64-FAST-RA-SAHF-NEXT: movl %esi, %eax
; 64-FAST-RA-SAHF-NEXT: lock cmpxchgl %edx, (%rdi)
; 64-FAST-RA-SAHF-NEXT: seto %al
; 64-FAST-RA-SAHF-NEXT: lahf
; 64-FAST-RA-SAHF-NEXT: movq %rax, %r14
; 64-FAST-RA-SAHF-NEXT: callq foo
; 64-FAST-RA-SAHF-NEXT: pushq %rax
; 64-FAST-RA-SAHF-NEXT: movq %r14, %rax
; 64-FAST-RA-SAHF-NEXT: addb $127, %al
; 64-FAST-RA-SAHF-NEXT: sahf
; 64-FAST-RA-SAHF-NEXT: popq %rax
; 64-FAST-RA-SAHF-NEXT: cmovel %ebx, %eax
; 64-FAST-RA-SAHF-NEXT: popq %rbx
; 64-FAST-RA-SAHF-NEXT: popq %r14
; 64-FAST-RA-SAHF-NEXT: popq %rbp
; 64-FAST-RA-SAHF-NEXT: retq
entry:
%res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
%success = extractvalue { i32, i1 } %res, 1
x86: Emit LAHF/SAHF instead of PUSHF/POPF NaCl's sandbox doesn't allow PUSHF/POPF out of security concerns (priviledged emulators have forgotten to mask system bits in the past, and EFLAGS's DF bit is a constant source of hilarity). Commit r220529 fixed PR20376 by saving cmpxchg's flags result using EFLAGS, this commit now generated LAHF/SAHF instead, for all of x86 (not just NaCl) because it leads to an overall performance gain over PUSHF/POPF. As with the previous patch this code generation is pretty bad because it occurs very later, after register allocation, and in many cases it rematerializes flags which were already available (e.g. already in a register through SETE). Fortunately it's somewhat rare that this code needs to fire. I did [[ https://github.com/jfbastien/benchmark-x86-flags | a bit of benchmarking ]], the results on an Intel Haswell E5-2690 CPU at 2.9GHz are: | Time per call (ms) | Runtime (ms) | Benchmark | | 0.000012514 | 6257 | sete.i386 | | 0.000012810 | 6405 | sete.i386-fast | | 0.000010456 | 5228 | sete.x86-64 | | 0.000010496 | 5248 | sete.x86-64-fast | | 0.000012906 | 6453 | lahf-sahf.i386 | | 0.000013236 | 6618 | lahf-sahf.i386-fast | | 0.000010580 | 5290 | lahf-sahf.x86-64 | | 0.000010304 | 5152 | lahf-sahf.x86-64-fast | | 0.000028056 | 14028 | pushf-popf.i386 | | 0.000027160 | 13580 | pushf-popf.i386-fast | | 0.000023810 | 11905 | pushf-popf.x86-64 | | 0.000026468 | 13234 | pushf-popf.x86-64-fast | Clearly `PUSHF`/`POPF` are suboptimal. It doesn't really seems to be worth teaching LLVM about individual flags, at least not for this purpose. Reviewers: rnk, jvoung, t.p.northover Subscribers: llvm-commits Differential revision: http://reviews.llvm.org/D6629 llvm-svn: 244503
2015-08-11 04:59:36 +08:00
%rhs = call i32 @foo()
%ret = select i1 %success, i32 %new, i32 %rhs
ret i32 %ret
}