llvm-project/llvm/test/CodeGen/X86/atomic_idempotent.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32

; On x86, an atomic rmw operation that does not modify the value in memory
; (such as atomic add 0) can be replaced by an mfence followed by a mov.
; This is explained (with the motivation for such an optimization) in
; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf

define i8 @add8(i8* %p) {
; X64-LABEL: add8:
; X64:       # %bb.0:
; X64-NEXT:    mfence
; X64-NEXT:    movb (%rdi), %al
; X64-NEXT:    retq
;
; X32-LABEL: add8:
; X32:       # %bb.0:
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    mfence
; X32-NEXT:    movb (%eax), %al
; X32-NEXT:    retl
  %1 = atomicrmw add i8* %p, i8 0 monotonic
  ret i8 %1
}

define i16 @or16(i16* %p) {
; X64-LABEL: or16:
; X64:       # %bb.0:
; X64-NEXT:    mfence
; X64-NEXT:    movzwl (%rdi), %eax
; X64-NEXT:    retq
;
; X32-LABEL: or16:
; X32:       # %bb.0:
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    mfence
; X32-NEXT:    movzwl (%eax), %eax
; X32-NEXT:    retl
  %1 = atomicrmw or i16* %p, i16 0 acquire
  ret i16 %1
}

define i32 @xor32(i32* %p) {
; X64-LABEL: xor32:
; X64:       # %bb.0:
; X64-NEXT:    mfence
; X64-NEXT:    movl (%rdi), %eax
; X64-NEXT:    retq
;
; X32-LABEL: xor32:
; X32:       # %bb.0:
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    mfence
; X32-NEXT:    movl (%eax), %eax
; X32-NEXT:    retl
  %1 = atomicrmw xor i32* %p, i32 0 release
  ret i32 %1
}

define i64 @sub64(i64* %p) {
; X64-LABEL: sub64:
; X64:       # %bb.0:
; X64-NEXT:    mfence
; X64-NEXT:    movq (%rdi), %rax
; X64-NEXT:    retq
;
; X32-LABEL: sub64:
; X32:       # %bb.0:
; X32-NEXT:    pushl %ebx
; X32-NEXT:    .cfi_def_cfa_offset 8
; X32-NEXT:    pushl %esi
; X32-NEXT:    .cfi_def_cfa_offset 12
; X32-NEXT:    .cfi_offset %esi, -12
; X32-NEXT:    .cfi_offset %ebx, -8
; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
; X32-NEXT:    movl (%esi), %eax
; X32-NEXT:    movl 4(%esi), %edx
; X32-NEXT:    .p2align 4, 0x90
; X32-NEXT:  .LBB3_1: # %atomicrmw.start
; X32-NEXT:    # =>This Inner Loop Header: Depth=1
; X32-NEXT:    movl %edx, %ecx
; X32-NEXT:    movl %eax, %ebx
; X32-NEXT:    lock cmpxchg8b (%esi)
; X32-NEXT:    jne .LBB3_1
; X32-NEXT:  # %bb.2: # %atomicrmw.end
; X32-NEXT:    popl %esi
; X32-NEXT:    .cfi_def_cfa_offset 8
; X32-NEXT:    popl %ebx
; X32-NEXT:    .cfi_def_cfa_offset 4
; X32-NEXT:    retl
  %1 = atomicrmw sub i64* %p, i64 0 seq_cst
  ret i64 %1
}

define i128 @or128(i128* %p) {
; X64-LABEL: or128:
; X64:       # %bb.0:
; X64-NEXT:    pushq %rax
; X64-NEXT:    .cfi_def_cfa_offset 16
; X64-NEXT:    xorl %esi, %esi
; X64-NEXT:    xorl %edx, %edx
; X64-NEXT:    callq __sync_fetch_and_or_16
; X64-NEXT:    popq %rcx
; X64-NEXT:    .cfi_def_cfa_offset 8
; X64-NEXT:    retq
;
; X32-LABEL: or128:
; X32:       # %bb.0:
; X32-NEXT:    pushl %ebp
; X32-NEXT:    .cfi_def_cfa_offset 8
; X32-NEXT:    .cfi_offset %ebp, -8
; X32-NEXT:    movl %esp, %ebp
; X32-NEXT:    .cfi_def_cfa_register %ebp
; X32-NEXT:    pushl %edi
; X32-NEXT:    pushl %esi
; X32-NEXT:    andl $-8, %esp
; X32-NEXT:    subl $16, %esp
; X32-NEXT:    .cfi_offset %esi, -16
; X32-NEXT:    .cfi_offset %edi, -12
; X32-NEXT:    movl 8(%ebp), %esi
; X32-NEXT:    movl %esp, %eax
; X32-NEXT:    pushl $0
; X32-NEXT:    pushl $0
; X32-NEXT:    pushl $0
; X32-NEXT:    pushl $0
; X32-NEXT:    pushl 12(%ebp)
; X32-NEXT:    pushl %eax
; X32-NEXT:    calll __sync_fetch_and_or_16
; X32-NEXT:    addl $20, %esp
; X32-NEXT:    movl (%esp), %eax
; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
; X32-NEXT:    movl %edi, 12(%esi)
; X32-NEXT:    movl %edx, 8(%esi)
; X32-NEXT:    movl %ecx, 4(%esi)
; X32-NEXT:    movl %eax, (%esi)
; X32-NEXT:    movl %esi, %eax
; X32-NEXT:    leal -8(%ebp), %esp
; X32-NEXT:    popl %esi
; X32-NEXT:    popl %edi
; X32-NEXT:    popl %ebp
; X32-NEXT:    .cfi_def_cfa %esp, 4
; X32-NEXT:    retl $4
  %1 = atomicrmw or i128* %p, i128 0 monotonic
  ret i128 %1
}

; For 'and', the idempotent value is (-1)
define i32 @and32 (i32* %p) {
; X64-LABEL: and32:
; X64:       # %bb.0:
; X64-NEXT:    mfence
; X64-NEXT:    movl (%rdi), %eax
; X64-NEXT:    retq
;
; X32-LABEL: and32:
; X32:       # %bb.0:
; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X32-NEXT:    mfence
; X32-NEXT:    movl (%eax), %eax
; X32-NEXT:    retl
  %1 = atomicrmw and i32* %p, i32 -1 acq_rel
  ret i32 %1
}
[X86] Autogenerate complete checks. NFC llvm-svn: 338799 2018-08-03 09:20:32 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
X86: Do not use llc -march in tests. `llc -march` is problematic because it only switches the target architecture, but leaves the operating system unchanged. This occasionally leads to indeterministic tests because the OS from LLVM_DEFAULT_TARGET_TRIPLE is used. However we can simply always use `llc -mtriple` instead. This changes all the tests to do this to avoid people using -march when they copy and paste parts of tests. See also the discussion in https://reviews.llvm.org/D35287 llvm-svn: 309774 2017-08-02 08:28:10 +08:00			`; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=X64`
			`; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=X32`
Lower idempotent RMWs to fence+load Summary: I originally tried doing this specifically for X86 in the backend in D5091, but it was rather brittle and generally running too late to be general. Furthermore, other targets may want to implement similar optimizations. So I reimplemented it at the IR-level, fitting it into AtomicExpandPass as it interacts with that pass (which could not be cleanly done before at the backend level). This optimization relies on a new target hook, which is only used by X86 for now, as the correctness of the optimization on other targets remains an open question. If it is found correct on other targets, it should be trivial to enable for them. Details of the optimization are discussed in D5091. Test Plan: make check-all + a new test Reviewers: jfb Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5422 llvm-svn: 218455 2014-09-26 01:27:43 +08:00
			`; On x86, an atomic rmw operation that does not modify the value in memory`
			`; (such as atomic add 0) can be replaced by an mfence followed by a mov.`
			`; This is explained (with the motivation for such an optimization) in`
			`; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf`

			`define i8 @add8(i8* %p) {`
[X86] Autogenerate complete checks. NFC llvm-svn: 338799 2018-08-03 09:20:32 +08:00			`; X64-LABEL: add8:`
			`; X64: # %bb.0:`
			`; X64-NEXT: mfence`
			`; X64-NEXT: movb (%rdi), %al`
			`; X64-NEXT: retq`
			`;`
			`; X32-LABEL: add8:`
			`; X32: # %bb.0:`
			`; X32-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X32-NEXT: mfence`
			`; X32-NEXT: movb (%eax), %al`
			`; X32-NEXT: retl`
Lower idempotent RMWs to fence+load Summary: I originally tried doing this specifically for X86 in the backend in D5091, but it was rather brittle and generally running too late to be general. Furthermore, other targets may want to implement similar optimizations. So I reimplemented it at the IR-level, fitting it into AtomicExpandPass as it interacts with that pass (which could not be cleanly done before at the backend level). This optimization relies on a new target hook, which is only used by X86 for now, as the correctness of the optimization on other targets remains an open question. If it is found correct on other targets, it should be trivial to enable for them. Details of the optimization are discussed in D5091. Test Plan: make check-all + a new test Reviewers: jfb Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5422 llvm-svn: 218455 2014-09-26 01:27:43 +08:00			`%1 = atomicrmw add i8* %p, i8 0 monotonic`
			`ret i8 %1`
			`}`

			`define i16 @or16(i16* %p) {`
[X86] Autogenerate complete checks. NFC llvm-svn: 338799 2018-08-03 09:20:32 +08:00			`; X64-LABEL: or16:`
			`; X64: # %bb.0:`
			`; X64-NEXT: mfence`
[X86] Remove RELEASE_ and ACQUIRE_ pseudo instructions. Use isel patterns and the normal instructions instead At one point in time acquire implied mayLoad and mayStore as did release. Thus we needed separate pseudos that also carried that property. This appears to no longer be the case. I believe it was changed in 2012 with a comment saying that atomic memory accesses are marked volatile which preserves the ordering. So from what I can tell we shouldn't need additional pseudos since they aren't carry any flags that are different from the normal instructions. The only thing I can think of is that we may consider them for load folding candidates in the peephole pass now where we didn't before. If that's important hopefully there's something in the memory operand we can check to prevent the folding without relying on pseudo instructions. Differential Revision: https://reviews.llvm.org/D50212 llvm-svn: 338925 2018-08-04 05:40:44 +08:00			`; X64-NEXT: movzwl (%rdi), %eax`
[X86] Autogenerate complete checks. NFC llvm-svn: 338799 2018-08-03 09:20:32 +08:00			`; X64-NEXT: retq`
			`;`
			`; X32-LABEL: or16:`
			`; X32: # %bb.0:`
			`; X32-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X32-NEXT: mfence`
[X86] Remove RELEASE_ and ACQUIRE_ pseudo instructions. Use isel patterns and the normal instructions instead At one point in time acquire implied mayLoad and mayStore as did release. Thus we needed separate pseudos that also carried that property. This appears to no longer be the case. I believe it was changed in 2012 with a comment saying that atomic memory accesses are marked volatile which preserves the ordering. So from what I can tell we shouldn't need additional pseudos since they aren't carry any flags that are different from the normal instructions. The only thing I can think of is that we may consider them for load folding candidates in the peephole pass now where we didn't before. If that's important hopefully there's something in the memory operand we can check to prevent the folding without relying on pseudo instructions. Differential Revision: https://reviews.llvm.org/D50212 llvm-svn: 338925 2018-08-04 05:40:44 +08:00			`; X32-NEXT: movzwl (%eax), %eax`
[X86] Autogenerate complete checks. NFC llvm-svn: 338799 2018-08-03 09:20:32 +08:00			`; X32-NEXT: retl`
Lower idempotent RMWs to fence+load Summary: I originally tried doing this specifically for X86 in the backend in D5091, but it was rather brittle and generally running too late to be general. Furthermore, other targets may want to implement similar optimizations. So I reimplemented it at the IR-level, fitting it into AtomicExpandPass as it interacts with that pass (which could not be cleanly done before at the backend level). This optimization relies on a new target hook, which is only used by X86 for now, as the correctness of the optimization on other targets remains an open question. If it is found correct on other targets, it should be trivial to enable for them. Details of the optimization are discussed in D5091. Test Plan: make check-all + a new test Reviewers: jfb Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5422 llvm-svn: 218455 2014-09-26 01:27:43 +08:00			`%1 = atomicrmw or i16* %p, i16 0 acquire`
			`ret i16 %1`
			`}`

			`define i32 @xor32(i32* %p) {`
[X86] Autogenerate complete checks. NFC llvm-svn: 338799 2018-08-03 09:20:32 +08:00			`; X64-LABEL: xor32:`
			`; X64: # %bb.0:`
			`; X64-NEXT: mfence`
			`; X64-NEXT: movl (%rdi), %eax`
			`; X64-NEXT: retq`
			`;`
			`; X32-LABEL: xor32:`
			`; X32: # %bb.0:`
			`; X32-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X32-NEXT: mfence`
			`; X32-NEXT: movl (%eax), %eax`
			`; X32-NEXT: retl`
Lower idempotent RMWs to fence+load Summary: I originally tried doing this specifically for X86 in the backend in D5091, but it was rather brittle and generally running too late to be general. Furthermore, other targets may want to implement similar optimizations. So I reimplemented it at the IR-level, fitting it into AtomicExpandPass as it interacts with that pass (which could not be cleanly done before at the backend level). This optimization relies on a new target hook, which is only used by X86 for now, as the correctness of the optimization on other targets remains an open question. If it is found correct on other targets, it should be trivial to enable for them. Details of the optimization are discussed in D5091. Test Plan: make check-all + a new test Reviewers: jfb Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5422 llvm-svn: 218455 2014-09-26 01:27:43 +08:00			`%1 = atomicrmw xor i32* %p, i32 0 release`
			`ret i32 %1`
			`}`

			`define i64 @sub64(i64* %p) {`
[X86] Autogenerate complete checks. NFC llvm-svn: 338799 2018-08-03 09:20:32 +08:00			`; X64-LABEL: sub64:`
			`; X64: # %bb.0:`
			`; X64-NEXT: mfence`
			`; X64-NEXT: movq (%rdi), %rax`
			`; X64-NEXT: retq`
			`;`
			`; X32-LABEL: sub64:`
			`; X32: # %bb.0:`
			`; X32-NEXT: pushl %ebx`
			`; X32-NEXT: .cfi_def_cfa_offset 8`
			`; X32-NEXT: pushl %esi`
			`; X32-NEXT: .cfi_def_cfa_offset 12`
			`; X32-NEXT: .cfi_offset %esi, -12`
			`; X32-NEXT: .cfi_offset %ebx, -8`
			`; X32-NEXT: movl {{[0-9]+}}(%esp), %esi`
			`; X32-NEXT: movl (%esi), %eax`
			`; X32-NEXT: movl 4(%esi), %edx`
			`; X32-NEXT: .p2align 4, 0x90`
			`; X32-NEXT: .LBB3_1: # %atomicrmw.start`
			`; X32-NEXT: # =>This Inner Loop Header: Depth=1`
			`; X32-NEXT: movl %edx, %ecx`
			`; X32-NEXT: movl %eax, %ebx`
			`; X32-NEXT: lock cmpxchg8b (%esi)`
			`; X32-NEXT: jne .LBB3_1`
			`; X32-NEXT: # %bb.2: # %atomicrmw.end`
			`; X32-NEXT: popl %esi`
			`; X32-NEXT: .cfi_def_cfa_offset 8`
			`; X32-NEXT: popl %ebx`
			`; X32-NEXT: .cfi_def_cfa_offset 4`
			`; X32-NEXT: retl`
Lower idempotent RMWs to fence+load Summary: I originally tried doing this specifically for X86 in the backend in D5091, but it was rather brittle and generally running too late to be general. Furthermore, other targets may want to implement similar optimizations. So I reimplemented it at the IR-level, fitting it into AtomicExpandPass as it interacts with that pass (which could not be cleanly done before at the backend level). This optimization relies on a new target hook, which is only used by X86 for now, as the correctness of the optimization on other targets remains an open question. If it is found correct on other targets, it should be trivial to enable for them. Details of the optimization are discussed in D5091. Test Plan: make check-all + a new test Reviewers: jfb Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5422 llvm-svn: 218455 2014-09-26 01:27:43 +08:00			`%1 = atomicrmw sub i64* %p, i64 0 seq_cst`
			`ret i64 %1`
			`}`

			`define i128 @or128(i128* %p) {`
[X86] Autogenerate complete checks. NFC llvm-svn: 338799 2018-08-03 09:20:32 +08:00			`; X64-LABEL: or128:`
			`; X64: # %bb.0:`
			`; X64-NEXT: pushq %rax`
			`; X64-NEXT: .cfi_def_cfa_offset 16`
			`; X64-NEXT: xorl %esi, %esi`
			`; X64-NEXT: xorl %edx, %edx`
			`; X64-NEXT: callq __sync_fetch_and_or_16`
			`; X64-NEXT: popq %rcx`
			`; X64-NEXT: .cfi_def_cfa_offset 8`
			`; X64-NEXT: retq`
			`;`
			`; X32-LABEL: or128:`
			`; X32: # %bb.0:`
			`; X32-NEXT: pushl %ebp`
			`; X32-NEXT: .cfi_def_cfa_offset 8`
			`; X32-NEXT: .cfi_offset %ebp, -8`
			`; X32-NEXT: movl %esp, %ebp`
			`; X32-NEXT: .cfi_def_cfa_register %ebp`
			`; X32-NEXT: pushl %edi`
			`; X32-NEXT: pushl %esi`
			`; X32-NEXT: andl $-8, %esp`
			`; X32-NEXT: subl $16, %esp`
			`; X32-NEXT: .cfi_offset %esi, -16`
			`; X32-NEXT: .cfi_offset %edi, -12`
			`; X32-NEXT: movl 8(%ebp), %esi`
			`; X32-NEXT: movl %esp, %eax`
			`; X32-NEXT: pushl $0`
			`; X32-NEXT: pushl $0`
			`; X32-NEXT: pushl $0`
			`; X32-NEXT: pushl $0`
			`; X32-NEXT: pushl 12(%ebp)`
			`; X32-NEXT: pushl %eax`
			`; X32-NEXT: calll __sync_fetch_and_or_16`
			`; X32-NEXT: addl $20, %esp`
			`; X32-NEXT: movl (%esp), %eax`
			`; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx`
			`; X32-NEXT: movl {{[0-9]+}}(%esp), %edx`
			`; X32-NEXT: movl {{[0-9]+}}(%esp), %edi`
			`; X32-NEXT: movl %edi, 12(%esi)`
			`; X32-NEXT: movl %edx, 8(%esi)`
			`; X32-NEXT: movl %ecx, 4(%esi)`
			`; X32-NEXT: movl %eax, (%esi)`
			`; X32-NEXT: movl %esi, %eax`
			`; X32-NEXT: leal -8(%ebp), %esp`
			`; X32-NEXT: popl %esi`
			`; X32-NEXT: popl %edi`
			`; X32-NEXT: popl %ebp`
			`; X32-NEXT: .cfi_def_cfa %esp, 4`
			`; X32-NEXT: retl $4`
Lower idempotent RMWs to fence+load Summary: I originally tried doing this specifically for X86 in the backend in D5091, but it was rather brittle and generally running too late to be general. Furthermore, other targets may want to implement similar optimizations. So I reimplemented it at the IR-level, fitting it into AtomicExpandPass as it interacts with that pass (which could not be cleanly done before at the backend level). This optimization relies on a new target hook, which is only used by X86 for now, as the correctness of the optimization on other targets remains an open question. If it is found correct on other targets, it should be trivial to enable for them. Details of the optimization are discussed in D5091. Test Plan: make check-all + a new test Reviewers: jfb Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5422 llvm-svn: 218455 2014-09-26 01:27:43 +08:00			`%1 = atomicrmw or i128* %p, i128 0 monotonic`
			`ret i128 %1`
			`}`

			`; For 'and', the idempotent value is (-1)`
			`define i32 @and32 (i32* %p) {`
[X86] Autogenerate complete checks. NFC llvm-svn: 338799 2018-08-03 09:20:32 +08:00			`; X64-LABEL: and32:`
			`; X64: # %bb.0:`
			`; X64-NEXT: mfence`
			`; X64-NEXT: movl (%rdi), %eax`
			`; X64-NEXT: retq`
			`;`
			`; X32-LABEL: and32:`
			`; X32: # %bb.0:`
			`; X32-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X32-NEXT: mfence`
			`; X32-NEXT: movl (%eax), %eax`
			`; X32-NEXT: retl`
Lower idempotent RMWs to fence+load Summary: I originally tried doing this specifically for X86 in the backend in D5091, but it was rather brittle and generally running too late to be general. Furthermore, other targets may want to implement similar optimizations. So I reimplemented it at the IR-level, fitting it into AtomicExpandPass as it interacts with that pass (which could not be cleanly done before at the backend level). This optimization relies on a new target hook, which is only used by X86 for now, as the correctness of the optimization on other targets remains an open question. If it is found correct on other targets, it should be trivial to enable for them. Details of the optimization are discussed in D5091. Test Plan: make check-all + a new test Reviewers: jfb Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D5422 llvm-svn: 218455 2014-09-26 01:27:43 +08:00			`%1 = atomicrmw and i32* %p, i32 -1 acq_rel`
			`ret i32 %1`
			`}`