llvm-project/llvm/test/CodeGen/X86/dag-merge-fast-accesses.ll

; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-slow-unaligned-mem-16 | FileCheck %s --check-prefix=FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+slow-unaligned-mem-16 | FileCheck %s --check-prefix=SLOW

; Verify that the DAGCombiner is creating unaligned 16-byte loads and stores
; if and only if those are fast.

define void @merge_const_vec_store(i64* %ptr) {
; FAST-LABEL: merge_const_vec_store:
; FAST:       # BB#0:
; FAST-NEXT:    xorps %xmm0, %xmm0
; FAST-NEXT:    movups %xmm0, (%rdi)
; FAST-NEXT:    retq
;
; SLOW-LABEL: merge_const_vec_store:
; SLOW:       # BB#0:
; SLOW-NEXT:    movq $0, (%rdi)
; SLOW-NEXT:    movq $0, 8(%rdi)
; SLOW-NEXT:    retq

  %idx0 = getelementptr i64, i64* %ptr, i64 0
  %idx1 = getelementptr i64, i64* %ptr, i64 1

  store i64 0, i64* %idx0, align 8
  store i64 0, i64* %idx1, align 8
  ret void
}


define void @merge_vec_element_store(<4 x double> %v, double* %ptr) {
; FAST-LABEL: merge_vec_element_store:
; FAST:       # BB#0:
; FAST-NEXT:    movups %xmm0, (%rdi)
; FAST-NEXT:    retq
;
; SLOW-LABEL: merge_vec_element_store:
; SLOW:       # BB#0:
; SLOW-NEXT:    movlpd %xmm0, (%rdi)
; SLOW-NEXT:    movhpd %xmm0, 8(%rdi)
; SLOW-NEXT:    retq

  %vecext0 = extractelement <4 x double> %v, i32 0
  %vecext1 = extractelement <4 x double> %v, i32 1

  %idx0 = getelementptr double, double* %ptr, i64 0
  %idx1 = getelementptr double, double* %ptr, i64 1

  store double %vecext0, double* %idx0, align 8
  store double %vecext1, double* %idx1, align 8
  ret void
}


;; TODO: FAST *should* be:
;;    movups (%rdi), %xmm0
;;    movups %xmm0, 40(%rdi)
;; ..but is not currently. See the UseAA FIXME in DAGCombiner.cpp
;; visitSTORE.

define void @merge_vec_load_and_stores(i64 *%ptr) {
; FAST-LABEL: merge_vec_load_and_stores:
; FAST:       # BB#0:
; FAST-NEXT:    movq (%rdi), %rax
; FAST-NEXT:    movq 8(%rdi), %rcx
; FAST-NEXT:    movq %rax, 40(%rdi)
; FAST-NEXT:    movq %rcx, 48(%rdi)
; FAST-NEXT:    retq
;
; SLOW-LABEL: merge_vec_load_and_stores:
; SLOW:       # BB#0:
; SLOW-NEXT:    movq (%rdi), %rax
; SLOW-NEXT:    movq 8(%rdi), %rcx
; SLOW-NEXT:    movq %rax, 40(%rdi)
; SLOW-NEXT:    movq %rcx, 48(%rdi)
; SLOW-NEXT:    retq

  %idx0 = getelementptr i64, i64* %ptr, i64 0
  %idx1 = getelementptr i64, i64* %ptr, i64 1

  %ld0 = load i64, i64* %idx0, align 4
  %ld1 = load i64, i64* %idx1, align 4

  %idx4 = getelementptr i64, i64* %ptr, i64 5
  %idx5 = getelementptr i64, i64* %ptr, i64 6

  store i64 %ld0, i64* %idx4, align 4
  store i64 %ld1, i64* %idx5, align 4
  ret void
}
check for fastness before merging in DAGCombiner::MergeConsecutiveStores() Use and check the 'IsFast' optional parameter to TLI.allowsMemoryAccess() any time we have a merged access candidate. Without this patch, we were generating unaligned 16-byte (SSE) memops for x86 targets where those accesses are slow. This change was mentioned in: http://reviews.llvm.org/D10662 and http://reviews.llvm.org/D10905 and will help solve PR21711. Differential Revision: http://reviews.llvm.org/D12573 llvm-svn: 246771 2015-09-03 23:03:19 +08:00			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-slow-unaligned-mem-16 \| FileCheck %s --check-prefix=FAST`
			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+slow-unaligned-mem-16 \| FileCheck %s --check-prefix=SLOW`

			`; Verify that the DAGCombiner is creating unaligned 16-byte loads and stores`
			`; if and only if those are fast.`

			`define void @merge_const_vec_store(i64* %ptr) {`
			`; FAST-LABEL: merge_const_vec_store:`
			`; FAST: # BB#0:`
			`; FAST-NEXT: xorps %xmm0, %xmm0`
			`; FAST-NEXT: movups %xmm0, (%rdi)`
			`; FAST-NEXT: retq`
			`;`
			`; SLOW-LABEL: merge_const_vec_store:`
			`; SLOW: # BB#0:`
			`; SLOW-NEXT: movq $0, (%rdi)`
			`; SLOW-NEXT: movq $0, 8(%rdi)`
			`; SLOW-NEXT: retq`

			`%idx0 = getelementptr i64, i64* %ptr, i64 0`
			`%idx1 = getelementptr i64, i64* %ptr, i64 1`

			`store i64 0, i64* %idx0, align 8`
			`store i64 0, i64* %idx1, align 8`
			`ret void`
			`}`


			`define void @merge_vec_element_store(<4 x double> %v, double* %ptr) {`
			`; FAST-LABEL: merge_vec_element_store:`
			`; FAST: # BB#0:`
			`; FAST-NEXT: movups %xmm0, (%rdi)`
			`; FAST-NEXT: retq`
			`;`
			`; SLOW-LABEL: merge_vec_element_store:`
			`; SLOW: # BB#0:`
			`; SLOW-NEXT: movlpd %xmm0, (%rdi)`
			`; SLOW-NEXT: movhpd %xmm0, 8(%rdi)`
			`; SLOW-NEXT: retq`

			`%vecext0 = extractelement <4 x double> %v, i32 0`
			`%vecext1 = extractelement <4 x double> %v, i32 1`

			`%idx0 = getelementptr double, double* %ptr, i64 0`
			`%idx1 = getelementptr double, double* %ptr, i64 1`

			`store double %vecext0, double* %idx0, align 8`
			`store double %vecext1, double* %idx1, align 8`
			`ret void`
			`}`


Fix two issues in MergeConsecutiveStores: 1) PR25154. This is basically a repeat of PR18102, which was fixed in r200201, and broken again by r234430. The latter changed which of the store nodes was merged into from the first to the last. Thus, we now also need to prefer merging a later store at a given address into the target node, instead of an earlier one. 2) While investigating that, I also realized I'd introduced a bug in r236850. There, I removed a check for alignment -- not realizing that nothing except the alignment check was ensuring that none of the stores were overlapping! This is a really bogus way to ensure there's no aliased stores. A better solution to both of these issues is likely to always use the code added in the 'if (UseAA)' branches which rearrange the chain based on a more principled analysis. I'll look into whether that can be used always, but in the interest of getting things back to working, I think a minimal change makes sense. llvm-svn: 251816 2015-11-03 02:48:08 +08:00			`;; TODO: FAST should be:`
			`;; movups (%rdi), %xmm0`
			`;; movups %xmm0, 40(%rdi)`
			`;; ..but is not currently. See the UseAA FIXME in DAGCombiner.cpp`
			`;; visitSTORE.`

check for fastness before merging in DAGCombiner::MergeConsecutiveStores() Use and check the 'IsFast' optional parameter to TLI.allowsMemoryAccess() any time we have a merged access candidate. Without this patch, we were generating unaligned 16-byte (SSE) memops for x86 targets where those accesses are slow. This change was mentioned in: http://reviews.llvm.org/D10662 and http://reviews.llvm.org/D10905 and will help solve PR21711. Differential Revision: http://reviews.llvm.org/D12573 llvm-svn: 246771 2015-09-03 23:03:19 +08:00			`define void @merge_vec_load_and_stores(i64 *%ptr) {`
			`; FAST-LABEL: merge_vec_load_and_stores:`
			`; FAST: # BB#0:`
Fix two issues in MergeConsecutiveStores: 1) PR25154. This is basically a repeat of PR18102, which was fixed in r200201, and broken again by r234430. The latter changed which of the store nodes was merged into from the first to the last. Thus, we now also need to prefer merging a later store at a given address into the target node, instead of an earlier one. 2) While investigating that, I also realized I'd introduced a bug in r236850. There, I removed a check for alignment -- not realizing that nothing except the alignment check was ensuring that none of the stores were overlapping! This is a really bogus way to ensure there's no aliased stores. A better solution to both of these issues is likely to always use the code added in the 'if (UseAA)' branches which rearrange the chain based on a more principled analysis. I'll look into whether that can be used always, but in the interest of getting things back to working, I think a minimal change makes sense. llvm-svn: 251816 2015-11-03 02:48:08 +08:00			`; FAST-NEXT: movq (%rdi), %rax`
			`; FAST-NEXT: movq 8(%rdi), %rcx`
			`; FAST-NEXT: movq %rax, 40(%rdi)`
			`; FAST-NEXT: movq %rcx, 48(%rdi)`
check for fastness before merging in DAGCombiner::MergeConsecutiveStores() Use and check the 'IsFast' optional parameter to TLI.allowsMemoryAccess() any time we have a merged access candidate. Without this patch, we were generating unaligned 16-byte (SSE) memops for x86 targets where those accesses are slow. This change was mentioned in: http://reviews.llvm.org/D10662 and http://reviews.llvm.org/D10905 and will help solve PR21711. Differential Revision: http://reviews.llvm.org/D12573 llvm-svn: 246771 2015-09-03 23:03:19 +08:00			`; FAST-NEXT: retq`
			`;`
			`; SLOW-LABEL: merge_vec_load_and_stores:`
			`; SLOW: # BB#0:`
			`; SLOW-NEXT: movq (%rdi), %rax`
			`; SLOW-NEXT: movq 8(%rdi), %rcx`
			`; SLOW-NEXT: movq %rax, 40(%rdi)`
			`; SLOW-NEXT: movq %rcx, 48(%rdi)`
			`; SLOW-NEXT: retq`

			`%idx0 = getelementptr i64, i64* %ptr, i64 0`
			`%idx1 = getelementptr i64, i64* %ptr, i64 1`

			`%ld0 = load i64, i64* %idx0, align 4`
			`%ld1 = load i64, i64* %idx1, align 4`

			`%idx4 = getelementptr i64, i64* %ptr, i64 5`
			`%idx5 = getelementptr i64, i64* %ptr, i64 6`

			`store i64 %ld0, i64* %idx4, align 4`
			`store i64 %ld1, i64* %idx5, align 4`
			`ret void`
			`}`