llvm-project/llvm/test/CodeGen/X86/vec_loadsingles.ll

; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=FAST32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=SLOW32

define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly {
  %tmp1 = load float* %p
  %vecins = insertelement <4 x float> undef, float %tmp1, i32 0
  %add.ptr = getelementptr float* %p, i32 1
  %tmp5 = load float* %add.ptr
  %vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1
  ret <4 x float> %vecins7

; ALL-LABEL: merge_2_floats
; ALL: vmovq
; ALL-NEXT: retq
}

; Test-case generated due to a crash when trying to treat loading the first
; two i64s of a <4 x i64> as a load of two i32s.
define <4 x i64> @merge_2_floats_into_4() {
  %1 = load i64** undef, align 8
  %2 = getelementptr inbounds i64* %1, i64 0
  %3 = load i64* %2
  %4 = insertelement <4 x i64> undef, i64 %3, i32 0
  %5 = load i64** undef, align 8
  %6 = getelementptr inbounds i64* %5, i64 1
  %7 = load i64* %6
  %8 = insertelement <4 x i64> %4, i64 %7, i32 1
  %9 = shufflevector <4 x i64> %8, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  ret <4 x i64> %9
  
; ALL-LABEL: merge_2_floats_into_4
; ALL: vmovups
; ALL-NEXT: retq
}

define <4 x float> @merge_4_floats(float* %ptr) {
  %a = load float* %ptr, align 8
  %vec = insertelement <4 x float> undef, float %a, i32 0
  %idx1 = getelementptr inbounds float* %ptr, i64 1
  %b = load float* %idx1, align 8
  %vec2 = insertelement <4 x float> %vec, float %b, i32 1
  %idx3 = getelementptr inbounds float* %ptr, i64 2
  %c = load float* %idx3, align 8
  %vec4 = insertelement <4 x float> %vec2, float %c, i32 2
  %idx5 = getelementptr inbounds float* %ptr, i64 3
  %d = load float* %idx5, align 8
  %vec6 = insertelement <4 x float> %vec4, float %d, i32 3
  ret <4 x float> %vec6

; ALL-LABEL: merge_4_floats
; ALL: vmovups
; ALL-NEXT: retq
}

; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 ) 
; Make sure that 32-byte vectors are handled efficiently.
; If the target has slow 32-byte accesses, we should still generate
; 16-byte loads.

define <8 x float> @merge_8_floats(float* %ptr) {
  %a = load float* %ptr, align 4
  %vec = insertelement <8 x float> undef, float %a, i32 0
  %idx1 = getelementptr inbounds float* %ptr, i64 1
  %b = load float* %idx1, align 4
  %vec2 = insertelement <8 x float> %vec, float %b, i32 1
  %idx3 = getelementptr inbounds float* %ptr, i64 2
  %c = load float* %idx3, align 4
  %vec4 = insertelement <8 x float> %vec2, float %c, i32 2
  %idx5 = getelementptr inbounds float* %ptr, i64 3
  %d = load float* %idx5, align 4
  %vec6 = insertelement <8 x float> %vec4, float %d, i32 3
  %idx7 = getelementptr inbounds float* %ptr, i64 4
  %e = load float* %idx7, align 4
  %vec8 = insertelement <8 x float> %vec6, float %e, i32 4
  %idx9 = getelementptr inbounds float* %ptr, i64 5
  %f = load float* %idx9, align 4
  %vec10 = insertelement <8 x float> %vec8, float %f, i32 5
  %idx11 = getelementptr inbounds float* %ptr, i64 6
  %g = load float* %idx11, align 4
  %vec12 = insertelement <8 x float> %vec10, float %g, i32 6
  %idx13 = getelementptr inbounds float* %ptr, i64 7
  %h = load float* %idx13, align 4
  %vec14 = insertelement <8 x float> %vec12, float %h, i32 7
  ret <8 x float> %vec14

; ALL-LABEL: merge_8_floats

; FAST32: vmovups
; FAST32-NEXT: retq

; SLOW32: vmovups
; SLOW32-NEXT: vinsertf128
; SLOW32-NEXT: retq
}

define <4 x double> @merge_4_doubles(double* %ptr) {
  %a = load double* %ptr, align 8
  %vec = insertelement <4 x double> undef, double %a, i32 0
  %idx1 = getelementptr inbounds double* %ptr, i64 1
  %b = load double* %idx1, align 8
  %vec2 = insertelement <4 x double> %vec, double %b, i32 1
  %idx3 = getelementptr inbounds double* %ptr, i64 2
  %c = load double* %idx3, align 8
  %vec4 = insertelement <4 x double> %vec2, double %c, i32 2
  %idx5 = getelementptr inbounds double* %ptr, i64 3
  %d = load double* %idx5, align 8
  %vec6 = insertelement <4 x double> %vec4, double %d, i32 3
  ret <4 x double> %vec6

; ALL-LABEL: merge_4_doubles
; FAST32: vmovups
; FAST32-NEXT: retq

; SLOW32: vmovups
; SLOW32-NEXT: vinsertf128
; SLOW32-NEXT: retq
}

; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ) 
; Recognize and combine consecutive loads even when the
; first of the combined loads is offset from the base address.
define <4 x double> @merge_4_doubles_offset(double* %ptr) {
  %arrayidx4 = getelementptr inbounds double* %ptr, i64 4
  %arrayidx5 = getelementptr inbounds double* %ptr, i64 5
  %arrayidx6 = getelementptr inbounds double* %ptr, i64 6
  %arrayidx7 = getelementptr inbounds double* %ptr, i64 7
  %e = load double* %arrayidx4, align 8
  %f = load double* %arrayidx5, align 8
  %g = load double* %arrayidx6, align 8
  %h = load double* %arrayidx7, align 8
  %vecinit4 = insertelement <4 x double> undef, double %e, i32 0
  %vecinit5 = insertelement <4 x double> %vecinit4, double %f, i32 1
  %vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2
  %vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3
  ret <4 x double> %vecinit7

; ALL-LABEL: merge_4_doubles_offset
; FAST32: vmovups
; FAST32-NEXT: retq

; SLOW32: vmovups
; SLOW32-NEXT: vinsertf128
; SLOW32-NEXT: retq
}
Optimize merging of scalar loads for 32-byte vectors [X86, AVX] Fix the poor codegen seen in PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 ). Before we crack 32-byte build vectors into smaller chunks (and then subsequently glue them back together), we should look for the easy case where we can just load all elements in a single op. An example of the codegen change is: From: vmovss 16(%rdi), %xmm1 vmovups (%rdi), %xmm0 vinsertps $16, 20(%rdi), %xmm1, %xmm1 vinsertps $32, 24(%rdi), %xmm1, %xmm1 vinsertps $48, 28(%rdi), %xmm1, %xmm1 vinsertf128 $1, %xmm1, %ymm0, %ymm0 retq To: vmovups (%rdi), %ymm0 retq Differential Revision: http://reviews.llvm.org/D6536 llvm-svn: 223518 2014-12-06 05:28:14 +08:00			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-slow-unaligned-mem-32 \| FileCheck %s --check-prefix=ALL --check-prefix=FAST32`
			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+slow-unaligned-mem-32 \| FileCheck %s --check-prefix=ALL --check-prefix=SLOW32`

			`define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly {`
			`%tmp1 = load float* %p`
			`%vecins = insertelement <4 x float> undef, float %tmp1, i32 0`
			`%add.ptr = getelementptr float* %p, i32 1`
			`%tmp5 = load float* %add.ptr`
			`%vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1`
			`ret <4 x float> %vecins7`

			`; ALL-LABEL: merge_2_floats`
			`; ALL: vmovq`
			`; ALL-NEXT: retq`
			`}`

[X86] Make a code path in EltsFromConsecutiveLoads work only on vectors it expects EltsFromConsecutiveLoads was apparently only ever called for 128-bit vectors, and assumed this implicitly. r223518 started calling it for AVX-sized vectors, causing the code path that had this assumption to crash. This adds a check to make this path fire only for 128-bit vectors. Differential Revision: http://reviews.llvm.org/D6579 llvm-svn: 223922 2014-12-10 16:46:12 +08:00			`; Test-case generated due to a crash when trying to treat loading the first`
			`; two i64s of a <4 x i64> as a load of two i32s.`
			`define <4 x i64> @merge_2_floats_into_4() {`
			`%1 = load i64** undef, align 8`
			`%2 = getelementptr inbounds i64* %1, i64 0`
			`%3 = load i64* %2`
			`%4 = insertelement <4 x i64> undef, i64 %3, i32 0`
			`%5 = load i64** undef, align 8`
			`%6 = getelementptr inbounds i64* %5, i64 1`
			`%7 = load i64* %6`
			`%8 = insertelement <4 x i64> %4, i64 %7, i32 1`
			`%9 = shufflevector <4 x i64> %8, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>`
			`ret <4 x i64> %9`

			`; ALL-LABEL: merge_2_floats_into_4`
			`; ALL: vmovups`
			`; ALL-NEXT: retq`
			`}`

Optimize merging of scalar loads for 32-byte vectors [X86, AVX] Fix the poor codegen seen in PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 ). Before we crack 32-byte build vectors into smaller chunks (and then subsequently glue them back together), we should look for the easy case where we can just load all elements in a single op. An example of the codegen change is: From: vmovss 16(%rdi), %xmm1 vmovups (%rdi), %xmm0 vinsertps $16, 20(%rdi), %xmm1, %xmm1 vinsertps $32, 24(%rdi), %xmm1, %xmm1 vinsertps $48, 28(%rdi), %xmm1, %xmm1 vinsertf128 $1, %xmm1, %ymm0, %ymm0 retq To: vmovups (%rdi), %ymm0 retq Differential Revision: http://reviews.llvm.org/D6536 llvm-svn: 223518 2014-12-06 05:28:14 +08:00			`define <4 x float> @merge_4_floats(float* %ptr) {`
			`%a = load float* %ptr, align 8`
			`%vec = insertelement <4 x float> undef, float %a, i32 0`
			`%idx1 = getelementptr inbounds float* %ptr, i64 1`
			`%b = load float* %idx1, align 8`
			`%vec2 = insertelement <4 x float> %vec, float %b, i32 1`
			`%idx3 = getelementptr inbounds float* %ptr, i64 2`
			`%c = load float* %idx3, align 8`
			`%vec4 = insertelement <4 x float> %vec2, float %c, i32 2`
			`%idx5 = getelementptr inbounds float* %ptr, i64 3`
			`%d = load float* %idx5, align 8`
			`%vec6 = insertelement <4 x float> %vec4, float %d, i32 3`
			`ret <4 x float> %vec6`

			`; ALL-LABEL: merge_4_floats`
			`; ALL: vmovups`
			`; ALL-NEXT: retq`
			`}`

			`; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 )`
			`; Make sure that 32-byte vectors are handled efficiently.`
			`; If the target has slow 32-byte accesses, we should still generate`
			`; 16-byte loads.`

			`define <8 x float> @merge_8_floats(float* %ptr) {`
			`%a = load float* %ptr, align 4`
			`%vec = insertelement <8 x float> undef, float %a, i32 0`
			`%idx1 = getelementptr inbounds float* %ptr, i64 1`
			`%b = load float* %idx1, align 4`
			`%vec2 = insertelement <8 x float> %vec, float %b, i32 1`
			`%idx3 = getelementptr inbounds float* %ptr, i64 2`
			`%c = load float* %idx3, align 4`
			`%vec4 = insertelement <8 x float> %vec2, float %c, i32 2`
			`%idx5 = getelementptr inbounds float* %ptr, i64 3`
			`%d = load float* %idx5, align 4`
			`%vec6 = insertelement <8 x float> %vec4, float %d, i32 3`
			`%idx7 = getelementptr inbounds float* %ptr, i64 4`
			`%e = load float* %idx7, align 4`
			`%vec8 = insertelement <8 x float> %vec6, float %e, i32 4`
			`%idx9 = getelementptr inbounds float* %ptr, i64 5`
			`%f = load float* %idx9, align 4`
			`%vec10 = insertelement <8 x float> %vec8, float %f, i32 5`
			`%idx11 = getelementptr inbounds float* %ptr, i64 6`
			`%g = load float* %idx11, align 4`
			`%vec12 = insertelement <8 x float> %vec10, float %g, i32 6`
			`%idx13 = getelementptr inbounds float* %ptr, i64 7`
			`%h = load float* %idx13, align 4`
			`%vec14 = insertelement <8 x float> %vec12, float %h, i32 7`
			`ret <8 x float> %vec14`

			`; ALL-LABEL: merge_8_floats`

			`; FAST32: vmovups`
			`; FAST32-NEXT: retq`

			`; SLOW32: vmovups`
merge consecutive loads that are offset from a base address SelectionDAG::isConsecutiveLoad() was not detecting consecutive loads when the first load was offset from a base address. This patch recognizes that pattern and subtracts the offset before comparing the second load to see if it is consecutive. The codegen change in the new test case improves from: vmovsd 32(%rdi), %xmm0 vmovsd 48(%rdi), %xmm1 vmovhpd 56(%rdi), %xmm1, %xmm1 vmovhpd 40(%rdi), %xmm0, %xmm0 vinsertf128 $1, %xmm1, %ymm0, %ymm0 To: vmovups 32(%rdi), %ymm0 An existing test case is also improved from: vmovsd (%rdi), %xmm0 vmovsd 16(%rdi), %xmm1 vmovsd 24(%rdi), %xmm2 vunpcklpd %xmm2, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm2[0] vmovhpd 8(%rdi), %xmm1, %xmm3 To: vmovsd (%rdi), %xmm0 vmovsd 16(%rdi), %xmm1 vmovhpd 24(%rdi), %xmm0, %xmm0 vmovhpd 8(%rdi), %xmm1, %xmm1 This patch fixes PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ). Differential Revision: http://reviews.llvm.org/D6642 llvm-svn: 224379 2014-12-17 05:57:18 +08:00			`; SLOW32-NEXT: vinsertf128`
Optimize merging of scalar loads for 32-byte vectors [X86, AVX] Fix the poor codegen seen in PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 ). Before we crack 32-byte build vectors into smaller chunks (and then subsequently glue them back together), we should look for the easy case where we can just load all elements in a single op. An example of the codegen change is: From: vmovss 16(%rdi), %xmm1 vmovups (%rdi), %xmm0 vinsertps $16, 20(%rdi), %xmm1, %xmm1 vinsertps $32, 24(%rdi), %xmm1, %xmm1 vinsertps $48, 28(%rdi), %xmm1, %xmm1 vinsertf128 $1, %xmm1, %ymm0, %ymm0 retq To: vmovups (%rdi), %ymm0 retq Differential Revision: http://reviews.llvm.org/D6536 llvm-svn: 223518 2014-12-06 05:28:14 +08:00			`; SLOW32-NEXT: retq`
			`}`

			`define <4 x double> @merge_4_doubles(double* %ptr) {`
			`%a = load double* %ptr, align 8`
			`%vec = insertelement <4 x double> undef, double %a, i32 0`
			`%idx1 = getelementptr inbounds double* %ptr, i64 1`
			`%b = load double* %idx1, align 8`
			`%vec2 = insertelement <4 x double> %vec, double %b, i32 1`
			`%idx3 = getelementptr inbounds double* %ptr, i64 2`
			`%c = load double* %idx3, align 8`
			`%vec4 = insertelement <4 x double> %vec2, double %c, i32 2`
			`%idx5 = getelementptr inbounds double* %ptr, i64 3`
			`%d = load double* %idx5, align 8`
			`%vec6 = insertelement <4 x double> %vec4, double %d, i32 3`
			`ret <4 x double> %vec6`

			`; ALL-LABEL: merge_4_doubles`
			`; FAST32: vmovups`
			`; FAST32-NEXT: retq`

			`; SLOW32: vmovups`
merge consecutive loads that are offset from a base address SelectionDAG::isConsecutiveLoad() was not detecting consecutive loads when the first load was offset from a base address. This patch recognizes that pattern and subtracts the offset before comparing the second load to see if it is consecutive. The codegen change in the new test case improves from: vmovsd 32(%rdi), %xmm0 vmovsd 48(%rdi), %xmm1 vmovhpd 56(%rdi), %xmm1, %xmm1 vmovhpd 40(%rdi), %xmm0, %xmm0 vinsertf128 $1, %xmm1, %ymm0, %ymm0 To: vmovups 32(%rdi), %ymm0 An existing test case is also improved from: vmovsd (%rdi), %xmm0 vmovsd 16(%rdi), %xmm1 vmovsd 24(%rdi), %xmm2 vunpcklpd %xmm2, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm2[0] vmovhpd 8(%rdi), %xmm1, %xmm3 To: vmovsd (%rdi), %xmm0 vmovsd 16(%rdi), %xmm1 vmovhpd 24(%rdi), %xmm0, %xmm0 vmovhpd 8(%rdi), %xmm1, %xmm1 This patch fixes PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ). Differential Revision: http://reviews.llvm.org/D6642 llvm-svn: 224379 2014-12-17 05:57:18 +08:00			`; SLOW32-NEXT: vinsertf128`
			`; SLOW32-NEXT: retq`
			`}`

			`; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 )`
			`; Recognize and combine consecutive loads even when the`
			`; first of the combined loads is offset from the base address.`
			`define <4 x double> @merge_4_doubles_offset(double* %ptr) {`
			`%arrayidx4 = getelementptr inbounds double* %ptr, i64 4`
			`%arrayidx5 = getelementptr inbounds double* %ptr, i64 5`
			`%arrayidx6 = getelementptr inbounds double* %ptr, i64 6`
			`%arrayidx7 = getelementptr inbounds double* %ptr, i64 7`
			`%e = load double* %arrayidx4, align 8`
			`%f = load double* %arrayidx5, align 8`
			`%g = load double* %arrayidx6, align 8`
			`%h = load double* %arrayidx7, align 8`
			`%vecinit4 = insertelement <4 x double> undef, double %e, i32 0`
			`%vecinit5 = insertelement <4 x double> %vecinit4, double %f, i32 1`
			`%vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2`
			`%vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3`
			`ret <4 x double> %vecinit7`

			`; ALL-LABEL: merge_4_doubles_offset`
			`; FAST32: vmovups`
			`; FAST32-NEXT: retq`

			`; SLOW32: vmovups`
			`; SLOW32-NEXT: vinsertf128`
Optimize merging of scalar loads for 32-byte vectors [X86, AVX] Fix the poor codegen seen in PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 ). Before we crack 32-byte build vectors into smaller chunks (and then subsequently glue them back together), we should look for the easy case where we can just load all elements in a single op. An example of the codegen change is: From: vmovss 16(%rdi), %xmm1 vmovups (%rdi), %xmm0 vinsertps $16, 20(%rdi), %xmm1, %xmm1 vinsertps $32, 24(%rdi), %xmm1, %xmm1 vinsertps $48, 28(%rdi), %xmm1, %xmm1 vinsertf128 $1, %xmm1, %ymm0, %ymm0 retq To: vmovups (%rdi), %ymm0 retq Differential Revision: http://reviews.llvm.org/D6536 llvm-svn: 223518 2014-12-06 05:28:14 +08:00			`; SLOW32-NEXT: retq`
Slightly generalize the code that handles shuffles of consecutive loads on x86 to handle more cases. Fix a bug in said code that would cause it to read past the end of an object. Rewrite the code in SelectionDAGLegalize::ExpandBUILD_VECTOR to be a bit more general. Remove PerformBuildVectorCombine, which is no longer necessary with these changes. In addition to simplifying the code, with this change, we can now catch a few more cases of consecutive loads. llvm-svn: 73012 2009-06-07 14:52:44 +08:00			`}`