llvm-project/llvm/test/CodeGen/X86/vec_set.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64

define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
; X86-LABEL: test:
; X86:       # BB#0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; X86-NEXT:    movdqa %xmm3, (%eax)
; X86-NEXT:    retl
;
; X64-LABEL: test:
; X64:       # BB#0:
; X64-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-NEXT:    movd %r9d, %xmm0
; X64-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT:    movd %r8d, %xmm1
; X64-NEXT:    movd %ecx, %xmm2
; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X64-NEXT:    movd %edx, %xmm1
; X64-NEXT:    movd %esi, %xmm3
; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; X64-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; X64-NEXT:    movdqa %xmm3, (%rdi)
; X64-NEXT:    retq
  %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0
  %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1
  %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2
  %tmp6 = insertelement <8 x i16> %tmp4, i16 %a3, i32 3
  %tmp8 = insertelement <8 x i16> %tmp6, i16 %a4, i32 4
  %tmp10 = insertelement <8 x i16> %tmp8, i16 %a5, i32 5
  %tmp12 = insertelement <8 x i16> %tmp10, i16 %a6, i32 6
  %tmp14 = insertelement <8 x i16> %tmp12, i16 %a7, i32 7
  store <8 x i16> %tmp14, <8 x i16>* %b
  ret void
}
[X86][SSE] Regenerated the vec_set tests. Replaced lots of dodgy greps with actual codegen llvm-svn: 265163 2016-04-02 01:40:25 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
[X86][SSE] Check vec_set BUILD_VECTOR tests on both 32 and 64-bit targets llvm-svn: 302683 2017-05-10 23:52:59 +08:00			`; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 \| FileCheck %s --check-prefix=X86`
			`; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 \| FileCheck %s --check-prefix=X64`
Add a BUILD_VECTOR with unpack and interleave testcase. llvm-svn: 27121 2006-03-25 17:48:14 +08:00
Add nounwind. llvm-svn: 50837 2008-05-08 06:59:08 +08:00			`define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {`
[X86][SSE] Check vec_set BUILD_VECTOR tests on both 32 and 64-bit targets llvm-svn: 302683 2017-05-10 23:52:59 +08:00			`; X86-LABEL: test:`
			`; X86: # BB#0:`
			`; X86-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero`
			`; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero`
			`; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]`
			`; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero`
			`; X86-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero`
			`; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]`
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> Step 2: unpcklps X, Y ==> <3, 2, 1, 0> The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc. Instead, this patch unpacks progressively larger sequential vector elements together: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 2> Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree. Differential Revision: https://reviews.llvm.org/D33864 llvm-svn: 304688 2017-06-05 04:12:04 +08:00			`; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]`
[X86][SSE] Check vec_set BUILD_VECTOR tests on both 32 and 64-bit targets llvm-svn: 302683 2017-05-10 23:52:59 +08:00			`; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero`
			`; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero`
			`; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]`
			`; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero`
			`; X86-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero`
			`; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]`
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> Step 2: unpcklps X, Y ==> <3, 2, 1, 0> The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc. Instead, this patch unpacks progressively larger sequential vector elements together: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 2> Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree. Differential Revision: https://reviews.llvm.org/D33864 llvm-svn: 304688 2017-06-05 04:12:04 +08:00			`; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]`
			`; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]`
[X86][SSE] Check vec_set BUILD_VECTOR tests on both 32 and 64-bit targets llvm-svn: 302683 2017-05-10 23:52:59 +08:00			`; X86-NEXT: movdqa %xmm3, (%eax)`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: test:`
			`; X64: # BB#0:`
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> Step 2: unpcklps X, Y ==> <3, 2, 1, 0> The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc. Instead, this patch unpacks progressively larger sequential vector elements together: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 2> Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree. Differential Revision: https://reviews.llvm.org/D33864 llvm-svn: 304688 2017-06-05 04:12:04 +08:00			`; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero`
[X86][SSE] Check vec_set BUILD_VECTOR tests on both 32 and 64-bit targets llvm-svn: 302683 2017-05-10 23:52:59 +08:00			`; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero`
			`; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]`
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> Step 2: unpcklps X, Y ==> <3, 2, 1, 0> The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc. Instead, this patch unpacks progressively larger sequential vector elements together: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 2> Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree. Differential Revision: https://reviews.llvm.org/D33864 llvm-svn: 304688 2017-06-05 04:12:04 +08:00			`; X64-NEXT: movd %r9d, %xmm0`
[X86][SSE] Check vec_set BUILD_VECTOR tests on both 32 and 64-bit targets llvm-svn: 302683 2017-05-10 23:52:59 +08:00			`; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero`
			`; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]`
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> Step 2: unpcklps X, Y ==> <3, 2, 1, 0> The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc. Instead, this patch unpacks progressively larger sequential vector elements together: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 2> Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree. Differential Revision: https://reviews.llvm.org/D33864 llvm-svn: 304688 2017-06-05 04:12:04 +08:00			`; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]`
			`; X64-NEXT: movd %r8d, %xmm1`
			`; X64-NEXT: movd %ecx, %xmm2`
			`; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]`
			`; X64-NEXT: movd %edx, %xmm1`
[X86][SSE] Check vec_set BUILD_VECTOR tests on both 32 and 64-bit targets llvm-svn: 302683 2017-05-10 23:52:59 +08:00			`; X64-NEXT: movd %esi, %xmm3`
			`; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]`
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> Step 2: unpcklps X, Y ==> <3, 2, 1, 0> The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc. Instead, this patch unpacks progressively larger sequential vector elements together: e.g. for v4f32: Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0> : unpcklps 1, 3 ==> Y: <?, ?, 3, 2> Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree. Differential Revision: https://reviews.llvm.org/D33864 llvm-svn: 304688 2017-06-05 04:12:04 +08:00			`; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]`
			`; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]`
[X86][SSE] Check vec_set BUILD_VECTOR tests on both 32 and 64-bit targets llvm-svn: 302683 2017-05-10 23:52:59 +08:00			`; X64-NEXT: movdqa %xmm3, (%rdi)`
			`; X64-NEXT: retq`
[X86][SSE] Regenerated the vec_set tests. Replaced lots of dodgy greps with actual codegen llvm-svn: 265163 2016-04-02 01:40:25 +08:00			`%tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0`
			`%tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1`
			`%tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2`
			`%tmp6 = insertelement <8 x i16> %tmp4, i16 %a3, i32 3`
			`%tmp8 = insertelement <8 x i16> %tmp6, i16 %a4, i32 4`
			`%tmp10 = insertelement <8 x i16> %tmp8, i16 %a5, i32 5`
			`%tmp12 = insertelement <8 x i16> %tmp10, i16 %a6, i32 6`
			`%tmp14 = insertelement <8 x i16> %tmp12, i16 %a7, i32 7`
			`store <8 x i16> %tmp14, <8 x i16>* %b`
			`ret void`
Add a BUILD_VECTOR with unpack and interleave testcase. llvm-svn: 27121 2006-03-25 17:48:14 +08:00			`}`
Remove llvm-upgrade and update tests. llvm-svn: 47432 2008-02-21 15:42:26 +08:00