llvm-project/llvm/test/CodeGen/X86/widen_shuffle-1.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64

; widening shuffle v3float and then a add
define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
; X86-LABEL: shuf:
; X86:       # BB#0: # %entry
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    addps %xmm1, %xmm0
; X86-NEXT:    extractps $2, %xmm0, 8(%eax)
; X86-NEXT:    extractps $1, %xmm0, 4(%eax)
; X86-NEXT:    movss %xmm0, (%eax)
; X86-NEXT:    retl
;
; X64-LABEL: shuf:
; X64:       # BB#0: # %entry
; X64-NEXT:    addps %xmm1, %xmm0
; X64-NEXT:    extractps $2, %xmm0, 8(%rdi)
; X64-NEXT:    movlps %xmm0, (%rdi)
; X64-NEXT:    retq
entry:
	%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 1, i32 2>
	%val = fadd <3 x float> %x, %src2
	store <3 x float> %val, <3 x float>* %dst.addr
	ret void
}


; widening shuffle v3float with a different mask and then a add
define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
; X86-LABEL: shuf2:
; X86:       # BB#0: # %entry
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X86-NEXT:    addps %xmm1, %xmm0
; X86-NEXT:    extractps $2, %xmm0, 8(%eax)
; X86-NEXT:    extractps $1, %xmm0, 4(%eax)
; X86-NEXT:    movss %xmm0, (%eax)
; X86-NEXT:    retl
;
; X64-LABEL: shuf2:
; X64:       # BB#0: # %entry
; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X64-NEXT:    addps %xmm1, %xmm0
; X64-NEXT:    extractps $2, %xmm0, 8(%rdi)
; X64-NEXT:    movlps %xmm0, (%rdi)
; X64-NEXT:    retq
entry:
	%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 4, i32 2>
	%val = fadd <3 x float> %x, %src2
	store <3 x float> %val, <3 x float>* %dst.addr
	ret void
}

; Example of when widening a v3float operation causes the DAG to replace a node
; with the operation that we are currently widening, i.e. when replacing
; opA with opB, the DAG will produce new operations with opA.
define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
; X86-LABEL: shuf3:
; X86:       # BB#0: # %entry
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT:    movaps %xmm1, (%eax)
; X86-NEXT:    retl
;
; X64-LABEL: shuf3:
; X64:       # BB#0: # %entry
; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-NEXT:    movaps %xmm1, (%rdi)
; X64-NEXT:    retq
entry:
  %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
  %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %tmp3.i13 = shufflevector <4 x float> %tmp1.i.i, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> ; <<3 x float>>
  %tmp6.i14 = shufflevector <3 x float> %tmp3.i13, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %tmp97.i = shufflevector <4 x float> %tmp6.i14, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
  %tmp2.i18 = shufflevector <3 x float> %tmp97.i, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  %t5 = bitcast <4 x float> %tmp2.i18 to <4 x i32>
  %shr.i.i19 = lshr <4 x i32> %t5, <i32 19, i32 19, i32 19, i32 19>
  %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080>
  %shuffle.i.i.i21 = shufflevector <4 x float> %tmp2.i18, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
  store <4 x float> %shuffle.i.i.i21, <4 x float>* %dst
  ret void
}

; PR10421: make sure we correctly handle extreme widening with CONCAT_VECTORS
define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
; X86-LABEL: shuf4:
; X86:       # BB#0:
; X86-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X86-NEXT:    pshufb %xmm2, %xmm1
; X86-NEXT:    pshufb %xmm2, %xmm0
; X86-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X86-NEXT:    retl
;
; X64-LABEL: shuf4:
; X64:       # BB#0:
; X64-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X64-NEXT:    pshufb %xmm2, %xmm1
; X64-NEXT:    pshufb %xmm2, %xmm0
; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT:    retq
  %vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  ret <8 x i8> %vshuf
}

; PR11389: another CONCAT_VECTORS case
define void @shuf5(<8 x i8>* %p) nounwind {
; X86-LABEL: shuf5:
; X86:       # BB#0:
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT:    movsd %xmm0, (%eax)
; X86-NEXT:    retl
;
; X64-LABEL: shuf5:
; X64:       # BB#0:
; X64-NEXT:    movq {{.*}}(%rip), %rax
; X64-NEXT:    movq %rax, (%rdi)
; X64-NEXT:    retq
  %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  store <8 x i8> %v, <8 x i8>* %p, align 8
  ret void
}
Make utils/update_llc_test_checks.py note that the assertions are autogenerated. Also update existing test cases which appear to be generated by it and weren't modified (other than addition of the header) by rerunning it. llvm-svn: 253917 2015-11-24 05:33:58 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
[X86][SSE] Regenerate and add 32-bit tests to widening tests llvm-svn: 283672 2016-10-09 03:54:28 +08:00			`; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 \| FileCheck %s --check-prefix=X86`
			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 \| FileCheck %s --check-prefix=X64`
[x86] Add two more triples to stabilize the precise assembly syntax across platforms. llvm-svn: 218973 2014-10-03 17:43:23 +08:00
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00			`; widening shuffle v3float and then a add`
			`define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {`
[X86][SSE] Regenerate and add 32-bit tests to widening tests llvm-svn: 283672 2016-10-09 03:54:28 +08:00			`; X86-LABEL: shuf:`
			`; X86: # BB#0: # %entry`
			`; X86-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X86-NEXT: addps %xmm1, %xmm0`
			`; X86-NEXT: extractps $2, %xmm0, 8(%eax)`
			`; X86-NEXT: extractps $1, %xmm0, 4(%eax)`
			`; X86-NEXT: movss %xmm0, (%eax)`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: shuf:`
			`; X64: # BB#0: # %entry`
			`; X64-NEXT: addps %xmm1, %xmm0`
			`; X64-NEXT: extractps $2, %xmm0, 8(%rdi)`
			`; X64-NEXT: movlps %xmm0, (%rdi)`
			`; X64-NEXT: retq`
[x86] Regenerate precise FileCheck lines for the lats batch of test cases. llvm-svn: 218954 2014-10-03 09:57:38 +08:00			`entry:`
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00			`%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 1, i32 2>`
Delete useless trailing semicolons. llvm-svn: 92740 2010-01-06 01:55:26 +08:00			`%val = fadd <3 x float> %x, %src2`
Added some basic test cases for r61209 llvm-svn: 61210 2008-12-19 04:05:58 +08:00			`store <3 x float> %val, <3 x float>* %dst.addr`
			`ret void`
			`}`
Fixed a bug during widening where we would avoid legalizing a node. When we replace an OpA with a widened OpB, it is possible to get new uses of OpA due to CSE when recursively updating nodes. Since OpA has been processed, the new uses are not examined again. The patch checks if this occurred and it it did, updates the new uses of OpA to use OpB. llvm-svn: 105453 2010-06-04 09:20:10 +08:00

			`; widening shuffle v3float with a different mask and then a add`
			`define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {`
[X86][SSE] Regenerate and add 32-bit tests to widening tests llvm-svn: 283672 2016-10-09 03:54:28 +08:00			`; X86-LABEL: shuf2:`
			`; X86: # BB#0: # %entry`
			`; X86-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X86-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]`
			`; X86-NEXT: addps %xmm1, %xmm0`
			`; X86-NEXT: extractps $2, %xmm0, 8(%eax)`
			`; X86-NEXT: extractps $1, %xmm0, 4(%eax)`
			`; X86-NEXT: movss %xmm0, (%eax)`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: shuf2:`
			`; X64: # BB#0: # %entry`
			`; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]`
			`; X64-NEXT: addps %xmm1, %xmm0`
			`; X64-NEXT: extractps $2, %xmm0, 8(%rdi)`
			`; X64-NEXT: movlps %xmm0, (%rdi)`
			`; X64-NEXT: retq`
[x86] Regenerate precise FileCheck lines for the lats batch of test cases. llvm-svn: 218954 2014-10-03 09:57:38 +08:00			`entry:`
Fixed a bug during widening where we would avoid legalizing a node. When we replace an OpA with a widened OpB, it is possible to get new uses of OpA due to CSE when recursively updating nodes. Since OpA has been processed, the new uses are not examined again. The patch checks if this occurred and it it did, updates the new uses of OpA to use OpB. llvm-svn: 105453 2010-06-04 09:20:10 +08:00			`%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 4, i32 2>`
			`%val = fadd <3 x float> %x, %src2`
			`store <3 x float> %val, <3 x float>* %dst.addr`
			`ret void`
			`}`

			`; Example of when widening a v3float operation causes the DAG to replace a node`
			`; with the operation that we are currently widening, i.e. when replacing`
			`; opA with opB, the DAG will produce new operations with opA.`
Change handling of illegal vector types to widen when possible instead of expanding: e.g. <2 x float> -> <4 x float> instead of -> 2 floats. This affects two places in the code: handling cross block values and handling function return and arguments. Since vectors are already widened by legalizetypes, this gives us much better code and unblocks x86-64 abi and SPU abi work. For example, this (which is a silly example of a cross-block value): define <4 x float> @test2(<4 x float> %A) nounwind { %B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1> %C = fadd <2 x float> %B, %B br label %BB BB: %D = fadd <2 x float> %C, %C %E = shufflevector <2 x float> %D, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> ret <4 x float> %E } Now compiles into: _test2: ## @test2 ## BB#0: addps %xmm0, %xmm0 addps %xmm0, %xmm0 ret previously it compiled into: _test2: ## @test2 ## BB#0: addps %xmm0, %xmm0 pshufd $1, %xmm0, %xmm1 ## kill: XMM0<def> XMM0<kill> XMM0<def> insertps $0, %xmm0, %xmm0 insertps $16, %xmm1, %xmm0 addps %xmm0, %xmm0 ret This implements rdar://8230384 llvm-svn: 112101 2010-08-26 06:49:25 +08:00			`define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {`
[X86][SSE] Regenerate and add 32-bit tests to widening tests llvm-svn: 283672 2016-10-09 03:54:28 +08:00			`; X86-LABEL: shuf3:`
			`; X86: # BB#0: # %entry`
			`; X86-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]`
			`; X86-NEXT: movaps %xmm1, (%eax)`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: shuf3:`
			`; X64: # BB#0: # %entry`
			`; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]`
			`; X64-NEXT: movaps %xmm1, (%rdi)`
			`; X64-NEXT: retq`
[x86] Regenerate precise FileCheck lines for the lats batch of test cases. llvm-svn: 218954 2014-10-03 09:57:38 +08:00			`entry:`
Fixed a bug during widening where we would avoid legalizing a node. When we replace an OpA with a widened OpB, it is possible to get new uses of OpA due to CSE when recursively updating nodes. Since OpA has been processed, the new uses are not examined again. The patch checks if this occurred and it it did, updates the new uses of OpA to use OpB. llvm-svn: 105453 2010-06-04 09:20:10 +08:00			`%shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>`
[x86] Regenerate precise FileCheck lines for the lats batch of test cases. llvm-svn: 218954 2014-10-03 09:57:38 +08:00			`%tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>`
Fixed a bug during widening where we would avoid legalizing a node. When we replace an OpA with a widened OpB, it is possible to get new uses of OpA due to CSE when recursively updating nodes. Since OpA has been processed, the new uses are not examined again. The patch checks if this occurred and it it did, updates the new uses of OpA to use OpB. llvm-svn: 105453 2010-06-04 09:20:10 +08:00			`%tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`%tmp3.i13 = shufflevector <4 x float> %tmp1.i.i, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> ; <<3 x float>>`
			`%tmp6.i14 = shufflevector <3 x float> %tmp3.i13, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`%tmp97.i = shufflevector <4 x float> %tmp6.i14, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>`
			`%tmp2.i18 = shufflevector <3 x float> %tmp97.i, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>`
			`%t5 = bitcast <4 x float> %tmp2.i18 to <4 x i32>`
			`%shr.i.i19 = lshr <4 x i32> %t5, <i32 19, i32 19, i32 19, i32 19>`
[x86] Regenerate precise FileCheck lines for the lats batch of test cases. llvm-svn: 218954 2014-10-03 09:57:38 +08:00			`%and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080>`
Fixed a bug during widening where we would avoid legalizing a node. When we replace an OpA with a widened OpB, it is possible to get new uses of OpA due to CSE when recursively updating nodes. Since OpA has been processed, the new uses are not examined again. The patch checks if this occurred and it it did, updates the new uses of OpA to use OpB. llvm-svn: 105453 2010-06-04 09:20:10 +08:00			`%shuffle.i.i.i21 = shufflevector <4 x float> %tmp2.i18, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>`
			`store <4 x float> %shuffle.i.i.i21, <4 x float>* %dst`
			`ret void`
			`}`

PR10421: Fix a straightforward bug in the widening logic for CONCAT_VECTORS. llvm-svn: 135595 2011-07-21 02:14:33 +08:00			`; PR10421: make sure we correctly handle extreme widening with CONCAT_VECTORS`
			`define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {`
[X86][SSE] Regenerate and add 32-bit tests to widening tests llvm-svn: 283672 2016-10-09 03:54:28 +08:00			`; X86-LABEL: shuf4:`
			`; X86: # BB#0:`
			`; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]`
			`; X86-NEXT: pshufb %xmm2, %xmm1`
			`; X86-NEXT: pshufb %xmm2, %xmm0`
			`; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]`
			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: shuf4:`
			`; X64: # BB#0:`
			`; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]`
			`; X64-NEXT: pshufb %xmm2, %xmm1`
			`; X64-NEXT: pshufb %xmm2, %xmm0`
			`; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]`
			`; X64-NEXT: retq`
PR10421: Fix a straightforward bug in the widening logic for CONCAT_VECTORS. llvm-svn: 135595 2011-07-21 02:14:33 +08:00			`%vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>`
			`ret <8 x i8> %vshuf`
			`}`
CONCAT_VECTORS can have more than two operands. PR11389. llvm-svn: 144768 2011-11-16 10:52:39 +08:00
			`; PR11389: another CONCAT_VECTORS case`
			`define void @shuf5(<8 x i8>* %p) nounwind {`
[X86][SSE] Regenerate and add 32-bit tests to widening tests llvm-svn: 283672 2016-10-09 03:54:28 +08:00			`; X86-LABEL: shuf5:`
			`; X86: # BB#0:`
			`; X86-NEXT: movl {{[0-9]+}}(%esp), %eax`
[X86][SSE] Add support for target shuffle constant folding Initial support for target shuffle constant folding in cases where all shuffle inputs are constant. We may be able to relax this and merge shuffles with only some constant inputs in the future. I've added the helper function getTargetConstantBitsFromNode (based off a similar function in X86ShuffleDecodeConstantPool.cpp) that could be reused for other cases requiring constant vector extraction. Differential Revision: https://reviews.llvm.org/D27220 llvm-svn: 288250 2016-12-01 00:33:46 +08:00			`; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero`
			`; X86-NEXT: movsd %xmm0, (%eax)`
[X86][SSE] Regenerate and add 32-bit tests to widening tests llvm-svn: 283672 2016-10-09 03:54:28 +08:00			`; X86-NEXT: retl`
			`;`
			`; X64-LABEL: shuf5:`
			`; X64: # BB#0:`
[X86][SSE] Add support for target shuffle constant folding Initial support for target shuffle constant folding in cases where all shuffle inputs are constant. We may be able to relax this and merge shuffles with only some constant inputs in the future. I've added the helper function getTargetConstantBitsFromNode (based off a similar function in X86ShuffleDecodeConstantPool.cpp) that could be reused for other cases requiring constant vector extraction. Differential Revision: https://reviews.llvm.org/D27220 llvm-svn: 288250 2016-12-01 00:33:46 +08:00			`; X64-NEXT: movq {{.*}}(%rip), %rax`
			`; X64-NEXT: movq %rax, (%rdi)`
[X86][SSE] Regenerate and add 32-bit tests to widening tests llvm-svn: 283672 2016-10-09 03:54:28 +08:00			`; X64-NEXT: retq`
CONCAT_VECTORS can have more than two operands. PR11389. llvm-svn: 144768 2011-11-16 10:52:39 +08:00			`%v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>`
			`store <8 x i8> %v, <8 x i8>* %p, align 8`
			`ret void`
			`}`