llvm-project/llvm/test/CodeGen/X86/vsplit-and.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn |  FileCheck %s

define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind readonly {
; CHECK-LABEL: t0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    pxor %xmm2, %xmm2
; CHECK-NEXT:    pcmpeqq %xmm2, %xmm0
; CHECK-NEXT:    pcmpeqq %xmm2, %xmm1
; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
; CHECK-NEXT:    pxor %xmm1, %xmm2
; CHECK-NEXT:    pandn %xmm2, %xmm0
; CHECK-NEXT:    movdqa %xmm0, (%rdi)
; CHECK-NEXT:    retq
  %cmp1 = icmp ne <2 x i64> %src1, zeroinitializer
  %cmp2 = icmp ne <2 x i64> %src2, zeroinitializer
  %t1 = and <2 x i1> %cmp1, %cmp2
  %t2 = sext <2 x i1> %t1 to <2 x i64>
  store <2 x i64> %t2, <2 x i64>* %dst
  ret void
}

define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {
; CHECK-LABEL: t2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movq %r9, %xmm1
; CHECK-NEXT:    movq %r8, %xmm0
; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT:    movq %rdx, %xmm1
; CHECK-NEXT:    movq %rsi, %xmm2
; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; CHECK-NEXT:    movq %rcx, %xmm1
; CHECK-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
; CHECK-NEXT:    pxor %xmm4, %xmm4
; CHECK-NEXT:    pcmpeqq %xmm4, %xmm1
; CHECK-NEXT:    pcmpeqd %xmm5, %xmm5
; CHECK-NEXT:    pxor %xmm5, %xmm1
; CHECK-NEXT:    pcmpeqq %xmm4, %xmm2
; CHECK-NEXT:    pxor %xmm5, %xmm2
; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; CHECK-NEXT:    pcmpeqq %xmm4, %xmm3
; CHECK-NEXT:    pxor %xmm5, %xmm3
; CHECK-NEXT:    pcmpeqq %xmm4, %xmm0
; CHECK-NEXT:    pxor %xmm5, %xmm0
; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; CHECK-NEXT:    andps %xmm2, %xmm0
; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; CHECK-NEXT:    psllq $63, %xmm1
; CHECK-NEXT:    psrad $31, %xmm1
; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; CHECK-NEXT:    psllq $63, %xmm0
; CHECK-NEXT:    psrad $31, %xmm0
; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-NEXT:    movq %xmm0, 16(%rdi)
; CHECK-NEXT:    movdqa %xmm1, (%rdi)
; CHECK-NEXT:    retq
  %cmp1 = icmp ne <3 x i64> %src1, zeroinitializer
  %cmp2 = icmp ne <3 x i64> %src2, zeroinitializer
  %t1 = and <3 x i1> %cmp1, %cmp2
  %t2 = sext <3 x i1> %t1 to <3 x i64>
  store <3 x i64> %t2, <3 x i64>* %dst
  ret void
}
[X86][SSE] Regenerate vsplit and tests To make it more obvious how bad some of that truncation code is.... llvm-svn: 283880 2016-10-11 21:51:44 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
Harden test so it's not affected by changes to compare lowering. This only failed on hosts that don't have SSE41. llvm-svn: 171066 2012-12-25 21:23:23 +08:00			`; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn \| FileCheck %s`
It seems better to scalarize vectors of size 1 instead of widening them. Add support to widen SETCC. llvm-svn: 94342 2010-01-24 08:24:43 +08:00
Clean the triple, add check lines. llvm-svn: 142183 2011-10-17 15:07:51 +08:00			`define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind readonly {`
[X86][SSE] Regenerate vsplit and tests To make it more obvious how bad some of that truncation code is.... llvm-svn: 283880 2016-10-11 21:51:44 +08:00			`; CHECK-LABEL: t0:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: # %bb.0:`
[X86][SSE] Regenerate vsplit and tests To make it more obvious how bad some of that truncation code is.... llvm-svn: 283880 2016-10-11 21:51:44 +08:00			`; CHECK-NEXT: pxor %xmm2, %xmm2`
			`; CHECK-NEXT: pcmpeqq %xmm2, %xmm0`
			`; CHECK-NEXT: pcmpeqq %xmm2, %xmm1`
[X86] Fix vector ANDN matching to work correctly when both inputs to the AND are XORs. llvm-svn: 293403 2017-01-29 07:52:09 +08:00			`; CHECK-NEXT: pcmpeqd %xmm2, %xmm2`
			`; CHECK-NEXT: pxor %xmm1, %xmm2`
			`; CHECK-NEXT: pandn %xmm2, %xmm0`
			`; CHECK-NEXT: movdqa %xmm0, (%rdi)`
[X86][SSE] Regenerate vsplit and tests To make it more obvious how bad some of that truncation code is.... llvm-svn: 283880 2016-10-11 21:51:44 +08:00			`; CHECK-NEXT: retq`
It seems better to scalarize vectors of size 1 instead of widening them. Add support to widen SETCC. llvm-svn: 94342 2010-01-24 08:24:43 +08:00			`%cmp1 = icmp ne <2 x i64> %src1, zeroinitializer`
			`%cmp2 = icmp ne <2 x i64> %src2, zeroinitializer`
			`%t1 = and <2 x i1> %cmp1, %cmp2`
			`%t2 = sext <2 x i1> %t1 to <2 x i64>`
			`store <2 x i64> %t2, <2 x i64>* %dst`
			`ret void`
			`}`

			`define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {`
[X86][SSE] Regenerate vsplit and tests To make it more obvious how bad some of that truncation code is.... llvm-svn: 283880 2016-10-11 21:51:44 +08:00			`; CHECK-LABEL: t2:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: # %bb.0:`
[X86][SSE2] Fix asm string for movq (Move Quadword) instruction. Replace "mov{d\|q}" with "movq". Differential Revision: https://reviews.llvm.org/D32220 llvm-svn: 301386 2017-04-26 15:08:44 +08:00			`; CHECK-NEXT: movq %r9, %xmm1`
			`; CHECK-NEXT: movq %r8, %xmm0`
[X86][SSE] Regenerate vsplit and tests To make it more obvious how bad some of that truncation code is.... llvm-svn: 283880 2016-10-11 21:51:44 +08:00			`; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]`
[SelectionDAG] Add BITCAST handling to ComputeNumSignBits for splatted sign bits. For cases where we are BITCASTing to vectors of smaller elements, then if the entire source was a splatted sign (src's NumSignBits == SrcBitWidth) we can say that the dst's NumSignBit == DstBitWidth, as we're just splitting those sign bits across multiple elements. We could generalize this but at the moment the only use case I have is to peek through bitcasts to vector comparison results. Differential Revision: https://reviews.llvm.org/D37849 llvm-svn: 313543 2017-09-19 00:45:05 +08:00			`; CHECK-NEXT: movq %rdx, %xmm1`
			`; CHECK-NEXT: movq %rsi, %xmm2`
			`; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]`
			`; CHECK-NEXT: movq %rcx, %xmm1`
[X86][SSE] Regenerate vsplit and tests To make it more obvious how bad some of that truncation code is.... llvm-svn: 283880 2016-10-11 21:51:44 +08:00			`; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero`
			`; CHECK-NEXT: pxor %xmm4, %xmm4`
[x86] use a single shufps when it can save instructions This is a tiny patch with a big pile of test changes. This partially fixes PR27885: https://llvm.org/bugs/show_bug.cgi?id=27885 My motivating case looks like this: - vpshufd {{.#+}} xmm1 = xmm1[0,1,0,2] - vpshufd {{.#+}} xmm0 = xmm0[0,2,2,3] - vpblendw {{.#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] + vshufps {{.#+}} xmm0 = xmm0[0,2],xmm1[0,2] And this happens several times in the diffs. For chips with domain-crossing penalties, the instruction count and size reduction should usually overcome any potential domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so using shufps is a pure win. So the test case diffs all appear to be improvements except one test in vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate zero elements and one test in combine-sra.ll where multiple uses prevent the expected shuffle combining. Differential Revision: https://reviews.llvm.org/D27692 llvm-svn: 289837 2016-12-16 02:03:38 +08:00			`; CHECK-NEXT: pcmpeqq %xmm4, %xmm1`
[SelectionDAG] Add BITCAST handling to ComputeNumSignBits for splatted sign bits. For cases where we are BITCASTing to vectors of smaller elements, then if the entire source was a splatted sign (src's NumSignBits == SrcBitWidth) we can say that the dst's NumSignBit == DstBitWidth, as we're just splitting those sign bits across multiple elements. We could generalize this but at the moment the only use case I have is to peek through bitcasts to vector comparison results. Differential Revision: https://reviews.llvm.org/D37849 llvm-svn: 313543 2017-09-19 00:45:05 +08:00			`; CHECK-NEXT: pcmpeqd %xmm5, %xmm5`
[x86] use a single shufps when it can save instructions This is a tiny patch with a big pile of test changes. This partially fixes PR27885: https://llvm.org/bugs/show_bug.cgi?id=27885 My motivating case looks like this: - vpshufd {{.#+}} xmm1 = xmm1[0,1,0,2] - vpshufd {{.#+}} xmm0 = xmm0[0,2,2,3] - vpblendw {{.#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] + vshufps {{.#+}} xmm0 = xmm0[0,2],xmm1[0,2] And this happens several times in the diffs. For chips with domain-crossing penalties, the instruction count and size reduction should usually overcome any potential domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so using shufps is a pure win. So the test case diffs all appear to be improvements except one test in vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate zero elements and one test in combine-sra.ll where multiple uses prevent the expected shuffle combining. Differential Revision: https://reviews.llvm.org/D27692 llvm-svn: 289837 2016-12-16 02:03:38 +08:00			`; CHECK-NEXT: pxor %xmm5, %xmm1`
[SelectionDAG] Add BITCAST handling to ComputeNumSignBits for splatted sign bits. For cases where we are BITCASTing to vectors of smaller elements, then if the entire source was a splatted sign (src's NumSignBits == SrcBitWidth) we can say that the dst's NumSignBit == DstBitWidth, as we're just splitting those sign bits across multiple elements. We could generalize this but at the moment the only use case I have is to peek through bitcasts to vector comparison results. Differential Revision: https://reviews.llvm.org/D37849 llvm-svn: 313543 2017-09-19 00:45:05 +08:00			`; CHECK-NEXT: pcmpeqq %xmm4, %xmm2`
			`; CHECK-NEXT: pxor %xmm5, %xmm2`
			`; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]`
[X86][SSE] Regenerate vsplit and tests To make it more obvious how bad some of that truncation code is.... llvm-svn: 283880 2016-10-11 21:51:44 +08:00			`; CHECK-NEXT: pcmpeqq %xmm4, %xmm3`
			`; CHECK-NEXT: pxor %xmm5, %xmm3`
			`; CHECK-NEXT: pcmpeqq %xmm4, %xmm0`
			`; CHECK-NEXT: pxor %xmm5, %xmm0`
[x86] use a single shufps when it can save instructions This is a tiny patch with a big pile of test changes. This partially fixes PR27885: https://llvm.org/bugs/show_bug.cgi?id=27885 My motivating case looks like this: - vpshufd {{.#+}} xmm1 = xmm1[0,1,0,2] - vpshufd {{.#+}} xmm0 = xmm0[0,2,2,3] - vpblendw {{.#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] + vshufps {{.#+}} xmm0 = xmm0[0,2],xmm1[0,2] And this happens several times in the diffs. For chips with domain-crossing penalties, the instruction count and size reduction should usually overcome any potential domain-crossing penalty due to using an FP op in a sequence of int ops. For chips such as recent Intel big cores and Atom, there is no domain-crossing penalty for shufps, so using shufps is a pure win. So the test case diffs all appear to be improvements except one test in vector-shuffle-combining.ll where we miss an opportunity to use a shift to generate zero elements and one test in combine-sra.ll where multiple uses prevent the expected shuffle combining. Differential Revision: https://reviews.llvm.org/D27692 llvm-svn: 289837 2016-12-16 02:03:38 +08:00			`; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]`
[SelectionDAG] Add BITCAST handling to ComputeNumSignBits for splatted sign bits. For cases where we are BITCASTing to vectors of smaller elements, then if the entire source was a splatted sign (src's NumSignBits == SrcBitWidth) we can say that the dst's NumSignBit == DstBitWidth, as we're just splitting those sign bits across multiple elements. We could generalize this but at the moment the only use case I have is to peek through bitcasts to vector comparison results. Differential Revision: https://reviews.llvm.org/D37849 llvm-svn: 313543 2017-09-19 00:45:05 +08:00			`; CHECK-NEXT: andps %xmm2, %xmm0`
[X86][SSE] Regenerate vsplit and tests To make it more obvious how bad some of that truncation code is.... llvm-svn: 283880 2016-10-11 21:51:44 +08:00			`; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero`
			`; CHECK-NEXT: psllq $63, %xmm1`
			`; CHECK-NEXT: psrad $31, %xmm1`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]`
			`; CHECK-NEXT: psllq $63, %xmm0`
			`; CHECK-NEXT: psrad $31, %xmm0`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]`
			`; CHECK-NEXT: movq %xmm0, 16(%rdi)`
			`; CHECK-NEXT: movdqa %xmm1, (%rdi)`
			`; CHECK-NEXT: retq`
It seems better to scalarize vectors of size 1 instead of widening them. Add support to widen SETCC. llvm-svn: 94342 2010-01-24 08:24:43 +08:00			`%cmp1 = icmp ne <3 x i64> %src1, zeroinitializer`
			`%cmp2 = icmp ne <3 x i64> %src2, zeroinitializer`
			`%t1 = and <3 x i1> %cmp1, %cmp2`
			`%t2 = sext <3 x i1> %t1 to <3 x i64>`
			`store <3 x i64> %t2, <3 x i64>* %dst`
			`ret void`
			`}`