llvm-project/llvm/test/CodeGen/X86/vec_shuffle-37.ll

; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0

define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
entry:
; CHECK: movaps  ({{%rdi|%rcx}}), %xmm0
; CHECK: movaps  %xmm0, %xmm1
; CHECK-NEXT: movss   %xmm2, %xmm1
; CHECK-NEXT: shufps  $36, %xmm1, %xmm0
  %0 = load <4 x i32>* undef, align 16
  %1 = load <4 x i32>* %a0, align 16
  %2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
  ret <4 x i32> %2
}

define void @t01(double* %a0) nounwind ssp {
entry:
; CHECK_O0: movsd (%eax), %xmm0
; CHECK_O0: unpcklpd  %xmm0, %xmm0
  %tmp93 = load double* %a0, align 8
  %vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1
  store <2 x double> %vecinit94, <2 x double>* undef
  ret void
}

define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
entry:
; CHECK: t02
; CHECK: mov
; CHECK-NEXT: mov
; CHECK-NEXT: mov
; CHECK-NEXT: mov
; CHECK-NEXT: ret
  %0 = bitcast <8 x i32>* %source to <4 x i32>*
  %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
  %tmp2 = load <4 x i32>* %arrayidx, align 16
  %tmp3 = extractelement <4 x i32> %tmp2, i32 0
  %tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0
  %arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1
  %1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>*
  %tmp8 = load <4 x i32>* %1, align 16
  %tmp9 = extractelement <4 x i32> %tmp8, i32 1
  %tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1
  store <2 x i32> %tmp11, <2 x i32>* %dest, align 8
  ret void
}
Relax expressions and add explicit triplets -linux and -win32. llvm-svn: 126215 2011-02-22 15:21:51 +08:00			`; RUN: llc < %s -mtriple=x86_64-linux \| FileCheck %s`
			`; RUN: llc < %s -mtriple=x86_64-win32 \| FileCheck %s`
Add one more pattern to fallback movddup llvm-svn: 113522 2010-09-10 02:48:34 +08:00			`; RUN: llc -O0 < %s -march=x86 -mcpu=core2 \| FileCheck %s --check-prefix=CHECK_O0`
Using target specific nodes for shuffle nodes makes the mask check more strict, breaking some cases not checked in the testsuite, but also exposes some foldings not done before, as this example: movaps (%rdi), %xmm0 movaps (%rax), %xmm1 movaps %xmm0, %xmm2 movss %xmm1, %xmm2 shufps $36, %xmm2, %xmm0 now is generated as: movaps (%rdi), %xmm0 movaps %xmm0, %xmm1 movlps (%rax), %xmm1 shufps $36, %xmm1, %xmm0 llvm-svn: 112753 2010-09-02 06:33:20 +08:00
			`define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {`
			`entry:`
Relax expressions and add explicit triplets -linux and -win32. llvm-svn: 126215 2011-02-22 15:21:51 +08:00			`; CHECK: movaps ({{%rdi\|%rcx}}), %xmm0`
The following X86 pattern is incorrect: def : Pat<(X86Movss VR128:$src1, (bc_v4i32 (v2i64 (load addr:$src2)))), (MOVLPSrm VR128:$src1, addr:$src2)>; This matches a MOVSS dag with a MOVLPS instruction. However, MOVSS will replace only the low 32 bits of the register, while the MOVLPS instruction will replace the low 64 bits. A testcase is added and illustrates the bug and also modified the one that was already present. Patch by Tanya Lattner. llvm-svn: 137227 2011-08-11 01:45:17 +08:00			`; CHECK: movaps %xmm0, %xmm1`
			`; CHECK-NEXT: movss %xmm2, %xmm1`
Using target specific nodes for shuffle nodes makes the mask check more strict, breaking some cases not checked in the testsuite, but also exposes some foldings not done before, as this example: movaps (%rdi), %xmm0 movaps (%rax), %xmm1 movaps %xmm0, %xmm2 movss %xmm1, %xmm2 shufps $36, %xmm2, %xmm0 now is generated as: movaps (%rdi), %xmm0 movaps %xmm0, %xmm1 movlps (%rax), %xmm1 shufps $36, %xmm1, %xmm0 llvm-svn: 112753 2010-09-02 06:33:20 +08:00			`; CHECK-NEXT: shufps $36, %xmm1, %xmm0`
			`%0 = load <4 x i32>* undef, align 16`
			`%1 = load <4 x i32>* %a0, align 16`
			`%2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>`
			`ret <4 x i32> %2`
			`}`

Add one more pattern to fallback movddup llvm-svn: 113522 2010-09-10 02:48:34 +08:00			`define void @t01(double* %a0) nounwind ssp {`
			`entry:`
			`; CHECK_O0: movsd (%eax), %xmm0`
			`; CHECK_O0: unpcklpd %xmm0, %xmm0`
			`%tmp93 = load double* %a0, align 8`
			`%vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1`
			`store <2 x double> %vecinit94, <2 x double>* undef`
			`ret void`
			`}`
The following X86 pattern is incorrect: def : Pat<(X86Movss VR128:$src1, (bc_v4i32 (v2i64 (load addr:$src2)))), (MOVLPSrm VR128:$src1, addr:$src2)>; This matches a MOVSS dag with a MOVLPS instruction. However, MOVSS will replace only the low 32 bits of the register, while the MOVLPS instruction will replace the low 64 bits. A testcase is added and illustrates the bug and also modified the one that was already present. Patch by Tanya Lattner. llvm-svn: 137227 2011-08-11 01:45:17 +08:00
			`define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {`
			`entry:`
Add a new DAGCombine optimization for BUILD_VECTOR. If all of the inputs are zero/any_extended, create a new simple BV which can be further optimized by other BV optimizations. llvm-svn: 143297 2011-10-30 05:23:04 +08:00			`; CHECK: t02`
This commit contains a few changes that had to go in together. 1. Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) (and also scalar_to_vector). 2. Xor/and/or are indifferent to the swizzle operation (shuffle of one src). Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A, B)) 3. Optimize swizzles of shuffles: shuff(shuff(x, y), undef) -> shuff(x, y). 4. Fix an X86ISelLowering optimization which was very bitcast-sensitive. Code which was previously compiled to this: movd (%rsi), %xmm0 movdqa .LCPI0_0(%rip), %xmm2 pshufb %xmm2, %xmm0 movd (%rdi), %xmm1 pshufb %xmm2, %xmm1 pxor %xmm0, %xmm1 pshufb .LCPI0_1(%rip), %xmm1 movd %xmm1, (%rdi) ret Now compiles to this: movl (%rsi), %eax xorl %eax, (%rdi) ret llvm-svn: 153848 2012-04-02 03:31:22 +08:00			`; CHECK: mov`
			`; CHECK-NEXT: mov`
			`; CHECK-NEXT: mov`
			`; CHECK-NEXT: mov`
			`; CHECK-NEXT: ret`
The following X86 pattern is incorrect: def : Pat<(X86Movss VR128:$src1, (bc_v4i32 (v2i64 (load addr:$src2)))), (MOVLPSrm VR128:$src1, addr:$src2)>; This matches a MOVSS dag with a MOVLPS instruction. However, MOVSS will replace only the low 32 bits of the register, while the MOVLPS instruction will replace the low 64 bits. A testcase is added and illustrates the bug and also modified the one that was already present. Patch by Tanya Lattner. llvm-svn: 137227 2011-08-11 01:45:17 +08:00			`%0 = bitcast <8 x i32>* %source to <4 x i32>*`
			`%arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3`
			`%tmp2 = load <4 x i32>* %arrayidx, align 16`
			`%tmp3 = extractelement <4 x i32> %tmp2, i32 0`
			`%tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0`
			`%arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1`
			`%1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>*`
			`%tmp8 = load <4 x i32>* %1, align 16`
			`%tmp9 = extractelement <4 x i32> %tmp8, i32 1`
			`%tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1`
			`store <2 x i32> %tmp11, <2 x i32>* %dest, align 8`
			`ret void`
			`}`