llvm-project/llvm/test/CodeGen/X86/sse1.ll

; Tests for SSE1 and below, without SSE2+.
; RUN: llc < %s -march=x86 -mcpu=pentium3 -O3 | FileCheck %s
; RUN: llc < %s -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s

define <8 x i16> @test1(<8 x i32> %a) nounwind {
; CHECK: test1
  ret <8 x i16> zeroinitializer
}

define <8 x i16> @test2(<8 x i32> %a) nounwind {
; CHECK: test2
  %c = trunc <8 x i32> %a to <8 x i16>            ; <<8 x i16>> [#uses=1]
  ret <8 x i16> %c
}

; PR7993
;define <4 x i32> @test3(<4 x i16> %a) nounwind {
;  %c = sext <4 x i16> %a to <4 x i32>             ; <<4 x i32>> [#uses=1]
;  ret <4 x i32> %c
;}

; This should not emit shuffles to populate the top 2 elements of the 4-element
; vector that this ends up returning.
; rdar://8368414
define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
entry:
  %tmp7 = extractelement <2 x float> %A, i32 0
  %tmp5 = extractelement <2 x float> %A, i32 1
  %tmp3 = extractelement <2 x float> %B, i32 0
  %tmp1 = extractelement <2 x float> %B, i32 1
  %add.r = fadd float %tmp7, %tmp3
  %add.i = fsub float %tmp5, %tmp1
  %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
  %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
  ret <2 x float> %tmp9
; CHECK-LABEL: test4:
; CHECK-NOT: shufps	$16
; CHECK: shufps	$1, 
; CHECK-NOT: shufps	$16
; CHECK: shufps	$1, 
; CHECK-NOT: shufps	$16
; CHECK: unpcklps
; CHECK-NOT: shufps	$16
; CHECK: ret
}
fix sse1 only codegen in x86-64 mode, which is something we apparently try to support. llvm-svn: 112168 2010-08-26 13:24:29 +08:00			`; Tests for SSE1 and below, without SSE2+.`
Make sure this forces the x86 targets llvm-svn: 112169 2010-08-26 13:25:05 +08:00			`; RUN: llc < %s -march=x86 -mcpu=pentium3 -O3 \| FileCheck %s`
Default 64-bit target features and SSE2 on when a triple specifies x86-64. Clean up all the other hacks which are now unnecessary. llvm-svn: 134753 2011-07-09 06:16:47 +08:00			`; RUN: llc < %s -march=x86-64 -mattr=-sse2,+sse -O3 \| FileCheck %s`
fix sse1 only codegen in x86-64 mode, which is something we apparently try to support. llvm-svn: 112168 2010-08-26 13:24:29 +08:00
			`define <8 x i16> @test1(<8 x i32> %a) nounwind {`
			`; CHECK: test1`
			`ret <8 x i16> zeroinitializer`
			`}`
implement SplitVecOp_CONCAT_VECTORS, fixing the included testcase with SSE1. llvm-svn: 112171 2010-08-26 13:51:22 +08:00
			`define <8 x i16> @test2(<8 x i32> %a) nounwind {`
			`; CHECK: test2`
			`%c = trunc <8 x i32> %a to <8 x i16> ; <<8 x i16>> [#uses=1]`
			`ret <8 x i16> %c`
			`}`
Add a hackaround for PR7993 which is causing failures on x86 builders that lack sse2. llvm-svn: 112175 2010-08-26 14:57:07 +08:00
			`; PR7993`
			`;define <4 x i32> @test3(<4 x i16> %a) nounwind {`
			`; %c = sext <4 x i16> %a to <4 x i32> ; <<4 x i32>> [#uses=1]`
			`; ret <4 x i32> %c`
			`;}`
fix the BuildVector -> unpcklps logic to not do pointless shuffles when the top elements of a vector are undefined. This happens all the time for X86-64 ABI stuff because only the low 2 elements of a 4 element vector are defined. For example, on: _Complex float f32(_Complex float A, _Complex float B) { return A+B; } We used to produce (with SSE2, SSE4.1+ uses insertps): _f32: ## @f32 movdqa %xmm0, %xmm2 addss %xmm1, %xmm2 pshufd $16, %xmm2, %xmm2 pshufd $1, %xmm1, %xmm1 pshufd $1, %xmm0, %xmm0 addss %xmm1, %xmm0 pshufd $16, %xmm0, %xmm1 movdqa %xmm2, %xmm0 unpcklps %xmm1, %xmm0 ret We now produce: _f32: ## @f32 movdqa %xmm0, %xmm2 addss %xmm1, %xmm2 pshufd $1, %xmm1, %xmm1 pshufd $1, %xmm0, %xmm3 addss %xmm1, %xmm3 movaps %xmm2, %xmm0 unpcklps %xmm3, %xmm0 ret This implements rdar://8368414 llvm-svn: 112378 2010-08-29 01:28:30 +08:00
			`; This should not emit shuffles to populate the top 2 elements of the 4-element`
			`; vector that this ends up returning.`
			`; rdar://8368414`
			`define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {`
			`entry:`
			`%tmp7 = extractelement <2 x float> %A, i32 0`
			`%tmp5 = extractelement <2 x float> %A, i32 1`
			`%tmp3 = extractelement <2 x float> %B, i32 0`
			`%tmp1 = extractelement <2 x float> %B, i32 1`
			`%add.r = fadd float %tmp7, %tmp3`
			`%add.i = fsub float %tmp5, %tmp1`
			`%tmp11 = insertelement <2 x float> undef, float %add.r, i32 0`
			`%tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1`
			`ret <2 x float> %tmp9`
Convert CodeGen//.ll tests to use the new CHECK-LABEL for easier debugging. No functionality change and all tests pass after conversion. This was done with the following sed invocation to catch label lines demarking function boundaries: sed -i '' "s/^;\( \)\([A-Z0-9_]\):\( \)test\([A-Za-z0-9_-]\):\( \)$/;\1\2-LABEL:\3test\4:\5/g" test/CodeGen//*.ll which was written conservatively to avoid false positives rather than false negatives. I scanned through all the changes and everything looks correct. llvm-svn: 186258 2013-07-14 04:38:47 +08:00			`; CHECK-LABEL: test4:`
fix the BuildVector -> unpcklps logic to not do pointless shuffles when the top elements of a vector are undefined. This happens all the time for X86-64 ABI stuff because only the low 2 elements of a 4 element vector are defined. For example, on: _Complex float f32(_Complex float A, _Complex float B) { return A+B; } We used to produce (with SSE2, SSE4.1+ uses insertps): _f32: ## @f32 movdqa %xmm0, %xmm2 addss %xmm1, %xmm2 pshufd $16, %xmm2, %xmm2 pshufd $1, %xmm1, %xmm1 pshufd $1, %xmm0, %xmm0 addss %xmm1, %xmm0 pshufd $16, %xmm0, %xmm1 movdqa %xmm2, %xmm0 unpcklps %xmm1, %xmm0 ret We now produce: _f32: ## @f32 movdqa %xmm0, %xmm2 addss %xmm1, %xmm2 pshufd $1, %xmm1, %xmm1 pshufd $1, %xmm0, %xmm3 addss %xmm1, %xmm3 movaps %xmm2, %xmm0 unpcklps %xmm3, %xmm0 ret This implements rdar://8368414 llvm-svn: 112378 2010-08-29 01:28:30 +08:00			`; CHECK-NOT: shufps $16`
			`; CHECK: shufps $1,`
			`; CHECK-NOT: shufps $16`
			`; CHECK: shufps $1,`
			`; CHECK-NOT: shufps $16`
			`; CHECK: unpcklps`
			`; CHECK-NOT: shufps $16`
			`; CHECK: ret`
			`}`