llvm-project/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s

define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psll_dq_bs:
; CHECK:       ## BB#0:
; CHECK-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
; CHECK-NEXT:    retl
  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone


define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psrl_dq_bs:
; CHECK:       ## BB#0:
; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT:    retl
  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone

define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psll_dq:
; CHECK:       ## BB#0:
; CHECK-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; CHECK-NEXT:    retl
  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone


define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psrl_dq:
; CHECK:       ## BB#0:
; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; CHECK-NEXT:    retl
  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone


define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
; CHECK:       ## BB#0:
; CHECK-NEXT:    cvtdq2pd %xmm0, %xmm0
; CHECK-NEXT:    retl
  %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
  ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone


define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
; CHECK-LABEL: test_x86_sse2_cvtps2pd:
; CHECK:       ## BB#0:
; CHECK-NEXT:    cvtps2pd %xmm0, %xmm0
; CHECK-NEXT:    retl
  %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
  ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone


define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_sse2_storel_dq:
; CHECK:       ## BB#0:
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT:    movlps %xmm0, (%eax)
; CHECK-NEXT:    retl
  call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
  ret void
}
declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind


define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
  ; add operation forces the execution domain.
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK:       ## BB#0:
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
; CHECK-NEXT:    psubb %xmm1, %xmm0
; CHECK-NEXT:    movdqu %xmm0, (%eax)
; CHECK-NEXT:    retl
  %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
  ret void
}
declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind


define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
  ; fadd operation forces the execution domain.
; CHECK-LABEL: test_x86_sse2_storeu_pd:
; CHECK:       ## BB#0:
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT:    xorpd %xmm1, %xmm1
; CHECK-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; CHECK-NEXT:    addpd %xmm0, %xmm1
; CHECK-NEXT:    movupd %xmm1, (%eax)
; CHECK-NEXT:    retl
  %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
  call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
  ret void
}
declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind

define <4 x i32> @test_x86_sse2_pshuf_d(<4 x i32> %a) {
; CHECK-LABEL: test_x86_sse2_pshuf_d:
; CHECK:       ## BB#0: ## %entry
; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT:    retl
entry:
  %res = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) nounwind readnone
  ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) nounwind readnone

define <8 x i16> @test_x86_sse2_pshufl_w(<8 x i16> %a) {
; CHECK-LABEL: test_x86_sse2_pshufl_w:
; CHECK:       ## BB#0: ## %entry
; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; CHECK-NEXT:    retl
entry:
  %res = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) nounwind readnone
  ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) nounwind readnone

define <8 x i16> @test_x86_sse2_pshufh_w(<8 x i16> %a) {
; CHECK-LABEL: test_x86_sse2_pshufh_w:
; CHECK:       ## BB#0: ## %entry
; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; CHECK-NEXT:    retl
entry:
  %res = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) nounwind readnone
  ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) nounwind readnone

define <16 x i8> @max_epu8(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: max_epu8:
; CHECK:       ## BB#0:
; CHECK-NEXT:    pmaxub %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
  ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone

define <16 x i8> @min_epu8(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: min_epu8:
; CHECK:       ## BB#0:
; CHECK-NEXT:    pminub %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
  ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone

define <8 x i16> @max_epi16(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: max_epi16:
; CHECK:       ## BB#0:
; CHECK-NEXT:    pmaxsw %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
  ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone

define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: min_epi16:
; CHECK:       ## BB#0:
; CHECK-NEXT:    pminsw %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
  ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone

define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_add_sd:
; SSE:       ## BB#0:
; SSE-NEXT:    addsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x58,0xc1]
; SSE-NEXT:    retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_add_sd:
; AVX2:       ## BB#0:
; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x58,0xc1]
; AVX2-NEXT:    retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_add_sd:
; SKX:       ## BB#0:
; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x58,0xc1]
; SKX-NEXT:    retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_add_sd:
; CHECK:       ## BB#0:
; CHECK-NEXT:    addsd %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
  ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone


define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_sub_sd:
; SSE:       ## BB#0:
; SSE-NEXT:    subsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5c,0xc1]
; SSE-NEXT:    retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_sub_sd:
; AVX2:       ## BB#0:
; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5c,0xc1]
; AVX2-NEXT:    retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_sub_sd:
; SKX:       ## BB#0:
; SKX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5c,0xc1]
; SKX-NEXT:    retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_sub_sd:
; CHECK:       ## BB#0:
; CHECK-NEXT:    subsd %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
  ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone


define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_mul_sd:
; SSE:       ## BB#0:
; SSE-NEXT:    mulsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x59,0xc1]
; SSE-NEXT:    retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_mul_sd:
; AVX2:       ## BB#0:
; AVX2-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x59,0xc1]
; AVX2-NEXT:    retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_mul_sd:
; SKX:       ## BB#0:
; SKX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x59,0xc1]
; SKX-NEXT:    retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_mul_sd:
; CHECK:       ## BB#0:
; CHECK-NEXT:    mulsd %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
  ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone


define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_div_sd:
; SSE:       ## BB#0:
; SSE-NEXT:    divsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5e,0xc1]
; SSE-NEXT:    retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_div_sd:
; AVX2:       ## BB#0:
; AVX2-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5e,0xc1]
; AVX2-NEXT:    retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_div_sd:
; SKX:       ## BB#0:
; SKX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5e,0xc1]
; SKX-NEXT:    retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_div_sd:
; CHECK:       ## BB#0:
; CHECK-NEXT:    divsd %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
  ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone

define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: mm_avg_epu8:
; CHECK:       ## BB#0:
; CHECK-NEXT:    pavgb %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
  ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone

define <8 x i16> @mm_avg_epu16(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: mm_avg_epu16:
; CHECK:       ## BB#0:
; CHECK-NEXT:    pavgw %xmm1, %xmm0
; CHECK-NEXT:    retl
  %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
  ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
[X86][SSE] Fix domains for VZEXT_LOAD type instructions Add the missing domain equivalences for movss, movsd, movd and movq zero extending loading instructions. Differential Revision: https://reviews.llvm.org/D27684 llvm-svn: 289825 2016-12-16 00:05:29 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
[X86][SSE2] Regenerated sse2 upgraded intrinsics tests llvm-svn: 270423 2016-05-23 20:40:11 +08:00			`; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 \| FileCheck %s`
[X86] Remove AVX2 and SSE2 pslldq and psrldq intrinsics. We can represent them in IR with vector shuffles now. All their uses have been removed from clang in favor of shuffles. llvm-svn: 229640 2015-02-18 14:24:44 +08:00
			`define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {`
[X86][SSE2] Regenerated sse2 upgraded intrinsics tests llvm-svn: 270423 2016-05-23 20:40:11 +08:00			`; CHECK-LABEL: test_x86_sse2_psll_dq_bs:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]`
			`; CHECK-NEXT: retl`
[X86] Remove AVX2 and SSE2 pslldq and psrldq intrinsics. We can represent them in IR with vector shuffles now. All their uses have been removed from clang in favor of shuffles. llvm-svn: 229640 2015-02-18 14:24:44 +08:00			`%res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]`
			`ret <2 x i64> %res`
			`}`
			`declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone`


			`define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {`
[X86][SSE2] Regenerated sse2 upgraded intrinsics tests llvm-svn: 270423 2016-05-23 20:40:11 +08:00			`; CHECK-LABEL: test_x86_sse2_psrl_dq_bs:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero`
			`; CHECK-NEXT: retl`
[X86] Remove AVX2 and SSE2 pslldq and psrldq intrinsics. We can represent them in IR with vector shuffles now. All their uses have been removed from clang in favor of shuffles. llvm-svn: 229640 2015-02-18 14:24:44 +08:00			`%res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]`
			`ret <2 x i64> %res`
			`}`
			`declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone`

			`define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {`
[X86][SSE2] Regenerated sse2 upgraded intrinsics tests llvm-svn: 270423 2016-05-23 20:40:11 +08:00			`; CHECK-LABEL: test_x86_sse2_psll_dq:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]`
			`; CHECK-NEXT: retl`
[X86] Remove AVX2 and SSE2 pslldq and psrldq intrinsics. We can represent them in IR with vector shuffles now. All their uses have been removed from clang in favor of shuffles. llvm-svn: 229640 2015-02-18 14:24:44 +08:00			`%res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]`
			`ret <2 x i64> %res`
			`}`
			`declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone`


			`define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {`
[X86][SSE2] Regenerated sse2 upgraded intrinsics tests llvm-svn: 270423 2016-05-23 20:40:11 +08:00			`; CHECK-LABEL: test_x86_sse2_psrl_dq:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero`
			`; CHECK-NEXT: retl`
[X86] Remove AVX2 and SSE2 pslldq and psrldq intrinsics. We can represent them in IR with vector shuffles now. All their uses have been removed from clang in favor of shuffles. llvm-svn: 229640 2015-02-18 14:24:44 +08:00			`%res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]`
			`ret <2 x i64> %res`
			`}`
			`declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone`
[X86] Remove the llvm.x86.sse2.storel.dq intrinsic. It hasn't been used in a long time. llvm-svn: 270677 2016-05-25 14:56:32 +08:00

[X86][SSE] Replace (V)CVTDQ2PD(Y) and (V)CVTPS2PD(Y) lossless conversion intrinsics with generic IR Followup to D20528 clang patch, this removes the (V)CVTDQ2PD(Y) and (V)CVTPS2PD(Y) llvm intrinsics and auto-upgrades to sitofp/fpext instead. Differential Revision: http://reviews.llvm.org/D20568 llvm-svn: 270678 2016-05-25 16:59:18 +08:00			`define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {`
			`; CHECK-LABEL: test_x86_sse2_cvtdq2pd:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]`
			`ret <2 x double> %res`
			`}`
			`declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone`


			`define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {`
			`; CHECK-LABEL: test_x86_sse2_cvtps2pd:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: cvtps2pd %xmm0, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]`
			`ret <2 x double> %res`
			`}`
			`declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone`


[X86] Remove the llvm.x86.sse2.storel.dq intrinsic. It hasn't been used in a long time. llvm-svn: 270677 2016-05-25 14:56:32 +08:00			`define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {`
			`; CHECK-LABEL: test_x86_sse2_storel_dq:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; CHECK-NEXT: movlps %xmm0, (%eax)`
			`; CHECK-NEXT: retl`
			`call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)`
			`ret void`
			`}`
			`declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind`


[X86] Remove SSE/AVX unaligned store intrinsics as clang no longer uses them. Auto upgrade to native unaligned store instructions. llvm-svn: 271236 2016-05-31 07:15:56 +08:00			`define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {`
			`; add operation forces the execution domain.`
			`; CHECK-LABEL: test_x86_sse2_storeu_dq:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax`
[x86] transform vector inc/dec to use -1 constant (PR33483) Convert vector increment or decrement to sub/add with an all-ones constant: add X, <1, 1...> --> sub X, <-1, -1...> sub X, <1, 1...> --> add X, <-1, -1...> The all-ones vector constant can be materialized using a pcmpeq instruction that is commonly recognized as an idiom (has no register dependency), so that's better than loading a splat 1 constant. AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better way to produce 512 one-bits. The general advantages of this lowering are: 1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables, so in theory, this could be better for perf, but... 2. That seems unlikely to affect any OOO implementation, and I can't measure any real perf difference from this transform on Haswell or Jaguar, but... 3. It doesn't look like it from the diffs, but this is an overall size win because we eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting a scalar load (which might itself be a bug), then we're replacing a scalar constant load + broadcast with a single cheap op, so that should always be smaller/better too. 4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1 and psub x, -1, so we should use that form for +1 too because we can. If there's some reason to favor a constant load on some CPU, let's make the reverse transform for all of these cases (either here in the DAG or in a later machine pass). This should fix: https://bugs.llvm.org/show_bug.cgi?id=33483 Differential Revision: https://reviews.llvm.org/D34336 llvm-svn: 306289 2017-06-26 22:19:26 +08:00			`; CHECK-NEXT: pcmpeqd %xmm1, %xmm1`
			`; CHECK-NEXT: psubb %xmm1, %xmm0`
[X86] Remove SSE/AVX unaligned store intrinsics as clang no longer uses them. Auto upgrade to native unaligned store instructions. llvm-svn: 271236 2016-05-31 07:15:56 +08:00			`; CHECK-NEXT: movdqu %xmm0, (%eax)`
			`; CHECK-NEXT: retl`
			`%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>`
			`call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)`
			`ret void`
			`}`
			`declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind`


			`define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {`
			`; fadd operation forces the execution domain.`
			`; CHECK-LABEL: test_x86_sse2_storeu_pd:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax`
[X86][SSE] Allow matchVectorShuffleWithUNPCK to recognise ZERO inputs Add support for specifying an UNPCK input as ZERO, particularly improves ZEXT cases with non-zero offsets llvm-svn: 295169 2017-02-15 19:46:15 +08:00			`; CHECK-NEXT: xorpd %xmm1, %xmm1`
			`; CHECK-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]`
[X86] Remove SSE/AVX unaligned store intrinsics as clang no longer uses them. Auto upgrade to native unaligned store instructions. llvm-svn: 271236 2016-05-31 07:15:56 +08:00			`; CHECK-NEXT: addpd %xmm0, %xmm1`
			`; CHECK-NEXT: movupd %xmm1, (%eax)`
			`; CHECK-NEXT: retl`
			`%a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>`
			`call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)`
			`ret void`
			`}`
			`declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind`

[X86] Remove sse2 pshufd/pshuflw/pshufhw intrinsics and upgrade them to shufflevector. llvm-svn: 272510 2016-06-12 22:11:32 +08:00			`define <4 x i32> @test_x86_sse2_pshuf_d(<4 x i32> %a) {`
			`; CHECK-LABEL: test_x86_sse2_pshuf_d:`
			`; CHECK: ## BB#0: ## %entry`
			`; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]`
			`; CHECK-NEXT: retl`
			`entry:`
[x86] autoupgrade and remove SSE2/SSE41 integer min/max intrinsics Follow-up to: http://reviews.llvm.org/rL272806 http://reviews.llvm.org/rL272807 llvm-svn: 272907 2016-06-16 23:48:30 +08:00			`%res = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) nounwind readnone`
			`ret <4 x i32> %res`
[X86] Remove sse2 pshufd/pshuflw/pshufhw intrinsics and upgrade them to shufflevector. llvm-svn: 272510 2016-06-12 22:11:32 +08:00			`}`
			`declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) nounwind readnone`
[X86] Remove the llvm.x86.sse2.storel.dq intrinsic. It hasn't been used in a long time. llvm-svn: 270677 2016-05-25 14:56:32 +08:00
[X86] Remove sse2 pshufd/pshuflw/pshufhw intrinsics and upgrade them to shufflevector. llvm-svn: 272510 2016-06-12 22:11:32 +08:00			`define <8 x i16> @test_x86_sse2_pshufl_w(<8 x i16> %a) {`
			`; CHECK-LABEL: test_x86_sse2_pshufl_w:`
			`; CHECK: ## BB#0: ## %entry`
			`; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]`
			`; CHECK-NEXT: retl`
			`entry:`
[x86] autoupgrade and remove SSE2/SSE41 integer min/max intrinsics Follow-up to: http://reviews.llvm.org/rL272806 http://reviews.llvm.org/rL272807 llvm-svn: 272907 2016-06-16 23:48:30 +08:00			`%res = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) nounwind readnone`
			`ret <8 x i16> %res`
[X86] Remove sse2 pshufd/pshuflw/pshufhw intrinsics and upgrade them to shufflevector. llvm-svn: 272510 2016-06-12 22:11:32 +08:00			`}`
			`declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) nounwind readnone`

			`define <8 x i16> @test_x86_sse2_pshufh_w(<8 x i16> %a) {`
			`; CHECK-LABEL: test_x86_sse2_pshufh_w:`
			`; CHECK: ## BB#0: ## %entry`
			`; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]`
			`; CHECK-NEXT: retl`
			`entry:`
[x86] autoupgrade and remove SSE2/SSE41 integer min/max intrinsics Follow-up to: http://reviews.llvm.org/rL272806 http://reviews.llvm.org/rL272807 llvm-svn: 272907 2016-06-16 23:48:30 +08:00			`%res = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) nounwind readnone`
			`ret <8 x i16> %res`
[X86] Remove sse2 pshufd/pshuflw/pshufhw intrinsics and upgrade them to shufflevector. llvm-svn: 272510 2016-06-12 22:11:32 +08:00			`}`
			`declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) nounwind readnone`
[x86] autoupgrade and remove SSE2/SSE41 integer min/max intrinsics Follow-up to: http://reviews.llvm.org/rL272806 http://reviews.llvm.org/rL272807 llvm-svn: 272907 2016-06-16 23:48:30 +08:00
			`define <16 x i8> @max_epu8(<16 x i8> %a0, <16 x i8> %a1) {`
			`; CHECK-LABEL: max_epu8:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: pmaxub %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)`
			`ret <16 x i8> %res`
			`}`
			`declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone`

			`define <16 x i8> @min_epu8(<16 x i8> %a0, <16 x i8> %a1) {`
			`; CHECK-LABEL: min_epu8:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: pminub %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)`
			`ret <16 x i8> %res`
			`}`
			`declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone`

			`define <8 x i16> @max_epi16(<8 x i16> %a0, <8 x i16> %a1) {`
			`; CHECK-LABEL: max_epi16:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: pmaxsw %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)`
			`ret <8 x i16> %res`
			`}`
			`declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone`

			`define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) {`
			`; CHECK-LABEL: min_epi16:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: pminsw %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)`
			`ret <8 x i16> %res`
			`}`
			`declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone`

[X86] Remove the scalar intrinsics for fadd/fsub/fdiv/fmul Summary: These intrinsics have been unused for clang for a while. This patch removes them. We auto upgrade them to extractelements, a scalar operation and then an insertelement. This matches the sequence used by clangs intrinsic file. Reviewers: zvi, delena, RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D26660 llvm-svn: 287083 2016-11-16 13:24:10 +08:00			`define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {`
			`; SSE-LABEL: test_x86_sse2_add_sd:`
			`; SSE: ## BB#0:`
			`; SSE-NEXT: addsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x58,0xc1]`
			`; SSE-NEXT: retl ## encoding: [0xc3]`
			`;`
			`; AVX2-LABEL: test_x86_sse2_add_sd:`
			`; AVX2: ## BB#0:`
			`; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x58,0xc1]`
			`; AVX2-NEXT: retl ## encoding: [0xc3]`
			`;`
			`; SKX-LABEL: test_x86_sse2_add_sd:`
			`; SKX: ## BB#0:`
			`; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x58,0xc1]`
			`; SKX-NEXT: retl ## encoding: [0xc3]`
			`; CHECK-LABEL: test_x86_sse2_add_sd:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: addsd %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]`
			`ret <2 x double> %res`
			`}`
			`declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone`


			`define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {`
			`; SSE-LABEL: test_x86_sse2_sub_sd:`
			`; SSE: ## BB#0:`
			`; SSE-NEXT: subsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5c,0xc1]`
			`; SSE-NEXT: retl ## encoding: [0xc3]`
			`;`
			`; AVX2-LABEL: test_x86_sse2_sub_sd:`
			`; AVX2: ## BB#0:`
			`; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5c,0xc1]`
			`; AVX2-NEXT: retl ## encoding: [0xc3]`
			`;`
			`; SKX-LABEL: test_x86_sse2_sub_sd:`
			`; SKX: ## BB#0:`
			`; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5c,0xc1]`
			`; SKX-NEXT: retl ## encoding: [0xc3]`
			`; CHECK-LABEL: test_x86_sse2_sub_sd:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: subsd %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]`
			`ret <2 x double> %res`
			`}`
			`declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone`


			`define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {`
			`; SSE-LABEL: test_x86_sse2_mul_sd:`
			`; SSE: ## BB#0:`
			`; SSE-NEXT: mulsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x59,0xc1]`
			`; SSE-NEXT: retl ## encoding: [0xc3]`
			`;`
			`; AVX2-LABEL: test_x86_sse2_mul_sd:`
			`; AVX2: ## BB#0:`
			`; AVX2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x59,0xc1]`
			`; AVX2-NEXT: retl ## encoding: [0xc3]`
			`;`
			`; SKX-LABEL: test_x86_sse2_mul_sd:`
			`; SKX: ## BB#0:`
			`; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x59,0xc1]`
			`; SKX-NEXT: retl ## encoding: [0xc3]`
			`; CHECK-LABEL: test_x86_sse2_mul_sd:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: mulsd %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]`
			`ret <2 x double> %res`
			`}`
			`declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone`


			`define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {`
			`; SSE-LABEL: test_x86_sse2_div_sd:`
			`; SSE: ## BB#0:`
			`; SSE-NEXT: divsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5e,0xc1]`
			`; SSE-NEXT: retl ## encoding: [0xc3]`
			`;`
			`; AVX2-LABEL: test_x86_sse2_div_sd:`
			`; AVX2: ## BB#0:`
			`; AVX2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5e,0xc1]`
			`; AVX2-NEXT: retl ## encoding: [0xc3]`
			`;`
			`; SKX-LABEL: test_x86_sse2_div_sd:`
			`; SKX: ## BB#0:`
			`; SKX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5e,0xc1]`
			`; SKX-NEXT: retl ## encoding: [0xc3]`
			`; CHECK-LABEL: test_x86_sse2_div_sd:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: divsd %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]`
			`ret <2 x double> %res`
			`}`
			`declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone`

[X86] Lower _mm[256\|512]_[mask[z]]_avg_epu[8\|16] intrinsics to native llvm IR Differential Revision: https://reviews.llvm.org/D37560 llvm-svn: 313013 2017-09-12 15:50:35 +08:00			`define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) {`
			`; CHECK-LABEL: mm_avg_epu8:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: pavgb %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]`
			`ret <16 x i8> %res`
			`}`
			`declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone`

			`define <8 x i16> @mm_avg_epu16(<8 x i16> %a0, <8 x i16> %a1) {`
			`; CHECK-LABEL: mm_avg_epu16:`
			`; CHECK: ## BB#0:`
			`; CHECK-NEXT: pavgw %xmm1, %xmm0`
			`; CHECK-NEXT: retl`
			`%res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]`
			`ret <8 x i16> %res`
			`}`
			`declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone`
[X86] Remove the scalar intrinsics for fadd/fsub/fdiv/fmul Summary: These intrinsics have been unused for clang for a while. This patch removes them. We auto upgrade them to extractelements, a scalar operation and then an insertelement. This matches the sequence used by clangs intrinsic file. Reviewers: zvi, delena, RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D26660 llvm-svn: 287083 2016-11-16 13:24:10 +08:00