llvm-project/llvm/test/CodeGen/X86/mmx-build-vector.ll

1847 lines
69 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx | FileCheck %s --check-prefixes=X86,X86-MMX
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE,X86-SSE2
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s --check-prefixes=X86,X86-SSE,X86-SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512
declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
;
; v2i32
;
define void @build_v2i32_01(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
; X86-MMX-LABEL: build_v2i32_01:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 12(%ebp), %ecx
; X86-MMX-NEXT: movl 16(%ebp), %edx
; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movl %ecx, (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2i32_01:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-8, %esp
; X86-SSE-NEXT: subl $8, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: movlps %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: build_v2i32_01:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %edx, %xmm0
; X64-SSE-NEXT: movd %esi, %xmm1
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-SSE-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE-NEXT: paddd %mm0, %mm0
; X64-SSE-NEXT: movq %mm0, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: build_v2i32_01:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovd %esi, %xmm0
; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 %a0, i32 0
%2 = insertelement <2 x i32> %1, i32 %a1, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}
define void @build_v2i32_0z(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
; X86-LABEL: build_v2i32_0z:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
; X86-NEXT: paddd %mm0, %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: build_v2i32_0z:
; X64: # %bb.0:
; X64-NEXT: movd %esi, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 %a0, i32 0
%2 = insertelement <2 x i32> %1, i32 0, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}
define void @build_v2i32_u1(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
; X86-MMX-LABEL: build_v2i32_u1:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 16(%ebp), %ecx
; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2i32_u1:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-8, %esp
; X86-SSE-NEXT: subl $8, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-SSE-NEXT: movq %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: build_v2i32_u1:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %edx, %xmm0
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE-NEXT: paddd %mm0, %mm0
; X64-SSE-NEXT: movq %mm0, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: build_v2i32_u1:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovd %edx, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX1-NEXT: paddd %mm0, %mm0
; X64-AVX1-NEXT: movq %mm0, (%rdi)
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: build_v2i32_u1:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovd %edx, %xmm0
; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX2-NEXT: paddd %mm0, %mm0
; X64-AVX2-NEXT: movq %mm0, (%rdi)
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: build_v2i32_u1:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vmovd %edx, %xmm0
; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX512-NEXT: paddd %mm0, %mm0
; X64-AVX512-NEXT: movq %mm0, (%rdi)
; X64-AVX512-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 undef, i32 0
%2 = insertelement <2 x i32> %1, i32 %a1, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}
define void @build_v2i32_z1(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
; X86-MMX-LABEL: build_v2i32_z1:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 16(%ebp), %ecx
; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movl $0, (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2i32_z1:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-8, %esp
; X86-SSE-NEXT: subl $8, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; X86-SSE-NEXT: movq %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: build_v2i32_z1:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: # kill: def $edx killed $edx def $rdx
; X64-SSE-NEXT: movq %rdx, %xmm0
; X64-SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE-NEXT: paddd %mm0, %mm0
; X64-SSE-NEXT: movq %mm0, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: build_v2i32_z1:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: # kill: def $edx killed $edx def $rdx
; X64-AVX-NEXT: vmovq %rdx, %xmm0
; X64-AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 0, i32 0
%2 = insertelement <2 x i32> %1, i32 %a1, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}
define void @build_v2i32_00(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
; X86-MMX-LABEL: build_v2i32_00:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 12(%ebp), %ecx
; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movl %ecx, (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2i32_00:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-8, %esp
; X86-SSE-NEXT: subl $8, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X86-SSE-NEXT: movq %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: build_v2i32_00:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %esi, %xmm0
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE-NEXT: paddd %mm0, %mm0
; X64-SSE-NEXT: movq %mm0, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: build_v2i32_00:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovd %esi, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX1-NEXT: paddd %mm0, %mm0
; X64-AVX1-NEXT: movq %mm0, (%rdi)
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: build_v2i32_00:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovd %esi, %xmm0
; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX2-NEXT: paddd %mm0, %mm0
; X64-AVX2-NEXT: movq %mm0, (%rdi)
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: build_v2i32_00:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vmovd %esi, %xmm0
; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX512-NEXT: paddd %mm0, %mm0
; X64-AVX512-NEXT: movq %mm0, (%rdi)
; X64-AVX512-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 %a0, i32 0
%2 = insertelement <2 x i32> %1, i32 %a0, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}
;
; v4i16
;
define void @build_v4i16_0123(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
; X86-MMX-LABEL: build_v4i16_0123:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 24(%ebp), %ecx
; X86-MMX-NEXT: shll $16, %ecx
; X86-MMX-NEXT: movzwl 20(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movl 16(%ebp), %ecx
; X86-MMX-NEXT: shll $16, %ecx
; X86-MMX-NEXT: movzwl 12(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movl %edx, (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v4i16_0123:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-8, %esp
; X86-SSE-NEXT: subl $8, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0
; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0
; X86-SSE-NEXT: pinsrw $3, 24(%ebp), %xmm0
; X86-SSE-NEXT: movq %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE2-LABEL: build_v4i16_0123:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movd %r8d, %xmm0
; X64-SSE2-NEXT: movd %ecx, %xmm1
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-SSE2-NEXT: movd %edx, %xmm0
; X64-SSE2-NEXT: movd %esi, %xmm2
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE2-NEXT: paddd %mm0, %mm0
; X64-SSE2-NEXT: movq %mm0, (%rdi)
; X64-SSE2-NEXT: retq
;
; X64-SSSE3-LABEL: build_v4i16_0123:
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: movd %r8d, %xmm0
; X64-SSSE3-NEXT: movd %ecx, %xmm1
; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-SSSE3-NEXT: movd %edx, %xmm0
; X64-SSSE3-NEXT: movd %esi, %xmm2
; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X64-SSSE3-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSSE3-NEXT: paddd %mm0, %mm0
; X64-SSSE3-NEXT: movq %mm0, (%rdi)
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: build_v4i16_0123:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovd %esi, %xmm0
; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrd $3, %r8d, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 %a1, i32 1
%3 = insertelement <4 x i16> %2, i16 %a2, i32 2
%4 = insertelement <4 x i16> %3, i16 %a3, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, x86_mmx *%p0
ret void
}
define void @build_v4i16_01zz(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
; X86-MMX-LABEL: build_v4i16_01zz:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 16(%ebp), %ecx
; X86-MMX-NEXT: shll $16, %ecx
; X86-MMX-NEXT: movzwl 12(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movl %edx, (%esp)
; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE2-LABEL: build_v4i16_01zz:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-8, %esp
; X86-SSE2-NEXT: subl $8, %esp
; X86-SSE2-NEXT: movl 8(%ebp), %eax
; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE2-NEXT: movq %xmm0, (%esp)
; X86-SSE2-NEXT: movq (%esp), %mm0
; X86-SSE2-NEXT: paddd %mm0, %mm0
; X86-SSE2-NEXT: movq %mm0, (%eax)
; X86-SSE2-NEXT: movl %ebp, %esp
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSSE3-LABEL: build_v4i16_01zz:
; X86-SSSE3: # %bb.0:
; X86-SSSE3-NEXT: pushl %ebp
; X86-SSSE3-NEXT: movl %esp, %ebp
; X86-SSSE3-NEXT: andl $-8, %esp
; X86-SSSE3-NEXT: subl $8, %esp
; X86-SSSE3-NEXT: movl 8(%ebp), %eax
; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-SSSE3-NEXT: movq %xmm1, (%esp)
; X86-SSSE3-NEXT: movq (%esp), %mm0
; X86-SSSE3-NEXT: paddd %mm0, %mm0
; X86-SSSE3-NEXT: movq %mm0, (%eax)
; X86-SSSE3-NEXT: movl %ebp, %esp
; X86-SSSE3-NEXT: popl %ebp
; X86-SSSE3-NEXT: retl
;
; X64-SSE2-LABEL: build_v4i16_01zz:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movd %edx, %xmm0
; X64-SSE2-NEXT: movd %esi, %xmm1
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE2-NEXT: paddd %mm0, %mm0
; X64-SSE2-NEXT: movq %mm0, (%rdi)
; X64-SSE2-NEXT: retq
;
; X64-SSSE3-LABEL: build_v4i16_01zz:
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: movd %edx, %xmm0
; X64-SSSE3-NEXT: movd %esi, %xmm1
; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSSE3-NEXT: paddd %mm0, %mm0
; X64-SSSE3-NEXT: movq %mm0, (%rdi)
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: build_v4i16_01zz:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovd %edx, %xmm0
; X64-AVX-NEXT: vmovd %esi, %xmm1
; X64-AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 %a1, i32 1
%3 = insertelement <4 x i16> %2, i16 0, i32 2
%4 = insertelement <4 x i16> %3, i16 0, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, x86_mmx *%p0
ret void
}
define void @build_v4i16_0uuz(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
; X86-LABEL: build_v4i16_0uuz:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
; X86-NEXT: paddd %mm0, %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: build_v4i16_0uuz:
; X64: # %bb.0:
; X64-NEXT: movd %esi, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 undef, i32 1
%3 = insertelement <4 x i16> %2, i16 undef, i32 2
%4 = insertelement <4 x i16> %3, i16 0, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, x86_mmx *%p0
ret void
}
define void @build_v4i16_0zuz(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
; X86-LABEL: build_v4i16_0zuz:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd %eax, %mm0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: paddd %mm0, %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: build_v4i16_0zuz:
; X64: # %bb.0:
; X64-NEXT: movzwl %si, %eax
; X64-NEXT: movd %eax, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 0, i32 1
%3 = insertelement <4 x i16> %2, i16 undef, i32 2
%4 = insertelement <4 x i16> %3, i16 0, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, x86_mmx *%p0
ret void
}
define void @build_v4i16_012u(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
; X86-MMX-LABEL: build_v4i16_012u:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 16(%ebp), %ecx
; X86-MMX-NEXT: shll $16, %ecx
; X86-MMX-NEXT: movzwl 12(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movl %edx, (%esp)
; X86-MMX-NEXT: shll $16, %ecx
; X86-MMX-NEXT: movzwl 20(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v4i16_012u:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-8, %esp
; X86-SSE-NEXT: subl $8, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0
; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0
; X86-SSE-NEXT: movq %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: build_v4i16_012u:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %edx, %xmm0
; X64-SSE-NEXT: movd %esi, %xmm1
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE-NEXT: paddd %mm0, %mm0
; X64-SSE-NEXT: movq %mm0, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: build_v4i16_012u:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovd %esi, %xmm0
; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
; X64-AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 %a1, i32 1
%3 = insertelement <4 x i16> %2, i16 %a2, i32 2
%4 = insertelement <4 x i16> %3, i16 undef, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, x86_mmx *%p0
ret void
}
define void @build_v4i16_0u00(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
; X86-MMX-LABEL: build_v4i16_0u00:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movzwl 12(%ebp), %ecx
; X86-MMX-NEXT: movl %ecx, %edx
; X86-MMX-NEXT: shll $16, %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: shll $16, %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movl %edx, (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE2-LABEL: build_v4i16_0u00:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-8, %esp
; X86-SSE2-NEXT: subl $8, %esp
; X86-SSE2-NEXT: movl 8(%ebp), %eax
; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE2-NEXT: movq %xmm0, (%esp)
; X86-SSE2-NEXT: movq (%esp), %mm0
; X86-SSE2-NEXT: paddd %mm0, %mm0
; X86-SSE2-NEXT: movq %mm0, (%eax)
; X86-SSE2-NEXT: movl %ebp, %esp
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSSE3-LABEL: build_v4i16_0u00:
; X86-SSSE3: # %bb.0:
; X86-SSSE3-NEXT: pushl %ebp
; X86-SSSE3-NEXT: movl %esp, %ebp
; X86-SSSE3-NEXT: andl $-8, %esp
; X86-SSSE3-NEXT: subl $8, %esp
; X86-SSSE3-NEXT: movl 8(%ebp), %eax
; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3]
; X86-SSSE3-NEXT: movq %xmm0, (%esp)
; X86-SSSE3-NEXT: movq (%esp), %mm0
; X86-SSSE3-NEXT: paddd %mm0, %mm0
; X86-SSSE3-NEXT: movq %mm0, (%eax)
; X86-SSSE3-NEXT: movl %ebp, %esp
; X86-SSSE3-NEXT: popl %ebp
; X86-SSSE3-NEXT: retl
;
; X64-SSE2-LABEL: build_v4i16_0u00:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movd %esi, %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE2-NEXT: paddd %mm0, %mm0
; X64-SSE2-NEXT: movq %mm0, (%rdi)
; X64-SSE2-NEXT: retq
;
; X64-SSSE3-LABEL: build_v4i16_0u00:
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: movd %esi, %xmm0
; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3]
; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSSE3-NEXT: paddd %mm0, %mm0
; X64-SSSE3-NEXT: movq %mm0, (%rdi)
; X64-SSSE3-NEXT: retq
;
; X64-AVX1-LABEL: build_v4i16_0u00:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovd %esi, %xmm0
; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3]
; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX1-NEXT: paddd %mm0, %mm0
; X64-AVX1-NEXT: movq %mm0, (%rdi)
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: build_v4i16_0u00:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovd %esi, %xmm0
; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; X64-AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX2-NEXT: paddd %mm0, %mm0
; X64-AVX2-NEXT: movq %mm0, (%rdi)
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: build_v4i16_0u00:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vmovd %esi, %xmm0
; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
; X64-AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX512-NEXT: paddd %mm0, %mm0
; X64-AVX512-NEXT: movq %mm0, (%rdi)
; X64-AVX512-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 undef, i32 1
%3 = insertelement <4 x i16> %2, i16 %a0, i32 2
%4 = insertelement <4 x i16> %3, i16 %a0, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, x86_mmx *%p0
ret void
}
;
; v8i8
;
define void @build_v8i8_01234567(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
; X86-MMX-LABEL: build_v8i8_01234567:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: pushl %esi
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $16, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 40(%ebp), %ecx
; X86-MMX-NEXT: shll $8, %ecx
; X86-MMX-NEXT: movzbl 36(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: shll $16, %edx
; X86-MMX-NEXT: movl 32(%ebp), %ecx
; X86-MMX-NEXT: shll $8, %ecx
; X86-MMX-NEXT: movzbl 28(%ebp), %esi
; X86-MMX-NEXT: orl %ecx, %esi
; X86-MMX-NEXT: movzwl %si, %ecx
; X86-MMX-NEXT: orl %edx, %ecx
; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movl 24(%ebp), %ecx
; X86-MMX-NEXT: shll $8, %ecx
; X86-MMX-NEXT: movzbl 20(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: shll $16, %edx
; X86-MMX-NEXT: movl 16(%ebp), %ecx
; X86-MMX-NEXT: shll $8, %ecx
; X86-MMX-NEXT: movzbl 12(%ebp), %esi
; X86-MMX-NEXT: orl %ecx, %esi
; X86-MMX-NEXT: movzwl %si, %ecx
; X86-MMX-NEXT: orl %edx, %ecx
; X86-MMX-NEXT: movl %ecx, (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: leal -4(%ebp), %esp
; X86-MMX-NEXT: popl %esi
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v8i8_01234567:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: andl $-8, %esp
; X86-SSE-NEXT: subl $16, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movl 24(%ebp), %ecx
; X86-SSE-NEXT: shll $8, %ecx
; X86-SSE-NEXT: movzbl 20(%ebp), %edx
; X86-SSE-NEXT: orl %ecx, %edx
; X86-SSE-NEXT: movl 16(%ebp), %ecx
; X86-SSE-NEXT: shll $8, %ecx
; X86-SSE-NEXT: movzbl 12(%ebp), %esi
; X86-SSE-NEXT: orl %ecx, %esi
; X86-SSE-NEXT: movd %esi, %xmm0
; X86-SSE-NEXT: pinsrw $1, %edx, %xmm0
; X86-SSE-NEXT: movl 32(%ebp), %ecx
; X86-SSE-NEXT: shll $8, %ecx
; X86-SSE-NEXT: movzbl 28(%ebp), %edx
; X86-SSE-NEXT: orl %ecx, %edx
; X86-SSE-NEXT: pinsrw $2, %edx, %xmm0
; X86-SSE-NEXT: movl 40(%ebp), %ecx
; X86-SSE-NEXT: shll $8, %ecx
; X86-SSE-NEXT: movzbl 36(%ebp), %edx
; X86-SSE-NEXT: orl %ecx, %edx
; X86-SSE-NEXT: pinsrw $3, %edx, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: leal -4(%ebp), %esp
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: build_v8i8_01234567:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: shll $8, %r8d
; X64-SSE-NEXT: movzbl %cl, %eax
; X64-SSE-NEXT: orl %r8d, %eax
; X64-SSE-NEXT: shll $8, %edx
; X64-SSE-NEXT: movzbl %sil, %ecx
; X64-SSE-NEXT: orl %edx, %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pinsrw $1, %eax, %xmm0
; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax
; X64-SSE-NEXT: shll $8, %eax
; X64-SSE-NEXT: movzbl %r9b, %ecx
; X64-SSE-NEXT: orl %eax, %ecx
; X64-SSE-NEXT: pinsrw $2, %ecx, %xmm0
; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax
; X64-SSE-NEXT: shll $8, %eax
; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
; X64-SSE-NEXT: orl %eax, %ecx
; X64-SSE-NEXT: pinsrw $3, %ecx, %xmm0
; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE-NEXT: paddd %mm0, %mm0
; X64-SSE-NEXT: movq %mm0, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: build_v8i8_01234567:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovd %esi, %xmm0
; X64-AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrb $4, %r9d, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 %a1, i32 1
%3 = insertelement <8 x i8> %2, i8 %a2, i32 2
%4 = insertelement <8 x i8> %3, i8 %a3, i32 3
%5 = insertelement <8 x i8> %4, i8 %a4, i32 4
%6 = insertelement <8 x i8> %5, i8 %a5, i32 5
%7 = insertelement <8 x i8> %6, i8 %a6, i32 6
%8 = insertelement <8 x i8> %7, i8 %a7, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, x86_mmx *%p0
ret void
}
define void @build_v8i8_0u2345z7(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
; X86-MMX-LABEL: build_v8i8_0u2345z7:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 24(%ebp), %ecx
; X86-MMX-NEXT: shll $8, %ecx
; X86-MMX-NEXT: movzbl 20(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: shll $16, %edx
; X86-MMX-NEXT: movzbl 12(%ebp), %ecx
; X86-MMX-NEXT: orl %edx, %ecx
; X86-MMX-NEXT: movl %ecx, (%esp)
; X86-MMX-NEXT: movl 32(%ebp), %ecx
; X86-MMX-NEXT: shll $8, %ecx
; X86-MMX-NEXT: movzbl 28(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movzwl %dx, %ecx
; X86-MMX-NEXT: movl 40(%ebp), %edx
; X86-MMX-NEXT: shll $24, %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE2-LABEL: build_v8i8_0u2345z7:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-8, %esp
; X86-SSE2-NEXT: subl $8, %esp
; X86-SSE2-NEXT: movl 8(%ebp), %eax
; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: packuswb %xmm0, %xmm0
; X86-SSE2-NEXT: movq %xmm0, (%esp)
; X86-SSE2-NEXT: movq (%esp), %mm0
; X86-SSE2-NEXT: paddd %mm0, %mm0
; X86-SSE2-NEXT: movq %mm0, (%eax)
; X86-SSE2-NEXT: movl %ebp, %esp
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSSE3-LABEL: build_v8i8_0u2345z7:
; X86-SSSE3: # %bb.0:
; X86-SSSE3-NEXT: pushl %ebp
; X86-SSSE3-NEXT: movl %esp, %ebp
; X86-SSSE3-NEXT: andl $-8, %esp
; X86-SSSE3-NEXT: subl $8, %esp
; X86-SSSE3-NEXT: movl 8(%ebp), %eax
; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: pxor %xmm1, %xmm1
; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u]
; X86-SSSE3-NEXT: movq %xmm0, (%esp)
; X86-SSSE3-NEXT: movq (%esp), %mm0
; X86-SSSE3-NEXT: paddd %mm0, %mm0
; X86-SSSE3-NEXT: movq %mm0, (%eax)
; X86-SSSE3-NEXT: movl %ebp, %esp
; X86-SSSE3-NEXT: popl %ebp
; X86-SSSE3-NEXT: retl
;
; X64-SSE2-LABEL: build_v8i8_0u2345z7:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-SSE2-NEXT: movd %r9d, %xmm0
; X64-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: movd %r8d, %xmm1
; X64-SSE2-NEXT: movd %ecx, %xmm2
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X64-SSE2-NEXT: movd %esi, %xmm1
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; X64-SSE2-NEXT: packuswb %xmm1, %xmm1
; X64-SSE2-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE2-NEXT: paddd %mm0, %mm0
; X64-SSE2-NEXT: movq %mm0, (%rdi)
; X64-SSE2-NEXT: retq
;
; X64-SSSE3-LABEL: build_v8i8_0u2345z7:
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSSE3-NEXT: pxor %xmm1, %xmm1
; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-SSSE3-NEXT: movd %r9d, %xmm0
; X64-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSSE3-NEXT: movd %r8d, %xmm1
; X64-SSSE3-NEXT: movd %ecx, %xmm2
; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X64-SSSE3-NEXT: movd %esi, %xmm1
; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,u,4,6,8,10],zero,xmm1[14,u,u,u,u,u,u,u,u]
; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSSE3-NEXT: paddd %mm0, %mm0
; X64-SSSE3-NEXT: movq %mm0, (%rdi)
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: build_v8i8_0u2345z7:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrw $4, %r9d, %xmm0, %xmm0
; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; X64-AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; X64-AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u]
; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 undef, i32 1
%3 = insertelement <8 x i8> %2, i8 %a2, i32 2
%4 = insertelement <8 x i8> %3, i8 %a3, i32 3
%5 = insertelement <8 x i8> %4, i8 %a4, i32 4
%6 = insertelement <8 x i8> %5, i8 %a5, i32 5
%7 = insertelement <8 x i8> %6, i8 0, i32 6
%8 = insertelement <8 x i8> %7, i8 %a7, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, x86_mmx *%p0
ret void
}
define void @build_v8i8_0123zzzu(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
; X86-MMX-LABEL: build_v8i8_0123zzzu:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: pushl %esi
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $16, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movl 24(%ebp), %ecx
; X86-MMX-NEXT: shll $8, %ecx
; X86-MMX-NEXT: movzbl 20(%ebp), %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: shll $16, %edx
; X86-MMX-NEXT: movl 16(%ebp), %ecx
; X86-MMX-NEXT: shll $8, %ecx
; X86-MMX-NEXT: movzbl 12(%ebp), %esi
; X86-MMX-NEXT: orl %ecx, %esi
; X86-MMX-NEXT: movzwl %si, %ecx
; X86-MMX-NEXT: orl %edx, %ecx
; X86-MMX-NEXT: movl %ecx, (%esp)
; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: leal -4(%ebp), %esp
; X86-MMX-NEXT: popl %esi
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE2-LABEL: build_v8i8_0123zzzu:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-8, %esp
; X86-SSE2-NEXT: subl $8, %esp
; X86-SSE2-NEXT: movl 8(%ebp), %eax
; X86-SSE2-NEXT: movl 12(%ebp), %ecx
; X86-SSE2-NEXT: pxor %xmm0, %xmm0
; X86-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
; X86-SSE2-NEXT: movl 16(%ebp), %ecx
; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
; X86-SSE2-NEXT: movl 20(%ebp), %ecx
; X86-SSE2-NEXT: pinsrw $2, %ecx, %xmm0
; X86-SSE2-NEXT: movl 24(%ebp), %ecx
; X86-SSE2-NEXT: pinsrw $3, %ecx, %xmm0
; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: packuswb %xmm0, %xmm0
; X86-SSE2-NEXT: movq %xmm0, (%esp)
; X86-SSE2-NEXT: movq (%esp), %mm0
; X86-SSE2-NEXT: paddd %mm0, %mm0
; X86-SSE2-NEXT: movq %mm0, (%eax)
; X86-SSE2-NEXT: movl %ebp, %esp
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSSE3-LABEL: build_v8i8_0123zzzu:
; X86-SSSE3: # %bb.0:
; X86-SSSE3-NEXT: pushl %ebp
; X86-SSSE3-NEXT: movl %esp, %ebp
; X86-SSSE3-NEXT: andl $-8, %esp
; X86-SSSE3-NEXT: subl $8, %esp
; X86-SSSE3-NEXT: movl 8(%ebp), %eax
; X86-SSSE3-NEXT: movl 12(%ebp), %ecx
; X86-SSSE3-NEXT: pxor %xmm0, %xmm0
; X86-SSSE3-NEXT: pinsrw $0, %ecx, %xmm0
; X86-SSSE3-NEXT: movl 16(%ebp), %ecx
; X86-SSSE3-NEXT: pinsrw $1, %ecx, %xmm0
; X86-SSSE3-NEXT: movl 20(%ebp), %ecx
; X86-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0
; X86-SSSE3-NEXT: movl 24(%ebp), %ecx
; X86-SSSE3-NEXT: pinsrw $3, %ecx, %xmm0
; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
; X86-SSSE3-NEXT: movq %xmm0, (%esp)
; X86-SSSE3-NEXT: movq (%esp), %mm0
; X86-SSSE3-NEXT: paddd %mm0, %mm0
; X86-SSSE3-NEXT: movq %mm0, (%eax)
; X86-SSSE3-NEXT: movl %ebp, %esp
; X86-SSSE3-NEXT: popl %ebp
; X86-SSSE3-NEXT: retl
;
; X64-SSE2-LABEL: build_v8i8_0123zzzu:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pxor %xmm0, %xmm0
; X64-SSE2-NEXT: pinsrw $0, %esi, %xmm0
; X64-SSE2-NEXT: pinsrw $1, %edx, %xmm0
; X64-SSE2-NEXT: pinsrw $2, %ecx, %xmm0
; X64-SSE2-NEXT: pinsrw $3, %r8d, %xmm0
; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: packuswb %xmm0, %xmm0
; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE2-NEXT: paddd %mm0, %mm0
; X64-SSE2-NEXT: movq %mm0, (%rdi)
; X64-SSE2-NEXT: retq
;
; X64-SSSE3-LABEL: build_v8i8_0123zzzu:
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: pxor %xmm0, %xmm0
; X64-SSSE3-NEXT: pinsrw $0, %esi, %xmm0
; X64-SSSE3-NEXT: pinsrw $1, %edx, %xmm0
; X64-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0
; X64-SSSE3-NEXT: pinsrw $3, %r8d, %xmm0
; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSSE3-NEXT: paddd %mm0, %mm0
; X64-SSSE3-NEXT: movq %mm0, (%rdi)
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: build_v8i8_0123zzzu:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 %a1, i32 1
%3 = insertelement <8 x i8> %2, i8 %a2, i32 2
%4 = insertelement <8 x i8> %3, i8 %a3, i32 3
%5 = insertelement <8 x i8> %4, i8 0, i32 4
%6 = insertelement <8 x i8> %5, i8 0, i32 5
%7 = insertelement <8 x i8> %6, i8 0, i32 6
%8 = insertelement <8 x i8> %7, i8 undef, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, x86_mmx *%p0
ret void
}
define void @build_v8i8_0uuuuzzz(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
; X86-LABEL: build_v8i8_0uuuuzzz:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
; X86-NEXT: paddd %mm0, %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: build_v8i8_0uuuuzzz:
; X64: # %bb.0:
; X64-NEXT: movd %esi, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 undef, i32 1
%3 = insertelement <8 x i8> %2, i8 undef, i32 2
%4 = insertelement <8 x i8> %3, i8 undef, i32 3
%5 = insertelement <8 x i8> %4, i8 undef, i32 4
%6 = insertelement <8 x i8> %5, i8 0, i32 5
%7 = insertelement <8 x i8> %6, i8 0, i32 6
%8 = insertelement <8 x i8> %7, i8 0, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, x86_mmx *%p0
ret void
}
define void @build_v8i8_0zzzzzzu(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
; X86-LABEL: build_v8i8_0zzzzzzu:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd %eax, %mm0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: paddd %mm0, %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: build_v8i8_0zzzzzzu:
; X64: # %bb.0:
; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: movd %eax, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 0, i32 1
%3 = insertelement <8 x i8> %2, i8 0, i32 2
%4 = insertelement <8 x i8> %3, i8 0, i32 3
%5 = insertelement <8 x i8> %4, i8 0, i32 4
%6 = insertelement <8 x i8> %5, i8 0, i32 5
%7 = insertelement <8 x i8> %6, i8 0, i32 6
%8 = insertelement <8 x i8> %7, i8 undef, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, x86_mmx *%p0
ret void
}
define void @build_v8i8_00000000(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
; X86-MMX-LABEL: build_v8i8_00000000:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: movzbl 12(%ebp), %ecx
; X86-MMX-NEXT: movl %ecx, %edx
; X86-MMX-NEXT: shll $8, %edx
; X86-MMX-NEXT: orl %ecx, %edx
; X86-MMX-NEXT: movl %edx, %ecx
; X86-MMX-NEXT: shll $16, %ecx
; X86-MMX-NEXT: orl %edx, %ecx
; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movl %ecx, (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE2-LABEL: build_v8i8_00000000:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-8, %esp
; X86-SSE2-NEXT: subl $8, %esp
; X86-SSE2-NEXT: movl 8(%ebp), %eax
; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: packuswb %xmm0, %xmm0
; X86-SSE2-NEXT: movq %xmm0, (%esp)
; X86-SSE2-NEXT: movq (%esp), %mm0
; X86-SSE2-NEXT: paddd %mm0, %mm0
; X86-SSE2-NEXT: movq %mm0, (%eax)
; X86-SSE2-NEXT: movl %ebp, %esp
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSSE3-LABEL: build_v8i8_00000000:
; X86-SSSE3: # %bb.0:
; X86-SSSE3-NEXT: pushl %ebp
; X86-SSSE3-NEXT: movl %esp, %ebp
; X86-SSSE3-NEXT: andl $-8, %esp
; X86-SSSE3-NEXT: subl $8, %esp
; X86-SSSE3-NEXT: movl 8(%ebp), %eax
; X86-SSSE3-NEXT: pxor %xmm0, %xmm0
; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: pshufb %xmm0, %xmm1
; X86-SSSE3-NEXT: movq %xmm1, (%esp)
; X86-SSSE3-NEXT: movq (%esp), %mm0
; X86-SSSE3-NEXT: paddd %mm0, %mm0
; X86-SSSE3-NEXT: movq %mm0, (%eax)
; X86-SSSE3-NEXT: movl %ebp, %esp
; X86-SSSE3-NEXT: popl %ebp
; X86-SSSE3-NEXT: retl
;
; X64-SSE2-LABEL: build_v8i8_00000000:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movd %esi, %xmm0
; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: packuswb %xmm0, %xmm0
; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE2-NEXT: paddd %mm0, %mm0
; X64-SSE2-NEXT: movq %mm0, (%rdi)
; X64-SSE2-NEXT: retq
;
; X64-SSSE3-LABEL: build_v8i8_00000000:
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: movd %esi, %xmm0
; X64-SSSE3-NEXT: pxor %xmm1, %xmm1
; X64-SSSE3-NEXT: pshufb %xmm1, %xmm0
; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSSE3-NEXT: paddd %mm0, %mm0
; X64-SSSE3-NEXT: movq %mm0, (%rdi)
; X64-SSSE3-NEXT: retq
;
; X64-AVX1-LABEL: build_v8i8_00000000:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovd %esi, %xmm0
; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX1-NEXT: paddd %mm0, %mm0
; X64-AVX1-NEXT: movq %mm0, (%rdi)
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: build_v8i8_00000000:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovd %esi, %xmm0
; X64-AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX2-NEXT: paddd %mm0, %mm0
; X64-AVX2-NEXT: movq %mm0, (%rdi)
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: build_v8i8_00000000:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vmovd %esi, %xmm0
; X64-AVX512-NEXT: vpbroadcastb %xmm0, %xmm0
; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX512-NEXT: paddd %mm0, %mm0
; X64-AVX512-NEXT: movq %mm0, (%rdi)
; X64-AVX512-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 %a0, i32 1
%3 = insertelement <8 x i8> %2, i8 %a0, i32 2
%4 = insertelement <8 x i8> %3, i8 %a0, i32 3
%5 = insertelement <8 x i8> %4, i8 %a0, i32 4
%6 = insertelement <8 x i8> %5, i8 %a0, i32 5
%7 = insertelement <8 x i8> %6, i8 %a0, i32 6
%8 = insertelement <8 x i8> %7, i8 %a0, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, x86_mmx *%p0
ret void
}
;
; v2f32
;
define void @build_v2f32_01(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_01:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: flds 12(%ebp)
; X86-MMX-NEXT: flds 16(%ebp)
; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp)
; X86-MMX-NEXT: fstps (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2f32_01:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-16, %esp
; X86-SSE-NEXT: subl $32, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: movaps %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: build_v2f32_01:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE-NEXT: paddd %mm0, %mm0
; X64-SSE-NEXT: movq %mm0, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: build_v2f32_01:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <2 x float> undef, float %a0, i32 0
%2 = insertelement <2 x float> %1, float %a1, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}
define void @build_v2f32_0z(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_0z:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: flds 12(%ebp)
; X86-MMX-NEXT: fstps (%esp)
; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2f32_0z:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-16, %esp
; X86-SSE-NEXT: subl $32, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movaps %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: build_v2f32_0z:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorps %xmm1, %xmm1
; X64-SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE-NEXT: paddd %mm0, %mm0
; X64-SSE-NEXT: movq %mm0, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: build_v2f32_0z:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX1-NEXT: paddd %mm0, %mm0
; X64-AVX1-NEXT: movq %mm0, (%rdi)
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: build_v2f32_0z:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX2-NEXT: paddd %mm0, %mm0
; X64-AVX2-NEXT: movq %mm0, (%rdi)
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: build_v2f32_0z:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX512-NEXT: paddd %mm0, %mm0
; X64-AVX512-NEXT: movq %mm0, (%rdi)
; X64-AVX512-NEXT: retq
%1 = insertelement <2 x float> undef, float %a0, i32 0
%2 = insertelement <2 x float> %1, float 0.0, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}
define void @build_v2f32_u1(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_u1:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: flds 16(%ebp)
; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE2-LABEL: build_v2f32_u1:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-16, %esp
; X86-SSE2-NEXT: subl $32, %esp
; X86-SSE2-NEXT: movl 8(%ebp), %eax
; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; X86-SSE2-NEXT: movaps %xmm0, (%esp)
; X86-SSE2-NEXT: movq (%esp), %mm0
; X86-SSE2-NEXT: paddd %mm0, %mm0
; X86-SSE2-NEXT: movq %mm0, (%eax)
; X86-SSE2-NEXT: movl %ebp, %esp
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSSE3-LABEL: build_v2f32_u1:
; X86-SSSE3: # %bb.0:
; X86-SSSE3-NEXT: pushl %ebp
; X86-SSSE3-NEXT: movl %esp, %ebp
; X86-SSSE3-NEXT: andl $-16, %esp
; X86-SSSE3-NEXT: subl $32, %esp
; X86-SSSE3-NEXT: movl 8(%ebp), %eax
; X86-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X86-SSSE3-NEXT: movaps %xmm0, (%esp)
; X86-SSSE3-NEXT: movq (%esp), %mm0
; X86-SSSE3-NEXT: paddd %mm0, %mm0
; X86-SSSE3-NEXT: movq %mm0, (%eax)
; X86-SSSE3-NEXT: movl %ebp, %esp
; X86-SSSE3-NEXT: popl %ebp
; X86-SSSE3-NEXT: retl
;
; X64-SSE2-LABEL: build_v2f32_u1:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,2,3]
; X64-SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE2-NEXT: paddd %mm0, %mm0
; X64-SSE2-NEXT: movq %mm0, (%rdi)
; X64-SSE2-NEXT: retq
;
; X64-SSSE3-LABEL: build_v2f32_u1:
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
; X64-SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSSE3-NEXT: paddd %mm0, %mm0
; X64-SSSE3-NEXT: movq %mm0, (%rdi)
; X64-SSSE3-NEXT: retq
;
; X64-AVX1-LABEL: build_v2f32_u1:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX1-NEXT: paddd %mm0, %mm0
; X64-AVX1-NEXT: movq %mm0, (%rdi)
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: build_v2f32_u1:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vbroadcastss %xmm1, %xmm0
; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX2-NEXT: paddd %mm0, %mm0
; X64-AVX2-NEXT: movq %mm0, (%rdi)
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: build_v2f32_u1:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vbroadcastss %xmm1, %xmm0
; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX512-NEXT: paddd %mm0, %mm0
; X64-AVX512-NEXT: movq %mm0, (%rdi)
; X64-AVX512-NEXT: retq
%1 = insertelement <2 x float> undef, float undef, i32 0
%2 = insertelement <2 x float> %1, float %a1, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}
define void @build_v2f32_z1(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_z1:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: flds 16(%ebp)
; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp)
; X86-MMX-NEXT: movl $0, (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2f32_z1:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-16, %esp
; X86-SSE-NEXT: subl $32, %esp
; X86-SSE-NEXT: movl 8(%ebp), %eax
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: xorps %xmm1, %xmm1
; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; X86-SSE-NEXT: movaps %xmm0, (%esp)
; X86-SSE-NEXT: movq (%esp), %mm0
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
; X86-SSE-NEXT: movl %ebp, %esp
; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: build_v2f32_z1:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorps %xmm0, %xmm0
; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE-NEXT: paddd %mm0, %mm0
; X64-SSE-NEXT: movq %mm0, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: build_v2f32_z1:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX-NEXT: paddd %mm0, %mm0
; X64-AVX-NEXT: movq %mm0, (%rdi)
; X64-AVX-NEXT: retq
%1 = insertelement <2 x float> undef, float 0.0, i32 0
%2 = insertelement <2 x float> %1, float %a1, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}
define void @build_v2f32_00(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_00:
; X86-MMX: # %bb.0:
; X86-MMX-NEXT: pushl %ebp
; X86-MMX-NEXT: movl %esp, %ebp
; X86-MMX-NEXT: andl $-8, %esp
; X86-MMX-NEXT: subl $8, %esp
; X86-MMX-NEXT: movl 8(%ebp), %eax
; X86-MMX-NEXT: flds 12(%ebp)
; X86-MMX-NEXT: fsts {{[0-9]+}}(%esp)
; X86-MMX-NEXT: fstps (%esp)
; X86-MMX-NEXT: movq (%esp), %mm0
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
; X86-MMX-NEXT: movl %ebp, %esp
; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE2-LABEL: build_v2f32_00:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: andl $-16, %esp
; X86-SSE2-NEXT: subl $32, %esp
; X86-SSE2-NEXT: movl 8(%ebp), %eax
; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; X86-SSE2-NEXT: movaps %xmm0, (%esp)
; X86-SSE2-NEXT: movq (%esp), %mm0
; X86-SSE2-NEXT: paddd %mm0, %mm0
; X86-SSE2-NEXT: movq %mm0, (%eax)
; X86-SSE2-NEXT: movl %ebp, %esp
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSSE3-LABEL: build_v2f32_00:
; X86-SSSE3: # %bb.0:
; X86-SSSE3-NEXT: pushl %ebp
; X86-SSSE3-NEXT: movl %esp, %ebp
; X86-SSSE3-NEXT: andl $-16, %esp
; X86-SSSE3-NEXT: subl $32, %esp
; X86-SSSE3-NEXT: movl 8(%ebp), %eax
; X86-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X86-SSSE3-NEXT: movaps %xmm0, (%esp)
; X86-SSSE3-NEXT: movq (%esp), %mm0
; X86-SSSE3-NEXT: paddd %mm0, %mm0
; X86-SSSE3-NEXT: movq %mm0, (%eax)
; X86-SSSE3-NEXT: movl %ebp, %esp
; X86-SSSE3-NEXT: popl %ebp
; X86-SSSE3-NEXT: retl
;
; X64-SSE2-LABEL: build_v2f32_00:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSE2-NEXT: paddd %mm0, %mm0
; X64-SSE2-NEXT: movq %mm0, (%rdi)
; X64-SSE2-NEXT: retq
;
; X64-SSSE3-LABEL: build_v2f32_00:
; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X64-SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-SSSE3-NEXT: paddd %mm0, %mm0
; X64-SSSE3-NEXT: movq %mm0, (%rdi)
; X64-SSSE3-NEXT: retq
;
; X64-AVX1-LABEL: build_v2f32_00:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX1-NEXT: paddd %mm0, %mm0
; X64-AVX1-NEXT: movq %mm0, (%rdi)
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: build_v2f32_00:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vbroadcastss %xmm0, %xmm0
; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX2-NEXT: paddd %mm0, %mm0
; X64-AVX2-NEXT: movq %mm0, (%rdi)
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: build_v2f32_00:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0
; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-AVX512-NEXT: paddd %mm0, %mm0
; X64-AVX512-NEXT: movq %mm0, (%rdi)
; X64-AVX512-NEXT: retq
%1 = insertelement <2 x float> undef, float %a0, i32 0
%2 = insertelement <2 x float> %1, float %a0, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, x86_mmx *%p0
ret void
}