2016-05-20 00:55:52 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
|
|
; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32
|
|
|
|
; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
|
|
|
|
|
|
|
|
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_add_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: addps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_add_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: addps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = fadd <4 x float> %a0, %a1
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_add_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: addss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_add_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: addss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext0 = extractelement <4 x float> %a0, i32 0
|
|
|
|
%ext1 = extractelement <4 x float> %a1, i32 0
|
|
|
|
%fadd = fadd float %ext0, %ext1
|
|
|
|
%res = insertelement <4 x float> %a0, float %fadd, i32 0
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_and_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %ebp
|
|
|
|
; X32-NEXT: movl %esp, %ebp
|
|
|
|
; X32-NEXT: pushl %esi
|
|
|
|
; X32-NEXT: andl $-16, %esp
|
|
|
|
; X32-NEXT: subl $64, %esp
|
|
|
|
; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X32-NEXT: movl %esi, (%esp)
|
|
|
|
; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: leal -4(%ebp), %esp
|
|
|
|
; X32-NEXT: popl %esi
|
|
|
|
; X32-NEXT: popl %ebp
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_and_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
|
|
|
|
; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
|
|
|
|
; X64-NEXT: movq %rdx, %rsi
|
|
|
|
; X64-NEXT: andl %eax, %edx
|
|
|
|
; X64-NEXT: shrq $32, %rax
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
|
|
|
|
; X64-NEXT: movq %rcx, %rdi
|
|
|
|
; X64-NEXT: andl %r8d, %ecx
|
|
|
|
; X64-NEXT: shrq $32, %r8
|
|
|
|
; X64-NEXT: shrq $32, %rsi
|
|
|
|
; X64-NEXT: shrq $32, %rdi
|
|
|
|
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: andl %r8d, %edi
|
|
|
|
; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: andl %eax, %esi
|
|
|
|
; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
|
|
|
|
%res = and <4 x i32> %arg0, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <4 x float>
|
|
|
|
ret <4 x float> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_andnot_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %ebp
|
|
|
|
; X32-NEXT: movl %esp, %ebp
|
|
|
|
; X32-NEXT: pushl %esi
|
|
|
|
; X32-NEXT: andl $-16, %esp
|
|
|
|
; X32-NEXT: subl $64, %esp
|
|
|
|
; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: notl %edx
|
|
|
|
; X32-NEXT: notl %esi
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: notl %ecx
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: notl %eax
|
|
|
|
; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl %eax, (%esp)
|
|
|
|
; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: leal -4(%ebp), %esp
|
|
|
|
; X32-NEXT: popl %esi
|
|
|
|
; X32-NEXT: popl %ebp
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_andnot_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
|
|
|
|
; X64-NEXT: movq %rcx, %rdx
|
|
|
|
; X64-NEXT: shrq $32, %rdx
|
|
|
|
; X64-NEXT: movq %rax, %rsi
|
|
|
|
; X64-NEXT: shrq $32, %rsi
|
|
|
|
; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
|
|
|
|
; X64-NEXT: notl %eax
|
|
|
|
; X64-NEXT: andl %edi, %eax
|
|
|
|
; X64-NEXT: shrq $32, %rdi
|
|
|
|
; X64-NEXT: notl %ecx
|
|
|
|
; X64-NEXT: andl %r8d, %ecx
|
|
|
|
; X64-NEXT: shrq $32, %r8
|
|
|
|
; X64-NEXT: notl %esi
|
|
|
|
; X64-NEXT: notl %edx
|
|
|
|
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: andl %r8d, %edx
|
|
|
|
; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: andl %edi, %esi
|
|
|
|
; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
|
|
|
|
%not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
%res = and <4 x i32> %not, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <4 x float>
|
|
|
|
ret <4 x float> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpeq_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpeqps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpeq_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpeqps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp oeq <4 x float> %a0, %a1
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpeq_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpeqss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpeq_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpeqss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpge_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpleps %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpge_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpleps %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp ole <4 x float> %a1, %a0
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpge_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpless %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpge_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpless %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpgt_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpltps %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpgt_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpltps %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp olt <4 x float> %a1, %a0
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpgt_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpltss %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpgt_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpltss %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmple_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpleps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmple_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpleps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp ole <4 x float> %a0, %a1
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmple_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpless %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmple_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpless %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmplt_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpltps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmplt_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpltps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp olt <4 x float> %a0, %a1
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmplt_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpltss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmplt_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpltss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpneq_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpneqps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpneq_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpneqps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp une <4 x float> %a0, %a1
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpneq_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpneqss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpneq_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpneqss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnge_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnleps %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnge_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnleps %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp ugt <4 x float> %a1, %a0
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnge_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnless %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnge_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnless %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpngt_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnltps %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpngt_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnltps %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp uge <4 x float> %a1, %a0
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpngt_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnltss %xmm0, %xmm1
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpngt_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnltss %xmm0, %xmm1
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnle_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnleps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnle_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnleps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp ugt <4 x float> %a0, %a1
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnle_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnless %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnle_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnless %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnlt_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnltps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnlt_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnltps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp uge <4 x float> %a0, %a1
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpnlt_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpnltss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpnlt_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpnltss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpord_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpordps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpord_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpordps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp ord <4 x float> %a0, %a1
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpord_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpordss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpord_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpordss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpunord_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpunordps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpunord_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpunordps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
2016-06-16 05:22:15 +08:00
|
|
|
%cmp = fcmp uno <4 x float> %a0, %a1
|
|
|
|
%sext = sext <4 x i1> %cmp to <4 x i32>
|
|
|
|
%res = bitcast <4 x i32> %sext to <4 x float>
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cmpunord_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cmpunordss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cmpunord_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cmpunordss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comieq_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: comiss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setnp %al
|
|
|
|
; X32-NEXT: sete %cl
|
|
|
|
; X32-NEXT: andb %al, %cl
|
|
|
|
; X32-NEXT: movzbl %cl, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comieq_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: comiss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setnp %al
|
|
|
|
; X64-NEXT: sete %cl
|
|
|
|
; X64-NEXT: andb %al, %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comige_ss:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: comiss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setae %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comige_ss:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: comiss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comigt_ss:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: comiss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: seta %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comigt_ss:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: comiss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comile_ss:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: comiss %xmm0, %xmm1
|
|
|
|
; X32-NEXT: setae %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comile_ss:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: comiss %xmm0, %xmm1
|
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comilt_ss:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: comiss %xmm0, %xmm1
|
|
|
|
; X32-NEXT: seta %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comilt_ss:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: comiss %xmm0, %xmm1
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_comineq_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: comiss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setp %al
|
|
|
|
; X32-NEXT: setne %cl
|
|
|
|
; X32-NEXT: orb %al, %cl
|
|
|
|
; X32-NEXT: movzbl %cl, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_comineq_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: comiss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setp %al
|
|
|
|
; X64-NEXT: setne %cl
|
|
|
|
; X64-NEXT: orb %al, %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvt_ss2si:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtss2si %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvt_ss2si:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtss2si %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtsi32_ss:
|
|
|
|
; X32: # BB#0:
|
2016-07-19 23:07:43 +08:00
|
|
|
; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtsi32_ss:
|
|
|
|
; X64: # BB#0:
|
2016-07-19 23:07:43 +08:00
|
|
|
; X64-NEXT: cvtsi2ssl %edi, %xmm0
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: retq
|
2016-07-19 23:07:43 +08:00
|
|
|
%res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
|
2016-05-20 00:55:52 +08:00
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
2016-07-19 23:07:43 +08:00
|
|
|
declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
|
2016-05-20 00:55:52 +08:00
|
|
|
|
|
|
|
define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtss_f32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
|
|
|
; X32-NEXT: movss %xmm0, (%esp)
|
|
|
|
; X32-NEXT: flds (%esp)
|
|
|
|
; X32-NEXT: popl %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtss_f32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = extractelement <4 x float> %a0, i32 0
|
|
|
|
ret float %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvtss_si32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvtss2si %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvtss_si32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvtss2si %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvttss_si:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvttss2si %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvttss_si:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvttss2si %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
2016-07-19 23:07:43 +08:00
|
|
|
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
|
2016-05-20 00:55:52 +08:00
|
|
|
ret i32 %res
|
|
|
|
}
|
2016-07-19 23:07:43 +08:00
|
|
|
declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
|
2016-05-20 00:55:52 +08:00
|
|
|
|
|
|
|
define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_cvttss_si32:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: cvttss2si %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_cvttss_si32:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: cvttss2si %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
2016-07-19 23:07:43 +08:00
|
|
|
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
|
2016-05-20 00:55:52 +08:00
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_div_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: divps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_div_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: divps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = fdiv <4 x float> %a0, %a1
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_div_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: divss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_div_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: divss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext0 = extractelement <4 x float> %a0, i32 0
|
|
|
|
%ext1 = extractelement <4 x float> %a1, i32 0
|
|
|
|
%fdiv = fdiv float %ext0, %ext1
|
|
|
|
%res = insertelement <4 x float> %a0, float %fdiv, i32 0
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
|
|
|
|
; X32-LABEL: test_MM_GET_EXCEPTION_MASK:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
2016-09-26 14:42:07 +08:00
|
|
|
; X32-NEXT: movl %esp, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: stmxcsr (%eax)
|
|
|
|
; X32-NEXT: movl (%esp), %eax
|
|
|
|
; X32-NEXT: andl $8064, %eax # imm = 0x1F80
|
|
|
|
; X32-NEXT: popl %ecx
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_MM_GET_EXCEPTION_MASK:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: stmxcsr (%rax)
|
|
|
|
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: andl $8064, %eax # imm = 0x1F80
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%1 = alloca i32, align 4
|
|
|
|
%2 = bitcast i32* %1 to i8*
|
|
|
|
call void @llvm.x86.sse.stmxcsr(i8* %2)
|
|
|
|
%3 = load i32, i32* %1, align 4
|
|
|
|
%4 = and i32 %3, 8064
|
|
|
|
ret i32 %4
|
|
|
|
}
|
|
|
|
declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
|
|
|
|
; X32-LABEL: test_MM_GET_EXCEPTION_STATE:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
2016-09-26 14:42:07 +08:00
|
|
|
; X32-NEXT: movl %esp, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: stmxcsr (%eax)
|
|
|
|
; X32-NEXT: movl (%esp), %eax
|
|
|
|
; X32-NEXT: andl $63, %eax
|
|
|
|
; X32-NEXT: popl %ecx
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_MM_GET_EXCEPTION_STATE:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: stmxcsr (%rax)
|
|
|
|
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: andl $63, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%1 = alloca i32, align 4
|
|
|
|
%2 = bitcast i32* %1 to i8*
|
|
|
|
call void @llvm.x86.sse.stmxcsr(i8* %2)
|
|
|
|
%3 = load i32, i32* %1, align 4
|
|
|
|
%4 = and i32 %3, 63
|
|
|
|
ret i32 %4
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
|
|
|
|
; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
2016-09-26 14:42:07 +08:00
|
|
|
; X32-NEXT: movl %esp, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: stmxcsr (%eax)
|
|
|
|
; X32-NEXT: movl (%esp), %eax
|
|
|
|
; X32-NEXT: andl $32768, %eax # imm = 0x8000
|
|
|
|
; X32-NEXT: popl %ecx
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: stmxcsr (%rax)
|
|
|
|
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: andl $32768, %eax # imm = 0x8000
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%1 = alloca i32, align 4
|
|
|
|
%2 = bitcast i32* %1 to i8*
|
|
|
|
call void @llvm.x86.sse.stmxcsr(i8* %2)
|
|
|
|
%3 = load i32, i32* %1, align 4
|
|
|
|
%4 = and i32 %3, 32768
|
|
|
|
ret i32 %4
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
|
|
|
|
; X32-LABEL: test_MM_GET_ROUNDING_MODE:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
2016-09-26 14:42:07 +08:00
|
|
|
; X32-NEXT: movl %esp, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: stmxcsr (%eax)
|
|
|
|
; X32-NEXT: movl (%esp), %eax
|
|
|
|
; X32-NEXT: andl $24576, %eax # imm = 0x6000
|
|
|
|
; X32-NEXT: popl %ecx
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_MM_GET_ROUNDING_MODE:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: stmxcsr (%rax)
|
|
|
|
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: andl $24576, %eax # imm = 0x6000
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%1 = alloca i32, align 4
|
|
|
|
%2 = bitcast i32* %1 to i8*
|
|
|
|
call void @llvm.x86.sse.stmxcsr(i8* %2)
|
|
|
|
%3 = load i32, i32* %1, align 4
|
|
|
|
%4 = and i32 %3, 24576
|
|
|
|
ret i32 %4
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_mm_getcsr() nounwind {
|
|
|
|
; X32-LABEL: test_mm_getcsr:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
2016-09-26 14:42:07 +08:00
|
|
|
; X32-NEXT: movl %esp, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: stmxcsr (%eax)
|
|
|
|
; X32-NEXT: movl (%esp), %eax
|
|
|
|
; X32-NEXT: popl %ecx
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_getcsr:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: stmxcsr (%rax)
|
|
|
|
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%1 = alloca i32, align 4
|
|
|
|
%2 = bitcast i32* %1 to i8*
|
|
|
|
call void @llvm.x86.sse.stmxcsr(i8* %2)
|
|
|
|
%3 = load i32, i32* %1, align 4
|
|
|
|
ret i32 %3
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_load_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movaps (%eax), %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_load_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps (%rdi), %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast float* %a0 to <4 x float>*
|
|
|
|
%res = load <4 x float>, <4 x float>* %arg0, align 16
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_load_ps1:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_load_ps1:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ld = load float, float* %a0, align 4
|
|
|
|
%res0 = insertelement <4 x float> undef, float %ld, i32 0
|
|
|
|
%res1 = insertelement <4 x float> %res0, float %ld, i32 1
|
|
|
|
%res2 = insertelement <4 x float> %res1, float %ld, i32 2
|
|
|
|
%res3 = insertelement <4 x float> %res2, float %ld, i32 3
|
|
|
|
ret <4 x float> %res3
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_load_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_load_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ld = load float, float* %a0, align 1
|
|
|
|
%res0 = insertelement <4 x float> undef, float %ld, i32 0
|
|
|
|
%res1 = insertelement <4 x float> %res0, float 0.0, i32 1
|
|
|
|
%res2 = insertelement <4 x float> %res1, float 0.0, i32 2
|
|
|
|
%res3 = insertelement <4 x float> %res2, float 0.0, i32 3
|
|
|
|
ret <4 x float> %res3
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_load1_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_load1_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ld = load float, float* %a0, align 4
|
|
|
|
%res0 = insertelement <4 x float> undef, float %ld, i32 0
|
|
|
|
%res1 = insertelement <4 x float> %res0, float %ld, i32 1
|
|
|
|
%res2 = insertelement <4 x float> %res1, float %ld, i32 2
|
|
|
|
%res3 = insertelement <4 x float> %res2, float %ld, i32 3
|
|
|
|
ret <4 x float> %res3
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
|
|
|
|
; X32-LABEL: test_mm_loadh_pi:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadh_pi:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
|
|
|
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: shrq $32, %rax
|
|
|
|
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; X64-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ptr = bitcast x86_mmx* %a1 to <2 x float>*
|
|
|
|
%ld = load <2 x float>, <2 x float>* %ptr
|
|
|
|
%ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
|
|
|
|
; X32-LABEL: test_mm_loadl_pi:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
|
|
|
|
; X32-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadl_pi:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
|
|
|
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: shrq $32, %rax
|
|
|
|
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; X64-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
|
|
|
|
; X64-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ptr = bitcast x86_mmx* %a1 to <2 x float>*
|
|
|
|
%ld = load <2 x float>, <2 x float>* %ptr
|
|
|
|
%ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_loadr_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movaps (%eax), %xmm0
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadr_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps (%rdi), %xmm0
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast float* %a0 to <4 x float>*
|
|
|
|
%ld = load <4 x float>, <4 x float>* %arg0, align 16
|
|
|
|
%res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_loadu_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movups (%eax), %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_loadu_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movups (%rdi), %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast float* %a0 to <4 x float>*
|
|
|
|
%res = load <4 x float>, <4 x float>* %arg0, align 1
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_max_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: maxps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_max_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: maxps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_max_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: maxss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_max_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: maxss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_min_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: minps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_min_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: minps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_min_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: minss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_min_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: minss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_move_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_move_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_movehl_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_movehl_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_movelh_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_movelh_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_movemask_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movmskps %xmm0, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_movemask_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movmskps %xmm0, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_mul_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: mulps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_mul_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: mulps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = fmul <4 x float> %a0, %a1
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_mul_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: mulss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_mul_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: mulss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext0 = extractelement <4 x float> %a0, i32 0
|
|
|
|
%ext1 = extractelement <4 x float> %a1, i32 0
|
|
|
|
%fmul = fmul float %ext0, %ext1
|
|
|
|
%res = insertelement <4 x float> %a0, float %fmul, i32 0
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_or_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %ebp
|
|
|
|
; X32-NEXT: movl %esp, %ebp
|
|
|
|
; X32-NEXT: pushl %esi
|
|
|
|
; X32-NEXT: andl $-16, %esp
|
|
|
|
; X32-NEXT: subl $64, %esp
|
|
|
|
; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: orl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X32-NEXT: movl %esi, (%esp)
|
|
|
|
; X32-NEXT: orl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: orl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: leal -4(%ebp), %esp
|
|
|
|
; X32-NEXT: popl %esi
|
|
|
|
; X32-NEXT: popl %ebp
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_or_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
|
|
|
|
; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
|
|
|
|
; X64-NEXT: movq %rdx, %rsi
|
|
|
|
; X64-NEXT: orl %eax, %edx
|
|
|
|
; X64-NEXT: shrq $32, %rax
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
|
|
|
|
; X64-NEXT: movq %rcx, %rdi
|
|
|
|
; X64-NEXT: orl %r8d, %ecx
|
|
|
|
; X64-NEXT: shrq $32, %r8
|
|
|
|
; X64-NEXT: shrq $32, %rsi
|
|
|
|
; X64-NEXT: shrq $32, %rdi
|
|
|
|
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: orl %r8d, %edi
|
|
|
|
; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: orl %eax, %esi
|
|
|
|
; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
|
|
|
|
%res = or <4 x i32> %arg0, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <4 x float>
|
|
|
|
ret <4 x float> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_prefetch(i8* %a0) {
|
|
|
|
; X32-LABEL: test_mm_prefetch:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: prefetchnta (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_prefetch:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: prefetchnta (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
|
|
|
|
; X32-LABEL: test_mm_rcp_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: rcpps %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_rcp_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: rcpps %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
|
|
|
|
; X32-LABEL: test_mm_rcp_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: rcpss %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_rcp_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: rcpss %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
|
|
|
|
%ext0 = extractelement <4 x float> %rcp, i32 0
|
|
|
|
%ins0 = insertelement <4 x float> undef, float %ext0, i32 0
|
|
|
|
%ext1 = extractelement <4 x float> %a0, i32 1
|
|
|
|
%ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
|
|
|
|
%ext2 = extractelement <4 x float> %a0, i32 2
|
|
|
|
%ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
|
|
|
|
%ext3 = extractelement <4 x float> %a0, i32 3
|
|
|
|
%ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
|
|
|
|
ret <4 x float> %ins3
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
|
|
|
|
; X32-LABEL: test_mm_rsqrt_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: rsqrtps %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_rsqrt_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: rsqrtps %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
|
|
|
|
; X32-LABEL: test_mm_rsqrt_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: rsqrtss %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_rsqrt_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: rsqrtss %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
|
|
|
|
%ext0 = extractelement <4 x float> %rsqrt, i32 0
|
|
|
|
%ins0 = insertelement <4 x float> undef, float %ext0, i32 0
|
|
|
|
%ext1 = extractelement <4 x float> %a0, i32 1
|
|
|
|
%ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
|
|
|
|
%ext2 = extractelement <4 x float> %a0, i32 2
|
|
|
|
%ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
|
|
|
|
%ext3 = extractelement <4 x float> %a0, i32 3
|
|
|
|
%ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
|
|
|
|
ret <4 x float> %ins3
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_MM_SET_EXCEPTION_MASK:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2016-09-26 14:42:07 +08:00
|
|
|
; X32-NEXT: movl %esp, %ecx
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: stmxcsr (%ecx)
|
|
|
|
; X32-NEXT: movl (%esp), %edx
|
2016-05-28 22:58:37 +08:00
|
|
|
; X32-NEXT: andl $-8065, %edx # imm = 0xE07F
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: orl %eax, %edx
|
|
|
|
; X32-NEXT: movl %edx, (%esp)
|
|
|
|
; X32-NEXT: ldmxcsr (%ecx)
|
|
|
|
; X32-NEXT: popl %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_MM_SET_EXCEPTION_MASK:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: stmxcsr (%rax)
|
|
|
|
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
|
2016-05-28 22:58:37 +08:00
|
|
|
; X64-NEXT: andl $-8065, %ecx # imm = 0xE07F
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: orl %edi, %ecx
|
|
|
|
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: ldmxcsr (%rax)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%1 = alloca i32, align 4
|
|
|
|
%2 = bitcast i32* %1 to i8*
|
|
|
|
call void @llvm.x86.sse.stmxcsr(i8* %2)
|
|
|
|
%3 = load i32, i32* %1
|
|
|
|
%4 = and i32 %3, -8065
|
|
|
|
%5 = or i32 %4, %a0
|
|
|
|
store i32 %5, i32* %1
|
|
|
|
call void @llvm.x86.sse.ldmxcsr(i8* %2)
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
|
|
|
|
|
|
|
|
define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_MM_SET_EXCEPTION_STATE:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2016-09-26 14:42:07 +08:00
|
|
|
; X32-NEXT: movl %esp, %ecx
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: stmxcsr (%ecx)
|
|
|
|
; X32-NEXT: movl (%esp), %edx
|
|
|
|
; X32-NEXT: andl $-64, %edx
|
|
|
|
; X32-NEXT: orl %eax, %edx
|
|
|
|
; X32-NEXT: movl %edx, (%esp)
|
|
|
|
; X32-NEXT: ldmxcsr (%ecx)
|
|
|
|
; X32-NEXT: popl %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_MM_SET_EXCEPTION_STATE:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: stmxcsr (%rax)
|
|
|
|
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
|
|
|
|
; X64-NEXT: andl $-64, %ecx
|
|
|
|
; X64-NEXT: orl %edi, %ecx
|
|
|
|
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: ldmxcsr (%rax)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%1 = alloca i32, align 4
|
|
|
|
%2 = bitcast i32* %1 to i8*
|
|
|
|
call void @llvm.x86.sse.stmxcsr(i8* %2)
|
|
|
|
%3 = load i32, i32* %1
|
|
|
|
%4 = and i32 %3, -64
|
|
|
|
%5 = or i32 %4, %a0
|
|
|
|
store i32 %5, i32* %1
|
|
|
|
call void @llvm.x86.sse.ldmxcsr(i8* %2)
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2016-09-26 14:42:07 +08:00
|
|
|
; X32-NEXT: movl %esp, %ecx
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: stmxcsr (%ecx)
|
|
|
|
; X32-NEXT: movl (%esp), %edx
|
2016-05-28 22:58:37 +08:00
|
|
|
; X32-NEXT: andl $-32769, %edx # imm = 0xFFFF7FFF
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: orl %eax, %edx
|
|
|
|
; X32-NEXT: movl %edx, (%esp)
|
|
|
|
; X32-NEXT: ldmxcsr (%ecx)
|
|
|
|
; X32-NEXT: popl %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: stmxcsr (%rax)
|
|
|
|
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
|
2016-05-28 22:58:37 +08:00
|
|
|
; X64-NEXT: andl $-32769, %ecx # imm = 0xFFFF7FFF
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: orl %edi, %ecx
|
|
|
|
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: ldmxcsr (%rax)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%1 = alloca i32, align 4
|
|
|
|
%2 = bitcast i32* %1 to i8*
|
|
|
|
call void @llvm.x86.sse.stmxcsr(i8* %2)
|
|
|
|
%3 = load i32, i32* %1
|
|
|
|
%4 = and i32 %3, -32769
|
|
|
|
%5 = or i32 %4, %a0
|
|
|
|
store i32 %5, i32* %1
|
|
|
|
call void @llvm.x86.sse.ldmxcsr(i8* %2)
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
2017-05-19 02:50:05 +08:00
|
|
|
; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_ps:
|
|
|
|
; X64: # BB#0:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: movaps %xmm3, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <4 x float> undef, float %a3, i32 0
|
|
|
|
%res1 = insertelement <4 x float> %res0, float %a2, i32 1
|
|
|
|
%res2 = insertelement <4 x float> %res1, float %a1, i32 2
|
|
|
|
%res3 = insertelement <4 x float> %res2, float %a0, i32 3
|
|
|
|
ret <4 x float> %res3
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_ps1:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_ps1:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <4 x float> undef, float %a0, i32 0
|
|
|
|
%res1 = insertelement <4 x float> %res0, float %a0, i32 1
|
|
|
|
%res2 = insertelement <4 x float> %res1, float %a0, i32 2
|
|
|
|
%res3 = insertelement <4 x float> %res2, float %a0, i32 3
|
|
|
|
ret <4 x float> %res3
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_MM_SET_ROUNDING_MODE:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2016-09-26 14:42:07 +08:00
|
|
|
; X32-NEXT: movl %esp, %ecx
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: stmxcsr (%ecx)
|
|
|
|
; X32-NEXT: movl (%esp), %edx
|
2016-05-28 22:58:37 +08:00
|
|
|
; X32-NEXT: andl $-24577, %edx # imm = 0x9FFF
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: orl %eax, %edx
|
|
|
|
; X32-NEXT: movl %edx, (%esp)
|
|
|
|
; X32-NEXT: ldmxcsr (%ecx)
|
|
|
|
; X32-NEXT: popl %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_MM_SET_ROUNDING_MODE:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: stmxcsr (%rax)
|
|
|
|
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
|
2016-05-28 22:58:37 +08:00
|
|
|
; X64-NEXT: andl $-24577, %ecx # imm = 0x9FFF
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: orl %edi, %ecx
|
|
|
|
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: ldmxcsr (%rax)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%1 = alloca i32, align 4
|
|
|
|
%2 = bitcast i32* %1 to i8*
|
|
|
|
call void @llvm.x86.sse.stmxcsr(i8* %2)
|
|
|
|
%3 = load i32, i32* %1
|
|
|
|
%4 = and i32 %3, -24577
|
|
|
|
%5 = or i32 %4, %a0
|
|
|
|
store i32 %5, i32* %1
|
|
|
|
call void @llvm.x86.sse.ldmxcsr(i8* %2)
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_set_ss(float %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: xorps %xmm1, %xmm1
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
|
|
|
|
; X64-NEXT: movaps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <4 x float> undef, float %a0, i32 0
|
|
|
|
%res1 = insertelement <4 x float> %res0, float 0.0, i32 1
|
|
|
|
%res2 = insertelement <4 x float> %res1, float 0.0, i32 2
|
|
|
|
%res3 = insertelement <4 x float> %res2, float 0.0, i32 3
|
|
|
|
ret <4 x float> %res3
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_set1_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_set1_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <4 x float> undef, float %a0, i32 0
|
|
|
|
%res1 = insertelement <4 x float> %res0, float %a0, i32 1
|
|
|
|
%res2 = insertelement <4 x float> %res1, float %a0, i32 2
|
|
|
|
%res3 = insertelement <4 x float> %res2, float %a0, i32 3
|
|
|
|
ret <4 x float> %res3
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_setcsr(i32 %a0) nounwind {
|
|
|
|
; X32-LABEL: test_mm_setcsr:
|
|
|
|
; X32: # BB#0:
|
2017-03-26 18:33:03 +08:00
|
|
|
; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
|
Elide argument copies during instruction selection
Summary:
Avoids tons of prologue boilerplate when arguments are passed in memory
and left in memory. This can happen in a debug build or in a release
build when an argument alloca is escaped. This will dramatically affect
the code size of x86 debug builds, because X86 fast isel doesn't handle
arguments passed in memory at all. It only handles the x86_64 case of up
to 6 basic register parameters.
This is implemented by analyzing the entry block before ISel to identify
copy elision candidates. A copy elision candidate is an argument that is
used to fully initialize an alloca before any other possibly escaping
uses of that alloca. If an argument is a copy elision candidate, we set
a flag on the InputArg. If the the target generates loads from a fixed
stack object that matches the size and alignment requirements of the
alloca, the SelectionDAG builder will delete the stack object created
for the alloca and replace it with the fixed stack object. The load is
left behind to satisfy any remaining uses of the argument value. The
store is now dead and is therefore elided. The fixed stack object is
also marked as mutable, as it may now be modified by the user, and it
would be invalid to rematerialize the initial load from it.
Supersedes D28388
Fixes PR26328
Reviewers: chandlerc, MatzeB, qcolombet, inglorion, hans
Subscribers: igorb, llvm-commits
Differential Revision: https://reviews.llvm.org/D29668
llvm-svn: 296683
2017-03-02 05:42:00 +08:00
|
|
|
; X32-NEXT: ldmxcsr (%eax)
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setcsr:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: ldmxcsr (%rax)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%st = alloca i32, align 4
|
|
|
|
store i32 %a0, i32* %st, align 4
|
|
|
|
%bc = bitcast i32* %st to i8*
|
|
|
|
call void @llvm.x86.sse.ldmxcsr(i8* %bc)
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
|
|
|
|
; X32-LABEL: test_mm_setr_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
2017-05-19 02:50:05 +08:00
|
|
|
; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
2017-05-19 02:50:05 +08:00
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setr_ps:
|
|
|
|
; X64: # BB#0:
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%res0 = insertelement <4 x float> undef, float %a0, i32 0
|
|
|
|
%res1 = insertelement <4 x float> %res0, float %a1, i32 1
|
|
|
|
%res2 = insertelement <4 x float> %res1, float %a2, i32 2
|
|
|
|
%res3 = insertelement <4 x float> %res2, float %a3, i32 3
|
|
|
|
ret <4 x float> %res3
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_setzero_ps() {
|
|
|
|
; X32-LABEL: test_mm_setzero_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_setzero_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: xorps %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
ret <4 x float> zeroinitializer
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_sfence() nounwind {
|
|
|
|
; X32-LABEL: test_mm_sfence:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: sfence
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sfence:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: sfence
|
|
|
|
; X64-NEXT: retq
|
|
|
|
call void @llvm.x86.sse.sfence()
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
declare void @llvm.x86.sse.sfence() nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_shuffle_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_shuffle_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
|
|
|
|
; X32-LABEL: test_mm_sqrt_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: sqrtps %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sqrt_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: sqrtps %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
|
|
|
|
; X32-LABEL: test_mm_sqrt_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: sqrtss %xmm0, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sqrt_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: sqrtss %xmm0, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
|
|
|
|
%ext0 = extractelement <4 x float> %sqrt, i32 0
|
|
|
|
%ins0 = insertelement <4 x float> undef, float %ext0, i32 0
|
|
|
|
%ext1 = extractelement <4 x float> %a0, i32 1
|
|
|
|
%ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
|
|
|
|
%ext2 = extractelement <4 x float> %a0, i32 2
|
|
|
|
%ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
|
|
|
|
%ext3 = extractelement <4 x float> %a0, i32 3
|
|
|
|
%ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
|
|
|
|
ret <4 x float> %ins3
|
|
|
|
}
|
|
|
|
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_store_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movaps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_store_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast float* %a0 to <4 x float>*
|
|
|
|
store <4 x float> %a1, <4 x float>* %arg0, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_store_ps1:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X32-NEXT: movaps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_store_ps1:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X64-NEXT: movaps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast float* %a0 to <4 x float>*
|
|
|
|
%shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
|
|
|
|
store <4 x float> %shuf, <4 x float>* %arg0, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_store_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movss %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_store_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movss %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext = extractelement <4 x float> %a1, i32 0
|
|
|
|
store float %ext, float* %a0, align 1
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_store1_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X32-NEXT: movaps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_store1_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
|
|
|
; X64-NEXT: movaps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast float* %a0 to <4 x float>*
|
|
|
|
%shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
|
|
|
|
store <4 x float> %shuf, <4 x float>* %arg0, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_storeh_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %ebp
|
|
|
|
; X32-NEXT: movl %esp, %ebp
|
|
|
|
; X32-NEXT: andl $-16, %esp
|
|
|
|
; X32-NEXT: subl $32, %esp
|
|
|
|
; X32-NEXT: movl 8(%ebp), %eax
|
|
|
|
; X32-NEXT: movaps %xmm0, (%esp)
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl %edx, 4(%eax)
|
|
|
|
; X32-NEXT: movl %ecx, (%eax)
|
|
|
|
; X32-NEXT: movl %ebp, %esp
|
|
|
|
; X32-NEXT: popl %ebp
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storeh_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: movq %rax, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ptr = bitcast x86_mmx* %a0 to i64*
|
|
|
|
%bc = bitcast <4 x float> %a1 to <2 x i64>
|
|
|
|
%ext = extractelement <2 x i64> %bc, i32 1
|
|
|
|
store i64 %ext, i64* %ptr
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_storel_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %ebp
|
|
|
|
; X32-NEXT: movl %esp, %ebp
|
|
|
|
; X32-NEXT: andl $-16, %esp
|
|
|
|
; X32-NEXT: subl $32, %esp
|
|
|
|
; X32-NEXT: movl 8(%ebp), %eax
|
|
|
|
; X32-NEXT: movaps %xmm0, (%esp)
|
|
|
|
; X32-NEXT: movl (%esp), %ecx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl %edx, 4(%eax)
|
|
|
|
; X32-NEXT: movl %ecx, (%eax)
|
|
|
|
; X32-NEXT: movl %ebp, %esp
|
|
|
|
; X32-NEXT: popl %ebp
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storel_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: movq %rax, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ptr = bitcast x86_mmx* %a0 to i64*
|
|
|
|
%bc = bitcast <4 x float> %a1 to <2 x i64>
|
|
|
|
%ext = extractelement <2 x i64> %bc, i32 0
|
|
|
|
store i64 %ext, i64* %ptr
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_storer_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
|
|
|
; X32-NEXT: movaps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storer_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
|
|
|
|
; X64-NEXT: movaps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast float* %a0 to <4 x float>*
|
|
|
|
%shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
|
|
|
store <4 x float> %shuf, <4 x float>* %arg0, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_storeu_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movups %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_storeu_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movups %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
2016-05-31 02:42:51 +08:00
|
|
|
%arg0 = bitcast float* %a0 to <4 x float>*
|
|
|
|
store <4 x float> %a1, <4 x float>* %arg0, align 1
|
2016-05-20 00:55:52 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
|
|
|
|
; X32-LABEL: test_mm_stream_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movntps %xmm0, (%eax)
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_stream_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movntps %xmm0, (%rdi)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast float* %a0 to <4 x float>*
|
|
|
|
store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sub_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: subps %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sub_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: subps %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = fsub <4 x float> %a0, %a1
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_sub_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: subss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_sub_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: subss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%ext0 = extractelement <4 x float> %a0, i32 0
|
|
|
|
%ext1 = extractelement <4 x float> %a1, i32 0
|
|
|
|
%fsub = fsub float %ext0, %ext1
|
|
|
|
%res = insertelement <4 x float> %a0, float %fsub, i32 0
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
|
|
|
|
; X32-LABEL: test_MM_TRANSPOSE4_PS:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %esi
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X32-NEXT: movaps (%esi), %xmm0
|
|
|
|
; X32-NEXT: movaps (%edx), %xmm1
|
|
|
|
; X32-NEXT: movaps (%ecx), %xmm2
|
|
|
|
; X32-NEXT: movaps (%eax), %xmm3
|
|
|
|
; X32-NEXT: movaps %xmm0, %xmm4
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
|
|
|
|
; X32-NEXT: movaps %xmm2, %xmm5
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
|
|
|
|
; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; X32-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
|
|
|
; X32-NEXT: movaps %xmm4, %xmm1
|
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
|
|
|
|
; X32-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
|
|
|
|
; X32-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
|
|
|
|
; X32-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
|
|
|
|
; X32-NEXT: movaps %xmm1, (%esi)
|
|
|
|
; X32-NEXT: movaps %xmm5, (%edx)
|
|
|
|
; X32-NEXT: movaps %xmm3, (%ecx)
|
|
|
|
; X32-NEXT: movaps %xmm2, (%eax)
|
|
|
|
; X32-NEXT: popl %esi
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_MM_TRANSPOSE4_PS:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps (%rdi), %xmm0
|
|
|
|
; X64-NEXT: movaps (%rsi), %xmm1
|
|
|
|
; X64-NEXT: movaps (%rdx), %xmm2
|
|
|
|
; X64-NEXT: movaps (%rcx), %xmm3
|
|
|
|
; X64-NEXT: movaps %xmm0, %xmm4
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
|
|
|
|
; X64-NEXT: movaps %xmm2, %xmm5
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
|
|
|
|
; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; X64-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
|
|
|
; X64-NEXT: movaps %xmm4, %xmm1
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
|
|
|
|
; X64-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
|
|
|
|
; X64-NEXT: movaps %xmm0, %xmm3
|
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
|
|
|
|
; X64-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
|
|
|
|
; X64-NEXT: movaps %xmm1, (%rdi)
|
|
|
|
; X64-NEXT: movaps %xmm5, (%rsi)
|
|
|
|
; X64-NEXT: movaps %xmm3, (%rdx)
|
|
|
|
; X64-NEXT: movaps %xmm2, (%rcx)
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%row0 = load <4 x float>, <4 x float>* %a0, align 16
|
|
|
|
%row1 = load <4 x float>, <4 x float>* %a1, align 16
|
|
|
|
%row2 = load <4 x float>, <4 x float>* %a2, align 16
|
|
|
|
%row3 = load <4 x float>, <4 x float>* %a3, align 16
|
|
|
|
%tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
|
|
|
|
%tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
|
|
|
|
%tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
|
|
|
|
%tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
|
|
|
|
%res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
|
|
|
%res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
|
|
|
|
%res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
|
|
|
%res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
|
|
|
|
store <4 x float> %res0, <4 x float>* %a0, align 16
|
|
|
|
store <4 x float> %res1, <4 x float>* %a1, align 16
|
|
|
|
store <4 x float> %res2, <4 x float>* %a2, align 16
|
|
|
|
store <4 x float> %res3, <4 x float>* %a3, align 16
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomieq_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setnp %al
|
|
|
|
; X32-NEXT: sete %cl
|
|
|
|
; X32-NEXT: andb %al, %cl
|
|
|
|
; X32-NEXT: movzbl %cl, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomieq_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setnp %al
|
|
|
|
; X64-NEXT: sete %cl
|
|
|
|
; X64-NEXT: andb %al, %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomige_ss:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setae %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomige_ss:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomigt_ss:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: seta %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomigt_ss:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomile_ss:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: ucomiss %xmm0, %xmm1
|
|
|
|
; X32-NEXT: setae %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomile_ss:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: ucomiss %xmm0, %xmm1
|
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomilt_ss:
|
|
|
|
; X32: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X32-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: ucomiss %xmm0, %xmm1
|
|
|
|
; X32-NEXT: seta %al
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomilt_ss:
|
|
|
|
; X64: # BB#0:
|
2016-07-08 06:50:23 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: ucomiss %xmm0, %xmm1
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_ucomineq_ss:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; X32-NEXT: setp %al
|
|
|
|
; X32-NEXT: setne %cl
|
|
|
|
; X32-NEXT: orb %al, %cl
|
|
|
|
; X32-NEXT: movzbl %cl, %eax
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_ucomineq_ss:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: ucomiss %xmm1, %xmm0
|
|
|
|
; X64-NEXT: setp %al
|
|
|
|
; X64-NEXT: setne %cl
|
|
|
|
; X64-NEXT: orb %al, %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_undefined_ps() {
|
|
|
|
; X32-LABEL: test_mm_undefined_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_undefined_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: retq
|
|
|
|
ret <4 x float> undef
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_unpackhi_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpackhi_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_unpacklo_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_unpacklo_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
|
|
|
|
ret <4 x float> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
|
|
|
|
; X32-LABEL: test_mm_xor_ps:
|
|
|
|
; X32: # BB#0:
|
|
|
|
; X32-NEXT: pushl %ebp
|
|
|
|
; X32-NEXT: movl %esp, %ebp
|
|
|
|
; X32-NEXT: pushl %esi
|
|
|
|
; X32-NEXT: andl $-16, %esp
|
|
|
|
; X32-NEXT: subl $64, %esp
|
|
|
|
; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: xorl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X32-NEXT: movl %esi, (%esp)
|
|
|
|
; X32-NEXT: xorl {{[0-9]+}}(%esp), %edx
|
|
|
|
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
|
|
|
|
; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X32-NEXT: leal -4(%ebp), %esp
|
|
|
|
; X32-NEXT: popl %esi
|
|
|
|
; X32-NEXT: popl %ebp
|
|
|
|
; X32-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: test_mm_xor_ps:
|
|
|
|
; X64: # BB#0:
|
|
|
|
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
|
|
|
|
; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
|
|
|
|
; X64-NEXT: movq %rdx, %rsi
|
|
|
|
; X64-NEXT: xorl %eax, %edx
|
|
|
|
; X64-NEXT: shrq $32, %rax
|
|
|
|
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
|
|
|
|
; X64-NEXT: movq %rcx, %rdi
|
|
|
|
; X64-NEXT: xorl %r8d, %ecx
|
|
|
|
; X64-NEXT: shrq $32, %r8
|
|
|
|
; X64-NEXT: shrq $32, %rsi
|
|
|
|
; X64-NEXT: shrq $32, %rdi
|
|
|
|
; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: xorl %r8d, %edi
|
|
|
|
; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: xorl %eax, %esi
|
|
|
|
; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
|
|
|
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-05-20 00:55:52 +08:00
|
|
|
; X64-NEXT: retq
|
|
|
|
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
|
|
|
|
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
|
|
|
|
%res = xor <4 x i32> %arg0, %arg1
|
|
|
|
%bc = bitcast <4 x i32> %res to <4 x float>
|
|
|
|
ret <4 x float> %bc
|
|
|
|
}
|
|
|
|
|
|
|
|
!0 = !{i32 1}
|